aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/infiniband/core
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-09-09 08:33:31 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2015-09-09 08:33:31 -0700
commit26d2177e977c912863ac04f6c1a967e793ca3a56 (patch)
tree48da04fb0b947cfa404747690d7081b657e33221 /drivers/infiniband/core
parentMerge tag 'for-linus-4.3' of git://git.code.sf.net/p/openipmi/linux-ipmi (diff)
parentIB/ipoib: Suppress warning for send only join failures (diff)
downloadlinux-dev-26d2177e977c912863ac04f6c1a967e793ca3a56.tar.xz
linux-dev-26d2177e977c912863ac04f6c1a967e793ca3a56.zip
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/dledford/rdma
Pull inifiniband/rdma updates from Doug Ledford: "This is a fairly sizeable set of changes. I've put them through a decent amount of testing prior to sending the pull request due to that. There are still a few fixups that I know are coming, but I wanted to go ahead and get the big, sizable chunk into your hands sooner rather than waiting for those last few fixups. Of note is the fact that this creates what is intended to be a temporary area in the drivers/staging tree specifically for some cleanups and additions that are coming for the RDMA stack. We deprecated two drivers (ipath and amso1100) and are waiting to hear back if we can deprecate another one (ehca). We also put Intel's new hfi1 driver into this area because it needs to be refactored and a transfer library created out of the factored out code, and then it and the qib driver and the soft-roce driver should all be modified to use that library. I expect drivers/staging/rdma to be around for three or four kernel releases and then to go away as all of the work is completed and final deletions of deprecated drivers are done. Summary of changes for 4.3: - Create drivers/staging/rdma - Move amso1100 driver to staging/rdma and schedule for deletion - Move ipath driver to staging/rdma and schedule for deletion - Add hfi1 driver to staging/rdma and set TODO for move to regular tree - Initial support for namespaces to be used on RDMA devices - Add RoCE GID table handling to the RDMA core caching code - Infrastructure to support handling of devices with differing read and write scatter gather capabilities - Various iSER updates - Kill off unsafe usage of global mr registrations - Update SRP driver - Misc mlx4 driver updates - Support for the mr_alloc verb - Support for a netlink interface between kernel and user space cache daemon to speed path record queries and route resolution - Ininitial support for safe hot removal of verbs devices" * tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/dledford/rdma: (136 commits) IB/ipoib: Suppress warning for send only join failures IB/ipoib: Clean up send-only multicast joins IB/srp: Fix possible protection fault IB/core: Move SM class defines from ib_mad.h to ib_smi.h IB/core: Remove unnecessary defines from ib_mad.h IB/hfi1: Add PSM2 user space header to header_install IB/hfi1: Add CSRs for CONFIG_SDMA_VERBOSITY mlx5: Fix incorrect wc pkey_index assignment for GSI messages IB/mlx5: avoid destroying a NULL mr in reg_user_mr error flow IB/uverbs: reject invalid or unknown opcodes IB/cxgb4: Fix if statement in pick_local_ip6adddrs IB/sa: Fix rdma netlink message flags IB/ucma: HW Device hot-removal support IB/mlx4_ib: Disassociate support IB/uverbs: Enable device removal when there are active user space applications IB/uverbs: Explicitly pass ib_dev to uverbs commands IB/uverbs: Fix race between ib_uverbs_open and remove_one IB/uverbs: Fix reference counting usage of event files IB/core: Make ib_dealloc_pd return void IB/srp: Create an insecure all physical rkey only if needed ...
Diffstat (limited to 'drivers/infiniband/core')
-rw-r--r--drivers/infiniband/core/Makefile3
-rw-r--r--drivers/infiniband/core/cache.c773
-rw-r--r--drivers/infiniband/core/cm.c215
-rw-r--r--drivers/infiniband/core/cma.c657
-rw-r--r--drivers/infiniband/core/core_priv.h54
-rw-r--r--drivers/infiniband/core/device.c335
-rw-r--r--drivers/infiniband/core/mad.c28
-rw-r--r--drivers/infiniband/core/mad_priv.h1
-rw-r--r--drivers/infiniband/core/multicast.c7
-rw-r--r--drivers/infiniband/core/netlink.c55
-rw-r--r--drivers/infiniband/core/roce_gid_mgmt.c728
-rw-r--r--drivers/infiniband/core/sa_query.c515
-rw-r--r--drivers/infiniband/core/sysfs.c51
-rw-r--r--drivers/infiniband/core/ucm.c9
-rw-r--r--drivers/infiniband/core/ucma.c146
-rw-r--r--drivers/infiniband/core/user_mad.c6
-rw-r--r--drivers/infiniband/core/uverbs.h16
-rw-r--r--drivers/infiniband/core/uverbs_cmd.c147
-rw-r--r--drivers/infiniband/core/uverbs_main.c448
-rw-r--r--drivers/infiniband/core/verbs.c131
20 files changed, 3618 insertions, 707 deletions
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index acf736764445..d43a8994ac5c 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -9,7 +9,8 @@ obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o ib_ucm.o \
$(user_access-y)
ib_core-y := packer.o ud_header.o verbs.o sysfs.o \
- device.o fmr_pool.o cache.o netlink.o
+ device.o fmr_pool.o cache.o netlink.o \
+ roce_gid_mgmt.o
ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o
diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c
index 871da832d016..8f66c67ff0df 100644
--- a/drivers/infiniband/core/cache.c
+++ b/drivers/infiniband/core/cache.c
@@ -37,6 +37,8 @@
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/workqueue.h>
+#include <linux/netdevice.h>
+#include <net/addrconf.h>
#include <rdma/ib_cache.h>
@@ -47,76 +49,621 @@ struct ib_pkey_cache {
u16 table[0];
};
-struct ib_gid_cache {
- int table_len;
- union ib_gid table[0];
-};
-
struct ib_update_work {
struct work_struct work;
struct ib_device *device;
u8 port_num;
};
-int ib_get_cached_gid(struct ib_device *device,
- u8 port_num,
- int index,
- union ib_gid *gid)
+union ib_gid zgid;
+EXPORT_SYMBOL(zgid);
+
+static const struct ib_gid_attr zattr;
+
+enum gid_attr_find_mask {
+ GID_ATTR_FIND_MASK_GID = 1UL << 0,
+ GID_ATTR_FIND_MASK_NETDEV = 1UL << 1,
+ GID_ATTR_FIND_MASK_DEFAULT = 1UL << 2,
+};
+
+enum gid_table_entry_props {
+ GID_TABLE_ENTRY_INVALID = 1UL << 0,
+ GID_TABLE_ENTRY_DEFAULT = 1UL << 1,
+};
+
+enum gid_table_write_action {
+ GID_TABLE_WRITE_ACTION_ADD,
+ GID_TABLE_WRITE_ACTION_DEL,
+ /* MODIFY only updates the GID table. Currently only used by
+ * ib_cache_update.
+ */
+ GID_TABLE_WRITE_ACTION_MODIFY
+};
+
+struct ib_gid_table_entry {
+ /* This lock protects an entry from being
+ * read and written simultaneously.
+ */
+ rwlock_t lock;
+ unsigned long props;
+ union ib_gid gid;
+ struct ib_gid_attr attr;
+ void *context;
+};
+
+struct ib_gid_table {
+ int sz;
+ /* In RoCE, adding a GID to the table requires:
+ * (a) Find if this GID is already exists.
+ * (b) Find a free space.
+ * (c) Write the new GID
+ *
+ * Delete requires different set of operations:
+ * (a) Find the GID
+ * (b) Delete it.
+ *
+ * Add/delete should be carried out atomically.
+ * This is done by locking this mutex from multiple
+ * writers. We don't need this lock for IB, as the MAD
+ * layer replaces all entries. All data_vec entries
+ * are locked by this lock.
+ **/
+ struct mutex lock;
+ struct ib_gid_table_entry *data_vec;
+};
+
+static int write_gid(struct ib_device *ib_dev, u8 port,
+ struct ib_gid_table *table, int ix,
+ const union ib_gid *gid,
+ const struct ib_gid_attr *attr,
+ enum gid_table_write_action action,
+ bool default_gid)
{
- struct ib_gid_cache *cache;
+ int ret = 0;
+ struct net_device *old_net_dev;
unsigned long flags;
+
+ /* in rdma_cap_roce_gid_table, this funciton should be protected by a
+ * sleep-able lock.
+ */
+ write_lock_irqsave(&table->data_vec[ix].lock, flags);
+
+ if (rdma_cap_roce_gid_table(ib_dev, port)) {
+ table->data_vec[ix].props |= GID_TABLE_ENTRY_INVALID;
+ write_unlock_irqrestore(&table->data_vec[ix].lock, flags);
+ /* GID_TABLE_WRITE_ACTION_MODIFY currently isn't supported by
+ * RoCE providers and thus only updates the cache.
+ */
+ if (action == GID_TABLE_WRITE_ACTION_ADD)
+ ret = ib_dev->add_gid(ib_dev, port, ix, gid, attr,
+ &table->data_vec[ix].context);
+ else if (action == GID_TABLE_WRITE_ACTION_DEL)
+ ret = ib_dev->del_gid(ib_dev, port, ix,
+ &table->data_vec[ix].context);
+ write_lock_irqsave(&table->data_vec[ix].lock, flags);
+ }
+
+ old_net_dev = table->data_vec[ix].attr.ndev;
+ if (old_net_dev && old_net_dev != attr->ndev)
+ dev_put(old_net_dev);
+ /* if modify_gid failed, just delete the old gid */
+ if (ret || action == GID_TABLE_WRITE_ACTION_DEL) {
+ gid = &zgid;
+ attr = &zattr;
+ table->data_vec[ix].context = NULL;
+ }
+ if (default_gid)
+ table->data_vec[ix].props |= GID_TABLE_ENTRY_DEFAULT;
+ memcpy(&table->data_vec[ix].gid, gid, sizeof(*gid));
+ memcpy(&table->data_vec[ix].attr, attr, sizeof(*attr));
+ if (table->data_vec[ix].attr.ndev &&
+ table->data_vec[ix].attr.ndev != old_net_dev)
+ dev_hold(table->data_vec[ix].attr.ndev);
+
+ table->data_vec[ix].props &= ~GID_TABLE_ENTRY_INVALID;
+
+ write_unlock_irqrestore(&table->data_vec[ix].lock, flags);
+
+ if (!ret && rdma_cap_roce_gid_table(ib_dev, port)) {
+ struct ib_event event;
+
+ event.device = ib_dev;
+ event.element.port_num = port;
+ event.event = IB_EVENT_GID_CHANGE;
+
+ ib_dispatch_event(&event);
+ }
+ return ret;
+}
+
+static int add_gid(struct ib_device *ib_dev, u8 port,
+ struct ib_gid_table *table, int ix,
+ const union ib_gid *gid,
+ const struct ib_gid_attr *attr,
+ bool default_gid) {
+ return write_gid(ib_dev, port, table, ix, gid, attr,
+ GID_TABLE_WRITE_ACTION_ADD, default_gid);
+}
+
+static int modify_gid(struct ib_device *ib_dev, u8 port,
+ struct ib_gid_table *table, int ix,
+ const union ib_gid *gid,
+ const struct ib_gid_attr *attr,
+ bool default_gid) {
+ return write_gid(ib_dev, port, table, ix, gid, attr,
+ GID_TABLE_WRITE_ACTION_MODIFY, default_gid);
+}
+
+static int del_gid(struct ib_device *ib_dev, u8 port,
+ struct ib_gid_table *table, int ix,
+ bool default_gid) {
+ return write_gid(ib_dev, port, table, ix, &zgid, &zattr,
+ GID_TABLE_WRITE_ACTION_DEL, default_gid);
+}
+
+static int find_gid(struct ib_gid_table *table, const union ib_gid *gid,
+ const struct ib_gid_attr *val, bool default_gid,
+ unsigned long mask)
+{
+ int i;
+
+ for (i = 0; i < table->sz; i++) {
+ unsigned long flags;
+ struct ib_gid_attr *attr = &table->data_vec[i].attr;
+
+ read_lock_irqsave(&table->data_vec[i].lock, flags);
+
+ if (table->data_vec[i].props & GID_TABLE_ENTRY_INVALID)
+ goto next;
+
+ if (mask & GID_ATTR_FIND_MASK_GID &&
+ memcmp(gid, &table->data_vec[i].gid, sizeof(*gid)))
+ goto next;
+
+ if (mask & GID_ATTR_FIND_MASK_NETDEV &&
+ attr->ndev != val->ndev)
+ goto next;
+
+ if (mask & GID_ATTR_FIND_MASK_DEFAULT &&
+ !!(table->data_vec[i].props & GID_TABLE_ENTRY_DEFAULT) !=
+ default_gid)
+ goto next;
+
+ read_unlock_irqrestore(&table->data_vec[i].lock, flags);
+ return i;
+next:
+ read_unlock_irqrestore(&table->data_vec[i].lock, flags);
+ }
+
+ return -1;
+}
+
+static void make_default_gid(struct net_device *dev, union ib_gid *gid)
+{
+ gid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL);
+ addrconf_ifid_eui48(&gid->raw[8], dev);
+}
+
+int ib_cache_gid_add(struct ib_device *ib_dev, u8 port,
+ union ib_gid *gid, struct ib_gid_attr *attr)
+{
+ struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
+ struct ib_gid_table *table;
+ int ix;
int ret = 0;
+ struct net_device *idev;
- if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device))
+ table = ports_table[port - rdma_start_port(ib_dev)];
+
+ if (!memcmp(gid, &zgid, sizeof(*gid)))
return -EINVAL;
- read_lock_irqsave(&device->cache.lock, flags);
+ if (ib_dev->get_netdev) {
+ idev = ib_dev->get_netdev(ib_dev, port);
+ if (idev && attr->ndev != idev) {
+ union ib_gid default_gid;
- cache = device->cache.gid_cache[port_num - rdma_start_port(device)];
+ /* Adding default GIDs in not permitted */
+ make_default_gid(idev, &default_gid);
+ if (!memcmp(gid, &default_gid, sizeof(*gid))) {
+ dev_put(idev);
+ return -EPERM;
+ }
+ }
+ if (idev)
+ dev_put(idev);
+ }
- if (index < 0 || index >= cache->table_len)
- ret = -EINVAL;
- else
- *gid = cache->table[index];
+ mutex_lock(&table->lock);
- read_unlock_irqrestore(&device->cache.lock, flags);
+ ix = find_gid(table, gid, attr, false, GID_ATTR_FIND_MASK_GID |
+ GID_ATTR_FIND_MASK_NETDEV);
+ if (ix >= 0)
+ goto out_unlock;
+ ix = find_gid(table, &zgid, NULL, false, GID_ATTR_FIND_MASK_GID |
+ GID_ATTR_FIND_MASK_DEFAULT);
+ if (ix < 0) {
+ ret = -ENOSPC;
+ goto out_unlock;
+ }
+
+ add_gid(ib_dev, port, table, ix, gid, attr, false);
+
+out_unlock:
+ mutex_unlock(&table->lock);
return ret;
}
-EXPORT_SYMBOL(ib_get_cached_gid);
-int ib_find_cached_gid(struct ib_device *device,
- const union ib_gid *gid,
- u8 *port_num,
- u16 *index)
+int ib_cache_gid_del(struct ib_device *ib_dev, u8 port,
+ union ib_gid *gid, struct ib_gid_attr *attr)
{
- struct ib_gid_cache *cache;
+ struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
+ struct ib_gid_table *table;
+ int ix;
+
+ table = ports_table[port - rdma_start_port(ib_dev)];
+
+ mutex_lock(&table->lock);
+
+ ix = find_gid(table, gid, attr, false,
+ GID_ATTR_FIND_MASK_GID |
+ GID_ATTR_FIND_MASK_NETDEV |
+ GID_ATTR_FIND_MASK_DEFAULT);
+ if (ix < 0)
+ goto out_unlock;
+
+ del_gid(ib_dev, port, table, ix, false);
+
+out_unlock:
+ mutex_unlock(&table->lock);
+ return 0;
+}
+
+int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port,
+ struct net_device *ndev)
+{
+ struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
+ struct ib_gid_table *table;
+ int ix;
+
+ table = ports_table[port - rdma_start_port(ib_dev)];
+
+ mutex_lock(&table->lock);
+
+ for (ix = 0; ix < table->sz; ix++)
+ if (table->data_vec[ix].attr.ndev == ndev)
+ del_gid(ib_dev, port, table, ix, false);
+
+ mutex_unlock(&table->lock);
+ return 0;
+}
+
+static int __ib_cache_gid_get(struct ib_device *ib_dev, u8 port, int index,
+ union ib_gid *gid, struct ib_gid_attr *attr)
+{
+ struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
+ struct ib_gid_table *table;
unsigned long flags;
- int p, i;
- int ret = -ENOENT;
- *port_num = -1;
- if (index)
- *index = -1;
+ table = ports_table[port - rdma_start_port(ib_dev)];
- read_lock_irqsave(&device->cache.lock, flags);
+ if (index < 0 || index >= table->sz)
+ return -EINVAL;
- for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p) {
- cache = device->cache.gid_cache[p];
- for (i = 0; i < cache->table_len; ++i) {
- if (!memcmp(gid, &cache->table[i], sizeof *gid)) {
- *port_num = p + rdma_start_port(device);
- if (index)
- *index = i;
- ret = 0;
- goto found;
- }
+ read_lock_irqsave(&table->data_vec[index].lock, flags);
+ if (table->data_vec[index].props & GID_TABLE_ENTRY_INVALID) {
+ read_unlock_irqrestore(&table->data_vec[index].lock, flags);
+ return -EAGAIN;
+ }
+
+ memcpy(gid, &table->data_vec[index].gid, sizeof(*gid));
+ if (attr) {
+ memcpy(attr, &table->data_vec[index].attr, sizeof(*attr));
+ if (attr->ndev)
+ dev_hold(attr->ndev);
+ }
+
+ read_unlock_irqrestore(&table->data_vec[index].lock, flags);
+ return 0;
+}
+
+static int _ib_cache_gid_table_find(struct ib_device *ib_dev,
+ const union ib_gid *gid,
+ const struct ib_gid_attr *val,
+ unsigned long mask,
+ u8 *port, u16 *index)
+{
+ struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
+ struct ib_gid_table *table;
+ u8 p;
+ int local_index;
+
+ for (p = 0; p < ib_dev->phys_port_cnt; p++) {
+ table = ports_table[p];
+ local_index = find_gid(table, gid, val, false, mask);
+ if (local_index >= 0) {
+ if (index)
+ *index = local_index;
+ if (port)
+ *port = p + rdma_start_port(ib_dev);
+ return 0;
}
}
-found:
- read_unlock_irqrestore(&device->cache.lock, flags);
- return ret;
+ return -ENOENT;
+}
+
+static int ib_cache_gid_find(struct ib_device *ib_dev,
+ const union ib_gid *gid,
+ struct net_device *ndev, u8 *port,
+ u16 *index)
+{
+ unsigned long mask = GID_ATTR_FIND_MASK_GID;
+ struct ib_gid_attr gid_attr_val = {.ndev = ndev};
+
+ if (ndev)
+ mask |= GID_ATTR_FIND_MASK_NETDEV;
+
+ return _ib_cache_gid_table_find(ib_dev, gid, &gid_attr_val,
+ mask, port, index);
+}
+
+int ib_cache_gid_find_by_port(struct ib_device *ib_dev,
+ const union ib_gid *gid,
+ u8 port, struct net_device *ndev,
+ u16 *index)
+{
+ int local_index;
+ struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
+ struct ib_gid_table *table;
+ unsigned long mask = GID_ATTR_FIND_MASK_GID;
+ struct ib_gid_attr val = {.ndev = ndev};
+
+ if (port < rdma_start_port(ib_dev) ||
+ port > rdma_end_port(ib_dev))
+ return -ENOENT;
+
+ table = ports_table[port - rdma_start_port(ib_dev)];
+
+ if (ndev)
+ mask |= GID_ATTR_FIND_MASK_NETDEV;
+
+ local_index = find_gid(table, gid, &val, false, mask);
+ if (local_index >= 0) {
+ if (index)
+ *index = local_index;
+ return 0;
+ }
+
+ return -ENOENT;
+}
+
+static struct ib_gid_table *alloc_gid_table(int sz)
+{
+ unsigned int i;
+ struct ib_gid_table *table =
+ kzalloc(sizeof(struct ib_gid_table), GFP_KERNEL);
+ if (!table)
+ return NULL;
+
+ table->data_vec = kcalloc(sz, sizeof(*table->data_vec), GFP_KERNEL);
+ if (!table->data_vec)
+ goto err_free_table;
+
+ mutex_init(&table->lock);
+
+ table->sz = sz;
+
+ for (i = 0; i < sz; i++)
+ rwlock_init(&table->data_vec[i].lock);
+
+ return table;
+
+err_free_table:
+ kfree(table);
+ return NULL;
+}
+
+static void release_gid_table(struct ib_gid_table *table)
+{
+ if (table) {
+ kfree(table->data_vec);
+ kfree(table);
+ }
+}
+
+static void cleanup_gid_table_port(struct ib_device *ib_dev, u8 port,
+ struct ib_gid_table *table)
+{
+ int i;
+
+ if (!table)
+ return;
+
+ for (i = 0; i < table->sz; ++i) {
+ if (memcmp(&table->data_vec[i].gid, &zgid,
+ sizeof(table->data_vec[i].gid)))
+ del_gid(ib_dev, port, table, i,
+ table->data_vec[i].props &
+ GID_ATTR_FIND_MASK_DEFAULT);
+ }
+}
+
+void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port,
+ struct net_device *ndev,
+ enum ib_cache_gid_default_mode mode)
+{
+ struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
+ union ib_gid gid;
+ struct ib_gid_attr gid_attr;
+ struct ib_gid_table *table;
+ int ix;
+ union ib_gid current_gid;
+ struct ib_gid_attr current_gid_attr = {};
+
+ table = ports_table[port - rdma_start_port(ib_dev)];
+
+ make_default_gid(ndev, &gid);
+ memset(&gid_attr, 0, sizeof(gid_attr));
+ gid_attr.ndev = ndev;
+
+ ix = find_gid(table, NULL, NULL, true, GID_ATTR_FIND_MASK_DEFAULT);
+
+ /* Coudn't find default GID location */
+ WARN_ON(ix < 0);
+
+ mutex_lock(&table->lock);
+ if (!__ib_cache_gid_get(ib_dev, port, ix,
+ &current_gid, &current_gid_attr) &&
+ mode == IB_CACHE_GID_DEFAULT_MODE_SET &&
+ !memcmp(&gid, &current_gid, sizeof(gid)) &&
+ !memcmp(&gid_attr, &current_gid_attr, sizeof(gid_attr)))
+ goto unlock;
+
+ if ((memcmp(&current_gid, &zgid, sizeof(current_gid)) ||
+ memcmp(&current_gid_attr, &zattr,
+ sizeof(current_gid_attr))) &&
+ del_gid(ib_dev, port, table, ix, true)) {
+ pr_warn("ib_cache_gid: can't delete index %d for default gid %pI6\n",
+ ix, gid.raw);
+ goto unlock;
+ }
+
+ if (mode == IB_CACHE_GID_DEFAULT_MODE_SET)
+ if (add_gid(ib_dev, port, table, ix, &gid, &gid_attr, true))
+ pr_warn("ib_cache_gid: unable to add default gid %pI6\n",
+ gid.raw);
+
+unlock:
+ if (current_gid_attr.ndev)
+ dev_put(current_gid_attr.ndev);
+ mutex_unlock(&table->lock);
+}
+
+static int gid_table_reserve_default(struct ib_device *ib_dev, u8 port,
+ struct ib_gid_table *table)
+{
+ if (rdma_protocol_roce(ib_dev, port)) {
+ struct ib_gid_table_entry *entry = &table->data_vec[0];
+
+ entry->props |= GID_TABLE_ENTRY_DEFAULT;
+ }
+
+ return 0;
+}
+
+static int _gid_table_setup_one(struct ib_device *ib_dev)
+{
+ u8 port;
+ struct ib_gid_table **table;
+ int err = 0;
+
+ table = kcalloc(ib_dev->phys_port_cnt, sizeof(*table), GFP_KERNEL);
+
+ if (!table) {
+ pr_warn("failed to allocate ib gid cache for %s\n",
+ ib_dev->name);
+ return -ENOMEM;
+ }
+
+ for (port = 0; port < ib_dev->phys_port_cnt; port++) {
+ u8 rdma_port = port + rdma_start_port(ib_dev);
+
+ table[port] =
+ alloc_gid_table(
+ ib_dev->port_immutable[rdma_port].gid_tbl_len);
+ if (!table[port]) {
+ err = -ENOMEM;
+ goto rollback_table_setup;
+ }
+
+ err = gid_table_reserve_default(ib_dev,
+ port + rdma_start_port(ib_dev),
+ table[port]);
+ if (err)
+ goto rollback_table_setup;
+ }
+
+ ib_dev->cache.gid_cache = table;
+ return 0;
+
+rollback_table_setup:
+ for (port = 0; port < ib_dev->phys_port_cnt; port++) {
+ cleanup_gid_table_port(ib_dev, port + rdma_start_port(ib_dev),
+ table[port]);
+ release_gid_table(table[port]);
+ }
+
+ kfree(table);
+ return err;
+}
+
+static void gid_table_release_one(struct ib_device *ib_dev)
+{
+ struct ib_gid_table **table = ib_dev->cache.gid_cache;
+ u8 port;
+
+ if (!table)
+ return;
+
+ for (port = 0; port < ib_dev->phys_port_cnt; port++)
+ release_gid_table(table[port]);
+
+ kfree(table);
+ ib_dev->cache.gid_cache = NULL;
+}
+
+static void gid_table_cleanup_one(struct ib_device *ib_dev)
+{
+ struct ib_gid_table **table = ib_dev->cache.gid_cache;
+ u8 port;
+
+ if (!table)
+ return;
+
+ for (port = 0; port < ib_dev->phys_port_cnt; port++)
+ cleanup_gid_table_port(ib_dev, port + rdma_start_port(ib_dev),
+ table[port]);
+}
+
+static int gid_table_setup_one(struct ib_device *ib_dev)
+{
+ int err;
+
+ err = _gid_table_setup_one(ib_dev);
+
+ if (err)
+ return err;
+
+ err = roce_rescan_device(ib_dev);
+
+ if (err) {
+ gid_table_cleanup_one(ib_dev);
+ gid_table_release_one(ib_dev);
+ }
+
+ return err;
+}
+
+int ib_get_cached_gid(struct ib_device *device,
+ u8 port_num,
+ int index,
+ union ib_gid *gid)
+{
+ if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device))
+ return -EINVAL;
+
+ return __ib_cache_gid_get(device, port_num, index, gid, NULL);
+}
+EXPORT_SYMBOL(ib_get_cached_gid);
+
+int ib_find_cached_gid(struct ib_device *device,
+ const union ib_gid *gid,
+ u8 *port_num,
+ u16 *index)
+{
+ return ib_cache_gid_find(device, gid, NULL, port_num, index);
}
EXPORT_SYMBOL(ib_find_cached_gid);
@@ -243,9 +790,21 @@ static void ib_cache_update(struct ib_device *device,
{
struct ib_port_attr *tprops = NULL;
struct ib_pkey_cache *pkey_cache = NULL, *old_pkey_cache;
- struct ib_gid_cache *gid_cache = NULL, *old_gid_cache;
+ struct ib_gid_cache {
+ int table_len;
+ union ib_gid table[0];
+ } *gid_cache = NULL;
int i;
int ret;
+ struct ib_gid_table *table;
+ struct ib_gid_table **ports_table = device->cache.gid_cache;
+ bool use_roce_gid_table =
+ rdma_cap_roce_gid_table(device, port);
+
+ if (port < rdma_start_port(device) || port > rdma_end_port(device))
+ return;
+
+ table = ports_table[port - rdma_start_port(device)];
tprops = kmalloc(sizeof *tprops, GFP_KERNEL);
if (!tprops)
@@ -265,12 +824,14 @@ static void ib_cache_update(struct ib_device *device,
pkey_cache->table_len = tprops->pkey_tbl_len;
- gid_cache = kmalloc(sizeof *gid_cache + tprops->gid_tbl_len *
- sizeof *gid_cache->table, GFP_KERNEL);
- if (!gid_cache)
- goto err;
+ if (!use_roce_gid_table) {
+ gid_cache = kmalloc(sizeof(*gid_cache) + tprops->gid_tbl_len *
+ sizeof(*gid_cache->table), GFP_KERNEL);
+ if (!gid_cache)
+ goto err;
- gid_cache->table_len = tprops->gid_tbl_len;
+ gid_cache->table_len = tprops->gid_tbl_len;
+ }
for (i = 0; i < pkey_cache->table_len; ++i) {
ret = ib_query_pkey(device, port, i, pkey_cache->table + i);
@@ -281,29 +842,36 @@ static void ib_cache_update(struct ib_device *device,
}
}
- for (i = 0; i < gid_cache->table_len; ++i) {
- ret = ib_query_gid(device, port, i, gid_cache->table + i);
- if (ret) {
- printk(KERN_WARNING "ib_query_gid failed (%d) for %s (index %d)\n",
- ret, device->name, i);
- goto err;
+ if (!use_roce_gid_table) {
+ for (i = 0; i < gid_cache->table_len; ++i) {
+ ret = ib_query_gid(device, port, i,
+ gid_cache->table + i);
+ if (ret) {
+ printk(KERN_WARNING "ib_query_gid failed (%d) for %s (index %d)\n",
+ ret, device->name, i);
+ goto err;
+ }
}
}
write_lock_irq(&device->cache.lock);
old_pkey_cache = device->cache.pkey_cache[port - rdma_start_port(device)];
- old_gid_cache = device->cache.gid_cache [port - rdma_start_port(device)];
device->cache.pkey_cache[port - rdma_start_port(device)] = pkey_cache;
- device->cache.gid_cache [port - rdma_start_port(device)] = gid_cache;
+ if (!use_roce_gid_table) {
+ for (i = 0; i < gid_cache->table_len; i++) {
+ modify_gid(device, port, table, i, gid_cache->table + i,
+ &zattr, false);
+ }
+ }
device->cache.lmc_cache[port - rdma_start_port(device)] = tprops->lmc;
write_unlock_irq(&device->cache.lock);
+ kfree(gid_cache);
kfree(old_pkey_cache);
- kfree(old_gid_cache);
kfree(tprops);
return;
@@ -344,85 +912,88 @@ static void ib_cache_event(struct ib_event_handler *handler,
}
}
-static void ib_cache_setup_one(struct ib_device *device)
+int ib_cache_setup_one(struct ib_device *device)
{
int p;
+ int err;
rwlock_init(&device->cache.lock);
device->cache.pkey_cache =
- kmalloc(sizeof *device->cache.pkey_cache *
- (rdma_end_port(device) - rdma_start_port(device) + 1), GFP_KERNEL);
- device->cache.gid_cache =
- kmalloc(sizeof *device->cache.gid_cache *
+ kzalloc(sizeof *device->cache.pkey_cache *
(rdma_end_port(device) - rdma_start_port(device) + 1), GFP_KERNEL);
-
device->cache.lmc_cache = kmalloc(sizeof *device->cache.lmc_cache *
(rdma_end_port(device) -
rdma_start_port(device) + 1),
GFP_KERNEL);
-
- if (!device->cache.pkey_cache || !device->cache.gid_cache ||
+ if (!device->cache.pkey_cache ||
!device->cache.lmc_cache) {
printk(KERN_WARNING "Couldn't allocate cache "
"for %s\n", device->name);
- goto err;
+ return -ENOMEM;
}
- for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p) {
- device->cache.pkey_cache[p] = NULL;
- device->cache.gid_cache [p] = NULL;
+ err = gid_table_setup_one(device);
+ if (err)
+ /* Allocated memory will be cleaned in the release function */
+ return err;
+
+ for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p)
ib_cache_update(device, p + rdma_start_port(device));
- }
INIT_IB_EVENT_HANDLER(&device->cache.event_handler,
device, ib_cache_event);
- if (ib_register_event_handler(&device->cache.event_handler))
- goto err_cache;
-
- return;
+ err = ib_register_event_handler(&device->cache.event_handler);
+ if (err)
+ goto err;
-err_cache:
- for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p) {
- kfree(device->cache.pkey_cache[p]);
- kfree(device->cache.gid_cache[p]);
- }
+ return 0;
err:
- kfree(device->cache.pkey_cache);
- kfree(device->cache.gid_cache);
- kfree(device->cache.lmc_cache);
+ gid_table_cleanup_one(device);
+ return err;
}
-static void ib_cache_cleanup_one(struct ib_device *device)
+void ib_cache_release_one(struct ib_device *device)
{
int p;
- ib_unregister_event_handler(&device->cache.event_handler);
- flush_workqueue(ib_wq);
-
- for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p) {
- kfree(device->cache.pkey_cache[p]);
- kfree(device->cache.gid_cache[p]);
- }
-
+ /*
+ * The release function frees all the cache elements.
+ * This function should be called as part of freeing
+ * all the device's resources when the cache could no
+ * longer be accessed.
+ */
+ if (device->cache.pkey_cache)
+ for (p = 0;
+ p <= rdma_end_port(device) - rdma_start_port(device); ++p)
+ kfree(device->cache.pkey_cache[p]);
+
+ gid_table_release_one(device);
kfree(device->cache.pkey_cache);
- kfree(device->cache.gid_cache);
kfree(device->cache.lmc_cache);
}
-static struct ib_client cache_client = {
- .name = "cache",
- .add = ib_cache_setup_one,
- .remove = ib_cache_cleanup_one
-};
+void ib_cache_cleanup_one(struct ib_device *device)
+{
+ /* The cleanup function unregisters the event handler,
+ * waits for all in-progress workqueue elements and cleans
+ * up the GID cache. This function should be called after
+ * the device was removed from the devices list and all
+ * clients were removed, so the cache exists but is
+ * non-functional and shouldn't be updated anymore.
+ */
+ ib_unregister_event_handler(&device->cache.event_handler);
+ flush_workqueue(ib_wq);
+ gid_table_cleanup_one(device);
+}
-int __init ib_cache_setup(void)
+void __init ib_cache_setup(void)
{
- return ib_register_client(&cache_client);
+ roce_gid_mgmt_init();
}
void __exit ib_cache_cleanup(void)
{
- ib_unregister_client(&cache_client);
+ roce_gid_mgmt_cleanup();
}
diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index 3a972ebf3c0d..ea4db9c1d44f 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -58,7 +58,7 @@ MODULE_DESCRIPTION("InfiniBand CM");
MODULE_LICENSE("Dual BSD/GPL");
static void cm_add_one(struct ib_device *device);
-static void cm_remove_one(struct ib_device *device);
+static void cm_remove_one(struct ib_device *device, void *client_data);
static struct ib_client cm_client = {
.name = "cm",
@@ -213,13 +213,15 @@ struct cm_id_private {
spinlock_t lock; /* Do not acquire inside cm.lock */
struct completion comp;
atomic_t refcount;
+ /* Number of clients sharing this ib_cm_id. Only valid for listeners.
+ * Protected by the cm.lock spinlock. */
+ int listen_sharecount;
struct ib_mad_send_buf *msg;
struct cm_timewait_info *timewait_info;
/* todo: use alternate port on send failure */
struct cm_av av;
struct cm_av alt_av;
- struct ib_cm_compare_data *compare_data;
void *private_data;
__be64 tid;
@@ -440,40 +442,6 @@ static struct cm_id_private * cm_acquire_id(__be32 local_id, __be32 remote_id)
return cm_id_priv;
}
-static void cm_mask_copy(u32 *dst, const u32 *src, const u32 *mask)
-{
- int i;
-
- for (i = 0; i < IB_CM_COMPARE_SIZE; i++)
- dst[i] = src[i] & mask[i];
-}
-
-static int cm_compare_data(struct ib_cm_compare_data *src_data,
- struct ib_cm_compare_data *dst_data)
-{
- u32 src[IB_CM_COMPARE_SIZE];
- u32 dst[IB_CM_COMPARE_SIZE];
-
- if (!src_data || !dst_data)
- return 0;
-
- cm_mask_copy(src, src_data->data, dst_data->mask);
- cm_mask_copy(dst, dst_data->data, src_data->mask);
- return memcmp(src, dst, sizeof(src));
-}
-
-static int cm_compare_private_data(u32 *private_data,
- struct ib_cm_compare_data *dst_data)
-{
- u32 src[IB_CM_COMPARE_SIZE];
-
- if (!dst_data)
- return 0;
-
- cm_mask_copy(src, private_data, dst_data->mask);
- return memcmp(src, dst_data->data, sizeof(src));
-}
-
/*
* Trivial helpers to strip endian annotation and compare; the
* endianness doesn't actually matter since we just need a stable
@@ -506,18 +474,14 @@ static struct cm_id_private * cm_insert_listen(struct cm_id_private *cm_id_priv)
struct cm_id_private *cur_cm_id_priv;
__be64 service_id = cm_id_priv->id.service_id;
__be64 service_mask = cm_id_priv->id.service_mask;
- int data_cmp;
while (*link) {
parent = *link;
cur_cm_id_priv = rb_entry(parent, struct cm_id_private,
service_node);
- data_cmp = cm_compare_data(cm_id_priv->compare_data,
- cur_cm_id_priv->compare_data);
if ((cur_cm_id_priv->id.service_mask & service_id) ==
(service_mask & cur_cm_id_priv->id.service_id) &&
- (cm_id_priv->id.device == cur_cm_id_priv->id.device) &&
- !data_cmp)
+ (cm_id_priv->id.device == cur_cm_id_priv->id.device))
return cur_cm_id_priv;
if (cm_id_priv->id.device < cur_cm_id_priv->id.device)
@@ -528,8 +492,6 @@ static struct cm_id_private * cm_insert_listen(struct cm_id_private *cm_id_priv)
link = &(*link)->rb_left;
else if (be64_gt(service_id, cur_cm_id_priv->id.service_id))
link = &(*link)->rb_right;
- else if (data_cmp < 0)
- link = &(*link)->rb_left;
else
link = &(*link)->rb_right;
}
@@ -539,20 +501,16 @@ static struct cm_id_private * cm_insert_listen(struct cm_id_private *cm_id_priv)
}
static struct cm_id_private * cm_find_listen(struct ib_device *device,
- __be64 service_id,
- u32 *private_data)
+ __be64 service_id)
{
struct rb_node *node = cm.listen_service_table.rb_node;
struct cm_id_private *cm_id_priv;
- int data_cmp;
while (node) {
cm_id_priv = rb_entry(node, struct cm_id_private, service_node);
- data_cmp = cm_compare_private_data(private_data,
- cm_id_priv->compare_data);
if ((cm_id_priv->id.service_mask & service_id) ==
cm_id_priv->id.service_id &&
- (cm_id_priv->id.device == device) && !data_cmp)
+ (cm_id_priv->id.device == device))
return cm_id_priv;
if (device < cm_id_priv->id.device)
@@ -563,8 +521,6 @@ static struct cm_id_private * cm_find_listen(struct ib_device *device,
node = node->rb_left;
else if (be64_gt(service_id, cm_id_priv->id.service_id))
node = node->rb_right;
- else if (data_cmp < 0)
- node = node->rb_left;
else
node = node->rb_right;
}
@@ -859,9 +815,15 @@ retest:
spin_lock_irq(&cm_id_priv->lock);
switch (cm_id->state) {
case IB_CM_LISTEN:
- cm_id->state = IB_CM_IDLE;
spin_unlock_irq(&cm_id_priv->lock);
+
spin_lock_irq(&cm.lock);
+ if (--cm_id_priv->listen_sharecount > 0) {
+ /* The id is still shared. */
+ cm_deref_id(cm_id_priv);
+ spin_unlock_irq(&cm.lock);
+ return;
+ }
rb_erase(&cm_id_priv->service_node, &cm.listen_service_table);
spin_unlock_irq(&cm.lock);
break;
@@ -930,7 +892,6 @@ retest:
wait_for_completion(&cm_id_priv->comp);
while ((work = cm_dequeue_work(cm_id_priv)) != NULL)
cm_free_work(work);
- kfree(cm_id_priv->compare_data);
kfree(cm_id_priv->private_data);
kfree(cm_id_priv);
}
@@ -941,11 +902,23 @@ void ib_destroy_cm_id(struct ib_cm_id *cm_id)
}
EXPORT_SYMBOL(ib_destroy_cm_id);
-int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask,
- struct ib_cm_compare_data *compare_data)
+/**
+ * __ib_cm_listen - Initiates listening on the specified service ID for
+ * connection and service ID resolution requests.
+ * @cm_id: Connection identifier associated with the listen request.
+ * @service_id: Service identifier matched against incoming connection
+ * and service ID resolution requests. The service ID should be specified
+ * network-byte order. If set to IB_CM_ASSIGN_SERVICE_ID, the CM will
+ * assign a service ID to the caller.
+ * @service_mask: Mask applied to service ID used to listen across a
+ * range of service IDs. If set to 0, the service ID is matched
+ * exactly. This parameter is ignored if %service_id is set to
+ * IB_CM_ASSIGN_SERVICE_ID.
+ */
+static int __ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id,
+ __be64 service_mask)
{
struct cm_id_private *cm_id_priv, *cur_cm_id_priv;
- unsigned long flags;
int ret = 0;
service_mask = service_mask ? service_mask : ~cpu_to_be64(0);
@@ -958,20 +931,9 @@ int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask,
if (cm_id->state != IB_CM_IDLE)
return -EINVAL;
- if (compare_data) {
- cm_id_priv->compare_data = kzalloc(sizeof *compare_data,
- GFP_KERNEL);
- if (!cm_id_priv->compare_data)
- return -ENOMEM;
- cm_mask_copy(cm_id_priv->compare_data->data,
- compare_data->data, compare_data->mask);
- memcpy(cm_id_priv->compare_data->mask, compare_data->mask,
- sizeof(compare_data->mask));
- }
-
cm_id->state = IB_CM_LISTEN;
+ ++cm_id_priv->listen_sharecount;
- spin_lock_irqsave(&cm.lock, flags);
if (service_id == IB_CM_ASSIGN_SERVICE_ID) {
cm_id->service_id = cpu_to_be64(cm.listen_service_id++);
cm_id->service_mask = ~cpu_to_be64(0);
@@ -980,18 +942,95 @@ int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask,
cm_id->service_mask = service_mask;
}
cur_cm_id_priv = cm_insert_listen(cm_id_priv);
- spin_unlock_irqrestore(&cm.lock, flags);
if (cur_cm_id_priv) {
cm_id->state = IB_CM_IDLE;
- kfree(cm_id_priv->compare_data);
- cm_id_priv->compare_data = NULL;
+ --cm_id_priv->listen_sharecount;
ret = -EBUSY;
}
return ret;
}
+
+int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&cm.lock, flags);
+ ret = __ib_cm_listen(cm_id, service_id, service_mask);
+ spin_unlock_irqrestore(&cm.lock, flags);
+
+ return ret;
+}
EXPORT_SYMBOL(ib_cm_listen);
+/**
+ * Create a new listening ib_cm_id and listen on the given service ID.
+ *
+ * If there's an existing ID listening on that same device and service ID,
+ * return it.
+ *
+ * @device: Device associated with the cm_id. All related communication will
+ * be associated with the specified device.
+ * @cm_handler: Callback invoked to notify the user of CM events.
+ * @service_id: Service identifier matched against incoming connection
+ * and service ID resolution requests. The service ID should be specified
+ * network-byte order. If set to IB_CM_ASSIGN_SERVICE_ID, the CM will
+ * assign a service ID to the caller.
+ *
+ * Callers should call ib_destroy_cm_id when done with the listener ID.
+ */
+struct ib_cm_id *ib_cm_insert_listen(struct ib_device *device,
+ ib_cm_handler cm_handler,
+ __be64 service_id)
+{
+ struct cm_id_private *cm_id_priv;
+ struct ib_cm_id *cm_id;
+ unsigned long flags;
+ int err = 0;
+
+ /* Create an ID in advance, since the creation may sleep */
+ cm_id = ib_create_cm_id(device, cm_handler, NULL);
+ if (IS_ERR(cm_id))
+ return cm_id;
+
+ spin_lock_irqsave(&cm.lock, flags);
+
+ if (service_id == IB_CM_ASSIGN_SERVICE_ID)
+ goto new_id;
+
+ /* Find an existing ID */
+ cm_id_priv = cm_find_listen(device, service_id);
+ if (cm_id_priv) {
+ if (cm_id->cm_handler != cm_handler || cm_id->context) {
+ /* Sharing an ib_cm_id with different handlers is not
+ * supported */
+ spin_unlock_irqrestore(&cm.lock, flags);
+ return ERR_PTR(-EINVAL);
+ }
+ atomic_inc(&cm_id_priv->refcount);
+ ++cm_id_priv->listen_sharecount;
+ spin_unlock_irqrestore(&cm.lock, flags);
+
+ ib_destroy_cm_id(cm_id);
+ cm_id = &cm_id_priv->id;
+ return cm_id;
+ }
+
+new_id:
+ /* Use newly created ID */
+ err = __ib_cm_listen(cm_id, service_id, 0);
+
+ spin_unlock_irqrestore(&cm.lock, flags);
+
+ if (err) {
+ ib_destroy_cm_id(cm_id);
+ return ERR_PTR(err);
+ }
+ return cm_id;
+}
+EXPORT_SYMBOL(ib_cm_insert_listen);
+
static __be64 cm_form_tid(struct cm_id_private *cm_id_priv,
enum cm_msg_sequence msg_seq)
{
@@ -1268,6 +1307,7 @@ static void cm_format_paths_from_req(struct cm_req_msg *req_msg,
primary_path->packet_life_time =
cm_req_get_primary_local_ack_timeout(req_msg);
primary_path->packet_life_time -= (primary_path->packet_life_time > 0);
+ primary_path->service_id = req_msg->service_id;
if (req_msg->alt_local_lid) {
memset(alt_path, 0, sizeof *alt_path);
@@ -1289,7 +1329,26 @@ static void cm_format_paths_from_req(struct cm_req_msg *req_msg,
alt_path->packet_life_time =
cm_req_get_alt_local_ack_timeout(req_msg);
alt_path->packet_life_time -= (alt_path->packet_life_time > 0);
+ alt_path->service_id = req_msg->service_id;
+ }
+}
+
+static u16 cm_get_bth_pkey(struct cm_work *work)
+{
+ struct ib_device *ib_dev = work->port->cm_dev->ib_device;
+ u8 port_num = work->port->port_num;
+ u16 pkey_index = work->mad_recv_wc->wc->pkey_index;
+ u16 pkey;
+ int ret;
+
+ ret = ib_get_cached_pkey(ib_dev, port_num, pkey_index, &pkey);
+ if (ret) {
+ dev_warn_ratelimited(&ib_dev->dev, "ib_cm: Couldn't retrieve pkey for incoming request (port %d, pkey index %d). %d\n",
+ port_num, pkey_index, ret);
+ return 0;
}
+
+ return pkey;
}
static void cm_format_req_event(struct cm_work *work,
@@ -1302,6 +1361,7 @@ static void cm_format_req_event(struct cm_work *work,
req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad;
param = &work->cm_event.param.req_rcvd;
param->listen_id = listen_id;
+ param->bth_pkey = cm_get_bth_pkey(work);
param->port = cm_id_priv->av.port->port_num;
param->primary_path = &work->path[0];
if (req_msg->alt_local_lid)
@@ -1484,8 +1544,7 @@ static struct cm_id_private * cm_match_req(struct cm_work *work,
/* Find matching listen request. */
listen_cm_id_priv = cm_find_listen(cm_id_priv->id.device,
- req_msg->service_id,
- req_msg->private_data);
+ req_msg->service_id);
if (!listen_cm_id_priv) {
cm_cleanup_timewait(cm_id_priv->timewait_info);
spin_unlock_irq(&cm.lock);
@@ -2992,6 +3051,8 @@ static void cm_format_sidr_req_event(struct cm_work *work,
param = &work->cm_event.param.sidr_req_rcvd;
param->pkey = __be16_to_cpu(sidr_req_msg->pkey);
param->listen_id = listen_id;
+ param->service_id = sidr_req_msg->service_id;
+ param->bth_pkey = cm_get_bth_pkey(work);
param->port = work->port->port_num;
work->cm_event.private_data = &sidr_req_msg->private_data;
}
@@ -3031,8 +3092,7 @@ static int cm_sidr_req_handler(struct cm_work *work)
}
cm_id_priv->id.state = IB_CM_SIDR_REQ_RCVD;
cur_cm_id_priv = cm_find_listen(cm_id->device,
- sidr_req_msg->service_id,
- sidr_req_msg->private_data);
+ sidr_req_msg->service_id);
if (!cur_cm_id_priv) {
spin_unlock_irq(&cm.lock);
cm_reject_sidr_req(cm_id_priv, IB_SIDR_UNSUPPORTED);
@@ -3886,9 +3946,9 @@ free:
kfree(cm_dev);
}
-static void cm_remove_one(struct ib_device *ib_device)
+static void cm_remove_one(struct ib_device *ib_device, void *client_data)
{
- struct cm_device *cm_dev;
+ struct cm_device *cm_dev = client_data;
struct cm_port *port;
struct ib_port_modify port_modify = {
.clr_port_cap_mask = IB_PORT_CM_SUP
@@ -3896,7 +3956,6 @@ static void cm_remove_one(struct ib_device *ib_device)
unsigned long flags;
int i;
- cm_dev = ib_get_client_data(ib_device, &cm_client);
if (!cm_dev)
return;
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 143ded2bbe7c..b1ab13f3e182 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -46,6 +46,8 @@
#include <net/tcp.h>
#include <net/ipv6.h>
+#include <net/ip_fib.h>
+#include <net/ip6_route.h>
#include <rdma/rdma_cm.h>
#include <rdma/rdma_cm_ib.h>
@@ -94,7 +96,7 @@ const char *rdma_event_msg(enum rdma_cm_event_type event)
EXPORT_SYMBOL(rdma_event_msg);
static void cma_add_one(struct ib_device *device);
-static void cma_remove_one(struct ib_device *device);
+static void cma_remove_one(struct ib_device *device, void *client_data);
static struct ib_client cma_client = {
.name = "cma",
@@ -113,6 +115,22 @@ static DEFINE_IDR(udp_ps);
static DEFINE_IDR(ipoib_ps);
static DEFINE_IDR(ib_ps);
+static struct idr *cma_idr(enum rdma_port_space ps)
+{
+ switch (ps) {
+ case RDMA_PS_TCP:
+ return &tcp_ps;
+ case RDMA_PS_UDP:
+ return &udp_ps;
+ case RDMA_PS_IPOIB:
+ return &ipoib_ps;
+ case RDMA_PS_IB:
+ return &ib_ps;
+ default:
+ return NULL;
+ }
+}
+
struct cma_device {
struct list_head list;
struct ib_device *device;
@@ -122,11 +140,33 @@ struct cma_device {
};
struct rdma_bind_list {
- struct idr *ps;
+ enum rdma_port_space ps;
struct hlist_head owners;
unsigned short port;
};
+static int cma_ps_alloc(enum rdma_port_space ps,
+ struct rdma_bind_list *bind_list, int snum)
+{
+ struct idr *idr = cma_idr(ps);
+
+ return idr_alloc(idr, bind_list, snum, snum + 1, GFP_KERNEL);
+}
+
+static struct rdma_bind_list *cma_ps_find(enum rdma_port_space ps, int snum)
+{
+ struct idr *idr = cma_idr(ps);
+
+ return idr_find(idr, snum);
+}
+
+static void cma_ps_remove(enum rdma_port_space ps, int snum)
+{
+ struct idr *idr = cma_idr(ps);
+
+ idr_remove(idr, snum);
+}
+
enum {
CMA_OPTION_AFONLY,
};
@@ -225,6 +265,15 @@ struct cma_hdr {
#define CMA_VERSION 0x00
+struct cma_req_info {
+ struct ib_device *device;
+ int port;
+ union ib_gid local_gid;
+ __be64 service_id;
+ u16 pkey;
+ bool has_gid:1;
+};
+
static int cma_comp(struct rdma_id_private *id_priv, enum rdma_cm_state comp)
{
unsigned long flags;
@@ -262,7 +311,7 @@ static enum rdma_cm_state cma_exch(struct rdma_id_private *id_priv,
return old;
}
-static inline u8 cma_get_ip_ver(struct cma_hdr *hdr)
+static inline u8 cma_get_ip_ver(const struct cma_hdr *hdr)
{
return hdr->ip_version >> 4;
}
@@ -870,107 +919,397 @@ static inline int cma_any_port(struct sockaddr *addr)
return !cma_port(addr);
}
-static void cma_save_ib_info(struct rdma_cm_id *id, struct rdma_cm_id *listen_id,
+static void cma_save_ib_info(struct sockaddr *src_addr,
+ struct sockaddr *dst_addr,
+ struct rdma_cm_id *listen_id,
struct ib_sa_path_rec *path)
{
struct sockaddr_ib *listen_ib, *ib;
listen_ib = (struct sockaddr_ib *) &listen_id->route.addr.src_addr;
- ib = (struct sockaddr_ib *) &id->route.addr.src_addr;
- ib->sib_family = listen_ib->sib_family;
- if (path) {
- ib->sib_pkey = path->pkey;
- ib->sib_flowinfo = path->flow_label;
- memcpy(&ib->sib_addr, &path->sgid, 16);
- } else {
- ib->sib_pkey = listen_ib->sib_pkey;
- ib->sib_flowinfo = listen_ib->sib_flowinfo;
- ib->sib_addr = listen_ib->sib_addr;
- }
- ib->sib_sid = listen_ib->sib_sid;
- ib->sib_sid_mask = cpu_to_be64(0xffffffffffffffffULL);
- ib->sib_scope_id = listen_ib->sib_scope_id;
-
- if (path) {
- ib = (struct sockaddr_ib *) &id->route.addr.dst_addr;
- ib->sib_family = listen_ib->sib_family;
- ib->sib_pkey = path->pkey;
- ib->sib_flowinfo = path->flow_label;
- memcpy(&ib->sib_addr, &path->dgid, 16);
+ if (src_addr) {
+ ib = (struct sockaddr_ib *)src_addr;
+ ib->sib_family = AF_IB;
+ if (path) {
+ ib->sib_pkey = path->pkey;
+ ib->sib_flowinfo = path->flow_label;
+ memcpy(&ib->sib_addr, &path->sgid, 16);
+ ib->sib_sid = path->service_id;
+ ib->sib_scope_id = 0;
+ } else {
+ ib->sib_pkey = listen_ib->sib_pkey;
+ ib->sib_flowinfo = listen_ib->sib_flowinfo;
+ ib->sib_addr = listen_ib->sib_addr;
+ ib->sib_sid = listen_ib->sib_sid;
+ ib->sib_scope_id = listen_ib->sib_scope_id;
+ }
+ ib->sib_sid_mask = cpu_to_be64(0xffffffffffffffffULL);
+ }
+ if (dst_addr) {
+ ib = (struct sockaddr_ib *)dst_addr;
+ ib->sib_family = AF_IB;
+ if (path) {
+ ib->sib_pkey = path->pkey;
+ ib->sib_flowinfo = path->flow_label;
+ memcpy(&ib->sib_addr, &path->dgid, 16);
+ }
}
}
-static __be16 ss_get_port(const struct sockaddr_storage *ss)
-{
- if (ss->ss_family == AF_INET)
- return ((struct sockaddr_in *)ss)->sin_port;
- else if (ss->ss_family == AF_INET6)
- return ((struct sockaddr_in6 *)ss)->sin6_port;
- BUG();
-}
-
-static void cma_save_ip4_info(struct rdma_cm_id *id, struct rdma_cm_id *listen_id,
- struct cma_hdr *hdr)
+static void cma_save_ip4_info(struct sockaddr *src_addr,
+ struct sockaddr *dst_addr,
+ struct cma_hdr *hdr,
+ __be16 local_port)
{
struct sockaddr_in *ip4;
- ip4 = (struct sockaddr_in *) &id->route.addr.src_addr;
- ip4->sin_family = AF_INET;
- ip4->sin_addr.s_addr = hdr->dst_addr.ip4.addr;
- ip4->sin_port = ss_get_port(&listen_id->route.addr.src_addr);
+ if (src_addr) {
+ ip4 = (struct sockaddr_in *)src_addr;
+ ip4->sin_family = AF_INET;
+ ip4->sin_addr.s_addr = hdr->dst_addr.ip4.addr;
+ ip4->sin_port = local_port;
+ }
- ip4 = (struct sockaddr_in *) &id->route.addr.dst_addr;
- ip4->sin_family = AF_INET;
- ip4->sin_addr.s_addr = hdr->src_addr.ip4.addr;
- ip4->sin_port = hdr->port;
+ if (dst_addr) {
+ ip4 = (struct sockaddr_in *)dst_addr;
+ ip4->sin_family = AF_INET;
+ ip4->sin_addr.s_addr = hdr->src_addr.ip4.addr;
+ ip4->sin_port = hdr->port;
+ }
}
-static void cma_save_ip6_info(struct rdma_cm_id *id, struct rdma_cm_id *listen_id,
- struct cma_hdr *hdr)
+static void cma_save_ip6_info(struct sockaddr *src_addr,
+ struct sockaddr *dst_addr,
+ struct cma_hdr *hdr,
+ __be16 local_port)
{
struct sockaddr_in6 *ip6;
- ip6 = (struct sockaddr_in6 *) &id->route.addr.src_addr;
- ip6->sin6_family = AF_INET6;
- ip6->sin6_addr = hdr->dst_addr.ip6;
- ip6->sin6_port = ss_get_port(&listen_id->route.addr.src_addr);
+ if (src_addr) {
+ ip6 = (struct sockaddr_in6 *)src_addr;
+ ip6->sin6_family = AF_INET6;
+ ip6->sin6_addr = hdr->dst_addr.ip6;
+ ip6->sin6_port = local_port;
+ }
- ip6 = (struct sockaddr_in6 *) &id->route.addr.dst_addr;
- ip6->sin6_family = AF_INET6;
- ip6->sin6_addr = hdr->src_addr.ip6;
- ip6->sin6_port = hdr->port;
+ if (dst_addr) {
+ ip6 = (struct sockaddr_in6 *)dst_addr;
+ ip6->sin6_family = AF_INET6;
+ ip6->sin6_addr = hdr->src_addr.ip6;
+ ip6->sin6_port = hdr->port;
+ }
}
-static int cma_save_net_info(struct rdma_cm_id *id, struct rdma_cm_id *listen_id,
- struct ib_cm_event *ib_event)
+static u16 cma_port_from_service_id(__be64 service_id)
{
- struct cma_hdr *hdr;
+ return (u16)be64_to_cpu(service_id);
+}
- if (listen_id->route.addr.src_addr.ss_family == AF_IB) {
- if (ib_event->event == IB_CM_REQ_RECEIVED)
- cma_save_ib_info(id, listen_id, ib_event->param.req_rcvd.primary_path);
- else if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED)
- cma_save_ib_info(id, listen_id, NULL);
- return 0;
- }
+static int cma_save_ip_info(struct sockaddr *src_addr,
+ struct sockaddr *dst_addr,
+ struct ib_cm_event *ib_event,
+ __be64 service_id)
+{
+ struct cma_hdr *hdr;
+ __be16 port;
hdr = ib_event->private_data;
if (hdr->cma_version != CMA_VERSION)
return -EINVAL;
+ port = htons(cma_port_from_service_id(service_id));
+
switch (cma_get_ip_ver(hdr)) {
case 4:
- cma_save_ip4_info(id, listen_id, hdr);
+ cma_save_ip4_info(src_addr, dst_addr, hdr, port);
break;
case 6:
- cma_save_ip6_info(id, listen_id, hdr);
+ cma_save_ip6_info(src_addr, dst_addr, hdr, port);
+ break;
+ default:
+ return -EAFNOSUPPORT;
+ }
+
+ return 0;
+}
+
+static int cma_save_net_info(struct sockaddr *src_addr,
+ struct sockaddr *dst_addr,
+ struct rdma_cm_id *listen_id,
+ struct ib_cm_event *ib_event,
+ sa_family_t sa_family, __be64 service_id)
+{
+ if (sa_family == AF_IB) {
+ if (ib_event->event == IB_CM_REQ_RECEIVED)
+ cma_save_ib_info(src_addr, dst_addr, listen_id,
+ ib_event->param.req_rcvd.primary_path);
+ else if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED)
+ cma_save_ib_info(src_addr, dst_addr, listen_id, NULL);
+ return 0;
+ }
+
+ return cma_save_ip_info(src_addr, dst_addr, ib_event, service_id);
+}
+
+static int cma_save_req_info(const struct ib_cm_event *ib_event,
+ struct cma_req_info *req)
+{
+ const struct ib_cm_req_event_param *req_param =
+ &ib_event->param.req_rcvd;
+ const struct ib_cm_sidr_req_event_param *sidr_param =
+ &ib_event->param.sidr_req_rcvd;
+
+ switch (ib_event->event) {
+ case IB_CM_REQ_RECEIVED:
+ req->device = req_param->listen_id->device;
+ req->port = req_param->port;
+ memcpy(&req->local_gid, &req_param->primary_path->sgid,
+ sizeof(req->local_gid));
+ req->has_gid = true;
+ req->service_id = req_param->primary_path->service_id;
+ req->pkey = req_param->bth_pkey;
+ break;
+ case IB_CM_SIDR_REQ_RECEIVED:
+ req->device = sidr_param->listen_id->device;
+ req->port = sidr_param->port;
+ req->has_gid = false;
+ req->service_id = sidr_param->service_id;
+ req->pkey = sidr_param->bth_pkey;
break;
default:
return -EINVAL;
}
+
return 0;
}
+static bool validate_ipv4_net_dev(struct net_device *net_dev,
+ const struct sockaddr_in *dst_addr,
+ const struct sockaddr_in *src_addr)
+{
+ __be32 daddr = dst_addr->sin_addr.s_addr,
+ saddr = src_addr->sin_addr.s_addr;
+ struct fib_result res;
+ struct flowi4 fl4;
+ int err;
+ bool ret;
+
+ if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
+ ipv4_is_lbcast(daddr) || ipv4_is_zeronet(saddr) ||
+ ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr) ||
+ ipv4_is_loopback(saddr))
+ return false;
+
+ memset(&fl4, 0, sizeof(fl4));
+ fl4.flowi4_iif = net_dev->ifindex;
+ fl4.daddr = daddr;
+ fl4.saddr = saddr;
+
+ rcu_read_lock();
+ err = fib_lookup(dev_net(net_dev), &fl4, &res, 0);
+ if (err)
+ return false;
+
+ ret = FIB_RES_DEV(res) == net_dev;
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static bool validate_ipv6_net_dev(struct net_device *net_dev,
+ const struct sockaddr_in6 *dst_addr,
+ const struct sockaddr_in6 *src_addr)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ const int strict = ipv6_addr_type(&dst_addr->sin6_addr) &
+ IPV6_ADDR_LINKLOCAL;
+ struct rt6_info *rt = rt6_lookup(dev_net(net_dev), &dst_addr->sin6_addr,
+ &src_addr->sin6_addr, net_dev->ifindex,
+ strict);
+ bool ret;
+
+ if (!rt)
+ return false;
+
+ ret = rt->rt6i_idev->dev == net_dev;
+ ip6_rt_put(rt);
+
+ return ret;
+#else
+ return false;
+#endif
+}
+
+static bool validate_net_dev(struct net_device *net_dev,
+ const struct sockaddr *daddr,
+ const struct sockaddr *saddr)
+{
+ const struct sockaddr_in *daddr4 = (const struct sockaddr_in *)daddr;
+ const struct sockaddr_in *saddr4 = (const struct sockaddr_in *)saddr;
+ const struct sockaddr_in6 *daddr6 = (const struct sockaddr_in6 *)daddr;
+ const struct sockaddr_in6 *saddr6 = (const struct sockaddr_in6 *)saddr;
+
+ switch (daddr->sa_family) {
+ case AF_INET:
+ return saddr->sa_family == AF_INET &&
+ validate_ipv4_net_dev(net_dev, daddr4, saddr4);
+
+ case AF_INET6:
+ return saddr->sa_family == AF_INET6 &&
+ validate_ipv6_net_dev(net_dev, daddr6, saddr6);
+
+ default:
+ return false;
+ }
+}
+
+static struct net_device *cma_get_net_dev(struct ib_cm_event *ib_event,
+ const struct cma_req_info *req)
+{
+ struct sockaddr_storage listen_addr_storage, src_addr_storage;
+ struct sockaddr *listen_addr = (struct sockaddr *)&listen_addr_storage,
+ *src_addr = (struct sockaddr *)&src_addr_storage;
+ struct net_device *net_dev;
+ const union ib_gid *gid = req->has_gid ? &req->local_gid : NULL;
+ int err;
+
+ err = cma_save_ip_info(listen_addr, src_addr, ib_event,
+ req->service_id);
+ if (err)
+ return ERR_PTR(err);
+
+ net_dev = ib_get_net_dev_by_params(req->device, req->port, req->pkey,
+ gid, listen_addr);
+ if (!net_dev)
+ return ERR_PTR(-ENODEV);
+
+ if (!validate_net_dev(net_dev, listen_addr, src_addr)) {
+ dev_put(net_dev);
+ return ERR_PTR(-EHOSTUNREACH);
+ }
+
+ return net_dev;
+}
+
+static enum rdma_port_space rdma_ps_from_service_id(__be64 service_id)
+{
+ return (be64_to_cpu(service_id) >> 16) & 0xffff;
+}
+
+static bool cma_match_private_data(struct rdma_id_private *id_priv,
+ const struct cma_hdr *hdr)
+{
+ struct sockaddr *addr = cma_src_addr(id_priv);
+ __be32 ip4_addr;
+ struct in6_addr ip6_addr;
+
+ if (cma_any_addr(addr) && !id_priv->afonly)
+ return true;
+
+ switch (addr->sa_family) {
+ case AF_INET:
+ ip4_addr = ((struct sockaddr_in *)addr)->sin_addr.s_addr;
+ if (cma_get_ip_ver(hdr) != 4)
+ return false;
+ if (!cma_any_addr(addr) &&
+ hdr->dst_addr.ip4.addr != ip4_addr)
+ return false;
+ break;
+ case AF_INET6:
+ ip6_addr = ((struct sockaddr_in6 *)addr)->sin6_addr;
+ if (cma_get_ip_ver(hdr) != 6)
+ return false;
+ if (!cma_any_addr(addr) &&
+ memcmp(&hdr->dst_addr.ip6, &ip6_addr, sizeof(ip6_addr)))
+ return false;
+ break;
+ case AF_IB:
+ return true;
+ default:
+ return false;
+ }
+
+ return true;
+}
+
+static bool cma_match_net_dev(const struct rdma_id_private *id_priv,
+ const struct net_device *net_dev)
+{
+ const struct rdma_addr *addr = &id_priv->id.route.addr;
+
+ if (!net_dev)
+ /* This request is an AF_IB request */
+ return addr->src_addr.ss_family == AF_IB;
+
+ return !addr->dev_addr.bound_dev_if ||
+ (net_eq(dev_net(net_dev), &init_net) &&
+ addr->dev_addr.bound_dev_if == net_dev->ifindex);
+}
+
+static struct rdma_id_private *cma_find_listener(
+ const struct rdma_bind_list *bind_list,
+ const struct ib_cm_id *cm_id,
+ const struct ib_cm_event *ib_event,
+ const struct cma_req_info *req,
+ const struct net_device *net_dev)
+{
+ struct rdma_id_private *id_priv, *id_priv_dev;
+
+ if (!bind_list)
+ return ERR_PTR(-EINVAL);
+
+ hlist_for_each_entry(id_priv, &bind_list->owners, node) {
+ if (cma_match_private_data(id_priv, ib_event->private_data)) {
+ if (id_priv->id.device == cm_id->device &&
+ cma_match_net_dev(id_priv, net_dev))
+ return id_priv;
+ list_for_each_entry(id_priv_dev,
+ &id_priv->listen_list,
+ listen_list) {
+ if (id_priv_dev->id.device == cm_id->device &&
+ cma_match_net_dev(id_priv_dev, net_dev))
+ return id_priv_dev;
+ }
+ }
+ }
+
+ return ERR_PTR(-EINVAL);
+}
+
+static struct rdma_id_private *cma_id_from_event(struct ib_cm_id *cm_id,
+ struct ib_cm_event *ib_event,
+ struct net_device **net_dev)
+{
+ struct cma_req_info req;
+ struct rdma_bind_list *bind_list;
+ struct rdma_id_private *id_priv;
+ int err;
+
+ err = cma_save_req_info(ib_event, &req);
+ if (err)
+ return ERR_PTR(err);
+
+ *net_dev = cma_get_net_dev(ib_event, &req);
+ if (IS_ERR(*net_dev)) {
+ if (PTR_ERR(*net_dev) == -EAFNOSUPPORT) {
+ /* Assuming the protocol is AF_IB */
+ *net_dev = NULL;
+ } else {
+ return ERR_CAST(*net_dev);
+ }
+ }
+
+ bind_list = cma_ps_find(rdma_ps_from_service_id(req.service_id),
+ cma_port_from_service_id(req.service_id));
+ id_priv = cma_find_listener(bind_list, cm_id, ib_event, &req, *net_dev);
+ if (IS_ERR(id_priv)) {
+ dev_put(*net_dev);
+ *net_dev = NULL;
+ }
+
+ return id_priv;
+}
+
static inline int cma_user_data_offset(struct rdma_id_private *id_priv)
{
return cma_family(id_priv) == AF_IB ? 0 : sizeof(struct cma_hdr);
@@ -1038,7 +1377,7 @@ static void cma_release_port(struct rdma_id_private *id_priv)
mutex_lock(&lock);
hlist_del(&id_priv->node);
if (hlist_empty(&bind_list->owners)) {
- idr_remove(bind_list->ps, bind_list->port);
+ cma_ps_remove(bind_list->ps, bind_list->port);
kfree(bind_list);
}
mutex_unlock(&lock);
@@ -1216,11 +1555,15 @@ out:
}
static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id,
- struct ib_cm_event *ib_event)
+ struct ib_cm_event *ib_event,
+ struct net_device *net_dev)
{
struct rdma_id_private *id_priv;
struct rdma_cm_id *id;
struct rdma_route *rt;
+ const sa_family_t ss_family = listen_id->route.addr.src_addr.ss_family;
+ const __be64 service_id =
+ ib_event->param.req_rcvd.primary_path->service_id;
int ret;
id = rdma_create_id(listen_id->event_handler, listen_id->context,
@@ -1229,7 +1572,9 @@ static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id,
return NULL;
id_priv = container_of(id, struct rdma_id_private, id);
- if (cma_save_net_info(id, listen_id, ib_event))
+ if (cma_save_net_info((struct sockaddr *)&id->route.addr.src_addr,
+ (struct sockaddr *)&id->route.addr.dst_addr,
+ listen_id, ib_event, ss_family, service_id))
goto err;
rt = &id->route;
@@ -1243,14 +1588,16 @@ static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id,
if (rt->num_paths == 2)
rt->path_rec[1] = *ib_event->param.req_rcvd.alternate_path;
- if (cma_any_addr(cma_src_addr(id_priv))) {
- rt->addr.dev_addr.dev_type = ARPHRD_INFINIBAND;
- rdma_addr_set_sgid(&rt->addr.dev_addr, &rt->path_rec[0].sgid);
- ib_addr_set_pkey(&rt->addr.dev_addr, be16_to_cpu(rt->path_rec[0].pkey));
- } else {
- ret = cma_translate_addr(cma_src_addr(id_priv), &rt->addr.dev_addr);
+ if (net_dev) {
+ ret = rdma_copy_addr(&rt->addr.dev_addr, net_dev, NULL);
if (ret)
goto err;
+ } else {
+ /* An AF_IB connection */
+ WARN_ON_ONCE(ss_family != AF_IB);
+
+ cma_translate_ib((struct sockaddr_ib *)cma_src_addr(id_priv),
+ &rt->addr.dev_addr);
}
rdma_addr_set_dgid(&rt->addr.dev_addr, &rt->path_rec[0].dgid);
@@ -1263,10 +1610,12 @@ err:
}
static struct rdma_id_private *cma_new_udp_id(struct rdma_cm_id *listen_id,
- struct ib_cm_event *ib_event)
+ struct ib_cm_event *ib_event,
+ struct net_device *net_dev)
{
struct rdma_id_private *id_priv;
struct rdma_cm_id *id;
+ const sa_family_t ss_family = listen_id->route.addr.src_addr.ss_family;
int ret;
id = rdma_create_id(listen_id->event_handler, listen_id->context,
@@ -1275,13 +1624,24 @@ static struct rdma_id_private *cma_new_udp_id(struct rdma_cm_id *listen_id,
return NULL;
id_priv = container_of(id, struct rdma_id_private, id);
- if (cma_save_net_info(id, listen_id, ib_event))
+ if (cma_save_net_info((struct sockaddr *)&id->route.addr.src_addr,
+ (struct sockaddr *)&id->route.addr.dst_addr,
+ listen_id, ib_event, ss_family,
+ ib_event->param.sidr_req_rcvd.service_id))
goto err;
- if (!cma_any_addr((struct sockaddr *) &id->route.addr.src_addr)) {
- ret = cma_translate_addr(cma_src_addr(id_priv), &id->route.addr.dev_addr);
+ if (net_dev) {
+ ret = rdma_copy_addr(&id->route.addr.dev_addr, net_dev, NULL);
if (ret)
goto err;
+ } else {
+ /* An AF_IB connection */
+ WARN_ON_ONCE(ss_family != AF_IB);
+
+ if (!cma_any_addr(cma_src_addr(id_priv)))
+ cma_translate_ib((struct sockaddr_ib *)
+ cma_src_addr(id_priv),
+ &id->route.addr.dev_addr);
}
id_priv->state = RDMA_CM_CONNECT;
@@ -1319,25 +1679,33 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
{
struct rdma_id_private *listen_id, *conn_id;
struct rdma_cm_event event;
+ struct net_device *net_dev;
int offset, ret;
- listen_id = cm_id->context;
- if (!cma_check_req_qp_type(&listen_id->id, ib_event))
- return -EINVAL;
+ listen_id = cma_id_from_event(cm_id, ib_event, &net_dev);
+ if (IS_ERR(listen_id))
+ return PTR_ERR(listen_id);
- if (cma_disable_callback(listen_id, RDMA_CM_LISTEN))
- return -ECONNABORTED;
+ if (!cma_check_req_qp_type(&listen_id->id, ib_event)) {
+ ret = -EINVAL;
+ goto net_dev_put;
+ }
+
+ if (cma_disable_callback(listen_id, RDMA_CM_LISTEN)) {
+ ret = -ECONNABORTED;
+ goto net_dev_put;
+ }
memset(&event, 0, sizeof event);
offset = cma_user_data_offset(listen_id);
event.event = RDMA_CM_EVENT_CONNECT_REQUEST;
if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) {
- conn_id = cma_new_udp_id(&listen_id->id, ib_event);
+ conn_id = cma_new_udp_id(&listen_id->id, ib_event, net_dev);
event.param.ud.private_data = ib_event->private_data + offset;
event.param.ud.private_data_len =
IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE - offset;
} else {
- conn_id = cma_new_conn_id(&listen_id->id, ib_event);
+ conn_id = cma_new_conn_id(&listen_id->id, ib_event, net_dev);
cma_set_req_event_data(&event, &ib_event->param.req_rcvd,
ib_event->private_data, offset);
}
@@ -1375,6 +1743,8 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
mutex_unlock(&conn_id->handler_mutex);
mutex_unlock(&listen_id->handler_mutex);
cma_deref_id(conn_id);
+ if (net_dev)
+ dev_put(net_dev);
return 0;
err3:
@@ -1388,6 +1758,11 @@ err1:
mutex_unlock(&listen_id->handler_mutex);
if (conn_id)
rdma_destroy_id(&conn_id->id);
+
+net_dev_put:
+ if (net_dev)
+ dev_put(net_dev);
+
return ret;
}
@@ -1400,42 +1775,6 @@ __be64 rdma_get_service_id(struct rdma_cm_id *id, struct sockaddr *addr)
}
EXPORT_SYMBOL(rdma_get_service_id);
-static void cma_set_compare_data(enum rdma_port_space ps, struct sockaddr *addr,
- struct ib_cm_compare_data *compare)
-{
- struct cma_hdr *cma_data, *cma_mask;
- __be32 ip4_addr;
- struct in6_addr ip6_addr;
-
- memset(compare, 0, sizeof *compare);
- cma_data = (void *) compare->data;
- cma_mask = (void *) compare->mask;
-
- switch (addr->sa_family) {
- case AF_INET:
- ip4_addr = ((struct sockaddr_in *) addr)->sin_addr.s_addr;
- cma_set_ip_ver(cma_data, 4);
- cma_set_ip_ver(cma_mask, 0xF);
- if (!cma_any_addr(addr)) {
- cma_data->dst_addr.ip4.addr = ip4_addr;
- cma_mask->dst_addr.ip4.addr = htonl(~0);
- }
- break;
- case AF_INET6:
- ip6_addr = ((struct sockaddr_in6 *) addr)->sin6_addr;
- cma_set_ip_ver(cma_data, 6);
- cma_set_ip_ver(cma_mask, 0xF);
- if (!cma_any_addr(addr)) {
- cma_data->dst_addr.ip6 = ip6_addr;
- memset(&cma_mask->dst_addr.ip6, 0xFF,
- sizeof cma_mask->dst_addr.ip6);
- }
- break;
- default:
- break;
- }
-}
-
static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event)
{
struct rdma_id_private *id_priv = iw_id->context;
@@ -1589,33 +1928,18 @@ out:
static int cma_ib_listen(struct rdma_id_private *id_priv)
{
- struct ib_cm_compare_data compare_data;
struct sockaddr *addr;
struct ib_cm_id *id;
__be64 svc_id;
- int ret;
- id = ib_create_cm_id(id_priv->id.device, cma_req_handler, id_priv);
+ addr = cma_src_addr(id_priv);
+ svc_id = rdma_get_service_id(&id_priv->id, addr);
+ id = ib_cm_insert_listen(id_priv->id.device, cma_req_handler, svc_id);
if (IS_ERR(id))
return PTR_ERR(id);
-
id_priv->cm_id.ib = id;
- addr = cma_src_addr(id_priv);
- svc_id = rdma_get_service_id(&id_priv->id, addr);
- if (cma_any_addr(addr) && !id_priv->afonly)
- ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, NULL);
- else {
- cma_set_compare_data(id_priv->id.ps, addr, &compare_data);
- ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, &compare_data);
- }
-
- if (ret) {
- ib_destroy_cm_id(id_priv->cm_id.ib);
- id_priv->cm_id.ib = NULL;
- }
-
- return ret;
+ return 0;
}
static int cma_iw_listen(struct rdma_id_private *id_priv, int backlog)
@@ -2203,8 +2527,11 @@ static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
src_addr = (struct sockaddr *) &id->route.addr.src_addr;
src_addr->sa_family = dst_addr->sa_family;
if (dst_addr->sa_family == AF_INET6) {
- ((struct sockaddr_in6 *) src_addr)->sin6_scope_id =
- ((struct sockaddr_in6 *) dst_addr)->sin6_scope_id;
+ struct sockaddr_in6 *src_addr6 = (struct sockaddr_in6 *) src_addr;
+ struct sockaddr_in6 *dst_addr6 = (struct sockaddr_in6 *) dst_addr;
+ src_addr6->sin6_scope_id = dst_addr6->sin6_scope_id;
+ if (ipv6_addr_type(&dst_addr6->sin6_addr) & IPV6_ADDR_LINKLOCAL)
+ id->route.addr.dev_addr.bound_dev_if = dst_addr6->sin6_scope_id;
} else if (dst_addr->sa_family == AF_IB) {
((struct sockaddr_ib *) src_addr)->sib_pkey =
((struct sockaddr_ib *) dst_addr)->sib_pkey;
@@ -2325,8 +2652,8 @@ static void cma_bind_port(struct rdma_bind_list *bind_list,
hlist_add_head(&id_priv->node, &bind_list->owners);
}
-static int cma_alloc_port(struct idr *ps, struct rdma_id_private *id_priv,
- unsigned short snum)
+static int cma_alloc_port(enum rdma_port_space ps,
+ struct rdma_id_private *id_priv, unsigned short snum)
{
struct rdma_bind_list *bind_list;
int ret;
@@ -2335,7 +2662,7 @@ static int cma_alloc_port(struct idr *ps, struct rdma_id_private *id_priv,
if (!bind_list)
return -ENOMEM;
- ret = idr_alloc(ps, bind_list, snum, snum + 1, GFP_KERNEL);
+ ret = cma_ps_alloc(ps, bind_list, snum);
if (ret < 0)
goto err;
@@ -2348,7 +2675,8 @@ err:
return ret == -ENOSPC ? -EADDRNOTAVAIL : ret;
}
-static int cma_alloc_any_port(struct idr *ps, struct rdma_id_private *id_priv)
+static int cma_alloc_any_port(enum rdma_port_space ps,
+ struct rdma_id_private *id_priv)
{
static unsigned int last_used_port;
int low, high, remaining;
@@ -2359,7 +2687,7 @@ static int cma_alloc_any_port(struct idr *ps, struct rdma_id_private *id_priv)
rover = prandom_u32() % remaining + low;
retry:
if (last_used_port != rover &&
- !idr_find(ps, (unsigned short) rover)) {
+ !cma_ps_find(ps, (unsigned short)rover)) {
int ret = cma_alloc_port(ps, id_priv, rover);
/*
* Remember previously used port number in order to avoid
@@ -2414,7 +2742,8 @@ static int cma_check_port(struct rdma_bind_list *bind_list,
return 0;
}
-static int cma_use_port(struct idr *ps, struct rdma_id_private *id_priv)
+static int cma_use_port(enum rdma_port_space ps,
+ struct rdma_id_private *id_priv)
{
struct rdma_bind_list *bind_list;
unsigned short snum;
@@ -2424,7 +2753,7 @@ static int cma_use_port(struct idr *ps, struct rdma_id_private *id_priv)
if (snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
return -EACCES;
- bind_list = idr_find(ps, snum);
+ bind_list = cma_ps_find(ps, snum);
if (!bind_list) {
ret = cma_alloc_port(ps, id_priv, snum);
} else {
@@ -2447,25 +2776,24 @@ static int cma_bind_listen(struct rdma_id_private *id_priv)
return ret;
}
-static struct idr *cma_select_inet_ps(struct rdma_id_private *id_priv)
+static enum rdma_port_space cma_select_inet_ps(
+ struct rdma_id_private *id_priv)
{
switch (id_priv->id.ps) {
case RDMA_PS_TCP:
- return &tcp_ps;
case RDMA_PS_UDP:
- return &udp_ps;
case RDMA_PS_IPOIB:
- return &ipoib_ps;
case RDMA_PS_IB:
- return &ib_ps;
+ return id_priv->id.ps;
default:
- return NULL;
+
+ return 0;
}
}
-static struct idr *cma_select_ib_ps(struct rdma_id_private *id_priv)
+static enum rdma_port_space cma_select_ib_ps(struct rdma_id_private *id_priv)
{
- struct idr *ps = NULL;
+ enum rdma_port_space ps = 0;
struct sockaddr_ib *sib;
u64 sid_ps, mask, sid;
@@ -2475,15 +2803,15 @@ static struct idr *cma_select_ib_ps(struct rdma_id_private *id_priv)
if ((id_priv->id.ps == RDMA_PS_IB) && (sid == (RDMA_IB_IP_PS_IB & mask))) {
sid_ps = RDMA_IB_IP_PS_IB;
- ps = &ib_ps;
+ ps = RDMA_PS_IB;
} else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_TCP)) &&
(sid == (RDMA_IB_IP_PS_TCP & mask))) {
sid_ps = RDMA_IB_IP_PS_TCP;
- ps = &tcp_ps;
+ ps = RDMA_PS_TCP;
} else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_UDP)) &&
(sid == (RDMA_IB_IP_PS_UDP & mask))) {
sid_ps = RDMA_IB_IP_PS_UDP;
- ps = &udp_ps;
+ ps = RDMA_PS_UDP;
}
if (ps) {
@@ -2496,7 +2824,7 @@ static struct idr *cma_select_ib_ps(struct rdma_id_private *id_priv)
static int cma_get_port(struct rdma_id_private *id_priv)
{
- struct idr *ps;
+ enum rdma_port_space ps;
int ret;
if (cma_family(id_priv) != AF_IB)
@@ -3551,11 +3879,10 @@ static void cma_process_remove(struct cma_device *cma_dev)
wait_for_completion(&cma_dev->comp);
}
-static void cma_remove_one(struct ib_device *device)
+static void cma_remove_one(struct ib_device *device, void *client_data)
{
- struct cma_device *cma_dev;
+ struct cma_device *cma_dev = client_data;
- cma_dev = ib_get_client_data(device, &cma_client);
if (!cma_dev)
return;
diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
index 87d1936f5c1c..70bb36ebb03b 100644
--- a/drivers/infiniband/core/core_priv.h
+++ b/drivers/infiniband/core/core_priv.h
@@ -43,12 +43,58 @@ int ib_device_register_sysfs(struct ib_device *device,
u8, struct kobject *));
void ib_device_unregister_sysfs(struct ib_device *device);
-int ib_sysfs_setup(void);
-void ib_sysfs_cleanup(void);
-
-int ib_cache_setup(void);
+void ib_cache_setup(void);
void ib_cache_cleanup(void);
int ib_resolve_eth_l2_attrs(struct ib_qp *qp,
struct ib_qp_attr *qp_attr, int *qp_attr_mask);
+
+typedef void (*roce_netdev_callback)(struct ib_device *device, u8 port,
+ struct net_device *idev, void *cookie);
+
+typedef int (*roce_netdev_filter)(struct ib_device *device, u8 port,
+ struct net_device *idev, void *cookie);
+
+void ib_enum_roce_netdev(struct ib_device *ib_dev,
+ roce_netdev_filter filter,
+ void *filter_cookie,
+ roce_netdev_callback cb,
+ void *cookie);
+void ib_enum_all_roce_netdevs(roce_netdev_filter filter,
+ void *filter_cookie,
+ roce_netdev_callback cb,
+ void *cookie);
+
+int ib_cache_gid_find_by_port(struct ib_device *ib_dev,
+ const union ib_gid *gid,
+ u8 port, struct net_device *ndev,
+ u16 *index);
+
+enum ib_cache_gid_default_mode {
+ IB_CACHE_GID_DEFAULT_MODE_SET,
+ IB_CACHE_GID_DEFAULT_MODE_DELETE
+};
+
+void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port,
+ struct net_device *ndev,
+ enum ib_cache_gid_default_mode mode);
+
+int ib_cache_gid_add(struct ib_device *ib_dev, u8 port,
+ union ib_gid *gid, struct ib_gid_attr *attr);
+
+int ib_cache_gid_del(struct ib_device *ib_dev, u8 port,
+ union ib_gid *gid, struct ib_gid_attr *attr);
+
+int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port,
+ struct net_device *ndev);
+
+int roce_gid_mgmt_init(void);
+void roce_gid_mgmt_cleanup(void);
+
+int roce_rescan_device(struct ib_device *ib_dev);
+
+int ib_cache_setup_one(struct ib_device *device);
+void ib_cache_cleanup_one(struct ib_device *device);
+void ib_cache_release_one(struct ib_device *device);
+
#endif /* _CORE_PRIV_H */
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 9567756ca4f9..17639117afc6 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -38,7 +38,10 @@
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/mutex.h>
+#include <linux/netdevice.h>
#include <rdma/rdma_netlink.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib_cache.h>
#include "core_priv.h"
@@ -50,22 +53,34 @@ struct ib_client_data {
struct list_head list;
struct ib_client *client;
void * data;
+ /* The device or client is going down. Do not call client or device
+ * callbacks other than remove(). */
+ bool going_down;
};
struct workqueue_struct *ib_wq;
EXPORT_SYMBOL_GPL(ib_wq);
+/* The device_list and client_list contain devices and clients after their
+ * registration has completed, and the devices and clients are removed
+ * during unregistration. */
static LIST_HEAD(device_list);
static LIST_HEAD(client_list);
/*
- * device_mutex protects access to both device_list and client_list.
- * There's no real point to using multiple locks or something fancier
- * like an rwsem: we always access both lists, and we're always
- * modifying one list or the other list. In any case this is not a
- * hot path so there's no point in trying to optimize.
+ * device_mutex and lists_rwsem protect access to both device_list and
+ * client_list. device_mutex protects writer access by device and client
+ * registration / de-registration. lists_rwsem protects reader access to
+ * these lists. Iterators of these lists must lock it for read, while updates
+ * to the lists must be done with a write lock. A special case is when the
+ * device_mutex is locked. In this case locking the lists for read access is
+ * not necessary as the device_mutex implies it.
+ *
+ * lists_rwsem also protects access to the client data list.
*/
static DEFINE_MUTEX(device_mutex);
+static DECLARE_RWSEM(lists_rwsem);
+
static int ib_device_check_mandatory(struct ib_device *device)
{
@@ -152,6 +167,36 @@ static int alloc_name(char *name)
return 0;
}
+static void ib_device_release(struct device *device)
+{
+ struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+ ib_cache_release_one(dev);
+ kfree(dev->port_immutable);
+ kfree(dev);
+}
+
+static int ib_device_uevent(struct device *device,
+ struct kobj_uevent_env *env)
+{
+ struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+ if (add_uevent_var(env, "NAME=%s", dev->name))
+ return -ENOMEM;
+
+ /*
+ * It would be nice to pass the node GUID with the event...
+ */
+
+ return 0;
+}
+
+static struct class ib_class = {
+ .name = "infiniband",
+ .dev_release = ib_device_release,
+ .dev_uevent = ib_device_uevent,
+};
+
/**
* ib_alloc_device - allocate an IB device struct
* @size:size of structure to allocate
@@ -164,9 +209,27 @@ static int alloc_name(char *name)
*/
struct ib_device *ib_alloc_device(size_t size)
{
- BUG_ON(size < sizeof (struct ib_device));
+ struct ib_device *device;
+
+ if (WARN_ON(size < sizeof(struct ib_device)))
+ return NULL;
+
+ device = kzalloc(size, GFP_KERNEL);
+ if (!device)
+ return NULL;
+
+ device->dev.class = &ib_class;
+ device_initialize(&device->dev);
+
+ dev_set_drvdata(&device->dev, device);
+
+ INIT_LIST_HEAD(&device->event_handler_list);
+ spin_lock_init(&device->event_handler_lock);
+ spin_lock_init(&device->client_data_lock);
+ INIT_LIST_HEAD(&device->client_data_list);
+ INIT_LIST_HEAD(&device->port_list);
- return kzalloc(size, GFP_KERNEL);
+ return device;
}
EXPORT_SYMBOL(ib_alloc_device);
@@ -178,13 +241,8 @@ EXPORT_SYMBOL(ib_alloc_device);
*/
void ib_dealloc_device(struct ib_device *device)
{
- if (device->reg_state == IB_DEV_UNINITIALIZED) {
- kfree(device);
- return;
- }
-
- BUG_ON(device->reg_state != IB_DEV_UNREGISTERED);
-
+ WARN_ON(device->reg_state != IB_DEV_UNREGISTERED &&
+ device->reg_state != IB_DEV_UNINITIALIZED);
kobject_put(&device->dev.kobj);
}
EXPORT_SYMBOL(ib_dealloc_device);
@@ -203,10 +261,13 @@ static int add_client_context(struct ib_device *device, struct ib_client *client
context->client = client;
context->data = NULL;
+ context->going_down = false;
+ down_write(&lists_rwsem);
spin_lock_irqsave(&device->client_data_lock, flags);
list_add(&context->list, &device->client_data_list);
spin_unlock_irqrestore(&device->client_data_lock, flags);
+ up_write(&lists_rwsem);
return 0;
}
@@ -219,7 +280,7 @@ static int verify_immutable(const struct ib_device *dev, u8 port)
static int read_port_immutable(struct ib_device *device)
{
- int ret = -ENOMEM;
+ int ret;
u8 start_port = rdma_start_port(device);
u8 end_port = rdma_end_port(device);
u8 port;
@@ -235,26 +296,18 @@ static int read_port_immutable(struct ib_device *device)
* (end_port + 1),
GFP_KERNEL);
if (!device->port_immutable)
- goto err;
+ return -ENOMEM;
for (port = start_port; port <= end_port; ++port) {
ret = device->get_port_immutable(device, port,
&device->port_immutable[port]);
if (ret)
- goto err;
+ return ret;
- if (verify_immutable(device, port)) {
- ret = -EINVAL;
- goto err;
- }
+ if (verify_immutable(device, port))
+ return -EINVAL;
}
-
- ret = 0;
- goto out;
-err:
- kfree(device->port_immutable);
-out:
- return ret;
+ return 0;
}
/**
@@ -271,6 +324,7 @@ int ib_register_device(struct ib_device *device,
u8, struct kobject *))
{
int ret;
+ struct ib_client *client;
mutex_lock(&device_mutex);
@@ -285,11 +339,6 @@ int ib_register_device(struct ib_device *device,
goto out;
}
- INIT_LIST_HEAD(&device->event_handler_list);
- INIT_LIST_HEAD(&device->client_data_list);
- spin_lock_init(&device->event_handler_lock);
- spin_lock_init(&device->client_data_lock);
-
ret = read_port_immutable(device);
if (ret) {
printk(KERN_WARNING "Couldn't create per port immutable data %s\n",
@@ -297,27 +346,30 @@ int ib_register_device(struct ib_device *device,
goto out;
}
+ ret = ib_cache_setup_one(device);
+ if (ret) {
+ printk(KERN_WARNING "Couldn't set up InfiniBand P_Key/GID cache\n");
+ goto out;
+ }
+
ret = ib_device_register_sysfs(device, port_callback);
if (ret) {
printk(KERN_WARNING "Couldn't register device %s with driver model\n",
device->name);
- kfree(device->port_immutable);
+ ib_cache_cleanup_one(device);
goto out;
}
- list_add_tail(&device->core_list, &device_list);
-
device->reg_state = IB_DEV_REGISTERED;
- {
- struct ib_client *client;
-
- list_for_each_entry(client, &client_list, list)
- if (client->add && !add_client_context(device, client))
- client->add(device);
- }
+ list_for_each_entry(client, &client_list, list)
+ if (client->add && !add_client_context(device, client))
+ client->add(device);
- out:
+ down_write(&lists_rwsem);
+ list_add_tail(&device->core_list, &device_list);
+ up_write(&lists_rwsem);
+out:
mutex_unlock(&device_mutex);
return ret;
}
@@ -331,26 +383,37 @@ EXPORT_SYMBOL(ib_register_device);
*/
void ib_unregister_device(struct ib_device *device)
{
- struct ib_client *client;
struct ib_client_data *context, *tmp;
unsigned long flags;
mutex_lock(&device_mutex);
- list_for_each_entry_reverse(client, &client_list, list)
- if (client->remove)
- client->remove(device);
-
+ down_write(&lists_rwsem);
list_del(&device->core_list);
+ spin_lock_irqsave(&device->client_data_lock, flags);
+ list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
+ context->going_down = true;
+ spin_unlock_irqrestore(&device->client_data_lock, flags);
+ downgrade_write(&lists_rwsem);
+
+ list_for_each_entry_safe(context, tmp, &device->client_data_list,
+ list) {
+ if (context->client->remove)
+ context->client->remove(device, context->data);
+ }
+ up_read(&lists_rwsem);
mutex_unlock(&device_mutex);
ib_device_unregister_sysfs(device);
+ ib_cache_cleanup_one(device);
+ down_write(&lists_rwsem);
spin_lock_irqsave(&device->client_data_lock, flags);
list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
kfree(context);
spin_unlock_irqrestore(&device->client_data_lock, flags);
+ up_write(&lists_rwsem);
device->reg_state = IB_DEV_UNREGISTERED;
}
@@ -375,11 +438,14 @@ int ib_register_client(struct ib_client *client)
mutex_lock(&device_mutex);
- list_add_tail(&client->list, &client_list);
list_for_each_entry(device, &device_list, core_list)
if (client->add && !add_client_context(device, client))
client->add(device);
+ down_write(&lists_rwsem);
+ list_add_tail(&client->list, &client_list);
+ up_write(&lists_rwsem);
+
mutex_unlock(&device_mutex);
return 0;
@@ -402,19 +468,41 @@ void ib_unregister_client(struct ib_client *client)
mutex_lock(&device_mutex);
+ down_write(&lists_rwsem);
+ list_del(&client->list);
+ up_write(&lists_rwsem);
+
list_for_each_entry(device, &device_list, core_list) {
- if (client->remove)
- client->remove(device);
+ struct ib_client_data *found_context = NULL;
+ down_write(&lists_rwsem);
spin_lock_irqsave(&device->client_data_lock, flags);
list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
if (context->client == client) {
- list_del(&context->list);
- kfree(context);
+ context->going_down = true;
+ found_context = context;
+ break;
}
spin_unlock_irqrestore(&device->client_data_lock, flags);
+ up_write(&lists_rwsem);
+
+ if (client->remove)
+ client->remove(device, found_context ?
+ found_context->data : NULL);
+
+ if (!found_context) {
+ pr_warn("No client context found for %s/%s\n",
+ device->name, client->name);
+ continue;
+ }
+
+ down_write(&lists_rwsem);
+ spin_lock_irqsave(&device->client_data_lock, flags);
+ list_del(&found_context->list);
+ kfree(found_context);
+ spin_unlock_irqrestore(&device->client_data_lock, flags);
+ up_write(&lists_rwsem);
}
- list_del(&client->list);
mutex_unlock(&device_mutex);
}
@@ -590,11 +678,80 @@ EXPORT_SYMBOL(ib_query_port);
int ib_query_gid(struct ib_device *device,
u8 port_num, int index, union ib_gid *gid)
{
+ if (rdma_cap_roce_gid_table(device, port_num))
+ return ib_get_cached_gid(device, port_num, index, gid);
+
return device->query_gid(device, port_num, index, gid);
}
EXPORT_SYMBOL(ib_query_gid);
/**
+ * ib_enum_roce_netdev - enumerate all RoCE ports
+ * @ib_dev : IB device we want to query
+ * @filter: Should we call the callback?
+ * @filter_cookie: Cookie passed to filter
+ * @cb: Callback to call for each found RoCE ports
+ * @cookie: Cookie passed back to the callback
+ *
+ * Enumerates all of the physical RoCE ports of ib_dev
+ * which are related to netdevice and calls callback() on each
+ * device for which filter() function returns non zero.
+ */
+void ib_enum_roce_netdev(struct ib_device *ib_dev,
+ roce_netdev_filter filter,
+ void *filter_cookie,
+ roce_netdev_callback cb,
+ void *cookie)
+{
+ u8 port;
+
+ for (port = rdma_start_port(ib_dev); port <= rdma_end_port(ib_dev);
+ port++)
+ if (rdma_protocol_roce(ib_dev, port)) {
+ struct net_device *idev = NULL;
+
+ if (ib_dev->get_netdev)
+ idev = ib_dev->get_netdev(ib_dev, port);
+
+ if (idev &&
+ idev->reg_state >= NETREG_UNREGISTERED) {
+ dev_put(idev);
+ idev = NULL;
+ }
+
+ if (filter(ib_dev, port, idev, filter_cookie))
+ cb(ib_dev, port, idev, cookie);
+
+ if (idev)
+ dev_put(idev);
+ }
+}
+
+/**
+ * ib_enum_all_roce_netdevs - enumerate all RoCE devices
+ * @filter: Should we call the callback?
+ * @filter_cookie: Cookie passed to filter
+ * @cb: Callback to call for each found RoCE ports
+ * @cookie: Cookie passed back to the callback
+ *
+ * Enumerates all RoCE devices' physical ports which are related
+ * to netdevices and calls callback() on each device for which
+ * filter() function returns non zero.
+ */
+void ib_enum_all_roce_netdevs(roce_netdev_filter filter,
+ void *filter_cookie,
+ roce_netdev_callback cb,
+ void *cookie)
+{
+ struct ib_device *dev;
+
+ down_read(&lists_rwsem);
+ list_for_each_entry(dev, &device_list, core_list)
+ ib_enum_roce_netdev(dev, filter, filter_cookie, cb, cookie);
+ up_read(&lists_rwsem);
+}
+
+/**
* ib_query_pkey - Get P_Key table entry
* @device:Device to query
* @port_num:Port number to query
@@ -673,6 +830,14 @@ int ib_find_gid(struct ib_device *device, union ib_gid *gid,
int ret, port, i;
for (port = rdma_start_port(device); port <= rdma_end_port(device); ++port) {
+ if (rdma_cap_roce_gid_table(device, port)) {
+ if (!ib_cache_gid_find_by_port(device, gid, port,
+ NULL, index)) {
+ *port_num = port;
+ return 0;
+ }
+ }
+
for (i = 0; i < device->port_immutable[port].gid_tbl_len; ++i) {
ret = ib_query_gid(device, port, i, &tmp_gid);
if (ret)
@@ -729,6 +894,51 @@ int ib_find_pkey(struct ib_device *device,
}
EXPORT_SYMBOL(ib_find_pkey);
+/**
+ * ib_get_net_dev_by_params() - Return the appropriate net_dev
+ * for a received CM request
+ * @dev: An RDMA device on which the request has been received.
+ * @port: Port number on the RDMA device.
+ * @pkey: The Pkey the request came on.
+ * @gid: A GID that the net_dev uses to communicate.
+ * @addr: Contains the IP address that the request specified as its
+ * destination.
+ */
+struct net_device *ib_get_net_dev_by_params(struct ib_device *dev,
+ u8 port,
+ u16 pkey,
+ const union ib_gid *gid,
+ const struct sockaddr *addr)
+{
+ struct net_device *net_dev = NULL;
+ struct ib_client_data *context;
+
+ if (!rdma_protocol_ib(dev, port))
+ return NULL;
+
+ down_read(&lists_rwsem);
+
+ list_for_each_entry(context, &dev->client_data_list, list) {
+ struct ib_client *client = context->client;
+
+ if (context->going_down)
+ continue;
+
+ if (client->get_net_dev_by_params) {
+ net_dev = client->get_net_dev_by_params(dev, port, pkey,
+ gid, addr,
+ context->data);
+ if (net_dev)
+ break;
+ }
+ }
+
+ up_read(&lists_rwsem);
+
+ return net_dev;
+}
+EXPORT_SYMBOL(ib_get_net_dev_by_params);
+
static int __init ib_core_init(void)
{
int ret;
@@ -737,7 +947,7 @@ static int __init ib_core_init(void)
if (!ib_wq)
return -ENOMEM;
- ret = ib_sysfs_setup();
+ ret = class_register(&ib_class);
if (ret) {
printk(KERN_WARNING "Couldn't create InfiniBand device class\n");
goto err;
@@ -749,19 +959,12 @@ static int __init ib_core_init(void)
goto err_sysfs;
}
- ret = ib_cache_setup();
- if (ret) {
- printk(KERN_WARNING "Couldn't set up InfiniBand P_Key/GID cache\n");
- goto err_nl;
- }
+ ib_cache_setup();
return 0;
-err_nl:
- ibnl_cleanup();
-
err_sysfs:
- ib_sysfs_cleanup();
+ class_unregister(&ib_class);
err:
destroy_workqueue(ib_wq);
@@ -772,7 +975,7 @@ static void __exit ib_core_cleanup(void)
{
ib_cache_cleanup();
ibnl_cleanup();
- ib_sysfs_cleanup();
+ class_unregister(&ib_class);
/* Make sure that any pending umem accounting work is done. */
destroy_workqueue(ib_wq);
}
diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
index 786fc51bf04b..4b5c72311deb 100644
--- a/drivers/infiniband/core/mad.c
+++ b/drivers/infiniband/core/mad.c
@@ -338,13 +338,6 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
goto error1;
}
- mad_agent_priv->agent.mr = ib_get_dma_mr(port_priv->qp_info[qpn].qp->pd,
- IB_ACCESS_LOCAL_WRITE);
- if (IS_ERR(mad_agent_priv->agent.mr)) {
- ret = ERR_PTR(-ENOMEM);
- goto error2;
- }
-
if (mad_reg_req) {
reg_req = kmemdup(mad_reg_req, sizeof *reg_req, GFP_KERNEL);
if (!reg_req) {
@@ -429,8 +422,6 @@ error4:
spin_unlock_irqrestore(&port_priv->reg_lock, flags);
kfree(reg_req);
error3:
- ib_dereg_mr(mad_agent_priv->agent.mr);
-error2:
kfree(mad_agent_priv);
error1:
return ret;
@@ -590,7 +581,6 @@ static void unregister_mad_agent(struct ib_mad_agent_private *mad_agent_priv)
wait_for_completion(&mad_agent_priv->comp);
kfree(mad_agent_priv->reg_req);
- ib_dereg_mr(mad_agent_priv->agent.mr);
kfree(mad_agent_priv);
}
@@ -1038,7 +1028,7 @@ struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent,
mad_send_wr->mad_agent_priv = mad_agent_priv;
mad_send_wr->sg_list[0].length = hdr_len;
- mad_send_wr->sg_list[0].lkey = mad_agent->mr->lkey;
+ mad_send_wr->sg_list[0].lkey = mad_agent->qp->pd->local_dma_lkey;
/* OPA MADs don't have to be the full 2048 bytes */
if (opa && base_version == OPA_MGMT_BASE_VERSION &&
@@ -1047,7 +1037,7 @@ struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent,
else
mad_send_wr->sg_list[1].length = mad_size - hdr_len;
- mad_send_wr->sg_list[1].lkey = mad_agent->mr->lkey;
+ mad_send_wr->sg_list[1].lkey = mad_agent->qp->pd->local_dma_lkey;
mad_send_wr->send_wr.wr_id = (unsigned long) mad_send_wr;
mad_send_wr->send_wr.sg_list = mad_send_wr->sg_list;
@@ -2885,7 +2875,7 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info,
struct ib_mad_queue *recv_queue = &qp_info->recv_queue;
/* Initialize common scatter list fields */
- sg_list.lkey = (*qp_info->port_priv->mr).lkey;
+ sg_list.lkey = qp_info->port_priv->pd->local_dma_lkey;
/* Initialize common receive WR fields */
recv_wr.next = NULL;
@@ -3201,13 +3191,6 @@ static int ib_mad_port_open(struct ib_device *device,
goto error4;
}
- port_priv->mr = ib_get_dma_mr(port_priv->pd, IB_ACCESS_LOCAL_WRITE);
- if (IS_ERR(port_priv->mr)) {
- dev_err(&device->dev, "Couldn't get ib_mad DMA MR\n");
- ret = PTR_ERR(port_priv->mr);
- goto error5;
- }
-
if (has_smi) {
ret = create_mad_qp(&port_priv->qp_info[0], IB_QPT_SMI);
if (ret)
@@ -3248,8 +3231,6 @@ error8:
error7:
destroy_mad_qp(&port_priv->qp_info[0]);
error6:
- ib_dereg_mr(port_priv->mr);
-error5:
ib_dealloc_pd(port_priv->pd);
error4:
ib_destroy_cq(port_priv->cq);
@@ -3284,7 +3265,6 @@ static int ib_mad_port_close(struct ib_device *device, int port_num)
destroy_workqueue(port_priv->wq);
destroy_mad_qp(&port_priv->qp_info[1]);
destroy_mad_qp(&port_priv->qp_info[0]);
- ib_dereg_mr(port_priv->mr);
ib_dealloc_pd(port_priv->pd);
ib_destroy_cq(port_priv->cq);
cleanup_recv_queue(&port_priv->qp_info[1]);
@@ -3335,7 +3315,7 @@ error:
}
}
-static void ib_mad_remove_device(struct ib_device *device)
+static void ib_mad_remove_device(struct ib_device *device, void *client_data)
{
int i;
diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h
index 5be89f98928f..4a4f7aad0978 100644
--- a/drivers/infiniband/core/mad_priv.h
+++ b/drivers/infiniband/core/mad_priv.h
@@ -199,7 +199,6 @@ struct ib_mad_port_private {
int port_num;
struct ib_cq *cq;
struct ib_pd *pd;
- struct ib_mr *mr;
spinlock_t reg_lock;
struct ib_mad_mgmt_version_table version[MAX_MGMT_VERSION];
diff --git a/drivers/infiniband/core/multicast.c b/drivers/infiniband/core/multicast.c
index 2cb865c7ce7a..d38d8b2b2979 100644
--- a/drivers/infiniband/core/multicast.c
+++ b/drivers/infiniband/core/multicast.c
@@ -43,7 +43,7 @@
#include "sa.h"
static void mcast_add_one(struct ib_device *device);
-static void mcast_remove_one(struct ib_device *device);
+static void mcast_remove_one(struct ib_device *device, void *client_data);
static struct ib_client mcast_client = {
.name = "ib_multicast",
@@ -840,13 +840,12 @@ static void mcast_add_one(struct ib_device *device)
ib_register_event_handler(&dev->event_handler);
}
-static void mcast_remove_one(struct ib_device *device)
+static void mcast_remove_one(struct ib_device *device, void *client_data)
{
- struct mcast_device *dev;
+ struct mcast_device *dev = client_data;
struct mcast_port *port;
int i;
- dev = ib_get_client_data(device, &mcast_client);
if (!dev)
return;
diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c
index 23dd5a5c7597..d47df9356779 100644
--- a/drivers/infiniband/core/netlink.c
+++ b/drivers/infiniband/core/netlink.c
@@ -49,6 +49,14 @@ static DEFINE_MUTEX(ibnl_mutex);
static struct sock *nls;
static LIST_HEAD(client_list);
+int ibnl_chk_listeners(unsigned int group)
+{
+ if (netlink_has_listeners(nls, group) == 0)
+ return -1;
+ return 0;
+}
+EXPORT_SYMBOL(ibnl_chk_listeners);
+
int ibnl_add_client(int index, int nops,
const struct ibnl_client_cbs cb_table[])
{
@@ -151,6 +159,23 @@ static int ibnl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
!client->cb_table[op].dump)
return -EINVAL;
+ /*
+ * For response or local service set_timeout request,
+ * there is no need to use netlink_dump_start.
+ */
+ if (!(nlh->nlmsg_flags & NLM_F_REQUEST) ||
+ (index == RDMA_NL_LS &&
+ op == RDMA_NL_LS_OP_SET_TIMEOUT)) {
+ struct netlink_callback cb = {
+ .skb = skb,
+ .nlh = nlh,
+ .dump = client->cb_table[op].dump,
+ .module = client->cb_table[op].module,
+ };
+
+ return cb.dump(skb, &cb);
+ }
+
{
struct netlink_dump_control c = {
.dump = client->cb_table[op].dump,
@@ -165,9 +190,39 @@ static int ibnl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
return -EINVAL;
}
+static void ibnl_rcv_reply_skb(struct sk_buff *skb)
+{
+ struct nlmsghdr *nlh;
+ int msglen;
+
+ /*
+ * Process responses until there is no more message or the first
+ * request. Generally speaking, it is not recommended to mix responses
+ * with requests.
+ */
+ while (skb->len >= nlmsg_total_size(0)) {
+ nlh = nlmsg_hdr(skb);
+
+ if (nlh->nlmsg_len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len)
+ return;
+
+ /* Handle response only */
+ if (nlh->nlmsg_flags & NLM_F_REQUEST)
+ return;
+
+ ibnl_rcv_msg(skb, nlh);
+
+ msglen = NLMSG_ALIGN(nlh->nlmsg_len);
+ if (msglen > skb->len)
+ msglen = skb->len;
+ skb_pull(skb, msglen);
+ }
+}
+
static void ibnl_rcv(struct sk_buff *skb)
{
mutex_lock(&ibnl_mutex);
+ ibnl_rcv_reply_skb(skb);
netlink_rcv_skb(skb, &ibnl_rcv_msg);
mutex_unlock(&ibnl_mutex);
}
diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c
new file mode 100644
index 000000000000..6b24cba1e474
--- /dev/null
+++ b/drivers/infiniband/core/roce_gid_mgmt.c
@@ -0,0 +1,728 @@
+/*
+ * Copyright (c) 2015, Mellanox Technologies inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "core_priv.h"
+
+#include <linux/in.h>
+#include <linux/in6.h>
+
+/* For in6_dev_get/in6_dev_put */
+#include <net/addrconf.h>
+#include <net/bonding.h>
+
+#include <rdma/ib_cache.h>
+#include <rdma/ib_addr.h>
+
+enum gid_op_type {
+ GID_DEL = 0,
+ GID_ADD
+};
+
+struct update_gid_event_work {
+ struct work_struct work;
+ union ib_gid gid;
+ struct ib_gid_attr gid_attr;
+ enum gid_op_type gid_op;
+};
+
+#define ROCE_NETDEV_CALLBACK_SZ 3
+struct netdev_event_work_cmd {
+ roce_netdev_callback cb;
+ roce_netdev_filter filter;
+ struct net_device *ndev;
+ struct net_device *filter_ndev;
+};
+
+struct netdev_event_work {
+ struct work_struct work;
+ struct netdev_event_work_cmd cmds[ROCE_NETDEV_CALLBACK_SZ];
+};
+
+static void update_gid(enum gid_op_type gid_op, struct ib_device *ib_dev,
+ u8 port, union ib_gid *gid,
+ struct ib_gid_attr *gid_attr)
+{
+ switch (gid_op) {
+ case GID_ADD:
+ ib_cache_gid_add(ib_dev, port, gid, gid_attr);
+ break;
+ case GID_DEL:
+ ib_cache_gid_del(ib_dev, port, gid, gid_attr);
+ break;
+ }
+}
+
+enum bonding_slave_state {
+ BONDING_SLAVE_STATE_ACTIVE = 1UL << 0,
+ BONDING_SLAVE_STATE_INACTIVE = 1UL << 1,
+ /* No primary slave or the device isn't a slave in bonding */
+ BONDING_SLAVE_STATE_NA = 1UL << 2,
+};
+
+static enum bonding_slave_state is_eth_active_slave_of_bonding_rcu(struct net_device *dev,
+ struct net_device *upper)
+{
+ if (upper && netif_is_bond_master(upper)) {
+ struct net_device *pdev =
+ bond_option_active_slave_get_rcu(netdev_priv(upper));
+
+ if (pdev)
+ return dev == pdev ? BONDING_SLAVE_STATE_ACTIVE :
+ BONDING_SLAVE_STATE_INACTIVE;
+ }
+
+ return BONDING_SLAVE_STATE_NA;
+}
+
+static bool is_upper_dev_rcu(struct net_device *dev, struct net_device *upper)
+{
+ struct net_device *_upper = NULL;
+ struct list_head *iter;
+
+ netdev_for_each_all_upper_dev_rcu(dev, _upper, iter)
+ if (_upper == upper)
+ break;
+
+ return _upper == upper;
+}
+
+#define REQUIRED_BOND_STATES (BONDING_SLAVE_STATE_ACTIVE | \
+ BONDING_SLAVE_STATE_NA)
+static int is_eth_port_of_netdev(struct ib_device *ib_dev, u8 port,
+ struct net_device *rdma_ndev, void *cookie)
+{
+ struct net_device *event_ndev = (struct net_device *)cookie;
+ struct net_device *real_dev;
+ int res;
+
+ if (!rdma_ndev)
+ return 0;
+
+ rcu_read_lock();
+ real_dev = rdma_vlan_dev_real_dev(event_ndev);
+ if (!real_dev)
+ real_dev = event_ndev;
+
+ res = ((is_upper_dev_rcu(rdma_ndev, event_ndev) &&
+ (is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) &
+ REQUIRED_BOND_STATES)) ||
+ real_dev == rdma_ndev);
+
+ rcu_read_unlock();
+ return res;
+}
+
+static int is_eth_port_inactive_slave(struct ib_device *ib_dev, u8 port,
+ struct net_device *rdma_ndev, void *cookie)
+{
+ struct net_device *master_dev;
+ int res;
+
+ if (!rdma_ndev)
+ return 0;
+
+ rcu_read_lock();
+ master_dev = netdev_master_upper_dev_get_rcu(rdma_ndev);
+ res = is_eth_active_slave_of_bonding_rcu(rdma_ndev, master_dev) ==
+ BONDING_SLAVE_STATE_INACTIVE;
+ rcu_read_unlock();
+
+ return res;
+}
+
+static int pass_all_filter(struct ib_device *ib_dev, u8 port,
+ struct net_device *rdma_ndev, void *cookie)
+{
+ return 1;
+}
+
+static int upper_device_filter(struct ib_device *ib_dev, u8 port,
+ struct net_device *rdma_ndev, void *cookie)
+{
+ struct net_device *event_ndev = (struct net_device *)cookie;
+ int res;
+
+ if (!rdma_ndev)
+ return 0;
+
+ if (rdma_ndev == event_ndev)
+ return 1;
+
+ rcu_read_lock();
+ res = is_upper_dev_rcu(rdma_ndev, event_ndev);
+ rcu_read_unlock();
+
+ return res;
+}
+
+static void update_gid_ip(enum gid_op_type gid_op,
+ struct ib_device *ib_dev,
+ u8 port, struct net_device *ndev,
+ struct sockaddr *addr)
+{
+ union ib_gid gid;
+ struct ib_gid_attr gid_attr;
+
+ rdma_ip2gid(addr, &gid);
+ memset(&gid_attr, 0, sizeof(gid_attr));
+ gid_attr.ndev = ndev;
+
+ update_gid(gid_op, ib_dev, port, &gid, &gid_attr);
+}
+
+static void enum_netdev_default_gids(struct ib_device *ib_dev,
+ u8 port, struct net_device *event_ndev,
+ struct net_device *rdma_ndev)
+{
+ rcu_read_lock();
+ if (!rdma_ndev ||
+ ((rdma_ndev != event_ndev &&
+ !is_upper_dev_rcu(rdma_ndev, event_ndev)) ||
+ is_eth_active_slave_of_bonding_rcu(rdma_ndev,
+ netdev_master_upper_dev_get_rcu(rdma_ndev)) ==
+ BONDING_SLAVE_STATE_INACTIVE)) {
+ rcu_read_unlock();
+ return;
+ }
+ rcu_read_unlock();
+
+ ib_cache_gid_set_default_gid(ib_dev, port, rdma_ndev,
+ IB_CACHE_GID_DEFAULT_MODE_SET);
+}
+
+static void bond_delete_netdev_default_gids(struct ib_device *ib_dev,
+ u8 port,
+ struct net_device *event_ndev,
+ struct net_device *rdma_ndev)
+{
+ struct net_device *real_dev = rdma_vlan_dev_real_dev(event_ndev);
+
+ if (!rdma_ndev)
+ return;
+
+ if (!real_dev)
+ real_dev = event_ndev;
+
+ rcu_read_lock();
+
+ if (is_upper_dev_rcu(rdma_ndev, event_ndev) &&
+ is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) ==
+ BONDING_SLAVE_STATE_INACTIVE) {
+ rcu_read_unlock();
+
+ ib_cache_gid_set_default_gid(ib_dev, port, rdma_ndev,
+ IB_CACHE_GID_DEFAULT_MODE_DELETE);
+ } else {
+ rcu_read_unlock();
+ }
+}
+
+static void enum_netdev_ipv4_ips(struct ib_device *ib_dev,
+ u8 port, struct net_device *ndev)
+{
+ struct in_device *in_dev;
+
+ if (ndev->reg_state >= NETREG_UNREGISTERING)
+ return;
+
+ in_dev = in_dev_get(ndev);
+ if (!in_dev)
+ return;
+
+ for_ifa(in_dev) {
+ struct sockaddr_in ip;
+
+ ip.sin_family = AF_INET;
+ ip.sin_addr.s_addr = ifa->ifa_address;
+ update_gid_ip(GID_ADD, ib_dev, port, ndev,
+ (struct sockaddr *)&ip);
+ }
+ endfor_ifa(in_dev);
+
+ in_dev_put(in_dev);
+}
+
+static void enum_netdev_ipv6_ips(struct ib_device *ib_dev,
+ u8 port, struct net_device *ndev)
+{
+ struct inet6_ifaddr *ifp;
+ struct inet6_dev *in6_dev;
+ struct sin6_list {
+ struct list_head list;
+ struct sockaddr_in6 sin6;
+ };
+ struct sin6_list *sin6_iter;
+ struct sin6_list *sin6_temp;
+ struct ib_gid_attr gid_attr = {.ndev = ndev};
+ LIST_HEAD(sin6_list);
+
+ if (ndev->reg_state >= NETREG_UNREGISTERING)
+ return;
+
+ in6_dev = in6_dev_get(ndev);
+ if (!in6_dev)
+ return;
+
+ read_lock_bh(&in6_dev->lock);
+ list_for_each_entry(ifp, &in6_dev->addr_list, if_list) {
+ struct sin6_list *entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+
+ if (!entry) {
+ pr_warn("roce_gid_mgmt: couldn't allocate entry for IPv6 update\n");
+ continue;
+ }
+
+ entry->sin6.sin6_family = AF_INET6;
+ entry->sin6.sin6_addr = ifp->addr;
+ list_add_tail(&entry->list, &sin6_list);
+ }
+ read_unlock_bh(&in6_dev->lock);
+
+ in6_dev_put(in6_dev);
+
+ list_for_each_entry_safe(sin6_iter, sin6_temp, &sin6_list, list) {
+ union ib_gid gid;
+
+ rdma_ip2gid((struct sockaddr *)&sin6_iter->sin6, &gid);
+ update_gid(GID_ADD, ib_dev, port, &gid, &gid_attr);
+ list_del(&sin6_iter->list);
+ kfree(sin6_iter);
+ }
+}
+
+static void _add_netdev_ips(struct ib_device *ib_dev, u8 port,
+ struct net_device *ndev)
+{
+ enum_netdev_ipv4_ips(ib_dev, port, ndev);
+ if (IS_ENABLED(CONFIG_IPV6))
+ enum_netdev_ipv6_ips(ib_dev, port, ndev);
+}
+
+static void add_netdev_ips(struct ib_device *ib_dev, u8 port,
+ struct net_device *rdma_ndev, void *cookie)
+{
+ struct net_device *event_ndev = (struct net_device *)cookie;
+
+ enum_netdev_default_gids(ib_dev, port, event_ndev, rdma_ndev);
+ _add_netdev_ips(ib_dev, port, event_ndev);
+}
+
+static void del_netdev_ips(struct ib_device *ib_dev, u8 port,
+ struct net_device *rdma_ndev, void *cookie)
+{
+ struct net_device *event_ndev = (struct net_device *)cookie;
+
+ ib_cache_gid_del_all_netdev_gids(ib_dev, port, event_ndev);
+}
+
+static void enum_all_gids_of_dev_cb(struct ib_device *ib_dev,
+ u8 port,
+ struct net_device *rdma_ndev,
+ void *cookie)
+{
+ struct net *net;
+ struct net_device *ndev;
+
+ /* Lock the rtnl to make sure the netdevs does not move under
+ * our feet
+ */
+ rtnl_lock();
+ for_each_net(net)
+ for_each_netdev(net, ndev)
+ if (is_eth_port_of_netdev(ib_dev, port, rdma_ndev, ndev))
+ add_netdev_ips(ib_dev, port, rdma_ndev, ndev);
+ rtnl_unlock();
+}
+
+/* This function will rescan all of the network devices in the system
+ * and add their gids, as needed, to the relevant RoCE devices. */
+int roce_rescan_device(struct ib_device *ib_dev)
+{
+ ib_enum_roce_netdev(ib_dev, pass_all_filter, NULL,
+ enum_all_gids_of_dev_cb, NULL);
+
+ return 0;
+}
+
+static void callback_for_addr_gid_device_scan(struct ib_device *device,
+ u8 port,
+ struct net_device *rdma_ndev,
+ void *cookie)
+{
+ struct update_gid_event_work *parsed = cookie;
+
+ return update_gid(parsed->gid_op, device,
+ port, &parsed->gid,
+ &parsed->gid_attr);
+}
+
+static void handle_netdev_upper(struct ib_device *ib_dev, u8 port,
+ void *cookie,
+ void (*handle_netdev)(struct ib_device *ib_dev,
+ u8 port,
+ struct net_device *ndev))
+{
+ struct net_device *ndev = (struct net_device *)cookie;
+ struct upper_list {
+ struct list_head list;
+ struct net_device *upper;
+ };
+ struct net_device *upper;
+ struct list_head *iter;
+ struct upper_list *upper_iter;
+ struct upper_list *upper_temp;
+ LIST_HEAD(upper_list);
+
+ rcu_read_lock();
+ netdev_for_each_all_upper_dev_rcu(ndev, upper, iter) {
+ struct upper_list *entry = kmalloc(sizeof(*entry),
+ GFP_ATOMIC);
+
+ if (!entry) {
+ pr_info("roce_gid_mgmt: couldn't allocate entry to delete ndev\n");
+ continue;
+ }
+
+ list_add_tail(&entry->list, &upper_list);
+ dev_hold(upper);
+ entry->upper = upper;
+ }
+ rcu_read_unlock();
+
+ handle_netdev(ib_dev, port, ndev);
+ list_for_each_entry_safe(upper_iter, upper_temp, &upper_list,
+ list) {
+ handle_netdev(ib_dev, port, upper_iter->upper);
+ dev_put(upper_iter->upper);
+ list_del(&upper_iter->list);
+ kfree(upper_iter);
+ }
+}
+
+static void _roce_del_all_netdev_gids(struct ib_device *ib_dev, u8 port,
+ struct net_device *event_ndev)
+{
+ ib_cache_gid_del_all_netdev_gids(ib_dev, port, event_ndev);
+}
+
+static void del_netdev_upper_ips(struct ib_device *ib_dev, u8 port,
+ struct net_device *rdma_ndev, void *cookie)
+{
+ handle_netdev_upper(ib_dev, port, cookie, _roce_del_all_netdev_gids);
+}
+
+static void add_netdev_upper_ips(struct ib_device *ib_dev, u8 port,
+ struct net_device *rdma_ndev, void *cookie)
+{
+ handle_netdev_upper(ib_dev, port, cookie, _add_netdev_ips);
+}
+
+static void del_netdev_default_ips_join(struct ib_device *ib_dev, u8 port,
+ struct net_device *rdma_ndev,
+ void *cookie)
+{
+ struct net_device *master_ndev;
+
+ rcu_read_lock();
+ master_ndev = netdev_master_upper_dev_get_rcu(rdma_ndev);
+ if (master_ndev)
+ dev_hold(master_ndev);
+ rcu_read_unlock();
+
+ if (master_ndev) {
+ bond_delete_netdev_default_gids(ib_dev, port, master_ndev,
+ rdma_ndev);
+ dev_put(master_ndev);
+ }
+}
+
+static void del_netdev_default_ips(struct ib_device *ib_dev, u8 port,
+ struct net_device *rdma_ndev, void *cookie)
+{
+ struct net_device *event_ndev = (struct net_device *)cookie;
+
+ bond_delete_netdev_default_gids(ib_dev, port, event_ndev, rdma_ndev);
+}
+
+/* The following functions operate on all IB devices. netdevice_event and
+ * addr_event execute ib_enum_all_roce_netdevs through a work.
+ * ib_enum_all_roce_netdevs iterates through all IB devices.
+ */
+
+static void netdevice_event_work_handler(struct work_struct *_work)
+{
+ struct netdev_event_work *work =
+ container_of(_work, struct netdev_event_work, work);
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(work->cmds) && work->cmds[i].cb; i++) {
+ ib_enum_all_roce_netdevs(work->cmds[i].filter,
+ work->cmds[i].filter_ndev,
+ work->cmds[i].cb,
+ work->cmds[i].ndev);
+ dev_put(work->cmds[i].ndev);
+ dev_put(work->cmds[i].filter_ndev);
+ }
+
+ kfree(work);
+}
+
+static int netdevice_queue_work(struct netdev_event_work_cmd *cmds,
+ struct net_device *ndev)
+{
+ unsigned int i;
+ struct netdev_event_work *ndev_work =
+ kmalloc(sizeof(*ndev_work), GFP_KERNEL);
+
+ if (!ndev_work) {
+ pr_warn("roce_gid_mgmt: can't allocate work for netdevice_event\n");
+ return NOTIFY_DONE;
+ }
+
+ memcpy(ndev_work->cmds, cmds, sizeof(ndev_work->cmds));
+ for (i = 0; i < ARRAY_SIZE(ndev_work->cmds) && ndev_work->cmds[i].cb; i++) {
+ if (!ndev_work->cmds[i].ndev)
+ ndev_work->cmds[i].ndev = ndev;
+ if (!ndev_work->cmds[i].filter_ndev)
+ ndev_work->cmds[i].filter_ndev = ndev;
+ dev_hold(ndev_work->cmds[i].ndev);
+ dev_hold(ndev_work->cmds[i].filter_ndev);
+ }
+ INIT_WORK(&ndev_work->work, netdevice_event_work_handler);
+
+ queue_work(ib_wq, &ndev_work->work);
+
+ return NOTIFY_DONE;
+}
+
+static const struct netdev_event_work_cmd add_cmd = {
+ .cb = add_netdev_ips, .filter = is_eth_port_of_netdev};
+static const struct netdev_event_work_cmd add_cmd_upper_ips = {
+ .cb = add_netdev_upper_ips, .filter = is_eth_port_of_netdev};
+
+static void netdevice_event_changeupper(struct netdev_notifier_changeupper_info *changeupper_info,
+ struct netdev_event_work_cmd *cmds)
+{
+ static const struct netdev_event_work_cmd upper_ips_del_cmd = {
+ .cb = del_netdev_upper_ips, .filter = upper_device_filter};
+ static const struct netdev_event_work_cmd bonding_default_del_cmd = {
+ .cb = del_netdev_default_ips, .filter = is_eth_port_inactive_slave};
+
+ if (changeupper_info->linking == false) {
+ cmds[0] = upper_ips_del_cmd;
+ cmds[0].ndev = changeupper_info->upper_dev;
+ cmds[1] = add_cmd;
+ } else {
+ cmds[0] = bonding_default_del_cmd;
+ cmds[0].ndev = changeupper_info->upper_dev;
+ cmds[1] = add_cmd_upper_ips;
+ cmds[1].ndev = changeupper_info->upper_dev;
+ cmds[1].filter_ndev = changeupper_info->upper_dev;
+ }
+}
+
+static int netdevice_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
+{
+ static const struct netdev_event_work_cmd del_cmd = {
+ .cb = del_netdev_ips, .filter = pass_all_filter};
+ static const struct netdev_event_work_cmd bonding_default_del_cmd_join = {
+ .cb = del_netdev_default_ips_join, .filter = is_eth_port_inactive_slave};
+ static const struct netdev_event_work_cmd default_del_cmd = {
+ .cb = del_netdev_default_ips, .filter = pass_all_filter};
+ static const struct netdev_event_work_cmd bonding_event_ips_del_cmd = {
+ .cb = del_netdev_upper_ips, .filter = upper_device_filter};
+ struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
+ struct netdev_event_work_cmd cmds[ROCE_NETDEV_CALLBACK_SZ] = { {NULL} };
+
+ if (ndev->type != ARPHRD_ETHER)
+ return NOTIFY_DONE;
+
+ switch (event) {
+ case NETDEV_REGISTER:
+ case NETDEV_UP:
+ cmds[0] = bonding_default_del_cmd_join;
+ cmds[1] = add_cmd;
+ break;
+
+ case NETDEV_UNREGISTER:
+ if (ndev->reg_state < NETREG_UNREGISTERED)
+ cmds[0] = del_cmd;
+ else
+ return NOTIFY_DONE;
+ break;
+
+ case NETDEV_CHANGEADDR:
+ cmds[0] = default_del_cmd;
+ cmds[1] = add_cmd;
+ break;
+
+ case NETDEV_CHANGEUPPER:
+ netdevice_event_changeupper(
+ container_of(ptr, struct netdev_notifier_changeupper_info, info),
+ cmds);
+ break;
+
+ case NETDEV_BONDING_FAILOVER:
+ cmds[0] = bonding_event_ips_del_cmd;
+ cmds[1] = bonding_default_del_cmd_join;
+ cmds[2] = add_cmd_upper_ips;
+ break;
+
+ default:
+ return NOTIFY_DONE;
+ }
+
+ return netdevice_queue_work(cmds, ndev);
+}
+
+static void update_gid_event_work_handler(struct work_struct *_work)
+{
+ struct update_gid_event_work *work =
+ container_of(_work, struct update_gid_event_work, work);
+
+ ib_enum_all_roce_netdevs(is_eth_port_of_netdev, work->gid_attr.ndev,
+ callback_for_addr_gid_device_scan, work);
+
+ dev_put(work->gid_attr.ndev);
+ kfree(work);
+}
+
+static int addr_event(struct notifier_block *this, unsigned long event,
+ struct sockaddr *sa, struct net_device *ndev)
+{
+ struct update_gid_event_work *work;
+ enum gid_op_type gid_op;
+
+ if (ndev->type != ARPHRD_ETHER)
+ return NOTIFY_DONE;
+
+ switch (event) {
+ case NETDEV_UP:
+ gid_op = GID_ADD;
+ break;
+
+ case NETDEV_DOWN:
+ gid_op = GID_DEL;
+ break;
+
+ default:
+ return NOTIFY_DONE;
+ }
+
+ work = kmalloc(sizeof(*work), GFP_ATOMIC);
+ if (!work) {
+ pr_warn("roce_gid_mgmt: Couldn't allocate work for addr_event\n");
+ return NOTIFY_DONE;
+ }
+
+ INIT_WORK(&work->work, update_gid_event_work_handler);
+
+ rdma_ip2gid(sa, &work->gid);
+ work->gid_op = gid_op;
+
+ memset(&work->gid_attr, 0, sizeof(work->gid_attr));
+ dev_hold(ndev);
+ work->gid_attr.ndev = ndev;
+
+ queue_work(ib_wq, &work->work);
+
+ return NOTIFY_DONE;
+}
+
+static int inetaddr_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
+{
+ struct sockaddr_in in;
+ struct net_device *ndev;
+ struct in_ifaddr *ifa = ptr;
+
+ in.sin_family = AF_INET;
+ in.sin_addr.s_addr = ifa->ifa_address;
+ ndev = ifa->ifa_dev->dev;
+
+ return addr_event(this, event, (struct sockaddr *)&in, ndev);
+}
+
+static int inet6addr_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
+{
+ struct sockaddr_in6 in6;
+ struct net_device *ndev;
+ struct inet6_ifaddr *ifa6 = ptr;
+
+ in6.sin6_family = AF_INET6;
+ in6.sin6_addr = ifa6->addr;
+ ndev = ifa6->idev->dev;
+
+ return addr_event(this, event, (struct sockaddr *)&in6, ndev);
+}
+
+static struct notifier_block nb_netdevice = {
+ .notifier_call = netdevice_event
+};
+
+static struct notifier_block nb_inetaddr = {
+ .notifier_call = inetaddr_event
+};
+
+static struct notifier_block nb_inet6addr = {
+ .notifier_call = inet6addr_event
+};
+
+int __init roce_gid_mgmt_init(void)
+{
+ register_inetaddr_notifier(&nb_inetaddr);
+ if (IS_ENABLED(CONFIG_IPV6))
+ register_inet6addr_notifier(&nb_inet6addr);
+ /* We relay on the netdevice notifier to enumerate all
+ * existing devices in the system. Register to this notifier
+ * last to make sure we will not miss any IP add/del
+ * callbacks.
+ */
+ register_netdevice_notifier(&nb_netdevice);
+
+ return 0;
+}
+
+void __exit roce_gid_mgmt_cleanup(void)
+{
+ if (IS_ENABLED(CONFIG_IPV6))
+ unregister_inet6addr_notifier(&nb_inet6addr);
+ unregister_inetaddr_notifier(&nb_inetaddr);
+ unregister_netdevice_notifier(&nb_netdevice);
+ /* Ensure all gid deletion tasks complete before we go down,
+ * to avoid any reference to free'd memory. By the time
+ * ib-core is removed, all physical devices have been removed,
+ * so no issue with remaining hardware contexts.
+ */
+}
diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
index ca919f429666..8c014b33d8e0 100644
--- a/drivers/infiniband/core/sa_query.c
+++ b/drivers/infiniband/core/sa_query.c
@@ -45,12 +45,21 @@
#include <uapi/linux/if_ether.h>
#include <rdma/ib_pack.h>
#include <rdma/ib_cache.h>
+#include <rdma/rdma_netlink.h>
+#include <net/netlink.h>
+#include <uapi/rdma/ib_user_sa.h>
+#include <rdma/ib_marshall.h>
#include "sa.h"
MODULE_AUTHOR("Roland Dreier");
MODULE_DESCRIPTION("InfiniBand subnet administration query support");
MODULE_LICENSE("Dual BSD/GPL");
+#define IB_SA_LOCAL_SVC_TIMEOUT_MIN 100
+#define IB_SA_LOCAL_SVC_TIMEOUT_DEFAULT 2000
+#define IB_SA_LOCAL_SVC_TIMEOUT_MAX 200000
+static int sa_local_svc_timeout_ms = IB_SA_LOCAL_SVC_TIMEOUT_DEFAULT;
+
struct ib_sa_sm_ah {
struct ib_ah *ah;
struct kref ref;
@@ -80,8 +89,16 @@ struct ib_sa_query {
struct ib_mad_send_buf *mad_buf;
struct ib_sa_sm_ah *sm_ah;
int id;
+ u32 flags;
+ struct list_head list; /* Local svc request list */
+ u32 seq; /* Local svc request sequence number */
+ unsigned long timeout; /* Local svc timeout */
+ u8 path_use; /* How will the pathrecord be used */
};
+#define IB_SA_ENABLE_LOCAL_SERVICE 0x00000001
+#define IB_SA_CANCEL 0x00000002
+
struct ib_sa_service_query {
void (*callback)(int, struct ib_sa_service_rec *, void *);
void *context;
@@ -106,8 +123,28 @@ struct ib_sa_mcmember_query {
struct ib_sa_query sa_query;
};
+static LIST_HEAD(ib_nl_request_list);
+static DEFINE_SPINLOCK(ib_nl_request_lock);
+static atomic_t ib_nl_sa_request_seq;
+static struct workqueue_struct *ib_nl_wq;
+static struct delayed_work ib_nl_timed_work;
+static const struct nla_policy ib_nl_policy[LS_NLA_TYPE_MAX] = {
+ [LS_NLA_TYPE_PATH_RECORD] = {.type = NLA_BINARY,
+ .len = sizeof(struct ib_path_rec_data)},
+ [LS_NLA_TYPE_TIMEOUT] = {.type = NLA_U32},
+ [LS_NLA_TYPE_SERVICE_ID] = {.type = NLA_U64},
+ [LS_NLA_TYPE_DGID] = {.type = NLA_BINARY,
+ .len = sizeof(struct rdma_nla_ls_gid)},
+ [LS_NLA_TYPE_SGID] = {.type = NLA_BINARY,
+ .len = sizeof(struct rdma_nla_ls_gid)},
+ [LS_NLA_TYPE_TCLASS] = {.type = NLA_U8},
+ [LS_NLA_TYPE_PKEY] = {.type = NLA_U16},
+ [LS_NLA_TYPE_QOS_CLASS] = {.type = NLA_U16},
+};
+
+
static void ib_sa_add_one(struct ib_device *device);
-static void ib_sa_remove_one(struct ib_device *device);
+static void ib_sa_remove_one(struct ib_device *device, void *client_data);
static struct ib_client sa_client = {
.name = "sa",
@@ -381,6 +418,427 @@ static const struct ib_field guidinfo_rec_table[] = {
.size_bits = 512 },
};
+static inline void ib_sa_disable_local_svc(struct ib_sa_query *query)
+{
+ query->flags &= ~IB_SA_ENABLE_LOCAL_SERVICE;
+}
+
+static inline int ib_sa_query_cancelled(struct ib_sa_query *query)
+{
+ return (query->flags & IB_SA_CANCEL);
+}
+
+static void ib_nl_set_path_rec_attrs(struct sk_buff *skb,
+ struct ib_sa_query *query)
+{
+ struct ib_sa_path_rec *sa_rec = query->mad_buf->context[1];
+ struct ib_sa_mad *mad = query->mad_buf->mad;
+ ib_sa_comp_mask comp_mask = mad->sa_hdr.comp_mask;
+ u16 val16;
+ u64 val64;
+ struct rdma_ls_resolve_header *header;
+
+ query->mad_buf->context[1] = NULL;
+
+ /* Construct the family header first */
+ header = (struct rdma_ls_resolve_header *)
+ skb_put(skb, NLMSG_ALIGN(sizeof(*header)));
+ memcpy(header->device_name, query->port->agent->device->name,
+ LS_DEVICE_NAME_MAX);
+ header->port_num = query->port->port_num;
+
+ if ((comp_mask & IB_SA_PATH_REC_REVERSIBLE) &&
+ sa_rec->reversible != 0)
+ query->path_use = LS_RESOLVE_PATH_USE_GMP;
+ else
+ query->path_use = LS_RESOLVE_PATH_USE_UNIDIRECTIONAL;
+ header->path_use = query->path_use;
+
+ /* Now build the attributes */
+ if (comp_mask & IB_SA_PATH_REC_SERVICE_ID) {
+ val64 = be64_to_cpu(sa_rec->service_id);
+ nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_SERVICE_ID,
+ sizeof(val64), &val64);
+ }
+ if (comp_mask & IB_SA_PATH_REC_DGID)
+ nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_DGID,
+ sizeof(sa_rec->dgid), &sa_rec->dgid);
+ if (comp_mask & IB_SA_PATH_REC_SGID)
+ nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_SGID,
+ sizeof(sa_rec->sgid), &sa_rec->sgid);
+ if (comp_mask & IB_SA_PATH_REC_TRAFFIC_CLASS)
+ nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_TCLASS,
+ sizeof(sa_rec->traffic_class), &sa_rec->traffic_class);
+
+ if (comp_mask & IB_SA_PATH_REC_PKEY) {
+ val16 = be16_to_cpu(sa_rec->pkey);
+ nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_PKEY,
+ sizeof(val16), &val16);
+ }
+ if (comp_mask & IB_SA_PATH_REC_QOS_CLASS) {
+ val16 = be16_to_cpu(sa_rec->qos_class);
+ nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_QOS_CLASS,
+ sizeof(val16), &val16);
+ }
+}
+
+static int ib_nl_get_path_rec_attrs_len(ib_sa_comp_mask comp_mask)
+{
+ int len = 0;
+
+ if (comp_mask & IB_SA_PATH_REC_SERVICE_ID)
+ len += nla_total_size(sizeof(u64));
+ if (comp_mask & IB_SA_PATH_REC_DGID)
+ len += nla_total_size(sizeof(struct rdma_nla_ls_gid));
+ if (comp_mask & IB_SA_PATH_REC_SGID)
+ len += nla_total_size(sizeof(struct rdma_nla_ls_gid));
+ if (comp_mask & IB_SA_PATH_REC_TRAFFIC_CLASS)
+ len += nla_total_size(sizeof(u8));
+ if (comp_mask & IB_SA_PATH_REC_PKEY)
+ len += nla_total_size(sizeof(u16));
+ if (comp_mask & IB_SA_PATH_REC_QOS_CLASS)
+ len += nla_total_size(sizeof(u16));
+
+ /*
+ * Make sure that at least some of the required comp_mask bits are
+ * set.
+ */
+ if (WARN_ON(len == 0))
+ return len;
+
+ /* Add the family header */
+ len += NLMSG_ALIGN(sizeof(struct rdma_ls_resolve_header));
+
+ return len;
+}
+
+static int ib_nl_send_msg(struct ib_sa_query *query)
+{
+ struct sk_buff *skb = NULL;
+ struct nlmsghdr *nlh;
+ void *data;
+ int ret = 0;
+ struct ib_sa_mad *mad;
+ int len;
+
+ mad = query->mad_buf->mad;
+ len = ib_nl_get_path_rec_attrs_len(mad->sa_hdr.comp_mask);
+ if (len <= 0)
+ return -EMSGSIZE;
+
+ skb = nlmsg_new(len, GFP_KERNEL);
+ if (!skb)
+ return -ENOMEM;
+
+ /* Put nlmsg header only for now */
+ data = ibnl_put_msg(skb, &nlh, query->seq, 0, RDMA_NL_LS,
+ RDMA_NL_LS_OP_RESOLVE, NLM_F_REQUEST);
+ if (!data) {
+ kfree_skb(skb);
+ return -EMSGSIZE;
+ }
+
+ /* Add attributes */
+ ib_nl_set_path_rec_attrs(skb, query);
+
+ /* Repair the nlmsg header length */
+ nlmsg_end(skb, nlh);
+
+ ret = ibnl_multicast(skb, nlh, RDMA_NL_GROUP_LS, GFP_KERNEL);
+ if (!ret)
+ ret = len;
+ else
+ ret = 0;
+
+ return ret;
+}
+
+static int ib_nl_make_request(struct ib_sa_query *query)
+{
+ unsigned long flags;
+ unsigned long delay;
+ int ret;
+
+ INIT_LIST_HEAD(&query->list);
+ query->seq = (u32)atomic_inc_return(&ib_nl_sa_request_seq);
+
+ spin_lock_irqsave(&ib_nl_request_lock, flags);
+ ret = ib_nl_send_msg(query);
+ if (ret <= 0) {
+ ret = -EIO;
+ goto request_out;
+ } else {
+ ret = 0;
+ }
+
+ delay = msecs_to_jiffies(sa_local_svc_timeout_ms);
+ query->timeout = delay + jiffies;
+ list_add_tail(&query->list, &ib_nl_request_list);
+ /* Start the timeout if this is the only request */
+ if (ib_nl_request_list.next == &query->list)
+ queue_delayed_work(ib_nl_wq, &ib_nl_timed_work, delay);
+
+request_out:
+ spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+
+ return ret;
+}
+
+static int ib_nl_cancel_request(struct ib_sa_query *query)
+{
+ unsigned long flags;
+ struct ib_sa_query *wait_query;
+ int found = 0;
+
+ spin_lock_irqsave(&ib_nl_request_lock, flags);
+ list_for_each_entry(wait_query, &ib_nl_request_list, list) {
+ /* Let the timeout to take care of the callback */
+ if (query == wait_query) {
+ query->flags |= IB_SA_CANCEL;
+ query->timeout = jiffies;
+ list_move(&query->list, &ib_nl_request_list);
+ found = 1;
+ mod_delayed_work(ib_nl_wq, &ib_nl_timed_work, 1);
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+
+ return found;
+}
+
+static void send_handler(struct ib_mad_agent *agent,
+ struct ib_mad_send_wc *mad_send_wc);
+
+static void ib_nl_process_good_resolve_rsp(struct ib_sa_query *query,
+ const struct nlmsghdr *nlh)
+{
+ struct ib_mad_send_wc mad_send_wc;
+ struct ib_sa_mad *mad = NULL;
+ const struct nlattr *head, *curr;
+ struct ib_path_rec_data *rec;
+ int len, rem;
+ u32 mask = 0;
+ int status = -EIO;
+
+ if (query->callback) {
+ head = (const struct nlattr *) nlmsg_data(nlh);
+ len = nlmsg_len(nlh);
+ switch (query->path_use) {
+ case LS_RESOLVE_PATH_USE_UNIDIRECTIONAL:
+ mask = IB_PATH_PRIMARY | IB_PATH_OUTBOUND;
+ break;
+
+ case LS_RESOLVE_PATH_USE_ALL:
+ case LS_RESOLVE_PATH_USE_GMP:
+ default:
+ mask = IB_PATH_PRIMARY | IB_PATH_GMP |
+ IB_PATH_BIDIRECTIONAL;
+ break;
+ }
+ nla_for_each_attr(curr, head, len, rem) {
+ if (curr->nla_type == LS_NLA_TYPE_PATH_RECORD) {
+ rec = nla_data(curr);
+ /*
+ * Get the first one. In the future, we may
+ * need to get up to 6 pathrecords.
+ */
+ if ((rec->flags & mask) == mask) {
+ mad = query->mad_buf->mad;
+ mad->mad_hdr.method |=
+ IB_MGMT_METHOD_RESP;
+ memcpy(mad->data, rec->path_rec,
+ sizeof(rec->path_rec));
+ status = 0;
+ break;
+ }
+ }
+ }
+ query->callback(query, status, mad);
+ }
+
+ mad_send_wc.send_buf = query->mad_buf;
+ mad_send_wc.status = IB_WC_SUCCESS;
+ send_handler(query->mad_buf->mad_agent, &mad_send_wc);
+}
+
+static void ib_nl_request_timeout(struct work_struct *work)
+{
+ unsigned long flags;
+ struct ib_sa_query *query;
+ unsigned long delay;
+ struct ib_mad_send_wc mad_send_wc;
+ int ret;
+
+ spin_lock_irqsave(&ib_nl_request_lock, flags);
+ while (!list_empty(&ib_nl_request_list)) {
+ query = list_entry(ib_nl_request_list.next,
+ struct ib_sa_query, list);
+
+ if (time_after(query->timeout, jiffies)) {
+ delay = query->timeout - jiffies;
+ if ((long)delay <= 0)
+ delay = 1;
+ queue_delayed_work(ib_nl_wq, &ib_nl_timed_work, delay);
+ break;
+ }
+
+ list_del(&query->list);
+ ib_sa_disable_local_svc(query);
+ /* Hold the lock to protect against query cancellation */
+ if (ib_sa_query_cancelled(query))
+ ret = -1;
+ else
+ ret = ib_post_send_mad(query->mad_buf, NULL);
+ if (ret) {
+ mad_send_wc.send_buf = query->mad_buf;
+ mad_send_wc.status = IB_WC_WR_FLUSH_ERR;
+ spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+ send_handler(query->port->agent, &mad_send_wc);
+ spin_lock_irqsave(&ib_nl_request_lock, flags);
+ }
+ }
+ spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+}
+
+static int ib_nl_handle_set_timeout(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ const struct nlmsghdr *nlh = (struct nlmsghdr *)cb->nlh;
+ int timeout, delta, abs_delta;
+ const struct nlattr *attr;
+ unsigned long flags;
+ struct ib_sa_query *query;
+ long delay = 0;
+ struct nlattr *tb[LS_NLA_TYPE_MAX];
+ int ret;
+
+ if (!netlink_capable(skb, CAP_NET_ADMIN))
+ return -EPERM;
+
+ ret = nla_parse(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh),
+ nlmsg_len(nlh), ib_nl_policy);
+ attr = (const struct nlattr *)tb[LS_NLA_TYPE_TIMEOUT];
+ if (ret || !attr)
+ goto settimeout_out;
+
+ timeout = *(int *) nla_data(attr);
+ if (timeout < IB_SA_LOCAL_SVC_TIMEOUT_MIN)
+ timeout = IB_SA_LOCAL_SVC_TIMEOUT_MIN;
+ if (timeout > IB_SA_LOCAL_SVC_TIMEOUT_MAX)
+ timeout = IB_SA_LOCAL_SVC_TIMEOUT_MAX;
+
+ delta = timeout - sa_local_svc_timeout_ms;
+ if (delta < 0)
+ abs_delta = -delta;
+ else
+ abs_delta = delta;
+
+ if (delta != 0) {
+ spin_lock_irqsave(&ib_nl_request_lock, flags);
+ sa_local_svc_timeout_ms = timeout;
+ list_for_each_entry(query, &ib_nl_request_list, list) {
+ if (delta < 0 && abs_delta > query->timeout)
+ query->timeout = 0;
+ else
+ query->timeout += delta;
+
+ /* Get the new delay from the first entry */
+ if (!delay) {
+ delay = query->timeout - jiffies;
+ if (delay <= 0)
+ delay = 1;
+ }
+ }
+ if (delay)
+ mod_delayed_work(ib_nl_wq, &ib_nl_timed_work,
+ (unsigned long)delay);
+ spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+ }
+
+settimeout_out:
+ return skb->len;
+}
+
+static inline int ib_nl_is_good_resolve_resp(const struct nlmsghdr *nlh)
+{
+ struct nlattr *tb[LS_NLA_TYPE_MAX];
+ int ret;
+
+ if (nlh->nlmsg_flags & RDMA_NL_LS_F_ERR)
+ return 0;
+
+ ret = nla_parse(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh),
+ nlmsg_len(nlh), ib_nl_policy);
+ if (ret)
+ return 0;
+
+ return 1;
+}
+
+static int ib_nl_handle_resolve_resp(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ const struct nlmsghdr *nlh = (struct nlmsghdr *)cb->nlh;
+ unsigned long flags;
+ struct ib_sa_query *query;
+ struct ib_mad_send_buf *send_buf;
+ struct ib_mad_send_wc mad_send_wc;
+ int found = 0;
+ int ret;
+
+ if (!netlink_capable(skb, CAP_NET_ADMIN))
+ return -EPERM;
+
+ spin_lock_irqsave(&ib_nl_request_lock, flags);
+ list_for_each_entry(query, &ib_nl_request_list, list) {
+ /*
+ * If the query is cancelled, let the timeout routine
+ * take care of it.
+ */
+ if (nlh->nlmsg_seq == query->seq) {
+ found = !ib_sa_query_cancelled(query);
+ if (found)
+ list_del(&query->list);
+ break;
+ }
+ }
+
+ if (!found) {
+ spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+ goto resp_out;
+ }
+
+ send_buf = query->mad_buf;
+
+ if (!ib_nl_is_good_resolve_resp(nlh)) {
+ /* if the result is a failure, send out the packet via IB */
+ ib_sa_disable_local_svc(query);
+ ret = ib_post_send_mad(query->mad_buf, NULL);
+ spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+ if (ret) {
+ mad_send_wc.send_buf = send_buf;
+ mad_send_wc.status = IB_WC_GENERAL_ERR;
+ send_handler(query->port->agent, &mad_send_wc);
+ }
+ } else {
+ spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+ ib_nl_process_good_resolve_rsp(query, nlh);
+ }
+
+resp_out:
+ return skb->len;
+}
+
+static struct ibnl_client_cbs ib_sa_cb_table[] = {
+ [RDMA_NL_LS_OP_RESOLVE] = {
+ .dump = ib_nl_handle_resolve_resp,
+ .module = THIS_MODULE },
+ [RDMA_NL_LS_OP_SET_TIMEOUT] = {
+ .dump = ib_nl_handle_set_timeout,
+ .module = THIS_MODULE },
+};
+
static void free_sm_ah(struct kref *kref)
{
struct ib_sa_sm_ah *sm_ah = container_of(kref, struct ib_sa_sm_ah, ref);
@@ -502,7 +960,13 @@ void ib_sa_cancel_query(int id, struct ib_sa_query *query)
mad_buf = query->mad_buf;
spin_unlock_irqrestore(&idr_lock, flags);
- ib_cancel_mad(agent, mad_buf);
+ /*
+ * If the query is still on the netlink request list, schedule
+ * it to be cancelled by the timeout routine. Otherwise, it has been
+ * sent to the MAD layer and has to be cancelled from there.
+ */
+ if (!ib_nl_cancel_request(query))
+ ib_cancel_mad(agent, mad_buf);
}
EXPORT_SYMBOL(ib_sa_cancel_query);
@@ -639,6 +1103,14 @@ static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask)
query->mad_buf->context[0] = query;
query->id = id;
+ if (query->flags & IB_SA_ENABLE_LOCAL_SERVICE) {
+ if (!ibnl_chk_listeners(RDMA_NL_GROUP_LS)) {
+ if (!ib_nl_make_request(query))
+ return id;
+ }
+ ib_sa_disable_local_svc(query);
+ }
+
ret = ib_post_send_mad(query->mad_buf, NULL);
if (ret) {
spin_lock_irqsave(&idr_lock, flags);
@@ -740,7 +1212,7 @@ int ib_sa_path_rec_get(struct ib_sa_client *client,
port = &sa_dev->port[port_num - sa_dev->start_port];
agent = port->agent;
- query = kmalloc(sizeof *query, gfp_mask);
+ query = kzalloc(sizeof(*query), gfp_mask);
if (!query)
return -ENOMEM;
@@ -767,6 +1239,9 @@ int ib_sa_path_rec_get(struct ib_sa_client *client,
*sa_query = &query->sa_query;
+ query->sa_query.flags |= IB_SA_ENABLE_LOCAL_SERVICE;
+ query->sa_query.mad_buf->context[1] = rec;
+
ret = send_mad(&query->sa_query, timeout_ms, gfp_mask);
if (ret < 0)
goto err2;
@@ -862,7 +1337,7 @@ int ib_sa_service_rec_query(struct ib_sa_client *client,
method != IB_SA_METHOD_DELETE)
return -EINVAL;
- query = kmalloc(sizeof *query, gfp_mask);
+ query = kzalloc(sizeof(*query), gfp_mask);
if (!query)
return -ENOMEM;
@@ -954,7 +1429,7 @@ int ib_sa_mcmember_rec_query(struct ib_sa_client *client,
port = &sa_dev->port[port_num - sa_dev->start_port];
agent = port->agent;
- query = kmalloc(sizeof *query, gfp_mask);
+ query = kzalloc(sizeof(*query), gfp_mask);
if (!query)
return -ENOMEM;
@@ -1051,7 +1526,7 @@ int ib_sa_guid_info_rec_query(struct ib_sa_client *client,
port = &sa_dev->port[port_num - sa_dev->start_port];
agent = port->agent;
- query = kmalloc(sizeof *query, gfp_mask);
+ query = kzalloc(sizeof(*query), gfp_mask);
if (!query)
return -ENOMEM;
@@ -1221,9 +1696,9 @@ free:
return;
}
-static void ib_sa_remove_one(struct ib_device *device)
+static void ib_sa_remove_one(struct ib_device *device, void *client_data)
{
- struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+ struct ib_sa_device *sa_dev = client_data;
int i;
if (!sa_dev)
@@ -1251,6 +1726,8 @@ static int __init ib_sa_init(void)
get_random_bytes(&tid, sizeof tid);
+ atomic_set(&ib_nl_sa_request_seq, 0);
+
ret = ib_register_client(&sa_client);
if (ret) {
printk(KERN_ERR "Couldn't register ib_sa client\n");
@@ -1263,7 +1740,25 @@ static int __init ib_sa_init(void)
goto err2;
}
+ ib_nl_wq = create_singlethread_workqueue("ib_nl_sa_wq");
+ if (!ib_nl_wq) {
+ ret = -ENOMEM;
+ goto err3;
+ }
+
+ if (ibnl_add_client(RDMA_NL_LS, RDMA_NL_LS_NUM_OPS,
+ ib_sa_cb_table)) {
+ pr_err("Failed to add netlink callback\n");
+ ret = -EINVAL;
+ goto err4;
+ }
+ INIT_DELAYED_WORK(&ib_nl_timed_work, ib_nl_request_timeout);
+
return 0;
+err4:
+ destroy_workqueue(ib_nl_wq);
+err3:
+ mcast_cleanup();
err2:
ib_unregister_client(&sa_client);
err1:
@@ -1272,6 +1767,10 @@ err1:
static void __exit ib_sa_cleanup(void)
{
+ ibnl_remove_client(RDMA_NL_LS);
+ cancel_delayed_work(&ib_nl_timed_work);
+ flush_workqueue(ib_nl_wq);
+ destroy_workqueue(ib_nl_wq);
mcast_cleanup();
ib_unregister_client(&sa_client);
idr_destroy(&query_idr);
diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c
index 0b84a9cdfe5b..34cdd74b0a17 100644
--- a/drivers/infiniband/core/sysfs.c
+++ b/drivers/infiniband/core/sysfs.c
@@ -457,29 +457,6 @@ static struct kobj_type port_type = {
.default_attrs = port_default_attrs
};
-static void ib_device_release(struct device *device)
-{
- struct ib_device *dev = container_of(device, struct ib_device, dev);
-
- kfree(dev->port_immutable);
- kfree(dev);
-}
-
-static int ib_device_uevent(struct device *device,
- struct kobj_uevent_env *env)
-{
- struct ib_device *dev = container_of(device, struct ib_device, dev);
-
- if (add_uevent_var(env, "NAME=%s", dev->name))
- return -ENOMEM;
-
- /*
- * It would be nice to pass the node GUID with the event...
- */
-
- return 0;
-}
-
static struct attribute **
alloc_group_attrs(ssize_t (*show)(struct ib_port *,
struct port_attribute *, char *buf),
@@ -702,12 +679,6 @@ static struct device_attribute *ib_class_attributes[] = {
&dev_attr_node_desc
};
-static struct class ib_class = {
- .name = "infiniband",
- .dev_release = ib_device_release,
- .dev_uevent = ib_device_uevent,
-};
-
/* Show a given an attribute in the statistics group */
static ssize_t show_protocol_stat(const struct device *device,
struct device_attribute *attr, char *buf,
@@ -846,14 +817,12 @@ int ib_device_register_sysfs(struct ib_device *device,
int ret;
int i;
- class_dev->class = &ib_class;
- class_dev->parent = device->dma_device;
- dev_set_name(class_dev, "%s", device->name);
- dev_set_drvdata(class_dev, device);
-
- INIT_LIST_HEAD(&device->port_list);
+ device->dev.parent = device->dma_device;
+ ret = dev_set_name(class_dev, "%s", device->name);
+ if (ret)
+ return ret;
- ret = device_register(class_dev);
+ ret = device_add(class_dev);
if (ret)
goto err;
@@ -916,13 +885,3 @@ void ib_device_unregister_sysfs(struct ib_device *device)
device_unregister(&device->dev);
}
-
-int ib_sysfs_setup(void)
-{
- return class_register(&ib_class);
-}
-
-void ib_sysfs_cleanup(void)
-{
- class_unregister(&ib_class);
-}
diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c
index 009481073644..6b4e8a008bc0 100644
--- a/drivers/infiniband/core/ucm.c
+++ b/drivers/infiniband/core/ucm.c
@@ -109,7 +109,7 @@ enum {
#define IB_UCM_BASE_DEV MKDEV(IB_UCM_MAJOR, IB_UCM_BASE_MINOR)
static void ib_ucm_add_one(struct ib_device *device);
-static void ib_ucm_remove_one(struct ib_device *device);
+static void ib_ucm_remove_one(struct ib_device *device, void *client_data);
static struct ib_client ucm_client = {
.name = "ucm",
@@ -658,8 +658,7 @@ static ssize_t ib_ucm_listen(struct ib_ucm_file *file,
if (result)
goto out;
- result = ib_cm_listen(ctx->cm_id, cmd.service_id, cmd.service_mask,
- NULL);
+ result = ib_cm_listen(ctx->cm_id, cmd.service_id, cmd.service_mask);
out:
ib_ucm_ctx_put(ctx);
return result;
@@ -1310,9 +1309,9 @@ err:
return;
}
-static void ib_ucm_remove_one(struct ib_device *device)
+static void ib_ucm_remove_one(struct ib_device *device, void *client_data)
{
- struct ib_ucm_device *ucm_dev = ib_get_client_data(device, &ucm_client);
+ struct ib_ucm_device *ucm_dev = client_data;
if (!ucm_dev)
return;
diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
index 29b21213ea75..a53fc9b01c69 100644
--- a/drivers/infiniband/core/ucma.c
+++ b/drivers/infiniband/core/ucma.c
@@ -74,6 +74,7 @@ struct ucma_file {
struct list_head ctx_list;
struct list_head event_list;
wait_queue_head_t poll_wait;
+ struct workqueue_struct *close_wq;
};
struct ucma_context {
@@ -89,6 +90,13 @@ struct ucma_context {
struct list_head list;
struct list_head mc_list;
+ /* mark that device is in process of destroying the internal HW
+ * resources, protected by the global mut
+ */
+ int closing;
+ /* sync between removal event and id destroy, protected by file mut */
+ int destroying;
+ struct work_struct close_work;
};
struct ucma_multicast {
@@ -107,6 +115,7 @@ struct ucma_event {
struct list_head list;
struct rdma_cm_id *cm_id;
struct rdma_ucm_event_resp resp;
+ struct work_struct close_work;
};
static DEFINE_MUTEX(mut);
@@ -132,8 +141,12 @@ static struct ucma_context *ucma_get_ctx(struct ucma_file *file, int id)
mutex_lock(&mut);
ctx = _ucma_find_context(id, file);
- if (!IS_ERR(ctx))
- atomic_inc(&ctx->ref);
+ if (!IS_ERR(ctx)) {
+ if (ctx->closing)
+ ctx = ERR_PTR(-EIO);
+ else
+ atomic_inc(&ctx->ref);
+ }
mutex_unlock(&mut);
return ctx;
}
@@ -144,6 +157,28 @@ static void ucma_put_ctx(struct ucma_context *ctx)
complete(&ctx->comp);
}
+static void ucma_close_event_id(struct work_struct *work)
+{
+ struct ucma_event *uevent_close = container_of(work, struct ucma_event, close_work);
+
+ rdma_destroy_id(uevent_close->cm_id);
+ kfree(uevent_close);
+}
+
+static void ucma_close_id(struct work_struct *work)
+{
+ struct ucma_context *ctx = container_of(work, struct ucma_context, close_work);
+
+ /* once all inflight tasks are finished, we close all underlying
+ * resources. The context is still alive till its explicit destryoing
+ * by its creator.
+ */
+ ucma_put_ctx(ctx);
+ wait_for_completion(&ctx->comp);
+ /* No new events will be generated after destroying the id. */
+ rdma_destroy_id(ctx->cm_id);
+}
+
static struct ucma_context *ucma_alloc_ctx(struct ucma_file *file)
{
struct ucma_context *ctx;
@@ -152,6 +187,7 @@ static struct ucma_context *ucma_alloc_ctx(struct ucma_file *file)
if (!ctx)
return NULL;
+ INIT_WORK(&ctx->close_work, ucma_close_id);
atomic_set(&ctx->ref, 1);
init_completion(&ctx->comp);
INIT_LIST_HEAD(&ctx->mc_list);
@@ -242,6 +278,44 @@ static void ucma_set_event_context(struct ucma_context *ctx,
}
}
+/* Called with file->mut locked for the relevant context. */
+static void ucma_removal_event_handler(struct rdma_cm_id *cm_id)
+{
+ struct ucma_context *ctx = cm_id->context;
+ struct ucma_event *con_req_eve;
+ int event_found = 0;
+
+ if (ctx->destroying)
+ return;
+
+ /* only if context is pointing to cm_id that it owns it and can be
+ * queued to be closed, otherwise that cm_id is an inflight one that
+ * is part of that context event list pending to be detached and
+ * reattached to its new context as part of ucma_get_event,
+ * handled separately below.
+ */
+ if (ctx->cm_id == cm_id) {
+ mutex_lock(&mut);
+ ctx->closing = 1;
+ mutex_unlock(&mut);
+ queue_work(ctx->file->close_wq, &ctx->close_work);
+ return;
+ }
+
+ list_for_each_entry(con_req_eve, &ctx->file->event_list, list) {
+ if (con_req_eve->cm_id == cm_id &&
+ con_req_eve->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST) {
+ list_del(&con_req_eve->list);
+ INIT_WORK(&con_req_eve->close_work, ucma_close_event_id);
+ queue_work(ctx->file->close_wq, &con_req_eve->close_work);
+ event_found = 1;
+ break;
+ }
+ }
+ if (!event_found)
+ printk(KERN_ERR "ucma_removal_event_handler: warning: connect request event wasn't found\n");
+}
+
static int ucma_event_handler(struct rdma_cm_id *cm_id,
struct rdma_cm_event *event)
{
@@ -276,14 +350,21 @@ static int ucma_event_handler(struct rdma_cm_id *cm_id,
* We ignore events for new connections until userspace has set
* their context. This can only happen if an error occurs on a
* new connection before the user accepts it. This is okay,
- * since the accept will just fail later.
+ * since the accept will just fail later. However, we do need
+ * to release the underlying HW resources in case of a device
+ * removal event.
*/
+ if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL)
+ ucma_removal_event_handler(cm_id);
+
kfree(uevent);
goto out;
}
list_add_tail(&uevent->list, &ctx->file->event_list);
wake_up_interruptible(&ctx->file->poll_wait);
+ if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL)
+ ucma_removal_event_handler(cm_id);
out:
mutex_unlock(&ctx->file->mut);
return ret;
@@ -442,9 +523,15 @@ static void ucma_cleanup_mc_events(struct ucma_multicast *mc)
}
/*
- * We cannot hold file->mut when calling rdma_destroy_id() or we can
- * deadlock. We also acquire file->mut in ucma_event_handler(), and
- * rdma_destroy_id() will wait until all callbacks have completed.
+ * ucma_free_ctx is called after the underlying rdma CM-ID is destroyed. At
+ * this point, no new events will be reported from the hardware. However, we
+ * still need to cleanup the UCMA context for this ID. Specifically, there
+ * might be events that have not yet been consumed by the user space software.
+ * These might include pending connect requests which we have not completed
+ * processing. We cannot call rdma_destroy_id while holding the lock of the
+ * context (file->mut), as it might cause a deadlock. We therefore extract all
+ * relevant events from the context pending events list while holding the
+ * mutex. After that we release them as needed.
*/
static int ucma_free_ctx(struct ucma_context *ctx)
{
@@ -452,8 +539,6 @@ static int ucma_free_ctx(struct ucma_context *ctx)
struct ucma_event *uevent, *tmp;
LIST_HEAD(list);
- /* No new events will be generated after destroying the id. */
- rdma_destroy_id(ctx->cm_id);
ucma_cleanup_multicast(ctx);
@@ -501,10 +586,24 @@ static ssize_t ucma_destroy_id(struct ucma_file *file, const char __user *inbuf,
if (IS_ERR(ctx))
return PTR_ERR(ctx);
- ucma_put_ctx(ctx);
- wait_for_completion(&ctx->comp);
- resp.events_reported = ucma_free_ctx(ctx);
+ mutex_lock(&ctx->file->mut);
+ ctx->destroying = 1;
+ mutex_unlock(&ctx->file->mut);
+ flush_workqueue(ctx->file->close_wq);
+ /* At this point it's guaranteed that there is no inflight
+ * closing task */
+ mutex_lock(&mut);
+ if (!ctx->closing) {
+ mutex_unlock(&mut);
+ ucma_put_ctx(ctx);
+ wait_for_completion(&ctx->comp);
+ rdma_destroy_id(ctx->cm_id);
+ } else {
+ mutex_unlock(&mut);
+ }
+
+ resp.events_reported = ucma_free_ctx(ctx);
if (copy_to_user((void __user *)(unsigned long)cmd.response,
&resp, sizeof(resp)))
ret = -EFAULT;
@@ -1321,10 +1420,10 @@ static ssize_t ucma_leave_multicast(struct ucma_file *file,
mc = ERR_PTR(-ENOENT);
else if (mc->ctx->file != file)
mc = ERR_PTR(-EINVAL);
- else {
+ else if (!atomic_inc_not_zero(&mc->ctx->ref))
+ mc = ERR_PTR(-ENXIO);
+ else
idr_remove(&multicast_idr, mc->id);
- atomic_inc(&mc->ctx->ref);
- }
mutex_unlock(&mut);
if (IS_ERR(mc)) {
@@ -1529,6 +1628,7 @@ static int ucma_open(struct inode *inode, struct file *filp)
INIT_LIST_HEAD(&file->ctx_list);
init_waitqueue_head(&file->poll_wait);
mutex_init(&file->mut);
+ file->close_wq = create_singlethread_workqueue("ucma_close_id");
filp->private_data = file;
file->filp = filp;
@@ -1543,16 +1643,34 @@ static int ucma_close(struct inode *inode, struct file *filp)
mutex_lock(&file->mut);
list_for_each_entry_safe(ctx, tmp, &file->ctx_list, list) {
+ ctx->destroying = 1;
mutex_unlock(&file->mut);
mutex_lock(&mut);
idr_remove(&ctx_idr, ctx->id);
mutex_unlock(&mut);
+ flush_workqueue(file->close_wq);
+ /* At that step once ctx was marked as destroying and workqueue
+ * was flushed we are safe from any inflights handlers that
+ * might put other closing task.
+ */
+ mutex_lock(&mut);
+ if (!ctx->closing) {
+ mutex_unlock(&mut);
+ /* rdma_destroy_id ensures that no event handlers are
+ * inflight for that id before releasing it.
+ */
+ rdma_destroy_id(ctx->cm_id);
+ } else {
+ mutex_unlock(&mut);
+ }
+
ucma_free_ctx(ctx);
mutex_lock(&file->mut);
}
mutex_unlock(&file->mut);
+ destroy_workqueue(file->close_wq);
kfree(file);
return 0;
}
diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c
index 35567fffaa4e..57f281f8d686 100644
--- a/drivers/infiniband/core/user_mad.c
+++ b/drivers/infiniband/core/user_mad.c
@@ -133,7 +133,7 @@ static DEFINE_SPINLOCK(port_lock);
static DECLARE_BITMAP(dev_map, IB_UMAD_MAX_PORTS);
static void ib_umad_add_one(struct ib_device *device);
-static void ib_umad_remove_one(struct ib_device *device);
+static void ib_umad_remove_one(struct ib_device *device, void *client_data);
static void ib_umad_release_dev(struct kobject *kobj)
{
@@ -1322,9 +1322,9 @@ free:
kobject_put(&umad_dev->kobj);
}
-static void ib_umad_remove_one(struct ib_device *device)
+static void ib_umad_remove_one(struct ib_device *device, void *client_data)
{
- struct ib_umad_device *umad_dev = ib_get_client_data(device, &umad_client);
+ struct ib_umad_device *umad_dev = client_data;
int i;
if (!umad_dev)
diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h
index ba365b6d1e8d..3863d33c243d 100644
--- a/drivers/infiniband/core/uverbs.h
+++ b/drivers/infiniband/core/uverbs.h
@@ -85,15 +85,20 @@
*/
struct ib_uverbs_device {
- struct kref ref;
+ atomic_t refcount;
int num_comp_vectors;
struct completion comp;
struct device *dev;
- struct ib_device *ib_dev;
+ struct ib_device __rcu *ib_dev;
int devnum;
struct cdev cdev;
struct rb_root xrcd_tree;
struct mutex xrcd_tree_mutex;
+ struct kobject kobj;
+ struct srcu_struct disassociate_srcu;
+ struct mutex lists_mutex; /* protect lists */
+ struct list_head uverbs_file_list;
+ struct list_head uverbs_events_file_list;
};
struct ib_uverbs_event_file {
@@ -105,6 +110,7 @@ struct ib_uverbs_event_file {
wait_queue_head_t poll_wait;
struct fasync_struct *async_queue;
struct list_head event_list;
+ struct list_head list;
};
struct ib_uverbs_file {
@@ -114,6 +120,8 @@ struct ib_uverbs_file {
struct ib_ucontext *ucontext;
struct ib_event_handler event_handler;
struct ib_uverbs_event_file *async_file;
+ struct list_head list;
+ int is_closed;
};
struct ib_uverbs_event {
@@ -177,7 +185,9 @@ extern struct idr ib_uverbs_rule_idr;
void idr_remove_uobj(struct idr *idp, struct ib_uobject *uobj);
struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file,
+ struct ib_device *ib_dev,
int is_async);
+void ib_uverbs_free_async_event_file(struct ib_uverbs_file *uverbs_file);
struct ib_uverbs_event_file *ib_uverbs_lookup_comp_file(int fd);
void ib_uverbs_release_ucq(struct ib_uverbs_file *file,
@@ -212,6 +222,7 @@ struct ib_uverbs_flow_spec {
#define IB_UVERBS_DECLARE_CMD(name) \
ssize_t ib_uverbs_##name(struct ib_uverbs_file *file, \
+ struct ib_device *ib_dev, \
const char __user *buf, int in_len, \
int out_len)
@@ -253,6 +264,7 @@ IB_UVERBS_DECLARE_CMD(close_xrcd);
#define IB_UVERBS_DECLARE_EX_CMD(name) \
int ib_uverbs_ex_##name(struct ib_uverbs_file *file, \
+ struct ib_device *ib_dev, \
struct ib_udata *ucore, \
struct ib_udata *uhw)
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index bbb02ffe87df..be4cb9f04be3 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -282,13 +282,13 @@ static void put_xrcd_read(struct ib_uobject *uobj)
}
ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf,
int in_len, int out_len)
{
struct ib_uverbs_get_context cmd;
struct ib_uverbs_get_context_resp resp;
struct ib_udata udata;
- struct ib_device *ibdev = file->device->ib_dev;
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
struct ib_device_attr dev_attr;
#endif
@@ -313,13 +313,13 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
(unsigned long) cmd.response + sizeof resp,
in_len - sizeof cmd, out_len - sizeof resp);
- ucontext = ibdev->alloc_ucontext(ibdev, &udata);
+ ucontext = ib_dev->alloc_ucontext(ib_dev, &udata);
if (IS_ERR(ucontext)) {
ret = PTR_ERR(ucontext);
goto err;
}
- ucontext->device = ibdev;
+ ucontext->device = ib_dev;
INIT_LIST_HEAD(&ucontext->pd_list);
INIT_LIST_HEAD(&ucontext->mr_list);
INIT_LIST_HEAD(&ucontext->mw_list);
@@ -340,7 +340,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
ucontext->odp_mrs_count = 0;
INIT_LIST_HEAD(&ucontext->no_private_counters);
- ret = ib_query_device(ibdev, &dev_attr);
+ ret = ib_query_device(ib_dev, &dev_attr);
if (ret)
goto err_free;
if (!(dev_attr.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING))
@@ -355,7 +355,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
goto err_free;
resp.async_fd = ret;
- filp = ib_uverbs_alloc_event_file(file, 1);
+ filp = ib_uverbs_alloc_event_file(file, ib_dev, 1);
if (IS_ERR(filp)) {
ret = PTR_ERR(filp);
goto err_fd;
@@ -367,16 +367,6 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
goto err_file;
}
- file->async_file = filp->private_data;
-
- INIT_IB_EVENT_HANDLER(&file->event_handler, file->device->ib_dev,
- ib_uverbs_event_handler);
- ret = ib_register_event_handler(&file->event_handler);
- if (ret)
- goto err_file;
-
- kref_get(&file->async_file->ref);
- kref_get(&file->ref);
file->ucontext = ucontext;
fd_install(resp.async_fd, filp);
@@ -386,6 +376,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
return in_len;
err_file:
+ ib_uverbs_free_async_event_file(file);
fput(filp);
err_fd:
@@ -393,7 +384,7 @@ err_fd:
err_free:
put_pid(ucontext->tgid);
- ibdev->dealloc_ucontext(ucontext);
+ ib_dev->dealloc_ucontext(ucontext);
err:
mutex_unlock(&file->mutex);
@@ -401,11 +392,12 @@ err:
}
static void copy_query_dev_fields(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
struct ib_uverbs_query_device_resp *resp,
struct ib_device_attr *attr)
{
resp->fw_ver = attr->fw_ver;
- resp->node_guid = file->device->ib_dev->node_guid;
+ resp->node_guid = ib_dev->node_guid;
resp->sys_image_guid = attr->sys_image_guid;
resp->max_mr_size = attr->max_mr_size;
resp->page_size_cap = attr->page_size_cap;
@@ -443,10 +435,11 @@ static void copy_query_dev_fields(struct ib_uverbs_file *file,
resp->max_srq_sge = attr->max_srq_sge;
resp->max_pkeys = attr->max_pkeys;
resp->local_ca_ack_delay = attr->local_ca_ack_delay;
- resp->phys_port_cnt = file->device->ib_dev->phys_port_cnt;
+ resp->phys_port_cnt = ib_dev->phys_port_cnt;
}
ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf,
int in_len, int out_len)
{
@@ -461,12 +454,12 @@ ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file,
if (copy_from_user(&cmd, buf, sizeof cmd))
return -EFAULT;
- ret = ib_query_device(file->device->ib_dev, &attr);
+ ret = ib_query_device(ib_dev, &attr);
if (ret)
return ret;
memset(&resp, 0, sizeof resp);
- copy_query_dev_fields(file, &resp, &attr);
+ copy_query_dev_fields(file, ib_dev, &resp, &attr);
if (copy_to_user((void __user *) (unsigned long) cmd.response,
&resp, sizeof resp))
@@ -476,6 +469,7 @@ ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file,
}
ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf,
int in_len, int out_len)
{
@@ -490,7 +484,7 @@ ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file,
if (copy_from_user(&cmd, buf, sizeof cmd))
return -EFAULT;
- ret = ib_query_port(file->device->ib_dev, cmd.port_num, &attr);
+ ret = ib_query_port(ib_dev, cmd.port_num, &attr);
if (ret)
return ret;
@@ -515,7 +509,7 @@ ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file,
resp.active_width = attr.active_width;
resp.active_speed = attr.active_speed;
resp.phys_state = attr.phys_state;
- resp.link_layer = rdma_port_get_link_layer(file->device->ib_dev,
+ resp.link_layer = rdma_port_get_link_layer(ib_dev,
cmd.port_num);
if (copy_to_user((void __user *) (unsigned long) cmd.response,
@@ -526,6 +520,7 @@ ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file,
}
ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf,
int in_len, int out_len)
{
@@ -553,15 +548,15 @@ ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file,
init_uobj(uobj, 0, file->ucontext, &pd_lock_class);
down_write(&uobj->mutex);
- pd = file->device->ib_dev->alloc_pd(file->device->ib_dev,
- file->ucontext, &udata);
+ pd = ib_dev->alloc_pd(ib_dev, file->ucontext, &udata);
if (IS_ERR(pd)) {
ret = PTR_ERR(pd);
goto err;
}
- pd->device = file->device->ib_dev;
+ pd->device = ib_dev;
pd->uobject = uobj;
+ pd->local_mr = NULL;
atomic_set(&pd->usecnt, 0);
uobj->object = pd;
@@ -600,11 +595,13 @@ err:
}
ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf,
int in_len, int out_len)
{
struct ib_uverbs_dealloc_pd cmd;
struct ib_uobject *uobj;
+ struct ib_pd *pd;
int ret;
if (copy_from_user(&cmd, buf, sizeof cmd))
@@ -613,15 +610,20 @@ ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file,
uobj = idr_write_uobj(&ib_uverbs_pd_idr, cmd.pd_handle, file->ucontext);
if (!uobj)
return -EINVAL;
+ pd = uobj->object;
- ret = ib_dealloc_pd(uobj->object);
- if (!ret)
- uobj->live = 0;
-
- put_uobj_write(uobj);
+ if (atomic_read(&pd->usecnt)) {
+ ret = -EBUSY;
+ goto err_put;
+ }
+ ret = pd->device->dealloc_pd(uobj->object);
+ WARN_ONCE(ret, "Infiniband HW driver failed dealloc_pd");
if (ret)
- return ret;
+ goto err_put;
+
+ uobj->live = 0;
+ put_uobj_write(uobj);
idr_remove_uobj(&ib_uverbs_pd_idr, uobj);
@@ -632,6 +634,10 @@ ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file,
put_uobj(uobj);
return in_len;
+
+err_put:
+ put_uobj_write(uobj);
+ return ret;
}
struct xrcd_table_entry {
@@ -720,6 +726,7 @@ static void xrcd_table_delete(struct ib_uverbs_device *dev,
}
ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -778,15 +785,14 @@ ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file,
down_write(&obj->uobject.mutex);
if (!xrcd) {
- xrcd = file->device->ib_dev->alloc_xrcd(file->device->ib_dev,
- file->ucontext, &udata);
+ xrcd = ib_dev->alloc_xrcd(ib_dev, file->ucontext, &udata);
if (IS_ERR(xrcd)) {
ret = PTR_ERR(xrcd);
goto err;
}
xrcd->inode = inode;
- xrcd->device = file->device->ib_dev;
+ xrcd->device = ib_dev;
atomic_set(&xrcd->usecnt, 0);
mutex_init(&xrcd->tgt_qp_mutex);
INIT_LIST_HEAD(&xrcd->tgt_qp_list);
@@ -857,6 +863,7 @@ err_tree_mutex_unlock:
}
ssize_t ib_uverbs_close_xrcd(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -934,6 +941,7 @@ void ib_uverbs_dealloc_xrcd(struct ib_uverbs_device *dev,
}
ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -1043,6 +1051,7 @@ err_free:
}
ssize_t ib_uverbs_rereg_mr(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -1136,6 +1145,7 @@ put_uobjs:
}
ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -1174,8 +1184,9 @@ ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file,
}
ssize_t ib_uverbs_alloc_mw(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+ struct ib_device *ib_dev,
+ const char __user *buf, int in_len,
+ int out_len)
{
struct ib_uverbs_alloc_mw cmd;
struct ib_uverbs_alloc_mw_resp resp;
@@ -1256,8 +1267,9 @@ err_free:
}
ssize_t ib_uverbs_dealloc_mw(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+ struct ib_device *ib_dev,
+ const char __user *buf, int in_len,
+ int out_len)
{
struct ib_uverbs_dealloc_mw cmd;
struct ib_mw *mw;
@@ -1294,6 +1306,7 @@ ssize_t ib_uverbs_dealloc_mw(struct ib_uverbs_file *file,
}
ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -1313,7 +1326,7 @@ ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file,
return ret;
resp.fd = ret;
- filp = ib_uverbs_alloc_event_file(file, 0);
+ filp = ib_uverbs_alloc_event_file(file, ib_dev, 0);
if (IS_ERR(filp)) {
put_unused_fd(resp.fd);
return PTR_ERR(filp);
@@ -1331,6 +1344,7 @@ ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file,
}
static struct ib_ucq_object *create_cq(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
struct ib_udata *ucore,
struct ib_udata *uhw,
struct ib_uverbs_ex_create_cq *cmd,
@@ -1379,14 +1393,14 @@ static struct ib_ucq_object *create_cq(struct ib_uverbs_file *file,
if (cmd_sz > offsetof(typeof(*cmd), flags) + sizeof(cmd->flags))
attr.flags = cmd->flags;
- cq = file->device->ib_dev->create_cq(file->device->ib_dev, &attr,
+ cq = ib_dev->create_cq(ib_dev, &attr,
file->ucontext, uhw);
if (IS_ERR(cq)) {
ret = PTR_ERR(cq);
goto err_file;
}
- cq->device = file->device->ib_dev;
+ cq->device = ib_dev;
cq->uobject = &obj->uobject;
cq->comp_handler = ib_uverbs_comp_handler;
cq->event_handler = ib_uverbs_cq_event_handler;
@@ -1447,6 +1461,7 @@ static int ib_uverbs_create_cq_cb(struct ib_uverbs_file *file,
}
ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -1475,7 +1490,7 @@ ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file,
cmd_ex.comp_vector = cmd.comp_vector;
cmd_ex.comp_channel = cmd.comp_channel;
- obj = create_cq(file, &ucore, &uhw, &cmd_ex,
+ obj = create_cq(file, ib_dev, &ucore, &uhw, &cmd_ex,
offsetof(typeof(cmd_ex), comp_channel) +
sizeof(cmd.comp_channel), ib_uverbs_create_cq_cb,
NULL);
@@ -1498,6 +1513,7 @@ static int ib_uverbs_ex_create_cq_cb(struct ib_uverbs_file *file,
}
int ib_uverbs_ex_create_cq(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
struct ib_udata *ucore,
struct ib_udata *uhw)
{
@@ -1523,7 +1539,7 @@ int ib_uverbs_ex_create_cq(struct ib_uverbs_file *file,
sizeof(resp.response_length)))
return -ENOSPC;
- obj = create_cq(file, ucore, uhw, &cmd,
+ obj = create_cq(file, ib_dev, ucore, uhw, &cmd,
min(ucore->inlen, sizeof(cmd)),
ib_uverbs_ex_create_cq_cb, NULL);
@@ -1534,6 +1550,7 @@ int ib_uverbs_ex_create_cq(struct ib_uverbs_file *file,
}
ssize_t ib_uverbs_resize_cq(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -1597,6 +1614,7 @@ static int copy_wc_to_user(void __user *dest, struct ib_wc *wc)
}
ssize_t ib_uverbs_poll_cq(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -1648,6 +1666,7 @@ out_put:
}
ssize_t ib_uverbs_req_notify_cq(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -1670,6 +1689,7 @@ ssize_t ib_uverbs_req_notify_cq(struct ib_uverbs_file *file,
}
ssize_t ib_uverbs_destroy_cq(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -1722,6 +1742,7 @@ ssize_t ib_uverbs_destroy_cq(struct ib_uverbs_file *file,
}
ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -1917,6 +1938,7 @@ err_put:
}
ssize_t ib_uverbs_open_qp(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len, int out_len)
{
struct ib_uverbs_open_qp cmd;
@@ -2011,6 +2033,7 @@ err_put:
}
ssize_t ib_uverbs_query_qp(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -2125,6 +2148,7 @@ static int modify_qp_mask(enum ib_qp_type qp_type, int mask)
}
ssize_t ib_uverbs_modify_qp(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -2221,6 +2245,7 @@ out:
}
ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -2279,6 +2304,7 @@ ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file,
}
ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -2346,6 +2372,12 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file,
next->send_flags = user_wr->send_flags;
if (is_ud) {
+ if (next->opcode != IB_WR_SEND &&
+ next->opcode != IB_WR_SEND_WITH_IMM) {
+ ret = -EINVAL;
+ goto out_put;
+ }
+
next->wr.ud.ah = idr_read_ah(user_wr->wr.ud.ah,
file->ucontext);
if (!next->wr.ud.ah) {
@@ -2385,9 +2417,11 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file,
user_wr->wr.atomic.compare_add;
next->wr.atomic.swap = user_wr->wr.atomic.swap;
next->wr.atomic.rkey = user_wr->wr.atomic.rkey;
+ case IB_WR_SEND:
break;
default:
- break;
+ ret = -EINVAL;
+ goto out_put;
}
}
@@ -2523,6 +2557,7 @@ err:
}
ssize_t ib_uverbs_post_recv(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -2572,6 +2607,7 @@ out:
}
ssize_t ib_uverbs_post_srq_recv(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -2621,6 +2657,7 @@ out:
}
ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -2713,6 +2750,7 @@ err:
}
ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len, int out_len)
{
struct ib_uverbs_destroy_ah cmd;
@@ -2749,6 +2787,7 @@ ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file,
}
ssize_t ib_uverbs_attach_mcast(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -2796,6 +2835,7 @@ out_put:
}
ssize_t ib_uverbs_detach_mcast(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -2876,6 +2916,7 @@ static int kern_spec_to_ib_spec(struct ib_uverbs_flow_spec *kern_spec,
}
int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
struct ib_udata *ucore,
struct ib_udata *uhw)
{
@@ -3036,6 +3077,7 @@ err_free_attr:
}
int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
struct ib_udata *ucore,
struct ib_udata *uhw)
{
@@ -3078,6 +3120,7 @@ int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file,
}
static int __uverbs_create_xsrq(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
struct ib_uverbs_create_xsrq *cmd,
struct ib_udata *udata)
{
@@ -3211,6 +3254,7 @@ err:
}
ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -3238,7 +3282,7 @@ ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file,
(unsigned long) cmd.response + sizeof resp,
in_len - sizeof cmd, out_len - sizeof resp);
- ret = __uverbs_create_xsrq(file, &xcmd, &udata);
+ ret = __uverbs_create_xsrq(file, ib_dev, &xcmd, &udata);
if (ret)
return ret;
@@ -3246,6 +3290,7 @@ ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file,
}
ssize_t ib_uverbs_create_xsrq(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len, int out_len)
{
struct ib_uverbs_create_xsrq cmd;
@@ -3263,7 +3308,7 @@ ssize_t ib_uverbs_create_xsrq(struct ib_uverbs_file *file,
(unsigned long) cmd.response + sizeof resp,
in_len - sizeof cmd, out_len - sizeof resp);
- ret = __uverbs_create_xsrq(file, &cmd, &udata);
+ ret = __uverbs_create_xsrq(file, ib_dev, &cmd, &udata);
if (ret)
return ret;
@@ -3271,6 +3316,7 @@ ssize_t ib_uverbs_create_xsrq(struct ib_uverbs_file *file,
}
ssize_t ib_uverbs_modify_srq(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -3301,6 +3347,7 @@ ssize_t ib_uverbs_modify_srq(struct ib_uverbs_file *file,
}
ssize_t ib_uverbs_query_srq(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf,
int in_len, int out_len)
{
@@ -3341,6 +3388,7 @@ ssize_t ib_uverbs_query_srq(struct ib_uverbs_file *file,
}
ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -3398,16 +3446,15 @@ ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file,
}
int ib_uverbs_ex_query_device(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
struct ib_udata *ucore,
struct ib_udata *uhw)
{
struct ib_uverbs_ex_query_device_resp resp;
struct ib_uverbs_ex_query_device cmd;
struct ib_device_attr attr;
- struct ib_device *device;
int err;
- device = file->device->ib_dev;
if (ucore->inlen < sizeof(cmd))
return -EINVAL;
@@ -3428,11 +3475,11 @@ int ib_uverbs_ex_query_device(struct ib_uverbs_file *file,
memset(&attr, 0, sizeof(attr));
- err = device->query_device(device, &attr, uhw);
+ err = ib_dev->query_device(ib_dev, &attr, uhw);
if (err)
return err;
- copy_query_dev_fields(file, &resp.base, &attr);
+ copy_query_dev_fields(file, ib_dev, &resp.base, &attr);
resp.comp_mask = 0;
if (ucore->outlen < resp.response_length + sizeof(resp.odp_caps))
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index f6eef2da7097..c29a660c72fe 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -79,6 +79,7 @@ static DEFINE_SPINLOCK(map_lock);
static DECLARE_BITMAP(dev_map, IB_UVERBS_MAX_DEVICES);
static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len) = {
[IB_USER_VERBS_CMD_GET_CONTEXT] = ib_uverbs_get_context,
@@ -119,6 +120,7 @@ static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file,
};
static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
struct ib_udata *ucore,
struct ib_udata *uhw) = {
[IB_USER_VERBS_EX_CMD_CREATE_FLOW] = ib_uverbs_ex_create_flow,
@@ -128,16 +130,21 @@ static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file,
};
static void ib_uverbs_add_one(struct ib_device *device);
-static void ib_uverbs_remove_one(struct ib_device *device);
+static void ib_uverbs_remove_one(struct ib_device *device, void *client_data);
-static void ib_uverbs_release_dev(struct kref *ref)
+static void ib_uverbs_release_dev(struct kobject *kobj)
{
struct ib_uverbs_device *dev =
- container_of(ref, struct ib_uverbs_device, ref);
+ container_of(kobj, struct ib_uverbs_device, kobj);
- complete(&dev->comp);
+ cleanup_srcu_struct(&dev->disassociate_srcu);
+ kfree(dev);
}
+static struct kobj_type ib_uverbs_dev_ktype = {
+ .release = ib_uverbs_release_dev,
+};
+
static void ib_uverbs_release_event_file(struct kref *ref)
{
struct ib_uverbs_event_file *file =
@@ -201,9 +208,6 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
{
struct ib_uobject *uobj, *tmp;
- if (!context)
- return 0;
-
context->closing = 1;
list_for_each_entry_safe(uobj, tmp, &context->ah_list, list) {
@@ -303,13 +307,27 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
return context->device->dealloc_ucontext(context);
}
+static void ib_uverbs_comp_dev(struct ib_uverbs_device *dev)
+{
+ complete(&dev->comp);
+}
+
static void ib_uverbs_release_file(struct kref *ref)
{
struct ib_uverbs_file *file =
container_of(ref, struct ib_uverbs_file, ref);
+ struct ib_device *ib_dev;
+ int srcu_key;
+
+ srcu_key = srcu_read_lock(&file->device->disassociate_srcu);
+ ib_dev = srcu_dereference(file->device->ib_dev,
+ &file->device->disassociate_srcu);
+ if (ib_dev && !ib_dev->disassociate_ucontext)
+ module_put(ib_dev->owner);
+ srcu_read_unlock(&file->device->disassociate_srcu, srcu_key);
- module_put(file->device->ib_dev->owner);
- kref_put(&file->device->ref, ib_uverbs_release_dev);
+ if (atomic_dec_and_test(&file->device->refcount))
+ ib_uverbs_comp_dev(file->device);
kfree(file);
}
@@ -331,9 +349,19 @@ static ssize_t ib_uverbs_event_read(struct file *filp, char __user *buf,
return -EAGAIN;
if (wait_event_interruptible(file->poll_wait,
- !list_empty(&file->event_list)))
+ (!list_empty(&file->event_list) ||
+ /* The barriers built into wait_event_interruptible()
+ * and wake_up() guarentee this will see the null set
+ * without using RCU
+ */
+ !file->uverbs_file->device->ib_dev)))
return -ERESTARTSYS;
+ /* If device was disassociated and no event exists set an error */
+ if (list_empty(&file->event_list) &&
+ !file->uverbs_file->device->ib_dev)
+ return -EIO;
+
spin_lock_irq(&file->lock);
}
@@ -396,8 +424,11 @@ static int ib_uverbs_event_close(struct inode *inode, struct file *filp)
{
struct ib_uverbs_event_file *file = filp->private_data;
struct ib_uverbs_event *entry, *tmp;
+ int closed_already = 0;
+ mutex_lock(&file->uverbs_file->device->lists_mutex);
spin_lock_irq(&file->lock);
+ closed_already = file->is_closed;
file->is_closed = 1;
list_for_each_entry_safe(entry, tmp, &file->event_list, list) {
if (entry->counter)
@@ -405,11 +436,15 @@ static int ib_uverbs_event_close(struct inode *inode, struct file *filp)
kfree(entry);
}
spin_unlock_irq(&file->lock);
-
- if (file->is_async) {
- ib_unregister_event_handler(&file->uverbs_file->event_handler);
- kref_put(&file->uverbs_file->ref, ib_uverbs_release_file);
+ if (!closed_already) {
+ list_del(&file->list);
+ if (file->is_async)
+ ib_unregister_event_handler(&file->uverbs_file->
+ event_handler);
}
+ mutex_unlock(&file->uverbs_file->device->lists_mutex);
+
+ kref_put(&file->uverbs_file->ref, ib_uverbs_release_file);
kref_put(&file->ref, ib_uverbs_release_event_file);
return 0;
@@ -541,13 +576,21 @@ void ib_uverbs_event_handler(struct ib_event_handler *handler,
NULL, NULL);
}
+void ib_uverbs_free_async_event_file(struct ib_uverbs_file *file)
+{
+ kref_put(&file->async_file->ref, ib_uverbs_release_event_file);
+ file->async_file = NULL;
+}
+
struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file,
+ struct ib_device *ib_dev,
int is_async)
{
struct ib_uverbs_event_file *ev_file;
struct file *filp;
+ int ret;
- ev_file = kmalloc(sizeof *ev_file, GFP_KERNEL);
+ ev_file = kzalloc(sizeof(*ev_file), GFP_KERNEL);
if (!ev_file)
return ERR_PTR(-ENOMEM);
@@ -556,16 +599,47 @@ struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file,
INIT_LIST_HEAD(&ev_file->event_list);
init_waitqueue_head(&ev_file->poll_wait);
ev_file->uverbs_file = uverbs_file;
+ kref_get(&ev_file->uverbs_file->ref);
ev_file->async_queue = NULL;
- ev_file->is_async = is_async;
ev_file->is_closed = 0;
filp = anon_inode_getfile("[infinibandevent]", &uverbs_event_fops,
ev_file, O_RDONLY);
if (IS_ERR(filp))
- kfree(ev_file);
+ goto err_put_refs;
+
+ mutex_lock(&uverbs_file->device->lists_mutex);
+ list_add_tail(&ev_file->list,
+ &uverbs_file->device->uverbs_events_file_list);
+ mutex_unlock(&uverbs_file->device->lists_mutex);
+
+ if (is_async) {
+ WARN_ON(uverbs_file->async_file);
+ uverbs_file->async_file = ev_file;
+ kref_get(&uverbs_file->async_file->ref);
+ INIT_IB_EVENT_HANDLER(&uverbs_file->event_handler,
+ ib_dev,
+ ib_uverbs_event_handler);
+ ret = ib_register_event_handler(&uverbs_file->event_handler);
+ if (ret)
+ goto err_put_file;
+
+ /* At that point async file stuff was fully set */
+ ev_file->is_async = 1;
+ }
return filp;
+
+err_put_file:
+ fput(filp);
+ kref_put(&uverbs_file->async_file->ref, ib_uverbs_release_event_file);
+ uverbs_file->async_file = NULL;
+ return ERR_PTR(ret);
+
+err_put_refs:
+ kref_put(&ev_file->uverbs_file->ref, ib_uverbs_release_file);
+ kref_put(&ev_file->ref, ib_uverbs_release_event_file);
+ return filp;
}
/*
@@ -601,8 +675,11 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
size_t count, loff_t *pos)
{
struct ib_uverbs_file *file = filp->private_data;
+ struct ib_device *ib_dev;
struct ib_uverbs_cmd_hdr hdr;
__u32 flags;
+ int srcu_key;
+ ssize_t ret;
if (count < sizeof hdr)
return -EINVAL;
@@ -610,6 +687,14 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
if (copy_from_user(&hdr, buf, sizeof hdr))
return -EFAULT;
+ srcu_key = srcu_read_lock(&file->device->disassociate_srcu);
+ ib_dev = srcu_dereference(file->device->ib_dev,
+ &file->device->disassociate_srcu);
+ if (!ib_dev) {
+ ret = -EIO;
+ goto out;
+ }
+
flags = (hdr.command &
IB_USER_VERBS_CMD_FLAGS_MASK) >> IB_USER_VERBS_CMD_FLAGS_SHIFT;
@@ -617,26 +702,36 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
__u32 command;
if (hdr.command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK |
- IB_USER_VERBS_CMD_COMMAND_MASK))
- return -EINVAL;
+ IB_USER_VERBS_CMD_COMMAND_MASK)) {
+ ret = -EINVAL;
+ goto out;
+ }
command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK;
if (command >= ARRAY_SIZE(uverbs_cmd_table) ||
- !uverbs_cmd_table[command])
- return -EINVAL;
+ !uverbs_cmd_table[command]) {
+ ret = -EINVAL;
+ goto out;
+ }
if (!file->ucontext &&
- command != IB_USER_VERBS_CMD_GET_CONTEXT)
- return -EINVAL;
+ command != IB_USER_VERBS_CMD_GET_CONTEXT) {
+ ret = -EINVAL;
+ goto out;
+ }
- if (!(file->device->ib_dev->uverbs_cmd_mask & (1ull << command)))
- return -ENOSYS;
+ if (!(ib_dev->uverbs_cmd_mask & (1ull << command))) {
+ ret = -ENOSYS;
+ goto out;
+ }
- if (hdr.in_words * 4 != count)
- return -EINVAL;
+ if (hdr.in_words * 4 != count) {
+ ret = -EINVAL;
+ goto out;
+ }
- return uverbs_cmd_table[command](file,
+ ret = uverbs_cmd_table[command](file, ib_dev,
buf + sizeof(hdr),
hdr.in_words * 4,
hdr.out_words * 4);
@@ -647,51 +742,72 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
struct ib_uverbs_ex_cmd_hdr ex_hdr;
struct ib_udata ucore;
struct ib_udata uhw;
- int err;
size_t written_count = count;
if (hdr.command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK |
- IB_USER_VERBS_CMD_COMMAND_MASK))
- return -EINVAL;
+ IB_USER_VERBS_CMD_COMMAND_MASK)) {
+ ret = -EINVAL;
+ goto out;
+ }
command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK;
if (command >= ARRAY_SIZE(uverbs_ex_cmd_table) ||
- !uverbs_ex_cmd_table[command])
- return -ENOSYS;
+ !uverbs_ex_cmd_table[command]) {
+ ret = -ENOSYS;
+ goto out;
+ }
- if (!file->ucontext)
- return -EINVAL;
+ if (!file->ucontext) {
+ ret = -EINVAL;
+ goto out;
+ }
- if (!(file->device->ib_dev->uverbs_ex_cmd_mask & (1ull << command)))
- return -ENOSYS;
+ if (!(ib_dev->uverbs_ex_cmd_mask & (1ull << command))) {
+ ret = -ENOSYS;
+ goto out;
+ }
- if (count < (sizeof(hdr) + sizeof(ex_hdr)))
- return -EINVAL;
+ if (count < (sizeof(hdr) + sizeof(ex_hdr))) {
+ ret = -EINVAL;
+ goto out;
+ }
- if (copy_from_user(&ex_hdr, buf + sizeof(hdr), sizeof(ex_hdr)))
- return -EFAULT;
+ if (copy_from_user(&ex_hdr, buf + sizeof(hdr), sizeof(ex_hdr))) {
+ ret = -EFAULT;
+ goto out;
+ }
count -= sizeof(hdr) + sizeof(ex_hdr);
buf += sizeof(hdr) + sizeof(ex_hdr);
- if ((hdr.in_words + ex_hdr.provider_in_words) * 8 != count)
- return -EINVAL;
+ if ((hdr.in_words + ex_hdr.provider_in_words) * 8 != count) {
+ ret = -EINVAL;
+ goto out;
+ }
- if (ex_hdr.cmd_hdr_reserved)
- return -EINVAL;
+ if (ex_hdr.cmd_hdr_reserved) {
+ ret = -EINVAL;
+ goto out;
+ }
if (ex_hdr.response) {
- if (!hdr.out_words && !ex_hdr.provider_out_words)
- return -EINVAL;
+ if (!hdr.out_words && !ex_hdr.provider_out_words) {
+ ret = -EINVAL;
+ goto out;
+ }
if (!access_ok(VERIFY_WRITE,
(void __user *) (unsigned long) ex_hdr.response,
- (hdr.out_words + ex_hdr.provider_out_words) * 8))
- return -EFAULT;
+ (hdr.out_words + ex_hdr.provider_out_words) * 8)) {
+ ret = -EFAULT;
+ goto out;
+ }
} else {
- if (hdr.out_words || ex_hdr.provider_out_words)
- return -EINVAL;
+ if (hdr.out_words || ex_hdr.provider_out_words) {
+ ret = -EINVAL;
+ goto out;
+ }
}
INIT_UDATA_BUF_OR_NULL(&ucore, buf, (unsigned long) ex_hdr.response,
@@ -703,27 +819,43 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
ex_hdr.provider_in_words * 8,
ex_hdr.provider_out_words * 8);
- err = uverbs_ex_cmd_table[command](file,
+ ret = uverbs_ex_cmd_table[command](file,
+ ib_dev,
&ucore,
&uhw);
-
- if (err)
- return err;
-
- return written_count;
+ if (!ret)
+ ret = written_count;
+ } else {
+ ret = -ENOSYS;
}
- return -ENOSYS;
+out:
+ srcu_read_unlock(&file->device->disassociate_srcu, srcu_key);
+ return ret;
}
static int ib_uverbs_mmap(struct file *filp, struct vm_area_struct *vma)
{
struct ib_uverbs_file *file = filp->private_data;
+ struct ib_device *ib_dev;
+ int ret = 0;
+ int srcu_key;
+
+ srcu_key = srcu_read_lock(&file->device->disassociate_srcu);
+ ib_dev = srcu_dereference(file->device->ib_dev,
+ &file->device->disassociate_srcu);
+ if (!ib_dev) {
+ ret = -EIO;
+ goto out;
+ }
if (!file->ucontext)
- return -ENODEV;
+ ret = -ENODEV;
else
- return file->device->ib_dev->mmap(file->ucontext, vma);
+ ret = ib_dev->mmap(file->ucontext, vma);
+out:
+ srcu_read_unlock(&file->device->disassociate_srcu, srcu_key);
+ return ret;
}
/*
@@ -740,23 +872,43 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp)
{
struct ib_uverbs_device *dev;
struct ib_uverbs_file *file;
+ struct ib_device *ib_dev;
int ret;
+ int module_dependent;
+ int srcu_key;
dev = container_of(inode->i_cdev, struct ib_uverbs_device, cdev);
- if (dev)
- kref_get(&dev->ref);
- else
+ if (!atomic_inc_not_zero(&dev->refcount))
return -ENXIO;
- if (!try_module_get(dev->ib_dev->owner)) {
- ret = -ENODEV;
+ srcu_key = srcu_read_lock(&dev->disassociate_srcu);
+ mutex_lock(&dev->lists_mutex);
+ ib_dev = srcu_dereference(dev->ib_dev,
+ &dev->disassociate_srcu);
+ if (!ib_dev) {
+ ret = -EIO;
goto err;
}
- file = kmalloc(sizeof *file, GFP_KERNEL);
+ /* In case IB device supports disassociate ucontext, there is no hard
+ * dependency between uverbs device and its low level device.
+ */
+ module_dependent = !(ib_dev->disassociate_ucontext);
+
+ if (module_dependent) {
+ if (!try_module_get(ib_dev->owner)) {
+ ret = -ENODEV;
+ goto err;
+ }
+ }
+
+ file = kzalloc(sizeof(*file), GFP_KERNEL);
if (!file) {
ret = -ENOMEM;
- goto err_module;
+ if (module_dependent)
+ goto err_module;
+
+ goto err;
}
file->device = dev;
@@ -766,27 +918,47 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp)
mutex_init(&file->mutex);
filp->private_data = file;
+ kobject_get(&dev->kobj);
+ list_add_tail(&file->list, &dev->uverbs_file_list);
+ mutex_unlock(&dev->lists_mutex);
+ srcu_read_unlock(&dev->disassociate_srcu, srcu_key);
return nonseekable_open(inode, filp);
err_module:
- module_put(dev->ib_dev->owner);
+ module_put(ib_dev->owner);
err:
- kref_put(&dev->ref, ib_uverbs_release_dev);
+ mutex_unlock(&dev->lists_mutex);
+ srcu_read_unlock(&dev->disassociate_srcu, srcu_key);
+ if (atomic_dec_and_test(&dev->refcount))
+ ib_uverbs_comp_dev(dev);
+
return ret;
}
static int ib_uverbs_close(struct inode *inode, struct file *filp)
{
struct ib_uverbs_file *file = filp->private_data;
-
- ib_uverbs_cleanup_ucontext(file, file->ucontext);
+ struct ib_uverbs_device *dev = file->device;
+ struct ib_ucontext *ucontext = NULL;
+
+ mutex_lock(&file->device->lists_mutex);
+ ucontext = file->ucontext;
+ file->ucontext = NULL;
+ if (!file->is_closed) {
+ list_del(&file->list);
+ file->is_closed = 1;
+ }
+ mutex_unlock(&file->device->lists_mutex);
+ if (ucontext)
+ ib_uverbs_cleanup_ucontext(file, ucontext);
if (file->async_file)
kref_put(&file->async_file->ref, ib_uverbs_release_event_file);
kref_put(&file->ref, ib_uverbs_release_file);
+ kobject_put(&dev->kobj);
return 0;
}
@@ -817,12 +989,21 @@ static struct ib_client uverbs_client = {
static ssize_t show_ibdev(struct device *device, struct device_attribute *attr,
char *buf)
{
+ int ret = -ENODEV;
+ int srcu_key;
struct ib_uverbs_device *dev = dev_get_drvdata(device);
+ struct ib_device *ib_dev;
if (!dev)
return -ENODEV;
- return sprintf(buf, "%s\n", dev->ib_dev->name);
+ srcu_key = srcu_read_lock(&dev->disassociate_srcu);
+ ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu);
+ if (ib_dev)
+ ret = sprintf(buf, "%s\n", ib_dev->name);
+ srcu_read_unlock(&dev->disassociate_srcu, srcu_key);
+
+ return ret;
}
static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL);
@@ -830,11 +1011,19 @@ static ssize_t show_dev_abi_version(struct device *device,
struct device_attribute *attr, char *buf)
{
struct ib_uverbs_device *dev = dev_get_drvdata(device);
+ int ret = -ENODEV;
+ int srcu_key;
+ struct ib_device *ib_dev;
if (!dev)
return -ENODEV;
+ srcu_key = srcu_read_lock(&dev->disassociate_srcu);
+ ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu);
+ if (ib_dev)
+ ret = sprintf(buf, "%d\n", ib_dev->uverbs_abi_ver);
+ srcu_read_unlock(&dev->disassociate_srcu, srcu_key);
- return sprintf(buf, "%d\n", dev->ib_dev->uverbs_abi_ver);
+ return ret;
}
static DEVICE_ATTR(abi_version, S_IRUGO, show_dev_abi_version, NULL);
@@ -874,6 +1063,7 @@ static void ib_uverbs_add_one(struct ib_device *device)
int devnum;
dev_t base;
struct ib_uverbs_device *uverbs_dev;
+ int ret;
if (!device->alloc_ucontext)
return;
@@ -882,10 +1072,20 @@ static void ib_uverbs_add_one(struct ib_device *device)
if (!uverbs_dev)
return;
- kref_init(&uverbs_dev->ref);
+ ret = init_srcu_struct(&uverbs_dev->disassociate_srcu);
+ if (ret) {
+ kfree(uverbs_dev);
+ return;
+ }
+
+ atomic_set(&uverbs_dev->refcount, 1);
init_completion(&uverbs_dev->comp);
uverbs_dev->xrcd_tree = RB_ROOT;
mutex_init(&uverbs_dev->xrcd_tree_mutex);
+ kobject_init(&uverbs_dev->kobj, &ib_uverbs_dev_ktype);
+ mutex_init(&uverbs_dev->lists_mutex);
+ INIT_LIST_HEAD(&uverbs_dev->uverbs_file_list);
+ INIT_LIST_HEAD(&uverbs_dev->uverbs_events_file_list);
spin_lock(&map_lock);
devnum = find_first_zero_bit(dev_map, IB_UVERBS_MAX_DEVICES);
@@ -906,12 +1106,13 @@ static void ib_uverbs_add_one(struct ib_device *device)
}
spin_unlock(&map_lock);
- uverbs_dev->ib_dev = device;
+ rcu_assign_pointer(uverbs_dev->ib_dev, device);
uverbs_dev->num_comp_vectors = device->num_comp_vectors;
cdev_init(&uverbs_dev->cdev, NULL);
uverbs_dev->cdev.owner = THIS_MODULE;
uverbs_dev->cdev.ops = device->mmap ? &uverbs_mmap_fops : &uverbs_fops;
+ uverbs_dev->cdev.kobj.parent = &uverbs_dev->kobj;
kobject_set_name(&uverbs_dev->cdev.kobj, "uverbs%d", uverbs_dev->devnum);
if (cdev_add(&uverbs_dev->cdev, base, 1))
goto err_cdev;
@@ -942,15 +1143,79 @@ err_cdev:
clear_bit(devnum, overflow_map);
err:
- kref_put(&uverbs_dev->ref, ib_uverbs_release_dev);
+ if (atomic_dec_and_test(&uverbs_dev->refcount))
+ ib_uverbs_comp_dev(uverbs_dev);
wait_for_completion(&uverbs_dev->comp);
- kfree(uverbs_dev);
+ kobject_put(&uverbs_dev->kobj);
return;
}
-static void ib_uverbs_remove_one(struct ib_device *device)
+static void ib_uverbs_free_hw_resources(struct ib_uverbs_device *uverbs_dev,
+ struct ib_device *ib_dev)
{
- struct ib_uverbs_device *uverbs_dev = ib_get_client_data(device, &uverbs_client);
+ struct ib_uverbs_file *file;
+ struct ib_uverbs_event_file *event_file;
+ struct ib_event event;
+
+ /* Pending running commands to terminate */
+ synchronize_srcu(&uverbs_dev->disassociate_srcu);
+ event.event = IB_EVENT_DEVICE_FATAL;
+ event.element.port_num = 0;
+ event.device = ib_dev;
+
+ mutex_lock(&uverbs_dev->lists_mutex);
+ while (!list_empty(&uverbs_dev->uverbs_file_list)) {
+ struct ib_ucontext *ucontext;
+
+ file = list_first_entry(&uverbs_dev->uverbs_file_list,
+ struct ib_uverbs_file, list);
+ file->is_closed = 1;
+ ucontext = file->ucontext;
+ list_del(&file->list);
+ file->ucontext = NULL;
+ kref_get(&file->ref);
+ mutex_unlock(&uverbs_dev->lists_mutex);
+ /* We must release the mutex before going ahead and calling
+ * disassociate_ucontext. disassociate_ucontext might end up
+ * indirectly calling uverbs_close, for example due to freeing
+ * the resources (e.g mmput).
+ */
+ ib_uverbs_event_handler(&file->event_handler, &event);
+ if (ucontext) {
+ ib_dev->disassociate_ucontext(ucontext);
+ ib_uverbs_cleanup_ucontext(file, ucontext);
+ }
+
+ mutex_lock(&uverbs_dev->lists_mutex);
+ kref_put(&file->ref, ib_uverbs_release_file);
+ }
+
+ while (!list_empty(&uverbs_dev->uverbs_events_file_list)) {
+ event_file = list_first_entry(&uverbs_dev->
+ uverbs_events_file_list,
+ struct ib_uverbs_event_file,
+ list);
+ spin_lock_irq(&event_file->lock);
+ event_file->is_closed = 1;
+ spin_unlock_irq(&event_file->lock);
+
+ list_del(&event_file->list);
+ if (event_file->is_async) {
+ ib_unregister_event_handler(&event_file->uverbs_file->
+ event_handler);
+ event_file->uverbs_file->event_handler.device = NULL;
+ }
+
+ wake_up_interruptible(&event_file->poll_wait);
+ kill_fasync(&event_file->async_queue, SIGIO, POLL_IN);
+ }
+ mutex_unlock(&uverbs_dev->lists_mutex);
+}
+
+static void ib_uverbs_remove_one(struct ib_device *device, void *client_data)
+{
+ struct ib_uverbs_device *uverbs_dev = client_data;
+ int wait_clients = 1;
if (!uverbs_dev)
return;
@@ -964,9 +1229,28 @@ static void ib_uverbs_remove_one(struct ib_device *device)
else
clear_bit(uverbs_dev->devnum - IB_UVERBS_MAX_DEVICES, overflow_map);
- kref_put(&uverbs_dev->ref, ib_uverbs_release_dev);
- wait_for_completion(&uverbs_dev->comp);
- kfree(uverbs_dev);
+ if (device->disassociate_ucontext) {
+ /* We disassociate HW resources and immediately return.
+ * Userspace will see a EIO errno for all future access.
+ * Upon returning, ib_device may be freed internally and is not
+ * valid any more.
+ * uverbs_device is still available until all clients close
+ * their files, then the uverbs device ref count will be zero
+ * and its resources will be freed.
+ * Note: At this point no more files can be opened since the
+ * cdev was deleted, however active clients can still issue
+ * commands and close their open files.
+ */
+ rcu_assign_pointer(uverbs_dev->ib_dev, NULL);
+ ib_uverbs_free_hw_resources(uverbs_dev, device);
+ wait_clients = 0;
+ }
+
+ if (atomic_dec_and_test(&uverbs_dev->refcount))
+ ib_uverbs_comp_dev(uverbs_dev);
+ if (wait_clients)
+ wait_for_completion(&uverbs_dev->comp);
+ kobject_put(&uverbs_dev->kobj);
}
static char *uverbs_devnode(struct device *dev, umode_t *mode)
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 30eb2457000c..e1f2c9887f3f 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -213,28 +213,79 @@ EXPORT_SYMBOL(rdma_port_get_link_layer);
/* Protection domains */
+/**
+ * ib_alloc_pd - Allocates an unused protection domain.
+ * @device: The device on which to allocate the protection domain.
+ *
+ * A protection domain object provides an association between QPs, shared
+ * receive queues, address handles, memory regions, and memory windows.
+ *
+ * Every PD has a local_dma_lkey which can be used as the lkey value for local
+ * memory operations.
+ */
struct ib_pd *ib_alloc_pd(struct ib_device *device)
{
struct ib_pd *pd;
+ struct ib_device_attr devattr;
+ int rc;
+
+ rc = ib_query_device(device, &devattr);
+ if (rc)
+ return ERR_PTR(rc);
pd = device->alloc_pd(device, NULL, NULL);
+ if (IS_ERR(pd))
+ return pd;
+
+ pd->device = device;
+ pd->uobject = NULL;
+ pd->local_mr = NULL;
+ atomic_set(&pd->usecnt, 0);
+
+ if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)
+ pd->local_dma_lkey = device->local_dma_lkey;
+ else {
+ struct ib_mr *mr;
+
+ mr = ib_get_dma_mr(pd, IB_ACCESS_LOCAL_WRITE);
+ if (IS_ERR(mr)) {
+ ib_dealloc_pd(pd);
+ return (struct ib_pd *)mr;
+ }
- if (!IS_ERR(pd)) {
- pd->device = device;
- pd->uobject = NULL;
- atomic_set(&pd->usecnt, 0);
+ pd->local_mr = mr;
+ pd->local_dma_lkey = pd->local_mr->lkey;
}
-
return pd;
}
EXPORT_SYMBOL(ib_alloc_pd);
-int ib_dealloc_pd(struct ib_pd *pd)
+/**
+ * ib_dealloc_pd - Deallocates a protection domain.
+ * @pd: The protection domain to deallocate.
+ *
+ * It is an error to call this function while any resources in the pd still
+ * exist. The caller is responsible to synchronously destroy them and
+ * guarantee no new allocations will happen.
+ */
+void ib_dealloc_pd(struct ib_pd *pd)
{
- if (atomic_read(&pd->usecnt))
- return -EBUSY;
+ int ret;
+
+ if (pd->local_mr) {
+ ret = ib_dereg_mr(pd->local_mr);
+ WARN_ON(ret);
+ pd->local_mr = NULL;
+ }
- return pd->device->dealloc_pd(pd);
+ /* uverbs manipulates usecnt with proper locking, while the kabi
+ requires the caller to guarantee we can't race here. */
+ WARN_ON(atomic_read(&pd->usecnt));
+
+ /* Making delalloc_pd a void return is a WIP, no driver should return
+ an error here. */
+ ret = pd->device->dealloc_pd(pd);
+ WARN_ONCE(ret, "Infiniband HW driver failed dealloc_pd");
}
EXPORT_SYMBOL(ib_dealloc_pd);
@@ -1168,54 +1219,28 @@ int ib_dereg_mr(struct ib_mr *mr)
}
EXPORT_SYMBOL(ib_dereg_mr);
-struct ib_mr *ib_create_mr(struct ib_pd *pd,
- struct ib_mr_init_attr *mr_init_attr)
-{
- struct ib_mr *mr;
-
- if (!pd->device->create_mr)
- return ERR_PTR(-ENOSYS);
-
- mr = pd->device->create_mr(pd, mr_init_attr);
-
- if (!IS_ERR(mr)) {
- mr->device = pd->device;
- mr->pd = pd;
- mr->uobject = NULL;
- atomic_inc(&pd->usecnt);
- atomic_set(&mr->usecnt, 0);
- }
-
- return mr;
-}
-EXPORT_SYMBOL(ib_create_mr);
-
-int ib_destroy_mr(struct ib_mr *mr)
-{
- struct ib_pd *pd;
- int ret;
-
- if (atomic_read(&mr->usecnt))
- return -EBUSY;
-
- pd = mr->pd;
- ret = mr->device->destroy_mr(mr);
- if (!ret)
- atomic_dec(&pd->usecnt);
-
- return ret;
-}
-EXPORT_SYMBOL(ib_destroy_mr);
-
-struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len)
+/**
+ * ib_alloc_mr() - Allocates a memory region
+ * @pd: protection domain associated with the region
+ * @mr_type: memory region type
+ * @max_num_sg: maximum sg entries available for registration.
+ *
+ * Notes:
+ * Memory registeration page/sg lists must not exceed max_num_sg.
+ * For mr_type IB_MR_TYPE_MEM_REG, the total length cannot exceed
+ * max_num_sg * used_page_size.
+ *
+ */
+struct ib_mr *ib_alloc_mr(struct ib_pd *pd,
+ enum ib_mr_type mr_type,
+ u32 max_num_sg)
{
struct ib_mr *mr;
- if (!pd->device->alloc_fast_reg_mr)
+ if (!pd->device->alloc_mr)
return ERR_PTR(-ENOSYS);
- mr = pd->device->alloc_fast_reg_mr(pd, max_page_list_len);
-
+ mr = pd->device->alloc_mr(pd, mr_type, max_num_sg);
if (!IS_ERR(mr)) {
mr->device = pd->device;
mr->pd = pd;
@@ -1226,7 +1251,7 @@ struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len)
return mr;
}
-EXPORT_SYMBOL(ib_alloc_fast_reg_mr);
+EXPORT_SYMBOL(ib_alloc_mr);
struct ib_fast_reg_page_list *ib_alloc_fast_reg_page_list(struct ib_device *device,
int max_page_list_len)