aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/infiniband
diff options
context:
space:
mode:
authorJason Gunthorpe <jgg@mellanox.com>2019-08-21 14:10:36 -0300
committerJason Gunthorpe <jgg@mellanox.com>2019-08-21 14:10:36 -0300
commit868df536f5e84672c3e002b949e0e44f97cb0f09 (patch)
treef76da5f6d06125b6d91c75ebfe4079ec9e2f958a /drivers/infiniband
parentRDMA: Delete DEBUG code (diff)
parentRDMA/mlx5: Use odp instead of mr->umem in pagefault_mr (diff)
downloadlinux-dev-868df536f5e84672c3e002b949e0e44f97cb0f09.tar.xz
linux-dev-868df536f5e84672c3e002b949e0e44f97cb0f09.zip
Merge branch 'odp_fixes' into rdma.git for-next
Jason Gunthorpe says: ==================== This is a collection of general cleanups for ODP to clarify some of the flows around umem creation and use of the interval tree. ==================== The branch is based on v5.3-rc5 due to dependencies * odp_fixes: RDMA/mlx5: Use odp instead of mr->umem in pagefault_mr RDMA/mlx5: Use ib_umem_start instead of umem.address RDMA/core: Make invalidate_range a device operation RDMA/odp: Use kvcalloc for the dma_list and page_list RDMA/odp: Check for overflow when computing the umem_odp end RDMA/odp: Provide ib_umem_odp_release() to undo the allocs RDMA/odp: Split creating a umem_odp from ib_umem_get RDMA/odp: Make the three ways to create a umem_odp clear RMDA/odp: Consolidate umem_odp initialization RDMA/odp: Make it clearer when a umem is an implicit ODP umem RDMA/odp: Iterate over the whole rbtree directly RDMA/odp: Use the common interval tree library instead of generic RDMA/mlx5: Fix MR npages calculation for IB_ACCESS_HUGETLB Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Diffstat (limited to 'drivers/infiniband')
-rw-r--r--drivers/infiniband/Kconfig1
-rw-r--r--drivers/infiniband/core/core_priv.h5
-rw-r--r--drivers/infiniband/core/counters.c6
-rw-r--r--drivers/infiniband/core/device.c103
-rw-r--r--drivers/infiniband/core/mad.c20
-rw-r--r--drivers/infiniband/core/nldev.c8
-rw-r--r--drivers/infiniband/core/umem.c57
-rw-r--r--drivers/infiniband/core/umem_odp.c441
-rw-r--r--drivers/infiniband/core/user_mad.c6
-rw-r--r--drivers/infiniband/core/uverbs_cmd.c2
-rw-r--r--drivers/infiniband/hw/hfi1/verbs.c2
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_hw_v1.c4
-rw-r--r--drivers/infiniband/hw/mlx5/devx.c11
-rw-r--r--drivers/infiniband/hw/mlx5/main.c11
-rw-r--r--drivers/infiniband/hw/mlx5/mem.c12
-rw-r--r--drivers/infiniband/hw/mlx5/mr.c65
-rw-r--r--drivers/infiniband/hw/mlx5/odp.c112
-rw-r--r--drivers/infiniband/sw/siw/Kconfig2
-rw-r--r--drivers/infiniband/sw/siw/siw.h2
-rw-r--r--drivers/infiniband/sw/siw/siw_main.c4
-rw-r--r--drivers/infiniband/sw/siw/siw_qp.c14
-rw-r--r--drivers/infiniband/sw/siw/siw_verbs.c16
22 files changed, 485 insertions, 419 deletions
diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
index 85e103b147cc..b44b1c322ec8 100644
--- a/drivers/infiniband/Kconfig
+++ b/drivers/infiniband/Kconfig
@@ -55,6 +55,7 @@ config INFINIBAND_ON_DEMAND_PAGING
bool "InfiniBand on-demand paging support"
depends on INFINIBAND_USER_MEM
select MMU_NOTIFIER
+ select INTERVAL_TREE
default y
---help---
On demand paging support for the InfiniBand subsystem.
diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
index 589ed805e0ad..3a8b0911c3bc 100644
--- a/drivers/infiniband/core/core_priv.h
+++ b/drivers/infiniband/core/core_priv.h
@@ -321,7 +321,9 @@ static inline struct ib_qp *_ib_create_qp(struct ib_device *dev,
struct ib_udata *udata,
struct ib_uobject *uobj)
{
+ enum ib_qp_type qp_type = attr->qp_type;
struct ib_qp *qp;
+ bool is_xrc;
if (!dev->ops.create_qp)
return ERR_PTR(-EOPNOTSUPP);
@@ -339,7 +341,8 @@ static inline struct ib_qp *_ib_create_qp(struct ib_device *dev,
* and more importantly they are created internaly by driver,
* see mlx5 create_dev_resources() as an example.
*/
- if (attr->qp_type < IB_QPT_XRC_INI) {
+ is_xrc = qp_type == IB_QPT_XRC_INI || qp_type == IB_QPT_XRC_TGT;
+ if ((qp_type < IB_QPT_MAX && !is_xrc) || qp_type == IB_QPT_DRIVER) {
qp->res.type = RDMA_RESTRACK_QP;
if (uobj)
rdma_restrack_uadd(&qp->res);
diff --git a/drivers/infiniband/core/counters.c b/drivers/infiniband/core/counters.c
index d60416f0bf3a..61fcb3a31340 100644
--- a/drivers/infiniband/core/counters.c
+++ b/drivers/infiniband/core/counters.c
@@ -38,6 +38,9 @@ int rdma_counter_set_auto_mode(struct ib_device *dev, u8 port,
int ret;
port_counter = &dev->port_data[port].port_counter;
+ if (!port_counter->hstats)
+ return -EOPNOTSUPP;
+
mutex_lock(&port_counter->lock);
if (on) {
ret = __counter_set_mode(&port_counter->mode,
@@ -509,6 +512,9 @@ int rdma_counter_bind_qpn_alloc(struct ib_device *dev, u8 port,
if (!rdma_is_port_valid(dev, port))
return -EINVAL;
+ if (!dev->port_data[port].port_counter.hstats)
+ return -EOPNOTSUPP;
+
qp = rdma_counter_get_qp(dev, qp_num);
if (!qp)
return -ENOENT;
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 8892862fb759..99c4a55545cf 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -93,11 +93,17 @@ static DEFINE_XARRAY_FLAGS(devices, XA_FLAGS_ALLOC);
static DECLARE_RWSEM(devices_rwsem);
#define DEVICE_REGISTERED XA_MARK_1
-static LIST_HEAD(client_list);
+static u32 highest_client_id;
#define CLIENT_REGISTERED XA_MARK_1
static DEFINE_XARRAY_FLAGS(clients, XA_FLAGS_ALLOC);
static DECLARE_RWSEM(clients_rwsem);
+static void ib_client_put(struct ib_client *client)
+{
+ if (refcount_dec_and_test(&client->uses))
+ complete(&client->uses_zero);
+}
+
/*
* If client_data is registered then the corresponding client must also still
* be registered.
@@ -653,6 +659,14 @@ static int add_client_context(struct ib_device *device,
down_write(&device->client_data_rwsem);
/*
+ * So long as the client is registered hold both the client and device
+ * unregistration locks.
+ */
+ if (!refcount_inc_not_zero(&client->uses))
+ goto out_unlock;
+ refcount_inc(&device->refcount);
+
+ /*
* Another caller to add_client_context got here first and has already
* completely initialized context.
*/
@@ -675,6 +689,9 @@ static int add_client_context(struct ib_device *device,
return 0;
out:
+ ib_device_put(device);
+ ib_client_put(client);
+out_unlock:
up_write(&device->client_data_rwsem);
return ret;
}
@@ -694,7 +711,7 @@ static void remove_client_context(struct ib_device *device,
client_data = xa_load(&device->client_data, client_id);
xa_clear_mark(&device->client_data, client_id, CLIENT_DATA_REGISTERED);
client = xa_load(&clients, client_id);
- downgrade_write(&device->client_data_rwsem);
+ up_write(&device->client_data_rwsem);
/*
* Notice we cannot be holding any exclusive locks when calling the
@@ -704,17 +721,13 @@ static void remove_client_context(struct ib_device *device,
*
* For this reason clients and drivers should not call the
* unregistration functions will holdling any locks.
- *
- * It tempting to drop the client_data_rwsem too, but this is required
- * to ensure that unregister_client does not return until all clients
- * are completely unregistered, which is required to avoid module
- * unloading races.
*/
if (client->remove)
client->remove(device, client_data);
xa_erase(&device->client_data, client_id);
- up_read(&device->client_data_rwsem);
+ ib_device_put(device);
+ ib_client_put(client);
}
static int alloc_port_data(struct ib_device *device)
@@ -1223,7 +1236,7 @@ static int setup_device(struct ib_device *device)
static void disable_device(struct ib_device *device)
{
- struct ib_client *client;
+ u32 cid;
WARN_ON(!refcount_read(&device->refcount));
@@ -1231,10 +1244,19 @@ static void disable_device(struct ib_device *device)
xa_clear_mark(&devices, device->index, DEVICE_REGISTERED);
up_write(&devices_rwsem);
+ /*
+ * Remove clients in LIFO order, see assign_client_id. This could be
+ * more efficient if xarray learns to reverse iterate. Since no new
+ * clients can be added to this ib_device past this point we only need
+ * the maximum possible client_id value here.
+ */
down_read(&clients_rwsem);
- list_for_each_entry_reverse(client, &client_list, list)
- remove_client_context(device, client->client_id);
+ cid = highest_client_id;
up_read(&clients_rwsem);
+ while (cid) {
+ cid--;
+ remove_client_context(device, cid);
+ }
/* Pairs with refcount_set in enable_device */
ib_device_put(device);
@@ -1661,30 +1683,31 @@ static int assign_client_id(struct ib_client *client)
/*
* The add/remove callbacks must be called in FIFO/LIFO order. To
* achieve this we assign client_ids so they are sorted in
- * registration order, and retain a linked list we can reverse iterate
- * to get the LIFO order. The extra linked list can go away if xarray
- * learns to reverse iterate.
+ * registration order.
*/
- if (list_empty(&client_list)) {
- client->client_id = 0;
- } else {
- struct ib_client *last;
-
- last = list_last_entry(&client_list, struct ib_client, list);
- client->client_id = last->client_id + 1;
- }
+ client->client_id = highest_client_id;
ret = xa_insert(&clients, client->client_id, client, GFP_KERNEL);
if (ret)
goto out;
+ highest_client_id++;
xa_set_mark(&clients, client->client_id, CLIENT_REGISTERED);
- list_add_tail(&client->list, &client_list);
out:
up_write(&clients_rwsem);
return ret;
}
+static void remove_client_id(struct ib_client *client)
+{
+ down_write(&clients_rwsem);
+ xa_erase(&clients, client->client_id);
+ for (; highest_client_id; highest_client_id--)
+ if (xa_load(&clients, highest_client_id - 1))
+ break;
+ up_write(&clients_rwsem);
+}
+
/**
* ib_register_client - Register an IB client
* @client:Client to register
@@ -1704,6 +1727,8 @@ int ib_register_client(struct ib_client *client)
unsigned long index;
int ret;
+ refcount_set(&client->uses, 1);
+ init_completion(&client->uses_zero);
ret = assign_client_id(client);
if (ret)
return ret;
@@ -1739,21 +1764,30 @@ void ib_unregister_client(struct ib_client *client)
unsigned long index;
down_write(&clients_rwsem);
+ ib_client_put(client);
xa_clear_mark(&clients, client->client_id, CLIENT_REGISTERED);
up_write(&clients_rwsem);
- /*
- * Every device still known must be serialized to make sure we are
- * done with the client callbacks before we return.
- */
- down_read(&devices_rwsem);
- xa_for_each (&devices, index, device)
+
+ /* We do not want to have locks while calling client->remove() */
+ rcu_read_lock();
+ xa_for_each (&devices, index, device) {
+ if (!ib_device_try_get(device))
+ continue;
+ rcu_read_unlock();
+
remove_client_context(device, client->client_id);
- up_read(&devices_rwsem);
- down_write(&clients_rwsem);
- list_del(&client->list);
- xa_erase(&clients, client->client_id);
- up_write(&clients_rwsem);
+ ib_device_put(device);
+ rcu_read_lock();
+ }
+ rcu_read_unlock();
+
+ /*
+ * remove_client_context() is not a fence, it can return even though a
+ * removal is ongoing. Wait until all removals are completed.
+ */
+ wait_for_completion(&client->uses_zero);
+ remove_client_id(client);
}
EXPORT_SYMBOL(ib_unregister_client);
@@ -2582,6 +2616,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
SET_DEVICE_OP(dev_ops, get_vf_config);
SET_DEVICE_OP(dev_ops, get_vf_stats);
SET_DEVICE_OP(dev_ops, init_port);
+ SET_DEVICE_OP(dev_ops, invalidate_range);
SET_DEVICE_OP(dev_ops, iw_accept);
SET_DEVICE_OP(dev_ops, iw_add_ref);
SET_DEVICE_OP(dev_ops, iw_connect);
diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
index cc99479b2c09..9947d16edef2 100644
--- a/drivers/infiniband/core/mad.c
+++ b/drivers/infiniband/core/mad.c
@@ -3224,18 +3224,18 @@ static int ib_mad_port_open(struct ib_device *device,
if (has_smi)
cq_size *= 2;
+ port_priv->pd = ib_alloc_pd(device, 0);
+ if (IS_ERR(port_priv->pd)) {
+ dev_err(&device->dev, "Couldn't create ib_mad PD\n");
+ ret = PTR_ERR(port_priv->pd);
+ goto error3;
+ }
+
port_priv->cq = ib_alloc_cq(port_priv->device, port_priv, cq_size, 0,
IB_POLL_UNBOUND_WORKQUEUE);
if (IS_ERR(port_priv->cq)) {
dev_err(&device->dev, "Couldn't create ib_mad CQ\n");
ret = PTR_ERR(port_priv->cq);
- goto error3;
- }
-
- port_priv->pd = ib_alloc_pd(device, 0);
- if (IS_ERR(port_priv->pd)) {
- dev_err(&device->dev, "Couldn't create ib_mad PD\n");
- ret = PTR_ERR(port_priv->pd);
goto error4;
}
@@ -3278,11 +3278,11 @@ error8:
error7:
destroy_mad_qp(&port_priv->qp_info[0]);
error6:
- ib_dealloc_pd(port_priv->pd);
-error4:
ib_free_cq(port_priv->cq);
cleanup_recv_queue(&port_priv->qp_info[1]);
cleanup_recv_queue(&port_priv->qp_info[0]);
+error4:
+ ib_dealloc_pd(port_priv->pd);
error3:
kfree(port_priv);
@@ -3312,8 +3312,8 @@ static int ib_mad_port_close(struct ib_device *device, int port_num)
destroy_workqueue(port_priv->wq);
destroy_mad_qp(&port_priv->qp_info[1]);
destroy_mad_qp(&port_priv->qp_info[0]);
- ib_dealloc_pd(port_priv->pd);
ib_free_cq(port_priv->cq);
+ ib_dealloc_pd(port_priv->pd);
cleanup_recv_queue(&port_priv->qp_info[1]);
cleanup_recv_queue(&port_priv->qp_info[0]);
/* XXX: Handle deallocation of MAD registration tables */
diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index e287b71a1cfd..cc08218f1ef7 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -1952,12 +1952,16 @@ static int stat_get_doit_qp(struct sk_buff *skb, struct nlmsghdr *nlh,
if (fill_nldev_handle(msg, device) ||
nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port) ||
- nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_MODE, mode))
+ nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_MODE, mode)) {
+ ret = -EMSGSIZE;
goto err_msg;
+ }
if ((mode == RDMA_COUNTER_MODE_AUTO) &&
- nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK, mask))
+ nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK, mask)) {
+ ret = -EMSGSIZE;
goto err_msg;
+ }
nlmsg_end(msg, nlh);
ib_device_put(device);
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index 08da840ed7ee..37eb8643ec29 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -184,9 +184,6 @@ EXPORT_SYMBOL(ib_umem_find_best_pgsz);
/**
* ib_umem_get - Pin and DMA map userspace memory.
*
- * If access flags indicate ODP memory, avoid pinning. Instead, stores
- * the mm for future page fault handling in conjunction with MMU notifiers.
- *
* @udata: userspace context to pin memory for
* @addr: userspace virtual address to start at
* @size: length of region to pin
@@ -231,17 +228,12 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
if (!can_do_mlock())
return ERR_PTR(-EPERM);
- if (access & IB_ACCESS_ON_DEMAND) {
- umem = kzalloc(sizeof(struct ib_umem_odp), GFP_KERNEL);
- if (!umem)
- return ERR_PTR(-ENOMEM);
- umem->is_odp = 1;
- } else {
- umem = kzalloc(sizeof(*umem), GFP_KERNEL);
- if (!umem)
- return ERR_PTR(-ENOMEM);
- }
+ if (access & IB_ACCESS_ON_DEMAND)
+ return ERR_PTR(-EOPNOTSUPP);
+ umem = kzalloc(sizeof(*umem), GFP_KERNEL);
+ if (!umem)
+ return ERR_PTR(-ENOMEM);
umem->context = context;
umem->length = size;
umem->address = addr;
@@ -249,18 +241,6 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
umem->owning_mm = mm = current->mm;
mmgrab(mm);
- if (access & IB_ACCESS_ON_DEMAND) {
- if (WARN_ON_ONCE(!context->invalidate_range)) {
- ret = -EINVAL;
- goto umem_kfree;
- }
-
- ret = ib_umem_odp_get(to_ib_umem_odp(umem), access);
- if (ret)
- goto umem_kfree;
- return umem;
- }
-
page_list = (struct page **) __get_free_page(GFP_KERNEL);
if (!page_list) {
ret = -ENOMEM;
@@ -346,15 +326,6 @@ umem_kfree:
}
EXPORT_SYMBOL(ib_umem_get);
-static void __ib_umem_release_tail(struct ib_umem *umem)
-{
- mmdrop(umem->owning_mm);
- if (umem->is_odp)
- kfree(to_ib_umem_odp(umem));
- else
- kfree(umem);
-}
-
/**
* ib_umem_release - release memory pinned with ib_umem_get
* @umem: umem struct to release
@@ -363,30 +334,22 @@ void ib_umem_release(struct ib_umem *umem)
{
if (!umem)
return;
-
- if (umem->is_odp) {
- ib_umem_odp_release(to_ib_umem_odp(umem));
- __ib_umem_release_tail(umem);
- return;
- }
+ if (umem->is_odp)
+ return ib_umem_odp_release(to_ib_umem_odp(umem));
__ib_umem_release(umem->context->device, umem, 1);
atomic64_sub(ib_umem_num_pages(umem), &umem->owning_mm->pinned_vm);
- __ib_umem_release_tail(umem);
+ mmdrop(umem->owning_mm);
+ kfree(umem);
}
EXPORT_SYMBOL(ib_umem_release);
int ib_umem_page_count(struct ib_umem *umem)
{
- int i;
- int n;
+ int i, n = 0;
struct scatterlist *sg;
- if (umem->is_odp)
- return ib_umem_num_pages(umem);
-
- n = 0;
for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i)
n += sg_dma_len(sg) >> PAGE_SHIFT;
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
index 2a75c6f8d827..32fb0b579dec 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -39,44 +39,14 @@
#include <linux/export.h>
#include <linux/vmalloc.h>
#include <linux/hugetlb.h>
-#include <linux/interval_tree_generic.h>
+#include <linux/interval_tree.h>
#include <linux/pagemap.h>
#include <rdma/ib_verbs.h>
#include <rdma/ib_umem.h>
#include <rdma/ib_umem_odp.h>
-/*
- * The ib_umem list keeps track of memory regions for which the HW
- * device request to receive notification when the related memory
- * mapping is changed.
- *
- * ib_umem_lock protects the list.
- */
-
-static u64 node_start(struct umem_odp_node *n)
-{
- struct ib_umem_odp *umem_odp =
- container_of(n, struct ib_umem_odp, interval_tree);
-
- return ib_umem_start(umem_odp);
-}
-
-/* Note that the representation of the intervals in the interval tree
- * considers the ending point as contained in the interval, while the
- * function ib_umem_end returns the first address which is not contained
- * in the umem.
- */
-static u64 node_last(struct umem_odp_node *n)
-{
- struct ib_umem_odp *umem_odp =
- container_of(n, struct ib_umem_odp, interval_tree);
-
- return ib_umem_end(umem_odp) - 1;
-}
-
-INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last,
- node_start, node_last, static, rbt_ib_umem)
+#include "uverbs.h"
static void ib_umem_notifier_start_account(struct ib_umem_odp *umem_odp)
{
@@ -104,35 +74,34 @@ static void ib_umem_notifier_end_account(struct ib_umem_odp *umem_odp)
mutex_unlock(&umem_odp->umem_mutex);
}
-static int ib_umem_notifier_release_trampoline(struct ib_umem_odp *umem_odp,
- u64 start, u64 end, void *cookie)
-{
- /*
- * Increase the number of notifiers running, to
- * prevent any further fault handling on this MR.
- */
- ib_umem_notifier_start_account(umem_odp);
- umem_odp->dying = 1;
- /* Make sure that the fact the umem is dying is out before we release
- * all pending page faults. */
- smp_wmb();
- complete_all(&umem_odp->notifier_completion);
- umem_odp->umem.context->invalidate_range(
- umem_odp, ib_umem_start(umem_odp), ib_umem_end(umem_odp));
- return 0;
-}
-
static void ib_umem_notifier_release(struct mmu_notifier *mn,
struct mm_struct *mm)
{
struct ib_ucontext_per_mm *per_mm =
container_of(mn, struct ib_ucontext_per_mm, mn);
+ struct rb_node *node;
down_read(&per_mm->umem_rwsem);
- if (per_mm->active)
- rbt_ib_umem_for_each_in_range(
- &per_mm->umem_tree, 0, ULLONG_MAX,
- ib_umem_notifier_release_trampoline, true, NULL);
+ if (!per_mm->active)
+ goto out;
+
+ for (node = rb_first_cached(&per_mm->umem_tree); node;
+ node = rb_next(node)) {
+ struct ib_umem_odp *umem_odp =
+ rb_entry(node, struct ib_umem_odp, interval_tree.rb);
+
+ /*
+ * Increase the number of notifiers running, to prevent any
+ * further fault handling on this MR.
+ */
+ ib_umem_notifier_start_account(umem_odp);
+ complete_all(&umem_odp->notifier_completion);
+ umem_odp->umem.context->device->ops.invalidate_range(
+ umem_odp, ib_umem_start(umem_odp),
+ ib_umem_end(umem_odp));
+ }
+
+out:
up_read(&per_mm->umem_rwsem);
}
@@ -140,7 +109,7 @@ static int invalidate_range_start_trampoline(struct ib_umem_odp *item,
u64 start, u64 end, void *cookie)
{
ib_umem_notifier_start_account(item);
- item->umem.context->invalidate_range(item, start, end);
+ item->umem.context->device->ops.invalidate_range(item, start, end);
return 0;
}
@@ -204,27 +173,13 @@ static const struct mmu_notifier_ops ib_umem_notifiers = {
.invalidate_range_end = ib_umem_notifier_invalidate_range_end,
};
-static void add_umem_to_per_mm(struct ib_umem_odp *umem_odp)
-{
- struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm;
-
- down_write(&per_mm->umem_rwsem);
- if (likely(ib_umem_start(umem_odp) != ib_umem_end(umem_odp)))
- rbt_ib_umem_insert(&umem_odp->interval_tree,
- &per_mm->umem_tree);
- up_write(&per_mm->umem_rwsem);
-}
-
static void remove_umem_from_per_mm(struct ib_umem_odp *umem_odp)
{
struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm;
down_write(&per_mm->umem_rwsem);
- if (likely(ib_umem_start(umem_odp) != ib_umem_end(umem_odp)))
- rbt_ib_umem_remove(&umem_odp->interval_tree,
- &per_mm->umem_tree);
+ interval_tree_remove(&umem_odp->interval_tree, &per_mm->umem_tree);
complete_all(&umem_odp->notifier_completion);
-
up_write(&per_mm->umem_rwsem);
}
@@ -267,33 +222,23 @@ out_pid:
return ERR_PTR(ret);
}
-static int get_per_mm(struct ib_umem_odp *umem_odp)
+static struct ib_ucontext_per_mm *get_per_mm(struct ib_umem_odp *umem_odp)
{
struct ib_ucontext *ctx = umem_odp->umem.context;
struct ib_ucontext_per_mm *per_mm;
+ lockdep_assert_held(&ctx->per_mm_list_lock);
+
/*
* Generally speaking we expect only one or two per_mm in this list,
* so no reason to optimize this search today.
*/
- mutex_lock(&ctx->per_mm_list_lock);
list_for_each_entry(per_mm, &ctx->per_mm_list, ucontext_list) {
if (per_mm->mm == umem_odp->umem.owning_mm)
- goto found;
+ return per_mm;
}
- per_mm = alloc_per_mm(ctx, umem_odp->umem.owning_mm);
- if (IS_ERR(per_mm)) {
- mutex_unlock(&ctx->per_mm_list_lock);
- return PTR_ERR(per_mm);
- }
-
-found:
- umem_odp->per_mm = per_mm;
- per_mm->odp_mrs_count++;
- mutex_unlock(&ctx->per_mm_list_lock);
-
- return 0;
+ return alloc_per_mm(ctx, umem_odp->umem.owning_mm);
}
static void free_per_mm(struct rcu_head *rcu)
@@ -334,76 +279,218 @@ static void put_per_mm(struct ib_umem_odp *umem_odp)
mmu_notifier_call_srcu(&per_mm->rcu, free_per_mm);
}
-struct ib_umem_odp *ib_alloc_odp_umem(struct ib_umem_odp *root,
- unsigned long addr, size_t size)
+static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp,
+ struct ib_ucontext_per_mm *per_mm)
+{
+ struct ib_ucontext *ctx = umem_odp->umem.context;
+ int ret;
+
+ umem_odp->umem.is_odp = 1;
+ if (!umem_odp->is_implicit_odp) {
+ size_t page_size = 1UL << umem_odp->page_shift;
+ size_t pages;
+
+ umem_odp->interval_tree.start =
+ ALIGN_DOWN(umem_odp->umem.address, page_size);
+ if (check_add_overflow(umem_odp->umem.address,
+ umem_odp->umem.length,
+ &umem_odp->interval_tree.last))
+ return -EOVERFLOW;
+ umem_odp->interval_tree.last =
+ ALIGN(umem_odp->interval_tree.last, page_size);
+ if (unlikely(umem_odp->interval_tree.last < page_size))
+ return -EOVERFLOW;
+
+ pages = (umem_odp->interval_tree.last -
+ umem_odp->interval_tree.start) >>
+ umem_odp->page_shift;
+ if (!pages)
+ return -EINVAL;
+
+ /*
+ * Note that the representation of the intervals in the
+ * interval tree considers the ending point as contained in
+ * the interval.
+ */
+ umem_odp->interval_tree.last--;
+
+ umem_odp->page_list = kvcalloc(
+ pages, sizeof(*umem_odp->page_list), GFP_KERNEL);
+ if (!umem_odp->page_list)
+ return -ENOMEM;
+
+ umem_odp->dma_list = kvcalloc(
+ pages, sizeof(*umem_odp->dma_list), GFP_KERNEL);
+ if (!umem_odp->dma_list) {
+ ret = -ENOMEM;
+ goto out_page_list;
+ }
+ }
+
+ mutex_lock(&ctx->per_mm_list_lock);
+ if (!per_mm) {
+ per_mm = get_per_mm(umem_odp);
+ if (IS_ERR(per_mm)) {
+ ret = PTR_ERR(per_mm);
+ goto out_unlock;
+ }
+ }
+ umem_odp->per_mm = per_mm;
+ per_mm->odp_mrs_count++;
+ mutex_unlock(&ctx->per_mm_list_lock);
+
+ mutex_init(&umem_odp->umem_mutex);
+ init_completion(&umem_odp->notifier_completion);
+
+ if (!umem_odp->is_implicit_odp) {
+ down_write(&per_mm->umem_rwsem);
+ interval_tree_insert(&umem_odp->interval_tree,
+ &per_mm->umem_tree);
+ up_write(&per_mm->umem_rwsem);
+ }
+ mmgrab(umem_odp->umem.owning_mm);
+
+ return 0;
+
+out_unlock:
+ mutex_unlock(&ctx->per_mm_list_lock);
+ kvfree(umem_odp->dma_list);
+out_page_list:
+ kvfree(umem_odp->page_list);
+ return ret;
+}
+
+/**
+ * ib_umem_odp_alloc_implicit - Allocate a parent implicit ODP umem
+ *
+ * Implicit ODP umems do not have a VA range and do not have any page lists.
+ * They exist only to hold the per_mm reference to help the driver create
+ * children umems.
+ *
+ * @udata: udata from the syscall being used to create the umem
+ * @access: ib_reg_mr access flags
+ */
+struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_udata *udata,
+ int access)
+{
+ struct ib_ucontext *context =
+ container_of(udata, struct uverbs_attr_bundle, driver_udata)
+ ->context;
+ struct ib_umem *umem;
+ struct ib_umem_odp *umem_odp;
+ int ret;
+
+ if (access & IB_ACCESS_HUGETLB)
+ return ERR_PTR(-EINVAL);
+
+ if (!context)
+ return ERR_PTR(-EIO);
+ if (WARN_ON_ONCE(!context->device->ops.invalidate_range))
+ return ERR_PTR(-EINVAL);
+
+ umem_odp = kzalloc(sizeof(*umem_odp), GFP_KERNEL);
+ if (!umem_odp)
+ return ERR_PTR(-ENOMEM);
+ umem = &umem_odp->umem;
+ umem->context = context;
+ umem->writable = ib_access_writable(access);
+ umem->owning_mm = current->mm;
+ umem_odp->is_implicit_odp = 1;
+ umem_odp->page_shift = PAGE_SHIFT;
+
+ ret = ib_init_umem_odp(umem_odp, NULL);
+ if (ret) {
+ kfree(umem_odp);
+ return ERR_PTR(ret);
+ }
+ return umem_odp;
+}
+EXPORT_SYMBOL(ib_umem_odp_alloc_implicit);
+
+/**
+ * ib_umem_odp_alloc_child - Allocate a child ODP umem under an implicit
+ * parent ODP umem
+ *
+ * @root: The parent umem enclosing the child. This must be allocated using
+ * ib_alloc_implicit_odp_umem()
+ * @addr: The starting userspace VA
+ * @size: The length of the userspace VA
+ */
+struct ib_umem_odp *ib_umem_odp_alloc_child(struct ib_umem_odp *root,
+ unsigned long addr, size_t size)
{
- struct ib_ucontext_per_mm *per_mm = root->per_mm;
- struct ib_ucontext *ctx = per_mm->context;
+ /*
+ * Caller must ensure that root cannot be freed during the call to
+ * ib_alloc_odp_umem.
+ */
struct ib_umem_odp *odp_data;
struct ib_umem *umem;
- int pages = size >> PAGE_SHIFT;
int ret;
+ if (WARN_ON(!root->is_implicit_odp))
+ return ERR_PTR(-EINVAL);
+
odp_data = kzalloc(sizeof(*odp_data), GFP_KERNEL);
if (!odp_data)
return ERR_PTR(-ENOMEM);
umem = &odp_data->umem;
- umem->context = ctx;
+ umem->context = root->umem.context;
umem->length = size;
umem->address = addr;
- odp_data->page_shift = PAGE_SHIFT;
umem->writable = root->umem.writable;
- umem->is_odp = 1;
- odp_data->per_mm = per_mm;
- umem->owning_mm = per_mm->mm;
- mmgrab(umem->owning_mm);
-
- mutex_init(&odp_data->umem_mutex);
- init_completion(&odp_data->notifier_completion);
-
- odp_data->page_list =
- vzalloc(array_size(pages, sizeof(*odp_data->page_list)));
- if (!odp_data->page_list) {
- ret = -ENOMEM;
- goto out_odp_data;
- }
+ umem->owning_mm = root->umem.owning_mm;
+ odp_data->page_shift = PAGE_SHIFT;
- odp_data->dma_list =
- vzalloc(array_size(pages, sizeof(*odp_data->dma_list)));
- if (!odp_data->dma_list) {
- ret = -ENOMEM;
- goto out_page_list;
+ ret = ib_init_umem_odp(odp_data, root->per_mm);
+ if (ret) {
+ kfree(odp_data);
+ return ERR_PTR(ret);
}
-
- /*
- * Caller must ensure that the umem_odp that the per_mm came from
- * cannot be freed during the call to ib_alloc_odp_umem.
- */
- mutex_lock(&ctx->per_mm_list_lock);
- per_mm->odp_mrs_count++;
- mutex_unlock(&ctx->per_mm_list_lock);
- add_umem_to_per_mm(odp_data);
-
return odp_data;
-
-out_page_list:
- vfree(odp_data->page_list);
-out_odp_data:
- mmdrop(umem->owning_mm);
- kfree(odp_data);
- return ERR_PTR(ret);
}
-EXPORT_SYMBOL(ib_alloc_odp_umem);
+EXPORT_SYMBOL(ib_umem_odp_alloc_child);
-int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access)
+/**
+ * ib_umem_odp_get - Create a umem_odp for a userspace va
+ *
+ * @udata: userspace context to pin memory for
+ * @addr: userspace virtual address to start at
+ * @size: length of region to pin
+ * @access: IB_ACCESS_xxx flags for memory being pinned
+ *
+ * The driver should use when the access flags indicate ODP memory. It avoids
+ * pinning, instead, stores the mm for future page fault handling in
+ * conjunction with MMU notifiers.
+ */
+struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata, unsigned long addr,
+ size_t size, int access)
{
- struct ib_umem *umem = &umem_odp->umem;
- /*
- * NOTE: This must called in a process context where umem->owning_mm
- * == current->mm
- */
- struct mm_struct *mm = umem->owning_mm;
- int ret_val;
+ struct ib_umem_odp *umem_odp;
+ struct ib_ucontext *context;
+ struct mm_struct *mm;
+ int ret;
+
+ if (!udata)
+ return ERR_PTR(-EIO);
+
+ context = container_of(udata, struct uverbs_attr_bundle, driver_udata)
+ ->context;
+ if (!context)
+ return ERR_PTR(-EIO);
+
+ if (WARN_ON_ONCE(!(access & IB_ACCESS_ON_DEMAND)) ||
+ WARN_ON_ONCE(!context->device->ops.invalidate_range))
+ return ERR_PTR(-EINVAL);
+
+ umem_odp = kzalloc(sizeof(struct ib_umem_odp), GFP_KERNEL);
+ if (!umem_odp)
+ return ERR_PTR(-ENOMEM);
+
+ umem_odp->umem.context = context;
+ umem_odp->umem.length = size;
+ umem_odp->umem.address = addr;
+ umem_odp->umem.writable = ib_access_writable(access);
+ umem_odp->umem.owning_mm = mm = current->mm;
umem_odp->page_shift = PAGE_SHIFT;
if (access & IB_ACCESS_HUGETLB) {
@@ -414,46 +501,24 @@ int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access)
vma = find_vma(mm, ib_umem_start(umem_odp));
if (!vma || !is_vm_hugetlb_page(vma)) {
up_read(&mm->mmap_sem);
- return -EINVAL;
+ ret = -EINVAL;
+ goto err_free;
}
h = hstate_vma(vma);
umem_odp->page_shift = huge_page_shift(h);
up_read(&mm->mmap_sem);
}
- mutex_init(&umem_odp->umem_mutex);
-
- init_completion(&umem_odp->notifier_completion);
+ ret = ib_init_umem_odp(umem_odp, NULL);
+ if (ret)
+ goto err_free;
+ return umem_odp;
- if (ib_umem_odp_num_pages(umem_odp)) {
- umem_odp->page_list =
- vzalloc(array_size(sizeof(*umem_odp->page_list),
- ib_umem_odp_num_pages(umem_odp)));
- if (!umem_odp->page_list)
- return -ENOMEM;
-
- umem_odp->dma_list =
- vzalloc(array_size(sizeof(*umem_odp->dma_list),
- ib_umem_odp_num_pages(umem_odp)));
- if (!umem_odp->dma_list) {
- ret_val = -ENOMEM;
- goto out_page_list;
- }
- }
-
- ret_val = get_per_mm(umem_odp);
- if (ret_val)
- goto out_dma_list;
- add_umem_to_per_mm(umem_odp);
-
- return 0;
-
-out_dma_list:
- vfree(umem_odp->dma_list);
-out_page_list:
- vfree(umem_odp->page_list);
- return ret_val;
+err_free:
+ kfree(umem_odp);
+ return ERR_PTR(ret);
}
+EXPORT_SYMBOL(ib_umem_odp_get);
void ib_umem_odp_release(struct ib_umem_odp *umem_odp)
{
@@ -463,14 +528,18 @@ void ib_umem_odp_release(struct ib_umem_odp *umem_odp)
* It is the driver's responsibility to ensure, before calling us,
* that the hardware will not attempt to access the MR any more.
*/
- ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp),
- ib_umem_end(umem_odp));
-
- remove_umem_from_per_mm(umem_odp);
+ if (!umem_odp->is_implicit_odp) {
+ ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp),
+ ib_umem_end(umem_odp));
+ remove_umem_from_per_mm(umem_odp);
+ kvfree(umem_odp->dma_list);
+ kvfree(umem_odp->page_list);
+ }
put_per_mm(umem_odp);
- vfree(umem_odp->dma_list);
- vfree(umem_odp->page_list);
+ mmdrop(umem_odp->umem.owning_mm);
+ kfree(umem_odp);
}
+EXPORT_SYMBOL(ib_umem_odp_release);
/*
* Map for DMA and insert a single page into the on-demand paging page tables.
@@ -538,7 +607,7 @@ out:
if (remove_existing_mapping) {
ib_umem_notifier_start_account(umem_odp);
- context->invalidate_range(
+ dev->ops.invalidate_range(
umem_odp,
ib_umem_start(umem_odp) +
(page_index << umem_odp->page_shift),
@@ -765,35 +834,21 @@ int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root,
void *cookie)
{
int ret_val = 0;
- struct umem_odp_node *node, *next;
+ struct interval_tree_node *node, *next;
struct ib_umem_odp *umem;
if (unlikely(start == last))
return ret_val;
- for (node = rbt_ib_umem_iter_first(root, start, last - 1);
+ for (node = interval_tree_iter_first(root, start, last - 1);
node; node = next) {
/* TODO move the blockable decision up to the callback */
if (!blockable)
return -EAGAIN;
- next = rbt_ib_umem_iter_next(node, start, last - 1);
+ next = interval_tree_iter_next(node, start, last - 1);
umem = container_of(node, struct ib_umem_odp, interval_tree);
ret_val = cb(umem, start, last, cookie) || ret_val;
}
return ret_val;
}
-EXPORT_SYMBOL(rbt_ib_umem_for_each_in_range);
-
-struct ib_umem_odp *rbt_ib_umem_lookup(struct rb_root_cached *root,
- u64 addr, u64 length)
-{
- struct umem_odp_node *node;
-
- node = rbt_ib_umem_iter_first(root, addr, addr + length - 1);
- if (node)
- return container_of(node, struct ib_umem_odp, interval_tree);
- return NULL;
-
-}
-EXPORT_SYMBOL(rbt_ib_umem_lookup);
diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c
index e0512aef033c..d1407fa378e8 100644
--- a/drivers/infiniband/core/user_mad.c
+++ b/drivers/infiniband/core/user_mad.c
@@ -49,6 +49,7 @@
#include <linux/sched.h>
#include <linux/semaphore.h>
#include <linux/slab.h>
+#include <linux/nospec.h>
#include <linux/uaccess.h>
@@ -884,11 +885,14 @@ static int ib_umad_unreg_agent(struct ib_umad_file *file, u32 __user *arg)
if (get_user(id, arg))
return -EFAULT;
+ if (id >= IB_UMAD_MAX_AGENTS)
+ return -EINVAL;
mutex_lock(&file->port->file_mutex);
mutex_lock(&file->mutex);
- if (id >= IB_UMAD_MAX_AGENTS || !__get_agent(file, id)) {
+ id = array_index_nospec(id, IB_UMAD_MAX_AGENTS);
+ if (!__get_agent(file, id)) {
ret = -EINVAL;
goto out;
}
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 7ddd0e5bc6b3..8f4fd4fac159 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -275,8 +275,6 @@ static int ib_uverbs_get_context(struct uverbs_attr_bundle *attrs)
ret = ib_dev->ops.alloc_ucontext(ucontext, &attrs->driver_udata);
if (ret)
goto err_file;
- if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING))
- ucontext->invalidate_range = NULL;
rdma_restrack_uadd(&ucontext->res);
diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c
index f4ca436118ab..9f53f63b1453 100644
--- a/drivers/infiniband/hw/hfi1/verbs.c
+++ b/drivers/infiniband/hw/hfi1/verbs.c
@@ -54,6 +54,7 @@
#include <linux/mm.h>
#include <linux/vmalloc.h>
#include <rdma/opa_addr.h>
+#include <linux/nospec.h>
#include "hfi.h"
#include "common.h"
@@ -1537,6 +1538,7 @@ static int hfi1_check_ah(struct ib_device *ibdev, struct rdma_ah_attr *ah_attr)
sl = rdma_ah_get_sl(ah_attr);
if (sl >= ARRAY_SIZE(ibp->sl_to_sc))
return -EINVAL;
+ sl = array_index_nospec(sl, ARRAY_SIZE(ibp->sl_to_sc));
sc5 = ibp->sl_to_sc[sl];
if (sc_to_vlt(dd, sc5) > num_vls && sc_to_vlt(dd, sc5) != 0xf)
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
index 0ff5f9617639..4c3ac2b75966 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
@@ -750,8 +750,10 @@ static int hns_roce_v1_rsv_lp_qp(struct hns_roce_dev *hr_dev)
atomic_set(&free_mr->mr_free_cq->ib_cq.usecnt, 0);
pd = rdma_zalloc_drv_obj(ibdev, ib_pd);
- if (!pd)
+ if (!pd) {
+ ret = -ENOMEM;
goto alloc_mem_failed;
+ }
pd->device = ibdev;
ret = hns_roce_alloc_pd(pd, NULL);
diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c
index 04b4e937c198..59022b744144 100644
--- a/drivers/infiniband/hw/mlx5/devx.c
+++ b/drivers/infiniband/hw/mlx5/devx.c
@@ -2049,7 +2049,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_SUBSCRIBE_EVENT)(
event_sub->eventfd =
eventfd_ctx_fdget(redirect_fd);
- if (IS_ERR(event_sub)) {
+ if (IS_ERR(event_sub->eventfd)) {
err = PTR_ERR(event_sub->eventfd);
event_sub->eventfd = NULL;
goto err;
@@ -2671,12 +2671,13 @@ static int devx_async_event_close(struct inode *inode, struct file *filp)
struct devx_async_event_file *ev_file = filp->private_data;
struct devx_event_subscription *event_sub, *event_sub_tmp;
struct devx_async_event_data *entry, *tmp;
+ struct mlx5_ib_dev *dev = ev_file->dev;
- mutex_lock(&ev_file->dev->devx_event_table.event_xa_lock);
+ mutex_lock(&dev->devx_event_table.event_xa_lock);
/* delete the subscriptions which are related to this FD */
list_for_each_entry_safe(event_sub, event_sub_tmp,
&ev_file->subscribed_events_list, file_list) {
- devx_cleanup_subscription(ev_file->dev, event_sub);
+ devx_cleanup_subscription(dev, event_sub);
if (event_sub->eventfd)
eventfd_ctx_put(event_sub->eventfd);
@@ -2685,7 +2686,7 @@ static int devx_async_event_close(struct inode *inode, struct file *filp)
kfree_rcu(event_sub, rcu);
}
- mutex_unlock(&ev_file->dev->devx_event_table.event_xa_lock);
+ mutex_unlock(&dev->devx_event_table.event_xa_lock);
/* free the pending events allocation */
if (!ev_file->omit_data) {
@@ -2697,7 +2698,7 @@ static int devx_async_event_close(struct inode *inode, struct file *filp)
}
uverbs_close_fd(filp);
- put_device(&ev_file->dev->ib_dev.dev);
+ put_device(&dev->ib_dev.dev);
return 0;
}
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 98e566acb746..1ec0e667110e 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -1867,10 +1867,6 @@ static int mlx5_ib_alloc_ucontext(struct ib_ucontext *uctx,
if (err)
goto out_sys_pages;
- if (ibdev->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING)
- context->ibucontext.invalidate_range =
- &mlx5_ib_invalidate_range;
-
if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX) {
err = mlx5_ib_devx_create(dev, true);
if (err < 0)
@@ -5838,13 +5834,12 @@ static void mlx5_ib_unbind_slave_port(struct mlx5_ib_dev *ibdev,
return;
}
- if (mpi->mdev_events.notifier_call)
- mlx5_notifier_unregister(mpi->mdev, &mpi->mdev_events);
- mpi->mdev_events.notifier_call = NULL;
-
mpi->ibdev = NULL;
spin_unlock(&port->mp.mpi_lock);
+ if (mpi->mdev_events.notifier_call)
+ mlx5_notifier_unregister(mpi->mdev, &mpi->mdev_events);
+ mpi->mdev_events.notifier_call = NULL;
mlx5_remove_netdev_notifier(ibdev, port_num);
spin_lock(&port->mp.mpi_lock);
diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c
index fe1a76d8531c..b5aece786b36 100644
--- a/drivers/infiniband/hw/mlx5/mem.c
+++ b/drivers/infiniband/hw/mlx5/mem.c
@@ -56,18 +56,6 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr,
struct scatterlist *sg;
int entry;
- if (umem->is_odp) {
- unsigned int page_shift = to_ib_umem_odp(umem)->page_shift;
-
- *ncont = ib_umem_page_count(umem);
- *count = *ncont << (page_shift - PAGE_SHIFT);
- *shift = page_shift;
- if (order)
- *order = ilog2(roundup_pow_of_two(*ncont));
-
- return;
- }
-
addr = addr >> PAGE_SHIFT;
tmp = (unsigned long)addr;
m = find_first_bit(&tmp, BITS_PER_LONG);
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 2c77456f359f..b7da619614e4 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -51,22 +51,12 @@ static void clean_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr);
static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr);
static int mr_cache_max_order(struct mlx5_ib_dev *dev);
static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr);
-static bool umr_can_modify_entity_size(struct mlx5_ib_dev *dev)
-{
- return !MLX5_CAP_GEN(dev->mdev, umr_modify_entity_size_disabled);
-}
static bool umr_can_use_indirect_mkey(struct mlx5_ib_dev *dev)
{
return !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled);
}
-static bool use_umr(struct mlx5_ib_dev *dev, int order)
-{
- return order <= mr_cache_max_order(dev) &&
- umr_can_modify_entity_size(dev);
-}
-
static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
{
int err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey);
@@ -794,19 +784,37 @@ static int mr_umem_get(struct mlx5_ib_dev *dev, struct ib_udata *udata,
int *ncont, int *order)
{
struct ib_umem *u;
- int err;
*umem = NULL;
- u = ib_umem_get(udata, start, length, access_flags, 0);
- err = PTR_ERR_OR_ZERO(u);
- if (err) {
- mlx5_ib_dbg(dev, "umem get failed (%d)\n", err);
- return err;
+ if (access_flags & IB_ACCESS_ON_DEMAND) {
+ struct ib_umem_odp *odp;
+
+ odp = ib_umem_odp_get(udata, start, length, access_flags);
+ if (IS_ERR(odp)) {
+ mlx5_ib_dbg(dev, "umem get failed (%ld)\n",
+ PTR_ERR(odp));
+ return PTR_ERR(odp);
+ }
+
+ u = &odp->umem;
+
+ *page_shift = odp->page_shift;
+ *ncont = ib_umem_odp_num_pages(odp);
+ *npages = *ncont << (*page_shift - PAGE_SHIFT);
+ if (order)
+ *order = ilog2(roundup_pow_of_two(*ncont));
+ } else {
+ u = ib_umem_get(udata, start, length, access_flags, 0);
+ if (IS_ERR(u)) {
+ mlx5_ib_dbg(dev, "umem get failed (%ld)\n", PTR_ERR(u));
+ return PTR_ERR(u);
+ }
+
+ mlx5_ib_cont_pages(u, start, MLX5_MKEY_PAGE_SHIFT_MASK, npages,
+ page_shift, ncont, order);
}
- mlx5_ib_cont_pages(u, start, MLX5_MKEY_PAGE_SHIFT_MASK, npages,
- page_shift, ncont, order);
if (!*npages) {
mlx5_ib_warn(dev, "avoid zero region\n");
ib_umem_release(u);
@@ -1271,7 +1279,7 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
{
struct mlx5_ib_dev *dev = to_mdev(pd->device);
struct mlx5_ib_mr *mr = NULL;
- bool populate_mtts = false;
+ bool use_umr;
struct ib_umem *umem;
int page_shift;
int npages;
@@ -1303,29 +1311,30 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
if (err < 0)
return ERR_PTR(err);
- if (use_umr(dev, order)) {
+ use_umr = !MLX5_CAP_GEN(dev->mdev, umr_modify_entity_size_disabled) &&
+ (!MLX5_CAP_GEN(dev->mdev, umr_modify_atomic_disabled) ||
+ !MLX5_CAP_GEN(dev->mdev, atomic));
+
+ if (order <= mr_cache_max_order(dev) && use_umr) {
mr = alloc_mr_from_cache(pd, umem, virt_addr, length, ncont,
page_shift, order, access_flags);
if (PTR_ERR(mr) == -EAGAIN) {
mlx5_ib_dbg(dev, "cache empty for order %d\n", order);
mr = NULL;
}
- populate_mtts = false;
} else if (!MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) {
if (access_flags & IB_ACCESS_ON_DEMAND) {
err = -EINVAL;
pr_err("Got MR registration for ODP MR > 512MB, not supported for Connect-IB\n");
goto error;
}
- populate_mtts = true;
+ use_umr = false;
}
if (!mr) {
- if (!umr_can_modify_entity_size(dev))
- populate_mtts = true;
mutex_lock(&dev->slow_path_mutex);
mr = reg_create(NULL, pd, virt_addr, length, umem, ncont,
- page_shift, access_flags, populate_mtts);
+ page_shift, access_flags, !use_umr);
mutex_unlock(&dev->slow_path_mutex);
}
@@ -1341,7 +1350,7 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
update_odp_mr(mr);
- if (!populate_mtts) {
+ if (use_umr) {
int update_xlt_flags = MLX5_IB_UPD_XLT_ENABLE;
if (access_flags & IB_ACCESS_ON_DEMAND)
@@ -1609,7 +1618,7 @@ static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
/* Wait for all running page-fault handlers to finish. */
synchronize_srcu(&dev->mr_srcu);
/* Destroy all page mappings */
- if (umem_odp->page_list)
+ if (!umem_odp->is_implicit_odp)
mlx5_ib_invalidate_range(umem_odp,
ib_umem_start(umem_odp),
ib_umem_end(umem_odp));
@@ -1620,7 +1629,7 @@ static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
* so that there will not be any invalidations in
* flight, looking at the *mr struct.
*/
- ib_umem_release(umem);
+ ib_umem_odp_release(umem_odp);
atomic_sub(npages, &dev->mdev->priv.reg_pages);
/* Avoid double-freeing the umem. */
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index b0c5de39d186..817c924e7289 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -184,7 +184,7 @@ void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset,
for (i = 0; i < nentries; i++, pklm++) {
pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE);
va = (offset + i) * MLX5_IMR_MTT_SIZE;
- if (odp && odp->umem.address == va) {
+ if (odp && ib_umem_start(odp) == va) {
struct mlx5_ib_mr *mtt = odp->private;
pklm->key = cpu_to_be32(mtt->ibmr.lkey);
@@ -206,7 +206,7 @@ static void mr_leaf_free_action(struct work_struct *work)
mr->parent = NULL;
synchronize_srcu(&mr->dev->mr_srcu);
- ib_umem_release(&odp->umem);
+ ib_umem_odp_release(odp);
if (imr->live)
mlx5_ib_update_xlt(imr, idx, 1, 0,
MLX5_IB_UPD_XLT_INDIRECT |
@@ -384,7 +384,7 @@ static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev,
}
static struct mlx5_ib_mr *implicit_mr_alloc(struct ib_pd *pd,
- struct ib_umem *umem,
+ struct ib_umem_odp *umem_odp,
bool ksm, int access_flags)
{
struct mlx5_ib_dev *dev = to_mdev(pd->device);
@@ -402,7 +402,7 @@ static struct mlx5_ib_mr *implicit_mr_alloc(struct ib_pd *pd,
mr->dev = dev;
mr->access_flags = access_flags;
mr->mmkey.iova = 0;
- mr->umem = umem;
+ mr->umem = &umem_odp->umem;
if (ksm) {
err = mlx5_ib_update_xlt(mr, 0,
@@ -462,18 +462,17 @@ next_mr:
if (nentries)
nentries++;
} else {
- odp = ib_alloc_odp_umem(odp_mr, addr,
- MLX5_IMR_MTT_SIZE);
+ odp = ib_umem_odp_alloc_child(odp_mr, addr, MLX5_IMR_MTT_SIZE);
if (IS_ERR(odp)) {
mutex_unlock(&odp_mr->umem_mutex);
return ERR_CAST(odp);
}
- mtt = implicit_mr_alloc(mr->ibmr.pd, &odp->umem, 0,
+ mtt = implicit_mr_alloc(mr->ibmr.pd, odp, 0,
mr->access_flags);
if (IS_ERR(mtt)) {
mutex_unlock(&odp_mr->umem_mutex);
- ib_umem_release(&odp->umem);
+ ib_umem_odp_release(odp);
return ERR_CAST(mtt);
}
@@ -495,7 +494,7 @@ next_mr:
addr += MLX5_IMR_MTT_SIZE;
if (unlikely(addr < io_virt + bcnt)) {
odp = odp_next(odp);
- if (odp && odp->umem.address != addr)
+ if (odp && ib_umem_start(odp) != addr)
odp = NULL;
goto next_mr;
}
@@ -519,19 +518,19 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
int access_flags)
{
struct mlx5_ib_mr *imr;
- struct ib_umem *umem;
+ struct ib_umem_odp *umem_odp;
- umem = ib_umem_get(udata, 0, 0, access_flags, 0);
- if (IS_ERR(umem))
- return ERR_CAST(umem);
+ umem_odp = ib_umem_odp_alloc_implicit(udata, access_flags);
+ if (IS_ERR(umem_odp))
+ return ERR_CAST(umem_odp);
- imr = implicit_mr_alloc(&pd->ibpd, umem, 1, access_flags);
+ imr = implicit_mr_alloc(&pd->ibpd, umem_odp, 1, access_flags);
if (IS_ERR(imr)) {
- ib_umem_release(umem);
+ ib_umem_odp_release(umem_odp);
return ERR_CAST(imr);
}
- imr->umem = umem;
+ imr->umem = &umem_odp->umem;
init_waitqueue_head(&imr->q_leaf_free);
atomic_set(&imr->num_leaf_free, 0);
atomic_set(&imr->num_pending_prefetch, 0);
@@ -539,34 +538,31 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
return imr;
}
-static int mr_leaf_free(struct ib_umem_odp *umem_odp, u64 start, u64 end,
- void *cookie)
+void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr)
{
- struct mlx5_ib_mr *mr = umem_odp->private, *imr = cookie;
-
- if (mr->parent != imr)
- return 0;
-
- ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp),
- ib_umem_end(umem_odp));
+ struct ib_ucontext_per_mm *per_mm = mr_to_per_mm(imr);
+ struct rb_node *node;
- if (umem_odp->dying)
- return 0;
+ down_read(&per_mm->umem_rwsem);
+ for (node = rb_first_cached(&per_mm->umem_tree); node;
+ node = rb_next(node)) {
+ struct ib_umem_odp *umem_odp =
+ rb_entry(node, struct ib_umem_odp, interval_tree.rb);
+ struct mlx5_ib_mr *mr = umem_odp->private;
- WRITE_ONCE(umem_odp->dying, 1);
- atomic_inc(&imr->num_leaf_free);
- schedule_work(&umem_odp->work);
+ if (mr->parent != imr)
+ continue;
- return 0;
-}
+ ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp),
+ ib_umem_end(umem_odp));
-void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr)
-{
- struct ib_ucontext_per_mm *per_mm = mr_to_per_mm(imr);
+ if (umem_odp->dying)
+ continue;
- down_read(&per_mm->umem_rwsem);
- rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, 0, ULLONG_MAX,
- mr_leaf_free, true, imr);
+ WRITE_ONCE(umem_odp->dying, 1);
+ atomic_inc(&imr->num_leaf_free);
+ schedule_work(&umem_odp->work);
+ }
up_read(&per_mm->umem_rwsem);
wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free));
@@ -579,7 +575,6 @@ static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
u32 flags)
{
int npages = 0, current_seq, page_shift, ret, np;
- bool implicit = false;
struct ib_umem_odp *odp_mr = to_ib_umem_odp(mr->umem);
bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE;
bool prefetch = flags & MLX5_PF_FLAGS_PREFETCH;
@@ -588,13 +583,12 @@ static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
struct ib_umem_odp *odp;
size_t size;
- if (!odp_mr->page_list) {
+ if (odp_mr->is_implicit_odp) {
odp = implicit_mr_get_data(mr, io_virt, bcnt);
if (IS_ERR(odp))
return PTR_ERR(odp);
mr = odp->private;
- implicit = true;
} else {
odp = odp_mr;
}
@@ -607,7 +601,7 @@ next_mr:
start_idx = (io_virt - (mr->mmkey.iova & page_mask)) >> page_shift;
access_mask = ODP_READ_ALLOWED_BIT;
- if (prefetch && !downgrade && !mr->umem->writable) {
+ if (prefetch && !downgrade && !odp->umem.writable) {
/* prefetch with write-access must
* be supported by the MR
*/
@@ -615,7 +609,7 @@ next_mr:
goto out;
}
- if (mr->umem->writable && !downgrade)
+ if (odp->umem.writable && !downgrade)
access_mask |= ODP_WRITE_ALLOWED_BIT;
current_seq = READ_ONCE(odp->notifiers_seq);
@@ -625,8 +619,8 @@ next_mr:
*/
smp_rmb();
- ret = ib_umem_odp_map_dma_pages(to_ib_umem_odp(mr->umem), io_virt, size,
- access_mask, current_seq);
+ ret = ib_umem_odp_map_dma_pages(odp, io_virt, size, access_mask,
+ current_seq);
if (ret < 0)
goto out;
@@ -634,8 +628,7 @@ next_mr:
np = ret;
mutex_lock(&odp->umem_mutex);
- if (!ib_umem_mmu_notifier_retry(to_ib_umem_odp(mr->umem),
- current_seq)) {
+ if (!ib_umem_mmu_notifier_retry(odp, current_seq)) {
/*
* No need to check whether the MTTs really belong to
* this MR, since ib_umem_odp_map_dma_pages already
@@ -668,7 +661,7 @@ next_mr:
io_virt += size;
next = odp_next(odp);
- if (unlikely(!next || next->umem.address != io_virt)) {
+ if (unlikely(!next || ib_umem_start(next) != io_virt)) {
mlx5_ib_dbg(dev, "next implicit leaf removed at 0x%llx. got %p\n",
io_virt, next);
return -EAGAIN;
@@ -682,19 +675,15 @@ next_mr:
out:
if (ret == -EAGAIN) {
- if (implicit || !odp->dying) {
- unsigned long timeout =
- msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT);
-
- if (!wait_for_completion_timeout(
- &odp->notifier_completion,
- timeout)) {
- mlx5_ib_warn(dev, "timeout waiting for mmu notifier. seq %d against %d. notifiers_count=%d\n",
- current_seq, odp->notifiers_seq, odp->notifiers_count);
- }
- } else {
- /* The MR is being killed, kill the QP as well. */
- ret = -EFAULT;
+ unsigned long timeout = msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT);
+
+ if (!wait_for_completion_timeout(&odp->notifier_completion,
+ timeout)) {
+ mlx5_ib_warn(
+ dev,
+ "timeout waiting for mmu notifier. seq %d against %d. notifiers_count=%d\n",
+ current_seq, odp->notifiers_seq,
+ odp->notifiers_count);
}
}
@@ -1598,6 +1587,7 @@ void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent)
static const struct ib_device_ops mlx5_ib_dev_odp_ops = {
.advise_mr = mlx5_ib_advise_mr,
+ .invalidate_range = mlx5_ib_invalidate_range,
};
int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev)
diff --git a/drivers/infiniband/sw/siw/Kconfig b/drivers/infiniband/sw/siw/Kconfig
index dace276aea14..b622fc62f2cd 100644
--- a/drivers/infiniband/sw/siw/Kconfig
+++ b/drivers/infiniband/sw/siw/Kconfig
@@ -1,6 +1,6 @@
config RDMA_SIW
tristate "Software RDMA over TCP/IP (iWARP) driver"
- depends on INET && INFINIBAND && LIBCRC32C && 64BIT
+ depends on INET && INFINIBAND && LIBCRC32C
select DMA_VIRT_OPS
help
This driver implements the iWARP RDMA transport over
diff --git a/drivers/infiniband/sw/siw/siw.h b/drivers/infiniband/sw/siw/siw.h
index 03fd7b2f595f..77b1aabf6ff3 100644
--- a/drivers/infiniband/sw/siw/siw.h
+++ b/drivers/infiniband/sw/siw/siw.h
@@ -214,7 +214,7 @@ struct siw_wqe {
struct siw_cq {
struct ib_cq base_cq;
spinlock_t lock;
- u64 *notify;
+ struct siw_cq_ctrl *notify;
struct siw_cqe *queue;
u32 cq_put;
u32 cq_get;
diff --git a/drivers/infiniband/sw/siw/siw_main.c b/drivers/infiniband/sw/siw/siw_main.c
index d0f140daf659..05a92f997f60 100644
--- a/drivers/infiniband/sw/siw/siw_main.c
+++ b/drivers/infiniband/sw/siw/siw_main.c
@@ -160,10 +160,8 @@ static int siw_init_cpulist(void)
out_err:
siw_cpu_info.num_nodes = 0;
- while (i) {
+ while (--i >= 0)
kfree(siw_cpu_info.tx_valid_cpus[i]);
- siw_cpu_info.tx_valid_cpus[i--] = NULL;
- }
kfree(siw_cpu_info.tx_valid_cpus);
siw_cpu_info.tx_valid_cpus = NULL;
diff --git a/drivers/infiniband/sw/siw/siw_qp.c b/drivers/infiniband/sw/siw/siw_qp.c
index e27bd5b35b96..0990307c5d2c 100644
--- a/drivers/infiniband/sw/siw/siw_qp.c
+++ b/drivers/infiniband/sw/siw/siw_qp.c
@@ -1013,18 +1013,24 @@ out:
*/
static bool siw_cq_notify_now(struct siw_cq *cq, u32 flags)
{
- u64 cq_notify;
+ u32 cq_notify;
if (!cq->base_cq.comp_handler)
return false;
- cq_notify = READ_ONCE(*cq->notify);
+ /* Read application shared notification state */
+ cq_notify = READ_ONCE(cq->notify->flags);
if ((cq_notify & SIW_NOTIFY_NEXT_COMPLETION) ||
((cq_notify & SIW_NOTIFY_SOLICITED) &&
(flags & SIW_WQE_SOLICITED))) {
- /* dis-arm CQ */
- smp_store_mb(*cq->notify, SIW_NOTIFY_NOT);
+ /*
+ * CQ notification is one-shot: Since the
+ * current CQE causes user notification,
+ * the CQ gets dis-aremd and must be re-aremd
+ * by the user for a new notification.
+ */
+ WRITE_ONCE(cq->notify->flags, SIW_NOTIFY_NOT);
return true;
}
diff --git a/drivers/infiniband/sw/siw/siw_verbs.c b/drivers/infiniband/sw/siw/siw_verbs.c
index 404e7ca4b30c..03176a3d1e18 100644
--- a/drivers/infiniband/sw/siw/siw_verbs.c
+++ b/drivers/infiniband/sw/siw/siw_verbs.c
@@ -1050,7 +1050,7 @@ int siw_create_cq(struct ib_cq *base_cq, const struct ib_cq_init_attr *attr,
spin_lock_init(&cq->lock);
- cq->notify = &((struct siw_cq_ctrl *)&cq->queue[size])->notify;
+ cq->notify = (struct siw_cq_ctrl *)&cq->queue[size];
if (udata) {
struct siw_uresp_create_cq uresp = {};
@@ -1142,11 +1142,17 @@ int siw_req_notify_cq(struct ib_cq *base_cq, enum ib_cq_notify_flags flags)
siw_dbg_cq(cq, "flags: 0x%02x\n", flags);
if ((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED)
- /* CQ event for next solicited completion */
- smp_store_mb(*cq->notify, SIW_NOTIFY_SOLICITED);
+ /*
+ * Enable CQ event for next solicited completion.
+ * and make it visible to all associated producers.
+ */
+ smp_store_mb(cq->notify->flags, SIW_NOTIFY_SOLICITED);
else
- /* CQ event for any signalled completion */
- smp_store_mb(*cq->notify, SIW_NOTIFY_ALL);
+ /*
+ * Enable CQ event for any signalled completion.
+ * and make it visible to all associated producers.
+ */
+ smp_store_mb(cq->notify->flags, SIW_NOTIFY_ALL);
if (flags & IB_CQ_REPORT_MISSED_EVENTS)
return cq->cq_put - cq->cq_get;