aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/infiniband/hw/mlx5/odp.c
diff options
context:
space:
mode:
authorJason Gunthorpe <jgg@mellanox.com>2019-10-09 13:09:34 -0300
committerJason Gunthorpe <jgg@mellanox.com>2019-10-28 16:41:14 -0300
commit09689703d29a3b75c510c198c3aca85d7d8b50c7 (patch)
tree378ec2883f70d270bbc1dc2068ede78979163c39 /drivers/infiniband/hw/mlx5/odp.c
parentRDMA/mlx5: Do not store implicit children in the odp_mkeys xarray (diff)
downloadlinux-dev-09689703d29a3b75c510c198c3aca85d7d8b50c7.tar.xz
linux-dev-09689703d29a3b75c510c198c3aca85d7d8b50c7.zip
RDMA/mlx5: Do not race with mlx5_ib_invalidate_range during create and destroy
For creation, as soon as the umem_odp is created the notifier can be called, however the underlying MR may not have been setup yet. This would cause problems if mlx5_ib_invalidate_range() runs. There is some confusing/ulocked/racy code that might by trying to solve this, but without locks it isn't going to work right. Instead trivially solve the problem by short-circuiting the invalidation if there are not yet any DMA mapped pages. By definition there is nothing to invalidate in this case. The create code will have the umem fully setup before anything is DMA mapped, and npages is fully locked by the umem_mutex. For destroy, invalidate the entire MR at the HW to stop DMA then DMA unmap the pages before destroying the MR. This drives npages to zero and prevents similar racing with invalidate while the MR is undergoing destruction. Arguably it would be better if the umem was created after the MR and destroyed before, but that would require a big rework of the MR code. Fixes: 6aec21f6a832 ("IB/mlx5: Page faults handling infrastructure") Link: https://lore.kernel.org/r/20191009160934.3143-15-jgg@ziepe.ca Reviewed-by: Artemy Kovalyov <artemyko@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Diffstat (limited to '')
-rw-r--r--drivers/infiniband/hw/mlx5/odp.c70
1 files changed, 61 insertions, 9 deletions
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index 06e24d5e7609..bcfc09846697 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -144,6 +144,27 @@ void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries,
}
}
+static void dma_fence_odp_mr(struct mlx5_ib_mr *mr)
+{
+ struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
+
+ /* Ensure mlx5_ib_invalidate_range() will not touch the MR any more */
+ mutex_lock(&odp->umem_mutex);
+ if (odp->npages) {
+ mlx5_mr_cache_invalidate(mr);
+ ib_umem_odp_unmap_dma_pages(odp, ib_umem_start(odp),
+ ib_umem_end(odp));
+ WARN_ON(odp->npages);
+ }
+ odp->private = NULL;
+ mutex_unlock(&odp->umem_mutex);
+
+ if (!mr->allocated_from_cache) {
+ mlx5_core_destroy_mkey(mr->dev->mdev, &mr->mmkey);
+ WARN_ON(mr->descs);
+ }
+}
+
/*
* This must be called after the mr has been removed from implicit_children
* and the SRCU synchronized. NOTE: The MR does not necessarily have to be
@@ -171,6 +192,8 @@ static void free_implicit_child_mr(struct mlx5_ib_mr *mr, bool need_imr_xlt)
srcu_read_unlock(&mr->dev->odp_srcu, srcu_key);
}
+ dma_fence_odp_mr(mr);
+
mr->parent = NULL;
mlx5_mr_cache_free(mr->dev, mr);
ib_umem_odp_release(odp);
@@ -228,16 +251,15 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start,
int in_block = 0;
u64 addr;
- if (!umem_odp) {
- pr_err("invalidation called on NULL umem or non-ODP umem\n");
- return;
- }
-
+ mutex_lock(&umem_odp->umem_mutex);
+ /*
+ * If npages is zero then umem_odp->private may not be setup yet. This
+ * does not complete until after the first page is mapped for DMA.
+ */
+ if (!umem_odp->npages)
+ goto out;
mr = umem_odp->private;
- if (!mr || !mr->ibmr.pd)
- return;
-
start = max_t(u64, ib_umem_start(umem_odp), start);
end = min_t(u64, ib_umem_end(umem_odp), end);
@@ -247,7 +269,6 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start,
* overwrite the same MTTs. Concurent invalidations might race us,
* but they will write 0s as well, so no difference in the end result.
*/
- mutex_lock(&umem_odp->umem_mutex);
for (addr = start; addr < end; addr += BIT(umem_odp->page_shift)) {
idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift;
/*
@@ -289,6 +310,7 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start,
if (unlikely(!umem_odp->npages && mr->parent))
destroy_unused_implicit_child_mr(mr);
+out:
mutex_unlock(&umem_odp->umem_mutex);
}
@@ -536,6 +558,13 @@ void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr)
WARN_ON(atomic_read(&imr->num_deferred_work));
}
+ /*
+ * Fence the imr before we destroy the children. This allows us to
+ * skip updating the XLT of the imr during destroy of the child mkey
+ * the imr points to.
+ */
+ mlx5_mr_cache_invalidate(imr);
+
list_for_each_entry_safe (mtt, tmp, &destroy_list, odp_destroy.elm)
free_implicit_child_mr(mtt, false);
@@ -543,6 +572,29 @@ void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr)
ib_umem_odp_release(odp_imr);
}
+/**
+ * mlx5_ib_fence_odp_mr - Stop all access to the ODP MR
+ * @mr: to fence
+ *
+ * On return no parallel threads will be touching this MR and no DMA will be
+ * active.
+ */
+void mlx5_ib_fence_odp_mr(struct mlx5_ib_mr *mr)
+{
+ /* Prevent new page faults and prefetch requests from succeeding */
+ xa_erase(&mr->dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key));
+
+ /* Wait for all running page-fault handlers to finish. */
+ synchronize_srcu(&mr->dev->odp_srcu);
+
+ if (atomic_read(&mr->num_deferred_work)) {
+ flush_workqueue(system_unbound_wq);
+ WARN_ON(atomic_read(&mr->num_deferred_work));
+ }
+
+ dma_fence_odp_mr(mr);
+}
+
#define MLX5_PF_FLAGS_DOWNGRADE BIT(1)
static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp,
u64 user_va, size_t bcnt, u32 *bytes_mapped,