1 files changed, 178 insertions, 147 deletions
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index aa2413b50adc..374698186662 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -33,6 +33,8 @@
 #include <rdma/ib_umem.h>
 #include <rdma/ib_umem_odp.h>
 #include <linux/kernel.h>
+#include <linux/dma-buf.h>
+#include <linux/dma-resv.h>
 
 #include "mlx5_ib.h"
 #include "cmd.h"
@@ -113,7 +115,6 @@ static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries,
 	 * xarray would be protected by the umem_mutex, however that is not
 	 * possible. Instead this uses a weaker update-then-lock pattern:
 	 *
-	 *  srcu_read_lock()
 	 *    xa_store()
 	 *    mutex_lock(umem_mutex)
 	 *     mlx5_ib_update_xlt()
@@ -124,12 +125,9 @@ static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries,
 	 * before destroying.
 	 *
 	 * The umem_mutex provides the acquire/release semantic needed to make
-	 * the xa_store() visible to a racing thread. While SRCU is not
-	 * technically required, using it gives consistent use of the SRCU
-	 * locking around the xarray.
+	 * the xa_store() visible to a racing thread.
 	 */
 	lockdep_assert_held(&to_ib_umem_odp(imr->umem)->umem_mutex);
-	lockdep_assert_held(&mr_to_mdev(imr)->odp_srcu);
 
 	for (; pklm != end; pklm++, idx++) {
 		struct mlx5_ib_mr *mtt = xa_load(&imr->implicit_children, idx);
@@ -205,8 +203,8 @@ static void dma_fence_odp_mr(struct mlx5_ib_mr *mr)
 }
 
 /*
- * This must be called after the mr has been removed from implicit_children
- * and the SRCU synchronized.  NOTE: The MR does not necessarily have to be
+ * This must be called after the mr has been removed from implicit_children.
+ * NOTE: The MR does not necessarily have to be
  * empty here, parallel page faults could have raced with the free process and
  * added pages to it.
  */
@@ -216,19 +214,15 @@ static void free_implicit_child_mr(struct mlx5_ib_mr *mr, bool need_imr_xlt)
 	struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem);
 	struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
 	unsigned long idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT;
-	int srcu_key;
 
-	/* implicit_child_mr's are not allowed to have deferred work */
-	WARN_ON(atomic_read(&mr->num_deferred_work));
+	mlx5r_deref_wait_odp_mkey(&mr->mmkey);
 
 	if (need_imr_xlt) {
-		srcu_key = srcu_read_lock(&mr_to_mdev(mr)->odp_srcu);
 		mutex_lock(&odp_imr->umem_mutex);
 		mlx5_ib_update_xlt(mr->parent, idx, 1, 0,
 				   MLX5_IB_UPD_XLT_INDIRECT |
 				   MLX5_IB_UPD_XLT_ATOMIC);
 		mutex_unlock(&odp_imr->umem_mutex);
-		srcu_read_unlock(&mr_to_mdev(mr)->odp_srcu, srcu_key);
 	}
 
 	dma_fence_odp_mr(mr);
@@ -236,26 +230,16 @@ static void free_implicit_child_mr(struct mlx5_ib_mr *mr, bool need_imr_xlt)
 	mr->parent = NULL;
 	mlx5_mr_cache_free(mr_to_mdev(mr), mr);
 	ib_umem_odp_release(odp);
-	if (atomic_dec_and_test(&imr->num_deferred_work))
-		wake_up(&imr->q_deferred_work);
 }
 
 static void free_implicit_child_mr_work(struct work_struct *work)
 {
 	struct mlx5_ib_mr *mr =
 		container_of(work, struct mlx5_ib_mr, odp_destroy.work);
+	struct mlx5_ib_mr *imr = mr->parent;
 
 	free_implicit_child_mr(mr, true);
-}
-
-static void free_implicit_child_mr_rcu(struct rcu_head *head)
-{
-	struct mlx5_ib_mr *mr =
-		container_of(head, struct mlx5_ib_mr, odp_destroy.rcu);
-
-	/* Freeing a MR is a sleeping operation, so bounce to a work queue */
-	INIT_WORK(&mr->odp_destroy.work, free_implicit_child_mr_work);
-	queue_work(system_unbound_wq, &mr->odp_destroy.work);
+	mlx5r_deref_odp_mkey(&imr->mmkey);
 }
 
 static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr)
@@ -264,21 +248,14 @@ static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr)
 	unsigned long idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT;
 	struct mlx5_ib_mr *imr = mr->parent;
 
-	xa_lock(&imr->implicit_children);
-	/*
-	 * This can race with mlx5_ib_free_implicit_mr(), the first one to
-	 * reach the xa lock wins the race and destroys the MR.
-	 */
-	if (__xa_cmpxchg(&imr->implicit_children, idx, mr, NULL, GFP_ATOMIC) !=
-	    mr)
-		goto out_unlock;
+	if (!refcount_inc_not_zero(&imr->mmkey.usecount))
+		return;
 
-	atomic_inc(&imr->num_deferred_work);
-	call_srcu(&mr_to_mdev(mr)->odp_srcu, &mr->odp_destroy.rcu,
-		  free_implicit_child_mr_rcu);
+	xa_erase(&imr->implicit_children, idx);
 
-out_unlock:
-	xa_unlock(&imr->implicit_children);
+	/* Freeing a MR is a sleeping operation, so bounce to a work queue */
+	INIT_WORK(&mr->odp_destroy.work, free_implicit_child_mr_work);
+	queue_work(system_unbound_wq, &mr->odp_destroy.work);
 }
 
 static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni,
@@ -490,6 +467,12 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
 	mr->parent = imr;
 	odp->private = mr;
 
+	/*
+	 * First refcount is owned by the xarray and second refconut
+	 * is returned to the caller.
+	 */
+	refcount_set(&mr->mmkey.usecount, 2);
+
 	err = mlx5_ib_update_xlt(mr, 0,
 				 MLX5_IMR_MTT_ENTRIES,
 				 PAGE_SHIFT,
@@ -500,27 +483,28 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
 		goto out_mr;
 	}
 
-	/*
-	 * Once the store to either xarray completes any error unwind has to
-	 * use synchronize_srcu(). Avoid this with xa_reserve()
-	 */
-	ret = xa_cmpxchg(&imr->implicit_children, idx, NULL, mr,
-			 GFP_KERNEL);
+	xa_lock(&imr->implicit_children);
+	ret = __xa_cmpxchg(&imr->implicit_children, idx, NULL, mr,
+			   GFP_KERNEL);
 	if (unlikely(ret)) {
 		if (xa_is_err(ret)) {
 			ret = ERR_PTR(xa_err(ret));
-			goto out_mr;
+			goto out_lock;
 		}
 		/*
 		 * Another thread beat us to creating the child mr, use
 		 * theirs.
 		 */
-		goto out_mr;
+		refcount_inc(&ret->mmkey.usecount);
+		goto out_lock;
 	}
+	xa_unlock(&imr->implicit_children);
 
 	mlx5_ib_dbg(mr_to_mdev(imr), "key %x mr %p\n", mr->mmkey.key, mr);
 	return mr;
 
+out_lock:
+	xa_unlock(&imr->implicit_children);
 out_mr:
 	mlx5_mr_cache_free(mr_to_mdev(imr), mr);
 out_umem:
@@ -559,8 +543,6 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
 	imr->ibmr.device = &dev->ib_dev;
 	imr->umem = &umem_odp->umem;
 	imr->is_odp_implicit = true;
-	atomic_set(&imr->num_deferred_work, 0);
-	init_waitqueue_head(&imr->q_deferred_work);
 	xa_init(&imr->implicit_children);
 
 	err = mlx5_ib_update_xlt(imr, 0,
@@ -572,8 +554,7 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
 	if (err)
 		goto out_mr;
 
-	err = xa_err(xa_store(&dev->odp_mkeys, mlx5_base_mkey(imr->mmkey.key),
-			      &imr->mmkey, GFP_KERNEL));
+	err = mlx5r_store_odp_mkey(dev, &imr->mmkey);
 	if (err)
 		goto out_mr;
 
@@ -591,60 +572,35 @@ void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr)
 {
 	struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem);
 	struct mlx5_ib_dev *dev = mr_to_mdev(imr);
-	struct list_head destroy_list;
 	struct mlx5_ib_mr *mtt;
-	struct mlx5_ib_mr *tmp;
 	unsigned long idx;
 
-	INIT_LIST_HEAD(&destroy_list);
-
 	xa_erase(&dev->odp_mkeys, mlx5_base_mkey(imr->mmkey.key));
 	/*
-	 * This stops the SRCU protected page fault path from touching either
-	 * the imr or any children. The page fault path can only reach the
-	 * children xarray via the imr.
-	 */
-	synchronize_srcu(&dev->odp_srcu);
-
-	/*
 	 * All work on the prefetch list must be completed, xa_erase() prevented
 	 * new work from being created.
 	 */
-	wait_event(imr->q_deferred_work, !atomic_read(&imr->num_deferred_work));
-
+	mlx5r_deref_wait_odp_mkey(&imr->mmkey);
 	/*
 	 * At this point it is forbidden for any other thread to enter
 	 * pagefault_mr() on this imr. It is already forbidden to call
 	 * pagefault_mr() on an implicit child. Due to this additions to
 	 * implicit_children are prevented.
+	 * In addition, any new call to destroy_unused_implicit_child_mr()
+	 * may return immediately.
 	 */
 
 	/*
-	 * Block destroy_unused_implicit_child_mr() from incrementing
-	 * num_deferred_work.
-	 */
-	xa_lock(&imr->implicit_children);
-	xa_for_each (&imr->implicit_children, idx, mtt) {
-		__xa_erase(&imr->implicit_children, idx);
-		list_add(&mtt->odp_destroy.elm, &destroy_list);
-	}
-	xa_unlock(&imr->implicit_children);
-
-	/*
-	 * Wait for any concurrent destroy_unused_implicit_child_mr() to
-	 * complete.
-	 */
-	wait_event(imr->q_deferred_work, !atomic_read(&imr->num_deferred_work));
-
-	/*
 	 * Fence the imr before we destroy the children. This allows us to
 	 * skip updating the XLT of the imr during destroy of the child mkey
 	 * the imr points to.
 	 */
 	mlx5_mr_cache_invalidate(imr);
 
-	list_for_each_entry_safe (mtt, tmp, &destroy_list, odp_destroy.elm)
+	xa_for_each(&imr->implicit_children, idx, mtt) {
+		xa_erase(&imr->implicit_children, idx);
 		free_implicit_child_mr(mtt, false);
+	}
 
 	mlx5_mr_cache_free(dev, imr);
 	ib_umem_odp_release(odp_imr);
@@ -663,13 +619,39 @@ void mlx5_ib_fence_odp_mr(struct mlx5_ib_mr *mr)
 	xa_erase(&mr_to_mdev(mr)->odp_mkeys, mlx5_base_mkey(mr->mmkey.key));
 
 	/* Wait for all running page-fault handlers to finish. */
-	synchronize_srcu(&mr_to_mdev(mr)->odp_srcu);
-
-	wait_event(mr->q_deferred_work, !atomic_read(&mr->num_deferred_work));
+	mlx5r_deref_wait_odp_mkey(&mr->mmkey);
 
 	dma_fence_odp_mr(mr);
 }
 
+/**
+ * mlx5_ib_fence_dmabuf_mr - Stop all access to the dmabuf MR
+ * @mr: to fence
+ *
+ * On return no parallel threads will be touching this MR and no DMA will be
+ * active.
+ */
+void mlx5_ib_fence_dmabuf_mr(struct mlx5_ib_mr *mr)
+{
+	struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(mr->umem);
+
+	/* Prevent new page faults and prefetch requests from succeeding */
+	xa_erase(&mr_to_mdev(mr)->odp_mkeys, mlx5_base_mkey(mr->mmkey.key));
+
+	mlx5r_deref_wait_odp_mkey(&mr->mmkey);
+
+	dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL);
+	mlx5_mr_cache_invalidate(mr);
+	umem_dmabuf->private = NULL;
+	ib_umem_dmabuf_unmap_pages(umem_dmabuf);
+	dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);
+
+	if (!mr->cache_ent) {
+		mlx5_core_destroy_mkey(mr_to_mdev(mr)->mdev, &mr->mmkey);
+		WARN_ON(mr->descs);
+	}
+}
+
 #define MLX5_PF_FLAGS_DOWNGRADE BIT(1)
 #define MLX5_PF_FLAGS_SNAPSHOT BIT(2)
 #define MLX5_PF_FLAGS_ENABLE BIT(3)
@@ -747,8 +729,10 @@ static int pagefault_implicit_mr(struct mlx5_ib_mr *imr,
 		struct mlx5_ib_mr *mtt;
 		u64 len;
 
+		xa_lock(&imr->implicit_children);
 		mtt = xa_load(&imr->implicit_children, idx);
 		if (unlikely(!mtt)) {
+			xa_unlock(&imr->implicit_children);
 			mtt = implicit_get_child_mr(imr, idx);
 			if (IS_ERR(mtt)) {
 				ret = PTR_ERR(mtt);
@@ -756,6 +740,9 @@ static int pagefault_implicit_mr(struct mlx5_ib_mr *imr,
 			}
 			upd_start_idx = min(upd_start_idx, idx);
 			upd_len = idx - upd_start_idx + 1;
+		} else {
+			refcount_inc(&mtt->mmkey.usecount);
+			xa_unlock(&imr->implicit_children);
 		}
 
 		umem_odp = to_ib_umem_odp(mtt->umem);
@@ -764,6 +751,9 @@ static int pagefault_implicit_mr(struct mlx5_ib_mr *imr,
 
 		ret = pagefault_real_mr(mtt, umem_odp, user_va, len,
 					bytes_mapped, flags);
+
+		mlx5r_deref_odp_mkey(&mtt->mmkey);
+
 		if (ret < 0)
 			goto out;
 		user_va += len;
@@ -803,6 +793,44 @@ out:
 	return ret;
 }
 
+static int pagefault_dmabuf_mr(struct mlx5_ib_mr *mr, size_t bcnt,
+			       u32 *bytes_mapped, u32 flags)
+{
+	struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(mr->umem);
+	u32 xlt_flags = 0;
+	int err;
+	unsigned int page_size;
+
+	if (flags & MLX5_PF_FLAGS_ENABLE)
+		xlt_flags |= MLX5_IB_UPD_XLT_ENABLE;
+
+	dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL);
+	err = ib_umem_dmabuf_map_pages(umem_dmabuf);
+	if (err) {
+		dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);
+		return err;
+	}
+
+	page_size = mlx5_umem_find_best_pgsz(&umem_dmabuf->umem, mkc,
+					     log_page_size, 0,
+					     umem_dmabuf->umem.iova);
+	if (unlikely(page_size < PAGE_SIZE)) {
+		ib_umem_dmabuf_unmap_pages(umem_dmabuf);
+		err = -EINVAL;
+	} else {
+		err = mlx5_ib_update_mr_pas(mr, xlt_flags);
+	}
+	dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);
+
+	if (err)
+		return err;
+
+	if (bytes_mapped)
+		*bytes_mapped += bcnt;
+
+	return ib_umem_num_pages(mr->umem);
+}
+
 /*
  * Returns:
  *  -EFAULT: The io_virt->bcnt is not within the MR, it covers pages that are
@@ -817,10 +845,12 @@ static int pagefault_mr(struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt,
 {
 	struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
 
-	lockdep_assert_held(&mr_to_mdev(mr)->odp_srcu);
 	if (unlikely(io_virt < mr->mmkey.iova))
 		return -EFAULT;
 
+	if (mr->umem->is_dmabuf)
+		return pagefault_dmabuf_mr(mr, bcnt, bytes_mapped, flags);
+
 	if (!odp->is_implicit_odp) {
 		u64 user_va;
 
@@ -847,6 +877,16 @@ int mlx5_ib_init_odp_mr(struct mlx5_ib_mr *mr)
 	return ret >= 0 ? 0 : ret;
 }
 
+int mlx5_ib_init_dmabuf_mr(struct mlx5_ib_mr *mr)
+{
+	int ret;
+
+	ret = pagefault_dmabuf_mr(mr, mr->umem->length, NULL,
+				  MLX5_PF_FLAGS_ENABLE);
+
+	return ret >= 0 ? 0 : ret;
+}
+
 struct pf_frame {
 	struct pf_frame *next;
 	u32 key;
@@ -896,7 +936,7 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev,
 					 u32 *bytes_committed,
 					 u32 *bytes_mapped)
 {
-	int npages = 0, srcu_key, ret, i, outlen, cur_outlen = 0, depth = 0;
+	int npages = 0, ret, i, outlen, cur_outlen = 0, depth = 0;
 	struct pf_frame *head = NULL, *frame;
 	struct mlx5_core_mkey *mmkey;
 	struct mlx5_ib_mr *mr;
@@ -905,14 +945,14 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev,
 	size_t offset;
 	int ndescs;
 
-	srcu_key = srcu_read_lock(&dev->odp_srcu);
-
 	io_virt += *bytes_committed;
 	bcnt -= *bytes_committed;
 
 next_mr:
+	xa_lock(&dev->odp_mkeys);
 	mmkey = xa_load(&dev->odp_mkeys, mlx5_base_mkey(key));
 	if (!mmkey) {
+		xa_unlock(&dev->odp_mkeys);
 		mlx5_ib_dbg(
 			dev,
 			"skipping non ODP MR (lkey=0x%06x) in page fault handler.\n",
@@ -925,12 +965,15 @@ next_mr:
 		 * faulted.
 		 */
 		ret = 0;
-		goto srcu_unlock;
+		goto end;
 	}
+	refcount_inc(&mmkey->usecount);
+	xa_unlock(&dev->odp_mkeys);
+
 	if (!mkey_is_eq(mmkey, key)) {
 		mlx5_ib_dbg(dev, "failed to find mkey %x\n", key);
 		ret = -EFAULT;
-		goto srcu_unlock;
+		goto end;
 	}
 
 	switch (mmkey->type) {
@@ -939,7 +982,7 @@ next_mr:
 
 		ret = pagefault_mr(mr, io_virt, bcnt, bytes_mapped, 0);
 		if (ret < 0)
-			goto srcu_unlock;
+			goto end;
 
 		mlx5_update_odp_stats(mr, faults, ret);
 
@@ -954,7 +997,7 @@ next_mr:
 		if (depth >= MLX5_CAP_GEN(dev->mdev, max_indirection)) {
 			mlx5_ib_dbg(dev, "indirection level exceeded\n");
 			ret = -EFAULT;
-			goto srcu_unlock;
+			goto end;
 		}
 
 		outlen = MLX5_ST_SZ_BYTES(query_mkey_out) +
@@ -965,7 +1008,7 @@ next_mr:
 			out = kzalloc(outlen, GFP_KERNEL);
 			if (!out) {
 				ret = -ENOMEM;
-				goto srcu_unlock;
+				goto end;
 			}
 			cur_outlen = outlen;
 		}
@@ -975,7 +1018,7 @@ next_mr:
 
 		ret = mlx5_core_query_mkey(dev->mdev, mmkey, out, outlen);
 		if (ret)
-			goto srcu_unlock;
+			goto end;
 
 		offset = io_virt - MLX5_GET64(query_mkey_out, out,
 					      memory_key_mkey_entry.start_addr);
@@ -989,7 +1032,7 @@ next_mr:
 			frame = kzalloc(sizeof(*frame), GFP_KERNEL);
 			if (!frame) {
 				ret = -ENOMEM;
-				goto srcu_unlock;
+				goto end;
 			}
 
 			frame->key = be32_to_cpu(pklm->key);
@@ -1008,7 +1051,7 @@ next_mr:
 	default:
 		mlx5_ib_dbg(dev, "wrong mkey type %d\n", mmkey->type);
 		ret = -EFAULT;
-		goto srcu_unlock;
+		goto end;
 	}
 
 	if (head) {
@@ -1021,10 +1064,13 @@ next_mr:
 		depth = frame->depth;
 		kfree(frame);
 
+		mlx5r_deref_odp_mkey(mmkey);
 		goto next_mr;
 	}
 
-srcu_unlock:
+end:
+	if (mmkey)
+		mlx5r_deref_odp_mkey(mmkey);
 	while (head) {
 		frame = head;
 		head = frame->next;
@@ -1032,7 +1078,6 @@ srcu_unlock:
 	}
 	kfree(out);
 
-	srcu_read_unlock(&dev->odp_srcu, srcu_key);
 	*bytes_committed = 0;
 	return ret ? ret : npages;
 }
@@ -1040,16 +1085,18 @@ srcu_unlock:
 /**
  * Parse a series of data segments for page fault handling.
  *
- * @pfault contains page fault information.
- * @wqe points at the first data segment in the WQE.
- * @wqe_end points after the end of the WQE.
- * @bytes_mapped receives the number of bytes that the function was able to
- *               map. This allows the caller to decide intelligently whether
- *               enough memory was mapped to resolve the page fault
- *               successfully (e.g. enough for the next MTU, or the entire
- *               WQE).
- * @total_wqe_bytes receives the total data size of this WQE in bytes (minus
- *                  the committed bytes).
+ * @dev:  Pointer to mlx5 IB device
+ * @pfault: contains page fault information.
+ * @wqe: points at the first data segment in the WQE.
+ * @wqe_end: points after the end of the WQE.
+ * @bytes_mapped: receives the number of bytes that the function was able to
+ *                map. This allows the caller to decide intelligently whether
+ *                enough memory was mapped to resolve the page fault
+ *                successfully (e.g. enough for the next MTU, or the entire
+ *                WQE).
+ * @total_wqe_bytes: receives the total data size of this WQE in bytes (minus
+ *                   the committed bytes).
+ * @receive_queue: receive WQE end of sg list
  *
  * Returns the number of pages loaded if positive, zero for an empty WQE, or a
  * negative error code.
@@ -1738,8 +1785,8 @@ static void destroy_prefetch_work(struct prefetch_mr_work *work)
 	u32 i;
 
 	for (i = 0; i < work->num_sge; ++i)
-		if (atomic_dec_and_test(&work->frags[i].mr->num_deferred_work))
-			wake_up(&work->frags[i].mr->q_deferred_work);
+		mlx5r_deref_odp_mkey(&work->frags[i].mr->mmkey);
+
 	kvfree(work);
 }
 
@@ -1749,27 +1796,30 @@ get_prefetchable_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice,
 {
 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
 	struct mlx5_core_mkey *mmkey;
-	struct ib_umem_odp *odp;
-	struct mlx5_ib_mr *mr;
-
-	lockdep_assert_held(&dev->odp_srcu);
+	struct mlx5_ib_mr *mr = NULL;
 
+	xa_lock(&dev->odp_mkeys);
 	mmkey = xa_load(&dev->odp_mkeys, mlx5_base_mkey(lkey));
 	if (!mmkey || mmkey->key != lkey || mmkey->type != MLX5_MKEY_MR)
-		return NULL;
+		goto end;
 
 	mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
 
-	if (mr->ibmr.pd != pd)
-		return NULL;
-
-	odp = to_ib_umem_odp(mr->umem);
+	if (mr->ibmr.pd != pd) {
+		mr = NULL;
+		goto end;
+	}
 
 	/* prefetch with write-access must be supported by the MR */
 	if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
-	    !odp->umem.writable)
-		return NULL;
+	    !mr->umem->writable) {
+		mr = NULL;
+		goto end;
+	}
 
+	refcount_inc(&mmkey->usecount);
+end:
+	xa_unlock(&dev->odp_mkeys);
 	return mr;
 }
 
@@ -1777,17 +1827,12 @@ static void mlx5_ib_prefetch_mr_work(struct work_struct *w)
 {
 	struct prefetch_mr_work *work =
 		container_of(w, struct prefetch_mr_work, work);
-	struct mlx5_ib_dev *dev;
 	u32 bytes_mapped = 0;
-	int srcu_key;
 	int ret;
 	u32 i;
 
 	/* We rely on IB/core that work is executed if we have num_sge != 0 only. */
 	WARN_ON(!work->num_sge);
-	dev = mr_to_mdev(work->frags[0].mr);
-	/* SRCU should be held when calling to mlx5_odp_populate_xlt() */
-	srcu_key = srcu_read_lock(&dev->odp_srcu);
 	for (i = 0; i < work->num_sge; ++i) {
 		ret = pagefault_mr(work->frags[i].mr, work->frags[i].io_virt,
 				   work->frags[i].length, &bytes_mapped,
@@ -1796,7 +1841,6 @@ static void mlx5_ib_prefetch_mr_work(struct work_struct *w)
 			continue;
 		mlx5_update_odp_stats(work->frags[i].mr, prefetch, ret);
 	}
-	srcu_read_unlock(&dev->odp_srcu, srcu_key);
 
 	destroy_prefetch_work(work);
 }
@@ -1820,9 +1864,6 @@ static bool init_prefetch_work(struct ib_pd *pd,
 			work->num_sge = i;
 			return false;
 		}
-
-		/* Keep the MR pointer will valid outside the SRCU */
-		atomic_inc(&work->frags[i].mr->num_deferred_work);
 	}
 	work->num_sge = num_sge;
 	return true;
@@ -1833,42 +1874,35 @@ static int mlx5_ib_prefetch_sg_list(struct ib_pd *pd,
 				    u32 pf_flags, struct ib_sge *sg_list,
 				    u32 num_sge)
 {
-	struct mlx5_ib_dev *dev = to_mdev(pd->device);
 	u32 bytes_mapped = 0;
-	int srcu_key;
 	int ret = 0;
 	u32 i;
 
-	srcu_key = srcu_read_lock(&dev->odp_srcu);
 	for (i = 0; i < num_sge; ++i) {
 		struct mlx5_ib_mr *mr;
 
 		mr = get_prefetchable_mr(pd, advice, sg_list[i].lkey);
-		if (!mr) {
-			ret = -ENOENT;
-			goto out;
-		}
+		if (!mr)
+			return -ENOENT;
 		ret = pagefault_mr(mr, sg_list[i].addr, sg_list[i].length,
 				   &bytes_mapped, pf_flags);
-		if (ret < 0)
-			goto out;
+		if (ret < 0) {
+			mlx5r_deref_odp_mkey(&mr->mmkey);
+			return ret;
+		}
 		mlx5_update_odp_stats(mr, prefetch, ret);
+		mlx5r_deref_odp_mkey(&mr->mmkey);
 	}
-	ret = 0;
 
-out:
-	srcu_read_unlock(&dev->odp_srcu, srcu_key);
-	return ret;
+	return 0;
 }
 
 int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
 			       enum ib_uverbs_advise_mr_advice advice,
 			       u32 flags, struct ib_sge *sg_list, u32 num_sge)
 {
-	struct mlx5_ib_dev *dev = to_mdev(pd->device);
 	u32 pf_flags = 0;
 	struct prefetch_mr_work *work;
-	int srcu_key;
 
 	if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH)
 		pf_flags |= MLX5_PF_FLAGS_DOWNGRADE;
@@ -1884,13 +1918,10 @@ int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
 	if (!work)
 		return -ENOMEM;
 
-	srcu_key = srcu_read_lock(&dev->odp_srcu);
 	if (!init_prefetch_work(pd, advice, pf_flags, work, sg_list, num_sge)) {
-		srcu_read_unlock(&dev->odp_srcu, srcu_key);
 		destroy_prefetch_work(work);
 		return -EINVAL;
 	}
 	queue_work(system_unbound_wq, &work->work);
-	srcu_read_unlock(&dev->odp_srcu, srcu_key);
 	return 0;
 }