aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/infiniband/hw/mlx5/odp.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/infiniband/hw/mlx5/odp.c')
-rw-r--r--drivers/infiniband/hw/mlx5/odp.c325
1 files changed, 178 insertions, 147 deletions
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index aa2413b50adc..374698186662 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -33,6 +33,8 @@
#include <rdma/ib_umem.h>
#include <rdma/ib_umem_odp.h>
#include <linux/kernel.h>
+#include <linux/dma-buf.h>
+#include <linux/dma-resv.h>
#include "mlx5_ib.h"
#include "cmd.h"
@@ -113,7 +115,6 @@ static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries,
* xarray would be protected by the umem_mutex, however that is not
* possible. Instead this uses a weaker update-then-lock pattern:
*
- * srcu_read_lock()
* xa_store()
* mutex_lock(umem_mutex)
* mlx5_ib_update_xlt()
@@ -124,12 +125,9 @@ static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries,
* before destroying.
*
* The umem_mutex provides the acquire/release semantic needed to make
- * the xa_store() visible to a racing thread. While SRCU is not
- * technically required, using it gives consistent use of the SRCU
- * locking around the xarray.
+ * the xa_store() visible to a racing thread.
*/
lockdep_assert_held(&to_ib_umem_odp(imr->umem)->umem_mutex);
- lockdep_assert_held(&mr_to_mdev(imr)->odp_srcu);
for (; pklm != end; pklm++, idx++) {
struct mlx5_ib_mr *mtt = xa_load(&imr->implicit_children, idx);
@@ -205,8 +203,8 @@ static void dma_fence_odp_mr(struct mlx5_ib_mr *mr)
}
/*
- * This must be called after the mr has been removed from implicit_children
- * and the SRCU synchronized. NOTE: The MR does not necessarily have to be
+ * This must be called after the mr has been removed from implicit_children.
+ * NOTE: The MR does not necessarily have to be
* empty here, parallel page faults could have raced with the free process and
* added pages to it.
*/
@@ -216,19 +214,15 @@ static void free_implicit_child_mr(struct mlx5_ib_mr *mr, bool need_imr_xlt)
struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem);
struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
unsigned long idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT;
- int srcu_key;
- /* implicit_child_mr's are not allowed to have deferred work */
- WARN_ON(atomic_read(&mr->num_deferred_work));
+ mlx5r_deref_wait_odp_mkey(&mr->mmkey);
if (need_imr_xlt) {
- srcu_key = srcu_read_lock(&mr_to_mdev(mr)->odp_srcu);
mutex_lock(&odp_imr->umem_mutex);
mlx5_ib_update_xlt(mr->parent, idx, 1, 0,
MLX5_IB_UPD_XLT_INDIRECT |
MLX5_IB_UPD_XLT_ATOMIC);
mutex_unlock(&odp_imr->umem_mutex);
- srcu_read_unlock(&mr_to_mdev(mr)->odp_srcu, srcu_key);
}
dma_fence_odp_mr(mr);
@@ -236,26 +230,16 @@ static void free_implicit_child_mr(struct mlx5_ib_mr *mr, bool need_imr_xlt)
mr->parent = NULL;
mlx5_mr_cache_free(mr_to_mdev(mr), mr);
ib_umem_odp_release(odp);
- if (atomic_dec_and_test(&imr->num_deferred_work))
- wake_up(&imr->q_deferred_work);
}
static void free_implicit_child_mr_work(struct work_struct *work)
{
struct mlx5_ib_mr *mr =
container_of(work, struct mlx5_ib_mr, odp_destroy.work);
+ struct mlx5_ib_mr *imr = mr->parent;
free_implicit_child_mr(mr, true);
-}
-
-static void free_implicit_child_mr_rcu(struct rcu_head *head)
-{
- struct mlx5_ib_mr *mr =
- container_of(head, struct mlx5_ib_mr, odp_destroy.rcu);
-
- /* Freeing a MR is a sleeping operation, so bounce to a work queue */
- INIT_WORK(&mr->odp_destroy.work, free_implicit_child_mr_work);
- queue_work(system_unbound_wq, &mr->odp_destroy.work);
+ mlx5r_deref_odp_mkey(&imr->mmkey);
}
static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr)
@@ -264,21 +248,14 @@ static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr)
unsigned long idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT;
struct mlx5_ib_mr *imr = mr->parent;
- xa_lock(&imr->implicit_children);
- /*
- * This can race with mlx5_ib_free_implicit_mr(), the first one to
- * reach the xa lock wins the race and destroys the MR.
- */
- if (__xa_cmpxchg(&imr->implicit_children, idx, mr, NULL, GFP_ATOMIC) !=
- mr)
- goto out_unlock;
+ if (!refcount_inc_not_zero(&imr->mmkey.usecount))
+ return;
- atomic_inc(&imr->num_deferred_work);
- call_srcu(&mr_to_mdev(mr)->odp_srcu, &mr->odp_destroy.rcu,
- free_implicit_child_mr_rcu);
+ xa_erase(&imr->implicit_children, idx);
-out_unlock:
- xa_unlock(&imr->implicit_children);
+ /* Freeing a MR is a sleeping operation, so bounce to a work queue */
+ INIT_WORK(&mr->odp_destroy.work, free_implicit_child_mr_work);
+ queue_work(system_unbound_wq, &mr->odp_destroy.work);
}
static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni,
@@ -490,6 +467,12 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
mr->parent = imr;
odp->private = mr;
+ /*
+ * First refcount is owned by the xarray and second refconut
+ * is returned to the caller.
+ */
+ refcount_set(&mr->mmkey.usecount, 2);
+
err = mlx5_ib_update_xlt(mr, 0,
MLX5_IMR_MTT_ENTRIES,
PAGE_SHIFT,
@@ -500,27 +483,28 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
goto out_mr;
}
- /*
- * Once the store to either xarray completes any error unwind has to
- * use synchronize_srcu(). Avoid this with xa_reserve()
- */
- ret = xa_cmpxchg(&imr->implicit_children, idx, NULL, mr,
- GFP_KERNEL);
+ xa_lock(&imr->implicit_children);
+ ret = __xa_cmpxchg(&imr->implicit_children, idx, NULL, mr,
+ GFP_KERNEL);
if (unlikely(ret)) {
if (xa_is_err(ret)) {
ret = ERR_PTR(xa_err(ret));
- goto out_mr;
+ goto out_lock;
}
/*
* Another thread beat us to creating the child mr, use
* theirs.
*/
- goto out_mr;
+ refcount_inc(&ret->mmkey.usecount);
+ goto out_lock;
}
+ xa_unlock(&imr->implicit_children);
mlx5_ib_dbg(mr_to_mdev(imr), "key %x mr %p\n", mr->mmkey.key, mr);
return mr;
+out_lock:
+ xa_unlock(&imr->implicit_children);
out_mr:
mlx5_mr_cache_free(mr_to_mdev(imr), mr);
out_umem:
@@ -559,8 +543,6 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
imr->ibmr.device = &dev->ib_dev;
imr->umem = &umem_odp->umem;
imr->is_odp_implicit = true;
- atomic_set(&imr->num_deferred_work, 0);
- init_waitqueue_head(&imr->q_deferred_work);
xa_init(&imr->implicit_children);
err = mlx5_ib_update_xlt(imr, 0,
@@ -572,8 +554,7 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
if (err)
goto out_mr;
- err = xa_err(xa_store(&dev->odp_mkeys, mlx5_base_mkey(imr->mmkey.key),
- &imr->mmkey, GFP_KERNEL));
+ err = mlx5r_store_odp_mkey(dev, &imr->mmkey);
if (err)
goto out_mr;
@@ -591,60 +572,35 @@ void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr)
{
struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem);
struct mlx5_ib_dev *dev = mr_to_mdev(imr);
- struct list_head destroy_list;
struct mlx5_ib_mr *mtt;
- struct mlx5_ib_mr *tmp;
unsigned long idx;
- INIT_LIST_HEAD(&destroy_list);
-
xa_erase(&dev->odp_mkeys, mlx5_base_mkey(imr->mmkey.key));
/*
- * This stops the SRCU protected page fault path from touching either
- * the imr or any children. The page fault path can only reach the
- * children xarray via the imr.
- */
- synchronize_srcu(&dev->odp_srcu);
-
- /*
* All work on the prefetch list must be completed, xa_erase() prevented
* new work from being created.
*/
- wait_event(imr->q_deferred_work, !atomic_read(&imr->num_deferred_work));
-
+ mlx5r_deref_wait_odp_mkey(&imr->mmkey);
/*
* At this point it is forbidden for any other thread to enter
* pagefault_mr() on this imr. It is already forbidden to call
* pagefault_mr() on an implicit child. Due to this additions to
* implicit_children are prevented.
+ * In addition, any new call to destroy_unused_implicit_child_mr()
+ * may return immediately.
*/
/*
- * Block destroy_unused_implicit_child_mr() from incrementing
- * num_deferred_work.
- */
- xa_lock(&imr->implicit_children);
- xa_for_each (&imr->implicit_children, idx, mtt) {
- __xa_erase(&imr->implicit_children, idx);
- list_add(&mtt->odp_destroy.elm, &destroy_list);
- }
- xa_unlock(&imr->implicit_children);
-
- /*
- * Wait for any concurrent destroy_unused_implicit_child_mr() to
- * complete.
- */
- wait_event(imr->q_deferred_work, !atomic_read(&imr->num_deferred_work));
-
- /*
* Fence the imr before we destroy the children. This allows us to
* skip updating the XLT of the imr during destroy of the child mkey
* the imr points to.
*/
mlx5_mr_cache_invalidate(imr);
- list_for_each_entry_safe (mtt, tmp, &destroy_list, odp_destroy.elm)
+ xa_for_each(&imr->implicit_children, idx, mtt) {
+ xa_erase(&imr->implicit_children, idx);
free_implicit_child_mr(mtt, false);
+ }
mlx5_mr_cache_free(dev, imr);
ib_umem_odp_release(odp_imr);
@@ -663,13 +619,39 @@ void mlx5_ib_fence_odp_mr(struct mlx5_ib_mr *mr)
xa_erase(&mr_to_mdev(mr)->odp_mkeys, mlx5_base_mkey(mr->mmkey.key));
/* Wait for all running page-fault handlers to finish. */
- synchronize_srcu(&mr_to_mdev(mr)->odp_srcu);
-
- wait_event(mr->q_deferred_work, !atomic_read(&mr->num_deferred_work));
+ mlx5r_deref_wait_odp_mkey(&mr->mmkey);
dma_fence_odp_mr(mr);
}
+/**
+ * mlx5_ib_fence_dmabuf_mr - Stop all access to the dmabuf MR
+ * @mr: to fence
+ *
+ * On return no parallel threads will be touching this MR and no DMA will be
+ * active.
+ */
+void mlx5_ib_fence_dmabuf_mr(struct mlx5_ib_mr *mr)
+{
+ struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(mr->umem);
+
+ /* Prevent new page faults and prefetch requests from succeeding */
+ xa_erase(&mr_to_mdev(mr)->odp_mkeys, mlx5_base_mkey(mr->mmkey.key));
+
+ mlx5r_deref_wait_odp_mkey(&mr->mmkey);
+
+ dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL);
+ mlx5_mr_cache_invalidate(mr);
+ umem_dmabuf->private = NULL;
+ ib_umem_dmabuf_unmap_pages(umem_dmabuf);
+ dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);
+
+ if (!mr->cache_ent) {
+ mlx5_core_destroy_mkey(mr_to_mdev(mr)->mdev, &mr->mmkey);
+ WARN_ON(mr->descs);
+ }
+}
+
#define MLX5_PF_FLAGS_DOWNGRADE BIT(1)
#define MLX5_PF_FLAGS_SNAPSHOT BIT(2)
#define MLX5_PF_FLAGS_ENABLE BIT(3)
@@ -747,8 +729,10 @@ static int pagefault_implicit_mr(struct mlx5_ib_mr *imr,
struct mlx5_ib_mr *mtt;
u64 len;
+ xa_lock(&imr->implicit_children);
mtt = xa_load(&imr->implicit_children, idx);
if (unlikely(!mtt)) {
+ xa_unlock(&imr->implicit_children);
mtt = implicit_get_child_mr(imr, idx);
if (IS_ERR(mtt)) {
ret = PTR_ERR(mtt);
@@ -756,6 +740,9 @@ static int pagefault_implicit_mr(struct mlx5_ib_mr *imr,
}
upd_start_idx = min(upd_start_idx, idx);
upd_len = idx - upd_start_idx + 1;
+ } else {
+ refcount_inc(&mtt->mmkey.usecount);
+ xa_unlock(&imr->implicit_children);
}
umem_odp = to_ib_umem_odp(mtt->umem);
@@ -764,6 +751,9 @@ static int pagefault_implicit_mr(struct mlx5_ib_mr *imr,
ret = pagefault_real_mr(mtt, umem_odp, user_va, len,
bytes_mapped, flags);
+
+ mlx5r_deref_odp_mkey(&mtt->mmkey);
+
if (ret < 0)
goto out;
user_va += len;
@@ -803,6 +793,44 @@ out:
return ret;
}
+static int pagefault_dmabuf_mr(struct mlx5_ib_mr *mr, size_t bcnt,
+ u32 *bytes_mapped, u32 flags)
+{
+ struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(mr->umem);
+ u32 xlt_flags = 0;
+ int err;
+ unsigned int page_size;
+
+ if (flags & MLX5_PF_FLAGS_ENABLE)
+ xlt_flags |= MLX5_IB_UPD_XLT_ENABLE;
+
+ dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL);
+ err = ib_umem_dmabuf_map_pages(umem_dmabuf);
+ if (err) {
+ dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);
+ return err;
+ }
+
+ page_size = mlx5_umem_find_best_pgsz(&umem_dmabuf->umem, mkc,
+ log_page_size, 0,
+ umem_dmabuf->umem.iova);
+ if (unlikely(page_size < PAGE_SIZE)) {
+ ib_umem_dmabuf_unmap_pages(umem_dmabuf);
+ err = -EINVAL;
+ } else {
+ err = mlx5_ib_update_mr_pas(mr, xlt_flags);
+ }
+ dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);
+
+ if (err)
+ return err;
+
+ if (bytes_mapped)
+ *bytes_mapped += bcnt;
+
+ return ib_umem_num_pages(mr->umem);
+}
+
/*
* Returns:
* -EFAULT: The io_virt->bcnt is not within the MR, it covers pages that are
@@ -817,10 +845,12 @@ static int pagefault_mr(struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt,
{
struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
- lockdep_assert_held(&mr_to_mdev(mr)->odp_srcu);
if (unlikely(io_virt < mr->mmkey.iova))
return -EFAULT;
+ if (mr->umem->is_dmabuf)
+ return pagefault_dmabuf_mr(mr, bcnt, bytes_mapped, flags);
+
if (!odp->is_implicit_odp) {
u64 user_va;
@@ -847,6 +877,16 @@ int mlx5_ib_init_odp_mr(struct mlx5_ib_mr *mr)
return ret >= 0 ? 0 : ret;
}
+int mlx5_ib_init_dmabuf_mr(struct mlx5_ib_mr *mr)
+{
+ int ret;
+
+ ret = pagefault_dmabuf_mr(mr, mr->umem->length, NULL,
+ MLX5_PF_FLAGS_ENABLE);
+
+ return ret >= 0 ? 0 : ret;
+}
+
struct pf_frame {
struct pf_frame *next;
u32 key;
@@ -896,7 +936,7 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev,
u32 *bytes_committed,
u32 *bytes_mapped)
{
- int npages = 0, srcu_key, ret, i, outlen, cur_outlen = 0, depth = 0;
+ int npages = 0, ret, i, outlen, cur_outlen = 0, depth = 0;
struct pf_frame *head = NULL, *frame;
struct mlx5_core_mkey *mmkey;
struct mlx5_ib_mr *mr;
@@ -905,14 +945,14 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev,
size_t offset;
int ndescs;
- srcu_key = srcu_read_lock(&dev->odp_srcu);
-
io_virt += *bytes_committed;
bcnt -= *bytes_committed;
next_mr:
+ xa_lock(&dev->odp_mkeys);
mmkey = xa_load(&dev->odp_mkeys, mlx5_base_mkey(key));
if (!mmkey) {
+ xa_unlock(&dev->odp_mkeys);
mlx5_ib_dbg(
dev,
"skipping non ODP MR (lkey=0x%06x) in page fault handler.\n",
@@ -925,12 +965,15 @@ next_mr:
* faulted.
*/
ret = 0;
- goto srcu_unlock;
+ goto end;
}
+ refcount_inc(&mmkey->usecount);
+ xa_unlock(&dev->odp_mkeys);
+
if (!mkey_is_eq(mmkey, key)) {
mlx5_ib_dbg(dev, "failed to find mkey %x\n", key);
ret = -EFAULT;
- goto srcu_unlock;
+ goto end;
}
switch (mmkey->type) {
@@ -939,7 +982,7 @@ next_mr:
ret = pagefault_mr(mr, io_virt, bcnt, bytes_mapped, 0);
if (ret < 0)
- goto srcu_unlock;
+ goto end;
mlx5_update_odp_stats(mr, faults, ret);
@@ -954,7 +997,7 @@ next_mr:
if (depth >= MLX5_CAP_GEN(dev->mdev, max_indirection)) {
mlx5_ib_dbg(dev, "indirection level exceeded\n");
ret = -EFAULT;
- goto srcu_unlock;
+ goto end;
}
outlen = MLX5_ST_SZ_BYTES(query_mkey_out) +
@@ -965,7 +1008,7 @@ next_mr:
out = kzalloc(outlen, GFP_KERNEL);
if (!out) {
ret = -ENOMEM;
- goto srcu_unlock;
+ goto end;
}
cur_outlen = outlen;
}
@@ -975,7 +1018,7 @@ next_mr:
ret = mlx5_core_query_mkey(dev->mdev, mmkey, out, outlen);
if (ret)
- goto srcu_unlock;
+ goto end;
offset = io_virt - MLX5_GET64(query_mkey_out, out,
memory_key_mkey_entry.start_addr);
@@ -989,7 +1032,7 @@ next_mr:
frame = kzalloc(sizeof(*frame), GFP_KERNEL);
if (!frame) {
ret = -ENOMEM;
- goto srcu_unlock;
+ goto end;
}
frame->key = be32_to_cpu(pklm->key);
@@ -1008,7 +1051,7 @@ next_mr:
default:
mlx5_ib_dbg(dev, "wrong mkey type %d\n", mmkey->type);
ret = -EFAULT;
- goto srcu_unlock;
+ goto end;
}
if (head) {
@@ -1021,10 +1064,13 @@ next_mr:
depth = frame->depth;
kfree(frame);
+ mlx5r_deref_odp_mkey(mmkey);
goto next_mr;
}
-srcu_unlock:
+end:
+ if (mmkey)
+ mlx5r_deref_odp_mkey(mmkey);
while (head) {
frame = head;
head = frame->next;
@@ -1032,7 +1078,6 @@ srcu_unlock:
}
kfree(out);
- srcu_read_unlock(&dev->odp_srcu, srcu_key);
*bytes_committed = 0;
return ret ? ret : npages;
}
@@ -1040,16 +1085,18 @@ srcu_unlock:
/**
* Parse a series of data segments for page fault handling.
*
- * @pfault contains page fault information.
- * @wqe points at the first data segment in the WQE.
- * @wqe_end points after the end of the WQE.
- * @bytes_mapped receives the number of bytes that the function was able to
- * map. This allows the caller to decide intelligently whether
- * enough memory was mapped to resolve the page fault
- * successfully (e.g. enough for the next MTU, or the entire
- * WQE).
- * @total_wqe_bytes receives the total data size of this WQE in bytes (minus
- * the committed bytes).
+ * @dev: Pointer to mlx5 IB device
+ * @pfault: contains page fault information.
+ * @wqe: points at the first data segment in the WQE.
+ * @wqe_end: points after the end of the WQE.
+ * @bytes_mapped: receives the number of bytes that the function was able to
+ * map. This allows the caller to decide intelligently whether
+ * enough memory was mapped to resolve the page fault
+ * successfully (e.g. enough for the next MTU, or the entire
+ * WQE).
+ * @total_wqe_bytes: receives the total data size of this WQE in bytes (minus
+ * the committed bytes).
+ * @receive_queue: receive WQE end of sg list
*
* Returns the number of pages loaded if positive, zero for an empty WQE, or a
* negative error code.
@@ -1738,8 +1785,8 @@ static void destroy_prefetch_work(struct prefetch_mr_work *work)
u32 i;
for (i = 0; i < work->num_sge; ++i)
- if (atomic_dec_and_test(&work->frags[i].mr->num_deferred_work))
- wake_up(&work->frags[i].mr->q_deferred_work);
+ mlx5r_deref_odp_mkey(&work->frags[i].mr->mmkey);
+
kvfree(work);
}
@@ -1749,27 +1796,30 @@ get_prefetchable_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice,
{
struct mlx5_ib_dev *dev = to_mdev(pd->device);
struct mlx5_core_mkey *mmkey;
- struct ib_umem_odp *odp;
- struct mlx5_ib_mr *mr;
-
- lockdep_assert_held(&dev->odp_srcu);
+ struct mlx5_ib_mr *mr = NULL;
+ xa_lock(&dev->odp_mkeys);
mmkey = xa_load(&dev->odp_mkeys, mlx5_base_mkey(lkey));
if (!mmkey || mmkey->key != lkey || mmkey->type != MLX5_MKEY_MR)
- return NULL;
+ goto end;
mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
- if (mr->ibmr.pd != pd)
- return NULL;
-
- odp = to_ib_umem_odp(mr->umem);
+ if (mr->ibmr.pd != pd) {
+ mr = NULL;
+ goto end;
+ }
/* prefetch with write-access must be supported by the MR */
if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
- !odp->umem.writable)
- return NULL;
+ !mr->umem->writable) {
+ mr = NULL;
+ goto end;
+ }
+ refcount_inc(&mmkey->usecount);
+end:
+ xa_unlock(&dev->odp_mkeys);
return mr;
}
@@ -1777,17 +1827,12 @@ static void mlx5_ib_prefetch_mr_work(struct work_struct *w)
{
struct prefetch_mr_work *work =
container_of(w, struct prefetch_mr_work, work);
- struct mlx5_ib_dev *dev;
u32 bytes_mapped = 0;
- int srcu_key;
int ret;
u32 i;
/* We rely on IB/core that work is executed if we have num_sge != 0 only. */
WARN_ON(!work->num_sge);
- dev = mr_to_mdev(work->frags[0].mr);
- /* SRCU should be held when calling to mlx5_odp_populate_xlt() */
- srcu_key = srcu_read_lock(&dev->odp_srcu);
for (i = 0; i < work->num_sge; ++i) {
ret = pagefault_mr(work->frags[i].mr, work->frags[i].io_virt,
work->frags[i].length, &bytes_mapped,
@@ -1796,7 +1841,6 @@ static void mlx5_ib_prefetch_mr_work(struct work_struct *w)
continue;
mlx5_update_odp_stats(work->frags[i].mr, prefetch, ret);
}
- srcu_read_unlock(&dev->odp_srcu, srcu_key);
destroy_prefetch_work(work);
}
@@ -1820,9 +1864,6 @@ static bool init_prefetch_work(struct ib_pd *pd,
work->num_sge = i;
return false;
}
-
- /* Keep the MR pointer will valid outside the SRCU */
- atomic_inc(&work->frags[i].mr->num_deferred_work);
}
work->num_sge = num_sge;
return true;
@@ -1833,42 +1874,35 @@ static int mlx5_ib_prefetch_sg_list(struct ib_pd *pd,
u32 pf_flags, struct ib_sge *sg_list,
u32 num_sge)
{
- struct mlx5_ib_dev *dev = to_mdev(pd->device);
u32 bytes_mapped = 0;
- int srcu_key;
int ret = 0;
u32 i;
- srcu_key = srcu_read_lock(&dev->odp_srcu);
for (i = 0; i < num_sge; ++i) {
struct mlx5_ib_mr *mr;
mr = get_prefetchable_mr(pd, advice, sg_list[i].lkey);
- if (!mr) {
- ret = -ENOENT;
- goto out;
- }
+ if (!mr)
+ return -ENOENT;
ret = pagefault_mr(mr, sg_list[i].addr, sg_list[i].length,
&bytes_mapped, pf_flags);
- if (ret < 0)
- goto out;
+ if (ret < 0) {
+ mlx5r_deref_odp_mkey(&mr->mmkey);
+ return ret;
+ }
mlx5_update_odp_stats(mr, prefetch, ret);
+ mlx5r_deref_odp_mkey(&mr->mmkey);
}
- ret = 0;
-out:
- srcu_read_unlock(&dev->odp_srcu, srcu_key);
- return ret;
+ return 0;
}
int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
enum ib_uverbs_advise_mr_advice advice,
u32 flags, struct ib_sge *sg_list, u32 num_sge)
{
- struct mlx5_ib_dev *dev = to_mdev(pd->device);
u32 pf_flags = 0;
struct prefetch_mr_work *work;
- int srcu_key;
if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH)
pf_flags |= MLX5_PF_FLAGS_DOWNGRADE;
@@ -1884,13 +1918,10 @@ int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
if (!work)
return -ENOMEM;
- srcu_key = srcu_read_lock(&dev->odp_srcu);
if (!init_prefetch_work(pd, advice, pf_flags, work, sg_list, num_sge)) {
- srcu_read_unlock(&dev->odp_srcu, srcu_key);
destroy_prefetch_work(work);
return -EINVAL;
}
queue_work(system_unbound_wq, &work->work);
- srcu_read_unlock(&dev->odp_srcu, srcu_key);
return 0;
}