From fb985e278a30224183fdf3d56e2f69cfdef88d4e Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 9 Oct 2019 13:09:21 -0300 Subject: RDMA/mlx5: Use SRCU properly in ODP prefetch When working with SRCU protected xarrays the xarray itself should be the SRCU 'update' point. Instead prefetch is using live as the SRCU update point and this prevents switching the locking design to use the xarray instead. To solve this the prefetch must only read from the xarray once, and hold on to the actual MR pointer for the duration of the async operation. Incrementing num_pending_prefetch delays destruction of the MR, so it is suitable. Prefetch calls directly to the pagefault_mr using the MR pointer and only does a single xarray lookup. All the testing if a MR is prefetchable or not is now done only in the prefetch code and removed from the pagefault critical path. Link: https://lore.kernel.org/r/20191009160934.3143-2-jgg@ziepe.ca Reviewed-by: Artemy Kovalyov Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/odp.c | 262 ++++++++++++++++++--------------------- 1 file changed, 121 insertions(+), 141 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 3f9478d19376..09dac97e4ca4 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -606,16 +606,13 @@ void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr) wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free)); } -#define MLX5_PF_FLAGS_PREFETCH BIT(0) #define MLX5_PF_FLAGS_DOWNGRADE BIT(1) -static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, - u64 io_virt, size_t bcnt, u32 *bytes_mapped, - u32 flags) +static int pagefault_mr(struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt, + u32 *bytes_mapped, u32 flags) { int npages = 0, current_seq, page_shift, ret, np; struct ib_umem_odp *odp_mr = to_ib_umem_odp(mr->umem); bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE; - bool prefetch = flags & MLX5_PF_FLAGS_PREFETCH; u64 access_mask; u64 start_idx, page_mask; struct ib_umem_odp *odp; @@ -639,14 +636,6 @@ next_mr: start_idx = (io_virt - (mr->mmkey.iova & page_mask)) >> page_shift; access_mask = ODP_READ_ALLOWED_BIT; - if (prefetch && !downgrade && !odp->umem.writable) { - /* prefetch with write-access must - * be supported by the MR - */ - ret = -EINVAL; - goto out; - } - if (odp->umem.writable && !downgrade) access_mask |= ODP_WRITE_ALLOWED_BIT; @@ -681,7 +670,8 @@ next_mr: if (ret < 0) { if (ret != -EAGAIN) - mlx5_ib_err(dev, "Failed to update mkey page tables\n"); + mlx5_ib_err(mr->dev, + "Failed to update mkey page tables\n"); goto out; } @@ -700,8 +690,10 @@ next_mr: io_virt += size; next = odp_next(odp); if (unlikely(!next || ib_umem_start(next) != io_virt)) { - mlx5_ib_dbg(dev, "next implicit leaf removed at 0x%llx. got %p\n", - io_virt, next); + mlx5_ib_dbg( + mr->dev, + "next implicit leaf removed at 0x%llx. got %p\n", + io_virt, next); return -EAGAIN; } odp = next; @@ -718,7 +710,7 @@ out: if (!wait_for_completion_timeout(&odp->notifier_completion, timeout)) { mlx5_ib_warn( - dev, + mr->dev, "timeout waiting for mmu notifier. seq %d against %d. notifiers_count=%d\n", current_seq, odp->notifiers_seq, odp->notifiers_count); @@ -775,10 +767,9 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev, struct ib_pd *pd, u32 key, u64 io_virt, size_t bcnt, u32 *bytes_committed, - u32 *bytes_mapped, u32 flags) + u32 *bytes_mapped) { int npages = 0, srcu_key, ret, i, outlen, cur_outlen = 0, depth = 0; - bool prefetch = flags & MLX5_PF_FLAGS_PREFETCH; struct pf_frame *head = NULL, *frame; struct mlx5_core_mkey *mmkey; struct mlx5_ib_mr *mr; @@ -800,12 +791,6 @@ next_mr: goto srcu_unlock; } - if (prefetch && mmkey->type != MLX5_MKEY_MR) { - mlx5_ib_dbg(dev, "prefetch is allowed only for MR\n"); - ret = -EINVAL; - goto srcu_unlock; - } - switch (mmkey->type) { case MLX5_MKEY_MR: mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); @@ -815,17 +800,6 @@ next_mr: goto srcu_unlock; } - if (prefetch) { - if (!is_odp_mr(mr) || - mr->ibmr.pd != pd) { - mlx5_ib_dbg(dev, "Invalid prefetch request: %s\n", - is_odp_mr(mr) ? "MR is not ODP" : - "PD is not of the MR"); - ret = -EINVAL; - goto srcu_unlock; - } - } - if (!is_odp_mr(mr)) { mlx5_ib_dbg(dev, "skipping non ODP MR (lkey=0x%06x) in page fault handler.\n", key); @@ -835,7 +809,7 @@ next_mr: goto srcu_unlock; } - ret = pagefault_mr(dev, mr, io_virt, bcnt, bytes_mapped, flags); + ret = pagefault_mr(mr, io_virt, bcnt, bytes_mapped, 0); if (ret < 0) goto srcu_unlock; @@ -1009,7 +983,7 @@ static int pagefault_data_segments(struct mlx5_ib_dev *dev, ret = pagefault_single_data_segment(dev, NULL, key, io_virt, bcnt, &pfault->bytes_committed, - bytes_mapped, 0); + bytes_mapped); if (ret < 0) break; npages += ret; @@ -1292,8 +1266,7 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev, } ret = pagefault_single_data_segment(dev, NULL, rkey, address, length, - &pfault->bytes_committed, NULL, - 0); + &pfault->bytes_committed, NULL); if (ret == -EAGAIN) { /* We're racing with an invalidation, don't prefetch */ prefetch_activated = 0; @@ -1320,8 +1293,7 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev, ret = pagefault_single_data_segment(dev, NULL, rkey, address, prefetch_len, - &bytes_committed, NULL, - 0); + &bytes_committed, NULL); if (ret < 0 && ret != -EAGAIN) { mlx5_ib_dbg(dev, "Prefetch failed. ret: %d, QP 0x%x, address: 0x%.16llx, length = 0x%.16x\n", ret, pfault->token, address, prefetch_len); @@ -1624,114 +1596,138 @@ int mlx5_ib_odp_init(void) struct prefetch_mr_work { struct work_struct work; - struct ib_pd *pd; u32 pf_flags; u32 num_sge; - struct ib_sge sg_list[0]; + struct { + u64 io_virt; + struct mlx5_ib_mr *mr; + size_t length; + } frags[]; }; -static void num_pending_prefetch_dec(struct mlx5_ib_dev *dev, - struct ib_sge *sg_list, u32 num_sge, - u32 from) +static void destroy_prefetch_work(struct prefetch_mr_work *work) { u32 i; - int srcu_key; - - srcu_key = srcu_read_lock(&dev->mr_srcu); - for (i = from; i < num_sge; ++i) { - struct mlx5_core_mkey *mmkey; - struct mlx5_ib_mr *mr; - - mmkey = xa_load(&dev->mdev->priv.mkey_table, - mlx5_base_mkey(sg_list[i].lkey)); - mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); - atomic_dec(&mr->num_pending_prefetch); - } - - srcu_read_unlock(&dev->mr_srcu, srcu_key); + for (i = 0; i < work->num_sge; ++i) + atomic_dec(&work->frags[i].mr->num_pending_prefetch); + kvfree(work); } -static bool num_pending_prefetch_inc(struct ib_pd *pd, - struct ib_sge *sg_list, u32 num_sge) +static struct mlx5_ib_mr * +get_prefetchable_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice, + u32 lkey) { struct mlx5_ib_dev *dev = to_mdev(pd->device); - bool ret = true; - u32 i; + struct mlx5_core_mkey *mmkey; + struct ib_umem_odp *odp; + struct mlx5_ib_mr *mr; - for (i = 0; i < num_sge; ++i) { - struct mlx5_core_mkey *mmkey; - struct mlx5_ib_mr *mr; + lockdep_assert_held(&dev->mr_srcu); - mmkey = xa_load(&dev->mdev->priv.mkey_table, - mlx5_base_mkey(sg_list[i].lkey)); - if (!mmkey || mmkey->key != sg_list[i].lkey) { - ret = false; - break; - } + mmkey = xa_load(&dev->mdev->priv.mkey_table, mlx5_base_mkey(lkey)); + if (!mmkey || mmkey->key != lkey || mmkey->type != MLX5_MKEY_MR) + return NULL; - if (mmkey->type != MLX5_MKEY_MR) { - ret = false; - break; - } + mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); - mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); + if (!smp_load_acquire(&mr->live)) + return NULL; - if (!smp_load_acquire(&mr->live)) { - ret = false; - break; - } + if (mr->ibmr.pd != pd || !is_odp_mr(mr)) + return NULL; - if (mr->ibmr.pd != pd) { - ret = false; - break; - } + /* + * Implicit child MRs are internal and userspace should not refer to + * them. + */ + if (mr->parent) + return NULL; - atomic_inc(&mr->num_pending_prefetch); - } + odp = to_ib_umem_odp(mr->umem); - if (!ret) - num_pending_prefetch_dec(dev, sg_list, i, 0); + /* prefetch with write-access must be supported by the MR */ + if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE && + !odp->umem.writable) + return NULL; - return ret; + return mr; } -static int mlx5_ib_prefetch_sg_list(struct ib_pd *pd, u32 pf_flags, - struct ib_sge *sg_list, u32 num_sge) +static void mlx5_ib_prefetch_mr_work(struct work_struct *w) { + struct prefetch_mr_work *work = + container_of(w, struct prefetch_mr_work, work); + u32 bytes_mapped = 0; u32 i; - int ret = 0; - struct mlx5_ib_dev *dev = to_mdev(pd->device); + + for (i = 0; i < work->num_sge; ++i) + pagefault_mr(work->frags[i].mr, work->frags[i].io_virt, + work->frags[i].length, &bytes_mapped, + work->pf_flags); + + destroy_prefetch_work(work); +} + +static bool init_prefetch_work(struct ib_pd *pd, + enum ib_uverbs_advise_mr_advice advice, + u32 pf_flags, struct prefetch_mr_work *work, + struct ib_sge *sg_list, u32 num_sge) +{ + u32 i; + + INIT_WORK(&work->work, mlx5_ib_prefetch_mr_work); + work->pf_flags = pf_flags; for (i = 0; i < num_sge; ++i) { - struct ib_sge *sg = &sg_list[i]; - int bytes_committed = 0; + work->frags[i].io_virt = sg_list[i].addr; + work->frags[i].length = sg_list[i].length; + work->frags[i].mr = + get_prefetchable_mr(pd, advice, sg_list[i].lkey); + if (!work->frags[i].mr) { + work->num_sge = i - 1; + if (i) + destroy_prefetch_work(work); + return false; + } - ret = pagefault_single_data_segment(dev, pd, sg->lkey, sg->addr, - sg->length, - &bytes_committed, NULL, - pf_flags); - if (ret < 0) - break; + /* Keep the MR pointer will valid outside the SRCU */ + atomic_inc(&work->frags[i].mr->num_pending_prefetch); } - - return ret < 0 ? ret : 0; + work->num_sge = num_sge; + return true; } -static void mlx5_ib_prefetch_mr_work(struct work_struct *work) +static int mlx5_ib_prefetch_sg_list(struct ib_pd *pd, + enum ib_uverbs_advise_mr_advice advice, + u32 pf_flags, struct ib_sge *sg_list, + u32 num_sge) { - struct prefetch_mr_work *w = - container_of(work, struct prefetch_mr_work, work); + struct mlx5_ib_dev *dev = to_mdev(pd->device); + u32 bytes_mapped = 0; + int srcu_key; + int ret = 0; + u32 i; - if (ib_device_try_get(w->pd->device)) { - mlx5_ib_prefetch_sg_list(w->pd, w->pf_flags, w->sg_list, - w->num_sge); - ib_device_put(w->pd->device); + srcu_key = srcu_read_lock(&dev->mr_srcu); + for (i = 0; i < num_sge; ++i) { + struct mlx5_ib_mr *mr; + + mr = get_prefetchable_mr(pd, advice, sg_list[i].lkey); + if (!mr) { + ret = -ENOENT; + goto out; + } + ret = pagefault_mr(mr, sg_list[i].addr, sg_list[i].length, + &bytes_mapped, pf_flags); + if (ret < 0) + goto out; } + ret = 0; - num_pending_prefetch_dec(to_mdev(w->pd->device), w->sg_list, - w->num_sge, 0); - kvfree(w); +out: + srcu_read_unlock(&dev->mr_srcu, srcu_key); + return ret; } int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd, @@ -1739,43 +1735,27 @@ int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd, u32 flags, struct ib_sge *sg_list, u32 num_sge) { struct mlx5_ib_dev *dev = to_mdev(pd->device); - u32 pf_flags = MLX5_PF_FLAGS_PREFETCH; + u32 pf_flags = 0; struct prefetch_mr_work *work; - bool valid_req; int srcu_key; if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH) pf_flags |= MLX5_PF_FLAGS_DOWNGRADE; if (flags & IB_UVERBS_ADVISE_MR_FLAG_FLUSH) - return mlx5_ib_prefetch_sg_list(pd, pf_flags, sg_list, + return mlx5_ib_prefetch_sg_list(pd, advice, pf_flags, sg_list, num_sge); - work = kvzalloc(struct_size(work, sg_list, num_sge), GFP_KERNEL); + work = kvzalloc(struct_size(work, frags, num_sge), GFP_KERNEL); if (!work) return -ENOMEM; - memcpy(work->sg_list, sg_list, num_sge * sizeof(struct ib_sge)); - - /* It is guaranteed that the pd when work is executed is the pd when - * work was queued since pd can't be destroyed while it holds MRs and - * destroying a MR leads to flushing the workquque - */ - work->pd = pd; - work->pf_flags = pf_flags; - work->num_sge = num_sge; - - INIT_WORK(&work->work, mlx5_ib_prefetch_mr_work); - srcu_key = srcu_read_lock(&dev->mr_srcu); - - valid_req = num_pending_prefetch_inc(pd, sg_list, num_sge); - if (valid_req) - queue_work(system_unbound_wq, &work->work); - else - kvfree(work); - + if (!init_prefetch_work(pd, advice, pf_flags, work, sg_list, num_sge)) { + srcu_read_unlock(&dev->mr_srcu, srcu_key); + return -EINVAL; + } + queue_work(system_unbound_wq, &work->work); srcu_read_unlock(&dev->mr_srcu, srcu_key); - - return valid_req ? 0 : -EINVAL; + return 0; } -- cgit v1.2.3-59-g8ed1b From 50211ec9443ff2e16db43f691dfcc0ef435cf45d Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 9 Oct 2019 13:09:22 -0300 Subject: RDMA/mlx5: Split sig_err MR data into its own xarray The locking model for signature is completely different than ODP, do not share the same xarray that relies on SRCU locking to support ODP. Simply store the active mlx5_core_sig_ctx's in an xarray when signature MRs are created and rely on trivial xarray locking to serialize everything. The overhead of storing only a handful of SIG related MRs is going to be much less than an xarray full of every mkey. Link: https://lore.kernel.org/r/20191009160934.3143-3-jgg@ziepe.ca Reviewed-by: Artemy Kovalyov Reviewed-by: Max Gurtovoy Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/cq.c | 33 ++++++++++++++++----------------- drivers/infiniband/hw/mlx5/main.c | 2 ++ drivers/infiniband/hw/mlx5/mlx5_ib.h | 2 ++ drivers/infiniband/hw/mlx5/mr.c | 8 ++++++++ 4 files changed, 28 insertions(+), 17 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index 45f48cde6b9d..cc938d27f913 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -423,9 +423,6 @@ static int mlx5_poll_one(struct mlx5_ib_cq *cq, struct mlx5_cqe64 *cqe64; struct mlx5_core_qp *mqp; struct mlx5_ib_wq *wq; - struct mlx5_sig_err_cqe *sig_err_cqe; - struct mlx5_core_mkey *mmkey; - struct mlx5_ib_mr *mr; uint8_t opcode; uint32_t qpn; u16 wqe_ctr; @@ -519,27 +516,29 @@ repoll: } } break; - case MLX5_CQE_SIG_ERR: - sig_err_cqe = (struct mlx5_sig_err_cqe *)cqe64; + case MLX5_CQE_SIG_ERR: { + struct mlx5_sig_err_cqe *sig_err_cqe = + (struct mlx5_sig_err_cqe *)cqe64; + struct mlx5_core_sig_ctx *sig; - xa_lock(&dev->mdev->priv.mkey_table); - mmkey = xa_load(&dev->mdev->priv.mkey_table, + xa_lock(&dev->sig_mrs); + sig = xa_load(&dev->sig_mrs, mlx5_base_mkey(be32_to_cpu(sig_err_cqe->mkey))); - mr = to_mibmr(mmkey); - get_sig_err_item(sig_err_cqe, &mr->sig->err_item); - mr->sig->sig_err_exists = true; - mr->sig->sigerr_count++; + get_sig_err_item(sig_err_cqe, &sig->err_item); + sig->sig_err_exists = true; + sig->sigerr_count++; mlx5_ib_warn(dev, "CQN: 0x%x Got SIGERR on key: 0x%x err_type %x err_offset %llx expected %x actual %x\n", - cq->mcq.cqn, mr->sig->err_item.key, - mr->sig->err_item.err_type, - mr->sig->err_item.sig_err_offset, - mr->sig->err_item.expected, - mr->sig->err_item.actual); + cq->mcq.cqn, sig->err_item.key, + sig->err_item.err_type, + sig->err_item.sig_err_offset, + sig->err_item.expected, + sig->err_item.actual); - xa_unlock(&dev->mdev->priv.mkey_table); + xa_unlock(&dev->sig_mrs); goto repoll; } + } return 0; } diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 831539419c30..b7eea724beaa 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -6150,6 +6150,7 @@ static void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev) cleanup_srcu_struct(&dev->mr_srcu); } + WARN_ON(!xa_empty(&dev->sig_mrs)); WARN_ON(!bitmap_empty(dev->dm.memic_alloc_pages, MLX5_MAX_MEMIC_PAGES)); } @@ -6201,6 +6202,7 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev) mutex_init(&dev->cap_mask_mutex); INIT_LIST_HEAD(&dev->qp_list); spin_lock_init(&dev->reset_flow_resource_lock); + xa_init(&dev->sig_mrs); spin_lock_init(&dev->dm.lock); dev->dm.dev = mdev; diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 1a98ee2e01c4..f4ce58354544 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -999,6 +999,8 @@ struct mlx5_ib_dev { struct mlx5_srq_table srq_table; struct mlx5_async_ctx async_ctx; struct mlx5_devx_event_table devx_event_table; + + struct xarray sig_mrs; }; static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq) diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 630599311586..fd24640606c1 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -1560,6 +1560,7 @@ static void clean_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) mr->sig->psv_wire.psv_idx)) mlx5_ib_warn(dev, "failed to destroy wire psv %d\n", mr->sig->psv_wire.psv_idx); + xa_erase(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key)); kfree(mr->sig); mr->sig = NULL; } @@ -1797,8 +1798,15 @@ static int mlx5_alloc_integrity_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, if (err) goto err_free_mtt_mr; + err = xa_err(xa_store(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key), + mr->sig, GFP_KERNEL)); + if (err) + goto err_free_descs; return 0; +err_free_descs: + destroy_mkey(dev, mr); + mlx5_free_priv_descs(mr); err_free_mtt_mr: dereg_mr(to_mdev(mr->mtt_mr->ibmr.device), mr->mtt_mr); mr->mtt_mr = NULL; -- cgit v1.2.3-59-g8ed1b From 806b101b2bfa800a9c779336b750bee39c7fb3b4 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 9 Oct 2019 13:09:23 -0300 Subject: RDMA/mlx5: Use a dedicated mkey xarray for ODP There is a per device xarray storing mkeys that is used to store every mkey in the system. However, this xarray is now only read by ODP for certain ODP designated MRs (ODP, implicit ODP, MW, DEVX_INDIRECT). Create an xarray only for use by ODP, that only contains ODP related MKeys. This xarray is protected by SRCU and all erases are protected by a synchronize. This improves performance: - All MRs in the odp_mkeys xarray are ODP MRs, so some tests for is_odp() can be deleted. The xarray will also consume fewer nodes. - normal MR's are never mixed with ODP MRs in a SRCU data structure so performance sucking synchronize_srcu() on every MR destruction is not needed. - No smp_load_acquire(live) and xa_load() double barrier on read Due to the SRCU locking scheme care must be taken with the placement of the xa_store(). Once it completes the MR is immediately visible to other threads and only through a xa_erase() & synchronize_srcu() cycle could it be destroyed. Link: https://lore.kernel.org/r/20191009160934.3143-4-jgg@ziepe.ca Reviewed-by: Artemy Kovalyov Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/devx.c | 8 ++-- drivers/infiniband/hw/mlx5/main.c | 17 ++++---- drivers/infiniband/hw/mlx5/mlx5_ib.h | 5 ++- drivers/infiniband/hw/mlx5/mr.c | 43 ++++++++++--------- drivers/infiniband/hw/mlx5/odp.c | 83 +++++++++++++++++++----------------- 5 files changed, 83 insertions(+), 73 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index d609f4659afb..6b1fca91d7d3 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -1265,8 +1265,8 @@ static int devx_handle_mkey_indirect(struct devx_obj *obj, mkey->pd = MLX5_GET(mkc, mkc, pd); devx_mr->ndescs = MLX5_GET(mkc, mkc, translations_octword_size); - return xa_err(xa_store(&dev->mdev->priv.mkey_table, - mlx5_base_mkey(mkey->key), mkey, GFP_KERNEL)); + return xa_err(xa_store(&dev->odp_mkeys, mlx5_base_mkey(mkey->key), mkey, + GFP_KERNEL)); } static int devx_handle_mkey_create(struct mlx5_ib_dev *dev, @@ -1345,9 +1345,9 @@ static int devx_obj_cleanup(struct ib_uobject *uobject, * the mmkey, we must wait for that to stop before freeing the * mkey, as another allocation could get the same mkey #. */ - xa_erase(&obj->ib_dev->mdev->priv.mkey_table, + xa_erase(&obj->ib_dev->odp_mkeys, mlx5_base_mkey(obj->devx_mr.mmkey.key)); - synchronize_srcu(&dev->mr_srcu); + synchronize_srcu(&dev->odp_srcu); } if (obj->flags & DEVX_OBJ_FLAGS_DCT) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index b7eea724beaa..4692c37b057c 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -6145,10 +6145,10 @@ static struct ib_counters *mlx5_ib_create_counters(struct ib_device *device, static void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev) { mlx5_ib_cleanup_multiport_master(dev); - if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) { - srcu_barrier(&dev->mr_srcu); - cleanup_srcu_struct(&dev->mr_srcu); - } + WARN_ON(!xa_empty(&dev->odp_mkeys)); + if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) + srcu_barrier(&dev->odp_srcu); + cleanup_srcu_struct(&dev->odp_srcu); WARN_ON(!xa_empty(&dev->sig_mrs)); WARN_ON(!bitmap_empty(dev->dm.memic_alloc_pages, MLX5_MAX_MEMIC_PAGES)); @@ -6202,16 +6202,15 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev) mutex_init(&dev->cap_mask_mutex); INIT_LIST_HEAD(&dev->qp_list); spin_lock_init(&dev->reset_flow_resource_lock); + xa_init(&dev->odp_mkeys); xa_init(&dev->sig_mrs); spin_lock_init(&dev->dm.lock); dev->dm.dev = mdev; - if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) { - err = init_srcu_struct(&dev->mr_srcu); - if (err) - goto err_mp; - } + err = init_srcu_struct(&dev->odp_srcu); + if (err) + goto err_mp; return 0; diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index f4ce58354544..2cf91db6a36f 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -606,7 +606,6 @@ struct mlx5_ib_mr { struct mlx5_ib_dev *dev; u32 out[MLX5_ST_SZ_DW(create_mkey_out)]; struct mlx5_core_sig_ctx *sig; - unsigned int live; void *descs_alloc; int access_flags; /* Needed for rereg MR */ @@ -975,7 +974,9 @@ struct mlx5_ib_dev { * Sleepable RCU that prevents destruction of MRs while they are still * being used by a page fault handler. */ - struct srcu_struct mr_srcu; + struct srcu_struct odp_srcu; + struct xarray odp_mkeys; + u32 null_mkey; struct mlx5_ib_flow_db *flow_db; /* protect resources needed as part of reset flow */ diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index fd24640606c1..60b12dc53004 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -59,13 +59,9 @@ static bool umr_can_use_indirect_mkey(struct mlx5_ib_dev *dev) static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) { - int err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey); + WARN_ON(xa_load(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key))); - if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) - /* Wait until all page fault handlers using the mr complete. */ - synchronize_srcu(&dev->mr_srcu); - - return err; + return mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey); } static int order2idx(struct mlx5_ib_dev *dev, int order) @@ -218,9 +214,6 @@ static void remove_keys(struct mlx5_ib_dev *dev, int c, int num) mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey); } - if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) - synchronize_srcu(&dev->mr_srcu); - list_for_each_entry_safe(mr, tmp_mr, &del_list, list) { list_del(&mr->list); kfree(mr); @@ -555,10 +548,6 @@ static void clean_keys(struct mlx5_ib_dev *dev, int c) mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey); } -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING - synchronize_srcu(&dev->mr_srcu); -#endif - list_for_each_entry_safe(mr, tmp_mr, &del_list, list) { list_del(&mr->list); kfree(mr); @@ -1338,9 +1327,14 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, if (is_odp_mr(mr)) { to_ib_umem_odp(mr->umem)->private = mr; atomic_set(&mr->num_pending_prefetch, 0); + err = xa_err(xa_store(&dev->odp_mkeys, + mlx5_base_mkey(mr->mmkey.key), &mr->mmkey, + GFP_KERNEL)); + if (err) { + dereg_mr(dev, mr); + return ERR_PTR(err); + } } - if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) - smp_store_release(&mr->live, 1); return &mr->ibmr; error: @@ -1582,10 +1576,10 @@ static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) /* Prevent new page faults and * prefetch requests from succeeding */ - WRITE_ONCE(mr->live, 0); + xa_erase(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)); /* Wait for all running page-fault handlers to finish. */ - synchronize_srcu(&dev->mr_srcu); + synchronize_srcu(&dev->odp_srcu); /* dequeue pending prefetch requests for the mr */ if (atomic_read(&mr->num_pending_prefetch)) @@ -1959,9 +1953,19 @@ struct ib_mw *mlx5_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type, } } + if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) { + err = xa_err(xa_store(&dev->odp_mkeys, + mlx5_base_mkey(mw->mmkey.key), &mw->mmkey, + GFP_KERNEL)); + if (err) + goto free_mkey; + } + kfree(in); return &mw->ibmw; +free_mkey: + mlx5_core_destroy_mkey(dev->mdev, &mw->mmkey); free: kfree(mw); kfree(in); @@ -1975,13 +1979,12 @@ int mlx5_ib_dealloc_mw(struct ib_mw *mw) int err; if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) { - xa_erase(&dev->mdev->priv.mkey_table, - mlx5_base_mkey(mmw->mmkey.key)); + xa_erase(&dev->odp_mkeys, mlx5_base_mkey(mmw->mmkey.key)); /* * pagefault_single_data_segment() may be accessing mmw under * SRCU if the user bound an ODP MR to this MW. */ - synchronize_srcu(&dev->mr_srcu); + synchronize_srcu(&dev->odp_srcu); } err = mlx5_core_destroy_mkey(dev->mdev, &mmw->mmkey); diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 09dac97e4ca4..1f06433bcfc8 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -199,7 +199,7 @@ void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset, * locking around the children list. */ lockdep_assert_held(&to_ib_umem_odp(mr->umem)->umem_mutex); - lockdep_assert_held(&mr->dev->mr_srcu); + lockdep_assert_held(&mr->dev->odp_srcu); odp = odp_lookup(offset * MLX5_IMR_MTT_SIZE, nentries * MLX5_IMR_MTT_SIZE, mr); @@ -229,16 +229,16 @@ static void mr_leaf_free_action(struct work_struct *work) int srcu_key; mr->parent = NULL; - synchronize_srcu(&mr->dev->mr_srcu); + synchronize_srcu(&mr->dev->odp_srcu); - if (smp_load_acquire(&imr->live)) { - srcu_key = srcu_read_lock(&mr->dev->mr_srcu); + if (xa_load(&mr->dev->odp_mkeys, mlx5_base_mkey(imr->mmkey.key))) { + srcu_key = srcu_read_lock(&mr->dev->odp_srcu); mutex_lock(&odp_imr->umem_mutex); mlx5_ib_update_xlt(imr, idx, 1, 0, MLX5_IB_UPD_XLT_INDIRECT | MLX5_IB_UPD_XLT_ATOMIC); mutex_unlock(&odp_imr->umem_mutex); - srcu_read_unlock(&mr->dev->mr_srcu, srcu_key); + srcu_read_unlock(&mr->dev->odp_srcu, srcu_key); } ib_umem_odp_release(odp); mlx5_mr_cache_free(mr->dev, mr); @@ -318,7 +318,7 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start, if (unlikely(!umem_odp->npages && mr->parent && !umem_odp->dying)) { - WRITE_ONCE(mr->live, 0); + xa_erase(&mr->dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)); umem_odp->dying = 1; atomic_inc(&mr->parent->num_leaf_free); schedule_work(&umem_odp->work); @@ -430,6 +430,11 @@ static struct mlx5_ib_mr *implicit_mr_alloc(struct ib_pd *pd, if (IS_ERR(mr)) return mr; + err = xa_reserve(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key), + GFP_KERNEL); + if (err) + goto out_mr; + mr->ibmr.pd = pd; mr->dev = dev; @@ -455,7 +460,7 @@ static struct mlx5_ib_mr *implicit_mr_alloc(struct ib_pd *pd, } if (err) - goto fail; + goto out_release; mr->ibmr.lkey = mr->mmkey.key; mr->ibmr.rkey = mr->mmkey.key; @@ -465,7 +470,9 @@ static struct mlx5_ib_mr *implicit_mr_alloc(struct ib_pd *pd, return mr; -fail: +out_release: + xa_release(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)); +out_mr: mlx5_ib_err(dev, "Failed to register MKEY %d\n", err); mlx5_mr_cache_free(dev, mr); @@ -513,7 +520,8 @@ next_mr: mtt->parent = mr; INIT_WORK(&odp->work, mr_leaf_free_action); - smp_store_release(&mtt->live, 1); + xa_store(&dev->odp_mkeys, mlx5_base_mkey(mtt->mmkey.key), + &mtt->mmkey, GFP_ATOMIC); if (!nentries) start_idx = addr >> MLX5_IMR_MTT_SHIFT; @@ -567,7 +575,8 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, init_waitqueue_head(&imr->q_leaf_free); atomic_set(&imr->num_leaf_free, 0); atomic_set(&imr->num_pending_prefetch, 0); - smp_store_release(&imr->live, 1); + xa_store(&imr->dev->odp_mkeys, mlx5_base_mkey(imr->mmkey.key), + &imr->mmkey, GFP_ATOMIC); return imr; } @@ -778,13 +787,28 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev, size_t offset; int ndescs; - srcu_key = srcu_read_lock(&dev->mr_srcu); + srcu_key = srcu_read_lock(&dev->odp_srcu); io_virt += *bytes_committed; bcnt -= *bytes_committed; next_mr: - mmkey = xa_load(&dev->mdev->priv.mkey_table, mlx5_base_mkey(key)); + mmkey = xa_load(&dev->odp_mkeys, mlx5_base_mkey(key)); + if (!mmkey) { + mlx5_ib_dbg( + dev, + "skipping non ODP MR (lkey=0x%06x) in page fault handler.\n", + key); + if (bytes_mapped) + *bytes_mapped += bcnt; + /* + * The user could specify a SGL with multiple lkeys and only + * some of them are ODP. Treat the non-ODP ones as fully + * faulted. + */ + ret = 0; + goto srcu_unlock; + } if (!mkey_is_eq(mmkey, key)) { mlx5_ib_dbg(dev, "failed to find mkey %x\n", key); ret = -EFAULT; @@ -794,20 +818,6 @@ next_mr: switch (mmkey->type) { case MLX5_MKEY_MR: mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); - if (!smp_load_acquire(&mr->live) || !mr->ibmr.pd) { - mlx5_ib_dbg(dev, "got dead MR\n"); - ret = -EFAULT; - goto srcu_unlock; - } - - if (!is_odp_mr(mr)) { - mlx5_ib_dbg(dev, "skipping non ODP MR (lkey=0x%06x) in page fault handler.\n", - key); - if (bytes_mapped) - *bytes_mapped += bcnt; - ret = 0; - goto srcu_unlock; - } ret = pagefault_mr(mr, io_virt, bcnt, bytes_mapped, 0); if (ret < 0) @@ -902,7 +912,7 @@ srcu_unlock: } kfree(out); - srcu_read_unlock(&dev->mr_srcu, srcu_key); + srcu_read_unlock(&dev->odp_srcu, srcu_key); *bytes_committed = 0; return ret ? ret : npages; } @@ -1623,18 +1633,15 @@ get_prefetchable_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice, struct ib_umem_odp *odp; struct mlx5_ib_mr *mr; - lockdep_assert_held(&dev->mr_srcu); + lockdep_assert_held(&dev->odp_srcu); - mmkey = xa_load(&dev->mdev->priv.mkey_table, mlx5_base_mkey(lkey)); + mmkey = xa_load(&dev->odp_mkeys, mlx5_base_mkey(lkey)); if (!mmkey || mmkey->key != lkey || mmkey->type != MLX5_MKEY_MR) return NULL; mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); - if (!smp_load_acquire(&mr->live)) - return NULL; - - if (mr->ibmr.pd != pd || !is_odp_mr(mr)) + if (mr->ibmr.pd != pd) return NULL; /* @@ -1709,7 +1716,7 @@ static int mlx5_ib_prefetch_sg_list(struct ib_pd *pd, int ret = 0; u32 i; - srcu_key = srcu_read_lock(&dev->mr_srcu); + srcu_key = srcu_read_lock(&dev->odp_srcu); for (i = 0; i < num_sge; ++i) { struct mlx5_ib_mr *mr; @@ -1726,7 +1733,7 @@ static int mlx5_ib_prefetch_sg_list(struct ib_pd *pd, ret = 0; out: - srcu_read_unlock(&dev->mr_srcu, srcu_key); + srcu_read_unlock(&dev->odp_srcu, srcu_key); return ret; } @@ -1750,12 +1757,12 @@ int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd, if (!work) return -ENOMEM; - srcu_key = srcu_read_lock(&dev->mr_srcu); + srcu_key = srcu_read_lock(&dev->odp_srcu); if (!init_prefetch_work(pd, advice, pf_flags, work, sg_list, num_sge)) { - srcu_read_unlock(&dev->mr_srcu, srcu_key); + srcu_read_unlock(&dev->odp_srcu, srcu_key); return -EINVAL; } queue_work(system_unbound_wq, &work->work); - srcu_read_unlock(&dev->mr_srcu, srcu_key); + srcu_read_unlock(&dev->odp_srcu, srcu_key); return 0; } -- cgit v1.2.3-59-g8ed1b From 74bddb3682f60df16ba24be335c94de348ba1b07 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 9 Oct 2019 13:09:24 -0300 Subject: RDMA/mlx5: Delete struct mlx5_priv->mkey_table No users are left, delete it. Link: https://lore.kernel.org/r/20191009160934.3143-5-jgg@ziepe.ca Reviewed-by: Artemy Kovalyov Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/mr.c | 9 --------- drivers/net/ethernet/mellanox/mlx5/core/main.c | 4 ---- drivers/net/ethernet/mellanox/mlx5/core/mr.c | 28 +------------------------- include/linux/mlx5/driver.h | 4 ---- 4 files changed, 1 insertion(+), 44 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 60b12dc53004..fd94838a8845 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -90,8 +90,6 @@ static void reg_mr_callback(int status, struct mlx5_async_work *context) struct mlx5_cache_ent *ent = &cache->ent[c]; u8 key; unsigned long flags; - struct xarray *mkeys = &dev->mdev->priv.mkey_table; - int err; spin_lock_irqsave(&ent->lock, flags); ent->pending--; @@ -118,13 +116,6 @@ static void reg_mr_callback(int status, struct mlx5_async_work *context) ent->size++; spin_unlock_irqrestore(&ent->lock, flags); - xa_lock_irqsave(mkeys, flags); - err = xa_err(__xa_store(mkeys, mlx5_base_mkey(mr->mmkey.key), - &mr->mmkey, GFP_ATOMIC)); - xa_unlock_irqrestore(mkeys, flags); - if (err) - pr_err("Error inserting to mkey tree. 0x%x\n", -err); - if (!completion_done(&ent->compl)) complete(&ent->compl); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index e47dd7c1b909..d10051f39b9c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -837,8 +837,6 @@ static int mlx5_init_once(struct mlx5_core_dev *dev) mlx5_init_qp_table(dev); - mlx5_init_mkey_table(dev); - mlx5_init_reserved_gids(dev); mlx5_init_clock(dev); @@ -896,7 +894,6 @@ err_rl_cleanup: err_tables_cleanup: mlx5_geneve_destroy(dev->geneve); mlx5_vxlan_destroy(dev->vxlan); - mlx5_cleanup_mkey_table(dev); mlx5_cleanup_qp_table(dev); mlx5_cq_debugfs_cleanup(dev); mlx5_events_cleanup(dev); @@ -924,7 +921,6 @@ static void mlx5_cleanup_once(struct mlx5_core_dev *dev) mlx5_vxlan_destroy(dev->vxlan); mlx5_cleanup_clock(dev); mlx5_cleanup_reserved_gids(dev); - mlx5_cleanup_mkey_table(dev); mlx5_cleanup_qp_table(dev); mlx5_cq_debugfs_cleanup(dev); mlx5_events_cleanup(dev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mr.c b/drivers/net/ethernet/mellanox/mlx5/core/mr.c index c501bf2a0252..42cc3c7ac5b6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/mr.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/mr.c @@ -36,16 +36,6 @@ #include #include "mlx5_core.h" -void mlx5_init_mkey_table(struct mlx5_core_dev *dev) -{ - xa_init_flags(&dev->priv.mkey_table, XA_FLAGS_LOCK_IRQ); -} - -void mlx5_cleanup_mkey_table(struct mlx5_core_dev *dev) -{ - WARN_ON(!xa_empty(&dev->priv.mkey_table)); -} - int mlx5_core_create_mkey_cb(struct mlx5_core_dev *dev, struct mlx5_core_mkey *mkey, struct mlx5_async_ctx *async_ctx, u32 *in, @@ -54,7 +44,6 @@ int mlx5_core_create_mkey_cb(struct mlx5_core_dev *dev, struct mlx5_async_work *context) { u32 lout[MLX5_ST_SZ_DW(create_mkey_out)] = {0}; - struct xarray *mkeys = &dev->priv.mkey_table; u32 mkey_index; void *mkc; int err; @@ -84,16 +73,7 @@ int mlx5_core_create_mkey_cb(struct mlx5_core_dev *dev, mlx5_core_dbg(dev, "out 0x%x, key 0x%x, mkey 0x%x\n", mkey_index, key, mkey->key); - - err = xa_err(xa_store_irq(mkeys, mlx5_base_mkey(mkey->key), mkey, - GFP_KERNEL)); - if (err) { - mlx5_core_warn(dev, "failed xarray insert of mkey 0x%x, %d\n", - mlx5_base_mkey(mkey->key), err); - mlx5_core_destroy_mkey(dev, mkey); - } - - return err; + return 0; } EXPORT_SYMBOL(mlx5_core_create_mkey_cb); @@ -111,12 +91,6 @@ int mlx5_core_destroy_mkey(struct mlx5_core_dev *dev, { u32 out[MLX5_ST_SZ_DW(destroy_mkey_out)] = {0}; u32 in[MLX5_ST_SZ_DW(destroy_mkey_in)] = {0}; - struct xarray *mkeys = &dev->priv.mkey_table; - unsigned long flags; - - xa_lock_irqsave(mkeys, flags); - __xa_erase(mkeys, mlx5_base_mkey(mkey->key)); - xa_unlock_irqrestore(mkeys, flags); MLX5_SET(destroy_mkey_in, in, opcode, MLX5_CMD_OP_DESTROY_MKEY); MLX5_SET(destroy_mkey_in, in, mkey_index, mlx5_mkey_to_idx(mkey->key)); diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 3e80f03a387f..8288b62b8f37 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -556,8 +556,6 @@ struct mlx5_priv { struct dentry *cmdif_debugfs; /* end: qp staff */ - struct xarray mkey_table; - /* start: alloc staff */ /* protect buffer alocation according to numa node */ struct mutex alloc_mutex; @@ -942,8 +940,6 @@ struct mlx5_cmd_mailbox *mlx5_alloc_cmd_mailbox_chain(struct mlx5_core_dev *dev, gfp_t flags, int npages); void mlx5_free_cmd_mailbox_chain(struct mlx5_core_dev *dev, struct mlx5_cmd_mailbox *head); -void mlx5_init_mkey_table(struct mlx5_core_dev *dev); -void mlx5_cleanup_mkey_table(struct mlx5_core_dev *dev); int mlx5_core_create_mkey_cb(struct mlx5_core_dev *dev, struct mlx5_core_mkey *mkey, struct mlx5_async_ctx *async_ctx, u32 *in, -- cgit v1.2.3-59-g8ed1b From 3d5f3c54e7bc82a279c80c18087462c0ce00ba44 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 9 Oct 2019 13:09:25 -0300 Subject: RDMA/mlx5: Rework implicit_mr_get_data This function is intended to loop across each MTT chunk in the implicit parent that intersects the range [io_virt, io_virt+bnct). But it is has a confusing construction, so: - Consistently use imr and odp_imr to refer to the implicit parent to avoid confusion with the normal mr and odp of the child - Directly compute the inclusive start/end indexes by shifting. This is clearer to understand the intent and avoids any errors from unaligned values of addr - Iterate directly over the range of MTT indexes, do not make a loop out of goto - Follow 'success oriented flow', with goto error unwind - Directly calculate the range of idx's that need update_xlt - Ensure that any leaf MR added to the interval tree always results in an update to the XLT Link: https://lore.kernel.org/r/20191009160934.3143-6-jgg@ziepe.ca Reviewed-by: Artemy Kovalyov Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/odp.c | 123 ++++++++++++++++++++++----------------- 1 file changed, 69 insertions(+), 54 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 1f06433bcfc8..78e31dc694d9 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -479,78 +479,93 @@ out_mr: return ERR_PTR(err); } -static struct ib_umem_odp *implicit_mr_get_data(struct mlx5_ib_mr *mr, - u64 io_virt, size_t bcnt) +static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr, + unsigned long idx) { - struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.pd->device); - struct ib_umem_odp *odp, *result = NULL; - struct ib_umem_odp *odp_mr = to_ib_umem_odp(mr->umem); - u64 addr = io_virt & MLX5_IMR_MTT_MASK; - int nentries = 0, start_idx = 0, ret; + struct ib_umem_odp *odp; struct mlx5_ib_mr *mtt; - mutex_lock(&odp_mr->umem_mutex); - odp = odp_lookup(addr, 1, mr); + odp = ib_umem_odp_alloc_child(to_ib_umem_odp(imr->umem), + idx * MLX5_IMR_MTT_SIZE, + MLX5_IMR_MTT_SIZE); + if (IS_ERR(odp)) + return ERR_CAST(odp); - mlx5_ib_dbg(dev, "io_virt:%llx bcnt:%zx addr:%llx odp:%p\n", - io_virt, bcnt, addr, odp); + mtt = implicit_mr_alloc(imr->ibmr.pd, odp, 0, imr->access_flags); + if (IS_ERR(mtt)) { + ib_umem_odp_release(odp); + return mtt; + } -next_mr: - if (likely(odp)) { - if (nentries) - nentries++; - } else { - odp = ib_umem_odp_alloc_child(odp_mr, addr, MLX5_IMR_MTT_SIZE); - if (IS_ERR(odp)) { - mutex_unlock(&odp_mr->umem_mutex); - return ERR_CAST(odp); - } + odp->private = mtt; + mtt->umem = &odp->umem; + mtt->mmkey.iova = idx * MLX5_IMR_MTT_SIZE; + mtt->parent = imr; + INIT_WORK(&odp->work, mr_leaf_free_action); - mtt = implicit_mr_alloc(mr->ibmr.pd, odp, 0, - mr->access_flags); - if (IS_ERR(mtt)) { - mutex_unlock(&odp_mr->umem_mutex); - ib_umem_odp_release(odp); - return ERR_CAST(mtt); - } + xa_store(&mtt->dev->odp_mkeys, mlx5_base_mkey(mtt->mmkey.key), + &mtt->mmkey, GFP_ATOMIC); + return mtt; +} - odp->private = mtt; - mtt->umem = &odp->umem; - mtt->mmkey.iova = addr; - mtt->parent = mr; - INIT_WORK(&odp->work, mr_leaf_free_action); +static struct ib_umem_odp *implicit_mr_get_data(struct mlx5_ib_mr *imr, + u64 io_virt, size_t bcnt) +{ + struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem); + unsigned long end_idx = (io_virt + bcnt - 1) >> MLX5_IMR_MTT_SHIFT; + unsigned long idx = io_virt >> MLX5_IMR_MTT_SHIFT; + unsigned long inv_start_idx = end_idx + 1; + unsigned long inv_len = 0; + struct ib_umem_odp *result = NULL; + struct ib_umem_odp *odp; + int ret; - xa_store(&dev->odp_mkeys, mlx5_base_mkey(mtt->mmkey.key), - &mtt->mmkey, GFP_ATOMIC); + mutex_lock(&odp_imr->umem_mutex); + odp = odp_lookup(idx * MLX5_IMR_MTT_SIZE, 1, imr); + for (idx = idx; idx <= end_idx; idx++) { + if (unlikely(!odp)) { + struct mlx5_ib_mr *mtt; - if (!nentries) - start_idx = addr >> MLX5_IMR_MTT_SHIFT; - nentries++; - } + mtt = implicit_get_child_mr(imr, idx); + if (IS_ERR(mtt)) { + result = ERR_CAST(mtt); + goto out; + } + odp = to_ib_umem_odp(mtt->umem); + inv_start_idx = min(inv_start_idx, idx); + inv_len = idx - inv_start_idx + 1; + } - /* Return first odp if region not covered by single one */ - if (likely(!result)) - result = odp; + /* Return first odp if region not covered by single one */ + if (likely(!result)) + result = odp; - addr += MLX5_IMR_MTT_SIZE; - if (unlikely(addr < io_virt + bcnt)) { odp = odp_next(odp); - if (odp && ib_umem_start(odp) != addr) + if (odp && ib_umem_start(odp) != idx * MLX5_IMR_MTT_SIZE) odp = NULL; - goto next_mr; } - if (unlikely(nentries)) { - ret = mlx5_ib_update_xlt(mr, start_idx, nentries, 0, - MLX5_IB_UPD_XLT_INDIRECT | + /* + * Any time the children in the interval tree are changed we must + * perform an update of the xlt before exiting to ensure the HW and + * the tree remains synchronized. + */ +out: + if (likely(!inv_len)) + goto out_unlock; + + ret = mlx5_ib_update_xlt(imr, inv_start_idx, inv_len, 0, + MLX5_IB_UPD_XLT_INDIRECT | MLX5_IB_UPD_XLT_ATOMIC); - if (ret) { - mlx5_ib_err(dev, "Failed to update PAS\n"); - result = ERR_PTR(ret); - } + if (ret) { + mlx5_ib_err(to_mdev(imr->ibmr.pd->device), + "Failed to update PAS\n"); + result = ERR_PTR(ret); + goto out_unlock; } - mutex_unlock(&odp_mr->umem_mutex); +out_unlock: + mutex_unlock(&odp_imr->umem_mutex); return result; } -- cgit v1.2.3-59-g8ed1b From c2edcd69351f681594a30b17b7fbc5259a038fb0 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 9 Oct 2019 13:09:26 -0300 Subject: RDMA/mlx5: Lift implicit_mr_alloc() into the two routines that call it This makes the routines easier to understand, particularly with respect the locking requirements of the entire sequence. The implicit_mr_alloc() had a lot of ifs specializing it to each of the callers, and only a very small amount of code was actually shared. Following patches will cause the flow in the two functions to diverge further. Link: https://lore.kernel.org/r/20191009160934.3143-7-jgg@ziepe.ca Reviewed-by: Artemy Kovalyov Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/odp.c | 151 +++++++++++++++++++-------------------- 1 file changed, 74 insertions(+), 77 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 78e31dc694d9..dfaa39fc4d3f 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -416,96 +416,66 @@ static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev, wq_num, err); } -static struct mlx5_ib_mr *implicit_mr_alloc(struct ib_pd *pd, - struct ib_umem_odp *umem_odp, - bool ksm, int access_flags) +static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr, + unsigned long idx) { - struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct ib_umem_odp *odp; struct mlx5_ib_mr *mr; + struct mlx5_ib_mr *ret; int err; - mr = mlx5_mr_cache_alloc(dev, ksm ? MLX5_IMR_KSM_CACHE_ENTRY : - MLX5_IMR_MTT_CACHE_ENTRY); + odp = ib_umem_odp_alloc_child(to_ib_umem_odp(imr->umem), + idx * MLX5_IMR_MTT_SIZE, + MLX5_IMR_MTT_SIZE); + if (IS_ERR(odp)) + return ERR_CAST(odp); + ret = mr = mlx5_mr_cache_alloc(imr->dev, MLX5_IMR_MTT_CACHE_ENTRY); if (IS_ERR(mr)) - return mr; + goto out_umem; - err = xa_reserve(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key), + err = xa_reserve(&imr->dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key), GFP_KERNEL); - if (err) + if (err) { + ret = ERR_PTR(err); goto out_mr; - - mr->ibmr.pd = pd; - - mr->dev = dev; - mr->access_flags = access_flags; - mr->mmkey.iova = 0; - mr->umem = &umem_odp->umem; - - if (ksm) { - err = mlx5_ib_update_xlt(mr, 0, - mlx5_imr_ksm_entries, - MLX5_KSM_PAGE_SHIFT, - MLX5_IB_UPD_XLT_INDIRECT | - MLX5_IB_UPD_XLT_ZAP | - MLX5_IB_UPD_XLT_ENABLE); - - } else { - err = mlx5_ib_update_xlt(mr, 0, - MLX5_IMR_MTT_ENTRIES, - PAGE_SHIFT, - MLX5_IB_UPD_XLT_ZAP | - MLX5_IB_UPD_XLT_ENABLE | - MLX5_IB_UPD_XLT_ATOMIC); } - if (err) - goto out_release; - + mr->ibmr.pd = imr->ibmr.pd; + mr->access_flags = imr->access_flags; + mr->umem = &odp->umem; mr->ibmr.lkey = mr->mmkey.key; mr->ibmr.rkey = mr->mmkey.key; + mr->mmkey.iova = 0; + mr->parent = imr; + odp->private = mr; + INIT_WORK(&odp->work, mr_leaf_free_action); + + err = mlx5_ib_update_xlt(mr, 0, + MLX5_IMR_MTT_ENTRIES, + PAGE_SHIFT, + MLX5_IB_UPD_XLT_ZAP | + MLX5_IB_UPD_XLT_ENABLE | + MLX5_IB_UPD_XLT_ATOMIC); + if (err) { + ret = ERR_PTR(err); + goto out_release; + } - mlx5_ib_dbg(dev, "key %x dev %p mr %p\n", - mr->mmkey.key, dev->mdev, mr); + mr->mmkey.iova = idx * MLX5_IMR_MTT_SIZE; + xa_store(&imr->dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key), + &mr->mmkey, GFP_ATOMIC); + mlx5_ib_dbg(imr->dev, "key %x mr %p\n", mr->mmkey.key, mr); return mr; out_release: - xa_release(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)); + xa_release(&imr->dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)); out_mr: - mlx5_ib_err(dev, "Failed to register MKEY %d\n", err); - mlx5_mr_cache_free(dev, mr); - - return ERR_PTR(err); -} - -static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr, - unsigned long idx) -{ - struct ib_umem_odp *odp; - struct mlx5_ib_mr *mtt; - - odp = ib_umem_odp_alloc_child(to_ib_umem_odp(imr->umem), - idx * MLX5_IMR_MTT_SIZE, - MLX5_IMR_MTT_SIZE); - if (IS_ERR(odp)) - return ERR_CAST(odp); - - mtt = implicit_mr_alloc(imr->ibmr.pd, odp, 0, imr->access_flags); - if (IS_ERR(mtt)) { - ib_umem_odp_release(odp); - return mtt; - } - - odp->private = mtt; - mtt->umem = &odp->umem; - mtt->mmkey.iova = idx * MLX5_IMR_MTT_SIZE; - mtt->parent = imr; - INIT_WORK(&odp->work, mr_leaf_free_action); - - xa_store(&mtt->dev->odp_mkeys, mlx5_base_mkey(mtt->mmkey.key), - &mtt->mmkey, GFP_ATOMIC); - return mtt; + mlx5_mr_cache_free(imr->dev, mr); +out_umem: + ib_umem_odp_release(odp); + return ret; } static struct ib_umem_odp *implicit_mr_get_data(struct mlx5_ib_mr *imr, @@ -573,27 +543,54 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, struct ib_udata *udata, int access_flags) { - struct mlx5_ib_mr *imr; + struct mlx5_ib_dev *dev = to_mdev(pd->ibpd.device); struct ib_umem_odp *umem_odp; + struct mlx5_ib_mr *imr; + int err; umem_odp = ib_umem_odp_alloc_implicit(udata, access_flags); if (IS_ERR(umem_odp)) return ERR_CAST(umem_odp); - imr = implicit_mr_alloc(&pd->ibpd, umem_odp, 1, access_flags); + imr = mlx5_mr_cache_alloc(dev, MLX5_IMR_KSM_CACHE_ENTRY); if (IS_ERR(imr)) { - ib_umem_odp_release(umem_odp); - return ERR_CAST(imr); + err = PTR_ERR(imr); + goto out_umem; } + imr->ibmr.pd = &pd->ibpd; + imr->access_flags = access_flags; + imr->mmkey.iova = 0; + imr->umem = &umem_odp->umem; + imr->ibmr.lkey = imr->mmkey.key; + imr->ibmr.rkey = imr->mmkey.key; imr->umem = &umem_odp->umem; init_waitqueue_head(&imr->q_leaf_free); atomic_set(&imr->num_leaf_free, 0); atomic_set(&imr->num_pending_prefetch, 0); - xa_store(&imr->dev->odp_mkeys, mlx5_base_mkey(imr->mmkey.key), - &imr->mmkey, GFP_ATOMIC); + err = mlx5_ib_update_xlt(imr, 0, + mlx5_imr_ksm_entries, + MLX5_KSM_PAGE_SHIFT, + MLX5_IB_UPD_XLT_INDIRECT | + MLX5_IB_UPD_XLT_ZAP | + MLX5_IB_UPD_XLT_ENABLE); + if (err) + goto out_mr; + + err = xa_err(xa_store(&dev->odp_mkeys, mlx5_base_mkey(imr->mmkey.key), + &imr->mmkey, GFP_KERNEL)); + if (err) + goto out_mr; + + mlx5_ib_dbg(dev, "key %x mr %p\n", imr->mmkey.key, imr); return imr; +out_mr: + mlx5_ib_err(dev, "Failed to register MKEY %d\n", err); + mlx5_mr_cache_free(dev, imr); +out_umem: + ib_umem_odp_release(umem_odp); + return ERR_PTR(err); } void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr) -- cgit v1.2.3-59-g8ed1b From 9162420dde49c9a8f4819f28bf2d5c675fb12552 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 9 Oct 2019 13:09:27 -0300 Subject: RDMA/mlx5: Set the HW IOVA of the child MRs to their place in the tree Instead of rewriting all the IOVA's to 0 as things progress down the tree make the IOVA of the children equal to placement in the tree. This makes things easier to understand by keeping mmkey.iova == HW configuration. Link: https://lore.kernel.org/r/20191009160934.3143-8-jgg@ziepe.ca Reviewed-by: Artemy Kovalyov Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/odp.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index dfaa39fc4d3f..b25cf7836544 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -211,9 +211,11 @@ void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset, struct mlx5_ib_mr *mtt = odp->private; pklm->key = cpu_to_be32(mtt->ibmr.lkey); + pklm->va = cpu_to_be64(va); odp = odp_next(odp); } else { pklm->key = cpu_to_be32(dev->null_mkey); + pklm->va = 0; } mlx5_ib_dbg(dev, "[%d] va %lx key %x\n", i, va, be32_to_cpu(pklm->key)); @@ -446,7 +448,7 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr, mr->umem = &odp->umem; mr->ibmr.lkey = mr->mmkey.key; mr->ibmr.rkey = mr->mmkey.key; - mr->mmkey.iova = 0; + mr->mmkey.iova = idx * MLX5_IMR_MTT_SIZE; mr->parent = imr; odp->private = mr; INIT_WORK(&odp->work, mr_leaf_free_action); @@ -462,7 +464,6 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr, goto out_release; } - mr->mmkey.iova = idx * MLX5_IMR_MTT_SIZE; xa_store(&imr->dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key), &mr->mmkey, GFP_ATOMIC); -- cgit v1.2.3-59-g8ed1b From 54375e7382952daded7002d1618eadaae859cecb Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 9 Oct 2019 13:09:28 -0300 Subject: RDMA/mlx5: Split implicit handling from pagefault_mr The single routine has a very confusing scheme to advance to the next child MR when working on an implicit parent. This scheme can only be used when working with an implicit parent and must not be triggered when working on a normal MR. Re-arrange things by directly putting all the single-MR stuff into one function and calling it in a loop for the implicit case. Simplify some of the error handling in the new pagefault_real_mr() to remove unneeded gotos. Link: https://lore.kernel.org/r/20191009160934.3143-9-jgg@ziepe.ca Reviewed-by: Artemy Kovalyov Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/odp.c | 125 ++++++++++++++++++++++++--------------- 1 file changed, 76 insertions(+), 49 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index b25cf7836544..8f43af6580ce 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -629,33 +629,18 @@ void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr) } #define MLX5_PF_FLAGS_DOWNGRADE BIT(1) -static int pagefault_mr(struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt, - u32 *bytes_mapped, u32 flags) +static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp, + u64 user_va, size_t bcnt, u32 *bytes_mapped, + u32 flags) { - int npages = 0, current_seq, page_shift, ret, np; - struct ib_umem_odp *odp_mr = to_ib_umem_odp(mr->umem); + int current_seq, page_shift, ret, np; bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE; u64 access_mask; u64 start_idx, page_mask; - struct ib_umem_odp *odp; - size_t size; - - if (odp_mr->is_implicit_odp) { - odp = implicit_mr_get_data(mr, io_virt, bcnt); - - if (IS_ERR(odp)) - return PTR_ERR(odp); - mr = odp->private; - } else { - odp = odp_mr; - } - -next_mr: - size = min_t(size_t, bcnt, ib_umem_end(odp) - io_virt); page_shift = odp->page_shift; page_mask = ~(BIT(page_shift) - 1); - start_idx = (io_virt - (mr->mmkey.iova & page_mask)) >> page_shift; + start_idx = (user_va - (mr->mmkey.iova & page_mask)) >> page_shift; access_mask = ODP_READ_ALLOWED_BIT; if (odp->umem.writable && !downgrade) @@ -668,13 +653,10 @@ next_mr: */ smp_rmb(); - ret = ib_umem_odp_map_dma_pages(odp, io_virt, size, access_mask, - current_seq); - - if (ret < 0) - goto out; - - np = ret; + np = ib_umem_odp_map_dma_pages(odp, user_va, bcnt, access_mask, + current_seq); + if (np < 0) + return np; mutex_lock(&odp->umem_mutex); if (!ib_umem_mmu_notifier_retry(odp, current_seq)) { @@ -699,31 +681,12 @@ next_mr: if (bytes_mapped) { u32 new_mappings = (np << page_shift) - - (io_virt - round_down(io_virt, 1 << page_shift)); - *bytes_mapped += min_t(u32, new_mappings, size); - } - - npages += np << (page_shift - PAGE_SHIFT); - bcnt -= size; + (user_va - round_down(user_va, 1 << page_shift)); - if (unlikely(bcnt)) { - struct ib_umem_odp *next; - - io_virt += size; - next = odp_next(odp); - if (unlikely(!next || ib_umem_start(next) != io_virt)) { - mlx5_ib_dbg( - mr->dev, - "next implicit leaf removed at 0x%llx. got %p\n", - io_virt, next); - return -EAGAIN; - } - odp = next; - mr = odp->private; - goto next_mr; + *bytes_mapped += min_t(u32, new_mappings, bcnt); } - return npages; + return np << (page_shift - PAGE_SHIFT); out: if (ret == -EAGAIN) { @@ -742,6 +705,70 @@ out: return ret; } +/* + * Returns: + * -EFAULT: The io_virt->bcnt is not within the MR, it covers pages that are + * not accessible, or the MR is no longer valid. + * -EAGAIN/-ENOMEM: The operation should be retried + * + * -EINVAL/others: General internal malfunction + * >0: Number of pages mapped + */ +static int pagefault_mr(struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt, + u32 *bytes_mapped, u32 flags) +{ + struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); + struct ib_umem_odp *child; + int npages = 0; + + if (!odp->is_implicit_odp) { + if (unlikely(io_virt < ib_umem_start(odp) || + ib_umem_end(odp) - io_virt < bcnt)) + return -EFAULT; + return pagefault_real_mr(mr, odp, io_virt, bcnt, bytes_mapped, + flags); + } + + if (unlikely(io_virt >= mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE || + mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE - io_virt < bcnt)) + return -EFAULT; + + child = implicit_mr_get_data(mr, io_virt, bcnt); + if (IS_ERR(child)) + return PTR_ERR(child); + + /* Fault each child mr that intersects with our interval. */ + while (bcnt) { + u64 end = min_t(u64, io_virt + bcnt, ib_umem_end(child)); + u64 len = end - io_virt; + int ret; + + ret = pagefault_real_mr(child->private, child, io_virt, len, + bytes_mapped, flags); + if (ret < 0) + return ret; + io_virt += len; + bcnt -= len; + npages += ret; + + if (unlikely(bcnt)) { + child = odp_next(child); + /* + * implicit_mr_get_data sets up all the leaves, this + * means they got invalidated before we got to them. + */ + if (!child || ib_umem_start(child) != io_virt) { + mlx5_ib_dbg( + mr->dev, + "next implicit leaf removed at 0x%llx.\n", + io_virt); + return -EAGAIN; + } + } + } + return npages; +} + struct pf_frame { struct pf_frame *next; u32 key; -- cgit v1.2.3-59-g8ed1b From 423f52d65005e8f5067d94bd4f41d8a7d8388135 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 9 Oct 2019 13:09:29 -0300 Subject: RDMA/mlx5: Use an xarray for the children of an implicit ODP Currently the child leaves are stored in the shared interval tree and every lookup for a child must be done under the interval tree rwsem. This is further complicated by dropping the rwsem during iteration (ie the odp_lookup(), odp_next() pattern), which requires a very tricky an difficult to understand locking scheme with SRCU. Instead reserve the interval tree for the exclusive use of the mmu notifier related code in umem_odp.c and give each implicit MR a xarray containing all the child MRs. Since the size of each child is 1GB of VA, a 1 level xarray will index 64G of VA, and a 2 level will index 2TB, making xarray a much better data structure choice than an interval tree. The locking properties of xarray will be used in the next patches to rework the implicit ODP locking scheme into something simpler. At this point, the xarray is locked by the implicit MR's umem_mutex, and read can also be locked by the odp_srcu. Link: https://lore.kernel.org/r/20191009160934.3143-10-jgg@ziepe.ca Reviewed-by: Artemy Kovalyov Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/mlx5_ib.h | 5 +- drivers/infiniband/hw/mlx5/odp.c | 195 +++++++++++------------------------ include/rdma/ib_umem_odp.h | 16 --- 3 files changed, 67 insertions(+), 149 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 2cf91db6a36f..88769fcffb5a 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -617,10 +617,13 @@ struct mlx5_ib_mr { u64 data_iova; u64 pi_iova; + /* For ODP and implicit */ atomic_t num_leaf_free; wait_queue_head_t q_leaf_free; - struct mlx5_async_work cb_work; atomic_t num_pending_prefetch; + struct xarray implicit_children; + + struct mlx5_async_work cb_work; }; static inline bool is_odp_mr(struct mlx5_ib_mr *mr) diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 8f43af6580ce..6f7eea175c72 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -93,132 +93,54 @@ struct mlx5_pagefault { static u64 mlx5_imr_ksm_entries; -static int check_parent(struct ib_umem_odp *odp, - struct mlx5_ib_mr *parent) +void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries, + struct mlx5_ib_mr *imr, int flags) { - struct mlx5_ib_mr *mr = odp->private; - - return mr && mr->parent == parent && !odp->dying; -} - -static struct ib_ucontext_per_mm *mr_to_per_mm(struct mlx5_ib_mr *mr) -{ - if (WARN_ON(!mr || !is_odp_mr(mr))) - return NULL; - - return to_ib_umem_odp(mr->umem)->per_mm; -} - -static struct ib_umem_odp *odp_next(struct ib_umem_odp *odp) -{ - struct mlx5_ib_mr *mr = odp->private, *parent = mr->parent; - struct ib_ucontext_per_mm *per_mm = odp->per_mm; - struct rb_node *rb; - - down_read(&per_mm->umem_rwsem); - while (1) { - rb = rb_next(&odp->interval_tree.rb); - if (!rb) - goto not_found; - odp = rb_entry(rb, struct ib_umem_odp, interval_tree.rb); - if (check_parent(odp, parent)) - goto end; - } -not_found: - odp = NULL; -end: - up_read(&per_mm->umem_rwsem); - return odp; -} - -static struct ib_umem_odp *odp_lookup(u64 start, u64 length, - struct mlx5_ib_mr *parent) -{ - struct ib_ucontext_per_mm *per_mm = mr_to_per_mm(parent); - struct ib_umem_odp *odp; - struct rb_node *rb; - - down_read(&per_mm->umem_rwsem); - odp = rbt_ib_umem_lookup(&per_mm->umem_tree, start, length); - if (!odp) - goto end; - - while (1) { - if (check_parent(odp, parent)) - goto end; - rb = rb_next(&odp->interval_tree.rb); - if (!rb) - goto not_found; - odp = rb_entry(rb, struct ib_umem_odp, interval_tree.rb); - if (ib_umem_start(odp) > start + length) - goto not_found; - } -not_found: - odp = NULL; -end: - up_read(&per_mm->umem_rwsem); - return odp; -} - -void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset, - size_t nentries, struct mlx5_ib_mr *mr, int flags) -{ - struct ib_pd *pd = mr->ibmr.pd; - struct mlx5_ib_dev *dev = to_mdev(pd->device); - struct ib_umem_odp *odp; - unsigned long va; - int i; + struct mlx5_klm *end = pklm + nentries; if (flags & MLX5_IB_UPD_XLT_ZAP) { - for (i = 0; i < nentries; i++, pklm++) { + for (; pklm != end; pklm++, idx++) { pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); - pklm->key = cpu_to_be32(dev->null_mkey); + pklm->key = cpu_to_be32(imr->dev->null_mkey); pklm->va = 0; } return; } /* - * The locking here is pretty subtle. Ideally the implicit children - * list would be protected by the umem_mutex, however that is not + * The locking here is pretty subtle. Ideally the implicit_children + * xarray would be protected by the umem_mutex, however that is not * possible. Instead this uses a weaker update-then-lock pattern: * * srcu_read_lock() - * + * xa_store() * mutex_lock(umem_mutex) * mlx5_ib_update_xlt() * mutex_unlock(umem_mutex) * destroy lkey * - * ie any change the children list must be followed by the locked - * update_xlt before destroying. + * ie any change the xarray must be followed by the locked update_xlt + * before destroying. * * The umem_mutex provides the acquire/release semantic needed to make - * the children list visible to a racing thread. While SRCU is not + * the xa_store() visible to a racing thread. While SRCU is not * technically required, using it gives consistent use of the SRCU - * locking around the children list. + * locking around the xarray. */ - lockdep_assert_held(&to_ib_umem_odp(mr->umem)->umem_mutex); - lockdep_assert_held(&mr->dev->odp_srcu); + lockdep_assert_held(&to_ib_umem_odp(imr->umem)->umem_mutex); + lockdep_assert_held(&imr->dev->odp_srcu); - odp = odp_lookup(offset * MLX5_IMR_MTT_SIZE, - nentries * MLX5_IMR_MTT_SIZE, mr); + for (; pklm != end; pklm++, idx++) { + struct mlx5_ib_mr *mtt = xa_load(&imr->implicit_children, idx); - for (i = 0; i < nentries; i++, pklm++) { pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); - va = (offset + i) * MLX5_IMR_MTT_SIZE; - if (odp && ib_umem_start(odp) == va) { - struct mlx5_ib_mr *mtt = odp->private; - + if (mtt) { pklm->key = cpu_to_be32(mtt->ibmr.lkey); - pklm->va = cpu_to_be64(va); - odp = odp_next(odp); + pklm->va = cpu_to_be64(idx * MLX5_IMR_MTT_SIZE); } else { - pklm->key = cpu_to_be32(dev->null_mkey); + pklm->key = cpu_to_be32(imr->dev->null_mkey); pklm->va = 0; } - mlx5_ib_dbg(dev, "[%d] va %lx key %x\n", - i, va, be32_to_cpu(pklm->key)); } } @@ -320,6 +242,8 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start, if (unlikely(!umem_odp->npages && mr->parent && !umem_odp->dying)) { + xa_erase(&mr->parent->implicit_children, + ib_umem_start(umem_odp) >> MLX5_IMR_MTT_SHIFT); xa_erase(&mr->dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)); umem_odp->dying = 1; atomic_inc(&mr->parent->num_leaf_free); @@ -464,6 +388,16 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr, goto out_release; } + /* + * Once the store to either xarray completes any error unwind has to + * use synchronize_srcu(). Avoid this with xa_reserve() + */ + err = xa_err(xa_store(&imr->implicit_children, idx, mr, GFP_KERNEL)); + if (err) { + ret = ERR_PTR(err); + goto out_release; + } + xa_store(&imr->dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key), &mr->mmkey, GFP_ATOMIC); @@ -479,7 +413,7 @@ out_umem: return ret; } -static struct ib_umem_odp *implicit_mr_get_data(struct mlx5_ib_mr *imr, +static struct mlx5_ib_mr *implicit_mr_get_data(struct mlx5_ib_mr *imr, u64 io_virt, size_t bcnt) { struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem); @@ -487,39 +421,32 @@ static struct ib_umem_odp *implicit_mr_get_data(struct mlx5_ib_mr *imr, unsigned long idx = io_virt >> MLX5_IMR_MTT_SHIFT; unsigned long inv_start_idx = end_idx + 1; unsigned long inv_len = 0; - struct ib_umem_odp *result = NULL; - struct ib_umem_odp *odp; + struct mlx5_ib_mr *result = NULL; int ret; mutex_lock(&odp_imr->umem_mutex); - odp = odp_lookup(idx * MLX5_IMR_MTT_SIZE, 1, imr); for (idx = idx; idx <= end_idx; idx++) { - if (unlikely(!odp)) { - struct mlx5_ib_mr *mtt; + struct mlx5_ib_mr *mtt = xa_load(&imr->implicit_children, idx); + if (unlikely(!mtt)) { mtt = implicit_get_child_mr(imr, idx); if (IS_ERR(mtt)) { - result = ERR_CAST(mtt); + result = mtt; goto out; } - odp = to_ib_umem_odp(mtt->umem); inv_start_idx = min(inv_start_idx, idx); inv_len = idx - inv_start_idx + 1; } /* Return first odp if region not covered by single one */ if (likely(!result)) - result = odp; - - odp = odp_next(odp); - if (odp && ib_umem_start(odp) != idx * MLX5_IMR_MTT_SIZE) - odp = NULL; + result = mtt; } /* - * Any time the children in the interval tree are changed we must - * perform an update of the xlt before exiting to ensure the HW and - * the tree remains synchronized. + * Any time the implicit_children are changed we must perform an + * update of the xlt before exiting to ensure the HW and the + * implicit_children remains synchronized. */ out: if (likely(!inv_len)) @@ -569,6 +496,7 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, init_waitqueue_head(&imr->q_leaf_free); atomic_set(&imr->num_leaf_free, 0); atomic_set(&imr->num_pending_prefetch, 0); + xa_init(&imr->implicit_children); err = mlx5_ib_update_xlt(imr, 0, mlx5_imr_ksm_entries, @@ -596,18 +524,15 @@ out_umem: void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr) { - struct ib_ucontext_per_mm *per_mm = mr_to_per_mm(imr); - struct rb_node *node; + struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem); + struct mlx5_ib_mr *mtt; + unsigned long idx; - down_read(&per_mm->umem_rwsem); - for (node = rb_first_cached(&per_mm->umem_tree); node; - node = rb_next(node)) { - struct ib_umem_odp *umem_odp = - rb_entry(node, struct ib_umem_odp, interval_tree.rb); - struct mlx5_ib_mr *mr = umem_odp->private; + mutex_lock(&odp_imr->umem_mutex); + xa_for_each (&imr->implicit_children, idx, mtt) { + struct ib_umem_odp *umem_odp = to_ib_umem_odp(mtt->umem); - if (mr->parent != imr) - continue; + xa_erase(&imr->implicit_children, idx); mutex_lock(&umem_odp->umem_mutex); ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp), @@ -623,9 +548,12 @@ void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr) schedule_work(&umem_odp->work); mutex_unlock(&umem_odp->umem_mutex); } - up_read(&per_mm->umem_rwsem); + mutex_unlock(&odp_imr->umem_mutex); wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free)); + WARN_ON(!xa_empty(&imr->implicit_children)); + /* Remove any left over reserved elements */ + xa_destroy(&imr->implicit_children); } #define MLX5_PF_FLAGS_DOWNGRADE BIT(1) @@ -718,7 +646,7 @@ static int pagefault_mr(struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt, u32 *bytes_mapped, u32 flags) { struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); - struct ib_umem_odp *child; + struct mlx5_ib_mr *mtt; int npages = 0; if (!odp->is_implicit_odp) { @@ -733,17 +661,18 @@ static int pagefault_mr(struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt, mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE - io_virt < bcnt)) return -EFAULT; - child = implicit_mr_get_data(mr, io_virt, bcnt); - if (IS_ERR(child)) - return PTR_ERR(child); + mtt = implicit_mr_get_data(mr, io_virt, bcnt); + if (IS_ERR(mtt)) + return PTR_ERR(mtt); /* Fault each child mr that intersects with our interval. */ while (bcnt) { - u64 end = min_t(u64, io_virt + bcnt, ib_umem_end(child)); + struct ib_umem_odp *umem_odp = to_ib_umem_odp(mtt->umem); + u64 end = min_t(u64, io_virt + bcnt, ib_umem_end(umem_odp)); u64 len = end - io_virt; int ret; - ret = pagefault_real_mr(child->private, child, io_virt, len, + ret = pagefault_real_mr(mtt, umem_odp, io_virt, len, bytes_mapped, flags); if (ret < 0) return ret; @@ -752,12 +681,14 @@ static int pagefault_mr(struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt, npages += ret; if (unlikely(bcnt)) { - child = odp_next(child); + mtt = xa_load(&mr->implicit_children, + io_virt >> MLX5_IMR_MTT_SHIFT); + /* * implicit_mr_get_data sets up all the leaves, this * means they got invalidated before we got to them. */ - if (!child || ib_umem_start(child) != io_virt) { + if (!mtt) { mlx5_ib_dbg( mr->dev, "next implicit leaf removed at 0x%llx.\n", diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index 253df1a1fa54..28078efc3833 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -156,22 +156,6 @@ int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root, umem_call_back cb, bool blockable, void *cookie); -/* - * Find first region intersecting with address range. - * Return NULL if not found - */ -static inline struct ib_umem_odp * -rbt_ib_umem_lookup(struct rb_root_cached *root, u64 addr, u64 length) -{ - struct interval_tree_node *node; - - node = interval_tree_iter_first(root, addr, addr + length - 1); - if (!node) - return NULL; - return container_of(node, struct ib_umem_odp, interval_tree); - -} - static inline int ib_umem_mmu_notifier_retry(struct ib_umem_odp *umem_odp, unsigned long mmu_seq) { -- cgit v1.2.3-59-g8ed1b From 3389baa831b6a09e3c96e2a6283a1b952be2f0cd Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 9 Oct 2019 13:09:30 -0300 Subject: RDMA/mlx5: Reduce locking in implicit_mr_get_data() Now that the child MRs are stored in an xarray we can rely on the SRCU lock to protect the xa_load and use xa_cmpxchg on the slow allocation path to resolve races with concurrent page fault. This reduces the scope of the critical section of umem_mutex for implicit MRs to only cover mlx5_ib_update_xlt, and avoids taking a lock at all if the child MR is already in the xarray. This makes it consistent with the normal ODP MR critical section for umem_lock, and the locking approach used for destroying an unusued implicit child MR. The MLX5_IB_UPD_XLT_ATOMIC is no longer needed in implicit_get_child_mr() since it is no longer called with any locks. Link: https://lore.kernel.org/r/20191009160934.3143-11-jgg@ziepe.ca Reviewed-by: Artemy Kovalyov Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/odp.c | 38 ++++++++++++++++++++++++++------------ 1 file changed, 26 insertions(+), 12 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 6f7eea175c72..00e14b6acd98 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -381,8 +381,7 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr, MLX5_IMR_MTT_ENTRIES, PAGE_SHIFT, MLX5_IB_UPD_XLT_ZAP | - MLX5_IB_UPD_XLT_ENABLE | - MLX5_IB_UPD_XLT_ATOMIC); + MLX5_IB_UPD_XLT_ENABLE); if (err) { ret = ERR_PTR(err); goto out_release; @@ -392,9 +391,16 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr, * Once the store to either xarray completes any error unwind has to * use synchronize_srcu(). Avoid this with xa_reserve() */ - err = xa_err(xa_store(&imr->implicit_children, idx, mr, GFP_KERNEL)); - if (err) { - ret = ERR_PTR(err); + ret = xa_cmpxchg(&imr->implicit_children, idx, NULL, mr, GFP_KERNEL); + if (unlikely(ret)) { + if (xa_is_err(ret)) { + ret = ERR_PTR(xa_err(ret)); + goto out_release; + } + /* + * Another thread beat us to creating the child mr, use + * theirs. + */ goto out_release; } @@ -424,7 +430,8 @@ static struct mlx5_ib_mr *implicit_mr_get_data(struct mlx5_ib_mr *imr, struct mlx5_ib_mr *result = NULL; int ret; - mutex_lock(&odp_imr->umem_mutex); + lockdep_assert_held(&imr->dev->odp_srcu); + for (idx = idx; idx <= end_idx; idx++) { struct mlx5_ib_mr *mtt = xa_load(&imr->implicit_children, idx); @@ -450,20 +457,27 @@ static struct mlx5_ib_mr *implicit_mr_get_data(struct mlx5_ib_mr *imr, */ out: if (likely(!inv_len)) - goto out_unlock; + return result; + /* + * Notice this is not strictly ordered right, the KSM is updated after + * the implicit_leaves is updated, so a parallel page fault could see + * a MR that is not yet visible in the KSM. This is similar to a + * parallel page fault seeing a MR that is being concurrently removed + * from the KSM. Both of these improbable situations are resolved + * safely by resuming the HW and then taking another page fault. The + * next pagefault handler will see the new information. + */ + mutex_lock(&odp_imr->umem_mutex); ret = mlx5_ib_update_xlt(imr, inv_start_idx, inv_len, 0, MLX5_IB_UPD_XLT_INDIRECT | MLX5_IB_UPD_XLT_ATOMIC); + mutex_unlock(&odp_imr->umem_mutex); if (ret) { mlx5_ib_err(to_mdev(imr->ibmr.pd->device), "Failed to update PAS\n"); - result = ERR_PTR(ret); - goto out_unlock; + return ERR_PTR(ret); } - -out_unlock: - mutex_unlock(&odp_imr->umem_mutex); return result; } -- cgit v1.2.3-59-g8ed1b From b70d785d237c0d3e4235c511f38f8ce64620f945 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 9 Oct 2019 13:09:31 -0300 Subject: RDMA/mlx5: Avoid double lookups on the pagefault path Now that the locking is simplified combine pagefault_implicit_mr() with implicit_mr_get_data() so that we sweep over the idx range only once, and do the single xlt update at the end, after the child umems are setup. This avoids double iteration/xa_loads plus the sketchy failure path if the xa_load() fails. Link: https://lore.kernel.org/r/20191009160934.3143-12-jgg@ziepe.ca Reviewed-by: Artemy Kovalyov Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/odp.c | 186 +++++++++++++++++---------------------- 1 file changed, 80 insertions(+), 106 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 00e14b6acd98..8bd30db87c21 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -419,68 +419,6 @@ out_umem: return ret; } -static struct mlx5_ib_mr *implicit_mr_get_data(struct mlx5_ib_mr *imr, - u64 io_virt, size_t bcnt) -{ - struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem); - unsigned long end_idx = (io_virt + bcnt - 1) >> MLX5_IMR_MTT_SHIFT; - unsigned long idx = io_virt >> MLX5_IMR_MTT_SHIFT; - unsigned long inv_start_idx = end_idx + 1; - unsigned long inv_len = 0; - struct mlx5_ib_mr *result = NULL; - int ret; - - lockdep_assert_held(&imr->dev->odp_srcu); - - for (idx = idx; idx <= end_idx; idx++) { - struct mlx5_ib_mr *mtt = xa_load(&imr->implicit_children, idx); - - if (unlikely(!mtt)) { - mtt = implicit_get_child_mr(imr, idx); - if (IS_ERR(mtt)) { - result = mtt; - goto out; - } - inv_start_idx = min(inv_start_idx, idx); - inv_len = idx - inv_start_idx + 1; - } - - /* Return first odp if region not covered by single one */ - if (likely(!result)) - result = mtt; - } - - /* - * Any time the implicit_children are changed we must perform an - * update of the xlt before exiting to ensure the HW and the - * implicit_children remains synchronized. - */ -out: - if (likely(!inv_len)) - return result; - - /* - * Notice this is not strictly ordered right, the KSM is updated after - * the implicit_leaves is updated, so a parallel page fault could see - * a MR that is not yet visible in the KSM. This is similar to a - * parallel page fault seeing a MR that is being concurrently removed - * from the KSM. Both of these improbable situations are resolved - * safely by resuming the HW and then taking another page fault. The - * next pagefault handler will see the new information. - */ - mutex_lock(&odp_imr->umem_mutex); - ret = mlx5_ib_update_xlt(imr, inv_start_idx, inv_len, 0, - MLX5_IB_UPD_XLT_INDIRECT | - MLX5_IB_UPD_XLT_ATOMIC); - mutex_unlock(&odp_imr->umem_mutex); - if (ret) { - mlx5_ib_err(to_mdev(imr->ibmr.pd->device), - "Failed to update PAS\n"); - return ERR_PTR(ret); - } - return result; -} - struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, struct ib_udata *udata, int access_flags) @@ -647,6 +585,84 @@ out: return ret; } +static int pagefault_implicit_mr(struct mlx5_ib_mr *imr, + struct ib_umem_odp *odp_imr, u64 user_va, + size_t bcnt, u32 *bytes_mapped, u32 flags) +{ + unsigned long end_idx = (user_va + bcnt - 1) >> MLX5_IMR_MTT_SHIFT; + unsigned long upd_start_idx = end_idx + 1; + unsigned long upd_len = 0; + unsigned long npages = 0; + int err; + int ret; + + if (unlikely(user_va >= mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE || + mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE - user_va < bcnt)) + return -EFAULT; + + /* Fault each child mr that intersects with our interval. */ + while (bcnt) { + unsigned long idx = user_va >> MLX5_IMR_MTT_SHIFT; + struct ib_umem_odp *umem_odp; + struct mlx5_ib_mr *mtt; + u64 len; + + mtt = xa_load(&imr->implicit_children, idx); + if (unlikely(!mtt)) { + mtt = implicit_get_child_mr(imr, idx); + if (IS_ERR(mtt)) { + ret = PTR_ERR(mtt); + goto out; + } + upd_start_idx = min(upd_start_idx, idx); + upd_len = idx - upd_start_idx + 1; + } + + umem_odp = to_ib_umem_odp(mtt->umem); + len = min_t(u64, user_va + bcnt, ib_umem_end(umem_odp)) - + user_va; + + ret = pagefault_real_mr(mtt, umem_odp, user_va, len, + bytes_mapped, flags); + if (ret < 0) + goto out; + user_va += len; + bcnt -= len; + npages += ret; + } + + ret = npages; + + /* + * Any time the implicit_children are changed we must perform an + * update of the xlt before exiting to ensure the HW and the + * implicit_children remains synchronized. + */ +out: + if (likely(!upd_len)) + return ret; + + /* + * Notice this is not strictly ordered right, the KSM is updated after + * the implicit_children is updated, so a parallel page fault could + * see a MR that is not yet visible in the KSM. This is similar to a + * parallel page fault seeing a MR that is being concurrently removed + * from the KSM. Both of these improbable situations are resolved + * safely by resuming the HW and then taking another page fault. The + * next pagefault handler will see the new information. + */ + mutex_lock(&odp_imr->umem_mutex); + err = mlx5_ib_update_xlt(imr, upd_start_idx, upd_len, 0, + MLX5_IB_UPD_XLT_INDIRECT | + MLX5_IB_UPD_XLT_ATOMIC); + mutex_unlock(&odp_imr->umem_mutex); + if (err) { + mlx5_ib_err(imr->dev, "Failed to update PAS\n"); + return err; + } + return ret; +} + /* * Returns: * -EFAULT: The io_virt->bcnt is not within the MR, it covers pages that are @@ -660,8 +676,6 @@ static int pagefault_mr(struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt, u32 *bytes_mapped, u32 flags) { struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); - struct mlx5_ib_mr *mtt; - int npages = 0; if (!odp->is_implicit_odp) { if (unlikely(io_virt < ib_umem_start(odp) || @@ -670,48 +684,8 @@ static int pagefault_mr(struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt, return pagefault_real_mr(mr, odp, io_virt, bcnt, bytes_mapped, flags); } - - if (unlikely(io_virt >= mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE || - mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE - io_virt < bcnt)) - return -EFAULT; - - mtt = implicit_mr_get_data(mr, io_virt, bcnt); - if (IS_ERR(mtt)) - return PTR_ERR(mtt); - - /* Fault each child mr that intersects with our interval. */ - while (bcnt) { - struct ib_umem_odp *umem_odp = to_ib_umem_odp(mtt->umem); - u64 end = min_t(u64, io_virt + bcnt, ib_umem_end(umem_odp)); - u64 len = end - io_virt; - int ret; - - ret = pagefault_real_mr(mtt, umem_odp, io_virt, len, - bytes_mapped, flags); - if (ret < 0) - return ret; - io_virt += len; - bcnt -= len; - npages += ret; - - if (unlikely(bcnt)) { - mtt = xa_load(&mr->implicit_children, - io_virt >> MLX5_IMR_MTT_SHIFT); - - /* - * implicit_mr_get_data sets up all the leaves, this - * means they got invalidated before we got to them. - */ - if (!mtt) { - mlx5_ib_dbg( - mr->dev, - "next implicit leaf removed at 0x%llx.\n", - io_virt); - return -EAGAIN; - } - } - } - return npages; + return pagefault_implicit_mr(mr, odp, io_virt, bcnt, bytes_mapped, + flags); } struct pf_frame { -- cgit v1.2.3-59-g8ed1b From 5256edcb98a14b11409a2d323f56a70a8b366363 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 9 Oct 2019 13:09:32 -0300 Subject: RDMA/mlx5: Rework implicit ODP destroy Use SRCU in a sensible way by removing all MRs in the implicit tree from the two xarrays (the update operation), then a synchronize, followed by a normal single threaded teardown. This is only a little unusual from the normal pattern as there can still be some work pending in the unbound wq that may also require a workqueue flush. This is tracked with a single atomic, consolidating the redundant existing atomics and wait queue. For understand-ability the entire ODP implicit create/destroy flow now largely exists in a single pair of functions within odp.c, with a few support functions for tearing down an unused child. Link: https://lore.kernel.org/r/20191009160934.3143-13-jgg@ziepe.ca Reviewed-by: Artemy Kovalyov Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 2 - drivers/infiniband/hw/mlx5/mlx5_ib.h | 9 ++- drivers/infiniband/hw/mlx5/mr.c | 21 ++--- drivers/infiniband/hw/mlx5/odp.c | 152 +++++++++++++++++++++++------------ include/rdma/ib_umem_odp.h | 2 - 5 files changed, 120 insertions(+), 66 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 4692c37b057c..add24b628900 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -6146,8 +6146,6 @@ static void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev) { mlx5_ib_cleanup_multiport_master(dev); WARN_ON(!xa_empty(&dev->odp_mkeys)); - if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) - srcu_barrier(&dev->odp_srcu); cleanup_srcu_struct(&dev->odp_srcu); WARN_ON(!xa_empty(&dev->sig_mrs)); diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 88769fcffb5a..b8c958f62628 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -618,10 +618,13 @@ struct mlx5_ib_mr { u64 pi_iova; /* For ODP and implicit */ - atomic_t num_leaf_free; - wait_queue_head_t q_leaf_free; - atomic_t num_pending_prefetch; + atomic_t num_deferred_work; struct xarray implicit_children; + union { + struct rcu_head rcu; + struct list_head elm; + struct work_struct work; + } odp_destroy; struct mlx5_async_work cb_work; }; diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index fd94838a8845..1e91f61efa8a 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -1317,7 +1317,7 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, if (is_odp_mr(mr)) { to_ib_umem_odp(mr->umem)->private = mr; - atomic_set(&mr->num_pending_prefetch, 0); + atomic_set(&mr->num_deferred_work, 0); err = xa_err(xa_store(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key), &mr->mmkey, GFP_KERNEL)); @@ -1573,17 +1573,15 @@ static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) synchronize_srcu(&dev->odp_srcu); /* dequeue pending prefetch requests for the mr */ - if (atomic_read(&mr->num_pending_prefetch)) + if (atomic_read(&mr->num_deferred_work)) { flush_workqueue(system_unbound_wq); - WARN_ON(atomic_read(&mr->num_pending_prefetch)); + WARN_ON(atomic_read(&mr->num_deferred_work)); + } /* Destroy all page mappings */ - if (!umem_odp->is_implicit_odp) - mlx5_ib_invalidate_range(umem_odp, - ib_umem_start(umem_odp), - ib_umem_end(umem_odp)); - else - mlx5_ib_free_implicit_mr(mr); + mlx5_ib_invalidate_range(umem_odp, ib_umem_start(umem_odp), + ib_umem_end(umem_odp)); + /* * We kill the umem before the MR for ODP, * so that there will not be any invalidations in @@ -1620,6 +1618,11 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) dereg_mr(to_mdev(mmr->klm_mr->ibmr.device), mmr->klm_mr); } + if (is_odp_mr(mmr) && to_ib_umem_odp(mmr->umem)->is_implicit_odp) { + mlx5_ib_free_implicit_mr(mmr); + return 0; + } + dereg_mr(to_mdev(ibmr->device), mmr); return 0; diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 8bd30db87c21..5cf93d8129a9 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -144,31 +144,79 @@ void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries, } } -static void mr_leaf_free_action(struct work_struct *work) +/* + * This must be called after the mr has been removed from implicit_children + * and odp_mkeys and the SRCU synchronized. NOTE: The MR does not necessarily + * have to be empty here, parallel page faults could have raced with the free + * process and added pages to it. + */ +static void free_implicit_child_mr(struct mlx5_ib_mr *mr, bool need_imr_xlt) { - struct ib_umem_odp *odp = container_of(work, struct ib_umem_odp, work); - int idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT; - struct mlx5_ib_mr *mr = odp->private, *imr = mr->parent; + struct mlx5_ib_mr *imr = mr->parent; struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem); + struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); + unsigned long idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT; int srcu_key; - mr->parent = NULL; - synchronize_srcu(&mr->dev->odp_srcu); + /* implicit_child_mr's are not allowed to have deferred work */ + WARN_ON(atomic_read(&mr->num_deferred_work)); - if (xa_load(&mr->dev->odp_mkeys, mlx5_base_mkey(imr->mmkey.key))) { + if (need_imr_xlt) { srcu_key = srcu_read_lock(&mr->dev->odp_srcu); mutex_lock(&odp_imr->umem_mutex); - mlx5_ib_update_xlt(imr, idx, 1, 0, + mlx5_ib_update_xlt(mr->parent, idx, 1, 0, MLX5_IB_UPD_XLT_INDIRECT | MLX5_IB_UPD_XLT_ATOMIC); mutex_unlock(&odp_imr->umem_mutex); srcu_read_unlock(&mr->dev->odp_srcu, srcu_key); } - ib_umem_odp_release(odp); + + mr->parent = NULL; mlx5_mr_cache_free(mr->dev, mr); + ib_umem_odp_release(odp); + atomic_dec(&imr->num_deferred_work); +} + +static void free_implicit_child_mr_work(struct work_struct *work) +{ + struct mlx5_ib_mr *mr = + container_of(work, struct mlx5_ib_mr, odp_destroy.work); + + free_implicit_child_mr(mr, true); +} + +static void free_implicit_child_mr_rcu(struct rcu_head *head) +{ + struct mlx5_ib_mr *mr = + container_of(head, struct mlx5_ib_mr, odp_destroy.rcu); + + /* Freeing a MR is a sleeping operation, so bounce to a work queue */ + INIT_WORK(&mr->odp_destroy.work, free_implicit_child_mr_work); + queue_work(system_unbound_wq, &mr->odp_destroy.work); +} + +static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr) +{ + struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); + unsigned long idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT; + struct mlx5_ib_mr *imr = mr->parent; - if (atomic_dec_and_test(&imr->num_leaf_free)) - wake_up(&imr->q_leaf_free); + xa_lock(&imr->implicit_children); + /* + * This can race with mlx5_ib_free_implicit_mr(), the first one to + * reach the xa lock wins the race and destroys the MR. + */ + if (__xa_cmpxchg(&imr->implicit_children, idx, mr, NULL, GFP_ATOMIC) != + mr) + goto out_unlock; + + __xa_erase(&mr->dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)); + atomic_inc(&imr->num_deferred_work); + call_srcu(&mr->dev->odp_srcu, &mr->odp_destroy.rcu, + free_implicit_child_mr_rcu); + +out_unlock: + xa_unlock(&imr->implicit_children); } void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start, @@ -240,15 +288,8 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start, ib_umem_odp_unmap_dma_pages(umem_odp, start, end); - if (unlikely(!umem_odp->npages && mr->parent && - !umem_odp->dying)) { - xa_erase(&mr->parent->implicit_children, - ib_umem_start(umem_odp) >> MLX5_IMR_MTT_SHIFT); - xa_erase(&mr->dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)); - umem_odp->dying = 1; - atomic_inc(&mr->parent->num_leaf_free); - schedule_work(&umem_odp->work); - } + if (unlikely(!umem_odp->npages && mr->parent)) + destroy_unused_implicit_child_mr(mr); mutex_unlock(&umem_odp->umem_mutex); } @@ -375,7 +416,6 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr, mr->mmkey.iova = idx * MLX5_IMR_MTT_SIZE; mr->parent = imr; odp->private = mr; - INIT_WORK(&odp->work, mr_leaf_free_action); err = mlx5_ib_update_xlt(mr, 0, MLX5_IMR_MTT_ENTRIES, @@ -391,7 +431,11 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr, * Once the store to either xarray completes any error unwind has to * use synchronize_srcu(). Avoid this with xa_reserve() */ - ret = xa_cmpxchg(&imr->implicit_children, idx, NULL, mr, GFP_KERNEL); + ret = xa_cmpxchg(&imr->implicit_children, idx, NULL, mr, + GFP_KERNEL); + if (likely(!ret)) + xa_store(&imr->dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key), + &mr->mmkey, GFP_ATOMIC); if (unlikely(ret)) { if (xa_is_err(ret)) { ret = ERR_PTR(xa_err(ret)); @@ -404,9 +448,6 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr, goto out_release; } - xa_store(&imr->dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key), - &mr->mmkey, GFP_ATOMIC); - mlx5_ib_dbg(imr->dev, "key %x mr %p\n", mr->mmkey.key, mr); return mr; @@ -445,9 +486,7 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, imr->ibmr.lkey = imr->mmkey.key; imr->ibmr.rkey = imr->mmkey.key; imr->umem = &umem_odp->umem; - init_waitqueue_head(&imr->q_leaf_free); - atomic_set(&imr->num_leaf_free, 0); - atomic_set(&imr->num_pending_prefetch, 0); + atomic_set(&imr->num_deferred_work, 0); xa_init(&imr->implicit_children); err = mlx5_ib_update_xlt(imr, 0, @@ -477,35 +516,48 @@ out_umem: void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr) { struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem); + struct mlx5_ib_dev *dev = imr->dev; + struct list_head destroy_list; struct mlx5_ib_mr *mtt; + struct mlx5_ib_mr *tmp; unsigned long idx; - mutex_lock(&odp_imr->umem_mutex); - xa_for_each (&imr->implicit_children, idx, mtt) { - struct ib_umem_odp *umem_odp = to_ib_umem_odp(mtt->umem); + INIT_LIST_HEAD(&destroy_list); - xa_erase(&imr->implicit_children, idx); + xa_erase(&dev->odp_mkeys, mlx5_base_mkey(imr->mmkey.key)); + /* + * This stops the SRCU protected page fault path from touching either + * the imr or any children. The page fault path can only reach the + * children xarray via the imr. + */ + synchronize_srcu(&dev->odp_srcu); - mutex_lock(&umem_odp->umem_mutex); - ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp), - ib_umem_end(umem_odp)); + xa_lock(&imr->implicit_children); + xa_for_each (&imr->implicit_children, idx, mtt) { + __xa_erase(&imr->implicit_children, idx); + __xa_erase(&dev->odp_mkeys, mlx5_base_mkey(mtt->mmkey.key)); + list_add(&mtt->odp_destroy.elm, &destroy_list); + } + xa_unlock(&imr->implicit_children); - if (umem_odp->dying) { - mutex_unlock(&umem_odp->umem_mutex); - continue; - } + /* Fence access to the child pointers via the pagefault thread */ + synchronize_srcu(&dev->odp_srcu); - umem_odp->dying = 1; - atomic_inc(&imr->num_leaf_free); - schedule_work(&umem_odp->work); - mutex_unlock(&umem_odp->umem_mutex); + /* + * num_deferred_work can only be incremented inside the odp_srcu, or + * under xa_lock while the child is in the xarray. Thus at this point + * it is only decreasing, and all work holding it is now on the wq. + */ + if (atomic_read(&imr->num_deferred_work)) { + flush_workqueue(system_unbound_wq); + WARN_ON(atomic_read(&imr->num_deferred_work)); } - mutex_unlock(&odp_imr->umem_mutex); - wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free)); - WARN_ON(!xa_empty(&imr->implicit_children)); - /* Remove any left over reserved elements */ - xa_destroy(&imr->implicit_children); + list_for_each_entry_safe (mtt, tmp, &destroy_list, odp_destroy.elm) + free_implicit_child_mr(mtt, false); + + mlx5_mr_cache_free(dev, imr); + ib_umem_odp_release(odp_imr); } #define MLX5_PF_FLAGS_DOWNGRADE BIT(1) @@ -1579,7 +1631,7 @@ static void destroy_prefetch_work(struct prefetch_mr_work *work) u32 i; for (i = 0; i < work->num_sge; ++i) - atomic_dec(&work->frags[i].mr->num_pending_prefetch); + atomic_dec(&work->frags[i].mr->num_deferred_work); kvfree(work); } @@ -1658,7 +1710,7 @@ static bool init_prefetch_work(struct ib_pd *pd, } /* Keep the MR pointer will valid outside the SRCU */ - atomic_inc(&work->frags[i].mr->num_pending_prefetch); + atomic_inc(&work->frags[i].mr->num_deferred_work); } work->num_sge = num_sge; return true; diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index 28078efc3833..09b0e4494986 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -78,9 +78,7 @@ struct ib_umem_odp { bool is_implicit_odp; struct completion notifier_completion; - int dying; unsigned int page_shift; - struct work_struct work; }; static inline struct ib_umem_odp *to_ib_umem_odp(struct ib_umem *umem) -- cgit v1.2.3-59-g8ed1b From d561987f34f263dd176ccd8fb782cb153d72f441 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 9 Oct 2019 13:09:33 -0300 Subject: RDMA/mlx5: Do not store implicit children in the odp_mkeys xarray These mkeys are entirely internal and are never used by the HW for page fault. They should also never be used by userspace for prefetch. Simplify & optimize things by not including them in the xarray. Since the prefetch path can now never see a child mkey there is no need for the second synchronize_srcu() during imr destroy. Link: https://lore.kernel.org/r/20191009160934.3143-14-jgg@ziepe.ca Reviewed-by: Artemy Kovalyov Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/odp.c | 36 ++++++------------------------------ 1 file changed, 6 insertions(+), 30 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 5cf93d8129a9..06e24d5e7609 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -146,9 +146,9 @@ void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries, /* * This must be called after the mr has been removed from implicit_children - * and odp_mkeys and the SRCU synchronized. NOTE: The MR does not necessarily - * have to be empty here, parallel page faults could have raced with the free - * process and added pages to it. + * and the SRCU synchronized. NOTE: The MR does not necessarily have to be + * empty here, parallel page faults could have raced with the free process and + * added pages to it. */ static void free_implicit_child_mr(struct mlx5_ib_mr *mr, bool need_imr_xlt) { @@ -210,7 +210,6 @@ static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr) mr) goto out_unlock; - __xa_erase(&mr->dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)); atomic_inc(&imr->num_deferred_work); call_srcu(&mr->dev->odp_srcu, &mr->odp_destroy.rcu, free_implicit_child_mr_rcu); @@ -401,13 +400,6 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr, if (IS_ERR(mr)) goto out_umem; - err = xa_reserve(&imr->dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key), - GFP_KERNEL); - if (err) { - ret = ERR_PTR(err); - goto out_mr; - } - mr->ibmr.pd = imr->ibmr.pd; mr->access_flags = imr->access_flags; mr->umem = &odp->umem; @@ -424,7 +416,7 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr, MLX5_IB_UPD_XLT_ENABLE); if (err) { ret = ERR_PTR(err); - goto out_release; + goto out_mr; } /* @@ -433,26 +425,21 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr, */ ret = xa_cmpxchg(&imr->implicit_children, idx, NULL, mr, GFP_KERNEL); - if (likely(!ret)) - xa_store(&imr->dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key), - &mr->mmkey, GFP_ATOMIC); if (unlikely(ret)) { if (xa_is_err(ret)) { ret = ERR_PTR(xa_err(ret)); - goto out_release; + goto out_mr; } /* * Another thread beat us to creating the child mr, use * theirs. */ - goto out_release; + goto out_mr; } mlx5_ib_dbg(imr->dev, "key %x mr %p\n", mr->mmkey.key, mr); return mr; -out_release: - xa_release(&imr->dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)); out_mr: mlx5_mr_cache_free(imr->dev, mr); out_umem: @@ -535,14 +522,10 @@ void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr) xa_lock(&imr->implicit_children); xa_for_each (&imr->implicit_children, idx, mtt) { __xa_erase(&imr->implicit_children, idx); - __xa_erase(&dev->odp_mkeys, mlx5_base_mkey(mtt->mmkey.key)); list_add(&mtt->odp_destroy.elm, &destroy_list); } xa_unlock(&imr->implicit_children); - /* Fence access to the child pointers via the pagefault thread */ - synchronize_srcu(&dev->odp_srcu); - /* * num_deferred_work can only be incremented inside the odp_srcu, or * under xa_lock while the child is in the xarray. Thus at this point @@ -1655,13 +1638,6 @@ get_prefetchable_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice, if (mr->ibmr.pd != pd) return NULL; - /* - * Implicit child MRs are internal and userspace should not refer to - * them. - */ - if (mr->parent) - return NULL; - odp = to_ib_umem_odp(mr->umem); /* prefetch with write-access must be supported by the MR */ -- cgit v1.2.3-59-g8ed1b From 09689703d29a3b75c510c198c3aca85d7d8b50c7 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 9 Oct 2019 13:09:34 -0300 Subject: RDMA/mlx5: Do not race with mlx5_ib_invalidate_range during create and destroy For creation, as soon as the umem_odp is created the notifier can be called, however the underlying MR may not have been setup yet. This would cause problems if mlx5_ib_invalidate_range() runs. There is some confusing/ulocked/racy code that might by trying to solve this, but without locks it isn't going to work right. Instead trivially solve the problem by short-circuiting the invalidation if there are not yet any DMA mapped pages. By definition there is nothing to invalidate in this case. The create code will have the umem fully setup before anything is DMA mapped, and npages is fully locked by the umem_mutex. For destroy, invalidate the entire MR at the HW to stop DMA then DMA unmap the pages before destroying the MR. This drives npages to zero and prevents similar racing with invalidate while the MR is undergoing destruction. Arguably it would be better if the umem was created after the MR and destroyed before, but that would require a big rework of the MR code. Fixes: 6aec21f6a832 ("IB/mlx5: Page faults handling infrastructure") Link: https://lore.kernel.org/r/20191009160934.3143-15-jgg@ziepe.ca Reviewed-by: Artemy Kovalyov Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/mlx5_ib.h | 3 ++ drivers/infiniband/hw/mlx5/mr.c | 74 ++++++++++++------------------------ drivers/infiniband/hw/mlx5/odp.c | 70 +++++++++++++++++++++++++++++----- 3 files changed, 88 insertions(+), 59 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index b8c958f62628..f61d4005c6c3 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -1171,6 +1171,7 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, struct ib_udata *udata, int access_flags); void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *mr); +void mlx5_ib_fence_odp_mr(struct mlx5_ib_mr *mr); int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, u64 length, u64 virt_addr, int access_flags, struct ib_pd *pd, struct ib_udata *udata); @@ -1232,6 +1233,8 @@ int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev); struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, int entry); void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr); +int mlx5_mr_cache_invalidate(struct mlx5_ib_mr *mr); + int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, struct ib_mr_status *mr_status); struct ib_wq *mlx5_ib_create_wq(struct ib_pd *pd, diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 1e91f61efa8a..199f7959aaa5 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -50,7 +50,6 @@ enum { static void clean_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr); static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr); static int mr_cache_max_order(struct mlx5_ib_dev *dev); -static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr); static bool umr_can_use_indirect_mkey(struct mlx5_ib_dev *dev) { @@ -495,7 +494,7 @@ void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) c = order2idx(dev, mr->order); WARN_ON(c < 0 || c >= MAX_MR_CACHE_ENTRIES); - if (unreg_umr(dev, mr)) { + if (mlx5_mr_cache_invalidate(mr)) { mr->allocated_from_cache = false; destroy_mkey(dev, mr); ent = &cache->ent[c]; @@ -1333,22 +1332,29 @@ error: return ERR_PTR(err); } -static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) +/** + * mlx5_mr_cache_invalidate - Fence all DMA on the MR + * @mr: The MR to fence + * + * Upon return the NIC will not be doing any DMA to the pages under the MR, + * and any DMA inprogress will be completed. Failure of this function + * indicates the HW has failed catastrophically. + */ +int mlx5_mr_cache_invalidate(struct mlx5_ib_mr *mr) { - struct mlx5_core_dev *mdev = dev->mdev; struct mlx5_umr_wr umrwr = {}; - if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) + if (mr->dev->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) return 0; umrwr.wr.send_flags = MLX5_IB_SEND_UMR_DISABLE_MR | MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS; umrwr.wr.opcode = MLX5_IB_WR_UMR; - umrwr.pd = dev->umrc.pd; + umrwr.pd = mr->dev->umrc.pd; umrwr.mkey = mr->mmkey.key; umrwr.ignore_free_state = 1; - return mlx5_ib_post_send_wait(dev, &umrwr); + return mlx5_ib_post_send_wait(mr->dev, &umrwr); } static int rereg_umr(struct ib_pd *pd, struct mlx5_ib_mr *mr, @@ -1432,7 +1438,7 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, * UMR can't be used - MKey needs to be replaced. */ if (mr->allocated_from_cache) - err = unreg_umr(dev, mr); + err = mlx5_mr_cache_invalidate(mr); else err = destroy_mkey(dev, mr); if (err) @@ -1561,52 +1567,20 @@ static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) int npages = mr->npages; struct ib_umem *umem = mr->umem; - if (is_odp_mr(mr)) { - struct ib_umem_odp *umem_odp = to_ib_umem_odp(umem); - - /* Prevent new page faults and - * prefetch requests from succeeding - */ - xa_erase(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)); - - /* Wait for all running page-fault handlers to finish. */ - synchronize_srcu(&dev->odp_srcu); - - /* dequeue pending prefetch requests for the mr */ - if (atomic_read(&mr->num_deferred_work)) { - flush_workqueue(system_unbound_wq); - WARN_ON(atomic_read(&mr->num_deferred_work)); - } - - /* Destroy all page mappings */ - mlx5_ib_invalidate_range(umem_odp, ib_umem_start(umem_odp), - ib_umem_end(umem_odp)); - - /* - * We kill the umem before the MR for ODP, - * so that there will not be any invalidations in - * flight, looking at the *mr struct. - */ - ib_umem_odp_release(umem_odp); - atomic_sub(npages, &dev->mdev->priv.reg_pages); - - /* Avoid double-freeing the umem. */ - umem = NULL; - } + /* Stop all DMA */ + if (is_odp_mr(mr)) + mlx5_ib_fence_odp_mr(mr); + else + clean_mr(dev, mr); - clean_mr(dev, mr); + if (mr->allocated_from_cache) + mlx5_mr_cache_free(dev, mr); + else + kfree(mr); - /* - * We should unregister the DMA address from the HCA before - * remove the DMA mapping. - */ - mlx5_mr_cache_free(dev, mr); ib_umem_release(umem); - if (umem) - atomic_sub(npages, &dev->mdev->priv.reg_pages); + atomic_sub(npages, &dev->mdev->priv.reg_pages); - if (!mr->allocated_from_cache) - kfree(mr); } int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 06e24d5e7609..bcfc09846697 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -144,6 +144,27 @@ void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries, } } +static void dma_fence_odp_mr(struct mlx5_ib_mr *mr) +{ + struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); + + /* Ensure mlx5_ib_invalidate_range() will not touch the MR any more */ + mutex_lock(&odp->umem_mutex); + if (odp->npages) { + mlx5_mr_cache_invalidate(mr); + ib_umem_odp_unmap_dma_pages(odp, ib_umem_start(odp), + ib_umem_end(odp)); + WARN_ON(odp->npages); + } + odp->private = NULL; + mutex_unlock(&odp->umem_mutex); + + if (!mr->allocated_from_cache) { + mlx5_core_destroy_mkey(mr->dev->mdev, &mr->mmkey); + WARN_ON(mr->descs); + } +} + /* * This must be called after the mr has been removed from implicit_children * and the SRCU synchronized. NOTE: The MR does not necessarily have to be @@ -171,6 +192,8 @@ static void free_implicit_child_mr(struct mlx5_ib_mr *mr, bool need_imr_xlt) srcu_read_unlock(&mr->dev->odp_srcu, srcu_key); } + dma_fence_odp_mr(mr); + mr->parent = NULL; mlx5_mr_cache_free(mr->dev, mr); ib_umem_odp_release(odp); @@ -228,16 +251,15 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start, int in_block = 0; u64 addr; - if (!umem_odp) { - pr_err("invalidation called on NULL umem or non-ODP umem\n"); - return; - } - + mutex_lock(&umem_odp->umem_mutex); + /* + * If npages is zero then umem_odp->private may not be setup yet. This + * does not complete until after the first page is mapped for DMA. + */ + if (!umem_odp->npages) + goto out; mr = umem_odp->private; - if (!mr || !mr->ibmr.pd) - return; - start = max_t(u64, ib_umem_start(umem_odp), start); end = min_t(u64, ib_umem_end(umem_odp), end); @@ -247,7 +269,6 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start, * overwrite the same MTTs. Concurent invalidations might race us, * but they will write 0s as well, so no difference in the end result. */ - mutex_lock(&umem_odp->umem_mutex); for (addr = start; addr < end; addr += BIT(umem_odp->page_shift)) { idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift; /* @@ -289,6 +310,7 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start, if (unlikely(!umem_odp->npages && mr->parent)) destroy_unused_implicit_child_mr(mr); +out: mutex_unlock(&umem_odp->umem_mutex); } @@ -536,6 +558,13 @@ void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr) WARN_ON(atomic_read(&imr->num_deferred_work)); } + /* + * Fence the imr before we destroy the children. This allows us to + * skip updating the XLT of the imr during destroy of the child mkey + * the imr points to. + */ + mlx5_mr_cache_invalidate(imr); + list_for_each_entry_safe (mtt, tmp, &destroy_list, odp_destroy.elm) free_implicit_child_mr(mtt, false); @@ -543,6 +572,29 @@ void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr) ib_umem_odp_release(odp_imr); } +/** + * mlx5_ib_fence_odp_mr - Stop all access to the ODP MR + * @mr: to fence + * + * On return no parallel threads will be touching this MR and no DMA will be + * active. + */ +void mlx5_ib_fence_odp_mr(struct mlx5_ib_mr *mr) +{ + /* Prevent new page faults and prefetch requests from succeeding */ + xa_erase(&mr->dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)); + + /* Wait for all running page-fault handlers to finish. */ + synchronize_srcu(&mr->dev->odp_srcu); + + if (atomic_read(&mr->num_deferred_work)) { + flush_workqueue(system_unbound_wq); + WARN_ON(atomic_read(&mr->num_deferred_work)); + } + + dma_fence_odp_mr(mr); +} + #define MLX5_PF_FLAGS_DOWNGRADE BIT(1) static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp, u64 user_va, size_t bcnt, u32 *bytes_mapped, -- cgit v1.2.3-59-g8ed1b From 46870b2391d5163e84180e051fdabadd502d8b44 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 9 Oct 2019 13:09:35 -0300 Subject: RDMA/odp: Remove broken debugging call to invalidate_range invalidate_range() also obtains the umem_mutex which is being held at this point, so if this path were was ever called it would deadlock. Thus conclude the debugging never triggers and rework it into a simple WARN_ON and leave things as they are. While here add a note to explain how we could possibly get inconsistent page pointers. Link: https://lore.kernel.org/r/20191009160934.3143-16-jgg@ziepe.ca Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/umem_odp.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 163ff7ba92b7..d7d5fadf0899 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -508,7 +508,6 @@ static int ib_umem_odp_map_dma_single_page( { struct ib_device *dev = umem_odp->umem.ibdev; dma_addr_t dma_addr; - int remove_existing_mapping = 0; int ret = 0; /* @@ -534,28 +533,29 @@ static int ib_umem_odp_map_dma_single_page( } else if (umem_odp->page_list[page_index] == page) { umem_odp->dma_list[page_index] |= access_mask; } else { - pr_err("error: got different pages in IB device and from get_user_pages. IB device page: %p, gup page: %p\n", - umem_odp->page_list[page_index], page); - /* Better remove the mapping now, to prevent any further - * damage. */ - remove_existing_mapping = 1; + /* + * This is a race here where we could have done: + * + * CPU0 CPU1 + * get_user_pages() + * invalidate() + * page_fault() + * mutex_lock(umem_mutex) + * page from GUP != page in ODP + * + * It should be prevented by the retry test above as reading + * the seq number should be reliable under the + * umem_mutex. Thus something is really not working right if + * things get here. + */ + WARN(true, + "Got different pages in IB device and from get_user_pages. IB device page: %p, gup page: %p\n", + umem_odp->page_list[page_index], page); + ret = -EAGAIN; } out: put_user_page(page); - - if (remove_existing_mapping) { - ib_umem_notifier_start_account(umem_odp); - dev->ops.invalidate_range( - umem_odp, - ib_umem_start(umem_odp) + - (page_index << umem_odp->page_shift), - ib_umem_start(umem_odp) + - ((page_index + 1) << umem_odp->page_shift)); - ib_umem_notifier_end_account(umem_odp); - ret = -EAGAIN; - } - return ret; } -- cgit v1.2.3-59-g8ed1b