From c320e527e1548305f31d95ec405140b04aed25f5 Mon Sep 17 00:00:00 2001 From: Moni Shoua Date: Wed, 15 Jan 2020 14:43:31 +0200 Subject: IB: Allow calls to ib_umem_get from kernel ULPs So far the assumption was that ib_umem_get() and ib_umem_odp_get() are called from flows that start in UVERBS and therefore has a user context. This assumption restricts flows that are initiated by ULPs and need the service that ib_umem_get() provides. This patch changes ib_umem_get() and ib_umem_odp_get() to get IB device directly by relying on the fact that both UVERBS and ULPs sets that field correctly. Reviewed-by: Guy Levi Signed-off-by: Moni Shoua Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/umem.c | 27 +++++++++---------------- drivers/infiniband/core/umem_odp.c | 29 +++++++-------------------- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 12 ++++++----- drivers/infiniband/hw/cxgb4/mem.c | 2 +- drivers/infiniband/hw/efa/efa_verbs.c | 2 +- drivers/infiniband/hw/hns/hns_roce_cq.c | 2 +- drivers/infiniband/hw/hns/hns_roce_db.c | 3 ++- drivers/infiniband/hw/hns/hns_roce_mr.c | 4 ++-- drivers/infiniband/hw/hns/hns_roce_qp.c | 2 +- drivers/infiniband/hw/hns/hns_roce_srq.c | 5 +++-- drivers/infiniband/hw/i40iw/i40iw_verbs.c | 2 +- drivers/infiniband/hw/mlx4/cq.c | 2 +- drivers/infiniband/hw/mlx4/doorbell.c | 3 ++- drivers/infiniband/hw/mlx4/mr.c | 8 ++++---- drivers/infiniband/hw/mlx4/qp.c | 5 +++-- drivers/infiniband/hw/mlx4/srq.c | 3 ++- drivers/infiniband/hw/mlx5/cq.c | 6 +++--- drivers/infiniband/hw/mlx5/devx.c | 2 +- drivers/infiniband/hw/mlx5/doorbell.c | 3 ++- drivers/infiniband/hw/mlx5/mr.c | 18 ++++++++--------- drivers/infiniband/hw/mlx5/odp.c | 2 +- drivers/infiniband/hw/mlx5/qp.c | 4 ++-- drivers/infiniband/hw/mlx5/srq.c | 2 +- drivers/infiniband/hw/mthca/mthca_provider.c | 2 +- drivers/infiniband/hw/ocrdma/ocrdma_verbs.c | 2 +- drivers/infiniband/hw/qedr/verbs.c | 9 ++++----- drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c | 2 +- drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c | 2 +- drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c | 7 ++++--- drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c | 2 +- drivers/infiniband/sw/rdmavt/mr.c | 2 +- drivers/infiniband/sw/rxe/rxe_mr.c | 2 +- include/rdma/ib_umem.h | 4 ++-- include/rdma/ib_umem_odp.h | 6 +++--- 34 files changed, 85 insertions(+), 103 deletions(-) diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 7a3b99597ead..146f98fbf22b 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -181,15 +181,14 @@ EXPORT_SYMBOL(ib_umem_find_best_pgsz); /** * ib_umem_get - Pin and DMA map userspace memory. * - * @udata: userspace context to pin memory for + * @device: IB device to connect UMEM * @addr: userspace virtual address to start at * @size: length of region to pin * @access: IB_ACCESS_xxx flags for memory being pinned */ -struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr, +struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr, size_t size, int access) { - struct ib_ucontext *context; struct ib_umem *umem; struct page **page_list; unsigned long lock_limit; @@ -201,14 +200,6 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr, struct scatterlist *sg; unsigned int gup_flags = FOLL_WRITE; - if (!udata) - return ERR_PTR(-EIO); - - context = container_of(udata, struct uverbs_attr_bundle, driver_udata) - ->context; - if (!context) - return ERR_PTR(-EIO); - /* * If the combination of the addr and size requested for this memory * region causes an integer overflow, return error. @@ -226,7 +217,7 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr, umem = kzalloc(sizeof(*umem), GFP_KERNEL); if (!umem) return ERR_PTR(-ENOMEM); - umem->ibdev = context->device; + umem->ibdev = device; umem->length = size; umem->address = addr; umem->writable = ib_access_writable(access); @@ -281,7 +272,7 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr, npages -= ret; sg = ib_umem_add_sg_table(sg, page_list, ret, - dma_get_max_seg_size(context->device->dma_device), + dma_get_max_seg_size(device->dma_device), &umem->sg_nents); up_read(&mm->mmap_sem); @@ -289,10 +280,10 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr, sg_mark_end(sg); - umem->nmap = ib_dma_map_sg(context->device, - umem->sg_head.sgl, - umem->sg_nents, - DMA_BIDIRECTIONAL); + umem->nmap = ib_dma_map_sg(device, + umem->sg_head.sgl, + umem->sg_nents, + DMA_BIDIRECTIONAL); if (!umem->nmap) { ret = -ENOMEM; @@ -303,7 +294,7 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr, goto out; umem_release: - __ib_umem_release(context->device, umem, 0); + __ib_umem_release(device, umem, 0); vma: atomic64_sub(ib_umem_num_pages(umem), &mm->pinned_vm); out: diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index e42d44e501fd..dac3fd2ebc26 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -110,15 +110,12 @@ out_page_list: * They exist only to hold the per_mm reference to help the driver create * children umems. * - * @udata: udata from the syscall being used to create the umem + * @device: IB device to create UMEM * @access: ib_reg_mr access flags */ -struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_udata *udata, +struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_device *device, int access) { - struct ib_ucontext *context = - container_of(udata, struct uverbs_attr_bundle, driver_udata) - ->context; struct ib_umem *umem; struct ib_umem_odp *umem_odp; int ret; @@ -126,14 +123,11 @@ struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_udata *udata, if (access & IB_ACCESS_HUGETLB) return ERR_PTR(-EINVAL); - if (!context) - return ERR_PTR(-EIO); - umem_odp = kzalloc(sizeof(*umem_odp), GFP_KERNEL); if (!umem_odp) return ERR_PTR(-ENOMEM); umem = &umem_odp->umem; - umem->ibdev = context->device; + umem->ibdev = device; umem->writable = ib_access_writable(access); umem->owning_mm = current->mm; umem_odp->is_implicit_odp = 1; @@ -201,7 +195,7 @@ EXPORT_SYMBOL(ib_umem_odp_alloc_child); /** * ib_umem_odp_get - Create a umem_odp for a userspace va * - * @udata: userspace context to pin memory for + * @device: IB device struct to get UMEM * @addr: userspace virtual address to start at * @size: length of region to pin * @access: IB_ACCESS_xxx flags for memory being pinned @@ -210,23 +204,14 @@ EXPORT_SYMBOL(ib_umem_odp_alloc_child); * pinning, instead, stores the mm for future page fault handling in * conjunction with MMU notifiers. */ -struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata, unsigned long addr, - size_t size, int access, +struct ib_umem_odp *ib_umem_odp_get(struct ib_device *device, + unsigned long addr, size_t size, int access, const struct mmu_interval_notifier_ops *ops) { struct ib_umem_odp *umem_odp; - struct ib_ucontext *context; struct mm_struct *mm; int ret; - if (!udata) - return ERR_PTR(-EIO); - - context = container_of(udata, struct uverbs_attr_bundle, driver_udata) - ->context; - if (!context) - return ERR_PTR(-EIO); - if (WARN_ON_ONCE(!(access & IB_ACCESS_ON_DEMAND))) return ERR_PTR(-EINVAL); @@ -234,7 +219,7 @@ struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata, unsigned long addr, if (!umem_odp) return ERR_PTR(-ENOMEM); - umem_odp->umem.ibdev = context->device; + umem_odp->umem.ibdev = device; umem_odp->umem.length = size; umem_odp->umem.address = addr; umem_odp->umem.writable = ib_access_writable(access); diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index ad5112a2325f..52b6a4d85460 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -837,7 +837,8 @@ static int bnxt_re_init_user_qp(struct bnxt_re_dev *rdev, struct bnxt_re_pd *pd, bytes += (qplib_qp->sq.max_wqe * psn_sz); } bytes = PAGE_ALIGN(bytes); - umem = ib_umem_get(udata, ureq.qpsva, bytes, IB_ACCESS_LOCAL_WRITE); + umem = ib_umem_get(&rdev->ibdev, ureq.qpsva, bytes, + IB_ACCESS_LOCAL_WRITE); if (IS_ERR(umem)) return PTR_ERR(umem); @@ -850,7 +851,7 @@ static int bnxt_re_init_user_qp(struct bnxt_re_dev *rdev, struct bnxt_re_pd *pd, if (!qp->qplib_qp.srq) { bytes = (qplib_qp->rq.max_wqe * BNXT_QPLIB_MAX_RQE_ENTRY_SIZE); bytes = PAGE_ALIGN(bytes); - umem = ib_umem_get(udata, ureq.qprva, bytes, + umem = ib_umem_get(&rdev->ibdev, ureq.qprva, bytes, IB_ACCESS_LOCAL_WRITE); if (IS_ERR(umem)) goto rqfail; @@ -1304,7 +1305,8 @@ static int bnxt_re_init_user_srq(struct bnxt_re_dev *rdev, bytes = (qplib_srq->max_wqe * BNXT_QPLIB_MAX_RQE_ENTRY_SIZE); bytes = PAGE_ALIGN(bytes); - umem = ib_umem_get(udata, ureq.srqva, bytes, IB_ACCESS_LOCAL_WRITE); + umem = ib_umem_get(&rdev->ibdev, ureq.srqva, bytes, + IB_ACCESS_LOCAL_WRITE); if (IS_ERR(umem)) return PTR_ERR(umem); @@ -2545,7 +2547,7 @@ int bnxt_re_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, goto fail; } - cq->umem = ib_umem_get(udata, req.cq_va, + cq->umem = ib_umem_get(&rdev->ibdev, req.cq_va, entries * sizeof(struct cq_base), IB_ACCESS_LOCAL_WRITE); if (IS_ERR(cq->umem)) { @@ -3514,7 +3516,7 @@ struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *ib_pd, u64 start, u64 length, /* The fixed portion of the rkey is the same as the lkey */ mr->ib_mr.rkey = mr->qplib_mr.rkey; - umem = ib_umem_get(udata, start, length, mr_access_flags); + umem = ib_umem_get(&rdev->ibdev, start, length, mr_access_flags); if (IS_ERR(umem)) { dev_err(rdev_to_dev(rdev), "Failed to get umem"); rc = -EFAULT; diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c index fe3a7e8561df..962dc97a8ff2 100644 --- a/drivers/infiniband/hw/cxgb4/mem.c +++ b/drivers/infiniband/hw/cxgb4/mem.c @@ -543,7 +543,7 @@ struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, mhp->rhp = rhp; - mhp->umem = ib_umem_get(udata, start, length, acc); + mhp->umem = ib_umem_get(pd->device, start, length, acc); if (IS_ERR(mhp->umem)) goto err_free_skb; diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c index 50c22575aed6..5c35aa72f515 100644 --- a/drivers/infiniband/hw/efa/efa_verbs.c +++ b/drivers/infiniband/hw/efa/efa_verbs.c @@ -1384,7 +1384,7 @@ struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length, goto err_out; } - mr->umem = ib_umem_get(udata, start, length, access_flags); + mr->umem = ib_umem_get(ibpd->device, start, length, access_flags); if (IS_ERR(mr->umem)) { err = PTR_ERR(mr->umem); ibdev_dbg(&dev->ibdev, diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c index af1d8823b3f0..a2d1e5331bf1 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cq.c +++ b/drivers/infiniband/hw/hns/hns_roce_cq.c @@ -163,7 +163,7 @@ static int get_cq_umem(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq, u32 npages; int ret; - *umem = ib_umem_get(udata, ucmd.buf_addr, buf->size, + *umem = ib_umem_get(&hr_dev->ib_dev, ucmd.buf_addr, buf->size, IB_ACCESS_LOCAL_WRITE); if (IS_ERR(*umem)) return PTR_ERR(*umem); diff --git a/drivers/infiniband/hw/hns/hns_roce_db.c b/drivers/infiniband/hw/hns/hns_roce_db.c index 10af6958ab69..bff6abdccfb0 100644 --- a/drivers/infiniband/hw/hns/hns_roce_db.c +++ b/drivers/infiniband/hw/hns/hns_roce_db.c @@ -31,7 +31,8 @@ int hns_roce_db_map_user(struct hns_roce_ucontext *context, refcount_set(&page->refcount, 1); page->user_virt = page_addr; - page->umem = ib_umem_get(udata, page_addr, PAGE_SIZE, 0); + page->umem = ib_umem_get(context->ibucontext.device, page_addr, + PAGE_SIZE, 0); if (IS_ERR(page->umem)) { ret = PTR_ERR(page->umem); kfree(page); diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index 9ad19170c3f9..3ff610549c74 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -1145,7 +1145,7 @@ struct ib_mr *hns_roce_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, if (!mr) return ERR_PTR(-ENOMEM); - mr->umem = ib_umem_get(udata, start, length, access_flags); + mr->umem = ib_umem_get(pd->device, start, length, access_flags); if (IS_ERR(mr->umem)) { ret = PTR_ERR(mr->umem); goto err_free; @@ -1230,7 +1230,7 @@ static int rereg_mr_trans(struct ib_mr *ibmr, int flags, } ib_umem_release(mr->umem); - mr->umem = ib_umem_get(udata, start, length, mr_access_flags); + mr->umem = ib_umem_get(ibmr->device, start, length, mr_access_flags); if (IS_ERR(mr->umem)) { ret = PTR_ERR(mr->umem); mr->umem = NULL; diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index a6565b674801..eb2ee6a581aa 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -744,7 +744,7 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, goto err_alloc_rq_inline_buf; } - hr_qp->umem = ib_umem_get(udata, ucmd.buf_addr, + hr_qp->umem = ib_umem_get(ib_pd->device, ucmd.buf_addr, hr_qp->buff_size, 0); if (IS_ERR(hr_qp->umem)) { dev_err(dev, "ib_umem_get error for create qp\n"); diff --git a/drivers/infiniband/hw/hns/hns_roce_srq.c b/drivers/infiniband/hw/hns/hns_roce_srq.c index 7113ebfdb4f0..c6d5f06f9cde 100644 --- a/drivers/infiniband/hw/hns/hns_roce_srq.c +++ b/drivers/infiniband/hw/hns/hns_roce_srq.c @@ -186,7 +186,8 @@ static int create_user_srq(struct hns_roce_srq *srq, struct ib_udata *udata, if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) return -EFAULT; - srq->umem = ib_umem_get(udata, ucmd.buf_addr, srq_buf_size, 0); + srq->umem = + ib_umem_get(srq->ibsrq.device, ucmd.buf_addr, srq_buf_size, 0); if (IS_ERR(srq->umem)) return PTR_ERR(srq->umem); @@ -205,7 +206,7 @@ static int create_user_srq(struct hns_roce_srq *srq, struct ib_udata *udata, goto err_user_srq_mtt; /* config index queue BA */ - srq->idx_que.umem = ib_umem_get(udata, ucmd.que_addr, + srq->idx_que.umem = ib_umem_get(srq->ibsrq.device, ucmd.que_addr, srq->idx_que.buf_size, 0); if (IS_ERR(srq->idx_que.umem)) { dev_err(hr_dev->dev, "ib_umem_get error for index queue\n"); diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c index dbd96d029d8b..96488fb443eb 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c +++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c @@ -1761,7 +1761,7 @@ static struct ib_mr *i40iw_reg_user_mr(struct ib_pd *pd, if (length > I40IW_MAX_MR_SIZE) return ERR_PTR(-EINVAL); - region = ib_umem_get(udata, start, length, acc); + region = ib_umem_get(pd->device, start, length, acc); if (IS_ERR(region)) return (struct ib_mr *)region; diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c index 306b21281fa2..a57033d4b0e5 100644 --- a/drivers/infiniband/hw/mlx4/cq.c +++ b/drivers/infiniband/hw/mlx4/cq.c @@ -144,7 +144,7 @@ static int mlx4_ib_get_cq_umem(struct mlx4_ib_dev *dev, struct ib_udata *udata, int shift; int n; - *umem = ib_umem_get(udata, buf_addr, cqe * cqe_size, + *umem = ib_umem_get(&dev->ib_dev, buf_addr, cqe * cqe_size, IB_ACCESS_LOCAL_WRITE); if (IS_ERR(*umem)) return PTR_ERR(*umem); diff --git a/drivers/infiniband/hw/mlx4/doorbell.c b/drivers/infiniband/hw/mlx4/doorbell.c index 714f9df5bf39..d41f03ccb0e1 100644 --- a/drivers/infiniband/hw/mlx4/doorbell.c +++ b/drivers/infiniband/hw/mlx4/doorbell.c @@ -64,7 +64,8 @@ int mlx4_ib_db_map_user(struct ib_udata *udata, unsigned long virt, page->user_virt = (virt & PAGE_MASK); page->refcnt = 0; - page->umem = ib_umem_get(udata, virt & PAGE_MASK, PAGE_SIZE, 0); + page->umem = ib_umem_get(context->ibucontext.device, virt & PAGE_MASK, + PAGE_SIZE, 0); if (IS_ERR(page->umem)) { err = PTR_ERR(page->umem); kfree(page); diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c index dfa17bcdcdbc..b0121c90c561 100644 --- a/drivers/infiniband/hw/mlx4/mr.c +++ b/drivers/infiniband/hw/mlx4/mr.c @@ -367,7 +367,7 @@ end: return block_shift; } -static struct ib_umem *mlx4_get_umem_mr(struct ib_udata *udata, u64 start, +static struct ib_umem *mlx4_get_umem_mr(struct ib_device *device, u64 start, u64 length, int access_flags) { /* @@ -398,7 +398,7 @@ static struct ib_umem *mlx4_get_umem_mr(struct ib_udata *udata, u64 start, up_read(¤t->mm->mmap_sem); } - return ib_umem_get(udata, start, length, access_flags); + return ib_umem_get(device, start, length, access_flags); } struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, @@ -415,7 +415,7 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, if (!mr) return ERR_PTR(-ENOMEM); - mr->umem = mlx4_get_umem_mr(udata, start, length, access_flags); + mr->umem = mlx4_get_umem_mr(pd->device, start, length, access_flags); if (IS_ERR(mr->umem)) { err = PTR_ERR(mr->umem); goto err_free; @@ -504,7 +504,7 @@ int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags, mlx4_mr_rereg_mem_cleanup(dev->dev, &mmr->mmr); ib_umem_release(mmr->umem); - mmr->umem = mlx4_get_umem_mr(udata, start, length, + mmr->umem = mlx4_get_umem_mr(mr->device, start, length, mr_access_flags); if (IS_ERR(mmr->umem)) { err = PTR_ERR(mmr->umem); diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 85f57b76e446..cb23cdb9389a 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -916,7 +916,7 @@ static int create_rq(struct ib_pd *pd, struct ib_qp_init_attr *init_attr, qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) + (qp->sq.wqe_cnt << qp->sq.wqe_shift); - qp->umem = ib_umem_get(udata, wq.buf_addr, qp->buf_size, 0); + qp->umem = ib_umem_get(pd->device, wq.buf_addr, qp->buf_size, 0); if (IS_ERR(qp->umem)) { err = PTR_ERR(qp->umem); goto err; @@ -1110,7 +1110,8 @@ static int create_qp_common(struct ib_pd *pd, struct ib_qp_init_attr *init_attr, if (err) goto err; - qp->umem = ib_umem_get(udata, ucmd.buf_addr, qp->buf_size, 0); + qp->umem = + ib_umem_get(pd->device, ucmd.buf_addr, qp->buf_size, 0); if (IS_ERR(qp->umem)) { err = PTR_ERR(qp->umem); goto err; diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c index 8dcf6e3d9ae2..8f9d5035142d 100644 --- a/drivers/infiniband/hw/mlx4/srq.c +++ b/drivers/infiniband/hw/mlx4/srq.c @@ -110,7 +110,8 @@ int mlx4_ib_create_srq(struct ib_srq *ib_srq, if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) return -EFAULT; - srq->umem = ib_umem_get(udata, ucmd.buf_addr, buf_size, 0); + srq->umem = + ib_umem_get(ib_srq->device, ucmd.buf_addr, buf_size, 0); if (IS_ERR(srq->umem)) return PTR_ERR(srq->umem); diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index dd8d24ee8e1d..367a71bc5f4b 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -708,8 +708,8 @@ static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata, *cqe_size = ucmd.cqe_size; cq->buf.umem = - ib_umem_get(udata, ucmd.buf_addr, entries * ucmd.cqe_size, - IB_ACCESS_LOCAL_WRITE); + ib_umem_get(&dev->ib_dev, ucmd.buf_addr, + entries * ucmd.cqe_size, IB_ACCESS_LOCAL_WRITE); if (IS_ERR(cq->buf.umem)) { err = PTR_ERR(cq->buf.umem); return err; @@ -1108,7 +1108,7 @@ static int resize_user(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq, if (ucmd.cqe_size && SIZE_MAX / ucmd.cqe_size <= entries - 1) return -EINVAL; - umem = ib_umem_get(udata, ucmd.buf_addr, + umem = ib_umem_get(&dev->ib_dev, ucmd.buf_addr, (size_t)ucmd.cqe_size * entries, IB_ACCESS_LOCAL_WRITE); if (IS_ERR(umem)) { diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index 9d0a18cf9e5e..685b8ed96b4e 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -2134,7 +2134,7 @@ static int devx_umem_get(struct mlx5_ib_dev *dev, struct ib_ucontext *ucontext, if (err) return err; - obj->umem = ib_umem_get(&attrs->driver_udata, addr, size, access); + obj->umem = ib_umem_get(&dev->ib_dev, addr, size, access); if (IS_ERR(obj->umem)) return PTR_ERR(obj->umem); diff --git a/drivers/infiniband/hw/mlx5/doorbell.c b/drivers/infiniband/hw/mlx5/doorbell.c index 12737c509aa2..61475b571531 100644 --- a/drivers/infiniband/hw/mlx5/doorbell.c +++ b/drivers/infiniband/hw/mlx5/doorbell.c @@ -64,7 +64,8 @@ int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context, page->user_virt = (virt & PAGE_MASK); page->refcnt = 0; - page->umem = ib_umem_get(udata, virt & PAGE_MASK, PAGE_SIZE, 0); + page->umem = ib_umem_get(context->ibucontext.device, virt & PAGE_MASK, + PAGE_SIZE, 0); if (IS_ERR(page->umem)) { err = PTR_ERR(page->umem); kfree(page); diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index ea8bfc3e2d8d..f79bb44b94fe 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -737,10 +737,9 @@ static int mr_cache_max_order(struct mlx5_ib_dev *dev) return MLX5_MAX_UMR_SHIFT; } -static int mr_umem_get(struct mlx5_ib_dev *dev, struct ib_udata *udata, - u64 start, u64 length, int access_flags, - struct ib_umem **umem, int *npages, int *page_shift, - int *ncont, int *order) +static int mr_umem_get(struct mlx5_ib_dev *dev, u64 start, u64 length, + int access_flags, struct ib_umem **umem, int *npages, + int *page_shift, int *ncont, int *order) { struct ib_umem *u; @@ -749,7 +748,7 @@ static int mr_umem_get(struct mlx5_ib_dev *dev, struct ib_udata *udata, if (access_flags & IB_ACCESS_ON_DEMAND) { struct ib_umem_odp *odp; - odp = ib_umem_odp_get(udata, start, length, access_flags, + odp = ib_umem_odp_get(&dev->ib_dev, start, length, access_flags, &mlx5_mn_ops); if (IS_ERR(odp)) { mlx5_ib_dbg(dev, "umem get failed (%ld)\n", @@ -765,7 +764,7 @@ static int mr_umem_get(struct mlx5_ib_dev *dev, struct ib_udata *udata, if (order) *order = ilog2(roundup_pow_of_two(*ncont)); } else { - u = ib_umem_get(udata, start, length, access_flags); + u = ib_umem_get(&dev->ib_dev, start, length, access_flags); if (IS_ERR(u)) { mlx5_ib_dbg(dev, "umem get failed (%ld)\n", PTR_ERR(u)); return PTR_ERR(u); @@ -1257,7 +1256,7 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, return &mr->ibmr; } - err = mr_umem_get(dev, udata, start, length, access_flags, &umem, + err = mr_umem_get(dev, start, length, access_flags, &umem, &npages, &page_shift, &ncont, &order); if (err < 0) @@ -1424,9 +1423,8 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, flags |= IB_MR_REREG_TRANS; ib_umem_release(mr->umem); mr->umem = NULL; - err = mr_umem_get(dev, udata, addr, len, access_flags, - &mr->umem, &npages, &page_shift, &ncont, - &order); + err = mr_umem_get(dev, addr, len, access_flags, &mr->umem, + &npages, &page_shift, &ncont, &order); if (err) goto err; } diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index f924250f80c2..3b3ceb5acdd3 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -497,7 +497,7 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, struct mlx5_ib_mr *imr; int err; - umem_odp = ib_umem_odp_alloc_implicit(udata, access_flags); + umem_odp = ib_umem_odp_alloc_implicit(&dev->ib_dev, access_flags); if (IS_ERR(umem_odp)) return ERR_CAST(umem_odp); diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 7e51870e9e01..7f0bde313560 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -749,7 +749,7 @@ static int mlx5_ib_umem_get(struct mlx5_ib_dev *dev, struct ib_udata *udata, { int err; - *umem = ib_umem_get(udata, addr, size, 0); + *umem = ib_umem_get(&dev->ib_dev, addr, size, 0); if (IS_ERR(*umem)) { mlx5_ib_dbg(dev, "umem_get failed\n"); return PTR_ERR(*umem); @@ -806,7 +806,7 @@ static int create_user_rq(struct mlx5_ib_dev *dev, struct ib_pd *pd, if (!ucmd->buf_addr) return -EINVAL; - rwq->umem = ib_umem_get(udata, ucmd->buf_addr, rwq->buf_size, 0); + rwq->umem = ib_umem_get(&dev->ib_dev, ucmd->buf_addr, rwq->buf_size, 0); if (IS_ERR(rwq->umem)) { mlx5_ib_dbg(dev, "umem_get failed\n"); err = PTR_ERR(rwq->umem); diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c index 62939df3c692..b1a8a9175040 100644 --- a/drivers/infiniband/hw/mlx5/srq.c +++ b/drivers/infiniband/hw/mlx5/srq.c @@ -80,7 +80,7 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq, srq->wq_sig = !!(ucmd.flags & MLX5_SRQ_FLAG_SIGNATURE); - srq->umem = ib_umem_get(udata, ucmd.buf_addr, buf_size, 0); + srq->umem = ib_umem_get(pd->device, ucmd.buf_addr, buf_size, 0); if (IS_ERR(srq->umem)) { mlx5_ib_dbg(dev, "failed umem get, size %d\n", buf_size); err = PTR_ERR(srq->umem); diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index 33002530fee7..ac19d57803b5 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -880,7 +880,7 @@ static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, if (!mr) return ERR_PTR(-ENOMEM); - mr->umem = ib_umem_get(udata, start, length, acc); + mr->umem = ib_umem_get(pd->device, start, length, acc); if (IS_ERR(mr->umem)) { err = PTR_ERR(mr->umem); goto err; diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index 9bc1ca6f6f9e..d47ea675734b 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -869,7 +869,7 @@ struct ib_mr *ocrdma_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len, mr = kzalloc(sizeof(*mr), GFP_KERNEL); if (!mr) return ERR_PTR(status); - mr->umem = ib_umem_get(udata, start, len, acc); + mr->umem = ib_umem_get(ibpd->device, start, len, acc); if (IS_ERR(mr->umem)) { status = -EFAULT; goto umem_err; diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index 4cd292966aa9..920f35e28cfc 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -772,7 +772,7 @@ static inline int qedr_init_user_queue(struct ib_udata *udata, q->buf_addr = buf_addr; q->buf_len = buf_len; - q->umem = ib_umem_get(udata, q->buf_addr, q->buf_len, access); + q->umem = ib_umem_get(&dev->ibdev, q->buf_addr, q->buf_len, access); if (IS_ERR(q->umem)) { DP_ERR(dev, "create user queue: failed ib_umem_get, got %ld\n", PTR_ERR(q->umem)); @@ -1415,9 +1415,8 @@ static int qedr_init_srq_user_params(struct ib_udata *udata, if (rc) return rc; - srq->prod_umem = - ib_umem_get(udata, ureq->prod_pair_addr, - sizeof(struct rdma_srq_producers), access); + srq->prod_umem = ib_umem_get(srq->ibsrq.device, ureq->prod_pair_addr, + sizeof(struct rdma_srq_producers), access); if (IS_ERR(srq->prod_umem)) { qedr_free_pbl(srq->dev, &srq->usrq.pbl_info, srq->usrq.pbl_tbl); ib_umem_release(srq->usrq.umem); @@ -2839,7 +2838,7 @@ struct ib_mr *qedr_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len, mr->type = QEDR_MR_USER; - mr->umem = ib_umem_get(udata, start, len, acc); + mr->umem = ib_umem_get(ibpd->device, start, len, acc); if (IS_ERR(mr->umem)) { rc = -EFAULT; goto err0; diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c index a26a4fd86bf4..4f6cc0de7ef9 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c @@ -135,7 +135,7 @@ int pvrdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, goto err_cq; } - cq->umem = ib_umem_get(udata, ucmd.buf_addr, ucmd.buf_size, + cq->umem = ib_umem_get(ibdev, ucmd.buf_addr, ucmd.buf_size, IB_ACCESS_LOCAL_WRITE); if (IS_ERR(cq->umem)) { ret = PTR_ERR(cq->umem); diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c index c61e665ff261..b039f1f00e05 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c @@ -126,7 +126,7 @@ struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, return ERR_PTR(-EINVAL); } - umem = ib_umem_get(udata, start, length, access_flags); + umem = ib_umem_get(pd->device, start, length, access_flags); if (IS_ERR(umem)) { dev_warn(&dev->pdev->dev, "could not get umem for mem region\n"); diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c index f15809c28f67..9de1281f9a3b 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c @@ -276,8 +276,9 @@ struct ib_qp *pvrdma_create_qp(struct ib_pd *pd, if (!is_srq) { /* set qp->sq.wqe_cnt, shift, buf_size.. */ - qp->rumem = ib_umem_get(udata, ucmd.rbuf_addr, - ucmd.rbuf_size, 0); + qp->rumem = + ib_umem_get(pd->device, ucmd.rbuf_addr, + ucmd.rbuf_size, 0); if (IS_ERR(qp->rumem)) { ret = PTR_ERR(qp->rumem); goto err_qp; @@ -288,7 +289,7 @@ struct ib_qp *pvrdma_create_qp(struct ib_pd *pd, qp->srq = to_vsrq(init_attr->srq); } - qp->sumem = ib_umem_get(udata, ucmd.sbuf_addr, + qp->sumem = ib_umem_get(pd->device, ucmd.sbuf_addr, ucmd.sbuf_size, 0); if (IS_ERR(qp->sumem)) { if (!is_srq) diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c index 98c8be71d91d..d330decfb80a 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c @@ -146,7 +146,7 @@ int pvrdma_create_srq(struct ib_srq *ibsrq, struct ib_srq_init_attr *init_attr, goto err_srq; } - srq->umem = ib_umem_get(udata, ucmd.buf_addr, ucmd.buf_size, 0); + srq->umem = ib_umem_get(ibsrq->device, ucmd.buf_addr, ucmd.buf_size, 0); if (IS_ERR(srq->umem)) { ret = PTR_ERR(srq->umem); goto err_srq; diff --git a/drivers/infiniband/sw/rdmavt/mr.c b/drivers/infiniband/sw/rdmavt/mr.c index b9a76bf74857..72f6534fbb52 100644 --- a/drivers/infiniband/sw/rdmavt/mr.c +++ b/drivers/infiniband/sw/rdmavt/mr.c @@ -390,7 +390,7 @@ struct ib_mr *rvt_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, if (length == 0) return ERR_PTR(-EINVAL); - umem = ib_umem_get(udata, start, length, mr_access_flags); + umem = ib_umem_get(pd->device, start, length, mr_access_flags); if (IS_ERR(umem)) return (void *)umem; diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c index 35a2baf2f364..e83c7b518bfa 100644 --- a/drivers/infiniband/sw/rxe/rxe_mr.c +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -169,7 +169,7 @@ int rxe_mem_init_user(struct rxe_pd *pd, u64 start, void *vaddr; int err; - umem = ib_umem_get(udata, start, length, access); + umem = ib_umem_get(pd->ibpd.device, start, length, access); if (IS_ERR(umem)) { pr_warn("err %d from rxe_umem_get\n", (int)PTR_ERR(umem)); diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h index 753f54e17e0a..e3518fd6b95b 100644 --- a/include/rdma/ib_umem.h +++ b/include/rdma/ib_umem.h @@ -69,7 +69,7 @@ static inline size_t ib_umem_num_pages(struct ib_umem *umem) #ifdef CONFIG_INFINIBAND_USER_MEM -struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr, +struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr, size_t size, int access); void ib_umem_release(struct ib_umem *umem); int ib_umem_page_count(struct ib_umem *umem); @@ -83,7 +83,7 @@ unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem, #include -static inline struct ib_umem *ib_umem_get(struct ib_udata *udata, +static inline struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr, size_t size, int access) { diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index 81429acc8257..64314ff76612 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -114,9 +114,9 @@ static inline size_t ib_umem_odp_num_pages(struct ib_umem_odp *umem_odp) #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING struct ib_umem_odp * -ib_umem_odp_get(struct ib_udata *udata, unsigned long addr, size_t size, +ib_umem_odp_get(struct ib_device *device, unsigned long addr, size_t size, int access, const struct mmu_interval_notifier_ops *ops); -struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_udata *udata, +struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_device *device, int access); struct ib_umem_odp * ib_umem_odp_alloc_child(struct ib_umem_odp *root_umem, unsigned long addr, @@ -134,7 +134,7 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 start_offset, #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ static inline struct ib_umem_odp * -ib_umem_odp_get(struct ib_udata *udata, unsigned long addr, size_t size, +ib_umem_odp_get(struct ib_device *device, unsigned long addr, size_t size, int access, const struct mmu_interval_notifier_ops *ops) { return ERR_PTR(-EINVAL); -- cgit v1.2.3-59-g8ed1b From 33006bd4f37f7d2c3d1cf0268b4f327b5fdc2558 Mon Sep 17 00:00:00 2001 From: Moni Shoua Date: Wed, 15 Jan 2020 14:43:32 +0200 Subject: IB/core: Introduce ib_reg_user_mr Add ib_reg_user_mr() for kernel ULPs to register user MRs. The common use case that uses this function is a userspace application that allocates memory for HCA access but the responsibility to register the memory at the HCA is on an kernel ULP. This ULP that acts as an agent for the userspace application. This function is intended to be used without a user context so vendor drivers need to be aware of calling reg_user_mr() device operation with udata equal to NULL. Among all drivers, i40iw is the only driver which relies on presence of udata, so check udata existence for that driver. Signed-off-by: Moni Shoua Reviewed-by: Guy Levi Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/verbs.c | 30 ++++++++++++++++++++++++++++++ drivers/infiniband/hw/efa/efa_verbs.c | 2 +- drivers/infiniband/hw/i40iw/i40iw_verbs.c | 3 +++ include/rdma/ib_verbs.h | 6 ++++++ 4 files changed, 40 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index dd765e176cdd..7a69e4bbe877 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -1990,6 +1990,36 @@ EXPORT_SYMBOL(ib_resize_cq); /* Memory regions */ +struct ib_mr *ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt_addr, int access_flags) +{ + struct ib_mr *mr; + + if (access_flags & IB_ACCESS_ON_DEMAND) { + if (!(pd->device->attrs.device_cap_flags & + IB_DEVICE_ON_DEMAND_PAGING)) { + pr_debug("ODP support not available\n"); + return ERR_PTR(-EINVAL); + } + } + + mr = pd->device->ops.reg_user_mr(pd, start, length, virt_addr, + access_flags, NULL); + + if (IS_ERR(mr)) + return mr; + + mr->device = pd->device; + mr->pd = pd; + mr->dm = NULL; + atomic_inc(&pd->usecnt); + mr->res.type = RDMA_RESTRACK_MR; + rdma_restrack_kadd(&mr->res); + + return mr; +} +EXPORT_SYMBOL(ib_reg_user_mr); + int ib_dereg_mr_user(struct ib_mr *mr, struct ib_udata *udata) { struct ib_pd *pd = mr->pd; diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c index 5c35aa72f515..4822f5fa12be 100644 --- a/drivers/infiniband/hw/efa/efa_verbs.c +++ b/drivers/infiniband/hw/efa/efa_verbs.c @@ -1358,7 +1358,7 @@ struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length, int inline_size; int err; - if (udata->inlen && + if (udata && udata->inlen && !ib_is_udata_cleared(udata, 0, sizeof(udata->inlen))) { ibdev_dbg(&dev->ibdev, "Incompatible ABI params, udata not cleared\n"); diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c index 96488fb443eb..c335de91508f 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c +++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c @@ -1756,6 +1756,9 @@ static struct ib_mr *i40iw_reg_user_mr(struct ib_pd *pd, int ret; int pg_shift; + if (!udata) + return ERR_PTR(-EOPNOTSUPP); + if (iwdev->closing) return ERR_PTR(-ENODEV); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 5608e14e3aad..170d5ec95b79 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -4153,6 +4153,12 @@ static inline void ib_dma_free_coherent(struct ib_device *dev, dma_free_coherent(dev->dma_device, size, cpu_addr, dma_handle); } +/* ib_reg_user_mr - register a memory region for virtual addresses from kernel + * space. This function should be called when 'current' is the owning MM. + */ +struct ib_mr *ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt_addr, int mr_access_flags); + /** * ib_dereg_mr_user - Deregisters a memory region and removes it from the * HCA translation table. -- cgit v1.2.3-59-g8ed1b From 87d8069f6b028793254ddd0a66df1d7b6d79b450 Mon Sep 17 00:00:00 2001 From: Moni Shoua Date: Wed, 15 Jan 2020 14:43:33 +0200 Subject: IB/core: Add interface to advise_mr for kernel users Allow ULPs to call advise_mr, so they can control ODP regions in the same way as user space applications. Signed-off-by: Moni Shoua Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/verbs.c | 11 +++++++++++ include/rdma/ib_verbs.h | 3 +++ 2 files changed, 14 insertions(+) diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 7a69e4bbe877..d33bdc9b01cd 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -2020,6 +2020,17 @@ struct ib_mr *ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, } EXPORT_SYMBOL(ib_reg_user_mr); +int ib_advise_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice, + u32 flags, struct ib_sge *sg_list, u32 num_sge) +{ + if (!pd->device->ops.advise_mr) + return -EOPNOTSUPP; + + return pd->device->ops.advise_mr(pd, advice, flags, sg_list, num_sge, + NULL); +} +EXPORT_SYMBOL(ib_advise_mr); + int ib_dereg_mr_user(struct ib_mr *mr, struct ib_udata *udata) { struct ib_pd *pd = mr->pd; diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 170d5ec95b79..e2cc62217cc2 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -4159,6 +4159,9 @@ static inline void ib_dma_free_coherent(struct ib_device *dev, struct ib_mr *ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int mr_access_flags); +/* ib_advise_mr - give an advice about an address range in a memory region */ +int ib_advise_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice, + u32 flags, struct ib_sge *sg_list, u32 num_sge); /** * ib_dereg_mr_user - Deregisters a memory region and removes it from the * HCA translation table. -- cgit v1.2.3-59-g8ed1b From da9ee9d8a8745e70e481446e0bfe2d773b1c364b Mon Sep 17 00:00:00 2001 From: Moni Shoua Date: Wed, 15 Jan 2020 14:43:34 +0200 Subject: IB/mlx5: Add ODP WQE handlers for kernel QPs One of the steps in ODP page fault handler for WQEs is to read a WQE from a QP send queue or receive queue buffer at a specific index. Since the implementation of this buffer is different between kernel and user QP the implementation of the handler needs to be aware of that and handle it in a different way. ODP for kernel MRs is currently supported only for RDMA_READ and RDMA_WRITE operations so change the handler to - read a WQE from a kernel QP send queue - fail if access to receive queue or shared receive queue is required for a kernel QP Signed-off-by: Moni Shoua Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/mlx5_ib.h | 12 +-- drivers/infiniband/hw/mlx5/odp.c | 12 +-- drivers/infiniband/hw/mlx5/qp.c | 163 ++++++++++++++++++++++------------- 3 files changed, 117 insertions(+), 70 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index b06f32ff5748..77d495b2032d 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -1153,12 +1153,12 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, const struct ib_send_wr **bad_wr); int mlx5_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, const struct ib_recv_wr **bad_wr); -int mlx5_ib_read_user_wqe_sq(struct mlx5_ib_qp *qp, int wqe_index, void *buffer, - int buflen, size_t *bc); -int mlx5_ib_read_user_wqe_rq(struct mlx5_ib_qp *qp, int wqe_index, void *buffer, - int buflen, size_t *bc); -int mlx5_ib_read_user_wqe_srq(struct mlx5_ib_srq *srq, int wqe_index, - void *buffer, int buflen, size_t *bc); +int mlx5_ib_read_wqe_sq(struct mlx5_ib_qp *qp, int wqe_index, void *buffer, + size_t buflen, size_t *bc); +int mlx5_ib_read_wqe_rq(struct mlx5_ib_qp *qp, int wqe_index, void *buffer, + size_t buflen, size_t *bc); +int mlx5_ib_read_wqe_srq(struct mlx5_ib_srq *srq, int wqe_index, void *buffer, + size_t buflen, size_t *bc); int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, struct ib_udata *udata); void mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata); diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 3b3ceb5acdd3..3642c6a491c2 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -1237,15 +1237,15 @@ static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev *dev, wqe = wqe_start; qp = (res->res == MLX5_RES_QP) ? res_to_qp(res) : NULL; if (qp && sq) { - ret = mlx5_ib_read_user_wqe_sq(qp, wqe_index, wqe, PAGE_SIZE, - &bytes_copied); + ret = mlx5_ib_read_wqe_sq(qp, wqe_index, wqe, PAGE_SIZE, + &bytes_copied); if (ret) goto read_user; ret = mlx5_ib_mr_initiator_pfault_handler( dev, pfault, qp, &wqe, &wqe_end, bytes_copied); } else if (qp && !sq) { - ret = mlx5_ib_read_user_wqe_rq(qp, wqe_index, wqe, PAGE_SIZE, - &bytes_copied); + ret = mlx5_ib_read_wqe_rq(qp, wqe_index, wqe, PAGE_SIZE, + &bytes_copied); if (ret) goto read_user; ret = mlx5_ib_mr_responder_pfault_handler_rq( @@ -1253,8 +1253,8 @@ static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev *dev, } else if (!qp) { struct mlx5_ib_srq *srq = res_to_srq(res); - ret = mlx5_ib_read_user_wqe_srq(srq, wqe_index, wqe, PAGE_SIZE, - &bytes_copied); + ret = mlx5_ib_read_wqe_srq(srq, wqe_index, wqe, PAGE_SIZE, + &bytes_copied); if (ret) goto read_user; ret = mlx5_ib_mr_responder_pfault_handler_srq( diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 7f0bde313560..ae7cbd9c9bca 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -129,14 +129,10 @@ static int is_sqp(enum ib_qp_type qp_type) * * Return: zero on success, or an error code. */ -static int mlx5_ib_read_user_wqe_common(struct ib_umem *umem, - void *buffer, - u32 buflen, - int wqe_index, - int wq_offset, - int wq_wqe_cnt, - int wq_wqe_shift, - int bcnt, +static int mlx5_ib_read_user_wqe_common(struct ib_umem *umem, void *buffer, + size_t buflen, int wqe_index, + int wq_offset, int wq_wqe_cnt, + int wq_wqe_shift, int bcnt, size_t *bytes_copied) { size_t offset = wq_offset + ((wqe_index % wq_wqe_cnt) << wq_wqe_shift); @@ -160,11 +156,43 @@ static int mlx5_ib_read_user_wqe_common(struct ib_umem *umem, return 0; } -int mlx5_ib_read_user_wqe_sq(struct mlx5_ib_qp *qp, - int wqe_index, - void *buffer, - int buflen, - size_t *bc) +static int mlx5_ib_read_kernel_wqe_sq(struct mlx5_ib_qp *qp, int wqe_index, + void *buffer, size_t buflen, size_t *bc) +{ + struct mlx5_wqe_ctrl_seg *ctrl; + size_t bytes_copied = 0; + size_t wqe_length; + void *p; + int ds; + + wqe_index = wqe_index & qp->sq.fbc.sz_m1; + + /* read the control segment first */ + p = mlx5_frag_buf_get_wqe(&qp->sq.fbc, wqe_index); + ctrl = p; + ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK; + wqe_length = ds * MLX5_WQE_DS_UNITS; + + /* read rest of WQE if it spreads over more than one stride */ + while (bytes_copied < wqe_length) { + size_t copy_length = + min_t(size_t, buflen - bytes_copied, MLX5_SEND_WQE_BB); + + if (!copy_length) + break; + + memcpy(buffer + bytes_copied, p, copy_length); + bytes_copied += copy_length; + + wqe_index = (wqe_index + 1) & qp->sq.fbc.sz_m1; + p = mlx5_frag_buf_get_wqe(&qp->sq.fbc, wqe_index); + } + *bc = bytes_copied; + return 0; +} + +static int mlx5_ib_read_user_wqe_sq(struct mlx5_ib_qp *qp, int wqe_index, + void *buffer, size_t buflen, size_t *bc) { struct mlx5_ib_qp_base *base = &qp->trans_qp.base; struct ib_umem *umem = base->ubuffer.umem; @@ -176,18 +204,10 @@ int mlx5_ib_read_user_wqe_sq(struct mlx5_ib_qp *qp, int ret; int ds; - if (buflen < sizeof(*ctrl)) - return -EINVAL; - /* at first read as much as possible */ - ret = mlx5_ib_read_user_wqe_common(umem, - buffer, - buflen, - wqe_index, - wq->offset, - wq->wqe_cnt, - wq->wqe_shift, - buflen, + ret = mlx5_ib_read_user_wqe_common(umem, buffer, buflen, wqe_index, + wq->offset, wq->wqe_cnt, + wq->wqe_shift, buflen, &bytes_copied); if (ret) return ret; @@ -210,13 +230,9 @@ int mlx5_ib_read_user_wqe_sq(struct mlx5_ib_qp *qp, * so read the remaining bytes starting * from wqe_index 0 */ - ret = mlx5_ib_read_user_wqe_common(umem, - buffer + bytes_copied, - buflen - bytes_copied, - 0, - wq->offset, - wq->wqe_cnt, - wq->wqe_shift, + ret = mlx5_ib_read_user_wqe_common(umem, buffer + bytes_copied, + buflen - bytes_copied, 0, wq->offset, + wq->wqe_cnt, wq->wqe_shift, wqe_length - bytes_copied, &bytes_copied2); @@ -226,11 +242,24 @@ int mlx5_ib_read_user_wqe_sq(struct mlx5_ib_qp *qp, return 0; } -int mlx5_ib_read_user_wqe_rq(struct mlx5_ib_qp *qp, - int wqe_index, - void *buffer, - int buflen, - size_t *bc) +int mlx5_ib_read_wqe_sq(struct mlx5_ib_qp *qp, int wqe_index, void *buffer, + size_t buflen, size_t *bc) +{ + struct mlx5_ib_qp_base *base = &qp->trans_qp.base; + struct ib_umem *umem = base->ubuffer.umem; + + if (buflen < sizeof(struct mlx5_wqe_ctrl_seg)) + return -EINVAL; + + if (!umem) + return mlx5_ib_read_kernel_wqe_sq(qp, wqe_index, buffer, + buflen, bc); + + return mlx5_ib_read_user_wqe_sq(qp, wqe_index, buffer, buflen, bc); +} + +static int mlx5_ib_read_user_wqe_rq(struct mlx5_ib_qp *qp, int wqe_index, + void *buffer, size_t buflen, size_t *bc) { struct mlx5_ib_qp_base *base = &qp->trans_qp.base; struct ib_umem *umem = base->ubuffer.umem; @@ -238,14 +267,9 @@ int mlx5_ib_read_user_wqe_rq(struct mlx5_ib_qp *qp, size_t bytes_copied; int ret; - ret = mlx5_ib_read_user_wqe_common(umem, - buffer, - buflen, - wqe_index, - wq->offset, - wq->wqe_cnt, - wq->wqe_shift, - buflen, + ret = mlx5_ib_read_user_wqe_common(umem, buffer, buflen, wqe_index, + wq->offset, wq->wqe_cnt, + wq->wqe_shift, buflen, &bytes_copied); if (ret) @@ -254,25 +278,33 @@ int mlx5_ib_read_user_wqe_rq(struct mlx5_ib_qp *qp, return 0; } -int mlx5_ib_read_user_wqe_srq(struct mlx5_ib_srq *srq, - int wqe_index, - void *buffer, - int buflen, - size_t *bc) +int mlx5_ib_read_wqe_rq(struct mlx5_ib_qp *qp, int wqe_index, void *buffer, + size_t buflen, size_t *bc) +{ + struct mlx5_ib_qp_base *base = &qp->trans_qp.base; + struct ib_umem *umem = base->ubuffer.umem; + struct mlx5_ib_wq *wq = &qp->rq; + size_t wqe_size = 1 << wq->wqe_shift; + + if (buflen < wqe_size) + return -EINVAL; + + if (!umem) + return -EOPNOTSUPP; + + return mlx5_ib_read_user_wqe_rq(qp, wqe_index, buffer, buflen, bc); +} + +static int mlx5_ib_read_user_wqe_srq(struct mlx5_ib_srq *srq, int wqe_index, + void *buffer, size_t buflen, size_t *bc) { struct ib_umem *umem = srq->umem; size_t bytes_copied; int ret; - ret = mlx5_ib_read_user_wqe_common(umem, - buffer, - buflen, - wqe_index, - 0, - srq->msrq.max, - srq->msrq.wqe_shift, - buflen, - &bytes_copied); + ret = mlx5_ib_read_user_wqe_common(umem, buffer, buflen, wqe_index, 0, + srq->msrq.max, srq->msrq.wqe_shift, + buflen, &bytes_copied); if (ret) return ret; @@ -280,6 +312,21 @@ int mlx5_ib_read_user_wqe_srq(struct mlx5_ib_srq *srq, return 0; } +int mlx5_ib_read_wqe_srq(struct mlx5_ib_srq *srq, int wqe_index, void *buffer, + size_t buflen, size_t *bc) +{ + struct ib_umem *umem = srq->umem; + size_t wqe_size = 1 << srq->msrq.wqe_shift; + + if (buflen < wqe_size) + return -EINVAL; + + if (!umem) + return -EOPNOTSUPP; + + return mlx5_ib_read_user_wqe_srq(srq, wqe_index, buffer, buflen, bc); +} + static void mlx5_ib_qp_event(struct mlx5_core_qp *qp, int type) { struct ib_qp *ibqp = &to_mibqp(qp)->ibqp; -- cgit v1.2.3-59-g8ed1b From 4835709176e8ccf6561abc9f5c405293e008095f Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 15 Jan 2020 14:43:35 +0200 Subject: RDMA/mlx5: Don't fake udata for kernel path Kernel paths must not set udata and provide NULL pointer, instead of faking zeroed udata struct. Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/main.c | 34 ++++++++++++++++------------------ 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 997cbfe4b90c..760630c7aae7 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -815,6 +815,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, struct ib_device_attr *props, struct ib_udata *uhw) { + size_t uhw_outlen = (uhw) ? uhw->outlen : 0; struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_core_dev *mdev = dev->mdev; int err = -ENOMEM; @@ -828,12 +829,12 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, u64 max_tso; resp_len = sizeof(resp.comp_mask) + sizeof(resp.response_length); - if (uhw->outlen && uhw->outlen < resp_len) + if (uhw_outlen && uhw_outlen < resp_len) return -EINVAL; resp.response_length = resp_len; - if (uhw->inlen && !ib_is_udata_cleared(uhw, 0, uhw->inlen)) + if (uhw && uhw->inlen && !ib_is_udata_cleared(uhw, 0, uhw->inlen)) return -EINVAL; memset(props, 0, sizeof(*props)); @@ -897,7 +898,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, props->raw_packet_caps |= IB_RAW_PACKET_CAP_CVLAN_STRIPPING; - if (field_avail(typeof(resp), tso_caps, uhw->outlen)) { + if (field_avail(typeof(resp), tso_caps, uhw_outlen)) { max_tso = MLX5_CAP_ETH(mdev, max_lso_cap); if (max_tso) { resp.tso_caps.max_tso = 1 << max_tso; @@ -907,7 +908,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, } } - if (field_avail(typeof(resp), rss_caps, uhw->outlen)) { + if (field_avail(typeof(resp), rss_caps, uhw_outlen)) { resp.rss_caps.rx_hash_function = MLX5_RX_HASH_FUNC_TOEPLITZ; resp.rss_caps.rx_hash_fields_mask = @@ -927,9 +928,9 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, resp.response_length += sizeof(resp.rss_caps); } } else { - if (field_avail(typeof(resp), tso_caps, uhw->outlen)) + if (field_avail(typeof(resp), tso_caps, uhw_outlen)) resp.response_length += sizeof(resp.tso_caps); - if (field_avail(typeof(resp), rss_caps, uhw->outlen)) + if (field_avail(typeof(resp), rss_caps, uhw_outlen)) resp.response_length += sizeof(resp.rss_caps); } @@ -1054,7 +1055,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, MLX5_MAX_CQ_PERIOD; } - if (field_avail(typeof(resp), cqe_comp_caps, uhw->outlen)) { + if (field_avail(typeof(resp), cqe_comp_caps, uhw_outlen)) { resp.response_length += sizeof(resp.cqe_comp_caps); if (MLX5_CAP_GEN(dev->mdev, cqe_compression)) { @@ -1072,7 +1073,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, } } - if (field_avail(typeof(resp), packet_pacing_caps, uhw->outlen) && + if (field_avail(typeof(resp), packet_pacing_caps, uhw_outlen) && raw_support) { if (MLX5_CAP_QOS(mdev, packet_pacing) && MLX5_CAP_GEN(mdev, qos)) { @@ -1091,7 +1092,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, } if (field_avail(typeof(resp), mlx5_ib_support_multi_pkt_send_wqes, - uhw->outlen)) { + uhw_outlen)) { if (MLX5_CAP_ETH(mdev, multi_pkt_send_wqe)) resp.mlx5_ib_support_multi_pkt_send_wqes = MLX5_IB_ALLOW_MPW; @@ -1104,7 +1105,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, sizeof(resp.mlx5_ib_support_multi_pkt_send_wqes); } - if (field_avail(typeof(resp), flags, uhw->outlen)) { + if (field_avail(typeof(resp), flags, uhw_outlen)) { resp.response_length += sizeof(resp.flags); if (MLX5_CAP_GEN(mdev, cqe_compression_128)) @@ -1120,8 +1121,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, resp.flags |= MLX5_IB_QUERY_DEV_RESP_FLAGS_SCAT2CQE_DCT; } - if (field_avail(typeof(resp), sw_parsing_caps, - uhw->outlen)) { + if (field_avail(typeof(resp), sw_parsing_caps, uhw_outlen)) { resp.response_length += sizeof(resp.sw_parsing_caps); if (MLX5_CAP_ETH(mdev, swp)) { resp.sw_parsing_caps.sw_parsing_offloads |= @@ -1141,7 +1141,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, } } - if (field_avail(typeof(resp), striding_rq_caps, uhw->outlen) && + if (field_avail(typeof(resp), striding_rq_caps, uhw_outlen) && raw_support) { resp.response_length += sizeof(resp.striding_rq_caps); if (MLX5_CAP_GEN(mdev, striding_rq)) { @@ -1164,8 +1164,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, } } - if (field_avail(typeof(resp), tunnel_offloads_caps, - uhw->outlen)) { + if (field_avail(typeof(resp), tunnel_offloads_caps, uhw_outlen)) { resp.response_length += sizeof(resp.tunnel_offloads_caps); if (MLX5_CAP_ETH(mdev, tunnel_stateless_vxlan)) resp.tunnel_offloads_caps |= @@ -1186,7 +1185,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, MLX5_IB_TUNNELED_OFFLOADS_MPLS_UDP; } - if (uhw->outlen) { + if (uhw_outlen) { err = ib_copy_to_udata(uhw, &resp, resp.response_length); if (err) @@ -4771,7 +4770,6 @@ static int __get_port_caps(struct mlx5_ib_dev *dev, u8 port) struct ib_device_attr *dprops = NULL; struct ib_port_attr *pprops = NULL; int err = -ENOMEM; - struct ib_udata uhw = {.inlen = 0, .outlen = 0}; pprops = kzalloc(sizeof(*pprops), GFP_KERNEL); if (!pprops) @@ -4781,7 +4779,7 @@ static int __get_port_caps(struct mlx5_ib_dev *dev, u8 port) if (!dprops) goto out; - err = mlx5_ib_query_device(&dev->ib_dev, dprops, &uhw); + err = mlx5_ib_query_device(&dev->ib_dev, dprops, NULL); if (err) { mlx5_ib_warn(dev, "query_device failed %d\n", err); goto out; -- cgit v1.2.3-59-g8ed1b From a73a895588520a40e52f967a7b2892baf810c708 Mon Sep 17 00:00:00 2001 From: Moni Shoua Date: Wed, 15 Jan 2020 14:43:36 +0200 Subject: IB/mlx5: Mask out unsupported ODP capabilities for kernel QPs The ODP handler for WQEs in RQ or SRQ is not implented for kernel QPs. Therefore don't report support in these if query comes from a kernel user. Signed-off-by: Moni Shoua Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/main.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 760630c7aae7..994d59cd9daf 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -1015,6 +1015,23 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, if (dev->odp_caps.general_caps & IB_ODP_SUPPORT) props->device_cap_flags |= IB_DEVICE_ON_DEMAND_PAGING; props->odp_caps = dev->odp_caps; + if (!uhw) { + /* ODP for kernel QPs is not implemented for receive + * WQEs and SRQ WQEs + */ + props->odp_caps.per_transport_caps.rc_odp_caps &= + ~(IB_ODP_SUPPORT_READ | + IB_ODP_SUPPORT_SRQ_RECV); + props->odp_caps.per_transport_caps.uc_odp_caps &= + ~(IB_ODP_SUPPORT_READ | + IB_ODP_SUPPORT_SRQ_RECV); + props->odp_caps.per_transport_caps.ud_odp_caps &= + ~(IB_ODP_SUPPORT_READ | + IB_ODP_SUPPORT_SRQ_RECV); + props->odp_caps.per_transport_caps.xrc_odp_caps &= + ~(IB_ODP_SUPPORT_READ | + IB_ODP_SUPPORT_SRQ_RECV); + } } if (MLX5_CAP_GEN(mdev, cd)) -- cgit v1.2.3-59-g8ed1b From 8ffc32485158528f870b62707077ab494ba31deb Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 15 Jan 2020 14:43:37 +0200 Subject: RDMA/mlx5: Fix handling of IOVA != user_va in ODP paths Till recently it was not possible for userspace to specify a different IOVA, but with the new ibv_reg_mr_iova() library call this can be done. To compute the user_va we must compute: user_va = (iova - iova_start) + user_va_start while being cautious of overflow and other math problems. The iova is not reliably stored in the mmkey when the MR is created. Only the cached creation path (the common one) set it, so it must also be set when creating uncached MRs. Fix the weird use of iova when computing the starting page index in the MR. In the normal case, when iova == umem.address: iova & (~(BIT(page_shift) - 1)) == ALIGN_DOWN(umem.address, odp->page_size) == ib_umem_start(odp) And when iova is different using it in math with a user_va is wrong. Finally, do not allow an implicit ODP to be created with a non-zero IOVA as we have no support for that. Fixes: 7bdf65d411c1 ("IB/mlx5: Handle page faults") Signed-off-by: Moni Shoua Signed-off-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/mr.c | 2 ++ drivers/infiniband/hw/mlx5/odp.c | 19 +++++++++++++------ 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index f79bb44b94fe..44a0ee6bd9f1 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -1246,6 +1246,8 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && !start && length == U64_MAX) { + if (virt_addr != start) + return ERR_PTR(-EINVAL); if (!(access_flags & IB_ACCESS_ON_DEMAND) || !(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT)) return ERR_PTR(-EINVAL); diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 3642c6a491c2..0afb0042bd53 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -624,11 +624,10 @@ static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp, bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE; unsigned long current_seq; u64 access_mask; - u64 start_idx, page_mask; + u64 start_idx; page_shift = odp->page_shift; - page_mask = ~(BIT(page_shift) - 1); - start_idx = (user_va - (mr->mmkey.iova & page_mask)) >> page_shift; + start_idx = (user_va - ib_umem_start(odp)) >> page_shift; access_mask = ODP_READ_ALLOWED_BIT; if (odp->umem.writable && !downgrade) @@ -767,11 +766,19 @@ static int pagefault_mr(struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt, { struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); + if (unlikely(io_virt < mr->mmkey.iova)) + return -EFAULT; + if (!odp->is_implicit_odp) { - if (unlikely(io_virt < ib_umem_start(odp) || - ib_umem_end(odp) - io_virt < bcnt)) + u64 user_va; + + if (check_add_overflow(io_virt - mr->mmkey.iova, + (u64)odp->umem.address, &user_va)) + return -EFAULT; + if (unlikely(user_va >= ib_umem_end(odp) || + ib_umem_end(odp) - user_va < bcnt)) return -EFAULT; - return pagefault_real_mr(mr, odp, io_virt, bcnt, bytes_mapped, + return pagefault_real_mr(mr, odp, user_va, bcnt, bytes_mapped, flags); } return pagefault_implicit_mr(mr, odp, io_virt, bcnt, bytes_mapped, -- cgit v1.2.3-59-g8ed1b From c4c86abb3f9fb3d5bc113fb996298a30ec63e07b Mon Sep 17 00:00:00 2001 From: Hans Westgaard Ry Date: Wed, 15 Jan 2020 14:43:38 +0200 Subject: net/rds: Detect need of On-Demand-Paging memory registration Add code to check if memory intended for RDMA is FS-DAX-memory. RDS will fail with error code EOPNOTSUPP if FS-DAX-memory is detected. Signed-off-by: Hans Westgaard Ry Acked-by: Santosh Shilimkar Signed-off-by: Leon Romanovsky --- net/rds/rdma.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/rds/rdma.c b/net/rds/rdma.c index 916f5ec373d8..eb23c38ce2b3 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -156,11 +156,13 @@ void rds_rdma_drop_keys(struct rds_sock *rs) static int rds_pin_pages(unsigned long user_addr, unsigned int nr_pages, struct page **pages, int write) { + unsigned int gup_flags = FOLL_LONGTERM; int ret; - ret = get_user_pages_fast(user_addr, nr_pages, write ? FOLL_WRITE : 0, - pages); + if (write) + gup_flags |= FOLL_WRITE; + ret = get_user_pages_fast(user_addr, nr_pages, gup_flags, pages); if (ret >= 0 && ret < nr_pages) { while (ret--) put_page(pages[ret]); -- cgit v1.2.3-59-g8ed1b From 2eafa1746f17872483d1033b0116ec71435ea19d Mon Sep 17 00:00:00 2001 From: Hans Westgaard Ry Date: Wed, 15 Jan 2020 14:43:39 +0200 Subject: net/rds: Handle ODP mr registration/unregistration On-Demand-Paging MRs are registered using ib_reg_user_mr and unregistered with ib_dereg_mr. Signed-off-by: Hans Westgaard Ry Acked-by: Santosh Shilimkar Signed-off-by: Leon Romanovsky --- net/rds/ib.c | 7 +++ net/rds/ib.h | 3 +- net/rds/ib_mr.h | 7 ++- net/rds/ib_rdma.c | 75 ++++++++++++++++++++++++++- net/rds/ib_send.c | 44 +++++++++++----- net/rds/rdma.c | 151 ++++++++++++++++++++++++++++++++++++++++-------------- net/rds/rds.h | 13 ++++- 7 files changed, 244 insertions(+), 56 deletions(-) diff --git a/net/rds/ib.c b/net/rds/ib.c index 3fd5f40189bd..a792d8a3872a 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -156,6 +156,13 @@ static void rds_ib_add_one(struct ib_device *device) has_fmr = (device->ops.alloc_fmr && device->ops.dealloc_fmr && device->ops.map_phys_fmr && device->ops.unmap_fmr); rds_ibdev->use_fastreg = (has_fr && !has_fmr); + rds_ibdev->odp_capable = + !!(device->attrs.device_cap_flags & + IB_DEVICE_ON_DEMAND_PAGING) && + !!(device->attrs.odp_caps.per_transport_caps.rc_odp_caps & + IB_ODP_SUPPORT_WRITE) && + !!(device->attrs.odp_caps.per_transport_caps.rc_odp_caps & + IB_ODP_SUPPORT_READ); rds_ibdev->fmr_max_remaps = device->attrs.max_map_per_fmr?: 32; rds_ibdev->max_1m_mrs = device->attrs.max_mr ? diff --git a/net/rds/ib.h b/net/rds/ib.h index 6e6f24753998..0296f1f7acda 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -247,7 +247,8 @@ struct rds_ib_device { struct ib_device *dev; struct ib_pd *pd; struct dma_pool *rid_hdrs_pool; /* RDS headers DMA pool */ - bool use_fastreg; + u8 use_fastreg:1; + u8 odp_capable:1; unsigned int max_mrs; struct rds_ib_mr_pool *mr_1m_pool; diff --git a/net/rds/ib_mr.h b/net/rds/ib_mr.h index 9045a8c0edff..0c8252d7fe2b 100644 --- a/net/rds/ib_mr.h +++ b/net/rds/ib_mr.h @@ -67,6 +67,7 @@ struct rds_ib_frmr { /* This is stored as mr->r_trans_private. */ struct rds_ib_mr { + struct delayed_work work; struct rds_ib_device *device; struct rds_ib_mr_pool *pool; struct rds_ib_connection *ic; @@ -81,9 +82,11 @@ struct rds_ib_mr { unsigned int sg_len; int sg_dma_len; + u8 odp:1; union { struct rds_ib_fmr fmr; struct rds_ib_frmr frmr; + struct ib_mr *mr; } u; }; @@ -122,12 +125,14 @@ void rds6_ib_get_mr_info(struct rds_ib_device *rds_ibdev, void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *); void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, struct rds_sock *rs, u32 *key_ret, - struct rds_connection *conn); + struct rds_connection *conn, u64 start, u64 length, + int need_odp); void rds_ib_sync_mr(void *trans_private, int dir); void rds_ib_free_mr(void *trans_private, int invalidate); void rds_ib_flush_mrs(void); int rds_ib_mr_init(void); void rds_ib_mr_exit(void); +u32 rds_ib_get_lkey(void *trans_private); void __rds_ib_teardown_mr(struct rds_ib_mr *); void rds_ib_teardown_mr(struct rds_ib_mr *); diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index c8c1e3ae8d84..ea4d7daec251 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -37,8 +37,15 @@ #include "rds_single_path.h" #include "ib_mr.h" +#include "rds.h" struct workqueue_struct *rds_ib_mr_wq; +struct rds_ib_dereg_odp_mr { + struct work_struct work; + struct ib_mr *mr; +}; + +static void rds_ib_odp_mr_worker(struct work_struct *work); static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr) { @@ -213,6 +220,9 @@ void rds_ib_sync_mr(void *trans_private, int direction) struct rds_ib_mr *ibmr = trans_private; struct rds_ib_device *rds_ibdev = ibmr->device; + if (ibmr->odp) + return; + switch (direction) { case DMA_FROM_DEVICE: ib_dma_sync_sg_for_cpu(rds_ibdev->dev, ibmr->sg, @@ -482,6 +492,16 @@ void rds_ib_free_mr(void *trans_private, int invalidate) rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len); + if (ibmr->odp) { + /* A MR created and marked as use_once. We use delayed work, + * because there is a change that we are in interrupt and can't + * call to ib_dereg_mr() directly. + */ + INIT_DELAYED_WORK(&ibmr->work, rds_ib_odp_mr_worker); + queue_delayed_work(rds_ib_mr_wq, &ibmr->work, 0); + return; + } + /* Return it to the pool's free list */ if (rds_ibdev->use_fastreg) rds_ib_free_frmr_list(ibmr); @@ -526,9 +546,17 @@ void rds_ib_flush_mrs(void) up_read(&rds_ib_devices_lock); } +u32 rds_ib_get_lkey(void *trans_private) +{ + struct rds_ib_mr *ibmr = trans_private; + + return ibmr->u.mr->lkey; +} + void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, struct rds_sock *rs, u32 *key_ret, - struct rds_connection *conn) + struct rds_connection *conn, + u64 start, u64 length, int need_odp) { struct rds_ib_device *rds_ibdev; struct rds_ib_mr *ibmr = NULL; @@ -541,6 +569,42 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, goto out; } + if (need_odp == ODP_ZEROBASED || need_odp == ODP_VIRTUAL) { + u64 virt_addr = need_odp == ODP_ZEROBASED ? 0 : start; + int access_flags = + (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_ATOMIC | + IB_ACCESS_ON_DEMAND); + struct ib_mr *ib_mr; + + if (!rds_ibdev->odp_capable) { + ret = -EOPNOTSUPP; + goto out; + } + + ib_mr = ib_reg_user_mr(rds_ibdev->pd, start, length, virt_addr, + access_flags); + + if (IS_ERR(ib_mr)) { + rdsdebug("rds_ib_get_user_mr returned %d\n", + IS_ERR(ib_mr)); + ret = PTR_ERR(ib_mr); + goto out; + } + if (key_ret) + *key_ret = ib_mr->rkey; + + ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL); + if (!ibmr) { + ib_dereg_mr(ib_mr); + ret = -ENOMEM; + goto out; + } + ibmr->u.mr = ib_mr; + ibmr->odp = 1; + return ibmr; + } + if (conn) ic = conn->c_transport_data; @@ -629,3 +693,12 @@ void rds_ib_mr_exit(void) { destroy_workqueue(rds_ib_mr_wq); } + +static void rds_ib_odp_mr_worker(struct work_struct *work) +{ + struct rds_ib_mr *ibmr; + + ibmr = container_of(work, struct rds_ib_mr, work.work); + ib_dereg_mr(ibmr->u.mr); + kfree(ibmr); +} diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index d1cc1d7778d8..dfe778220657 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -39,6 +39,7 @@ #include "rds_single_path.h" #include "rds.h" #include "ib.h" +#include "ib_mr.h" /* * Convert IB-specific error message to RDS error message and call core @@ -635,6 +636,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, send->s_sge[0].addr = ic->i_send_hdrs_dma[pos]; send->s_sge[0].length = sizeof(struct rds_header); + send->s_sge[0].lkey = ic->i_pd->local_dma_lkey; memcpy(ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header)); @@ -650,6 +652,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, send->s_sge[1].addr = sg_dma_address(scat); send->s_sge[1].addr += rm->data.op_dmaoff; send->s_sge[1].length = len; + send->s_sge[1].lkey = ic->i_pd->local_dma_lkey; bytes_sent += len; rm->data.op_dmaoff += len; @@ -858,20 +861,29 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) int ret; int num_sge; int nr_sig = 0; + u64 odp_addr = op->op_odp_addr; + u32 odp_lkey = 0; /* map the op the first time we see it */ - if (!op->op_mapped) { - op->op_count = ib_dma_map_sg(ic->i_cm_id->device, - op->op_sg, op->op_nents, (op->op_write) ? - DMA_TO_DEVICE : DMA_FROM_DEVICE); - rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->op_count); - if (op->op_count == 0) { - rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); - ret = -ENOMEM; /* XXX ? */ - goto out; + if (!op->op_odp_mr) { + if (!op->op_mapped) { + op->op_count = + ib_dma_map_sg(ic->i_cm_id->device, op->op_sg, + op->op_nents, + (op->op_write) ? DMA_TO_DEVICE : + DMA_FROM_DEVICE); + rdsdebug("ic %p mapping op %p: %d\n", ic, op, + op->op_count); + if (op->op_count == 0) { + rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); + ret = -ENOMEM; /* XXX ? */ + goto out; + } + op->op_mapped = 1; } - - op->op_mapped = 1; + } else { + op->op_count = op->op_nents; + odp_lkey = rds_ib_get_lkey(op->op_odp_mr->r_trans_private); } /* @@ -923,14 +935,20 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) for (j = 0; j < send->s_rdma_wr.wr.num_sge && scat != &op->op_sg[op->op_count]; j++) { len = sg_dma_len(scat); - send->s_sge[j].addr = sg_dma_address(scat); + if (!op->op_odp_mr) { + send->s_sge[j].addr = sg_dma_address(scat); + send->s_sge[j].lkey = ic->i_pd->local_dma_lkey; + } else { + send->s_sge[j].addr = odp_addr; + send->s_sge[j].lkey = odp_lkey; + } send->s_sge[j].length = len; - send->s_sge[j].lkey = ic->i_pd->local_dma_lkey; sent += len; rdsdebug("ic %p sent %d remote_addr %llu\n", ic, sent, remote_addr); remote_addr += len; + odp_addr += len; scat++; } diff --git a/net/rds/rdma.c b/net/rds/rdma.c index eb23c38ce2b3..3341eee87bf9 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -177,13 +177,14 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, struct rds_conn_path *cp) { struct rds_mr *mr = NULL, *found; + struct scatterlist *sg = NULL; unsigned int nr_pages; struct page **pages = NULL; - struct scatterlist *sg; void *trans_private; unsigned long flags; rds_rdma_cookie_t cookie; - unsigned int nents; + unsigned int nents = 0; + int need_odp = 0; long i; int ret; @@ -197,6 +198,21 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, goto out; } + /* If the combination of the addr and size requested for this memory + * region causes an integer overflow, return error. + */ + if (((args->vec.addr + args->vec.bytes) < args->vec.addr) || + PAGE_ALIGN(args->vec.addr + args->vec.bytes) < + (args->vec.addr + args->vec.bytes)) { + ret = -EINVAL; + goto out; + } + + if (!can_do_mlock()) { + ret = -EPERM; + goto out; + } + nr_pages = rds_pages_in_vec(&args->vec); if (nr_pages == 0) { ret = -EINVAL; @@ -250,36 +266,44 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, * the zero page. */ ret = rds_pin_pages(args->vec.addr, nr_pages, pages, 1); - if (ret < 0) - goto out; - - nents = ret; - sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL); - if (!sg) { - ret = -ENOMEM; + if (ret == -EOPNOTSUPP) { + need_odp = 1; + } else if (ret <= 0) { goto out; - } - WARN_ON(!nents); - sg_init_table(sg, nents); - - /* Stick all pages into the scatterlist */ - for (i = 0 ; i < nents; i++) - sg_set_page(&sg[i], pages[i], PAGE_SIZE, 0); + } else { + nents = ret; + sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL); + if (!sg) { + ret = -ENOMEM; + goto out; + } + WARN_ON(!nents); + sg_init_table(sg, nents); - rdsdebug("RDS: trans_private nents is %u\n", nents); + /* Stick all pages into the scatterlist */ + for (i = 0 ; i < nents; i++) + sg_set_page(&sg[i], pages[i], PAGE_SIZE, 0); + rdsdebug("RDS: trans_private nents is %u\n", nents); + } /* Obtain a transport specific MR. If this succeeds, the * s/g list is now owned by the MR. * Note that dma_map() implies that pending writes are * flushed to RAM, so no dma_sync is needed here. */ - trans_private = rs->rs_transport->get_mr(sg, nents, rs, - &mr->r_key, - cp ? cp->cp_conn : NULL); + trans_private = rs->rs_transport->get_mr( + sg, nents, rs, &mr->r_key, cp ? cp->cp_conn : NULL, + args->vec.addr, args->vec.bytes, + need_odp ? ODP_ZEROBASED : ODP_NOT_NEEDED); if (IS_ERR(trans_private)) { - for (i = 0 ; i < nents; i++) - put_page(sg_page(&sg[i])); - kfree(sg); + /* In ODP case, we don't GUP pages, so don't need + * to release anything. + */ + if (!need_odp) { + for (i = 0 ; i < nents; i++) + put_page(sg_page(&sg[i])); + kfree(sg); + } ret = PTR_ERR(trans_private); goto out; } @@ -293,7 +317,11 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, * map page aligned regions. So we keep the offset, and build * a 64bit cookie containing and pass that * around. */ - cookie = rds_rdma_make_cookie(mr->r_key, args->vec.addr & ~PAGE_MASK); + if (need_odp) + cookie = rds_rdma_make_cookie(mr->r_key, 0); + else + cookie = rds_rdma_make_cookie(mr->r_key, + args->vec.addr & ~PAGE_MASK); if (cookie_ret) *cookie_ret = cookie; @@ -458,22 +486,26 @@ void rds_rdma_free_op(struct rm_rdma_op *ro) { unsigned int i; - for (i = 0; i < ro->op_nents; i++) { - struct page *page = sg_page(&ro->op_sg[i]); - - /* Mark page dirty if it was possibly modified, which - * is the case for a RDMA_READ which copies from remote - * to local memory */ - if (!ro->op_write) { - WARN_ON(!page->mapping && irqs_disabled()); - set_page_dirty(page); + if (ro->op_odp_mr) { + rds_mr_put(ro->op_odp_mr); + } else { + for (i = 0; i < ro->op_nents; i++) { + struct page *page = sg_page(&ro->op_sg[i]); + + /* Mark page dirty if it was possibly modified, which + * is the case for a RDMA_READ which copies from remote + * to local memory + */ + if (!ro->op_write) + set_page_dirty(page); + put_page(page); } - put_page(page); } kfree(ro->op_notifier); ro->op_notifier = NULL; ro->op_active = 0; + ro->op_odp_mr = NULL; } void rds_atomic_free_op(struct rm_atomic_op *ao) @@ -583,6 +615,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, struct rds_iovec *iovs; unsigned int i, j; int ret = 0; + bool odp_supported = true; if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args)) || rm->rdma.op_active) @@ -604,6 +637,9 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, ret = -EINVAL; goto out_ret; } + /* odp-mr is not supported for multiple requests within one message */ + if (args->nr_local != 1) + odp_supported = false; iovs = vec->iov; @@ -625,6 +661,8 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, op->op_silent = !!(args->flags & RDS_RDMA_SILENT); op->op_active = 1; op->op_recverr = rs->rs_recverr; + op->op_odp_mr = NULL; + WARN_ON(!nr_pages); op->op_sg = rds_message_alloc_sgs(rm, nr_pages, &ret); if (!op->op_sg) @@ -674,10 +712,44 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, * If it's a READ operation, we need to pin the pages for writing. */ ret = rds_pin_pages(iov->addr, nr, pages, !op->op_write); - if (ret < 0) + if ((!odp_supported && ret <= 0) || + (odp_supported && ret <= 0 && ret != -EOPNOTSUPP)) goto out_pages; - else - ret = 0; + + if (ret == -EOPNOTSUPP) { + struct rds_mr *local_odp_mr; + + if (!rs->rs_transport->get_mr) { + ret = -EOPNOTSUPP; + goto out_pages; + } + local_odp_mr = + kzalloc(sizeof(*local_odp_mr), GFP_KERNEL); + if (!local_odp_mr) { + ret = -ENOMEM; + goto out_pages; + } + RB_CLEAR_NODE(&local_odp_mr->r_rb_node); + refcount_set(&local_odp_mr->r_refcount, 1); + local_odp_mr->r_trans = rs->rs_transport; + local_odp_mr->r_sock = rs; + local_odp_mr->r_trans_private = + rs->rs_transport->get_mr( + NULL, 0, rs, &local_odp_mr->r_key, NULL, + iov->addr, iov->bytes, ODP_VIRTUAL); + if (IS_ERR(local_odp_mr->r_trans_private)) { + ret = IS_ERR(local_odp_mr->r_trans_private); + rdsdebug("get_mr ret %d %p\"", ret, + local_odp_mr->r_trans_private); + kfree(local_odp_mr); + ret = -EOPNOTSUPP; + goto out_pages; + } + rdsdebug("Need odp; local_odp_mr %p trans_private %p\n", + local_odp_mr, local_odp_mr->r_trans_private); + op->op_odp_mr = local_odp_mr; + op->op_odp_addr = iov->addr; + } rdsdebug("RDS: nr_bytes %u nr %u iov->bytes %llu iov->addr %llx\n", nr_bytes, nr, iov->bytes, iov->addr); @@ -693,6 +765,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, min_t(unsigned int, iov->bytes, PAGE_SIZE - offset), offset); + sg_dma_len(sg) = sg->length; rdsdebug("RDS: sg->offset %x sg->len %x iov->addr %llx iov->bytes %llu\n", sg->offset, sg->length, iov->addr, iov->bytes); @@ -711,6 +784,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, goto out_pages; } op->op_bytes = nr_bytes; + ret = 0; out_pages: kfree(pages); @@ -757,7 +831,8 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); if (mr) { - mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE); + mr->r_trans->sync_mr(mr->r_trans_private, + DMA_TO_DEVICE); rm->rdma.op_rdma_mr = mr; } return err; diff --git a/net/rds/rds.h b/net/rds/rds.h index 53e86911773a..e4a603523083 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -40,7 +40,6 @@ #ifdef ATOMIC64_INIT #define KERNEL_HAS_ATOMIC64 #endif - #ifdef RDS_DEBUG #define rdsdebug(fmt, args...) pr_debug("%s(): " fmt, __func__ , ##args) #else @@ -478,6 +477,9 @@ struct rds_message { struct rds_notifier *op_notifier; struct rds_mr *op_rdma_mr; + + u64 op_odp_addr; + struct rds_mr *op_odp_mr; } rdma; struct rm_data_op { unsigned int op_active:1; @@ -573,7 +575,8 @@ struct rds_transport { void (*exit)(void); void *(*get_mr)(struct scatterlist *sg, unsigned long nr_sg, struct rds_sock *rs, u32 *key_ret, - struct rds_connection *conn); + struct rds_connection *conn, + u64 start, u64 length, int need_odp); void (*sync_mr)(void *trans_private, int direction); void (*free_mr)(void *trans_private, int invalidate); void (*flush_mrs)(void); @@ -956,6 +959,12 @@ static inline bool rds_destroy_pending(struct rds_connection *conn) (conn->c_trans->t_unloading && conn->c_trans->t_unloading(conn)); } +enum { + ODP_NOT_NEEDED, + ODP_ZEROBASED, + ODP_VIRTUAL +}; + /* stats.c */ DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats); #define rds_stats_inc_which(which, member) do { \ -- cgit v1.2.3-59-g8ed1b From b2dfc6765e45a3154800333234e4952b5412d792 Mon Sep 17 00:00:00 2001 From: Hans Westgaard Ry Date: Wed, 15 Jan 2020 14:43:40 +0200 Subject: net/rds: Use prefetch for On-Demand-Paging MR Try prefetching pages when using On-Demand-Paging MR using ib_advise_mr. Signed-off-by: Hans Westgaard Ry Acked-by: Santosh Shilimkar Signed-off-by: Leon Romanovsky --- net/rds/ib_rdma.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index ea4d7daec251..b34b24e237f8 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -575,6 +575,7 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ | IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_ON_DEMAND); + struct ib_sge sge = {}; struct ib_mr *ib_mr; if (!rds_ibdev->odp_capable) { @@ -602,6 +603,14 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, } ibmr->u.mr = ib_mr; ibmr->odp = 1; + + sge.addr = virt_addr; + sge.length = length; + sge.lkey = ib_mr->lkey; + + ib_advise_mr(rds_ibdev->pd, + IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE, + IB_UVERBS_ADVISE_MR_FLAG_FLUSH, &sge, 1); return ibmr; } -- cgit v1.2.3-59-g8ed1b