From c320e527e1548305f31d95ec405140b04aed25f5 Mon Sep 17 00:00:00 2001 From: Moni Shoua Date: Wed, 15 Jan 2020 14:43:31 +0200 Subject: IB: Allow calls to ib_umem_get from kernel ULPs So far the assumption was that ib_umem_get() and ib_umem_odp_get() are called from flows that start in UVERBS and therefore has a user context. This assumption restricts flows that are initiated by ULPs and need the service that ib_umem_get() provides. This patch changes ib_umem_get() and ib_umem_odp_get() to get IB device directly by relying on the fact that both UVERBS and ULPs sets that field correctly. Reviewed-by: Guy Levi Signed-off-by: Moni Shoua Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/odp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/infiniband/hw/mlx5/odp.c') diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index f924250f80c2..3b3ceb5acdd3 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -497,7 +497,7 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, struct mlx5_ib_mr *imr; int err; - umem_odp = ib_umem_odp_alloc_implicit(udata, access_flags); + umem_odp = ib_umem_odp_alloc_implicit(&dev->ib_dev, access_flags); if (IS_ERR(umem_odp)) return ERR_CAST(umem_odp); -- cgit v1.2.3-59-g8ed1b From da9ee9d8a8745e70e481446e0bfe2d773b1c364b Mon Sep 17 00:00:00 2001 From: Moni Shoua Date: Wed, 15 Jan 2020 14:43:34 +0200 Subject: IB/mlx5: Add ODP WQE handlers for kernel QPs One of the steps in ODP page fault handler for WQEs is to read a WQE from a QP send queue or receive queue buffer at a specific index. Since the implementation of this buffer is different between kernel and user QP the implementation of the handler needs to be aware of that and handle it in a different way. ODP for kernel MRs is currently supported only for RDMA_READ and RDMA_WRITE operations so change the handler to - read a WQE from a kernel QP send queue - fail if access to receive queue or shared receive queue is required for a kernel QP Signed-off-by: Moni Shoua Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/mlx5_ib.h | 12 +-- drivers/infiniband/hw/mlx5/odp.c | 12 +-- drivers/infiniband/hw/mlx5/qp.c | 163 ++++++++++++++++++++++------------- 3 files changed, 117 insertions(+), 70 deletions(-) (limited to 'drivers/infiniband/hw/mlx5/odp.c') diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index b06f32ff5748..77d495b2032d 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -1153,12 +1153,12 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, const struct ib_send_wr **bad_wr); int mlx5_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, const struct ib_recv_wr **bad_wr); -int mlx5_ib_read_user_wqe_sq(struct mlx5_ib_qp *qp, int wqe_index, void *buffer, - int buflen, size_t *bc); -int mlx5_ib_read_user_wqe_rq(struct mlx5_ib_qp *qp, int wqe_index, void *buffer, - int buflen, size_t *bc); -int mlx5_ib_read_user_wqe_srq(struct mlx5_ib_srq *srq, int wqe_index, - void *buffer, int buflen, size_t *bc); +int mlx5_ib_read_wqe_sq(struct mlx5_ib_qp *qp, int wqe_index, void *buffer, + size_t buflen, size_t *bc); +int mlx5_ib_read_wqe_rq(struct mlx5_ib_qp *qp, int wqe_index, void *buffer, + size_t buflen, size_t *bc); +int mlx5_ib_read_wqe_srq(struct mlx5_ib_srq *srq, int wqe_index, void *buffer, + size_t buflen, size_t *bc); int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, struct ib_udata *udata); void mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata); diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 3b3ceb5acdd3..3642c6a491c2 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -1237,15 +1237,15 @@ static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev *dev, wqe = wqe_start; qp = (res->res == MLX5_RES_QP) ? res_to_qp(res) : NULL; if (qp && sq) { - ret = mlx5_ib_read_user_wqe_sq(qp, wqe_index, wqe, PAGE_SIZE, - &bytes_copied); + ret = mlx5_ib_read_wqe_sq(qp, wqe_index, wqe, PAGE_SIZE, + &bytes_copied); if (ret) goto read_user; ret = mlx5_ib_mr_initiator_pfault_handler( dev, pfault, qp, &wqe, &wqe_end, bytes_copied); } else if (qp && !sq) { - ret = mlx5_ib_read_user_wqe_rq(qp, wqe_index, wqe, PAGE_SIZE, - &bytes_copied); + ret = mlx5_ib_read_wqe_rq(qp, wqe_index, wqe, PAGE_SIZE, + &bytes_copied); if (ret) goto read_user; ret = mlx5_ib_mr_responder_pfault_handler_rq( @@ -1253,8 +1253,8 @@ static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev *dev, } else if (!qp) { struct mlx5_ib_srq *srq = res_to_srq(res); - ret = mlx5_ib_read_user_wqe_srq(srq, wqe_index, wqe, PAGE_SIZE, - &bytes_copied); + ret = mlx5_ib_read_wqe_srq(srq, wqe_index, wqe, PAGE_SIZE, + &bytes_copied); if (ret) goto read_user; ret = mlx5_ib_mr_responder_pfault_handler_srq( diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 7f0bde313560..ae7cbd9c9bca 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -129,14 +129,10 @@ static int is_sqp(enum ib_qp_type qp_type) * * Return: zero on success, or an error code. */ -static int mlx5_ib_read_user_wqe_common(struct ib_umem *umem, - void *buffer, - u32 buflen, - int wqe_index, - int wq_offset, - int wq_wqe_cnt, - int wq_wqe_shift, - int bcnt, +static int mlx5_ib_read_user_wqe_common(struct ib_umem *umem, void *buffer, + size_t buflen, int wqe_index, + int wq_offset, int wq_wqe_cnt, + int wq_wqe_shift, int bcnt, size_t *bytes_copied) { size_t offset = wq_offset + ((wqe_index % wq_wqe_cnt) << wq_wqe_shift); @@ -160,11 +156,43 @@ static int mlx5_ib_read_user_wqe_common(struct ib_umem *umem, return 0; } -int mlx5_ib_read_user_wqe_sq(struct mlx5_ib_qp *qp, - int wqe_index, - void *buffer, - int buflen, - size_t *bc) +static int mlx5_ib_read_kernel_wqe_sq(struct mlx5_ib_qp *qp, int wqe_index, + void *buffer, size_t buflen, size_t *bc) +{ + struct mlx5_wqe_ctrl_seg *ctrl; + size_t bytes_copied = 0; + size_t wqe_length; + void *p; + int ds; + + wqe_index = wqe_index & qp->sq.fbc.sz_m1; + + /* read the control segment first */ + p = mlx5_frag_buf_get_wqe(&qp->sq.fbc, wqe_index); + ctrl = p; + ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK; + wqe_length = ds * MLX5_WQE_DS_UNITS; + + /* read rest of WQE if it spreads over more than one stride */ + while (bytes_copied < wqe_length) { + size_t copy_length = + min_t(size_t, buflen - bytes_copied, MLX5_SEND_WQE_BB); + + if (!copy_length) + break; + + memcpy(buffer + bytes_copied, p, copy_length); + bytes_copied += copy_length; + + wqe_index = (wqe_index + 1) & qp->sq.fbc.sz_m1; + p = mlx5_frag_buf_get_wqe(&qp->sq.fbc, wqe_index); + } + *bc = bytes_copied; + return 0; +} + +static int mlx5_ib_read_user_wqe_sq(struct mlx5_ib_qp *qp, int wqe_index, + void *buffer, size_t buflen, size_t *bc) { struct mlx5_ib_qp_base *base = &qp->trans_qp.base; struct ib_umem *umem = base->ubuffer.umem; @@ -176,18 +204,10 @@ int mlx5_ib_read_user_wqe_sq(struct mlx5_ib_qp *qp, int ret; int ds; - if (buflen < sizeof(*ctrl)) - return -EINVAL; - /* at first read as much as possible */ - ret = mlx5_ib_read_user_wqe_common(umem, - buffer, - buflen, - wqe_index, - wq->offset, - wq->wqe_cnt, - wq->wqe_shift, - buflen, + ret = mlx5_ib_read_user_wqe_common(umem, buffer, buflen, wqe_index, + wq->offset, wq->wqe_cnt, + wq->wqe_shift, buflen, &bytes_copied); if (ret) return ret; @@ -210,13 +230,9 @@ int mlx5_ib_read_user_wqe_sq(struct mlx5_ib_qp *qp, * so read the remaining bytes starting * from wqe_index 0 */ - ret = mlx5_ib_read_user_wqe_common(umem, - buffer + bytes_copied, - buflen - bytes_copied, - 0, - wq->offset, - wq->wqe_cnt, - wq->wqe_shift, + ret = mlx5_ib_read_user_wqe_common(umem, buffer + bytes_copied, + buflen - bytes_copied, 0, wq->offset, + wq->wqe_cnt, wq->wqe_shift, wqe_length - bytes_copied, &bytes_copied2); @@ -226,11 +242,24 @@ int mlx5_ib_read_user_wqe_sq(struct mlx5_ib_qp *qp, return 0; } -int mlx5_ib_read_user_wqe_rq(struct mlx5_ib_qp *qp, - int wqe_index, - void *buffer, - int buflen, - size_t *bc) +int mlx5_ib_read_wqe_sq(struct mlx5_ib_qp *qp, int wqe_index, void *buffer, + size_t buflen, size_t *bc) +{ + struct mlx5_ib_qp_base *base = &qp->trans_qp.base; + struct ib_umem *umem = base->ubuffer.umem; + + if (buflen < sizeof(struct mlx5_wqe_ctrl_seg)) + return -EINVAL; + + if (!umem) + return mlx5_ib_read_kernel_wqe_sq(qp, wqe_index, buffer, + buflen, bc); + + return mlx5_ib_read_user_wqe_sq(qp, wqe_index, buffer, buflen, bc); +} + +static int mlx5_ib_read_user_wqe_rq(struct mlx5_ib_qp *qp, int wqe_index, + void *buffer, size_t buflen, size_t *bc) { struct mlx5_ib_qp_base *base = &qp->trans_qp.base; struct ib_umem *umem = base->ubuffer.umem; @@ -238,14 +267,9 @@ int mlx5_ib_read_user_wqe_rq(struct mlx5_ib_qp *qp, size_t bytes_copied; int ret; - ret = mlx5_ib_read_user_wqe_common(umem, - buffer, - buflen, - wqe_index, - wq->offset, - wq->wqe_cnt, - wq->wqe_shift, - buflen, + ret = mlx5_ib_read_user_wqe_common(umem, buffer, buflen, wqe_index, + wq->offset, wq->wqe_cnt, + wq->wqe_shift, buflen, &bytes_copied); if (ret) @@ -254,25 +278,33 @@ int mlx5_ib_read_user_wqe_rq(struct mlx5_ib_qp *qp, return 0; } -int mlx5_ib_read_user_wqe_srq(struct mlx5_ib_srq *srq, - int wqe_index, - void *buffer, - int buflen, - size_t *bc) +int mlx5_ib_read_wqe_rq(struct mlx5_ib_qp *qp, int wqe_index, void *buffer, + size_t buflen, size_t *bc) +{ + struct mlx5_ib_qp_base *base = &qp->trans_qp.base; + struct ib_umem *umem = base->ubuffer.umem; + struct mlx5_ib_wq *wq = &qp->rq; + size_t wqe_size = 1 << wq->wqe_shift; + + if (buflen < wqe_size) + return -EINVAL; + + if (!umem) + return -EOPNOTSUPP; + + return mlx5_ib_read_user_wqe_rq(qp, wqe_index, buffer, buflen, bc); +} + +static int mlx5_ib_read_user_wqe_srq(struct mlx5_ib_srq *srq, int wqe_index, + void *buffer, size_t buflen, size_t *bc) { struct ib_umem *umem = srq->umem; size_t bytes_copied; int ret; - ret = mlx5_ib_read_user_wqe_common(umem, - buffer, - buflen, - wqe_index, - 0, - srq->msrq.max, - srq->msrq.wqe_shift, - buflen, - &bytes_copied); + ret = mlx5_ib_read_user_wqe_common(umem, buffer, buflen, wqe_index, 0, + srq->msrq.max, srq->msrq.wqe_shift, + buflen, &bytes_copied); if (ret) return ret; @@ -280,6 +312,21 @@ int mlx5_ib_read_user_wqe_srq(struct mlx5_ib_srq *srq, return 0; } +int mlx5_ib_read_wqe_srq(struct mlx5_ib_srq *srq, int wqe_index, void *buffer, + size_t buflen, size_t *bc) +{ + struct ib_umem *umem = srq->umem; + size_t wqe_size = 1 << srq->msrq.wqe_shift; + + if (buflen < wqe_size) + return -EINVAL; + + if (!umem) + return -EOPNOTSUPP; + + return mlx5_ib_read_user_wqe_srq(srq, wqe_index, buffer, buflen, bc); +} + static void mlx5_ib_qp_event(struct mlx5_core_qp *qp, int type) { struct ib_qp *ibqp = &to_mibqp(qp)->ibqp; -- cgit v1.2.3-59-g8ed1b From 8ffc32485158528f870b62707077ab494ba31deb Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 15 Jan 2020 14:43:37 +0200 Subject: RDMA/mlx5: Fix handling of IOVA != user_va in ODP paths Till recently it was not possible for userspace to specify a different IOVA, but with the new ibv_reg_mr_iova() library call this can be done. To compute the user_va we must compute: user_va = (iova - iova_start) + user_va_start while being cautious of overflow and other math problems. The iova is not reliably stored in the mmkey when the MR is created. Only the cached creation path (the common one) set it, so it must also be set when creating uncached MRs. Fix the weird use of iova when computing the starting page index in the MR. In the normal case, when iova == umem.address: iova & (~(BIT(page_shift) - 1)) == ALIGN_DOWN(umem.address, odp->page_size) == ib_umem_start(odp) And when iova is different using it in math with a user_va is wrong. Finally, do not allow an implicit ODP to be created with a non-zero IOVA as we have no support for that. Fixes: 7bdf65d411c1 ("IB/mlx5: Handle page faults") Signed-off-by: Moni Shoua Signed-off-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/mr.c | 2 ++ drivers/infiniband/hw/mlx5/odp.c | 19 +++++++++++++------ 2 files changed, 15 insertions(+), 6 deletions(-) (limited to 'drivers/infiniband/hw/mlx5/odp.c') diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index f79bb44b94fe..44a0ee6bd9f1 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -1246,6 +1246,8 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && !start && length == U64_MAX) { + if (virt_addr != start) + return ERR_PTR(-EINVAL); if (!(access_flags & IB_ACCESS_ON_DEMAND) || !(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT)) return ERR_PTR(-EINVAL); diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 3642c6a491c2..0afb0042bd53 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -624,11 +624,10 @@ static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp, bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE; unsigned long current_seq; u64 access_mask; - u64 start_idx, page_mask; + u64 start_idx; page_shift = odp->page_shift; - page_mask = ~(BIT(page_shift) - 1); - start_idx = (user_va - (mr->mmkey.iova & page_mask)) >> page_shift; + start_idx = (user_va - ib_umem_start(odp)) >> page_shift; access_mask = ODP_READ_ALLOWED_BIT; if (odp->umem.writable && !downgrade) @@ -767,11 +766,19 @@ static int pagefault_mr(struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt, { struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); + if (unlikely(io_virt < mr->mmkey.iova)) + return -EFAULT; + if (!odp->is_implicit_odp) { - if (unlikely(io_virt < ib_umem_start(odp) || - ib_umem_end(odp) - io_virt < bcnt)) + u64 user_va; + + if (check_add_overflow(io_virt - mr->mmkey.iova, + (u64)odp->umem.address, &user_va)) + return -EFAULT; + if (unlikely(user_va >= ib_umem_end(odp) || + ib_umem_end(odp) - user_va < bcnt)) return -EFAULT; - return pagefault_real_mr(mr, odp, io_virt, bcnt, bytes_mapped, + return pagefault_real_mr(mr, odp, user_va, bcnt, bytes_mapped, flags); } return pagefault_implicit_mr(mr, odp, io_virt, bcnt, bytes_mapped, -- cgit v1.2.3-59-g8ed1b