From d34d0bdb500e6f9d8d61d390e1000d7d153d8a04 Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Sun, 19 Jan 2025 18:28:29 +0100 Subject: RDMA/rxe: Replace netdev dev addr with raw_gid Because TUN device does not have dev_addr, but a gid in rdma is needed, as such, a raw_gid is generated to act as the gid. The similar commit is in SIW. This commit learns from the similar commit bad5b6e34ffb ("RDMA/siw: Fabricate a GID on tun and loopback devices") in SIW. Signed-off-by: Zhu Yanjun Link: https://patch.msgid.link/20250119172831.3123110-2-yanjun.zhu@linux.dev Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/rxe/rxe.c | 15 +++++++++++++-- drivers/infiniband/sw/rxe/rxe_verbs.c | 2 +- drivers/infiniband/sw/rxe/rxe_verbs.h | 4 +++- 3 files changed, 17 insertions(+), 4 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c index 1ba4a0c8726a..432e54a29b99 100644 --- a/drivers/infiniband/sw/rxe/rxe.c +++ b/drivers/infiniband/sw/rxe/rxe.c @@ -78,8 +78,19 @@ static void rxe_init_device_param(struct rxe_dev *rxe) if (!ndev) return; + if (ndev->addr_len) { + memcpy(rxe->raw_gid, ndev->dev_addr, + min_t(unsigned int, ndev->addr_len, ETH_ALEN)); + } else { + /* + * This device does not have a HW address, but + * connection mangagement requires a unique gid. + */ + eth_random_addr(rxe->raw_gid); + } + addrconf_addr_eui48((unsigned char *)&rxe->attr.sys_image_guid, - ndev->dev_addr); + rxe->raw_gid); dev_put(ndev); @@ -125,7 +136,7 @@ static void rxe_init_ports(struct rxe_dev *rxe) if (!ndev) return; addrconf_addr_eui48((unsigned char *)&port->port_guid, - ndev->dev_addr); + rxe->raw_gid); dev_put(ndev); spin_lock_init(&port->port_lock); } diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index 6152a0fdfc8c..c46e94f7e86e 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -1523,7 +1523,7 @@ int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name, dev->num_comp_vectors = num_possible_cpus(); dev->local_dma_lkey = 0; addrconf_addr_eui48((unsigned char *)&dev->node_guid, - ndev->dev_addr); + rxe->raw_gid); dev->uverbs_cmd_mask |= BIT_ULL(IB_USER_VERBS_CMD_POST_SEND) | BIT_ULL(IB_USER_VERBS_CMD_REQ_NOTIFY_CQ); diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index 6573ceec0ef5..729a6ada46af 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -376,7 +376,9 @@ struct rxe_dev { struct ib_device_attr attr; int max_ucontext; int max_inline_data; - struct mutex usdev_lock; + struct mutex usdev_lock; + + char raw_gid[ETH_ALEN]; struct rxe_pool uc_pool; struct rxe_pool pd_pool; -- cgit v1.2.3-59-g8ed1b From 93486fc96f0e262581ee889e41dd6ddaa95066ad Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Sun, 19 Jan 2025 18:28:30 +0100 Subject: RDMA/rxe: Add query_gid support The query_gid is not implemented in RXE. After the raw_gid is added, this query_gid should be implemented in RXE. Signed-off-by: Zhu Yanjun Link: https://patch.msgid.link/20250119172831.3123110-3-yanjun.zhu@linux.dev Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/rxe/rxe_verbs.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'drivers') diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index c46e94f7e86e..08a5836c2600 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -80,6 +80,18 @@ err_out: return err; } +static int rxe_query_gid(struct ib_device *ibdev, u32 port, int idx, + union ib_gid *gid) +{ + struct rxe_dev *rxe = to_rdev(ibdev); + + /* subnet_prefix == interface_id == 0; */ + memset(gid, 0, sizeof(*gid)); + memcpy(gid->raw, rxe->raw_gid, ETH_ALEN); + + return 0; +} + static int rxe_query_pkey(struct ib_device *ibdev, u32 port_num, u16 index, u16 *pkey) { @@ -1493,6 +1505,7 @@ static const struct ib_device_ops rxe_dev_ops = { .query_ah = rxe_query_ah, .query_device = rxe_query_device, .query_pkey = rxe_query_pkey, + .query_gid = rxe_query_gid, .query_port = rxe_query_port, .query_qp = rxe_query_qp, .query_srq = rxe_query_srq, -- cgit v1.2.3-59-g8ed1b From 190797d47f16d2d5bd32e2d3360218111d83869d Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Sun, 19 Jan 2025 18:28:31 +0100 Subject: RDMA/rxe: Make rping work with tun device Because the type of tun device is ARPHRD_NONE, in cma, ARPHRD_NONE is not recognized. To RXE device, just as SIW does, ARPHRD_NONE is added to support. Signed-off-by: Zhu Yanjun Link: https://patch.msgid.link/20250119172831.3123110-4-yanjun.zhu@linux.dev Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/cma.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 91db10515d74..fedcdb56fb6b 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -739,12 +739,26 @@ cma_validate_port(struct ib_device *device, u32 port, goto out; } - if (dev_type == ARPHRD_ETHER && rdma_protocol_roce(device, port)) { - ndev = dev_get_by_index(dev_addr->net, bound_if_index); - if (!ndev) - goto out; + /* + * For a RXE device, it should work with TUN device and normal ethernet + * devices. Use driver_id to check if a device is a RXE device or not. + * ARPHDR_NONE means a TUN device. + */ + if (device->ops.driver_id == RDMA_DRIVER_RXE) { + if ((dev_type == ARPHRD_NONE || dev_type == ARPHRD_ETHER) + && rdma_protocol_roce(device, port)) { + ndev = dev_get_by_index(dev_addr->net, bound_if_index); + if (!ndev) + goto out; + } } else { - gid_type = IB_GID_TYPE_IB; + if (dev_type == ARPHRD_ETHER && rdma_protocol_roce(device, port)) { + ndev = dev_get_by_index(dev_addr->net, bound_if_index); + if (!ndev) + goto out; + } else { + gid_type = IB_GID_TYPE_IB; + } } sgid_attr = rdma_find_gid_by_port(device, gid, gid_type, port, ndev); -- cgit v1.2.3-59-g8ed1b From 78683c25c80e54bf3e8015fdfb8cba2fcd03daa5 Mon Sep 17 00:00:00 2001 From: Konstantin Taranov Date: Mon, 20 Jan 2025 09:27:07 -0800 Subject: RDMA/mana_ib: Allow registration of DMA-mapped memory in PDs Allow the HW to register DMA-mapped memory for kernel-level PDs. Signed-off-by: Konstantin Taranov Link: https://patch.msgid.link/1737394039-28772-2-git-send-email-kotaranov@linux.microsoft.com Reviewed-by: Shiraz Saleem Reviewed-by: Long Li Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mana/main.c | 3 +++ include/net/mana/gdma.h | 1 + 2 files changed, 4 insertions(+) (limited to 'drivers') diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c index 67c2d43135a8..45b251b91131 100644 --- a/drivers/infiniband/hw/mana/main.c +++ b/drivers/infiniband/hw/mana/main.c @@ -82,6 +82,9 @@ int mana_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) mana_gd_init_req_hdr(&req.hdr, GDMA_CREATE_PD, sizeof(req), sizeof(resp)); + if (!udata) + flags |= GDMA_PD_FLAG_ALLOW_GPA_MR; + req.flags = flags; err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h index 90f56656b572..03e1b257953e 100644 --- a/include/net/mana/gdma.h +++ b/include/net/mana/gdma.h @@ -775,6 +775,7 @@ struct gdma_destroy_dma_region_req { enum gdma_pd_flags { GDMA_PD_FLAG_INVALID = 0, + GDMA_PD_FLAG_ALLOW_GPA_MR = 1, }; struct gdma_create_pd_req { -- cgit v1.2.3-59-g8ed1b From 6e1b8bdcd04f4e84c924489e2f837cf620002b69 Mon Sep 17 00:00:00 2001 From: Konstantin Taranov Date: Mon, 20 Jan 2025 09:27:08 -0800 Subject: RDMA/mana_ib: implement get_dma_mr Implement allocation of DMA-mapped memory regions. Signed-off-by: Konstantin Taranov Link: https://patch.msgid.link/1737394039-28772-3-git-send-email-kotaranov@linux.microsoft.com Reviewed-by: Shiraz Saleem Reviewed-by: Long Li Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mana/device.c | 1 + drivers/infiniband/hw/mana/mr.c | 36 ++++++++++++++++++++++++++++++++++++ include/net/mana/gdma.h | 5 +++++ 3 files changed, 42 insertions(+) (limited to 'drivers') diff --git a/drivers/infiniband/hw/mana/device.c b/drivers/infiniband/hw/mana/device.c index 3416a85f8738..681e243b349c 100644 --- a/drivers/infiniband/hw/mana/device.c +++ b/drivers/infiniband/hw/mana/device.c @@ -32,6 +32,7 @@ static const struct ib_device_ops mana_ib_dev_ops = { .destroy_rwq_ind_table = mana_ib_destroy_rwq_ind_table, .destroy_wq = mana_ib_destroy_wq, .disassociate_ucontext = mana_ib_disassociate_ucontext, + .get_dma_mr = mana_ib_get_dma_mr, .get_link_layer = mana_ib_get_link_layer, .get_port_immutable = mana_ib_get_port_immutable, .mmap = mana_ib_mmap, diff --git a/drivers/infiniband/hw/mana/mr.c b/drivers/infiniband/hw/mana/mr.c index 887b09dd86e7..3a047f8c9fc5 100644 --- a/drivers/infiniband/hw/mana/mr.c +++ b/drivers/infiniband/hw/mana/mr.c @@ -8,6 +8,8 @@ #define VALID_MR_FLAGS \ (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ) +#define VALID_DMA_MR_FLAGS (IB_ACCESS_LOCAL_WRITE) + static enum gdma_mr_access_flags mana_ib_verbs_to_gdma_access_flags(int access_flags) { @@ -39,6 +41,8 @@ static int mana_ib_gd_create_mr(struct mana_ib_dev *dev, struct mana_ib_mr *mr, req.mr_type = mr_params->mr_type; switch (mr_params->mr_type) { + case GDMA_MR_TYPE_GPA: + break; case GDMA_MR_TYPE_GVA: req.gva.dma_region_handle = mr_params->gva.dma_region_handle; req.gva.virtual_address = mr_params->gva.virtual_address; @@ -169,6 +173,38 @@ err_free: return ERR_PTR(err); } +struct ib_mr *mana_ib_get_dma_mr(struct ib_pd *ibpd, int access_flags) +{ + struct mana_ib_pd *pd = container_of(ibpd, struct mana_ib_pd, ibpd); + struct gdma_create_mr_params mr_params = {}; + struct ib_device *ibdev = ibpd->device; + struct mana_ib_dev *dev; + struct mana_ib_mr *mr; + int err; + + dev = container_of(ibdev, struct mana_ib_dev, ib_dev); + + if (access_flags & ~VALID_DMA_MR_FLAGS) + return ERR_PTR(-EINVAL); + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + mr_params.pd_handle = pd->pd_handle; + mr_params.mr_type = GDMA_MR_TYPE_GPA; + + err = mana_ib_gd_create_mr(dev, mr, &mr_params); + if (err) + goto err_free; + + return &mr->ibmr; + +err_free: + kfree(mr); + return ERR_PTR(err); +} + int mana_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) { struct mana_ib_mr *mr = container_of(ibmr, struct mana_ib_mr, ibmr); diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h index 03e1b257953e..a94b04ea926f 100644 --- a/include/net/mana/gdma.h +++ b/include/net/mana/gdma.h @@ -801,6 +801,11 @@ struct gdma_destory_pd_resp { };/* HW DATA */ enum gdma_mr_type { + /* + * Guest Physical Address - MRs of this type allow access + * to any DMA-mapped memory using bus-logical address + */ + GDMA_MR_TYPE_GPA = 1, /* Guest Virtual Address - MRs of this type allow access * to memory mapped by PTEs associated with this MR using a virtual * address that is set up in the MST -- cgit v1.2.3-59-g8ed1b From 1440bdbd9c4ee2a7461a5e547c4f7c27b4740f91 Mon Sep 17 00:00:00 2001 From: Konstantin Taranov Date: Mon, 20 Jan 2025 09:27:09 -0800 Subject: RDMA/mana_ib: helpers to allocate kernel queues Introduce helpers to allocate queues for kernel-level use. Signed-off-by: Konstantin Taranov Link: https://patch.msgid.link/1737394039-28772-4-git-send-email-kotaranov@linux.microsoft.com Reviewed-by: Shiraz Saleem Reviewed-by: Long Li Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mana/main.c | 23 +++++++++++++++++++++++ drivers/infiniband/hw/mana/mana_ib.h | 3 +++ drivers/net/ethernet/microsoft/mana/gdma_main.c | 1 + 3 files changed, 27 insertions(+) (limited to 'drivers') diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c index 45b251b91131..f2f6bb352048 100644 --- a/drivers/infiniband/hw/mana/main.c +++ b/drivers/infiniband/hw/mana/main.c @@ -240,6 +240,27 @@ void mana_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) ibdev_dbg(ibdev, "Failed to destroy doorbell page %d\n", ret); } +int mana_ib_create_kernel_queue(struct mana_ib_dev *mdev, u32 size, enum gdma_queue_type type, + struct mana_ib_queue *queue) +{ + struct gdma_context *gc = mdev_to_gc(mdev); + struct gdma_queue_spec spec = {}; + int err; + + queue->id = INVALID_QUEUE_ID; + queue->gdma_region = GDMA_INVALID_DMA_REGION; + spec.type = type; + spec.monitor_avl_buf = false; + spec.queue_size = size; + err = mana_gd_create_mana_wq_cq(&gc->mana_ib, &spec, &queue->kmem); + if (err) + return err; + /* take ownership into mana_ib from mana */ + queue->gdma_region = queue->kmem->mem_info.dma_region_handle; + queue->kmem->mem_info.dma_region_handle = GDMA_INVALID_DMA_REGION; + return 0; +} + int mana_ib_create_queue(struct mana_ib_dev *mdev, u64 addr, u32 size, struct mana_ib_queue *queue) { @@ -279,6 +300,8 @@ void mana_ib_destroy_queue(struct mana_ib_dev *mdev, struct mana_ib_queue *queue */ mana_ib_gd_destroy_dma_region(mdev, queue->gdma_region); ib_umem_release(queue->umem); + if (queue->kmem) + mana_gd_destroy_queue(mdev_to_gc(mdev), queue->kmem); } static int diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h index b53a5b4de908..79ebd9598006 100644 --- a/drivers/infiniband/hw/mana/mana_ib.h +++ b/drivers/infiniband/hw/mana/mana_ib.h @@ -52,6 +52,7 @@ struct mana_ib_adapter_caps { struct mana_ib_queue { struct ib_umem *umem; + struct gdma_queue *kmem; u64 gdma_region; u64 id; }; @@ -388,6 +389,8 @@ int mana_ib_create_dma_region(struct mana_ib_dev *dev, struct ib_umem *umem, int mana_ib_gd_destroy_dma_region(struct mana_ib_dev *dev, mana_handle_t gdma_region); +int mana_ib_create_kernel_queue(struct mana_ib_dev *mdev, u32 size, enum gdma_queue_type type, + struct mana_ib_queue *queue); int mana_ib_create_queue(struct mana_ib_dev *mdev, u64 addr, u32 size, struct mana_ib_queue *queue); void mana_ib_destroy_queue(struct mana_ib_dev *mdev, struct mana_ib_queue *queue); diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c index be95336ce089..4e71987bbf2c 100644 --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c @@ -867,6 +867,7 @@ free_q: kfree(queue); return err; } +EXPORT_SYMBOL_NS(mana_gd_create_mana_wq_cq, "NET_MANA"); void mana_gd_destroy_queue(struct gdma_context *gc, struct gdma_queue *queue) { -- cgit v1.2.3-59-g8ed1b From bec127e45d9fd0080df679835d041615b0023ea6 Mon Sep 17 00:00:00 2001 From: Konstantin Taranov Date: Mon, 20 Jan 2025 09:27:10 -0800 Subject: RDMA/mana_ib: create kernel-level CQs Implement creation of CQs for the kernel. Signed-off-by: Konstantin Taranov Link: https://patch.msgid.link/1737394039-28772-5-git-send-email-kotaranov@linux.microsoft.com Reviewed-by: Shiraz Saleem Reviewed-by: Long Li Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mana/cq.c | 80 ++++++++++++++++++++++++++--------------- 1 file changed, 52 insertions(+), 28 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/mana/cq.c b/drivers/infiniband/hw/mana/cq.c index f04a679d2871..d26d82d9f8ce 100644 --- a/drivers/infiniband/hw/mana/cq.c +++ b/drivers/infiniband/hw/mana/cq.c @@ -15,42 +15,57 @@ int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, struct ib_device *ibdev = ibcq->device; struct mana_ib_create_cq ucmd = {}; struct mana_ib_dev *mdev; + struct gdma_context *gc; bool is_rnic_cq; u32 doorbell; + u32 buf_size; int err; mdev = container_of(ibdev, struct mana_ib_dev, ib_dev); + gc = mdev_to_gc(mdev); cq->comp_vector = attr->comp_vector % ibdev->num_comp_vectors; cq->cq_handle = INVALID_MANA_HANDLE; - if (udata->inlen < offsetof(struct mana_ib_create_cq, flags)) - return -EINVAL; + if (udata) { + if (udata->inlen < offsetof(struct mana_ib_create_cq, flags)) + return -EINVAL; - err = ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen)); - if (err) { - ibdev_dbg(ibdev, - "Failed to copy from udata for create cq, %d\n", err); - return err; - } + err = ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen)); + if (err) { + ibdev_dbg(ibdev, "Failed to copy from udata for create cq, %d\n", err); + return err; + } - is_rnic_cq = !!(ucmd.flags & MANA_IB_CREATE_RNIC_CQ); + is_rnic_cq = !!(ucmd.flags & MANA_IB_CREATE_RNIC_CQ); - if (!is_rnic_cq && attr->cqe > mdev->adapter_caps.max_qp_wr) { - ibdev_dbg(ibdev, "CQE %d exceeding limit\n", attr->cqe); - return -EINVAL; - } + if (!is_rnic_cq && attr->cqe > mdev->adapter_caps.max_qp_wr) { + ibdev_dbg(ibdev, "CQE %d exceeding limit\n", attr->cqe); + return -EINVAL; + } - cq->cqe = attr->cqe; - err = mana_ib_create_queue(mdev, ucmd.buf_addr, cq->cqe * COMP_ENTRY_SIZE, &cq->queue); - if (err) { - ibdev_dbg(ibdev, "Failed to create queue for create cq, %d\n", err); - return err; - } + cq->cqe = attr->cqe; + err = mana_ib_create_queue(mdev, ucmd.buf_addr, cq->cqe * COMP_ENTRY_SIZE, + &cq->queue); + if (err) { + ibdev_dbg(ibdev, "Failed to create queue for create cq, %d\n", err); + return err; + } - mana_ucontext = rdma_udata_to_drv_context(udata, struct mana_ib_ucontext, - ibucontext); - doorbell = mana_ucontext->doorbell; + mana_ucontext = rdma_udata_to_drv_context(udata, struct mana_ib_ucontext, + ibucontext); + doorbell = mana_ucontext->doorbell; + } else { + is_rnic_cq = true; + buf_size = MANA_PAGE_ALIGN(roundup_pow_of_two(attr->cqe * COMP_ENTRY_SIZE)); + cq->cqe = buf_size / COMP_ENTRY_SIZE; + err = mana_ib_create_kernel_queue(mdev, buf_size, GDMA_CQ, &cq->queue); + if (err) { + ibdev_dbg(ibdev, "Failed to create kernel queue for create cq, %d\n", err); + return err; + } + doorbell = gc->mana_ib.doorbell; + } if (is_rnic_cq) { err = mana_ib_gd_create_cq(mdev, cq, doorbell); @@ -66,11 +81,13 @@ int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, } } - resp.cqid = cq->queue.id; - err = ib_copy_to_udata(udata, &resp, min(sizeof(resp), udata->outlen)); - if (err) { - ibdev_dbg(&mdev->ib_dev, "Failed to copy to udata, %d\n", err); - goto err_remove_cq_cb; + if (udata) { + resp.cqid = cq->queue.id; + err = ib_copy_to_udata(udata, &resp, min(sizeof(resp), udata->outlen)); + if (err) { + ibdev_dbg(&mdev->ib_dev, "Failed to copy to udata, %d\n", err); + goto err_remove_cq_cb; + } } return 0; @@ -122,7 +139,10 @@ int mana_ib_install_cq_cb(struct mana_ib_dev *mdev, struct mana_ib_cq *cq) return -EINVAL; /* Create CQ table entry */ WARN_ON(gc->cq_table[cq->queue.id]); - gdma_cq = kzalloc(sizeof(*gdma_cq), GFP_KERNEL); + if (cq->queue.kmem) + gdma_cq = cq->queue.kmem; + else + gdma_cq = kzalloc(sizeof(*gdma_cq), GFP_KERNEL); if (!gdma_cq) return -ENOMEM; @@ -141,6 +161,10 @@ void mana_ib_remove_cq_cb(struct mana_ib_dev *mdev, struct mana_ib_cq *cq) if (cq->queue.id >= gc->max_num_cqs || cq->queue.id == INVALID_QUEUE_ID) return; + if (cq->queue.kmem) + /* Then it will be cleaned and removed by the mana */ + return; + kfree(gc->cq_table[cq->queue.id]); gc->cq_table[cq->queue.id] = NULL; } -- cgit v1.2.3-59-g8ed1b From 7f5192a82b37fd70050b7f6c5c119edf4740e8e4 Mon Sep 17 00:00:00 2001 From: Konstantin Taranov Date: Mon, 20 Jan 2025 09:27:11 -0800 Subject: RDMA/mana_ib: Create and destroy UD/GSI QP Implement HW requests to create and destroy UD/GSI QPs. An UD/GSI QP has send and receive queues. Signed-off-by: Konstantin Taranov Link: https://patch.msgid.link/1737394039-28772-6-git-send-email-kotaranov@linux.microsoft.com Reviewed-by: Shiraz Saleem Reviewed-by: Long Li Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mana/main.c | 58 ++++++++++++++++++++++++++++++++++++ drivers/infiniband/hw/mana/mana_ib.h | 49 ++++++++++++++++++++++++++++++ 2 files changed, 107 insertions(+) (limited to 'drivers') diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c index f2f6bb352048..b0c55cb80a88 100644 --- a/drivers/infiniband/hw/mana/main.c +++ b/drivers/infiniband/hw/mana/main.c @@ -1013,3 +1013,61 @@ int mana_ib_gd_destroy_rc_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp) } return 0; } + +int mana_ib_gd_create_ud_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp, + struct ib_qp_init_attr *attr, u32 doorbell, u32 type) +{ + struct mana_ib_cq *send_cq = container_of(qp->ibqp.send_cq, struct mana_ib_cq, ibcq); + struct mana_ib_cq *recv_cq = container_of(qp->ibqp.recv_cq, struct mana_ib_cq, ibcq); + struct mana_ib_pd *pd = container_of(qp->ibqp.pd, struct mana_ib_pd, ibpd); + struct gdma_context *gc = mdev_to_gc(mdev); + struct mana_rnic_create_udqp_resp resp = {}; + struct mana_rnic_create_udqp_req req = {}; + int err, i; + + mana_gd_init_req_hdr(&req.hdr, MANA_IB_CREATE_UD_QP, sizeof(req), sizeof(resp)); + req.hdr.dev_id = gc->mana_ib.dev_id; + req.adapter = mdev->adapter_handle; + req.pd_handle = pd->pd_handle; + req.send_cq_handle = send_cq->cq_handle; + req.recv_cq_handle = recv_cq->cq_handle; + for (i = 0; i < MANA_UD_QUEUE_TYPE_MAX; i++) + req.dma_region[i] = qp->ud_qp.queues[i].gdma_region; + req.doorbell_page = doorbell; + req.max_send_wr = attr->cap.max_send_wr; + req.max_recv_wr = attr->cap.max_recv_wr; + req.max_send_sge = attr->cap.max_send_sge; + req.max_recv_sge = attr->cap.max_recv_sge; + req.qp_type = type; + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + if (err) { + ibdev_err(&mdev->ib_dev, "Failed to create ud qp err %d", err); + return err; + } + qp->qp_handle = resp.qp_handle; + for (i = 0; i < MANA_UD_QUEUE_TYPE_MAX; i++) { + qp->ud_qp.queues[i].id = resp.queue_ids[i]; + /* The GDMA regions are now owned by the RNIC QP handle */ + qp->ud_qp.queues[i].gdma_region = GDMA_INVALID_DMA_REGION; + } + return 0; +} + +int mana_ib_gd_destroy_ud_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp) +{ + struct mana_rnic_destroy_udqp_resp resp = {0}; + struct mana_rnic_destroy_udqp_req req = {0}; + struct gdma_context *gc = mdev_to_gc(mdev); + int err; + + mana_gd_init_req_hdr(&req.hdr, MANA_IB_DESTROY_UD_QP, sizeof(req), sizeof(resp)); + req.hdr.dev_id = gc->mana_ib.dev_id; + req.adapter = mdev->adapter_handle; + req.qp_handle = qp->qp_handle; + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + if (err) { + ibdev_err(&mdev->ib_dev, "Failed to destroy ud qp err %d", err); + return err; + } + return 0; +} diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h index 79ebd9598006..5e470f15435d 100644 --- a/drivers/infiniband/hw/mana/mana_ib.h +++ b/drivers/infiniband/hw/mana/mana_ib.h @@ -115,6 +115,17 @@ struct mana_ib_rc_qp { struct mana_ib_queue queues[MANA_RC_QUEUE_TYPE_MAX]; }; +enum mana_ud_queue_type { + MANA_UD_SEND_QUEUE = 0, + MANA_UD_RECV_QUEUE, + MANA_UD_QUEUE_TYPE_MAX, +}; + +struct mana_ib_ud_qp { + struct mana_ib_queue queues[MANA_UD_QUEUE_TYPE_MAX]; + u32 sq_psn; +}; + struct mana_ib_qp { struct ib_qp ibqp; @@ -122,6 +133,7 @@ struct mana_ib_qp { union { struct mana_ib_queue raw_sq; struct mana_ib_rc_qp rc_qp; + struct mana_ib_ud_qp ud_qp; }; /* The port on the IB device, starting with 1 */ @@ -146,6 +158,8 @@ enum mana_ib_command_code { MANA_IB_DESTROY_ADAPTER = 0x30003, MANA_IB_CONFIG_IP_ADDR = 0x30004, MANA_IB_CONFIG_MAC_ADDR = 0x30005, + MANA_IB_CREATE_UD_QP = 0x30006, + MANA_IB_DESTROY_UD_QP = 0x30007, MANA_IB_CREATE_CQ = 0x30008, MANA_IB_DESTROY_CQ = 0x30009, MANA_IB_CREATE_RC_QP = 0x3000a, @@ -297,6 +311,37 @@ struct mana_rnic_destroy_rc_qp_resp { struct gdma_resp_hdr hdr; }; /* HW Data */ +struct mana_rnic_create_udqp_req { + struct gdma_req_hdr hdr; + mana_handle_t adapter; + mana_handle_t pd_handle; + mana_handle_t send_cq_handle; + mana_handle_t recv_cq_handle; + u64 dma_region[MANA_UD_QUEUE_TYPE_MAX]; + u32 qp_type; + u32 doorbell_page; + u32 max_send_wr; + u32 max_recv_wr; + u32 max_send_sge; + u32 max_recv_sge; +}; /* HW Data */ + +struct mana_rnic_create_udqp_resp { + struct gdma_resp_hdr hdr; + mana_handle_t qp_handle; + u32 queue_ids[MANA_UD_QUEUE_TYPE_MAX]; +}; /* HW Data*/ + +struct mana_rnic_destroy_udqp_req { + struct gdma_req_hdr hdr; + mana_handle_t adapter; + mana_handle_t qp_handle; +}; /* HW Data */ + +struct mana_rnic_destroy_udqp_resp { + struct gdma_resp_hdr hdr; +}; /* HW Data */ + struct mana_ib_ah_attr { u8 src_addr[16]; u8 dest_addr[16]; @@ -483,4 +528,8 @@ int mana_ib_gd_destroy_cq(struct mana_ib_dev *mdev, struct mana_ib_cq *cq); int mana_ib_gd_create_rc_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp, struct ib_qp_init_attr *attr, u32 doorbell, u64 flags); int mana_ib_gd_destroy_rc_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp); + +int mana_ib_gd_create_ud_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp, + struct ib_qp_init_attr *attr, u32 doorbell, u32 type); +int mana_ib_gd_destroy_ud_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp); #endif -- cgit v1.2.3-59-g8ed1b From bd4ee700870a732c62f32cbc102c79f75b5e88f1 Mon Sep 17 00:00:00 2001 From: Konstantin Taranov Date: Mon, 20 Jan 2025 09:27:12 -0800 Subject: RDMA/mana_ib: UD/GSI QP creation for kernel Implement UD/GSI QPs for the kernel. Allow create/modify/destroy for such QPs. Signed-off-by: Konstantin Taranov Link: https://patch.msgid.link/1737394039-28772-7-git-send-email-kotaranov@linux.microsoft.com Reviewed-by: Shiraz Saleem Reviewed-by: Long Li Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mana/qp.c | 115 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 115 insertions(+) (limited to 'drivers') diff --git a/drivers/infiniband/hw/mana/qp.c b/drivers/infiniband/hw/mana/qp.c index 73d67c853b6f..192b6eef5bc7 100644 --- a/drivers/infiniband/hw/mana/qp.c +++ b/drivers/infiniband/hw/mana/qp.c @@ -398,6 +398,52 @@ err_free_vport: return err; } +static u32 mana_ib_wqe_size(u32 sge, u32 oob_size) +{ + u32 wqe_size = sge * sizeof(struct gdma_sge) + sizeof(struct gdma_wqe) + oob_size; + + return ALIGN(wqe_size, GDMA_WQE_BU_SIZE); +} + +static u32 mana_ib_queue_size(struct ib_qp_init_attr *attr, u32 queue_type) +{ + u32 queue_size; + + switch (attr->qp_type) { + case IB_QPT_UD: + case IB_QPT_GSI: + if (queue_type == MANA_UD_SEND_QUEUE) + queue_size = attr->cap.max_send_wr * + mana_ib_wqe_size(attr->cap.max_send_sge, INLINE_OOB_LARGE_SIZE); + else + queue_size = attr->cap.max_recv_wr * + mana_ib_wqe_size(attr->cap.max_recv_sge, INLINE_OOB_SMALL_SIZE); + break; + default: + return 0; + } + + return MANA_PAGE_ALIGN(roundup_pow_of_two(queue_size)); +} + +static enum gdma_queue_type mana_ib_queue_type(struct ib_qp_init_attr *attr, u32 queue_type) +{ + enum gdma_queue_type type; + + switch (attr->qp_type) { + case IB_QPT_UD: + case IB_QPT_GSI: + if (queue_type == MANA_UD_SEND_QUEUE) + type = GDMA_SQ; + else + type = GDMA_RQ; + break; + default: + type = GDMA_INVALID_QUEUE; + } + return type; +} + static int mana_table_store_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp) { refcount_set(&qp->refcount, 1); @@ -490,6 +536,51 @@ destroy_queues: return err; } +static int mana_ib_create_ud_qp(struct ib_qp *ibqp, struct ib_pd *ibpd, + struct ib_qp_init_attr *attr, struct ib_udata *udata) +{ + struct mana_ib_dev *mdev = container_of(ibpd->device, struct mana_ib_dev, ib_dev); + struct mana_ib_qp *qp = container_of(ibqp, struct mana_ib_qp, ibqp); + struct gdma_context *gc = mdev_to_gc(mdev); + u32 doorbell, queue_size; + int i, err; + + if (udata) { + ibdev_dbg(&mdev->ib_dev, "User-level UD QPs are not supported\n"); + return -EOPNOTSUPP; + } + + for (i = 0; i < MANA_UD_QUEUE_TYPE_MAX; ++i) { + queue_size = mana_ib_queue_size(attr, i); + err = mana_ib_create_kernel_queue(mdev, queue_size, mana_ib_queue_type(attr, i), + &qp->ud_qp.queues[i]); + if (err) { + ibdev_err(&mdev->ib_dev, "Failed to create queue %d, err %d\n", + i, err); + goto destroy_queues; + } + } + doorbell = gc->mana_ib.doorbell; + + err = mana_ib_gd_create_ud_qp(mdev, qp, attr, doorbell, attr->qp_type); + if (err) { + ibdev_err(&mdev->ib_dev, "Failed to create ud qp %d\n", err); + goto destroy_queues; + } + qp->ibqp.qp_num = qp->ud_qp.queues[MANA_UD_RECV_QUEUE].id; + qp->port = attr->port_num; + + for (i = 0; i < MANA_UD_QUEUE_TYPE_MAX; ++i) + qp->ud_qp.queues[i].kmem->id = qp->ud_qp.queues[i].id; + + return 0; + +destroy_queues: + while (i-- > 0) + mana_ib_destroy_queue(mdev, &qp->ud_qp.queues[i]); + return err; +} + int mana_ib_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attr, struct ib_udata *udata) { @@ -503,6 +594,9 @@ int mana_ib_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attr, return mana_ib_create_qp_raw(ibqp, ibqp->pd, attr, udata); case IB_QPT_RC: return mana_ib_create_rc_qp(ibqp, ibqp->pd, attr, udata); + case IB_QPT_UD: + case IB_QPT_GSI: + return mana_ib_create_ud_qp(ibqp, ibqp->pd, attr, udata); default: ibdev_dbg(ibqp->device, "Creating QP type %u not supported\n", attr->qp_type); @@ -579,6 +673,8 @@ int mana_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, { switch (ibqp->qp_type) { case IB_QPT_RC: + case IB_QPT_UD: + case IB_QPT_GSI: return mana_ib_gd_modify_qp(ibqp, attr, attr_mask, udata); default: ibdev_dbg(ibqp->device, "Modify QP type %u not supported", ibqp->qp_type); @@ -652,6 +748,22 @@ static int mana_ib_destroy_rc_qp(struct mana_ib_qp *qp, struct ib_udata *udata) return 0; } +static int mana_ib_destroy_ud_qp(struct mana_ib_qp *qp, struct ib_udata *udata) +{ + struct mana_ib_dev *mdev = + container_of(qp->ibqp.device, struct mana_ib_dev, ib_dev); + int i; + + /* Ignore return code as there is not much we can do about it. + * The error message is printed inside. + */ + mana_ib_gd_destroy_ud_qp(mdev, qp); + for (i = 0; i < MANA_UD_QUEUE_TYPE_MAX; ++i) + mana_ib_destroy_queue(mdev, &qp->ud_qp.queues[i]); + + return 0; +} + int mana_ib_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) { struct mana_ib_qp *qp = container_of(ibqp, struct mana_ib_qp, ibqp); @@ -665,6 +777,9 @@ int mana_ib_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) return mana_ib_destroy_qp_raw(qp, udata); case IB_QPT_RC: return mana_ib_destroy_rc_qp(qp, udata); + case IB_QPT_UD: + case IB_QPT_GSI: + return mana_ib_destroy_ud_qp(qp, udata); default: ibdev_dbg(ibqp->device, "Unexpected QP type %u\n", ibqp->qp_type); -- cgit v1.2.3-59-g8ed1b From df91c470d9e57b99e7d918dc89e1c13949f7b95e Mon Sep 17 00:00:00 2001 From: Konstantin Taranov Date: Mon, 20 Jan 2025 09:27:13 -0800 Subject: RDMA/mana_ib: create/destroy AH Implement create and destroy AH for kernel. In mana_ib, AV is passed as an sge in WQE. Allocate DMA memory and write an AV there. Signed-off-by: Konstantin Taranov Link: https://patch.msgid.link/1737394039-28772-8-git-send-email-kotaranov@linux.microsoft.com Reviewed-by: Shiraz Saleem Reviewed-by: Long Li Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mana/Makefile | 2 +- drivers/infiniband/hw/mana/ah.c | 58 ++++++++++++++++++++++++++++++++++++ drivers/infiniband/hw/mana/device.c | 13 +++++++- drivers/infiniband/hw/mana/mana_ib.h | 30 +++++++++++++++++++ 4 files changed, 101 insertions(+), 2 deletions(-) create mode 100644 drivers/infiniband/hw/mana/ah.c (limited to 'drivers') diff --git a/drivers/infiniband/hw/mana/Makefile b/drivers/infiniband/hw/mana/Makefile index 88655fe5e398..6e56f778d6db 100644 --- a/drivers/infiniband/hw/mana/Makefile +++ b/drivers/infiniband/hw/mana/Makefile @@ -1,4 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_MANA_INFINIBAND) += mana_ib.o -mana_ib-y := device.o main.o wq.o qp.o cq.o mr.o +mana_ib-y := device.o main.o wq.o qp.o cq.o mr.o ah.o diff --git a/drivers/infiniband/hw/mana/ah.c b/drivers/infiniband/hw/mana/ah.c new file mode 100644 index 000000000000..f56952eebbaa --- /dev/null +++ b/drivers/infiniband/hw/mana/ah.c @@ -0,0 +1,58 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2024, Microsoft Corporation. All rights reserved. + */ + +#include "mana_ib.h" + +int mana_ib_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *attr, + struct ib_udata *udata) +{ + struct mana_ib_dev *mdev = container_of(ibah->device, struct mana_ib_dev, ib_dev); + struct mana_ib_ah *ah = container_of(ibah, struct mana_ib_ah, ibah); + struct rdma_ah_attr *ah_attr = attr->ah_attr; + const struct ib_global_route *grh; + enum rdma_network_type ntype; + + if (ah_attr->type != RDMA_AH_ATTR_TYPE_ROCE || + !(rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH)) + return -EINVAL; + + if (udata) + return -EINVAL; + + ah->av = dma_pool_zalloc(mdev->av_pool, GFP_ATOMIC, &ah->dma_handle); + if (!ah->av) + return -ENOMEM; + + grh = rdma_ah_read_grh(ah_attr); + ntype = rdma_gid_attr_network_type(grh->sgid_attr); + + copy_in_reverse(ah->av->dest_mac, ah_attr->roce.dmac, ETH_ALEN); + ah->av->udp_src_port = rdma_flow_label_to_udp_sport(grh->flow_label); + ah->av->hop_limit = grh->hop_limit; + ah->av->dscp = (grh->traffic_class >> 2) & 0x3f; + ah->av->is_ipv6 = (ntype == RDMA_NETWORK_IPV6); + + if (ah->av->is_ipv6) { + copy_in_reverse(ah->av->dest_ip, grh->dgid.raw, 16); + copy_in_reverse(ah->av->src_ip, grh->sgid_attr->gid.raw, 16); + } else { + ah->av->dest_ip[10] = 0xFF; + ah->av->dest_ip[11] = 0xFF; + copy_in_reverse(&ah->av->dest_ip[12], &grh->dgid.raw[12], 4); + copy_in_reverse(&ah->av->src_ip[12], &grh->sgid_attr->gid.raw[12], 4); + } + + return 0; +} + +int mana_ib_destroy_ah(struct ib_ah *ibah, u32 flags) +{ + struct mana_ib_dev *mdev = container_of(ibah->device, struct mana_ib_dev, ib_dev); + struct mana_ib_ah *ah = container_of(ibah, struct mana_ib_ah, ibah); + + dma_pool_free(mdev->av_pool, ah->av, ah->dma_handle); + + return 0; +} diff --git a/drivers/infiniband/hw/mana/device.c b/drivers/infiniband/hw/mana/device.c index 681e243b349c..df33aab8d990 100644 --- a/drivers/infiniband/hw/mana/device.c +++ b/drivers/infiniband/hw/mana/device.c @@ -19,6 +19,7 @@ static const struct ib_device_ops mana_ib_dev_ops = { .add_gid = mana_ib_gd_add_gid, .alloc_pd = mana_ib_alloc_pd, .alloc_ucontext = mana_ib_alloc_ucontext, + .create_ah = mana_ib_create_ah, .create_cq = mana_ib_create_cq, .create_qp = mana_ib_create_qp, .create_rwq_ind_table = mana_ib_create_rwq_ind_table, @@ -27,6 +28,7 @@ static const struct ib_device_ops mana_ib_dev_ops = { .dealloc_ucontext = mana_ib_dealloc_ucontext, .del_gid = mana_ib_gd_del_gid, .dereg_mr = mana_ib_dereg_mr, + .destroy_ah = mana_ib_destroy_ah, .destroy_cq = mana_ib_destroy_cq, .destroy_qp = mana_ib_destroy_qp, .destroy_rwq_ind_table = mana_ib_destroy_rwq_ind_table, @@ -44,6 +46,7 @@ static const struct ib_device_ops mana_ib_dev_ops = { .query_port = mana_ib_query_port, .reg_user_mr = mana_ib_reg_user_mr, + INIT_RDMA_OBJ_SIZE(ib_ah, mana_ib_ah, ibah), INIT_RDMA_OBJ_SIZE(ib_cq, mana_ib_cq, ibcq), INIT_RDMA_OBJ_SIZE(ib_pd, mana_ib_pd, ibpd), INIT_RDMA_OBJ_SIZE(ib_qp, mana_ib_qp, ibqp), @@ -135,15 +138,22 @@ static int mana_ib_probe(struct auxiliary_device *adev, goto destroy_rnic; } + dev->av_pool = dma_pool_create("mana_ib_av", mdev->gdma_context->dev, + MANA_AV_BUFFER_SIZE, MANA_AV_BUFFER_SIZE, 0); + if (!dev->av_pool) + goto destroy_rnic; + ret = ib_register_device(&dev->ib_dev, "mana_%d", mdev->gdma_context->dev); if (ret) - goto destroy_rnic; + goto deallocate_pool; dev_set_drvdata(&adev->dev, dev); return 0; +deallocate_pool: + dma_pool_destroy(dev->av_pool); destroy_rnic: xa_destroy(&dev->qp_table_wq); mana_ib_gd_destroy_rnic_adapter(dev); @@ -161,6 +171,7 @@ static void mana_ib_remove(struct auxiliary_device *adev) struct mana_ib_dev *dev = dev_get_drvdata(&adev->dev); ib_unregister_device(&dev->ib_dev); + dma_pool_destroy(dev->av_pool); xa_destroy(&dev->qp_table_wq); mana_ib_gd_destroy_rnic_adapter(dev); mana_ib_destroy_eqs(dev); diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h index 5e470f15435d..7b079d875e6c 100644 --- a/drivers/infiniband/hw/mana/mana_ib.h +++ b/drivers/infiniband/hw/mana/mana_ib.h @@ -11,6 +11,7 @@ #include #include #include +#include #include @@ -32,6 +33,11 @@ */ #define MANA_CA_ACK_DELAY 16 +/* + * The buffer used for writing AV + */ +#define MANA_AV_BUFFER_SIZE 64 + struct mana_ib_adapter_caps { u32 max_sq_id; u32 max_rq_id; @@ -65,6 +71,7 @@ struct mana_ib_dev { struct gdma_queue **eqs; struct xarray qp_table_wq; struct mana_ib_adapter_caps adapter_caps; + struct dma_pool *av_pool; }; struct mana_ib_wq { @@ -88,6 +95,25 @@ struct mana_ib_pd { u32 tx_vp_offset; }; +struct mana_ib_av { + u8 dest_ip[16]; + u8 dest_mac[ETH_ALEN]; + u16 udp_src_port; + u8 src_ip[16]; + u32 hop_limit : 8; + u32 reserved1 : 12; + u32 dscp : 6; + u32 reserved2 : 5; + u32 is_ipv6 : 1; + u32 reserved3 : 32; +}; + +struct mana_ib_ah { + struct ib_ah ibah; + struct mana_ib_av *av; + dma_addr_t dma_handle; +}; + struct mana_ib_mr { struct ib_mr ibmr; struct ib_umem *umem; @@ -532,4 +558,8 @@ int mana_ib_gd_destroy_rc_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp); int mana_ib_gd_create_ud_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp, struct ib_qp_init_attr *attr, u32 doorbell, u32 type); int mana_ib_gd_destroy_ud_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp); + +int mana_ib_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, + struct ib_udata *udata); +int mana_ib_destroy_ah(struct ib_ah *ah, u32 flags); #endif -- cgit v1.2.3-59-g8ed1b From 5ec7e1c86c441c46a374577bccd9488abea30037 Mon Sep 17 00:00:00 2001 From: Konstantin Taranov Date: Mon, 20 Jan 2025 09:27:14 -0800 Subject: net/mana: fix warning in the writer of client oob Do not warn on missing pad_data when oob is in sgl. Signed-off-by: Konstantin Taranov Link: https://patch.msgid.link/1737394039-28772-9-git-send-email-kotaranov@linux.microsoft.com Reviewed-by: Shiraz Saleem Reviewed-by: Long Li Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/microsoft/mana/gdma_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c index 4e71987bbf2c..4701bcaa7808 100644 --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c @@ -1042,7 +1042,7 @@ static u32 mana_gd_write_client_oob(const struct gdma_wqe_request *wqe_req, header->inline_oob_size_div4 = client_oob_size / sizeof(u32); if (oob_in_sgl) { - WARN_ON_ONCE(!pad_data || wqe_req->num_sge < 2); + WARN_ON_ONCE(wqe_req->num_sge < 2); header->client_oob_in_sgl = 1; -- cgit v1.2.3-59-g8ed1b From c8017f5b4856d52c490f6517d2df7bd6eed212f3 Mon Sep 17 00:00:00 2001 From: Konstantin Taranov Date: Mon, 20 Jan 2025 09:27:15 -0800 Subject: RDMA/mana_ib: UD/GSI work requests Implement post send and post recv for UD/GSI QPs. Add information about posted requests into shadow queues. Co-developed-by: Shiraz Saleem Signed-off-by: Shiraz Saleem Signed-off-by: Konstantin Taranov Link: https://patch.msgid.link/1737394039-28772-10-git-send-email-kotaranov@linux.microsoft.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mana/Makefile | 2 +- drivers/infiniband/hw/mana/device.c | 2 + drivers/infiniband/hw/mana/mana_ib.h | 33 +++++ drivers/infiniband/hw/mana/qp.c | 21 ++- drivers/infiniband/hw/mana/shadow_queue.h | 115 ++++++++++++++++ drivers/infiniband/hw/mana/wr.c | 168 ++++++++++++++++++++++++ drivers/net/ethernet/microsoft/mana/gdma_main.c | 2 + 7 files changed, 341 insertions(+), 2 deletions(-) create mode 100644 drivers/infiniband/hw/mana/shadow_queue.h create mode 100644 drivers/infiniband/hw/mana/wr.c (limited to 'drivers') diff --git a/drivers/infiniband/hw/mana/Makefile b/drivers/infiniband/hw/mana/Makefile index 6e56f778d6db..79426e725122 100644 --- a/drivers/infiniband/hw/mana/Makefile +++ b/drivers/infiniband/hw/mana/Makefile @@ -1,4 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_MANA_INFINIBAND) += mana_ib.o -mana_ib-y := device.o main.o wq.o qp.o cq.o mr.o ah.o +mana_ib-y := device.o main.o wq.o qp.o cq.o mr.o ah.o wr.o diff --git a/drivers/infiniband/hw/mana/device.c b/drivers/infiniband/hw/mana/device.c index df33aab8d990..6fa5d7e2d0ed 100644 --- a/drivers/infiniband/hw/mana/device.c +++ b/drivers/infiniband/hw/mana/device.c @@ -40,6 +40,8 @@ static const struct ib_device_ops mana_ib_dev_ops = { .mmap = mana_ib_mmap, .modify_qp = mana_ib_modify_qp, .modify_wq = mana_ib_modify_wq, + .post_recv = mana_ib_post_recv, + .post_send = mana_ib_post_send, .query_device = mana_ib_query_device, .query_gid = mana_ib_query_gid, .query_pkey = mana_ib_query_pkey, diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h index 7b079d875e6c..6265c39eeff6 100644 --- a/drivers/infiniband/hw/mana/mana_ib.h +++ b/drivers/infiniband/hw/mana/mana_ib.h @@ -14,6 +14,7 @@ #include #include +#include "shadow_queue.h" #define PAGE_SZ_BM \ (SZ_4K | SZ_8K | SZ_16K | SZ_32K | SZ_64K | SZ_128K | SZ_256K | \ @@ -165,6 +166,9 @@ struct mana_ib_qp { /* The port on the IB device, starting with 1 */ u32 port; + struct shadow_queue shadow_rq; + struct shadow_queue shadow_sq; + refcount_t refcount; struct completion free; }; @@ -404,6 +408,30 @@ struct mana_rnic_set_qp_state_resp { struct gdma_resp_hdr hdr; }; /* HW Data */ +enum WQE_OPCODE_TYPES { + WQE_TYPE_UD_SEND = 0, + WQE_TYPE_UD_RECV = 8, +}; /* HW DATA */ + +struct rdma_send_oob { + u32 wqe_type : 5; + u32 fence : 1; + u32 signaled : 1; + u32 solicited : 1; + u32 psn : 24; + + u32 ssn_or_rqpn : 24; + u32 reserved1 : 8; + union { + struct { + u32 remote_qkey; + u32 immediate; + u32 reserved1; + u32 reserved2; + } ud_send; + }; +}; /* HW DATA */ + static inline struct gdma_context *mdev_to_gc(struct mana_ib_dev *mdev) { return mdev->gdma_dev->gdma_context; @@ -562,4 +590,9 @@ int mana_ib_gd_destroy_ud_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp); int mana_ib_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, struct ib_udata *udata); int mana_ib_destroy_ah(struct ib_ah *ah, u32 flags); + +int mana_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr); +int mana_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr); #endif diff --git a/drivers/infiniband/hw/mana/qp.c b/drivers/infiniband/hw/mana/qp.c index 192b6eef5bc7..13ee30f63d03 100644 --- a/drivers/infiniband/hw/mana/qp.c +++ b/drivers/infiniband/hw/mana/qp.c @@ -562,10 +562,23 @@ static int mana_ib_create_ud_qp(struct ib_qp *ibqp, struct ib_pd *ibpd, } doorbell = gc->mana_ib.doorbell; + err = create_shadow_queue(&qp->shadow_rq, attr->cap.max_recv_wr, + sizeof(struct ud_rq_shadow_wqe)); + if (err) { + ibdev_err(&mdev->ib_dev, "Failed to create shadow rq err %d\n", err); + goto destroy_queues; + } + err = create_shadow_queue(&qp->shadow_sq, attr->cap.max_send_wr, + sizeof(struct ud_sq_shadow_wqe)); + if (err) { + ibdev_err(&mdev->ib_dev, "Failed to create shadow sq err %d\n", err); + goto destroy_shadow_queues; + } + err = mana_ib_gd_create_ud_qp(mdev, qp, attr, doorbell, attr->qp_type); if (err) { ibdev_err(&mdev->ib_dev, "Failed to create ud qp %d\n", err); - goto destroy_queues; + goto destroy_shadow_queues; } qp->ibqp.qp_num = qp->ud_qp.queues[MANA_UD_RECV_QUEUE].id; qp->port = attr->port_num; @@ -575,6 +588,9 @@ static int mana_ib_create_ud_qp(struct ib_qp *ibqp, struct ib_pd *ibpd, return 0; +destroy_shadow_queues: + destroy_shadow_queue(&qp->shadow_rq); + destroy_shadow_queue(&qp->shadow_sq); destroy_queues: while (i-- > 0) mana_ib_destroy_queue(mdev, &qp->ud_qp.queues[i]); @@ -754,6 +770,9 @@ static int mana_ib_destroy_ud_qp(struct mana_ib_qp *qp, struct ib_udata *udata) container_of(qp->ibqp.device, struct mana_ib_dev, ib_dev); int i; + destroy_shadow_queue(&qp->shadow_rq); + destroy_shadow_queue(&qp->shadow_sq); + /* Ignore return code as there is not much we can do about it. * The error message is printed inside. */ diff --git a/drivers/infiniband/hw/mana/shadow_queue.h b/drivers/infiniband/hw/mana/shadow_queue.h new file mode 100644 index 000000000000..d8bfb4c712d5 --- /dev/null +++ b/drivers/infiniband/hw/mana/shadow_queue.h @@ -0,0 +1,115 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2024, Microsoft Corporation. All rights reserved. + */ + +#ifndef _MANA_SHADOW_QUEUE_H_ +#define _MANA_SHADOW_QUEUE_H_ + +struct shadow_wqe_header { + u16 opcode; + u16 error_code; + u32 posted_wqe_size; + u64 wr_id; +}; + +struct ud_rq_shadow_wqe { + struct shadow_wqe_header header; + u32 byte_len; + u32 src_qpn; +}; + +struct ud_sq_shadow_wqe { + struct shadow_wqe_header header; +}; + +struct shadow_queue { + /* Unmasked producer index, Incremented on wqe posting */ + u64 prod_idx; + /* Unmasked consumer index, Incremented on cq polling */ + u64 cons_idx; + /* Unmasked index of next-to-complete (from HW) shadow WQE */ + u64 next_to_complete_idx; + /* queue size in wqes */ + u32 length; + /* distance between elements in bytes */ + u32 stride; + /* ring buffer holding wqes */ + void *buffer; +}; + +static inline int create_shadow_queue(struct shadow_queue *queue, uint32_t length, uint32_t stride) +{ + queue->buffer = kvmalloc(length * stride, GFP_KERNEL); + if (!queue->buffer) + return -ENOMEM; + + queue->length = length; + queue->stride = stride; + + return 0; +} + +static inline void destroy_shadow_queue(struct shadow_queue *queue) +{ + kvfree(queue->buffer); +} + +static inline bool shadow_queue_full(struct shadow_queue *queue) +{ + return (queue->prod_idx - queue->cons_idx) >= queue->length; +} + +static inline bool shadow_queue_empty(struct shadow_queue *queue) +{ + return queue->prod_idx == queue->cons_idx; +} + +static inline void * +shadow_queue_get_element(const struct shadow_queue *queue, u64 unmasked_index) +{ + u32 index = unmasked_index % queue->length; + + return ((u8 *)queue->buffer + index * queue->stride); +} + +static inline void * +shadow_queue_producer_entry(struct shadow_queue *queue) +{ + return shadow_queue_get_element(queue, queue->prod_idx); +} + +static inline void * +shadow_queue_get_next_to_consume(const struct shadow_queue *queue) +{ + if (queue->cons_idx == queue->next_to_complete_idx) + return NULL; + + return shadow_queue_get_element(queue, queue->cons_idx); +} + +static inline void * +shadow_queue_get_next_to_complete(struct shadow_queue *queue) +{ + if (queue->next_to_complete_idx == queue->prod_idx) + return NULL; + + return shadow_queue_get_element(queue, queue->next_to_complete_idx); +} + +static inline void shadow_queue_advance_producer(struct shadow_queue *queue) +{ + queue->prod_idx++; +} + +static inline void shadow_queue_advance_consumer(struct shadow_queue *queue) +{ + queue->cons_idx++; +} + +static inline void shadow_queue_advance_next_to_complete(struct shadow_queue *queue) +{ + queue->next_to_complete_idx++; +} + +#endif diff --git a/drivers/infiniband/hw/mana/wr.c b/drivers/infiniband/hw/mana/wr.c new file mode 100644 index 000000000000..1813567d3b16 --- /dev/null +++ b/drivers/infiniband/hw/mana/wr.c @@ -0,0 +1,168 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2024, Microsoft Corporation. All rights reserved. + */ + +#include "mana_ib.h" + +#define MAX_WR_SGL_NUM (2) + +static int mana_ib_post_recv_ud(struct mana_ib_qp *qp, const struct ib_recv_wr *wr) +{ + struct mana_ib_dev *mdev = container_of(qp->ibqp.device, struct mana_ib_dev, ib_dev); + struct gdma_queue *queue = qp->ud_qp.queues[MANA_UD_RECV_QUEUE].kmem; + struct gdma_posted_wqe_info wqe_info = {0}; + struct gdma_sge gdma_sgl[MAX_WR_SGL_NUM]; + struct gdma_wqe_request wqe_req = {0}; + struct ud_rq_shadow_wqe *shadow_wqe; + int err, i; + + if (shadow_queue_full(&qp->shadow_rq)) + return -EINVAL; + + if (wr->num_sge > MAX_WR_SGL_NUM) + return -EINVAL; + + for (i = 0; i < wr->num_sge; ++i) { + gdma_sgl[i].address = wr->sg_list[i].addr; + gdma_sgl[i].mem_key = wr->sg_list[i].lkey; + gdma_sgl[i].size = wr->sg_list[i].length; + } + wqe_req.num_sge = wr->num_sge; + wqe_req.sgl = gdma_sgl; + + err = mana_gd_post_work_request(queue, &wqe_req, &wqe_info); + if (err) + return err; + + shadow_wqe = shadow_queue_producer_entry(&qp->shadow_rq); + memset(shadow_wqe, 0, sizeof(*shadow_wqe)); + shadow_wqe->header.opcode = IB_WC_RECV; + shadow_wqe->header.wr_id = wr->wr_id; + shadow_wqe->header.posted_wqe_size = wqe_info.wqe_size_in_bu; + shadow_queue_advance_producer(&qp->shadow_rq); + + mana_gd_wq_ring_doorbell(mdev_to_gc(mdev), queue); + return 0; +} + +int mana_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) +{ + struct mana_ib_qp *qp = container_of(ibqp, struct mana_ib_qp, ibqp); + int err = 0; + + for (; wr; wr = wr->next) { + switch (ibqp->qp_type) { + case IB_QPT_UD: + case IB_QPT_GSI: + err = mana_ib_post_recv_ud(qp, wr); + if (unlikely(err)) { + *bad_wr = wr; + return err; + } + break; + default: + ibdev_dbg(ibqp->device, "Posting recv wr on qp type %u is not supported\n", + ibqp->qp_type); + return -EINVAL; + } + } + + return err; +} + +static int mana_ib_post_send_ud(struct mana_ib_qp *qp, const struct ib_ud_wr *wr) +{ + struct mana_ib_dev *mdev = container_of(qp->ibqp.device, struct mana_ib_dev, ib_dev); + struct mana_ib_ah *ah = container_of(wr->ah, struct mana_ib_ah, ibah); + struct net_device *ndev = mana_ib_get_netdev(&mdev->ib_dev, qp->port); + struct gdma_queue *queue = qp->ud_qp.queues[MANA_UD_SEND_QUEUE].kmem; + struct gdma_sge gdma_sgl[MAX_WR_SGL_NUM + 1]; + struct gdma_posted_wqe_info wqe_info = {0}; + struct gdma_wqe_request wqe_req = {0}; + struct rdma_send_oob send_oob = {0}; + struct ud_sq_shadow_wqe *shadow_wqe; + int err, i; + + if (!ndev) { + ibdev_dbg(&mdev->ib_dev, "Invalid port %u in QP %u\n", + qp->port, qp->ibqp.qp_num); + return -EINVAL; + } + + if (wr->wr.opcode != IB_WR_SEND) + return -EINVAL; + + if (shadow_queue_full(&qp->shadow_sq)) + return -EINVAL; + + if (wr->wr.num_sge > MAX_WR_SGL_NUM) + return -EINVAL; + + gdma_sgl[0].address = ah->dma_handle; + gdma_sgl[0].mem_key = qp->ibqp.pd->local_dma_lkey; + gdma_sgl[0].size = sizeof(struct mana_ib_av); + for (i = 0; i < wr->wr.num_sge; ++i) { + gdma_sgl[i + 1].address = wr->wr.sg_list[i].addr; + gdma_sgl[i + 1].mem_key = wr->wr.sg_list[i].lkey; + gdma_sgl[i + 1].size = wr->wr.sg_list[i].length; + } + + wqe_req.num_sge = wr->wr.num_sge + 1; + wqe_req.sgl = gdma_sgl; + wqe_req.inline_oob_size = sizeof(struct rdma_send_oob); + wqe_req.inline_oob_data = &send_oob; + wqe_req.flags = GDMA_WR_OOB_IN_SGL; + wqe_req.client_data_unit = ib_mtu_enum_to_int(ib_mtu_int_to_enum(ndev->mtu)); + + send_oob.wqe_type = WQE_TYPE_UD_SEND; + send_oob.fence = !!(wr->wr.send_flags & IB_SEND_FENCE); + send_oob.signaled = !!(wr->wr.send_flags & IB_SEND_SIGNALED); + send_oob.solicited = !!(wr->wr.send_flags & IB_SEND_SOLICITED); + send_oob.psn = qp->ud_qp.sq_psn; + send_oob.ssn_or_rqpn = wr->remote_qpn; + send_oob.ud_send.remote_qkey = + qp->ibqp.qp_type == IB_QPT_GSI ? IB_QP1_QKEY : wr->remote_qkey; + + err = mana_gd_post_work_request(queue, &wqe_req, &wqe_info); + if (err) + return err; + + qp->ud_qp.sq_psn++; + shadow_wqe = shadow_queue_producer_entry(&qp->shadow_sq); + memset(shadow_wqe, 0, sizeof(*shadow_wqe)); + shadow_wqe->header.opcode = IB_WC_SEND; + shadow_wqe->header.wr_id = wr->wr.wr_id; + shadow_wqe->header.posted_wqe_size = wqe_info.wqe_size_in_bu; + shadow_queue_advance_producer(&qp->shadow_sq); + + mana_gd_wq_ring_doorbell(mdev_to_gc(mdev), queue); + return 0; +} + +int mana_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr) +{ + int err; + struct mana_ib_qp *qp = container_of(ibqp, struct mana_ib_qp, ibqp); + + for (; wr; wr = wr->next) { + switch (ibqp->qp_type) { + case IB_QPT_UD: + case IB_QPT_GSI: + err = mana_ib_post_send_ud(qp, ud_wr(wr)); + if (unlikely(err)) { + *bad_wr = wr; + return err; + } + break; + default: + ibdev_dbg(ibqp->device, "Posting send wr on qp type %u is not supported\n", + ibqp->qp_type); + return -EINVAL; + } + } + + return err; +} diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c index 4701bcaa7808..a67d4d16c6a7 100644 --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c @@ -331,6 +331,7 @@ void mana_gd_wq_ring_doorbell(struct gdma_context *gc, struct gdma_queue *queue) mana_gd_ring_doorbell(gc, queue->gdma_dev->doorbell, queue->type, queue->id, queue->head * GDMA_WQE_BU_SIZE, 0); } +EXPORT_SYMBOL_NS(mana_gd_wq_ring_doorbell, "NET_MANA"); void mana_gd_ring_cq(struct gdma_queue *cq, u8 arm_bit) { @@ -1149,6 +1150,7 @@ int mana_gd_post_work_request(struct gdma_queue *wq, return 0; } +EXPORT_SYMBOL_NS(mana_gd_post_work_request, "NET_MANA"); int mana_gd_post_and_ring(struct gdma_queue *queue, const struct gdma_wqe_request *wqe_req, -- cgit v1.2.3-59-g8ed1b From 40ebdacb4e433f344437b3a499d3bb5175775f2d Mon Sep 17 00:00:00 2001 From: Konstantin Taranov Date: Mon, 20 Jan 2025 09:27:16 -0800 Subject: RDMA/mana_ib: implement req_notify_cq Arm a CQ when req_notify_cq is called. Signed-off-by: Konstantin Taranov Link: https://patch.msgid.link/1737394039-28772-11-git-send-email-kotaranov@linux.microsoft.com Reviewed-by: Shiraz Saleem Reviewed-by: Long Li Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mana/cq.c | 12 ++++++++++++ drivers/infiniband/hw/mana/device.c | 1 + drivers/infiniband/hw/mana/mana_ib.h | 2 ++ drivers/net/ethernet/microsoft/mana/gdma_main.c | 1 + 4 files changed, 16 insertions(+) (limited to 'drivers') diff --git a/drivers/infiniband/hw/mana/cq.c b/drivers/infiniband/hw/mana/cq.c index d26d82d9f8ce..82f14623f83c 100644 --- a/drivers/infiniband/hw/mana/cq.c +++ b/drivers/infiniband/hw/mana/cq.c @@ -168,3 +168,15 @@ void mana_ib_remove_cq_cb(struct mana_ib_dev *mdev, struct mana_ib_cq *cq) kfree(gc->cq_table[cq->queue.id]); gc->cq_table[cq->queue.id] = NULL; } + +int mana_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) +{ + struct mana_ib_cq *cq = container_of(ibcq, struct mana_ib_cq, ibcq); + struct gdma_queue *gdma_cq = cq->queue.kmem; + + if (!gdma_cq) + return -EINVAL; + + mana_gd_ring_cq(gdma_cq, SET_ARM_BIT); + return 0; +} diff --git a/drivers/infiniband/hw/mana/device.c b/drivers/infiniband/hw/mana/device.c index 6fa5d7e2d0ed..637bc9833325 100644 --- a/drivers/infiniband/hw/mana/device.c +++ b/drivers/infiniband/hw/mana/device.c @@ -47,6 +47,7 @@ static const struct ib_device_ops mana_ib_dev_ops = { .query_pkey = mana_ib_query_pkey, .query_port = mana_ib_query_port, .reg_user_mr = mana_ib_reg_user_mr, + .req_notify_cq = mana_ib_arm_cq, INIT_RDMA_OBJ_SIZE(ib_ah, mana_ib_ah, ibah), INIT_RDMA_OBJ_SIZE(ib_cq, mana_ib_cq, ibcq), diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h index 6265c39eeff6..bd34ad608a37 100644 --- a/drivers/infiniband/hw/mana/mana_ib.h +++ b/drivers/infiniband/hw/mana/mana_ib.h @@ -595,4 +595,6 @@ int mana_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, const struct ib_recv_wr **bad_wr); int mana_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, const struct ib_send_wr **bad_wr); + +int mana_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags); #endif diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c index a67d4d16c6a7..e032c9f9f7c2 100644 --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c @@ -344,6 +344,7 @@ void mana_gd_ring_cq(struct gdma_queue *cq, u8 arm_bit) mana_gd_ring_doorbell(gc, cq->gdma_dev->doorbell, cq->type, cq->id, head, arm_bit); } +EXPORT_SYMBOL_NS(mana_gd_ring_cq, "NET_MANA"); static void mana_gd_process_eqe(struct gdma_queue *eq) { -- cgit v1.2.3-59-g8ed1b From 8001e9257eca23264550ff9e34598ee43a80f0f9 Mon Sep 17 00:00:00 2001 From: Konstantin Taranov Date: Mon, 20 Jan 2025 09:27:17 -0800 Subject: RDMA/mana_ib: extend mana QP table Enable mana QP table to store UD/GSI QPs. For send queues, set the most significant bit to one, as send and receive WQs can have the same ID in mana. Signed-off-by: Konstantin Taranov Link: https://patch.msgid.link/1737394039-28772-12-git-send-email-kotaranov@linux.microsoft.com Reviewed-by: Shiraz Saleem Reviewed-by: Long Li Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mana/main.c | 2 +- drivers/infiniband/hw/mana/mana_ib.h | 8 +++- drivers/infiniband/hw/mana/qp.c | 78 ++++++++++++++++++++++++++++++++++-- 3 files changed, 83 insertions(+), 5 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c index b0c55cb80a88..114e39109511 100644 --- a/drivers/infiniband/hw/mana/main.c +++ b/drivers/infiniband/hw/mana/main.c @@ -704,7 +704,7 @@ mana_ib_event_handler(void *ctx, struct gdma_queue *q, struct gdma_event *event) switch (event->type) { case GDMA_EQE_RNIC_QP_FATAL: qpn = event->details[0]; - qp = mana_get_qp_ref(mdev, qpn); + qp = mana_get_qp_ref(mdev, qpn, false); if (!qp) break; if (qp->ibqp.event_handler) { diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h index bd34ad608a37..5e4ca55b67b4 100644 --- a/drivers/infiniband/hw/mana/mana_ib.h +++ b/drivers/infiniband/hw/mana/mana_ib.h @@ -23,6 +23,9 @@ /* MANA doesn't have any limit for MR size */ #define MANA_IB_MAX_MR_SIZE U64_MAX +/* Send queue ID mask */ +#define MANA_SENDQ_MASK BIT(31) + /* * The hardware limit of number of MRs is greater than maximum number of MRs * that can possibly represent in 24 bits @@ -438,11 +441,14 @@ static inline struct gdma_context *mdev_to_gc(struct mana_ib_dev *mdev) } static inline struct mana_ib_qp *mana_get_qp_ref(struct mana_ib_dev *mdev, - uint32_t qid) + u32 qid, bool is_sq) { struct mana_ib_qp *qp; unsigned long flag; + if (is_sq) + qid |= MANA_SENDQ_MASK; + xa_lock_irqsave(&mdev->qp_table_wq, flag); qp = xa_load(&mdev->qp_table_wq, qid); if (qp) diff --git a/drivers/infiniband/hw/mana/qp.c b/drivers/infiniband/hw/mana/qp.c index 13ee30f63d03..6bbfd5b90839 100644 --- a/drivers/infiniband/hw/mana/qp.c +++ b/drivers/infiniband/hw/mana/qp.c @@ -444,18 +444,82 @@ static enum gdma_queue_type mana_ib_queue_type(struct ib_qp_init_attr *attr, u32 return type; } +static int mana_table_store_rc_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp) +{ + return xa_insert_irq(&mdev->qp_table_wq, qp->ibqp.qp_num, qp, + GFP_KERNEL); +} + +static void mana_table_remove_rc_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp) +{ + xa_erase_irq(&mdev->qp_table_wq, qp->ibqp.qp_num); +} + +static int mana_table_store_ud_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp) +{ + u32 qids = qp->ud_qp.queues[MANA_UD_SEND_QUEUE].id | MANA_SENDQ_MASK; + u32 qidr = qp->ud_qp.queues[MANA_UD_RECV_QUEUE].id; + int err; + + err = xa_insert_irq(&mdev->qp_table_wq, qids, qp, GFP_KERNEL); + if (err) + return err; + + err = xa_insert_irq(&mdev->qp_table_wq, qidr, qp, GFP_KERNEL); + if (err) + goto remove_sq; + + return 0; + +remove_sq: + xa_erase_irq(&mdev->qp_table_wq, qids); + return err; +} + +static void mana_table_remove_ud_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp) +{ + u32 qids = qp->ud_qp.queues[MANA_UD_SEND_QUEUE].id | MANA_SENDQ_MASK; + u32 qidr = qp->ud_qp.queues[MANA_UD_RECV_QUEUE].id; + + xa_erase_irq(&mdev->qp_table_wq, qids); + xa_erase_irq(&mdev->qp_table_wq, qidr); +} + static int mana_table_store_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp) { refcount_set(&qp->refcount, 1); init_completion(&qp->free); - return xa_insert_irq(&mdev->qp_table_wq, qp->ibqp.qp_num, qp, - GFP_KERNEL); + + switch (qp->ibqp.qp_type) { + case IB_QPT_RC: + return mana_table_store_rc_qp(mdev, qp); + case IB_QPT_UD: + case IB_QPT_GSI: + return mana_table_store_ud_qp(mdev, qp); + default: + ibdev_dbg(&mdev->ib_dev, "Unknown QP type for storing in mana table, %d\n", + qp->ibqp.qp_type); + } + + return -EINVAL; } static void mana_table_remove_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp) { - xa_erase_irq(&mdev->qp_table_wq, qp->ibqp.qp_num); + switch (qp->ibqp.qp_type) { + case IB_QPT_RC: + mana_table_remove_rc_qp(mdev, qp); + break; + case IB_QPT_UD: + case IB_QPT_GSI: + mana_table_remove_ud_qp(mdev, qp); + break; + default: + ibdev_dbg(&mdev->ib_dev, "Unknown QP type for removing from mana table, %d\n", + qp->ibqp.qp_type); + return; + } mana_put_qp_ref(qp); wait_for_completion(&qp->free); } @@ -586,8 +650,14 @@ static int mana_ib_create_ud_qp(struct ib_qp *ibqp, struct ib_pd *ibpd, for (i = 0; i < MANA_UD_QUEUE_TYPE_MAX; ++i) qp->ud_qp.queues[i].kmem->id = qp->ud_qp.queues[i].id; + err = mana_table_store_qp(mdev, qp); + if (err) + goto destroy_qp; + return 0; +destroy_qp: + mana_ib_gd_destroy_ud_qp(mdev, qp); destroy_shadow_queues: destroy_shadow_queue(&qp->shadow_rq); destroy_shadow_queue(&qp->shadow_sq); @@ -770,6 +840,8 @@ static int mana_ib_destroy_ud_qp(struct mana_ib_qp *qp, struct ib_udata *udata) container_of(qp->ibqp.device, struct mana_ib_dev, ib_dev); int i; + mana_table_remove_qp(mdev, qp); + destroy_shadow_queue(&qp->shadow_rq); destroy_shadow_queue(&qp->shadow_sq); -- cgit v1.2.3-59-g8ed1b From cfef4525924e394005a47b02a78cde0f0318c74d Mon Sep 17 00:00:00 2001 From: Konstantin Taranov Date: Mon, 20 Jan 2025 09:27:18 -0800 Subject: RDMA/mana_ib: polling of CQs for GSI/UD Add polling for the kernel CQs. Process completion events for UD/GSI QPs. Signed-off-by: Konstantin Taranov Link: https://patch.msgid.link/1737394039-28772-13-git-send-email-kotaranov@linux.microsoft.com Reviewed-by: Shiraz Saleem Reviewed-by: Long Li Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mana/cq.c | 135 ++++++++++++++++++++++++ drivers/infiniband/hw/mana/device.c | 1 + drivers/infiniband/hw/mana/mana_ib.h | 32 ++++++ drivers/infiniband/hw/mana/qp.c | 33 ++++++ drivers/net/ethernet/microsoft/mana/gdma_main.c | 1 + 5 files changed, 202 insertions(+) (limited to 'drivers') diff --git a/drivers/infiniband/hw/mana/cq.c b/drivers/infiniband/hw/mana/cq.c index 82f14623f83c..5c325ef4ac56 100644 --- a/drivers/infiniband/hw/mana/cq.c +++ b/drivers/infiniband/hw/mana/cq.c @@ -90,6 +90,10 @@ int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, } } + spin_lock_init(&cq->cq_lock); + INIT_LIST_HEAD(&cq->list_send_qp); + INIT_LIST_HEAD(&cq->list_recv_qp); + return 0; err_remove_cq_cb: @@ -180,3 +184,134 @@ int mana_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) mana_gd_ring_cq(gdma_cq, SET_ARM_BIT); return 0; } + +static inline void handle_ud_sq_cqe(struct mana_ib_qp *qp, struct gdma_comp *cqe) +{ + struct mana_rdma_cqe *rdma_cqe = (struct mana_rdma_cqe *)cqe->cqe_data; + struct gdma_queue *wq = qp->ud_qp.queues[MANA_UD_SEND_QUEUE].kmem; + struct ud_sq_shadow_wqe *shadow_wqe; + + shadow_wqe = shadow_queue_get_next_to_complete(&qp->shadow_sq); + if (!shadow_wqe) + return; + + shadow_wqe->header.error_code = rdma_cqe->ud_send.vendor_error; + + wq->tail += shadow_wqe->header.posted_wqe_size; + shadow_queue_advance_next_to_complete(&qp->shadow_sq); +} + +static inline void handle_ud_rq_cqe(struct mana_ib_qp *qp, struct gdma_comp *cqe) +{ + struct mana_rdma_cqe *rdma_cqe = (struct mana_rdma_cqe *)cqe->cqe_data; + struct gdma_queue *wq = qp->ud_qp.queues[MANA_UD_RECV_QUEUE].kmem; + struct ud_rq_shadow_wqe *shadow_wqe; + + shadow_wqe = shadow_queue_get_next_to_complete(&qp->shadow_rq); + if (!shadow_wqe) + return; + + shadow_wqe->byte_len = rdma_cqe->ud_recv.msg_len; + shadow_wqe->src_qpn = rdma_cqe->ud_recv.src_qpn; + shadow_wqe->header.error_code = IB_WC_SUCCESS; + + wq->tail += shadow_wqe->header.posted_wqe_size; + shadow_queue_advance_next_to_complete(&qp->shadow_rq); +} + +static void mana_handle_cqe(struct mana_ib_dev *mdev, struct gdma_comp *cqe) +{ + struct mana_ib_qp *qp = mana_get_qp_ref(mdev, cqe->wq_num, cqe->is_sq); + + if (!qp) + return; + + if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_UD) { + if (cqe->is_sq) + handle_ud_sq_cqe(qp, cqe); + else + handle_ud_rq_cqe(qp, cqe); + } + + mana_put_qp_ref(qp); +} + +static void fill_verbs_from_shadow_wqe(struct mana_ib_qp *qp, struct ib_wc *wc, + const struct shadow_wqe_header *shadow_wqe) +{ + const struct ud_rq_shadow_wqe *ud_wqe = (const struct ud_rq_shadow_wqe *)shadow_wqe; + + wc->wr_id = shadow_wqe->wr_id; + wc->status = shadow_wqe->error_code; + wc->opcode = shadow_wqe->opcode; + wc->vendor_err = shadow_wqe->error_code; + wc->wc_flags = 0; + wc->qp = &qp->ibqp; + wc->pkey_index = 0; + + if (shadow_wqe->opcode == IB_WC_RECV) { + wc->byte_len = ud_wqe->byte_len; + wc->src_qp = ud_wqe->src_qpn; + wc->wc_flags |= IB_WC_GRH; + } +} + +static int mana_process_completions(struct mana_ib_cq *cq, int nwc, struct ib_wc *wc) +{ + struct shadow_wqe_header *shadow_wqe; + struct mana_ib_qp *qp; + int wc_index = 0; + + /* process send shadow queue completions */ + list_for_each_entry(qp, &cq->list_send_qp, cq_send_list) { + while ((shadow_wqe = shadow_queue_get_next_to_consume(&qp->shadow_sq)) + != NULL) { + if (wc_index >= nwc) + goto out; + + fill_verbs_from_shadow_wqe(qp, &wc[wc_index], shadow_wqe); + shadow_queue_advance_consumer(&qp->shadow_sq); + wc_index++; + } + } + + /* process recv shadow queue completions */ + list_for_each_entry(qp, &cq->list_recv_qp, cq_recv_list) { + while ((shadow_wqe = shadow_queue_get_next_to_consume(&qp->shadow_rq)) + != NULL) { + if (wc_index >= nwc) + goto out; + + fill_verbs_from_shadow_wqe(qp, &wc[wc_index], shadow_wqe); + shadow_queue_advance_consumer(&qp->shadow_rq); + wc_index++; + } + } + +out: + return wc_index; +} + +int mana_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) +{ + struct mana_ib_cq *cq = container_of(ibcq, struct mana_ib_cq, ibcq); + struct mana_ib_dev *mdev = container_of(ibcq->device, struct mana_ib_dev, ib_dev); + struct gdma_queue *queue = cq->queue.kmem; + struct gdma_comp gdma_cqe; + unsigned long flags; + int num_polled = 0; + int comp_read, i; + + spin_lock_irqsave(&cq->cq_lock, flags); + for (i = 0; i < num_entries; i++) { + comp_read = mana_gd_poll_cq(queue, &gdma_cqe, 1); + if (comp_read < 1) + break; + mana_handle_cqe(mdev, &gdma_cqe); + } + + num_polled = mana_process_completions(cq, num_entries, wc); + spin_unlock_irqrestore(&cq->cq_lock, flags); + + return num_polled; +} diff --git a/drivers/infiniband/hw/mana/device.c b/drivers/infiniband/hw/mana/device.c index 637bc9833325..0a7553f819ba 100644 --- a/drivers/infiniband/hw/mana/device.c +++ b/drivers/infiniband/hw/mana/device.c @@ -40,6 +40,7 @@ static const struct ib_device_ops mana_ib_dev_ops = { .mmap = mana_ib_mmap, .modify_qp = mana_ib_modify_qp, .modify_wq = mana_ib_modify_wq, + .poll_cq = mana_ib_poll_cq, .post_recv = mana_ib_post_recv, .post_send = mana_ib_post_send, .query_device = mana_ib_query_device, diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h index 5e4ca55b67b4..cd771af22b80 100644 --- a/drivers/infiniband/hw/mana/mana_ib.h +++ b/drivers/infiniband/hw/mana/mana_ib.h @@ -127,6 +127,10 @@ struct mana_ib_mr { struct mana_ib_cq { struct ib_cq ibcq; struct mana_ib_queue queue; + /* protects CQ polling */ + spinlock_t cq_lock; + struct list_head list_send_qp; + struct list_head list_recv_qp; int cqe; u32 comp_vector; mana_handle_t cq_handle; @@ -169,6 +173,8 @@ struct mana_ib_qp { /* The port on the IB device, starting with 1 */ u32 port; + struct list_head cq_send_list; + struct list_head cq_recv_list; struct shadow_queue shadow_rq; struct shadow_queue shadow_sq; @@ -435,6 +441,31 @@ struct rdma_send_oob { }; }; /* HW DATA */ +struct mana_rdma_cqe { + union { + struct { + u8 cqe_type; + u8 data[GDMA_COMP_DATA_SIZE - 1]; + }; + struct { + u32 cqe_type : 8; + u32 vendor_error : 9; + u32 reserved1 : 15; + u32 sge_offset : 5; + u32 tx_wqe_offset : 27; + } ud_send; + struct { + u32 cqe_type : 8; + u32 reserved1 : 24; + u32 msg_len; + u32 src_qpn : 24; + u32 reserved2 : 8; + u32 imm_data; + u32 rx_wqe_offset; + } ud_recv; + }; +}; /* HW DATA */ + static inline struct gdma_context *mdev_to_gc(struct mana_ib_dev *mdev) { return mdev->gdma_dev->gdma_context; @@ -602,5 +633,6 @@ int mana_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, int mana_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, const struct ib_send_wr **bad_wr); +int mana_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); int mana_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags); #endif diff --git a/drivers/infiniband/hw/mana/qp.c b/drivers/infiniband/hw/mana/qp.c index 6bbfd5b90839..c928af58f38b 100644 --- a/drivers/infiniband/hw/mana/qp.c +++ b/drivers/infiniband/hw/mana/qp.c @@ -600,6 +600,36 @@ destroy_queues: return err; } +static void mana_add_qp_to_cqs(struct mana_ib_qp *qp) +{ + struct mana_ib_cq *send_cq = container_of(qp->ibqp.send_cq, struct mana_ib_cq, ibcq); + struct mana_ib_cq *recv_cq = container_of(qp->ibqp.recv_cq, struct mana_ib_cq, ibcq); + unsigned long flags; + + spin_lock_irqsave(&send_cq->cq_lock, flags); + list_add_tail(&qp->cq_send_list, &send_cq->list_send_qp); + spin_unlock_irqrestore(&send_cq->cq_lock, flags); + + spin_lock_irqsave(&recv_cq->cq_lock, flags); + list_add_tail(&qp->cq_recv_list, &recv_cq->list_recv_qp); + spin_unlock_irqrestore(&recv_cq->cq_lock, flags); +} + +static void mana_remove_qp_from_cqs(struct mana_ib_qp *qp) +{ + struct mana_ib_cq *send_cq = container_of(qp->ibqp.send_cq, struct mana_ib_cq, ibcq); + struct mana_ib_cq *recv_cq = container_of(qp->ibqp.recv_cq, struct mana_ib_cq, ibcq); + unsigned long flags; + + spin_lock_irqsave(&send_cq->cq_lock, flags); + list_del(&qp->cq_send_list); + spin_unlock_irqrestore(&send_cq->cq_lock, flags); + + spin_lock_irqsave(&recv_cq->cq_lock, flags); + list_del(&qp->cq_recv_list); + spin_unlock_irqrestore(&recv_cq->cq_lock, flags); +} + static int mana_ib_create_ud_qp(struct ib_qp *ibqp, struct ib_pd *ibpd, struct ib_qp_init_attr *attr, struct ib_udata *udata) { @@ -654,6 +684,8 @@ static int mana_ib_create_ud_qp(struct ib_qp *ibqp, struct ib_pd *ibpd, if (err) goto destroy_qp; + mana_add_qp_to_cqs(qp); + return 0; destroy_qp: @@ -840,6 +872,7 @@ static int mana_ib_destroy_ud_qp(struct mana_ib_qp *qp, struct ib_udata *udata) container_of(qp->ibqp.device, struct mana_ib_dev, ib_dev); int i; + mana_remove_qp_from_cqs(qp); mana_table_remove_qp(mdev, qp); destroy_shadow_queue(&qp->shadow_rq); diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c index e032c9f9f7c2..4a2b17f35f82 100644 --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c @@ -1222,6 +1222,7 @@ int mana_gd_poll_cq(struct gdma_queue *cq, struct gdma_comp *comp, int num_cqe) return cqe_idx; } +EXPORT_SYMBOL_NS(mana_gd_poll_cq, "NET_MANA"); static irqreturn_t mana_gd_intr(int irq, void *arg) { -- cgit v1.2.3-59-g8ed1b From 6c53bf9cff03504ad43265883ae7881f92e2e3a0 Mon Sep 17 00:00:00 2001 From: Konstantin Taranov Date: Mon, 20 Jan 2025 09:27:19 -0800 Subject: RDMA/mana_ib: indicate CM support Set max_mad_size and IB_PORT_CM_SUP capability to enable connection manager. Signed-off-by: Konstantin Taranov Link: https://patch.msgid.link/1737394039-28772-14-git-send-email-kotaranov@linux.microsoft.com Reviewed-by: Shiraz Saleem Reviewed-by: Long Li Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mana/main.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c index 114e39109511..ae1fb6983f66 100644 --- a/drivers/infiniband/hw/mana/main.c +++ b/drivers/infiniband/hw/mana/main.c @@ -561,8 +561,10 @@ int mana_ib_get_port_immutable(struct ib_device *ibdev, u32 port_num, immutable->pkey_tbl_len = attr.pkey_tbl_len; immutable->gid_tbl_len = attr.gid_tbl_len; immutable->core_cap_flags = RDMA_CORE_PORT_RAW_PACKET; - if (port_num == 1) + if (port_num == 1) { immutable->core_cap_flags |= RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP; + immutable->max_mad_size = IB_MGMT_MAD_SIZE; + } return 0; } @@ -621,8 +623,11 @@ int mana_ib_query_port(struct ib_device *ibdev, u32 port, props->active_width = IB_WIDTH_4X; props->active_speed = IB_SPEED_EDR; props->pkey_tbl_len = 1; - if (port == 1) + if (port == 1) { props->gid_tbl_len = 16; + props->port_cap_flags = IB_PORT_CM_SUP; + props->ip_gids = true; + } return 0; } -- cgit v1.2.3-59-g8ed1b From 656dff55da19eb889f2b1df5a5f2aa1d28f024fd Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Sun, 19 Jan 2025 07:45:35 -0800 Subject: RDMA/bnxt_re: Congestion control settings using debugfs hook Implements routines to set and get different settings of the congestion control. This will enable the users to modify the settings according to their network. Currently supporting only GEN 0 version of the parameters. Reading these files queries the firmware and report the values currently programmed. Writing to the files sends commands that update the congestion control settings Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/1737301535-6599-1-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/bnxt_re.h | 2 + drivers/infiniband/hw/bnxt_re/debugfs.c | 212 +++++++++++++++++++++++++++++++- drivers/infiniband/hw/bnxt_re/debugfs.h | 15 +++ 3 files changed, 228 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h index b91a85a491d0..b33b04e4f8e4 100644 --- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h +++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h @@ -232,6 +232,8 @@ struct bnxt_re_dev { unsigned long event_bitmap; struct bnxt_qplib_cc_param cc_param; struct workqueue_struct *dcb_wq; + struct dentry *cc_config; + struct bnxt_re_dbg_cc_config_params *cc_config_params; }; #define to_bnxt_re_dev(ptr, member) \ diff --git a/drivers/infiniband/hw/bnxt_re/debugfs.c b/drivers/infiniband/hw/bnxt_re/debugfs.c index 7c47039044ef..f4dd2fb51867 100644 --- a/drivers/infiniband/hw/bnxt_re/debugfs.c +++ b/drivers/infiniband/hw/bnxt_re/debugfs.c @@ -22,6 +22,23 @@ static struct dentry *bnxt_re_debugfs_root; +static const char * const bnxt_re_cc_gen0_name[] = { + "enable_cc", + "run_avg_weight_g", + "num_phase_per_state", + "init_cr", + "init_tr", + "tos_ecn", + "tos_dscp", + "alt_vlan_pcp", + "alt_vlan_dscp", + "rtt", + "cc_mode", + "tcp_cp", + "tx_queue", + "inactivity_cp", +}; + static inline const char *bnxt_re_qp_state_str(u8 state) { switch (state) { @@ -110,19 +127,212 @@ void bnxt_re_debug_rem_qpinfo(struct bnxt_re_dev *rdev, struct bnxt_re_qp *qp) debugfs_remove(qp->dentry); } +static int map_cc_config_offset_gen0_ext0(u32 offset, struct bnxt_qplib_cc_param *ccparam, u32 *val) +{ + u64 map_offset; + + map_offset = BIT(offset); + + switch (map_offset) { + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_ENABLE_CC: + *val = ccparam->enable; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_G: + *val = ccparam->g; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_NUMPHASEPERSTATE: + *val = ccparam->nph_per_state; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_INIT_CR: + *val = ccparam->init_cr; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_INIT_TR: + *val = ccparam->init_tr; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_TOS_ECN: + *val = ccparam->tos_ecn; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_TOS_DSCP: + *val = ccparam->tos_dscp; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_ALT_VLAN_PCP: + *val = ccparam->alt_vlan_pcp; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_ALT_TOS_DSCP: + *val = ccparam->alt_tos_dscp; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_RTT: + *val = ccparam->rtt; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_CC_MODE: + *val = ccparam->cc_mode; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_TCP_CP: + *val = ccparam->tcp_cp; + break; + default: + return -EINVAL; + } + + return 0; +} + +static ssize_t bnxt_re_cc_config_get(struct file *filp, char __user *buffer, + size_t usr_buf_len, loff_t *ppos) +{ + struct bnxt_re_cc_param *dbg_cc_param = filp->private_data; + struct bnxt_re_dev *rdev = dbg_cc_param->rdev; + struct bnxt_qplib_cc_param ccparam = {}; + u32 offset = dbg_cc_param->offset; + char buf[16]; + u32 val; + int rc; + + rc = bnxt_qplib_query_cc_param(&rdev->qplib_res, &ccparam); + if (rc) + return rc; + + rc = map_cc_config_offset_gen0_ext0(offset, &ccparam, &val); + if (rc) + return rc; + + rc = snprintf(buf, sizeof(buf), "%d\n", val); + if (rc < 0) + return rc; + + return simple_read_from_buffer(buffer, usr_buf_len, ppos, (u8 *)(buf), rc); +} + +static void bnxt_re_fill_gen0_ext0(struct bnxt_qplib_cc_param *ccparam, u32 offset, u32 val) +{ + u32 modify_mask; + + modify_mask = BIT(offset); + + switch (modify_mask) { + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_ENABLE_CC: + ccparam->enable = val; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_G: + ccparam->g = val; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_NUMPHASEPERSTATE: + ccparam->nph_per_state = val; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_INIT_CR: + ccparam->init_cr = val; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_INIT_TR: + ccparam->init_tr = val; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_TOS_ECN: + ccparam->tos_ecn = val; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_TOS_DSCP: + ccparam->tos_dscp = val; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_ALT_VLAN_PCP: + ccparam->alt_vlan_pcp = val; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_ALT_TOS_DSCP: + ccparam->alt_tos_dscp = val; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_RTT: + ccparam->rtt = val; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_CC_MODE: + ccparam->cc_mode = val; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_TCP_CP: + ccparam->tcp_cp = val; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_TX_QUEUE: + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_INACTIVITY_CP: + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_TIME_PER_PHASE: + ccparam->time_pph = val; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_PKTS_PER_PHASE: + ccparam->pkts_pph = val; + break; + } + + ccparam->mask = modify_mask; +} + +static int bnxt_re_configure_cc(struct bnxt_re_dev *rdev, u32 gen_ext, u32 offset, u32 val) +{ + struct bnxt_qplib_cc_param ccparam = { }; + + /* Supporting only Gen 0 now */ + if (gen_ext != CC_CONFIG_GEN0_EXT0) + bnxt_re_fill_gen0_ext0(&ccparam, offset, val); + else + return -EINVAL; + + bnxt_qplib_modify_cc(&rdev->qplib_res, &ccparam); + return 0; +} + +static ssize_t bnxt_re_cc_config_set(struct file *filp, const char __user *buffer, + size_t count, loff_t *ppos) +{ + struct bnxt_re_cc_param *dbg_cc_param = filp->private_data; + struct bnxt_re_dev *rdev = dbg_cc_param->rdev; + u32 offset = dbg_cc_param->offset; + u8 cc_gen = dbg_cc_param->cc_gen; + char buf[16]; + u32 val; + int rc; + + if (copy_from_user(buf, buffer, count)) + return -EFAULT; + + buf[count] = '\0'; + if (kstrtou32(buf, 0, &val)) + return -EINVAL; + + rc = bnxt_re_configure_cc(rdev, cc_gen, offset, val); + return rc ? rc : count; +} + +static const struct file_operations bnxt_re_cc_config_ops = { + .owner = THIS_MODULE, + .open = simple_open, + .read = bnxt_re_cc_config_get, + .write = bnxt_re_cc_config_set, +}; + void bnxt_re_debugfs_add_pdev(struct bnxt_re_dev *rdev) { struct pci_dev *pdev = rdev->en_dev->pdev; + struct bnxt_re_dbg_cc_config_params *cc_params; + int i; rdev->dbg_root = debugfs_create_dir(dev_name(&pdev->dev), bnxt_re_debugfs_root); rdev->qp_debugfs = debugfs_create_dir("QPs", rdev->dbg_root); + rdev->cc_config = debugfs_create_dir("cc_config", rdev->dbg_root); + + rdev->cc_config_params = kzalloc(sizeof(*cc_params), GFP_KERNEL); + + for (i = 0; i < BNXT_RE_CC_PARAM_GEN0; i++) { + struct bnxt_re_cc_param *tmp_params = &rdev->cc_config_params->gen0_parms[i]; + + tmp_params->rdev = rdev; + tmp_params->offset = i; + tmp_params->cc_gen = CC_CONFIG_GEN0_EXT0; + tmp_params->dentry = debugfs_create_file(bnxt_re_cc_gen0_name[i], 0400, + rdev->cc_config, tmp_params, + &bnxt_re_cc_config_ops); + } } void bnxt_re_debugfs_rem_pdev(struct bnxt_re_dev *rdev) { debugfs_remove_recursive(rdev->qp_debugfs); - + debugfs_remove_recursive(rdev->cc_config); + kfree(rdev->cc_config_params); debugfs_remove_recursive(rdev->dbg_root); rdev->dbg_root = NULL; } diff --git a/drivers/infiniband/hw/bnxt_re/debugfs.h b/drivers/infiniband/hw/bnxt_re/debugfs.h index cd3be0a9ec7e..8f101df4e838 100644 --- a/drivers/infiniband/hw/bnxt_re/debugfs.h +++ b/drivers/infiniband/hw/bnxt_re/debugfs.h @@ -18,4 +18,19 @@ void bnxt_re_debugfs_rem_pdev(struct bnxt_re_dev *rdev); void bnxt_re_register_debugfs(void); void bnxt_re_unregister_debugfs(void); +#define CC_CONFIG_GEN_EXT(x, y) (((x) << 16) | (y)) +#define CC_CONFIG_GEN0_EXT0 CC_CONFIG_GEN_EXT(0, 0) + +#define BNXT_RE_CC_PARAM_GEN0 14 + +struct bnxt_re_cc_param { + struct bnxt_re_dev *rdev; + struct dentry *dentry; + u32 offset; + u8 cc_gen; +}; + +struct bnxt_re_dbg_cc_config_params { + struct bnxt_re_cc_param gen0_parms[BNXT_RE_CC_PARAM_GEN0]; +}; #endif -- cgit v1.2.3-59-g8ed1b From 5459f6523c1f1a053cdc6411a810061ee717219c Mon Sep 17 00:00:00 2001 From: Maher Sanalla Date: Mon, 3 Feb 2025 14:48:04 +0200 Subject: IB/cache: Add log messages for IB device state changes Enhance visibility into IB device state transitions by adding log messages to the kernel log (dmesg). Whenever an IB device changes state, a relevant print will be printed, such as: "mlx5_core 0000:08:00.0 mlx5_0: Port: 1 Link DOWN" "mlx5_core 0000:08:00.0 rdmap8s0f0: Port: 2 Link ACTIVE" Signed-off-by: Maher Sanalla Link: https://patch.msgid.link/2d26ccbd669bad99089fa2aebb5cba4014fc4999.1738586601.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/cache.c | 6 ++++++ include/rdma/ib_verbs.h | 17 +++++++++++++++++ 2 files changed, 23 insertions(+) (limited to 'drivers') diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index f8413f8a9f26..9979a351577f 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -1501,6 +1501,12 @@ ib_cache_update(struct ib_device *device, u32 port, bool update_gids, device->port_data[port].cache.pkey = pkey_cache; } device->port_data[port].cache.lmc = tprops->lmc; + + if (device->port_data[port].cache.port_state != IB_PORT_NOP && + device->port_data[port].cache.port_state != tprops->state) + ibdev_info(device, "Port: %d Link %s\n", port, + ib_port_state_to_str(tprops->state)); + device->port_data[port].cache.port_state = tprops->state; device->port_data[port].cache.subnet_prefix = tprops->subnet_prefix; diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 0ad104dae253..b59bf30de430 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -519,6 +519,23 @@ enum ib_port_state { IB_PORT_ACTIVE_DEFER = 5 }; +static inline const char *__attribute_const__ +ib_port_state_to_str(enum ib_port_state state) +{ + const char * const states[] = { + [IB_PORT_NOP] = "NOP", + [IB_PORT_DOWN] = "DOWN", + [IB_PORT_INIT] = "INIT", + [IB_PORT_ARMED] = "ARMED", + [IB_PORT_ACTIVE] = "ACTIVE", + [IB_PORT_ACTIVE_DEFER] = "ACTIVE_DEFER", + }; + + if (state < ARRAY_SIZE(states)) + return states[state]; + return "UNKNOWN"; +} + enum ib_port_phys_state { IB_PORT_PHYS_STATE_SLEEP = 1, IB_PORT_PHYS_STATE_POLLING = 2, -- cgit v1.2.3-59-g8ed1b From 1fd119c6db838dbed7084ce48f76ad12f5441d59 Mon Sep 17 00:00:00 2001 From: Maher Sanalla Date: Mon, 3 Feb 2025 14:48:05 +0200 Subject: RDMA/core: Use ib_port_state_to_str() for IB state sysfs Refactor the IB state sysfs implementation to replace the local array used for converting IB state to string with the ib_port_state_to_str() function. Signed-off-by: Maher Sanalla Link: https://patch.msgid.link/06affabbbf144f990e64b447918af39483c78c3e.1738586601.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/sysfs.c | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index 9f97bef02149..7491328ca5e6 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -216,24 +216,12 @@ static ssize_t state_show(struct ib_device *ibdev, u32 port_num, struct ib_port_attr attr; ssize_t ret; - static const char *state_name[] = { - [IB_PORT_NOP] = "NOP", - [IB_PORT_DOWN] = "DOWN", - [IB_PORT_INIT] = "INIT", - [IB_PORT_ARMED] = "ARMED", - [IB_PORT_ACTIVE] = "ACTIVE", - [IB_PORT_ACTIVE_DEFER] = "ACTIVE_DEFER" - }; - ret = ib_query_port(ibdev, port_num, &attr); if (ret) return ret; return sysfs_emit(buf, "%d: %s\n", attr.state, - attr.state >= 0 && - attr.state < ARRAY_SIZE(state_name) ? - state_name[attr.state] : - "UNKNOWN"); + ib_port_state_to_str(attr.state)); } static ssize_t lid_show(struct ib_device *ibdev, u32 port_num, -- cgit v1.2.3-59-g8ed1b From d9d9434a3fee5c2b05ef661d57e3e31e6990ed7b Mon Sep 17 00:00:00 2001 From: Maher Sanalla Date: Mon, 3 Feb 2025 14:48:06 +0200 Subject: IB/hfi1: Remove state transition log message and opa_lstate_name() Remove the state transition log message from the hfi1 driver, as the IB core now logs the same information when handling a cache update event. While at it, replace the hfi1-specific opa_lstate_name() function with the ib_verbs equivalent function, ib_port_state_to_str(), for converting IB port state to a string. Signed-off-by: Maher Sanalla Link: https://patch.msgid.link/64e48bef00630e33f4b8c830cb7d9a69aedc34dc.1738586601.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hfi1/chip.c | 18 ------------------ drivers/infiniband/hw/hfi1/chip.h | 1 - drivers/infiniband/hw/hfi1/driver.c | 2 +- drivers/infiniband/hw/hfi1/mad.c | 4 ++-- 4 files changed, 3 insertions(+), 22 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index a442eca498b8..368b6be3226f 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -12882,22 +12882,6 @@ u32 chip_to_opa_pstate(struct hfi1_devdata *dd, u32 chip_pstate) } } -/* return the OPA port logical state name */ -const char *opa_lstate_name(u32 lstate) -{ - static const char * const port_logical_names[] = { - "PORT_NOP", - "PORT_DOWN", - "PORT_INIT", - "PORT_ARMED", - "PORT_ACTIVE", - "PORT_ACTIVE_DEFER", - }; - if (lstate < ARRAY_SIZE(port_logical_names)) - return port_logical_names[lstate]; - return "unknown"; -} - /* return the OPA port physical state name */ const char *opa_pstate_name(u32 pstate) { @@ -12956,8 +12940,6 @@ static void update_statusp(struct hfi1_pportdata *ppd, u32 state) break; } } - dd_dev_info(ppd->dd, "logical state changed to %s (0x%x)\n", - opa_lstate_name(state), state); } /** diff --git a/drivers/infiniband/hw/hfi1/chip.h b/drivers/infiniband/hw/hfi1/chip.h index 8841db16bde7..6992f6d40255 100644 --- a/drivers/infiniband/hw/hfi1/chip.h +++ b/drivers/infiniband/hw/hfi1/chip.h @@ -771,7 +771,6 @@ int is_bx(struct hfi1_devdata *dd); bool is_urg_masked(struct hfi1_ctxtdata *rcd); u32 read_physical_state(struct hfi1_devdata *dd); u32 chip_to_opa_pstate(struct hfi1_devdata *dd, u32 chip_pstate); -const char *opa_lstate_name(u32 lstate); const char *opa_pstate_name(u32 pstate); u32 driver_pstate(struct hfi1_pportdata *ppd); u32 driver_lstate(struct hfi1_pportdata *ppd); diff --git a/drivers/infiniband/hw/hfi1/driver.c b/drivers/infiniband/hw/hfi1/driver.c index 37a6794885d3..50826e7cdb7e 100644 --- a/drivers/infiniband/hw/hfi1/driver.c +++ b/drivers/infiniband/hw/hfi1/driver.c @@ -968,7 +968,7 @@ static bool __set_armed_to_active(struct hfi1_packet *packet) if (hwstate != IB_PORT_ACTIVE) { dd_dev_info(packet->rcd->dd, "Unexpected link state %s\n", - opa_lstate_name(hwstate)); + ib_port_state_to_str(hwstate)); return false; } diff --git a/drivers/infiniband/hw/hfi1/mad.c b/drivers/infiniband/hw/hfi1/mad.c index a9883295f4af..b39f63ce6dfc 100644 --- a/drivers/infiniband/hw/hfi1/mad.c +++ b/drivers/infiniband/hw/hfi1/mad.c @@ -1160,8 +1160,8 @@ static int port_states_transition_allowed(struct hfi1_pportdata *ppd, if (ret == HFI_TRANSITION_DISALLOWED || ret == HFI_TRANSITION_UNDEFINED) { pr_warn("invalid logical state transition %s -> %s\n", - opa_lstate_name(logical_old), - opa_lstate_name(logical_new)); + ib_port_state_to_str(logical_old), + ib_port_state_to_str(logical_new)); return ret; } -- cgit v1.2.3-59-g8ed1b From bad4480934c8227d8a03b6b76e276349506b2bfe Mon Sep 17 00:00:00 2001 From: Shiraz Saleem Date: Wed, 5 Feb 2025 02:35:12 -0800 Subject: RDMA/mana_ib: Query feature_flags bitmask from FW Extend the mana_ib_gd_query_adapter_caps function to retrieve and store the feature_flags from the firmware response. Signed-off-by: Shiraz Saleem Signed-off-by: Konstantin Taranov Link: https://patch.msgid.link/1738751713-16169-2-git-send-email-kotaranov@linux.microsoft.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mana/main.c | 3 ++- drivers/infiniband/hw/mana/mana_ib.h | 2 ++ include/net/mana/gdma.h | 1 + 3 files changed, 5 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c index ae1fb6983f66..3d4b8e21d63a 100644 --- a/drivers/infiniband/hw/mana/main.c +++ b/drivers/infiniband/hw/mana/main.c @@ -665,7 +665,7 @@ int mana_ib_gd_query_adapter_caps(struct mana_ib_dev *dev) mana_gd_init_req_hdr(&req.hdr, MANA_IB_GET_ADAPTER_CAP, sizeof(req), sizeof(resp)); - req.hdr.resp.msg_version = GDMA_MESSAGE_V3; + req.hdr.resp.msg_version = GDMA_MESSAGE_V4; req.hdr.dev_id = dev->gdma_dev->dev_id; err = mana_gd_send_request(mdev_to_gc(dev), sizeof(req), @@ -694,6 +694,7 @@ int mana_ib_gd_query_adapter_caps(struct mana_ib_dev *dev) caps->max_inline_data_size = resp.max_inline_data_size; caps->max_send_sge_count = resp.max_send_sge_count; caps->max_recv_sge_count = resp.max_recv_sge_count; + caps->feature_flags = resp.feature_flags; return 0; } diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h index cd771af22b80..baaeef0dc8bb 100644 --- a/drivers/infiniband/hw/mana/mana_ib.h +++ b/drivers/infiniband/hw/mana/mana_ib.h @@ -58,6 +58,7 @@ struct mana_ib_adapter_caps { u32 max_send_sge_count; u32 max_recv_sge_count; u32 max_inline_data_size; + u64 feature_flags; }; struct mana_ib_queue { @@ -230,6 +231,7 @@ struct mana_ib_query_adapter_caps_resp { u32 max_send_sge_count; u32 max_recv_sge_count; u32 max_inline_data_size; + u64 feature_flags; }; /* HW Data */ struct mana_rnic_create_adapter_req { diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h index a94b04ea926f..50ffbc408346 100644 --- a/include/net/mana/gdma.h +++ b/include/net/mana/gdma.h @@ -152,6 +152,7 @@ struct gdma_general_req { #define GDMA_MESSAGE_V1 1 #define GDMA_MESSAGE_V2 2 #define GDMA_MESSAGE_V3 3 +#define GDMA_MESSAGE_V4 4 struct gdma_general_resp { struct gdma_resp_hdr hdr; -- cgit v1.2.3-59-g8ed1b From cd3c5ddf823016b0c670d1965c5d312b3cf8bb7b Mon Sep 17 00:00:00 2001 From: Konstantin Taranov Date: Wed, 5 Feb 2025 02:35:13 -0800 Subject: RDMA/mana_ib: request error CQEs when supported Request an adapter with error CQEs when it is supported. Signed-off-by: Konstantin Taranov Link: https://patch.msgid.link/1738751713-16169-3-git-send-email-kotaranov@linux.microsoft.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mana/main.c | 3 +++ drivers/infiniband/hw/mana/mana_ib.h | 8 ++++++++ 2 files changed, 11 insertions(+) (limited to 'drivers') diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c index 3d4b8e21d63a..e3230fe31fbc 100644 --- a/drivers/infiniband/hw/mana/main.c +++ b/drivers/infiniband/hw/mana/main.c @@ -794,6 +794,9 @@ int mana_ib_gd_create_rnic_adapter(struct mana_ib_dev *mdev) req.hdr.dev_id = gc->mana_ib.dev_id; req.notify_eq_id = mdev->fatal_err_eq->id; + if (mdev->adapter_caps.feature_flags & MANA_IB_FEATURE_CLIENT_ERROR_CQE_SUPPORT) + req.feature_flags |= MANA_IB_FEATURE_CLIENT_ERROR_CQE_REQUEST; + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); if (err) { ibdev_err(&mdev->ib_dev, "Failed to create RNIC adapter err %d", err); diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h index baaeef0dc8bb..ad716a90c7be 100644 --- a/drivers/infiniband/hw/mana/mana_ib.h +++ b/drivers/infiniband/hw/mana/mana_ib.h @@ -211,6 +211,10 @@ struct mana_ib_query_adapter_caps_req { struct gdma_req_hdr hdr; }; /*HW Data */ +enum mana_ib_adapter_features { + MANA_IB_FEATURE_CLIENT_ERROR_CQE_SUPPORT = BIT(4), +}; + struct mana_ib_query_adapter_caps_resp { struct gdma_resp_hdr hdr; u32 max_sq_id; @@ -234,6 +238,10 @@ struct mana_ib_query_adapter_caps_resp { u64 feature_flags; }; /* HW Data */ +enum mana_ib_adapter_features_request { + MANA_IB_FEATURE_CLIENT_ERROR_CQE_REQUEST = BIT(1), +}; /*HW Data */ + struct mana_rnic_create_adapter_req { struct gdma_req_hdr hdr; u32 notify_eq_id; -- cgit v1.2.3-59-g8ed1b From 79bccd746132afe12b8bc3785e7b74b90440060d Mon Sep 17 00:00:00 2001 From: Shiraz Saleem Date: Wed, 5 Feb 2025 02:32:07 -0800 Subject: RDMA/mana_ib: Add port statistics support Implement alloc_hw_port_stats and get_hw_stats APIs to support querying MANA VF port level statistics from rdma stat tool. Example output from rdma stat tool: $rdma statistic show link mana_0/1 -p link mana_0/1 requester_timeout 45 requester_oos_nak 0 requester_rnr_nak 0 responder_rnr_nak 0 responder_oos 0 responder_dup_request 0 requester_implicit_nak 0 requester_readresp_psn_mismatch 0 nak_inv_req 0 nak_access_error 0 nak_opp_error 0 nak_inv_read 0 responder_local_len_error 0 requestor_local_prot_error 0 responder_rem_access_error 0 responder_local_qp_error 0 responder_malformed_wqe 0 general_hw_error 6 requester_rnr_nak_retries_exceeded 0 requester_retries_exceeded 5 total_fatal_error 6 received_cnps 0 num_qps_congested 0 rate_inc_events 0 num_qps_recovered 0 current_rate 100000 Signed-off-by: Shiraz Saleem Signed-off-by: Konstantin Taranov Link: https://patch.msgid.link/1738751527-15517-1-git-send-email-kotaranov@linux.microsoft.com Reviewed-by: Long Li Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mana/Makefile | 2 +- drivers/infiniband/hw/mana/counters.c | 105 ++++++++++++++++++++++++++++++++++ drivers/infiniband/hw/mana/counters.h | 44 ++++++++++++++ drivers/infiniband/hw/mana/device.c | 7 +++ drivers/infiniband/hw/mana/mana_ib.h | 37 ++++++++++++ 5 files changed, 194 insertions(+), 1 deletion(-) create mode 100644 drivers/infiniband/hw/mana/counters.c create mode 100644 drivers/infiniband/hw/mana/counters.h (limited to 'drivers') diff --git a/drivers/infiniband/hw/mana/Makefile b/drivers/infiniband/hw/mana/Makefile index 79426e725122..921c05e08b11 100644 --- a/drivers/infiniband/hw/mana/Makefile +++ b/drivers/infiniband/hw/mana/Makefile @@ -1,4 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_MANA_INFINIBAND) += mana_ib.o -mana_ib-y := device.o main.o wq.o qp.o cq.o mr.o ah.o wr.o +mana_ib-y := device.o main.o wq.o qp.o cq.o mr.o ah.o wr.o counters.o diff --git a/drivers/infiniband/hw/mana/counters.c b/drivers/infiniband/hw/mana/counters.c new file mode 100644 index 000000000000..e533ce21013d --- /dev/null +++ b/drivers/infiniband/hw/mana/counters.c @@ -0,0 +1,105 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2024, Microsoft Corporation. All rights reserved. + */ + +#include "counters.h" + +static const struct rdma_stat_desc mana_ib_port_stats_desc[] = { + [MANA_IB_REQUESTER_TIMEOUT].name = "requester_timeout", + [MANA_IB_REQUESTER_OOS_NAK].name = "requester_oos_nak", + [MANA_IB_REQUESTER_RNR_NAK].name = "requester_rnr_nak", + [MANA_IB_RESPONDER_RNR_NAK].name = "responder_rnr_nak", + [MANA_IB_RESPONDER_OOS].name = "responder_oos", + [MANA_IB_RESPONDER_DUP_REQUEST].name = "responder_dup_request", + [MANA_IB_REQUESTER_IMPLICIT_NAK].name = "requester_implicit_nak", + [MANA_IB_REQUESTER_READRESP_PSN_MISMATCH].name = "requester_readresp_psn_mismatch", + [MANA_IB_NAK_INV_REQ].name = "nak_inv_req", + [MANA_IB_NAK_ACCESS_ERR].name = "nak_access_error", + [MANA_IB_NAK_OPP_ERR].name = "nak_opp_error", + [MANA_IB_NAK_INV_READ].name = "nak_inv_read", + [MANA_IB_RESPONDER_LOCAL_LEN_ERR].name = "responder_local_len_error", + [MANA_IB_REQUESTOR_LOCAL_PROT_ERR].name = "requestor_local_prot_error", + [MANA_IB_RESPONDER_REM_ACCESS_ERR].name = "responder_rem_access_error", + [MANA_IB_RESPONDER_LOCAL_QP_ERR].name = "responder_local_qp_error", + [MANA_IB_RESPONDER_MALFORMED_WQE].name = "responder_malformed_wqe", + [MANA_IB_GENERAL_HW_ERR].name = "general_hw_error", + [MANA_IB_REQUESTER_RNR_NAK_RETRIES_EXCEEDED].name = "requester_rnr_nak_retries_exceeded", + [MANA_IB_REQUESTER_RETRIES_EXCEEDED].name = "requester_retries_exceeded", + [MANA_IB_TOTAL_FATAL_ERR].name = "total_fatal_error", + [MANA_IB_RECEIVED_CNPS].name = "received_cnps", + [MANA_IB_NUM_QPS_CONGESTED].name = "num_qps_congested", + [MANA_IB_RATE_INC_EVENTS].name = "rate_inc_events", + [MANA_IB_NUM_QPS_RECOVERED].name = "num_qps_recovered", + [MANA_IB_CURRENT_RATE].name = "current_rate", +}; + +struct rdma_hw_stats *mana_ib_alloc_hw_port_stats(struct ib_device *ibdev, + u32 port_num) +{ + return rdma_alloc_hw_stats_struct(mana_ib_port_stats_desc, + ARRAY_SIZE(mana_ib_port_stats_desc), + RDMA_HW_STATS_DEFAULT_LIFESPAN); +} + +int mana_ib_get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats, + u32 port_num, int index) +{ + struct mana_ib_dev *mdev = container_of(ibdev, struct mana_ib_dev, + ib_dev); + struct mana_rnic_query_vf_cntrs_resp resp = {}; + struct mana_rnic_query_vf_cntrs_req req = {}; + int err; + + mana_gd_init_req_hdr(&req.hdr, MANA_IB_QUERY_VF_COUNTERS, + sizeof(req), sizeof(resp)); + req.hdr.dev_id = mdev->gdma_dev->dev_id; + req.adapter = mdev->adapter_handle; + + err = mana_gd_send_request(mdev_to_gc(mdev), sizeof(req), &req, + sizeof(resp), &resp); + if (err) { + ibdev_err(&mdev->ib_dev, "Failed to query vf counters err %d", + err); + return err; + } + + stats->value[MANA_IB_REQUESTER_TIMEOUT] = resp.requester_timeout; + stats->value[MANA_IB_REQUESTER_OOS_NAK] = resp.requester_oos_nak; + stats->value[MANA_IB_REQUESTER_RNR_NAK] = resp.requester_rnr_nak; + stats->value[MANA_IB_RESPONDER_RNR_NAK] = resp.responder_rnr_nak; + stats->value[MANA_IB_RESPONDER_OOS] = resp.responder_oos; + stats->value[MANA_IB_RESPONDER_DUP_REQUEST] = resp.responder_dup_request; + stats->value[MANA_IB_REQUESTER_IMPLICIT_NAK] = + resp.requester_implicit_nak; + stats->value[MANA_IB_REQUESTER_READRESP_PSN_MISMATCH] = + resp.requester_readresp_psn_mismatch; + stats->value[MANA_IB_NAK_INV_REQ] = resp.nak_inv_req; + stats->value[MANA_IB_NAK_ACCESS_ERR] = resp.nak_access_err; + stats->value[MANA_IB_NAK_OPP_ERR] = resp.nak_opp_err; + stats->value[MANA_IB_NAK_INV_READ] = resp.nak_inv_read; + stats->value[MANA_IB_RESPONDER_LOCAL_LEN_ERR] = + resp.responder_local_len_err; + stats->value[MANA_IB_REQUESTOR_LOCAL_PROT_ERR] = + resp.requestor_local_prot_err; + stats->value[MANA_IB_RESPONDER_REM_ACCESS_ERR] = + resp.responder_rem_access_err; + stats->value[MANA_IB_RESPONDER_LOCAL_QP_ERR] = + resp.responder_local_qp_err; + stats->value[MANA_IB_RESPONDER_MALFORMED_WQE] = + resp.responder_malformed_wqe; + stats->value[MANA_IB_GENERAL_HW_ERR] = resp.general_hw_err; + stats->value[MANA_IB_REQUESTER_RNR_NAK_RETRIES_EXCEEDED] = + resp.requester_rnr_nak_retries_exceeded; + stats->value[MANA_IB_REQUESTER_RETRIES_EXCEEDED] = + resp.requester_retries_exceeded; + stats->value[MANA_IB_TOTAL_FATAL_ERR] = resp.total_fatal_err; + + stats->value[MANA_IB_RECEIVED_CNPS] = resp.received_cnps; + stats->value[MANA_IB_NUM_QPS_CONGESTED] = resp.num_qps_congested; + stats->value[MANA_IB_RATE_INC_EVENTS] = resp.rate_inc_events; + stats->value[MANA_IB_NUM_QPS_RECOVERED] = resp.num_qps_recovered; + stats->value[MANA_IB_CURRENT_RATE] = resp.current_rate; + + return ARRAY_SIZE(mana_ib_port_stats_desc); +} diff --git a/drivers/infiniband/hw/mana/counters.h b/drivers/infiniband/hw/mana/counters.h new file mode 100644 index 000000000000..7ff92d27f6c3 --- /dev/null +++ b/drivers/infiniband/hw/mana/counters.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024 Microsoft Corporation. All rights reserved. + */ + +#ifndef _COUNTERS_H_ +#define _COUNTERS_H_ + +#include "mana_ib.h" + +enum mana_ib_port_counters { + MANA_IB_REQUESTER_TIMEOUT, + MANA_IB_REQUESTER_OOS_NAK, + MANA_IB_REQUESTER_RNR_NAK, + MANA_IB_RESPONDER_RNR_NAK, + MANA_IB_RESPONDER_OOS, + MANA_IB_RESPONDER_DUP_REQUEST, + MANA_IB_REQUESTER_IMPLICIT_NAK, + MANA_IB_REQUESTER_READRESP_PSN_MISMATCH, + MANA_IB_NAK_INV_REQ, + MANA_IB_NAK_ACCESS_ERR, + MANA_IB_NAK_OPP_ERR, + MANA_IB_NAK_INV_READ, + MANA_IB_RESPONDER_LOCAL_LEN_ERR, + MANA_IB_REQUESTOR_LOCAL_PROT_ERR, + MANA_IB_RESPONDER_REM_ACCESS_ERR, + MANA_IB_RESPONDER_LOCAL_QP_ERR, + MANA_IB_RESPONDER_MALFORMED_WQE, + MANA_IB_GENERAL_HW_ERR, + MANA_IB_REQUESTER_RNR_NAK_RETRIES_EXCEEDED, + MANA_IB_REQUESTER_RETRIES_EXCEEDED, + MANA_IB_TOTAL_FATAL_ERR, + MANA_IB_RECEIVED_CNPS, + MANA_IB_NUM_QPS_CONGESTED, + MANA_IB_RATE_INC_EVENTS, + MANA_IB_NUM_QPS_RECOVERED, + MANA_IB_CURRENT_RATE, +}; + +struct rdma_hw_stats *mana_ib_alloc_hw_port_stats(struct ib_device *ibdev, + u32 port_num); +int mana_ib_get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats, + u32 port_num, int index); +#endif /* _COUNTERS_H_ */ diff --git a/drivers/infiniband/hw/mana/device.c b/drivers/infiniband/hw/mana/device.c index 0a7553f819ba..640a2c96b5ca 100644 --- a/drivers/infiniband/hw/mana/device.c +++ b/drivers/infiniband/hw/mana/device.c @@ -59,6 +59,11 @@ static const struct ib_device_ops mana_ib_dev_ops = { ib_ind_table), }; +static const struct ib_device_ops mana_ib_stats_ops = { + .alloc_hw_port_stats = mana_ib_alloc_hw_port_stats, + .get_hw_stats = mana_ib_get_hw_stats, +}; + static int mana_ib_probe(struct auxiliary_device *adev, const struct auxiliary_device_id *id) { @@ -124,6 +129,8 @@ static int mana_ib_probe(struct auxiliary_device *adev, goto deregister_device; } + ib_set_device_ops(&dev->ib_dev, &mana_ib_stats_ops); + ret = mana_ib_create_eqs(dev); if (ret) { ibdev_err(&dev->ib_dev, "Failed to create EQs, ret %d", ret); diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h index ad716a90c7be..7e342e070b70 100644 --- a/drivers/infiniband/hw/mana/mana_ib.h +++ b/drivers/infiniband/hw/mana/mana_ib.h @@ -15,6 +15,7 @@ #include #include "shadow_queue.h" +#include "counters.h" #define PAGE_SZ_BM \ (SZ_4K | SZ_8K | SZ_16K | SZ_32K | SZ_64K | SZ_128K | SZ_256K | \ @@ -205,6 +206,7 @@ enum mana_ib_command_code { MANA_IB_CREATE_RC_QP = 0x3000a, MANA_IB_DESTROY_RC_QP = 0x3000b, MANA_IB_SET_QP_STATE = 0x3000d, + MANA_IB_QUERY_VF_COUNTERS = 0x30022, }; struct mana_ib_query_adapter_caps_req { @@ -476,6 +478,41 @@ struct mana_rdma_cqe { }; }; /* HW DATA */ +struct mana_rnic_query_vf_cntrs_req { + struct gdma_req_hdr hdr; + mana_handle_t adapter; +}; /* HW Data */ + +struct mana_rnic_query_vf_cntrs_resp { + struct gdma_resp_hdr hdr; + u64 requester_timeout; + u64 requester_oos_nak; + u64 requester_rnr_nak; + u64 responder_rnr_nak; + u64 responder_oos; + u64 responder_dup_request; + u64 requester_implicit_nak; + u64 requester_readresp_psn_mismatch; + u64 nak_inv_req; + u64 nak_access_err; + u64 nak_opp_err; + u64 nak_inv_read; + u64 responder_local_len_err; + u64 requestor_local_prot_err; + u64 responder_rem_access_err; + u64 responder_local_qp_err; + u64 responder_malformed_wqe; + u64 general_hw_err; + u64 requester_rnr_nak_retries_exceeded; + u64 requester_retries_exceeded; + u64 total_fatal_err; + u64 received_cnps; + u64 num_qps_congested; + u64 rate_inc_events; + u64 num_qps_recovered; + u64 current_rate; +}; /* HW Data */ + static inline struct gdma_context *mdev_to_gc(struct mana_ib_dev *mdev) { return mdev->gdma_dev->gdma_context; -- cgit v1.2.3-59-g8ed1b From ccca5e8aa14572315dc9b9dadb3a06a6a6a607f7 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 6 Feb 2025 19:23:16 -0800 Subject: RDMA/rxe: switch to using the crc32 library Now that the crc32_le() library function takes advantage of architecture-specific optimizations, it is unnecessary to go through the crypto API. Just use crc32_le(). This is much simpler, and it improves performance due to eliminating the crypto API overhead. Signed-off-by: Eric Biggers Link: https://patch.msgid.link/20250207032316.53941-1-ebiggers@kernel.org Reviewed-by: Zhu Yanjun Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/rxe/Kconfig | 3 +-- drivers/infiniband/sw/rxe/rxe.c | 3 --- drivers/infiniband/sw/rxe/rxe.h | 1 - drivers/infiniband/sw/rxe/rxe_icrc.c | 40 +---------------------------------- drivers/infiniband/sw/rxe/rxe_loc.h | 1 - drivers/infiniband/sw/rxe/rxe_req.c | 1 - drivers/infiniband/sw/rxe/rxe_verbs.c | 4 ---- drivers/infiniband/sw/rxe/rxe_verbs.h | 1 - 8 files changed, 2 insertions(+), 52 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/sw/rxe/Kconfig b/drivers/infiniband/sw/rxe/Kconfig index 06b8dc5093f7..c180e7ebcfc5 100644 --- a/drivers/infiniband/sw/rxe/Kconfig +++ b/drivers/infiniband/sw/rxe/Kconfig @@ -4,8 +4,7 @@ config RDMA_RXE depends on INET && PCI && INFINIBAND depends on INFINIBAND_VIRT_DMA select NET_UDP_TUNNEL - select CRYPTO - select CRYPTO_CRC32 + select CRC32 help This driver implements the InfiniBand RDMA transport over the Linux network stack. It enables a system with a diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c index 432e54a29b99..83a16dba1eff 100644 --- a/drivers/infiniband/sw/rxe/rxe.c +++ b/drivers/infiniband/sw/rxe/rxe.c @@ -31,9 +31,6 @@ void rxe_dealloc(struct ib_device *ib_dev) WARN_ON(!RB_EMPTY_ROOT(&rxe->mcg_tree)); - if (rxe->tfm) - crypto_free_shash(rxe->tfm); - mutex_destroy(&rxe->usdev_lock); } diff --git a/drivers/infiniband/sw/rxe/rxe.h b/drivers/infiniband/sw/rxe/rxe.h index fe7f97066732..8db65731499d 100644 --- a/drivers/infiniband/sw/rxe/rxe.h +++ b/drivers/infiniband/sw/rxe/rxe.h @@ -21,7 +21,6 @@ #include #include #include -#include #include "rxe_net.h" #include "rxe_opcode.h" diff --git a/drivers/infiniband/sw/rxe/rxe_icrc.c b/drivers/infiniband/sw/rxe/rxe_icrc.c index fdf5f08cd8f1..76d760fbe7ea 100644 --- a/drivers/infiniband/sw/rxe/rxe_icrc.c +++ b/drivers/infiniband/sw/rxe/rxe_icrc.c @@ -9,28 +9,6 @@ #include "rxe.h" #include "rxe_loc.h" -/** - * rxe_icrc_init() - Initialize crypto function for computing crc32 - * @rxe: rdma_rxe device object - * - * Return: 0 on success else an error - */ -int rxe_icrc_init(struct rxe_dev *rxe) -{ - struct crypto_shash *tfm; - - tfm = crypto_alloc_shash("crc32", 0, 0); - if (IS_ERR(tfm)) { - rxe_dbg_dev(rxe, "failed to init crc32 algorithm err: %ld\n", - PTR_ERR(tfm)); - return PTR_ERR(tfm); - } - - rxe->tfm = tfm; - - return 0; -} - /** * rxe_crc32() - Compute cumulative crc32 for a contiguous segment * @rxe: rdma_rxe device object @@ -42,23 +20,7 @@ int rxe_icrc_init(struct rxe_dev *rxe) */ static __be32 rxe_crc32(struct rxe_dev *rxe, __be32 crc, void *next, size_t len) { - __be32 icrc; - int err; - - SHASH_DESC_ON_STACK(shash, rxe->tfm); - - shash->tfm = rxe->tfm; - *(__be32 *)shash_desc_ctx(shash) = crc; - err = crypto_shash_update(shash, next, len); - if (unlikely(err)) { - rxe_dbg_dev(rxe, "failed crc calculation, err: %d\n", err); - return (__force __be32)crc32_le((__force u32)crc, next, len); - } - - icrc = *(__be32 *)shash_desc_ctx(shash); - barrier_data(shash_desc_ctx(shash)); - - return icrc; + return (__force __be32)crc32_le((__force u32)crc, next, len); } /** diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h index ded46119151b..c57ab8975c5d 100644 --- a/drivers/infiniband/sw/rxe/rxe_loc.h +++ b/drivers/infiniband/sw/rxe/rxe_loc.h @@ -168,7 +168,6 @@ int rxe_sender(struct rxe_qp *qp); int rxe_receiver(struct rxe_qp *qp); /* rxe_icrc.c */ -int rxe_icrc_init(struct rxe_dev *rxe); int rxe_icrc_check(struct sk_buff *skb, struct rxe_pkt_info *pkt); void rxe_icrc_generate(struct sk_buff *skb, struct rxe_pkt_info *pkt); diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c index 87a02f0deb00..9d0392df8a92 100644 --- a/drivers/infiniband/sw/rxe/rxe_req.c +++ b/drivers/infiniband/sw/rxe/rxe_req.c @@ -5,7 +5,6 @@ */ #include -#include #include "rxe.h" #include "rxe_loc.h" diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index 08a5836c2600..a9494c6030af 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -1546,10 +1546,6 @@ int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name, if (err) return err; - err = rxe_icrc_init(rxe); - if (err) - return err; - err = ib_register_device(dev, ibdev_name, NULL); if (err) rxe_dbg_dev(rxe, "failed with error %d\n", err); diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index 729a6ada46af..3d0faab9a8f7 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -404,7 +404,6 @@ struct rxe_dev { atomic64_t stats_counters[RXE_NUM_OF_COUNTERS]; struct rxe_port port; - struct crypto_shash *tfm; }; static inline struct net_device *rxe_ib_device_get_netdev(struct ib_device *dev) -- cgit v1.2.3-59-g8ed1b From 607a7dcf2e981449a4b200f497f8ea97ddb5e13f Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 7 Feb 2025 12:16:03 +0300 Subject: RDMA/mana_ib: Fix error code in probe() Return -ENOMEM if dma_pool_create() fails. Don't return success. Fixes: df91c470d9e5 ("RDMA/mana_ib: create/destroy AH") Signed-off-by: Dan Carpenter Link: https://patch.msgid.link/2bbe900e-18b3-46b5-a08c-42eb71886da6@stanley.mountain Reviewed-by: Long Li Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mana/device.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/mana/device.c b/drivers/infiniband/hw/mana/device.c index 640a2c96b5ca..0c7a9929168f 100644 --- a/drivers/infiniband/hw/mana/device.c +++ b/drivers/infiniband/hw/mana/device.c @@ -151,8 +151,10 @@ static int mana_ib_probe(struct auxiliary_device *adev, dev->av_pool = dma_pool_create("mana_ib_av", mdev->gdma_context->dev, MANA_AV_BUFFER_SIZE, MANA_AV_BUFFER_SIZE, 0); - if (!dev->av_pool) + if (!dev->av_pool) { + ret = -ENOMEM; goto destroy_rnic; + } ret = ib_register_device(&dev->ib_dev, "mana_%d", mdev->gdma_context->dev); -- cgit v1.2.3-59-g8ed1b From dbc641ecf1cbd41a649e7ac6ea7175562ef599b2 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 7 Feb 2025 12:16:19 +0300 Subject: RDMA/bnxt_re: Fix buffer overflow in debugfs code Add some bounds checking to prevent memory corruption in bnxt_re_cc_config_set(). This is debugfs code so the bug can only be triggered by root. Fixes: 656dff55da19 ("RDMA/bnxt_re: Congestion control settings using debugfs hook") Signed-off-by: Dan Carpenter Link: https://patch.msgid.link/a6b081ab-55fe-4d0c-8f69-c5e5a59e9141@stanley.mountain Acked-by: Selvin Xavier Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/debugfs.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers') diff --git a/drivers/infiniband/hw/bnxt_re/debugfs.c b/drivers/infiniband/hw/bnxt_re/debugfs.c index f4dd2fb51867..d7354e7753fe 100644 --- a/drivers/infiniband/hw/bnxt_re/debugfs.c +++ b/drivers/infiniband/hw/bnxt_re/debugfs.c @@ -285,6 +285,9 @@ static ssize_t bnxt_re_cc_config_set(struct file *filp, const char __user *buffe u32 val; int rc; + if (count >= sizeof(buf)) + return -EINVAL; + if (copy_from_user(buf, buffer, count)) return -EFAULT; -- cgit v1.2.3-59-g8ed1b From f26e648a978ae7958e0958095768363c851a736d Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Sat, 8 Feb 2025 05:48:26 -0800 Subject: RDMA/bnxt_re: Fix the condition check while programming congestion control Program the Congestion control values when the CC gen matches. Fix the condition check for the same. Fixes: 656dff55da19 ("RDMA/bnxt_re: Congestion control settings using debugfs hook") Reported-by: Kalesh AP Reported-by: Chengchang Tang Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/1739022506-8937-1-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/debugfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/bnxt_re/debugfs.c b/drivers/infiniband/hw/bnxt_re/debugfs.c index d7354e7753fe..af91d16c3c77 100644 --- a/drivers/infiniband/hw/bnxt_re/debugfs.c +++ b/drivers/infiniband/hw/bnxt_re/debugfs.c @@ -265,7 +265,7 @@ static int bnxt_re_configure_cc(struct bnxt_re_dev *rdev, u32 gen_ext, u32 offse struct bnxt_qplib_cc_param ccparam = { }; /* Supporting only Gen 0 now */ - if (gen_ext != CC_CONFIG_GEN0_EXT0) + if (gen_ext == CC_CONFIG_GEN0_EXT0) bnxt_re_fill_gen0_ext0(&ccparam, offset, val); else return -EINVAL; -- cgit v1.2.3-59-g8ed1b From 161072d43a8cd2f1e4c9612f7e41d5d070c1d01b Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 6 Feb 2025 20:08:16 -0800 Subject: RDMA/irdma: Switch to using the crc32c library Now that the crc32c() library function directly takes advantage of architecture-specific optimizations, it is unnecessary to go through the crypto API. Just use crc32c(). This is much simpler, and it improves performance due to eliminating the crypto API overhead. Note that for crc32c the equivalent of crypto_shash_digest() is cpu_to_le32(~crc32c(~0, ...)), considering that crypto_shash_digest() had before and inversions as well as a cpu_to_le32() built-in. This means that this driver is using u32 for fixed-endian types; this patch does not try to fix that but rather just keep the exact same behavior. Link: https://lore.kernel.org/r/20250207033643.59904-1-ebiggers@kernel.org Signed-off-by: Eric Biggers Link: https://patch.msgid.link/20250207040816.69163-1-ebiggers@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/irdma/Kconfig | 1 + drivers/infiniband/hw/irdma/main.h | 1 - drivers/infiniband/hw/irdma/osdep.h | 6 +---- drivers/infiniband/hw/irdma/puda.c | 19 ++++++--------- drivers/infiniband/hw/irdma/puda.h | 5 +--- drivers/infiniband/hw/irdma/utils.c | 47 ++----------------------------------- 6 files changed, 12 insertions(+), 67 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/irdma/Kconfig b/drivers/infiniband/hw/irdma/Kconfig index b6f9c41bca51..5f49a58590ed 100644 --- a/drivers/infiniband/hw/irdma/Kconfig +++ b/drivers/infiniband/hw/irdma/Kconfig @@ -7,6 +7,7 @@ config INFINIBAND_IRDMA depends on ICE && I40E select GENERIC_ALLOCATOR select AUXILIARY_BUS + select CRC32 help This is an Intel(R) Ethernet Protocol Driver for RDMA driver that support E810 (iWARP/RoCE) and X722 (iWARP) network devices. diff --git a/drivers/infiniband/hw/irdma/main.h b/drivers/infiniband/hw/irdma/main.h index 9f0ed6e84471..0705ef3d72a9 100644 --- a/drivers/infiniband/hw/irdma/main.h +++ b/drivers/infiniband/hw/irdma/main.h @@ -30,7 +30,6 @@ #endif #include #include -#include #include #include #include diff --git a/drivers/infiniband/hw/irdma/osdep.h b/drivers/infiniband/hw/irdma/osdep.h index ddf02a462efa..4b4f78288d12 100644 --- a/drivers/infiniband/hw/irdma/osdep.h +++ b/drivers/infiniband/hw/irdma/osdep.h @@ -6,7 +6,6 @@ #include #include #include -#include #include #define STATS_TIMER_DELAY 60000 @@ -43,15 +42,12 @@ enum irdma_status_code irdma_vf_wait_vchnl_resp(struct irdma_sc_dev *dev); bool irdma_vf_clear_to_send(struct irdma_sc_dev *dev); void irdma_add_dev_ref(struct irdma_sc_dev *dev); void irdma_put_dev_ref(struct irdma_sc_dev *dev); -int irdma_ieq_check_mpacrc(struct shash_desc *desc, void *addr, u32 len, - u32 val); +int irdma_ieq_check_mpacrc(const void *addr, u32 len, u32 val); struct irdma_sc_qp *irdma_ieq_get_qp(struct irdma_sc_dev *dev, struct irdma_puda_buf *buf); void irdma_send_ieq_ack(struct irdma_sc_qp *qp); void irdma_ieq_update_tcpip_info(struct irdma_puda_buf *buf, u16 len, u32 seqnum); -void irdma_free_hash_desc(struct shash_desc *hash_desc); -int irdma_init_hash_desc(struct shash_desc **hash_desc); int irdma_puda_get_tcpip_info(struct irdma_puda_cmpl_info *info, struct irdma_puda_buf *buf); int irdma_cqp_sds_cmd(struct irdma_sc_dev *dev, diff --git a/drivers/infiniband/hw/irdma/puda.c b/drivers/infiniband/hw/irdma/puda.c index 7e3f9bca2c23..694e5a9ed15d 100644 --- a/drivers/infiniband/hw/irdma/puda.c +++ b/drivers/infiniband/hw/irdma/puda.c @@ -923,8 +923,6 @@ void irdma_puda_dele_rsrc(struct irdma_sc_vsi *vsi, enum puda_rsrc_type type, switch (rsrc->cmpl) { case PUDA_HASH_CRC_COMPLETE: - irdma_free_hash_desc(rsrc->hash_desc); - fallthrough; case PUDA_QP_CREATED: irdma_qp_rem_qos(&rsrc->qp); @@ -1095,15 +1093,12 @@ int irdma_puda_create_rsrc(struct irdma_sc_vsi *vsi, goto error; if (info->type == IRDMA_PUDA_RSRC_TYPE_IEQ) { - if (!irdma_init_hash_desc(&rsrc->hash_desc)) { - rsrc->check_crc = true; - rsrc->cmpl = PUDA_HASH_CRC_COMPLETE; - ret = 0; - } + rsrc->check_crc = true; + rsrc->cmpl = PUDA_HASH_CRC_COMPLETE; } irdma_sc_ccq_arm(&rsrc->cq); - return ret; + return 0; error: irdma_puda_dele_rsrc(vsi, info->type, false); @@ -1396,8 +1391,8 @@ static int irdma_ieq_handle_partial(struct irdma_puda_rsrc *ieq, crcptr = txbuf->data + fpdu_len - 4; mpacrc = *(u32 *)crcptr; if (ieq->check_crc) { - status = irdma_ieq_check_mpacrc(ieq->hash_desc, txbuf->data, - (fpdu_len - 4), mpacrc); + status = irdma_ieq_check_mpacrc(txbuf->data, fpdu_len - 4, + mpacrc); if (status) { ibdev_dbg(to_ibdev(ieq->dev), "IEQ: error bad crc\n"); goto error; @@ -1465,8 +1460,8 @@ static int irdma_ieq_process_buf(struct irdma_puda_rsrc *ieq, crcptr = datap + fpdu_len - 4; mpacrc = *(u32 *)crcptr; if (ieq->check_crc) - ret = irdma_ieq_check_mpacrc(ieq->hash_desc, datap, - fpdu_len - 4, mpacrc); + ret = irdma_ieq_check_mpacrc(datap, fpdu_len - 4, + mpacrc); if (ret) { list_add(&buf->list, rxlist); ibdev_dbg(to_ibdev(ieq->dev), diff --git a/drivers/infiniband/hw/irdma/puda.h b/drivers/infiniband/hw/irdma/puda.h index bc6d9514c9c1..2fc638f2b143 100644 --- a/drivers/infiniband/hw/irdma/puda.h +++ b/drivers/infiniband/hw/irdma/puda.h @@ -119,7 +119,6 @@ struct irdma_puda_rsrc { u32 rx_wqe_idx; u32 rxq_invalid_cnt; u32 tx_wqe_avail_cnt; - struct shash_desc *hash_desc; struct list_head txpend; struct list_head bufpool; /* free buffers pool list for recv and xmit */ u32 alloc_buf_count; @@ -163,10 +162,8 @@ struct irdma_sc_qp *irdma_ieq_get_qp(struct irdma_sc_dev *dev, struct irdma_puda_buf *buf); int irdma_puda_get_tcpip_info(struct irdma_puda_cmpl_info *info, struct irdma_puda_buf *buf); -int irdma_ieq_check_mpacrc(struct shash_desc *desc, void *addr, u32 len, u32 val); -int irdma_init_hash_desc(struct shash_desc **desc); +int irdma_ieq_check_mpacrc(const void *addr, u32 len, u32 val); void irdma_ieq_mpa_crc_ae(struct irdma_sc_dev *dev, struct irdma_sc_qp *qp); -void irdma_free_hash_desc(struct shash_desc *desc); void irdma_ieq_update_tcpip_info(struct irdma_puda_buf *buf, u16 len, u32 seqnum); int irdma_cqp_qp_create_cmd(struct irdma_sc_dev *dev, struct irdma_sc_qp *qp); int irdma_cqp_cq_create_cmd(struct irdma_sc_dev *dev, struct irdma_sc_cq *cq); diff --git a/drivers/infiniband/hw/irdma/utils.c b/drivers/infiniband/hw/irdma/utils.c index 0e594122baa7..e73b14fd95ef 100644 --- a/drivers/infiniband/hw/irdma/utils.c +++ b/drivers/infiniband/hw/irdma/utils.c @@ -1273,58 +1273,15 @@ void irdma_ieq_mpa_crc_ae(struct irdma_sc_dev *dev, struct irdma_sc_qp *qp) irdma_gen_ae(rf, qp, &info, false); } -/** - * irdma_init_hash_desc - initialize hash for crc calculation - * @desc: cryption type - */ -int irdma_init_hash_desc(struct shash_desc **desc) -{ - struct crypto_shash *tfm; - struct shash_desc *tdesc; - - tfm = crypto_alloc_shash("crc32c", 0, 0); - if (IS_ERR(tfm)) - return -EINVAL; - - tdesc = kzalloc(sizeof(*tdesc) + crypto_shash_descsize(tfm), - GFP_KERNEL); - if (!tdesc) { - crypto_free_shash(tfm); - return -EINVAL; - } - - tdesc->tfm = tfm; - *desc = tdesc; - - return 0; -} - -/** - * irdma_free_hash_desc - free hash desc - * @desc: to be freed - */ -void irdma_free_hash_desc(struct shash_desc *desc) -{ - if (desc) { - crypto_free_shash(desc->tfm); - kfree(desc); - } -} - /** * irdma_ieq_check_mpacrc - check if mpa crc is OK - * @desc: desc for hash * @addr: address of buffer for crc * @len: length of buffer * @val: value to be compared */ -int irdma_ieq_check_mpacrc(struct shash_desc *desc, void *addr, u32 len, - u32 val) +int irdma_ieq_check_mpacrc(const void *addr, u32 len, u32 val) { - u32 crc = 0; - - crypto_shash_digest(desc, addr, len, (u8 *)&crc); - if (crc != val) + if ((__force u32)cpu_to_le32(~crc32c(~0, addr, len)) != val) return -EINVAL; return 0; -- cgit v1.2.3-59-g8ed1b From ffd67b6b420d0549e250f91eb670e98e189cd73a Mon Sep 17 00:00:00 2001 From: Konstantin Taranov Date: Thu, 13 Feb 2025 05:54:21 -0800 Subject: RDMA/mana_ib: Implement DMABUF MR support Add support of dmabuf MRs to mana_ib. Signed-off-by: Konstantin Taranov Link: https://patch.msgid.link/1739454861-4456-1-git-send-email-kotaranov@linux.microsoft.com Reviewed-by: Long Li Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mana/device.c | 1 + drivers/infiniband/hw/mana/mana_ib.h | 4 +++ drivers/infiniband/hw/mana/mr.c | 69 ++++++++++++++++++++++++++++++++++++ 3 files changed, 74 insertions(+) (limited to 'drivers') diff --git a/drivers/infiniband/hw/mana/device.c b/drivers/infiniband/hw/mana/device.c index 0c7a9929168f..d1a02c54a236 100644 --- a/drivers/infiniband/hw/mana/device.c +++ b/drivers/infiniband/hw/mana/device.c @@ -48,6 +48,7 @@ static const struct ib_device_ops mana_ib_dev_ops = { .query_pkey = mana_ib_query_pkey, .query_port = mana_ib_query_port, .reg_user_mr = mana_ib_reg_user_mr, + .reg_user_mr_dmabuf = mana_ib_reg_user_mr_dmabuf, .req_notify_cq = mana_ib_arm_cq, INIT_RDMA_OBJ_SIZE(ib_ah, mana_ib_ah, ibah), diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h index 7e342e070b70..77fc1032eda8 100644 --- a/drivers/infiniband/hw/mana/mana_ib.h +++ b/drivers/infiniband/hw/mana/mana_ib.h @@ -682,4 +682,8 @@ int mana_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, int mana_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); int mana_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags); + +struct ib_mr *mana_ib_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 start, u64 length, + u64 iova, int fd, int mr_access_flags, + struct uverbs_attr_bundle *attrs); #endif diff --git a/drivers/infiniband/hw/mana/mr.c b/drivers/infiniband/hw/mana/mr.c index 3a047f8c9fc5..f99557ec7767 100644 --- a/drivers/infiniband/hw/mana/mr.c +++ b/drivers/infiniband/hw/mana/mr.c @@ -173,6 +173,75 @@ err_free: return ERR_PTR(err); } +struct ib_mr *mana_ib_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 start, u64 length, + u64 iova, int fd, int access_flags, + struct uverbs_attr_bundle *attrs) +{ + struct mana_ib_pd *pd = container_of(ibpd, struct mana_ib_pd, ibpd); + struct gdma_create_mr_params mr_params = {}; + struct ib_device *ibdev = ibpd->device; + struct ib_umem_dmabuf *umem_dmabuf; + struct mana_ib_dev *dev; + struct mana_ib_mr *mr; + u64 dma_region_handle; + int err; + + dev = container_of(ibdev, struct mana_ib_dev, ib_dev); + + access_flags &= ~IB_ACCESS_OPTIONAL; + if (access_flags & ~VALID_MR_FLAGS) + return ERR_PTR(-EOPNOTSUPP); + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + umem_dmabuf = ib_umem_dmabuf_get_pinned(ibdev, start, length, fd, access_flags); + if (IS_ERR(umem_dmabuf)) { + err = PTR_ERR(umem_dmabuf); + ibdev_dbg(ibdev, "Failed to get dmabuf umem, %d\n", err); + goto err_free; + } + + mr->umem = &umem_dmabuf->umem; + + err = mana_ib_create_dma_region(dev, mr->umem, &dma_region_handle, iova); + if (err) { + ibdev_dbg(ibdev, "Failed create dma region for user-mr, %d\n", + err); + goto err_umem; + } + + mr_params.pd_handle = pd->pd_handle; + mr_params.mr_type = GDMA_MR_TYPE_GVA; + mr_params.gva.dma_region_handle = dma_region_handle; + mr_params.gva.virtual_address = iova; + mr_params.gva.access_flags = + mana_ib_verbs_to_gdma_access_flags(access_flags); + + err = mana_ib_gd_create_mr(dev, mr, &mr_params); + if (err) + goto err_dma_region; + + /* + * There is no need to keep track of dma_region_handle after MR is + * successfully created. The dma_region_handle is tracked in the PF + * as part of the lifecycle of this MR. + */ + + return &mr->ibmr; + +err_dma_region: + mana_gd_destroy_dma_region(mdev_to_gc(dev), dma_region_handle); + +err_umem: + ib_umem_release(mr->umem); + +err_free: + kfree(mr); + return ERR_PTR(err); +} + struct ib_mr *mana_ib_get_dma_mr(struct ib_pd *ibpd, int access_flags) { struct mana_ib_pd *pd = container_of(ibpd, struct mana_ib_pd, ibpd); -- cgit v1.2.3-59-g8ed1b From 0172be244ce367dd51d77b777244ea9c8de34a3a Mon Sep 17 00:00:00 2001 From: Imanol Date: Mon, 17 Feb 2025 18:30:54 +0000 Subject: IB/iser: fix typos in iscsi_iser.c comments Fixes multiple occurrences of the misspelled word "occured" in the comments of `iscsi_iser.c`, replacing them with the correct spelling "occurred". This improves readability without affecting functionality. Signed-off-by: Imanol Link: https://patch.msgid.link/20250217183048.9394-1-imvalient@protonmail.com Acked-by: Max Gurtovoy Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/iser/iscsi_iser.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c index bb9aaff92ca3..a5be6f1ba12b 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.c +++ b/drivers/infiniband/ulp/iser/iscsi_iser.c @@ -393,10 +393,10 @@ static void iscsi_iser_cleanup_task(struct iscsi_task *task) * @task: iscsi task * @sector: error sector if exsists (output) * - * Return: zero if no data-integrity errors have occured - * 0x1: data-integrity error occured in the guard-block - * 0x2: data-integrity error occured in the reference tag - * 0x3: data-integrity error occured in the application tag + * Return: zero if no data-integrity errors have occurred + * 0x1: data-integrity error occurred in the guard-block + * 0x2: data-integrity error occurred in the reference tag + * 0x3: data-integrity error occurred in the application tag * * In addition the error sector is marked. */ -- cgit v1.2.3-59-g8ed1b From 486055f5e09df959ad4e3aa4ee75b5c91ddeec2e Mon Sep 17 00:00:00 2001 From: Michael Margolin Date: Mon, 17 Feb 2025 14:16:23 +0000 Subject: RDMA/core: Fix best page size finding when it can cross SG entries A single scatter-gather entry is limited by a 32 bits "length" field that is practically 4GB - PAGE_SIZE. This means that even when the memory is physically contiguous, we might need more than one entry to represent it. Additionally when using dmabuf, the sg_table might be originated outside the subsystem and optimized for other needs. For instance an SGT of 16GB GPU continuous memory might look like this: (a real life example) dma_address 34401400000, length fffff000 dma_address 345013ff000, length fffff000 dma_address 346013fe000, length fffff000 dma_address 347013fd000, length fffff000 dma_address 348013fc000, length 4000 Since ib_umem_find_best_pgsz works within SG entries, in the above case we will result with the worst possible 4KB page size. Fix this by taking into consideration only the alignment of addresses of real discontinuity points rather than treating SG entries as such, and adjust the page iterator to correctly handle cross SG entry pages. There is currently an assumption that drivers do not ask for pages bigger than maximal DMA size supported by their devices. Reviewed-by: Firas Jahjah Reviewed-by: Yonatan Nachum Signed-off-by: Michael Margolin Link: https://patch.msgid.link/20250217141623.12428-1-mrgolin@amazon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/umem.c | 36 ++++++++++++++++++++++++++---------- drivers/infiniband/core/verbs.c | 11 ++++++----- 2 files changed, 32 insertions(+), 15 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 07c571c7b699..c5b686394760 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -80,9 +80,12 @@ unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem, unsigned long pgsz_bitmap, unsigned long virt) { - struct scatterlist *sg; + unsigned long curr_len = 0; + dma_addr_t curr_base = ~0; unsigned long va, pgoff; + struct scatterlist *sg; dma_addr_t mask; + dma_addr_t end; int i; umem->iova = va = virt; @@ -107,17 +110,30 @@ unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem, pgoff = umem->address & ~PAGE_MASK; for_each_sgtable_dma_sg(&umem->sgt_append.sgt, sg, i) { - /* Walk SGL and reduce max page size if VA/PA bits differ - * for any address. + /* If the current entry is physically contiguous with the previous + * one, no need to take its start addresses into consideration. */ - mask |= (sg_dma_address(sg) + pgoff) ^ va; + if (check_add_overflow(curr_base, curr_len, &end) || + end != sg_dma_address(sg)) { + + curr_base = sg_dma_address(sg); + curr_len = 0; + + /* Reduce max page size if VA/PA bits differ */ + mask |= (curr_base + pgoff) ^ va; + + /* The alignment of any VA matching a discontinuity point + * in the physical memory sets the maximum possible page + * size as this must be a starting point of a new page that + * needs to be aligned. + */ + if (i != 0) + mask |= va; + } + + curr_len += sg_dma_len(sg); va += sg_dma_len(sg) - pgoff; - /* Except for the last entry, the ending iova alignment sets - * the maximum possible page size as the low bits of the iova - * must be zero when starting the next chunk. - */ - if (i != (umem->sgt_append.sgt.nents - 1)) - mask |= va; + pgoff = 0; } diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 473ee0831307..dc40001072a5 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -3109,22 +3109,23 @@ EXPORT_SYMBOL(__rdma_block_iter_start); bool __rdma_block_iter_next(struct ib_block_iter *biter) { unsigned int block_offset; - unsigned int sg_delta; + unsigned int delta; if (!biter->__sg_nents || !biter->__sg) return false; biter->__dma_addr = sg_dma_address(biter->__sg) + biter->__sg_advance; block_offset = biter->__dma_addr & (BIT_ULL(biter->__pg_bit) - 1); - sg_delta = BIT_ULL(biter->__pg_bit) - block_offset; + delta = BIT_ULL(biter->__pg_bit) - block_offset; - if (sg_dma_len(biter->__sg) - biter->__sg_advance > sg_delta) { - biter->__sg_advance += sg_delta; - } else { + while (biter->__sg_nents && biter->__sg && + sg_dma_len(biter->__sg) - biter->__sg_advance <= delta) { + delta -= sg_dma_len(biter->__sg) - biter->__sg_advance; biter->__sg_advance = 0; biter->__sg = sg_next(biter->__sg); biter->__sg_nents--; } + biter->__sg_advance += delta; return true; } -- cgit v1.2.3-59-g8ed1b From 7f880725078d2314f6761f24c2a244e4a68c69c2 Mon Sep 17 00:00:00 2001 From: Daisuke Matsuda Date: Fri, 20 Dec 2024 19:09:32 +0900 Subject: RDMA/rxe: Move some code to rxe_loc.h in preparation for ODP rxe_mr_init() and resp_states are going to be used in rxe_odp.c, which is to be created in the subsequent patch. Link: https://patch.msgid.link/r/20241220100936.2193541-2-matsuda-daisuke@fujitsu.com Signed-off-by: Daisuke Matsuda Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe.h | 37 ----------------------------------- drivers/infiniband/sw/rxe/rxe_loc.h | 1 + drivers/infiniband/sw/rxe/rxe_mr.c | 2 +- drivers/infiniband/sw/rxe/rxe_verbs.h | 37 +++++++++++++++++++++++++++++++++++ 4 files changed, 39 insertions(+), 38 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/sw/rxe/rxe.h b/drivers/infiniband/sw/rxe/rxe.h index 8db65731499d..ff8cd53f5f28 100644 --- a/drivers/infiniband/sw/rxe/rxe.h +++ b/drivers/infiniband/sw/rxe/rxe.h @@ -99,43 +99,6 @@ #define rxe_info_mw(mw, fmt, ...) ibdev_info_ratelimited((mw)->ibmw.device, \ "mw#%d %s: " fmt, (mw)->elem.index, __func__, ##__VA_ARGS__) -/* responder states */ -enum resp_states { - RESPST_NONE, - RESPST_GET_REQ, - RESPST_CHK_PSN, - RESPST_CHK_OP_SEQ, - RESPST_CHK_OP_VALID, - RESPST_CHK_RESOURCE, - RESPST_CHK_LENGTH, - RESPST_CHK_RKEY, - RESPST_EXECUTE, - RESPST_READ_REPLY, - RESPST_ATOMIC_REPLY, - RESPST_ATOMIC_WRITE_REPLY, - RESPST_PROCESS_FLUSH, - RESPST_COMPLETE, - RESPST_ACKNOWLEDGE, - RESPST_CLEANUP, - RESPST_DUPLICATE_REQUEST, - RESPST_ERR_MALFORMED_WQE, - RESPST_ERR_UNSUPPORTED_OPCODE, - RESPST_ERR_MISALIGNED_ATOMIC, - RESPST_ERR_PSN_OUT_OF_SEQ, - RESPST_ERR_MISSING_OPCODE_FIRST, - RESPST_ERR_MISSING_OPCODE_LAST_C, - RESPST_ERR_MISSING_OPCODE_LAST_D1E, - RESPST_ERR_TOO_MANY_RDMA_ATM_REQ, - RESPST_ERR_RNR, - RESPST_ERR_RKEY_VIOLATION, - RESPST_ERR_INVALIDATE_RKEY, - RESPST_ERR_LENGTH, - RESPST_ERR_CQ_OVERFLOW, - RESPST_ERROR, - RESPST_DONE, - RESPST_EXIT, -}; - void rxe_set_mtu(struct rxe_dev *rxe, unsigned int dev_mtu); int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name, diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h index c57ab8975c5d..738d3bca8d4f 100644 --- a/drivers/infiniband/sw/rxe/rxe_loc.h +++ b/drivers/infiniband/sw/rxe/rxe_loc.h @@ -58,6 +58,7 @@ int rxe_mmap(struct ib_ucontext *context, struct vm_area_struct *vma); /* rxe_mr.c */ u8 rxe_get_next_key(u32 last_key); +void rxe_mr_init(int access, struct rxe_mr *mr); void rxe_mr_init_dma(int access, struct rxe_mr *mr); int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, int access, struct rxe_mr *mr); diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c index da3dee520876..12cacc4f60f2 100644 --- a/drivers/infiniband/sw/rxe/rxe_mr.c +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -45,7 +45,7 @@ int mr_check_range(struct rxe_mr *mr, u64 iova, size_t length) } } -static void rxe_mr_init(int access, struct rxe_mr *mr) +void rxe_mr_init(int access, struct rxe_mr *mr) { u32 key = mr->elem.index << 8 | rxe_get_next_key(-1); diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index 3d0faab9a8f7..fd48075810dd 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -126,6 +126,43 @@ struct rxe_comp_info { u32 rnr_retry; }; +/* responder states */ +enum resp_states { + RESPST_NONE, + RESPST_GET_REQ, + RESPST_CHK_PSN, + RESPST_CHK_OP_SEQ, + RESPST_CHK_OP_VALID, + RESPST_CHK_RESOURCE, + RESPST_CHK_LENGTH, + RESPST_CHK_RKEY, + RESPST_EXECUTE, + RESPST_READ_REPLY, + RESPST_ATOMIC_REPLY, + RESPST_ATOMIC_WRITE_REPLY, + RESPST_PROCESS_FLUSH, + RESPST_COMPLETE, + RESPST_ACKNOWLEDGE, + RESPST_CLEANUP, + RESPST_DUPLICATE_REQUEST, + RESPST_ERR_MALFORMED_WQE, + RESPST_ERR_UNSUPPORTED_OPCODE, + RESPST_ERR_MISALIGNED_ATOMIC, + RESPST_ERR_PSN_OUT_OF_SEQ, + RESPST_ERR_MISSING_OPCODE_FIRST, + RESPST_ERR_MISSING_OPCODE_LAST_C, + RESPST_ERR_MISSING_OPCODE_LAST_D1E, + RESPST_ERR_TOO_MANY_RDMA_ATM_REQ, + RESPST_ERR_RNR, + RESPST_ERR_RKEY_VIOLATION, + RESPST_ERR_INVALIDATE_RKEY, + RESPST_ERR_LENGTH, + RESPST_ERR_CQ_OVERFLOW, + RESPST_ERROR, + RESPST_DONE, + RESPST_EXIT, +}; + enum rdatm_res_state { rdatm_res_state_next, rdatm_res_state_new, -- cgit v1.2.3-59-g8ed1b From b601792392f9fe8d9d1ae9028ca61d0f70ffb986 Mon Sep 17 00:00:00 2001 From: Daisuke Matsuda Date: Fri, 20 Dec 2024 19:09:33 +0900 Subject: RDMA/rxe: Add page invalidation support On page invalidation, an MMU notifier callback is invoked to unmap DMA addresses and update the driver page table(umem_odp->dma_list). The callback is registered when an ODP-enabled MR is created. Link: https://patch.msgid.link/r/20241220100936.2193541-3-matsuda-daisuke@fujitsu.com Signed-off-by: Daisuke Matsuda Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/Makefile | 2 ++ drivers/infiniband/sw/rxe/rxe_loc.h | 3 +++ drivers/infiniband/sw/rxe/rxe_odp.c | 38 +++++++++++++++++++++++++++++++++++++ 3 files changed, 43 insertions(+) create mode 100644 drivers/infiniband/sw/rxe/rxe_odp.c (limited to 'drivers') diff --git a/drivers/infiniband/sw/rxe/Makefile b/drivers/infiniband/sw/rxe/Makefile index 5395a581f4bb..93134f1d1d0c 100644 --- a/drivers/infiniband/sw/rxe/Makefile +++ b/drivers/infiniband/sw/rxe/Makefile @@ -23,3 +23,5 @@ rdma_rxe-y := \ rxe_task.o \ rxe_net.o \ rxe_hw_counters.o + +rdma_rxe-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += rxe_odp.o diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h index 738d3bca8d4f..465ab188c109 100644 --- a/drivers/infiniband/sw/rxe/rxe_loc.h +++ b/drivers/infiniband/sw/rxe/rxe_loc.h @@ -181,4 +181,7 @@ static inline unsigned int wr_opcode_mask(int opcode, struct rxe_qp *qp) return rxe_wr_opcode_info[opcode].mask[qp->ibqp.qp_type]; } +/* rxe_odp.c */ +extern const struct mmu_interval_notifier_ops rxe_mn_ops; + #endif /* RXE_LOC_H */ diff --git a/drivers/infiniband/sw/rxe/rxe_odp.c b/drivers/infiniband/sw/rxe/rxe_odp.c new file mode 100644 index 000000000000..2be8066db012 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_odp.c @@ -0,0 +1,38 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2022-2023 Fujitsu Ltd. All rights reserved. + */ + +#include + +#include + +#include "rxe.h" + +static bool rxe_ib_invalidate_range(struct mmu_interval_notifier *mni, + const struct mmu_notifier_range *range, + unsigned long cur_seq) +{ + struct ib_umem_odp *umem_odp = + container_of(mni, struct ib_umem_odp, notifier); + unsigned long start, end; + + if (!mmu_notifier_range_blockable(range)) + return false; + + mutex_lock(&umem_odp->umem_mutex); + mmu_interval_set_seq(mni, cur_seq); + + start = max_t(u64, ib_umem_start(umem_odp), range->start); + end = min_t(u64, ib_umem_end(umem_odp), range->end); + + /* update umem_odp->dma_list */ + ib_umem_odp_unmap_dma_pages(umem_odp, start, end); + + mutex_unlock(&umem_odp->umem_mutex); + return true; +} + +const struct mmu_interval_notifier_ops rxe_mn_ops = { + .invalidate = rxe_ib_invalidate_range, +}; -- cgit v1.2.3-59-g8ed1b From d03fb5c6599e31b90c6b5f65d43d6ccc6b49eb91 Mon Sep 17 00:00:00 2001 From: Daisuke Matsuda Date: Fri, 20 Dec 2024 19:09:34 +0900 Subject: RDMA/rxe: Allow registering MRs for On-Demand Paging Allow userspace to register an ODP-enabled MR, in which case the flag IB_ACCESS_ON_DEMAND is passed to rxe_reg_user_mr(). However, there is no RDMA operation enabled right now. They will be supported later in the subsequent two patches. rxe_odp_do_pagefault() is called to initialize an ODP-enabled MR. It syncs process address space from the CPU page table to the driver page table (dma_list/pfn_list in umem_odp) when called with RXE_PAGEFAULT_SNAPSHOT flag. Additionally, It can be used to trigger page fault when pages being accessed are not present or do not have proper read/write permissions, and possibly to prefetch pages in the future. Link: https://patch.msgid.link/r/20241220100936.2193541-4-matsuda-daisuke@fujitsu.com Signed-off-by: Daisuke Matsuda Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe.c | 7 +++ drivers/infiniband/sw/rxe/rxe_loc.h | 12 +++++ drivers/infiniband/sw/rxe/rxe_mr.c | 9 +++- drivers/infiniband/sw/rxe/rxe_odp.c | 86 +++++++++++++++++++++++++++++++++++ drivers/infiniband/sw/rxe/rxe_resp.c | 15 ++++-- drivers/infiniband/sw/rxe/rxe_verbs.c | 5 +- 6 files changed, 128 insertions(+), 6 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c index 83a16dba1eff..c08868c7bc55 100644 --- a/drivers/infiniband/sw/rxe/rxe.c +++ b/drivers/infiniband/sw/rxe/rxe.c @@ -92,6 +92,13 @@ static void rxe_init_device_param(struct rxe_dev *rxe) dev_put(ndev); rxe->max_ucontext = RXE_MAX_UCONTEXT; + + if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) { + rxe->attr.kernel_cap_flags |= IBK_ON_DEMAND_PAGING; + + /* IB_ODP_SUPPORT_IMPLICIT is not supported right now. */ + rxe->attr.odp_caps.general_caps |= IB_ODP_SUPPORT; + } } /* initialize port attributes */ diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h index 465ab188c109..a63fa7ad583b 100644 --- a/drivers/infiniband/sw/rxe/rxe_loc.h +++ b/drivers/infiniband/sw/rxe/rxe_loc.h @@ -184,4 +184,16 @@ static inline unsigned int wr_opcode_mask(int opcode, struct rxe_qp *qp) /* rxe_odp.c */ extern const struct mmu_interval_notifier_ops rxe_mn_ops; +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING +int rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, + u64 iova, int access_flags, struct rxe_mr *mr); +#else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ +static inline int +rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, + int access_flags, struct rxe_mr *mr) +{ + return -EOPNOTSUPP; +} +#endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ + #endif /* RXE_LOC_H */ diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c index 12cacc4f60f2..190225a34139 100644 --- a/drivers/infiniband/sw/rxe/rxe_mr.c +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -323,7 +323,10 @@ int rxe_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, return err; } - return rxe_mr_copy_xarray(mr, iova, addr, length, dir); + if (mr->umem->is_odp) + return -EOPNOTSUPP; + else + return rxe_mr_copy_xarray(mr, iova, addr, length, dir); } /* copy data in or out of a wqe, i.e. sg list @@ -532,6 +535,10 @@ int rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value) struct page *page; u64 *va; + /* ODP is not supported right now. WIP. */ + if (mr->umem->is_odp) + return RESPST_ERR_UNSUPPORTED_OPCODE; + /* See IBA oA19-28 */ if (unlikely(mr->state != RXE_MR_STATE_VALID)) { rxe_dbg_mr(mr, "mr not in valid state\n"); diff --git a/drivers/infiniband/sw/rxe/rxe_odp.c b/drivers/infiniband/sw/rxe/rxe_odp.c index 2be8066db012..141290078754 100644 --- a/drivers/infiniband/sw/rxe/rxe_odp.c +++ b/drivers/infiniband/sw/rxe/rxe_odp.c @@ -36,3 +36,89 @@ static bool rxe_ib_invalidate_range(struct mmu_interval_notifier *mni, const struct mmu_interval_notifier_ops rxe_mn_ops = { .invalidate = rxe_ib_invalidate_range, }; + +#define RXE_PAGEFAULT_RDONLY BIT(1) +#define RXE_PAGEFAULT_SNAPSHOT BIT(2) +static int rxe_odp_do_pagefault_and_lock(struct rxe_mr *mr, u64 user_va, int bcnt, u32 flags) +{ + struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); + bool fault = !(flags & RXE_PAGEFAULT_SNAPSHOT); + u64 access_mask; + int np; + + access_mask = ODP_READ_ALLOWED_BIT; + if (umem_odp->umem.writable && !(flags & RXE_PAGEFAULT_RDONLY)) + access_mask |= ODP_WRITE_ALLOWED_BIT; + + /* + * ib_umem_odp_map_dma_and_lock() locks umem_mutex on success. + * Callers must release the lock later to let invalidation handler + * do its work again. + */ + np = ib_umem_odp_map_dma_and_lock(umem_odp, user_va, bcnt, + access_mask, fault); + return np; +} + +static int rxe_odp_init_pages(struct rxe_mr *mr) +{ + struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); + int ret; + + ret = rxe_odp_do_pagefault_and_lock(mr, mr->umem->address, + mr->umem->length, + RXE_PAGEFAULT_SNAPSHOT); + + if (ret >= 0) + mutex_unlock(&umem_odp->umem_mutex); + + return ret >= 0 ? 0 : ret; +} + +int rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, + u64 iova, int access_flags, struct rxe_mr *mr) +{ + struct ib_umem_odp *umem_odp; + int err; + + if (!IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) + return -EOPNOTSUPP; + + rxe_mr_init(access_flags, mr); + + if (!start && length == U64_MAX) { + if (iova != 0) + return -EINVAL; + if (!(rxe->attr.odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT)) + return -EINVAL; + + /* Never reach here, for implicit ODP is not implemented. */ + } + + umem_odp = ib_umem_odp_get(&rxe->ib_dev, start, length, access_flags, + &rxe_mn_ops); + if (IS_ERR(umem_odp)) { + rxe_dbg_mr(mr, "Unable to create umem_odp err = %d\n", + (int)PTR_ERR(umem_odp)); + return PTR_ERR(umem_odp); + } + + umem_odp->private = mr; + + mr->umem = &umem_odp->umem; + mr->access = access_flags; + mr->ibmr.length = length; + mr->ibmr.iova = iova; + mr->page_offset = ib_umem_offset(&umem_odp->umem); + + err = rxe_odp_init_pages(mr); + if (err) { + ib_umem_odp_release(umem_odp); + return err; + } + + mr->state = RXE_MR_STATE_VALID; + mr->ibmr.type = IB_MR_TYPE_USER; + + return err; +} diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index c11ab280551a..e703a3ab82d4 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -649,6 +649,10 @@ static enum resp_states process_flush(struct rxe_qp *qp, struct rxe_mr *mr = qp->resp.mr; struct resp_res *res = qp->resp.res; + /* ODP is not supported right now. WIP. */ + if (mr->umem->is_odp) + return RESPST_ERR_UNSUPPORTED_OPCODE; + /* oA19-14, oA19-15 */ if (res && res->replay) return RESPST_ACKNOWLEDGE; @@ -702,10 +706,13 @@ static enum resp_states atomic_reply(struct rxe_qp *qp, if (!res->replay) { u64 iova = qp->resp.va + qp->resp.offset; - err = rxe_mr_do_atomic_op(mr, iova, pkt->opcode, - atmeth_comp(pkt), - atmeth_swap_add(pkt), - &res->atomic.orig_val); + if (mr->umem->is_odp) + err = RESPST_ERR_UNSUPPORTED_OPCODE; + else + err = rxe_mr_do_atomic_op(mr, iova, pkt->opcode, + atmeth_comp(pkt), + atmeth_swap_add(pkt), + &res->atomic.orig_val); if (err) return err; diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index a9494c6030af..2331e698a65b 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -1298,7 +1298,10 @@ static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd, u64 start, mr->ibmr.pd = ibpd; mr->ibmr.device = ibpd->device; - err = rxe_mr_init_user(rxe, start, length, access, mr); + if (access & IB_ACCESS_ON_DEMAND) + err = rxe_odp_mr_init_user(rxe, start, length, iova, access, mr); + else + err = rxe_mr_init_user(rxe, start, length, access, mr); if (err) { rxe_dbg_mr(mr, "reg_user_mr failed, err = %d\n", err); goto err_cleanup; -- cgit v1.2.3-59-g8ed1b From 2fae67ab63db7e8b35520f897935d694ad92f2fe Mon Sep 17 00:00:00 2001 From: Daisuke Matsuda Date: Fri, 20 Dec 2024 19:09:35 +0900 Subject: RDMA/rxe: Add support for Send/Recv/Write/Read with ODP rxe_mr_copy() is used widely to copy data to/from a user MR. requester uses it to load payloads of requesting packets; responder uses it to process Send, Write, and Read operaetions; completer uses it to copy data from response packets of Read and Atomic operations to a user MR. Allow these operations to be used with ODP by adding a subordinate function rxe_odp_mr_copy(). It is comprised of the following steps: 1. Check the driver page table(umem_odp->dma_list) to see if pages being accessed are present with appropriate permission. 2. If necessary, trigger page fault to map the pages. 3. Convert their user space addresses to kernel logical addresses using PFNs in the driver page table(umem_odp->pfn_list). 4. Execute data copy to/from the pages. Link: https://patch.msgid.link/r/20241220100936.2193541-5-matsuda-daisuke@fujitsu.com Signed-off-by: Daisuke Matsuda Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe.c | 10 +++ drivers/infiniband/sw/rxe/rxe_loc.h | 7 ++ drivers/infiniband/sw/rxe/rxe_mr.c | 2 +- drivers/infiniband/sw/rxe/rxe_odp.c | 131 ++++++++++++++++++++++++++++++++++++ 4 files changed, 149 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c index c08868c7bc55..d7daaafa8b32 100644 --- a/drivers/infiniband/sw/rxe/rxe.c +++ b/drivers/infiniband/sw/rxe/rxe.c @@ -98,6 +98,16 @@ static void rxe_init_device_param(struct rxe_dev *rxe) /* IB_ODP_SUPPORT_IMPLICIT is not supported right now. */ rxe->attr.odp_caps.general_caps |= IB_ODP_SUPPORT; + + rxe->attr.odp_caps.per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SEND; + rxe->attr.odp_caps.per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_RECV; + rxe->attr.odp_caps.per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV; + + rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SEND; + rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_RECV; + rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_WRITE; + rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_READ; + rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV; } } diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h index a63fa7ad583b..ec53ba2bd7d8 100644 --- a/drivers/infiniband/sw/rxe/rxe_loc.h +++ b/drivers/infiniband/sw/rxe/rxe_loc.h @@ -187,6 +187,8 @@ extern const struct mmu_interval_notifier_ops rxe_mn_ops; #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING int rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, int access_flags, struct rxe_mr *mr); +int rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, int length, + enum rxe_mr_copy_dir dir); #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ static inline int rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, @@ -194,6 +196,11 @@ rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, { return -EOPNOTSUPP; } +static inline int rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, + int length, enum rxe_mr_copy_dir dir) +{ + return -EOPNOTSUPP; +} #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ #endif /* RXE_LOC_H */ diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c index 190225a34139..6cd668a8dfb2 100644 --- a/drivers/infiniband/sw/rxe/rxe_mr.c +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -324,7 +324,7 @@ int rxe_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, } if (mr->umem->is_odp) - return -EOPNOTSUPP; + return rxe_odp_mr_copy(mr, iova, addr, length, dir); else return rxe_mr_copy_xarray(mr, iova, addr, length, dir); } diff --git a/drivers/infiniband/sw/rxe/rxe_odp.c b/drivers/infiniband/sw/rxe/rxe_odp.c index 141290078754..d3c67d18c173 100644 --- a/drivers/infiniband/sw/rxe/rxe_odp.c +++ b/drivers/infiniband/sw/rxe/rxe_odp.c @@ -122,3 +122,134 @@ int rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, return err; } + +static inline bool rxe_check_pagefault(struct ib_umem_odp *umem_odp, + u64 iova, int length, u32 perm) +{ + bool need_fault = false; + u64 addr; + int idx; + + addr = iova & (~(BIT(umem_odp->page_shift) - 1)); + + /* Skim through all pages that are to be accessed. */ + while (addr < iova + length) { + idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift; + + if (!(umem_odp->dma_list[idx] & perm)) { + need_fault = true; + break; + } + + addr += BIT(umem_odp->page_shift); + } + return need_fault; +} + +static int rxe_odp_map_range_and_lock(struct rxe_mr *mr, u64 iova, int length, u32 flags) +{ + struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); + bool need_fault; + u64 perm; + int err; + + if (unlikely(length < 1)) + return -EINVAL; + + perm = ODP_READ_ALLOWED_BIT; + if (!(flags & RXE_PAGEFAULT_RDONLY)) + perm |= ODP_WRITE_ALLOWED_BIT; + + mutex_lock(&umem_odp->umem_mutex); + + need_fault = rxe_check_pagefault(umem_odp, iova, length, perm); + if (need_fault) { + mutex_unlock(&umem_odp->umem_mutex); + + /* umem_mutex is locked on success. */ + err = rxe_odp_do_pagefault_and_lock(mr, iova, length, + flags); + if (err < 0) + return err; + + need_fault = rxe_check_pagefault(umem_odp, iova, length, perm); + if (need_fault) + return -EFAULT; + } + + return 0; +} + +static int __rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, + int length, enum rxe_mr_copy_dir dir) +{ + struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); + struct page *page; + int idx, bytes; + size_t offset; + u8 *user_va; + + idx = (iova - ib_umem_start(umem_odp)) >> umem_odp->page_shift; + offset = iova & (BIT(umem_odp->page_shift) - 1); + + while (length > 0) { + u8 *src, *dest; + + page = hmm_pfn_to_page(umem_odp->pfn_list[idx]); + user_va = kmap_local_page(page); + if (!user_va) + return -EFAULT; + + src = (dir == RXE_TO_MR_OBJ) ? addr : user_va; + dest = (dir == RXE_TO_MR_OBJ) ? user_va : addr; + + bytes = BIT(umem_odp->page_shift) - offset; + if (bytes > length) + bytes = length; + + memcpy(dest, src, bytes); + kunmap_local(user_va); + + length -= bytes; + idx++; + offset = 0; + } + + return 0; +} + +int rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, int length, + enum rxe_mr_copy_dir dir) +{ + struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); + u32 flags = 0; + int err; + + if (length == 0) + return 0; + + if (unlikely(!mr->umem->is_odp)) + return -EOPNOTSUPP; + + switch (dir) { + case RXE_TO_MR_OBJ: + break; + + case RXE_FROM_MR_OBJ: + flags = RXE_PAGEFAULT_RDONLY; + break; + + default: + return -EINVAL; + } + + err = rxe_odp_map_range_and_lock(mr, iova, length, flags); + if (err) + return err; + + err = __rxe_odp_mr_copy(mr, iova, addr, length, dir); + + mutex_unlock(&umem_odp->umem_mutex); + + return err; +} -- cgit v1.2.3-59-g8ed1b From b55e9d29ec6a268c7d077d6796fd52f98e150a33 Mon Sep 17 00:00:00 2001 From: Daisuke Matsuda Date: Fri, 20 Dec 2024 19:09:36 +0900 Subject: RDMA/rxe: Add support for the traditional Atomic operations with ODP Enable 'fetch and add' and 'compare and swap' operations to be used with ODP. This is comprised of the following steps: 1. Check the driver page table(umem_odp->dma_list) to see if the target page is both readable and writable. 2. If not, then trigger page fault to map the page. 3. Convert its user space address to a kernel logical address using PFNs in the driver page table(umem_odp->pfn_list). 4. Execute the operation. Link: https://patch.msgid.link/r/20241220100936.2193541-6-matsuda-daisuke@fujitsu.com Signed-off-by: Daisuke Matsuda Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe.c | 1 + drivers/infiniband/sw/rxe/rxe_loc.h | 11 ++++++ drivers/infiniband/sw/rxe/rxe_mr.c | 2 +- drivers/infiniband/sw/rxe/rxe_odp.c | 69 ++++++++++++++++++++++++++++++++++++ drivers/infiniband/sw/rxe/rxe_resp.c | 5 ++- 5 files changed, 86 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c index d7daaafa8b32..4e56a371deb5 100644 --- a/drivers/infiniband/sw/rxe/rxe.c +++ b/drivers/infiniband/sw/rxe/rxe.c @@ -107,6 +107,7 @@ static void rxe_init_device_param(struct rxe_dev *rxe) rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_RECV; rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_WRITE; rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_READ; + rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC; rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV; } } diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h index ec53ba2bd7d8..feb386d98d1d 100644 --- a/drivers/infiniband/sw/rxe/rxe_loc.h +++ b/drivers/infiniband/sw/rxe/rxe_loc.h @@ -81,6 +81,9 @@ int rxe_invalidate_mr(struct rxe_qp *qp, u32 key); int rxe_reg_fast_mr(struct rxe_qp *qp, struct rxe_send_wqe *wqe); void rxe_mr_cleanup(struct rxe_pool_elem *elem); +/* defined in rxe_mr.c; used in rxe_mr.c and rxe_odp.c */ +extern spinlock_t atomic_ops_lock; + /* rxe_mw.c */ int rxe_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata); int rxe_dealloc_mw(struct ib_mw *ibmw); @@ -189,6 +192,8 @@ int rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, int access_flags, struct rxe_mr *mr); int rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, int length, enum rxe_mr_copy_dir dir); +int rxe_odp_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, + u64 compare, u64 swap_add, u64 *orig_val); #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ static inline int rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, @@ -201,6 +206,12 @@ static inline int rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, { return -EOPNOTSUPP; } +static inline int +rxe_odp_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, + u64 compare, u64 swap_add, u64 *orig_val) +{ + return RESPST_ERR_UNSUPPORTED_OPCODE; +} #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ #endif /* RXE_LOC_H */ diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c index 6cd668a8dfb2..868d2f0b74e9 100644 --- a/drivers/infiniband/sw/rxe/rxe_mr.c +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -469,7 +469,7 @@ int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length) } /* Guarantee atomicity of atomic operations at the machine level. */ -static DEFINE_SPINLOCK(atomic_ops_lock); +DEFINE_SPINLOCK(atomic_ops_lock); int rxe_mr_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, u64 compare, u64 swap_add, u64 *orig_val) diff --git a/drivers/infiniband/sw/rxe/rxe_odp.c b/drivers/infiniband/sw/rxe/rxe_odp.c index d3c67d18c173..a82e5011360c 100644 --- a/drivers/infiniband/sw/rxe/rxe_odp.c +++ b/drivers/infiniband/sw/rxe/rxe_odp.c @@ -253,3 +253,72 @@ int rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, int length, return err; } + +static int rxe_odp_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, + u64 compare, u64 swap_add, u64 *orig_val) +{ + struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); + unsigned int page_offset; + struct page *page; + unsigned int idx; + u64 value; + u64 *va; + int err; + + if (unlikely(mr->state != RXE_MR_STATE_VALID)) { + rxe_dbg_mr(mr, "mr not in valid state\n"); + return RESPST_ERR_RKEY_VIOLATION; + } + + err = mr_check_range(mr, iova, sizeof(value)); + if (err) { + rxe_dbg_mr(mr, "iova out of range\n"); + return RESPST_ERR_RKEY_VIOLATION; + } + + idx = (iova - ib_umem_start(umem_odp)) >> umem_odp->page_shift; + page_offset = iova & (BIT(umem_odp->page_shift) - 1); + page = hmm_pfn_to_page(umem_odp->pfn_list[idx]); + if (!page) + return RESPST_ERR_RKEY_VIOLATION; + + if (unlikely(page_offset & 0x7)) { + rxe_dbg_mr(mr, "iova not aligned\n"); + return RESPST_ERR_MISALIGNED_ATOMIC; + } + + va = kmap_local_page(page); + + spin_lock_bh(&atomic_ops_lock); + value = *orig_val = va[page_offset >> 3]; + + if (opcode == IB_OPCODE_RC_COMPARE_SWAP) { + if (value == compare) + va[page_offset >> 3] = swap_add; + } else { + value += swap_add; + va[page_offset >> 3] = value; + } + spin_unlock_bh(&atomic_ops_lock); + + kunmap_local(va); + + return 0; +} + +int rxe_odp_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, + u64 compare, u64 swap_add, u64 *orig_val) +{ + struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); + int err; + + err = rxe_odp_map_range_and_lock(mr, iova, sizeof(char), 0); + if (err < 0) + return err; + + err = rxe_odp_do_atomic_op(mr, iova, opcode, compare, swap_add, + orig_val); + mutex_unlock(&umem_odp->umem_mutex); + + return err; +} diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index e703a3ab82d4..54ba9ee1acc5 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -707,7 +707,10 @@ static enum resp_states atomic_reply(struct rxe_qp *qp, u64 iova = qp->resp.va + qp->resp.offset; if (mr->umem->is_odp) - err = RESPST_ERR_UNSUPPORTED_OPCODE; + err = rxe_odp_atomic_op(mr, iova, pkt->opcode, + atmeth_comp(pkt), + atmeth_swap_add(pkt), + &res->atomic.orig_val); else err = rxe_mr_do_atomic_op(mr, iova, pkt->opcode, atmeth_comp(pkt), -- cgit v1.2.3-59-g8ed1b From be35a3127d60964b338da95c7bfaaf4a01b330d4 Mon Sep 17 00:00:00 2001 From: Kees Bakker Date: Fri, 21 Feb 2025 20:39:03 +0100 Subject: RDMA/mana_ib: Ensure variable err is initialized In the function mana_ib_gd_create_dma_region if there are no dma blocks to process the variable `err` remains uninitialized. Fixes: 0266a177631d ("RDMA/mana_ib: Add a driver for Microsoft Azure Network Adapter") Signed-off-by: Kees Bakker Link: https://patch.msgid.link/20250221195833.7516C16290A@bout3.ijzerbout.nl Reviewed-by: Long Li Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mana/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c index e3230fe31fbc..730f958aa2a8 100644 --- a/drivers/infiniband/hw/mana/main.c +++ b/drivers/infiniband/hw/mana/main.c @@ -384,7 +384,7 @@ static int mana_ib_gd_create_dma_region(struct mana_ib_dev *dev, struct ib_umem unsigned int tail = 0; u64 *page_addr_list; void *request_buf; - int err; + int err = 0; gc = mdev_to_gc(dev); hwc = gc->hwc.driver_data; -- cgit v1.2.3-59-g8ed1b From ba7fbaa6a83e5c68124cc943ba96a95b528bc253 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sun, 23 Feb 2025 21:55:43 +0000 Subject: RDMA/hfi1: Remove unused one_qsfp_write The last use of one_qsfp_write() was removed in 2016's commit 145dd2b39958 ("IB/hfi1: Always turn on CDRs for low power QSFP modules") Remove it. Signed-off-by: Dr. David Alan Gilbert Link: https://patch.msgid.link/20250223215543.153312-1-linux@treblig.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hfi1/qsfp.c | 20 -------------------- drivers/infiniband/hw/hfi1/qsfp.h | 2 -- 2 files changed, 22 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/hfi1/qsfp.c b/drivers/infiniband/hw/hfi1/qsfp.c index 52cce1c8b76a..3b7842a7f634 100644 --- a/drivers/infiniband/hw/hfi1/qsfp.c +++ b/drivers/infiniband/hw/hfi1/qsfp.c @@ -404,26 +404,6 @@ int qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, return count; } -/* - * Perform a stand-alone single QSFP write. Acquire the resource, do the - * write, then release the resource. - */ -int one_qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, - int len) -{ - struct hfi1_devdata *dd = ppd->dd; - u32 resource = qsfp_resource(dd); - int ret; - - ret = acquire_chip_resource(dd, resource, QSFP_WAIT); - if (ret) - return ret; - ret = qsfp_write(ppd, target, addr, bp, len); - release_chip_resource(dd, resource); - - return ret; -} - /* * Access page n, offset m of QSFP memory as defined by SFF 8636 * by reading @addr = ((256 * n) + m) diff --git a/drivers/infiniband/hw/hfi1/qsfp.h b/drivers/infiniband/hw/hfi1/qsfp.h index df1389bad86b..5c59d53fcb63 100644 --- a/drivers/infiniband/hw/hfi1/qsfp.h +++ b/drivers/infiniband/hw/hfi1/qsfp.h @@ -195,8 +195,6 @@ int qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, int len); int qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, int len); -int one_qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, - int len); int one_qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, int len); struct hfi1_asic_data; -- cgit v1.2.3-59-g8ed1b From 426370c860e8a276000144dd6470de4f1a96f8ee Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 26 Feb 2025 21:12:07 -0800 Subject: RDMA/siw: Switch to using the crc32c library Now that the crc32c() library function directly takes advantage of architecture-specific optimizations, it is unnecessary to go through the crypto API. Just use crc32c(). This is much simpler, and it improves performance due to eliminating the crypto API overhead. Signed-off-by: Eric Biggers Link: https://patch.msgid.link/20250227051207.19470-1-ebiggers@kernel.org Acked-by: Bernard Metzler Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/siw/Kconfig | 4 +-- drivers/infiniband/sw/siw/siw.h | 37 ++++++++++++++++++++---- drivers/infiniband/sw/siw/siw_main.c | 22 +------------- drivers/infiniband/sw/siw/siw_qp.c | 54 +++++++---------------------------- drivers/infiniband/sw/siw/siw_qp_rx.c | 23 +++++++-------- drivers/infiniband/sw/siw/siw_qp_tx.c | 44 ++++++++++++---------------- drivers/infiniband/sw/siw/siw_verbs.c | 3 -- 7 files changed, 74 insertions(+), 113 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/sw/siw/Kconfig b/drivers/infiniband/sw/siw/Kconfig index 81b70a3eeb87..ae4a953e2a03 100644 --- a/drivers/infiniband/sw/siw/Kconfig +++ b/drivers/infiniband/sw/siw/Kconfig @@ -2,9 +2,7 @@ config RDMA_SIW tristate "Software RDMA over TCP/IP (iWARP) driver" depends on INET && INFINIBAND depends on INFINIBAND_VIRT_DMA - select LIBCRC32C - select CRYPTO - select CRYPTO_CRC32C + select CRC32 help This driver implements the iWARP RDMA transport over the Linux TCP/IP network stack. It enables a system with a diff --git a/drivers/infiniband/sw/siw/siw.h b/drivers/infiniband/sw/siw/siw.h index ea5eee50dc39..21238746716a 100644 --- a/drivers/infiniband/sw/siw/siw.h +++ b/drivers/infiniband/sw/siw/siw.h @@ -10,9 +10,9 @@ #include #include #include -#include #include #include +#include #include #include "iwarp.h" @@ -289,7 +289,8 @@ struct siw_rx_stream { union iwarp_hdr hdr; struct mpa_trailer trailer; - struct shash_desc *mpa_crc_hd; + u32 mpa_crc; + bool mpa_crc_enabled; /* * For each FPDU, main RX loop runs through 3 stages: @@ -390,7 +391,8 @@ struct siw_iwarp_tx { int burst; int bytes_unsent; /* ddp payload bytes */ - struct shash_desc *mpa_crc_hd; + u32 mpa_crc; + bool mpa_crc_enabled; u8 do_crc : 1; /* do crc for segment */ u8 use_sendpage : 1; /* send w/o copy */ @@ -496,7 +498,6 @@ extern u_char mpa_version; extern const bool peer_to_peer; extern struct task_struct *siw_tx_thread[]; -extern struct crypto_shash *siw_crypto_shash; extern struct iwarp_msg_info iwarp_pktinfo[RDMAP_TERMINATE + 1]; /* QP general functions */ @@ -668,6 +669,30 @@ static inline struct siw_sqe *irq_alloc_free(struct siw_qp *qp) return NULL; } +static inline void siw_crc_init(u32 *crc) +{ + *crc = ~0; +} + +static inline void siw_crc_update(u32 *crc, const void *data, size_t len) +{ + *crc = crc32c(*crc, data, len); +} + +static inline void siw_crc_final(u32 *crc, u8 out[4]) +{ + put_unaligned_le32(~*crc, out); +} + +static inline void siw_crc_oneshot(const void *data, size_t len, u8 out[4]) +{ + u32 crc; + + siw_crc_init(&crc); + siw_crc_update(&crc, data, len); + return siw_crc_final(&crc, out); +} + static inline __wsum siw_csum_update(const void *buff, int len, __wsum sum) { return (__force __wsum)crc32c((__force __u32)sum, buff, len); @@ -686,11 +711,11 @@ static inline void siw_crc_skb(struct siw_rx_stream *srx, unsigned int len) .update = siw_csum_update, .combine = siw_csum_combine, }; - __wsum crc = *(u32 *)shash_desc_ctx(srx->mpa_crc_hd); + __wsum crc = (__force __wsum)srx->mpa_crc; crc = __skb_checksum(srx->skb, srx->skb_offset, len, crc, &siw_cs_ops); - *(u32 *)shash_desc_ctx(srx->mpa_crc_hd) = crc; + srx->mpa_crc = (__force u32)crc; } #define siw_dbg(ibdev, fmt, ...) \ diff --git a/drivers/infiniband/sw/siw/siw_main.c b/drivers/infiniband/sw/siw/siw_main.c index b17752bd1ecc..5168307229a9 100644 --- a/drivers/infiniband/sw/siw/siw_main.c +++ b/drivers/infiniband/sw/siw/siw_main.c @@ -59,7 +59,6 @@ u_char mpa_version = MPA_REVISION_2; const bool peer_to_peer; struct task_struct *siw_tx_thread[NR_CPUS]; -struct crypto_shash *siw_crypto_shash; static int siw_device_register(struct siw_device *sdev, const char *name) { @@ -467,20 +466,7 @@ static __init int siw_init_module(void) rv = -ENOMEM; goto out_error; } - /* - * Locate CRC32 algorithm. If unsuccessful, fail - * loading siw only, if CRC is required. - */ - siw_crypto_shash = crypto_alloc_shash("crc32c", 0, 0); - if (IS_ERR(siw_crypto_shash)) { - pr_info("siw: Loading CRC32c failed: %ld\n", - PTR_ERR(siw_crypto_shash)); - siw_crypto_shash = NULL; - if (mpa_crc_required) { - rv = -EOPNOTSUPP; - goto out_error; - } - } + rv = register_netdevice_notifier(&siw_netdev_nb); if (rv) goto out_error; @@ -493,9 +479,6 @@ static __init int siw_init_module(void) out_error: siw_stop_tx_threads(); - if (siw_crypto_shash) - crypto_free_shash(siw_crypto_shash); - pr_info("SoftIWARP attach failed. Error: %d\n", rv); siw_cm_exit(); @@ -516,9 +499,6 @@ static void __exit siw_exit_module(void) siw_destroy_cpulist(siw_cpu_info.num_nodes); - if (siw_crypto_shash) - crypto_free_shash(siw_crypto_shash); - pr_info("SoftiWARP detached\n"); } diff --git a/drivers/infiniband/sw/siw/siw_qp.c b/drivers/infiniband/sw/siw/siw_qp.c index da92cfa2073d..c1e6e7d6e32f 100644 --- a/drivers/infiniband/sw/siw/siw_qp.c +++ b/drivers/infiniband/sw/siw/siw_qp.c @@ -226,33 +226,6 @@ static int siw_qp_readq_init(struct siw_qp *qp, int irq_size, int orq_size) return 0; } -static int siw_qp_enable_crc(struct siw_qp *qp) -{ - struct siw_rx_stream *c_rx = &qp->rx_stream; - struct siw_iwarp_tx *c_tx = &qp->tx_ctx; - int size; - - if (siw_crypto_shash == NULL) - return -ENOENT; - - size = crypto_shash_descsize(siw_crypto_shash) + - sizeof(struct shash_desc); - - c_tx->mpa_crc_hd = kzalloc(size, GFP_KERNEL); - c_rx->mpa_crc_hd = kzalloc(size, GFP_KERNEL); - if (!c_tx->mpa_crc_hd || !c_rx->mpa_crc_hd) { - kfree(c_tx->mpa_crc_hd); - kfree(c_rx->mpa_crc_hd); - c_tx->mpa_crc_hd = NULL; - c_rx->mpa_crc_hd = NULL; - return -ENOMEM; - } - c_tx->mpa_crc_hd->tfm = siw_crypto_shash; - c_rx->mpa_crc_hd->tfm = siw_crypto_shash; - - return 0; -} - /* * Send a non signalled READ or WRITE to peer side as negotiated * with MPAv2 P2P setup protocol. The work request is only created @@ -583,20 +556,15 @@ void siw_send_terminate(struct siw_qp *qp) term->ctrl.mpa_len = cpu_to_be16(len_terminate - (MPA_HDR_SIZE + MPA_CRC_SIZE)); - if (qp->tx_ctx.mpa_crc_hd) { - crypto_shash_init(qp->tx_ctx.mpa_crc_hd); - if (crypto_shash_update(qp->tx_ctx.mpa_crc_hd, - (u8 *)iov[0].iov_base, - iov[0].iov_len)) - goto out; - + if (qp->tx_ctx.mpa_crc_enabled) { + siw_crc_init(&qp->tx_ctx.mpa_crc); + siw_crc_update(&qp->tx_ctx.mpa_crc, + iov[0].iov_base, iov[0].iov_len); if (num_frags == 3) { - if (crypto_shash_update(qp->tx_ctx.mpa_crc_hd, - (u8 *)iov[1].iov_base, - iov[1].iov_len)) - goto out; + siw_crc_update(&qp->tx_ctx.mpa_crc, + iov[1].iov_base, iov[1].iov_len); } - crypto_shash_final(qp->tx_ctx.mpa_crc_hd, (u8 *)&crc); + siw_crc_final(&qp->tx_ctx.mpa_crc, (u8 *)&crc); } rv = kernel_sendmsg(s, &msg, iov, num_frags, len_terminate); @@ -604,7 +572,6 @@ void siw_send_terminate(struct siw_qp *qp) rv == len_terminate ? "success" : "failure", __rdmap_term_layer(term), __rdmap_term_etype(term), __rdmap_term_ecode(term), rv); -out: kfree(term); kfree(err_hdr); } @@ -643,9 +610,10 @@ static int siw_qp_nextstate_from_idle(struct siw_qp *qp, switch (attrs->state) { case SIW_QP_STATE_RTS: if (attrs->flags & SIW_MPA_CRC) { - rv = siw_qp_enable_crc(qp); - if (rv) - break; + siw_crc_init(&qp->tx_ctx.mpa_crc); + qp->tx_ctx.mpa_crc_enabled = true; + siw_crc_init(&qp->rx_stream.mpa_crc); + qp->rx_stream.mpa_crc_enabled = true; } if (!(mask & SIW_QP_ATTR_LLP_HANDLE)) { siw_dbg_qp(qp, "no socket\n"); diff --git a/drivers/infiniband/sw/siw/siw_qp_rx.c b/drivers/infiniband/sw/siw/siw_qp_rx.c index ed4fc39718b4..32554eba1eac 100644 --- a/drivers/infiniband/sw/siw/siw_qp_rx.c +++ b/drivers/infiniband/sw/siw/siw_qp_rx.c @@ -67,10 +67,10 @@ static int siw_rx_umem(struct siw_rx_stream *srx, struct siw_umem *umem, return -EFAULT; } - if (srx->mpa_crc_hd) { + if (srx->mpa_crc_enabled) { if (rdma_is_kernel_res(&rx_qp(srx)->base_qp.res)) { - crypto_shash_update(srx->mpa_crc_hd, - (u8 *)(dest + pg_off), bytes); + siw_crc_update(&srx->mpa_crc, dest + pg_off, + bytes); kunmap_atomic(dest); } else { kunmap_atomic(dest); @@ -114,8 +114,8 @@ static int siw_rx_kva(struct siw_rx_stream *srx, void *kva, int len) return rv; } - if (srx->mpa_crc_hd) - crypto_shash_update(srx->mpa_crc_hd, (u8 *)kva, len); + if (srx->mpa_crc_enabled) + siw_crc_update(&srx->mpa_crc, kva, len); srx->skb_offset += len; srx->skb_copied += len; @@ -966,16 +966,16 @@ static int siw_get_trailer(struct siw_qp *qp, struct siw_rx_stream *srx) if (srx->fpdu_part_rem) return -EAGAIN; - if (!srx->mpa_crc_hd) + if (!srx->mpa_crc_enabled) return 0; if (srx->pad) - crypto_shash_update(srx->mpa_crc_hd, tbuf, srx->pad); + siw_crc_update(&srx->mpa_crc, tbuf, srx->pad); /* * CRC32 is computed, transmitted and received directly in NBO, * so there's never a reason to convert byte order. */ - crypto_shash_final(srx->mpa_crc_hd, (u8 *)&crc_own); + siw_crc_final(&srx->mpa_crc, (u8 *)&crc_own); crc_in = (__force __wsum)srx->trailer.crc; if (unlikely(crc_in != crc_own)) { @@ -1093,13 +1093,12 @@ static int siw_get_hdr(struct siw_rx_stream *srx) * (tagged/untagged). E.g., a WRITE can get intersected by a SEND, * but not by a READ RESPONSE etc. */ - if (srx->mpa_crc_hd) { + if (srx->mpa_crc_enabled) { /* * Restart CRC computation */ - crypto_shash_init(srx->mpa_crc_hd); - crypto_shash_update(srx->mpa_crc_hd, (u8 *)c_hdr, - srx->fpdu_part_rcvd); + siw_crc_init(&srx->mpa_crc); + siw_crc_update(&srx->mpa_crc, c_hdr, srx->fpdu_part_rcvd); } if (frx->more_ddp_segs) { frx->first_ddp_seg = 0; diff --git a/drivers/infiniband/sw/siw/siw_qp_tx.c b/drivers/infiniband/sw/siw/siw_qp_tx.c index a034264c5669..6432bce7d083 100644 --- a/drivers/infiniband/sw/siw/siw_qp_tx.c +++ b/drivers/infiniband/sw/siw/siw_qp_tx.c @@ -248,10 +248,8 @@ static int siw_qp_prepare_tx(struct siw_iwarp_tx *c_tx) /* * Do complete CRC if enabled and short packet */ - if (c_tx->mpa_crc_hd && - crypto_shash_digest(c_tx->mpa_crc_hd, (u8 *)&c_tx->pkt, - c_tx->ctrl_len, (u8 *)crc) != 0) - return -EINVAL; + if (c_tx->mpa_crc_enabled) + siw_crc_oneshot(&c_tx->pkt, c_tx->ctrl_len, (u8 *)crc); c_tx->ctrl_len += MPA_CRC_SIZE; return PKT_COMPLETE; @@ -482,9 +480,8 @@ static int siw_tx_hdt(struct siw_iwarp_tx *c_tx, struct socket *s) iov[seg].iov_len = sge_len; if (do_crc) - crypto_shash_update(c_tx->mpa_crc_hd, - iov[seg].iov_base, - sge_len); + siw_crc_update(&c_tx->mpa_crc, + iov[seg].iov_base, sge_len); sge_off += sge_len; data_len -= sge_len; seg++; @@ -516,15 +513,14 @@ static int siw_tx_hdt(struct siw_iwarp_tx *c_tx, struct socket *s) iov[seg].iov_len = plen; if (do_crc) - crypto_shash_update( - c_tx->mpa_crc_hd, + siw_crc_update( + &c_tx->mpa_crc, iov[seg].iov_base, plen); } else if (do_crc) { kaddr = kmap_local_page(p); - crypto_shash_update(c_tx->mpa_crc_hd, - kaddr + fp_off, - plen); + siw_crc_update(&c_tx->mpa_crc, + kaddr + fp_off, plen); kunmap_local(kaddr); } } else { @@ -536,10 +532,9 @@ static int siw_tx_hdt(struct siw_iwarp_tx *c_tx, struct socket *s) page_array[seg] = ib_virt_dma_to_page(va); if (do_crc) - crypto_shash_update( - c_tx->mpa_crc_hd, - ib_virt_dma_to_ptr(va), - plen); + siw_crc_update(&c_tx->mpa_crc, + ib_virt_dma_to_ptr(va), + plen); } sge_len -= plen; @@ -576,14 +571,14 @@ sge_done: if (c_tx->pad) { *(u32 *)c_tx->trailer.pad = 0; if (do_crc) - crypto_shash_update(c_tx->mpa_crc_hd, - (u8 *)&c_tx->trailer.crc - c_tx->pad, - c_tx->pad); + siw_crc_update(&c_tx->mpa_crc, + (u8 *)&c_tx->trailer.crc - c_tx->pad, + c_tx->pad); } - if (!c_tx->mpa_crc_hd) + if (!c_tx->mpa_crc_enabled) c_tx->trailer.crc = 0; else if (do_crc) - crypto_shash_final(c_tx->mpa_crc_hd, (u8 *)&c_tx->trailer.crc); + siw_crc_final(&c_tx->mpa_crc, (u8 *)&c_tx->trailer.crc); data_len = c_tx->bytes_unsent; @@ -736,10 +731,9 @@ static void siw_prepare_fpdu(struct siw_qp *qp, struct siw_wqe *wqe) /* * Init MPA CRC computation */ - if (c_tx->mpa_crc_hd) { - crypto_shash_init(c_tx->mpa_crc_hd); - crypto_shash_update(c_tx->mpa_crc_hd, (u8 *)&c_tx->pkt, - c_tx->ctrl_len); + if (c_tx->mpa_crc_enabled) { + siw_crc_init(&c_tx->mpa_crc); + siw_crc_update(&c_tx->mpa_crc, &c_tx->pkt, c_tx->ctrl_len); c_tx->do_crc = 1; } } diff --git a/drivers/infiniband/sw/siw/siw_verbs.c b/drivers/infiniband/sw/siw/siw_verbs.c index 5ac8bd450d24..fd7b266a221b 100644 --- a/drivers/infiniband/sw/siw/siw_verbs.c +++ b/drivers/infiniband/sw/siw/siw_verbs.c @@ -631,9 +631,6 @@ int siw_destroy_qp(struct ib_qp *base_qp, struct ib_udata *udata) } up_write(&qp->state_lock); - kfree(qp->tx_ctx.mpa_crc_hd); - kfree(qp->rx_stream.mpa_crc_hd); - qp->scq = qp->rcq = NULL; siw_qp_put(qp); -- cgit v1.2.3-59-g8ed1b From a1ecb30f90856b0be4168ad51b8875148e285c1f Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 27 Feb 2025 16:54:20 +0000 Subject: RDMA/core: Don't expose hw_counters outside of init net namespace Commit 467f432a521a ("RDMA/core: Split port and device counter sysfs attributes") accidentally almost exposed hw counters to non-init net namespaces. It didn't expose them fully, as an attempt to read any of those counters leads to a crash like this one: [42021.807566] BUG: kernel NULL pointer dereference, address: 0000000000000028 [42021.814463] #PF: supervisor read access in kernel mode [42021.819549] #PF: error_code(0x0000) - not-present page [42021.824636] PGD 0 P4D 0 [42021.827145] Oops: 0000 [#1] SMP PTI [42021.830598] CPU: 82 PID: 2843922 Comm: switchto-defaul Kdump: loaded Tainted: G S W I XXX [42021.841697] Hardware name: XXX [42021.849619] RIP: 0010:hw_stat_device_show+0x1e/0x40 [ib_core] [42021.855362] Code: 90 90 90 90 90 90 90 90 90 90 90 90 f3 0f 1e fa 0f 1f 44 00 00 49 89 d0 4c 8b 5e 20 48 8b 8f b8 04 00 00 48 81 c7 f0 fa ff ff <48> 8b 41 28 48 29 ce 48 83 c6 d0 48 c1 ee 04 69 d6 ab aa aa aa 48 [42021.873931] RSP: 0018:ffff97fe90f03da0 EFLAGS: 00010287 [42021.879108] RAX: ffff9406988a8c60 RBX: ffff940e1072d438 RCX: 0000000000000000 [42021.886169] RDX: ffff94085f1aa000 RSI: ffff93c6cbbdbcb0 RDI: ffff940c7517aef0 [42021.893230] RBP: ffff97fe90f03e70 R08: ffff94085f1aa000 R09: 0000000000000000 [42021.900294] R10: ffff94085f1aa000 R11: ffffffffc0775680 R12: ffffffff87ca2530 [42021.907355] R13: ffff940651602840 R14: ffff93c6cbbdbcb0 R15: ffff94085f1aa000 [42021.914418] FS: 00007fda1a3b9700(0000) GS:ffff94453fb80000(0000) knlGS:0000000000000000 [42021.922423] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [42021.928130] CR2: 0000000000000028 CR3: 00000042dcfb8003 CR4: 00000000003726f0 [42021.935194] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [42021.942257] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [42021.949324] Call Trace: [42021.951756] [42021.953842] [] ? show_regs+0x64/0x70 [42021.959030] [] ? __die+0x78/0xc0 [42021.963874] [] ? page_fault_oops+0x2b5/0x3b0 [42021.969749] [] ? exc_page_fault+0x1a2/0x3c0 [42021.975549] [] ? asm_exc_page_fault+0x26/0x30 [42021.981517] [] ? __pfx_show_hw_stats+0x10/0x10 [ib_core] [42021.988482] [] ? hw_stat_device_show+0x1e/0x40 [ib_core] [42021.995438] [] dev_attr_show+0x1e/0x50 [42022.000803] [] sysfs_kf_seq_show+0x81/0xe0 [42022.006508] [] seq_read_iter+0xf4/0x410 [42022.011954] [] vfs_read+0x16e/0x2f0 [42022.017058] [] ksys_read+0x6e/0xe0 [42022.022073] [] do_syscall_64+0x6a/0xa0 [42022.027441] [] entry_SYSCALL_64_after_hwframe+0x78/0xe2 The problem can be reproduced using the following steps: ip netns add foo ip netns exec foo bash cat /sys/class/infiniband/mlx4_0/hw_counters/* The panic occurs because of casting the device pointer into an ib_device pointer using container_of() in hw_stat_device_show() is wrong and leads to a memory corruption. However the real problem is that hw counters should never been exposed outside of the non-init net namespace. Fix this by saving the index of the corresponding attribute group (it might be 1 or 2 depending on the presence of driver-specific attributes) and zeroing the pointer to hw_counters group for compat devices during the initialization. With this fix applied hw_counters are not available in a non-init net namespace: find /sys/class/infiniband/mlx4_0/ -name hw_counters /sys/class/infiniband/mlx4_0/ports/1/hw_counters /sys/class/infiniband/mlx4_0/ports/2/hw_counters /sys/class/infiniband/mlx4_0/hw_counters ip netns add foo ip netns exec foo bash find /sys/class/infiniband/mlx4_0/ -name hw_counters Fixes: 467f432a521a ("RDMA/core: Split port and device counter sysfs attributes") Signed-off-by: Roman Gushchin Cc: Jason Gunthorpe Cc: Leon Romanovsky Cc: Maher Sanalla Cc: linux-rdma@vger.kernel.org Cc: linux-kernel@vger.kernel.org Link: https://patch.msgid.link/20250227165420.3430301-1-roman.gushchin@linux.dev Reviewed-by: Parav Pandit Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/device.c | 9 +++++++++ drivers/infiniband/core/sysfs.c | 1 + include/rdma/ib_verbs.h | 1 + 3 files changed, 11 insertions(+) (limited to 'drivers') diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 0ded91f056f3..8feb22089cbb 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -528,6 +528,8 @@ static struct class ib_class = { static void rdma_init_coredev(struct ib_core_device *coredev, struct ib_device *dev, struct net *net) { + bool is_full_dev = &dev->coredev == coredev; + /* This BUILD_BUG_ON is intended to catch layout change * of union of ib_core_device and device. * dev must be the first element as ib_core and providers @@ -539,6 +541,13 @@ static void rdma_init_coredev(struct ib_core_device *coredev, coredev->dev.class = &ib_class; coredev->dev.groups = dev->groups; + + /* + * Don't expose hw counters outside of the init namespace. + */ + if (!is_full_dev && dev->hw_stats_attr_index) + coredev->dev.groups[dev->hw_stats_attr_index] = NULL; + device_initialize(&coredev->dev); coredev->owner = dev; INIT_LIST_HEAD(&coredev->port_list); diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index 7491328ca5e6..0ed862b38b44 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -976,6 +976,7 @@ int ib_setup_device_attrs(struct ib_device *ibdev) for (i = 0; i != ARRAY_SIZE(ibdev->groups); i++) if (!ibdev->groups[i]) { ibdev->groups[i] = &data->group; + ibdev->hw_stats_attr_index = i; return 0; } WARN(true, "struct ib_device->groups is too small"); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index b59bf30de430..a5761038935d 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2767,6 +2767,7 @@ struct ib_device { * It is a NULL terminated array. */ const struct attribute_group *groups[4]; + u8 hw_stats_attr_index; u64 uverbs_cmd_mask; -- cgit v1.2.3-59-g8ed1b From f33cd9b3fd03a791296ab37550ffd26213a90c4e Mon Sep 17 00:00:00 2001 From: Nicolas Bouchinet Date: Mon, 24 Feb 2025 10:58:20 +0100 Subject: RDMA/core: Fixes infiniband sysctl bounds Bound infiniband iwcm and ucma sysctl writings between SYSCTL_ZERO and SYSCTL_INT_MAX. The proc_handler has thus been updated to proc_dointvec_minmax. Signed-off-by: Nicolas Bouchinet Link: https://patch.msgid.link/20250224095826.16458-6-nicolas.bouchinet@clip-os.org Reviewed-by: Zhu Yanjun Reviewed-by: Joel Granados Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/iwcm.c | 4 +++- drivers/infiniband/core/ucma.c | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c index 7e3a55349e10..f4486cbd8f45 100644 --- a/drivers/infiniband/core/iwcm.c +++ b/drivers/infiniband/core/iwcm.c @@ -109,7 +109,9 @@ static struct ctl_table iwcm_ctl_table[] = { .data = &default_backlog, .maxlen = sizeof(default_backlog), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, }, }; diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index 02f1666f3cba..6e700b974033 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -69,7 +69,9 @@ static struct ctl_table ucma_ctl_table[] = { .data = &max_backlog, .maxlen = sizeof max_backlog, .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, }, }; -- cgit v1.2.3-59-g8ed1b From 3745242ad1e1c07d5990b33764529eb13565db44 Mon Sep 17 00:00:00 2001 From: Christian Göttsche Date: Sun, 2 Mar 2025 17:06:47 +0100 Subject: RDMA/mlx5: Reorder capability check last MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit capable() calls refer to enabled LSMs whether to permit or deny the request. This is relevant in connection with SELinux, where a capability check results in a policy decision and by default a denial message on insufficient permission is issued. It can lead to three undesired cases: 1. A denial message is generated, even in case the operation was an unprivileged one and thus the syscall succeeded, creating noise. 2. To avoid the noise from 1. the policy writer adds a rule to ignore those denial messages, hiding future syscalls, where the task performs an actual privileged operation, leading to hidden limited functionality of that task. 3. To avoid the noise from 1. the policy writer adds a rule to permit the task the requested capability, while it does not need it, violating the principle of least privilege. Signed-off-by: Christian Göttsche Link: https://patch.msgid.link/20250302160657.127253-10-cgoettsche@seltendoof.de Reviewed-by: Serge Hallyn Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/devx.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index 4186884c66e1..39304cae5b10 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -136,12 +136,14 @@ int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user) return -EINVAL; uctx = MLX5_ADDR_OF(create_uctx_in, in, uctx); - if (is_user && capable(CAP_NET_RAW) && - (MLX5_CAP_GEN(dev->mdev, uctx_cap) & MLX5_UCTX_CAP_RAW_TX)) + if (is_user && + (MLX5_CAP_GEN(dev->mdev, uctx_cap) & MLX5_UCTX_CAP_RAW_TX) && + capable(CAP_NET_RAW)) cap |= MLX5_UCTX_CAP_RAW_TX; - if (is_user && capable(CAP_SYS_RAWIO) && + if (is_user && (MLX5_CAP_GEN(dev->mdev, uctx_cap) & - MLX5_UCTX_CAP_INTERNAL_DEV_RES)) + MLX5_UCTX_CAP_INTERNAL_DEV_RES) && + capable(CAP_SYS_RAWIO)) cap |= MLX5_UCTX_CAP_INTERNAL_DEV_RES; MLX5_SET(create_uctx_in, in, opcode, MLX5_CMD_OP_CREATE_UCTX); -- cgit v1.2.3-59-g8ed1b From 0b27b0e4d43aff78f996abd2d60faa838e8e30b1 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Tue, 4 Mar 2025 21:56:36 +0000 Subject: RDMA/vmw_pvrdma: Remove unused pvrdma_modify_device pvrdma_modify_device() was added in 2016 as part of commit 29c8d9eba550 ("IB: Add vmw_pvrdma driver") but accidentally it was never wired into the device_ops struct. After some discussion the best course seems to be just to remove it, see discussion at: https://lore.kernel.org/all/Z8TWF6coBUF3l_jk@gallifrey/ Signed-off-by: Dr. David Alan Gilbert Link: https://patch.msgid.link/20250304215637.68559-1-linux@treblig.org Acked-by: Vishnu Dasa Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c | 28 ------------------------- drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h | 2 -- 2 files changed, 30 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c index 9f54aa90a35a..bcd43dc30e21 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c @@ -237,34 +237,6 @@ enum rdma_link_layer pvrdma_port_link_layer(struct ib_device *ibdev, return IB_LINK_LAYER_ETHERNET; } -int pvrdma_modify_device(struct ib_device *ibdev, int mask, - struct ib_device_modify *props) -{ - unsigned long flags; - - if (mask & ~(IB_DEVICE_MODIFY_SYS_IMAGE_GUID | - IB_DEVICE_MODIFY_NODE_DESC)) { - dev_warn(&to_vdev(ibdev)->pdev->dev, - "unsupported device modify mask %#x\n", mask); - return -EOPNOTSUPP; - } - - if (mask & IB_DEVICE_MODIFY_NODE_DESC) { - spin_lock_irqsave(&to_vdev(ibdev)->desc_lock, flags); - memcpy(ibdev->node_desc, props->node_desc, 64); - spin_unlock_irqrestore(&to_vdev(ibdev)->desc_lock, flags); - } - - if (mask & IB_DEVICE_MODIFY_SYS_IMAGE_GUID) { - mutex_lock(&to_vdev(ibdev)->port_mutex); - to_vdev(ibdev)->sys_image_guid = - cpu_to_be64(props->sys_image_guid); - mutex_unlock(&to_vdev(ibdev)->port_mutex); - } - - return 0; -} - /** * pvrdma_modify_port - modify device port attributes * @ibdev: the device to modify diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h index 4b9edc03d73d..fd47b0b1df5c 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h @@ -356,8 +356,6 @@ int pvrdma_query_pkey(struct ib_device *ibdev, u32 port, u16 index, u16 *pkey); enum rdma_link_layer pvrdma_port_link_layer(struct ib_device *ibdev, u32 port); -int pvrdma_modify_device(struct ib_device *ibdev, int mask, - struct ib_device_modify *props); int pvrdma_modify_port(struct ib_device *ibdev, u32 port, int mask, struct ib_port_modify *props); int pvrdma_mmap(struct ib_ucontext *context, struct vm_area_struct *vma); -- cgit v1.2.3-59-g8ed1b From 83437689249e6a17b25e27712fbee292e42e7855 Mon Sep 17 00:00:00 2001 From: Cheng Xu Date: Thu, 6 Mar 2025 20:04:40 +0800 Subject: RDMA/erdma: Prevent use-after-free in erdma_accept_newconn() After the erdma_cep_put(new_cep) being called, new_cep will be freed, and the following dereference will cause a UAF problem. Fix this issue. Fixes: 920d93eac8b9 ("RDMA/erdma: Add connection management (CM) support") Signed-off-by: Markus Elfring Signed-off-by: Cheng Xu Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/erdma/erdma_cm.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/erdma/erdma_cm.c b/drivers/infiniband/hw/erdma/erdma_cm.c index 1b23c698ec25..e0acc185e719 100644 --- a/drivers/infiniband/hw/erdma/erdma_cm.c +++ b/drivers/infiniband/hw/erdma/erdma_cm.c @@ -709,7 +709,6 @@ error: erdma_cancel_mpatimer(new_cep); erdma_cep_put(new_cep); - new_cep->sock = NULL; } if (new_s) { -- cgit v1.2.3-59-g8ed1b From 1d5c69514e742846ad3b8727b51b1fd46ea251fd Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 6 Mar 2025 22:49:06 +0300 Subject: RDMA/mana_ib: Use safer allocation function() My static checker says this multiplication can overflow. I'm not an expert in this code but the call tree would be: ib_uverbs_handler_UVERBS_METHOD_QP_CREATE() <- reads cap from the user -> ib_create_qp_user() -> create_qp() -> mana_ib_create_qp() -> mana_ib_create_ud_qp() -> create_shadow_queue() It can't hurt to use safer interfaces. Fixes: c8017f5b4856 ("RDMA/mana_ib: UD/GSI work requests") Signed-off-by: Dan Carpenter Link: https://patch.msgid.link/58439ac0-1ee5-4f96-a595-7ab83b59139b@stanley.mountain Reviewed-by: Long Li Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mana/shadow_queue.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/mana/shadow_queue.h b/drivers/infiniband/hw/mana/shadow_queue.h index d8bfb4c712d5..a4b3818f9c39 100644 --- a/drivers/infiniband/hw/mana/shadow_queue.h +++ b/drivers/infiniband/hw/mana/shadow_queue.h @@ -40,7 +40,7 @@ struct shadow_queue { static inline int create_shadow_queue(struct shadow_queue *queue, uint32_t length, uint32_t stride) { - queue->buffer = kvmalloc(length * stride, GFP_KERNEL); + queue->buffer = kvmalloc_array(length, stride, GFP_KERNEL); if (!queue->buffer) return -ENOMEM; -- cgit v1.2.3-59-g8ed1b From 61e51682816d395307f78ae06d640089054c28ab Mon Sep 17 00:00:00 2001 From: Chiara Meiohas Date: Thu, 6 Mar 2025 13:51:26 +0200 Subject: RDMA/uverbs: Introduce UCAP (User CAPabilities) API Implement a new User CAPabilities (UCAP) API to provide fine-grained control over specific firmware features. This approach offers more granular capabilities than the existing Linux capabilities, which may be too generic for certain FW features. This mechanism represents each capability as a character device with root read-write access. Root processes can grant users special privileges by allowing access to these character devices (e.g., using chown). UCAP character devices are located in /dev/infiniband and the class path is /sys/class/infiniband_ucaps. Signed-off-by: Chiara Meiohas Link: https://patch.msgid.link/5a1379187cd21178e8554afc81a3c941f21af22f.1741261611.git.leon@kernel.org Reviewed-by: Yishai Hadas Reviewed-by: Zhu Yanjun Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/Makefile | 3 +- drivers/infiniband/core/ucaps.c | 267 ++++++++++++++++++++++++++++++++++ drivers/infiniband/core/uverbs_main.c | 2 + include/rdma/ib_ucaps.h | 30 ++++ 4 files changed, 301 insertions(+), 1 deletion(-) create mode 100644 drivers/infiniband/core/ucaps.c create mode 100644 include/rdma/ib_ucaps.h (limited to 'drivers') diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index 8ab4eea5a0a5..d49ded7e95f0 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -39,6 +39,7 @@ ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o \ uverbs_std_types_async_fd.o \ uverbs_std_types_srq.o \ uverbs_std_types_wq.o \ - uverbs_std_types_qp.o + uverbs_std_types_qp.o \ + ucaps.o ib_uverbs-$(CONFIG_INFINIBAND_USER_MEM) += umem.o umem_dmabuf.o ib_uverbs-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o diff --git a/drivers/infiniband/core/ucaps.c b/drivers/infiniband/core/ucaps.c new file mode 100644 index 000000000000..6853c6d078f9 --- /dev/null +++ b/drivers/infiniband/core/ucaps.c @@ -0,0 +1,267 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ + +#include +#include +#include +#include +#include +#include + +#define RDMA_UCAP_FIRST RDMA_UCAP_MLX5_CTRL_LOCAL + +static DEFINE_MUTEX(ucaps_mutex); +static struct ib_ucap *ucaps_list[RDMA_UCAP_MAX]; +static bool ucaps_class_is_registered; +static dev_t ucaps_base_dev; + +struct ib_ucap { + struct cdev cdev; + struct device dev; + struct kref ref; +}; + +static const char *ucap_names[RDMA_UCAP_MAX] = { + [RDMA_UCAP_MLX5_CTRL_LOCAL] = "mlx5_perm_ctrl_local", + [RDMA_UCAP_MLX5_CTRL_OTHER_VHCA] = "mlx5_perm_ctrl_other_vhca" +}; + +static char *ucaps_devnode(const struct device *dev, umode_t *mode) +{ + if (mode) + *mode = 0600; + + return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev)); +} + +static const struct class ucaps_class = { + .name = "infiniband_ucaps", + .devnode = ucaps_devnode, +}; + +static const struct file_operations ucaps_cdev_fops = { + .owner = THIS_MODULE, + .open = simple_open, +}; + +/** + * ib_cleanup_ucaps - cleanup all API resources and class. + * + * This is called once, when removing the ib_uverbs module. + */ +void ib_cleanup_ucaps(void) +{ + mutex_lock(&ucaps_mutex); + if (!ucaps_class_is_registered) { + mutex_unlock(&ucaps_mutex); + return; + } + + for (int i = RDMA_UCAP_FIRST; i < RDMA_UCAP_MAX; i++) + WARN_ON(ucaps_list[i]); + + class_unregister(&ucaps_class); + ucaps_class_is_registered = false; + unregister_chrdev_region(ucaps_base_dev, RDMA_UCAP_MAX); + mutex_unlock(&ucaps_mutex); +} + +static int get_ucap_from_devt(dev_t devt, u64 *idx_mask) +{ + for (int type = RDMA_UCAP_FIRST; type < RDMA_UCAP_MAX; type++) { + if (ucaps_list[type] && ucaps_list[type]->dev.devt == devt) { + *idx_mask |= 1 << type; + return 0; + } + } + + return -EINVAL; +} + +static int get_devt_from_fd(unsigned int fd, dev_t *ret_dev) +{ + struct file *file; + + file = fget(fd); + if (!file) + return -EBADF; + + *ret_dev = file_inode(file)->i_rdev; + fput(file); + return 0; +} + +/** + * ib_ucaps_init - Initialization required before ucap creation. + * + * Return: 0 on success, or a negative errno value on failure + */ +static int ib_ucaps_init(void) +{ + int ret = 0; + + if (ucaps_class_is_registered) + return ret; + + ret = class_register(&ucaps_class); + if (ret) + return ret; + + ret = alloc_chrdev_region(&ucaps_base_dev, 0, RDMA_UCAP_MAX, + ucaps_class.name); + if (ret < 0) { + class_unregister(&ucaps_class); + return ret; + } + + ucaps_class_is_registered = true; + + return 0; +} + +static void ucap_dev_release(struct device *device) +{ + struct ib_ucap *ucap = container_of(device, struct ib_ucap, dev); + + kfree(ucap); +} + +/** + * ib_create_ucap - Add a ucap character device + * @type: UCAP type + * + * Creates a ucap character device in the /dev/infiniband directory. By default, + * the device has root-only read-write access. + * + * A driver may call this multiple times with the same UCAP type. A reference + * count tracks creations and deletions. + * + * Return: 0 on success, or a negative errno value on failure + */ +int ib_create_ucap(enum rdma_user_cap type) +{ + struct ib_ucap *ucap; + int ret; + + if (type >= RDMA_UCAP_MAX) + return -EINVAL; + + mutex_lock(&ucaps_mutex); + ret = ib_ucaps_init(); + if (ret) + goto unlock; + + ucap = ucaps_list[type]; + if (ucap) { + kref_get(&ucap->ref); + mutex_unlock(&ucaps_mutex); + return 0; + } + + ucap = kzalloc(sizeof(*ucap), GFP_KERNEL); + if (!ucap) { + ret = -ENOMEM; + goto unlock; + } + + device_initialize(&ucap->dev); + ucap->dev.class = &ucaps_class; + ucap->dev.devt = MKDEV(MAJOR(ucaps_base_dev), type); + ucap->dev.release = ucap_dev_release; + ret = dev_set_name(&ucap->dev, ucap_names[type]); + if (ret) + goto err_device; + + cdev_init(&ucap->cdev, &ucaps_cdev_fops); + ucap->cdev.owner = THIS_MODULE; + + ret = cdev_device_add(&ucap->cdev, &ucap->dev); + if (ret) + goto err_device; + + kref_init(&ucap->ref); + ucaps_list[type] = ucap; + mutex_unlock(&ucaps_mutex); + + return 0; + +err_device: + put_device(&ucap->dev); +unlock: + mutex_unlock(&ucaps_mutex); + return ret; +} +EXPORT_SYMBOL(ib_create_ucap); + +static void ib_release_ucap(struct kref *ref) +{ + struct ib_ucap *ucap = container_of(ref, struct ib_ucap, ref); + enum rdma_user_cap type; + + for (type = RDMA_UCAP_FIRST; type < RDMA_UCAP_MAX; type++) { + if (ucaps_list[type] == ucap) + break; + } + WARN_ON(type == RDMA_UCAP_MAX); + + ucaps_list[type] = NULL; + cdev_device_del(&ucap->cdev, &ucap->dev); + put_device(&ucap->dev); +} + +/** + * ib_remove_ucap - Remove a ucap character device + * @type: User cap type + * + * Removes the ucap character device according to type. The device is completely + * removed from the filesystem when its reference count reaches 0. + */ +void ib_remove_ucap(enum rdma_user_cap type) +{ + struct ib_ucap *ucap; + + mutex_lock(&ucaps_mutex); + ucap = ucaps_list[type]; + if (WARN_ON(!ucap)) + goto end; + + kref_put(&ucap->ref, ib_release_ucap); +end: + mutex_unlock(&ucaps_mutex); +} +EXPORT_SYMBOL(ib_remove_ucap); + +/** + * ib_get_ucaps - Get bitmask of ucap types from file descriptors + * @fds: Array of file descriptors + * @fd_count: Number of file descriptors in the array + * @idx_mask: Bitmask to be updated based on the ucaps in the fd list + * + * Given an array of file descriptors, this function returns a bitmask of + * the ucaps where a bit is set if an FD for that ucap type was in the array. + * + * Return: 0 on success, or a negative errno value on failure + */ +int ib_get_ucaps(int *fds, int fd_count, uint64_t *idx_mask) +{ + int ret = 0; + dev_t dev; + + *idx_mask = 0; + mutex_lock(&ucaps_mutex); + for (int i = 0; i < fd_count; i++) { + ret = get_devt_from_fd(fds[i], &dev); + if (ret) + goto end; + + ret = get_ucap_from_devt(dev, idx_mask); + if (ret) + goto end; + } + +end: + mutex_unlock(&ucaps_mutex); + return ret; +} diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 85cfc790a7bb..973fe2c7ef53 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -52,6 +52,7 @@ #include #include #include +#include #include "uverbs.h" #include "core_priv.h" @@ -1345,6 +1346,7 @@ static void __exit ib_uverbs_cleanup(void) IB_UVERBS_NUM_FIXED_MINOR); unregister_chrdev_region(dynamic_uverbs_dev, IB_UVERBS_NUM_DYNAMIC_MINOR); + ib_cleanup_ucaps(); mmu_notifier_synchronize(); } diff --git a/include/rdma/ib_ucaps.h b/include/rdma/ib_ucaps.h new file mode 100644 index 000000000000..d9f96be3a553 --- /dev/null +++ b/include/rdma/ib_ucaps.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ + +#ifndef _IB_UCAPS_H_ +#define _IB_UCAPS_H_ + +#define UCAP_ENABLED(ucaps, type) (!!((ucaps) & (1U << (type)))) + +enum rdma_user_cap { + RDMA_UCAP_MLX5_CTRL_LOCAL, + RDMA_UCAP_MLX5_CTRL_OTHER_VHCA, + RDMA_UCAP_MAX +}; + +void ib_cleanup_ucaps(void); +int ib_get_ucaps(int *fds, int fd_count, uint64_t *idx_mask); +#if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS) +int ib_create_ucap(enum rdma_user_cap type); +void ib_remove_ucap(enum rdma_user_cap type); +#else +static inline int ib_create_ucap(enum rdma_user_cap type) +{ + return -EOPNOTSUPP; +} +static inline void ib_remove_ucap(enum rdma_user_cap type) {} +#endif /* CONFIG_INFINIBAND_USER_ACCESS */ + +#endif /* _IB_UCAPS_H_ */ -- cgit v1.2.3-59-g8ed1b From cf7174e8982f8e07a04db23e549a3f9c9a80d3ca Mon Sep 17 00:00:00 2001 From: Chiara Meiohas Date: Thu, 6 Mar 2025 13:51:27 +0200 Subject: RDMA/mlx5: Create UCAP char devices for supported device capabilities Create UCAP character devices when probing an IB device with supported firmware capabilities. If the RDMA_CTRL general object type is supported, check for specific UCTX capabilities: Create /dev/infiniband/mlx5_perm_ctrl_local for RDMA_UCAP_MLX5_CTRL_LOCAL Create /dev/infiniband/mlx5_perm_ctrl_other_vhca for RDMA_UCAP_MLX5_CTRL_OTHER_VHCA Signed-off-by: Chiara Meiohas Link: https://patch.msgid.link/30ed40e7a12a694cf4ee257459ed61b145b7837d.1741261611.git.leon@kernel.org Reviewed-by: Yishai Hadas Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/main.c | 47 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) (limited to 'drivers') diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 81849eb671a1..04b489a6a449 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -47,6 +47,7 @@ #include #include #include +#include #include "macsec.h" #include "data_direct.h" @@ -4201,8 +4202,47 @@ static int mlx5_ib_init_var_table(struct mlx5_ib_dev *dev) return (var_table->bitmap) ? 0 : -ENOMEM; } +static void mlx5_ib_cleanup_ucaps(struct mlx5_ib_dev *dev) +{ + if (MLX5_CAP_GEN(dev->mdev, uctx_cap) & MLX5_UCTX_CAP_RDMA_CTRL) + ib_remove_ucap(RDMA_UCAP_MLX5_CTRL_LOCAL); + + if (MLX5_CAP_GEN(dev->mdev, uctx_cap) & + MLX5_UCTX_CAP_RDMA_CTRL_OTHER_VHCA) + ib_remove_ucap(RDMA_UCAP_MLX5_CTRL_OTHER_VHCA); +} + +static int mlx5_ib_init_ucaps(struct mlx5_ib_dev *dev) +{ + int ret; + + if (MLX5_CAP_GEN(dev->mdev, uctx_cap) & MLX5_UCTX_CAP_RDMA_CTRL) { + ret = ib_create_ucap(RDMA_UCAP_MLX5_CTRL_LOCAL); + if (ret) + return ret; + } + + if (MLX5_CAP_GEN(dev->mdev, uctx_cap) & + MLX5_UCTX_CAP_RDMA_CTRL_OTHER_VHCA) { + ret = ib_create_ucap(RDMA_UCAP_MLX5_CTRL_OTHER_VHCA); + if (ret) + goto remove_local; + } + + return 0; + +remove_local: + if (MLX5_CAP_GEN(dev->mdev, uctx_cap) & MLX5_UCTX_CAP_RDMA_CTRL) + ib_remove_ucap(RDMA_UCAP_MLX5_CTRL_LOCAL); + return ret; +} + static void mlx5_ib_stage_caps_cleanup(struct mlx5_ib_dev *dev) { + if (MLX5_CAP_GEN_2_64(dev->mdev, general_obj_types_127_64) & + MLX5_HCA_CAP_2_GENERAL_OBJECT_TYPES_RDMA_CTRL) + mlx5_ib_cleanup_ucaps(dev); + bitmap_free(dev->var_table.bitmap); } @@ -4253,6 +4293,13 @@ static int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev) return err; } + if (MLX5_CAP_GEN_2_64(dev->mdev, general_obj_types_127_64) & + MLX5_HCA_CAP_2_GENERAL_OBJECT_TYPES_RDMA_CTRL) { + err = mlx5_ib_init_ucaps(dev); + if (err) + return err; + } + dev->ib_dev.use_cq_dim = true; return 0; -- cgit v1.2.3-59-g8ed1b From fe9d7822baee65d8e77542391799b2777f5216f5 Mon Sep 17 00:00:00 2001 From: Chiara Meiohas Date: Thu, 6 Mar 2025 13:51:28 +0200 Subject: RDMA/uverbs: Add support for UCAPs in context creation Add support for file descriptor array attribute for GET_CONTEXT commands. Check that the file descriptor (fd) array represents fds for valid UCAPs. Store the enabled UCAPs from the fd array as a bitmask in ib_ucontext. Signed-off-by: Chiara Meiohas Link: https://patch.msgid.link/ebfb30bc947e2259b193c96a319c80e82599045b.1741261611.git.leon@kernel.org Reviewed-by: Yishai Hadas Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/uverbs_cmd.c | 19 +++++++++++++++++++ drivers/infiniband/core/uverbs_std_types_device.c | 4 ++++ include/rdma/ib_verbs.h | 1 + include/uapi/rdma/ib_user_ioctl_cmds.h | 1 + 4 files changed, 25 insertions(+) (limited to 'drivers') diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 5ad14c39d48c..96d639e1ffa0 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -42,6 +42,7 @@ #include #include +#include #include "rdma_core.h" #include "uverbs.h" @@ -232,6 +233,8 @@ int ib_init_ucontext(struct uverbs_attr_bundle *attrs) { struct ib_ucontext *ucontext = attrs->context; struct ib_uverbs_file *file = attrs->ufile; + int *fd_array; + int fd_count; int ret; if (!down_read_trylock(&file->hw_destroy_rwsem)) @@ -247,6 +250,22 @@ int ib_init_ucontext(struct uverbs_attr_bundle *attrs) if (ret) goto err; + if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_GET_CONTEXT_FD_ARR)) { + fd_count = uverbs_attr_ptr_get_array_size(attrs, + UVERBS_ATTR_GET_CONTEXT_FD_ARR, + sizeof(int)); + if (fd_count < 0) { + ret = fd_count; + goto err_uncharge; + } + + fd_array = uverbs_attr_get_alloced_ptr(attrs, + UVERBS_ATTR_GET_CONTEXT_FD_ARR); + ret = ib_get_ucaps(fd_array, fd_count, &ucontext->enabled_caps); + if (ret) + goto err_uncharge; + } + ret = ucontext->device->ops.alloc_ucontext(ucontext, &attrs->driver_udata); if (ret) diff --git a/drivers/infiniband/core/uverbs_std_types_device.c b/drivers/infiniband/core/uverbs_std_types_device.c index fb0555647336..c0fd283d9d6c 100644 --- a/drivers/infiniband/core/uverbs_std_types_device.c +++ b/drivers/infiniband/core/uverbs_std_types_device.c @@ -437,6 +437,10 @@ DECLARE_UVERBS_NAMED_METHOD( UVERBS_ATTR_TYPE(u32), UA_OPTIONAL), UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_GET_CONTEXT_CORE_SUPPORT, UVERBS_ATTR_TYPE(u64), UA_OPTIONAL), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_GET_CONTEXT_FD_ARR, + UVERBS_ATTR_MIN_SIZE(sizeof(int)), + UA_OPTIONAL, + UA_ALLOC_AND_COPY), UVERBS_ATTR_UHW()); DECLARE_UVERBS_NAMED_METHOD( diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index a5761038935d..9941f4185c79 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1530,6 +1530,7 @@ struct ib_ucontext { struct ib_uverbs_file *ufile; struct ib_rdmacg_object cg_obj; + u64 enabled_caps; /* * Implementation details of the RDMA core, don't use in drivers: */ diff --git a/include/uapi/rdma/ib_user_ioctl_cmds.h b/include/uapi/rdma/ib_user_ioctl_cmds.h index ec719053aab9..ac7b162611ed 100644 --- a/include/uapi/rdma/ib_user_ioctl_cmds.h +++ b/include/uapi/rdma/ib_user_ioctl_cmds.h @@ -88,6 +88,7 @@ enum uverbs_attrs_query_port_cmd_attr_ids { enum uverbs_attrs_get_context_attr_ids { UVERBS_ATTR_GET_CONTEXT_NUM_COMP_VECTORS, UVERBS_ATTR_GET_CONTEXT_CORE_SUPPORT, + UVERBS_ATTR_GET_CONTEXT_FD_ARR, }; enum uverbs_attrs_query_context_attr_ids { -- cgit v1.2.3-59-g8ed1b From 17ade5366345656e1a7f4e9da16863a7499da21b Mon Sep 17 00:00:00 2001 From: Chiara Meiohas Date: Thu, 6 Mar 2025 13:51:29 +0200 Subject: RDMA/mlx5: Check enabled UCAPs when creating ucontext Verify that the enabled UCAPs are supported by the device before creating the ucontext. If supported, create the ucontext with the associated capabilities. Store the privileged ucontext UID on creation and remove it when destroying the privileged ucontext. This allows the command interface to recognize privileged commands through its UID. Signed-off-by: Chiara Meiohas Link: https://patch.msgid.link/8b180583a207cb30deb7a2967934079749cdcc44.1741261611.git.leon@kernel.org Reviewed-by: Yishai Hadas Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/devx.c | 31 +++++++++++++++++++++++++++++-- drivers/infiniband/hw/mlx5/devx.h | 5 +++-- drivers/infiniband/hw/mlx5/main.c | 30 ++++++++++++++++++++++++++---- 3 files changed, 58 insertions(+), 8 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index 39304cae5b10..2479da8620ca 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "mlx5_ib.h" #include "devx.h" #include "qp.h" @@ -122,7 +123,27 @@ devx_ufile2uctx(const struct uverbs_attr_bundle *attrs) return to_mucontext(ib_uverbs_get_ucontext(attrs)); } -int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user) +static int set_uctx_ucaps(struct mlx5_ib_dev *dev, u64 req_ucaps, u32 *cap) +{ + if (UCAP_ENABLED(req_ucaps, RDMA_UCAP_MLX5_CTRL_LOCAL)) { + if (MLX5_CAP_GEN(dev->mdev, uctx_cap) & MLX5_UCTX_CAP_RDMA_CTRL) + *cap |= MLX5_UCTX_CAP_RDMA_CTRL; + else + return -EOPNOTSUPP; + } + + if (UCAP_ENABLED(req_ucaps, RDMA_UCAP_MLX5_CTRL_OTHER_VHCA)) { + if (MLX5_CAP_GEN(dev->mdev, uctx_cap) & + MLX5_UCTX_CAP_RDMA_CTRL_OTHER_VHCA) + *cap |= MLX5_UCTX_CAP_RDMA_CTRL_OTHER_VHCA; + else + return -EOPNOTSUPP; + } + + return 0; +} + +int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user, u64 req_ucaps) { u32 in[MLX5_ST_SZ_DW(create_uctx_in)] = {}; u32 out[MLX5_ST_SZ_DW(create_uctx_out)] = {}; @@ -146,6 +167,12 @@ int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user) capable(CAP_SYS_RAWIO)) cap |= MLX5_UCTX_CAP_INTERNAL_DEV_RES; + if (req_ucaps) { + err = set_uctx_ucaps(dev, req_ucaps, &cap); + if (err) + return err; + } + MLX5_SET(create_uctx_in, in, opcode, MLX5_CMD_OP_CREATE_UCTX); MLX5_SET(uctx, uctx, cap, cap); @@ -2575,7 +2602,7 @@ int mlx5_ib_devx_init(struct mlx5_ib_dev *dev) struct mlx5_devx_event_table *table = &dev->devx_event_table; int uid; - uid = mlx5_ib_devx_create(dev, false); + uid = mlx5_ib_devx_create(dev, false, 0); if (uid > 0) { dev->devx_whitelist_uid = uid; xa_init(&table->event_xa); diff --git a/drivers/infiniband/hw/mlx5/devx.h b/drivers/infiniband/hw/mlx5/devx.h index 1344bf4c9d21..ee9e7d3af93f 100644 --- a/drivers/infiniband/hw/mlx5/devx.h +++ b/drivers/infiniband/hw/mlx5/devx.h @@ -24,13 +24,14 @@ struct devx_obj { struct list_head event_sub; /* holds devx_event_subscription entries */ }; #if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS) -int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user); +int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user, u64 req_ucaps); void mlx5_ib_devx_destroy(struct mlx5_ib_dev *dev, u16 uid); int mlx5_ib_devx_init(struct mlx5_ib_dev *dev); void mlx5_ib_devx_cleanup(struct mlx5_ib_dev *dev); void mlx5_ib_ufile_hw_cleanup(struct ib_uverbs_file *ufile); #else -static inline int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user) +static inline int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user, + u64 req_ucaps) { return -EOPNOTSUPP; } diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 04b489a6a449..d07cacaa0abd 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -1935,6 +1935,12 @@ static int set_ucontext_resp(struct ib_ucontext *uctx, return 0; } +static bool uctx_rdma_ctrl_is_enabled(u64 enabled_caps) +{ + return UCAP_ENABLED(enabled_caps, RDMA_UCAP_MLX5_CTRL_LOCAL) || + UCAP_ENABLED(enabled_caps, RDMA_UCAP_MLX5_CTRL_OTHER_VHCA); +} + static int mlx5_ib_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata) { @@ -1977,10 +1983,17 @@ static int mlx5_ib_alloc_ucontext(struct ib_ucontext *uctx, return -EINVAL; if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX) { - err = mlx5_ib_devx_create(dev, true); + err = mlx5_ib_devx_create(dev, true, uctx->enabled_caps); if (err < 0) goto out_ctx; context->devx_uid = err; + + if (uctx_rdma_ctrl_is_enabled(uctx->enabled_caps)) { + err = mlx5_cmd_add_privileged_uid(dev->mdev, + context->devx_uid); + if (err) + goto out_devx; + } } lib_uar_4k = req.lib_caps & MLX5_LIB_CAP_4K_UAR; @@ -1995,7 +2008,7 @@ static int mlx5_ib_alloc_ucontext(struct ib_ucontext *uctx, /* updates req->total_num_bfregs */ err = calc_total_bfregs(dev, lib_uar_4k, &req, bfregi); if (err) - goto out_devx; + goto out_ucap; mutex_init(&bfregi->lock); bfregi->lib_uar_4k = lib_uar_4k; @@ -2003,7 +2016,7 @@ static int mlx5_ib_alloc_ucontext(struct ib_ucontext *uctx, GFP_KERNEL); if (!bfregi->count) { err = -ENOMEM; - goto out_devx; + goto out_ucap; } bfregi->sys_pages = kcalloc(bfregi->num_sys_pages, @@ -2067,6 +2080,11 @@ out_sys_pages: out_count: kfree(bfregi->count); +out_ucap: + if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX && + uctx_rdma_ctrl_is_enabled(uctx->enabled_caps)) + mlx5_cmd_remove_privileged_uid(dev->mdev, context->devx_uid); + out_devx: if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX) mlx5_ib_devx_destroy(dev, context->devx_uid); @@ -2111,8 +2129,12 @@ static void mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) kfree(bfregi->sys_pages); kfree(bfregi->count); - if (context->devx_uid) + if (context->devx_uid) { + if (uctx_rdma_ctrl_is_enabled(ibcontext->enabled_caps)) + mlx5_cmd_remove_privileged_uid(dev->mdev, + context->devx_uid); mlx5_ib_devx_destroy(dev, context->devx_uid); + } } static phys_addr_t uar_index2pfn(struct mlx5_ib_dev *dev, -- cgit v1.2.3-59-g8ed1b From 74934ddf124a6f6e438b570ab5fb70ef83626707 Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Thu, 6 Mar 2025 13:51:30 +0200 Subject: RDMA/mlx5: Expose RDMA TRANSPORT flow table types to userspace This patch adds RDMA_TRANSPORT_RX and RDMA_TRANSPORT_TX as a new flow table type for matcher creation. Signed-off-by: Patrisious Haddad Reviewed-by: Mark Bloch Signed-off-by: Leon Romanovsky Link: https://patch.msgid.link/2287d8c50483e880450c7e8e08d9de34cdec1b14.1741261611.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/fs.c | 154 +++++++++++++++++++++++++++--- drivers/infiniband/hw/mlx5/fs.h | 2 + drivers/infiniband/hw/mlx5/mlx5_ib.h | 3 + include/uapi/rdma/mlx5_user_ioctl_cmds.h | 1 + include/uapi/rdma/mlx5_user_ioctl_verbs.h | 2 + 5 files changed, 149 insertions(+), 13 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/mlx5/fs.c b/drivers/infiniband/hw/mlx5/fs.c index 162814ae8cb4..6ae2801fa13f 100644 --- a/drivers/infiniband/hw/mlx5/fs.c +++ b/drivers/infiniband/hw/mlx5/fs.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -690,7 +691,7 @@ static struct mlx5_ib_flow_prio *_get_prio(struct mlx5_ib_dev *dev, struct mlx5_ib_flow_prio *prio, int priority, int num_entries, int num_groups, - u32 flags) + u32 flags, u16 vport) { struct mlx5_flow_table_attr ft_attr = {}; struct mlx5_flow_table *ft; @@ -698,6 +699,7 @@ static struct mlx5_ib_flow_prio *_get_prio(struct mlx5_ib_dev *dev, ft_attr.prio = priority; ft_attr.max_fte = num_entries; ft_attr.flags = flags; + ft_attr.vport = vport; ft_attr.autogroup.max_num_groups = num_groups; ft = mlx5_create_auto_grouped_flow_table(ns, &ft_attr); if (IS_ERR(ft)) @@ -792,7 +794,7 @@ static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev, ft = prio->flow_table; if (!ft) return _get_prio(dev, ns, prio, priority, max_table_size, - num_groups, flags); + num_groups, flags, 0); return prio; } @@ -935,7 +937,7 @@ int mlx5_ib_fs_add_op_fc(struct mlx5_ib_dev *dev, u32 port_num, prio = &dev->flow_db->opfcs[type]; if (!prio->flow_table) { prio = _get_prio(dev, ns, prio, priority, - dev->num_ports * MAX_OPFC_RULES, 1, 0); + dev->num_ports * MAX_OPFC_RULES, 1, 0, 0); if (IS_ERR(prio)) { err = PTR_ERR(prio); goto free; @@ -1413,17 +1415,51 @@ free_ucmd: return ERR_PTR(err); } +static int mlx5_ib_fill_transport_ns_info(struct mlx5_ib_dev *dev, + enum mlx5_flow_namespace_type type, + u32 *flags, u16 *vport_idx, + u16 *vport, + struct mlx5_core_dev **ft_mdev, + u32 ib_port) +{ + struct mlx5_core_dev *esw_mdev; + + if (!is_mdev_switchdev_mode(dev->mdev)) + return 0; + + if (!MLX5_CAP_ADV_RDMA(dev->mdev, rdma_transport_manager)) + return -EOPNOTSUPP; + + if (!dev->port[ib_port - 1].rep) + return -EINVAL; + + esw_mdev = mlx5_eswitch_get_core_dev(dev->port[ib_port - 1].rep->esw); + if (esw_mdev != dev->mdev) + return -EOPNOTSUPP; + + *flags |= MLX5_FLOW_TABLE_OTHER_VPORT; + *ft_mdev = esw_mdev; + *vport = dev->port[ib_port - 1].rep->vport; + *vport_idx = dev->port[ib_port - 1].rep->vport_index; + + return 0; +} + static struct mlx5_ib_flow_prio * _get_flow_table(struct mlx5_ib_dev *dev, u16 user_priority, enum mlx5_flow_namespace_type ns_type, - bool mcast) + bool mcast, u32 ib_port) { + struct mlx5_core_dev *ft_mdev = dev->mdev; struct mlx5_flow_namespace *ns = NULL; struct mlx5_ib_flow_prio *prio = NULL; int max_table_size = 0; + u16 vport_idx = 0; bool esw_encap; u32 flags = 0; + u16 vport = 0; int priority; + int ret; if (mcast) priority = MLX5_IB_FLOW_MCAST_PRIO; @@ -1471,13 +1507,38 @@ _get_flow_table(struct mlx5_ib_dev *dev, u16 user_priority, MLX5_CAP_FLOWTABLE_RDMA_TX(dev->mdev, log_max_ft_size)); priority = user_priority; break; + case MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_RX: + case MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_TX: + if (ib_port == 0 || user_priority > MLX5_RDMA_TRANSPORT_BYPASS_PRIO) + return ERR_PTR(-EINVAL); + ret = mlx5_ib_fill_transport_ns_info(dev, ns_type, &flags, + &vport_idx, &vport, + &ft_mdev, ib_port); + if (ret) + return ERR_PTR(ret); + + if (ns_type == MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_RX) + max_table_size = + BIT(MLX5_CAP_FLOWTABLE_RDMA_TRANSPORT_RX( + ft_mdev, log_max_ft_size)); + else + max_table_size = + BIT(MLX5_CAP_FLOWTABLE_RDMA_TRANSPORT_TX( + ft_mdev, log_max_ft_size)); + priority = user_priority; + break; default: break; } max_table_size = min_t(int, max_table_size, MLX5_FS_MAX_ENTRIES); - ns = mlx5_get_flow_namespace(dev->mdev, ns_type); + if (ns_type == MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_RX || + ns_type == MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_TX) + ns = mlx5_get_flow_vport_namespace(ft_mdev, ns_type, vport_idx); + else + ns = mlx5_get_flow_namespace(ft_mdev, ns_type); + if (!ns) return ERR_PTR(-EOPNOTSUPP); @@ -1497,6 +1558,12 @@ _get_flow_table(struct mlx5_ib_dev *dev, u16 user_priority, case MLX5_FLOW_NAMESPACE_RDMA_TX: prio = &dev->flow_db->rdma_tx[priority]; break; + case MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_RX: + prio = &dev->flow_db->rdma_transport_rx[ib_port - 1]; + break; + case MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_TX: + prio = &dev->flow_db->rdma_transport_tx[ib_port - 1]; + break; default: return ERR_PTR(-EINVAL); } @@ -1507,7 +1574,7 @@ _get_flow_table(struct mlx5_ib_dev *dev, u16 user_priority, return prio; return _get_prio(dev, ns, prio, priority, max_table_size, - MLX5_FS_MAX_TYPES, flags); + MLX5_FS_MAX_TYPES, flags, vport); } static struct mlx5_ib_flow_handler * @@ -1626,7 +1693,8 @@ static struct mlx5_ib_flow_handler *raw_fs_rule_add( mutex_lock(&dev->flow_db->lock); ft_prio = _get_flow_table(dev, fs_matcher->priority, - fs_matcher->ns_type, mcast); + fs_matcher->ns_type, mcast, + fs_matcher->ib_port); if (IS_ERR(ft_prio)) { err = PTR_ERR(ft_prio); goto unlock; @@ -1742,6 +1810,12 @@ mlx5_ib_ft_type_to_namespace(enum mlx5_ib_uapi_flow_table_type table_type, case MLX5_IB_UAPI_FLOW_TABLE_TYPE_RDMA_TX: *namespace = MLX5_FLOW_NAMESPACE_RDMA_TX; break; + case MLX5_IB_UAPI_FLOW_TABLE_TYPE_RDMA_TRANSPORT_RX: + *namespace = MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_RX; + break; + case MLX5_IB_UAPI_FLOW_TABLE_TYPE_RDMA_TRANSPORT_TX: + *namespace = MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_TX; + break; default: return -EINVAL; } @@ -1831,7 +1905,8 @@ static int get_dests(struct uverbs_attr_bundle *attrs, return -EINVAL; /* Allow only DEVX object or QP as dest when inserting to RDMA_RX */ - if ((fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_RX) && + if ((fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_RX || + fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_RX) && ((!dest_devx && !dest_qp) || (dest_devx && dest_qp))) return -EINVAL; @@ -1848,7 +1923,8 @@ static int get_dests(struct uverbs_attr_bundle *attrs, return -EINVAL; /* Allow only flow table as dest when inserting to FDB or RDMA_RX */ if ((fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_FDB_BYPASS || - fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_RX) && + fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_RX || + fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_RX) && *dest_type != MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE) return -EINVAL; } else if (dest_qp) { @@ -1869,14 +1945,16 @@ static int get_dests(struct uverbs_attr_bundle *attrs, *dest_id = mqp->raw_packet_qp.rq.tirn; *dest_type = MLX5_FLOW_DESTINATION_TYPE_TIR; } else if ((fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_EGRESS || - fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_TX) && + fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_TX || + fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_TX) && !(*flags & MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DROP)) { *dest_type = MLX5_FLOW_DESTINATION_TYPE_PORT; } if (*dest_type == MLX5_FLOW_DESTINATION_TYPE_TIR && (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_EGRESS || - fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_TX)) + fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_TX || + fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_TX)) return -EINVAL; return 0; @@ -2353,6 +2431,15 @@ static int mlx5_ib_matcher_ns(struct uverbs_attr_bundle *attrs, return 0; } +static bool verify_context_caps(struct mlx5_ib_dev *dev, u64 enabled_caps) +{ + if (is_mdev_switchdev_mode(dev->mdev)) + return UCAP_ENABLED(enabled_caps, + RDMA_UCAP_MLX5_CTRL_OTHER_VHCA); + + return UCAP_ENABLED(enabled_caps, RDMA_UCAP_MLX5_CTRL_LOCAL); +} + static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_MATCHER_CREATE)( struct uverbs_attr_bundle *attrs) { @@ -2401,6 +2488,26 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_MATCHER_CREATE)( goto end; } + if (uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_FLOW_MATCHER_IB_PORT)) { + err = uverbs_copy_from(&obj->ib_port, attrs, + MLX5_IB_ATTR_FLOW_MATCHER_IB_PORT); + if (err) + goto end; + if (!rdma_is_port_valid(&dev->ib_dev, obj->ib_port)) { + err = -EINVAL; + goto end; + } + if (obj->ns_type != MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_RX && + obj->ns_type != MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_TX) { + err = -EINVAL; + goto end; + } + if (!verify_context_caps(dev, uobj->context->enabled_caps)) { + err = -EOPNOTSUPP; + goto end; + } + } + uobj->object = obj; obj->mdev = dev->mdev; atomic_set(&obj->usecnt, 0); @@ -2448,7 +2555,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_STEERING_ANCHOR_CREATE)( mutex_lock(&dev->flow_db->lock); - ft_prio = _get_flow_table(dev, priority, ns_type, 0); + ft_prio = _get_flow_table(dev, priority, ns_type, 0, 0); if (IS_ERR(ft_prio)) { err = PTR_ERR(ft_prio); goto free_obj; @@ -2834,7 +2941,10 @@ DECLARE_UVERBS_NAMED_METHOD( UA_OPTIONAL), UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_FLOW_MATCHER_FT_TYPE, enum mlx5_ib_uapi_flow_table_type, - UA_OPTIONAL)); + UA_OPTIONAL), + UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_FLOW_MATCHER_IB_PORT, + UVERBS_ATTR_TYPE(u32), + UA_OPTIONAL)); DECLARE_UVERBS_NAMED_METHOD_DESTROY( MLX5_IB_METHOD_FLOW_MATCHER_DESTROY, @@ -2904,8 +3014,26 @@ int mlx5_ib_fs_init(struct mlx5_ib_dev *dev) if (!dev->flow_db) return -ENOMEM; + dev->flow_db->rdma_transport_rx = kcalloc(dev->num_ports, + sizeof(struct mlx5_ib_flow_prio), + GFP_KERNEL); + if (!dev->flow_db->rdma_transport_rx) + goto free_flow_db; + + dev->flow_db->rdma_transport_tx = kcalloc(dev->num_ports, + sizeof(struct mlx5_ib_flow_prio), + GFP_KERNEL); + if (!dev->flow_db->rdma_transport_tx) + goto free_rdma_transport_rx; + mutex_init(&dev->flow_db->lock); ib_set_device_ops(&dev->ib_dev, &flow_ops); return 0; + +free_rdma_transport_rx: + kfree(dev->flow_db->rdma_transport_rx); +free_flow_db: + kfree(dev->flow_db); + return -ENOMEM; } diff --git a/drivers/infiniband/hw/mlx5/fs.h b/drivers/infiniband/hw/mlx5/fs.h index b9734904f5f0..0516555eb1c1 100644 --- a/drivers/infiniband/hw/mlx5/fs.h +++ b/drivers/infiniband/hw/mlx5/fs.h @@ -40,6 +40,8 @@ static inline void mlx5_ib_fs_cleanup(struct mlx5_ib_dev *dev) * is a safe assumption that all references are gone. */ mlx5_ib_fs_cleanup_anchor(dev); + kfree(dev->flow_db->rdma_transport_tx); + kfree(dev->flow_db->rdma_transport_rx); kfree(dev->flow_db); } #endif /* _MLX5_IB_FS_H */ diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 974a45c92fbb..ccaaef20f50d 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -276,6 +276,7 @@ struct mlx5_ib_flow_matcher { struct mlx5_core_dev *mdev; atomic_t usecnt; u8 match_criteria_enable; + u32 ib_port; }; struct mlx5_ib_steering_anchor { @@ -307,6 +308,8 @@ struct mlx5_ib_flow_db { struct mlx5_ib_flow_prio rdma_tx[MLX5_IB_NUM_FLOW_FT]; struct mlx5_ib_flow_prio opfcs[MLX5_IB_OPCOUNTER_MAX]; struct mlx5_flow_table *lag_demux_ft; + struct mlx5_ib_flow_prio *rdma_transport_rx; + struct mlx5_ib_flow_prio *rdma_transport_tx; /* Protect flow steering bypass flow tables * when add/del flow rules. * only single add/removal of flow steering rule could be done diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h index fd2e4a3a56b3..18f9fe070213 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h +++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h @@ -239,6 +239,7 @@ enum mlx5_ib_flow_matcher_create_attrs { MLX5_IB_ATTR_FLOW_MATCHER_MATCH_CRITERIA, MLX5_IB_ATTR_FLOW_MATCHER_FLOW_FLAGS, MLX5_IB_ATTR_FLOW_MATCHER_FT_TYPE, + MLX5_IB_ATTR_FLOW_MATCHER_IB_PORT, }; enum mlx5_ib_flow_matcher_destroy_attrs { diff --git a/include/uapi/rdma/mlx5_user_ioctl_verbs.h b/include/uapi/rdma/mlx5_user_ioctl_verbs.h index 7c233df475e7..8f86e79d78a5 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_verbs.h +++ b/include/uapi/rdma/mlx5_user_ioctl_verbs.h @@ -45,6 +45,8 @@ enum mlx5_ib_uapi_flow_table_type { MLX5_IB_UAPI_FLOW_TABLE_TYPE_FDB = 0x2, MLX5_IB_UAPI_FLOW_TABLE_TYPE_RDMA_RX = 0x3, MLX5_IB_UAPI_FLOW_TABLE_TYPE_RDMA_TX = 0x4, + MLX5_IB_UAPI_FLOW_TABLE_TYPE_RDMA_TRANSPORT_RX = 0x5, + MLX5_IB_UAPI_FLOW_TABLE_TYPE_RDMA_TRANSPORT_TX = 0x6, }; enum mlx5_ib_uapi_flow_action_packet_reformat_type { -- cgit v1.2.3-59-g8ed1b From 8b6745b9f6b209ff3d535416a15e60743b6cbcba Mon Sep 17 00:00:00 2001 From: Guofeng Yue Date: Tue, 11 Mar 2025 16:48:51 +0800 Subject: RDMA/hns: Inappropriate format characters cleanup Use %u for unsigned type and %d for enum. Signed-off-by: Guofeng Yue Signed-off-by: Junxian Huang Link: https://patch.msgid.link/20250311084857.3803665-2-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_mr.c | 2 +- drivers/infiniband/hw/hns/hns_roce_qp.c | 2 +- drivers/infiniband/hw/hns/hns_roce_srq.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index 55b9283bfc6f..09da3496843b 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -998,7 +998,7 @@ static bool is_buf_attr_valid(struct hns_roce_dev *hr_dev, if (attr->region_count > ARRAY_SIZE(attr->region) || attr->region_count < 1 || attr->page_shift < HNS_HW_PAGE_SHIFT) { ibdev_err(ibdev, - "invalid buf attr, region count %d, page shift %u.\n", + "invalid buf attr, region count %u, page shift %u.\n", attr->region_count, attr->page_shift); return false; } diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index 9e2e76c59406..ccfef673a976 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -1319,7 +1319,7 @@ int hns_roce_create_qp(struct ib_qp *qp, struct ib_qp_init_attr *init_attr, ret = hns_roce_create_qp_common(hr_dev, init_attr, udata, hr_qp); if (ret) - ibdev_err(ibdev, "create QP type 0x%x failed(%d)\n", + ibdev_err(ibdev, "create QP type %d failed(%d)\n", init_attr->qp_type, ret); err_out: diff --git a/drivers/infiniband/hw/hns/hns_roce_srq.c b/drivers/infiniband/hw/hns/hns_roce_srq.c index 70c06ef65603..1090051f493b 100644 --- a/drivers/infiniband/hw/hns/hns_roce_srq.c +++ b/drivers/infiniband/hw/hns/hns_roce_srq.c @@ -51,7 +51,7 @@ static void hns_roce_ib_srq_event(struct hns_roce_srq *srq, break; default: dev_err(hr_dev->dev, - "hns_roce:Unexpected event type 0x%x on SRQ %06lx\n", + "hns_roce:Unexpected event type %d on SRQ %06lx\n", event_type, srq->srqn); return; } -- cgit v1.2.3-59-g8ed1b From 0a924decd4a3a27c557a6d3c29add61d45ab592a Mon Sep 17 00:00:00 2001 From: Daisuke Matsuda Date: Wed, 12 Mar 2025 15:59:37 +0900 Subject: RDMA/rxe: Improve readability of ODP pagefault interface Use a meaningful constant explicitly instead of hard-coding a literal. Signed-off-by: Daisuke Matsuda Link: https://patch.msgid.link/20250312065937.1787241-1-matsuda-daisuke@fujitsu.com Reviewed-by: Zhu Yanjun Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/rxe/rxe_odp.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/sw/rxe/rxe_odp.c b/drivers/infiniband/sw/rxe/rxe_odp.c index a82e5011360c..94f7bbe14981 100644 --- a/drivers/infiniband/sw/rxe/rxe_odp.c +++ b/drivers/infiniband/sw/rxe/rxe_odp.c @@ -37,8 +37,9 @@ const struct mmu_interval_notifier_ops rxe_mn_ops = { .invalidate = rxe_ib_invalidate_range, }; -#define RXE_PAGEFAULT_RDONLY BIT(1) -#define RXE_PAGEFAULT_SNAPSHOT BIT(2) +#define RXE_PAGEFAULT_DEFAULT 0 +#define RXE_PAGEFAULT_RDONLY BIT(0) +#define RXE_PAGEFAULT_SNAPSHOT BIT(1) static int rxe_odp_do_pagefault_and_lock(struct rxe_mr *mr, u64 user_va, int bcnt, u32 flags) { struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); @@ -222,7 +223,7 @@ int rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, int length, enum rxe_mr_copy_dir dir) { struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); - u32 flags = 0; + u32 flags = RXE_PAGEFAULT_DEFAULT; int err; if (length == 0) @@ -236,7 +237,7 @@ int rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, int length, break; case RXE_FROM_MR_OBJ: - flags = RXE_PAGEFAULT_RDONLY; + flags |= RXE_PAGEFAULT_RDONLY; break; default: @@ -312,7 +313,8 @@ int rxe_odp_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); int err; - err = rxe_odp_map_range_and_lock(mr, iova, sizeof(char), 0); + err = rxe_odp_map_range_and_lock(mr, iova, sizeof(char), + RXE_PAGEFAULT_DEFAULT); if (err < 0) return err; -- cgit v1.2.3-59-g8ed1b From a8445cfec101c42e9d64cdb2dac13973b22c205c Mon Sep 17 00:00:00 2001 From: Long Li Date: Wed, 12 Mar 2025 16:15:31 -0700 Subject: net: mana: Change the function signature of mana_get_primary_netdev_rcu Change mana_get_primary_netdev_rcu() to mana_get_primary_netdev(), and return the ndev with refcount held. The caller is responsible for dropping the refcount. Also drop the check for IFF_SLAVE as it is not necessary if the upper device is present. Signed-off-by: Long Li Link: https://patch.msgid.link/1741821332-9392-1-git-send-email-longli@linuxonhyperv.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mana/device.c | 7 +++---- drivers/infiniband/hw/mana/mana_ib.h | 1 + drivers/net/ethernet/microsoft/mana/mana_en.c | 22 ++++++++++++++-------- include/net/mana/mana.h | 4 +++- 4 files changed, 21 insertions(+), 13 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/mana/device.c b/drivers/infiniband/hw/mana/device.c index d1a02c54a236..9357a9845c2c 100644 --- a/drivers/infiniband/hw/mana/device.c +++ b/drivers/infiniband/hw/mana/device.c @@ -98,10 +98,8 @@ static int mana_ib_probe(struct auxiliary_device *adev, dev->ib_dev.num_comp_vectors = mdev->gdma_context->max_num_queues; dev->ib_dev.dev.parent = mdev->gdma_context->dev; - rcu_read_lock(); /* required to get primary netdev */ - ndev = mana_get_primary_netdev_rcu(mc, 0); + ndev = mana_get_primary_netdev(mc, 0, &dev->dev_tracker); if (!ndev) { - rcu_read_unlock(); ret = -ENODEV; ibdev_err(&dev->ib_dev, "Failed to get netdev for IB port 1"); goto free_ib_device; @@ -109,7 +107,8 @@ static int mana_ib_probe(struct auxiliary_device *adev, ether_addr_copy(mac_addr, ndev->dev_addr); addrconf_addr_eui48((u8 *)&dev->ib_dev.node_guid, ndev->dev_addr); ret = ib_device_set_netdev(&dev->ib_dev, ndev, 1); - rcu_read_unlock(); + /* mana_get_primary_netdev() returns ndev with refcount held */ + netdev_put(ndev, &dev->dev_tracker); if (ret) { ibdev_err(&dev->ib_dev, "Failed to set ib netdev, ret %d", ret); goto free_ib_device; diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h index 77fc1032eda8..81a7e7474462 100644 --- a/drivers/infiniband/hw/mana/mana_ib.h +++ b/drivers/infiniband/hw/mana/mana_ib.h @@ -78,6 +78,7 @@ struct mana_ib_dev { struct xarray qp_table_wq; struct mana_ib_adapter_caps adapter_caps; struct dma_pool *av_pool; + netdevice_tracker dev_tracker; }; struct mana_ib_wq { diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index aa1e47233fe5..4e870b11f946 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -3131,21 +3131,27 @@ out: kfree(ac); } -struct net_device *mana_get_primary_netdev_rcu(struct mana_context *ac, u32 port_index) +struct net_device *mana_get_primary_netdev(struct mana_context *ac, + u32 port_index, + netdevice_tracker *tracker) { struct net_device *ndev; - RCU_LOCKDEP_WARN(!rcu_read_lock_held(), - "Taking primary netdev without holding the RCU read lock"); if (port_index >= ac->num_ports) return NULL; - /* When mana is used in netvsc, the upper netdevice should be returned. */ - if (ac->ports[port_index]->flags & IFF_SLAVE) - ndev = netdev_master_upper_dev_get_rcu(ac->ports[port_index]); - else + rcu_read_lock(); + + /* If mana is used in netvsc, the upper netdevice should be returned. */ + ndev = netdev_master_upper_dev_get_rcu(ac->ports[port_index]); + + /* If there is no upper device, use the parent Ethernet device */ + if (!ndev) ndev = ac->ports[port_index]; + netdev_hold(ndev, tracker, GFP_ATOMIC); + rcu_read_unlock(); + return ndev; } -EXPORT_SYMBOL_NS(mana_get_primary_netdev_rcu, "NET_MANA"); +EXPORT_SYMBOL_NS(mana_get_primary_netdev, "NET_MANA"); diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h index 0d00b24eacaf..0f78065de8fe 100644 --- a/include/net/mana/mana.h +++ b/include/net/mana/mana.h @@ -827,5 +827,7 @@ int mana_cfg_vport(struct mana_port_context *apc, u32 protection_dom_id, u32 doorbell_pg_id); void mana_uncfg_vport(struct mana_port_context *apc); -struct net_device *mana_get_primary_netdev_rcu(struct mana_context *ac, u32 port_index); +struct net_device *mana_get_primary_netdev(struct mana_context *ac, + u32 port_index, + netdevice_tracker *tracker); #endif /* _MANA_H */ -- cgit v1.2.3-59-g8ed1b From bee35b7161aaaed9831e2f14876c374b9c566952 Mon Sep 17 00:00:00 2001 From: Long Li Date: Wed, 12 Mar 2025 16:15:32 -0700 Subject: RDMA/mana_ib: Handle net event for pointing to the current netdev When running under Hyper-V, the master device to the RDMA device is always bonded to this RDMA device. This is not user-configurable. The master device can be unbind/bind from the kernel. During those events, the RDMA device should set to the current netdev to reflect the change of master device from those events. Signed-off-by: Long Li Link: https://patch.msgid.link/1741821332-9392-2-git-send-email-longli@linuxonhyperv.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mana/device.c | 47 ++++++++++++++++++++++++++++++++++-- drivers/infiniband/hw/mana/mana_ib.h | 1 + 2 files changed, 46 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/mana/device.c b/drivers/infiniband/hw/mana/device.c index 9357a9845c2c..b31089320aa5 100644 --- a/drivers/infiniband/hw/mana/device.c +++ b/drivers/infiniband/hw/mana/device.c @@ -65,6 +65,38 @@ static const struct ib_device_ops mana_ib_stats_ops = { .get_hw_stats = mana_ib_get_hw_stats, }; +static int mana_ib_netdev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct mana_ib_dev *dev = container_of(this, struct mana_ib_dev, nb); + struct net_device *event_dev = netdev_notifier_info_to_dev(ptr); + struct gdma_context *gc = dev->gdma_dev->gdma_context; + struct mana_context *mc = gc->mana.driver_data; + struct net_device *ndev; + + /* Only process events from our parent device */ + if (event_dev != mc->ports[0]) + return NOTIFY_DONE; + + switch (event) { + case NETDEV_CHANGEUPPER: + ndev = mana_get_primary_netdev(mc, 0, &dev->dev_tracker); + /* + * RDMA core will setup GID based on updated netdev. + * It's not possible to race with the core as rtnl lock is being + * held. + */ + ib_device_set_netdev(&dev->ib_dev, ndev, 1); + + /* mana_get_primary_netdev() returns ndev with refcount held */ + netdev_put(ndev, &dev->dev_tracker); + + return NOTIFY_OK; + default: + return NOTIFY_DONE; + } +} + static int mana_ib_probe(struct auxiliary_device *adev, const struct auxiliary_device_id *id) { @@ -122,11 +154,19 @@ static int mana_ib_probe(struct auxiliary_device *adev, } dev->gdma_dev = &mdev->gdma_context->mana_ib; + dev->nb.notifier_call = mana_ib_netdev_event; + ret = register_netdevice_notifier(&dev->nb); + if (ret) { + ibdev_err(&dev->ib_dev, "Failed to register net notifier, %d", + ret); + goto deregister_device; + } + ret = mana_ib_gd_query_adapter_caps(dev); if (ret) { ibdev_err(&dev->ib_dev, "Failed to query device caps, ret %d", ret); - goto deregister_device; + goto deregister_net_notifier; } ib_set_device_ops(&dev->ib_dev, &mana_ib_stats_ops); @@ -134,7 +174,7 @@ static int mana_ib_probe(struct auxiliary_device *adev, ret = mana_ib_create_eqs(dev); if (ret) { ibdev_err(&dev->ib_dev, "Failed to create EQs, ret %d", ret); - goto deregister_device; + goto deregister_net_notifier; } ret = mana_ib_gd_create_rnic_adapter(dev); @@ -172,6 +212,8 @@ destroy_rnic: mana_ib_gd_destroy_rnic_adapter(dev); destroy_eqs: mana_ib_destroy_eqs(dev); +deregister_net_notifier: + unregister_netdevice_notifier(&dev->nb); deregister_device: mana_gd_deregister_device(dev->gdma_dev); free_ib_device: @@ -188,6 +230,7 @@ static void mana_ib_remove(struct auxiliary_device *adev) xa_destroy(&dev->qp_table_wq); mana_ib_gd_destroy_rnic_adapter(dev); mana_ib_destroy_eqs(dev); + unregister_netdevice_notifier(&dev->nb); mana_gd_deregister_device(dev->gdma_dev); ib_dealloc_device(&dev->ib_dev); } diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h index 81a7e7474462..6903946677e5 100644 --- a/drivers/infiniband/hw/mana/mana_ib.h +++ b/drivers/infiniband/hw/mana/mana_ib.h @@ -79,6 +79,7 @@ struct mana_ib_dev { struct mana_ib_adapter_caps adapter_caps; struct dma_pool *av_pool; netdevice_tracker dev_tracker; + struct notifier_block nb; }; struct mana_ib_wq { -- cgit v1.2.3-59-g8ed1b From 81f8f7454ad9e0bf95efdec6542afdc9a6ab1e24 Mon Sep 17 00:00:00 2001 From: Maher Sanalla Date: Wed, 26 Feb 2025 15:54:13 +0200 Subject: RDMA/uverbs: Propagate errors from rdma_lookup_get_uobject() Currently, the IB uverbs API calls uobj_get_uobj_read(), which in turn uses the rdma_lookup_get_uobject() helper to retrieve user objects. In case of failure, uobj_get_uobj_read() returns NULL, overriding the error code from rdma_lookup_get_uobject(). The IB uverbs API then translates this NULL to -EINVAL, masking the actual error and complicating debugging. For example, applications calling ibv_modify_qp that fails with EBUSY when retrieving the QP uobject will see the overridden error code EINVAL instead, masking the actual error. Furthermore, based on rdma-core commit: "2a22f1ced5f3 ("Merge pull request #1568 from jakemoroni/master")" Kernel's IB uverbs return values are either ignored and passed on as is to application or overridden with other errnos in a few cases. Thus, to improve error reporting and debuggability, propagate the original error from rdma_lookup_get_uobject() instead of replacing it with EINVAL. Signed-off-by: Maher Sanalla Link: https://patch.msgid.link/64f9d3711b183984e939962c2f83383904f97dfb.1740577869.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/uverbs_cmd.c | 144 ++++++++++++++++++----------------- include/rdma/uverbs_std_types.h | 2 +- 2 files changed, 77 insertions(+), 69 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 96d639e1ffa0..3c3bb670c805 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -735,8 +735,8 @@ static int ib_uverbs_reg_mr(struct uverbs_attr_bundle *attrs) goto err_free; pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, attrs); - if (!pd) { - ret = -EINVAL; + if (IS_ERR(pd)) { + ret = PTR_ERR(pd); goto err_free; } @@ -826,8 +826,8 @@ static int ib_uverbs_rereg_mr(struct uverbs_attr_bundle *attrs) if (cmd.flags & IB_MR_REREG_PD) { new_pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, attrs); - if (!new_pd) { - ret = -EINVAL; + if (IS_ERR(new_pd)) { + ret = PTR_ERR(new_pd); goto put_uobjs; } } else { @@ -936,8 +936,8 @@ static int ib_uverbs_alloc_mw(struct uverbs_attr_bundle *attrs) return PTR_ERR(uobj); pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, attrs); - if (!pd) { - ret = -EINVAL; + if (IS_ERR(pd)) { + ret = PTR_ERR(pd); goto err_free; } @@ -1144,8 +1144,8 @@ static int ib_uverbs_resize_cq(struct uverbs_attr_bundle *attrs) return ret; cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, attrs); - if (!cq) - return -EINVAL; + if (IS_ERR(cq)) + return PTR_ERR(cq); ret = cq->device->ops.resize_cq(cq, cmd.cqe, &attrs->driver_udata); if (ret) @@ -1206,8 +1206,8 @@ static int ib_uverbs_poll_cq(struct uverbs_attr_bundle *attrs) return ret; cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, attrs); - if (!cq) - return -EINVAL; + if (IS_ERR(cq)) + return PTR_ERR(cq); /* we copy a struct ib_uverbs_poll_cq_resp to user space */ header_ptr = attrs->ucore.outbuf; @@ -1255,8 +1255,8 @@ static int ib_uverbs_req_notify_cq(struct uverbs_attr_bundle *attrs) return ret; cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, attrs); - if (!cq) - return -EINVAL; + if (IS_ERR(cq)) + return PTR_ERR(cq); ib_req_notify_cq(cq, cmd.solicited_only ? IB_CQ_SOLICITED : IB_CQ_NEXT_COMP); @@ -1338,8 +1338,8 @@ static int create_qp(struct uverbs_attr_bundle *attrs, ind_tbl = uobj_get_obj_read(rwq_ind_table, UVERBS_OBJECT_RWQ_IND_TBL, cmd->rwq_ind_tbl_handle, attrs); - if (!ind_tbl) { - ret = -EINVAL; + if (IS_ERR(ind_tbl)) { + ret = PTR_ERR(ind_tbl); goto err_put; } @@ -1377,8 +1377,10 @@ static int create_qp(struct uverbs_attr_bundle *attrs, if (cmd->is_srq) { srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, cmd->srq_handle, attrs); - if (!srq || srq->srq_type == IB_SRQT_XRC) { - ret = -EINVAL; + if (IS_ERR(srq) || + srq->srq_type == IB_SRQT_XRC) { + ret = IS_ERR(srq) ? PTR_ERR(srq) : + -EINVAL; goto err_put; } } @@ -1388,23 +1390,29 @@ static int create_qp(struct uverbs_attr_bundle *attrs, rcq = uobj_get_obj_read( cq, UVERBS_OBJECT_CQ, cmd->recv_cq_handle, attrs); - if (!rcq) { - ret = -EINVAL; + if (IS_ERR(rcq)) { + ret = PTR_ERR(rcq); goto err_put; } } } } - if (has_sq) + if (has_sq) { scq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd->send_cq_handle, attrs); + if (IS_ERR(scq)) { + ret = PTR_ERR(scq); + goto err_put; + } + } + if (!ind_tbl && cmd->qp_type != IB_QPT_XRC_INI) rcq = rcq ?: scq; pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd->pd_handle, attrs); - if (!pd || (!scq && has_sq)) { - ret = -EINVAL; + if (IS_ERR(pd)) { + ret = PTR_ERR(pd); goto err_put; } @@ -1499,18 +1507,18 @@ static int create_qp(struct uverbs_attr_bundle *attrs, err_put: if (!IS_ERR(xrcd_uobj)) uobj_put_read(xrcd_uobj); - if (pd) + if (!IS_ERR_OR_NULL(pd)) uobj_put_obj_read(pd); - if (scq) + if (!IS_ERR_OR_NULL(scq)) rdma_lookup_put_uobject(&scq->uobject->uevent.uobject, UVERBS_LOOKUP_READ); - if (rcq && rcq != scq) + if (!IS_ERR_OR_NULL(rcq) && rcq != scq) rdma_lookup_put_uobject(&rcq->uobject->uevent.uobject, UVERBS_LOOKUP_READ); - if (srq) + if (!IS_ERR_OR_NULL(srq)) rdma_lookup_put_uobject(&srq->uobject->uevent.uobject, UVERBS_LOOKUP_READ); - if (ind_tbl) + if (!IS_ERR_OR_NULL(ind_tbl)) uobj_put_obj_read(ind_tbl); uobj_alloc_abort(&obj->uevent.uobject, attrs); @@ -1672,8 +1680,8 @@ static int ib_uverbs_query_qp(struct uverbs_attr_bundle *attrs) } qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs); - if (!qp) { - ret = -EINVAL; + if (IS_ERR(qp)) { + ret = PTR_ERR(qp); goto out; } @@ -1778,8 +1786,8 @@ static int modify_qp(struct uverbs_attr_bundle *attrs, qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd->base.qp_handle, attrs); - if (!qp) { - ret = -EINVAL; + if (IS_ERR(qp)) { + ret = PTR_ERR(qp); goto out; } @@ -2045,8 +2053,8 @@ static int ib_uverbs_post_send(struct uverbs_attr_bundle *attrs) return -ENOMEM; qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs); - if (!qp) { - ret = -EINVAL; + if (IS_ERR(qp)) { + ret = PTR_ERR(qp); goto out; } @@ -2083,9 +2091,9 @@ static int ib_uverbs_post_send(struct uverbs_attr_bundle *attrs) ud->ah = uobj_get_obj_read(ah, UVERBS_OBJECT_AH, user_wr->wr.ud.ah, attrs); - if (!ud->ah) { + if (IS_ERR(ud->ah)) { + ret = PTR_ERR(ud->ah); kfree(ud); - ret = -EINVAL; goto out_put; } ud->remote_qpn = user_wr->wr.ud.remote_qpn; @@ -2322,8 +2330,8 @@ static int ib_uverbs_post_recv(struct uverbs_attr_bundle *attrs) return PTR_ERR(wr); qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs); - if (!qp) { - ret = -EINVAL; + if (IS_ERR(qp)) { + ret = PTR_ERR(qp); goto out; } @@ -2373,8 +2381,8 @@ static int ib_uverbs_post_srq_recv(struct uverbs_attr_bundle *attrs) return PTR_ERR(wr); srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, cmd.srq_handle, attrs); - if (!srq) { - ret = -EINVAL; + if (IS_ERR(srq)) { + ret = PTR_ERR(srq); goto out; } @@ -2430,8 +2438,8 @@ static int ib_uverbs_create_ah(struct uverbs_attr_bundle *attrs) } pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, attrs); - if (!pd) { - ret = -EINVAL; + if (IS_ERR(pd)) { + ret = PTR_ERR(pd); goto err; } @@ -2500,8 +2508,8 @@ static int ib_uverbs_attach_mcast(struct uverbs_attr_bundle *attrs) return ret; qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs); - if (!qp) - return -EINVAL; + if (IS_ERR(qp)) + return PTR_ERR(qp); obj = qp->uobject; @@ -2550,8 +2558,8 @@ static int ib_uverbs_detach_mcast(struct uverbs_attr_bundle *attrs) return ret; qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs); - if (!qp) - return -EINVAL; + if (IS_ERR(qp)) + return PTR_ERR(qp); obj = qp->uobject; mutex_lock(&obj->mcast_lock); @@ -2685,8 +2693,8 @@ static int kern_spec_to_ib_spec_action(struct uverbs_attr_bundle *attrs, UVERBS_OBJECT_FLOW_ACTION, kern_spec->action.handle, attrs); - if (!ib_spec->action.act) - return -EINVAL; + if (IS_ERR(ib_spec->action.act)) + return PTR_ERR(ib_spec->action.act); ib_spec->action.size = sizeof(struct ib_flow_spec_action_handle); flow_resources_add(uflow_res, @@ -2703,8 +2711,8 @@ static int kern_spec_to_ib_spec_action(struct uverbs_attr_bundle *attrs, UVERBS_OBJECT_COUNTERS, kern_spec->flow_count.handle, attrs); - if (!ib_spec->flow_count.counters) - return -EINVAL; + if (IS_ERR(ib_spec->flow_count.counters)) + return PTR_ERR(ib_spec->flow_count.counters); ib_spec->flow_count.size = sizeof(struct ib_flow_spec_action_count); flow_resources_add(uflow_res, @@ -2922,14 +2930,14 @@ static int ib_uverbs_ex_create_wq(struct uverbs_attr_bundle *attrs) return PTR_ERR(obj); pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, attrs); - if (!pd) { - err = -EINVAL; + if (IS_ERR(pd)) { + err = PTR_ERR(pd); goto err_uobj; } cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, attrs); - if (!cq) { - err = -EINVAL; + if (IS_ERR(cq)) { + err = PTR_ERR(cq); goto err_put_pd; } @@ -3030,8 +3038,8 @@ static int ib_uverbs_ex_modify_wq(struct uverbs_attr_bundle *attrs) return -EINVAL; wq = uobj_get_obj_read(wq, UVERBS_OBJECT_WQ, cmd.wq_handle, attrs); - if (!wq) - return -EINVAL; + if (IS_ERR(wq)) + return PTR_ERR(wq); if (cmd.attr_mask & IB_WQ_FLAGS) { wq_attr.flags = cmd.flags; @@ -3114,8 +3122,8 @@ static int ib_uverbs_ex_create_rwq_ind_table(struct uverbs_attr_bundle *attrs) num_read_wqs++) { wq = uobj_get_obj_read(wq, UVERBS_OBJECT_WQ, wqs_handles[num_read_wqs], attrs); - if (!wq) { - err = -EINVAL; + if (IS_ERR(wq)) { + err = PTR_ERR(wq); goto put_wqs; } @@ -3270,8 +3278,8 @@ static int ib_uverbs_ex_create_flow(struct uverbs_attr_bundle *attrs) } qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs); - if (!qp) { - err = -EINVAL; + if (IS_ERR(qp)) { + err = PTR_ERR(qp); goto err_uobj; } @@ -3417,15 +3425,15 @@ static int __uverbs_create_xsrq(struct uverbs_attr_bundle *attrs, if (ib_srq_has_cq(cmd->srq_type)) { attr.ext.cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd->cq_handle, attrs); - if (!attr.ext.cq) { - ret = -EINVAL; + if (IS_ERR(attr.ext.cq)) { + ret = PTR_ERR(attr.ext.cq); goto err_put_xrcd; } } pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd->pd_handle, attrs); - if (!pd) { - ret = -EINVAL; + if (IS_ERR(pd)) { + ret = PTR_ERR(pd); goto err_put_cq; } @@ -3532,8 +3540,8 @@ static int ib_uverbs_modify_srq(struct uverbs_attr_bundle *attrs) return ret; srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, cmd.srq_handle, attrs); - if (!srq) - return -EINVAL; + if (IS_ERR(srq)) + return PTR_ERR(srq); attr.max_wr = cmd.max_wr; attr.srq_limit = cmd.srq_limit; @@ -3560,8 +3568,8 @@ static int ib_uverbs_query_srq(struct uverbs_attr_bundle *attrs) return ret; srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, cmd.srq_handle, attrs); - if (!srq) - return -EINVAL; + if (IS_ERR(srq)) + return PTR_ERR(srq); ret = ib_query_srq(srq, &attr); @@ -3686,8 +3694,8 @@ static int ib_uverbs_ex_modify_cq(struct uverbs_attr_bundle *attrs) return -EOPNOTSUPP; cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, attrs); - if (!cq) - return -EINVAL; + if (IS_ERR(cq)) + return PTR_ERR(cq); ret = rdma_set_cq_moderation(cq, cmd.attr.cq_count, cmd.attr.cq_period); diff --git a/include/rdma/uverbs_std_types.h b/include/rdma/uverbs_std_types.h index fe0512116958..555ea3d142a4 100644 --- a/include/rdma/uverbs_std_types.h +++ b/include/rdma/uverbs_std_types.h @@ -34,7 +34,7 @@ static inline void *_uobj_get_obj_read(struct ib_uobject *uobj) { if (IS_ERR(uobj)) - return NULL; + return ERR_CAST(uobj); return uobj->object; } #define uobj_get_obj_read(_object, _type, _id, _attrs) \ -- cgit v1.2.3-59-g8ed1b From 86ab05366b556a41843f071302aab94122cf9f8a Mon Sep 17 00:00:00 2001 From: Daisuke Matsuda Date: Thu, 13 Mar 2025 15:45:40 +0900 Subject: RDMA/rxe: Fix incorrect return value of rxe_odp_atomic_op() rxe_mr_do_atomic_op() returns enum resp_states numbers, so the ODP counterpart must not return raw errno codes. Signed-off-by: Daisuke Matsuda Reviewed-by: Zhu Yanjun Link: https://patch.msgid.link/20250313064540.2619115-1-matsuda-daisuke@fujitsu.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/rxe/rxe_odp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/infiniband/sw/rxe/rxe_odp.c b/drivers/infiniband/sw/rxe/rxe_odp.c index 94f7bbe14981..9f6e2bb2a269 100644 --- a/drivers/infiniband/sw/rxe/rxe_odp.c +++ b/drivers/infiniband/sw/rxe/rxe_odp.c @@ -316,7 +316,7 @@ int rxe_odp_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, err = rxe_odp_map_range_and_lock(mr, iova, sizeof(char), RXE_PAGEFAULT_DEFAULT); if (err < 0) - return err; + return RESPST_ERR_RKEY_VIOLATION; err = rxe_odp_do_atomic_op(mr, iova, opcode, compare, swap_add, orig_val); -- cgit v1.2.3-59-g8ed1b From d9d5c8ed98738f0e4e9ecbce1effa87a8569d35d Mon Sep 17 00:00:00 2001 From: Preethi G Date: Thu, 13 Mar 2025 01:44:24 -0700 Subject: RDMA/bnxt_re: Support perf management counters Add support for process_mad hook to retrieve the perf management counters. Supports IB_PMA_PORT_COUNTERS and IB_PMA_PORT_COUNTERS_EXT counters. Query the data from HW contexts and FW commands. Signed-off-by: Preethi G Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/1741855464-27921-1-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/bnxt_re.h | 4 ++ drivers/infiniband/hw/bnxt_re/hw_counters.c | 92 +++++++++++++++++++++++++++++ drivers/infiniband/hw/bnxt_re/ib_verbs.c | 36 +++++++++++ drivers/infiniband/hw/bnxt_re/ib_verbs.h | 6 ++ drivers/infiniband/hw/bnxt_re/main.c | 1 + 5 files changed, 139 insertions(+) (limited to 'drivers') diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h index b33b04e4f8e4..8bc023775c20 100644 --- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h +++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h @@ -246,6 +246,10 @@ struct bnxt_re_dev { #define BNXT_RE_CHECK_RC(x) ((x) && ((x) != -ETIMEDOUT)) void bnxt_re_pacing_alert(struct bnxt_re_dev *rdev); +int bnxt_re_assign_pma_port_counters(struct bnxt_re_dev *rdev, struct ib_mad *out_mad); +int bnxt_re_assign_pma_port_ext_counters(struct bnxt_re_dev *rdev, + struct ib_mad *out_mad); + static inline struct device *rdev_to_dev(struct bnxt_re_dev *rdev) { if (rdev) diff --git a/drivers/infiniband/hw/bnxt_re/hw_counters.c b/drivers/infiniband/hw/bnxt_re/hw_counters.c index 3ac47f4e6122..3f53e5db9594 100644 --- a/drivers/infiniband/hw/bnxt_re/hw_counters.c +++ b/drivers/infiniband/hw/bnxt_re/hw_counters.c @@ -39,6 +39,8 @@ #include #include +#include +#include #include "roce_hsi.h" #include "qplib_res.h" @@ -285,6 +287,96 @@ static void bnxt_re_copy_db_pacing_stats(struct bnxt_re_dev *rdev, readl(rdev->en_dev->bar0 + rdev->pacing.dbr_db_fifo_reg_off); } +int bnxt_re_assign_pma_port_ext_counters(struct bnxt_re_dev *rdev, struct ib_mad *out_mad) +{ + struct ib_pma_portcounters_ext *pma_cnt_ext; + struct bnxt_qplib_ext_stat *estat = &rdev->stats.rstat.ext_stat; + struct ctx_hw_stats *hw_stats = NULL; + int rc; + + hw_stats = rdev->qplib_ctx.stats.dma; + + pma_cnt_ext = (struct ib_pma_portcounters_ext *)(out_mad->data + 40); + if (_is_ext_stats_supported(rdev->dev_attr->dev_cap_flags)) { + u32 fid = PCI_FUNC(rdev->en_dev->pdev->devfn); + + rc = bnxt_qplib_qext_stat(&rdev->rcfw, fid, estat); + if (rc) + return rc; + } + + pma_cnt_ext = (struct ib_pma_portcounters_ext *)(out_mad->data + 40); + if ((bnxt_qplib_is_chip_gen_p5(rdev->chip_ctx) && rdev->is_virtfn) || + !bnxt_qplib_is_chip_gen_p5(rdev->chip_ctx)) { + pma_cnt_ext->port_xmit_data = + cpu_to_be64(le64_to_cpu(hw_stats->tx_ucast_bytes) / 4); + pma_cnt_ext->port_rcv_data = + cpu_to_be64(le64_to_cpu(hw_stats->rx_ucast_bytes) / 4); + pma_cnt_ext->port_xmit_packets = + cpu_to_be64(le64_to_cpu(hw_stats->tx_ucast_pkts)); + pma_cnt_ext->port_rcv_packets = + cpu_to_be64(le64_to_cpu(hw_stats->rx_ucast_pkts)); + pma_cnt_ext->port_unicast_rcv_packets = + cpu_to_be64(le64_to_cpu(hw_stats->rx_ucast_pkts)); + pma_cnt_ext->port_unicast_xmit_packets = + cpu_to_be64(le64_to_cpu(hw_stats->tx_ucast_pkts)); + + } else { + pma_cnt_ext->port_rcv_packets = cpu_to_be64(estat->rx_roce_good_pkts); + pma_cnt_ext->port_rcv_data = cpu_to_be64(estat->rx_roce_good_bytes / 4); + pma_cnt_ext->port_xmit_packets = cpu_to_be64(estat->tx_roce_pkts); + pma_cnt_ext->port_xmit_data = cpu_to_be64(estat->tx_roce_bytes / 4); + pma_cnt_ext->port_unicast_rcv_packets = cpu_to_be64(estat->rx_roce_good_pkts); + pma_cnt_ext->port_unicast_xmit_packets = cpu_to_be64(estat->tx_roce_pkts); + } + return 0; +} + +int bnxt_re_assign_pma_port_counters(struct bnxt_re_dev *rdev, struct ib_mad *out_mad) +{ + struct bnxt_qplib_ext_stat *estat = &rdev->stats.rstat.ext_stat; + struct ib_pma_portcounters *pma_cnt; + struct ctx_hw_stats *hw_stats = NULL; + int rc; + + hw_stats = rdev->qplib_ctx.stats.dma; + + pma_cnt = (struct ib_pma_portcounters *)(out_mad->data + 40); + if (_is_ext_stats_supported(rdev->dev_attr->dev_cap_flags)) { + u32 fid = PCI_FUNC(rdev->en_dev->pdev->devfn); + + rc = bnxt_qplib_qext_stat(&rdev->rcfw, fid, estat); + if (rc) + return rc; + } + if ((bnxt_qplib_is_chip_gen_p5(rdev->chip_ctx) && rdev->is_virtfn) || + !bnxt_qplib_is_chip_gen_p5(rdev->chip_ctx)) { + pma_cnt->port_rcv_packets = + cpu_to_be32((u32)(le64_to_cpu(hw_stats->rx_ucast_pkts)) & 0xFFFFFFFF); + pma_cnt->port_rcv_data = + cpu_to_be32((u32)((le64_to_cpu(hw_stats->rx_ucast_bytes) & + 0xFFFFFFFF) / 4)); + pma_cnt->port_xmit_packets = + cpu_to_be32((u32)(le64_to_cpu(hw_stats->tx_ucast_pkts)) & 0xFFFFFFFF); + pma_cnt->port_xmit_data = + cpu_to_be32((u32)((le64_to_cpu(hw_stats->tx_ucast_bytes) + & 0xFFFFFFFF) / 4)); + } else { + pma_cnt->port_rcv_packets = cpu_to_be32(estat->rx_roce_good_pkts); + pma_cnt->port_rcv_data = cpu_to_be32((estat->rx_roce_good_bytes / 4)); + pma_cnt->port_xmit_packets = cpu_to_be32(estat->tx_roce_pkts); + pma_cnt->port_xmit_data = cpu_to_be32((estat->tx_roce_bytes / 4)); + } + pma_cnt->port_rcv_constraint_errors = (u8)(le64_to_cpu(hw_stats->rx_discard_pkts) & 0xFF); + pma_cnt->port_rcv_errors = cpu_to_be16((u16)(le64_to_cpu(hw_stats->rx_error_pkts) + & 0xFFFF)); + pma_cnt->port_xmit_constraint_errors = (u8)(le64_to_cpu(hw_stats->tx_error_pkts) & 0xFF); + pma_cnt->port_xmit_discards = cpu_to_be16((u16)(le64_to_cpu(hw_stats->tx_discard_pkts) + & 0xFFFF)); + + return 0; +} + int bnxt_re_ib_get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats, u32 port, int index) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 2de101d6e825..37e58d48e22e 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include @@ -4489,6 +4490,41 @@ void bnxt_re_mmap_free(struct rdma_user_mmap_entry *rdma_entry) kfree(bnxt_entry); } +int bnxt_re_process_mad(struct ib_device *ibdev, int mad_flags, + u32 port_num, const struct ib_wc *in_wc, + const struct ib_grh *in_grh, + const struct ib_mad *in_mad, struct ib_mad *out_mad, + size_t *out_mad_size, u16 *out_mad_pkey_index) +{ + struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev); + struct ib_class_port_info cpi = {}; + int ret = IB_MAD_RESULT_SUCCESS; + int rc = 0; + + if (in_mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_PERF_MGMT) + return ret; + + switch (in_mad->mad_hdr.attr_id) { + case IB_PMA_CLASS_PORT_INFO: + cpi.capability_mask = IB_PMA_CLASS_CAP_EXT_WIDTH; + memcpy((out_mad->data + 40), &cpi, sizeof(cpi)); + break; + case IB_PMA_PORT_COUNTERS_EXT: + rc = bnxt_re_assign_pma_port_ext_counters(rdev, out_mad); + break; + case IB_PMA_PORT_COUNTERS: + rc = bnxt_re_assign_pma_port_counters(rdev, out_mad); + break; + default: + rc = -EINVAL; + break; + } + if (rc) + return IB_MAD_RESULT_FAILURE; + ret |= IB_MAD_RESULT_REPLY; + return ret; +} + static int UVERBS_HANDLER(BNXT_RE_METHOD_NOTIFY_DRV)(struct uverbs_attr_bundle *attrs) { struct bnxt_re_ucontext *uctx; diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.h b/drivers/infiniband/hw/bnxt_re/ib_verbs.h index fbb16a411d6a..22c9eb8e9cfc 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.h +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.h @@ -268,6 +268,12 @@ void bnxt_re_dealloc_ucontext(struct ib_ucontext *context); int bnxt_re_mmap(struct ib_ucontext *context, struct vm_area_struct *vma); void bnxt_re_mmap_free(struct rdma_user_mmap_entry *rdma_entry); +int bnxt_re_process_mad(struct ib_device *device, int process_mad_flags, + u32 port_num, const struct ib_wc *in_wc, + const struct ib_grh *in_grh, + const struct ib_mad *in_mad, struct ib_mad *out_mad, + size_t *out_mad_size, u16 *out_mad_pkey_index); + static inline u32 __to_ib_port_num(u16 port_id) { return (u32)port_id + 1; diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index e9e4da4dd576..59ddb366978a 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -1276,6 +1276,7 @@ static const struct ib_device_ops bnxt_re_dev_ops = { .post_recv = bnxt_re_post_recv, .post_send = bnxt_re_post_send, .post_srq_recv = bnxt_re_post_srq_recv, + .process_mad = bnxt_re_process_mad, .query_ah = bnxt_re_query_ah, .query_device = bnxt_re_query_device, .modify_device = bnxt_re_modify_device, -- cgit v1.2.3-59-g8ed1b From 1d6a9e7449e2a0c1e2934eee7880ba8bd1e464cd Mon Sep 17 00:00:00 2001 From: Wang Liang Date: Thu, 13 Mar 2025 17:24:21 +0800 Subject: RDMA/core: Fix use-after-free when rename device name Syzbot reported a slab-use-after-free with the following call trace: ================================================================== BUG: KASAN: slab-use-after-free in nla_put+0xd3/0x150 lib/nlattr.c:1099 Read of size 5 at addr ffff888140ea1c60 by task syz.0.988/10025 CPU: 0 UID: 0 PID: 10025 Comm: syz.0.988 Not tainted 6.14.0-rc4-syzkaller-00859-gf77f12010f67 #0 Hardware name: Google Compute Engine, BIOS Google 02/12/2025 Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:408 [inline] print_report+0x16e/0x5b0 mm/kasan/report.c:521 kasan_report+0x143/0x180 mm/kasan/report.c:634 kasan_check_range+0x282/0x290 mm/kasan/generic.c:189 __asan_memcpy+0x29/0x70 mm/kasan/shadow.c:105 nla_put+0xd3/0x150 lib/nlattr.c:1099 nla_put_string include/net/netlink.h:1621 [inline] fill_nldev_handle+0x16e/0x200 drivers/infiniband/core/nldev.c:265 rdma_nl_notify_event+0x561/0xef0 drivers/infiniband/core/nldev.c:2857 ib_device_notify_register+0x22/0x230 drivers/infiniband/core/device.c:1344 ib_register_device+0x1292/0x1460 drivers/infiniband/core/device.c:1460 rxe_register_device+0x233/0x350 drivers/infiniband/sw/rxe/rxe_verbs.c:1540 rxe_net_add+0x74/0xf0 drivers/infiniband/sw/rxe/rxe_net.c:550 rxe_newlink+0xde/0x1a0 drivers/infiniband/sw/rxe/rxe.c:212 nldev_newlink+0x5ea/0x680 drivers/infiniband/core/nldev.c:1795 rdma_nl_rcv_skb drivers/infiniband/core/netlink.c:239 [inline] rdma_nl_rcv+0x6dd/0x9e0 drivers/infiniband/core/netlink.c:259 netlink_unicast_kernel net/netlink/af_netlink.c:1313 [inline] netlink_unicast+0x7f6/0x990 net/netlink/af_netlink.c:1339 netlink_sendmsg+0x8de/0xcb0 net/netlink/af_netlink.c:1883 sock_sendmsg_nosec net/socket.c:709 [inline] __sock_sendmsg+0x221/0x270 net/socket.c:724 ____sys_sendmsg+0x53a/0x860 net/socket.c:2564 ___sys_sendmsg net/socket.c:2618 [inline] __sys_sendmsg+0x269/0x350 net/socket.c:2650 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f42d1b8d169 Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 ... RSP: 002b:00007f42d2960038 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00007f42d1da6320 RCX: 00007f42d1b8d169 RDX: 0000000000000000 RSI: 00004000000002c0 RDI: 000000000000000c RBP: 00007f42d1c0e2a0 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 0000000000000000 R14: 00007f42d1da6320 R15: 00007ffe399344a8 Allocated by task 10025: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x3f/0x80 mm/kasan/common.c:68 poison_kmalloc_redzone mm/kasan/common.c:377 [inline] __kasan_kmalloc+0x98/0xb0 mm/kasan/common.c:394 kasan_kmalloc include/linux/kasan.h:260 [inline] __do_kmalloc_node mm/slub.c:4294 [inline] __kmalloc_node_track_caller_noprof+0x28b/0x4c0 mm/slub.c:4313 __kmemdup_nul mm/util.c:61 [inline] kstrdup+0x42/0x100 mm/util.c:81 kobject_set_name_vargs+0x61/0x120 lib/kobject.c:274 dev_set_name+0xd5/0x120 drivers/base/core.c:3468 assign_name drivers/infiniband/core/device.c:1202 [inline] ib_register_device+0x178/0x1460 drivers/infiniband/core/device.c:1384 rxe_register_device+0x233/0x350 drivers/infiniband/sw/rxe/rxe_verbs.c:1540 rxe_net_add+0x74/0xf0 drivers/infiniband/sw/rxe/rxe_net.c:550 rxe_newlink+0xde/0x1a0 drivers/infiniband/sw/rxe/rxe.c:212 nldev_newlink+0x5ea/0x680 drivers/infiniband/core/nldev.c:1795 rdma_nl_rcv_skb drivers/infiniband/core/netlink.c:239 [inline] rdma_nl_rcv+0x6dd/0x9e0 drivers/infiniband/core/netlink.c:259 netlink_unicast_kernel net/netlink/af_netlink.c:1313 [inline] netlink_unicast+0x7f6/0x990 net/netlink/af_netlink.c:1339 netlink_sendmsg+0x8de/0xcb0 net/netlink/af_netlink.c:1883 sock_sendmsg_nosec net/socket.c:709 [inline] __sock_sendmsg+0x221/0x270 net/socket.c:724 ____sys_sendmsg+0x53a/0x860 net/socket.c:2564 ___sys_sendmsg net/socket.c:2618 [inline] __sys_sendmsg+0x269/0x350 net/socket.c:2650 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f Freed by task 10035: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x3f/0x80 mm/kasan/common.c:68 kasan_save_free_info+0x40/0x50 mm/kasan/generic.c:576 poison_slab_object mm/kasan/common.c:247 [inline] __kasan_slab_free+0x59/0x70 mm/kasan/common.c:264 kasan_slab_free include/linux/kasan.h:233 [inline] slab_free_hook mm/slub.c:2353 [inline] slab_free mm/slub.c:4609 [inline] kfree+0x196/0x430 mm/slub.c:4757 kobject_rename+0x38f/0x410 lib/kobject.c:524 device_rename+0x16a/0x200 drivers/base/core.c:4525 ib_device_rename+0x270/0x710 drivers/infiniband/core/device.c:402 nldev_set_doit+0x30e/0x4c0 drivers/infiniband/core/nldev.c:1146 rdma_nl_rcv_skb drivers/infiniband/core/netlink.c:239 [inline] rdma_nl_rcv+0x6dd/0x9e0 drivers/infiniband/core/netlink.c:259 netlink_unicast_kernel net/netlink/af_netlink.c:1313 [inline] netlink_unicast+0x7f6/0x990 net/netlink/af_netlink.c:1339 netlink_sendmsg+0x8de/0xcb0 net/netlink/af_netlink.c:1883 sock_sendmsg_nosec net/socket.c:709 [inline] __sock_sendmsg+0x221/0x270 net/socket.c:724 ____sys_sendmsg+0x53a/0x860 net/socket.c:2564 ___sys_sendmsg net/socket.c:2618 [inline] __sys_sendmsg+0x269/0x350 net/socket.c:2650 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f This is because if rename device happens, the old name is freed in ib_device_rename() with lock, but ib_device_notify_register() may visit the dev name locklessly by event RDMA_REGISTER_EVENT or RDMA_NETDEV_ATTACH_EVENT. Fix this by hold devices_rwsem in ib_device_notify_register(). Reported-by: syzbot+f60349ba1f9f08df349f@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=25bc6f0ed2b88b9eb9b8 Fixes: 9cbed5aab5ae ("RDMA/nldev: Add support for RDMA monitoring") Signed-off-by: Wang Liang Link: https://patch.msgid.link/20250313092421.944658-1-wangliang74@huawei.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/device.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 8feb22089cbb..ee75b99f84bc 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -1350,9 +1350,11 @@ static void ib_device_notify_register(struct ib_device *device) u32 port; int ret; + down_read(&devices_rwsem); + ret = rdma_nl_notify_event(device, 0, RDMA_REGISTER_EVENT); if (ret) - return; + goto out; rdma_for_each_port(device, port) { netdev = ib_device_get_netdev(device, port); @@ -1363,8 +1365,11 @@ static void ib_device_notify_register(struct ib_device *device) RDMA_NETDEV_ATTACH_EVENT); dev_put(netdev); if (ret) - return; + goto out; } + +out: + up_read(&devices_rwsem); } /** -- cgit v1.2.3-59-g8ed1b From d375db42a8effdfeb7973d4f1b0b5985e066fd28 Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Thu, 13 Mar 2025 16:18:41 +0200 Subject: RDMA/mlx5: Add optional counters for RDMA_TX/RX_packets/bytes Add the following optional counters: rdma_tx_packets,rdma_rx_bytes,rdma_rx_packets,rdma_tx_bytes. Which counts all RDMA packets/bytes sent and received per link. Note that since each direction packet and byte counter are shared, the counter is only reset when both counters of that direction are removed. But from user-perspective each can be enabled/disabled separately. The counters can be enabled using: sudo rdma stat set link rocep8s0f0/1 optional-counters rdma_tx_packets And can be seen using: rdma stat -j show link rocep8s0f0/1 Signed-off-by: Patrisious Haddad Reviewed-by: Mark Bloch Link: https://patch.msgid.link/9f2753ad636f21704416df64b47395c8991d1123.1741875070.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/counters.c | 86 +++++++++++++++++++++++++++++++++-- drivers/infiniband/hw/mlx5/fs.c | 46 ++++++++++++++++++- drivers/infiniband/hw/mlx5/mlx5_ib.h | 4 ++ include/linux/mlx5/device.h | 4 +- 4 files changed, 133 insertions(+), 7 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/mlx5/counters.c b/drivers/infiniband/hw/mlx5/counters.c index 4f6c1968a2ee..e2a54c1dbc7a 100644 --- a/drivers/infiniband/hw/mlx5/counters.c +++ b/drivers/infiniband/hw/mlx5/counters.c @@ -140,6 +140,13 @@ static const struct mlx5_ib_counter rdmatx_cnp_op_cnts[] = { INIT_OP_COUNTER(cc_tx_cnp_pkts, CC_TX_CNP_PKTS), }; +static const struct mlx5_ib_counter packets_op_cnts[] = { + INIT_OP_COUNTER(rdma_tx_packets, RDMA_TX_PACKETS), + INIT_OP_COUNTER(rdma_tx_bytes, RDMA_TX_BYTES), + INIT_OP_COUNTER(rdma_rx_packets, RDMA_RX_PACKETS), + INIT_OP_COUNTER(rdma_rx_bytes, RDMA_RX_BYTES), +}; + static int mlx5_ib_read_counters(struct ib_counters *counters, struct ib_counters_read_attr *read_attr, struct uverbs_attr_bundle *attrs) @@ -427,6 +434,15 @@ done: return num_counters; } +static bool is_rdma_bytes_counter(u32 type) +{ + if (type == MLX5_IB_OPCOUNTER_RDMA_TX_BYTES || + type == MLX5_IB_OPCOUNTER_RDMA_RX_BYTES) + return true; + + return false; +} + static int do_get_op_stat(struct ib_device *ibdev, struct rdma_hw_stats *stats, u32 port_num, int index) @@ -434,7 +450,7 @@ static int do_get_op_stat(struct ib_device *ibdev, struct mlx5_ib_dev *dev = to_mdev(ibdev); const struct mlx5_ib_counters *cnts; const struct mlx5_ib_op_fc *opfcs; - u64 packets = 0, bytes; + u64 packets, bytes; u32 type; int ret; @@ -453,8 +469,11 @@ static int do_get_op_stat(struct ib_device *ibdev, if (ret) return ret; + if (is_rdma_bytes_counter(type)) + stats->value[index] = bytes; + else + stats->value[index] = packets; out: - stats->value[index] = packets; return index; } @@ -677,6 +696,12 @@ static void mlx5_ib_fill_counters(struct mlx5_ib_dev *dev, descs[j].priv = &rdmatx_cnp_op_cnts[i].type; } } + + for (i = 0; i < ARRAY_SIZE(packets_op_cnts); i++, j++) { + descs[j].name = packets_op_cnts[i].name; + descs[j].flags |= IB_STAT_FLAG_OPTIONAL; + descs[j].priv = &packets_op_cnts[i].type; + } } @@ -727,6 +752,8 @@ static int __mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev, num_op_counters = ARRAY_SIZE(basic_op_cnts); + num_op_counters += ARRAY_SIZE(packets_op_cnts); + if (MLX5_CAP_FLOWTABLE(dev->mdev, ft_field_support_2_nic_receive_rdma.bth_opcode)) num_op_counters += ARRAY_SIZE(rdmarx_cnp_op_cnts); @@ -756,10 +783,47 @@ err: return -ENOMEM; } +/* + * Checks if the given flow counter type should be sharing the same flow counter + * with another type and if it should, checks if that other type flow counter + * was already created, if both conditions are met return true and the counter + * else return false. + */ +static bool mlx5r_is_opfc_shared_and_in_use(struct mlx5_ib_op_fc *opfcs, + u32 type, + struct mlx5_ib_op_fc **opfc) +{ + u32 shared_fc_type; + + switch (type) { + case MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS: + shared_fc_type = MLX5_IB_OPCOUNTER_RDMA_TX_BYTES; + break; + case MLX5_IB_OPCOUNTER_RDMA_TX_BYTES: + shared_fc_type = MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS; + break; + case MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS: + shared_fc_type = MLX5_IB_OPCOUNTER_RDMA_RX_BYTES; + break; + case MLX5_IB_OPCOUNTER_RDMA_RX_BYTES: + shared_fc_type = MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS; + break; + default: + return false; + } + + *opfc = &opfcs[shared_fc_type]; + if (!(*opfc)->fc) + return false; + + return true; +} + static void mlx5_ib_dealloc_counters(struct mlx5_ib_dev *dev) { u32 in[MLX5_ST_SZ_DW(dealloc_q_counter_in)] = {}; int num_cnt_ports = dev->num_ports; + struct mlx5_ib_op_fc *in_use_opfc; int i, j; if (is_mdev_switchdev_mode(dev->mdev)) @@ -781,11 +845,16 @@ static void mlx5_ib_dealloc_counters(struct mlx5_ib_dev *dev) if (!dev->port[i].cnts.opfcs[j].fc) continue; + if (mlx5r_is_opfc_shared_and_in_use( + dev->port[i].cnts.opfcs, j, &in_use_opfc)) + goto skip; + if (IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS)) mlx5_ib_fs_remove_op_fc(dev, &dev->port[i].cnts.opfcs[j], j); mlx5_fc_destroy(dev->mdev, dev->port[i].cnts.opfcs[j].fc); +skip: dev->port[i].cnts.opfcs[j].fc = NULL; } } @@ -979,8 +1048,8 @@ static int mlx5_ib_modify_stat(struct ib_device *device, u32 port, unsigned int index, bool enable) { struct mlx5_ib_dev *dev = to_mdev(device); + struct mlx5_ib_op_fc *opfc, *in_use_opfc; struct mlx5_ib_counters *cnts; - struct mlx5_ib_op_fc *opfc; u32 num_hw_counters, type; int ret; @@ -1004,6 +1073,13 @@ static int mlx5_ib_modify_stat(struct ib_device *device, u32 port, if (opfc->fc) return -EEXIST; + if (mlx5r_is_opfc_shared_and_in_use(cnts->opfcs, type, + &in_use_opfc)) { + opfc->fc = in_use_opfc->fc; + opfc->rule[0] = in_use_opfc->rule[0]; + return 0; + } + opfc->fc = mlx5_fc_create(dev->mdev, false); if (IS_ERR(opfc->fc)) return PTR_ERR(opfc->fc); @@ -1019,8 +1095,12 @@ static int mlx5_ib_modify_stat(struct ib_device *device, u32 port, if (!opfc->fc) return -EINVAL; + if (mlx5r_is_opfc_shared_and_in_use(cnts->opfcs, type, &in_use_opfc)) + goto out; + mlx5_ib_fs_remove_op_fc(dev, opfc, type); mlx5_fc_destroy(dev->mdev, opfc->fc); +out: opfc->fc = NULL; return 0; } diff --git a/drivers/infiniband/hw/mlx5/fs.c b/drivers/infiniband/hw/mlx5/fs.c index 6ae2801fa13f..93b229e9aab3 100644 --- a/drivers/infiniband/hw/mlx5/fs.c +++ b/drivers/infiniband/hw/mlx5/fs.c @@ -802,10 +802,12 @@ static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev, enum { RDMA_RX_ECN_OPCOUNTER_PRIO, RDMA_RX_CNP_OPCOUNTER_PRIO, + RDMA_RX_PKTS_BYTES_OPCOUNTER_PRIO, }; enum { RDMA_TX_CNP_OPCOUNTER_PRIO, + RDMA_TX_PKTS_BYTES_OPCOUNTER_PRIO, }; static int set_vhca_port_spec(struct mlx5_ib_dev *dev, u32 port_num, @@ -869,6 +871,29 @@ static int set_cnp_spec(struct mlx5_ib_dev *dev, u32 port_num, return 0; } +/* Returns the prio we should use for the given optional counter type, + * whereas for bytes type we use the packet type, since they share the same + * resources. + */ +static struct mlx5_ib_flow_prio *get_opfc_prio(struct mlx5_ib_dev *dev, + u32 type) +{ + u32 prio_type; + + switch (type) { + case MLX5_IB_OPCOUNTER_RDMA_TX_BYTES: + prio_type = MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS; + break; + case MLX5_IB_OPCOUNTER_RDMA_RX_BYTES: + prio_type = MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS; + break; + default: + prio_type = type; + } + + return &dev->flow_db->opfcs[prio_type]; +} + int mlx5_ib_fs_add_op_fc(struct mlx5_ib_dev *dev, u32 port_num, struct mlx5_ib_op_fc *opfc, enum mlx5_ib_optional_counter_type type) @@ -923,6 +948,20 @@ int mlx5_ib_fs_add_op_fc(struct mlx5_ib_dev *dev, u32 port_num, priority = RDMA_TX_CNP_OPCOUNTER_PRIO; break; + case MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS: + case MLX5_IB_OPCOUNTER_RDMA_TX_BYTES: + spec_num = 1; + fn_type = MLX5_FLOW_NAMESPACE_RDMA_TX_COUNTERS; + priority = RDMA_TX_PKTS_BYTES_OPCOUNTER_PRIO; + break; + + case MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS: + case MLX5_IB_OPCOUNTER_RDMA_RX_BYTES: + spec_num = 1; + fn_type = MLX5_FLOW_NAMESPACE_RDMA_RX_COUNTERS; + priority = RDMA_RX_PKTS_BYTES_OPCOUNTER_PRIO; + break; + default: err = -EOPNOTSUPP; goto free; @@ -934,7 +973,7 @@ int mlx5_ib_fs_add_op_fc(struct mlx5_ib_dev *dev, u32 port_num, goto free; } - prio = &dev->flow_db->opfcs[type]; + prio = get_opfc_prio(dev, type); if (!prio->flow_table) { prio = _get_prio(dev, ns, prio, priority, dev->num_ports * MAX_OPFC_RULES, 1, 0, 0); @@ -976,11 +1015,14 @@ void mlx5_ib_fs_remove_op_fc(struct mlx5_ib_dev *dev, struct mlx5_ib_op_fc *opfc, enum mlx5_ib_optional_counter_type type) { + struct mlx5_ib_flow_prio *prio; int i; + prio = get_opfc_prio(dev, type); + for (i = 0; i < MAX_OPFC_RULES && opfc->rule[i]; i++) { mlx5_del_flow_rules(opfc->rule[i]); - put_flow_table(dev, &dev->flow_db->opfcs[type], true); + put_flow_table(dev, prio, true); } } diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index ccaaef20f50d..26397eb08684 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -294,6 +294,10 @@ enum mlx5_ib_optional_counter_type { MLX5_IB_OPCOUNTER_CC_RX_CE_PKTS, MLX5_IB_OPCOUNTER_CC_RX_CNP_PKTS, MLX5_IB_OPCOUNTER_CC_TX_CNP_PKTS, + MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS, + MLX5_IB_OPCOUNTER_RDMA_TX_BYTES, + MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS, + MLX5_IB_OPCOUNTER_RDMA_RX_BYTES, MLX5_IB_OPCOUNTER_MAX, }; diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 8fe56d0362c6..63f0d9fb94b4 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1532,8 +1532,8 @@ static inline u16 mlx5_to_sw_pkey_sz(int pkey_sz) return MLX5_MIN_PKEY_TABLE_SIZE << pkey_sz; } -#define MLX5_RDMA_RX_NUM_COUNTERS_PRIOS 2 -#define MLX5_RDMA_TX_NUM_COUNTERS_PRIOS 1 +#define MLX5_RDMA_RX_NUM_COUNTERS_PRIOS 3 +#define MLX5_RDMA_TX_NUM_COUNTERS_PRIOS 2 #define MLX5_BY_PASS_NUM_REGULAR_PRIOS 16 #define MLX5_BY_PASS_NUM_DONT_TRAP_PRIOS 16 #define MLX5_BY_PASS_NUM_MULTICAST_PRIOS 1 -- cgit v1.2.3-59-g8ed1b From 7e53b31acc7f976d01a0718724a4c36f1fddf739 Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Thu, 13 Mar 2025 16:18:42 +0200 Subject: RDMA/core: Create and destroy rdma_counter using rdma_zalloc_drv_obj() Change rdma_counter allocation to use rdma_zalloc_drv_obj() instead of, explicitly allocating at core, in order to be contained inside driver specific structures. Adjust all drivers that use it to have their containing structure, and add driver specific initialization operation. This change is needed to allow upcoming patches to implement optional-counters binding whereas inside each driver specific counter struct his bound optional-counters will be maintained. Signed-off-by: Patrisious Haddad Reviewed-by: Mark Bloch Link: https://patch.msgid.link/a5a484f421fc2e5595158e61a354fba43272b02d.1741875070.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/counters.c | 4 +++- drivers/infiniband/core/device.c | 2 ++ drivers/infiniband/hw/mlx5/counters.c | 8 ++++++++ drivers/infiniband/hw/mlx5/counters.h | 11 +++++++++++ include/rdma/ib_verbs.h | 6 ++++++ 5 files changed, 30 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/infiniband/core/counters.c b/drivers/infiniband/core/counters.c index af59486fe418..981d5a28614a 100644 --- a/drivers/infiniband/core/counters.c +++ b/drivers/infiniband/core/counters.c @@ -149,13 +149,15 @@ static struct rdma_counter *alloc_and_bind(struct ib_device *dev, u32 port, if (!dev->ops.counter_dealloc || !dev->ops.counter_alloc_stats) return NULL; - counter = kzalloc(sizeof(*counter), GFP_KERNEL); + counter = rdma_zalloc_drv_obj(dev, rdma_counter); if (!counter) return NULL; counter->device = dev; counter->port = port; + dev->ops.counter_init(counter); + rdma_restrack_new(&counter->res, RDMA_RESTRACK_COUNTER); counter->stats = dev->ops.counter_alloc_stats(counter); if (!counter->stats) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index ee75b99f84bc..b4e3e4beb7f4 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -2683,6 +2683,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, counter_alloc_stats); SET_DEVICE_OP(dev_ops, counter_bind_qp); SET_DEVICE_OP(dev_ops, counter_dealloc); + SET_DEVICE_OP(dev_ops, counter_init); SET_DEVICE_OP(dev_ops, counter_unbind_qp); SET_DEVICE_OP(dev_ops, counter_update_stats); SET_DEVICE_OP(dev_ops, create_ah); @@ -2797,6 +2798,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_OBJ_SIZE(dev_ops, ib_srq); SET_OBJ_SIZE(dev_ops, ib_ucontext); SET_OBJ_SIZE(dev_ops, ib_xrcd); + SET_OBJ_SIZE(dev_ops, rdma_counter); } EXPORT_SYMBOL(ib_set_device_ops); diff --git a/drivers/infiniband/hw/mlx5/counters.c b/drivers/infiniband/hw/mlx5/counters.c index e2a54c1dbc7a..018bb96bdbf4 100644 --- a/drivers/infiniband/hw/mlx5/counters.c +++ b/drivers/infiniband/hw/mlx5/counters.c @@ -1105,6 +1105,8 @@ out: return 0; } +static void mlx5_ib_counter_init(struct rdma_counter *counter) {} + static const struct ib_device_ops hw_stats_ops = { .alloc_hw_port_stats = mlx5_ib_alloc_hw_port_stats, .get_hw_stats = mlx5_ib_get_hw_stats, @@ -1115,6 +1117,9 @@ static const struct ib_device_ops hw_stats_ops = { .counter_update_stats = mlx5_ib_counter_update_stats, .modify_hw_stat = IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS) ? mlx5_ib_modify_stat : NULL, + .counter_init = mlx5_ib_counter_init, + + INIT_RDMA_OBJ_SIZE(rdma_counter, mlx5_rdma_counter, rdma_counter), }; static const struct ib_device_ops hw_switchdev_vport_op = { @@ -1129,6 +1134,9 @@ static const struct ib_device_ops hw_switchdev_stats_ops = { .counter_dealloc = mlx5_ib_counter_dealloc, .counter_alloc_stats = mlx5_ib_counter_alloc_stats, .counter_update_stats = mlx5_ib_counter_update_stats, + .counter_init = mlx5_ib_counter_init, + + INIT_RDMA_OBJ_SIZE(rdma_counter, mlx5_rdma_counter, rdma_counter), }; static const struct ib_device_ops counters_ops = { diff --git a/drivers/infiniband/hw/mlx5/counters.h b/drivers/infiniband/hw/mlx5/counters.h index 6bcaaa52e2b2..f153901a43be 100644 --- a/drivers/infiniband/hw/mlx5/counters.h +++ b/drivers/infiniband/hw/mlx5/counters.h @@ -8,6 +8,17 @@ #include "mlx5_ib.h" + +struct mlx5_rdma_counter { + struct rdma_counter rdma_counter; +}; + +static inline struct mlx5_rdma_counter * +to_mcounter(struct rdma_counter *counter) +{ + return container_of(counter, struct mlx5_rdma_counter, rdma_counter); +} + int mlx5_ib_counters_init(struct mlx5_ib_dev *dev); void mlx5_ib_counters_cleanup(struct mlx5_ib_dev *dev); void mlx5_ib_counters_clear_description(struct ib_counters *counters); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 9941f4185c79..90e93297d59e 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2665,6 +2665,11 @@ struct ib_device_ops { */ int (*counter_update_stats)(struct rdma_counter *counter); + /** + * counter_init - Initialize the driver specific rdma counter struct. + */ + void (*counter_init)(struct rdma_counter *counter); + /** * Allows rdma drivers to add their own restrack attributes * dumped via 'rdma stat' iproute2 command. @@ -2716,6 +2721,7 @@ struct ib_device_ops { DECLARE_RDMA_OBJ_SIZE(ib_srq); DECLARE_RDMA_OBJ_SIZE(ib_ucontext); DECLARE_RDMA_OBJ_SIZE(ib_xrcd); + DECLARE_RDMA_OBJ_SIZE(rdma_counter); }; struct ib_core_device { -- cgit v1.2.3-59-g8ed1b From da3711074f5252ee35d6348ca286351013f1d7fe Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Thu, 13 Mar 2025 16:18:43 +0200 Subject: RDMA/core: Add support to optional-counters binding configuration Whenever a new counter is created, save inside it the user requested configuration for optional-counters binding, for manual configuration it is requested directly by the user and for the automatic configuration it depends on if the automatic binding was enabled with or without optional-counters binding. This argument will later be used by the driver to determine if to bind the optional-counters as well or not when trying to bind this counter to a QP. It indicates that when binding counters to a QP we also want the currently enabled link optional-counters to be bound as well. Signed-off-by: Patrisious Haddad Reviewed-by: Mark Bloch Link: https://patch.msgid.link/82f1c357606a16932979ef9a5910122675c74a3a.1741875070.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/counters.c | 28 +++++++++++++++++++--------- drivers/infiniband/core/nldev.c | 18 ++++++++++++++++-- include/rdma/rdma_counter.h | 5 ++++- include/uapi/rdma/rdma_netlink.h | 2 ++ 4 files changed, 41 insertions(+), 12 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/core/counters.c b/drivers/infiniband/core/counters.c index 981d5a28614a..b270a208214e 100644 --- a/drivers/infiniband/core/counters.c +++ b/drivers/infiniband/core/counters.c @@ -12,7 +12,8 @@ static int __counter_set_mode(struct rdma_port_counter *port_counter, enum rdma_nl_counter_mode new_mode, - enum rdma_nl_counter_mask new_mask) + enum rdma_nl_counter_mask new_mask, + bool bind_opcnt) { if (new_mode == RDMA_COUNTER_MODE_AUTO) { if (new_mask & (~ALL_AUTO_MODE_MASKS)) @@ -23,6 +24,7 @@ static int __counter_set_mode(struct rdma_port_counter *port_counter, port_counter->mode.mode = new_mode; port_counter->mode.mask = new_mask; + port_counter->mode.bind_opcnt = bind_opcnt; return 0; } @@ -41,6 +43,7 @@ static int __counter_set_mode(struct rdma_port_counter *port_counter, */ int rdma_counter_set_auto_mode(struct ib_device *dev, u32 port, enum rdma_nl_counter_mask mask, + bool bind_opcnt, struct netlink_ext_ack *extack) { struct rdma_port_counter *port_counter; @@ -59,12 +62,13 @@ int rdma_counter_set_auto_mode(struct ib_device *dev, u32 port, RDMA_COUNTER_MODE_NONE; if (port_counter->mode.mode == mode && - port_counter->mode.mask == mask) { + port_counter->mode.mask == mask && + port_counter->mode.bind_opcnt == bind_opcnt) { ret = 0; goto out; } - ret = __counter_set_mode(port_counter, mode, mask); + ret = __counter_set_mode(port_counter, mode, mask, bind_opcnt); out: mutex_unlock(&port_counter->lock); @@ -140,7 +144,8 @@ out: static struct rdma_counter *alloc_and_bind(struct ib_device *dev, u32 port, struct ib_qp *qp, - enum rdma_nl_counter_mode mode) + enum rdma_nl_counter_mode mode, + bool bind_opcnt) { struct rdma_port_counter *port_counter; struct rdma_counter *counter; @@ -168,7 +173,7 @@ static struct rdma_counter *alloc_and_bind(struct ib_device *dev, u32 port, switch (mode) { case RDMA_COUNTER_MODE_MANUAL: ret = __counter_set_mode(port_counter, RDMA_COUNTER_MODE_MANUAL, - 0); + 0, bind_opcnt); if (ret) { mutex_unlock(&port_counter->lock); goto err_mode; @@ -187,6 +192,7 @@ static struct rdma_counter *alloc_and_bind(struct ib_device *dev, u32 port, mutex_unlock(&port_counter->lock); counter->mode.mode = mode; + counter->mode.bind_opcnt = bind_opcnt; kref_init(&counter->kref); mutex_init(&counter->lock); @@ -215,7 +221,8 @@ static void rdma_counter_free(struct rdma_counter *counter) port_counter->num_counters--; if (!port_counter->num_counters && (port_counter->mode.mode == RDMA_COUNTER_MODE_MANUAL)) - __counter_set_mode(port_counter, RDMA_COUNTER_MODE_NONE, 0); + __counter_set_mode(port_counter, RDMA_COUNTER_MODE_NONE, 0, + false); mutex_unlock(&port_counter->lock); @@ -347,7 +354,8 @@ int rdma_counter_bind_qp_auto(struct ib_qp *qp, u32 port) return ret; } } else { - counter = alloc_and_bind(dev, port, qp, RDMA_COUNTER_MODE_AUTO); + counter = alloc_and_bind(dev, port, qp, RDMA_COUNTER_MODE_AUTO, + port_counter->mode.bind_opcnt); if (!counter) return -ENOMEM; } @@ -560,7 +568,7 @@ int rdma_counter_bind_qpn_alloc(struct ib_device *dev, u32 port, goto err; } - counter = alloc_and_bind(dev, port, qp, RDMA_COUNTER_MODE_MANUAL); + counter = alloc_and_bind(dev, port, qp, RDMA_COUNTER_MODE_MANUAL, true); if (!counter) { ret = -ENOMEM; goto err; @@ -615,13 +623,15 @@ out: int rdma_counter_get_mode(struct ib_device *dev, u32 port, enum rdma_nl_counter_mode *mode, - enum rdma_nl_counter_mask *mask) + enum rdma_nl_counter_mask *mask, + bool *opcnt) { struct rdma_port_counter *port_counter; port_counter = &dev->port_data[port].port_counter; *mode = port_counter->mode.mode; *mask = port_counter->mode.mask; + *opcnt = port_counter->mode.bind_opcnt; return 0; } diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index cb987ab0177c..a872643e8039 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -171,6 +171,7 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_PARENT_NAME] = { .type = NLA_NUL_STRING }, [RDMA_NLDEV_ATTR_NAME_ASSIGN_TYPE] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_EVENT_TYPE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_STAT_OPCOUNTER_ENABLED] = { .type = NLA_U8 }, }; static int put_driver_name_print_type(struct sk_buff *msg, const char *name, @@ -2028,6 +2029,7 @@ static int nldev_stat_set_mode_doit(struct sk_buff *msg, struct ib_device *device, u32 port) { u32 mode, mask = 0, qpn, cntn = 0; + bool opcnt = false; int ret; /* Currently only counter for QP is supported */ @@ -2035,12 +2037,17 @@ static int nldev_stat_set_mode_doit(struct sk_buff *msg, nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES]) != RDMA_NLDEV_ATTR_RES_QP) return -EINVAL; + if (tb[RDMA_NLDEV_ATTR_STAT_OPCOUNTER_ENABLED]) + opcnt = !!nla_get_u8( + tb[RDMA_NLDEV_ATTR_STAT_OPCOUNTER_ENABLED]); + mode = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_MODE]); if (mode == RDMA_COUNTER_MODE_AUTO) { if (tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK]) mask = nla_get_u32( tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK]); - return rdma_counter_set_auto_mode(device, port, mask, extack); + return rdma_counter_set_auto_mode(device, port, mask, opcnt, + extack); } if (!tb[RDMA_NLDEV_ATTR_RES_LQPN]) @@ -2358,6 +2365,7 @@ static int stat_get_doit_qp(struct sk_buff *skb, struct nlmsghdr *nlh, struct ib_device *device; struct sk_buff *msg; u32 index, port; + bool opcnt; int ret; if (tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]) @@ -2393,7 +2401,7 @@ static int stat_get_doit_qp(struct sk_buff *skb, struct nlmsghdr *nlh, goto err_msg; } - ret = rdma_counter_get_mode(device, port, &mode, &mask); + ret = rdma_counter_get_mode(device, port, &mode, &mask, &opcnt); if (ret) goto err_msg; @@ -2410,6 +2418,12 @@ static int stat_get_doit_qp(struct sk_buff *skb, struct nlmsghdr *nlh, goto err_msg; } + if ((mode == RDMA_COUNTER_MODE_AUTO) && + nla_put_u8(msg, RDMA_NLDEV_ATTR_STAT_OPCOUNTER_ENABLED, opcnt)) { + ret = -EMSGSIZE; + goto err_msg; + } + nlmsg_end(msg, nlh); ib_device_put(device); return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); diff --git a/include/rdma/rdma_counter.h b/include/rdma/rdma_counter.h index 45d5481a7846..74e635409ff7 100644 --- a/include/rdma/rdma_counter.h +++ b/include/rdma/rdma_counter.h @@ -23,6 +23,7 @@ struct rdma_counter_mode { enum rdma_nl_counter_mode mode; enum rdma_nl_counter_mask mask; struct auto_mode_param param; + bool bind_opcnt; }; struct rdma_port_counter { @@ -47,6 +48,7 @@ void rdma_counter_init(struct ib_device *dev); void rdma_counter_release(struct ib_device *dev); int rdma_counter_set_auto_mode(struct ib_device *dev, u32 port, enum rdma_nl_counter_mask mask, + bool bind_opcnt, struct netlink_ext_ack *extack); int rdma_counter_bind_qp_auto(struct ib_qp *qp, u32 port); int rdma_counter_unbind_qp(struct ib_qp *qp, bool force); @@ -61,7 +63,8 @@ int rdma_counter_unbind_qpn(struct ib_device *dev, u32 port, u32 qp_num, u32 counter_id); int rdma_counter_get_mode(struct ib_device *dev, u32 port, enum rdma_nl_counter_mode *mode, - enum rdma_nl_counter_mask *mask); + enum rdma_nl_counter_mask *mask, + bool *opcnt); int rdma_counter_modify(struct ib_device *dev, u32 port, unsigned int index, bool enable); diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 9f9cf20c1cd8..f41f0228fcd0 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -580,6 +580,8 @@ enum rdma_nldev_attr { RDMA_NLDEV_ATTR_EVENT_TYPE, /* u8 */ RDMA_NLDEV_SYS_ATTR_MONITOR_MODE, /* u8 */ + + RDMA_NLDEV_ATTR_STAT_OPCOUNTER_ENABLED, /* u8 */ /* * Always the end */ -- cgit v1.2.3-59-g8ed1b From 88ae02feda84f0f78ff7243ef437e79624fdcd80 Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Thu, 13 Mar 2025 16:18:44 +0200 Subject: RDMA/core: Pass port to counter bind/unbind operations This will be useful for the next patches in the series since port number is needed for optional counters binding and unbinding. Note that this change is needed since when the operation is done qp->port isn't necessarily initialized yet and can't be used. Signed-off-by: Patrisious Haddad Reviewed-by: Mark Bloch Link: https://patch.msgid.link/b6f6797844acbd517358e8d2a270ea9b3e6ecba1.1741875070.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/counters.c | 20 ++++++++++---------- drivers/infiniband/core/verbs.c | 2 +- drivers/infiniband/hw/mlx5/counters.c | 4 ++-- include/rdma/ib_verbs.h | 5 +++-- include/rdma/rdma_counter.h | 2 +- 5 files changed, 17 insertions(+), 16 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/core/counters.c b/drivers/infiniband/core/counters.c index b270a208214e..e6ec7b7a40af 100644 --- a/drivers/infiniband/core/counters.c +++ b/drivers/infiniband/core/counters.c @@ -93,7 +93,7 @@ static void auto_mode_init_counter(struct rdma_counter *counter, } static int __rdma_counter_bind_qp(struct rdma_counter *counter, - struct ib_qp *qp) + struct ib_qp *qp, u32 port) { int ret; @@ -104,7 +104,7 @@ static int __rdma_counter_bind_qp(struct rdma_counter *counter, return -EOPNOTSUPP; mutex_lock(&counter->lock); - ret = qp->device->ops.counter_bind_qp(counter, qp); + ret = qp->device->ops.counter_bind_qp(counter, qp, port); mutex_unlock(&counter->lock); return ret; @@ -196,7 +196,7 @@ static struct rdma_counter *alloc_and_bind(struct ib_device *dev, u32 port, kref_init(&counter->kref); mutex_init(&counter->lock); - ret = __rdma_counter_bind_qp(counter, qp); + ret = __rdma_counter_bind_qp(counter, qp, port); if (ret) goto err_mode; @@ -247,7 +247,7 @@ static bool auto_mode_match(struct ib_qp *qp, struct rdma_counter *counter, return match; } -static int __rdma_counter_unbind_qp(struct ib_qp *qp) +static int __rdma_counter_unbind_qp(struct ib_qp *qp, u32 port) { struct rdma_counter *counter = qp->counter; int ret; @@ -256,7 +256,7 @@ static int __rdma_counter_unbind_qp(struct ib_qp *qp) return -EOPNOTSUPP; mutex_lock(&counter->lock); - ret = qp->device->ops.counter_unbind_qp(qp); + ret = qp->device->ops.counter_unbind_qp(qp, port); mutex_unlock(&counter->lock); return ret; @@ -348,7 +348,7 @@ int rdma_counter_bind_qp_auto(struct ib_qp *qp, u32 port) counter = rdma_get_counter_auto_mode(qp, port); if (counter) { - ret = __rdma_counter_bind_qp(counter, qp); + ret = __rdma_counter_bind_qp(counter, qp, port); if (ret) { kref_put(&counter->kref, counter_release); return ret; @@ -368,7 +368,7 @@ int rdma_counter_bind_qp_auto(struct ib_qp *qp, u32 port) * @force: * true - Decrease the counter ref-count anyway (e.g., qp destroy) */ -int rdma_counter_unbind_qp(struct ib_qp *qp, bool force) +int rdma_counter_unbind_qp(struct ib_qp *qp, u32 port, bool force) { struct rdma_counter *counter = qp->counter; int ret; @@ -376,7 +376,7 @@ int rdma_counter_unbind_qp(struct ib_qp *qp, bool force) if (!counter) return -EINVAL; - ret = __rdma_counter_unbind_qp(qp); + ret = __rdma_counter_unbind_qp(qp, port); if (ret && !force) return ret; @@ -523,7 +523,7 @@ int rdma_counter_bind_qpn(struct ib_device *dev, u32 port, goto err_task; } - ret = __rdma_counter_bind_qp(counter, qp); + ret = __rdma_counter_bind_qp(counter, qp, port); if (ret) goto err_task; @@ -614,7 +614,7 @@ int rdma_counter_unbind_qpn(struct ib_device *dev, u32 port, goto out; } - ret = rdma_counter_unbind_qp(qp, false); + ret = rdma_counter_unbind_qp(qp, port, false); out: rdma_restrack_put(&qp->res); diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index dc40001072a5..c5e78bbefbd0 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -2105,7 +2105,7 @@ int ib_destroy_qp_user(struct ib_qp *qp, struct ib_udata *udata) if (!qp->uobject) rdma_rw_cleanup_mrs(qp); - rdma_counter_unbind_qp(qp, true); + rdma_counter_unbind_qp(qp, qp->port, true); ret = qp->device->ops.destroy_qp(qp, udata); if (ret) { if (sec) diff --git a/drivers/infiniband/hw/mlx5/counters.c b/drivers/infiniband/hw/mlx5/counters.c index 018bb96bdbf4..d826f03b6ec5 100644 --- a/drivers/infiniband/hw/mlx5/counters.c +++ b/drivers/infiniband/hw/mlx5/counters.c @@ -562,7 +562,7 @@ static int mlx5_ib_counter_dealloc(struct rdma_counter *counter) } static int mlx5_ib_counter_bind_qp(struct rdma_counter *counter, - struct ib_qp *qp) + struct ib_qp *qp, u32 port) { struct mlx5_ib_dev *dev = to_mdev(qp->device); int err; @@ -594,7 +594,7 @@ fail_set_counter: return err; } -static int mlx5_ib_counter_unbind_qp(struct ib_qp *qp) +static int mlx5_ib_counter_unbind_qp(struct ib_qp *qp, u32 port) { return mlx5_ib_qp_set_counter(qp, NULL); } diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 90e93297d59e..d42eae69d9a8 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2644,12 +2644,13 @@ struct ib_device_ops { * @counter - The counter to be bound. If counter->id is zero then * the driver needs to allocate a new counter and set counter->id */ - int (*counter_bind_qp)(struct rdma_counter *counter, struct ib_qp *qp); + int (*counter_bind_qp)(struct rdma_counter *counter, struct ib_qp *qp, + u32 port); /** * counter_unbind_qp - Unbind the qp from the dynamically-allocated * counter and bind it onto the default one */ - int (*counter_unbind_qp)(struct ib_qp *qp); + int (*counter_unbind_qp)(struct ib_qp *qp, u32 port); /** * counter_dealloc -De-allocate the hw counter */ diff --git a/include/rdma/rdma_counter.h b/include/rdma/rdma_counter.h index 74e635409ff7..4204d08a010a 100644 --- a/include/rdma/rdma_counter.h +++ b/include/rdma/rdma_counter.h @@ -51,7 +51,7 @@ int rdma_counter_set_auto_mode(struct ib_device *dev, u32 port, bool bind_opcnt, struct netlink_ext_ack *extack); int rdma_counter_bind_qp_auto(struct ib_qp *qp, u32 port); -int rdma_counter_unbind_qp(struct ib_qp *qp, bool force); +int rdma_counter_unbind_qp(struct ib_qp *qp, u32 port, bool force); int rdma_counter_query_stats(struct rdma_counter *counter); u64 rdma_counter_get_hwstat_value(struct ib_device *dev, u32 port, u32 index); -- cgit v1.2.3-59-g8ed1b From 36e0d433672f1e65400d69e7eaf7150752c45e29 Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Thu, 13 Mar 2025 16:18:45 +0200 Subject: RDMA/mlx5: Compile fs.c regardless of INFINIBAND_USER_ACCESS config Change mlx5 Makefile, fs.c and fs.h to support fs compilation regardless of INFINIBAND_USER_ACCESS config. In addition allow optional counters support regardless of the config. Signed-off-by: Patrisious Haddad Link: https://patch.msgid.link/b8dd220456a91538b22c3aff150ab021d7b9e1bf.1741875070.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/Makefile | 2 +- drivers/infiniband/hw/mlx5/counters.c | 8 +++----- drivers/infiniband/hw/mlx5/fs.c | 4 +++- drivers/infiniband/hw/mlx5/fs.h | 15 --------------- 4 files changed, 7 insertions(+), 22 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/mlx5/Makefile b/drivers/infiniband/hw/mlx5/Makefile index b38961f5058e..11878ddf7cc7 100644 --- a/drivers/infiniband/hw/mlx5/Makefile +++ b/drivers/infiniband/hw/mlx5/Makefile @@ -9,6 +9,7 @@ mlx5_ib-y := ah.o \ data_direct.o \ dm.o \ doorbell.o \ + fs.o \ gsi.o \ ib_virt.o \ mad.o \ @@ -26,7 +27,6 @@ mlx5_ib-y := ah.o \ mlx5_ib-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += odp.o mlx5_ib-$(CONFIG_MLX5_ESWITCH) += ib_rep.o mlx5_ib-$(CONFIG_INFINIBAND_USER_ACCESS) += devx.o \ - fs.o \ qos.o \ std_types.o mlx5_ib-$(CONFIG_MLX5_MACSEC) += macsec.o diff --git a/drivers/infiniband/hw/mlx5/counters.c b/drivers/infiniband/hw/mlx5/counters.c index d826f03b6ec5..542f591d73e0 100644 --- a/drivers/infiniband/hw/mlx5/counters.c +++ b/drivers/infiniband/hw/mlx5/counters.c @@ -849,9 +849,8 @@ static void mlx5_ib_dealloc_counters(struct mlx5_ib_dev *dev) dev->port[i].cnts.opfcs, j, &in_use_opfc)) goto skip; - if (IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS)) - mlx5_ib_fs_remove_op_fc(dev, - &dev->port[i].cnts.opfcs[j], j); + mlx5_ib_fs_remove_op_fc(dev, + &dev->port[i].cnts.opfcs[j], j); mlx5_fc_destroy(dev->mdev, dev->port[i].cnts.opfcs[j].fc); skip: @@ -1115,8 +1114,7 @@ static const struct ib_device_ops hw_stats_ops = { .counter_dealloc = mlx5_ib_counter_dealloc, .counter_alloc_stats = mlx5_ib_counter_alloc_stats, .counter_update_stats = mlx5_ib_counter_update_stats, - .modify_hw_stat = IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS) ? - mlx5_ib_modify_stat : NULL, + .modify_hw_stat = mlx5_ib_modify_stat, .counter_init = mlx5_ib_counter_init, INIT_RDMA_OBJ_SIZE(rdma_counter, mlx5_rdma_counter, rdma_counter), diff --git a/drivers/infiniband/hw/mlx5/fs.c b/drivers/infiniband/hw/mlx5/fs.c index 93b229e9aab3..71a47821a564 100644 --- a/drivers/infiniband/hw/mlx5/fs.c +++ b/drivers/infiniband/hw/mlx5/fs.c @@ -679,7 +679,7 @@ enum flow_table_type { #define MLX5_FS_MAX_TYPES 6 #define MLX5_FS_MAX_ENTRIES BIT(16) -static bool mlx5_ib_shared_ft_allowed(struct ib_device *device) +static bool __maybe_unused mlx5_ib_shared_ft_allowed(struct ib_device *device) { struct mlx5_ib_dev *dev = to_mdev(device); @@ -3030,6 +3030,7 @@ DECLARE_UVERBS_NAMED_OBJECT( &UVERBS_METHOD(MLX5_IB_METHOD_STEERING_ANCHOR_DESTROY)); const struct uapi_definition mlx5_ib_flow_defs[] = { +#if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS) UAPI_DEF_CHAIN_OBJ_TREE_NAMED( MLX5_IB_OBJECT_FLOW_MATCHER), UAPI_DEF_CHAIN_OBJ_TREE( @@ -3040,6 +3041,7 @@ const struct uapi_definition mlx5_ib_flow_defs[] = { UAPI_DEF_CHAIN_OBJ_TREE_NAMED( MLX5_IB_OBJECT_STEERING_ANCHOR, UAPI_DEF_IS_OBJ_SUPPORTED(mlx5_ib_shared_ft_allowed)), +#endif {}, }; diff --git a/drivers/infiniband/hw/mlx5/fs.h b/drivers/infiniband/hw/mlx5/fs.h index 0516555eb1c1..2ebe86e5be10 100644 --- a/drivers/infiniband/hw/mlx5/fs.h +++ b/drivers/infiniband/hw/mlx5/fs.h @@ -8,23 +8,8 @@ #include "mlx5_ib.h" -#if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS) int mlx5_ib_fs_init(struct mlx5_ib_dev *dev); void mlx5_ib_fs_cleanup_anchor(struct mlx5_ib_dev *dev); -#else -static inline int mlx5_ib_fs_init(struct mlx5_ib_dev *dev) -{ - dev->flow_db = kzalloc(sizeof(*dev->flow_db), GFP_KERNEL); - - if (!dev->flow_db) - return -ENOMEM; - - mutex_init(&dev->flow_db->lock); - return 0; -} - -inline void mlx5_ib_fs_cleanup_anchor(struct mlx5_ib_dev *dev) {} -#endif static inline void mlx5_ib_fs_cleanup(struct mlx5_ib_dev *dev) { -- cgit v1.2.3-59-g8ed1b From fd24c9ef6c8f12fd045524c7cae1992f7967e94b Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Thu, 13 Mar 2025 16:18:46 +0200 Subject: RDMA/mlx5: Support optional-counters binding for QPs Add support to allow optional-counters binding to a QP, whereas when a bind operation is requested depending on the counter optional-counter binding state the driver will determine if to also add optional-counters to this QP binding. The optional-counter binding is done by simply adding a steering rule for the specific optional-counter condition with the additional match over that QP number. Note that optional-counters per QP rules are handled on an earlier prio than per device counters, and per device counter correctness is maintained by core whereas it is responsible to sum active counters when checking device counter and to add them to history count when they are deallocated. Signed-off-by: Patrisious Haddad Reviewed-by: Mark Bloch Link: https://patch.msgid.link/2cad1b891a6641ae61fe8d92f867e1059121813a.1741875070.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/counters.c | 99 +++++++- drivers/infiniband/hw/mlx5/counters.h | 6 +- drivers/infiniband/hw/mlx5/fs.c | 433 +++++++++++++++++++++++++++++++++- drivers/infiniband/hw/mlx5/mlx5_ib.h | 16 ++ include/linux/mlx5/device.h | 4 +- 5 files changed, 546 insertions(+), 12 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/mlx5/counters.c b/drivers/infiniband/hw/mlx5/counters.c index 542f591d73e0..a37242d308ff 100644 --- a/drivers/infiniband/hw/mlx5/counters.c +++ b/drivers/infiniband/hw/mlx5/counters.c @@ -437,12 +437,49 @@ done: static bool is_rdma_bytes_counter(u32 type) { if (type == MLX5_IB_OPCOUNTER_RDMA_TX_BYTES || - type == MLX5_IB_OPCOUNTER_RDMA_RX_BYTES) + type == MLX5_IB_OPCOUNTER_RDMA_RX_BYTES || + type == MLX5_IB_OPCOUNTER_RDMA_TX_BYTES_PER_QP || + type == MLX5_IB_OPCOUNTER_RDMA_RX_BYTES_PER_QP) return true; return false; } +static int do_per_qp_get_op_stat(struct rdma_counter *counter) +{ + struct mlx5_ib_dev *dev = to_mdev(counter->device); + const struct mlx5_ib_counters *cnts = get_counters(dev, counter->port); + struct mlx5_rdma_counter *mcounter = to_mcounter(counter); + int i, ret, index, num_hw_counters; + u64 packets = 0, bytes = 0; + + for (i = MLX5_IB_OPCOUNTER_CC_RX_CE_PKTS_PER_QP; + i <= MLX5_IB_OPCOUNTER_RDMA_RX_BYTES_PER_QP; i++) { + if (!mcounter->fc[i]) + continue; + + ret = mlx5_fc_query(dev->mdev, mcounter->fc[i], + &packets, &bytes); + if (ret) + return ret; + + num_hw_counters = cnts->num_q_counters + + cnts->num_cong_counters + + cnts->num_ext_ppcnt_counters; + + index = i - MLX5_IB_OPCOUNTER_CC_RX_CE_PKTS_PER_QP + + num_hw_counters; + + if (is_rdma_bytes_counter(i)) + counter->stats->value[index] = bytes; + else + counter->stats->value[index] = packets; + + clear_bit(index, counter->stats->is_disabled); + } + return 0; +} + static int do_get_op_stat(struct ib_device *ibdev, struct rdma_hw_stats *stats, u32 port_num, int index) @@ -542,19 +579,30 @@ static int mlx5_ib_counter_update_stats(struct rdma_counter *counter) { struct mlx5_ib_dev *dev = to_mdev(counter->device); const struct mlx5_ib_counters *cnts = get_counters(dev, counter->port); + int ret; + + ret = mlx5_ib_query_q_counters(dev->mdev, cnts, counter->stats, + counter->id); + if (ret) + return ret; + + if (!counter->mode.bind_opcnt) + return 0; - return mlx5_ib_query_q_counters(dev->mdev, cnts, - counter->stats, counter->id); + return do_per_qp_get_op_stat(counter); } static int mlx5_ib_counter_dealloc(struct rdma_counter *counter) { + struct mlx5_rdma_counter *mcounter = to_mcounter(counter); struct mlx5_ib_dev *dev = to_mdev(counter->device); u32 in[MLX5_ST_SZ_DW(dealloc_q_counter_in)] = {}; if (!counter->id) return 0; + WARN_ON(!xa_empty(&mcounter->qpn_opfc_xa)); + mlx5r_fs_destroy_fcs(dev, counter); MLX5_SET(dealloc_q_counter_in, in, opcode, MLX5_CMD_OP_DEALLOC_Q_COUNTER); MLX5_SET(dealloc_q_counter_in, in, counter_set_id, counter->id); @@ -585,8 +633,14 @@ static int mlx5_ib_counter_bind_qp(struct rdma_counter *counter, if (err) goto fail_set_counter; + err = mlx5r_fs_bind_op_fc(qp, counter, port); + if (err) + goto fail_bind_op_fc; + return 0; +fail_bind_op_fc: + mlx5_ib_qp_set_counter(qp, NULL); fail_set_counter: mlx5_ib_counter_dealloc(counter); counter->id = 0; @@ -596,7 +650,20 @@ fail_set_counter: static int mlx5_ib_counter_unbind_qp(struct ib_qp *qp, u32 port) { - return mlx5_ib_qp_set_counter(qp, NULL); + struct rdma_counter *counter = qp->counter; + int err; + + mlx5r_fs_unbind_op_fc(qp, counter); + + err = mlx5_ib_qp_set_counter(qp, NULL); + if (err) + goto fail_set_counter; + + return 0; + +fail_set_counter: + mlx5r_fs_bind_op_fc(qp, counter, port); + return err; } static void mlx5_ib_fill_counters(struct mlx5_ib_dev *dev, @@ -789,9 +856,8 @@ err: * was already created, if both conditions are met return true and the counter * else return false. */ -static bool mlx5r_is_opfc_shared_and_in_use(struct mlx5_ib_op_fc *opfcs, - u32 type, - struct mlx5_ib_op_fc **opfc) +bool mlx5r_is_opfc_shared_and_in_use(struct mlx5_ib_op_fc *opfcs, u32 type, + struct mlx5_ib_op_fc **opfc) { u32 shared_fc_type; @@ -808,6 +874,18 @@ static bool mlx5r_is_opfc_shared_and_in_use(struct mlx5_ib_op_fc *opfcs, case MLX5_IB_OPCOUNTER_RDMA_RX_BYTES: shared_fc_type = MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS; break; + case MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS_PER_QP: + shared_fc_type = MLX5_IB_OPCOUNTER_RDMA_TX_BYTES_PER_QP; + break; + case MLX5_IB_OPCOUNTER_RDMA_TX_BYTES_PER_QP: + shared_fc_type = MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS_PER_QP; + break; + case MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS_PER_QP: + shared_fc_type = MLX5_IB_OPCOUNTER_RDMA_RX_BYTES_PER_QP; + break; + case MLX5_IB_OPCOUNTER_RDMA_RX_BYTES_PER_QP: + shared_fc_type = MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS_PER_QP; + break; default: return false; } @@ -1104,7 +1182,12 @@ out: return 0; } -static void mlx5_ib_counter_init(struct rdma_counter *counter) {} +static void mlx5_ib_counter_init(struct rdma_counter *counter) +{ + struct mlx5_rdma_counter *mcounter = to_mcounter(counter); + + xa_init(&mcounter->qpn_opfc_xa); +} static const struct ib_device_ops hw_stats_ops = { .alloc_hw_port_stats = mlx5_ib_alloc_hw_port_stats, diff --git a/drivers/infiniband/hw/mlx5/counters.h b/drivers/infiniband/hw/mlx5/counters.h index f153901a43be..bd03cee42014 100644 --- a/drivers/infiniband/hw/mlx5/counters.h +++ b/drivers/infiniband/hw/mlx5/counters.h @@ -8,9 +8,11 @@ #include "mlx5_ib.h" - struct mlx5_rdma_counter { struct rdma_counter rdma_counter; + + struct mlx5_fc *fc[MLX5_IB_OPCOUNTER_MAX]; + struct xarray qpn_opfc_xa; }; static inline struct mlx5_rdma_counter * @@ -25,4 +27,6 @@ void mlx5_ib_counters_clear_description(struct ib_counters *counters); int mlx5_ib_flow_counters_set_data(struct ib_counters *ibcounters, struct mlx5_ib_create_flow *ucmd); u16 mlx5_ib_get_counters_id(struct mlx5_ib_dev *dev, u32 port_num); +bool mlx5r_is_opfc_shared_and_in_use(struct mlx5_ib_op_fc *opfcs, u32 type, + struct mlx5_ib_op_fc **opfc); #endif /* _MLX5_IB_COUNTERS_H */ diff --git a/drivers/infiniband/hw/mlx5/fs.c b/drivers/infiniband/hw/mlx5/fs.c index 71a47821a564..251246c73b33 100644 --- a/drivers/infiniband/hw/mlx5/fs.c +++ b/drivers/infiniband/hw/mlx5/fs.c @@ -33,6 +33,11 @@ enum { MATCH_CRITERIA_ENABLE_MISC2_BIT }; + +struct mlx5_per_qp_opfc { + struct mlx5_ib_op_fc opfcs[MLX5_IB_OPCOUNTER_MAX]; +}; + #define HEADER_IS_ZERO(match_criteria, headers) \ !(memchr_inv(MLX5_ADDR_OF(fte_match_param, match_criteria, headers), \ 0, MLX5_FLD_SZ_BYTES(fte_match_param, headers))) \ @@ -800,12 +805,17 @@ static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev, } enum { + RDMA_RX_ECN_OPCOUNTER_PER_QP_PRIO, + RDMA_RX_CNP_OPCOUNTER_PER_QP_PRIO, + RDMA_RX_PKTS_BYTES_OPCOUNTER_PER_QP_PRIO, RDMA_RX_ECN_OPCOUNTER_PRIO, RDMA_RX_CNP_OPCOUNTER_PRIO, RDMA_RX_PKTS_BYTES_OPCOUNTER_PRIO, }; enum { + RDMA_TX_CNP_OPCOUNTER_PER_QP_PRIO, + RDMA_TX_PKTS_BYTES_OPCOUNTER_PER_QP_PRIO, RDMA_TX_CNP_OPCOUNTER_PRIO, RDMA_TX_PKTS_BYTES_OPCOUNTER_PRIO, }; @@ -887,6 +897,12 @@ static struct mlx5_ib_flow_prio *get_opfc_prio(struct mlx5_ib_dev *dev, case MLX5_IB_OPCOUNTER_RDMA_RX_BYTES: prio_type = MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS; break; + case MLX5_IB_OPCOUNTER_RDMA_TX_BYTES_PER_QP: + prio_type = MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS_PER_QP; + break; + case MLX5_IB_OPCOUNTER_RDMA_RX_BYTES_PER_QP: + prio_type = MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS_PER_QP; + break; default: prio_type = type; } @@ -894,6 +910,315 @@ static struct mlx5_ib_flow_prio *get_opfc_prio(struct mlx5_ib_dev *dev, return &dev->flow_db->opfcs[prio_type]; } +static void put_per_qp_prio(struct mlx5_ib_dev *dev, + enum mlx5_ib_optional_counter_type type) +{ + enum mlx5_ib_optional_counter_type per_qp_type; + struct mlx5_ib_flow_prio *prio; + + switch (type) { + case MLX5_IB_OPCOUNTER_CC_RX_CE_PKTS: + per_qp_type = MLX5_IB_OPCOUNTER_CC_RX_CE_PKTS_PER_QP; + break; + case MLX5_IB_OPCOUNTER_CC_RX_CNP_PKTS: + per_qp_type = MLX5_IB_OPCOUNTER_CC_RX_CNP_PKTS_PER_QP; + break; + case MLX5_IB_OPCOUNTER_CC_TX_CNP_PKTS: + per_qp_type = MLX5_IB_OPCOUNTER_CC_TX_CNP_PKTS_PER_QP; + break; + case MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS: + per_qp_type = MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS_PER_QP; + break; + case MLX5_IB_OPCOUNTER_RDMA_TX_BYTES: + per_qp_type = MLX5_IB_OPCOUNTER_RDMA_TX_BYTES_PER_QP; + break; + case MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS: + per_qp_type = MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS_PER_QP; + break; + case MLX5_IB_OPCOUNTER_RDMA_RX_BYTES: + per_qp_type = MLX5_IB_OPCOUNTER_RDMA_RX_BYTES_PER_QP; + break; + default: + return; + } + + prio = get_opfc_prio(dev, per_qp_type); + put_flow_table(dev, prio, true); +} + +static int get_per_qp_prio(struct mlx5_ib_dev *dev, + enum mlx5_ib_optional_counter_type type) +{ + enum mlx5_ib_optional_counter_type per_qp_type; + enum mlx5_flow_namespace_type fn_type; + struct mlx5_flow_namespace *ns; + struct mlx5_ib_flow_prio *prio; + int priority; + + switch (type) { + case MLX5_IB_OPCOUNTER_CC_RX_CE_PKTS: + fn_type = MLX5_FLOW_NAMESPACE_RDMA_RX_COUNTERS; + priority = RDMA_RX_ECN_OPCOUNTER_PER_QP_PRIO; + per_qp_type = MLX5_IB_OPCOUNTER_CC_RX_CE_PKTS_PER_QP; + break; + case MLX5_IB_OPCOUNTER_CC_RX_CNP_PKTS: + fn_type = MLX5_FLOW_NAMESPACE_RDMA_RX_COUNTERS; + priority = RDMA_RX_CNP_OPCOUNTER_PER_QP_PRIO; + per_qp_type = MLX5_IB_OPCOUNTER_CC_RX_CNP_PKTS_PER_QP; + break; + case MLX5_IB_OPCOUNTER_CC_TX_CNP_PKTS: + fn_type = MLX5_FLOW_NAMESPACE_RDMA_TX_COUNTERS; + priority = RDMA_TX_CNP_OPCOUNTER_PER_QP_PRIO; + per_qp_type = MLX5_IB_OPCOUNTER_CC_TX_CNP_PKTS_PER_QP; + break; + case MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS: + fn_type = MLX5_FLOW_NAMESPACE_RDMA_TX_COUNTERS; + priority = RDMA_TX_PKTS_BYTES_OPCOUNTER_PER_QP_PRIO; + per_qp_type = MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS_PER_QP; + break; + case MLX5_IB_OPCOUNTER_RDMA_TX_BYTES: + fn_type = MLX5_FLOW_NAMESPACE_RDMA_TX_COUNTERS; + priority = RDMA_TX_PKTS_BYTES_OPCOUNTER_PER_QP_PRIO; + per_qp_type = MLX5_IB_OPCOUNTER_RDMA_TX_BYTES_PER_QP; + break; + case MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS: + fn_type = MLX5_FLOW_NAMESPACE_RDMA_RX_COUNTERS; + priority = RDMA_RX_PKTS_BYTES_OPCOUNTER_PER_QP_PRIO; + per_qp_type = MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS_PER_QP; + break; + case MLX5_IB_OPCOUNTER_RDMA_RX_BYTES: + fn_type = MLX5_FLOW_NAMESPACE_RDMA_RX_COUNTERS; + priority = RDMA_RX_PKTS_BYTES_OPCOUNTER_PER_QP_PRIO; + per_qp_type = MLX5_IB_OPCOUNTER_RDMA_RX_BYTES_PER_QP; + break; + default: + return -EINVAL; + } + + ns = mlx5_get_flow_namespace(dev->mdev, fn_type); + if (!ns) + return -EOPNOTSUPP; + + prio = get_opfc_prio(dev, per_qp_type); + if (prio->flow_table) + return 0; + + prio = _get_prio(dev, ns, prio, priority, MLX5_FS_MAX_POOL_SIZE, 1, 0, 0); + if (IS_ERR(prio)) + return PTR_ERR(prio); + + prio->refcount = 1; + + return 0; +} + +static struct mlx5_per_qp_opfc * +get_per_qp_opfc(struct mlx5_rdma_counter *mcounter, u32 qp_num, bool *new) +{ + struct mlx5_per_qp_opfc *per_qp_opfc; + + *new = false; + + per_qp_opfc = xa_load(&mcounter->qpn_opfc_xa, qp_num); + if (per_qp_opfc) + return per_qp_opfc; + per_qp_opfc = kzalloc(sizeof(*per_qp_opfc), GFP_KERNEL); + + if (!per_qp_opfc) + return NULL; + + *new = true; + return per_qp_opfc; +} + +static int add_op_fc_rules(struct mlx5_ib_dev *dev, + struct mlx5_rdma_counter *mcounter, + struct mlx5_per_qp_opfc *per_qp_opfc, + struct mlx5_ib_flow_prio *prio, + enum mlx5_ib_optional_counter_type type, + u32 qp_num, u32 port_num) +{ + struct mlx5_ib_op_fc *opfc = &per_qp_opfc->opfcs[type], *in_use_opfc; + struct mlx5_flow_act flow_act = {}; + struct mlx5_flow_destination dst; + struct mlx5_flow_spec *spec; + int i, err, spec_num; + bool is_tx; + + if (opfc->fc) + return -EEXIST; + + if (mlx5r_is_opfc_shared_and_in_use(per_qp_opfc->opfcs, type, + &in_use_opfc)) { + opfc->fc = in_use_opfc->fc; + opfc->rule[0] = in_use_opfc->rule[0]; + return 0; + } + + opfc->fc = mcounter->fc[type]; + + spec = kcalloc(MAX_OPFC_RULES, sizeof(*spec), GFP_KERNEL); + if (!spec) { + err = -ENOMEM; + goto null_fc; + } + + switch (type) { + case MLX5_IB_OPCOUNTER_CC_RX_CE_PKTS_PER_QP: + if (set_ecn_ce_spec(dev, port_num, &spec[0], + MLX5_FS_IPV4_VERSION) || + set_ecn_ce_spec(dev, port_num, &spec[1], + MLX5_FS_IPV6_VERSION)) { + err = -EOPNOTSUPP; + goto free_spec; + } + spec_num = 2; + is_tx = false; + + MLX5_SET_TO_ONES(fte_match_param, spec[1].match_criteria, + misc_parameters.bth_dst_qp); + MLX5_SET(fte_match_param, spec[1].match_value, + misc_parameters.bth_dst_qp, qp_num); + spec[1].match_criteria_enable |= MLX5_MATCH_MISC_PARAMETERS; + break; + case MLX5_IB_OPCOUNTER_CC_RX_CNP_PKTS_PER_QP: + if (!MLX5_CAP_FLOWTABLE( + dev->mdev, + ft_field_support_2_nic_receive_rdma.bth_opcode) || + set_cnp_spec(dev, port_num, &spec[0])) { + err = -EOPNOTSUPP; + goto free_spec; + } + spec_num = 1; + is_tx = false; + break; + case MLX5_IB_OPCOUNTER_CC_TX_CNP_PKTS_PER_QP: + if (!MLX5_CAP_FLOWTABLE( + dev->mdev, + ft_field_support_2_nic_transmit_rdma.bth_opcode) || + set_cnp_spec(dev, port_num, &spec[0])) { + err = -EOPNOTSUPP; + goto free_spec; + } + spec_num = 1; + is_tx = true; + break; + case MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS_PER_QP: + case MLX5_IB_OPCOUNTER_RDMA_TX_BYTES_PER_QP: + spec_num = 1; + is_tx = true; + break; + case MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS_PER_QP: + case MLX5_IB_OPCOUNTER_RDMA_RX_BYTES_PER_QP: + spec_num = 1; + is_tx = false; + break; + default: + err = -EINVAL; + goto free_spec; + } + + if (is_tx) { + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, + misc_parameters.source_sqn); + MLX5_SET(fte_match_param, spec->match_value, + misc_parameters.source_sqn, qp_num); + } else { + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, + misc_parameters.bth_dst_qp); + MLX5_SET(fte_match_param, spec->match_value, + misc_parameters.bth_dst_qp, qp_num); + } + + spec->match_criteria_enable |= MLX5_MATCH_MISC_PARAMETERS; + + dst.type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; + dst.counter = opfc->fc; + + flow_act.action = + MLX5_FLOW_CONTEXT_ACTION_COUNT | MLX5_FLOW_CONTEXT_ACTION_ALLOW; + + for (i = 0; i < spec_num; i++) { + opfc->rule[i] = mlx5_add_flow_rules(prio->flow_table, &spec[i], + &flow_act, &dst, 1); + if (IS_ERR(opfc->rule[i])) { + err = PTR_ERR(opfc->rule[i]); + goto del_rules; + } + } + prio->refcount += spec_num; + + err = xa_err(xa_store(&mcounter->qpn_opfc_xa, qp_num, per_qp_opfc, + GFP_KERNEL)); + if (err) + goto del_rules; + + kfree(spec); + + return 0; + +del_rules: + while (i--) + mlx5_del_flow_rules(opfc->rule[i]); + put_flow_table(dev, prio, false); +free_spec: + kfree(spec); +null_fc: + opfc->fc = NULL; + return err; +} + +static bool is_fc_shared_and_in_use(struct mlx5_rdma_counter *mcounter, + u32 type, struct mlx5_fc **fc) +{ + u32 shared_fc_type; + + switch (type) { + case MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS_PER_QP: + shared_fc_type = MLX5_IB_OPCOUNTER_RDMA_TX_BYTES_PER_QP; + break; + case MLX5_IB_OPCOUNTER_RDMA_TX_BYTES_PER_QP: + shared_fc_type = MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS_PER_QP; + break; + case MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS_PER_QP: + shared_fc_type = MLX5_IB_OPCOUNTER_RDMA_RX_BYTES_PER_QP; + break; + case MLX5_IB_OPCOUNTER_RDMA_RX_BYTES_PER_QP: + shared_fc_type = MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS_PER_QP; + break; + default: + return false; + } + + *fc = mcounter->fc[shared_fc_type]; + if (!(*fc)) + return false; + + return true; +} + +void mlx5r_fs_destroy_fcs(struct mlx5_ib_dev *dev, + struct rdma_counter *counter) +{ + struct mlx5_rdma_counter *mcounter = to_mcounter(counter); + struct mlx5_fc *in_use_fc; + int i; + + for (i = MLX5_IB_OPCOUNTER_CC_RX_CE_PKTS_PER_QP; + i <= MLX5_IB_OPCOUNTER_RDMA_RX_BYTES_PER_QP; i++) { + if (!mcounter->fc[i]) + continue; + + if (is_fc_shared_and_in_use(mcounter, i, &in_use_fc)) { + mcounter->fc[i] = NULL; + continue; + } + + mlx5_fc_destroy(dev->mdev, mcounter->fc[i]); + mcounter->fc[i] = NULL; + } +} + int mlx5_ib_fs_add_op_fc(struct mlx5_ib_dev *dev, u32 port_num, struct mlx5_ib_op_fc *opfc, enum mlx5_ib_optional_counter_type type) @@ -975,11 +1300,15 @@ int mlx5_ib_fs_add_op_fc(struct mlx5_ib_dev *dev, u32 port_num, prio = get_opfc_prio(dev, type); if (!prio->flow_table) { + err = get_per_qp_prio(dev, type); + if (err) + goto free; + prio = _get_prio(dev, ns, prio, priority, dev->num_ports * MAX_OPFC_RULES, 1, 0, 0); if (IS_ERR(prio)) { err = PTR_ERR(prio); - goto free; + goto put_prio; } } @@ -1006,6 +1335,8 @@ del_rules: for (i -= 1; i >= 0; i--) mlx5_del_flow_rules(opfc->rule[i]); put_flow_table(dev, prio, false); +put_prio: + put_per_qp_prio(dev, type); free: kfree(spec); return err; @@ -1024,6 +1355,106 @@ void mlx5_ib_fs_remove_op_fc(struct mlx5_ib_dev *dev, mlx5_del_flow_rules(opfc->rule[i]); put_flow_table(dev, prio, true); } + + put_per_qp_prio(dev, type); +} + +void mlx5r_fs_unbind_op_fc(struct ib_qp *qp, struct rdma_counter *counter) +{ + struct mlx5_rdma_counter *mcounter = to_mcounter(counter); + struct mlx5_ib_dev *dev = to_mdev(counter->device); + struct mlx5_per_qp_opfc *per_qp_opfc; + struct mlx5_ib_op_fc *in_use_opfc; + struct mlx5_ib_flow_prio *prio; + int i, j; + + per_qp_opfc = xa_load(&mcounter->qpn_opfc_xa, qp->qp_num); + if (!per_qp_opfc) + return; + + for (i = MLX5_IB_OPCOUNTER_CC_RX_CE_PKTS_PER_QP; + i <= MLX5_IB_OPCOUNTER_RDMA_RX_BYTES_PER_QP; i++) { + if (!per_qp_opfc->opfcs[i].fc) + continue; + + if (mlx5r_is_opfc_shared_and_in_use(per_qp_opfc->opfcs, i, + &in_use_opfc)) { + per_qp_opfc->opfcs[i].fc = NULL; + continue; + } + + for (j = 0; j < MAX_OPFC_RULES; j++) { + if (!per_qp_opfc->opfcs[i].rule[j]) + continue; + mlx5_del_flow_rules(per_qp_opfc->opfcs[i].rule[j]); + prio = get_opfc_prio(dev, i); + put_flow_table(dev, prio, true); + } + per_qp_opfc->opfcs[i].fc = NULL; + } + + kfree(per_qp_opfc); + xa_erase(&mcounter->qpn_opfc_xa, qp->qp_num); +} + +int mlx5r_fs_bind_op_fc(struct ib_qp *qp, struct rdma_counter *counter, + u32 port) +{ + struct mlx5_rdma_counter *mcounter = to_mcounter(counter); + struct mlx5_ib_dev *dev = to_mdev(qp->device); + struct mlx5_per_qp_opfc *per_qp_opfc; + struct mlx5_ib_flow_prio *prio; + struct mlx5_ib_counters *cnts; + struct mlx5_ib_op_fc *opfc; + struct mlx5_fc *in_use_fc; + int i, err, per_qp_type; + bool new; + + if (!counter->mode.bind_opcnt) + return 0; + + cnts = &dev->port[port - 1].cnts; + + for (i = 0; i <= MLX5_IB_OPCOUNTER_RDMA_RX_BYTES; i++) { + opfc = &cnts->opfcs[i]; + if (!opfc->fc) + continue; + + per_qp_type = i + MLX5_IB_OPCOUNTER_CC_RX_CE_PKTS_PER_QP; + prio = get_opfc_prio(dev, per_qp_type); + WARN_ON(!prio->flow_table); + + if (is_fc_shared_and_in_use(mcounter, per_qp_type, &in_use_fc)) + mcounter->fc[per_qp_type] = in_use_fc; + + if (!mcounter->fc[per_qp_type]) { + mcounter->fc[per_qp_type] = mlx5_fc_create(dev->mdev, + false); + if (IS_ERR(mcounter->fc[per_qp_type])) + return PTR_ERR(mcounter->fc[per_qp_type]); + } + + per_qp_opfc = get_per_qp_opfc(mcounter, qp->qp_num, &new); + if (!per_qp_opfc) { + err = -ENOMEM; + goto free_fc; + } + err = add_op_fc_rules(dev, mcounter, per_qp_opfc, prio, + per_qp_type, qp->qp_num, port); + if (err) + goto del_rules; + } + + return 0; + +del_rules: + mlx5r_fs_unbind_op_fc(qp, counter); + if (new) + kfree(per_qp_opfc); +free_fc: + if (xa_empty(&mcounter->qpn_opfc_xa)) + mlx5r_fs_destroy_fcs(dev, counter); + return err; } static void set_underlay_qp(struct mlx5_ib_dev *dev, diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 26397eb08684..ace2df3e1d9f 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -299,6 +299,14 @@ enum mlx5_ib_optional_counter_type { MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS, MLX5_IB_OPCOUNTER_RDMA_RX_BYTES, + MLX5_IB_OPCOUNTER_CC_RX_CE_PKTS_PER_QP, + MLX5_IB_OPCOUNTER_CC_RX_CNP_PKTS_PER_QP, + MLX5_IB_OPCOUNTER_CC_TX_CNP_PKTS_PER_QP, + MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS_PER_QP, + MLX5_IB_OPCOUNTER_RDMA_TX_BYTES_PER_QP, + MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS_PER_QP, + MLX5_IB_OPCOUNTER_RDMA_RX_BYTES_PER_QP, + MLX5_IB_OPCOUNTER_MAX, }; @@ -890,6 +898,14 @@ void mlx5_ib_fs_remove_op_fc(struct mlx5_ib_dev *dev, struct mlx5_ib_op_fc *opfc, enum mlx5_ib_optional_counter_type type); +int mlx5r_fs_bind_op_fc(struct ib_qp *qp, struct rdma_counter *counter, + u32 port); + +void mlx5r_fs_unbind_op_fc(struct ib_qp *qp, struct rdma_counter *counter); + +void mlx5r_fs_destroy_fcs(struct mlx5_ib_dev *dev, + struct rdma_counter *counter); + struct mlx5_ib_multiport_info; struct mlx5_ib_multiport { diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 63f0d9fb94b4..344124644697 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1532,8 +1532,8 @@ static inline u16 mlx5_to_sw_pkey_sz(int pkey_sz) return MLX5_MIN_PKEY_TABLE_SIZE << pkey_sz; } -#define MLX5_RDMA_RX_NUM_COUNTERS_PRIOS 3 -#define MLX5_RDMA_TX_NUM_COUNTERS_PRIOS 2 +#define MLX5_RDMA_RX_NUM_COUNTERS_PRIOS 6 +#define MLX5_RDMA_TX_NUM_COUNTERS_PRIOS 4 #define MLX5_BY_PASS_NUM_REGULAR_PRIOS 16 #define MLX5_BY_PASS_NUM_DONT_TRAP_PRIOS 16 #define MLX5_BY_PASS_NUM_MULTICAST_PRIOS 1 -- cgit v1.2.3-59-g8ed1b From a0130ef84b00c68ba0b79ee974a0f01459741421 Mon Sep 17 00:00:00 2001 From: Michael Guralnik Date: Thu, 13 Mar 2025 16:29:48 +0200 Subject: RDMA/mlx5: Fix MR cache initialization error flow Destroy all previously created cache entries and work queue when rolling back the MR cache initialization upon an error. Fixes: 73d09b2fe833 ("RDMA/mlx5: Introduce mlx5r_cache_rb_key") Signed-off-by: Michael Guralnik Reviewed-by: Yishai Hadas Link: https://patch.msgid.link/c41d525fb3c72e28dd38511bf3aaccb5d584063e.1741875692.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/mr.c | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index bb02b6adbf2c..1ffa4b3d0f76 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -919,6 +919,25 @@ mkeys_err: return ERR_PTR(ret); } +static void mlx5r_destroy_cache_entries(struct mlx5_ib_dev *dev) +{ + struct rb_root *root = &dev->cache.rb_root; + struct mlx5_cache_ent *ent; + struct rb_node *node; + + mutex_lock(&dev->cache.rb_lock); + node = rb_first(root); + while (node) { + ent = rb_entry(node, struct mlx5_cache_ent, node); + node = rb_next(node); + clean_keys(dev, ent); + rb_erase(&ent->node, root); + mlx5r_mkeys_uninit(ent); + kfree(ent); + } + mutex_unlock(&dev->cache.rb_lock); +} + int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev) { struct mlx5_mkey_cache *cache = &dev->cache; @@ -970,6 +989,8 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev) err: mutex_unlock(&cache->rb_lock); mlx5_mkey_cache_debugfs_cleanup(dev); + mlx5r_destroy_cache_entries(dev); + destroy_workqueue(cache->wq); mlx5_ib_warn(dev, "failed to create mkey cache entry\n"); return ret; } @@ -1003,17 +1024,7 @@ void mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev) mlx5_cmd_cleanup_async_ctx(&dev->async_ctx); /* At this point all entries are disabled and have no concurrent work. */ - mutex_lock(&dev->cache.rb_lock); - node = rb_first(root); - while (node) { - ent = rb_entry(node, struct mlx5_cache_ent, node); - node = rb_next(node); - clean_keys(dev, ent); - rb_erase(&ent->node, root); - mlx5r_mkeys_uninit(ent); - kfree(ent); - } - mutex_unlock(&dev->cache.rb_lock); + mlx5r_destroy_cache_entries(dev); destroy_workqueue(dev->cache.wq); del_timer_sync(&dev->delay_timer); -- cgit v1.2.3-59-g8ed1b From 24d693cf6c89d216a68634d44fa93e4400775d94 Mon Sep 17 00:00:00 2001 From: Michael Guralnik Date: Thu, 13 Mar 2025 16:29:49 +0200 Subject: RDMA/mlx5: Fix cache entry update on dereg error Fix double decrement of 'in_use' counter on push_mkey_locked() failure while deregistering an MR. If we fail to return an mkey to the cache in cache_ent_find_and_store() it'll update the 'in_use' counter. Its caller, revoke_mr(), also updates it, thus having double decrement. Wrong value of 'in_use' counter will be exposed through debugfs and can also cause wrong resizing of the cache when users try to set cache entry size using the 'size' debugfs. To address this issue, the 'in_use' counter is now decremented within mlx5_revoke_mr() also after a successful call to cache_ent_find_and_store() and not within cache_ent_find_and_store(). Other success or failure flows remains unchanged where it was also decremented. Fixes: 8c1185fef68c ("RDMA/mlx5: Change check for cacheable mkeys") Signed-off-by: Michael Guralnik Reviewed-by: Yishai Hadas Link: https://patch.msgid.link/97e979dff636f232ff4c83ce709c17c727da1fdb.1741875692.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/mr.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 1ffa4b3d0f76..c88016630f9d 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -1967,7 +1967,6 @@ static int cache_ent_find_and_store(struct mlx5_ib_dev *dev, if (mr->mmkey.cache_ent) { spin_lock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock); - mr->mmkey.cache_ent->in_use--; goto end; } @@ -2033,6 +2032,7 @@ static int mlx5_revoke_mr(struct mlx5_ib_mr *mr) struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); struct mlx5_cache_ent *ent = mr->mmkey.cache_ent; bool is_odp = is_odp_mr(mr); + bool from_cache = !!ent; int ret = 0; if (is_odp) @@ -2042,6 +2042,8 @@ static int mlx5_revoke_mr(struct mlx5_ib_mr *mr) ent = mr->mmkey.cache_ent; /* upon storing to a clean temp entry - schedule its cleanup */ spin_lock_irq(&ent->mkeys_queue.lock); + if (from_cache) + ent->in_use--; if (ent->is_tmp && !ent->tmp_cleanup_scheduled) { mod_delayed_work(ent->dev->cache.wq, &ent->dwork, msecs_to_jiffies(30 * 1000)); -- cgit v1.2.3-59-g8ed1b From 9a68356c309a37aebb2442ada872a38dfce17645 Mon Sep 17 00:00:00 2001 From: Michael Guralnik Date: Thu, 13 Mar 2025 16:29:50 +0200 Subject: RDMA/mlx5: Drop access_flags from _mlx5_mr_cache_alloc() Drop the unused access_flags parameter. Signed-off-by: Michael Guralnik Reviewed-by: Yishai Hadas Link: https://patch.msgid.link/4d769c51eb012c62b3a92fd916b7886c25b56fbf.1741875692.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/mr.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index c88016630f9d..58486f79c5e5 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -718,8 +718,7 @@ mkey_cache_ent_from_rb_key(struct mlx5_ib_dev *dev, } static struct mlx5_ib_mr *_mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, - struct mlx5_cache_ent *ent, - int access_flags) + struct mlx5_cache_ent *ent) { struct mlx5_ib_mr *mr; int err; @@ -794,7 +793,7 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, if (!ent) return ERR_PTR(-EOPNOTSUPP); - return _mlx5_mr_cache_alloc(dev, ent, access_flags); + return _mlx5_mr_cache_alloc(dev, ent); } static void mlx5_mkey_cache_debugfs_cleanup(struct mlx5_ib_dev *dev) @@ -1155,7 +1154,7 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, return mr; } - mr = _mlx5_mr_cache_alloc(dev, ent, access_flags); + mr = _mlx5_mr_cache_alloc(dev, ent); if (IS_ERR(mr)) return mr; -- cgit v1.2.3-59-g8ed1b From f0c2427412b43cdf1b7b0944749ea17ddb97d5a5 Mon Sep 17 00:00:00 2001 From: Michael Guralnik Date: Thu, 13 Mar 2025 16:29:51 +0200 Subject: RDMA/mlx5: Fix page_size variable overflow Change all variables storing mlx5_umem_mkc_find_best_pgsz() result to unsigned long to support values larger than 31 and avoid overflow. For example: If we try to register 4GB of memory that is contiguous in physical memory, the driver will optimize the page_size and try to use an mkey with 4GB entity size. The 'unsigned int' page_size variable will overflow to '0' and we'll hit the WARN_ON() in alloc_cacheable_mr(). WARNING: CPU: 2 PID: 1203 at drivers/infiniband/hw/mlx5/mr.c:1124 alloc_cacheable_mr+0x22/0x580 [mlx5_ib] Modules linked in: mlx5_ib mlx5_core bonding ip6_gre ip6_tunnel tunnel6 ip_gre gre rdma_rxe rdma_ucm ib_uverbs ib_ipoib ib_umad rpcrdma ib_iser libiscsi scsi_transport_iscsi rdma_cm iw_cm ib_cm fuse ib_core [last unloaded: mlx5_core] CPU: 2 UID: 70878 PID: 1203 Comm: rdma_resource_l Tainted: G W 6.14.0-rc4-dirty #43 Tainted: [W]=WARN Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 RIP: 0010:alloc_cacheable_mr+0x22/0x580 [mlx5_ib] Code: 90 90 90 90 90 90 90 90 0f 1f 44 00 00 55 48 89 e5 41 57 41 56 41 55 41 54 41 52 53 48 83 ec 30 f6 46 28 04 4c 8b 77 08 75 21 <0f> 0b 49 c7 c2 ea ff ff ff 48 8d 65 d0 4c 89 d0 5b 41 5a 41 5c 41 RSP: 0018:ffffc900006ffac8 EFLAGS: 00010246 RAX: 0000000004c0d0d0 RBX: ffff888217a22000 RCX: 0000000000100001 RDX: 00007fb7ac480000 RSI: ffff8882037b1240 RDI: ffff8882046f0600 RBP: ffffc900006ffb28 R08: 0000000000000001 R09: 0000000000000000 R10: 00000000000007e0 R11: ffffea0008011d40 R12: ffff8882037b1240 R13: ffff8882046f0600 R14: ffff888217a22000 R15: ffffc900006ffe00 FS: 00007fb7ed013340(0000) GS:ffff88885fd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fb7ed1d8000 CR3: 00000001fd8f6006 CR4: 0000000000772eb0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: ? __warn+0x81/0x130 ? alloc_cacheable_mr+0x22/0x580 [mlx5_ib] ? report_bug+0xfc/0x1e0 ? handle_bug+0x55/0x90 ? exc_invalid_op+0x17/0x70 ? asm_exc_invalid_op+0x1a/0x20 ? alloc_cacheable_mr+0x22/0x580 [mlx5_ib] create_real_mr+0x54/0x150 [mlx5_ib] ib_uverbs_reg_mr+0x17f/0x2a0 [ib_uverbs] ib_uverbs_handler_UVERBS_METHOD_INVOKE_WRITE+0xca/0x140 [ib_uverbs] ib_uverbs_run_method+0x6d0/0x780 [ib_uverbs] ? __pfx_ib_uverbs_handler_UVERBS_METHOD_INVOKE_WRITE+0x10/0x10 [ib_uverbs] ib_uverbs_cmd_verbs+0x19b/0x360 [ib_uverbs] ? walk_system_ram_range+0x79/0xd0 ? ___pte_offset_map+0x1b/0x110 ? __pte_offset_map_lock+0x80/0x100 ib_uverbs_ioctl+0xac/0x110 [ib_uverbs] __x64_sys_ioctl+0x94/0xb0 do_syscall_64+0x50/0x110 entry_SYSCALL_64_after_hwframe+0x76/0x7e RIP: 0033:0x7fb7ecf0737b Code: ff ff ff 85 c0 79 9b 49 c7 c4 ff ff ff ff 5b 5d 4c 89 e0 41 5c c3 66 0f 1f 84 00 00 00 00 00 f3 0f 1e fa b8 10 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 7d 2a 0f 00 f7 d8 64 89 01 48 RSP: 002b:00007ffdbe03ecc8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 00007ffdbe03edb8 RCX: 00007fb7ecf0737b RDX: 00007ffdbe03eda0 RSI: 00000000c0181b01 RDI: 0000000000000003 RBP: 00007ffdbe03ed80 R08: 00007fb7ecc84010 R09: 00007ffdbe03eed4 R10: 0000000000000009 R11: 0000000000000246 R12: 00007ffdbe03eed4 R13: 000000000000000c R14: 000000000000000c R15: 00007fb7ecc84150 Fixes: cef7dde8836a ("net/mlx5: Expand mkey page size to support 6 bits") Signed-off-by: Michael Guralnik Reviewed-by: Yishai Hadas Link: https://patch.msgid.link/2479a4a3f6fd9bd032e1b6d396274a89c4c5e22f.1741875692.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/mr.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 58486f79c5e5..2080458cabd1 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -56,7 +56,7 @@ static void create_mkey_callback(int status, struct mlx5_async_work *context); static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, u64 iova, int access_flags, - unsigned int page_size, bool populate, + unsigned long page_size, bool populate, int access_mode); static int __mlx5_ib_dereg_mr(struct ib_mr *ibmr); @@ -1125,7 +1125,7 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, struct mlx5r_cache_rb_key rb_key = {}; struct mlx5_cache_ent *ent; struct mlx5_ib_mr *mr; - unsigned int page_size; + unsigned long page_size; if (umem->is_dmabuf) page_size = mlx5_umem_dmabuf_default_pgsz(umem, iova); @@ -1229,7 +1229,7 @@ err_1: */ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, u64 iova, int access_flags, - unsigned int page_size, bool populate, + unsigned long page_size, bool populate, int access_mode) { struct mlx5_ib_dev *dev = to_mdev(pd->device); @@ -1435,7 +1435,7 @@ static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem, mr = alloc_cacheable_mr(pd, umem, iova, access_flags, MLX5_MKC_ACCESS_MODE_MTT); } else { - unsigned int page_size = + unsigned long page_size = mlx5_umem_mkc_find_best_pgsz(dev, umem, iova); mutex_lock(&dev->slow_path_mutex); -- cgit v1.2.3-59-g8ed1b From 5ed3b0cb3f827072e93b4c5b6e2b8106fd7cccbd Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Thu, 13 Mar 2025 16:29:53 +0200 Subject: RDMA/mlx5: Fix mlx5_poll_one() cur_qp update flow When cur_qp isn't NULL, in order to avoid fetching the QP from the radix tree again we check if the next cqe QP is identical to the one we already have. The bug however is that we are checking if the QP is identical by checking the QP number inside the CQE against the QP number inside the mlx5_ib_qp, but that's wrong since the QP number from the CQE is from FW so it should be matched against mlx5_core_qp which is our FW QP number. Otherwise we could use the wrong QP when handling a CQE which could cause the kernel trace below. This issue is mainly noticeable over QPs 0 & 1, since for now they are the only QPs in our driver whereas the QP number inside mlx5_ib_qp doesn't match the QP number inside mlx5_core_qp. BUG: kernel NULL pointer dereference, address: 0000000000000012 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: Oops: 0000 [#1] SMP CPU: 0 UID: 0 PID: 7927 Comm: kworker/u62:1 Not tainted 6.14.0-rc3+ #189 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014 Workqueue: ib-comp-unb-wq ib_cq_poll_work [ib_core] RIP: 0010:mlx5_ib_poll_cq+0x4c7/0xd90 [mlx5_ib] Code: 03 00 00 8d 58 ff 21 cb 66 39 d3 74 39 48 c7 c7 3c 89 6e a0 0f b7 db e8 b7 d2 b3 e0 49 8b 86 60 03 00 00 48 c7 c7 4a 89 6e a0 <0f> b7 5c 98 02 e8 9f d2 b3 e0 41 0f b7 86 78 03 00 00 83 e8 01 21 RSP: 0018:ffff88810511bd60 EFLAGS: 00010046 RAX: 0000000000000010 RBX: 0000000000000000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffff88885fa1b3c0 RDI: ffffffffa06e894a RBP: 00000000000000b0 R08: 0000000000000000 R09: ffff88810511bc10 R10: 0000000000000001 R11: 0000000000000001 R12: ffff88810d593000 R13: ffff88810e579108 R14: ffff888105146000 R15: 00000000000000b0 FS: 0000000000000000(0000) GS:ffff88885fa00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000012 CR3: 00000001077e6001 CR4: 0000000000370eb0 Call Trace: ? __die+0x20/0x60 ? page_fault_oops+0x150/0x3e0 ? exc_page_fault+0x74/0x130 ? asm_exc_page_fault+0x22/0x30 ? mlx5_ib_poll_cq+0x4c7/0xd90 [mlx5_ib] __ib_process_cq+0x5a/0x150 [ib_core] ib_cq_poll_work+0x31/0x90 [ib_core] process_one_work+0x169/0x320 worker_thread+0x288/0x3a0 ? work_busy+0xb0/0xb0 kthread+0xd7/0x1f0 ? kthreads_online_cpu+0x130/0x130 ? kthreads_online_cpu+0x130/0x130 ret_from_fork+0x2d/0x50 ? kthreads_online_cpu+0x130/0x130 ret_from_fork_asm+0x11/0x20 Fixes: e126ba97dba9 ("mlx5: Add driver for Mellanox Connect-IB adapters") Signed-off-by: Patrisious Haddad Reviewed-by: Edward Srouji Link: https://patch.msgid.link/4ada09d41f1e36db62c44a9b25c209ea5f054316.1741875692.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/cq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index 4c54dc578069..1aa5311b03e9 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -490,7 +490,7 @@ repoll: } qpn = ntohl(cqe64->sop_drop_qpn) & 0xffffff; - if (!*cur_qp || (qpn != (*cur_qp)->ibqp.qp_num)) { + if (!*cur_qp || (qpn != (*cur_qp)->trans_qp.base.mqp.qpn)) { /* We do not have to take the QP table lock here, * because CQs will be locked while QPs are removed * from the table. -- cgit v1.2.3-59-g8ed1b From 79195147644653ebffadece31a42181e4c48c07d Mon Sep 17 00:00:00 2001 From: Chiara Meiohas Date: Thu, 13 Mar 2025 16:29:54 +0200 Subject: RDMA/mlx5: Fix calculation of total invalidated pages When invalidating an address range in mlx5, there is an optimization to do UMR operations in chunks. Previously, the invalidation counter was incorrectly updated for the same indexes within a chunk. Now, the invalidation counter is updated only when a chunk is complete and mlx5r_umr_update_xlt() is called. This ensures that the counter accurately represents the number of pages invalidated using UMR. Fixes: a3de94e3d61e ("IB/mlx5: Introduce ODP diagnostic counters") Signed-off-by: Chiara Meiohas Reviewed-by: Michael Guralnik Link: https://patch.msgid.link/560deb2433318e5947282b070c915f3c81fef77f.1741875692.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/odp.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index f1e23583e6c0..de9683344f5e 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -308,9 +308,6 @@ static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni, blk_start_idx = idx; in_block = 1; } - - /* Count page invalidations */ - invalidations += idx - blk_start_idx + 1; } else { u64 umr_offset = idx & umr_block_mask; @@ -320,14 +317,19 @@ static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni, MLX5_IB_UPD_XLT_ZAP | MLX5_IB_UPD_XLT_ATOMIC); in_block = 0; + /* Count page invalidations */ + invalidations += idx - blk_start_idx + 1; } } } - if (in_block) + if (in_block) { mlx5r_umr_update_xlt(mr, blk_start_idx, idx - blk_start_idx + 1, 0, MLX5_IB_UPD_XLT_ZAP | MLX5_IB_UPD_XLT_ATOMIC); + /* Count page invalidations */ + invalidations += idx - blk_start_idx + 1; + } mlx5_update_odp_stats_with_handled(mr, invalidations, invalidations); -- cgit v1.2.3-59-g8ed1b From 0c55174524227a174c1a367889cd704212d15b45 Mon Sep 17 00:00:00 2001 From: Konstantin Taranov Date: Tue, 18 Mar 2025 08:45:44 -0700 Subject: RDMA/mana_ib: Fix integer overflow during queue creation Check queue size during CQ creation for users to prevent overflow of u32. Fixes: bec127e45d9f ("RDMA/mana_ib: create kernel-level CQs") Signed-off-by: Konstantin Taranov Link: https://patch.msgid.link/1742312744-14370-1-git-send-email-kotaranov@linux.microsoft.com Reviewed-by: Long Li Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mana/cq.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/mana/cq.c b/drivers/infiniband/hw/mana/cq.c index 5c325ef4ac56..0fc4e2679218 100644 --- a/drivers/infiniband/hw/mana/cq.c +++ b/drivers/infiniband/hw/mana/cq.c @@ -39,7 +39,8 @@ int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, is_rnic_cq = !!(ucmd.flags & MANA_IB_CREATE_RNIC_CQ); - if (!is_rnic_cq && attr->cqe > mdev->adapter_caps.max_qp_wr) { + if ((!is_rnic_cq && attr->cqe > mdev->adapter_caps.max_qp_wr) || + attr->cqe > U32_MAX / COMP_ENTRY_SIZE) { ibdev_dbg(ibdev, "CQE %d exceeding limit\n", attr->cqe); return -EINVAL; } -- cgit v1.2.3-59-g8ed1b From 37826f0a8c2f6b6add5179003b8597e32a445362 Mon Sep 17 00:00:00 2001 From: Maher Sanalla Date: Thu, 13 Mar 2025 16:20:17 +0200 Subject: IB/mad: Check available slots before posting receive WRs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ib_post_receive_mads() function handles posting receive work requests (WRs) to MAD QPs and is called in two cases: 1) When a MAD port is opened. 2) When a receive WQE is consumed upon receiving a new MAD. Whereas, if MADs arrive during the port open phase, a race condition might cause an extra WR to be posted, exceeding the QP’s capacity. This leads to failures such as: infiniband mlx5_0: ib_post_recv failed: -12 infiniband mlx5_0: Couldn't post receive WRs infiniband mlx5_0: Couldn't start port infiniband mlx5_0: Couldn't open port 1 Fix this by checking the current receive count before posting a new WR. If the QP’s receive queue is full, do not post additional WRs. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Maher Sanalla Link: https://patch.msgid.link/c4984ba3c3a98a5711a558bccefcad789587ecf1.1741875592.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/mad.c | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index 1fd54d5c4dd8..73f3a0b9a54b 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -2671,11 +2671,11 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info, struct ib_mad_private *mad) { unsigned long flags; - int post, ret; struct ib_mad_private *mad_priv; struct ib_sge sg_list; struct ib_recv_wr recv_wr; struct ib_mad_queue *recv_queue = &qp_info->recv_queue; + int ret = 0; /* Initialize common scatter list fields */ sg_list.lkey = qp_info->port_priv->pd->local_dma_lkey; @@ -2685,7 +2685,7 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info, recv_wr.sg_list = &sg_list; recv_wr.num_sge = 1; - do { + while (true) { /* Allocate and map receive buffer */ if (mad) { mad_priv = mad; @@ -2693,10 +2693,8 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info, } else { mad_priv = alloc_mad_private(port_mad_size(qp_info->port_priv), GFP_ATOMIC); - if (!mad_priv) { - ret = -ENOMEM; - break; - } + if (!mad_priv) + return -ENOMEM; } sg_list.length = mad_priv_dma_size(mad_priv); sg_list.addr = ib_dma_map_single(qp_info->port_priv->device, @@ -2705,37 +2703,41 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info, DMA_FROM_DEVICE); if (unlikely(ib_dma_mapping_error(qp_info->port_priv->device, sg_list.addr))) { - kfree(mad_priv); ret = -ENOMEM; - break; + goto free_mad_priv; } mad_priv->header.mapping = sg_list.addr; mad_priv->header.mad_list.mad_queue = recv_queue; mad_priv->header.mad_list.cqe.done = ib_mad_recv_done; recv_wr.wr_cqe = &mad_priv->header.mad_list.cqe; - - /* Post receive WR */ spin_lock_irqsave(&recv_queue->lock, flags); - post = (++recv_queue->count < recv_queue->max_active); - list_add_tail(&mad_priv->header.mad_list.list, &recv_queue->list); + if (recv_queue->count >= recv_queue->max_active) { + /* Fully populated the receive queue */ + spin_unlock_irqrestore(&recv_queue->lock, flags); + break; + } + recv_queue->count++; + list_add_tail(&mad_priv->header.mad_list.list, + &recv_queue->list); spin_unlock_irqrestore(&recv_queue->lock, flags); + ret = ib_post_recv(qp_info->qp, &recv_wr, NULL); if (ret) { spin_lock_irqsave(&recv_queue->lock, flags); list_del(&mad_priv->header.mad_list.list); recv_queue->count--; spin_unlock_irqrestore(&recv_queue->lock, flags); - ib_dma_unmap_single(qp_info->port_priv->device, - mad_priv->header.mapping, - mad_priv_dma_size(mad_priv), - DMA_FROM_DEVICE); - kfree(mad_priv); dev_err(&qp_info->port_priv->device->dev, "ib_post_recv failed: %d\n", ret); break; } - } while (post); + } + ib_dma_unmap_single(qp_info->port_priv->device, + mad_priv->header.mapping, + mad_priv_dma_size(mad_priv), DMA_FROM_DEVICE); +free_mad_priv: + kfree(mad_priv); return ret; } -- cgit v1.2.3-59-g8ed1b