From 6cd7397d01c4a3e09757840299e4f114f0aa5fa0 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 11 Nov 2021 13:45:00 +0200 Subject: RDMA/core: Set send and receive CQ before forwarding to the driver Preset both receive and send CQ pointers prior to call to the drivers and overwrite it later again till the mlx4 is going to be changed do not overwrite ibqp properties. This change is needed for mlx5, because in case of QP creation failure, it will go to the path of QP destroy which relies on proper CQ pointers. BUG: KASAN: use-after-free in create_qp.cold+0x164/0x16e [mlx5_ib] Write of size 8 at addr ffff8880064c55c0 by task a.out/246 CPU: 0 PID: 246 Comm: a.out Not tainted 5.15.0+ #291 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack_lvl+0x45/0x59 print_address_description.constprop.0+0x1f/0x140 kasan_report.cold+0x83/0xdf create_qp.cold+0x164/0x16e [mlx5_ib] mlx5_ib_create_qp+0x358/0x28a0 [mlx5_ib] create_qp.part.0+0x45b/0x6a0 [ib_core] ib_create_qp_user+0x97/0x150 [ib_core] ib_uverbs_handler_UVERBS_METHOD_QP_CREATE+0x92c/0x1250 [ib_uverbs] ib_uverbs_cmd_verbs+0x1c38/0x3150 [ib_uverbs] ib_uverbs_ioctl+0x169/0x260 [ib_uverbs] __x64_sys_ioctl+0x866/0x14d0 do_syscall_64+0x3d/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae Allocated by task 246: kasan_save_stack+0x1b/0x40 __kasan_kmalloc+0xa4/0xd0 create_qp.part.0+0x92/0x6a0 [ib_core] ib_create_qp_user+0x97/0x150 [ib_core] ib_uverbs_handler_UVERBS_METHOD_QP_CREATE+0x92c/0x1250 [ib_uverbs] ib_uverbs_cmd_verbs+0x1c38/0x3150 [ib_uverbs] ib_uverbs_ioctl+0x169/0x260 [ib_uverbs] __x64_sys_ioctl+0x866/0x14d0 do_syscall_64+0x3d/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae Freed by task 246: kasan_save_stack+0x1b/0x40 kasan_set_track+0x1c/0x30 kasan_set_free_info+0x20/0x30 __kasan_slab_free+0x10c/0x150 slab_free_freelist_hook+0xb4/0x1b0 kfree+0xe7/0x2a0 create_qp.part.0+0x52b/0x6a0 [ib_core] ib_create_qp_user+0x97/0x150 [ib_core] ib_uverbs_handler_UVERBS_METHOD_QP_CREATE+0x92c/0x1250 [ib_uverbs] ib_uverbs_cmd_verbs+0x1c38/0x3150 [ib_uverbs] ib_uverbs_ioctl+0x169/0x260 [ib_uverbs] __x64_sys_ioctl+0x866/0x14d0 do_syscall_64+0x3d/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae Fixes: 514aee660df4 ("RDMA: Globally allocate and release QP memory") Link: https://lore.kernel.org/r/2dbb2e2cbb1efb188a500e5634be1d71956424ce.1636631035.git.leonro@nvidia.com Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/verbs.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 692d5ff657df..c18634bec212 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -1232,6 +1232,9 @@ static struct ib_qp *create_qp(struct ib_device *dev, struct ib_pd *pd, INIT_LIST_HEAD(&qp->rdma_mrs); INIT_LIST_HEAD(&qp->sig_mrs); + qp->send_cq = attr->send_cq; + qp->recv_cq = attr->recv_cq; + rdma_restrack_new(&qp->res, RDMA_RESTRACK_QP); WARN_ONCE(!udata && !caller, "Missing kernel QP owner"); rdma_restrack_set_name(&qp->res, udata ? NULL : caller); -- cgit v1.2.3-59-g8ed1b From da86dc175b5af1f3e95642cdb536bfa4f7ddb1a9 Mon Sep 17 00:00:00 2001 From: Dennis Dalessandro Date: Mon, 15 Nov 2021 15:09:13 -0500 Subject: IB/hfi1: Properly allocate rdma counter desc memory When optional counter support was added the allocation of the memory holding the counter descriptors was not cleared properly. This caused WARN_ON()s in the IB/sysfs code to be hit. This is because the uninitialized memory made some of the counters wrongly look like optional counters. Use kzalloc. While here change the sizeof() calls to use the pointer rather than the name of the type. WARNING: CPU: 0 PID: 32644 at drivers/infiniband/core/sysfs.c:1064 ib_setup_port_attrs+0x7e1/0x890 [ib_core] CPU: 0 PID: 32644 Comm: kworker/0:2 Tainted: G S W 5.15.0+ #36 Hardware name: Intel Corporation S2600WTT/S2600WTT, BIOS SE5C610.86B.01.01.0018.C4.072020161249 07/20/2016 Workqueue: events work_for_cpu_fn RIP: 0010:ib_setup_port_attrs+0x7e1/0x890 [ib_core] RSP: 0018:ffffc90006ea3c40 EFLAGS: 00010202 RAX: 0000000000000068 RBX: ffff888106ad8000 RCX: 0000000000000138 RDX: ffff888126c84c00 RSI: ffff888103c41000 RDI: 0000000000000124 RBP: ffff88810f63a801 R08: ffff888126c8a000 R09: 0000000000000001 R10: ffffffffa09acf20 R11: 0000000000000065 R12: ffff88810f63a800 R13: ffff88810f63a800 R14: ffff88810f63a8e0 R15: 0000000000000001 FS: 0000000000000000(0000) GS:ffff888667a00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00005590102cb078 CR3: 000000000240a003 CR4: 00000000001706f0 Call Trace: ib_register_device.cold.44+0x23e/0x2d0 [ib_core] rvt_register_device+0xfa/0x230 [rdmavt] hfi1_register_ib_device+0x623/0x690 [hfi1] init_one.cold.36+0x2d1/0x49b [hfi1] local_pci_probe+0x45/0x80 work_for_cpu_fn+0x16/0x20 process_one_work+0x1b1/0x360 worker_thread+0x1d4/0x3a0 kthread+0x11a/0x140 ret_from_fork+0x22/0x30 Fixes: 5e2ddd1e5982 ("RDMA/counter: Add optional counter support") Link: https://lore.kernel.org/r/20211115200913.124104.47770.stgit@awfm-01.cornelisnetworks.com Reviewed-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/verbs.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index ed9fa0d84e9e..dc9211f3a009 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -1628,8 +1628,7 @@ static int init_cntr_names(const char *names_in, const size_t names_len, n++; names_out = - kmalloc((n + num_extra_names) * sizeof(struct rdma_stat_desc) + - names_len, + kzalloc((n + num_extra_names) * sizeof(*q) + names_len, GFP_KERNEL); if (!names_out) { *num_cntrs = 0; @@ -1637,7 +1636,7 @@ static int init_cntr_names(const char *names_in, const size_t names_len, return -ENOMEM; } - p = names_out + (n + num_extra_names) * sizeof(struct rdma_stat_desc); + p = names_out + (n + num_extra_names) * sizeof(*q); memcpy(p, names_in, names_len); q = (struct rdma_stat_desc *)names_out; -- cgit v1.2.3-59-g8ed1b From 378c67413de18b69fb3bb78d8c4f0f1192cfa973 Mon Sep 17 00:00:00 2001 From: Jack Wang Date: Mon, 15 Nov 2021 11:15:19 +0100 Subject: RDMA/mlx4: Do not fail the registration on port stats If the FW doesn't support MLX4_DEV_CAP_FLAG2_DIAG_PER_PORT, mlx4 driver will fail the ib_setup_port_attrs, which is called from ib_register_device()/enable_device_and_get(), in the end leads to device not detected[1][2] To fix it, add a new mlx4_ib_hw_stats_ops1, w/o alloc_hw_port_stats if FW does not support MLX4_DEV_CAP_FLAG2_DIAG_PER_PORT. [1] https://bugzilla.redhat.com/show_bug.cgi?id=2014094 [2] https://lore.kernel.org/linux-rdma/CAMGffEn2wvEnmzc0xe=xYiCLqpphiHDBxCxqAELrBofbUAMQxw@mail.gmail.com Fixes: 4b5f4d3fb408 ("RDMA: Split the alloc_hw_stats() ops to port and device variants") Link: https://lore.kernel.org/r/20211115101519.27210-1-jinpu.wang@ionos.com Signed-off-by: Jack Wang Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx4/main.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index ceca05982f61..0d2fa3338784 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -2215,6 +2215,11 @@ static const struct ib_device_ops mlx4_ib_hw_stats_ops = { .get_hw_stats = mlx4_ib_get_hw_stats, }; +static const struct ib_device_ops mlx4_ib_hw_stats_ops1 = { + .alloc_hw_device_stats = mlx4_ib_alloc_hw_device_stats, + .get_hw_stats = mlx4_ib_get_hw_stats, +}; + static int mlx4_ib_alloc_diag_counters(struct mlx4_ib_dev *ibdev) { struct mlx4_ib_diag_counters *diag = ibdev->diag_counters; @@ -2227,9 +2232,16 @@ static int mlx4_ib_alloc_diag_counters(struct mlx4_ib_dev *ibdev) return 0; for (i = 0; i < MLX4_DIAG_COUNTERS_TYPES; i++) { - /* i == 1 means we are building port counters */ - if (i && !per_port) - continue; + /* + * i == 1 means we are building port counters, set a different + * stats ops without port stats callback. + */ + if (i && !per_port) { + ib_set_device_ops(&ibdev->ib_dev, + &mlx4_ib_hw_stats_ops1); + + return 0; + } ret = __mlx4_ib_alloc_diag_counters(ibdev, &diag[i].descs, &diag[i].offset, -- cgit v1.2.3-59-g8ed1b From d821f7c13ca03318ad1bdc64ce64afb43080a07a Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 17 Nov 2021 14:27:04 +0200 Subject: RDMA/nldev: Check stat attribute before accessing it The access to non-existent netlink attribute causes to the following kernel panic. Fix it by checking existence before trying to read it. general protection fault, probably for non-canonical address 0xdffffc0000000000: 0000 [#1] PREEMPT SMP KASAN KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007] CPU: 0 PID: 6744 Comm: syz-executor.0 Not tainted 5.15.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:nla_get_u32 include/net/netlink.h:1554 [inline] RIP: 0010:nldev_stat_set_mode_doit drivers/infiniband/core/nldev.c:1909 [inline] RIP: 0010:nldev_stat_set_doit+0x578/0x10d0 drivers/infiniband/core/nldev.c:2040 Code: fa 4c 8b a4 24 f8 02 00 00 48 b8 00 00 00 00 00 fc ff df c7 84 24 80 00 00 00 00 00 00 00 49 8d 7c 24 04 48 89 fa 48 c1 ea 03 <0f> b6 14 02 48 89 f8 83 e0 07 83 c0 03 38 d0 7c 08 84 d2 0f 85 02 RSP: 0018:ffffc90004acf2e8 EFLAGS: 00010247 RAX: dffffc0000000000 RBX: 0000000000000000 RCX: ffffc90002b94000 RDX: 0000000000000000 RSI: ffffffff8684c5ff RDI: 0000000000000004 RBP: ffff88807cda4000 R08: 0000000000000000 R09: ffff888023fb8027 R10: ffffffff8684c5d7 R11: 0000000000000000 R12: 0000000000000000 R13: 0000000000000001 R14: ffff888041024280 R15: ffff888031ade780 FS: 00007eff9dddd700(0000) GS:ffff8880b9c00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000001b2ef24000 CR3: 0000000036902000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: rdma_nl_rcv_msg+0x36d/0x690 drivers/infiniband/core/netlink.c:195 rdma_nl_rcv_skb drivers/infiniband/core/netlink.c:239 [inline] rdma_nl_rcv+0x2ee/0x430 drivers/infiniband/core/netlink.c:259 netlink_unicast_kernel net/netlink/af_netlink.c:1319 [inline] netlink_unicast+0x533/0x7d0 net/netlink/af_netlink.c:1345 netlink_sendmsg+0x86d/0xda0 net/netlink/af_netlink.c:1916 sock_sendmsg_nosec net/socket.c:704 [inline] sock_sendmsg+0xcf/0x120 net/socket.c:724 ____sys_sendmsg+0x6e8/0x810 net/socket.c:2409 ___sys_sendmsg+0xf3/0x170 net/socket.c:2463 __sys_sendmsg+0xe5/0x1b0 net/socket.c:2492 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae Fixes: 822cf785ac6d ("RDMA/nldev: Split nldev_stat_set_mode_doit out of nldev_stat_set_doit") Link: https://lore.kernel.org/r/b21967c366f076ff1988862f9c8a1aa0244c599f.1637151999.git.leonro@nvidia.com Reported-by: syzbot+9111d2255a9710e87562@syzkaller.appspotmail.com Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/nldev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index fedc0fa6ebf9..f5aacaf7fb8e 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -1906,7 +1906,8 @@ static int nldev_stat_set_mode_doit(struct sk_buff *msg, int ret; /* Currently only counter for QP is supported */ - if (nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES]) != RDMA_NLDEV_ATTR_RES_QP) + if (!tb[RDMA_NLDEV_ATTR_STAT_RES] || + nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES]) != RDMA_NLDEV_ATTR_RES_QP) return -EINVAL; mode = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_MODE]); -- cgit v1.2.3-59-g8ed1b From 84b01721e8042cdd1e8ffeb648844a09cd4213e0 Mon Sep 17 00:00:00 2001 From: Pavel Skripkin Date: Sun, 21 Nov 2021 23:22:39 +0300 Subject: RDMA: Fix use-after-free in rxe_queue_cleanup On error handling path in rxe_qp_from_init() qp->sq.queue is freed and then rxe_create_qp() will drop last reference to this object. qp clean up function will try to free this queue one time and it causes UAF bug. Fix it by zeroing queue pointer after freeing queue in rxe_qp_from_init(). Fixes: 514aee660df4 ("RDMA: Globally allocate and release QP memory") Link: https://lore.kernel.org/r/20211121202239.3129-1-paskripkin@gmail.com Reported-by: syzbot+aab53008a5adf26abe91@syzkaller.appspotmail.com Signed-off-by: Pavel Skripkin Reviewed-by: Zhu Yanjun Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_qp.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index 975321812c87..54b8711321c1 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -359,6 +359,7 @@ int rxe_qp_from_init(struct rxe_dev *rxe, struct rxe_qp *qp, struct rxe_pd *pd, err2: rxe_queue_cleanup(qp->sq.queue); + qp->sq.queue = NULL; err1: qp->pd = NULL; qp->rcq = NULL; -- cgit v1.2.3-59-g8ed1b From f0ae4afe3d35e67db042c58a52909e06262b740f Mon Sep 17 00:00:00 2001 From: Alaa Hleihel Date: Mon, 22 Nov 2021 13:41:51 +0200 Subject: RDMA/mlx5: Fix releasing unallocated memory in dereg MR flow For the case of IB_MR_TYPE_DM the mr does doesn't have a umem, even though it is a user MR. This causes function mlx5_free_priv_descs() to think that it is a kernel MR, leading to wrongly accessing mr->descs that will get wrong values in the union which leads to attempt to release resources that were not allocated in the first place. For example: DMA-API: mlx5_core 0000:08:00.1: device driver tries to free DMA memory it has not allocated [device address=0x0000000000000000] [size=0 bytes] WARNING: CPU: 8 PID: 1021 at kernel/dma/debug.c:961 check_unmap+0x54f/0x8b0 RIP: 0010:check_unmap+0x54f/0x8b0 Call Trace: debug_dma_unmap_page+0x57/0x60 mlx5_free_priv_descs+0x57/0x70 [mlx5_ib] mlx5_ib_dereg_mr+0x1fb/0x3d0 [mlx5_ib] ib_dereg_mr_user+0x60/0x140 [ib_core] uverbs_destroy_uobject+0x59/0x210 [ib_uverbs] uobj_destroy+0x3f/0x80 [ib_uverbs] ib_uverbs_cmd_verbs+0x435/0xd10 [ib_uverbs] ? uverbs_finalize_object+0x50/0x50 [ib_uverbs] ? lock_acquire+0xc4/0x2e0 ? lock_acquired+0x12/0x380 ? lock_acquire+0xc4/0x2e0 ? lock_acquire+0xc4/0x2e0 ? ib_uverbs_ioctl+0x7c/0x140 [ib_uverbs] ? lock_release+0x28a/0x400 ib_uverbs_ioctl+0xc0/0x140 [ib_uverbs] ? ib_uverbs_ioctl+0x7c/0x140 [ib_uverbs] __x64_sys_ioctl+0x7f/0xb0 do_syscall_64+0x38/0x90 Fix it by reorganizing the dereg flow and mlx5_ib_mr structure: - Move the ib_umem field into the user MRs structure in the union as it's applicable only there. - Function mlx5_ib_dereg_mr() will now call mlx5_free_priv_descs() only in case there isn't udata, which indicates that this isn't a user MR. Fixes: f18ec4223117 ("RDMA/mlx5: Use a union inside mlx5_ib_mr") Link: https://lore.kernel.org/r/66bb1dd253c1fd7ceaa9fc411061eefa457b86fb.1637581144.git.leonro@nvidia.com Signed-off-by: Alaa Hleihel Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/mlx5_ib.h | 6 +++--- drivers/infiniband/hw/mlx5/mr.c | 26 ++++++++++++-------------- 2 files changed, 15 insertions(+), 17 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index e636e954f6bf..4a7a56ed740b 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -664,7 +664,6 @@ struct mlx5_ib_mr { /* User MR data */ struct mlx5_cache_ent *cache_ent; - struct ib_umem *umem; /* This is zero'd when the MR is allocated */ union { @@ -676,7 +675,7 @@ struct mlx5_ib_mr { struct list_head list; }; - /* Used only by kernel MRs (umem == NULL) */ + /* Used only by kernel MRs */ struct { void *descs; void *descs_alloc; @@ -697,8 +696,9 @@ struct mlx5_ib_mr { int data_length; }; - /* Used only by User MRs (umem != NULL) */ + /* Used only by User MRs */ struct { + struct ib_umem *umem; unsigned int page_shift; /* Current access_flags */ int access_flags; diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 157d862fb864..63e2129f1142 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -1904,19 +1904,18 @@ err: return ret; } -static void -mlx5_free_priv_descs(struct mlx5_ib_mr *mr) +static void mlx5_free_priv_descs(struct mlx5_ib_mr *mr) { - if (!mr->umem && mr->descs) { - struct ib_device *device = mr->ibmr.device; - int size = mr->max_descs * mr->desc_size; - struct mlx5_ib_dev *dev = to_mdev(device); + struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); + int size = mr->max_descs * mr->desc_size; - dma_unmap_single(&dev->mdev->pdev->dev, mr->desc_map, size, - DMA_TO_DEVICE); - kfree(mr->descs_alloc); - mr->descs = NULL; - } + if (!mr->descs) + return; + + dma_unmap_single(&dev->mdev->pdev->dev, mr->desc_map, size, + DMA_TO_DEVICE); + kfree(mr->descs_alloc); + mr->descs = NULL; } int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) @@ -1992,7 +1991,8 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) if (mr->cache_ent) { mlx5_mr_cache_free(dev, mr); } else { - mlx5_free_priv_descs(mr); + if (!udata) + mlx5_free_priv_descs(mr); kfree(mr); } return 0; @@ -2079,7 +2079,6 @@ static struct mlx5_ib_mr *mlx5_ib_alloc_pi_mr(struct ib_pd *pd, if (err) goto err_free_in; - mr->umem = NULL; kfree(in); return mr; @@ -2206,7 +2205,6 @@ static struct ib_mr *__mlx5_ib_alloc_mr(struct ib_pd *pd, } mr->ibmr.device = pd->device; - mr->umem = NULL; switch (mr_type) { case IB_MR_TYPE_MEM_REG: -- cgit v1.2.3-59-g8ed1b From 52414e27d6b568120b087d1fbafbb4482b0ccaab Mon Sep 17 00:00:00 2001 From: Yangyang Li Date: Tue, 23 Nov 2021 16:48:09 +0800 Subject: RDMA/hns: Do not halt commands during reset until later is_reset is used to indicate whether the hardware starts to reset. When hns_roce_hw_v2_reset_notify_down() is called, the hardware has not yet started to reset. If is_reset is set at this time, all mailbox operations of resource destroy actions will be intercepted by driver. When the driver cleans up resources, but the hardware is still accessed, the following errors will appear: arm-smmu-v3 arm-smmu-v3.2.auto: event 0x10 received: arm-smmu-v3 arm-smmu-v3.2.auto: 0x0000350100000010 arm-smmu-v3 arm-smmu-v3.2.auto: 0x000002088000003f arm-smmu-v3 arm-smmu-v3.2.auto: 0x00000000a50e0800 arm-smmu-v3 arm-smmu-v3.2.auto: 0x0000000000000000 arm-smmu-v3 arm-smmu-v3.2.auto: event 0x10 received: arm-smmu-v3 arm-smmu-v3.2.auto: 0x0000350100000010 arm-smmu-v3 arm-smmu-v3.2.auto: 0x000002088000043e arm-smmu-v3 arm-smmu-v3.2.auto: 0x00000000a50a0800 arm-smmu-v3 arm-smmu-v3.2.auto: 0x0000000000000000 arm-smmu-v3 arm-smmu-v3.2.auto: event 0x10 received: arm-smmu-v3 arm-smmu-v3.2.auto: 0x0000350100000010 arm-smmu-v3 arm-smmu-v3.2.auto: 0x0000020880000436 arm-smmu-v3 arm-smmu-v3.2.auto: 0x00000000a50a0880 arm-smmu-v3 arm-smmu-v3.2.auto: 0x0000000000000000 arm-smmu-v3 arm-smmu-v3.2.auto: event 0x10 received: arm-smmu-v3 arm-smmu-v3.2.auto: 0x0000350100000010 arm-smmu-v3 arm-smmu-v3.2.auto: 0x000002088000043a arm-smmu-v3 arm-smmu-v3.2.auto: 0x00000000a50e0840 hns3 0000:35:00.0: INT status: CMDQ(0x0) HW errors(0x0) other(0x0) arm-smmu-v3 arm-smmu-v3.2.auto: 0x0000000000000000 hns3 0000:35:00.0: received unknown or unhandled event of vector0 arm-smmu-v3 arm-smmu-v3.2.auto: event 0x10 received: arm-smmu-v3 arm-smmu-v3.2.auto: 0x0000350100000010 {34}[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 7 is_reset will be set correctly in check_aedev_reset_status(), so the setting in hns_roce_hw_v2_reset_notify_down() should be deleted. Fixes: 726be12f5ca0 ("RDMA/hns: Set reset flag when hw resetting") Link: https://lore.kernel.org/r/20211123084809.37318-1-liangwenpeng@huawei.com Signed-off-by: Yangyang Li Signed-off-by: Wenpeng Liang Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 9bfbaddd1763..ae14329c619c 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -6387,10 +6387,8 @@ static int hns_roce_hw_v2_reset_notify_down(struct hnae3_handle *handle) if (!hr_dev) return 0; - hr_dev->is_reset = true; hr_dev->active = false; hr_dev->dis_db = true; - hr_dev->state = HNS_ROCE_DEVICE_STATE_RST_DOWN; return 0; -- cgit v1.2.3-59-g8ed1b From b0969f83890bf8b47f5c8bd42539599b2b52fdeb Mon Sep 17 00:00:00 2001 From: Yangyang Li Date: Tue, 23 Nov 2021 22:24:02 +0800 Subject: RDMA/hns: Do not destroy QP resources in the hw resetting phase When hns_roce_v2_destroy_qp() is called, the brief calling process of the driver is as follows: ...... hns_roce_v2_destroy_qp hns_roce_v2_qp_modify hns_roce_cmd_mbox hns_roce_qp_destroy If hns_roce_cmd_mbox() detects that the hardware is being reset during the execution of the hns_roce_cmd_mbox(), the driver will not be able to get the return value from the hardware (the firmware cannot respond to the driver's mailbox during the hardware reset phase). The driver needs to wait for the hardware reset to complete before continuing to execute hns_roce_qp_destroy(), otherwise it may happen that the driver releases the resources but the hardware is still accessing. In order to fix this problem, HNS RoCE needs to add a piece of code to wait for the hardware reset to complete. The original interface get_hw_reset_stat() is the instantaneous state of the hardware reset, which cannot accurately reflect whether the hardware reset is completed, so it needs to be replaced with the ae_dev_reset_cnt interface. The sign that the hardware reset is complete is that the return value of the ae_dev_reset_cnt interface is greater than the original value reset_cnt recorded by the driver. Fixes: 6a04aed6afae ("RDMA/hns: Fix the chip hanging caused by sending mailbox&CMQ during reset") Link: https://lore.kernel.org/r/20211123142402.26936-1-liangwenpeng@huawei.com Signed-off-by: Yangyang Li Signed-off-by: Wenpeng Liang Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index ae14329c619c..bbfa1332dedc 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -1050,9 +1051,14 @@ static u32 hns_roce_v2_cmd_hw_resetting(struct hns_roce_dev *hr_dev, unsigned long instance_stage, unsigned long reset_stage) { +#define HW_RESET_TIMEOUT_US 1000000 +#define HW_RESET_SLEEP_US 1000 + struct hns_roce_v2_priv *priv = hr_dev->priv; struct hnae3_handle *handle = priv->handle; const struct hnae3_ae_ops *ops = handle->ae_algo->ops; + unsigned long val; + int ret; /* When hardware reset is detected, we should stop sending mailbox&cmq& * doorbell to hardware. If now in .init_instance() function, we should @@ -1064,7 +1070,11 @@ static u32 hns_roce_v2_cmd_hw_resetting(struct hns_roce_dev *hr_dev, * again. */ hr_dev->dis_db = true; - if (!ops->get_hw_reset_stat(handle)) + + ret = read_poll_timeout(ops->ae_dev_reset_cnt, val, + val > hr_dev->reset_cnt, HW_RESET_SLEEP_US, + HW_RESET_TIMEOUT_US, false, handle); + if (!ret) hr_dev->is_reset = true; if (!hr_dev->is_reset || reset_stage == HNS_ROCE_STATE_RST_INIT || -- cgit v1.2.3-59-g8ed1b From db6169b5bac1c75ed37cfdaedc7dfb1618f3f362 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Sun, 28 Nov 2021 21:35:01 +0800 Subject: RDMA/rtrs: Call {get,put}_cpu_ptr to silence a debug kernel warning With preemption enabled (CONFIG_DEBUG_PREEMPT=y), the following appeared when rnbd client tries to map remote block device. BUG: using smp_processor_id() in preemptible [00000000] code: bash/1733 caller is debug_smp_processor_id+0x17/0x20 CPU: 0 PID: 1733 Comm: bash Not tainted 5.16.0-rc1 #5 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.14.0-0-g155821a-rebuilt.opensuse.org 04/01/2014 Call Trace: dump_stack_lvl+0x5d/0x78 dump_stack+0x10/0x12 check_preemption_disabled+0xe4/0xf0 debug_smp_processor_id+0x17/0x20 rtrs_clt_update_all_stats+0x3b/0x70 [rtrs_client] rtrs_clt_read_req+0xc3/0x380 [rtrs_client] ? rtrs_clt_init_req+0xe3/0x120 [rtrs_client] rtrs_clt_request+0x1a7/0x320 [rtrs_client] ? 0xffffffffc0ab1000 send_usr_msg+0xbf/0x160 [rnbd_client] ? rnbd_clt_put_sess+0x60/0x60 [rnbd_client] ? send_usr_msg+0x160/0x160 [rnbd_client] ? sg_alloc_table+0x27/0xb0 ? sg_zero_buffer+0xd0/0xd0 send_msg_sess_info+0xe9/0x180 [rnbd_client] ? rnbd_clt_put_sess+0x60/0x60 [rnbd_client] ? blk_mq_alloc_tag_set+0x2ef/0x370 rnbd_clt_map_device+0xba8/0xcd0 [rnbd_client] ? send_msg_open+0x200/0x200 [rnbd_client] rnbd_clt_map_device_store+0x3e5/0x620 [rnbd_client To supress the calltrace, let's call get_cpu_ptr/put_cpu_ptr pair in rtrs_clt_update_rdma_stats to disable preemption when accessing per-cpu variable. While at it, let's make the similar change in rtrs_clt_update_wc_stats. And for rtrs_clt_inc_failover_cnt, though it was only called inside rcu section, but it still can be preempted in case CONFIG_PREEMPT_RCU is enabled, so change it to {get,put}_cpu_ptr pair either. Link: https://lore.kernel.org/r/20211128133501.38710-1-guoqing.jiang@linux.dev Signed-off-by: Guoqing Jiang Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/rtrs/rtrs-clt-stats.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt-stats.c b/drivers/infiniband/ulp/rtrs/rtrs-clt-stats.c index f7e459fe68be..76e4352fe3f6 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-clt-stats.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt-stats.c @@ -19,7 +19,7 @@ void rtrs_clt_update_wc_stats(struct rtrs_clt_con *con) int cpu; cpu = raw_smp_processor_id(); - s = this_cpu_ptr(stats->pcpu_stats); + s = get_cpu_ptr(stats->pcpu_stats); if (con->cpu != cpu) { s->cpu_migr.to++; @@ -27,14 +27,16 @@ void rtrs_clt_update_wc_stats(struct rtrs_clt_con *con) s = per_cpu_ptr(stats->pcpu_stats, con->cpu); atomic_inc(&s->cpu_migr.from); } + put_cpu_ptr(stats->pcpu_stats); } void rtrs_clt_inc_failover_cnt(struct rtrs_clt_stats *stats) { struct rtrs_clt_stats_pcpu *s; - s = this_cpu_ptr(stats->pcpu_stats); + s = get_cpu_ptr(stats->pcpu_stats); s->rdma.failover_cnt++; + put_cpu_ptr(stats->pcpu_stats); } int rtrs_clt_stats_migration_from_cnt_to_str(struct rtrs_clt_stats *stats, char *buf) @@ -169,9 +171,10 @@ static inline void rtrs_clt_update_rdma_stats(struct rtrs_clt_stats *stats, { struct rtrs_clt_stats_pcpu *s; - s = this_cpu_ptr(stats->pcpu_stats); + s = get_cpu_ptr(stats->pcpu_stats); s->rdma.dir[d].cnt++; s->rdma.dir[d].size_total += size; + put_cpu_ptr(stats->pcpu_stats); } void rtrs_clt_update_all_stats(struct rtrs_clt_io_req *req, int dir) -- cgit v1.2.3-59-g8ed1b From 9292f8f9a2ac42eb320bced7153aa2e63d8cc13a Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Mon, 29 Nov 2021 14:19:52 -0500 Subject: IB/hfi1: Correct guard on eager buffer deallocation The code tests the dma address which legitimately can be 0. The code should test the kernel logical address to avoid leaking eager buffer allocations that happen to map to a dma address of 0. Fixes: 60368186fd85 ("IB/hfi1: Fix user-space buffers mapping with IOMMU enabled") Link: https://lore.kernel.org/r/20211129191952.101968.17137.stgit@awfm-01.cornelisnetworks.com Signed-off-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index dbd1c31830b9..8e1236be46e1 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -1120,7 +1120,7 @@ void hfi1_free_ctxtdata(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd) rcd->egrbufs.rcvtids = NULL; for (e = 0; e < rcd->egrbufs.alloced; e++) { - if (rcd->egrbufs.buffers[e].dma) + if (rcd->egrbufs.buffers[e].addr) dma_free_coherent(&dd->pcidev->dev, rcd->egrbufs.buffers[e].len, rcd->egrbufs.buffers[e].addr, -- cgit v1.2.3-59-g8ed1b From b6d57e24ce6cc3df8a8845e1b193e88a65d501b1 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Mon, 29 Nov 2021 14:19:58 -0500 Subject: IB/hfi1: Insure use of smp_processor_id() is preempt disabled The following BUG has just surfaced with our 5.16 testing: BUG: using smp_processor_id() in preemptible [00000000] code: mpicheck/1581081 caller is sdma_select_user_engine+0x72/0x210 [hfi1] CPU: 0 PID: 1581081 Comm: mpicheck Tainted: G S 5.16.0-rc1+ #1 Hardware name: Intel Corporation S2600WT2R/S2600WT2R, BIOS SE5C610.86B.01.01.0016.033120161139 03/31/2016 Call Trace: dump_stack_lvl+0x33/0x42 check_preemption_disabled+0xbf/0xe0 sdma_select_user_engine+0x72/0x210 [hfi1] ? _raw_spin_unlock_irqrestore+0x1f/0x31 ? hfi1_mmu_rb_insert+0x6b/0x200 [hfi1] hfi1_user_sdma_process_request+0xa02/0x1120 [hfi1] ? hfi1_write_iter+0xb8/0x200 [hfi1] hfi1_write_iter+0xb8/0x200 [hfi1] do_iter_readv_writev+0x163/0x1c0 do_iter_write+0x80/0x1c0 vfs_writev+0x88/0x1a0 ? recalibrate_cpu_khz+0x10/0x10 ? ktime_get+0x3e/0xa0 ? __fget_files+0x66/0xa0 do_writev+0x65/0x100 do_syscall_64+0x3a/0x80 Fix this long standing bug by moving the smp_processor_id() to after the rcu_read_lock(). The rcu_read_lock() implicitly disables preemption. Link: https://lore.kernel.org/r/20211129191958.101968.87329.stgit@awfm-01.cornelisnetworks.com Cc: stable@vger.kernel.org Fixes: 0cb2aa690c7e ("IB/hfi1: Add sysfs interface for affinity setup") Signed-off-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/sdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c index 2b6c24b7b586..f07d328689d3 100644 --- a/drivers/infiniband/hw/hfi1/sdma.c +++ b/drivers/infiniband/hw/hfi1/sdma.c @@ -838,8 +838,8 @@ struct sdma_engine *sdma_select_user_engine(struct hfi1_devdata *dd, if (current->nr_cpus_allowed != 1) goto out; - cpu_id = smp_processor_id(); rcu_read_lock(); + cpu_id = smp_processor_id(); rht_node = rhashtable_lookup(dd->sdma_rht, &cpu_id, sdma_rht_params); -- cgit v1.2.3-59-g8ed1b From f6a3cfec3c01f9983e961c3327cef0db129a3c43 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Mon, 29 Nov 2021 14:20:03 -0500 Subject: IB/hfi1: Fix early init panic The following trace can be observed with an init failure such as firmware load failures: BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 PGD 0 P4D 0 Oops: 0010 [#1] SMP PTI CPU: 0 PID: 537 Comm: kworker/0:3 Tainted: G OE --------- - - 4.18.0-240.el8.x86_64 #1 Workqueue: events work_for_cpu_fn RIP: 0010:0x0 Code: Bad RIP value. RSP: 0000:ffffae5f878a3c98 EFLAGS: 00010046 RAX: 0000000000000000 RBX: ffff95e48e025c00 RCX: 0000000000000000 RDX: 0000000000000001 RSI: 0000000000000000 RDI: ffff95e48e025c00 RBP: ffff95e4bf3660a4 R08: 0000000000000000 R09: ffffffff86d5e100 R10: ffff95e49e1de600 R11: 0000000000000001 R12: ffff95e4bf366180 R13: ffff95e48e025c00 R14: ffff95e4bf366028 R15: ffff95e4bf366000 FS: 0000000000000000(0000) GS:ffff95e4df200000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffffffffffffd6 CR3: 0000000f86a0a003 CR4: 00000000001606f0 Call Trace: receive_context_interrupt+0x1f/0x40 [hfi1] __free_irq+0x201/0x300 free_irq+0x2e/0x60 pci_free_irq+0x18/0x30 msix_free_irq.part.2+0x46/0x80 [hfi1] msix_clean_up_interrupts+0x2b/0x70 [hfi1] hfi1_init_dd+0x640/0x1a90 [hfi1] do_init_one.isra.19+0x34d/0x680 [hfi1] local_pci_probe+0x41/0x90 work_for_cpu_fn+0x16/0x20 process_one_work+0x1a7/0x360 worker_thread+0x1cf/0x390 ? create_worker+0x1a0/0x1a0 kthread+0x112/0x130 ? kthread_flush_work_fn+0x10/0x10 ret_from_fork+0x35/0x40 The free_irq() results in a callback to the registered interrupt handler, and rcd->do_interrupt is NULL because the receive context data structures are not fully initialized. Fix by ensuring that the do_interrupt is always assigned and adding a guards in the slow path handler to detect and handle a partially initialized receive context and noop the receive. Link: https://lore.kernel.org/r/20211129192003.101968.33612.stgit@awfm-01.cornelisnetworks.com Cc: stable@vger.kernel.org Fixes: b0ba3c18d6bf ("IB/hfi1: Move normal functions from hfi1_devdata to const array") Signed-off-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/chip.c | 2 ++ drivers/infiniband/hw/hfi1/driver.c | 2 ++ drivers/infiniband/hw/hfi1/init.c | 5 ++--- 3 files changed, 6 insertions(+), 3 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index ec37f4fd8e96..f1245c94ae26 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -8415,6 +8415,8 @@ static void receive_interrupt_common(struct hfi1_ctxtdata *rcd) */ static void __hfi1_rcd_eoi_intr(struct hfi1_ctxtdata *rcd) { + if (!rcd->rcvhdrq) + return; clear_recv_intr(rcd); if (check_packet_present(rcd)) force_recv_intr(rcd); diff --git a/drivers/infiniband/hw/hfi1/driver.c b/drivers/infiniband/hw/hfi1/driver.c index 61f341c3005c..e2c634af40e9 100644 --- a/drivers/infiniband/hw/hfi1/driver.c +++ b/drivers/infiniband/hw/hfi1/driver.c @@ -1012,6 +1012,8 @@ int handle_receive_interrupt(struct hfi1_ctxtdata *rcd, int thread) struct hfi1_packet packet; int skip_pkt = 0; + if (!rcd->rcvhdrq) + return RCV_PKT_OK; /* Control context will always use the slow path interrupt handler */ needset = (rcd->ctxt == HFI1_CTRL_CTXT) ? 0 : 1; diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index 8e1236be46e1..6422dd6cae60 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -113,7 +113,6 @@ static int hfi1_create_kctxt(struct hfi1_devdata *dd, rcd->fast_handler = get_dma_rtail_setting(rcd) ? handle_receive_interrupt_dma_rtail : handle_receive_interrupt_nodma_rtail; - rcd->slow_handler = handle_receive_interrupt; hfi1_set_seq_cnt(rcd, 1); @@ -334,6 +333,8 @@ int hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, int numa, rcd->numa_id = numa; rcd->rcv_array_groups = dd->rcv_entries.ngroups; rcd->rhf_rcv_function_map = normal_rhf_rcv_functions; + rcd->slow_handler = handle_receive_interrupt; + rcd->do_interrupt = rcd->slow_handler; rcd->msix_intr = CCE_NUM_MSIX_VECTORS; mutex_init(&rcd->exp_mutex); @@ -898,8 +899,6 @@ int hfi1_init(struct hfi1_devdata *dd, int reinit) if (!rcd) continue; - rcd->do_interrupt = &handle_receive_interrupt; - lastfail = hfi1_create_rcvhdrq(dd, rcd); if (!lastfail) lastfail = hfi1_setup_eagerbufs(rcd); -- cgit v1.2.3-59-g8ed1b From 60a8b5a1611b4a26de4839ab9c1fc2a9cf3e17c1 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Mon, 29 Nov 2021 14:20:08 -0500 Subject: IB/hfi1: Fix leak of rcvhdrtail_dummy_kvaddr This buffer is currently allocated in hfi1_init(): if (reinit) ret = init_after_reset(dd); else ret = loadtime_init(dd); if (ret) goto done; /* allocate dummy tail memory for all receive contexts */ dd->rcvhdrtail_dummy_kvaddr = dma_alloc_coherent(&dd->pcidev->dev, sizeof(u64), &dd->rcvhdrtail_dummy_dma, GFP_KERNEL); if (!dd->rcvhdrtail_dummy_kvaddr) { dd_dev_err(dd, "cannot allocate dummy tail memory\n"); ret = -ENOMEM; goto done; } The reinit triggered path will overwrite the old allocation and leak it. Fix by moving the allocation to hfi1_alloc_devdata() and the deallocation to hfi1_free_devdata(). Link: https://lore.kernel.org/r/20211129192008.101968.91302.stgit@awfm-01.cornelisnetworks.com Cc: stable@vger.kernel.org Fixes: 46b010d3eeb8 ("staging/rdma/hfi1: Workaround to prevent corruption during packet delivery") Signed-off-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/init.c | 33 ++++++++++++++------------------- 1 file changed, 14 insertions(+), 19 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index 6422dd6cae60..4436ed41547c 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -875,18 +875,6 @@ int hfi1_init(struct hfi1_devdata *dd, int reinit) if (ret) goto done; - /* allocate dummy tail memory for all receive contexts */ - dd->rcvhdrtail_dummy_kvaddr = dma_alloc_coherent(&dd->pcidev->dev, - sizeof(u64), - &dd->rcvhdrtail_dummy_dma, - GFP_KERNEL); - - if (!dd->rcvhdrtail_dummy_kvaddr) { - dd_dev_err(dd, "cannot allocate dummy tail memory\n"); - ret = -ENOMEM; - goto done; - } - /* dd->rcd can be NULL if early initialization failed */ for (i = 0; dd->rcd && i < dd->first_dyn_alloc_ctxt; ++i) { /* @@ -1200,6 +1188,11 @@ void hfi1_free_devdata(struct hfi1_devdata *dd) dd->tx_opstats = NULL; kfree(dd->comp_vect); dd->comp_vect = NULL; + if (dd->rcvhdrtail_dummy_kvaddr) + dma_free_coherent(&dd->pcidev->dev, sizeof(u64), + (void *)dd->rcvhdrtail_dummy_kvaddr, + dd->rcvhdrtail_dummy_dma); + dd->rcvhdrtail_dummy_kvaddr = NULL; sdma_clean(dd, dd->num_sdma); rvt_dealloc_device(&dd->verbs_dev.rdi); } @@ -1297,6 +1290,15 @@ static struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, goto bail; } + /* allocate dummy tail memory for all receive contexts */ + dd->rcvhdrtail_dummy_kvaddr = + dma_alloc_coherent(&dd->pcidev->dev, sizeof(u64), + &dd->rcvhdrtail_dummy_dma, GFP_KERNEL); + if (!dd->rcvhdrtail_dummy_kvaddr) { + ret = -ENOMEM; + goto bail; + } + atomic_set(&dd->ipoib_rsm_usr_num, 0); return dd; @@ -1504,13 +1506,6 @@ static void cleanup_device_data(struct hfi1_devdata *dd) free_credit_return(dd); - if (dd->rcvhdrtail_dummy_kvaddr) { - dma_free_coherent(&dd->pcidev->dev, sizeof(u64), - (void *)dd->rcvhdrtail_dummy_kvaddr, - dd->rcvhdrtail_dummy_dma); - dd->rcvhdrtail_dummy_kvaddr = NULL; - } - /* * Free any resources still in use (usually just kernel contexts) * at unload; we do for ctxtcnt, because that's what we allocate. -- cgit v1.2.3-59-g8ed1b From 1e11a39a82e95ce86f849f40dda0d9c0498cebd9 Mon Sep 17 00:00:00 2001 From: Shiraz Saleem Date: Tue, 7 Dec 2021 09:21:36 -0600 Subject: RDMA/irdma: Fix a user-after-free in add_pble_prm When irdma_hmc_sd_one fails, 'chunk' is freed while its still on the PBLE info list. Add the chunk entry to the PBLE info list only after successful setting of the SD in irdma_hmc_sd_one. Fixes: e8c4dbc2fcac ("RDMA/irdma: Add PBLE resource manager") Link: https://lore.kernel.org/r/20211207152135.2192-1-shiraz.saleem@intel.com Reported-by: Dan Carpenter Signed-off-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/irdma/pble.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/irdma/pble.c b/drivers/infiniband/hw/irdma/pble.c index aeeb1c310965..da032b952755 100644 --- a/drivers/infiniband/hw/irdma/pble.c +++ b/drivers/infiniband/hw/irdma/pble.c @@ -283,7 +283,6 @@ add_pble_prm(struct irdma_hmc_pble_rsrc *pble_rsrc) "PBLE: next_fpm_addr = %llx chunk_size[%llu] = 0x%llx\n", pble_rsrc->next_fpm_addr, chunk->size, chunk->size); pble_rsrc->unallocated_pble -= (u32)(chunk->size >> 3); - list_add(&chunk->list, &pble_rsrc->pinfo.clist); sd_reg_val = (sd_entry_type == IRDMA_SD_TYPE_PAGED) ? sd_entry->u.pd_table.pd_page_addr.pa : sd_entry->u.bp.addr.pa; @@ -295,6 +294,7 @@ add_pble_prm(struct irdma_hmc_pble_rsrc *pble_rsrc) goto error; } + list_add(&chunk->list, &pble_rsrc->pinfo.clist); sd_entry->valid = true; return 0; -- cgit v1.2.3-59-g8ed1b From 117697cc935b0ab04ec66274d8e64ccfebd7d0d2 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 5 Dec 2021 09:17:24 +0100 Subject: RDMA/irdma: Fix a potential memory allocation issue in 'irdma_prm_add_pble_mem()' 'pchunk->bitmapbuf' is a bitmap. Its size (in number of bits) is stored in 'pchunk->sizeofbitmap'. When it is allocated, the size (in bytes) is computed by: size_in_bits >> 3 There are 2 issues (numbers bellow assume that longs are 64 bits): - there is no guarantee here that 'pchunk->bitmapmem.size' is modulo BITS_PER_LONG but bitmaps are stored as longs (sizeofbitmap=8 bits will only allocate 1 byte, instead of 8 (1 long)) - the number of bytes is computed with a shift, not a round up, so we may allocate less memory than needed (sizeofbitmap=65 bits will only allocate 8 bytes (i.e. 1 long), when 2 longs are needed = 16 bytes) Fix both issues by using 'bitmap_zalloc()' and remove the useless 'bitmapmem' from 'struct irdma_chunk'. While at it, remove some useless NULL test before calling kfree/bitmap_free. Fixes: 915cc7ac0f8e ("RDMA/irdma: Add miscellaneous utility definitions") Link: https://lore.kernel.org/r/5e670b640508e14b1869c3e8e4fb970d78cbe997.1638692171.git.christophe.jaillet@wanadoo.fr Signed-off-by: Christophe JAILLET Reviewed-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/irdma/pble.c | 6 ++---- drivers/infiniband/hw/irdma/pble.h | 1 - drivers/infiniband/hw/irdma/utils.c | 9 ++------- 3 files changed, 4 insertions(+), 12 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/irdma/pble.c b/drivers/infiniband/hw/irdma/pble.c index da032b952755..fed49da770f3 100644 --- a/drivers/infiniband/hw/irdma/pble.c +++ b/drivers/infiniband/hw/irdma/pble.c @@ -25,8 +25,7 @@ void irdma_destroy_pble_prm(struct irdma_hmc_pble_rsrc *pble_rsrc) list_del(&chunk->list); if (chunk->type == PBLE_SD_PAGED) irdma_pble_free_paged_mem(chunk); - if (chunk->bitmapbuf) - kfree(chunk->bitmapmem.va); + bitmap_free(chunk->bitmapbuf); kfree(chunk->chunkmem.va); } } @@ -299,8 +298,7 @@ add_pble_prm(struct irdma_hmc_pble_rsrc *pble_rsrc) return 0; error: - if (chunk->bitmapbuf) - kfree(chunk->bitmapmem.va); + bitmap_free(chunk->bitmapbuf); kfree(chunk->chunkmem.va); return ret_code; diff --git a/drivers/infiniband/hw/irdma/pble.h b/drivers/infiniband/hw/irdma/pble.h index e1b3b8118a2c..aa20827dcc9d 100644 --- a/drivers/infiniband/hw/irdma/pble.h +++ b/drivers/infiniband/hw/irdma/pble.h @@ -78,7 +78,6 @@ struct irdma_chunk { u32 pg_cnt; enum irdma_alloc_type type; struct irdma_sc_dev *dev; - struct irdma_virt_mem bitmapmem; struct irdma_virt_mem chunkmem; }; diff --git a/drivers/infiniband/hw/irdma/utils.c b/drivers/infiniband/hw/irdma/utils.c index 8b42c43fc14f..981107b40c90 100644 --- a/drivers/infiniband/hw/irdma/utils.c +++ b/drivers/infiniband/hw/irdma/utils.c @@ -2239,15 +2239,10 @@ enum irdma_status_code irdma_prm_add_pble_mem(struct irdma_pble_prm *pprm, sizeofbitmap = (u64)pchunk->size >> pprm->pble_shift; - pchunk->bitmapmem.size = sizeofbitmap >> 3; - pchunk->bitmapmem.va = kzalloc(pchunk->bitmapmem.size, GFP_KERNEL); - - if (!pchunk->bitmapmem.va) + pchunk->bitmapbuf = bitmap_zalloc(sizeofbitmap, GFP_KERNEL); + if (!pchunk->bitmapbuf) return IRDMA_ERR_NO_MEMORY; - pchunk->bitmapbuf = pchunk->bitmapmem.va; - bitmap_zero(pchunk->bitmapbuf, sizeofbitmap); - pchunk->sizeofbitmap = sizeofbitmap; /* each pble is 8 bytes hence shift by 3 */ pprm->total_pble_alloc += pchunk->size >> 3; -- cgit v1.2.3-59-g8ed1b From 25b5d6fd6d13b2de3780a0ae247befc43c4576fe Mon Sep 17 00:00:00 2001 From: Shiraz Saleem Date: Wed, 1 Dec 2021 17:15:08 -0600 Subject: RDMA/irdma: Report correct WC errors Return IBV_WC_REM_OP_ERR for responder QP errors instead of IBV_WC_REM_ACCESS_ERR. Return IBV_WC_LOC_QP_OP_ERR for errors detected on the SQ with bad opcodes Fixes: 44d9e52977a1 ("RDMA/irdma: Implement device initialization definitions") Link: https://lore.kernel.org/r/20211201231509.1930-1-shiraz.saleem@intel.com Signed-off-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/irdma/hw.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/irdma/hw.c b/drivers/infiniband/hw/irdma/hw.c index 4108dcabece2..1bae1dca6a62 100644 --- a/drivers/infiniband/hw/irdma/hw.c +++ b/drivers/infiniband/hw/irdma/hw.c @@ -146,6 +146,7 @@ static void irdma_set_flush_fields(struct irdma_sc_qp *qp, qp->flush_code = FLUSH_PROT_ERR; break; case IRDMA_AE_AMP_BAD_QP: + case IRDMA_AE_WQE_UNEXPECTED_OPCODE: qp->flush_code = FLUSH_LOC_QP_OP_ERR; break; case IRDMA_AE_AMP_BAD_STAG_KEY: @@ -156,7 +157,6 @@ static void irdma_set_flush_fields(struct irdma_sc_qp *qp, case IRDMA_AE_PRIV_OPERATION_DENIED: case IRDMA_AE_IB_INVALID_REQUEST: case IRDMA_AE_IB_REMOTE_ACCESS_ERROR: - case IRDMA_AE_IB_REMOTE_OP_ERROR: qp->flush_code = FLUSH_REM_ACCESS_ERR; qp->event_type = IRDMA_QP_EVENT_ACCESS_ERR; break; @@ -184,6 +184,9 @@ static void irdma_set_flush_fields(struct irdma_sc_qp *qp, case IRDMA_AE_AMP_MWBIND_INVALID_BOUNDS: qp->flush_code = FLUSH_MW_BIND_ERR; break; + case IRDMA_AE_IB_REMOTE_OP_ERROR: + qp->flush_code = FLUSH_REM_OP_ERR; + break; default: qp->flush_code = FLUSH_FATAL_ERR; break; -- cgit v1.2.3-59-g8ed1b From 10467ce09fefa2e74359f5b2ab1efb8909402f19 Mon Sep 17 00:00:00 2001 From: Tatyana Nikolova Date: Wed, 1 Dec 2021 17:15:09 -0600 Subject: RDMA/irdma: Don't arm the CQ more than two times if no CE for this CQ Completion events (CEs) are lost if the application is allowed to arm the CQ more than two times when no new CE for this CQ has been generated by the HW. Check if arming has been done for the CQ and if not, arm the CQ for any event otherwise promote to arm the CQ for any event only when the last arm event was solicited. Fixes: b48c24c2d710 ("RDMA/irdma: Implement device supported verb APIs") Link: https://lore.kernel.org/r/20211201231509.1930-2-shiraz.saleem@intel.com Signed-off-by: Tatyana Nikolova Signed-off-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/irdma/hw.c | 2 ++ drivers/infiniband/hw/irdma/main.h | 1 + drivers/infiniband/hw/irdma/utils.c | 15 +++++++++++++++ drivers/infiniband/hw/irdma/verbs.c | 23 ++++++++++++++++++----- drivers/infiniband/hw/irdma/verbs.h | 2 ++ 5 files changed, 38 insertions(+), 5 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/irdma/hw.c b/drivers/infiniband/hw/irdma/hw.c index 1bae1dca6a62..b4c657f5f2f9 100644 --- a/drivers/infiniband/hw/irdma/hw.c +++ b/drivers/infiniband/hw/irdma/hw.c @@ -60,6 +60,8 @@ static void irdma_iwarp_ce_handler(struct irdma_sc_cq *iwcq) { struct irdma_cq *cq = iwcq->back_cq; + if (!cq->user_mode) + cq->armed = false; if (cq->ibcq.comp_handler) cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context); } diff --git a/drivers/infiniband/hw/irdma/main.h b/drivers/infiniband/hw/irdma/main.h index 91a497139ba3..cb218cab79ac 100644 --- a/drivers/infiniband/hw/irdma/main.h +++ b/drivers/infiniband/hw/irdma/main.h @@ -542,6 +542,7 @@ int irdma_ah_cqp_op(struct irdma_pci_f *rf, struct irdma_sc_ah *sc_ah, u8 cmd, void (*callback_fcn)(struct irdma_cqp_request *cqp_request), void *cb_param); void irdma_gsi_ud_qp_ah_cb(struct irdma_cqp_request *cqp_request); +bool irdma_cq_empty(struct irdma_cq *iwcq); int irdma_inetaddr_event(struct notifier_block *notifier, unsigned long event, void *ptr); int irdma_inet6addr_event(struct notifier_block *notifier, unsigned long event, diff --git a/drivers/infiniband/hw/irdma/utils.c b/drivers/infiniband/hw/irdma/utils.c index 981107b40c90..398736d8c78a 100644 --- a/drivers/infiniband/hw/irdma/utils.c +++ b/drivers/infiniband/hw/irdma/utils.c @@ -2486,3 +2486,18 @@ void irdma_ib_qp_event(struct irdma_qp *iwqp, enum irdma_qp_event_type event) ibevent.element.qp = &iwqp->ibqp; iwqp->ibqp.event_handler(&ibevent, iwqp->ibqp.qp_context); } + +bool irdma_cq_empty(struct irdma_cq *iwcq) +{ + struct irdma_cq_uk *ukcq; + u64 qword3; + __le64 *cqe; + u8 polarity; + + ukcq = &iwcq->sc_cq.cq_uk; + cqe = IRDMA_GET_CURRENT_CQ_ELEM(ukcq); + get_64bit_val(cqe, 24, &qword3); + polarity = (u8)FIELD_GET(IRDMA_CQ_VALID, qword3); + + return polarity != ukcq->polarity; +} diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index 0f66e809d418..8cd5f9261692 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -3584,18 +3584,31 @@ static int irdma_req_notify_cq(struct ib_cq *ibcq, struct irdma_cq *iwcq; struct irdma_cq_uk *ukcq; unsigned long flags; - enum irdma_cmpl_notify cq_notify = IRDMA_CQ_COMPL_EVENT; + enum irdma_cmpl_notify cq_notify; + bool promo_event = false; + int ret = 0; + cq_notify = notify_flags == IB_CQ_SOLICITED ? + IRDMA_CQ_COMPL_SOLICITED : IRDMA_CQ_COMPL_EVENT; iwcq = to_iwcq(ibcq); ukcq = &iwcq->sc_cq.cq_uk; - if (notify_flags == IB_CQ_SOLICITED) - cq_notify = IRDMA_CQ_COMPL_SOLICITED; spin_lock_irqsave(&iwcq->lock, flags); - irdma_uk_cq_request_notification(ukcq, cq_notify); + /* Only promote to arm the CQ for any event if the last arm event was solicited. */ + if (iwcq->last_notify == IRDMA_CQ_COMPL_SOLICITED && notify_flags != IB_CQ_SOLICITED) + promo_event = true; + + if (!iwcq->armed || promo_event) { + iwcq->armed = true; + iwcq->last_notify = cq_notify; + irdma_uk_cq_request_notification(ukcq, cq_notify); + } + + if ((notify_flags & IB_CQ_REPORT_MISSED_EVENTS) && !irdma_cq_empty(iwcq)) + ret = 1; spin_unlock_irqrestore(&iwcq->lock, flags); - return 0; + return ret; } static int irdma_roce_port_immutable(struct ib_device *ibdev, u32 port_num, diff --git a/drivers/infiniband/hw/irdma/verbs.h b/drivers/infiniband/hw/irdma/verbs.h index 5c244cd321a3..d0fdef8d09ea 100644 --- a/drivers/infiniband/hw/irdma/verbs.h +++ b/drivers/infiniband/hw/irdma/verbs.h @@ -110,6 +110,8 @@ struct irdma_cq { u16 cq_size; u16 cq_num; bool user_mode; + bool armed; + enum irdma_cmpl_notify last_notify; u32 polled_cmpls; u32 cq_mem_size; struct irdma_dma_mem kmem; -- cgit v1.2.3-59-g8ed1b