From 9a4467a6b282a299b932608ac2c9034f8415359f Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Mon, 25 Nov 2019 00:39:29 -0800 Subject: RDMA/bnxt_re: Avoid freeing MR resources if dereg fails The driver returns an error code for MR dereg, but frees the MR structure. When the MR dereg is retried due to previous error, the system crashes as the structure is already freed. BUG: unable to handle kernel NULL pointer dereference at 00000000000001b8 PGD 0 P4D 0 Oops: 0000 [#1] SMP PTI CPU: 7 PID: 12178 Comm: ib_send_bw Kdump: loaded Not tainted 4.18.0-124.el8.x86_64 #1 Hardware name: Dell Inc. PowerEdge R430/03XKDV, BIOS 1.1.10 03/10/2015 RIP: 0010:__dev_printk+0x2a/0x70 Code: 0f 1f 44 00 00 49 89 d1 48 85 f6 0f 84 f6 2b 00 00 4c 8b 46 70 4d 85 c0 75 04 4c 8b 46 10 48 8b 86 a8 00 00 00 48 85 c0 74 16 <48> 8b 08 0f be 7f 01 48 c7 c2 13 ac ac 83 83 ef 30 e9 10 fe ff ff RSP: 0018:ffffaf7c04607a60 EFLAGS: 00010006 RAX: 00000000000001b8 RBX: ffffa0010c91c488 RCX: 0000000000000246 RDX: ffffaf7c04607a68 RSI: ffffa0010c91caa8 RDI: ffffffff83a788eb RBP: ffffaf7c04607ac8 R08: 0000000000000000 R09: ffffaf7c04607a68 R10: 0000000000000000 R11: 0000000000000001 R12: ffffaf7c04607b90 R13: 000000000000000e R14: 0000000000000000 R15: 00000000ffffa001 FS: 0000146fa1f1cdc0(0000) GS:ffffa0012fac0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000000001b8 CR3: 000000007680a003 CR4: 00000000001606e0 Call Trace: dev_err+0x6c/0x90 ? dev_printk_emit+0x4e/0x70 bnxt_qplib_rcfw_send_message+0x594/0x660 [bnxt_re] ? dev_err+0x6c/0x90 bnxt_qplib_free_mrw+0x80/0xe0 [bnxt_re] bnxt_re_dereg_mr+0x2e/0xd0 [bnxt_re] ib_dereg_mr+0x2f/0x50 [ib_core] destroy_hw_idr_uobject+0x20/0x70 [ib_uverbs] uverbs_destroy_uobject+0x2e/0x170 [ib_uverbs] __uverbs_cleanup_ufile+0x6e/0x90 [ib_uverbs] uverbs_destroy_ufile_hw+0x61/0x130 [ib_uverbs] ib_uverbs_close+0x1f/0x80 [ib_uverbs] __fput+0xb7/0x230 task_work_run+0x8a/0xb0 do_exit+0x2da/0xb40 ... RIP: 0033:0x146fa113a387 Code: Bad RIP value. RSP: 002b:00007fff945d1478 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff02 RAX: 0000000000000000 RBX: 000055a248908d70 RCX: 0000000000000000 RDX: 0000146fa1f2b000 RSI: 0000000000000001 RDI: 000055a248906488 RBP: 000055a248909630 R08: 0000000000010000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: 000055a248906488 R13: 0000000000000001 R14: 0000000000000000 R15: 000055a2489095f0 Do not free the MR structures, when driver returns error to the stack. Fixes: 872f3578241d ("RDMA/bnxt_re: Add support for MRs with Huge pages") Link: https://lore.kernel.org/r/1574671174-5064-2-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Selvin Xavier Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 9b6ca15a183c..ad5112a2325f 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -3305,8 +3305,10 @@ int bnxt_re_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata) int rc; rc = bnxt_qplib_free_mrw(&rdev->qplib_res, &mr->qplib_mr); - if (rc) + if (rc) { dev_err(rdev_to_dev(rdev), "Dereg MR failed: %#x\n", rc); + return rc; + } if (mr->pages) { rc = bnxt_qplib_free_fast_reg_page_list(&rdev->qplib_res, -- cgit v1.2.3-59-g8ed1b From c5275723580922e5f3264f96751337661a153c7d Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Mon, 25 Nov 2019 00:39:30 -0800 Subject: RDMA/bnxt_re: Fix Send Work Entry state check while polling completions Some adapters need a fence Work Entry to handle retransmission. Currently the driver checks for this condition, only if the Send queue entry is signalled. Implement the condition check, irrespective of the signalled state of the Work queue entries Failure to add the fence can result in access to memory that is already marked as completed, triggering data corruption, transmission failure, IOMMU failures, etc. Fixes: 9152e0b722b2 ("RDMA/bnxt_re: HW workarounds for handling specific conditions") Link: https://lore.kernel.org/r/1574671174-5064-3-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Selvin Xavier Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/qplib_fp.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index 958c1ff9c515..4d07d22bfa7b 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -2283,13 +2283,13 @@ static int bnxt_qplib_cq_process_req(struct bnxt_qplib_cq *cq, /* Add qp to flush list of the CQ */ bnxt_qplib_add_flush_qp(qp); } else { + /* Before we complete, do WA 9060 */ + if (do_wa9060(qp, cq, cq_cons, sw_sq_cons, + cqe_sq_cons)) { + *lib_qp = qp; + goto out; + } if (swq->flags & SQ_SEND_FLAGS_SIGNAL_COMP) { - /* Before we complete, do WA 9060 */ - if (do_wa9060(qp, cq, cq_cons, sw_sq_cons, - cqe_sq_cons)) { - *lib_qp = qp; - goto out; - } cqe->status = CQ_REQ_STATUS_OK; cqe++; (*budget)--; -- cgit v1.2.3-59-g8ed1b From ca9033ba69c7e3477f207df69867b2ea969197c8 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Thu, 19 Dec 2019 16:19:41 -0500 Subject: IB/hfi1: Don't cancel unused work item In the iowait structure, two iowait_work entries were included to queue a given object: one for normal IB operations, and the other for TID RDMA operations. For non-TID RDMA operations, the iowait_work structure for TID RDMA is initialized to contain a NULL function (not used). When the QP is reset, the function iowait_cancel_work will be called to cancel any pending work. The problem is that this function will call cancel_work_sync() for both iowait_work entries, even though the one for TID RDMA is not used at all. Eventually, the call cascades to __flush_work(), wherein a WARN_ON will be triggered due to the fact that work->func is NULL. The WARN_ON was introduced in commit 4d43d395fed1 ("workqueue: Try to catch flush_work() without INIT_WORK().") This patch fixes the issue by making sure that a work function is present for TID RDMA before calling cancel_work_sync in iowait_cancel_work. Fixes: 4d43d395fed1 ("workqueue: Try to catch flush_work() without INIT_WORK().") Fixes: 5da0fc9dbf89 ("IB/hfi1: Prepare resource waits for dual leg") Link: https://lore.kernel.org/r/20191219211941.58387.39883.stgit@awfm-01.aw.intel.com Reviewed-by: Mike Marciniszyn Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/iowait.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/hfi1/iowait.c b/drivers/infiniband/hw/hfi1/iowait.c index adb4a1ba921b..5836fe7b2817 100644 --- a/drivers/infiniband/hw/hfi1/iowait.c +++ b/drivers/infiniband/hw/hfi1/iowait.c @@ -81,7 +81,9 @@ void iowait_init(struct iowait *wait, u32 tx_limit, void iowait_cancel_work(struct iowait *w) { cancel_work_sync(&iowait_get_ib_work(w)->iowork); - cancel_work_sync(&iowait_get_tid_work(w)->iowork); + /* Make sure that the iowork for TID RDMA is used */ + if (iowait_get_tid_work(w)->iowork.func) + cancel_work_sync(&iowait_get_tid_work(w)->iowork); } /** -- cgit v1.2.3-59-g8ed1b From b2ff0d510182eb5cc05a65d1b2371af62c4b170c Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Thu, 19 Dec 2019 18:19:20 -0500 Subject: IB/hfi1: Adjust flow PSN with the correct resync_psn When a TID RDMA ACK to RESYNC request is received, the flow PSNs for pending TID RDMA WRITE segments will be adjusted with the next flow generation number, based on the resync_psn value extracted from the flow PSN of the TID RDMA ACK packet. The resync_psn value indicates the last flow PSN for which a TID RDMA WRITE DATA packet has been received by the responder and the requester should resend TID RDMA WRITE DATA packets, starting from the next flow PSN. However, if resync_psn points to the last flow PSN for a segment and the next segment flow PSN starts with a new generation number, use of the old resync_psn to adjust the flow PSN for the next segment will lead to miscalculation, resulting in WARN_ON and sge rewinding errors: WARNING: CPU: 4 PID: 146961 at /nfs/site/home/phcvs2/gitrepo/ifs-all/components/Drivers/tmp/rpmbuild/BUILD/ifs-kernel-updates-3.10.0_957.el7.x86_64/hfi1/tid_rdma.c:4764 hfi1_rc_rcv_tid_rdma_ack+0x8f6/0xa90 [hfi1] Modules linked in: ib_ipoib(OE) hfi1(OE) rdmavt(OE) rpcsec_gss_krb5 auth_rpcgss nfsv4 dns_resolver nfsv3 nfs_acl nfs lockd grace fscache iTCO_wdt iTCO_vendor_support skx_edac intel_powerclamp coretemp intel_rapl iosf_mbi kvm irqbypass crc32_pclmul ghash_clmulni_intel ib_isert iscsi_target_mod target_core_mod aesni_intel lrw gf128mul glue_helper ablk_helper cryptd rpcrdma sunrpc opa_vnic ast ttm ib_iser libiscsi drm_kms_helper scsi_transport_iscsi ipmi_ssif syscopyarea sysfillrect sysimgblt fb_sys_fops drm joydev ipmi_si pcspkr sg drm_panel_orientation_quirks ipmi_devintf lpc_ich i2c_i801 ipmi_msghandler wmi rdma_ucm ib_ucm ib_uverbs acpi_cpufreq acpi_power_meter ib_umad rdma_cm ib_cm iw_cm ip_tables ext4 mbcache jbd2 sd_mod crc_t10dif crct10dif_generic crct10dif_pclmul i2c_algo_bit crct10dif_common crc32c_intel e1000e ib_core ahci libahci ptp libata pps_core nfit libnvdimm [last unloaded: rdmavt] CPU: 4 PID: 146961 Comm: kworker/4:0H Kdump: loaded Tainted: G W OE ------------ 3.10.0-957.el7.x86_64 #1 Hardware name: Intel Corporation S2600WFT/S2600WFT, BIOS SE5C620.86B.0X.02.0117.040420182310 04/04/2018 Workqueue: hfi0_0 _hfi1_do_tid_send [hfi1] Call Trace: [] dump_stack+0x19/0x1b [] __warn+0xd8/0x100 [] warn_slowpath_null+0x1d/0x20 [] hfi1_rc_rcv_tid_rdma_ack+0x8f6/0xa90 [hfi1] [] hfi1_kdeth_eager_rcv+0x1dc/0x210 [hfi1] [] ? hfi1_kdeth_expected_rcv+0x1ef/0x210 [hfi1] [] kdeth_process_eager+0x35/0x90 [hfi1] [] handle_receive_interrupt_nodma_rtail+0x17a/0x2b0 [hfi1] [] receive_context_interrupt+0x23/0x40 [hfi1] [] __handle_irq_event_percpu+0x44/0x1c0 [] handle_irq_event_percpu+0x32/0x80 [] handle_irq_event+0x3c/0x60 [] handle_edge_irq+0x7f/0x150 [] handle_irq+0xe4/0x1a0 [] do_IRQ+0x4d/0xf0 [] common_interrupt+0x162/0x162 [] ? swiotlb_map_page+0x49/0x150 [] hfi1_verbs_send_dma+0x291/0xb70 [hfi1] [] ? hfi1_wait_kmem+0xf0/0xf0 [hfi1] [] hfi1_verbs_send+0x126/0x2b0 [hfi1] [] _hfi1_do_tid_send+0x1d3/0x320 [hfi1] [] process_one_work+0x17f/0x440 [] worker_thread+0x126/0x3c0 [] ? manage_workers.isra.25+0x2a0/0x2a0 [] kthread+0xd1/0xe0 [] ? insert_kthread_work+0x40/0x40 [] ret_from_fork_nospec_begin+0x7/0x21 [] ? insert_kthread_work+0x40/0x40 This patch fixes the issue by adjusting the resync_psn first if the flow generation has been advanced for a pending segment. Fixes: 9e93e967f7b4 ("IB/hfi1: Add a function to receive TID RDMA ACK packet") Link: https://lore.kernel.org/r/20191219231920.51069.37147.stgit@awfm-01.aw.intel.com Cc: Reviewed-by: Mike Marciniszyn Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/tid_rdma.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index e53f542b60af..8a2e0d9351e9 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -4633,6 +4633,15 @@ void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet) */ fpsn = full_flow_psn(flow, flow->flow_state.spsn); req->r_ack_psn = psn; + /* + * If resync_psn points to the last flow PSN for a + * segment and the new segment (likely from a new + * request) starts with a new generation number, we + * need to adjust resync_psn accordingly. + */ + if (flow->flow_state.generation != + (resync_psn >> HFI1_KDETH_BTH_SEQ_SHIFT)) + resync_psn = mask_psn(fpsn - 1); flow->resync_npkts += delta_psn(mask_psn(resync_psn + 1), fpsn); /* -- cgit v1.2.3-59-g8ed1b From 9554de394b7eee01606e64c3806cd43893f3037e Mon Sep 17 00:00:00 2001 From: Shiraz Saleem Date: Tue, 7 Jan 2020 10:22:23 -0600 Subject: i40iw: Remove setting of VMA private data and use rdma_user_mmap_io vm_ops is now initialized in ib_uverbs_mmap() with the recent rdma mmap API changes. Earlier it was done in rdma_umap_priv_init() which would not be called unless a driver called rdma_user_mmap_io() in its mmap. i40iw does not use the rdma_user_mmap_io API but sets the vma's vm_private_data to a driver object. This now conflicts with the vm_op rdma_umap_close as priv pointer points to the i40iw driver object instead of the private data setup by core when rdma_user_mmap_io is called. This leads to a crash in rdma_umap_close with a mmap put being called when it should not have. Remove the redundant setting of the vma private_data in i40iw as it is not used. Also move i40iw over to use the rdma_user_mmap_io API. This gives the extra protection of having the mappings zapped when the context is detsroyed. BUG: unable to handle page fault for address: 0000000100000001 #PF: supervisor write access in kernel mode #PF: error_code(0x0002) - not-present page PGD 0 P4D 0 Oops: 0002 [#1] SMP PTI CPU: 6 PID: 9528 Comm: rping Kdump: loaded Not tainted 5.5.0-rc4+ #117 Hardware name: Gigabyte Technology Co., Ltd. To be filled by O.E.M./Q87M-D2H, BIOS F7 01/17/2014 RIP: 0010:rdma_user_mmap_entry_put+0xa/0x30 [ib_core] RSP: 0018:ffffb340c04c7c38 EFLAGS: 00010202 RAX: 00000000ffffffff RBX: ffff9308e7be2a00 RCX: 000000000000cec0 RDX: 0000000000000000 RSI: 0000000000000006 RDI: 0000000100000001 RBP: ffff9308dc7641f0 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000001 R11: ffffffff8d4414d8 R12: ffff93075182c780 R13: 0000000000000001 R14: ffff93075182d2a8 R15: ffff9308e2ddc840 FS: 0000000000000000(0000) GS:ffff9308fdc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000100000001 CR3: 00000002e0412004 CR4: 00000000001606e0 Call Trace: rdma_umap_close+0x40/0x90 [ib_uverbs] remove_vma+0x43/0x80 exit_mmap+0xfd/0x1b0 mmput+0x6e/0x130 do_exit+0x290/0xcc0 ? get_signal+0x152/0xc40 do_group_exit+0x46/0xc0 get_signal+0x1bd/0xc40 ? prepare_to_wait_event+0x97/0x190 do_signal+0x36/0x630 ? remove_wait_queue+0x60/0x60 ? __audit_syscall_exit+0x1d9/0x290 ? rcu_read_lock_sched_held+0x52/0x90 ? kfree+0x21c/0x2e0 exit_to_usermode_loop+0x4f/0xc3 do_syscall_64+0x1ed/0x270 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x7fae715a81fd Code: Bad RIP value. RSP: 002b:00007fae6e163cb0 EFLAGS: 00000293 ORIG_RAX: 0000000000000001 RAX: fffffffffffffe00 RBX: 00007fae6e163d30 RCX: 00007fae715a81fd RDX: 0000000000000010 RSI: 00007fae6e163cf0 RDI: 0000000000000003 RBP: 00000000013413a0 R08: 00007fae68000000 R09: 0000000000000017 R10: 0000000000000001 R11: 0000000000000293 R12: 00007fae680008c0 R13: 00007fae6e163cf0 R14: 00007fae717c9804 R15: 00007fae6e163ed0 CR2: 0000000100000001 ---[ end trace b33d58d3a06782cb ]--- RIP: 0010:rdma_user_mmap_entry_put+0xa/0x30 [ib_core] Fixes: b86deba977a9 ("RDMA/core: Move core content from ib_uverbs to ib_core") Link: https://lore.kernel.org/r/20200107162223.1745-1-shiraz.saleem@intel.com Signed-off-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/i40iw/i40iw_verbs.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c index 86375947bc67..dbd96d029d8b 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c +++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c @@ -169,8 +169,7 @@ static void i40iw_dealloc_ucontext(struct ib_ucontext *context) static int i40iw_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) { struct i40iw_ucontext *ucontext; - u64 db_addr_offset; - u64 push_offset; + u64 db_addr_offset, push_offset, pfn; ucontext = to_ucontext(context); if (ucontext->iwdev->sc_dev.is_pf) { @@ -189,7 +188,6 @@ static int i40iw_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) if (vma->vm_pgoff == (db_addr_offset >> PAGE_SHIFT)) { vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - vma->vm_private_data = ucontext; } else { if ((vma->vm_pgoff - (push_offset >> PAGE_SHIFT)) % 2) vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); @@ -197,12 +195,12 @@ static int i40iw_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); } - if (io_remap_pfn_range(vma, vma->vm_start, - vma->vm_pgoff + (pci_resource_start(ucontext->iwdev->ldev->pcidev, 0) >> PAGE_SHIFT), - PAGE_SIZE, vma->vm_page_prot)) - return -EAGAIN; + pfn = vma->vm_pgoff + + (pci_resource_start(ucontext->iwdev->ldev->pcidev, 0) >> + PAGE_SHIFT); - return 0; + return rdma_user_mmap_io(context, vma, pfn, PAGE_SIZE, + vma->vm_page_prot, NULL); } /** -- cgit v1.2.3-59-g8ed1b