From 8f28b178f71cc56eccf2a6e2c0ace17c82f900d7 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 3 Sep 2018 09:11:14 +0300 Subject: RDMA/mlx4: Ensure that maximal send/receive SGE less than supported by HW In calculating the global maximum number of the Scatter/Gather elements supported, the following four maximum parameters must be taken into consideration: max_sg_rq, max_sg_sq, max_desc_sz_rq and max_desc_sz_sq. However instead of bringing this complexity to query_device, which still won't be sufficient anyway (the calculations are dependent on QP type), the safer approach will be to restore old code, which will give us 32 SGEs. Fixes: 33023fb85a42 ("IB/core: add max_send_sge and max_recv_sge attributes") Reported-by: Chuck Lever Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx4/main.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index ca0f1ee26091..0bbeaaae47e0 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -517,9 +517,11 @@ static int mlx4_ib_query_device(struct ib_device *ibdev, props->page_size_cap = dev->dev->caps.page_size_cap; props->max_qp = dev->dev->quotas.qp; props->max_qp_wr = dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE; - props->max_send_sge = dev->dev->caps.max_sq_sg; - props->max_recv_sge = dev->dev->caps.max_rq_sg; - props->max_sge_rd = MLX4_MAX_SGE_RD; + props->max_send_sge = + min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg); + props->max_recv_sge = + min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg); + props->max_sge_rd = MLX4_MAX_SGE_RD; props->max_cq = dev->dev->quotas.cq; props->max_cqe = dev->dev->caps.max_cqes; props->max_mr = dev->dev->quotas.mpt; -- cgit v1.2.3-59-g8ed1b From 0dbfaa9f2813787679e296eb5476e40938ab48c8 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 20 Sep 2018 12:58:46 -0700 Subject: IB/hfi1: Fix SL array bounds check The SL specified by a user needs to be a valid SL. Add a range check to the user specified SL value which protects from running off the end of the SL to SC table. CC: stable@vger.kernel.org Fixes: 7724105686e7 ("IB/hfi1: add driver files") Signed-off-by: Ira Weiny Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/verbs.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index 13374c727b14..a7c586a5589d 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -1582,6 +1582,7 @@ static int hfi1_check_ah(struct ib_device *ibdev, struct rdma_ah_attr *ah_attr) struct hfi1_pportdata *ppd; struct hfi1_devdata *dd; u8 sc5; + u8 sl; if (hfi1_check_mcast(rdma_ah_get_dlid(ah_attr)) && !(rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH)) @@ -1590,8 +1591,13 @@ static int hfi1_check_ah(struct ib_device *ibdev, struct rdma_ah_attr *ah_attr) /* test the mapping for validity */ ibp = to_iport(ibdev, rdma_ah_get_port_num(ah_attr)); ppd = ppd_from_ibp(ibp); - sc5 = ibp->sl_to_sc[rdma_ah_get_sl(ah_attr)]; dd = dd_from_ppd(ppd); + + sl = rdma_ah_get_sl(ah_attr); + if (sl >= ARRAY_SIZE(ibp->sl_to_sc)) + return -EINVAL; + + sc5 = ibp->sl_to_sc[sl]; if (sc_to_vlt(dd, sc5) > num_vls && sc_to_vlt(dd, sc5) != 0xf) return -EINVAL; return 0; -- cgit v1.2.3-59-g8ed1b From 94694d18cf27a6faad91487a38ce516c2b16e7d9 Mon Sep 17 00:00:00 2001 From: "Michael J. Ruhl" Date: Thu, 20 Sep 2018 12:58:56 -0700 Subject: IB/hfi1: Invalid user input can result in crash If the number of packets in a user sdma request does not match the actual iovectors being sent, sdma_cleanup can be called on an uninitialized request structure, resulting in a crash similar to this: BUG: unable to handle kernel NULL pointer dereference at 0000000000000008 IP: [] __sdma_txclean+0x57/0x1e0 [hfi1] PGD 8000001044f61067 PUD 1052706067 PMD 0 Oops: 0000 [#1] SMP CPU: 30 PID: 69912 Comm: upsm Kdump: loaded Tainted: G OE ------------ 3.10.0-862.el7.x86_64 #1 Hardware name: Intel Corporation S2600KPR/S2600KPR, BIOS SE5C610.86B.01.01.0019.101220160604 10/12/2016 task: ffff8b331c890000 ti: ffff8b2ed1f98000 task.ti: ffff8b2ed1f98000 RIP: 0010:[] [] __sdma_txclean+0x57/0x1e0 [hfi1] RSP: 0018:ffff8b2ed1f9bab0 EFLAGS: 00010286 RAX: 0000000000008b2b RBX: ffff8b2adf6e0000 RCX: 0000000000000000 RDX: 00000000000000a0 RSI: ffff8b2e9eedc540 RDI: ffff8b2adf6e0000 RBP: ffff8b2ed1f9bad8 R08: 0000000000000000 R09: ffffffffc0b04a06 R10: ffff8b331c890190 R11: ffffe6ed00bf1840 R12: ffff8b3315480000 R13: ffff8b33154800f0 R14: 00000000fffffff2 R15: ffff8b2e9eedc540 FS: 00007f035ac47740(0000) GS:ffff8b331e100000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000008 CR3: 0000000c03fe6000 CR4: 00000000001607e0 Call Trace: [] user_sdma_send_pkts+0xdcd/0x1990 [hfi1] [] ? gup_pud_range+0x140/0x290 [] ? hfi1_mmu_rb_insert+0x155/0x1b0 [hfi1] [] hfi1_user_sdma_process_request+0xc5b/0x11b0 [hfi1] [] hfi1_aio_write+0xba/0x110 [hfi1] [] do_sync_readv_writev+0x7b/0xd0 [] do_readv_writev+0xce/0x260 [] ? tty_ldisc_deref+0x19/0x20 [] ? n_tty_ioctl+0xe0/0xe0 [] vfs_writev+0x35/0x60 [] SyS_writev+0x7f/0x110 [] system_call_fastpath+0x1c/0x21 Code: 06 49 c7 47 18 00 00 00 00 0f 87 89 01 00 00 5b 41 5c 41 5d 41 5e 41 5f 5d c3 66 2e 0f 1f 84 00 00 00 00 00 48 8b 4e 10 48 89 fb <48> 8b 51 08 49 89 d4 83 e2 0c 41 81 e4 00 e0 00 00 48 c1 ea 02 RIP [] __sdma_txclean+0x57/0x1e0 [hfi1] RSP CR2: 0000000000000008 There are two exit points from user_sdma_send_pkts(). One (free_tx) merely frees the slab entry and one (free_txreq) cleans the sdma_txreq prior to freeing the slab entry. The free_txreq variation can only be called after one of the sdma_init*() variations has been called. In the panic case, the slab entry had been allocated but not inited. Fix the issue by exiting through free_tx thus avoiding sdma_clean(). Cc: # 4.9.x+ Fixes: 7724105686e7 ("IB/hfi1: add driver files") Reviewed-by: Mike Marciniszyn Reviewed-by: Lukasz Odzioba Signed-off-by: Michael J. Ruhl Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/user_sdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c index a3a7b33196d6..5c88706121c1 100644 --- a/drivers/infiniband/hw/hfi1/user_sdma.c +++ b/drivers/infiniband/hw/hfi1/user_sdma.c @@ -828,7 +828,7 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts) if (READ_ONCE(iovec->offset) == iovec->iov.iov_len) { if (++req->iov_idx == req->data_iovs) { ret = -EFAULT; - goto free_txreq; + goto free_tx; } iovec = &req->iovs[req->iov_idx]; WARN_ON(iovec->offset); -- cgit v1.2.3-59-g8ed1b From d623500b3c4efd8d4e945ac9003c6b87b469a9ab Mon Sep 17 00:00:00 2001 From: "Michael J. Ruhl" Date: Thu, 20 Sep 2018 12:59:05 -0700 Subject: IB/hfi1: Fix context recovery when PBC has an UnsupportedVL If a packet stream uses an UnsupportedVL (virtual lane), the send engine will not send the packet, and it will not indicate that an error has occurred. This will cause the packet stream to block. HFI has 8 virtual lanes available for packet streams. Each lane can be enabled or disabled using the UnsupportedVL mask. If a lane is disabled, adding a packet to the send context must be disallowed. The current mask for determining unsupported VLs defaults to 0 (allow all). This is incorrect. Only the VLs that are defined should be allowed. Determine which VLs are disabled (mtu == 0), and set the appropriate unsupported bit in the mask. The correct mask will allow the send engine to error on the invalid VL, and error recovery will work correctly. Cc: # 4.9.x+ Fixes: 7724105686e7 ("IB/hfi1: add driver files") Reviewed-by: Mike Marciniszyn Reviewed-by: Lukasz Odzioba Signed-off-by: Michael J. Ruhl Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/pio.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/hfi1/pio.c b/drivers/infiniband/hw/hfi1/pio.c index c2c1cba5b23b..cd962c9ea6bc 100644 --- a/drivers/infiniband/hw/hfi1/pio.c +++ b/drivers/infiniband/hw/hfi1/pio.c @@ -86,6 +86,7 @@ void pio_send_control(struct hfi1_devdata *dd, int op) unsigned long flags; int write = 1; /* write sendctrl back */ int flush = 0; /* re-read sendctrl to make sure it is flushed */ + int i; spin_lock_irqsave(&dd->sendctrl_lock, flags); @@ -95,9 +96,13 @@ void pio_send_control(struct hfi1_devdata *dd, int op) reg |= SEND_CTRL_SEND_ENABLE_SMASK; /* Fall through */ case PSC_DATA_VL_ENABLE: + mask = 0; + for (i = 0; i < ARRAY_SIZE(dd->vld); i++) + if (!dd->vld[i].mtu) + mask |= BIT_ULL(i); /* Disallow sending on VLs not enabled */ - mask = (((~0ull) << num_vls) & SEND_CTRL_UNSUPPORTED_VL_MASK) << - SEND_CTRL_UNSUPPORTED_VL_SHIFT; + mask = (mask & SEND_CTRL_UNSUPPORTED_VL_MASK) << + SEND_CTRL_UNSUPPORTED_VL_SHIFT; reg = (reg & ~SEND_CTRL_UNSUPPORTED_VL_SMASK) | mask; break; case PSC_GLOBAL_DISABLE: -- cgit v1.2.3-59-g8ed1b From b4a4957d3d1c328b733fce783b7264996f866ad2 Mon Sep 17 00:00:00 2001 From: "Michael J. Ruhl" Date: Thu, 20 Sep 2018 12:59:14 -0700 Subject: IB/hfi1: Fix destroy_qp hang after a link down rvt_destroy_qp() cannot complete until all in process packets have been released from the underlying hardware. If a link down event occurs, an application can hang with a kernel stack similar to: cat /proc//stack quiesce_qp+0x178/0x250 [hfi1] rvt_reset_qp+0x23d/0x400 [rdmavt] rvt_destroy_qp+0x69/0x210 [rdmavt] ib_destroy_qp+0xba/0x1c0 [ib_core] nvme_rdma_destroy_queue_ib+0x46/0x80 [nvme_rdma] nvme_rdma_free_queue+0x3c/0xd0 [nvme_rdma] nvme_rdma_destroy_io_queues+0x88/0xd0 [nvme_rdma] nvme_rdma_error_recovery_work+0x52/0xf0 [nvme_rdma] process_one_work+0x17a/0x440 worker_thread+0x126/0x3c0 kthread+0xcf/0xe0 ret_from_fork+0x58/0x90 0xffffffffffffffff quiesce_qp() waits until all outstanding packets have been freed. This wait should be momentary. During a link down event, the cleanup handling does not ensure that all packets caught by the link down are flushed properly. This is caused by the fact that the freeze path and the link down event is handled the same. This is not correct. The freeze path waits until the HFI is unfrozen and then restarts PIO. A link down is not a freeze event. The link down path cannot restart the PIO until link is restored. If the PIO path is restarted before the link comes up, the application (QP) using the PIO path will hang (until link is restored). Fix by separating the linkdown path from the freeze path and use the link down path for link down events. Close a race condition sc_disable() by acquiring both the progress and release locks. Close a race condition in sc_stop() by moving the setting of the flag bits under the alloc lock. Cc: # 4.9.x+ Fixes: 7724105686e7 ("IB/hfi1: add driver files") Reviewed-by: Mike Marciniszyn Signed-off-by: Michael J. Ruhl Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/chip.c | 6 +++++- drivers/infiniband/hw/hfi1/pio.c | 42 +++++++++++++++++++++++++++++++-------- drivers/infiniband/hw/hfi1/pio.h | 2 ++ 3 files changed, 41 insertions(+), 9 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index 2c19bf772451..e1668bcc2d13 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -6733,6 +6733,7 @@ void start_freeze_handling(struct hfi1_pportdata *ppd, int flags) struct hfi1_devdata *dd = ppd->dd; struct send_context *sc; int i; + int sc_flags; if (flags & FREEZE_SELF) write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_FREEZE_SMASK); @@ -6743,11 +6744,13 @@ void start_freeze_handling(struct hfi1_pportdata *ppd, int flags) /* notify all SDMA engines that they are going into a freeze */ sdma_freeze_notify(dd, !!(flags & FREEZE_LINK_DOWN)); + sc_flags = SCF_FROZEN | SCF_HALTED | (flags & FREEZE_LINK_DOWN ? + SCF_LINK_DOWN : 0); /* do halt pre-handling on all enabled send contexts */ for (i = 0; i < dd->num_send_contexts; i++) { sc = dd->send_contexts[i].sc; if (sc && (sc->flags & SCF_ENABLED)) - sc_stop(sc, SCF_FROZEN | SCF_HALTED); + sc_stop(sc, sc_flags); } /* Send context are frozen. Notify user space */ @@ -10674,6 +10677,7 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state) add_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK); handle_linkup_change(dd, 1); + pio_kernel_linkup(dd); /* * After link up, a new link width will have been set. diff --git a/drivers/infiniband/hw/hfi1/pio.c b/drivers/infiniband/hw/hfi1/pio.c index cd962c9ea6bc..752057647f09 100644 --- a/drivers/infiniband/hw/hfi1/pio.c +++ b/drivers/infiniband/hw/hfi1/pio.c @@ -926,20 +926,18 @@ void sc_free(struct send_context *sc) void sc_disable(struct send_context *sc) { u64 reg; - unsigned long flags; struct pio_buf *pbuf; if (!sc) return; /* do all steps, even if already disabled */ - spin_lock_irqsave(&sc->alloc_lock, flags); + spin_lock_irq(&sc->alloc_lock); reg = read_kctxt_csr(sc->dd, sc->hw_context, SC(CTRL)); reg &= ~SC(CTRL_CTXT_ENABLE_SMASK); sc->flags &= ~SCF_ENABLED; sc_wait_for_packet_egress(sc, 1); write_kctxt_csr(sc->dd, sc->hw_context, SC(CTRL), reg); - spin_unlock_irqrestore(&sc->alloc_lock, flags); /* * Flush any waiters. Once the context is disabled, @@ -949,7 +947,7 @@ void sc_disable(struct send_context *sc) * proceed with the flush. */ udelay(1); - spin_lock_irqsave(&sc->release_lock, flags); + spin_lock(&sc->release_lock); if (sc->sr) { /* this context has a shadow ring */ while (sc->sr_tail != sc->sr_head) { pbuf = &sc->sr[sc->sr_tail].pbuf; @@ -960,7 +958,8 @@ void sc_disable(struct send_context *sc) sc->sr_tail = 0; } } - spin_unlock_irqrestore(&sc->release_lock, flags); + spin_unlock(&sc->release_lock); + spin_unlock_irq(&sc->alloc_lock); } /* return SendEgressCtxtStatus.PacketOccupancy */ @@ -1183,11 +1182,39 @@ void pio_kernel_unfreeze(struct hfi1_devdata *dd) sc = dd->send_contexts[i].sc; if (!sc || !(sc->flags & SCF_FROZEN) || sc->type == SC_USER) continue; + if (sc->flags & SCF_LINK_DOWN) + continue; sc_enable(sc); /* will clear the sc frozen flag */ } } +/** + * pio_kernel_linkup() - Re-enable send contexts after linkup event + * @dd: valid devive data + * + * When the link goes down, the freeze path is taken. However, a link down + * event is different from a freeze because if the send context is re-enabled + * whowever is sending data will start sending data again, which will hang + * any QP that is sending data. + * + * The freeze path now looks at the type of event that occurs and takes this + * path for link down event. + */ +void pio_kernel_linkup(struct hfi1_devdata *dd) +{ + struct send_context *sc; + int i; + + for (i = 0; i < dd->num_send_contexts; i++) { + sc = dd->send_contexts[i].sc; + if (!sc || !(sc->flags & SCF_LINK_DOWN) || sc->type == SC_USER) + continue; + + sc_enable(sc); /* will clear the sc link down flag */ + } +} + /* * Wait for the SendPioInitCtxt.PioInitInProgress bit to clear. * Returns: @@ -1387,11 +1414,10 @@ void sc_stop(struct send_context *sc, int flag) { unsigned long flags; - /* mark the context */ - sc->flags |= flag; - /* stop buffer allocations */ spin_lock_irqsave(&sc->alloc_lock, flags); + /* mark the context */ + sc->flags |= flag; sc->flags &= ~SCF_ENABLED; spin_unlock_irqrestore(&sc->alloc_lock, flags); wake_up(&sc->halt_wait); diff --git a/drivers/infiniband/hw/hfi1/pio.h b/drivers/infiniband/hw/hfi1/pio.h index 058b08f459ab..aaf372c3e5d6 100644 --- a/drivers/infiniband/hw/hfi1/pio.h +++ b/drivers/infiniband/hw/hfi1/pio.h @@ -139,6 +139,7 @@ struct send_context { #define SCF_IN_FREE 0x02 #define SCF_HALTED 0x04 #define SCF_FROZEN 0x08 +#define SCF_LINK_DOWN 0x10 struct send_context_info { struct send_context *sc; /* allocated working context */ @@ -306,6 +307,7 @@ void set_pio_integrity(struct send_context *sc); void pio_reset_all(struct hfi1_devdata *dd); void pio_freeze(struct hfi1_devdata *dd); void pio_kernel_unfreeze(struct hfi1_devdata *dd); +void pio_kernel_linkup(struct hfi1_devdata *dd); /* global PIO send control operations */ #define PSC_GLOBAL_ENABLE 0 -- cgit v1.2.3-59-g8ed1b From de5c95d0f518537f59ee5aef762abc46f868c377 Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Thu, 20 Sep 2018 22:33:00 -0700 Subject: RDMA/bnxt_re: Fix system crash during RDMA resource initialization bnxt_re_ib_reg acquires and releases the rtnl lock whenever it accesses the L2 driver. The following sequence can trigger a crash Acquires the rtnl_lock -> Registers roce driver callback with L2 driver -> release the rtnl lock bnxt_re acquires the rtnl_lock -> Request for MSIx vectors -> release the rtnl_lock Issue happens when bnxt_re proceeds with remaining part of initialization and L2 driver invokes bnxt_ulp_irq_stop as a part of bnxt_open_nic. The crash is in bnxt_qplib_nq_stop_irq as the NQ structures are not initialized yet, [ 3551.726647] BUG: unable to handle kernel NULL pointer dereference at (null) [ 3551.726656] IP: [] bnxt_qplib_nq_stop_irq+0x59/0xb0 [bnxt_re] [ 3551.726674] PGD 0 [ 3551.726679] Oops: 0002 1 SMP ... [ 3551.726822] Hardware name: Dell Inc. PowerEdge R720/08RW36, BIOS 2.4.3 07/09/2014 [ 3551.726826] task: ffff97e30eec5ee0 ti: ffff97e3173bc000 task.ti: ffff97e3173bc000 [ 3551.726829] RIP: 0010:[] [] bnxt_qplib_nq_stop_irq+0x59/0xb0 [bnxt_re] ... [ 3551.726872] Call Trace: [ 3551.726886] [] bnxt_re_stop_irq+0x4e/0x70 [bnxt_re] [ 3551.726899] [] bnxt_ulp_irq_stop+0x43/0x70 [bnxt_en] [ 3551.726908] [] bnxt_reserve_rings+0x174/0x1e0 [bnxt_en] [ 3551.726917] [] __bnxt_open_nic+0x368/0x9a0 [bnxt_en] [ 3551.726925] [] bnxt_open_nic+0x1b/0x50 [bnxt_en] [ 3551.726934] [] bnxt_setup_mq_tc+0x11f/0x260 [bnxt_en] [ 3551.726943] [] bnxt_dcbnl_ieee_setets+0xb8/0x1f0 [bnxt_en] [ 3551.726954] [] dcbnl_ieee_set+0x9a/0x250 [ 3551.726966] [] ? __alloc_skb+0xa1/0x2d0 [ 3551.726972] [] dcb_doit+0x13a/0x210 [ 3551.726981] [] rtnetlink_rcv_msg+0xa7/0x260 [ 3551.726989] [] ? rtnl_unicast+0x20/0x30 [ 3551.726996] [] ? __kmalloc_node_track_caller+0x58/0x290 [ 3551.727002] [] ? dcb_doit+0x166/0x210 [ 3551.727007] [] ? __alloc_skb+0x8d/0x2d0 [ 3551.727012] [] ? rtnl_newlink+0x880/0x880 ... [ 3551.727104] [] system_call_fastpath+0x1c/0x21 ... [ 3551.727164] RIP [] bnxt_qplib_nq_stop_irq+0x59/0xb0 [bnxt_re] [ 3551.727175] RSP [ 3551.727177] CR2: 0000000000000000 Avoid this inconsistent state and system crash by acquiring the rtnl lock for the entire duration of device initialization. Re-factor the code to remove the rtnl lock from the individual function and acquire and release it from the caller. Fixes: 1ac5a4047975 ("RDMA/bnxt_re: Add bnxt_re RoCE driver") Fixes: 6e04b1035689 ("RDMA/bnxt_re: Fix broken RoCE driver due to recent L2 driver changes") Signed-off-by: Selvin Xavier Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/main.c | 93 +++++++++++++++--------------------- 1 file changed, 38 insertions(+), 55 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 20b9f31052bf..85cd1a3593d6 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -78,7 +78,7 @@ static struct list_head bnxt_re_dev_list = LIST_HEAD_INIT(bnxt_re_dev_list); /* Mutex to protect the list of bnxt_re devices added */ static DEFINE_MUTEX(bnxt_re_dev_lock); static struct workqueue_struct *bnxt_re_wq; -static void bnxt_re_ib_unreg(struct bnxt_re_dev *rdev, bool lock_wait); +static void bnxt_re_ib_unreg(struct bnxt_re_dev *rdev); /* SR-IOV helper functions */ @@ -182,7 +182,7 @@ static void bnxt_re_shutdown(void *p) if (!rdev) return; - bnxt_re_ib_unreg(rdev, false); + bnxt_re_ib_unreg(rdev); } static void bnxt_re_stop_irq(void *handle) @@ -251,7 +251,7 @@ static struct bnxt_ulp_ops bnxt_re_ulp_ops = { /* Driver registration routines used to let the networking driver (bnxt_en) * to know that the RoCE driver is now installed */ -static int bnxt_re_unregister_netdev(struct bnxt_re_dev *rdev, bool lock_wait) +static int bnxt_re_unregister_netdev(struct bnxt_re_dev *rdev) { struct bnxt_en_dev *en_dev; int rc; @@ -260,14 +260,9 @@ static int bnxt_re_unregister_netdev(struct bnxt_re_dev *rdev, bool lock_wait) return -EINVAL; en_dev = rdev->en_dev; - /* Acquire rtnl lock if it is not invokded from netdev event */ - if (lock_wait) - rtnl_lock(); rc = en_dev->en_ops->bnxt_unregister_device(rdev->en_dev, BNXT_ROCE_ULP); - if (lock_wait) - rtnl_unlock(); return rc; } @@ -281,14 +276,12 @@ static int bnxt_re_register_netdev(struct bnxt_re_dev *rdev) en_dev = rdev->en_dev; - rtnl_lock(); rc = en_dev->en_ops->bnxt_register_device(en_dev, BNXT_ROCE_ULP, &bnxt_re_ulp_ops, rdev); - rtnl_unlock(); return rc; } -static int bnxt_re_free_msix(struct bnxt_re_dev *rdev, bool lock_wait) +static int bnxt_re_free_msix(struct bnxt_re_dev *rdev) { struct bnxt_en_dev *en_dev; int rc; @@ -298,13 +291,9 @@ static int bnxt_re_free_msix(struct bnxt_re_dev *rdev, bool lock_wait) en_dev = rdev->en_dev; - if (lock_wait) - rtnl_lock(); rc = en_dev->en_ops->bnxt_free_msix(rdev->en_dev, BNXT_ROCE_ULP); - if (lock_wait) - rtnl_unlock(); return rc; } @@ -320,7 +309,6 @@ static int bnxt_re_request_msix(struct bnxt_re_dev *rdev) num_msix_want = min_t(u32, BNXT_RE_MAX_MSIX, num_online_cpus()); - rtnl_lock(); num_msix_got = en_dev->en_ops->bnxt_request_msix(en_dev, BNXT_ROCE_ULP, rdev->msix_entries, num_msix_want); @@ -335,7 +323,6 @@ static int bnxt_re_request_msix(struct bnxt_re_dev *rdev) } rdev->num_msix = num_msix_got; done: - rtnl_unlock(); return rc; } @@ -358,24 +345,18 @@ static void bnxt_re_fill_fw_msg(struct bnxt_fw_msg *fw_msg, void *msg, fw_msg->timeout = timeout; } -static int bnxt_re_net_ring_free(struct bnxt_re_dev *rdev, u16 fw_ring_id, - bool lock_wait) +static int bnxt_re_net_ring_free(struct bnxt_re_dev *rdev, u16 fw_ring_id) { struct bnxt_en_dev *en_dev = rdev->en_dev; struct hwrm_ring_free_input req = {0}; struct hwrm_ring_free_output resp; struct bnxt_fw_msg fw_msg; - bool do_unlock = false; int rc = -EINVAL; if (!en_dev) return rc; memset(&fw_msg, 0, sizeof(fw_msg)); - if (lock_wait) { - rtnl_lock(); - do_unlock = true; - } bnxt_re_init_hwrm_hdr(rdev, (void *)&req, HWRM_RING_FREE, -1, -1); req.ring_type = RING_ALLOC_REQ_RING_TYPE_L2_CMPL; @@ -386,8 +367,6 @@ static int bnxt_re_net_ring_free(struct bnxt_re_dev *rdev, u16 fw_ring_id, if (rc) dev_err(rdev_to_dev(rdev), "Failed to free HW ring:%d :%#x", req.ring_id, rc); - if (do_unlock) - rtnl_unlock(); return rc; } @@ -405,7 +384,6 @@ static int bnxt_re_net_ring_alloc(struct bnxt_re_dev *rdev, dma_addr_t *dma_arr, return rc; memset(&fw_msg, 0, sizeof(fw_msg)); - rtnl_lock(); bnxt_re_init_hwrm_hdr(rdev, (void *)&req, HWRM_RING_ALLOC, -1, -1); req.enables = 0; req.page_tbl_addr = cpu_to_le64(dma_arr[0]); @@ -426,27 +404,21 @@ static int bnxt_re_net_ring_alloc(struct bnxt_re_dev *rdev, dma_addr_t *dma_arr, if (!rc) *fw_ring_id = le16_to_cpu(resp.ring_id); - rtnl_unlock(); return rc; } static int bnxt_re_net_stats_ctx_free(struct bnxt_re_dev *rdev, - u32 fw_stats_ctx_id, bool lock_wait) + u32 fw_stats_ctx_id) { struct bnxt_en_dev *en_dev = rdev->en_dev; struct hwrm_stat_ctx_free_input req = {0}; struct bnxt_fw_msg fw_msg; - bool do_unlock = false; int rc = -EINVAL; if (!en_dev) return rc; memset(&fw_msg, 0, sizeof(fw_msg)); - if (lock_wait) { - rtnl_lock(); - do_unlock = true; - } bnxt_re_init_hwrm_hdr(rdev, (void *)&req, HWRM_STAT_CTX_FREE, -1, -1); req.stat_ctx_id = cpu_to_le32(fw_stats_ctx_id); @@ -457,8 +429,6 @@ static int bnxt_re_net_stats_ctx_free(struct bnxt_re_dev *rdev, dev_err(rdev_to_dev(rdev), "Failed to free HW stats context %#x", rc); - if (do_unlock) - rtnl_unlock(); return rc; } @@ -478,7 +448,6 @@ static int bnxt_re_net_stats_ctx_alloc(struct bnxt_re_dev *rdev, return rc; memset(&fw_msg, 0, sizeof(fw_msg)); - rtnl_lock(); bnxt_re_init_hwrm_hdr(rdev, (void *)&req, HWRM_STAT_CTX_ALLOC, -1, -1); req.update_period_ms = cpu_to_le32(1000); @@ -490,7 +459,6 @@ static int bnxt_re_net_stats_ctx_alloc(struct bnxt_re_dev *rdev, if (!rc) *fw_stats_ctx_id = le32_to_cpu(resp.stat_ctx_id); - rtnl_unlock(); return rc; } @@ -929,19 +897,19 @@ fail: return rc; } -static void bnxt_re_free_nq_res(struct bnxt_re_dev *rdev, bool lock_wait) +static void bnxt_re_free_nq_res(struct bnxt_re_dev *rdev) { int i; for (i = 0; i < rdev->num_msix - 1; i++) { - bnxt_re_net_ring_free(rdev, rdev->nq[i].ring_id, lock_wait); + bnxt_re_net_ring_free(rdev, rdev->nq[i].ring_id); bnxt_qplib_free_nq(&rdev->nq[i]); } } -static void bnxt_re_free_res(struct bnxt_re_dev *rdev, bool lock_wait) +static void bnxt_re_free_res(struct bnxt_re_dev *rdev) { - bnxt_re_free_nq_res(rdev, lock_wait); + bnxt_re_free_nq_res(rdev); if (rdev->qplib_res.dpi_tbl.max) { bnxt_qplib_dealloc_dpi(&rdev->qplib_res, @@ -1219,7 +1187,7 @@ static int bnxt_re_setup_qos(struct bnxt_re_dev *rdev) return 0; } -static void bnxt_re_ib_unreg(struct bnxt_re_dev *rdev, bool lock_wait) +static void bnxt_re_ib_unreg(struct bnxt_re_dev *rdev) { int i, rc; @@ -1234,28 +1202,27 @@ static void bnxt_re_ib_unreg(struct bnxt_re_dev *rdev, bool lock_wait) cancel_delayed_work(&rdev->worker); bnxt_re_cleanup_res(rdev); - bnxt_re_free_res(rdev, lock_wait); + bnxt_re_free_res(rdev); if (test_and_clear_bit(BNXT_RE_FLAG_RCFW_CHANNEL_EN, &rdev->flags)) { rc = bnxt_qplib_deinit_rcfw(&rdev->rcfw); if (rc) dev_warn(rdev_to_dev(rdev), "Failed to deinitialize RCFW: %#x", rc); - bnxt_re_net_stats_ctx_free(rdev, rdev->qplib_ctx.stats.fw_id, - lock_wait); + bnxt_re_net_stats_ctx_free(rdev, rdev->qplib_ctx.stats.fw_id); bnxt_qplib_free_ctx(rdev->en_dev->pdev, &rdev->qplib_ctx); bnxt_qplib_disable_rcfw_channel(&rdev->rcfw); - bnxt_re_net_ring_free(rdev, rdev->rcfw.creq_ring_id, lock_wait); + bnxt_re_net_ring_free(rdev, rdev->rcfw.creq_ring_id); bnxt_qplib_free_rcfw_channel(&rdev->rcfw); } if (test_and_clear_bit(BNXT_RE_FLAG_GOT_MSIX, &rdev->flags)) { - rc = bnxt_re_free_msix(rdev, lock_wait); + rc = bnxt_re_free_msix(rdev); if (rc) dev_warn(rdev_to_dev(rdev), "Failed to free MSI-X vectors: %#x", rc); } if (test_and_clear_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags)) { - rc = bnxt_re_unregister_netdev(rdev, lock_wait); + rc = bnxt_re_unregister_netdev(rdev); if (rc) dev_warn(rdev_to_dev(rdev), "Failed to unregister with netdev: %#x", rc); @@ -1276,6 +1243,12 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev) { int i, j, rc; + bool locked; + + /* Acquire rtnl lock through out this function */ + rtnl_lock(); + locked = true; + /* Registered a new RoCE device instance to netdev */ rc = bnxt_re_register_netdev(rdev); if (rc) { @@ -1374,12 +1347,16 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev) schedule_delayed_work(&rdev->worker, msecs_to_jiffies(30000)); } + rtnl_unlock(); + locked = false; + /* Register ib dev */ rc = bnxt_re_register_ib(rdev); if (rc) { pr_err("Failed to register with IB: %#x\n", rc); goto fail; } + set_bit(BNXT_RE_FLAG_IBDEV_REGISTERED, &rdev->flags); dev_info(rdev_to_dev(rdev), "Device registered successfully"); for (i = 0; i < ARRAY_SIZE(bnxt_re_attributes); i++) { rc = device_create_file(&rdev->ibdev.dev, @@ -1395,7 +1372,6 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev) goto fail; } } - set_bit(BNXT_RE_FLAG_IBDEV_REGISTERED, &rdev->flags); ib_get_eth_speed(&rdev->ibdev, 1, &rdev->active_speed, &rdev->active_width); set_bit(BNXT_RE_FLAG_ISSUE_ROCE_STATS, &rdev->flags); @@ -1404,17 +1380,21 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev) return 0; free_sctx: - bnxt_re_net_stats_ctx_free(rdev, rdev->qplib_ctx.stats.fw_id, true); + bnxt_re_net_stats_ctx_free(rdev, rdev->qplib_ctx.stats.fw_id); free_ctx: bnxt_qplib_free_ctx(rdev->en_dev->pdev, &rdev->qplib_ctx); disable_rcfw: bnxt_qplib_disable_rcfw_channel(&rdev->rcfw); free_ring: - bnxt_re_net_ring_free(rdev, rdev->rcfw.creq_ring_id, true); + bnxt_re_net_ring_free(rdev, rdev->rcfw.creq_ring_id); free_rcfw: bnxt_qplib_free_rcfw_channel(&rdev->rcfw); fail: - bnxt_re_ib_unreg(rdev, true); + if (!locked) + rtnl_lock(); + bnxt_re_ib_unreg(rdev); + rtnl_unlock(); + return rc; } @@ -1567,7 +1547,7 @@ static int bnxt_re_netdev_event(struct notifier_block *notifier, */ if (atomic_read(&rdev->sched_count) > 0) goto exit; - bnxt_re_ib_unreg(rdev, false); + bnxt_re_ib_unreg(rdev); bnxt_re_remove_one(rdev); bnxt_re_dev_unreg(rdev); break; @@ -1646,7 +1626,10 @@ static void __exit bnxt_re_mod_exit(void) */ flush_workqueue(bnxt_re_wq); bnxt_re_dev_stop(rdev); - bnxt_re_ib_unreg(rdev, true); + /* Acquire the rtnl_lock as the L2 resources are freed here */ + rtnl_lock(); + bnxt_re_ib_unreg(rdev); + rtnl_unlock(); bnxt_re_remove_one(rdev); bnxt_re_dev_unreg(rdev); } -- cgit v1.2.3-59-g8ed1b From e8ef090a614292db01b5956a6f5467afbe6c5cf7 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 25 Sep 2018 12:11:12 +0300 Subject: IB/mlx5: Destroy the DEVX object upon error flow Upon DEVX object creation the object must be destroyed upon a follows error flow. Fixes: 7efce3691d33 ("IB/mlx5: Add obj create and destroy functionality") Signed-off-by: Yishai Hadas Reviewed-by: Artemy Kovalyov Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/devx.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index ac116d63e466..f2f11e652dcd 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -723,6 +723,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)( attrs, MLX5_IB_ATTR_DEVX_OBJ_CREATE_HANDLE); struct mlx5_ib_ucontext *c = to_mucontext(uobj->context); struct mlx5_ib_dev *dev = to_mdev(c->ibucontext.device); + u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)]; struct devx_obj *obj; int err; @@ -754,10 +755,12 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)( err = uverbs_copy_to(attrs, MLX5_IB_ATTR_DEVX_OBJ_CREATE_CMD_OUT, cmd_out, cmd_out_len); if (err) - goto obj_free; + goto obj_destroy; return 0; +obj_destroy: + mlx5_cmd_exec(obj->mdev, obj->dinbox, obj->dinlen, out, sizeof(out)); obj_free: kfree(obj); return err; -- cgit v1.2.3-59-g8ed1b From dd9a403495704fc80fb9f399003013ef2be2ee23 Mon Sep 17 00:00:00 2001 From: Valentine Fatiev Date: Wed, 10 Oct 2018 09:56:25 +0300 Subject: IB/mlx5: Unmap DMA addr from HCA before IOMMU The function that puts back the MR in cache also removes the DMA address from the HCA. Therefore we need to call this function before we remove the DMA mapping from MMU. Otherwise the HCA may access a memory that is no longer DMA mapped. Call trace: NMI: IOCK error (debug interrupt?) for reason 71 on CPU 0. CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.19.0-rc6+ #4 Hardware name: HP ProLiant DL360p Gen8, BIOS P71 08/20/2012 RIP: 0010:intel_idle+0x73/0x120 Code: 80 5c 01 00 0f ae 38 0f ae f0 31 d2 65 48 8b 04 25 80 5c 01 00 48 89 d1 0f 60 02 RSP: 0018:ffffffff9a403e38 EFLAGS: 00000046 RAX: 0000000000000030 RBX: 0000000000000005 RCX: 0000000000000001 RDX: 0000000000000000 RSI: ffffffff9a5790c0 RDI: 0000000000000000 RBP: 0000000000000030 R08: 0000000000000000 R09: 0000000000007cf9 R10: 000000000000030a R11: 0000000000000018 R12: 0000000000000000 R13: ffffffff9a5792b8 R14: ffffffff9a5790c0 R15: 0000002b48471e4d FS: 0000000000000000(0000) GS:ffff9c6caf400000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f5737185000 CR3: 0000000590c0a002 CR4: 00000000000606f0 Call Trace: cpuidle_enter_state+0x7e/0x2e0 do_idle+0x1ed/0x290 cpu_startup_entry+0x6f/0x80 start_kernel+0x524/0x544 ? set_init_arg+0x55/0x55 secondary_startup_64+0xa4/0xb0 DMAR: DRHD: handling fault status reg 2 DMAR: [DMA Read] Request device [04:00.0] fault addr b34d2000 [fault reason 06] PTE Read access is not set DMAR: [DMA Read] Request device [01:00.2] fault addr bff8b000 [fault reason 06] PTE Read access is not set Fixes: f3f134f5260a ("RDMA/mlx5: Fix crash while accessing garbage pointer and freed memory") Signed-off-by: Valentine Fatiev Reviewed-by: Moni Shoua Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/mr.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 9fb1d9cb9401..e22314837645 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -544,6 +544,9 @@ void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) int shrink = 0; int c; + if (!mr->allocated_from_cache) + return; + c = order2idx(dev, mr->order); if (c < 0 || c >= MAX_MR_CACHE_ENTRIES) { mlx5_ib_warn(dev, "order %d, cache index %d\n", mr->order, c); @@ -1647,18 +1650,19 @@ static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) umem = NULL; } #endif - clean_mr(dev, mr); + /* + * We should unregister the DMA address from the HCA before + * remove the DMA mapping. + */ + mlx5_mr_cache_free(dev, mr); if (umem) { ib_umem_release(umem); atomic_sub(npages, &dev->mdev->priv.reg_pages); } - if (!mr->allocated_from_cache) kfree(mr); - else - mlx5_mr_cache_free(dev, mr); } int mlx5_ib_dereg_mr(struct ib_mr *ibmr) -- cgit v1.2.3-59-g8ed1b