aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-03-09 15:53:03 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2019-03-09 15:53:03 -0800
commita50243b1ddcdd766d0d17fbfeeb1a22e62fdc461 (patch)
tree3dbf847105558eaac3658a46c4934df503c866a2
parentMerge tag 'pci-v5.1-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/helgaas/pci (diff)
parentnet/mlx5: ODP support for XRC transport is not enabled by default in FW (diff)
downloadlinux-dev-a50243b1ddcdd766d0d17fbfeeb1a22e62fdc461.tar.xz
linux-dev-a50243b1ddcdd766d0d17fbfeeb1a22e62fdc461.zip
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma
Pull rdma updates from Jason Gunthorpe: "This has been a slightly more active cycle than normal with ongoing core changes and quite a lot of collected driver updates. - Various driver fixes for bnxt_re, cxgb4, hns, mlx5, pvrdma, rxe - A new data transfer mode for HFI1 giving higher performance - Significant functional and bug fix update to the mlx5 On-Demand-Paging MR feature - A chip hang reset recovery system for hns - Change mm->pinned_vm to an atomic64 - Update bnxt_re to support a new 57500 chip - A sane netlink 'rdma link add' method for creating rxe devices and fixing the various unregistration race conditions in rxe's unregister flow - Allow lookup up objects by an ID over netlink - Various reworking of the core to driver interface: - drivers should not assume umem SGLs are in PAGE_SIZE chunks - ucontext is accessed via udata not other means - start to make the core code responsible for object memory allocation - drivers should convert struct device to struct ib_device via a helper - drivers have more tools to avoid use after unregister problems" * tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma: (280 commits) net/mlx5: ODP support for XRC transport is not enabled by default in FW IB/hfi1: Close race condition on user context disable and close RDMA/umem: Revert broken 'off by one' fix RDMA/umem: minor bug fix in error handling path RDMA/hns: Use GFP_ATOMIC in hns_roce_v2_modify_qp cxgb4: kfree mhp after the debug print IB/rdmavt: Fix concurrency panics in QP post_send and modify to error IB/rdmavt: Fix loopback send with invalidate ordering IB/iser: Fix dma_nents type definition IB/mlx5: Set correct write permissions for implicit ODP MR bnxt_re: Clean cq for kernel consumers only RDMA/uverbs: Don't do double free of allocated PD RDMA: Handle ucontext allocations by IB/core RDMA/core: Fix a WARN() message bnxt_re: fix the regression due to changes in alloc_pbl IB/mlx4: Increase the timeout for CM cache IB/core: Abort page fault handler silently during owning process exit IB/mlx5: Validate correct PD before prefetch MR IB/mlx5: Protect against prefetch of invalid MR RDMA/uverbs: Store PR pointer before it is overwritten ...
-rw-r--r--.clang-format2
-rw-r--r--Documentation/infiniband/user_verbs.txt4
-rw-r--r--drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c8
-rw-r--r--drivers/infiniband/Kconfig15
-rw-r--r--drivers/infiniband/core/Makefile4
-rw-r--r--drivers/infiniband/core/cache.c118
-rw-r--r--drivers/infiniband/core/cgroup.c5
-rw-r--r--drivers/infiniband/core/cm.c3
-rw-r--r--drivers/infiniband/core/cma.c139
-rw-r--r--drivers/infiniband/core/cma_priv.h4
-rw-r--r--drivers/infiniband/core/core_priv.h35
-rw-r--r--drivers/infiniband/core/device.c1323
-rw-r--r--drivers/infiniband/core/iwcm.c13
-rw-r--r--drivers/infiniband/core/iwpm_msg.c232
-rw-r--r--drivers/infiniband/core/iwpm_util.c86
-rw-r--r--drivers/infiniband/core/iwpm_util.h12
-rw-r--r--drivers/infiniband/core/mad.c4
-rw-r--r--drivers/infiniband/core/netlink.c4
-rw-r--r--drivers/infiniband/core/nldev.c492
-rw-r--r--drivers/infiniband/core/rdma_core.c42
-rw-r--r--drivers/infiniband/core/restrack.c210
-rw-r--r--drivers/infiniband/core/restrack.h28
-rw-r--r--drivers/infiniband/core/rw.c12
-rw-r--r--drivers/infiniband/core/sa_query.c4
-rw-r--r--drivers/infiniband/core/security.c96
-rw-r--r--drivers/infiniband/core/sysfs.c93
-rw-r--r--drivers/infiniband/core/ucma.c7
-rw-r--r--drivers/infiniband/core/umem.c60
-rw-r--r--drivers/infiniband/core/umem_odp.c21
-rw-r--r--drivers/infiniband/core/user_mad.c52
-rw-r--r--drivers/infiniband/core/uverbs_cmd.c69
-rw-r--r--drivers/infiniband/core/uverbs_ioctl.c3
-rw-r--r--drivers/infiniband/core/uverbs_main.c2
-rw-r--r--drivers/infiniband/core/uverbs_std_types.c2
-rw-r--r--drivers/infiniband/core/uverbs_uapi.c15
-rw-r--r--drivers/infiniband/core/verbs.c73
-rw-r--r--drivers/infiniband/hw/bnxt_re/Kconfig1
-rw-r--r--drivers/infiniband/hw/bnxt_re/Makefile2
-rw-r--r--drivers/infiniband/hw/bnxt_re/bnxt_re.h1
-rw-r--r--drivers/infiniband/hw/bnxt_re/ib_verbs.c268
-rw-r--r--drivers/infiniband/hw/bnxt_re/ib_verbs.h16
-rw-r--r--drivers/infiniband/hw/bnxt_re/main.c134
-rw-r--r--drivers/infiniband/hw/bnxt_re/qplib_fp.c193
-rw-r--r--drivers/infiniband/hw/bnxt_re/qplib_fp.h47
-rw-r--r--drivers/infiniband/hw/bnxt_re/qplib_rcfw.c40
-rw-r--r--drivers/infiniband/hw/bnxt_re/qplib_rcfw.h45
-rw-r--r--drivers/infiniband/hw/bnxt_re/qplib_res.c22
-rw-r--r--drivers/infiniband/hw/bnxt_re/qplib_res.h30
-rw-r--r--drivers/infiniband/hw/bnxt_re/qplib_sp.c3
-rw-r--r--drivers/infiniband/hw/bnxt_re/roce_hsi.h160
-rw-r--r--drivers/infiniband/hw/cxgb3/Makefile2
-rw-r--r--drivers/infiniband/hw/cxgb3/iwch.c2
-rw-r--r--drivers/infiniband/hw/cxgb3/iwch_provider.c95
-rw-r--r--drivers/infiniband/hw/cxgb4/Makefile4
-rw-r--r--drivers/infiniband/hw/cxgb4/cm.c199
-rw-r--r--drivers/infiniband/hw/cxgb4/device.c10
-rw-r--r--drivers/infiniband/hw/cxgb4/iw_cxgb4.h16
-rw-r--r--drivers/infiniband/hw/cxgb4/mem.c36
-rw-r--r--drivers/infiniband/hw/cxgb4/provider.c85
-rw-r--r--drivers/infiniband/hw/cxgb4/qp.c33
-rw-r--r--drivers/infiniband/hw/cxgb4/t4.h1
-rw-r--r--drivers/infiniband/hw/hfi1/Makefile1
-rw-r--r--drivers/infiniband/hw/hfi1/chip.c13
-rw-r--r--drivers/infiniband/hw/hfi1/chip.h4
-rw-r--r--drivers/infiniband/hw/hfi1/common.h4
-rw-r--r--drivers/infiniband/hw/hfi1/debugfs.c58
-rw-r--r--drivers/infiniband/hw/hfi1/debugfs.h12
-rw-r--r--drivers/infiniband/hw/hfi1/driver.c58
-rw-r--r--drivers/infiniband/hw/hfi1/fault.c53
-rw-r--r--drivers/infiniband/hw/hfi1/hfi.h24
-rw-r--r--drivers/infiniband/hw/hfi1/init.c35
-rw-r--r--drivers/infiniband/hw/hfi1/iowait.c34
-rw-r--r--drivers/infiniband/hw/hfi1/iowait.h99
-rw-r--r--drivers/infiniband/hw/hfi1/opfn.c323
-rw-r--r--drivers/infiniband/hw/hfi1/opfn.h85
-rw-r--r--drivers/infiniband/hw/hfi1/pio.c18
-rw-r--r--drivers/infiniband/hw/hfi1/qp.c76
-rw-r--r--drivers/infiniband/hw/hfi1/qp.h7
-rw-r--r--drivers/infiniband/hw/hfi1/rc.c1141
-rw-r--r--drivers/infiniband/hw/hfi1/rc.h51
-rw-r--r--drivers/infiniband/hw/hfi1/ruc.c48
-rw-r--r--drivers/infiniband/hw/hfi1/sdma.c24
-rw-r--r--drivers/infiniband/hw/hfi1/sdma_txreq.h1
-rw-r--r--drivers/infiniband/hw/hfi1/sysfs.c16
-rw-r--r--drivers/infiniband/hw/hfi1/tid_rdma.c5418
-rw-r--r--drivers/infiniband/hw/hfi1/tid_rdma.h311
-rw-r--r--drivers/infiniband/hw/hfi1/trace.c118
-rw-r--r--drivers/infiniband/hw/hfi1/trace.h1
-rw-r--r--drivers/infiniband/hw/hfi1/trace_ibhdrs.h8
-rw-r--r--drivers/infiniband/hw/hfi1/trace_rc.h48
-rw-r--r--drivers/infiniband/hw/hfi1/trace_rx.h107
-rw-r--r--drivers/infiniband/hw/hfi1/trace_tid.h1610
-rw-r--r--drivers/infiniband/hw/hfi1/trace_tx.h18
-rw-r--r--drivers/infiniband/hw/hfi1/uc.c3
-rw-r--r--drivers/infiniband/hw/hfi1/ud.c24
-rw-r--r--drivers/infiniband/hw/hfi1/user_exp_rcv.h1
-rw-r--r--drivers/infiniband/hw/hfi1/user_pages.c12
-rw-r--r--drivers/infiniband/hw/hfi1/user_sdma.c9
-rw-r--r--drivers/infiniband/hw/hfi1/verbs.c210
-rw-r--r--drivers/infiniband/hw/hfi1/verbs.h104
-rw-r--r--drivers/infiniband/hw/hfi1/verbs_txreq.h1
-rw-r--r--drivers/infiniband/hw/hfi1/vnic_sdma.c6
-rw-r--r--drivers/infiniband/hw/hns/Kconfig1
-rw-r--r--drivers/infiniband/hw/hns/Makefile2
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_cmd.c32
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_cmd.h12
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_cq.c9
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_db.c6
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_device.h63
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_hem.c68
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_hem.h3
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_hw_v1.c36
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_hw_v2.c596
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_hw_v2.h92
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_main.c88
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_mr.c95
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_pd.c25
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_qp.c92
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_srq.c16
-rw-r--r--drivers/infiniband/hw/i40iw/Makefile2
-rw-r--r--drivers/infiniband/hw/i40iw/i40iw_utils.c1
-rw-r--r--drivers/infiniband/hw/i40iw/i40iw_verbs.c137
-rw-r--r--drivers/infiniband/hw/mlx4/Kconfig1
-rw-r--r--drivers/infiniband/hw/mlx4/cm.c2
-rw-r--r--drivers/infiniband/hw/mlx4/cq.c19
-rw-r--r--drivers/infiniband/hw/mlx4/doorbell.c6
-rw-r--r--drivers/infiniband/hw/mlx4/main.c77
-rw-r--r--drivers/infiniband/hw/mlx4/mlx4_ib.h3
-rw-r--r--drivers/infiniband/hw/mlx4/mr.c13
-rw-r--r--drivers/infiniband/hw/mlx4/qp.c84
-rw-r--r--drivers/infiniband/hw/mlx4/srq.c12
-rw-r--r--drivers/infiniband/hw/mlx5/Kconfig1
-rw-r--r--drivers/infiniband/hw/mlx5/cong.c15
-rw-r--r--drivers/infiniband/hw/mlx5/cq.c15
-rw-r--r--drivers/infiniband/hw/mlx5/devx.c463
-rw-r--r--drivers/infiniband/hw/mlx5/doorbell.c6
-rw-r--r--drivers/infiniband/hw/mlx5/ib_rep.c6
-rw-r--r--drivers/infiniband/hw/mlx5/main.c249
-rw-r--r--drivers/infiniband/hw/mlx5/mem.c5
-rw-r--r--drivers/infiniband/hw/mlx5/mlx5_ib.h41
-rw-r--r--drivers/infiniband/hw/mlx5/mr.c126
-rw-r--r--drivers/infiniband/hw/mlx5/odp.c316
-rw-r--r--drivers/infiniband/hw/mlx5/qp.c308
-rw-r--r--drivers/infiniband/hw/mlx5/srq.c11
-rw-r--r--drivers/infiniband/hw/mlx5/srq.h2
-rw-r--r--drivers/infiniband/hw/mlx5/srq_cmd.c16
-rw-r--r--drivers/infiniband/hw/mthca/mthca_main.c2
-rw-r--r--drivers/infiniband/hw/mthca/mthca_provider.c139
-rw-r--r--drivers/infiniband/hw/mthca/mthca_qp.c13
-rw-r--r--drivers/infiniband/hw/mthca/mthca_srq.c21
-rw-r--r--drivers/infiniband/hw/nes/Kconfig2
-rw-r--r--drivers/infiniband/hw/nes/nes_verbs.c313
-rw-r--r--drivers/infiniband/hw/nes/nes_verbs.h1
-rw-r--r--drivers/infiniband/hw/ocrdma/Makefile2
-rw-r--r--drivers/infiniband/hw/ocrdma/ocrdma_main.c12
-rw-r--r--drivers/infiniband/hw/ocrdma/ocrdma_stats.c67
-rw-r--r--drivers/infiniband/hw/ocrdma/ocrdma_verbs.c189
-rw-r--r--drivers/infiniband/hw/ocrdma/ocrdma_verbs.h11
-rw-r--r--drivers/infiniband/hw/qedr/main.c9
-rw-r--r--drivers/infiniband/hw/qedr/qedr_iw_cm.c2
-rw-r--r--drivers/infiniband/hw/qedr/verbs.c192
-rw-r--r--drivers/infiniband/hw/qedr/verbs.h10
-rw-r--r--drivers/infiniband/hw/qib/qib_debugfs.c27
-rw-r--r--drivers/infiniband/hw/qib/qib_rc.c7
-rw-r--r--drivers/infiniband/hw/qib/qib_sdma.c26
-rw-r--r--drivers/infiniband/hw/qib/qib_sysfs.c18
-rw-r--r--drivers/infiniband/hw/qib/qib_ud.c6
-rw-r--r--drivers/infiniband/hw/qib/qib_user_pages.c75
-rw-r--r--drivers/infiniband/hw/qib/qib_verbs.c20
-rw-r--r--drivers/infiniband/hw/usnic/Makefile2
-rw-r--r--drivers/infiniband/hw/usnic/usnic_debugfs.c26
-rw-r--r--drivers/infiniband/hw/usnic/usnic_ib_main.c57
-rw-r--r--drivers/infiniband/hw/usnic/usnic_ib_sysfs.c26
-rw-r--r--drivers/infiniband/hw/usnic/usnic_ib_verbs.c114
-rw-r--r--drivers/infiniband/hw/usnic/usnic_ib_verbs.h28
-rw-r--r--drivers/infiniband/hw/usnic/usnic_uiom.c65
-rw-r--r--drivers/infiniband/hw/usnic/usnic_uiom.h1
-rw-r--r--drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c2
-rw-r--r--drivers/infiniband/hw/vmw_pvrdma/pvrdma_dev_api.h15
-rw-r--r--drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c12
-rw-r--r--drivers/infiniband/hw/vmw_pvrdma/pvrdma_misc.c21
-rw-r--r--drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c3
-rw-r--r--drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c6
-rw-r--r--drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c4
-rw-r--r--drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c98
-rw-r--r--drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h12
-rw-r--r--drivers/infiniband/sw/rdmavt/mr.c21
-rw-r--r--drivers/infiniband/sw/rdmavt/pd.c29
-rw-r--r--drivers/infiniband/sw/rdmavt/pd.h7
-rw-r--r--drivers/infiniband/sw/rdmavt/qp.c104
-rw-r--r--drivers/infiniband/sw/rdmavt/rc.c13
-rw-r--r--drivers/infiniband/sw/rdmavt/srq.c5
-rw-r--r--drivers/infiniband/sw/rdmavt/trace_cq.h10
-rw-r--r--drivers/infiniband/sw/rdmavt/vt.c34
-rw-r--r--drivers/infiniband/sw/rxe/rxe.c67
-rw-r--r--drivers/infiniband/sw/rxe/rxe.h16
-rw-r--r--drivers/infiniband/sw/rxe/rxe_av.c7
-rw-r--r--drivers/infiniband/sw/rxe/rxe_comp.c6
-rw-r--r--drivers/infiniband/sw/rxe/rxe_loc.h9
-rw-r--r--drivers/infiniband/sw/rxe/rxe_mr.c15
-rw-r--r--drivers/infiniband/sw/rxe/rxe_net.c97
-rw-r--r--drivers/infiniband/sw/rxe/rxe_net.h2
-rw-r--r--drivers/infiniband/sw/rxe/rxe_param.h3
-rw-r--r--drivers/infiniband/sw/rxe/rxe_pool.c77
-rw-r--r--drivers/infiniband/sw/rxe/rxe_pool.h4
-rw-r--r--drivers/infiniband/sw/rxe/rxe_qp.c15
-rw-r--r--drivers/infiniband/sw/rxe/rxe_recv.c12
-rw-r--r--drivers/infiniband/sw/rxe/rxe_resp.c3
-rw-r--r--drivers/infiniband/sw/rxe/rxe_sysfs.c40
-rw-r--r--drivers/infiniband/sw/rxe/rxe_verbs.c103
-rw-r--r--drivers/infiniband/sw/rxe/rxe_verbs.h9
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib.h4
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_fs.c7
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_main.c14
-rw-r--r--drivers/infiniband/ulp/iser/iscsi_iser.h2
-rw-r--r--drivers/infiniband/ulp/iser/iser_memory.c19
-rw-r--r--drivers/infiniband/ulp/isert/Makefile1
-rw-r--r--drivers/infiniband/ulp/srp/ib_srp.c26
-rw-r--r--drivers/infiniband/ulp/srpt/Makefile1
-rw-r--r--drivers/media/pci/intel/ipu3/ipu3-cio2.c4
-rw-r--r--drivers/misc/mic/scif/scif_rma.c38
-rw-r--r--drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c3
-rw-r--r--drivers/net/ethernet/chelsio/cxgb4/t4_msg.h8
-rw-r--r--drivers/net/ethernet/chelsio/cxgb4/t4_tcb.h12
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/main.c85
-rw-r--r--fs/proc/task_mmu.c2
-rw-r--r--include/linux/cgroup_rdma.h2
-rw-r--r--include/linux/mlx5/driver.h5
-rw-r--r--include/linux/mm_types.h2
-rw-r--r--include/linux/scatterlist.h49
-rw-r--r--include/rdma/ib_hdrs.h14
-rw-r--r--include/rdma/ib_mad.h5
-rw-r--r--include/rdma/ib_umem.h8
-rw-r--r--include/rdma/ib_umem_odp.h34
-rw-r--r--include/rdma/ib_verbs.h274
-rw-r--r--include/rdma/iw_cm.h16
-rw-r--r--include/rdma/iw_portmap.h144
-rw-r--r--include/rdma/rdma_cm.h1
-rw-r--r--include/rdma/rdma_netlink.h11
-rw-r--r--include/rdma/rdma_vt.h30
-rw-r--r--include/rdma/rdmavt_qp.h20
-rw-r--r--include/rdma/restrack.h58
-rw-r--r--include/rdma/tid_rdma_defs.h108
-rw-r--r--include/rdma/uverbs_ioctl.h18
-rw-r--r--include/rdma/uverbs_std_types.h18
-rw-r--r--include/rdma/uverbs_types.h1
-rw-r--r--include/uapi/rdma/bnxt_re-abi.h11
-rw-r--r--include/uapi/rdma/ib_user_verbs.h2
-rw-r--r--include/uapi/rdma/mlx5_user_ioctl_cmds.h18
-rw-r--r--include/uapi/rdma/mlx5_user_ioctl_verbs.h5
-rw-r--r--include/uapi/rdma/rdma_netlink.h74
-rw-r--r--include/uapi/rdma/rdma_user_cm.h4
-rw-r--r--include/uapi/rdma/rdma_user_rxe.h3
-rw-r--r--kernel/cgroup/rdma.c5
-rw-r--r--kernel/events/core.c8
-rw-r--r--kernel/fork.c2
-rw-r--r--lib/irq_poll.c2
-rw-r--r--lib/scatterlist.c26
-rw-r--r--mm/debug.c5
-rw-r--r--net/rds/ib.h12
-rw-r--r--net/rds/ib_fmr.c8
-rw-r--r--net/rds/ib_frmr.c4
-rw-r--r--net/rds/ib_recv.c8
-rw-r--r--net/rds/ib_send.c15
264 files changed, 16710 insertions, 5014 deletions
diff --git a/.clang-format b/.clang-format
index bc2ffb2a0b53..201a4f531b90 100644
--- a/.clang-format
+++ b/.clang-format
@@ -240,6 +240,7 @@ ForEachMacros:
- 'for_each_set_bit'
- 'for_each_set_bit_from'
- 'for_each_sg'
+ - 'for_each_sg_dma_page'
- 'for_each_sg_page'
- 'for_each_sibling_event'
- '__for_each_thread'
@@ -360,6 +361,7 @@ ForEachMacros:
- 'radix_tree_for_each_slot'
- 'radix_tree_for_each_tagged'
- 'rbtree_postorder_for_each_entry_safe'
+ - 'rdma_for_each_port'
- 'resource_list_for_each_entry'
- 'resource_list_for_each_entry_safe'
- 'rhl_for_each_entry_rcu'
diff --git a/Documentation/infiniband/user_verbs.txt b/Documentation/infiniband/user_verbs.txt
index df049b9f5b6e..47ebf2f80b2b 100644
--- a/Documentation/infiniband/user_verbs.txt
+++ b/Documentation/infiniband/user_verbs.txt
@@ -46,11 +46,11 @@ Memory pinning
I/O targets be kept resident at the same physical address. The
ib_uverbs module manages pinning and unpinning memory regions via
get_user_pages() and put_page() calls. It also accounts for the
- amount of memory pinned in the process's locked_vm, and checks that
+ amount of memory pinned in the process's pinned_vm, and checks that
unprivileged processes do not exceed their RLIMIT_MEMLOCK limit.
Pages that are pinned multiple times are counted each time they are
- pinned, so the value of locked_vm may be an overestimate of the
+ pinned, so the value of pinned_vm may be an overestimate of the
number of pages pinned by a process.
/dev files
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c
index 31786b200afc..a3357ff7540d 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c
@@ -311,7 +311,13 @@ static dma_addr_t __vmw_piter_dma_addr(struct vmw_piter *viter)
static dma_addr_t __vmw_piter_sg_addr(struct vmw_piter *viter)
{
- return sg_page_iter_dma_address(&viter->iter);
+ /*
+ * FIXME: This driver wrongly mixes DMA and CPU SG list iteration and
+ * needs revision. See
+ * https://lore.kernel.org/lkml/20190104223531.GA1705@ziepe.ca/
+ */
+ return sg_page_iter_dma_address(
+ container_of(&viter->iter, struct sg_dma_page_iter, base));
}
diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
index 0a3ec7c726ec..a1fb840de45d 100644
--- a/drivers/infiniband/Kconfig
+++ b/drivers/infiniband/Kconfig
@@ -89,6 +89,7 @@ config INFINIBAND_ADDR_TRANS_CONFIGFS
This allows the user to config the default GID type that the CM
uses for each device, when initiaing new connections.
+if INFINIBAND_USER_ACCESS || !INFINIBAND_USER_ACCESS
source "drivers/infiniband/hw/mthca/Kconfig"
source "drivers/infiniband/hw/qib/Kconfig"
source "drivers/infiniband/hw/cxgb3/Kconfig"
@@ -101,6 +102,12 @@ source "drivers/infiniband/hw/ocrdma/Kconfig"
source "drivers/infiniband/hw/vmw_pvrdma/Kconfig"
source "drivers/infiniband/hw/usnic/Kconfig"
source "drivers/infiniband/hw/hns/Kconfig"
+source "drivers/infiniband/hw/bnxt_re/Kconfig"
+source "drivers/infiniband/hw/hfi1/Kconfig"
+source "drivers/infiniband/hw/qedr/Kconfig"
+source "drivers/infiniband/sw/rdmavt/Kconfig"
+source "drivers/infiniband/sw/rxe/Kconfig"
+endif
source "drivers/infiniband/ulp/ipoib/Kconfig"
@@ -111,13 +118,5 @@ source "drivers/infiniband/ulp/iser/Kconfig"
source "drivers/infiniband/ulp/isert/Kconfig"
source "drivers/infiniband/ulp/opa_vnic/Kconfig"
-source "drivers/infiniband/sw/rdmavt/Kconfig"
-source "drivers/infiniband/sw/rxe/Kconfig"
-
-source "drivers/infiniband/hw/hfi1/Kconfig"
-
-source "drivers/infiniband/hw/qedr/Kconfig"
-
-source "drivers/infiniband/hw/bnxt_re/Kconfig"
endif # INFINIBAND
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index 69dee36e0e89..313f2349b518 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -15,8 +15,6 @@ ib_core-y := packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \
nldev.o restrack.o
ib_core-$(CONFIG_SECURITY_INFINIBAND) += security.o
-ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
-ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o
ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o
ib_cm-y := cm.o
@@ -39,3 +37,5 @@ ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o \
uverbs_std_types_flow_action.o uverbs_std_types_dm.o \
uverbs_std_types_mr.o uverbs_std_types_counters.o \
uverbs_uapi.o uverbs_std_types_device.o
+ib_uverbs-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
+ib_uverbs-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o
diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c
index 7b04590f307f..43c67e5f43c6 100644
--- a/drivers/infiniband/core/cache.c
+++ b/drivers/infiniband/core/cache.c
@@ -185,7 +185,7 @@ EXPORT_SYMBOL(ib_cache_gid_parse_type_str);
static struct ib_gid_table *rdma_gid_table(struct ib_device *device, u8 port)
{
- return device->cache.ports[port - rdma_start_port(device)].gid;
+ return device->port_data[port].cache.gid;
}
static bool is_gid_entry_free(const struct ib_gid_table_entry *entry)
@@ -547,21 +547,19 @@ int ib_cache_gid_add(struct ib_device *ib_dev, u8 port,
unsigned long mask;
int ret;
- if (ib_dev->ops.get_netdev) {
- idev = ib_dev->ops.get_netdev(ib_dev, port);
- if (idev && attr->ndev != idev) {
- union ib_gid default_gid;
+ idev = ib_device_get_netdev(ib_dev, port);
+ if (idev && attr->ndev != idev) {
+ union ib_gid default_gid;
- /* Adding default GIDs in not permitted */
- make_default_gid(idev, &default_gid);
- if (!memcmp(gid, &default_gid, sizeof(*gid))) {
- dev_put(idev);
- return -EPERM;
- }
- }
- if (idev)
+ /* Adding default GIDs is not permitted */
+ make_default_gid(idev, &default_gid);
+ if (!memcmp(gid, &default_gid, sizeof(*gid))) {
dev_put(idev);
+ return -EPERM;
+ }
}
+ if (idev)
+ dev_put(idev);
mask = GID_ATTR_FIND_MASK_GID |
GID_ATTR_FIND_MASK_GID_TYPE |
@@ -765,7 +763,7 @@ err_free_table:
return NULL;
}
-static void release_gid_table(struct ib_device *device, u8 port,
+static void release_gid_table(struct ib_device *device,
struct ib_gid_table *table)
{
bool leak = false;
@@ -863,31 +861,27 @@ static void gid_table_reserve_default(struct ib_device *ib_dev, u8 port,
static void gid_table_release_one(struct ib_device *ib_dev)
{
- struct ib_gid_table *table;
- u8 port;
+ unsigned int p;
- for (port = 0; port < ib_dev->phys_port_cnt; port++) {
- table = ib_dev->cache.ports[port].gid;
- release_gid_table(ib_dev, port, table);
- ib_dev->cache.ports[port].gid = NULL;
+ rdma_for_each_port (ib_dev, p) {
+ release_gid_table(ib_dev, ib_dev->port_data[p].cache.gid);
+ ib_dev->port_data[p].cache.gid = NULL;
}
}
static int _gid_table_setup_one(struct ib_device *ib_dev)
{
- u8 port;
struct ib_gid_table *table;
+ unsigned int rdma_port;
- for (port = 0; port < ib_dev->phys_port_cnt; port++) {
- u8 rdma_port = port + rdma_start_port(ib_dev);
-
- table = alloc_gid_table(
- ib_dev->port_immutable[rdma_port].gid_tbl_len);
+ rdma_for_each_port (ib_dev, rdma_port) {
+ table = alloc_gid_table(
+ ib_dev->port_data[rdma_port].immutable.gid_tbl_len);
if (!table)
goto rollback_table_setup;
gid_table_reserve_default(ib_dev, rdma_port, table);
- ib_dev->cache.ports[port].gid = table;
+ ib_dev->port_data[rdma_port].cache.gid = table;
}
return 0;
@@ -898,14 +892,11 @@ rollback_table_setup:
static void gid_table_cleanup_one(struct ib_device *ib_dev)
{
- struct ib_gid_table *table;
- u8 port;
+ unsigned int p;
- for (port = 0; port < ib_dev->phys_port_cnt; port++) {
- table = ib_dev->cache.ports[port].gid;
- cleanup_gid_table_port(ib_dev, port + rdma_start_port(ib_dev),
- table);
- }
+ rdma_for_each_port (ib_dev, p)
+ cleanup_gid_table_port(ib_dev, p,
+ ib_dev->port_data[p].cache.gid);
}
static int gid_table_setup_one(struct ib_device *ib_dev)
@@ -983,17 +974,17 @@ const struct ib_gid_attr *rdma_find_gid(struct ib_device *device,
unsigned long mask = GID_ATTR_FIND_MASK_GID |
GID_ATTR_FIND_MASK_GID_TYPE;
struct ib_gid_attr gid_attr_val = {.ndev = ndev, .gid_type = gid_type};
- u8 p;
+ unsigned int p;
if (ndev)
mask |= GID_ATTR_FIND_MASK_NETDEV;
- for (p = 0; p < device->phys_port_cnt; p++) {
+ rdma_for_each_port(device, p) {
struct ib_gid_table *table;
unsigned long flags;
int index;
- table = device->cache.ports[p].gid;
+ table = device->port_data[p].cache.gid;
read_lock_irqsave(&table->rwlock, flags);
index = find_gid(table, gid, &gid_attr_val, false, mask, NULL);
if (index >= 0) {
@@ -1025,7 +1016,7 @@ int ib_get_cached_pkey(struct ib_device *device,
read_lock_irqsave(&device->cache.lock, flags);
- cache = device->cache.ports[port_num - rdma_start_port(device)].pkey;
+ cache = device->port_data[port_num].cache.pkey;
if (index < 0 || index >= cache->table_len)
ret = -EINVAL;
@@ -1043,14 +1034,12 @@ int ib_get_cached_subnet_prefix(struct ib_device *device,
u64 *sn_pfx)
{
unsigned long flags;
- int p;
if (!rdma_is_port_valid(device, port_num))
return -EINVAL;
- p = port_num - rdma_start_port(device);
read_lock_irqsave(&device->cache.lock, flags);
- *sn_pfx = device->cache.ports[p].subnet_prefix;
+ *sn_pfx = device->port_data[port_num].cache.subnet_prefix;
read_unlock_irqrestore(&device->cache.lock, flags);
return 0;
@@ -1073,7 +1062,7 @@ int ib_find_cached_pkey(struct ib_device *device,
read_lock_irqsave(&device->cache.lock, flags);
- cache = device->cache.ports[port_num - rdma_start_port(device)].pkey;
+ cache = device->port_data[port_num].cache.pkey;
*index = -1;
@@ -1113,7 +1102,7 @@ int ib_find_exact_cached_pkey(struct ib_device *device,
read_lock_irqsave(&device->cache.lock, flags);
- cache = device->cache.ports[port_num - rdma_start_port(device)].pkey;
+ cache = device->port_data[port_num].cache.pkey;
*index = -1;
@@ -1141,7 +1130,7 @@ int ib_get_cached_lmc(struct ib_device *device,
return -EINVAL;
read_lock_irqsave(&device->cache.lock, flags);
- *lmc = device->cache.ports[port_num - rdma_start_port(device)].lmc;
+ *lmc = device->port_data[port_num].cache.lmc;
read_unlock_irqrestore(&device->cache.lock, flags);
return ret;
@@ -1159,8 +1148,7 @@ int ib_get_cached_port_state(struct ib_device *device,
return -EINVAL;
read_lock_irqsave(&device->cache.lock, flags);
- *port_state = device->cache.ports[port_num
- - rdma_start_port(device)].port_state;
+ *port_state = device->port_data[port_num].cache.port_state;
read_unlock_irqrestore(&device->cache.lock, flags);
return ret;
@@ -1361,16 +1349,13 @@ static void ib_cache_update(struct ib_device *device,
write_lock_irq(&device->cache.lock);
- old_pkey_cache = device->cache.ports[port -
- rdma_start_port(device)].pkey;
+ old_pkey_cache = device->port_data[port].cache.pkey;
- device->cache.ports[port - rdma_start_port(device)].pkey = pkey_cache;
- device->cache.ports[port - rdma_start_port(device)].lmc = tprops->lmc;
- device->cache.ports[port - rdma_start_port(device)].port_state =
- tprops->state;
+ device->port_data[port].cache.pkey = pkey_cache;
+ device->port_data[port].cache.lmc = tprops->lmc;
+ device->port_data[port].cache.port_state = tprops->state;
- device->cache.ports[port - rdma_start_port(device)].subnet_prefix =
- tprops->subnet_prefix;
+ device->port_data[port].cache.subnet_prefix = tprops->subnet_prefix;
write_unlock_irq(&device->cache.lock);
if (enforce_security)
@@ -1428,27 +1413,17 @@ static void ib_cache_event(struct ib_event_handler *handler,
int ib_cache_setup_one(struct ib_device *device)
{
- int p;
+ unsigned int p;
int err;
rwlock_init(&device->cache.lock);
- device->cache.ports =
- kcalloc(rdma_end_port(device) - rdma_start_port(device) + 1,
- sizeof(*device->cache.ports),
- GFP_KERNEL);
- if (!device->cache.ports)
- return -ENOMEM;
-
err = gid_table_setup_one(device);
- if (err) {
- kfree(device->cache.ports);
- device->cache.ports = NULL;
+ if (err)
return err;
- }
- for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p)
- ib_cache_update(device, p + rdma_start_port(device), true);
+ rdma_for_each_port (device, p)
+ ib_cache_update(device, p, true);
INIT_IB_EVENT_HANDLER(&device->cache.event_handler,
device, ib_cache_event);
@@ -1458,7 +1433,7 @@ int ib_cache_setup_one(struct ib_device *device)
void ib_cache_release_one(struct ib_device *device)
{
- int p;
+ unsigned int p;
/*
* The release function frees all the cache elements.
@@ -1466,11 +1441,10 @@ void ib_cache_release_one(struct ib_device *device)
* all the device's resources when the cache could no
* longer be accessed.
*/
- for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p)
- kfree(device->cache.ports[p].pkey);
+ rdma_for_each_port (device, p)
+ kfree(device->port_data[p].cache.pkey);
gid_table_release_one(device);
- kfree(device->cache.ports);
}
void ib_cache_cleanup_one(struct ib_device *device)
diff --git a/drivers/infiniband/core/cgroup.c b/drivers/infiniband/core/cgroup.c
index 126ac5f99db7..388fd04e5f63 100644
--- a/drivers/infiniband/core/cgroup.c
+++ b/drivers/infiniband/core/cgroup.c
@@ -21,12 +21,11 @@
* Register with the rdma cgroup. Should be called before
* exposing rdma device to user space applications to avoid
* resource accounting leak.
- * Returns 0 on success or otherwise failure code.
*/
-int ib_device_register_rdmacg(struct ib_device *device)
+void ib_device_register_rdmacg(struct ib_device *device)
{
device->cg_device.name = device->name;
- return rdmacg_register_device(&device->cg_device);
+ rdmacg_register_device(&device->cg_device);
}
/**
diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index 37980c7564c0..b9416a6fca36 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -4052,8 +4052,7 @@ static void cm_recv_handler(struct ib_mad_agent *mad_agent,
atomic_long_inc(&port->counter_group[CM_RECV].
counter[attr_id - CM_ATTR_ID_OFFSET]);
- work = kmalloc(sizeof(*work) + sizeof(struct sa_path_rec) * paths,
- GFP_KERNEL);
+ work = kmalloc(struct_size(work, path, paths), GFP_KERNEL);
if (!work) {
ib_free_recv_mad(mad_recv_wc);
return;
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 84f077b2b90a..68c997be2429 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -659,7 +659,7 @@ static int cma_acquire_dev_by_src_ip(struct rdma_id_private *id_priv)
struct cma_device *cma_dev;
enum ib_gid_type gid_type;
int ret = -ENODEV;
- u8 port;
+ unsigned int port;
if (dev_addr->dev_type != ARPHRD_INFINIBAND &&
id_priv->id.ps == RDMA_PS_IPOIB)
@@ -673,8 +673,7 @@ static int cma_acquire_dev_by_src_ip(struct rdma_id_private *id_priv)
mutex_lock(&lock);
list_for_each_entry(cma_dev, &dev_list, list) {
- for (port = rdma_start_port(cma_dev->device);
- port <= rdma_end_port(cma_dev->device); port++) {
+ rdma_for_each_port (cma_dev->device, port) {
gidp = rdma_protocol_roce(cma_dev->device, port) ?
&iboe_gid : &gid;
gid_type = cma_dev->default_gid_type[port - 1];
@@ -888,6 +887,7 @@ struct rdma_cm_id *__rdma_create_id(struct net *net,
id_priv->id.ps = ps;
id_priv->id.qp_type = qp_type;
id_priv->tos_set = false;
+ id_priv->timeout_set = false;
id_priv->gid_type = IB_GID_TYPE_IB;
spin_lock_init(&id_priv->lock);
mutex_init(&id_priv->qp_mutex);
@@ -1130,6 +1130,9 @@ int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr,
} else
ret = -ENOSYS;
+ if ((*qp_attr_mask & IB_QP_TIMEOUT) && id_priv->timeout_set)
+ qp_attr->timeout = id_priv->timeout;
+
return ret;
}
EXPORT_SYMBOL(rdma_init_qp_attr);
@@ -2410,6 +2413,7 @@ static int cma_iw_listen(struct rdma_id_private *id_priv, int backlog)
return PTR_ERR(id);
id->tos = id_priv->tos;
+ id->tos_set = id_priv->tos_set;
id_priv->cm_id.iw = id;
memcpy(&id_priv->cm_id.iw->local_addr, cma_src_addr(id_priv),
@@ -2462,6 +2466,8 @@ static void cma_listen_on_dev(struct rdma_id_private *id_priv,
atomic_inc(&id_priv->refcount);
dev_id_priv->internal_id = 1;
dev_id_priv->afonly = id_priv->afonly;
+ dev_id_priv->tos_set = id_priv->tos_set;
+ dev_id_priv->tos = id_priv->tos;
ret = rdma_listen(id, id_priv->backlog);
if (ret)
@@ -2490,6 +2496,34 @@ void rdma_set_service_type(struct rdma_cm_id *id, int tos)
}
EXPORT_SYMBOL(rdma_set_service_type);
+/**
+ * rdma_set_ack_timeout() - Set the ack timeout of QP associated
+ * with a connection identifier.
+ * @id: Communication identifier to associated with service type.
+ * @timeout: Ack timeout to set a QP, expressed as 4.096 * 2^(timeout) usec.
+ *
+ * This function should be called before rdma_connect() on active side,
+ * and on passive side before rdma_accept(). It is applicable to primary
+ * path only. The timeout will affect the local side of the QP, it is not
+ * negotiated with remote side and zero disables the timer.
+ *
+ * Return: 0 for success
+ */
+int rdma_set_ack_timeout(struct rdma_cm_id *id, u8 timeout)
+{
+ struct rdma_id_private *id_priv;
+
+ if (id->qp_type != IB_QPT_RC)
+ return -EINVAL;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ id_priv->timeout = timeout;
+ id_priv->timeout_set = true;
+
+ return 0;
+}
+EXPORT_SYMBOL(rdma_set_ack_timeout);
+
static void cma_query_handler(int status, struct sa_path_rec *path_rec,
void *context)
{
@@ -2966,13 +3000,22 @@ static void addr_handler(int status, struct sockaddr *src_addr,
{
struct rdma_id_private *id_priv = context;
struct rdma_cm_event event = {};
+ struct sockaddr *addr;
+ struct sockaddr_storage old_addr;
mutex_lock(&id_priv->handler_mutex);
if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY,
RDMA_CM_ADDR_RESOLVED))
goto out;
- memcpy(cma_src_addr(id_priv), src_addr, rdma_addr_size(src_addr));
+ /*
+ * Store the previous src address, so that if we fail to acquire
+ * matching rdma device, old address can be restored back, which helps
+ * to cancel the cma listen operation correctly.
+ */
+ addr = cma_src_addr(id_priv);
+ memcpy(&old_addr, addr, rdma_addr_size(addr));
+ memcpy(addr, src_addr, rdma_addr_size(src_addr));
if (!status && !id_priv->cma_dev) {
status = cma_acquire_dev_by_src_ip(id_priv);
if (status)
@@ -2983,6 +3026,8 @@ static void addr_handler(int status, struct sockaddr *src_addr,
}
if (status) {
+ memcpy(addr, &old_addr,
+ rdma_addr_size((struct sockaddr *)&old_addr));
if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED,
RDMA_CM_ADDR_BOUND))
goto out;
@@ -3798,6 +3843,7 @@ static int cma_connect_iw(struct rdma_id_private *id_priv,
return PTR_ERR(cm_id);
cm_id->tos = id_priv->tos;
+ cm_id->tos_set = id_priv->tos_set;
id_priv->cm_id.iw = cm_id;
memcpy(&cm_id->local_addr, cma_src_addr(id_priv),
@@ -4501,7 +4547,7 @@ static void cma_add_one(struct ib_device *device)
if (!cma_dev->default_roce_tos)
goto free_gid_type;
- for (i = rdma_start_port(device); i <= rdma_end_port(device); i++) {
+ rdma_for_each_port (device, i) {
supported_gids = roce_gid_type_mask_support(device, i);
WARN_ON(!supported_gids);
if (supported_gids & (1 << CMA_PREFERRED_ROCE_GID_TYPE))
@@ -4605,85 +4651,6 @@ static void cma_remove_one(struct ib_device *device, void *client_data)
kfree(cma_dev);
}
-static int cma_get_id_stats(struct sk_buff *skb, struct netlink_callback *cb)
-{
- struct nlmsghdr *nlh;
- struct rdma_cm_id_stats *id_stats;
- struct rdma_id_private *id_priv;
- struct rdma_cm_id *id = NULL;
- struct cma_device *cma_dev;
- int i_dev = 0, i_id = 0;
-
- /*
- * We export all of the IDs as a sequence of messages. Each
- * ID gets its own netlink message.
- */
- mutex_lock(&lock);
-
- list_for_each_entry(cma_dev, &dev_list, list) {
- if (i_dev < cb->args[0]) {
- i_dev++;
- continue;
- }
-
- i_id = 0;
- list_for_each_entry(id_priv, &cma_dev->id_list, list) {
- if (i_id < cb->args[1]) {
- i_id++;
- continue;
- }
-
- id_stats = ibnl_put_msg(skb, &nlh, cb->nlh->nlmsg_seq,
- sizeof *id_stats, RDMA_NL_RDMA_CM,
- RDMA_NL_RDMA_CM_ID_STATS,
- NLM_F_MULTI);
- if (!id_stats)
- goto out;
-
- memset(id_stats, 0, sizeof *id_stats);
- id = &id_priv->id;
- id_stats->node_type = id->route.addr.dev_addr.dev_type;
- id_stats->port_num = id->port_num;
- id_stats->bound_dev_if =
- id->route.addr.dev_addr.bound_dev_if;
-
- if (ibnl_put_attr(skb, nlh,
- rdma_addr_size(cma_src_addr(id_priv)),
- cma_src_addr(id_priv),
- RDMA_NL_RDMA_CM_ATTR_SRC_ADDR))
- goto out;
- if (ibnl_put_attr(skb, nlh,
- rdma_addr_size(cma_dst_addr(id_priv)),
- cma_dst_addr(id_priv),
- RDMA_NL_RDMA_CM_ATTR_DST_ADDR))
- goto out;
-
- id_stats->pid = task_pid_vnr(id_priv->res.task);
- id_stats->port_space = id->ps;
- id_stats->cm_state = id_priv->state;
- id_stats->qp_num = id_priv->qp_num;
- id_stats->qp_type = id->qp_type;
-
- i_id++;
- nlmsg_end(skb, nlh);
- }
-
- cb->args[1] = 0;
- i_dev++;
- }
-
-out:
- mutex_unlock(&lock);
- cb->args[0] = i_dev;
- cb->args[1] = i_id;
-
- return skb->len;
-}
-
-static const struct rdma_nl_cbs cma_cb_table[RDMA_NL_RDMA_CM_NUM_OPS] = {
- [RDMA_NL_RDMA_CM_ID_STATS] = { .dump = cma_get_id_stats},
-};
-
static int cma_init_net(struct net *net)
{
struct cma_pernet *pernet = cma_pernet(net);
@@ -4732,7 +4699,6 @@ static int __init cma_init(void)
if (ret)
goto err;
- rdma_nl_register(RDMA_NL_RDMA_CM, cma_cb_table);
cma_configfs_init();
return 0;
@@ -4748,7 +4714,6 @@ err_wq:
static void __exit cma_cleanup(void)
{
cma_configfs_exit();
- rdma_nl_unregister(RDMA_NL_RDMA_CM);
ib_unregister_client(&cma_client);
unregister_netdevice_notifier(&cma_nb);
ib_sa_unregister_client(&sa_client);
@@ -4756,7 +4721,5 @@ static void __exit cma_cleanup(void)
destroy_workqueue(cma_wq);
}
-MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_RDMA_CM, 1);
-
module_init(cma_init);
module_exit(cma_cleanup);
diff --git a/drivers/infiniband/core/cma_priv.h b/drivers/infiniband/core/cma_priv.h
index cf47c69436a7..ca7307277518 100644
--- a/drivers/infiniband/core/cma_priv.h
+++ b/drivers/infiniband/core/cma_priv.h
@@ -84,9 +84,11 @@ struct rdma_id_private {
u32 options;
u8 srq;
u8 tos;
- bool tos_set;
+ u8 tos_set:1;
+ u8 timeout_set:1;
u8 reuseaddr;
u8 afonly;
+ u8 timeout;
enum ib_gid_type gid_type;
/*
diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
index 616734313f0c..08c690249594 100644
--- a/drivers/infiniband/core/core_priv.h
+++ b/drivers/infiniband/core/core_priv.h
@@ -54,9 +54,9 @@ struct pkey_index_qp_list {
struct list_head qp_list;
};
-int ib_device_register_sysfs(struct ib_device *device,
- int (*port_callback)(struct ib_device *,
- u8, struct kobject *));
+extern const struct attribute_group ib_dev_attr_group;
+
+int ib_device_register_sysfs(struct ib_device *device);
void ib_device_unregister_sysfs(struct ib_device *device);
int ib_device_rename(struct ib_device *ibdev, const char *name);
@@ -66,6 +66,9 @@ typedef void (*roce_netdev_callback)(struct ib_device *device, u8 port,
typedef bool (*roce_netdev_filter)(struct ib_device *device, u8 port,
struct net_device *idev, void *cookie);
+struct net_device *ib_device_get_netdev(struct ib_device *ib_dev,
+ unsigned int port);
+
void ib_enum_roce_netdev(struct ib_device *ib_dev,
roce_netdev_filter filter,
void *filter_cookie,
@@ -117,7 +120,7 @@ void ib_cache_cleanup_one(struct ib_device *device);
void ib_cache_release_one(struct ib_device *device);
#ifdef CONFIG_CGROUP_RDMA
-int ib_device_register_rdmacg(struct ib_device *device);
+void ib_device_register_rdmacg(struct ib_device *device);
void ib_device_unregister_rdmacg(struct ib_device *device);
int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj,
@@ -128,21 +131,26 @@ void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj,
struct ib_device *device,
enum rdmacg_resource_type resource_index);
#else
-static inline int ib_device_register_rdmacg(struct ib_device *device)
-{ return 0; }
+static inline void ib_device_register_rdmacg(struct ib_device *device)
+{
+}
static inline void ib_device_unregister_rdmacg(struct ib_device *device)
-{ }
+{
+}
static inline int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj,
struct ib_device *device,
enum rdmacg_resource_type resource_index)
-{ return 0; }
+{
+ return 0;
+}
static inline void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj,
struct ib_device *device,
enum rdmacg_resource_type resource_index)
-{ }
+{
+}
#endif
static inline bool rdma_is_upper_dev_rcu(struct net_device *dev,
@@ -178,7 +186,7 @@ int ib_get_cached_subnet_prefix(struct ib_device *device,
u64 *sn_pfx);
#ifdef CONFIG_SECURITY_INFINIBAND
-void ib_security_destroy_port_pkey_list(struct ib_device *device);
+void ib_security_release_port_pkey_list(struct ib_device *device);
void ib_security_cache_change(struct ib_device *device,
u8 port_num,
@@ -199,8 +207,9 @@ int ib_mad_agent_security_setup(struct ib_mad_agent *agent,
enum ib_qp_type qp_type);
void ib_mad_agent_security_cleanup(struct ib_mad_agent *agent);
int ib_mad_enforce_security(struct ib_mad_agent_private *map, u16 pkey_index);
+void ib_mad_agent_security_change(void);
#else
-static inline void ib_security_destroy_port_pkey_list(struct ib_device *device)
+static inline void ib_security_release_port_pkey_list(struct ib_device *device)
{
}
@@ -264,6 +273,10 @@ static inline int ib_mad_enforce_security(struct ib_mad_agent_private *map,
{
return 0;
}
+
+static inline void ib_mad_agent_security_change(void)
+{
+}
#endif
struct ib_device *ib_device_get_by_index(u32 ifindex);
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 238ec42778ef..a9f29156e486 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -37,54 +37,111 @@
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/init.h>
-#include <linux/mutex.h>
#include <linux/netdevice.h>
#include <linux/security.h>
#include <linux/notifier.h>
+#include <linux/hashtable.h>
#include <rdma/rdma_netlink.h>
#include <rdma/ib_addr.h>
#include <rdma/ib_cache.h>
#include "core_priv.h"
+#include "restrack.h"
MODULE_AUTHOR("Roland Dreier");
MODULE_DESCRIPTION("core kernel InfiniBand API");
MODULE_LICENSE("Dual BSD/GPL");
-struct ib_client_data {
- struct list_head list;
- struct ib_client *client;
- void * data;
- /* The device or client is going down. Do not call client or device
- * callbacks other than remove(). */
- bool going_down;
-};
-
struct workqueue_struct *ib_comp_wq;
struct workqueue_struct *ib_comp_unbound_wq;
struct workqueue_struct *ib_wq;
EXPORT_SYMBOL_GPL(ib_wq);
-/* The device_list and client_list contain devices and clients after their
- * registration has completed, and the devices and clients are removed
- * during unregistration. */
-static LIST_HEAD(device_list);
-static LIST_HEAD(client_list);
+/*
+ * Each of the three rwsem locks (devices, clients, client_data) protects the
+ * xarray of the same name. Specifically it allows the caller to assert that
+ * the MARK will/will not be changing under the lock, and for devices and
+ * clients, that the value in the xarray is still a valid pointer. Change of
+ * the MARK is linked to the object state, so holding the lock and testing the
+ * MARK also asserts that the contained object is in a certain state.
+ *
+ * This is used to build a two stage register/unregister flow where objects
+ * can continue to be in the xarray even though they are still in progress to
+ * register/unregister.
+ *
+ * The xarray itself provides additional locking, and restartable iteration,
+ * which is also relied on.
+ *
+ * Locks should not be nested, with the exception of client_data, which is
+ * allowed to nest under the read side of the other two locks.
+ *
+ * The devices_rwsem also protects the device name list, any change or
+ * assignment of device name must also hold the write side to guarantee unique
+ * names.
+ */
/*
- * device_mutex and lists_rwsem protect access to both device_list and
- * client_list. device_mutex protects writer access by device and client
- * registration / de-registration. lists_rwsem protects reader access to
- * these lists. Iterators of these lists must lock it for read, while updates
- * to the lists must be done with a write lock. A special case is when the
- * device_mutex is locked. In this case locking the lists for read access is
- * not necessary as the device_mutex implies it.
+ * devices contains devices that have had their names assigned. The
+ * devices may not be registered. Users that care about the registration
+ * status need to call ib_device_try_get() on the device to ensure it is
+ * registered, and keep it registered, for the required duration.
*
- * lists_rwsem also protects access to the client data list.
*/
-static DEFINE_MUTEX(device_mutex);
-static DECLARE_RWSEM(lists_rwsem);
+static DEFINE_XARRAY_FLAGS(devices, XA_FLAGS_ALLOC);
+static DECLARE_RWSEM(devices_rwsem);
+#define DEVICE_REGISTERED XA_MARK_1
+static LIST_HEAD(client_list);
+#define CLIENT_REGISTERED XA_MARK_1
+static DEFINE_XARRAY_FLAGS(clients, XA_FLAGS_ALLOC);
+static DECLARE_RWSEM(clients_rwsem);
+
+/*
+ * If client_data is registered then the corresponding client must also still
+ * be registered.
+ */
+#define CLIENT_DATA_REGISTERED XA_MARK_1
+/*
+ * xarray has this behavior where it won't iterate over NULL values stored in
+ * allocated arrays. So we need our own iterator to see all values stored in
+ * the array. This does the same thing as xa_for_each except that it also
+ * returns NULL valued entries if the array is allocating. Simplified to only
+ * work on simple xarrays.
+ */
+static void *xan_find_marked(struct xarray *xa, unsigned long *indexp,
+ xa_mark_t filter)
+{
+ XA_STATE(xas, xa, *indexp);
+ void *entry;
+
+ rcu_read_lock();
+ do {
+ entry = xas_find_marked(&xas, ULONG_MAX, filter);
+ if (xa_is_zero(entry))
+ break;
+ } while (xas_retry(&xas, entry));
+ rcu_read_unlock();
+
+ if (entry) {
+ *indexp = xas.xa_index;
+ if (xa_is_zero(entry))
+ return NULL;
+ return entry;
+ }
+ return XA_ERROR(-ENOENT);
+}
+#define xan_for_each_marked(xa, index, entry, filter) \
+ for (index = 0, entry = xan_find_marked(xa, &(index), filter); \
+ !xa_is_err(entry); \
+ (index)++, entry = xan_find_marked(xa, &(index), filter))
+
+/* RCU hash table mapping netdevice pointers to struct ib_port_data */
+static DEFINE_SPINLOCK(ndev_hash_lock);
+static DECLARE_HASHTABLE(ndev_hash, 5);
+
+static void free_netdevs(struct ib_device *ib_dev);
+static void ib_unregister_work(struct work_struct *work);
+static void __ib_unregister_device(struct ib_device *device);
static int ib_security_change(struct notifier_block *nb, unsigned long event,
void *lsm_data);
static void ib_policy_change_task(struct work_struct *work);
@@ -94,6 +151,12 @@ static struct notifier_block ibdev_lsm_nb = {
.notifier_call = ib_security_change,
};
+/* Pointer to the RCU head at the start of the ib_port_data array */
+struct ib_port_data_rcu {
+ struct rcu_head rcu_head;
+ struct ib_port_data pdata[];
+};
+
static int ib_device_check_mandatory(struct ib_device *device)
{
#define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device_ops, x), #x }
@@ -121,30 +184,18 @@ static int ib_device_check_mandatory(struct ib_device *device)
};
int i;
+ device->kverbs_provider = true;
for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) {
if (!*(void **) ((void *) &device->ops +
mandatory_table[i].offset)) {
- dev_warn(&device->dev,
- "Device is missing mandatory function %s\n",
- mandatory_table[i].name);
- return -EINVAL;
+ device->kverbs_provider = false;
+ break;
}
}
return 0;
}
-static struct ib_device *__ib_device_get_by_index(u32 index)
-{
- struct ib_device *device;
-
- list_for_each_entry(device, &device_list, core_list)
- if (device->index == index)
- return device;
-
- return NULL;
-}
-
/*
* Caller must perform ib_device_put() to return the device reference count
* when ib_device_get_by_index() returns valid device pointer.
@@ -153,13 +204,13 @@ struct ib_device *ib_device_get_by_index(u32 index)
{
struct ib_device *device;
- down_read(&lists_rwsem);
- device = __ib_device_get_by_index(index);
+ down_read(&devices_rwsem);
+ device = xa_load(&devices, index);
if (device) {
if (!ib_device_try_get(device))
device = NULL;
}
- up_read(&lists_rwsem);
+ up_read(&devices_rwsem);
return device;
}
@@ -180,28 +231,56 @@ EXPORT_SYMBOL(ib_device_put);
static struct ib_device *__ib_device_get_by_name(const char *name)
{
struct ib_device *device;
+ unsigned long index;
- list_for_each_entry(device, &device_list, core_list)
+ xa_for_each (&devices, index, device)
if (!strcmp(name, dev_name(&device->dev)))
return device;
return NULL;
}
-int ib_device_rename(struct ib_device *ibdev, const char *name)
+/**
+ * ib_device_get_by_name - Find an IB device by name
+ * @name: The name to look for
+ * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all)
+ *
+ * Find and hold an ib_device by its name. The caller must call
+ * ib_device_put() on the returned pointer.
+ */
+struct ib_device *ib_device_get_by_name(const char *name,
+ enum rdma_driver_id driver_id)
{
struct ib_device *device;
- int ret = 0;
- if (!strcmp(name, dev_name(&ibdev->dev)))
- return ret;
+ down_read(&devices_rwsem);
+ device = __ib_device_get_by_name(name);
+ if (device && driver_id != RDMA_DRIVER_UNKNOWN &&
+ device->driver_id != driver_id)
+ device = NULL;
- mutex_lock(&device_mutex);
- list_for_each_entry(device, &device_list, core_list) {
- if (!strcmp(name, dev_name(&device->dev))) {
- ret = -EEXIST;
- goto out;
- }
+ if (device) {
+ if (!ib_device_try_get(device))
+ device = NULL;
+ }
+ up_read(&devices_rwsem);
+ return device;
+}
+EXPORT_SYMBOL(ib_device_get_by_name);
+
+int ib_device_rename(struct ib_device *ibdev, const char *name)
+{
+ int ret;
+
+ down_write(&devices_rwsem);
+ if (!strcmp(name, dev_name(&ibdev->dev))) {
+ ret = 0;
+ goto out;
+ }
+
+ if (__ib_device_get_by_name(name)) {
+ ret = -EEXIST;
+ goto out;
}
ret = device_rename(&ibdev->dev, name);
@@ -209,53 +288,60 @@ int ib_device_rename(struct ib_device *ibdev, const char *name)
goto out;
strlcpy(ibdev->name, name, IB_DEVICE_NAME_MAX);
out:
- mutex_unlock(&device_mutex);
+ up_write(&devices_rwsem);
return ret;
}
static int alloc_name(struct ib_device *ibdev, const char *name)
{
- unsigned long *inuse;
struct ib_device *device;
+ unsigned long index;
+ struct ida inuse;
+ int rc;
int i;
- inuse = (unsigned long *) get_zeroed_page(GFP_KERNEL);
- if (!inuse)
- return -ENOMEM;
-
- list_for_each_entry(device, &device_list, core_list) {
+ lockdep_assert_held_exclusive(&devices_rwsem);
+ ida_init(&inuse);
+ xa_for_each (&devices, index, device) {
char buf[IB_DEVICE_NAME_MAX];
if (sscanf(dev_name(&device->dev), name, &i) != 1)
continue;
- if (i < 0 || i >= PAGE_SIZE * 8)
+ if (i < 0 || i >= INT_MAX)
continue;
snprintf(buf, sizeof buf, name, i);
- if (!strcmp(buf, dev_name(&device->dev)))
- set_bit(i, inuse);
+ if (strcmp(buf, dev_name(&device->dev)) != 0)
+ continue;
+
+ rc = ida_alloc_range(&inuse, i, i, GFP_KERNEL);
+ if (rc < 0)
+ goto out;
}
- i = find_first_zero_bit(inuse, PAGE_SIZE * 8);
- free_page((unsigned long) inuse);
+ rc = ida_alloc(&inuse, GFP_KERNEL);
+ if (rc < 0)
+ goto out;
- return dev_set_name(&ibdev->dev, name, i);
+ rc = dev_set_name(&ibdev->dev, name, rc);
+out:
+ ida_destroy(&inuse);
+ return rc;
}
static void ib_device_release(struct device *device)
{
struct ib_device *dev = container_of(device, struct ib_device, dev);
- WARN_ON(dev->reg_state == IB_DEV_REGISTERED);
- if (dev->reg_state == IB_DEV_UNREGISTERED) {
- /*
- * In IB_DEV_UNINITIALIZED state, cache or port table
- * is not even created. Free cache and port table only when
- * device reaches UNREGISTERED state.
- */
- ib_cache_release_one(dev);
- kfree(dev->port_immutable);
- }
- kfree(dev);
+ free_netdevs(dev);
+ WARN_ON(refcount_read(&dev->refcount));
+ ib_cache_release_one(dev);
+ ib_security_release_port_pkey_list(dev);
+ xa_destroy(&dev->client_data);
+ if (dev->port_data)
+ kfree_rcu(container_of(dev->port_data, struct ib_port_data_rcu,
+ pdata[0]),
+ rcu_head);
+ kfree_rcu(dev, rcu_head);
}
static int ib_device_uevent(struct device *device,
@@ -278,7 +364,7 @@ static struct class ib_class = {
};
/**
- * ib_alloc_device - allocate an IB device struct
+ * _ib_alloc_device - allocate an IB device struct
* @size:size of structure to allocate
*
* Low-level drivers should use ib_alloc_device() to allocate &struct
@@ -287,7 +373,7 @@ static struct class ib_class = {
* ib_dealloc_device() must be used to free structures allocated with
* ib_alloc_device().
*/
-struct ib_device *ib_alloc_device(size_t size)
+struct ib_device *_ib_alloc_device(size_t size)
{
struct ib_device *device;
@@ -298,23 +384,32 @@ struct ib_device *ib_alloc_device(size_t size)
if (!device)
return NULL;
- rdma_restrack_init(&device->res);
+ if (rdma_restrack_init(device)) {
+ kfree(device);
+ return NULL;
+ }
device->dev.class = &ib_class;
+ device->groups[0] = &ib_dev_attr_group;
+ device->dev.groups = device->groups;
device_initialize(&device->dev);
- dev_set_drvdata(&device->dev, device);
-
INIT_LIST_HEAD(&device->event_handler_list);
spin_lock_init(&device->event_handler_lock);
- rwlock_init(&device->client_data_lock);
- INIT_LIST_HEAD(&device->client_data_list);
+ mutex_init(&device->unregistration_lock);
+ /*
+ * client_data needs to be alloc because we don't want our mark to be
+ * destroyed if the user stores NULL in the client data.
+ */
+ xa_init_flags(&device->client_data, XA_FLAGS_ALLOC);
+ init_rwsem(&device->client_data_rwsem);
INIT_LIST_HEAD(&device->port_list);
init_completion(&device->unreg_completion);
+ INIT_WORK(&device->unregistration_work, ib_unregister_work);
return device;
}
-EXPORT_SYMBOL(ib_alloc_device);
+EXPORT_SYMBOL(_ib_alloc_device);
/**
* ib_dealloc_device - free an IB device struct
@@ -324,32 +419,153 @@ EXPORT_SYMBOL(ib_alloc_device);
*/
void ib_dealloc_device(struct ib_device *device)
{
- WARN_ON(!list_empty(&device->client_data_list));
- WARN_ON(device->reg_state != IB_DEV_UNREGISTERED &&
- device->reg_state != IB_DEV_UNINITIALIZED);
- rdma_restrack_clean(&device->res);
+ if (device->ops.dealloc_driver)
+ device->ops.dealloc_driver(device);
+
+ /*
+ * ib_unregister_driver() requires all devices to remain in the xarray
+ * while their ops are callable. The last op we call is dealloc_driver
+ * above. This is needed to create a fence on op callbacks prior to
+ * allowing the driver module to unload.
+ */
+ down_write(&devices_rwsem);
+ if (xa_load(&devices, device->index) == device)
+ xa_erase(&devices, device->index);
+ up_write(&devices_rwsem);
+
+ /* Expedite releasing netdev references */
+ free_netdevs(device);
+
+ WARN_ON(!xa_empty(&device->client_data));
+ WARN_ON(refcount_read(&device->refcount));
+ rdma_restrack_clean(device);
+ /* Balances with device_initialize */
put_device(&device->dev);
}
EXPORT_SYMBOL(ib_dealloc_device);
-static int add_client_context(struct ib_device *device, struct ib_client *client)
+/*
+ * add_client_context() and remove_client_context() must be safe against
+ * parallel calls on the same device - registration/unregistration of both the
+ * device and client can be occurring in parallel.
+ *
+ * The routines need to be a fence, any caller must not return until the add
+ * or remove is fully completed.
+ */
+static int add_client_context(struct ib_device *device,
+ struct ib_client *client)
{
- struct ib_client_data *context;
+ int ret = 0;
- context = kmalloc(sizeof(*context), GFP_KERNEL);
- if (!context)
- return -ENOMEM;
+ if (!device->kverbs_provider && !client->no_kverbs_req)
+ return 0;
+
+ down_write(&device->client_data_rwsem);
+ /*
+ * Another caller to add_client_context got here first and has already
+ * completely initialized context.
+ */
+ if (xa_get_mark(&device->client_data, client->client_id,
+ CLIENT_DATA_REGISTERED))
+ goto out;
+
+ ret = xa_err(xa_store(&device->client_data, client->client_id, NULL,
+ GFP_KERNEL));
+ if (ret)
+ goto out;
+ downgrade_write(&device->client_data_rwsem);
+ if (client->add)
+ client->add(device);
+
+ /* Readers shall not see a client until add has been completed */
+ xa_set_mark(&device->client_data, client->client_id,
+ CLIENT_DATA_REGISTERED);
+ up_read(&device->client_data_rwsem);
+ return 0;
- context->client = client;
- context->data = NULL;
- context->going_down = false;
+out:
+ up_write(&device->client_data_rwsem);
+ return ret;
+}
- down_write(&lists_rwsem);
- write_lock_irq(&device->client_data_lock);
- list_add(&context->list, &device->client_data_list);
- write_unlock_irq(&device->client_data_lock);
- up_write(&lists_rwsem);
+static void remove_client_context(struct ib_device *device,
+ unsigned int client_id)
+{
+ struct ib_client *client;
+ void *client_data;
+
+ down_write(&device->client_data_rwsem);
+ if (!xa_get_mark(&device->client_data, client_id,
+ CLIENT_DATA_REGISTERED)) {
+ up_write(&device->client_data_rwsem);
+ return;
+ }
+ client_data = xa_load(&device->client_data, client_id);
+ xa_clear_mark(&device->client_data, client_id, CLIENT_DATA_REGISTERED);
+ client = xa_load(&clients, client_id);
+ downgrade_write(&device->client_data_rwsem);
+
+ /*
+ * Notice we cannot be holding any exclusive locks when calling the
+ * remove callback as the remove callback can recurse back into any
+ * public functions in this module and thus try for any locks those
+ * functions take.
+ *
+ * For this reason clients and drivers should not call the
+ * unregistration functions will holdling any locks.
+ *
+ * It tempting to drop the client_data_rwsem too, but this is required
+ * to ensure that unregister_client does not return until all clients
+ * are completely unregistered, which is required to avoid module
+ * unloading races.
+ */
+ if (client->remove)
+ client->remove(device, client_data);
+
+ xa_erase(&device->client_data, client_id);
+ up_read(&device->client_data_rwsem);
+}
+
+static int alloc_port_data(struct ib_device *device)
+{
+ struct ib_port_data_rcu *pdata_rcu;
+ unsigned int port;
+
+ if (device->port_data)
+ return 0;
+ /* This can only be called once the physical port range is defined */
+ if (WARN_ON(!device->phys_port_cnt))
+ return -EINVAL;
+
+ /*
+ * device->port_data is indexed directly by the port number to make
+ * access to this data as efficient as possible.
+ *
+ * Therefore port_data is declared as a 1 based array with potential
+ * empty slots at the beginning.
+ */
+ pdata_rcu = kzalloc(struct_size(pdata_rcu, pdata,
+ rdma_end_port(device) + 1),
+ GFP_KERNEL);
+ if (!pdata_rcu)
+ return -ENOMEM;
+ /*
+ * The rcu_head is put in front of the port data array and the stored
+ * pointer is adjusted since we never need to see that member until
+ * kfree_rcu.
+ */
+ device->port_data = pdata_rcu->pdata;
+
+ rdma_for_each_port (device, port) {
+ struct ib_port_data *pdata = &device->port_data[port];
+
+ pdata->ib_dev = device;
+ spin_lock_init(&pdata->pkey_list_lock);
+ INIT_LIST_HEAD(&pdata->pkey_list);
+ spin_lock_init(&pdata->netdev_lock);
+ INIT_HLIST_NODE(&pdata->ndev_hash_link);
+ }
return 0;
}
@@ -359,29 +575,20 @@ static int verify_immutable(const struct ib_device *dev, u8 port)
rdma_max_mad_size(dev, port) != 0);
}
-static int read_port_immutable(struct ib_device *device)
+static int setup_port_data(struct ib_device *device)
{
+ unsigned int port;
int ret;
- u8 start_port = rdma_start_port(device);
- u8 end_port = rdma_end_port(device);
- u8 port;
- /**
- * device->port_immutable is indexed directly by the port number to make
- * access to this data as efficient as possible.
- *
- * Therefore port_immutable is declared as a 1 based array with
- * potential empty slots at the beginning.
- */
- device->port_immutable = kcalloc(end_port + 1,
- sizeof(*device->port_immutable),
- GFP_KERNEL);
- if (!device->port_immutable)
- return -ENOMEM;
+ ret = alloc_port_data(device);
+ if (ret)
+ return ret;
- for (port = start_port; port <= end_port; ++port) {
- ret = device->ops.get_port_immutable(
- device, port, &device->port_immutable[port]);
+ rdma_for_each_port (device, port) {
+ struct ib_port_data *pdata = &device->port_data[port];
+
+ ret = device->ops.get_port_immutable(device, port,
+ &pdata->immutable);
if (ret)
return ret;
@@ -400,39 +607,16 @@ void ib_get_device_fw_str(struct ib_device *dev, char *str)
}
EXPORT_SYMBOL(ib_get_device_fw_str);
-static int setup_port_pkey_list(struct ib_device *device)
-{
- int i;
-
- /**
- * device->port_pkey_list is indexed directly by the port number,
- * Therefore it is declared as a 1 based array with potential empty
- * slots at the beginning.
- */
- device->port_pkey_list = kcalloc(rdma_end_port(device) + 1,
- sizeof(*device->port_pkey_list),
- GFP_KERNEL);
-
- if (!device->port_pkey_list)
- return -ENOMEM;
-
- for (i = 0; i < (rdma_end_port(device) + 1); i++) {
- spin_lock_init(&device->port_pkey_list[i].list_lock);
- INIT_LIST_HEAD(&device->port_pkey_list[i].pkey_list);
- }
-
- return 0;
-}
-
static void ib_policy_change_task(struct work_struct *work)
{
struct ib_device *dev;
+ unsigned long index;
- down_read(&lists_rwsem);
- list_for_each_entry(dev, &device_list, core_list) {
- int i;
+ down_read(&devices_rwsem);
+ xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) {
+ unsigned int i;
- for (i = rdma_start_port(dev); i <= rdma_end_port(dev); i++) {
+ rdma_for_each_port (dev, i) {
u64 sp;
int ret = ib_get_cached_subnet_prefix(dev,
i,
@@ -445,7 +629,7 @@ static void ib_policy_change_task(struct work_struct *work)
ib_security_cache_change(dev, i, sp);
}
}
- up_read(&lists_rwsem);
+ up_read(&devices_rwsem);
}
static int ib_security_change(struct notifier_block *nb, unsigned long event,
@@ -455,32 +639,52 @@ static int ib_security_change(struct notifier_block *nb, unsigned long event,
return NOTIFY_DONE;
schedule_work(&ib_policy_change_work);
+ ib_mad_agent_security_change();
return NOTIFY_OK;
}
-/**
- * __dev_new_index - allocate an device index
- *
- * Returns a suitable unique value for a new device interface
- * number. It assumes that there are less than 2^32-1 ib devices
- * will be present in the system.
+/*
+ * Assign the unique string device name and the unique device index. This is
+ * undone by ib_dealloc_device.
*/
-static u32 __dev_new_index(void)
+static int assign_name(struct ib_device *device, const char *name)
{
- /*
- * The device index to allow stable naming.
- * Similar to struct net -> ifindex.
- */
- static u32 index;
+ static u32 last_id;
+ int ret;
- for (;;) {
- if (!(++index))
- index = 1;
+ down_write(&devices_rwsem);
+ /* Assign a unique name to the device */
+ if (strchr(name, '%'))
+ ret = alloc_name(device, name);
+ else
+ ret = dev_set_name(&device->dev, name);
+ if (ret)
+ goto out;
+
+ if (__ib_device_get_by_name(dev_name(&device->dev))) {
+ ret = -ENFILE;
+ goto out;
+ }
+ strlcpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX);
- if (!__ib_device_get_by_index(index))
- return index;
+ /* Cyclically allocate a user visible ID for the device */
+ device->index = last_id;
+ ret = xa_alloc(&devices, &device->index, INT_MAX, device, GFP_KERNEL);
+ if (ret == -ENOSPC) {
+ device->index = 0;
+ ret = xa_alloc(&devices, &device->index, INT_MAX, device,
+ GFP_KERNEL);
}
+ if (ret)
+ goto out;
+ last_id = device->index + 1;
+
+ ret = 0;
+
+out:
+ up_write(&devices_rwsem);
+ return ret;
}
static void setup_dma_device(struct ib_device *device)
@@ -518,27 +722,25 @@ static void setup_dma_device(struct ib_device *device)
}
}
-static void cleanup_device(struct ib_device *device)
-{
- ib_cache_cleanup_one(device);
- ib_cache_release_one(device);
- kfree(device->port_pkey_list);
- kfree(device->port_immutable);
-}
-
+/*
+ * setup_device() allocates memory and sets up data that requires calling the
+ * device ops, this is the only reason these actions are not done during
+ * ib_alloc_device. It is undone by ib_dealloc_device().
+ */
static int setup_device(struct ib_device *device)
{
struct ib_udata uhw = {.outlen = 0, .inlen = 0};
int ret;
+ setup_dma_device(device);
+
ret = ib_device_check_mandatory(device);
if (ret)
return ret;
- ret = read_port_immutable(device);
+ ret = setup_port_data(device);
if (ret) {
- dev_warn(&device->dev,
- "Couldn't create per port immutable data\n");
+ dev_warn(&device->dev, "Couldn't create per-port data\n");
return ret;
}
@@ -547,27 +749,76 @@ static int setup_device(struct ib_device *device)
if (ret) {
dev_warn(&device->dev,
"Couldn't query the device attributes\n");
- goto port_cleanup;
+ return ret;
}
- ret = setup_port_pkey_list(device);
- if (ret) {
- dev_warn(&device->dev, "Couldn't create per port_pkey_list\n");
- goto port_cleanup;
+ return 0;
+}
+
+static void disable_device(struct ib_device *device)
+{
+ struct ib_client *client;
+
+ WARN_ON(!refcount_read(&device->refcount));
+
+ down_write(&devices_rwsem);
+ xa_clear_mark(&devices, device->index, DEVICE_REGISTERED);
+ up_write(&devices_rwsem);
+
+ down_read(&clients_rwsem);
+ list_for_each_entry_reverse(client, &client_list, list)
+ remove_client_context(device, client->client_id);
+ up_read(&clients_rwsem);
+
+ /* Pairs with refcount_set in enable_device */
+ ib_device_put(device);
+ wait_for_completion(&device->unreg_completion);
+
+ /* Expedite removing unregistered pointers from the hash table */
+ free_netdevs(device);
+}
+
+/*
+ * An enabled device is visible to all clients and to all the public facing
+ * APIs that return a device pointer. This always returns with a new get, even
+ * if it fails.
+ */
+static int enable_device_and_get(struct ib_device *device)
+{
+ struct ib_client *client;
+ unsigned long index;
+ int ret = 0;
+
+ /*
+ * One ref belongs to the xa and the other belongs to this
+ * thread. This is needed to guard against parallel unregistration.
+ */
+ refcount_set(&device->refcount, 2);
+ down_write(&devices_rwsem);
+ xa_set_mark(&devices, device->index, DEVICE_REGISTERED);
+
+ /*
+ * By using downgrade_write() we ensure that no other thread can clear
+ * DEVICE_REGISTERED while we are completing the client setup.
+ */
+ downgrade_write(&devices_rwsem);
+
+ if (device->ops.enable_driver) {
+ ret = device->ops.enable_driver(device);
+ if (ret)
+ goto out;
}
- ret = ib_cache_setup_one(device);
- if (ret) {
- dev_warn(&device->dev,
- "Couldn't set up InfiniBand P_Key/GID cache\n");
- goto pkey_cleanup;
+ down_read(&clients_rwsem);
+ xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) {
+ ret = add_client_context(device, client);
+ if (ret)
+ break;
}
- return 0;
+ up_read(&clients_rwsem);
-pkey_cleanup:
- kfree(device->port_pkey_list);
-port_cleanup:
- kfree(device->port_immutable);
+out:
+ up_read(&devices_rwsem);
return ret;
}
@@ -579,133 +830,253 @@ port_cleanup:
* devices with the IB core. All registered clients will receive a
* callback for each device that is added. @device must be allocated
* with ib_alloc_device().
+ *
+ * If the driver uses ops.dealloc_driver and calls any ib_unregister_device()
+ * asynchronously then the device pointer may become freed as soon as this
+ * function returns.
*/
-int ib_register_device(struct ib_device *device, const char *name,
- int (*port_callback)(struct ib_device *, u8,
- struct kobject *))
+int ib_register_device(struct ib_device *device, const char *name)
{
int ret;
- struct ib_client *client;
-
- setup_dma_device(device);
-
- mutex_lock(&device_mutex);
- if (strchr(name, '%')) {
- ret = alloc_name(device, name);
- if (ret)
- goto out;
- } else {
- ret = dev_set_name(&device->dev, name);
- if (ret)
- goto out;
- }
- if (__ib_device_get_by_name(dev_name(&device->dev))) {
- ret = -ENFILE;
- goto out;
- }
- strlcpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX);
+ ret = assign_name(device, name);
+ if (ret)
+ return ret;
ret = setup_device(device);
if (ret)
- goto out;
-
- device->index = __dev_new_index();
+ return ret;
- ret = ib_device_register_rdmacg(device);
+ ret = ib_cache_setup_one(device);
if (ret) {
dev_warn(&device->dev,
- "Couldn't register device with rdma cgroup\n");
- goto dev_cleanup;
+ "Couldn't set up InfiniBand P_Key/GID cache\n");
+ return ret;
}
- ret = ib_device_register_sysfs(device, port_callback);
+ ib_device_register_rdmacg(device);
+
+ ret = device_add(&device->dev);
+ if (ret)
+ goto cg_cleanup;
+
+ ret = ib_device_register_sysfs(device);
if (ret) {
dev_warn(&device->dev,
"Couldn't register device with driver model\n");
- goto cg_cleanup;
+ goto dev_cleanup;
}
- refcount_set(&device->refcount, 1);
- device->reg_state = IB_DEV_REGISTERED;
+ ret = enable_device_and_get(device);
+ if (ret) {
+ void (*dealloc_fn)(struct ib_device *);
- list_for_each_entry(client, &client_list, list)
- if (!add_client_context(device, client) && client->add)
- client->add(device);
+ /*
+ * If we hit this error flow then we don't want to
+ * automatically dealloc the device since the caller is
+ * expected to call ib_dealloc_device() after
+ * ib_register_device() fails. This is tricky due to the
+ * possibility for a parallel unregistration along with this
+ * error flow. Since we have a refcount here we know any
+ * parallel flow is stopped in disable_device and will see the
+ * NULL pointers, causing the responsibility to
+ * ib_dealloc_device() to revert back to this thread.
+ */
+ dealloc_fn = device->ops.dealloc_driver;
+ device->ops.dealloc_driver = NULL;
+ ib_device_put(device);
+ __ib_unregister_device(device);
+ device->ops.dealloc_driver = dealloc_fn;
+ return ret;
+ }
+ ib_device_put(device);
- down_write(&lists_rwsem);
- list_add_tail(&device->core_list, &device_list);
- up_write(&lists_rwsem);
- mutex_unlock(&device_mutex);
return 0;
+dev_cleanup:
+ device_del(&device->dev);
cg_cleanup:
ib_device_unregister_rdmacg(device);
-dev_cleanup:
- cleanup_device(device);
-out:
- mutex_unlock(&device_mutex);
+ ib_cache_cleanup_one(device);
return ret;
}
EXPORT_SYMBOL(ib_register_device);
+/* Callers must hold a get on the device. */
+static void __ib_unregister_device(struct ib_device *ib_dev)
+{
+ /*
+ * We have a registration lock so that all the calls to unregister are
+ * fully fenced, once any unregister returns the device is truely
+ * unregistered even if multiple callers are unregistering it at the
+ * same time. This also interacts with the registration flow and
+ * provides sane semantics if register and unregister are racing.
+ */
+ mutex_lock(&ib_dev->unregistration_lock);
+ if (!refcount_read(&ib_dev->refcount))
+ goto out;
+
+ disable_device(ib_dev);
+ ib_device_unregister_sysfs(ib_dev);
+ device_del(&ib_dev->dev);
+ ib_device_unregister_rdmacg(ib_dev);
+ ib_cache_cleanup_one(ib_dev);
+
+ /*
+ * Drivers using the new flow may not call ib_dealloc_device except
+ * in error unwind prior to registration success.
+ */
+ if (ib_dev->ops.dealloc_driver) {
+ WARN_ON(kref_read(&ib_dev->dev.kobj.kref) <= 1);
+ ib_dealloc_device(ib_dev);
+ }
+out:
+ mutex_unlock(&ib_dev->unregistration_lock);
+}
+
/**
* ib_unregister_device - Unregister an IB device
- * @device:Device to unregister
+ * @device: The device to unregister
*
* Unregister an IB device. All clients will receive a remove callback.
+ *
+ * Callers should call this routine only once, and protect against races with
+ * registration. Typically it should only be called as part of a remove
+ * callback in an implementation of driver core's struct device_driver and
+ * related.
+ *
+ * If ops.dealloc_driver is used then ib_dev will be freed upon return from
+ * this function.
*/
-void ib_unregister_device(struct ib_device *device)
+void ib_unregister_device(struct ib_device *ib_dev)
{
- struct ib_client_data *context, *tmp;
- unsigned long flags;
+ get_device(&ib_dev->dev);
+ __ib_unregister_device(ib_dev);
+ put_device(&ib_dev->dev);
+}
+EXPORT_SYMBOL(ib_unregister_device);
- /*
- * Wait for all netlink command callers to finish working on the
- * device.
- */
- ib_device_put(device);
- wait_for_completion(&device->unreg_completion);
+/**
+ * ib_unregister_device_and_put - Unregister a device while holding a 'get'
+ * device: The device to unregister
+ *
+ * This is the same as ib_unregister_device(), except it includes an internal
+ * ib_device_put() that should match a 'get' obtained by the caller.
+ *
+ * It is safe to call this routine concurrently from multiple threads while
+ * holding the 'get'. When the function returns the device is fully
+ * unregistered.
+ *
+ * Drivers using this flow MUST use the driver_unregister callback to clean up
+ * their resources associated with the device and dealloc it.
+ */
+void ib_unregister_device_and_put(struct ib_device *ib_dev)
+{
+ WARN_ON(!ib_dev->ops.dealloc_driver);
+ get_device(&ib_dev->dev);
+ ib_device_put(ib_dev);
+ __ib_unregister_device(ib_dev);
+ put_device(&ib_dev->dev);
+}
+EXPORT_SYMBOL(ib_unregister_device_and_put);
- mutex_lock(&device_mutex);
+/**
+ * ib_unregister_driver - Unregister all IB devices for a driver
+ * @driver_id: The driver to unregister
+ *
+ * This implements a fence for device unregistration. It only returns once all
+ * devices associated with the driver_id have fully completed their
+ * unregistration and returned from ib_unregister_device*().
+ *
+ * If device's are not yet unregistered it goes ahead and starts unregistering
+ * them.
+ *
+ * This does not block creation of new devices with the given driver_id, that
+ * is the responsibility of the caller.
+ */
+void ib_unregister_driver(enum rdma_driver_id driver_id)
+{
+ struct ib_device *ib_dev;
+ unsigned long index;
- down_write(&lists_rwsem);
- list_del(&device->core_list);
- write_lock_irq(&device->client_data_lock);
- list_for_each_entry(context, &device->client_data_list, list)
- context->going_down = true;
- write_unlock_irq(&device->client_data_lock);
- downgrade_write(&lists_rwsem);
+ down_read(&devices_rwsem);
+ xa_for_each (&devices, index, ib_dev) {
+ if (ib_dev->driver_id != driver_id)
+ continue;
- list_for_each_entry(context, &device->client_data_list, list) {
- if (context->client->remove)
- context->client->remove(device, context->data);
+ get_device(&ib_dev->dev);
+ up_read(&devices_rwsem);
+
+ WARN_ON(!ib_dev->ops.dealloc_driver);
+ __ib_unregister_device(ib_dev);
+
+ put_device(&ib_dev->dev);
+ down_read(&devices_rwsem);
}
- up_read(&lists_rwsem);
+ up_read(&devices_rwsem);
+}
+EXPORT_SYMBOL(ib_unregister_driver);
- ib_device_unregister_sysfs(device);
- ib_device_unregister_rdmacg(device);
+static void ib_unregister_work(struct work_struct *work)
+{
+ struct ib_device *ib_dev =
+ container_of(work, struct ib_device, unregistration_work);
- mutex_unlock(&device_mutex);
+ __ib_unregister_device(ib_dev);
+ put_device(&ib_dev->dev);
+}
- ib_cache_cleanup_one(device);
+/**
+ * ib_unregister_device_queued - Unregister a device using a work queue
+ * device: The device to unregister
+ *
+ * This schedules an asynchronous unregistration using a WQ for the device. A
+ * driver should use this to avoid holding locks while doing unregistration,
+ * such as holding the RTNL lock.
+ *
+ * Drivers using this API must use ib_unregister_driver before module unload
+ * to ensure that all scheduled unregistrations have completed.
+ */
+void ib_unregister_device_queued(struct ib_device *ib_dev)
+{
+ WARN_ON(!refcount_read(&ib_dev->refcount));
+ WARN_ON(!ib_dev->ops.dealloc_driver);
+ get_device(&ib_dev->dev);
+ if (!queue_work(system_unbound_wq, &ib_dev->unregistration_work))
+ put_device(&ib_dev->dev);
+}
+EXPORT_SYMBOL(ib_unregister_device_queued);
- ib_security_destroy_port_pkey_list(device);
- kfree(device->port_pkey_list);
+static int assign_client_id(struct ib_client *client)
+{
+ int ret;
- down_write(&lists_rwsem);
- write_lock_irqsave(&device->client_data_lock, flags);
- list_for_each_entry_safe(context, tmp, &device->client_data_list,
- list) {
- list_del(&context->list);
- kfree(context);
- }
- write_unlock_irqrestore(&device->client_data_lock, flags);
- up_write(&lists_rwsem);
+ down_write(&clients_rwsem);
+ /*
+ * The add/remove callbacks must be called in FIFO/LIFO order. To
+ * achieve this we assign client_ids so they are sorted in
+ * registration order, and retain a linked list we can reverse iterate
+ * to get the LIFO order. The extra linked list can go away if xarray
+ * learns to reverse iterate.
+ */
+ if (list_empty(&client_list))
+ client->client_id = 0;
+ else
+ client->client_id =
+ list_last_entry(&client_list, struct ib_client, list)
+ ->client_id;
+ ret = xa_alloc(&clients, &client->client_id, INT_MAX, client,
+ GFP_KERNEL);
+ if (ret)
+ goto out;
+
+ xa_set_mark(&clients, client->client_id, CLIENT_REGISTERED);
+ list_add_tail(&client->list, &client_list);
- device->reg_state = IB_DEV_UNREGISTERED;
+out:
+ up_write(&clients_rwsem);
+ return ret;
}
-EXPORT_SYMBOL(ib_unregister_device);
/**
* ib_register_client - Register an IB client
@@ -723,19 +1094,23 @@ EXPORT_SYMBOL(ib_unregister_device);
int ib_register_client(struct ib_client *client)
{
struct ib_device *device;
+ unsigned long index;
+ int ret;
- mutex_lock(&device_mutex);
-
- list_for_each_entry(device, &device_list, core_list)
- if (!add_client_context(device, client) && client->add)
- client->add(device);
-
- down_write(&lists_rwsem);
- list_add_tail(&client->list, &client_list);
- up_write(&lists_rwsem);
-
- mutex_unlock(&device_mutex);
+ ret = assign_client_id(client);
+ if (ret)
+ return ret;
+ down_read(&devices_rwsem);
+ xa_for_each_marked (&devices, index, device, DEVICE_REGISTERED) {
+ ret = add_client_context(device, client);
+ if (ret) {
+ up_read(&devices_rwsem);
+ ib_unregister_client(client);
+ return ret;
+ }
+ }
+ up_read(&devices_rwsem);
return 0;
}
EXPORT_SYMBOL(ib_register_client);
@@ -747,108 +1122,56 @@ EXPORT_SYMBOL(ib_register_client);
* Upper level users use ib_unregister_client() to remove their client
* registration. When ib_unregister_client() is called, the client
* will receive a remove callback for each IB device still registered.
+ *
+ * This is a full fence, once it returns no client callbacks will be called,
+ * or are running in another thread.
*/
void ib_unregister_client(struct ib_client *client)
{
- struct ib_client_data *context;
struct ib_device *device;
+ unsigned long index;
- mutex_lock(&device_mutex);
+ down_write(&clients_rwsem);
+ xa_clear_mark(&clients, client->client_id, CLIENT_REGISTERED);
+ up_write(&clients_rwsem);
+ /*
+ * Every device still known must be serialized to make sure we are
+ * done with the client callbacks before we return.
+ */
+ down_read(&devices_rwsem);
+ xa_for_each (&devices, index, device)
+ remove_client_context(device, client->client_id);
+ up_read(&devices_rwsem);
- down_write(&lists_rwsem);
+ down_write(&clients_rwsem);
list_del(&client->list);
- up_write(&lists_rwsem);
-
- list_for_each_entry(device, &device_list, core_list) {
- struct ib_client_data *found_context = NULL;
-
- down_write(&lists_rwsem);
- write_lock_irq(&device->client_data_lock);
- list_for_each_entry(context, &device->client_data_list, list)
- if (context->client == client) {
- context->going_down = true;
- found_context = context;
- break;
- }
- write_unlock_irq(&device->client_data_lock);
- up_write(&lists_rwsem);
-
- if (client->remove)
- client->remove(device, found_context ?
- found_context->data : NULL);
-
- if (!found_context) {
- dev_warn(&device->dev,
- "No client context found for %s\n",
- client->name);
- continue;
- }
-
- down_write(&lists_rwsem);
- write_lock_irq(&device->client_data_lock);
- list_del(&found_context->list);
- write_unlock_irq(&device->client_data_lock);
- up_write(&lists_rwsem);
- kfree(found_context);
- }
-
- mutex_unlock(&device_mutex);
+ xa_erase(&clients, client->client_id);
+ up_write(&clients_rwsem);
}
EXPORT_SYMBOL(ib_unregister_client);
/**
- * ib_get_client_data - Get IB client context
- * @device:Device to get context for
- * @client:Client to get context for
- *
- * ib_get_client_data() returns client context set with
- * ib_set_client_data().
- */
-void *ib_get_client_data(struct ib_device *device, struct ib_client *client)
-{
- struct ib_client_data *context;
- void *ret = NULL;
- unsigned long flags;
-
- read_lock_irqsave(&device->client_data_lock, flags);
- list_for_each_entry(context, &device->client_data_list, list)
- if (context->client == client) {
- ret = context->data;
- break;
- }
- read_unlock_irqrestore(&device->client_data_lock, flags);
-
- return ret;
-}
-EXPORT_SYMBOL(ib_get_client_data);
-
-/**
* ib_set_client_data - Set IB client context
* @device:Device to set context for
* @client:Client to set context for
* @data:Context to set
*
- * ib_set_client_data() sets client context that can be retrieved with
- * ib_get_client_data().
+ * ib_set_client_data() sets client context data that can be retrieved with
+ * ib_get_client_data(). This can only be called while the client is
+ * registered to the device, once the ib_client remove() callback returns this
+ * cannot be called.
*/
void ib_set_client_data(struct ib_device *device, struct ib_client *client,
void *data)
{
- struct ib_client_data *context;
- unsigned long flags;
-
- write_lock_irqsave(&device->client_data_lock, flags);
- list_for_each_entry(context, &device->client_data_list, list)
- if (context->client == client) {
- context->data = data;
- goto out;
- }
+ void *rc;
- dev_warn(&device->dev, "No client context found for %s\n",
- client->name);
+ if (WARN_ON(IS_ERR(data)))
+ data = NULL;
-out:
- write_unlock_irqrestore(&device->client_data_lock, flags);
+ rc = xa_store(&device->client_data, client->client_id, data,
+ GFP_KERNEL);
+ WARN_ON(xa_is_err(rc));
}
EXPORT_SYMBOL(ib_set_client_data);
@@ -947,6 +1270,185 @@ int ib_query_port(struct ib_device *device,
}
EXPORT_SYMBOL(ib_query_port);
+static void add_ndev_hash(struct ib_port_data *pdata)
+{
+ unsigned long flags;
+
+ might_sleep();
+
+ spin_lock_irqsave(&ndev_hash_lock, flags);
+ if (hash_hashed(&pdata->ndev_hash_link)) {
+ hash_del_rcu(&pdata->ndev_hash_link);
+ spin_unlock_irqrestore(&ndev_hash_lock, flags);
+ /*
+ * We cannot do hash_add_rcu after a hash_del_rcu until the
+ * grace period
+ */
+ synchronize_rcu();
+ spin_lock_irqsave(&ndev_hash_lock, flags);
+ }
+ if (pdata->netdev)
+ hash_add_rcu(ndev_hash, &pdata->ndev_hash_link,
+ (uintptr_t)pdata->netdev);
+ spin_unlock_irqrestore(&ndev_hash_lock, flags);
+}
+
+/**
+ * ib_device_set_netdev - Associate the ib_dev with an underlying net_device
+ * @ib_dev: Device to modify
+ * @ndev: net_device to affiliate, may be NULL
+ * @port: IB port the net_device is connected to
+ *
+ * Drivers should use this to link the ib_device to a netdev so the netdev
+ * shows up in interfaces like ib_enum_roce_netdev. Only one netdev may be
+ * affiliated with any port.
+ *
+ * The caller must ensure that the given ndev is not unregistered or
+ * unregistering, and that either the ib_device is unregistered or
+ * ib_device_set_netdev() is called with NULL when the ndev sends a
+ * NETDEV_UNREGISTER event.
+ */
+int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev,
+ unsigned int port)
+{
+ struct net_device *old_ndev;
+ struct ib_port_data *pdata;
+ unsigned long flags;
+ int ret;
+
+ /*
+ * Drivers wish to call this before ib_register_driver, so we have to
+ * setup the port data early.
+ */
+ ret = alloc_port_data(ib_dev);
+ if (ret)
+ return ret;
+
+ if (!rdma_is_port_valid(ib_dev, port))
+ return -EINVAL;
+
+ pdata = &ib_dev->port_data[port];
+ spin_lock_irqsave(&pdata->netdev_lock, flags);
+ old_ndev = rcu_dereference_protected(
+ pdata->netdev, lockdep_is_held(&pdata->netdev_lock));
+ if (old_ndev == ndev) {
+ spin_unlock_irqrestore(&pdata->netdev_lock, flags);
+ return 0;
+ }
+
+ if (ndev)
+ dev_hold(ndev);
+ rcu_assign_pointer(pdata->netdev, ndev);
+ spin_unlock_irqrestore(&pdata->netdev_lock, flags);
+
+ add_ndev_hash(pdata);
+ if (old_ndev)
+ dev_put(old_ndev);
+
+ return 0;
+}
+EXPORT_SYMBOL(ib_device_set_netdev);
+
+static void free_netdevs(struct ib_device *ib_dev)
+{
+ unsigned long flags;
+ unsigned int port;
+
+ rdma_for_each_port (ib_dev, port) {
+ struct ib_port_data *pdata = &ib_dev->port_data[port];
+ struct net_device *ndev;
+
+ spin_lock_irqsave(&pdata->netdev_lock, flags);
+ ndev = rcu_dereference_protected(
+ pdata->netdev, lockdep_is_held(&pdata->netdev_lock));
+ if (ndev) {
+ spin_lock(&ndev_hash_lock);
+ hash_del_rcu(&pdata->ndev_hash_link);
+ spin_unlock(&ndev_hash_lock);
+
+ /*
+ * If this is the last dev_put there is still a
+ * synchronize_rcu before the netdev is kfreed, so we
+ * can continue to rely on unlocked pointer
+ * comparisons after the put
+ */
+ rcu_assign_pointer(pdata->netdev, NULL);
+ dev_put(ndev);
+ }
+ spin_unlock_irqrestore(&pdata->netdev_lock, flags);
+ }
+}
+
+struct net_device *ib_device_get_netdev(struct ib_device *ib_dev,
+ unsigned int port)
+{
+ struct ib_port_data *pdata;
+ struct net_device *res;
+
+ if (!rdma_is_port_valid(ib_dev, port))
+ return NULL;
+
+ pdata = &ib_dev->port_data[port];
+
+ /*
+ * New drivers should use ib_device_set_netdev() not the legacy
+ * get_netdev().
+ */
+ if (ib_dev->ops.get_netdev)
+ res = ib_dev->ops.get_netdev(ib_dev, port);
+ else {
+ spin_lock(&pdata->netdev_lock);
+ res = rcu_dereference_protected(
+ pdata->netdev, lockdep_is_held(&pdata->netdev_lock));
+ if (res)
+ dev_hold(res);
+ spin_unlock(&pdata->netdev_lock);
+ }
+
+ /*
+ * If we are starting to unregister expedite things by preventing
+ * propagation of an unregistering netdev.
+ */
+ if (res && res->reg_state != NETREG_REGISTERED) {
+ dev_put(res);
+ return NULL;
+ }
+
+ return res;
+}
+
+/**
+ * ib_device_get_by_netdev - Find an IB device associated with a netdev
+ * @ndev: netdev to locate
+ * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all)
+ *
+ * Find and hold an ib_device that is associated with a netdev via
+ * ib_device_set_netdev(). The caller must call ib_device_put() on the
+ * returned pointer.
+ */
+struct ib_device *ib_device_get_by_netdev(struct net_device *ndev,
+ enum rdma_driver_id driver_id)
+{
+ struct ib_device *res = NULL;
+ struct ib_port_data *cur;
+
+ rcu_read_lock();
+ hash_for_each_possible_rcu (ndev_hash, cur, ndev_hash_link,
+ (uintptr_t)ndev) {
+ if (rcu_access_pointer(cur->netdev) == ndev &&
+ (driver_id == RDMA_DRIVER_UNKNOWN ||
+ cur->ib_dev->driver_id == driver_id) &&
+ ib_device_try_get(cur->ib_dev)) {
+ res = cur->ib_dev;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return res;
+}
+EXPORT_SYMBOL(ib_device_get_by_netdev);
+
/**
* ib_enum_roce_netdev - enumerate all RoCE ports
* @ib_dev : IB device we want to query
@@ -965,21 +1467,12 @@ void ib_enum_roce_netdev(struct ib_device *ib_dev,
roce_netdev_callback cb,
void *cookie)
{
- u8 port;
+ unsigned int port;
- for (port = rdma_start_port(ib_dev); port <= rdma_end_port(ib_dev);
- port++)
+ rdma_for_each_port (ib_dev, port)
if (rdma_protocol_roce(ib_dev, port)) {
- struct net_device *idev = NULL;
-
- if (ib_dev->ops.get_netdev)
- idev = ib_dev->ops.get_netdev(ib_dev, port);
-
- if (idev &&
- idev->reg_state >= NETREG_UNREGISTERED) {
- dev_put(idev);
- idev = NULL;
- }
+ struct net_device *idev =
+ ib_device_get_netdev(ib_dev, port);
if (filter(ib_dev, port, idev, filter_cookie))
cb(ib_dev, port, idev, cookie);
@@ -1006,11 +1499,12 @@ void ib_enum_all_roce_netdevs(roce_netdev_filter filter,
void *cookie)
{
struct ib_device *dev;
+ unsigned long index;
- down_read(&lists_rwsem);
- list_for_each_entry(dev, &device_list, core_list)
+ down_read(&devices_rwsem);
+ xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED)
ib_enum_roce_netdev(dev, filter, filter_cookie, cb, cookie);
- up_read(&lists_rwsem);
+ up_read(&devices_rwsem);
}
/**
@@ -1022,19 +1516,19 @@ void ib_enum_all_roce_netdevs(roce_netdev_filter filter,
int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb,
struct netlink_callback *cb)
{
+ unsigned long index;
struct ib_device *dev;
unsigned int idx = 0;
int ret = 0;
- down_read(&lists_rwsem);
- list_for_each_entry(dev, &device_list, core_list) {
+ down_read(&devices_rwsem);
+ xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) {
ret = nldev_cb(dev, skb, cb, idx);
if (ret)
break;
idx++;
}
-
- up_read(&lists_rwsem);
+ up_read(&devices_rwsem);
return ret;
}
@@ -1121,13 +1615,15 @@ int ib_find_gid(struct ib_device *device, union ib_gid *gid,
u8 *port_num, u16 *index)
{
union ib_gid tmp_gid;
- int ret, port, i;
+ unsigned int port;
+ int ret, i;
- for (port = rdma_start_port(device); port <= rdma_end_port(device); ++port) {
+ rdma_for_each_port (device, port) {
if (!rdma_protocol_ib(device, port))
continue;
- for (i = 0; i < device->port_immutable[port].gid_tbl_len; ++i) {
+ for (i = 0; i < device->port_data[port].immutable.gid_tbl_len;
+ ++i) {
ret = rdma_query_gid(device, port, i, &tmp_gid);
if (ret)
return ret;
@@ -1159,7 +1655,8 @@ int ib_find_pkey(struct ib_device *device,
u16 tmp_pkey;
int partial_ix = -1;
- for (i = 0; i < device->port_immutable[port_num].pkey_tbl_len; ++i) {
+ for (i = 0; i < device->port_data[port_num].immutable.pkey_tbl_len;
+ ++i) {
ret = ib_query_pkey(device, port_num, i, &tmp_pkey);
if (ret)
return ret;
@@ -1192,6 +1689,7 @@ EXPORT_SYMBOL(ib_find_pkey);
* @gid: A GID that the net_dev uses to communicate.
* @addr: Contains the IP address that the request specified as its
* destination.
+ *
*/
struct net_device *ib_get_net_dev_by_params(struct ib_device *dev,
u8 port,
@@ -1200,29 +1698,30 @@ struct net_device *ib_get_net_dev_by_params(struct ib_device *dev,
const struct sockaddr *addr)
{
struct net_device *net_dev = NULL;
- struct ib_client_data *context;
+ unsigned long index;
+ void *client_data;
if (!rdma_protocol_ib(dev, port))
return NULL;
- down_read(&lists_rwsem);
-
- list_for_each_entry(context, &dev->client_data_list, list) {
- struct ib_client *client = context->client;
+ /*
+ * Holding the read side guarantees that the client will not become
+ * unregistered while we are calling get_net_dev_by_params()
+ */
+ down_read(&dev->client_data_rwsem);
+ xan_for_each_marked (&dev->client_data, index, client_data,
+ CLIENT_DATA_REGISTERED) {
+ struct ib_client *client = xa_load(&clients, index);
- if (context->going_down)
+ if (!client || !client->get_net_dev_by_params)
continue;
- if (client->get_net_dev_by_params) {
- net_dev = client->get_net_dev_by_params(dev, port, pkey,
- gid, addr,
- context->data);
- if (net_dev)
- break;
- }
+ net_dev = client->get_net_dev_by_params(dev, port, pkey, gid,
+ addr, client_data);
+ if (net_dev)
+ break;
}
-
- up_read(&lists_rwsem);
+ up_read(&dev->client_data_rwsem);
return net_dev;
}
@@ -1238,6 +1737,8 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
(ptr)->name = ops->name; \
} while (0)
+#define SET_OBJ_SIZE(ptr, name) SET_DEVICE_OP(ptr, size_##name)
+
SET_DEVICE_OP(dev_ops, add_gid);
SET_DEVICE_OP(dev_ops, advise_mr);
SET_DEVICE_OP(dev_ops, alloc_dm);
@@ -1261,6 +1762,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
SET_DEVICE_OP(dev_ops, create_srq);
SET_DEVICE_OP(dev_ops, create_wq);
SET_DEVICE_OP(dev_ops, dealloc_dm);
+ SET_DEVICE_OP(dev_ops, dealloc_driver);
SET_DEVICE_OP(dev_ops, dealloc_fmr);
SET_DEVICE_OP(dev_ops, dealloc_mw);
SET_DEVICE_OP(dev_ops, dealloc_pd);
@@ -1281,6 +1783,8 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
SET_DEVICE_OP(dev_ops, disassociate_ucontext);
SET_DEVICE_OP(dev_ops, drain_rq);
SET_DEVICE_OP(dev_ops, drain_sq);
+ SET_DEVICE_OP(dev_ops, enable_driver);
+ SET_DEVICE_OP(dev_ops, fill_res_entry);
SET_DEVICE_OP(dev_ops, get_dev_fw_str);
SET_DEVICE_OP(dev_ops, get_dma_mr);
SET_DEVICE_OP(dev_ops, get_hw_stats);
@@ -1290,6 +1794,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
SET_DEVICE_OP(dev_ops, get_vector_affinity);
SET_DEVICE_OP(dev_ops, get_vf_config);
SET_DEVICE_OP(dev_ops, get_vf_stats);
+ SET_DEVICE_OP(dev_ops, init_port);
SET_DEVICE_OP(dev_ops, map_mr_sg);
SET_DEVICE_OP(dev_ops, map_phys_fmr);
SET_DEVICE_OP(dev_ops, mmap);
@@ -1325,6 +1830,9 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
SET_DEVICE_OP(dev_ops, set_vf_guid);
SET_DEVICE_OP(dev_ops, set_vf_link_state);
SET_DEVICE_OP(dev_ops, unmap_fmr);
+
+ SET_OBJ_SIZE(dev_ops, ib_pd);
+ SET_OBJ_SIZE(dev_ops, ib_ucontext);
}
EXPORT_SYMBOL(ib_set_device_ops);
@@ -1443,6 +1951,9 @@ static void __exit ib_core_cleanup(void)
destroy_workqueue(ib_comp_wq);
/* Make sure that any pending umem accounting work is done. */
destroy_workqueue(ib_wq);
+ flush_workqueue(system_unbound_wq);
+ WARN_ON(!xa_empty(&clients));
+ WARN_ON(!xa_empty(&devices));
}
MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_LS, 4);
diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c
index 476abc74178e..732637c913d9 100644
--- a/drivers/infiniband/core/iwcm.c
+++ b/drivers/infiniband/core/iwcm.c
@@ -87,7 +87,8 @@ static struct rdma_nl_cbs iwcm_nl_cb_table[RDMA_NL_IWPM_NUM_OPS] = {
[RDMA_NL_IWPM_REMOTE_INFO] = {.dump = iwpm_remote_info_cb},
[RDMA_NL_IWPM_HANDLE_ERR] = {.dump = iwpm_mapping_error_cb},
[RDMA_NL_IWPM_MAPINFO] = {.dump = iwpm_mapping_info_cb},
- [RDMA_NL_IWPM_MAPINFO_NUM] = {.dump = iwpm_ack_mapping_info_cb}
+ [RDMA_NL_IWPM_MAPINFO_NUM] = {.dump = iwpm_ack_mapping_info_cb},
+ [RDMA_NL_IWPM_HELLO] = {.dump = iwpm_hello_cb}
};
static struct workqueue_struct *iwcm_wq;
@@ -504,7 +505,7 @@ static int iw_cm_map(struct iw_cm_id *cm_id, bool active)
{
const char *devname = dev_name(&cm_id->device->dev);
const char *ifname = cm_id->device->iwcm->ifname;
- struct iwpm_dev_data pm_reg_msg;
+ struct iwpm_dev_data pm_reg_msg = {};
struct iwpm_sa_data pm_msg;
int status;
@@ -515,8 +516,8 @@ static int iw_cm_map(struct iw_cm_id *cm_id, bool active)
cm_id->m_local_addr = cm_id->local_addr;
cm_id->m_remote_addr = cm_id->remote_addr;
- strncpy(pm_reg_msg.dev_name, devname, sizeof(pm_reg_msg.dev_name));
- strncpy(pm_reg_msg.if_name, ifname, sizeof(pm_reg_msg.if_name));
+ strcpy(pm_reg_msg.dev_name, devname);
+ strcpy(pm_reg_msg.if_name, ifname);
if (iwpm_register_pid(&pm_reg_msg, RDMA_NL_IWCM) ||
!iwpm_valid_pid())
@@ -525,6 +526,8 @@ static int iw_cm_map(struct iw_cm_id *cm_id, bool active)
cm_id->mapped = true;
pm_msg.loc_addr = cm_id->local_addr;
pm_msg.rem_addr = cm_id->remote_addr;
+ pm_msg.flags = (cm_id->device->iwcm->driver_flags & IW_F_NO_PORT_MAP) ?
+ IWPM_FLAGS_NO_PORT_MAP : 0;
if (active)
status = iwpm_add_and_query_mapping(&pm_msg,
RDMA_NL_IWCM);
@@ -543,7 +546,7 @@ static int iw_cm_map(struct iw_cm_id *cm_id, bool active)
return iwpm_create_mapinfo(&cm_id->local_addr,
&cm_id->m_local_addr,
- RDMA_NL_IWCM);
+ RDMA_NL_IWCM, pm_msg.flags);
}
/*
diff --git a/drivers/infiniband/core/iwpm_msg.c b/drivers/infiniband/core/iwpm_msg.c
index 8861c052155a..2452b0ddcf0d 100644
--- a/drivers/infiniband/core/iwpm_msg.c
+++ b/drivers/infiniband/core/iwpm_msg.c
@@ -34,18 +34,25 @@
#include "iwpm_util.h"
static const char iwpm_ulib_name[IWPM_ULIBNAME_SIZE] = "iWarpPortMapperUser";
-static int iwpm_ulib_version = 3;
+u16 iwpm_ulib_version = IWPM_UABI_VERSION_MIN;
static int iwpm_user_pid = IWPM_PID_UNDEFINED;
static atomic_t echo_nlmsg_seq;
+/**
+ * iwpm_valid_pid - Check if the userspace iwarp port mapper pid is valid
+ *
+ * Returns true if the pid is greater than zero, otherwise returns false
+ */
int iwpm_valid_pid(void)
{
return iwpm_user_pid > 0;
}
-/*
- * iwpm_register_pid - Send a netlink query to user space
- * for the iwarp port mapper pid
+/**
+ * iwpm_register_pid - Send a netlink query to userspace
+ * to get the iwarp port mapper pid
+ * @pm_msg: Contains driver info to send to the userspace port mapper
+ * @nl_client: The index of the netlink client
*
* nlmsg attributes:
* [IWPM_NLA_REG_PID_SEQ]
@@ -124,12 +131,19 @@ pid_query_error:
return ret;
}
-/*
- * iwpm_add_mapping - Send a netlink add mapping message
- * to the port mapper
+/**
+ * iwpm_add_mapping - Send a netlink add mapping request to
+ * the userspace port mapper
+ * @pm_msg: Contains the local ip/tcp address info to send
+ * @nl_client: The index of the netlink client
+ *
* nlmsg attributes:
* [IWPM_NLA_MANAGE_MAPPING_SEQ]
* [IWPM_NLA_MANAGE_ADDR]
+ * [IWPM_NLA_MANAGE_FLAGS]
+ *
+ * If the request is successful, the pm_msg stores
+ * the port mapper response (mapped address info)
*/
int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client)
{
@@ -173,6 +187,18 @@ int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client)
if (ret)
goto add_mapping_error;
+ /* If flags are required and we're not V4, then return a quiet error */
+ if (pm_msg->flags && iwpm_ulib_version == IWPM_UABI_VERSION_MIN) {
+ ret = -EINVAL;
+ goto add_mapping_error_nowarn;
+ }
+ if (iwpm_ulib_version > IWPM_UABI_VERSION_MIN) {
+ ret = ibnl_put_attr(skb, nlh, sizeof(u32), &pm_msg->flags,
+ IWPM_NLA_MANAGE_FLAGS);
+ if (ret)
+ goto add_mapping_error;
+ }
+
nlmsg_end(skb, nlh);
nlmsg_request->req_buffer = pm_msg;
@@ -187,6 +213,7 @@ int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client)
return ret;
add_mapping_error:
pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client);
+add_mapping_error_nowarn:
if (skb)
dev_kfree_skb(skb);
if (nlmsg_request)
@@ -194,13 +221,17 @@ add_mapping_error:
return ret;
}
-/*
- * iwpm_add_and_query_mapping - Send a netlink add and query
- * mapping message to the port mapper
+/**
+ * iwpm_add_and_query_mapping - Process the port mapper response to
+ * iwpm_add_and_query_mapping request
+ * @pm_msg: Contains the local ip/tcp address info to send
+ * @nl_client: The index of the netlink client
+ *
* nlmsg attributes:
* [IWPM_NLA_QUERY_MAPPING_SEQ]
* [IWPM_NLA_QUERY_LOCAL_ADDR]
* [IWPM_NLA_QUERY_REMOTE_ADDR]
+ * [IWPM_NLA_QUERY_FLAGS]
*/
int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client)
{
@@ -251,6 +282,18 @@ int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client)
if (ret)
goto query_mapping_error;
+ /* If flags are required and we're not V4, then return a quite error */
+ if (pm_msg->flags && iwpm_ulib_version == IWPM_UABI_VERSION_MIN) {
+ ret = -EINVAL;
+ goto query_mapping_error_nowarn;
+ }
+ if (iwpm_ulib_version > IWPM_UABI_VERSION_MIN) {
+ ret = ibnl_put_attr(skb, nlh, sizeof(u32), &pm_msg->flags,
+ IWPM_NLA_QUERY_FLAGS);
+ if (ret)
+ goto query_mapping_error;
+ }
+
nlmsg_end(skb, nlh);
nlmsg_request->req_buffer = pm_msg;
@@ -264,6 +307,7 @@ int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client)
return ret;
query_mapping_error:
pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client);
+query_mapping_error_nowarn:
if (skb)
dev_kfree_skb(skb);
if (nlmsg_request)
@@ -271,9 +315,13 @@ query_mapping_error:
return ret;
}
-/*
- * iwpm_remove_mapping - Send a netlink remove mapping message
- * to the port mapper
+/**
+ * iwpm_remove_mapping - Send a netlink remove mapping request
+ * to the userspace port mapper
+ *
+ * @local_addr: Local ip/tcp address to remove
+ * @nl_client: The index of the netlink client
+ *
* nlmsg attributes:
* [IWPM_NLA_MANAGE_MAPPING_SEQ]
* [IWPM_NLA_MANAGE_ADDR]
@@ -344,9 +392,14 @@ static const struct nla_policy resp_reg_policy[IWPM_NLA_RREG_PID_MAX] = {
[IWPM_NLA_RREG_PID_ERR] = { .type = NLA_U16 }
};
-/*
- * iwpm_register_pid_cb - Process a port mapper response to
- * iwpm_register_pid()
+/**
+ * iwpm_register_pid_cb - Process the port mapper response to
+ * iwpm_register_pid query
+ * @skb:
+ * @cb: Contains the received message (payload and netlink header)
+ *
+ * If successful, the function receives the userspace port mapper pid
+ * which is used in future communication with the port mapper
*/
int iwpm_register_pid_cb(struct sk_buff *skb, struct netlink_callback *cb)
{
@@ -379,7 +432,7 @@ int iwpm_register_pid_cb(struct sk_buff *skb, struct netlink_callback *cb)
/* check device name, ulib name and version */
if (strcmp(pm_msg->dev_name, dev_name) ||
strcmp(iwpm_ulib_name, iwpm_name) ||
- iwpm_version != iwpm_ulib_version) {
+ iwpm_version < IWPM_UABI_VERSION_MIN) {
pr_info("%s: Incorrect info (dev = %s name = %s version = %d)\n",
__func__, dev_name, iwpm_name, iwpm_version);
@@ -387,6 +440,10 @@ int iwpm_register_pid_cb(struct sk_buff *skb, struct netlink_callback *cb)
goto register_pid_response_exit;
}
iwpm_user_pid = cb->nlh->nlmsg_pid;
+ iwpm_ulib_version = iwpm_version;
+ if (iwpm_ulib_version < IWPM_UABI_VERSION)
+ pr_warn_once("%s: Down level iwpmd/pid %u. Continuing...",
+ __func__, iwpm_user_pid);
atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
pr_debug("%s: iWarp Port Mapper (pid = %d) is available!\n",
__func__, iwpm_user_pid);
@@ -403,15 +460,19 @@ register_pid_response_exit:
/* netlink attribute policy for the received response to add mapping request */
static const struct nla_policy resp_add_policy[IWPM_NLA_RMANAGE_MAPPING_MAX] = {
- [IWPM_NLA_MANAGE_MAPPING_SEQ] = { .type = NLA_U32 },
- [IWPM_NLA_MANAGE_ADDR] = { .len = sizeof(struct sockaddr_storage) },
- [IWPM_NLA_MANAGE_MAPPED_LOC_ADDR] = { .len = sizeof(struct sockaddr_storage) },
- [IWPM_NLA_RMANAGE_MAPPING_ERR] = { .type = NLA_U16 }
+ [IWPM_NLA_RMANAGE_MAPPING_SEQ] = { .type = NLA_U32 },
+ [IWPM_NLA_RMANAGE_ADDR] = {
+ .len = sizeof(struct sockaddr_storage) },
+ [IWPM_NLA_RMANAGE_MAPPED_LOC_ADDR] = {
+ .len = sizeof(struct sockaddr_storage) },
+ [IWPM_NLA_RMANAGE_MAPPING_ERR] = { .type = NLA_U16 }
};
-/*
- * iwpm_add_mapping_cb - Process a port mapper response to
- * iwpm_add_mapping()
+/**
+ * iwpm_add_mapping_cb - Process the port mapper response to
+ * iwpm_add_mapping request
+ * @skb:
+ * @cb: Contains the received message (payload and netlink header)
*/
int iwpm_add_mapping_cb(struct sk_buff *skb, struct netlink_callback *cb)
{
@@ -430,7 +491,7 @@ int iwpm_add_mapping_cb(struct sk_buff *skb, struct netlink_callback *cb)
atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
- msg_seq = nla_get_u32(nltb[IWPM_NLA_MANAGE_MAPPING_SEQ]);
+ msg_seq = nla_get_u32(nltb[IWPM_NLA_RMANAGE_MAPPING_SEQ]);
nlmsg_request = iwpm_find_nlmsg_request(msg_seq);
if (!nlmsg_request) {
pr_info("%s: Could not find a matching request (seq = %u)\n",
@@ -439,9 +500,9 @@ int iwpm_add_mapping_cb(struct sk_buff *skb, struct netlink_callback *cb)
}
pm_msg = nlmsg_request->req_buffer;
local_sockaddr = (struct sockaddr_storage *)
- nla_data(nltb[IWPM_NLA_MANAGE_ADDR]);
+ nla_data(nltb[IWPM_NLA_RMANAGE_ADDR]);
mapped_sockaddr = (struct sockaddr_storage *)
- nla_data(nltb[IWPM_NLA_MANAGE_MAPPED_LOC_ADDR]);
+ nla_data(nltb[IWPM_NLA_RMANAGE_MAPPED_LOC_ADDR]);
if (iwpm_compare_sockaddr(local_sockaddr, &pm_msg->loc_addr)) {
nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR;
@@ -472,17 +533,23 @@ add_mapping_response_exit:
/* netlink attribute policy for the response to add and query mapping request
* and response with remote address info */
static const struct nla_policy resp_query_policy[IWPM_NLA_RQUERY_MAPPING_MAX] = {
- [IWPM_NLA_QUERY_MAPPING_SEQ] = { .type = NLA_U32 },
- [IWPM_NLA_QUERY_LOCAL_ADDR] = { .len = sizeof(struct sockaddr_storage) },
- [IWPM_NLA_QUERY_REMOTE_ADDR] = { .len = sizeof(struct sockaddr_storage) },
- [IWPM_NLA_RQUERY_MAPPED_LOC_ADDR] = { .len = sizeof(struct sockaddr_storage) },
- [IWPM_NLA_RQUERY_MAPPED_REM_ADDR] = { .len = sizeof(struct sockaddr_storage) },
+ [IWPM_NLA_RQUERY_MAPPING_SEQ] = { .type = NLA_U32 },
+ [IWPM_NLA_RQUERY_LOCAL_ADDR] = {
+ .len = sizeof(struct sockaddr_storage) },
+ [IWPM_NLA_RQUERY_REMOTE_ADDR] = {
+ .len = sizeof(struct sockaddr_storage) },
+ [IWPM_NLA_RQUERY_MAPPED_LOC_ADDR] = {
+ .len = sizeof(struct sockaddr_storage) },
+ [IWPM_NLA_RQUERY_MAPPED_REM_ADDR] = {
+ .len = sizeof(struct sockaddr_storage) },
[IWPM_NLA_RQUERY_MAPPING_ERR] = { .type = NLA_U16 }
};
-/*
- * iwpm_add_and_query_mapping_cb - Process a port mapper response to
- * iwpm_add_and_query_mapping()
+/**
+ * iwpm_add_and_query_mapping_cb - Process the port mapper response to
+ * iwpm_add_and_query_mapping request
+ * @skb:
+ * @cb: Contains the received message (payload and netlink header)
*/
int iwpm_add_and_query_mapping_cb(struct sk_buff *skb,
struct netlink_callback *cb)
@@ -502,7 +569,7 @@ int iwpm_add_and_query_mapping_cb(struct sk_buff *skb,
return -EINVAL;
atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
- msg_seq = nla_get_u32(nltb[IWPM_NLA_QUERY_MAPPING_SEQ]);
+ msg_seq = nla_get_u32(nltb[IWPM_NLA_RQUERY_MAPPING_SEQ]);
nlmsg_request = iwpm_find_nlmsg_request(msg_seq);
if (!nlmsg_request) {
pr_info("%s: Could not find a matching request (seq = %u)\n",
@@ -511,9 +578,9 @@ int iwpm_add_and_query_mapping_cb(struct sk_buff *skb,
}
pm_msg = nlmsg_request->req_buffer;
local_sockaddr = (struct sockaddr_storage *)
- nla_data(nltb[IWPM_NLA_QUERY_LOCAL_ADDR]);
+ nla_data(nltb[IWPM_NLA_RQUERY_LOCAL_ADDR]);
remote_sockaddr = (struct sockaddr_storage *)
- nla_data(nltb[IWPM_NLA_QUERY_REMOTE_ADDR]);
+ nla_data(nltb[IWPM_NLA_RQUERY_REMOTE_ADDR]);
mapped_loc_sockaddr = (struct sockaddr_storage *)
nla_data(nltb[IWPM_NLA_RQUERY_MAPPED_LOC_ADDR]);
mapped_rem_sockaddr = (struct sockaddr_storage *)
@@ -560,9 +627,13 @@ query_mapping_response_exit:
return 0;
}
-/*
- * iwpm_remote_info_cb - Process a port mapper message, containing
- * the remote connecting peer address info
+/**
+ * iwpm_remote_info_cb - Process remote connecting peer address info, which
+ * the port mapper has received from the connecting peer
+ * @skb:
+ * @cb: Contains the received message (payload and netlink header)
+ *
+ * Stores the IPv4/IPv6 address info in a hash table
*/
int iwpm_remote_info_cb(struct sk_buff *skb, struct netlink_callback *cb)
{
@@ -588,9 +659,9 @@ int iwpm_remote_info_cb(struct sk_buff *skb, struct netlink_callback *cb)
atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
local_sockaddr = (struct sockaddr_storage *)
- nla_data(nltb[IWPM_NLA_QUERY_LOCAL_ADDR]);
+ nla_data(nltb[IWPM_NLA_RQUERY_LOCAL_ADDR]);
remote_sockaddr = (struct sockaddr_storage *)
- nla_data(nltb[IWPM_NLA_QUERY_REMOTE_ADDR]);
+ nla_data(nltb[IWPM_NLA_RQUERY_REMOTE_ADDR]);
mapped_loc_sockaddr = (struct sockaddr_storage *)
nla_data(nltb[IWPM_NLA_RQUERY_MAPPED_LOC_ADDR]);
mapped_rem_sockaddr = (struct sockaddr_storage *)
@@ -635,8 +706,14 @@ static const struct nla_policy resp_mapinfo_policy[IWPM_NLA_MAPINFO_REQ_MAX] = {
[IWPM_NLA_MAPINFO_ULIB_VER] = { .type = NLA_U16 }
};
-/*
- * iwpm_mapping_info_cb - Process a port mapper request for mapping info
+/**
+ * iwpm_mapping_info_cb - Process a notification that the userspace
+ * port mapper daemon is started
+ * @skb:
+ * @cb: Contains the received message (payload and netlink header)
+ *
+ * Using the received port mapper pid, send all the local mapping
+ * info records to the userspace port mapper
*/
int iwpm_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb)
{
@@ -655,7 +732,7 @@ int iwpm_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb)
iwpm_name = (char *)nla_data(nltb[IWPM_NLA_MAPINFO_ULIB_NAME]);
iwpm_version = nla_get_u16(nltb[IWPM_NLA_MAPINFO_ULIB_VER]);
if (strcmp(iwpm_ulib_name, iwpm_name) ||
- iwpm_version != iwpm_ulib_version) {
+ iwpm_version < IWPM_UABI_VERSION_MIN) {
pr_info("%s: Invalid port mapper name = %s version = %d\n",
__func__, iwpm_name, iwpm_version);
return ret;
@@ -669,6 +746,11 @@ int iwpm_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb)
iwpm_set_registration(nl_client, IWPM_REG_INCOMPL);
atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
iwpm_user_pid = cb->nlh->nlmsg_pid;
+
+ if (iwpm_ulib_version < IWPM_UABI_VERSION)
+ pr_warn_once("%s: Down level iwpmd/pid %u. Continuing...",
+ __func__, iwpm_user_pid);
+
if (!iwpm_mapinfo_available())
return 0;
pr_debug("%s: iWarp Port Mapper (pid = %d) is available!\n",
@@ -684,9 +766,11 @@ static const struct nla_policy ack_mapinfo_policy[IWPM_NLA_MAPINFO_NUM_MAX] = {
[IWPM_NLA_MAPINFO_ACK_NUM] = { .type = NLA_U32 }
};
-/*
- * iwpm_ack_mapping_info_cb - Process a port mapper ack for
- * the provided mapping info records
+/**
+ * iwpm_ack_mapping_info_cb - Process the port mapper ack for
+ * the provided local mapping info records
+ * @skb:
+ * @cb: Contains the received message (payload and netlink header)
*/
int iwpm_ack_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb)
{
@@ -712,8 +796,11 @@ static const struct nla_policy map_error_policy[IWPM_NLA_ERR_MAX] = {
[IWPM_NLA_ERR_CODE] = { .type = NLA_U16 },
};
-/*
- * iwpm_mapping_error_cb - Process a port mapper error message
+/**
+ * iwpm_mapping_error_cb - Process port mapper notification for error
+ *
+ * @skb:
+ * @cb: Contains the received message (payload and netlink header)
*/
int iwpm_mapping_error_cb(struct sk_buff *skb, struct netlink_callback *cb)
{
@@ -748,3 +835,46 @@ int iwpm_mapping_error_cb(struct sk_buff *skb, struct netlink_callback *cb)
up(&nlmsg_request->sem);
return 0;
}
+
+/* netlink attribute policy for the received hello request */
+static const struct nla_policy hello_policy[IWPM_NLA_HELLO_MAX] = {
+ [IWPM_NLA_HELLO_ABI_VERSION] = { .type = NLA_U16 }
+};
+
+/**
+ * iwpm_hello_cb - Process a hello message from iwpmd
+ *
+ * @skb:
+ * @cb: Contains the received message (payload and netlink header)
+ *
+ * Using the received port mapper pid, send the kernel's abi_version
+ * after adjusting it to support the iwpmd version.
+ */
+int iwpm_hello_cb(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct nlattr *nltb[IWPM_NLA_HELLO_MAX];
+ const char *msg_type = "Hello request";
+ u8 nl_client;
+ u16 abi_version;
+ int ret = -EINVAL;
+
+ if (iwpm_parse_nlmsg(cb, IWPM_NLA_HELLO_MAX, hello_policy, nltb,
+ msg_type)) {
+ pr_info("%s: Unable to parse nlmsg\n", __func__);
+ return ret;
+ }
+ abi_version = nla_get_u16(nltb[IWPM_NLA_HELLO_ABI_VERSION]);
+ nl_client = RDMA_NL_GET_CLIENT(cb->nlh->nlmsg_type);
+ if (!iwpm_valid_client(nl_client)) {
+ pr_info("%s: Invalid port mapper client = %d\n",
+ __func__, nl_client);
+ return ret;
+ }
+ iwpm_set_registration(nl_client, IWPM_REG_INCOMPL);
+ atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
+ iwpm_ulib_version = min_t(u16, IWPM_UABI_VERSION, abi_version);
+ pr_debug("Using ABI version %u\n", iwpm_ulib_version);
+ iwpm_user_pid = cb->nlh->nlmsg_pid;
+ ret = iwpm_send_hello(nl_client, iwpm_user_pid, iwpm_ulib_version);
+ return ret;
+}
diff --git a/drivers/infiniband/core/iwpm_util.c b/drivers/infiniband/core/iwpm_util.c
index cdb63f3f4de7..a5d2a20ee697 100644
--- a/drivers/infiniband/core/iwpm_util.c
+++ b/drivers/infiniband/core/iwpm_util.c
@@ -51,6 +51,12 @@ static DEFINE_SPINLOCK(iwpm_reminfo_lock);
static DEFINE_MUTEX(iwpm_admin_lock);
static struct iwpm_admin_data iwpm_admin;
+/**
+ * iwpm_init - Allocate resources for the iwarp port mapper
+ * @nl_client: The index of the netlink client
+ *
+ * Should be called when network interface goes up.
+ */
int iwpm_init(u8 nl_client)
{
int ret = 0;
@@ -87,6 +93,12 @@ init_exit:
static void free_hash_bucket(void);
static void free_reminfo_bucket(void);
+/**
+ * iwpm_exit - Deallocate resources for the iwarp port mapper
+ * @nl_client: The index of the netlink client
+ *
+ * Should be called when network interface goes down.
+ */
int iwpm_exit(u8 nl_client)
{
@@ -112,9 +124,17 @@ int iwpm_exit(u8 nl_client)
static struct hlist_head *get_mapinfo_hash_bucket(struct sockaddr_storage *,
struct sockaddr_storage *);
+/**
+ * iwpm_create_mapinfo - Store local and mapped IPv4/IPv6 address
+ * info in a hash table
+ * @local_addr: Local ip/tcp address
+ * @mapped_addr: Mapped local ip/tcp address
+ * @nl_client: The index of the netlink client
+ * @map_flags: IWPM mapping flags
+ */
int iwpm_create_mapinfo(struct sockaddr_storage *local_sockaddr,
struct sockaddr_storage *mapped_sockaddr,
- u8 nl_client)
+ u8 nl_client, u32 map_flags)
{
struct hlist_head *hash_bucket_head = NULL;
struct iwpm_mapping_info *map_info;
@@ -132,6 +152,7 @@ int iwpm_create_mapinfo(struct sockaddr_storage *local_sockaddr,
memcpy(&map_info->mapped_sockaddr, mapped_sockaddr,
sizeof(struct sockaddr_storage));
map_info->nl_client = nl_client;
+ map_info->map_flags = map_flags;
spin_lock_irqsave(&iwpm_mapinfo_lock, flags);
if (iwpm_hash_bucket) {
@@ -150,6 +171,15 @@ int iwpm_create_mapinfo(struct sockaddr_storage *local_sockaddr,
return ret;
}
+/**
+ * iwpm_remove_mapinfo - Remove local and mapped IPv4/IPv6 address
+ * info from the hash table
+ * @local_addr: Local ip/tcp address
+ * @mapped_local_addr: Mapped local ip/tcp address
+ *
+ * Returns err code if mapping info is not found in the hash table,
+ * otherwise returns 0
+ */
int iwpm_remove_mapinfo(struct sockaddr_storage *local_sockaddr,
struct sockaddr_storage *mapped_local_addr)
{
@@ -250,6 +280,17 @@ void iwpm_add_remote_info(struct iwpm_remote_info *rem_info)
spin_unlock_irqrestore(&iwpm_reminfo_lock, flags);
}
+/**
+ * iwpm_get_remote_info - Get the remote connecting peer address info
+ *
+ * @mapped_loc_addr: Mapped local address of the listening peer
+ * @mapped_rem_addr: Mapped remote address of the connecting peer
+ * @remote_addr: To store the remote address of the connecting peer
+ * @nl_client: The index of the netlink client
+ *
+ * The remote address info is retrieved and provided to the client in
+ * the remote_addr. After that it is removed from the hash table
+ */
int iwpm_get_remote_info(struct sockaddr_storage *mapped_loc_addr,
struct sockaddr_storage *mapped_rem_addr,
struct sockaddr_storage *remote_addr,
@@ -686,6 +727,14 @@ int iwpm_send_mapinfo(u8 nl_client, int iwpm_pid)
if (ret)
goto send_mapping_info_unlock;
+ if (iwpm_ulib_version > IWPM_UABI_VERSION_MIN) {
+ ret = ibnl_put_attr(skb, nlh, sizeof(u32),
+ &map_info->map_flags,
+ IWPM_NLA_MAPINFO_FLAGS);
+ if (ret)
+ goto send_mapping_info_unlock;
+ }
+
nlmsg_end(skb, nlh);
iwpm_print_sockaddr(&map_info->local_sockaddr,
@@ -754,3 +803,38 @@ int iwpm_mapinfo_available(void)
spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags);
return full_bucket;
}
+
+int iwpm_send_hello(u8 nl_client, int iwpm_pid, u16 abi_version)
+{
+ struct sk_buff *skb = NULL;
+ struct nlmsghdr *nlh;
+ const char *err_str = "";
+ int ret = -EINVAL;
+
+ skb = iwpm_create_nlmsg(RDMA_NL_IWPM_HELLO, &nlh, nl_client);
+ if (!skb) {
+ err_str = "Unable to create a nlmsg";
+ goto hello_num_error;
+ }
+ nlh->nlmsg_seq = iwpm_get_nlmsg_seq();
+ err_str = "Unable to put attribute of abi_version into nlmsg";
+ ret = ibnl_put_attr(skb, nlh, sizeof(u16), &abi_version,
+ IWPM_NLA_HELLO_ABI_VERSION);
+ if (ret)
+ goto hello_num_error;
+ nlmsg_end(skb, nlh);
+
+ ret = rdma_nl_unicast(skb, iwpm_pid);
+ if (ret) {
+ skb = NULL;
+ err_str = "Unable to send a nlmsg";
+ goto hello_num_error;
+ }
+ pr_debug("%s: Sent hello abi_version = %u\n", __func__, abi_version);
+ return 0;
+hello_num_error:
+ pr_info("%s: %s\n", __func__, err_str);
+ if (skb)
+ dev_kfree_skb(skb);
+ return ret;
+}
diff --git a/drivers/infiniband/core/iwpm_util.h b/drivers/infiniband/core/iwpm_util.h
index af1fc14a0d3d..7e2bcc72f66c 100644
--- a/drivers/infiniband/core/iwpm_util.h
+++ b/drivers/infiniband/core/iwpm_util.h
@@ -78,6 +78,7 @@ struct iwpm_mapping_info {
struct sockaddr_storage local_sockaddr;
struct sockaddr_storage mapped_sockaddr;
u8 nl_client;
+ u32 map_flags;
};
struct iwpm_remote_info {
@@ -266,4 +267,15 @@ int iwpm_parse_nlmsg(struct netlink_callback *cb, int policy_max,
* @msg: Message to print
*/
void iwpm_print_sockaddr(struct sockaddr_storage *sockaddr, char *msg);
+
+/**
+ * iwpm_send_hello - Send hello response to iwpmd
+ *
+ * @nl_client: The index of the netlink client
+ * @abi_version: The kernel's abi_version
+ *
+ * Returns 0 on success or a negative error code
+ */
+int iwpm_send_hello(u8 nl_client, int iwpm_pid, u16 abi_version);
+extern u16 iwpm_ulib_version;
#endif
diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
index 7870823bac47..e742a6a2c138 100644
--- a/drivers/infiniband/core/mad.c
+++ b/drivers/infiniband/core/mad.c
@@ -3326,9 +3326,9 @@ error:
static void ib_mad_remove_device(struct ib_device *device, void *client_data)
{
- int i;
+ unsigned int i;
- for (i = rdma_start_port(device); i <= rdma_end_port(device); i++) {
+ rdma_for_each_port (device, i) {
if (!rdma_cap_ib_mad(device, i))
continue;
diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c
index 724f5a62e82f..eecfc0b377c9 100644
--- a/drivers/infiniband/core/netlink.c
+++ b/drivers/infiniband/core/netlink.c
@@ -56,7 +56,6 @@ EXPORT_SYMBOL(rdma_nl_chk_listeners);
static bool is_nl_msg_valid(unsigned int type, unsigned int op)
{
static const unsigned int max_num_ops[RDMA_NL_NUM_CLIENTS] = {
- [RDMA_NL_RDMA_CM] = RDMA_NL_RDMA_CM_NUM_OPS,
[RDMA_NL_IWCM] = RDMA_NL_IWPM_NUM_OPS,
[RDMA_NL_LS] = RDMA_NL_LS_NUM_OPS,
[RDMA_NL_NLDEV] = RDMA_NLDEV_NUM_OPS,
@@ -181,8 +180,7 @@ static int rdma_nl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
return -EINVAL;
}
/* FIXME: Convert IWCM to properly handle doit callbacks */
- if ((nlh->nlmsg_flags & NLM_F_DUMP) || index == RDMA_NL_RDMA_CM ||
- index == RDMA_NL_IWCM) {
+ if ((nlh->nlmsg_flags & NLM_F_DUMP) || index == RDMA_NL_IWCM) {
struct netlink_dump_control c = {
.dump = cb_table[op].dump,
};
diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index 3c97a8b6bf1e..11ed58d3fce5 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -33,12 +33,14 @@
#include <linux/module.h>
#include <linux/pid.h>
#include <linux/pid_namespace.h>
+#include <linux/mutex.h>
#include <net/netlink.h>
#include <rdma/rdma_cm.h>
#include <rdma/rdma_netlink.h>
#include "core_priv.h"
#include "cma_priv.h"
+#include "restrack.h"
static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
[RDMA_NLDEV_ATTR_DEV_INDEX] = { .type = NLA_U32 },
@@ -107,6 +109,13 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
[RDMA_NLDEV_ATTR_DRIVER_U32] = { .type = NLA_U32 },
[RDMA_NLDEV_ATTR_DRIVER_S64] = { .type = NLA_S64 },
[RDMA_NLDEV_ATTR_DRIVER_U64] = { .type = NLA_U64 },
+ [RDMA_NLDEV_ATTR_RES_PDN] = { .type = NLA_U32 },
+ [RDMA_NLDEV_ATTR_RES_CQN] = { .type = NLA_U32 },
+ [RDMA_NLDEV_ATTR_RES_MRN] = { .type = NLA_U32 },
+ [RDMA_NLDEV_ATTR_RES_CM_IDN] = { .type = NLA_U32 },
+ [RDMA_NLDEV_ATTR_RES_CTXN] = { .type = NLA_U32 },
+ [RDMA_NLDEV_ATTR_LINK_TYPE] = { .type = NLA_NUL_STRING,
+ .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN },
};
static int put_driver_name_print_type(struct sk_buff *msg, const char *name,
@@ -262,9 +271,7 @@ static int fill_port_info(struct sk_buff *msg,
if (nla_put_u8(msg, RDMA_NLDEV_ATTR_PORT_PHYS_STATE, attr.phys_state))
return -EMSGSIZE;
- if (device->ops.get_netdev)
- netdev = device->ops.get_netdev(device, port);
-
+ netdev = ib_device_get_netdev(device, port);
if (netdev && net_eq(dev_net(netdev), net)) {
ret = nla_put_u32(msg,
RDMA_NLDEV_ATTR_NDEV_INDEX, netdev->ifindex);
@@ -314,7 +321,6 @@ static int fill_res_info(struct sk_buff *msg, struct ib_device *device)
[RDMA_RESTRACK_CTX] = "ctx",
};
- struct rdma_restrack_root *res = &device->res;
struct nlattr *table_attr;
int ret, i, curr;
@@ -328,7 +334,8 @@ static int fill_res_info(struct sk_buff *msg, struct ib_device *device)
for (i = 0; i < RDMA_RESTRACK_MAX; i++) {
if (!names[i])
continue;
- curr = rdma_restrack_count(res, i, task_active_pid_ns(current));
+ curr = rdma_restrack_count(device, i,
+ task_active_pid_ns(current));
ret = fill_res_info_entry(msg, names[i], curr);
if (ret)
goto err;
@@ -361,13 +368,20 @@ static int fill_res_name_pid(struct sk_buff *msg,
return 0;
}
-static int fill_res_qp_entry(struct sk_buff *msg, struct netlink_callback *cb,
+static bool fill_res_entry(struct ib_device *dev, struct sk_buff *msg,
+ struct rdma_restrack_entry *res)
+{
+ if (!dev->ops.fill_res_entry)
+ return false;
+ return dev->ops.fill_res_entry(msg, res);
+}
+
+static int fill_res_qp_entry(struct sk_buff *msg, bool has_cap_net_admin,
struct rdma_restrack_entry *res, uint32_t port)
{
struct ib_qp *qp = container_of(res, struct ib_qp, res);
- struct rdma_restrack_root *resroot = &qp->device->res;
+ struct ib_device *dev = qp->device;
struct ib_qp_init_attr qp_init_attr;
- struct nlattr *entry_attr;
struct ib_qp_attr qp_attr;
int ret;
@@ -376,11 +390,7 @@ static int fill_res_qp_entry(struct sk_buff *msg, struct netlink_callback *cb,
return ret;
if (port && port != qp_attr.port_num)
- return 0;
-
- entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_QP_ENTRY);
- if (!entry_attr)
- goto out;
+ return -EAGAIN;
/* In create_qp() port is not set yet */
if (qp_attr.port_num &&
@@ -412,38 +422,32 @@ static int fill_res_qp_entry(struct sk_buff *msg, struct netlink_callback *cb,
if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_STATE, qp_attr.qp_state))
goto err;
+ if (!rdma_is_kernel_res(res) &&
+ nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, qp->pd->res.id))
+ goto err;
+
if (fill_res_name_pid(msg, res))
goto err;
- if (resroot->fill_res_entry(msg, res))
+ if (fill_res_entry(dev, msg, res))
goto err;
- nla_nest_end(msg, entry_attr);
return 0;
-err:
- nla_nest_cancel(msg, entry_attr);
-out:
- return -EMSGSIZE;
+err: return -EMSGSIZE;
}
-static int fill_res_cm_id_entry(struct sk_buff *msg,
- struct netlink_callback *cb,
+static int fill_res_cm_id_entry(struct sk_buff *msg, bool has_cap_net_admin,
struct rdma_restrack_entry *res, uint32_t port)
{
struct rdma_id_private *id_priv =
container_of(res, struct rdma_id_private, res);
- struct rdma_restrack_root *resroot = &id_priv->id.device->res;
+ struct ib_device *dev = id_priv->id.device;
struct rdma_cm_id *cm_id = &id_priv->id;
- struct nlattr *entry_attr;
if (port && port != cm_id->port_num)
return 0;
- entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_CM_ID_ENTRY);
- if (!entry_attr)
- goto out;
-
if (cm_id->port_num &&
nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, cm_id->port_num))
goto err;
@@ -472,31 +476,25 @@ static int fill_res_cm_id_entry(struct sk_buff *msg,
&cm_id->route.addr.dst_addr))
goto err;
+ if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CM_IDN, res->id))
+ goto err;
+
if (fill_res_name_pid(msg, res))
goto err;
- if (resroot->fill_res_entry(msg, res))
+ if (fill_res_entry(dev, msg, res))
goto err;
- nla_nest_end(msg, entry_attr);
return 0;
-err:
- nla_nest_cancel(msg, entry_attr);
-out:
- return -EMSGSIZE;
+err: return -EMSGSIZE;
}
-static int fill_res_cq_entry(struct sk_buff *msg, struct netlink_callback *cb,
+static int fill_res_cq_entry(struct sk_buff *msg, bool has_cap_net_admin,
struct rdma_restrack_entry *res, uint32_t port)
{
struct ib_cq *cq = container_of(res, struct ib_cq, res);
- struct rdma_restrack_root *resroot = &cq->device->res;
- struct nlattr *entry_attr;
-
- entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_CQ_ENTRY);
- if (!entry_attr)
- goto out;
+ struct ib_device *dev = cq->device;
if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CQE, cq->cqe))
goto err;
@@ -509,33 +507,31 @@ static int fill_res_cq_entry(struct sk_buff *msg, struct netlink_callback *cb,
nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_POLL_CTX, cq->poll_ctx))
goto err;
+ if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CQN, res->id))
+ goto err;
+ if (!rdma_is_kernel_res(res) &&
+ nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CTXN,
+ cq->uobject->context->res.id))
+ goto err;
+
if (fill_res_name_pid(msg, res))
goto err;
- if (resroot->fill_res_entry(msg, res))
+ if (fill_res_entry(dev, msg, res))
goto err;
- nla_nest_end(msg, entry_attr);
return 0;
-err:
- nla_nest_cancel(msg, entry_attr);
-out:
- return -EMSGSIZE;
+err: return -EMSGSIZE;
}
-static int fill_res_mr_entry(struct sk_buff *msg, struct netlink_callback *cb,
+static int fill_res_mr_entry(struct sk_buff *msg, bool has_cap_net_admin,
struct rdma_restrack_entry *res, uint32_t port)
{
struct ib_mr *mr = container_of(res, struct ib_mr, res);
- struct rdma_restrack_root *resroot = &mr->pd->device->res;
- struct nlattr *entry_attr;
-
- entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_MR_ENTRY);
- if (!entry_attr)
- goto out;
+ struct ib_device *dev = mr->pd->device;
- if (netlink_capable(cb->skb, CAP_NET_ADMIN)) {
+ if (has_cap_net_admin) {
if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_RKEY, mr->rkey))
goto err;
if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LKEY, mr->lkey))
@@ -546,33 +542,31 @@ static int fill_res_mr_entry(struct sk_buff *msg, struct netlink_callback *cb,
RDMA_NLDEV_ATTR_PAD))
goto err;
+ if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_MRN, res->id))
+ goto err;
+
+ if (!rdma_is_kernel_res(res) &&
+ nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, mr->pd->res.id))
+ goto err;
+
if (fill_res_name_pid(msg, res))
goto err;
- if (resroot->fill_res_entry(msg, res))
+ if (fill_res_entry(dev, msg, res))
goto err;
- nla_nest_end(msg, entry_attr);
return 0;
-err:
- nla_nest_cancel(msg, entry_attr);
-out:
- return -EMSGSIZE;
+err: return -EMSGSIZE;
}
-static int fill_res_pd_entry(struct sk_buff *msg, struct netlink_callback *cb,
+static int fill_res_pd_entry(struct sk_buff *msg, bool has_cap_net_admin,
struct rdma_restrack_entry *res, uint32_t port)
{
struct ib_pd *pd = container_of(res, struct ib_pd, res);
- struct rdma_restrack_root *resroot = &pd->device->res;
- struct nlattr *entry_attr;
+ struct ib_device *dev = pd->device;
- entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_PD_ENTRY);
- if (!entry_attr)
- goto out;
-
- if (netlink_capable(cb->skb, CAP_NET_ADMIN)) {
+ if (has_cap_net_admin) {
if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY,
pd->local_dma_lkey))
goto err;
@@ -585,19 +579,23 @@ static int fill_res_pd_entry(struct sk_buff *msg, struct netlink_callback *cb,
atomic_read(&pd->usecnt), RDMA_NLDEV_ATTR_PAD))
goto err;
+ if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, res->id))
+ goto err;
+
+ if (!rdma_is_kernel_res(res) &&
+ nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CTXN,
+ pd->uobject->context->res.id))
+ goto err;
+
if (fill_res_name_pid(msg, res))
goto err;
- if (resroot->fill_res_entry(msg, res))
+ if (fill_res_entry(dev, msg, res))
goto err;
- nla_nest_end(msg, entry_attr);
return 0;
-err:
- nla_nest_cancel(msg, entry_attr);
-out:
- return -EMSGSIZE;
+err: return -EMSGSIZE;
}
static int nldev_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -777,7 +775,7 @@ static int nldev_port_get_dumpit(struct sk_buff *skb,
u32 idx = 0;
u32 ifindex;
int err;
- u32 p;
+ unsigned int p;
err = nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
nldev_policy, NULL);
@@ -789,7 +787,7 @@ static int nldev_port_get_dumpit(struct sk_buff *skb,
if (!device)
return -EINVAL;
- for (p = rdma_start_port(device); p <= rdma_end_port(device); ++p) {
+ rdma_for_each_port (device, p) {
/*
* The dumpit function returns all information from specific
* index. This specific index is taken from the netlink
@@ -905,10 +903,17 @@ static int nldev_res_get_dumpit(struct sk_buff *skb,
}
struct nldev_fill_res_entry {
- int (*fill_res_func)(struct sk_buff *msg, struct netlink_callback *cb,
+ int (*fill_res_func)(struct sk_buff *msg, bool has_cap_net_admin,
struct rdma_restrack_entry *res, u32 port);
enum rdma_nldev_attr nldev_attr;
enum rdma_nldev_command nldev_cmd;
+ u8 flags;
+ u32 entry;
+ u32 id;
+};
+
+enum nldev_res_flags {
+ NLDEV_PER_DEV = 1 << 0,
};
static const struct nldev_fill_res_entry fill_entries[RDMA_RESTRACK_MAX] = {
@@ -916,29 +921,136 @@ static const struct nldev_fill_res_entry fill_entries[RDMA_RESTRACK_MAX] = {
.fill_res_func = fill_res_qp_entry,
.nldev_cmd = RDMA_NLDEV_CMD_RES_QP_GET,
.nldev_attr = RDMA_NLDEV_ATTR_RES_QP,
+ .entry = RDMA_NLDEV_ATTR_RES_QP_ENTRY,
+ .id = RDMA_NLDEV_ATTR_RES_LQPN,
},
[RDMA_RESTRACK_CM_ID] = {
.fill_res_func = fill_res_cm_id_entry,
.nldev_cmd = RDMA_NLDEV_CMD_RES_CM_ID_GET,
.nldev_attr = RDMA_NLDEV_ATTR_RES_CM_ID,
+ .entry = RDMA_NLDEV_ATTR_RES_CM_ID_ENTRY,
+ .id = RDMA_NLDEV_ATTR_RES_CM_IDN,
},
[RDMA_RESTRACK_CQ] = {
.fill_res_func = fill_res_cq_entry,
.nldev_cmd = RDMA_NLDEV_CMD_RES_CQ_GET,
.nldev_attr = RDMA_NLDEV_ATTR_RES_CQ,
+ .flags = NLDEV_PER_DEV,
+ .entry = RDMA_NLDEV_ATTR_RES_CQ_ENTRY,
+ .id = RDMA_NLDEV_ATTR_RES_CQN,
},
[RDMA_RESTRACK_MR] = {
.fill_res_func = fill_res_mr_entry,
.nldev_cmd = RDMA_NLDEV_CMD_RES_MR_GET,
.nldev_attr = RDMA_NLDEV_ATTR_RES_MR,
+ .flags = NLDEV_PER_DEV,
+ .entry = RDMA_NLDEV_ATTR_RES_MR_ENTRY,
+ .id = RDMA_NLDEV_ATTR_RES_MRN,
},
[RDMA_RESTRACK_PD] = {
.fill_res_func = fill_res_pd_entry,
.nldev_cmd = RDMA_NLDEV_CMD_RES_PD_GET,
.nldev_attr = RDMA_NLDEV_ATTR_RES_PD,
+ .flags = NLDEV_PER_DEV,
+ .entry = RDMA_NLDEV_ATTR_RES_PD_ENTRY,
+ .id = RDMA_NLDEV_ATTR_RES_PDN,
},
};
+static bool is_visible_in_pid_ns(struct rdma_restrack_entry *res)
+{
+ /*
+ * 1. Kern resources should be visible in init name space only
+ * 2. Present only resources visible in the current namespace
+ */
+ if (rdma_is_kernel_res(res))
+ return task_active_pid_ns(current) == &init_pid_ns;
+ return task_active_pid_ns(current) == task_active_pid_ns(res->task);
+}
+
+static int res_get_common_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack,
+ enum rdma_restrack_type res_type)
+{
+ const struct nldev_fill_res_entry *fe = &fill_entries[res_type];
+ struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+ struct rdma_restrack_entry *res;
+ struct ib_device *device;
+ u32 index, id, port = 0;
+ bool has_cap_net_admin;
+ struct sk_buff *msg;
+ int ret;
+
+ ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+ nldev_policy, extack);
+ if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !fe->id || !tb[fe->id])
+ return -EINVAL;
+
+ index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+ device = ib_device_get_by_index(index);
+ if (!device)
+ return -EINVAL;
+
+ if (tb[RDMA_NLDEV_ATTR_PORT_INDEX]) {
+ port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]);
+ if (!rdma_is_port_valid(device, port)) {
+ ret = -EINVAL;
+ goto err;
+ }
+ }
+
+ if ((port && fe->flags & NLDEV_PER_DEV) ||
+ (!port && ~fe->flags & NLDEV_PER_DEV)) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ id = nla_get_u32(tb[fe->id]);
+ res = rdma_restrack_get_byid(device, res_type, id);
+ if (IS_ERR(res)) {
+ ret = PTR_ERR(res);
+ goto err;
+ }
+
+ if (!is_visible_in_pid_ns(res)) {
+ ret = -ENOENT;
+ goto err_get;
+ }
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
+ RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, fe->nldev_cmd),
+ 0, 0);
+
+ if (fill_nldev_handle(msg, device)) {
+ ret = -EMSGSIZE;
+ goto err_free;
+ }
+
+ has_cap_net_admin = netlink_capable(skb, CAP_NET_ADMIN);
+ ret = fe->fill_res_func(msg, has_cap_net_admin, res, port);
+ rdma_restrack_put(res);
+ if (ret)
+ goto err_free;
+
+ nlmsg_end(msg, nlh);
+ ib_device_put(device);
+ return rdma_nl_unicast(msg, NETLINK_CB(skb).portid);
+
+err_free:
+ nlmsg_free(msg);
+err_get:
+ rdma_restrack_put(res);
+err:
+ ib_device_put(device);
+ return ret;
+}
+
static int res_get_common_dumpit(struct sk_buff *skb,
struct netlink_callback *cb,
enum rdma_restrack_type res_type)
@@ -946,11 +1058,15 @@ static int res_get_common_dumpit(struct sk_buff *skb,
const struct nldev_fill_res_entry *fe = &fill_entries[res_type];
struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
struct rdma_restrack_entry *res;
+ struct rdma_restrack_root *rt;
int err, ret = 0, idx = 0;
struct nlattr *table_attr;
+ struct nlattr *entry_attr;
struct ib_device *device;
int start = cb->args[0];
+ bool has_cap_net_admin;
struct nlmsghdr *nlh;
+ unsigned long id;
u32 index, port = 0;
bool filled = false;
@@ -998,55 +1114,51 @@ static int res_get_common_dumpit(struct sk_buff *skb,
goto err;
}
- down_read(&device->res.rwsem);
- hash_for_each_possible(device->res.hash, res, node, res_type) {
- if (idx < start)
- goto next;
+ has_cap_net_admin = netlink_capable(cb->skb, CAP_NET_ADMIN);
- if ((rdma_is_kernel_res(res) &&
- task_active_pid_ns(current) != &init_pid_ns) ||
- (!rdma_is_kernel_res(res) && task_active_pid_ns(current) !=
- task_active_pid_ns(res->task)))
- /*
- * 1. Kern resources should be visible in init
- * namspace only
- * 2. Present only resources visible in the current
- * namespace
- */
- goto next;
+ rt = &device->res[res_type];
+ xa_lock(&rt->xa);
+ /*
+ * FIXME: if the skip ahead is something common this loop should
+ * use xas_for_each & xas_pause to optimize, we can have a lot of
+ * objects.
+ */
+ xa_for_each(&rt->xa, id, res) {
+ if (!is_visible_in_pid_ns(res))
+ continue;
- if (!rdma_restrack_get(res))
- /*
- * Resource is under release now, but we are not
- * relesing lock now, so it will be released in
- * our next pass, once we will get ->next pointer.
- */
+ if (idx < start || !rdma_restrack_get(res))
goto next;
+ xa_unlock(&rt->xa);
+
filled = true;
- up_read(&device->res.rwsem);
- ret = fe->fill_res_func(skb, cb, res, port);
- down_read(&device->res.rwsem);
- /*
- * Return resource back, but it won't be released till
- * the &device->res.rwsem will be released for write.
- */
+ entry_attr = nla_nest_start(skb, fe->entry);
+ if (!entry_attr) {
+ ret = -EMSGSIZE;
+ rdma_restrack_put(res);
+ goto msg_full;
+ }
+
+ ret = fe->fill_res_func(skb, has_cap_net_admin, res, port);
rdma_restrack_put(res);
- if (ret == -EMSGSIZE)
- /*
- * There is a chance to optimize here.
- * It can be done by using list_prepare_entry
- * and list_for_each_entry_continue afterwards.
- */
- break;
- if (ret)
+ if (ret) {
+ nla_nest_cancel(skb, entry_attr);
+ if (ret == -EMSGSIZE)
+ goto msg_full;
+ if (ret == -EAGAIN)
+ goto again;
goto res_err;
+ }
+ nla_nest_end(skb, entry_attr);
+again: xa_lock(&rt->xa);
next: idx++;
}
- up_read(&device->res.rwsem);
+ xa_unlock(&rt->xa);
+msg_full:
nla_nest_end(skb, table_attr);
nlmsg_end(skb, nlh);
cb->args[0] = idx;
@@ -1063,7 +1175,6 @@ next: idx++;
res_err:
nla_nest_cancel(skb, table_attr);
- up_read(&device->res.rwsem);
err:
nlmsg_cancel(skb, nlh);
@@ -1073,34 +1184,132 @@ err_index:
return ret;
}
-static int nldev_res_get_qp_dumpit(struct sk_buff *skb,
- struct netlink_callback *cb)
+#define RES_GET_FUNCS(name, type) \
+ static int nldev_res_get_##name##_dumpit(struct sk_buff *skb, \
+ struct netlink_callback *cb) \
+ { \
+ return res_get_common_dumpit(skb, cb, type); \
+ } \
+ static int nldev_res_get_##name##_doit(struct sk_buff *skb, \
+ struct nlmsghdr *nlh, \
+ struct netlink_ext_ack *extack) \
+ { \
+ return res_get_common_doit(skb, nlh, extack, type); \
+ }
+
+RES_GET_FUNCS(qp, RDMA_RESTRACK_QP);
+RES_GET_FUNCS(cm_id, RDMA_RESTRACK_CM_ID);
+RES_GET_FUNCS(cq, RDMA_RESTRACK_CQ);
+RES_GET_FUNCS(pd, RDMA_RESTRACK_PD);
+RES_GET_FUNCS(mr, RDMA_RESTRACK_MR);
+
+static LIST_HEAD(link_ops);
+static DECLARE_RWSEM(link_ops_rwsem);
+
+static const struct rdma_link_ops *link_ops_get(const char *type)
{
- return res_get_common_dumpit(skb, cb, RDMA_RESTRACK_QP);
+ const struct rdma_link_ops *ops;
+
+ list_for_each_entry(ops, &link_ops, list) {
+ if (!strcmp(ops->type, type))
+ goto out;
+ }
+ ops = NULL;
+out:
+ return ops;
}
-static int nldev_res_get_cm_id_dumpit(struct sk_buff *skb,
- struct netlink_callback *cb)
+void rdma_link_register(struct rdma_link_ops *ops)
{
- return res_get_common_dumpit(skb, cb, RDMA_RESTRACK_CM_ID);
+ down_write(&link_ops_rwsem);
+ if (WARN_ON_ONCE(link_ops_get(ops->type)))
+ goto out;
+ list_add(&ops->list, &link_ops);
+out:
+ up_write(&link_ops_rwsem);
}
+EXPORT_SYMBOL(rdma_link_register);
-static int nldev_res_get_cq_dumpit(struct sk_buff *skb,
- struct netlink_callback *cb)
+void rdma_link_unregister(struct rdma_link_ops *ops)
{
- return res_get_common_dumpit(skb, cb, RDMA_RESTRACK_CQ);
+ down_write(&link_ops_rwsem);
+ list_del(&ops->list);
+ up_write(&link_ops_rwsem);
}
+EXPORT_SYMBOL(rdma_link_unregister);
-static int nldev_res_get_mr_dumpit(struct sk_buff *skb,
- struct netlink_callback *cb)
+static int nldev_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
{
- return res_get_common_dumpit(skb, cb, RDMA_RESTRACK_MR);
+ struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+ char ibdev_name[IB_DEVICE_NAME_MAX];
+ const struct rdma_link_ops *ops;
+ char ndev_name[IFNAMSIZ];
+ struct net_device *ndev;
+ char type[IFNAMSIZ];
+ int err;
+
+ err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+ nldev_policy, extack);
+ if (err || !tb[RDMA_NLDEV_ATTR_DEV_NAME] ||
+ !tb[RDMA_NLDEV_ATTR_LINK_TYPE] || !tb[RDMA_NLDEV_ATTR_NDEV_NAME])
+ return -EINVAL;
+
+ nla_strlcpy(ibdev_name, tb[RDMA_NLDEV_ATTR_DEV_NAME],
+ sizeof(ibdev_name));
+ if (strchr(ibdev_name, '%'))
+ return -EINVAL;
+
+ nla_strlcpy(type, tb[RDMA_NLDEV_ATTR_LINK_TYPE], sizeof(type));
+ nla_strlcpy(ndev_name, tb[RDMA_NLDEV_ATTR_NDEV_NAME],
+ sizeof(ndev_name));
+
+ ndev = dev_get_by_name(&init_net, ndev_name);
+ if (!ndev)
+ return -ENODEV;
+
+ down_read(&link_ops_rwsem);
+ ops = link_ops_get(type);
+#ifdef CONFIG_MODULES
+ if (!ops) {
+ up_read(&link_ops_rwsem);
+ request_module("rdma-link-%s", type);
+ down_read(&link_ops_rwsem);
+ ops = link_ops_get(type);
+ }
+#endif
+ err = ops ? ops->newlink(ibdev_name, ndev) : -EINVAL;
+ up_read(&link_ops_rwsem);
+ dev_put(ndev);
+
+ return err;
}
-static int nldev_res_get_pd_dumpit(struct sk_buff *skb,
- struct netlink_callback *cb)
+static int nldev_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
{
- return res_get_common_dumpit(skb, cb, RDMA_RESTRACK_PD);
+ struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+ struct ib_device *device;
+ u32 index;
+ int err;
+
+ err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+ nldev_policy, extack);
+ if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX])
+ return -EINVAL;
+
+ index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+ device = ib_device_get_by_index(index);
+ if (!device)
+ return -EINVAL;
+
+ if (!(device->attrs.device_cap_flags & IB_DEVICE_ALLOW_USER_UNREG)) {
+ ib_device_put(device);
+ return -EINVAL;
+ }
+
+ ib_unregister_device_and_put(device);
+ return 0;
}
static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = {
@@ -1112,6 +1321,14 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = {
.doit = nldev_set_doit,
.flags = RDMA_NL_ADMIN_PERM,
},
+ [RDMA_NLDEV_CMD_NEWLINK] = {
+ .doit = nldev_newlink,
+ .flags = RDMA_NL_ADMIN_PERM,
+ },
+ [RDMA_NLDEV_CMD_DELLINK] = {
+ .doit = nldev_dellink,
+ .flags = RDMA_NL_ADMIN_PERM,
+ },
[RDMA_NLDEV_CMD_PORT_GET] = {
.doit = nldev_port_get_doit,
.dump = nldev_port_get_dumpit,
@@ -1121,28 +1338,23 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = {
.dump = nldev_res_get_dumpit,
},
[RDMA_NLDEV_CMD_RES_QP_GET] = {
+ .doit = nldev_res_get_qp_doit,
.dump = nldev_res_get_qp_dumpit,
- /*
- * .doit is not implemented yet for two reasons:
- * 1. It is not needed yet.
- * 2. There is a need to provide identifier, while it is easy
- * for the QPs (device index + port index + LQPN), it is not
- * the case for the rest of resources (PD and CQ). Because it
- * is better to provide similar interface for all resources,
- * let's wait till we will have other resources implemented
- * too.
- */
},
[RDMA_NLDEV_CMD_RES_CM_ID_GET] = {
+ .doit = nldev_res_get_cm_id_doit,
.dump = nldev_res_get_cm_id_dumpit,
},
[RDMA_NLDEV_CMD_RES_CQ_GET] = {
+ .doit = nldev_res_get_cq_doit,
.dump = nldev_res_get_cq_dumpit,
},
[RDMA_NLDEV_CMD_RES_MR_GET] = {
+ .doit = nldev_res_get_mr_doit,
.dump = nldev_res_get_mr_dumpit,
},
[RDMA_NLDEV_CMD_RES_PD_GET] = {
+ .doit = nldev_res_get_pd_doit,
.dump = nldev_res_get_pd_dumpit,
},
};
diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c
index 6c4747e61d2b..778375ff664e 100644
--- a/drivers/infiniband/core/rdma_core.c
+++ b/drivers/infiniband/core/rdma_core.c
@@ -438,6 +438,38 @@ free:
uverbs_uobject_put(uobj);
return ERR_PTR(ret);
}
+struct ib_uobject *_uobj_get_read(enum uverbs_default_objects type,
+ u32 object_id,
+ struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uobject *uobj;
+
+ uobj = rdma_lookup_get_uobject(uobj_get_type(attrs, type), attrs->ufile,
+ object_id, UVERBS_LOOKUP_READ);
+ if (IS_ERR(uobj))
+ return uobj;
+
+ attrs->context = uobj->context;
+
+ return uobj;
+}
+
+struct ib_uobject *_uobj_get_write(enum uverbs_default_objects type,
+ u32 object_id,
+ struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uobject *uobj;
+
+ uobj = rdma_lookup_get_uobject(uobj_get_type(attrs, type), attrs->ufile,
+ object_id, UVERBS_LOOKUP_WRITE);
+
+ if (IS_ERR(uobj))
+ return uobj;
+
+ attrs->context = uobj->context;
+
+ return uobj;
+}
static struct ib_uobject *
alloc_begin_idr_uobject(const struct uverbs_api_object *obj,
@@ -801,6 +833,7 @@ void uverbs_close_fd(struct file *f)
/* Pairs with filp->private_data in alloc_begin_fd_uobject */
uverbs_uobject_put(uobj);
}
+EXPORT_SYMBOL(uverbs_close_fd);
/*
* Drop the ucontext off the ufile and completely disconnect it from the
@@ -811,7 +844,6 @@ static void ufile_destroy_ucontext(struct ib_uverbs_file *ufile,
{
struct ib_ucontext *ucontext = ufile->ucontext;
struct ib_device *ib_dev = ucontext->device;
- int ret;
/*
* If we are closing the FD then the user mmap VMAs must have
@@ -829,12 +861,8 @@ static void ufile_destroy_ucontext(struct ib_uverbs_file *ufile,
rdma_restrack_del(&ucontext->res);
- /*
- * FIXME: Drivers are not permitted to fail dealloc_ucontext, remove
- * the error return.
- */
- ret = ib_dev->ops.dealloc_ucontext(ucontext);
- WARN_ON(ret);
+ ib_dev->ops.dealloc_ucontext(ucontext);
+ kfree(ucontext);
ufile->ucontext = NULL;
}
diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c
index 46a5c553c624..fa804093fafb 100644
--- a/drivers/infiniband/core/restrack.c
+++ b/drivers/infiniband/core/restrack.c
@@ -11,17 +11,51 @@
#include <linux/pid_namespace.h>
#include "cma_priv.h"
+#include "restrack.h"
-static int fill_res_noop(struct sk_buff *msg,
- struct rdma_restrack_entry *entry)
+static int rt_xa_alloc_cyclic(struct xarray *xa, u32 *id, void *entry,
+ u32 *next)
{
- return 0;
+ int err;
+
+ *id = *next;
+ if (*next == U32_MAX)
+ *id = 0;
+
+ xa_lock(xa);
+ err = __xa_alloc(xa, id, U32_MAX, entry, GFP_KERNEL);
+ if (err && *next != U32_MAX) {
+ *id = 0;
+ err = __xa_alloc(xa, id, *next, entry, GFP_KERNEL);
+ }
+
+ if (!err)
+ *next = *id + 1;
+ xa_unlock(xa);
+ return err;
}
-void rdma_restrack_init(struct rdma_restrack_root *res)
+/**
+ * rdma_restrack_init() - initialize and allocate resource tracking
+ * @dev: IB device
+ *
+ * Return: 0 on success
+ */
+int rdma_restrack_init(struct ib_device *dev)
{
- init_rwsem(&res->rwsem);
- res->fill_res_entry = fill_res_noop;
+ struct rdma_restrack_root *rt;
+ int i;
+
+ dev->res = kcalloc(RDMA_RESTRACK_MAX, sizeof(*rt), GFP_KERNEL);
+ if (!dev->res)
+ return -ENOMEM;
+
+ rt = dev->res;
+
+ for (i = 0; i < RDMA_RESTRACK_MAX; i++)
+ xa_init_flags(&rt[i].xa, XA_FLAGS_ALLOC);
+
+ return 0;
}
static const char *type2str(enum rdma_restrack_type type)
@@ -38,55 +72,79 @@ static const char *type2str(enum rdma_restrack_type type)
return names[type];
};
-void rdma_restrack_clean(struct rdma_restrack_root *res)
+/**
+ * rdma_restrack_clean() - clean resource tracking
+ * @dev: IB device
+ */
+void rdma_restrack_clean(struct ib_device *dev)
{
+ struct rdma_restrack_root *rt = dev->res;
struct rdma_restrack_entry *e;
char buf[TASK_COMM_LEN];
- struct ib_device *dev;
+ bool found = false;
const char *owner;
- int bkt;
-
- if (hash_empty(res->hash))
- return;
-
- dev = container_of(res, struct ib_device, res);
- pr_err("restrack: %s", CUT_HERE);
- dev_err(&dev->dev, "BUG: RESTRACK detected leak of resources\n");
- hash_for_each(res->hash, bkt, e, node) {
- if (rdma_is_kernel_res(e)) {
- owner = e->kern_name;
- } else {
- /*
- * There is no need to call get_task_struct here,
- * because we can be here only if there are more
- * get_task_struct() call than put_task_struct().
- */
- get_task_comm(buf, e->task);
- owner = buf;
+ int i;
+
+ for (i = 0 ; i < RDMA_RESTRACK_MAX; i++) {
+ struct xarray *xa = &dev->res[i].xa;
+
+ if (!xa_empty(xa)) {
+ unsigned long index;
+
+ if (!found) {
+ pr_err("restrack: %s", CUT_HERE);
+ dev_err(&dev->dev, "BUG: RESTRACK detected leak of resources\n");
+ }
+ xa_for_each(xa, index, e) {
+ if (rdma_is_kernel_res(e)) {
+ owner = e->kern_name;
+ } else {
+ /*
+ * There is no need to call get_task_struct here,
+ * because we can be here only if there are more
+ * get_task_struct() call than put_task_struct().
+ */
+ get_task_comm(buf, e->task);
+ owner = buf;
+ }
+
+ pr_err("restrack: %s %s object allocated by %s is not freed\n",
+ rdma_is_kernel_res(e) ? "Kernel" :
+ "User",
+ type2str(e->type), owner);
+ }
+ found = true;
}
-
- pr_err("restrack: %s %s object allocated by %s is not freed\n",
- rdma_is_kernel_res(e) ? "Kernel" : "User",
- type2str(e->type), owner);
+ xa_destroy(xa);
}
- pr_err("restrack: %s", CUT_HERE);
+ if (found)
+ pr_err("restrack: %s", CUT_HERE);
+
+ kfree(rt);
}
-int rdma_restrack_count(struct rdma_restrack_root *res,
- enum rdma_restrack_type type,
+/**
+ * rdma_restrack_count() - the current usage of specific object
+ * @dev: IB device
+ * @type: actual type of object to operate
+ * @ns: PID namespace
+ */
+int rdma_restrack_count(struct ib_device *dev, enum rdma_restrack_type type,
struct pid_namespace *ns)
{
+ struct rdma_restrack_root *rt = &dev->res[type];
struct rdma_restrack_entry *e;
+ XA_STATE(xas, &rt->xa, 0);
u32 cnt = 0;
- down_read(&res->rwsem);
- hash_for_each_possible(res->hash, e, node, type) {
+ xa_lock(&rt->xa);
+ xas_for_each(&xas, e, U32_MAX) {
if (ns == &init_pid_ns ||
(!rdma_is_kernel_res(e) &&
ns == task_active_pid_ns(e->task)))
cnt++;
}
- up_read(&res->rwsem);
+ xa_unlock(&rt->xa);
return cnt;
}
EXPORT_SYMBOL(rdma_restrack_count);
@@ -157,28 +215,28 @@ EXPORT_SYMBOL(rdma_restrack_set_task);
static void rdma_restrack_add(struct rdma_restrack_entry *res)
{
struct ib_device *dev = res_to_dev(res);
+ struct rdma_restrack_root *rt;
+ int ret;
if (!dev)
return;
- if (res->type != RDMA_RESTRACK_CM_ID || rdma_is_kernel_res(res))
- res->task = NULL;
-
- if (!rdma_is_kernel_res(res)) {
- if (!res->task)
- rdma_restrack_set_task(res, NULL);
- res->kern_name = NULL;
- } else {
- set_kern_name(res);
- }
+ rt = &dev->res[res->type];
kref_init(&res->kref);
init_completion(&res->comp);
- res->valid = true;
+ if (res->type != RDMA_RESTRACK_QP)
+ ret = rt_xa_alloc_cyclic(&rt->xa, &res->id, res, &rt->next_id);
+ else {
+ /* Special case to ensure that LQPN points to right QP */
+ struct ib_qp *qp = container_of(res, struct ib_qp, res);
+
+ ret = xa_insert(&rt->xa, qp->qp_num, res, GFP_KERNEL);
+ res->id = ret ? 0 : qp->qp_num;
+ }
- down_write(&dev->res.rwsem);
- hash_add(dev->res.hash, &res->node, res->type);
- up_write(&dev->res.rwsem);
+ if (!ret)
+ res->valid = true;
}
/**
@@ -187,6 +245,8 @@ static void rdma_restrack_add(struct rdma_restrack_entry *res)
*/
void rdma_restrack_kadd(struct rdma_restrack_entry *res)
{
+ res->task = NULL;
+ set_kern_name(res);
res->user = false;
rdma_restrack_add(res);
}
@@ -198,6 +258,13 @@ EXPORT_SYMBOL(rdma_restrack_kadd);
*/
void rdma_restrack_uadd(struct rdma_restrack_entry *res)
{
+ if (res->type != RDMA_RESTRACK_CM_ID)
+ res->task = NULL;
+
+ if (!res->task)
+ rdma_restrack_set_task(res, NULL);
+ res->kern_name = NULL;
+
res->user = true;
rdma_restrack_add(res);
}
@@ -209,6 +276,31 @@ int __must_check rdma_restrack_get(struct rdma_restrack_entry *res)
}
EXPORT_SYMBOL(rdma_restrack_get);
+/**
+ * rdma_restrack_get_byid() - translate from ID to restrack object
+ * @dev: IB device
+ * @type: resource track type
+ * @id: ID to take a look
+ *
+ * Return: Pointer to restrack entry or -ENOENT in case of error.
+ */
+struct rdma_restrack_entry *
+rdma_restrack_get_byid(struct ib_device *dev,
+ enum rdma_restrack_type type, u32 id)
+{
+ struct rdma_restrack_root *rt = &dev->res[type];
+ struct rdma_restrack_entry *res;
+
+ xa_lock(&rt->xa);
+ res = xa_load(&rt->xa, id);
+ if (!res || !rdma_restrack_get(res))
+ res = ERR_PTR(-ENOENT);
+ xa_unlock(&rt->xa);
+
+ return res;
+}
+EXPORT_SYMBOL(rdma_restrack_get_byid);
+
static void restrack_release(struct kref *kref)
{
struct rdma_restrack_entry *res;
@@ -225,23 +317,25 @@ EXPORT_SYMBOL(rdma_restrack_put);
void rdma_restrack_del(struct rdma_restrack_entry *res)
{
+ struct rdma_restrack_entry *old;
+ struct rdma_restrack_root *rt;
struct ib_device *dev;
if (!res->valid)
goto out;
dev = res_to_dev(res);
- if (!dev)
+ if (WARN_ON(!dev))
return;
- rdma_restrack_put(res);
+ rt = &dev->res[res->type];
- wait_for_completion(&res->comp);
-
- down_write(&dev->res.rwsem);
- hash_del(&res->node);
+ old = xa_erase(&rt->xa, res->id);
+ WARN_ON(old != res);
res->valid = false;
- up_write(&dev->res.rwsem);
+
+ rdma_restrack_put(res);
+ wait_for_completion(&res->comp);
out:
if (res->task) {
diff --git a/drivers/infiniband/core/restrack.h b/drivers/infiniband/core/restrack.h
new file mode 100644
index 000000000000..09a1fbdf578e
--- /dev/null
+++ b/drivers/infiniband/core/restrack.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2017-2019 Mellanox Technologies. All rights reserved.
+ */
+
+#ifndef _RDMA_CORE_RESTRACK_H_
+#define _RDMA_CORE_RESTRACK_H_
+
+#include <linux/mutex.h>
+
+/**
+ * struct rdma_restrack_root - main resource tracking management
+ * entity, per-device
+ */
+struct rdma_restrack_root {
+ /**
+ * @xa: Array of XArray structure to hold restrack entries.
+ */
+ struct xarray xa;
+ /**
+ * @next_id: Next ID to support cyclic allocation
+ */
+ u32 next_id;
+};
+
+int rdma_restrack_init(struct ib_device *dev);
+void rdma_restrack_clean(struct ib_device *dev);
+#endif /* _RDMA_CORE_RESTRACK_H_ */
diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c
index d22c4a2ebac6..89a5be3a2f97 100644
--- a/drivers/infiniband/core/rw.c
+++ b/drivers/infiniband/core/rw.c
@@ -179,7 +179,6 @@ static int rdma_rw_init_map_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
struct scatterlist *sg, u32 sg_cnt, u32 offset,
u64 remote_addr, u32 rkey, enum dma_data_direction dir)
{
- struct ib_device *dev = qp->pd->device;
u32 max_sge = dir == DMA_TO_DEVICE ? qp->max_write_sge :
qp->max_read_sge;
struct ib_sge *sge;
@@ -209,8 +208,8 @@ static int rdma_rw_init_map_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
rdma_wr->wr.sg_list = sge;
for (j = 0; j < nr_sge; j++, sg = sg_next(sg)) {
- sge->addr = ib_sg_dma_address(dev, sg) + offset;
- sge->length = ib_sg_dma_len(dev, sg) - offset;
+ sge->addr = sg_dma_address(sg) + offset;
+ sge->length = sg_dma_len(sg) - offset;
sge->lkey = qp->pd->local_dma_lkey;
total_len += sge->length;
@@ -236,14 +235,13 @@ static int rdma_rw_init_single_wr(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
struct scatterlist *sg, u32 offset, u64 remote_addr, u32 rkey,
enum dma_data_direction dir)
{
- struct ib_device *dev = qp->pd->device;
struct ib_rdma_wr *rdma_wr = &ctx->single.wr;
ctx->nr_ops = 1;
ctx->single.sge.lkey = qp->pd->local_dma_lkey;
- ctx->single.sge.addr = ib_sg_dma_address(dev, sg) + offset;
- ctx->single.sge.length = ib_sg_dma_len(dev, sg) - offset;
+ ctx->single.sge.addr = sg_dma_address(sg) + offset;
+ ctx->single.sge.length = sg_dma_len(sg) - offset;
memset(rdma_wr, 0, sizeof(*rdma_wr));
if (dir == DMA_TO_DEVICE)
@@ -294,7 +292,7 @@ int rdma_rw_ctx_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num,
* Skip to the S/G entry that sg_offset falls into:
*/
for (;;) {
- u32 len = ib_sg_dma_len(dev, sg);
+ u32 len = sg_dma_len(sg);
if (sg_offset < len)
break;
diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
index 97e6d7b69abf..7925e45ea88a 100644
--- a/drivers/infiniband/core/sa_query.c
+++ b/drivers/infiniband/core/sa_query.c
@@ -2342,9 +2342,7 @@ static void ib_sa_add_one(struct ib_device *device)
s = rdma_start_port(device);
e = rdma_end_port(device);
- sa_dev = kzalloc(sizeof *sa_dev +
- (e - s + 1) * sizeof (struct ib_sa_port),
- GFP_KERNEL);
+ sa_dev = kzalloc(struct_size(sa_dev, port, e - s + 1), GFP_KERNEL);
if (!sa_dev)
return;
diff --git a/drivers/infiniband/core/security.c b/drivers/infiniband/core/security.c
index 1efadbccf394..1ab423b19f77 100644
--- a/drivers/infiniband/core/security.c
+++ b/drivers/infiniband/core/security.c
@@ -39,22 +39,25 @@
#include "core_priv.h"
#include "mad_priv.h"
+static LIST_HEAD(mad_agent_list);
+/* Lock to protect mad_agent_list */
+static DEFINE_SPINLOCK(mad_agent_list_lock);
+
static struct pkey_index_qp_list *get_pkey_idx_qp_list(struct ib_port_pkey *pp)
{
struct pkey_index_qp_list *pkey = NULL;
struct pkey_index_qp_list *tmp_pkey;
struct ib_device *dev = pp->sec->dev;
- spin_lock(&dev->port_pkey_list[pp->port_num].list_lock);
- list_for_each_entry(tmp_pkey,
- &dev->port_pkey_list[pp->port_num].pkey_list,
- pkey_index_list) {
+ spin_lock(&dev->port_data[pp->port_num].pkey_list_lock);
+ list_for_each_entry (tmp_pkey, &dev->port_data[pp->port_num].pkey_list,
+ pkey_index_list) {
if (tmp_pkey->pkey_index == pp->pkey_index) {
pkey = tmp_pkey;
break;
}
}
- spin_unlock(&dev->port_pkey_list[pp->port_num].list_lock);
+ spin_unlock(&dev->port_data[pp->port_num].pkey_list_lock);
return pkey;
}
@@ -259,12 +262,12 @@ static int port_pkey_list_insert(struct ib_port_pkey *pp)
if (!pkey)
return -ENOMEM;
- spin_lock(&dev->port_pkey_list[port_num].list_lock);
+ spin_lock(&dev->port_data[port_num].pkey_list_lock);
/* Check for the PKey again. A racing process may
* have created it.
*/
list_for_each_entry(tmp_pkey,
- &dev->port_pkey_list[port_num].pkey_list,
+ &dev->port_data[port_num].pkey_list,
pkey_index_list) {
if (tmp_pkey->pkey_index == pp->pkey_index) {
kfree(pkey);
@@ -279,9 +282,9 @@ static int port_pkey_list_insert(struct ib_port_pkey *pp)
spin_lock_init(&pkey->qp_list_lock);
INIT_LIST_HEAD(&pkey->qp_list);
list_add(&pkey->pkey_index_list,
- &dev->port_pkey_list[port_num].pkey_list);
+ &dev->port_data[port_num].pkey_list);
}
- spin_unlock(&dev->port_pkey_list[port_num].list_lock);
+ spin_unlock(&dev->port_data[port_num].pkey_list_lock);
}
spin_lock(&pkey->qp_list_lock);
@@ -418,12 +421,15 @@ void ib_close_shared_qp_security(struct ib_qp_security *sec)
int ib_create_qp_security(struct ib_qp *qp, struct ib_device *dev)
{
- u8 i = rdma_start_port(dev);
+ unsigned int i;
bool is_ib = false;
int ret;
- while (i <= rdma_end_port(dev) && !is_ib)
+ rdma_for_each_port (dev, i) {
is_ib = rdma_protocol_ib(dev, i++);
+ if (is_ib)
+ break;
+ }
/* If this isn't an IB device don't create the security context */
if (!is_ib)
@@ -544,9 +550,8 @@ void ib_security_cache_change(struct ib_device *device,
{
struct pkey_index_qp_list *pkey;
- list_for_each_entry(pkey,
- &device->port_pkey_list[port_num].pkey_list,
- pkey_index_list) {
+ list_for_each_entry (pkey, &device->port_data[port_num].pkey_list,
+ pkey_index_list) {
check_pkey_qps(pkey,
device,
port_num,
@@ -554,21 +559,19 @@ void ib_security_cache_change(struct ib_device *device,
}
}
-void ib_security_destroy_port_pkey_list(struct ib_device *device)
+void ib_security_release_port_pkey_list(struct ib_device *device)
{
struct pkey_index_qp_list *pkey, *tmp_pkey;
- int i;
+ unsigned int i;
- for (i = rdma_start_port(device); i <= rdma_end_port(device); i++) {
- spin_lock(&device->port_pkey_list[i].list_lock);
+ rdma_for_each_port (device, i) {
list_for_each_entry_safe(pkey,
tmp_pkey,
- &device->port_pkey_list[i].pkey_list,
+ &device->port_data[i].pkey_list,
pkey_index_list) {
list_del(&pkey->pkey_index_list);
kfree(pkey);
}
- spin_unlock(&device->port_pkey_list[i].list_lock);
}
}
@@ -676,19 +679,18 @@ static int ib_security_pkey_access(struct ib_device *dev,
return security_ib_pkey_access(sec, subnet_prefix, pkey);
}
-static int ib_mad_agent_security_change(struct notifier_block *nb,
- unsigned long event,
- void *data)
+void ib_mad_agent_security_change(void)
{
- struct ib_mad_agent *ag = container_of(nb, struct ib_mad_agent, lsm_nb);
-
- if (event != LSM_POLICY_CHANGE)
- return NOTIFY_DONE;
-
- ag->smp_allowed = !security_ib_endport_manage_subnet(
- ag->security, dev_name(&ag->device->dev), ag->port_num);
-
- return NOTIFY_OK;
+ struct ib_mad_agent *ag;
+
+ spin_lock(&mad_agent_list_lock);
+ list_for_each_entry(ag,
+ &mad_agent_list,
+ mad_agent_sec_list)
+ WRITE_ONCE(ag->smp_allowed,
+ !security_ib_endport_manage_subnet(ag->security,
+ dev_name(&ag->device->dev), ag->port_num));
+ spin_unlock(&mad_agent_list_lock);
}
int ib_mad_agent_security_setup(struct ib_mad_agent *agent,
@@ -699,6 +701,8 @@ int ib_mad_agent_security_setup(struct ib_mad_agent *agent,
if (!rdma_protocol_ib(agent->device, agent->port_num))
return 0;
+ INIT_LIST_HEAD(&agent->mad_agent_sec_list);
+
ret = security_ib_alloc_security(&agent->security);
if (ret)
return ret;
@@ -706,20 +710,22 @@ int ib_mad_agent_security_setup(struct ib_mad_agent *agent,
if (qp_type != IB_QPT_SMI)
return 0;
+ spin_lock(&mad_agent_list_lock);
ret = security_ib_endport_manage_subnet(agent->security,
dev_name(&agent->device->dev),
agent->port_num);
if (ret)
- return ret;
+ goto free_security;
- agent->lsm_nb.notifier_call = ib_mad_agent_security_change;
- ret = register_lsm_notifier(&agent->lsm_nb);
- if (ret)
- return ret;
-
- agent->smp_allowed = true;
- agent->lsm_nb_reg = true;
+ WRITE_ONCE(agent->smp_allowed, true);
+ list_add(&agent->mad_agent_sec_list, &mad_agent_list);
+ spin_unlock(&mad_agent_list_lock);
return 0;
+
+free_security:
+ spin_unlock(&mad_agent_list_lock);
+ security_ib_free_security(agent->security);
+ return ret;
}
void ib_mad_agent_security_cleanup(struct ib_mad_agent *agent)
@@ -727,9 +733,13 @@ void ib_mad_agent_security_cleanup(struct ib_mad_agent *agent)
if (!rdma_protocol_ib(agent->device, agent->port_num))
return;
+ if (agent->qp->qp_type == IB_QPT_SMI) {
+ spin_lock(&mad_agent_list_lock);
+ list_del(&agent->mad_agent_sec_list);
+ spin_unlock(&mad_agent_list_lock);
+ }
+
security_ib_free_security(agent->security);
- if (agent->lsm_nb_reg)
- unregister_lsm_notifier(&agent->lsm_nb);
}
int ib_mad_enforce_security(struct ib_mad_agent_private *map, u16 pkey_index)
@@ -738,7 +748,7 @@ int ib_mad_enforce_security(struct ib_mad_agent_private *map, u16 pkey_index)
return 0;
if (map->agent.qp->qp_type == IB_QPT_SMI) {
- if (!map->agent.smp_allowed)
+ if (!READ_ONCE(map->agent.smp_allowed))
return -EACCES;
return 0;
}
diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c
index 80f68eb0ba5c..9b6a065bdfa5 100644
--- a/drivers/infiniband/core/sysfs.c
+++ b/drivers/infiniband/core/sysfs.c
@@ -1015,9 +1015,7 @@ err_free_stats:
return;
}
-static int add_port(struct ib_device *device, int port_num,
- int (*port_callback)(struct ib_device *,
- u8, struct kobject *))
+static int add_port(struct ib_device *device, int port_num)
{
struct ib_port *p;
struct ib_port_attr attr;
@@ -1113,8 +1111,8 @@ static int add_port(struct ib_device *device, int port_num,
if (ret)
goto err_free_pkey;
- if (port_callback) {
- ret = port_callback(device, port_num, &p->kobj);
+ if (device->ops.init_port) {
+ ret = device->ops.init_port(device, port_num, &p->kobj);
if (ret)
goto err_remove_pkey;
}
@@ -1189,7 +1187,7 @@ err_put:
static ssize_t node_type_show(struct device *device,
struct device_attribute *attr, char *buf)
{
- struct ib_device *dev = container_of(device, struct ib_device, dev);
+ struct ib_device *dev = rdma_device_to_ibdev(device);
switch (dev->node_type) {
case RDMA_NODE_IB_CA: return sprintf(buf, "%d: CA\n", dev->node_type);
@@ -1206,7 +1204,7 @@ static DEVICE_ATTR_RO(node_type);
static ssize_t sys_image_guid_show(struct device *device,
struct device_attribute *dev_attr, char *buf)
{
- struct ib_device *dev = container_of(device, struct ib_device, dev);
+ struct ib_device *dev = rdma_device_to_ibdev(device);
return sprintf(buf, "%04x:%04x:%04x:%04x\n",
be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[0]),
@@ -1219,7 +1217,7 @@ static DEVICE_ATTR_RO(sys_image_guid);
static ssize_t node_guid_show(struct device *device,
struct device_attribute *attr, char *buf)
{
- struct ib_device *dev = container_of(device, struct ib_device, dev);
+ struct ib_device *dev = rdma_device_to_ibdev(device);
return sprintf(buf, "%04x:%04x:%04x:%04x\n",
be16_to_cpu(((__be16 *) &dev->node_guid)[0]),
@@ -1232,7 +1230,7 @@ static DEVICE_ATTR_RO(node_guid);
static ssize_t node_desc_show(struct device *device,
struct device_attribute *attr, char *buf)
{
- struct ib_device *dev = container_of(device, struct ib_device, dev);
+ struct ib_device *dev = rdma_device_to_ibdev(device);
return sprintf(buf, "%.64s\n", dev->node_desc);
}
@@ -1241,7 +1239,7 @@ static ssize_t node_desc_store(struct device *device,
struct device_attribute *attr,
const char *buf, size_t count)
{
- struct ib_device *dev = container_of(device, struct ib_device, dev);
+ struct ib_device *dev = rdma_device_to_ibdev(device);
struct ib_device_modify desc = {};
int ret;
@@ -1260,7 +1258,7 @@ static DEVICE_ATTR_RW(node_desc);
static ssize_t fw_ver_show(struct device *device, struct device_attribute *attr,
char *buf)
{
- struct ib_device *dev = container_of(device, struct ib_device, dev);
+ struct ib_device *dev = rdma_device_to_ibdev(device);
ib_get_device_fw_str(dev, buf);
strlcat(buf, "\n", IB_FW_VERSION_NAME_MAX);
@@ -1277,21 +1275,21 @@ static struct attribute *ib_dev_attrs[] = {
NULL,
};
-static const struct attribute_group dev_attr_group = {
+const struct attribute_group ib_dev_attr_group = {
.attrs = ib_dev_attrs,
};
-static void free_port_list_attributes(struct ib_device *device)
+static void ib_free_port_attrs(struct ib_device *device)
{
struct kobject *p, *t;
list_for_each_entry_safe(p, t, &device->port_list, entry) {
struct ib_port *port = container_of(p, struct ib_port, kobj);
+
list_del(&p->entry);
- if (port->hw_stats) {
- kfree(port->hw_stats);
+ if (port->hw_stats_ag)
free_hsag(&port->kobj, port->hw_stats_ag);
- }
+ kfree(port->hw_stats);
if (port->pma_table)
sysfs_remove_group(p, port->pma_table);
@@ -1308,62 +1306,47 @@ static void free_port_list_attributes(struct ib_device *device)
kobject_put(device->ports_kobj);
}
-int ib_device_register_sysfs(struct ib_device *device,
- int (*port_callback)(struct ib_device *,
- u8, struct kobject *))
+static int ib_setup_port_attrs(struct ib_device *device)
{
- struct device *class_dev = &device->dev;
+ unsigned int port;
int ret;
- int i;
-
- device->groups[0] = &dev_attr_group;
- class_dev->groups = device->groups;
- ret = device_add(class_dev);
- if (ret)
- goto err;
-
- device->ports_kobj = kobject_create_and_add("ports", &class_dev->kobj);
- if (!device->ports_kobj) {
- ret = -ENOMEM;
- goto err_put;
- }
+ device->ports_kobj = kobject_create_and_add("ports", &device->dev.kobj);
+ if (!device->ports_kobj)
+ return -ENOMEM;
- if (rdma_cap_ib_switch(device)) {
- ret = add_port(device, 0, port_callback);
+ rdma_for_each_port (device, port) {
+ ret = add_port(device, port);
if (ret)
goto err_put;
- } else {
- for (i = 1; i <= device->phys_port_cnt; ++i) {
- ret = add_port(device, i, port_callback);
- if (ret)
- goto err_put;
- }
}
- if (device->ops.alloc_hw_stats)
- setup_hw_stats(device, NULL, 0);
-
return 0;
err_put:
- free_port_list_attributes(device);
- device_del(class_dev);
-err:
+ ib_free_port_attrs(device);
return ret;
}
-void ib_device_unregister_sysfs(struct ib_device *device)
+int ib_device_register_sysfs(struct ib_device *device)
{
- /* Hold device until ib_dealloc_device() */
- get_device(&device->dev);
+ int ret;
+
+ ret = ib_setup_port_attrs(device);
+ if (ret)
+ return ret;
+
+ if (device->ops.alloc_hw_stats)
+ setup_hw_stats(device, NULL, 0);
- free_port_list_attributes(device);
+ return 0;
+}
- if (device->hw_stats) {
- kfree(device->hw_stats);
+void ib_device_unregister_sysfs(struct ib_device *device)
+{
+ if (device->hw_stats_ag)
free_hsag(&device->dev.kobj, device->hw_stats_ag);
- }
+ kfree(device->hw_stats);
- device_unregister(&device->dev);
+ ib_free_port_attrs(device);
}
diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
index 01d68ed46c1b..7468b26b8a01 100644
--- a/drivers/infiniband/core/ucma.c
+++ b/drivers/infiniband/core/ucma.c
@@ -1236,6 +1236,13 @@ static int ucma_set_option_id(struct ucma_context *ctx, int optname,
}
ret = rdma_set_afonly(ctx->cm_id, *((int *) optval) ? 1 : 0);
break;
+ case RDMA_OPTION_ID_ACK_TIMEOUT:
+ if (optlen != sizeof(u8)) {
+ ret = -EINVAL;
+ break;
+ }
+ ret = rdma_set_ack_timeout(ctx->cm_id, *((u8 *)optval));
+ break;
default:
ret = -ENOSYS;
}
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index c6144df47ea4..fe5551562dbc 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -72,15 +72,16 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d
* If access flags indicate ODP memory, avoid pinning. Instead, stores
* the mm for future page fault handling in conjunction with MMU notifiers.
*
- * @context: userspace context to pin memory for
+ * @udata: userspace context to pin memory for
* @addr: userspace virtual address to start at
* @size: length of region to pin
* @access: IB_ACCESS_xxx flags for memory being pinned
* @dmasync: flush in-flight DMA when the memory region is written
*/
-struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
+struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
size_t size, int access, int dmasync)
{
+ struct ib_ucontext *context;
struct ib_umem *umem;
struct page **page_list;
struct vm_area_struct **vma_list;
@@ -95,6 +96,14 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
struct scatterlist *sg, *sg_list_start;
unsigned int gup_flags = FOLL_WRITE;
+ if (!udata)
+ return ERR_PTR(-EIO);
+
+ context = container_of(udata, struct uverbs_attr_bundle, driver_udata)
+ ->context;
+ if (!context)
+ return ERR_PTR(-EIO);
+
if (dmasync)
dma_attrs |= DMA_ATTR_WRITE_BARRIER;
@@ -160,15 +169,12 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
- down_write(&mm->mmap_sem);
- if (check_add_overflow(mm->pinned_vm, npages, &new_pinned) ||
- (new_pinned > lock_limit && !capable(CAP_IPC_LOCK))) {
- up_write(&mm->mmap_sem);
+ new_pinned = atomic64_add_return(npages, &mm->pinned_vm);
+ if (new_pinned > lock_limit && !capable(CAP_IPC_LOCK)) {
+ atomic64_sub(npages, &mm->pinned_vm);
ret = -ENOMEM;
goto out;
}
- mm->pinned_vm = new_pinned;
- up_write(&mm->mmap_sem);
cur_base = addr & PAGE_MASK;
@@ -228,9 +234,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
umem_release:
__ib_umem_release(context->device, umem, 0);
vma:
- down_write(&mm->mmap_sem);
- mm->pinned_vm -= ib_umem_num_pages(umem);
- up_write(&mm->mmap_sem);
+ atomic64_sub(ib_umem_num_pages(umem), &mm->pinned_vm);
out:
if (vma_list)
free_page((unsigned long) vma_list);
@@ -253,25 +257,12 @@ static void __ib_umem_release_tail(struct ib_umem *umem)
kfree(umem);
}
-static void ib_umem_release_defer(struct work_struct *work)
-{
- struct ib_umem *umem = container_of(work, struct ib_umem, work);
-
- down_write(&umem->owning_mm->mmap_sem);
- umem->owning_mm->pinned_vm -= ib_umem_num_pages(umem);
- up_write(&umem->owning_mm->mmap_sem);
-
- __ib_umem_release_tail(umem);
-}
-
/**
* ib_umem_release - release memory pinned with ib_umem_get
* @umem: umem struct to release
*/
void ib_umem_release(struct ib_umem *umem)
{
- struct ib_ucontext *context = umem->context;
-
if (umem->is_odp) {
ib_umem_odp_release(to_ib_umem_odp(umem));
__ib_umem_release_tail(umem);
@@ -280,26 +271,7 @@ void ib_umem_release(struct ib_umem *umem)
__ib_umem_release(umem->context->device, umem, 1);
- /*
- * We may be called with the mm's mmap_sem already held. This
- * can happen when a userspace munmap() is the call that drops
- * the last reference to our file and calls our release
- * method. If there are memory regions to destroy, we'll end
- * up here and not be able to take the mmap_sem. In that case
- * we defer the vm_locked accounting a workqueue.
- */
- if (context->closing) {
- if (!down_write_trylock(&umem->owning_mm->mmap_sem)) {
- INIT_WORK(&umem->work, ib_umem_release_defer);
- queue_work(ib_wq, &umem->work);
- return;
- }
- } else {
- down_write(&umem->owning_mm->mmap_sem);
- }
- umem->owning_mm->pinned_vm -= ib_umem_num_pages(umem);
- up_write(&umem->owning_mm->mmap_sem);
-
+ atomic64_sub(ib_umem_num_pages(umem), &umem->owning_mm->pinned_vm);
__ib_umem_release_tail(umem);
}
EXPORT_SYMBOL(ib_umem_release);
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
index acb882f279cb..e6ec79ad9cc8 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -40,6 +40,7 @@
#include <linux/vmalloc.h>
#include <linux/hugetlb.h>
#include <linux/interval_tree_generic.h>
+#include <linux/pagemap.h>
#include <rdma/ib_verbs.h>
#include <rdma/ib_umem.h>
@@ -299,7 +300,7 @@ static void free_per_mm(struct rcu_head *rcu)
kfree(container_of(rcu, struct ib_ucontext_per_mm, rcu));
}
-void put_per_mm(struct ib_umem_odp *umem_odp)
+static void put_per_mm(struct ib_umem_odp *umem_odp)
{
struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm;
struct ib_ucontext *ctx = umem_odp->umem.context;
@@ -332,9 +333,10 @@ void put_per_mm(struct ib_umem_odp *umem_odp)
mmu_notifier_call_srcu(&per_mm->rcu, free_per_mm);
}
-struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext_per_mm *per_mm,
+struct ib_umem_odp *ib_alloc_odp_umem(struct ib_umem_odp *root,
unsigned long addr, size_t size)
{
+ struct ib_ucontext_per_mm *per_mm = root->per_mm;
struct ib_ucontext *ctx = per_mm->context;
struct ib_umem_odp *odp_data;
struct ib_umem *umem;
@@ -349,7 +351,7 @@ struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext_per_mm *per_mm,
umem->length = size;
umem->address = addr;
umem->page_shift = PAGE_SHIFT;
- umem->writable = 1;
+ umem->writable = root->umem.writable;
umem->is_odp = 1;
odp_data->per_mm = per_mm;
umem->owning_mm = per_mm->mm;
@@ -617,7 +619,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt,
* mmget_not_zero will fail in this case.
*/
owning_process = get_pid_task(umem_odp->per_mm->tgid, PIDTYPE_PID);
- if (WARN_ON(!mmget_not_zero(umem_odp->umem.owning_mm))) {
+ if (!owning_process || !mmget_not_zero(owning_mm)) {
ret = -EINVAL;
goto out_put_task;
}
@@ -684,9 +686,14 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt,
mutex_unlock(&umem_odp->umem_mutex);
if (ret < 0) {
- /* Release left over pages when handling errors. */
- for (++j; j < npages; ++j)
- put_page(local_page_list[j]);
+ /*
+ * Release pages, remembering that the first page
+ * to hit an error was already released by
+ * ib_umem_odp_map_dma_single_page().
+ */
+ if (npages - (j + 1) > 0)
+ release_pages(&local_page_list[j+1],
+ npages - (j + 1));
break;
}
}
diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c
index de8d31ab8945..02b7947ab215 100644
--- a/drivers/infiniband/core/user_mad.c
+++ b/drivers/infiniband/core/user_mad.c
@@ -957,19 +957,22 @@ static int ib_umad_open(struct inode *inode, struct file *filp)
{
struct ib_umad_port *port;
struct ib_umad_file *file;
- int ret = -ENXIO;
+ int ret = 0;
port = container_of(inode->i_cdev, struct ib_umad_port, cdev);
mutex_lock(&port->file_mutex);
- if (!port->ib_dev)
+ if (!port->ib_dev) {
+ ret = -ENXIO;
goto out;
+ }
- ret = -ENOMEM;
- file = kzalloc(sizeof *file, GFP_KERNEL);
- if (!file)
+ file = kzalloc(sizeof(*file), GFP_KERNEL);
+ if (!file) {
+ ret = -ENOMEM;
goto out;
+ }
mutex_init(&file->mutex);
spin_lock_init(&file->send_lock);
@@ -982,14 +985,7 @@ static int ib_umad_open(struct inode *inode, struct file *filp)
list_add_tail(&file->port_list, &port->file_list);
- ret = nonseekable_open(inode, filp);
- if (ret) {
- list_del(&file->port_list);
- kfree(file);
- goto out;
- }
-
- ib_umad_dev_get(port->umad_dev);
+ nonseekable_open(inode, filp);
out:
mutex_unlock(&port->file_mutex);
return ret;
@@ -998,7 +994,6 @@ out:
static int ib_umad_close(struct inode *inode, struct file *filp)
{
struct ib_umad_file *file = filp->private_data;
- struct ib_umad_device *dev = file->port->umad_dev;
struct ib_umad_packet *packet, *tmp;
int already_dead;
int i;
@@ -1027,7 +1022,6 @@ static int ib_umad_close(struct inode *inode, struct file *filp)
mutex_unlock(&file->port->file_mutex);
kfree(file);
- ib_umad_dev_put(dev);
return 0;
}
@@ -1073,17 +1067,9 @@ static int ib_umad_sm_open(struct inode *inode, struct file *filp)
filp->private_data = port;
- ret = nonseekable_open(inode, filp);
- if (ret)
- goto err_clr_sm_cap;
-
- ib_umad_dev_get(port->umad_dev);
+ nonseekable_open(inode, filp);
return 0;
-err_clr_sm_cap:
- swap(props.set_port_cap_mask, props.clr_port_cap_mask);
- ib_modify_port(port->ib_dev, port->port_num, 0, &props);
-
err_up_sem:
up(&port->sm_sem);
@@ -1106,7 +1092,6 @@ static int ib_umad_sm_close(struct inode *inode, struct file *filp)
up(&port->sm_sem);
- ib_umad_dev_put(port->umad_dev);
return ret;
}
@@ -1283,10 +1268,12 @@ static void ib_umad_kill_port(struct ib_umad_port *port)
mutex_unlock(&port->file_mutex);
cdev_device_del(&port->sm_cdev, &port->sm_dev);
- put_device(&port->sm_dev);
cdev_device_del(&port->cdev, &port->dev);
- put_device(&port->dev);
ida_free(&umad_ida, port->dev_num);
+
+ /* balances device_initialize() */
+ put_device(&port->sm_dev);
+ put_device(&port->dev);
}
static void ib_umad_add_one(struct ib_device *device)
@@ -1329,21 +1316,24 @@ err:
ib_umad_kill_port(&umad_dev->ports[i - s]);
}
free:
+ /* balances kref_init */
ib_umad_dev_put(umad_dev);
}
static void ib_umad_remove_one(struct ib_device *device, void *client_data)
{
struct ib_umad_device *umad_dev = client_data;
- int i;
+ unsigned int i;
if (!umad_dev)
return;
- for (i = 0; i <= rdma_end_port(device) - rdma_start_port(device); ++i) {
- if (rdma_cap_ib_mad(device, i + rdma_start_port(device)))
- ib_umad_kill_port(&umad_dev->ports[i]);
+ rdma_for_each_port (device, i) {
+ if (rdma_cap_ib_mad(device, i))
+ ib_umad_kill_port(
+ &umad_dev->ports[i - rdma_start_port(device)]);
}
+ /* balances kref_init() */
ib_umad_dev_put(umad_dev);
}
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 3317300ab036..062a86c04123 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -224,12 +224,13 @@ static int ib_uverbs_get_context(struct uverbs_attr_bundle *attrs)
if (ret)
goto err;
- ucontext = ib_dev->ops.alloc_ucontext(ib_dev, &attrs->driver_udata);
- if (IS_ERR(ucontext)) {
- ret = PTR_ERR(ucontext);
+ ucontext = rdma_zalloc_drv_obj(ib_dev, ib_ucontext);
+ if (!ucontext) {
+ ret = -ENOMEM;
goto err_alloc;
}
+ ucontext->res.type = RDMA_RESTRACK_CTX;
ucontext->device = ib_dev;
ucontext->cg_obj = cg_obj;
/* ufile is required when some objects are released */
@@ -238,15 +239,8 @@ static int ib_uverbs_get_context(struct uverbs_attr_bundle *attrs)
ucontext->closing = false;
ucontext->cleanup_retryable = false;
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
mutex_init(&ucontext->per_mm_list_lock);
INIT_LIST_HEAD(&ucontext->per_mm_list);
- if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING))
- ucontext->invalidate_range = NULL;
-
-#endif
-
- resp.num_comp_vectors = file->device->num_comp_vectors;
ret = get_unused_fd_flags(O_CLOEXEC);
if (ret < 0)
@@ -259,15 +253,22 @@ static int ib_uverbs_get_context(struct uverbs_attr_bundle *attrs)
goto err_fd;
}
+ resp.num_comp_vectors = file->device->num_comp_vectors;
+
ret = uverbs_response(attrs, &resp, sizeof(resp));
if (ret)
goto err_file;
- fd_install(resp.async_fd, filp);
+ ret = ib_dev->ops.alloc_ucontext(ucontext, &attrs->driver_udata);
+ if (ret)
+ goto err_file;
+ if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING))
+ ucontext->invalidate_range = NULL;
- ucontext->res.type = RDMA_RESTRACK_CTX;
rdma_restrack_uadd(&ucontext->res);
+ fd_install(resp.async_fd, filp);
+
/*
* Make sure that ib_uverbs_get_ucontext() sees the pointer update
* only after all writes to setup the ucontext have completed
@@ -286,7 +287,7 @@ err_fd:
put_unused_fd(resp.async_fd);
err_free:
- ib_dev->ops.dealloc_ucontext(ucontext);
+ kfree(ucontext);
err_alloc:
ib_rdmacg_uncharge(&cg_obj, ib_dev, RDMACG_RESOURCE_HCA_HANDLE);
@@ -410,9 +411,9 @@ static int ib_uverbs_alloc_pd(struct uverbs_attr_bundle *attrs)
if (IS_ERR(uobj))
return PTR_ERR(uobj);
- pd = ib_dev->ops.alloc_pd(ib_dev, uobj->context, &attrs->driver_udata);
- if (IS_ERR(pd)) {
- ret = PTR_ERR(pd);
+ pd = rdma_zalloc_drv_obj(ib_dev, ib_pd);
+ if (!pd) {
+ ret = -ENOMEM;
goto err;
}
@@ -420,11 +421,15 @@ static int ib_uverbs_alloc_pd(struct uverbs_attr_bundle *attrs)
pd->uobject = uobj;
pd->__internal_mr = NULL;
atomic_set(&pd->usecnt, 0);
+ pd->res.type = RDMA_RESTRACK_PD;
+
+ ret = ib_dev->ops.alloc_pd(pd, uobj->context, &attrs->driver_udata);
+ if (ret)
+ goto err_alloc;
uobj->object = pd;
memset(&resp, 0, sizeof resp);
resp.pd_handle = uobj->id;
- pd->res.type = RDMA_RESTRACK_PD;
rdma_restrack_uadd(&pd->res);
ret = uverbs_response(attrs, &resp, sizeof(resp));
@@ -435,7 +440,9 @@ static int ib_uverbs_alloc_pd(struct uverbs_attr_bundle *attrs)
err_copy:
ib_dealloc_pd(pd);
-
+ pd = NULL;
+err_alloc:
+ kfree(pd);
err:
uobj_alloc_abort(uobj);
return ret;
@@ -822,14 +829,13 @@ static int ib_uverbs_rereg_mr(struct uverbs_attr_bundle *attrs)
cmd.length, cmd.hca_va,
cmd.access_flags, pd,
&attrs->driver_udata);
- if (!ret) {
- if (cmd.flags & IB_MR_REREG_PD) {
- atomic_inc(&pd->usecnt);
- mr->pd = pd;
- atomic_dec(&old_pd->usecnt);
- }
- } else {
+ if (ret)
goto put_uobj_pd;
+
+ if (cmd.flags & IB_MR_REREG_PD) {
+ atomic_inc(&pd->usecnt);
+ mr->pd = pd;
+ atomic_dec(&old_pd->usecnt);
}
memset(&resp, 0, sizeof(resp));
@@ -884,6 +890,11 @@ static int ib_uverbs_alloc_mw(struct uverbs_attr_bundle *attrs)
goto err_free;
}
+ if (cmd.mw_type != IB_MW_TYPE_1 && cmd.mw_type != IB_MW_TYPE_2) {
+ ret = -EINVAL;
+ goto err_put;
+ }
+
mw = pd->device->ops.alloc_mw(pd, cmd.mw_type, &attrs->driver_udata);
if (IS_ERR(mw)) {
ret = PTR_ERR(mw);
@@ -1184,12 +1195,11 @@ static int ib_uverbs_poll_cq(struct uverbs_attr_bundle *attrs)
ret = -EFAULT;
goto out_put;
}
+ ret = 0;
if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_CORE_OUT))
ret = uverbs_output_written(attrs, UVERBS_ATTR_CORE_OUT);
- ret = 0;
-
out_put:
uobj_put_obj_read(cq);
return ret;
@@ -2632,7 +2642,7 @@ void flow_resources_add(struct ib_uflow_resources *uflow_res,
}
EXPORT_SYMBOL(flow_resources_add);
-static int kern_spec_to_ib_spec_action(const struct uverbs_attr_bundle *attrs,
+static int kern_spec_to_ib_spec_action(struct uverbs_attr_bundle *attrs,
struct ib_uverbs_flow_spec *kern_spec,
union ib_flow_spec *ib_spec,
struct ib_uflow_resources *uflow_res)
@@ -3618,7 +3628,6 @@ static int ib_uverbs_ex_query_device(struct uverbs_attr_bundle *attrs)
copy_query_dev_fields(ucontext, &resp.base, &attr);
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
resp.odp_caps.general_caps = attr.odp_caps.general_caps;
resp.odp_caps.per_transport_caps.rc_odp_caps =
attr.odp_caps.per_transport_caps.rc_odp_caps;
@@ -3626,7 +3635,7 @@ static int ib_uverbs_ex_query_device(struct uverbs_attr_bundle *attrs)
attr.odp_caps.per_transport_caps.uc_odp_caps;
resp.odp_caps.per_transport_caps.ud_odp_caps =
attr.odp_caps.per_transport_caps.ud_odp_caps;
-#endif
+ resp.xrc_odp_caps = attr.odp_caps.per_transport_caps.xrc_odp_caps;
resp.timestamp_mask = attr.timestamp_mask;
resp.hca_core_clock = attr.hca_core_clock;
diff --git a/drivers/infiniband/core/uverbs_ioctl.c b/drivers/infiniband/core/uverbs_ioctl.c
index 0ca04d224015..e1379949e663 100644
--- a/drivers/infiniband/core/uverbs_ioctl.c
+++ b/drivers/infiniband/core/uverbs_ioctl.c
@@ -213,6 +213,7 @@ static int uverbs_process_idrs_array(struct bundle_priv *pbundle,
ret = PTR_ERR(attr->uobjects[i]);
break;
}
+ pbundle->bundle.context = attr->uobjects[i]->context;
}
attr->len = i;
@@ -330,6 +331,7 @@ static int uverbs_process_attr(struct bundle_priv *pbundle,
uattr->data_s64);
if (IS_ERR(o_attr->uobject))
return PTR_ERR(o_attr->uobject);
+ pbundle->bundle.context = o_attr->uobject->context;
__set_bit(attr_bkey, pbundle->uobj_finalize);
if (spec->u.obj.access == UVERBS_ACCESS_NEW) {
@@ -592,6 +594,7 @@ static int ib_uverbs_cmd_verbs(struct ib_uverbs_file *ufile,
pbundle->method_elm = method_elm;
pbundle->method_key = attrs_iter.index;
pbundle->bundle.ufile = ufile;
+ pbundle->bundle.context = NULL; /* only valid if bundle has uobject */
pbundle->radix = &uapi->radix;
pbundle->radix_slots = slot;
pbundle->radix_slots_len = radix_tree_chunk_size(&attrs_iter);
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index 5f366838b7ff..70b7d80431a9 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -695,6 +695,7 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
memset(bundle.attr_present, 0, sizeof(bundle.attr_present));
bundle.ufile = file;
+ bundle.context = NULL; /* only valid if bundle has uobject */
if (!method_elm->is_ex) {
size_t in_len = hdr.in_words * 4 - sizeof(hdr);
size_t out_len = hdr.out_words * 4;
@@ -1135,6 +1136,7 @@ static const struct file_operations uverbs_mmap_fops = {
static struct ib_client uverbs_client = {
.name = "uverbs",
+ .no_kverbs_req = true,
.add = ib_uverbs_add_one,
.remove = ib_uverbs_remove_one
};
diff --git a/drivers/infiniband/core/uverbs_std_types.c b/drivers/infiniband/core/uverbs_std_types.c
index cbc72312eb41..f224cb727224 100644
--- a/drivers/infiniband/core/uverbs_std_types.c
+++ b/drivers/infiniband/core/uverbs_std_types.c
@@ -188,7 +188,7 @@ static int uverbs_free_pd(struct ib_uobject *uobject,
if (ret)
return ret;
- ib_dealloc_pd((struct ib_pd *)uobject->object);
+ ib_dealloc_pd(pd);
return 0;
}
diff --git a/drivers/infiniband/core/uverbs_uapi.c b/drivers/infiniband/core/uverbs_uapi.c
index 9ae08e4b78a3..7a987acf0c0b 100644
--- a/drivers/infiniband/core/uverbs_uapi.c
+++ b/drivers/infiniband/core/uverbs_uapi.c
@@ -188,13 +188,18 @@ static int uapi_merge_obj_tree(struct uverbs_api *uapi,
obj_elm->type_attrs = obj->type_attrs;
obj_elm->type_class = obj->type_attrs->type_class;
/*
- * Today drivers are only permitted to use idr_class
- * types. They cannot use FD types because we currently have
- * no way to revoke the fops pointer after device
- * disassociation.
+ * Today drivers are only permitted to use idr_class and
+ * fd_class types. We can revoke the IDR types during
+ * disassociation, and the FD types require the driver to use
+ * struct file_operations.owner to prevent the driver module
+ * code from unloading while the file is open. This provides
+ * enough safety that uverbs_close_fd() will continue to work.
+ * Drivers using FD are responsible to handle disassociation of
+ * the device on their own.
*/
if (WARN_ON(is_driver &&
- obj->type_attrs->type_class != &uverbs_idr_class))
+ obj->type_attrs->type_class != &uverbs_idr_class &&
+ obj->type_attrs->type_class != &uverbs_fd_class))
return -EINVAL;
}
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index ac011836bb54..5a5e83f5f0fc 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -254,10 +254,11 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags,
{
struct ib_pd *pd;
int mr_access_flags = 0;
+ int ret;
- pd = device->ops.alloc_pd(device, NULL, NULL);
- if (IS_ERR(pd))
- return pd;
+ pd = rdma_zalloc_drv_obj(device, ib_pd);
+ if (!pd)
+ return ERR_PTR(-ENOMEM);
pd->device = device;
pd->uobject = NULL;
@@ -265,6 +266,16 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags,
atomic_set(&pd->usecnt, 0);
pd->flags = flags;
+ pd->res.type = RDMA_RESTRACK_PD;
+ rdma_restrack_set_task(&pd->res, caller);
+
+ ret = device->ops.alloc_pd(pd, NULL, NULL);
+ if (ret) {
+ kfree(pd);
+ return ERR_PTR(ret);
+ }
+ rdma_restrack_kadd(&pd->res);
+
if (device->attrs.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)
pd->local_dma_lkey = device->local_dma_lkey;
else
@@ -275,10 +286,6 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags,
mr_access_flags |= IB_ACCESS_REMOTE_READ | IB_ACCESS_REMOTE_WRITE;
}
- pd->res.type = RDMA_RESTRACK_PD;
- rdma_restrack_set_task(&pd->res, caller);
- rdma_restrack_kadd(&pd->res);
-
if (mr_access_flags) {
struct ib_mr *mr;
@@ -329,10 +336,8 @@ void ib_dealloc_pd(struct ib_pd *pd)
WARN_ON(atomic_read(&pd->usecnt));
rdma_restrack_del(&pd->res);
- /* Making delalloc_pd a void return is a WIP, no driver should return
- an error here. */
- ret = pd->device->ops.dealloc_pd(pd);
- WARN_ONCE(ret, "Infiniband HW driver failed dealloc_pd");
+ pd->device->ops.dealloc_pd(pd);
+ kfree(pd);
}
EXPORT_SYMBOL(ib_dealloc_pd);
@@ -1106,8 +1111,8 @@ struct ib_qp *ib_open_qp(struct ib_xrcd *xrcd,
}
EXPORT_SYMBOL(ib_open_qp);
-static struct ib_qp *ib_create_xrc_qp(struct ib_qp *qp,
- struct ib_qp_init_attr *qp_init_attr)
+static struct ib_qp *create_xrc_qp(struct ib_qp *qp,
+ struct ib_qp_init_attr *qp_init_attr)
{
struct ib_qp *real_qp = qp;
@@ -1122,10 +1127,10 @@ static struct ib_qp *ib_create_xrc_qp(struct ib_qp *qp,
qp = __ib_open_qp(real_qp, qp_init_attr->event_handler,
qp_init_attr->qp_context);
- if (!IS_ERR(qp))
- __ib_insert_xrcd_qp(qp_init_attr->xrcd, real_qp);
- else
- real_qp->device->ops.destroy_qp(real_qp);
+ if (IS_ERR(qp))
+ return qp;
+
+ __ib_insert_xrcd_qp(qp_init_attr->xrcd, real_qp);
return qp;
}
@@ -1156,10 +1161,8 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd,
return qp;
ret = ib_create_qp_security(qp, device);
- if (ret) {
- ib_destroy_qp(qp);
- return ERR_PTR(ret);
- }
+ if (ret)
+ goto err;
qp->real_qp = qp;
qp->qp_type = qp_init_attr->qp_type;
@@ -1172,8 +1175,15 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd,
INIT_LIST_HEAD(&qp->sig_mrs);
qp->port = 0;
- if (qp_init_attr->qp_type == IB_QPT_XRC_TGT)
- return ib_create_xrc_qp(qp, qp_init_attr);
+ if (qp_init_attr->qp_type == IB_QPT_XRC_TGT) {
+ struct ib_qp *xrc_qp = create_xrc_qp(qp, qp_init_attr);
+
+ if (IS_ERR(xrc_qp)) {
+ ret = PTR_ERR(xrc_qp);
+ goto err;
+ }
+ return xrc_qp;
+ }
qp->event_handler = qp_init_attr->event_handler;
qp->qp_context = qp_init_attr->qp_context;
@@ -1200,11 +1210,8 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd,
if (qp_init_attr->cap.max_rdma_ctxs) {
ret = rdma_rw_init_mrs(qp, qp_init_attr);
- if (ret) {
- pr_err("failed to init MR pool ret= %d\n", ret);
- ib_destroy_qp(qp);
- return ERR_PTR(ret);
- }
+ if (ret)
+ goto err;
}
/*
@@ -1217,6 +1224,11 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd,
device->attrs.max_sge_rd);
return qp;
+
+err:
+ ib_destroy_qp(qp);
+ return ERR_PTR(ret);
+
}
EXPORT_SYMBOL(ib_create_qp);
@@ -1711,10 +1723,7 @@ int ib_get_eth_speed(struct ib_device *dev, u8 port_num, u8 *speed, u8 *width)
if (rdma_port_get_link_layer(dev, port_num) != IB_LINK_LAYER_ETHERNET)
return -EINVAL;
- if (!dev->ops.get_netdev)
- return -EOPNOTSUPP;
-
- netdev = dev->ops.get_netdev(dev, port_num);
+ netdev = ib_device_get_netdev(dev, port_num);
if (!netdev)
return -ENODEV;
diff --git a/drivers/infiniband/hw/bnxt_re/Kconfig b/drivers/infiniband/hw/bnxt_re/Kconfig
index 19982a4a9bba..d25439c305f7 100644
--- a/drivers/infiniband/hw/bnxt_re/Kconfig
+++ b/drivers/infiniband/hw/bnxt_re/Kconfig
@@ -1,5 +1,6 @@
config INFINIBAND_BNXT_RE
tristate "Broadcom Netxtreme HCA support"
+ depends on 64BIT
depends on ETHERNET && NETDEVICES && PCI && INET && DCB
select NET_VENDOR_BROADCOM
select BNXT
diff --git a/drivers/infiniband/hw/bnxt_re/Makefile b/drivers/infiniband/hw/bnxt_re/Makefile
index 6e3bc25cc140..ee9bb1be61ea 100644
--- a/drivers/infiniband/hw/bnxt_re/Makefile
+++ b/drivers/infiniband/hw/bnxt_re/Makefile
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: GPL-2.0
-ccflags-y := -Idrivers/net/ethernet/broadcom/bnxt
+ccflags-y := -I $(srctree)/drivers/net/ethernet/broadcom/bnxt
obj-$(CONFIG_INFINIBAND_BNXT_RE) += bnxt_re.o
bnxt_re-y := main.o ib_verbs.o \
qplib_res.o qplib_rcfw.o \
diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h
index 31baa8939a4f..e55a1666c0cd 100644
--- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h
+++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h
@@ -124,6 +124,7 @@ struct bnxt_re_dev {
#define BNXT_RE_FLAG_ISSUE_ROCE_STATS 29
struct net_device *netdev;
unsigned int version, major, minor;
+ struct bnxt_qplib_chip_ctx chip_ctx;
struct bnxt_en_dev *en_dev;
struct bnxt_msix_entry msix_entries[BNXT_RE_MAX_MSIX];
int num_msix;
diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
index 1e2515e2eb62..071b2fc38b0b 100644
--- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c
+++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
@@ -48,6 +48,7 @@
#include <rdma/ib_addr.h>
#include <rdma/ib_mad.h>
#include <rdma/ib_cache.h>
+#include <rdma/uverbs_ioctl.h>
#include "bnxt_ulp.h"
@@ -563,41 +564,29 @@ fail:
}
/* Protection Domains */
-int bnxt_re_dealloc_pd(struct ib_pd *ib_pd)
+void bnxt_re_dealloc_pd(struct ib_pd *ib_pd)
{
struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd);
struct bnxt_re_dev *rdev = pd->rdev;
- int rc;
bnxt_re_destroy_fence_mr(pd);
- if (pd->qplib_pd.id) {
- rc = bnxt_qplib_dealloc_pd(&rdev->qplib_res,
- &rdev->qplib_res.pd_tbl,
- &pd->qplib_pd);
- if (rc)
- dev_err(rdev_to_dev(rdev), "Failed to deallocate HW PD");
- }
-
- kfree(pd);
- return 0;
+ if (pd->qplib_pd.id)
+ bnxt_qplib_dealloc_pd(&rdev->qplib_res, &rdev->qplib_res.pd_tbl,
+ &pd->qplib_pd);
}
-struct ib_pd *bnxt_re_alloc_pd(struct ib_device *ibdev,
- struct ib_ucontext *ucontext,
- struct ib_udata *udata)
+int bnxt_re_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *ucontext,
+ struct ib_udata *udata)
{
+ struct ib_device *ibdev = ibpd->device;
struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev);
struct bnxt_re_ucontext *ucntx = container_of(ucontext,
struct bnxt_re_ucontext,
ib_uctx);
- struct bnxt_re_pd *pd;
+ struct bnxt_re_pd *pd = container_of(ibpd, struct bnxt_re_pd, ib_pd);
int rc;
- pd = kzalloc(sizeof(*pd), GFP_KERNEL);
- if (!pd)
- return ERR_PTR(-ENOMEM);
-
pd->rdev = rdev;
if (bnxt_qplib_alloc_pd(&rdev->qplib_res.pd_tbl, &pd->qplib_pd)) {
dev_err(rdev_to_dev(rdev), "Failed to allocate HW PD");
@@ -637,13 +626,12 @@ struct ib_pd *bnxt_re_alloc_pd(struct ib_device *ibdev,
if (bnxt_re_create_fence_mr(pd))
dev_warn(rdev_to_dev(rdev),
"Failed to create Fence-MR\n");
- return &pd->ib_pd;
+ return 0;
dbfail:
- (void)bnxt_qplib_dealloc_pd(&rdev->qplib_res, &rdev->qplib_res.pd_tbl,
- &pd->qplib_pd);
+ bnxt_qplib_dealloc_pd(&rdev->qplib_res, &rdev->qplib_res.pd_tbl,
+ &pd->qplib_pd);
fail:
- kfree(pd);
- return ERR_PTR(rc);
+ return rc;
}
/* Address Handles */
@@ -663,17 +651,36 @@ int bnxt_re_destroy_ah(struct ib_ah *ib_ah, u32 flags)
return 0;
}
+static u8 bnxt_re_stack_to_dev_nw_type(enum rdma_network_type ntype)
+{
+ u8 nw_type;
+
+ switch (ntype) {
+ case RDMA_NETWORK_IPV4:
+ nw_type = CMDQ_CREATE_AH_TYPE_V2IPV4;
+ break;
+ case RDMA_NETWORK_IPV6:
+ nw_type = CMDQ_CREATE_AH_TYPE_V2IPV6;
+ break;
+ default:
+ nw_type = CMDQ_CREATE_AH_TYPE_V1;
+ break;
+ }
+ return nw_type;
+}
+
struct ib_ah *bnxt_re_create_ah(struct ib_pd *ib_pd,
struct rdma_ah_attr *ah_attr,
u32 flags,
struct ib_udata *udata)
{
struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd);
+ const struct ib_global_route *grh = rdma_ah_read_grh(ah_attr);
struct bnxt_re_dev *rdev = pd->rdev;
+ const struct ib_gid_attr *sgid_attr;
struct bnxt_re_ah *ah;
- const struct ib_global_route *grh = rdma_ah_read_grh(ah_attr);
- int rc;
u8 nw_type;
+ int rc;
if (!(rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH)) {
dev_err(rdev_to_dev(rdev), "Failed to alloc AH: GRH not set");
@@ -700,28 +707,11 @@ struct ib_ah *bnxt_re_create_ah(struct ib_pd *ib_pd,
ah->qplib_ah.flow_label = grh->flow_label;
ah->qplib_ah.hop_limit = grh->hop_limit;
ah->qplib_ah.sl = rdma_ah_get_sl(ah_attr);
- if (udata &&
- !rdma_is_multicast_addr((struct in6_addr *)
- grh->dgid.raw) &&
- !rdma_link_local_addr((struct in6_addr *)
- grh->dgid.raw)) {
- const struct ib_gid_attr *sgid_attr;
- sgid_attr = grh->sgid_attr;
- /* Get network header type for this GID */
- nw_type = rdma_gid_attr_network_type(sgid_attr);
- switch (nw_type) {
- case RDMA_NETWORK_IPV4:
- ah->qplib_ah.nw_type = CMDQ_CREATE_AH_TYPE_V2IPV4;
- break;
- case RDMA_NETWORK_IPV6:
- ah->qplib_ah.nw_type = CMDQ_CREATE_AH_TYPE_V2IPV6;
- break;
- default:
- ah->qplib_ah.nw_type = CMDQ_CREATE_AH_TYPE_V1;
- break;
- }
- }
+ sgid_attr = grh->sgid_attr;
+ /* Get network header type for this GID */
+ nw_type = rdma_gid_attr_network_type(sgid_attr);
+ ah->qplib_ah.nw_type = bnxt_re_stack_to_dev_nw_type(nw_type);
memcpy(ah->qplib_ah.dmac, ah_attr->roce.dmac, ETH_ALEN);
rc = bnxt_qplib_create_ah(&rdev->qplib_res, &ah->qplib_ah,
@@ -733,12 +723,11 @@ struct ib_ah *bnxt_re_create_ah(struct ib_pd *ib_pd,
/* Write AVID to shared page. */
if (udata) {
- struct ib_ucontext *ib_uctx = ib_pd->uobject->context;
- struct bnxt_re_ucontext *uctx;
+ struct bnxt_re_ucontext *uctx = rdma_udata_to_drv_context(
+ udata, struct bnxt_re_ucontext, ib_uctx);
unsigned long flag;
u32 *wrptr;
- uctx = container_of(ib_uctx, struct bnxt_re_ucontext, ib_uctx);
spin_lock_irqsave(&uctx->sh_lock, flag);
wrptr = (u32 *)(uctx->shpg + BNXT_RE_AVID_OFFT);
*wrptr = ah->qplib_ah.id;
@@ -804,8 +793,8 @@ int bnxt_re_destroy_qp(struct ib_qp *ib_qp)
{
struct bnxt_re_qp *qp = container_of(ib_qp, struct bnxt_re_qp, ib_qp);
struct bnxt_re_dev *rdev = qp->rdev;
- int rc;
unsigned int flags;
+ int rc;
bnxt_qplib_flush_cqn_wq(&qp->qplib_qp);
rc = bnxt_qplib_destroy_qp(&rdev->qplib_res, &qp->qplib_qp);
@@ -814,9 +803,12 @@ int bnxt_re_destroy_qp(struct ib_qp *ib_qp)
return rc;
}
- flags = bnxt_re_lock_cqs(qp);
- bnxt_qplib_clean_qp(&qp->qplib_qp);
- bnxt_re_unlock_cqs(qp, flags);
+ if (rdma_is_kernel_res(&qp->ib_qp.res)) {
+ flags = bnxt_re_lock_cqs(qp);
+ bnxt_qplib_clean_qp(&qp->qplib_qp);
+ bnxt_re_unlock_cqs(qp, flags);
+ }
+
bnxt_qplib_free_qp_res(&rdev->qplib_res, &qp->qplib_qp);
if (ib_qp->qp_type == IB_QPT_GSI && rdev->qp1_sqp) {
@@ -882,21 +874,23 @@ static int bnxt_re_init_user_qp(struct bnxt_re_dev *rdev, struct bnxt_re_pd *pd,
struct bnxt_re_qp_req ureq;
struct bnxt_qplib_qp *qplib_qp = &qp->qplib_qp;
struct ib_umem *umem;
- int bytes = 0;
- struct ib_ucontext *context = pd->ib_pd.uobject->context;
- struct bnxt_re_ucontext *cntx = container_of(context,
- struct bnxt_re_ucontext,
- ib_uctx);
+ int bytes = 0, psn_sz;
+ struct bnxt_re_ucontext *cntx = rdma_udata_to_drv_context(
+ udata, struct bnxt_re_ucontext, ib_uctx);
+
if (ib_copy_from_udata(&ureq, udata, sizeof(ureq)))
return -EFAULT;
bytes = (qplib_qp->sq.max_wqe * BNXT_QPLIB_MAX_SQE_ENTRY_SIZE);
/* Consider mapping PSN search memory only for RC QPs. */
- if (qplib_qp->type == CMDQ_CREATE_QP_TYPE_RC)
- bytes += (qplib_qp->sq.max_wqe * sizeof(struct sq_psn_search));
+ if (qplib_qp->type == CMDQ_CREATE_QP_TYPE_RC) {
+ psn_sz = bnxt_qplib_is_chip_gen_p5(&rdev->chip_ctx) ?
+ sizeof(struct sq_psn_search_ext) :
+ sizeof(struct sq_psn_search);
+ bytes += (qplib_qp->sq.max_wqe * psn_sz);
+ }
bytes = PAGE_ALIGN(bytes);
- umem = ib_umem_get(context, ureq.qpsva, bytes,
- IB_ACCESS_LOCAL_WRITE, 1);
+ umem = ib_umem_get(udata, ureq.qpsva, bytes, IB_ACCESS_LOCAL_WRITE, 1);
if (IS_ERR(umem))
return PTR_ERR(umem);
@@ -908,7 +902,7 @@ static int bnxt_re_init_user_qp(struct bnxt_re_dev *rdev, struct bnxt_re_pd *pd,
if (!qp->qplib_qp.srq) {
bytes = (qplib_qp->rq.max_wqe * BNXT_QPLIB_MAX_RQE_ENTRY_SIZE);
bytes = PAGE_ALIGN(bytes);
- umem = ib_umem_get(context, ureq.qprva, bytes,
+ umem = ib_umem_get(udata, ureq.qprva, bytes,
IB_ACCESS_LOCAL_WRITE, 1);
if (IS_ERR(umem))
goto rqfail;
@@ -1066,12 +1060,17 @@ struct ib_qp *bnxt_re_create_qp(struct ib_pd *ib_pd,
qp->qplib_qp.pd = &pd->qplib_pd;
qp->qplib_qp.qp_handle = (u64)(unsigned long)(&qp->qplib_qp);
qp->qplib_qp.type = __from_ib_qp_type(qp_init_attr->qp_type);
+
+ if (qp_init_attr->qp_type == IB_QPT_GSI &&
+ bnxt_qplib_is_chip_gen_p5(&rdev->chip_ctx))
+ qp->qplib_qp.type = CMDQ_CREATE_QP_TYPE_GSI;
if (qp->qplib_qp.type == IB_QPT_MAX) {
dev_err(rdev_to_dev(rdev), "QP type 0x%x not supported",
qp->qplib_qp.type);
rc = -EINVAL;
goto fail;
}
+
qp->qplib_qp.max_inline_data = qp_init_attr->cap.max_inline_data;
qp->qplib_qp.sig_type = ((qp_init_attr->sq_sig_type ==
IB_SIGNAL_ALL_WR) ? true : false);
@@ -1132,7 +1131,8 @@ struct ib_qp *bnxt_re_create_qp(struct ib_pd *ib_pd,
qp->qplib_qp.mtu = ib_mtu_enum_to_int(iboe_get_mtu(rdev->netdev->mtu));
- if (qp_init_attr->qp_type == IB_QPT_GSI) {
+ if (qp_init_attr->qp_type == IB_QPT_GSI &&
+ !(bnxt_qplib_is_chip_gen_p5(&rdev->chip_ctx))) {
/* Allocate 1 more than what's provided */
entries = roundup_pow_of_two(qp_init_attr->cap.max_send_wr + 1);
qp->qplib_qp.sq.max_wqe = min_t(u32, entries,
@@ -1361,17 +1361,15 @@ static int bnxt_re_init_user_srq(struct bnxt_re_dev *rdev,
struct bnxt_qplib_srq *qplib_srq = &srq->qplib_srq;
struct ib_umem *umem;
int bytes = 0;
- struct ib_ucontext *context = pd->ib_pd.uobject->context;
- struct bnxt_re_ucontext *cntx = container_of(context,
- struct bnxt_re_ucontext,
- ib_uctx);
+ struct bnxt_re_ucontext *cntx = rdma_udata_to_drv_context(
+ udata, struct bnxt_re_ucontext, ib_uctx);
+
if (ib_copy_from_udata(&ureq, udata, sizeof(ureq)))
return -EFAULT;
bytes = (qplib_srq->max_wqe * BNXT_QPLIB_MAX_RQE_ENTRY_SIZE);
bytes = PAGE_ALIGN(bytes);
- umem = ib_umem_get(context, ureq.srqva, bytes,
- IB_ACCESS_LOCAL_WRITE, 1);
+ umem = ib_umem_get(udata, ureq.srqva, bytes, IB_ACCESS_LOCAL_WRITE, 1);
if (IS_ERR(umem))
return PTR_ERR(umem);
@@ -1646,6 +1644,9 @@ int bnxt_re_modify_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr,
__from_ib_access_flags(qp_attr->qp_access_flags);
/* LOCAL_WRITE access must be set to allow RC receive */
qp->qplib_qp.access |= BNXT_QPLIB_ACCESS_LOCAL_WRITE;
+ /* Temp: Set all params on QP as of now */
+ qp->qplib_qp.access |= CMDQ_MODIFY_QP_ACCESS_REMOTE_WRITE;
+ qp->qplib_qp.access |= CMDQ_MODIFY_QP_ACCESS_REMOTE_READ;
}
if (qp_attr_mask & IB_QP_PKEY_INDEX) {
qp->qplib_qp.modify_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_PKEY;
@@ -2093,7 +2094,8 @@ static int bnxt_re_build_qp1_shadow_qp_recv(struct bnxt_re_qp *qp,
static int is_ud_qp(struct bnxt_re_qp *qp)
{
- return qp->qplib_qp.type == CMDQ_CREATE_QP_TYPE_UD;
+ return (qp->qplib_qp.type == CMDQ_CREATE_QP_TYPE_UD ||
+ qp->qplib_qp.type == CMDQ_CREATE_QP_TYPE_GSI);
}
static int bnxt_re_build_send_wqe(struct bnxt_re_qp *qp,
@@ -2397,7 +2399,7 @@ int bnxt_re_post_send(struct ib_qp *ib_qp, const struct ib_send_wr *wr,
switch (wr->opcode) {
case IB_WR_SEND:
case IB_WR_SEND_WITH_IMM:
- if (ib_qp->qp_type == IB_QPT_GSI) {
+ if (qp->qplib_qp.type == CMDQ_CREATE_QP1_TYPE_GSI) {
rc = bnxt_re_build_qp1_send_v2(qp, wr, &wqe,
payload_sz);
if (rc)
@@ -2527,7 +2529,8 @@ int bnxt_re_post_recv(struct ib_qp *ib_qp, const struct ib_recv_wr *wr,
wqe.wr_id = wr->wr_id;
wqe.type = BNXT_QPLIB_SWQE_TYPE_RECV;
- if (ib_qp->qp_type == IB_QPT_GSI)
+ if (ib_qp->qp_type == IB_QPT_GSI &&
+ qp->qplib_qp.type != CMDQ_CREATE_QP_TYPE_GSI)
rc = bnxt_re_build_qp1_shadow_qp_recv(qp, wr, &wqe,
payload_sz);
if (!rc)
@@ -2622,7 +2625,7 @@ struct ib_cq *bnxt_re_create_cq(struct ib_device *ibdev,
goto fail;
}
- cq->umem = ib_umem_get(context, req.cq_va,
+ cq->umem = ib_umem_get(udata, req.cq_va,
entries * sizeof(struct cq_base),
IB_ACCESS_LOCAL_WRITE, 1);
if (IS_ERR(cq->umem)) {
@@ -3122,19 +3125,33 @@ static void bnxt_re_process_res_shadow_qp_wc(struct bnxt_re_qp *qp,
}
}
-static void bnxt_re_process_res_ud_wc(struct ib_wc *wc,
+static void bnxt_re_process_res_ud_wc(struct bnxt_re_qp *qp,
+ struct ib_wc *wc,
struct bnxt_qplib_cqe *cqe)
{
+ u8 nw_type;
+
wc->opcode = IB_WC_RECV;
wc->status = __rc_to_ib_wc_status(cqe->status);
- if (cqe->flags & CQ_RES_RC_FLAGS_IMM)
+ if (cqe->flags & CQ_RES_UD_FLAGS_IMM)
wc->wc_flags |= IB_WC_WITH_IMM;
- if (cqe->flags & CQ_RES_RC_FLAGS_INV)
- wc->wc_flags |= IB_WC_WITH_INVALIDATE;
- if ((cqe->flags & (CQ_RES_RC_FLAGS_RDMA | CQ_RES_RC_FLAGS_IMM)) ==
- (CQ_RES_RC_FLAGS_RDMA | CQ_RES_RC_FLAGS_IMM))
- wc->opcode = IB_WC_RECV_RDMA_WITH_IMM;
+ /* report only on GSI QP for Thor */
+ if (qp->qplib_qp.type == CMDQ_CREATE_QP_TYPE_GSI) {
+ wc->wc_flags |= IB_WC_GRH;
+ memcpy(wc->smac, cqe->smac, ETH_ALEN);
+ wc->wc_flags |= IB_WC_WITH_SMAC;
+ if (cqe->flags & CQ_RES_UD_FLAGS_META_FORMAT_VLAN) {
+ wc->vlan_id = (cqe->cfa_meta & 0xFFF);
+ if (wc->vlan_id < 0x1000)
+ wc->wc_flags |= IB_WC_WITH_VLAN;
+ }
+ nw_type = (cqe->flags & CQ_RES_UD_FLAGS_ROCE_IP_VER_MASK) >>
+ CQ_RES_UD_FLAGS_ROCE_IP_VER_SFT;
+ wc->network_hdr_type = bnxt_re_to_ib_nw_type(nw_type);
+ wc->wc_flags |= IB_WC_WITH_NETWORK_HDR_TYPE;
+ }
+
}
static int send_phantom_wqe(struct bnxt_re_qp *qp)
@@ -3226,7 +3243,7 @@ int bnxt_re_poll_cq(struct ib_cq *ib_cq, int num_entries, struct ib_wc *wc)
switch (cqe->opcode) {
case CQ_BASE_CQE_TYPE_REQ:
- if (qp->qplib_qp.id ==
+ if (qp->rdev->qp1_sqp && qp->qplib_qp.id ==
qp->rdev->qp1_sqp->qplib_qp.id) {
/* Handle this completion with
* the stored completion
@@ -3261,7 +3278,7 @@ int bnxt_re_poll_cq(struct ib_cq *ib_cq, int num_entries, struct ib_wc *wc)
bnxt_re_process_res_rc_wc(wc, cqe);
break;
case CQ_BASE_CQE_TYPE_RES_UD:
- if (qp->qplib_qp.id ==
+ if (qp->rdev->qp1_sqp && qp->qplib_qp.id ==
qp->rdev->qp1_sqp->qplib_qp.id) {
/* Handle this completion with
* the stored completion
@@ -3274,7 +3291,7 @@ int bnxt_re_poll_cq(struct ib_cq *ib_cq, int num_entries, struct ib_wc *wc)
break;
}
}
- bnxt_re_process_res_ud_wc(wc, cqe);
+ bnxt_re_process_res_ud_wc(qp, wc, cqe);
break;
default:
dev_err(rdev_to_dev(cq->rdev),
@@ -3301,10 +3318,10 @@ int bnxt_re_req_notify_cq(struct ib_cq *ib_cq,
spin_lock_irqsave(&cq->cq_lock, flags);
/* Trigger on the very next completion */
if (ib_cqn_flags & IB_CQ_NEXT_COMP)
- type = DBR_DBR_TYPE_CQ_ARMALL;
+ type = DBC_DBC_TYPE_CQ_ARMALL;
/* Trigger on the next solicited completion */
else if (ib_cqn_flags & IB_CQ_SOLICITED)
- type = DBR_DBR_TYPE_CQ_ARMSE;
+ type = DBC_DBC_TYPE_CQ_ARMSE;
/* Poll to see if there are missed events */
if ((ib_cqn_flags & IB_CQ_REPORT_MISSED_EVENTS) &&
@@ -3537,19 +3554,14 @@ static int fill_umem_pbl_tbl(struct ib_umem *umem, u64 *pbl_tbl_orig,
u64 *pbl_tbl = pbl_tbl_orig;
u64 paddr;
u64 page_mask = (1ULL << page_shift) - 1;
- int i, pages;
- struct scatterlist *sg;
- int entry;
-
- for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
- pages = sg_dma_len(sg) >> PAGE_SHIFT;
- for (i = 0; i < pages; i++) {
- paddr = sg_dma_address(sg) + (i << PAGE_SHIFT);
- if (pbl_tbl == pbl_tbl_orig)
- *pbl_tbl++ = paddr & ~page_mask;
- else if ((paddr & page_mask) == 0)
- *pbl_tbl++ = paddr;
- }
+ struct sg_dma_page_iter sg_iter;
+
+ for_each_sg_dma_page (umem->sg_head.sgl, &sg_iter, umem->nmap, 0) {
+ paddr = sg_page_iter_dma_address(&sg_iter);
+ if (pbl_tbl == pbl_tbl_orig)
+ *pbl_tbl++ = paddr & ~page_mask;
+ else if ((paddr & page_mask) == 0)
+ *pbl_tbl++ = paddr;
}
return pbl_tbl - pbl_tbl_orig;
}
@@ -3589,8 +3601,7 @@ struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *ib_pd, u64 start, u64 length,
/* The fixed portion of the rkey is the same as the lkey */
mr->ib_mr.rkey = mr->qplib_mr.rkey;
- umem = ib_umem_get(ib_pd->uobject->context, start, length,
- mr_access_flags, 0);
+ umem = ib_umem_get(udata, start, length, mr_access_flags, 0);
if (IS_ERR(umem)) {
dev_err(rdev_to_dev(rdev), "Failed to get umem");
rc = -EFAULT;
@@ -3613,7 +3624,7 @@ struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *ib_pd, u64 start, u64 length,
goto free_umem;
}
- page_shift = umem->page_shift;
+ page_shift = PAGE_SHIFT;
if (!bnxt_re_page_size_ok(page_shift)) {
dev_err(rdev_to_dev(rdev), "umem page size unsupported!");
@@ -3660,13 +3671,15 @@ free_mr:
return ERR_PTR(rc);
}
-struct ib_ucontext *bnxt_re_alloc_ucontext(struct ib_device *ibdev,
- struct ib_udata *udata)
+int bnxt_re_alloc_ucontext(struct ib_ucontext *ctx, struct ib_udata *udata)
{
+ struct ib_device *ibdev = ctx->device;
+ struct bnxt_re_ucontext *uctx =
+ container_of(ctx, struct bnxt_re_ucontext, ib_uctx);
struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev);
- struct bnxt_re_uctx_resp resp;
- struct bnxt_re_ucontext *uctx;
struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr;
+ struct bnxt_re_uctx_resp resp;
+ u32 chip_met_rev_num = 0;
int rc;
dev_dbg(rdev_to_dev(rdev), "ABI version requested %d",
@@ -3675,13 +3688,9 @@ struct ib_ucontext *bnxt_re_alloc_ucontext(struct ib_device *ibdev,
if (ibdev->uverbs_abi_ver != BNXT_RE_ABI_VERSION) {
dev_dbg(rdev_to_dev(rdev), " is different from the device %d ",
BNXT_RE_ABI_VERSION);
- return ERR_PTR(-EPERM);
+ return -EPERM;
}
- uctx = kzalloc(sizeof(*uctx), GFP_KERNEL);
- if (!uctx)
- return ERR_PTR(-ENOMEM);
-
uctx->rdev = rdev;
uctx->shpg = (void *)__get_free_page(GFP_KERNEL);
@@ -3691,37 +3700,45 @@ struct ib_ucontext *bnxt_re_alloc_ucontext(struct ib_device *ibdev,
}
spin_lock_init(&uctx->sh_lock);
- resp.dev_id = rdev->en_dev->pdev->devfn; /*Temp, Use idr_alloc instead*/
+ resp.comp_mask = BNXT_RE_UCNTX_CMASK_HAVE_CCTX;
+ chip_met_rev_num = rdev->chip_ctx.chip_num;
+ chip_met_rev_num |= ((u32)rdev->chip_ctx.chip_rev & 0xFF) <<
+ BNXT_RE_CHIP_ID0_CHIP_REV_SFT;
+ chip_met_rev_num |= ((u32)rdev->chip_ctx.chip_metal & 0xFF) <<
+ BNXT_RE_CHIP_ID0_CHIP_MET_SFT;
+ resp.chip_id0 = chip_met_rev_num;
+ /* Future extension of chip info */
+ resp.chip_id1 = 0;
+ /*Temp, Use idr_alloc instead */
+ resp.dev_id = rdev->en_dev->pdev->devfn;
resp.max_qp = rdev->qplib_ctx.qpc_count;
resp.pg_size = PAGE_SIZE;
resp.cqe_sz = sizeof(struct cq_base);
resp.max_cqd = dev_attr->max_cq_wqes;
resp.rsvd = 0;
- rc = ib_copy_to_udata(udata, &resp, sizeof(resp));
+ rc = ib_copy_to_udata(udata, &resp, min(udata->outlen, sizeof(resp)));
if (rc) {
dev_err(rdev_to_dev(rdev), "Failed to copy user context");
rc = -EFAULT;
goto cfail;
}
- return &uctx->ib_uctx;
+ return 0;
cfail:
free_page((unsigned long)uctx->shpg);
uctx->shpg = NULL;
fail:
- kfree(uctx);
- return ERR_PTR(rc);
+ return rc;
}
-int bnxt_re_dealloc_ucontext(struct ib_ucontext *ib_uctx)
+void bnxt_re_dealloc_ucontext(struct ib_ucontext *ib_uctx)
{
struct bnxt_re_ucontext *uctx = container_of(ib_uctx,
struct bnxt_re_ucontext,
ib_uctx);
struct bnxt_re_dev *rdev = uctx->rdev;
- int rc = 0;
if (uctx->shpg)
free_page((unsigned long)uctx->shpg);
@@ -3730,17 +3747,10 @@ int bnxt_re_dealloc_ucontext(struct ib_ucontext *ib_uctx)
/* Free DPI only if this is the first PD allocated by the
* application and mark the context dpi as NULL
*/
- rc = bnxt_qplib_dealloc_dpi(&rdev->qplib_res,
- &rdev->qplib_res.dpi_tbl,
- &uctx->dpi);
- if (rc)
- dev_err(rdev_to_dev(rdev), "Deallocate HW DPI failed!");
- /* Don't fail, continue*/
+ bnxt_qplib_dealloc_dpi(&rdev->qplib_res,
+ &rdev->qplib_res.dpi_tbl, &uctx->dpi);
uctx->dpi.dbr = NULL;
}
-
- kfree(uctx);
- return 0;
}
/* Helper function to mmap the virtual memory from user app */
diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.h b/drivers/infiniband/hw/bnxt_re/ib_verbs.h
index c4af72604b4f..e45465ed4eee 100644
--- a/drivers/infiniband/hw/bnxt_re/ib_verbs.h
+++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.h
@@ -56,8 +56,8 @@ struct bnxt_re_fence_data {
};
struct bnxt_re_pd {
+ struct ib_pd ib_pd;
struct bnxt_re_dev *rdev;
- struct ib_pd ib_pd;
struct bnxt_qplib_pd qplib_pd;
struct bnxt_re_fence_data fence;
};
@@ -135,8 +135,8 @@ struct bnxt_re_mw {
};
struct bnxt_re_ucontext {
+ struct ib_ucontext ib_uctx;
struct bnxt_re_dev *rdev;
- struct ib_ucontext ib_uctx;
struct bnxt_qplib_dpi dpi;
void *shpg;
spinlock_t sh_lock; /* protect shpg */
@@ -163,10 +163,9 @@ int bnxt_re_query_gid(struct ib_device *ibdev, u8 port_num,
int index, union ib_gid *gid);
enum rdma_link_layer bnxt_re_get_link_layer(struct ib_device *ibdev,
u8 port_num);
-struct ib_pd *bnxt_re_alloc_pd(struct ib_device *ibdev,
- struct ib_ucontext *context,
- struct ib_udata *udata);
-int bnxt_re_dealloc_pd(struct ib_pd *pd);
+int bnxt_re_alloc_pd(struct ib_pd *pd, struct ib_ucontext *context,
+ struct ib_udata *udata);
+void bnxt_re_dealloc_pd(struct ib_pd *pd);
struct ib_ah *bnxt_re_create_ah(struct ib_pd *pd,
struct rdma_ah_attr *ah_attr,
u32 flags,
@@ -216,9 +215,8 @@ int bnxt_re_dealloc_mw(struct ib_mw *mw);
struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
u64 virt_addr, int mr_access_flags,
struct ib_udata *udata);
-struct ib_ucontext *bnxt_re_alloc_ucontext(struct ib_device *ibdev,
- struct ib_udata *udata);
-int bnxt_re_dealloc_ucontext(struct ib_ucontext *context);
+int bnxt_re_alloc_ucontext(struct ib_ucontext *ctx, struct ib_udata *udata);
+void bnxt_re_dealloc_ucontext(struct ib_ucontext *context);
int bnxt_re_mmap(struct ib_ucontext *context, struct vm_area_struct *vma);
unsigned long bnxt_re_lock_cqs(struct bnxt_re_qp *qp);
diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c
index e7a997f2a537..2bd24ac45ee4 100644
--- a/drivers/infiniband/hw/bnxt_re/main.c
+++ b/drivers/infiniband/hw/bnxt_re/main.c
@@ -80,6 +80,29 @@ static DEFINE_MUTEX(bnxt_re_dev_lock);
static struct workqueue_struct *bnxt_re_wq;
static void bnxt_re_ib_unreg(struct bnxt_re_dev *rdev);
+static void bnxt_re_destroy_chip_ctx(struct bnxt_re_dev *rdev)
+{
+ rdev->rcfw.res = NULL;
+ rdev->qplib_res.cctx = NULL;
+}
+
+static int bnxt_re_setup_chip_ctx(struct bnxt_re_dev *rdev)
+{
+ struct bnxt_en_dev *en_dev;
+ struct bnxt *bp;
+
+ en_dev = rdev->en_dev;
+ bp = netdev_priv(en_dev->net);
+
+ rdev->chip_ctx.chip_num = bp->chip_num;
+ /* rest members to follow eventually */
+
+ rdev->qplib_res.cctx = &rdev->chip_ctx;
+ rdev->rcfw.res = &rdev->qplib_res;
+
+ return 0;
+}
+
/* SR-IOV helper functions */
static void bnxt_re_get_sriov_func_type(struct bnxt_re_dev *rdev)
@@ -278,6 +301,7 @@ static int bnxt_re_register_netdev(struct bnxt_re_dev *rdev)
rc = en_dev->en_ops->bnxt_register_device(en_dev, BNXT_ROCE_ULP,
&bnxt_re_ulp_ops, rdev);
+ rdev->qplib_res.pdev = rdev->en_dev->pdev;
return rc;
}
@@ -345,7 +369,8 @@ static void bnxt_re_fill_fw_msg(struct bnxt_fw_msg *fw_msg, void *msg,
fw_msg->timeout = timeout;
}
-static int bnxt_re_net_ring_free(struct bnxt_re_dev *rdev, u16 fw_ring_id)
+static int bnxt_re_net_ring_free(struct bnxt_re_dev *rdev,
+ u16 fw_ring_id, int type)
{
struct bnxt_en_dev *en_dev = rdev->en_dev;
struct hwrm_ring_free_input req = {0};
@@ -359,7 +384,7 @@ static int bnxt_re_net_ring_free(struct bnxt_re_dev *rdev, u16 fw_ring_id)
memset(&fw_msg, 0, sizeof(fw_msg));
bnxt_re_init_hwrm_hdr(rdev, (void *)&req, HWRM_RING_FREE, -1, -1);
- req.ring_type = RING_ALLOC_REQ_RING_TYPE_L2_CMPL;
+ req.ring_type = type;
req.ring_id = cpu_to_le16(fw_ring_id);
bnxt_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp,
sizeof(resp), DFLT_HWRM_CMD_TIMEOUT);
@@ -396,7 +421,7 @@ static int bnxt_re_net_ring_alloc(struct bnxt_re_dev *rdev, dma_addr_t *dma_arr,
/* Association of ring index with doorbell index and MSIX number */
req.logical_id = cpu_to_le16(map_index);
req.length = cpu_to_le32(ring_mask + 1);
- req.ring_type = RING_ALLOC_REQ_RING_TYPE_L2_CMPL;
+ req.ring_type = type;
req.int_mode = RING_ALLOC_REQ_INT_MODE_MSIX;
bnxt_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp,
sizeof(resp), DFLT_HWRM_CMD_TIMEOUT);
@@ -538,7 +563,8 @@ static struct bnxt_en_dev *bnxt_re_dev_probe(struct net_device *netdev)
static ssize_t hw_rev_show(struct device *device, struct device_attribute *attr,
char *buf)
{
- struct bnxt_re_dev *rdev = to_bnxt_re_dev(device, ibdev.dev);
+ struct bnxt_re_dev *rdev =
+ rdma_device_to_drv_device(device, struct bnxt_re_dev, ibdev);
return scnprintf(buf, PAGE_SIZE, "0x%x\n", rdev->en_dev->pdev->vendor);
}
@@ -547,7 +573,8 @@ static DEVICE_ATTR_RO(hw_rev);
static ssize_t hca_type_show(struct device *device,
struct device_attribute *attr, char *buf)
{
- struct bnxt_re_dev *rdev = to_bnxt_re_dev(device, ibdev.dev);
+ struct bnxt_re_dev *rdev =
+ rdma_device_to_drv_device(device, struct bnxt_re_dev, ibdev);
return scnprintf(buf, PAGE_SIZE, "%s\n", rdev->ibdev.node_desc);
}
@@ -610,6 +637,8 @@ static const struct ib_device_ops bnxt_re_dev_ops = {
.query_srq = bnxt_re_query_srq,
.reg_user_mr = bnxt_re_reg_user_mr,
.req_notify_cq = bnxt_re_req_notify_cq,
+ INIT_RDMA_OBJ_SIZE(ib_pd, bnxt_re_pd, ib_pd),
+ INIT_RDMA_OBJ_SIZE(ib_ucontext, bnxt_re_ucontext, ib_uctx),
};
static int bnxt_re_register_ib(struct bnxt_re_dev *rdev)
@@ -662,7 +691,7 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev)
rdma_set_device_sysfs_group(ibdev, &bnxt_re_dev_attr_group);
ibdev->driver_id = RDMA_DRIVER_BNXT_RE;
ib_set_device_ops(ibdev, &bnxt_re_dev_ops);
- return ib_register_device(ibdev, "bnxt_re%d", NULL);
+ return ib_register_device(ibdev, "bnxt_re%d");
}
static void bnxt_re_dev_remove(struct bnxt_re_dev *rdev)
@@ -686,7 +715,7 @@ static struct bnxt_re_dev *bnxt_re_dev_add(struct net_device *netdev,
struct bnxt_re_dev *rdev;
/* Allocate bnxt_re_dev instance here */
- rdev = (struct bnxt_re_dev *)ib_alloc_device(sizeof(*rdev));
+ rdev = ib_alloc_device(bnxt_re_dev, ibdev);
if (!rdev) {
dev_err(NULL, "%s: bnxt_re_dev allocation failure!",
ROCE_DRV_MODULE_NAME);
@@ -858,6 +887,12 @@ static int bnxt_re_cqn_handler(struct bnxt_qplib_nq *nq,
return 0;
}
+static u32 bnxt_re_get_nqdb_offset(struct bnxt_re_dev *rdev, u16 indx)
+{
+ return bnxt_qplib_is_chip_gen_p5(&rdev->chip_ctx) ?
+ 0x10000 : rdev->msix_entries[indx].db_offset;
+}
+
static void bnxt_re_cleanup_res(struct bnxt_re_dev *rdev)
{
int i;
@@ -871,18 +906,18 @@ static void bnxt_re_cleanup_res(struct bnxt_re_dev *rdev)
static int bnxt_re_init_res(struct bnxt_re_dev *rdev)
{
- int rc = 0, i;
int num_vec_enabled = 0;
+ int rc = 0, i;
+ u32 db_offt;
bnxt_qplib_init_res(&rdev->qplib_res);
for (i = 1; i < rdev->num_msix ; i++) {
+ db_offt = bnxt_re_get_nqdb_offset(rdev, i);
rc = bnxt_qplib_enable_nq(rdev->en_dev->pdev, &rdev->nq[i - 1],
i - 1, rdev->msix_entries[i].vector,
- rdev->msix_entries[i].db_offset,
- &bnxt_re_cqn_handler,
+ db_offt, &bnxt_re_cqn_handler,
&bnxt_re_srqn_handler);
-
if (rc) {
dev_err(rdev_to_dev(rdev),
"Failed to enable NQ with rc = 0x%x", rc);
@@ -894,16 +929,18 @@ static int bnxt_re_init_res(struct bnxt_re_dev *rdev)
fail:
for (i = num_vec_enabled; i >= 0; i--)
bnxt_qplib_disable_nq(&rdev->nq[i]);
-
return rc;
}
static void bnxt_re_free_nq_res(struct bnxt_re_dev *rdev)
{
+ u8 type;
int i;
for (i = 0; i < rdev->num_msix - 1; i++) {
- bnxt_re_net_ring_free(rdev, rdev->nq[i].ring_id);
+ type = bnxt_qplib_get_ring_type(&rdev->chip_ctx);
+ bnxt_re_net_ring_free(rdev, rdev->nq[i].ring_id, type);
+ rdev->nq[i].res = NULL;
bnxt_qplib_free_nq(&rdev->nq[i]);
}
}
@@ -925,8 +962,11 @@ static void bnxt_re_free_res(struct bnxt_re_dev *rdev)
static int bnxt_re_alloc_res(struct bnxt_re_dev *rdev)
{
- int rc = 0, i;
int num_vec_created = 0;
+ dma_addr_t *pg_map;
+ int rc = 0, i;
+ int pages;
+ u8 type;
/* Configure and allocate resources for qplib */
rdev->qplib_res.rcfw = &rdev->rcfw;
@@ -947,6 +987,7 @@ static int bnxt_re_alloc_res(struct bnxt_re_dev *rdev)
goto dealloc_res;
for (i = 0; i < rdev->num_msix - 1; i++) {
+ rdev->nq[i].res = &rdev->qplib_res;
rdev->nq[i].hwq.max_elements = BNXT_RE_MAX_CQ_COUNT +
BNXT_RE_MAX_SRQC_COUNT + 2;
rc = bnxt_qplib_alloc_nq(rdev->en_dev->pdev, &rdev->nq[i]);
@@ -955,13 +996,13 @@ static int bnxt_re_alloc_res(struct bnxt_re_dev *rdev)
i, rc);
goto free_nq;
}
- rc = bnxt_re_net_ring_alloc
- (rdev, rdev->nq[i].hwq.pbl[PBL_LVL_0].pg_map_arr,
- rdev->nq[i].hwq.pbl[rdev->nq[i].hwq.level].pg_count,
- HWRM_RING_ALLOC_CMPL,
- BNXT_QPLIB_NQE_MAX_CNT - 1,
- rdev->msix_entries[i + 1].ring_idx,
- &rdev->nq[i].ring_id);
+ type = bnxt_qplib_get_ring_type(&rdev->chip_ctx);
+ pg_map = rdev->nq[i].hwq.pbl[PBL_LVL_0].pg_map_arr;
+ pages = rdev->nq[i].hwq.pbl[rdev->nq[i].hwq.level].pg_count;
+ rc = bnxt_re_net_ring_alloc(rdev, pg_map, pages, type,
+ BNXT_QPLIB_NQE_MAX_CNT - 1,
+ rdev->msix_entries[i + 1].ring_idx,
+ &rdev->nq[i].ring_id);
if (rc) {
dev_err(rdev_to_dev(rdev),
"Failed to allocate NQ fw id with rc = 0x%x",
@@ -974,7 +1015,8 @@ static int bnxt_re_alloc_res(struct bnxt_re_dev *rdev)
return 0;
free_nq:
for (i = num_vec_created; i >= 0; i--) {
- bnxt_re_net_ring_free(rdev, rdev->nq[i].ring_id);
+ type = bnxt_qplib_get_ring_type(&rdev->chip_ctx);
+ bnxt_re_net_ring_free(rdev, rdev->nq[i].ring_id, type);
bnxt_qplib_free_nq(&rdev->nq[i]);
}
bnxt_qplib_dealloc_dpi(&rdev->qplib_res,
@@ -1228,6 +1270,7 @@ static void bnxt_re_query_hwrm_intf_version(struct bnxt_re_dev *rdev)
static void bnxt_re_ib_unreg(struct bnxt_re_dev *rdev)
{
+ u8 type;
int rc;
if (test_and_clear_bit(BNXT_RE_FLAG_IBDEV_REGISTERED, &rdev->flags)) {
@@ -1251,7 +1294,8 @@ static void bnxt_re_ib_unreg(struct bnxt_re_dev *rdev)
bnxt_re_net_stats_ctx_free(rdev, rdev->qplib_ctx.stats.fw_id);
bnxt_qplib_free_ctx(rdev->en_dev->pdev, &rdev->qplib_ctx);
bnxt_qplib_disable_rcfw_channel(&rdev->rcfw);
- bnxt_re_net_ring_free(rdev, rdev->rcfw.creq_ring_id);
+ type = bnxt_qplib_get_ring_type(&rdev->chip_ctx);
+ bnxt_re_net_ring_free(rdev, rdev->rcfw.creq_ring_id, type);
bnxt_qplib_free_rcfw_channel(&rdev->rcfw);
}
if (test_and_clear_bit(BNXT_RE_FLAG_GOT_MSIX, &rdev->flags)) {
@@ -1260,6 +1304,8 @@ static void bnxt_re_ib_unreg(struct bnxt_re_dev *rdev)
dev_warn(rdev_to_dev(rdev),
"Failed to free MSI-X vectors: %#x", rc);
}
+
+ bnxt_re_destroy_chip_ctx(rdev);
if (test_and_clear_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags)) {
rc = bnxt_re_unregister_netdev(rdev);
if (rc)
@@ -1280,9 +1326,12 @@ static void bnxt_re_worker(struct work_struct *work)
static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev)
{
- int rc;
-
+ dma_addr_t *pg_map;
+ u32 db_offt, ridx;
+ int pages, vid;
bool locked;
+ u8 type;
+ int rc;
/* Acquire rtnl lock through out this function */
rtnl_lock();
@@ -1297,6 +1346,12 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev)
}
set_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags);
+ rc = bnxt_re_setup_chip_ctx(rdev);
+ if (rc) {
+ dev_err(rdev_to_dev(rdev), "Failed to get chip context\n");
+ return -EINVAL;
+ }
+
/* Check whether VF or PF */
bnxt_re_get_sriov_func_type(rdev);
@@ -1320,21 +1375,22 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev)
pr_err("Failed to allocate RCFW Channel: %#x\n", rc);
goto fail;
}
- rc = bnxt_re_net_ring_alloc
- (rdev, rdev->rcfw.creq.pbl[PBL_LVL_0].pg_map_arr,
- rdev->rcfw.creq.pbl[rdev->rcfw.creq.level].pg_count,
- HWRM_RING_ALLOC_CMPL, BNXT_QPLIB_CREQE_MAX_CNT - 1,
- rdev->msix_entries[BNXT_RE_AEQ_IDX].ring_idx,
- &rdev->rcfw.creq_ring_id);
+ type = bnxt_qplib_get_ring_type(&rdev->chip_ctx);
+ pg_map = rdev->rcfw.creq.pbl[PBL_LVL_0].pg_map_arr;
+ pages = rdev->rcfw.creq.pbl[rdev->rcfw.creq.level].pg_count;
+ ridx = rdev->msix_entries[BNXT_RE_AEQ_IDX].ring_idx;
+ rc = bnxt_re_net_ring_alloc(rdev, pg_map, pages, type,
+ BNXT_QPLIB_CREQE_MAX_CNT - 1,
+ ridx, &rdev->rcfw.creq_ring_id);
if (rc) {
pr_err("Failed to allocate CREQ: %#x\n", rc);
goto free_rcfw;
}
- rc = bnxt_qplib_enable_rcfw_channel
- (rdev->en_dev->pdev, &rdev->rcfw,
- rdev->msix_entries[BNXT_RE_AEQ_IDX].vector,
- rdev->msix_entries[BNXT_RE_AEQ_IDX].db_offset,
- rdev->is_virtfn, &bnxt_re_aeq_handler);
+ db_offt = bnxt_re_get_nqdb_offset(rdev, BNXT_RE_AEQ_IDX);
+ vid = rdev->msix_entries[BNXT_RE_AEQ_IDX].vector;
+ rc = bnxt_qplib_enable_rcfw_channel(rdev->en_dev->pdev, &rdev->rcfw,
+ vid, db_offt, rdev->is_virtfn,
+ &bnxt_re_aeq_handler);
if (rc) {
pr_err("Failed to enable RCFW channel: %#x\n", rc);
goto free_ring;
@@ -1347,7 +1403,8 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev)
if (!rdev->is_virtfn)
bnxt_re_set_resource_limits(rdev);
- rc = bnxt_qplib_alloc_ctx(rdev->en_dev->pdev, &rdev->qplib_ctx, 0);
+ rc = bnxt_qplib_alloc_ctx(rdev->en_dev->pdev, &rdev->qplib_ctx, 0,
+ bnxt_qplib_is_chip_gen_p5(&rdev->chip_ctx));
if (rc) {
pr_err("Failed to allocate QPLIB context: %#x\n", rc);
goto disable_rcfw;
@@ -1418,7 +1475,8 @@ free_ctx:
disable_rcfw:
bnxt_qplib_disable_rcfw_channel(&rdev->rcfw);
free_ring:
- bnxt_re_net_ring_free(rdev, rdev->rcfw.creq_ring_id);
+ type = bnxt_qplib_get_ring_type(&rdev->chip_ctx);
+ bnxt_re_net_ring_free(rdev, rdev->rcfw.creq_ring_id, type);
free_rcfw:
bnxt_qplib_free_rcfw_channel(&rdev->rcfw);
fail:
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c
index b98b054148cd..71c34d5b0ac0 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c
+++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c
@@ -44,6 +44,7 @@
#include <linux/slab.h>
#include <linux/pci.h>
#include <linux/prefetch.h>
+#include <linux/if_ether.h>
#include "roce_hsi.h"
@@ -244,6 +245,7 @@ static void bnxt_qplib_service_nq(unsigned long data)
u16 type;
int budget = nq->budget;
uintptr_t q_handle;
+ bool gen_p5 = bnxt_qplib_is_chip_gen_p5(nq->res->cctx);
/* Service the NQ until empty */
raw_cons = hwq->cons;
@@ -290,7 +292,7 @@ static void bnxt_qplib_service_nq(unsigned long data)
q_handle |= (u64)le32_to_cpu(nqsrqe->srq_handle_high)
<< 32;
bnxt_qplib_arm_srq((struct bnxt_qplib_srq *)q_handle,
- DBR_DBR_TYPE_SRQ_ARMENA);
+ DBC_DBC_TYPE_SRQ_ARMENA);
if (!nq->srqn_handler(nq,
(struct bnxt_qplib_srq *)q_handle,
nqsrqe->event))
@@ -312,7 +314,9 @@ static void bnxt_qplib_service_nq(unsigned long data)
}
if (hwq->cons != raw_cons) {
hwq->cons = raw_cons;
- NQ_DB_REARM(nq->bar_reg_iomem, hwq->cons, hwq->max_elements);
+ bnxt_qplib_ring_nq_db_rearm(nq->bar_reg_iomem, hwq->cons,
+ hwq->max_elements, nq->ring_id,
+ gen_p5);
}
}
@@ -336,9 +340,11 @@ static irqreturn_t bnxt_qplib_nq_irq(int irq, void *dev_instance)
void bnxt_qplib_nq_stop_irq(struct bnxt_qplib_nq *nq, bool kill)
{
+ bool gen_p5 = bnxt_qplib_is_chip_gen_p5(nq->res->cctx);
tasklet_disable(&nq->worker);
/* Mask h/w interrupt */
- NQ_DB(nq->bar_reg_iomem, nq->hwq.cons, nq->hwq.max_elements);
+ bnxt_qplib_ring_nq_db(nq->bar_reg_iomem, nq->hwq.cons,
+ nq->hwq.max_elements, nq->ring_id, gen_p5);
/* Sync with last running IRQ handler */
synchronize_irq(nq->vector);
if (kill)
@@ -373,6 +379,7 @@ void bnxt_qplib_disable_nq(struct bnxt_qplib_nq *nq)
int bnxt_qplib_nq_start_irq(struct bnxt_qplib_nq *nq, int nq_indx,
int msix_vector, bool need_init)
{
+ bool gen_p5 = bnxt_qplib_is_chip_gen_p5(nq->res->cctx);
int rc;
if (nq->requested)
@@ -399,7 +406,8 @@ int bnxt_qplib_nq_start_irq(struct bnxt_qplib_nq *nq, int nq_indx,
nq->vector, nq_indx);
}
nq->requested = true;
- NQ_DB_REARM(nq->bar_reg_iomem, nq->hwq.cons, nq->hwq.max_elements);
+ bnxt_qplib_ring_nq_db_rearm(nq->bar_reg_iomem, nq->hwq.cons,
+ nq->hwq.max_elements, nq->ring_id, gen_p5);
return rc;
}
@@ -433,7 +441,8 @@ int bnxt_qplib_enable_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq,
rc = -ENOMEM;
goto fail;
}
- nq->bar_reg_iomem = ioremap_nocache(nq_base + nq->bar_reg_off, 4);
+ /* Unconditionally map 8 bytes to support 57500 series */
+ nq->bar_reg_iomem = ioremap_nocache(nq_base + nq->bar_reg_off, 8);
if (!nq->bar_reg_iomem) {
rc = -ENOMEM;
goto fail;
@@ -462,15 +471,17 @@ void bnxt_qplib_free_nq(struct bnxt_qplib_nq *nq)
int bnxt_qplib_alloc_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq)
{
+ u8 hwq_type;
+
nq->pdev = pdev;
if (!nq->hwq.max_elements ||
nq->hwq.max_elements > BNXT_QPLIB_NQE_MAX_CNT)
nq->hwq.max_elements = BNXT_QPLIB_NQE_MAX_CNT;
-
+ hwq_type = bnxt_qplib_get_hwq_type(nq->res);
if (bnxt_qplib_alloc_init_hwq(nq->pdev, &nq->hwq, NULL, 0,
&nq->hwq.max_elements,
BNXT_QPLIB_MAX_NQE_ENTRY_SIZE, 0,
- PAGE_SIZE, HWQ_TYPE_L2_CMPL))
+ PAGE_SIZE, hwq_type))
return -ENOMEM;
nq->budget = 8;
@@ -481,21 +492,19 @@ int bnxt_qplib_alloc_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq)
static void bnxt_qplib_arm_srq(struct bnxt_qplib_srq *srq, u32 arm_type)
{
struct bnxt_qplib_hwq *srq_hwq = &srq->hwq;
- struct dbr_dbr db_msg = { 0 };
void __iomem *db;
- u32 sw_prod = 0;
+ u32 sw_prod;
+ u64 val = 0;
/* Ring DB */
- sw_prod = (arm_type == DBR_DBR_TYPE_SRQ_ARM) ? srq->threshold :
- HWQ_CMP(srq_hwq->prod, srq_hwq);
- db_msg.index = cpu_to_le32((sw_prod << DBR_DBR_INDEX_SFT) &
- DBR_DBR_INDEX_MASK);
- db_msg.type_xid = cpu_to_le32(((srq->id << DBR_DBR_XID_SFT) &
- DBR_DBR_XID_MASK) | arm_type);
- db = (arm_type == DBR_DBR_TYPE_SRQ_ARMENA) ?
- srq->dbr_base : srq->dpi->dbr;
- wmb(); /* barrier before db ring */
- __iowrite64_copy(db, &db_msg, sizeof(db_msg) / sizeof(u64));
+ sw_prod = (arm_type == DBC_DBC_TYPE_SRQ_ARM) ?
+ srq->threshold : HWQ_CMP(srq_hwq->prod, srq_hwq);
+ db = (arm_type == DBC_DBC_TYPE_SRQ_ARMENA) ? srq->dbr_base :
+ srq->dpi->dbr;
+ val = ((srq->id << DBC_DBC_XID_SFT) & DBC_DBC_XID_MASK) | arm_type;
+ val <<= 32;
+ val |= (sw_prod << DBC_DBC_INDEX_SFT) & DBC_DBC_INDEX_MASK;
+ writeq(val, db);
}
int bnxt_qplib_destroy_srq(struct bnxt_qplib_res *res,
@@ -590,7 +599,7 @@ int bnxt_qplib_create_srq(struct bnxt_qplib_res *res,
srq->id = le32_to_cpu(resp.xid);
srq->dbr_base = res->dpi_tbl.dbr_bar_reg_iomem;
if (srq->threshold)
- bnxt_qplib_arm_srq(srq, DBR_DBR_TYPE_SRQ_ARMENA);
+ bnxt_qplib_arm_srq(srq, DBC_DBC_TYPE_SRQ_ARMENA);
srq->arm_req = false;
return 0;
@@ -614,7 +623,7 @@ int bnxt_qplib_modify_srq(struct bnxt_qplib_res *res,
srq_hwq->max_elements - sw_cons + sw_prod;
if (count > srq->threshold) {
srq->arm_req = false;
- bnxt_qplib_arm_srq(srq, DBR_DBR_TYPE_SRQ_ARM);
+ bnxt_qplib_arm_srq(srq, DBC_DBC_TYPE_SRQ_ARM);
} else {
/* Deferred arming */
srq->arm_req = true;
@@ -702,10 +711,10 @@ int bnxt_qplib_post_srq_recv(struct bnxt_qplib_srq *srq,
srq_hwq->max_elements - sw_cons + sw_prod;
spin_unlock(&srq_hwq->lock);
/* Ring DB */
- bnxt_qplib_arm_srq(srq, DBR_DBR_TYPE_SRQ);
+ bnxt_qplib_arm_srq(srq, DBC_DBC_TYPE_SRQ);
if (srq->arm_req == true && count > srq->threshold) {
srq->arm_req = false;
- bnxt_qplib_arm_srq(srq, DBR_DBR_TYPE_SRQ_ARM);
+ bnxt_qplib_arm_srq(srq, DBC_DBC_TYPE_SRQ_ARM);
}
done:
return rc;
@@ -853,18 +862,19 @@ exit:
int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
{
struct bnxt_qplib_rcfw *rcfw = res->rcfw;
- struct sq_send *hw_sq_send_hdr, **hw_sq_send_ptr;
- struct cmdq_create_qp req;
- struct creq_create_qp_resp resp;
- struct bnxt_qplib_pbl *pbl;
- struct sq_psn_search **psn_search_ptr;
unsigned long int psn_search, poff = 0;
+ struct sq_psn_search **psn_search_ptr;
struct bnxt_qplib_q *sq = &qp->sq;
struct bnxt_qplib_q *rq = &qp->rq;
+ int i, rc, req_size, psn_sz = 0;
+ struct sq_send **hw_sq_send_ptr;
+ struct creq_create_qp_resp resp;
struct bnxt_qplib_hwq *xrrq;
- int i, rc, req_size, psn_sz;
u16 cmd_flags = 0, max_ssge;
- u32 sw_prod, qp_flags = 0;
+ struct cmdq_create_qp req;
+ struct bnxt_qplib_pbl *pbl;
+ u32 qp_flags = 0;
+ u16 max_rsge;
RCFW_CMD_PREP(req, CREATE_QP, cmd_flags);
@@ -874,8 +884,11 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
req.qp_handle = cpu_to_le64(qp->qp_handle);
/* SQ */
- psn_sz = (qp->type == CMDQ_CREATE_QP_TYPE_RC) ?
- sizeof(struct sq_psn_search) : 0;
+ if (qp->type == CMDQ_CREATE_QP_TYPE_RC) {
+ psn_sz = bnxt_qplib_is_chip_gen_p5(res->cctx) ?
+ sizeof(struct sq_psn_search_ext) :
+ sizeof(struct sq_psn_search);
+ }
sq->hwq.max_elements = sq->max_wqe;
rc = bnxt_qplib_alloc_init_hwq(res->pdev, &sq->hwq, sq->sglist,
sq->nmap, &sq->hwq.max_elements,
@@ -905,10 +918,16 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
poff = (psn_search & ~PAGE_MASK) /
BNXT_QPLIB_MAX_PSNE_ENTRY_SIZE;
}
- for (i = 0; i < sq->hwq.max_elements; i++)
+ for (i = 0; i < sq->hwq.max_elements; i++) {
sq->swq[i].psn_search =
&psn_search_ptr[get_psne_pg(i + poff)]
[get_psne_idx(i + poff)];
+ /*psns_ext will be used only for P5 chips. */
+ sq->swq[i].psn_ext =
+ (struct sq_psn_search_ext *)
+ &psn_search_ptr[get_psne_pg(i + poff)]
+ [get_psne_idx(i + poff)];
+ }
}
pbl = &sq->hwq.pbl[PBL_LVL_0];
req.sq_pbl = cpu_to_le64(pbl->pg_map_arr[0]);
@@ -929,14 +948,6 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
CMDQ_CREATE_QP_SQ_PG_SIZE_PG_1G :
CMDQ_CREATE_QP_SQ_PG_SIZE_PG_4K);
- /* initialize all SQ WQEs to LOCAL_INVALID (sq prep for hw fetch) */
- hw_sq_send_ptr = (struct sq_send **)sq->hwq.pbl_ptr;
- for (sw_prod = 0; sw_prod < sq->hwq.max_elements; sw_prod++) {
- hw_sq_send_hdr = &hw_sq_send_ptr[get_sqe_pg(sw_prod)]
- [get_sqe_idx(sw_prod)];
- hw_sq_send_hdr->wqe_type = SQ_BASE_WQE_TYPE_LOCAL_INVALID;
- }
-
if (qp->scq)
req.scq_cid = cpu_to_le32(qp->scq->id);
@@ -1007,8 +1018,9 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
req.sq_fwo_sq_sge = cpu_to_le16(
((max_ssge & CMDQ_CREATE_QP_SQ_SGE_MASK)
<< CMDQ_CREATE_QP_SQ_SGE_SFT) | 0);
+ max_rsge = bnxt_qplib_is_chip_gen_p5(res->cctx) ? 6 : rq->max_sge;
req.rq_fwo_rq_sge = cpu_to_le16(
- ((rq->max_sge & CMDQ_CREATE_QP_RQ_SGE_MASK)
+ ((max_rsge & CMDQ_CREATE_QP_RQ_SGE_MASK)
<< CMDQ_CREATE_QP_RQ_SGE_SFT) | 0);
/* ORRQ and IRRQ */
if (psn_sz) {
@@ -1053,6 +1065,7 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
qp->id = le32_to_cpu(resp.xid);
qp->cur_qp_state = CMDQ_MODIFY_QP_NEW_STATE_RESET;
+ qp->cctx = res->cctx;
INIT_LIST_HEAD(&qp->sq_flush);
INIT_LIST_HEAD(&qp->rq_flush);
rcfw->qp_tbl[qp->id].qp_id = qp->id;
@@ -1494,19 +1507,16 @@ void *bnxt_qplib_get_qp1_rq_buf(struct bnxt_qplib_qp *qp,
void bnxt_qplib_post_send_db(struct bnxt_qplib_qp *qp)
{
struct bnxt_qplib_q *sq = &qp->sq;
- struct dbr_dbr db_msg = { 0 };
u32 sw_prod;
+ u64 val = 0;
+ val = (((qp->id << DBC_DBC_XID_SFT) & DBC_DBC_XID_MASK) |
+ DBC_DBC_TYPE_SQ);
+ val <<= 32;
sw_prod = HWQ_CMP(sq->hwq.prod, &sq->hwq);
-
- db_msg.index = cpu_to_le32((sw_prod << DBR_DBR_INDEX_SFT) &
- DBR_DBR_INDEX_MASK);
- db_msg.type_xid =
- cpu_to_le32(((qp->id << DBR_DBR_XID_SFT) & DBR_DBR_XID_MASK) |
- DBR_DBR_TYPE_SQ);
+ val |= (sw_prod << DBC_DBC_INDEX_SFT) & DBC_DBC_INDEX_MASK;
/* Flush all the WQE writes to HW */
- wmb();
- __iowrite64_copy(qp->dpi->dbr, &db_msg, sizeof(db_msg) / sizeof(u64));
+ writeq(val, qp->dpi->dbr);
}
int bnxt_qplib_post_send(struct bnxt_qplib_qp *qp,
@@ -1617,7 +1627,8 @@ int bnxt_qplib_post_send(struct bnxt_qplib_qp *qp,
((offsetof(typeof(*sqe), data) + 15) >> 4);
sqe->inv_key_or_imm_data = cpu_to_le32(
wqe->send.inv_key);
- if (qp->type == CMDQ_CREATE_QP_TYPE_UD) {
+ if (qp->type == CMDQ_CREATE_QP_TYPE_UD ||
+ qp->type == CMDQ_CREATE_QP_TYPE_GSI) {
sqe->q_key = cpu_to_le32(wqe->send.q_key);
sqe->dst_qp = cpu_to_le32(
wqe->send.dst_qp & SQ_SEND_DST_QP_MASK);
@@ -1741,14 +1752,26 @@ int bnxt_qplib_post_send(struct bnxt_qplib_qp *qp,
}
swq->next_psn = sq->psn & BTH_PSN_MASK;
if (swq->psn_search) {
- swq->psn_search->opcode_start_psn = cpu_to_le32(
- ((swq->start_psn << SQ_PSN_SEARCH_START_PSN_SFT) &
- SQ_PSN_SEARCH_START_PSN_MASK) |
- ((wqe->type << SQ_PSN_SEARCH_OPCODE_SFT) &
- SQ_PSN_SEARCH_OPCODE_MASK));
- swq->psn_search->flags_next_psn = cpu_to_le32(
- ((swq->next_psn << SQ_PSN_SEARCH_NEXT_PSN_SFT) &
- SQ_PSN_SEARCH_NEXT_PSN_MASK));
+ u32 opcd_spsn;
+ u32 flg_npsn;
+
+ opcd_spsn = ((swq->start_psn << SQ_PSN_SEARCH_START_PSN_SFT) &
+ SQ_PSN_SEARCH_START_PSN_MASK);
+ opcd_spsn |= ((wqe->type << SQ_PSN_SEARCH_OPCODE_SFT) &
+ SQ_PSN_SEARCH_OPCODE_MASK);
+ flg_npsn = ((swq->next_psn << SQ_PSN_SEARCH_NEXT_PSN_SFT) &
+ SQ_PSN_SEARCH_NEXT_PSN_MASK);
+ if (bnxt_qplib_is_chip_gen_p5(qp->cctx)) {
+ swq->psn_ext->opcode_start_psn =
+ cpu_to_le32(opcd_spsn);
+ swq->psn_ext->flags_next_psn =
+ cpu_to_le32(flg_npsn);
+ } else {
+ swq->psn_search->opcode_start_psn =
+ cpu_to_le32(opcd_spsn);
+ swq->psn_search->flags_next_psn =
+ cpu_to_le32(flg_npsn);
+ }
}
queue_err:
if (sch_handler) {
@@ -1785,19 +1808,16 @@ done:
void bnxt_qplib_post_recv_db(struct bnxt_qplib_qp *qp)
{
struct bnxt_qplib_q *rq = &qp->rq;
- struct dbr_dbr db_msg = { 0 };
u32 sw_prod;
+ u64 val = 0;
+ val = (((qp->id << DBC_DBC_XID_SFT) & DBC_DBC_XID_MASK) |
+ DBC_DBC_TYPE_RQ);
+ val <<= 32;
sw_prod = HWQ_CMP(rq->hwq.prod, &rq->hwq);
- db_msg.index = cpu_to_le32((sw_prod << DBR_DBR_INDEX_SFT) &
- DBR_DBR_INDEX_MASK);
- db_msg.type_xid =
- cpu_to_le32(((qp->id << DBR_DBR_XID_SFT) & DBR_DBR_XID_MASK) |
- DBR_DBR_TYPE_RQ);
-
+ val |= (sw_prod << DBC_DBC_INDEX_SFT) & DBC_DBC_INDEX_MASK;
/* Flush the writes to HW Rx WQE before the ringing Rx DB */
- wmb();
- __iowrite64_copy(qp->dpi->dbr, &db_msg, sizeof(db_msg) / sizeof(u64));
+ writeq(val, qp->dpi->dbr);
}
int bnxt_qplib_post_recv(struct bnxt_qplib_qp *qp,
@@ -1881,32 +1901,28 @@ done:
/* Spinlock must be held */
static void bnxt_qplib_arm_cq_enable(struct bnxt_qplib_cq *cq)
{
- struct dbr_dbr db_msg = { 0 };
+ u64 val = 0;
- db_msg.type_xid =
- cpu_to_le32(((cq->id << DBR_DBR_XID_SFT) & DBR_DBR_XID_MASK) |
- DBR_DBR_TYPE_CQ_ARMENA);
+ val = ((cq->id << DBC_DBC_XID_SFT) & DBC_DBC_XID_MASK) |
+ DBC_DBC_TYPE_CQ_ARMENA;
+ val <<= 32;
/* Flush memory writes before enabling the CQ */
- wmb();
- __iowrite64_copy(cq->dbr_base, &db_msg, sizeof(db_msg) / sizeof(u64));
+ writeq(val, cq->dbr_base);
}
static void bnxt_qplib_arm_cq(struct bnxt_qplib_cq *cq, u32 arm_type)
{
struct bnxt_qplib_hwq *cq_hwq = &cq->hwq;
- struct dbr_dbr db_msg = { 0 };
u32 sw_cons;
+ u64 val = 0;
/* Ring DB */
+ val = ((cq->id << DBC_DBC_XID_SFT) & DBC_DBC_XID_MASK) | arm_type;
+ val <<= 32;
sw_cons = HWQ_CMP(cq_hwq->cons, cq_hwq);
- db_msg.index = cpu_to_le32((sw_cons << DBR_DBR_INDEX_SFT) &
- DBR_DBR_INDEX_MASK);
- db_msg.type_xid =
- cpu_to_le32(((cq->id << DBR_DBR_XID_SFT) & DBR_DBR_XID_MASK) |
- arm_type);
+ val |= (sw_cons << DBC_DBC_INDEX_SFT) & DBC_DBC_INDEX_MASK;
/* flush memory writes before arming the CQ */
- wmb();
- __iowrite64_copy(cq->dpi->dbr, &db_msg, sizeof(db_msg) / sizeof(u64));
+ writeq(val, cq->dpi->dbr);
}
int bnxt_qplib_create_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq)
@@ -2053,6 +2069,7 @@ static int __flush_rq(struct bnxt_qplib_q *rq, struct bnxt_qplib_qp *qp,
opcode = CQ_BASE_CQE_TYPE_RES_RC;
break;
case CMDQ_CREATE_QP_TYPE_UD:
+ case CMDQ_CREATE_QP_TYPE_GSI:
opcode = CQ_BASE_CQE_TYPE_RES_UD;
break;
}
@@ -2125,7 +2142,7 @@ static int do_wa9060(struct bnxt_qplib_qp *qp, struct bnxt_qplib_cq *cq,
sq->send_phantom = true;
/* TODO: Only ARM if the previous SQE is ARMALL */
- bnxt_qplib_arm_cq(cq, DBR_DBR_TYPE_CQ_ARMALL);
+ bnxt_qplib_arm_cq(cq, DBC_DBC_TYPE_CQ_ARMALL);
rc = -EAGAIN;
goto out;
@@ -2410,12 +2427,14 @@ static int bnxt_qplib_cq_process_res_ud(struct bnxt_qplib_cq *cq,
}
cqe = *pcqe;
cqe->opcode = hwcqe->cqe_type_toggle & CQ_BASE_CQE_TYPE_MASK;
- cqe->length = le32_to_cpu(hwcqe->length);
+ cqe->length = (u32)le16_to_cpu(hwcqe->length);
+ cqe->cfa_meta = le16_to_cpu(hwcqe->cfa_metadata);
cqe->invrkey = le32_to_cpu(hwcqe->imm_data);
cqe->flags = le16_to_cpu(hwcqe->flags);
cqe->status = hwcqe->status;
cqe->qp_handle = (u64)(unsigned long)qp;
- memcpy(cqe->smac, hwcqe->src_mac, 6);
+ /*FIXME: Endianness fix needed for smace */
+ memcpy(cqe->smac, hwcqe->src_mac, ETH_ALEN);
wr_id_idx = le32_to_cpu(hwcqe->src_qp_high_srq_or_rq_wr_id)
& CQ_RES_UD_SRQ_OR_RQ_WR_ID_MASK;
cqe->src_qp = le16_to_cpu(hwcqe->src_qp_low) |
@@ -2794,7 +2813,7 @@ int bnxt_qplib_poll_cq(struct bnxt_qplib_cq *cq, struct bnxt_qplib_cqe *cqe,
}
if (cq->hwq.cons != raw_cons) {
cq->hwq.cons = raw_cons;
- bnxt_qplib_arm_cq(cq, DBR_DBR_TYPE_CQ);
+ bnxt_qplib_arm_cq(cq, DBC_DBC_TYPE_CQ);
}
exit:
return num_cqes - budget;
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.h b/drivers/infiniband/hw/bnxt_re/qplib_fp.h
index 72352ca80ace..3f618b5f1f06 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_fp.h
+++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.h
@@ -106,6 +106,7 @@ struct bnxt_qplib_swq {
u32 start_psn;
u32 next_psn;
struct sq_psn_search *psn_search;
+ struct sq_psn_search_ext *psn_ext;
};
struct bnxt_qplib_swqe {
@@ -254,6 +255,7 @@ struct bnxt_qplib_q {
struct bnxt_qplib_qp {
struct bnxt_qplib_pd *pd;
struct bnxt_qplib_dpi *dpi;
+ struct bnxt_qplib_chip_ctx *cctx;
u64 qp_handle;
#define BNXT_QPLIB_QP_ID_INVALID 0xFFFFFFFF
u32 id;
@@ -347,6 +349,7 @@ struct bnxt_qplib_cqe {
u8 type;
u8 opcode;
u32 length;
+ u16 cfa_meta;
u64 wr_id;
union {
__be32 immdata;
@@ -432,13 +435,47 @@ struct bnxt_qplib_cq {
#define NQ_DB_CP_FLAGS (NQ_DB_KEY_CP | \
NQ_DB_IDX_VALID | \
NQ_DB_IRQ_DIS)
-#define NQ_DB_REARM(db, raw_cons, cp_bit) \
- writel(NQ_DB_CP_FLAGS_REARM | ((raw_cons) & ((cp_bit) - 1)), db)
-#define NQ_DB(db, raw_cons, cp_bit) \
- writel(NQ_DB_CP_FLAGS | ((raw_cons) & ((cp_bit) - 1)), db)
+
+static inline void bnxt_qplib_ring_nq_db64(void __iomem *db, u32 index,
+ u32 xid, bool arm)
+{
+ u64 val;
+
+ val = xid & DBC_DBC_XID_MASK;
+ val |= DBC_DBC_PATH_ROCE;
+ val |= arm ? DBC_DBC_TYPE_NQ_ARM : DBC_DBC_TYPE_NQ;
+ val <<= 32;
+ val |= index & DBC_DBC_INDEX_MASK;
+ writeq(val, db);
+}
+
+static inline void bnxt_qplib_ring_nq_db_rearm(void __iomem *db, u32 raw_cons,
+ u32 max_elements, u32 xid,
+ bool gen_p5)
+{
+ u32 index = raw_cons & (max_elements - 1);
+
+ if (gen_p5)
+ bnxt_qplib_ring_nq_db64(db, index, xid, true);
+ else
+ writel(NQ_DB_CP_FLAGS_REARM | (index & DBC_DBC32_XID_MASK), db);
+}
+
+static inline void bnxt_qplib_ring_nq_db(void __iomem *db, u32 raw_cons,
+ u32 max_elements, u32 xid,
+ bool gen_p5)
+{
+ u32 index = raw_cons & (max_elements - 1);
+
+ if (gen_p5)
+ bnxt_qplib_ring_nq_db64(db, index, xid, false);
+ else
+ writel(NQ_DB_CP_FLAGS | (index & DBC_DBC32_XID_MASK), db);
+}
struct bnxt_qplib_nq {
struct pci_dev *pdev;
+ struct bnxt_qplib_res *res;
int vector;
cpumask_t mask;
@@ -448,7 +485,7 @@ struct bnxt_qplib_nq {
struct bnxt_qplib_hwq hwq;
u16 bar_reg;
- u16 bar_reg_off;
+ u32 bar_reg_off;
u16 ring_id;
void __iomem *bar_reg_iomem;
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c
index 19551aa43850..c6461e957078 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c
+++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c
@@ -359,11 +359,12 @@ static int bnxt_qplib_process_qp_event(struct bnxt_qplib_rcfw *rcfw,
static void bnxt_qplib_service_creq(unsigned long data)
{
struct bnxt_qplib_rcfw *rcfw = (struct bnxt_qplib_rcfw *)data;
+ bool gen_p5 = bnxt_qplib_is_chip_gen_p5(rcfw->res->cctx);
struct bnxt_qplib_hwq *creq = &rcfw->creq;
+ u32 type, budget = CREQ_ENTRY_POLL_BUDGET;
struct creq_base *creqe, **creq_ptr;
u32 sw_cons, raw_cons;
unsigned long flags;
- u32 type, budget = CREQ_ENTRY_POLL_BUDGET;
/* Service the CREQ until budget is over */
spin_lock_irqsave(&creq->lock, flags);
@@ -407,8 +408,9 @@ static void bnxt_qplib_service_creq(unsigned long data)
if (creq->cons != raw_cons) {
creq->cons = raw_cons;
- CREQ_DB_REARM(rcfw->creq_bar_reg_iomem, raw_cons,
- creq->max_elements);
+ bnxt_qplib_ring_creq_db_rearm(rcfw->creq_bar_reg_iomem,
+ raw_cons, creq->max_elements,
+ rcfw->creq_ring_id, gen_p5);
}
spin_unlock_irqrestore(&creq->lock, flags);
}
@@ -480,11 +482,13 @@ int bnxt_qplib_init_rcfw(struct bnxt_qplib_rcfw *rcfw,
req.log2_dbr_pg_size = cpu_to_le16(PAGE_SHIFT -
RCFW_DBR_BASE_PAGE_SHIFT);
/*
- * VFs need not setup the HW context area, PF
+ * Gen P5 devices doesn't require this allocation
+ * as the L2 driver does the same for RoCE also.
+ * Also, VFs need not setup the HW context area, PF
* shall setup this area for VF. Skipping the
* HW programming
*/
- if (is_virtfn)
+ if (is_virtfn || bnxt_qplib_is_chip_gen_p5(rcfw->res->cctx))
goto skip_ctx_setup;
level = ctx->qpc_tbl.level;
@@ -560,12 +564,15 @@ int bnxt_qplib_alloc_rcfw_channel(struct pci_dev *pdev,
struct bnxt_qplib_ctx *ctx,
int qp_tbl_sz)
{
+ u8 hwq_type;
+
rcfw->pdev = pdev;
rcfw->creq.max_elements = BNXT_QPLIB_CREQE_MAX_CNT;
+ hwq_type = bnxt_qplib_get_hwq_type(rcfw->res);
if (bnxt_qplib_alloc_init_hwq(rcfw->pdev, &rcfw->creq, NULL, 0,
&rcfw->creq.max_elements,
- BNXT_QPLIB_CREQE_UNITS, 0, PAGE_SIZE,
- HWQ_TYPE_L2_CMPL)) {
+ BNXT_QPLIB_CREQE_UNITS,
+ 0, PAGE_SIZE, hwq_type)) {
dev_err(&rcfw->pdev->dev,
"HW channel CREQ allocation failed\n");
goto fail;
@@ -607,10 +614,13 @@ fail:
void bnxt_qplib_rcfw_stop_irq(struct bnxt_qplib_rcfw *rcfw, bool kill)
{
+ bool gen_p5 = bnxt_qplib_is_chip_gen_p5(rcfw->res->cctx);
+
tasklet_disable(&rcfw->worker);
/* Mask h/w interrupts */
- CREQ_DB(rcfw->creq_bar_reg_iomem, rcfw->creq.cons,
- rcfw->creq.max_elements);
+ bnxt_qplib_ring_creq_db(rcfw->creq_bar_reg_iomem, rcfw->creq.cons,
+ rcfw->creq.max_elements, rcfw->creq_ring_id,
+ gen_p5);
/* Sync with last running IRQ-handler */
synchronize_irq(rcfw->vector);
if (kill)
@@ -647,6 +657,7 @@ void bnxt_qplib_disable_rcfw_channel(struct bnxt_qplib_rcfw *rcfw)
int bnxt_qplib_rcfw_start_irq(struct bnxt_qplib_rcfw *rcfw, int msix_vector,
bool need_init)
{
+ bool gen_p5 = bnxt_qplib_is_chip_gen_p5(rcfw->res->cctx);
int rc;
if (rcfw->requested)
@@ -663,8 +674,9 @@ int bnxt_qplib_rcfw_start_irq(struct bnxt_qplib_rcfw *rcfw, int msix_vector,
if (rc)
return rc;
rcfw->requested = true;
- CREQ_DB_REARM(rcfw->creq_bar_reg_iomem, rcfw->creq.cons,
- rcfw->creq.max_elements);
+ bnxt_qplib_ring_creq_db_rearm(rcfw->creq_bar_reg_iomem,
+ rcfw->creq.cons, rcfw->creq.max_elements,
+ rcfw->creq_ring_id, gen_p5);
return 0;
}
@@ -684,8 +696,7 @@ int bnxt_qplib_enable_rcfw_channel(struct pci_dev *pdev,
/* General */
rcfw->seq_num = 0;
set_bit(FIRMWARE_FIRST_FLAG, &rcfw->flags);
- bmap_size = BITS_TO_LONGS(rcfw->cmdq_depth *
- sizeof(unsigned long));
+ bmap_size = BITS_TO_LONGS(rcfw->cmdq_depth) * sizeof(unsigned long);
rcfw->cmdq_bitmap = kzalloc(bmap_size, GFP_KERNEL);
if (!rcfw->cmdq_bitmap)
return -ENOMEM;
@@ -718,8 +729,9 @@ int bnxt_qplib_enable_rcfw_channel(struct pci_dev *pdev,
dev_err(&rcfw->pdev->dev,
"CREQ BAR region %d resc start is 0!\n",
rcfw->creq_bar_reg);
+ /* Unconditionally map 8 bytes to support 57500 series */
rcfw->creq_bar_reg_iomem = ioremap_nocache(res_base + cp_bar_reg_off,
- 4);
+ 8);
if (!rcfw->creq_bar_reg_iomem) {
dev_err(&rcfw->pdev->dev, "CREQ BAR region %d mapping failed\n",
rcfw->creq_bar_reg);
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h
index be0ef0e8c53e..2138533bb642 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h
+++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h
@@ -157,10 +157,46 @@ static inline u32 get_creq_idx(u32 val)
#define CREQ_DB_CP_FLAGS (CREQ_DB_KEY_CP | \
CREQ_DB_IDX_VALID | \
CREQ_DB_IRQ_DIS)
-#define CREQ_DB_REARM(db, raw_cons, cp_bit) \
- writel(CREQ_DB_CP_FLAGS_REARM | ((raw_cons) & ((cp_bit) - 1)), db)
-#define CREQ_DB(db, raw_cons, cp_bit) \
- writel(CREQ_DB_CP_FLAGS | ((raw_cons) & ((cp_bit) - 1)), db)
+
+static inline void bnxt_qplib_ring_creq_db64(void __iomem *db, u32 index,
+ u32 xid, bool arm)
+{
+ u64 val = 0;
+
+ val = xid & DBC_DBC_XID_MASK;
+ val |= DBC_DBC_PATH_ROCE;
+ val |= arm ? DBC_DBC_TYPE_NQ_ARM : DBC_DBC_TYPE_NQ;
+ val <<= 32;
+ val |= index & DBC_DBC_INDEX_MASK;
+
+ writeq(val, db);
+}
+
+static inline void bnxt_qplib_ring_creq_db_rearm(void __iomem *db, u32 raw_cons,
+ u32 max_elements, u32 xid,
+ bool gen_p5)
+{
+ u32 index = raw_cons & (max_elements - 1);
+
+ if (gen_p5)
+ bnxt_qplib_ring_creq_db64(db, index, xid, true);
+ else
+ writel(CREQ_DB_CP_FLAGS_REARM | (index & DBC_DBC32_XID_MASK),
+ db);
+}
+
+static inline void bnxt_qplib_ring_creq_db(void __iomem *db, u32 raw_cons,
+ u32 max_elements, u32 xid,
+ bool gen_p5)
+{
+ u32 index = raw_cons & (max_elements - 1);
+
+ if (gen_p5)
+ bnxt_qplib_ring_creq_db64(db, index, xid, true);
+ else
+ writel(CREQ_DB_CP_FLAGS | (index & DBC_DBC32_XID_MASK),
+ db);
+}
#define CREQ_ENTRY_POLL_BUDGET 0x100
@@ -187,6 +223,7 @@ struct bnxt_qplib_qp_node {
/* RCFW Communication Channels */
struct bnxt_qplib_rcfw {
struct pci_dev *pdev;
+ struct bnxt_qplib_res *res;
int vector;
struct tasklet_struct worker;
bool requested;
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.c b/drivers/infiniband/hw/bnxt_re/qplib_res.c
index 57d4951679cb..0bc24f934829 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_res.c
+++ b/drivers/infiniband/hw/bnxt_re/qplib_res.c
@@ -85,7 +85,7 @@ static void __free_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl,
static int __alloc_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl,
struct scatterlist *sghead, u32 pages, u32 pg_size)
{
- struct scatterlist *sg;
+ struct sg_dma_page_iter sg_iter;
bool is_umem = false;
int i;
@@ -116,13 +116,11 @@ static int __alloc_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl,
} else {
i = 0;
is_umem = true;
- for_each_sg(sghead, sg, pages, i) {
- pbl->pg_map_arr[i] = sg_dma_address(sg);
- pbl->pg_arr[i] = sg_virt(sg);
- if (!pbl->pg_arr[i])
- goto fail;
-
+ for_each_sg_dma_page (sghead, &sg_iter, pages, 0) {
+ pbl->pg_map_arr[i] = sg_page_iter_dma_address(&sg_iter);
+ pbl->pg_arr[i] = NULL;
pbl->pg_count++;
+ i++;
}
}
@@ -330,13 +328,13 @@ void bnxt_qplib_free_ctx(struct pci_dev *pdev,
*/
int bnxt_qplib_alloc_ctx(struct pci_dev *pdev,
struct bnxt_qplib_ctx *ctx,
- bool virt_fn)
+ bool virt_fn, bool is_p5)
{
int i, j, k, rc = 0;
int fnz_idx = -1;
__le64 **pbl_ptr;
- if (virt_fn)
+ if (virt_fn || is_p5)
goto stats_alloc;
/* QPC Tables */
@@ -762,7 +760,11 @@ static int bnxt_qplib_alloc_stats_ctx(struct pci_dev *pdev,
{
memset(stats, 0, sizeof(*stats));
stats->fw_id = -1;
- stats->size = sizeof(struct ctx_hw_stats);
+ /* 128 byte aligned context memory is required only for 57500.
+ * However making this unconditional, it does not harm previous
+ * generation.
+ */
+ stats->size = ALIGN(sizeof(struct ctx_hw_stats), 128);
stats->dma = dma_alloc_coherent(&pdev->dev, stats->size,
&stats->dma_map, GFP_KERNEL);
if (!stats->dma) {
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.h b/drivers/infiniband/hw/bnxt_re/qplib_res.h
index 1e80aa7bbcce..32cebd0f1436 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_res.h
+++ b/drivers/infiniband/hw/bnxt_re/qplib_res.h
@@ -180,12 +180,20 @@ struct bnxt_qplib_ctx {
u64 hwrm_intf_ver;
};
+struct bnxt_qplib_chip_ctx {
+ u16 chip_num;
+ u8 chip_rev;
+ u8 chip_metal;
+};
+
+#define CHIP_NUM_57500 0x1750
+
struct bnxt_qplib_res {
struct pci_dev *pdev;
+ struct bnxt_qplib_chip_ctx *cctx;
struct net_device *netdev;
struct bnxt_qplib_rcfw *rcfw;
-
struct bnxt_qplib_pd_tbl pd_tbl;
struct bnxt_qplib_sgid_tbl sgid_tbl;
struct bnxt_qplib_pkey_tbl pkey_tbl;
@@ -193,6 +201,24 @@ struct bnxt_qplib_res {
bool prio;
};
+static inline bool bnxt_qplib_is_chip_gen_p5(struct bnxt_qplib_chip_ctx *cctx)
+{
+ return (cctx->chip_num == CHIP_NUM_57500);
+}
+
+static inline u8 bnxt_qplib_get_hwq_type(struct bnxt_qplib_res *res)
+{
+ return bnxt_qplib_is_chip_gen_p5(res->cctx) ?
+ HWQ_TYPE_QUEUE : HWQ_TYPE_L2_CMPL;
+}
+
+static inline u8 bnxt_qplib_get_ring_type(struct bnxt_qplib_chip_ctx *cctx)
+{
+ return bnxt_qplib_is_chip_gen_p5(cctx) ?
+ RING_ALLOC_REQ_RING_TYPE_NQ :
+ RING_ALLOC_REQ_RING_TYPE_ROCE_CMPL;
+}
+
#define to_bnxt_qplib(ptr, type, member) \
container_of(ptr, type, member)
@@ -226,5 +252,5 @@ void bnxt_qplib_free_ctx(struct pci_dev *pdev,
struct bnxt_qplib_ctx *ctx);
int bnxt_qplib_alloc_ctx(struct pci_dev *pdev,
struct bnxt_qplib_ctx *ctx,
- bool virt_fn);
+ bool virt_fn, bool is_p5);
#endif /* __BNXT_QPLIB_RES_H__ */
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c
index efa0f2949dc7..e9c53e406404 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_sp.c
+++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.c
@@ -119,7 +119,8 @@ int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw,
* reporting the max number
*/
attr->max_qp_wqes -= BNXT_QPLIB_RESERVED_QP_WRS;
- attr->max_qp_sges = sb->max_sge;
+ attr->max_qp_sges = bnxt_qplib_is_chip_gen_p5(rcfw->res->cctx) ?
+ 6 : sb->max_sge;
attr->max_cq = le32_to_cpu(sb->max_cq);
attr->max_cq_wqes = le32_to_cpu(sb->max_cqe);
attr->max_cq_sges = attr->max_qp_sges;
diff --git a/drivers/infiniband/hw/bnxt_re/roce_hsi.h b/drivers/infiniband/hw/bnxt_re/roce_hsi.h
index 8a9ead419ac2..e4b09e7c2175 100644
--- a/drivers/infiniband/hw/bnxt_re/roce_hsi.h
+++ b/drivers/infiniband/hw/bnxt_re/roce_hsi.h
@@ -49,11 +49,11 @@ struct cmpl_doorbell {
#define CMPL_DOORBELL_IDX_SFT 0
#define CMPL_DOORBELL_RESERVED_MASK 0x3000000UL
#define CMPL_DOORBELL_RESERVED_SFT 24
- #define CMPL_DOORBELL_IDX_VALID 0x4000000UL
+ #define CMPL_DOORBELL_IDX_VALID 0x4000000UL
#define CMPL_DOORBELL_MASK 0x8000000UL
#define CMPL_DOORBELL_KEY_MASK 0xf0000000UL
#define CMPL_DOORBELL_KEY_SFT 28
- #define CMPL_DOORBELL_KEY_CMPL (0x2UL << 28)
+ #define CMPL_DOORBELL_KEY_CMPL (0x2UL << 28)
};
/* Status Door Bell Format (4 bytes) */
@@ -71,46 +71,56 @@ struct status_doorbell {
/* RoCE Host Structures */
/* Doorbell Structures */
-/* 64b Doorbell Format (8 bytes) */
-struct dbr_dbr {
- __le32 index;
- #define DBR_DBR_INDEX_MASK 0xfffffUL
- #define DBR_DBR_INDEX_SFT 0
- #define DBR_DBR_RESERVED12_MASK 0xfff00000UL
- #define DBR_DBR_RESERVED12_SFT 20
- __le32 type_xid;
- #define DBR_DBR_XID_MASK 0xfffffUL
- #define DBR_DBR_XID_SFT 0
- #define DBR_DBR_RESERVED8_MASK 0xff00000UL
- #define DBR_DBR_RESERVED8_SFT 20
- #define DBR_DBR_TYPE_MASK 0xf0000000UL
- #define DBR_DBR_TYPE_SFT 28
- #define DBR_DBR_TYPE_SQ (0x0UL << 28)
- #define DBR_DBR_TYPE_RQ (0x1UL << 28)
- #define DBR_DBR_TYPE_SRQ (0x2UL << 28)
- #define DBR_DBR_TYPE_SRQ_ARM (0x3UL << 28)
- #define DBR_DBR_TYPE_CQ (0x4UL << 28)
- #define DBR_DBR_TYPE_CQ_ARMSE (0x5UL << 28)
- #define DBR_DBR_TYPE_CQ_ARMALL (0x6UL << 28)
- #define DBR_DBR_TYPE_CQ_ARMENA (0x7UL << 28)
- #define DBR_DBR_TYPE_SRQ_ARMENA (0x8UL << 28)
- #define DBR_DBR_TYPE_CQ_CUTOFF_ACK (0x9UL << 28)
- #define DBR_DBR_TYPE_NULL (0xfUL << 28)
-};
-
-/* 32b Doorbell Format (4 bytes) */
-struct dbr_dbr32 {
- __le32 type_abs_incr_xid;
- #define DBR_DBR32_XID_MASK 0xfffffUL
- #define DBR_DBR32_XID_SFT 0
- #define DBR_DBR32_RESERVED4_MASK 0xf00000UL
- #define DBR_DBR32_RESERVED4_SFT 20
- #define DBR_DBR32_INCR_MASK 0xf000000UL
- #define DBR_DBR32_INCR_SFT 24
- #define DBR_DBR32_ABS 0x10000000UL
- #define DBR_DBR32_TYPE_MASK 0xe0000000UL
- #define DBR_DBR32_TYPE_SFT 29
- #define DBR_DBR32_TYPE_SQ (0x0UL << 29)
+/* dbc_dbc (size:64b/8B) */
+struct dbc_dbc {
+ __le32 index;
+ #define DBC_DBC_INDEX_MASK 0xffffffUL
+ #define DBC_DBC_INDEX_SFT 0
+ __le32 type_path_xid;
+ #define DBC_DBC_XID_MASK 0xfffffUL
+ #define DBC_DBC_XID_SFT 0
+ #define DBC_DBC_PATH_MASK 0x3000000UL
+ #define DBC_DBC_PATH_SFT 24
+ #define DBC_DBC_PATH_ROCE (0x0UL << 24)
+ #define DBC_DBC_PATH_L2 (0x1UL << 24)
+ #define DBC_DBC_PATH_ENGINE (0x2UL << 24)
+ #define DBC_DBC_PATH_LAST DBC_DBC_PATH_ENGINE
+ #define DBC_DBC_DEBUG_TRACE 0x8000000UL
+ #define DBC_DBC_TYPE_MASK 0xf0000000UL
+ #define DBC_DBC_TYPE_SFT 28
+ #define DBC_DBC_TYPE_SQ (0x0UL << 28)
+ #define DBC_DBC_TYPE_RQ (0x1UL << 28)
+ #define DBC_DBC_TYPE_SRQ (0x2UL << 28)
+ #define DBC_DBC_TYPE_SRQ_ARM (0x3UL << 28)
+ #define DBC_DBC_TYPE_CQ (0x4UL << 28)
+ #define DBC_DBC_TYPE_CQ_ARMSE (0x5UL << 28)
+ #define DBC_DBC_TYPE_CQ_ARMALL (0x6UL << 28)
+ #define DBC_DBC_TYPE_CQ_ARMENA (0x7UL << 28)
+ #define DBC_DBC_TYPE_SRQ_ARMENA (0x8UL << 28)
+ #define DBC_DBC_TYPE_CQ_CUTOFF_ACK (0x9UL << 28)
+ #define DBC_DBC_TYPE_NQ (0xaUL << 28)
+ #define DBC_DBC_TYPE_NQ_ARM (0xbUL << 28)
+ #define DBC_DBC_TYPE_NULL (0xfUL << 28)
+ #define DBC_DBC_TYPE_LAST DBC_DBC_TYPE_NULL
+};
+
+/* dbc_dbc32 (size:32b/4B) */
+struct dbc_dbc32 {
+ __le32 type_abs_incr_xid;
+ #define DBC_DBC32_XID_MASK 0xfffffUL
+ #define DBC_DBC32_XID_SFT 0
+ #define DBC_DBC32_PATH_MASK 0xc00000UL
+ #define DBC_DBC32_PATH_SFT 22
+ #define DBC_DBC32_PATH_ROCE (0x0UL << 22)
+ #define DBC_DBC32_PATH_L2 (0x1UL << 22)
+ #define DBC_DBC32_PATH_LAST DBC_DBC32_PATH_L2
+ #define DBC_DBC32_INCR_MASK 0xf000000UL
+ #define DBC_DBC32_INCR_SFT 24
+ #define DBC_DBC32_ABS 0x10000000UL
+ #define DBC_DBC32_TYPE_MASK 0xe0000000UL
+ #define DBC_DBC32_TYPE_SFT 29
+ #define DBC_DBC32_TYPE_SQ (0x0UL << 29)
+ #define DBC_DBC32_TYPE_LAST DBC_DBC32_TYPE_SQ
};
/* SQ WQE Structures */
@@ -149,7 +159,24 @@ struct sq_psn_search {
#define SQ_PSN_SEARCH_NEXT_PSN_MASK 0xffffffUL
#define SQ_PSN_SEARCH_NEXT_PSN_SFT 0
#define SQ_PSN_SEARCH_FLAGS_MASK 0xff000000UL
- #define SQ_PSN_SEARCH_FLAGS_SFT 24
+ #define SQ_PSN_SEARCH_FLAGS_SFT 24
+};
+
+/* sq_psn_search_ext (size:128b/16B) */
+struct sq_psn_search_ext {
+ __le32 opcode_start_psn;
+ #define SQ_PSN_SEARCH_EXT_START_PSN_MASK 0xffffffUL
+ #define SQ_PSN_SEARCH_EXT_START_PSN_SFT 0
+ #define SQ_PSN_SEARCH_EXT_OPCODE_MASK 0xff000000UL
+ #define SQ_PSN_SEARCH_EXT_OPCODE_SFT 24
+ __le32 flags_next_psn;
+ #define SQ_PSN_SEARCH_EXT_NEXT_PSN_MASK 0xffffffUL
+ #define SQ_PSN_SEARCH_EXT_NEXT_PSN_SFT 0
+ #define SQ_PSN_SEARCH_EXT_FLAGS_MASK 0xff000000UL
+ #define SQ_PSN_SEARCH_EXT_FLAGS_SFT 24
+ __le16 start_slot_idx;
+ __le16 reserved16;
+ __le32 reserved32;
};
/* Send SQ WQE (40 bytes) */
@@ -505,22 +532,24 @@ struct cq_res_rc {
/* Responder UD CQE (32 bytes) */
struct cq_res_ud {
- __le32 length;
+ __le16 length;
#define CQ_RES_UD_LENGTH_MASK 0x3fffUL
#define CQ_RES_UD_LENGTH_SFT 0
- #define CQ_RES_UD_RESERVED18_MASK 0xffffc000UL
- #define CQ_RES_UD_RESERVED18_SFT 14
+ __le16 cfa_metadata;
+ #define CQ_RES_UD_CFA_METADATA_VID_MASK 0xfffUL
+ #define CQ_RES_UD_CFA_METADATA_VID_SFT 0
+ #define CQ_RES_UD_CFA_METADATA_DE 0x1000UL
+ #define CQ_RES_UD_CFA_METADATA_PRI_MASK 0xe000UL
+ #define CQ_RES_UD_CFA_METADATA_PRI_SFT 13
__le32 imm_data;
__le64 qp_handle;
__le16 src_mac[3];
__le16 src_qp_low;
u8 cqe_type_toggle;
- #define CQ_RES_UD_TOGGLE 0x1UL
- #define CQ_RES_UD_CQE_TYPE_MASK 0x1eUL
- #define CQ_RES_UD_CQE_TYPE_SFT 1
+ #define CQ_RES_UD_TOGGLE 0x1UL
+ #define CQ_RES_UD_CQE_TYPE_MASK 0x1eUL
+ #define CQ_RES_UD_CQE_TYPE_SFT 1
#define CQ_RES_UD_CQE_TYPE_RES_UD (0x2UL << 1)
- #define CQ_RES_UD_RESERVED3_MASK 0xe0UL
- #define CQ_RES_UD_RESERVED3_SFT 5
u8 status;
#define CQ_RES_UD_STATUS_OK 0x0UL
#define CQ_RES_UD_STATUS_LOCAL_ACCESS_ERROR 0x1UL
@@ -536,18 +565,30 @@ struct cq_res_ud {
#define CQ_RES_UD_FLAGS_SRQ_SRQ (0x1UL << 0)
#define CQ_RES_UD_FLAGS_SRQ_LAST CQ_RES_UD_FLAGS_SRQ_SRQ
#define CQ_RES_UD_FLAGS_IMM 0x2UL
- #define CQ_RES_UD_FLAGS_ROCE_IP_VER_MASK 0xcUL
- #define CQ_RES_UD_FLAGS_ROCE_IP_VER_SFT 2
- #define CQ_RES_UD_FLAGS_ROCE_IP_VER_V1 (0x0UL << 2)
- #define CQ_RES_UD_FLAGS_ROCE_IP_VER_V2IPV4 (0x2UL << 2)
- #define CQ_RES_UD_FLAGS_ROCE_IP_VER_V2IPV6 (0x3UL << 2)
+ #define CQ_RES_UD_FLAGS_UNUSED_MASK 0xcUL
+ #define CQ_RES_UD_FLAGS_UNUSED_SFT 2
+ #define CQ_RES_UD_FLAGS_ROCE_IP_VER_MASK 0x30UL
+ #define CQ_RES_UD_FLAGS_ROCE_IP_VER_SFT 4
+ #define CQ_RES_UD_FLAGS_ROCE_IP_VER_V1 (0x0UL << 4)
+ #define CQ_RES_UD_FLAGS_ROCE_IP_VER_V2IPV4 (0x2UL << 4)
+ #define CQ_RES_UD_FLAGS_ROCE_IP_VER_V2IPV6 (0x3UL << 4)
#define CQ_RES_UD_FLAGS_ROCE_IP_VER_LAST \
CQ_RES_UD_FLAGS_ROCE_IP_VER_V2IPV6
+ #define CQ_RES_UD_FLAGS_META_FORMAT_MASK 0x3c0UL
+ #define CQ_RES_UD_FLAGS_META_FORMAT_SFT 6
+ #define CQ_RES_UD_FLAGS_META_FORMAT_NONE (0x0UL << 6)
+ #define CQ_RES_UD_FLAGS_META_FORMAT_VLAN (0x1UL << 6)
+ #define CQ_RES_UD_FLAGS_META_FORMAT_TUNNEL_ID (0x2UL << 6)
+ #define CQ_RES_UD_FLAGS_META_FORMAT_CHDR_DATA (0x3UL << 6)
+ #define CQ_RES_UD_FLAGS_META_FORMAT_HDR_OFFSET (0x4UL << 6)
+ #define CQ_RES_UD_FLAGS_META_FORMAT_LAST \
+ CQ_RES_UD_FLAGS_META_FORMAT_HDR_OFFSET
+ #define CQ_RES_UD_FLAGS_EXT_META_FORMAT_MASK 0xc00UL
+ #define CQ_RES_UD_FLAGS_EXT_META_FORMAT_SFT 10
+
__le32 src_qp_high_srq_or_rq_wr_id;
#define CQ_RES_UD_SRQ_OR_RQ_WR_ID_MASK 0xfffffUL
#define CQ_RES_UD_SRQ_OR_RQ_WR_ID_SFT 0
- #define CQ_RES_UD_RESERVED4_MASK 0xf00000UL
- #define CQ_RES_UD_RESERVED4_SFT 20
#define CQ_RES_UD_SRC_QP_HIGH_MASK 0xff000000UL
#define CQ_RES_UD_SRC_QP_HIGH_SFT 24
};
@@ -983,6 +1024,7 @@ struct cmdq_create_qp {
#define CMDQ_CREATE_QP_TYPE_RC 0x2UL
#define CMDQ_CREATE_QP_TYPE_UD 0x4UL
#define CMDQ_CREATE_QP_TYPE_RAW_ETHERTYPE 0x6UL
+ #define CMDQ_CREATE_QP_TYPE_GSI 0x7UL
u8 sq_pg_size_sq_lvl;
#define CMDQ_CREATE_QP_SQ_LVL_MASK 0xfUL
#define CMDQ_CREATE_QP_SQ_LVL_SFT 0
@@ -2719,6 +2761,8 @@ struct creq_query_func_resp_sb {
__le16 max_srq;
__le32 max_gid;
__le32 tqm_alloc_reqs[12];
+ __le32 max_dpi;
+ __le32 reserved_32;
};
/* Set resources command response (16 bytes) */
diff --git a/drivers/infiniband/hw/cxgb3/Makefile b/drivers/infiniband/hw/cxgb3/Makefile
index 66fe0917aba0..34bb86a6ae3a 100644
--- a/drivers/infiniband/hw/cxgb3/Makefile
+++ b/drivers/infiniband/hw/cxgb3/Makefile
@@ -1,5 +1,5 @@
# SPDX-License-Identifier: GPL-2.0
-ccflags-y := -Idrivers/net/ethernet/chelsio/cxgb3
+ccflags-y := -I $(srctree)/drivers/net/ethernet/chelsio/cxgb3
obj-$(CONFIG_INFINIBAND_CXGB3) += iw_cxgb3.o
diff --git a/drivers/infiniband/hw/cxgb3/iwch.c b/drivers/infiniband/hw/cxgb3/iwch.c
index 591de319c178..fb03bc492ef7 100644
--- a/drivers/infiniband/hw/cxgb3/iwch.c
+++ b/drivers/infiniband/hw/cxgb3/iwch.c
@@ -146,7 +146,7 @@ static void open_rnic_dev(struct t3cdev *tdev)
pr_debug("%s t3cdev %p\n", __func__, tdev);
pr_info_once("Chelsio T3 RDMA Driver - version %s\n", DRV_VERSION);
- rnicp = (struct iwch_dev *)ib_alloc_device(sizeof(*rnicp));
+ rnicp = ib_alloc_device(iwch_dev, ibdev);
if (!rnicp) {
pr_err("Cannot allocate ib device\n");
return;
diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c
index b34b1a1bd94b..4accf7b3dcf2 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_provider.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c
@@ -53,6 +53,7 @@
#include <rdma/ib_smi.h>
#include <rdma/ib_umem.h>
#include <rdma/ib_user_verbs.h>
+#include <rdma/uverbs_ioctl.h>
#include "cxio_hal.h"
#include "iwch.h"
@@ -61,7 +62,7 @@
#include <rdma/cxgb3-abi.h>
#include "common.h"
-static int iwch_dealloc_ucontext(struct ib_ucontext *context)
+static void iwch_dealloc_ucontext(struct ib_ucontext *context)
{
struct iwch_dev *rhp = to_iwch_dev(context->device);
struct iwch_ucontext *ucontext = to_iwch_ucontext(context);
@@ -71,24 +72,20 @@ static int iwch_dealloc_ucontext(struct ib_ucontext *context)
list_for_each_entry_safe(mm, tmp, &ucontext->mmaps, entry)
kfree(mm);
cxio_release_ucontext(&rhp->rdev, &ucontext->uctx);
- kfree(ucontext);
- return 0;
}
-static struct ib_ucontext *iwch_alloc_ucontext(struct ib_device *ibdev,
- struct ib_udata *udata)
+static int iwch_alloc_ucontext(struct ib_ucontext *ucontext,
+ struct ib_udata *udata)
{
- struct iwch_ucontext *context;
+ struct ib_device *ibdev = ucontext->device;
+ struct iwch_ucontext *context = to_iwch_ucontext(ucontext);
struct iwch_dev *rhp = to_iwch_dev(ibdev);
pr_debug("%s ibdev %p\n", __func__, ibdev);
- context = kzalloc(sizeof(*context), GFP_KERNEL);
- if (!context)
- return ERR_PTR(-ENOMEM);
cxio_init_ucontext(&rhp->rdev, &context->uctx);
INIT_LIST_HEAD(&context->mmaps);
spin_lock_init(&context->mmap_lock);
- return &context->ibucontext;
+ return 0;
}
static int iwch_destroy_cq(struct ib_cq *ib_cq)
@@ -370,7 +367,7 @@ static int iwch_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
return ret;
}
-static int iwch_deallocate_pd(struct ib_pd *pd)
+static void iwch_deallocate_pd(struct ib_pd *pd)
{
struct iwch_dev *rhp;
struct iwch_pd *php;
@@ -379,15 +376,13 @@ static int iwch_deallocate_pd(struct ib_pd *pd)
rhp = php->rhp;
pr_debug("%s ibpd %p pdid 0x%x\n", __func__, pd, php->pdid);
cxio_hal_put_pdid(rhp->rdev.rscp, php->pdid);
- kfree(php);
- return 0;
}
-static struct ib_pd *iwch_allocate_pd(struct ib_device *ibdev,
- struct ib_ucontext *context,
- struct ib_udata *udata)
+static int iwch_allocate_pd(struct ib_pd *pd, struct ib_ucontext *context,
+ struct ib_udata *udata)
{
- struct iwch_pd *php;
+ struct iwch_pd *php = to_iwch_pd(pd);
+ struct ib_device *ibdev = pd->device;
u32 pdid;
struct iwch_dev *rhp;
@@ -395,12 +390,8 @@ static struct ib_pd *iwch_allocate_pd(struct ib_device *ibdev,
rhp = (struct iwch_dev *) ibdev;
pdid = cxio_hal_get_pdid(rhp->rdev.rscp);
if (!pdid)
- return ERR_PTR(-EINVAL);
- php = kzalloc(sizeof(*php), GFP_KERNEL);
- if (!php) {
- cxio_hal_put_pdid(rhp->rdev.rscp, pdid);
- return ERR_PTR(-ENOMEM);
- }
+ return -EINVAL;
+
php->pdid = pdid;
php->rhp = rhp;
if (context) {
@@ -408,11 +399,11 @@ static struct ib_pd *iwch_allocate_pd(struct ib_device *ibdev,
if (ib_copy_to_udata(udata, &resp, sizeof(resp))) {
iwch_deallocate_pd(&php->ibpd);
- return ERR_PTR(-EFAULT);
+ return -EFAULT;
}
}
pr_debug("%s pdid 0x%0x ptr 0x%p\n", __func__, pdid, php);
- return &php->ibpd;
+ return 0;
}
static int iwch_dereg_mr(struct ib_mr *ib_mr)
@@ -522,14 +513,13 @@ static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
u64 virt, int acc, struct ib_udata *udata)
{
__be64 *pages;
- int shift, n, len;
- int i, k, entry;
+ int shift, n, i;
int err = 0;
struct iwch_dev *rhp;
struct iwch_pd *php;
struct iwch_mr *mhp;
struct iwch_reg_user_mr_resp uresp;
- struct scatterlist *sg;
+ struct sg_dma_page_iter sg_iter;
pr_debug("%s ib_pd %p\n", __func__, pd);
php = to_iwch_pd(pd);
@@ -540,14 +530,14 @@ static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
mhp->rhp = rhp;
- mhp->umem = ib_umem_get(pd->uobject->context, start, length, acc, 0);
+ mhp->umem = ib_umem_get(udata, start, length, acc, 0);
if (IS_ERR(mhp->umem)) {
err = PTR_ERR(mhp->umem);
kfree(mhp);
return ERR_PTR(err);
}
- shift = mhp->umem->page_shift;
+ shift = PAGE_SHIFT;
n = mhp->umem->nmap;
@@ -563,19 +553,15 @@ static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
i = n = 0;
- for_each_sg(mhp->umem->sg_head.sgl, sg, mhp->umem->nmap, entry) {
- len = sg_dma_len(sg) >> shift;
- for (k = 0; k < len; ++k) {
- pages[i++] = cpu_to_be64(sg_dma_address(sg) +
- (k << shift));
- if (i == PAGE_SIZE / sizeof *pages) {
- err = iwch_write_pbl(mhp, pages, i, n);
- if (err)
- goto pbl_done;
- n += i;
- i = 0;
- }
- }
+ for_each_sg_dma_page(mhp->umem->sg_head.sgl, &sg_iter, mhp->umem->nmap, 0) {
+ pages[i++] = cpu_to_be64(sg_page_iter_dma_address(&sg_iter));
+ if (i == PAGE_SIZE / sizeof *pages) {
+ err = iwch_write_pbl(mhp, pages, i, n);
+ if (err)
+ goto pbl_done;
+ n += i;
+ i = 0;
+ }
}
if (i)
@@ -836,7 +822,8 @@ static struct ib_qp *iwch_create_qp(struct ib_pd *pd,
* Kernel users need more wq space for fastreg WRs which can take
* 2 WR fragments.
*/
- ucontext = udata ? to_iwch_ucontext(pd->uobject->context) : NULL;
+ ucontext = rdma_udata_to_drv_context(udata, struct iwch_ucontext,
+ ibucontext);
if (!ucontext && wqsize < (rqsize + (2 * sqsize)))
wqsize = roundup_pow_of_two(rqsize +
roundup_pow_of_two(attrs->cap.max_send_wr * 2));
@@ -1130,8 +1117,9 @@ static int iwch_query_port(struct ib_device *ibdev,
static ssize_t hw_rev_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
- struct iwch_dev *iwch_dev = container_of(dev, struct iwch_dev,
- ibdev.dev);
+ struct iwch_dev *iwch_dev =
+ rdma_device_to_drv_device(dev, struct iwch_dev, ibdev);
+
pr_debug("%s dev 0x%p\n", __func__, dev);
return sprintf(buf, "%d\n", iwch_dev->rdev.t3cdev_p->type);
}
@@ -1140,8 +1128,8 @@ static DEVICE_ATTR_RO(hw_rev);
static ssize_t hca_type_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
- struct iwch_dev *iwch_dev = container_of(dev, struct iwch_dev,
- ibdev.dev);
+ struct iwch_dev *iwch_dev =
+ rdma_device_to_drv_device(dev, struct iwch_dev, ibdev);
struct ethtool_drvinfo info;
struct net_device *lldev = iwch_dev->rdev.t3cdev_p->lldev;
@@ -1154,8 +1142,9 @@ static DEVICE_ATTR_RO(hca_type);
static ssize_t board_id_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
- struct iwch_dev *iwch_dev = container_of(dev, struct iwch_dev,
- ibdev.dev);
+ struct iwch_dev *iwch_dev =
+ rdma_device_to_drv_device(dev, struct iwch_dev, ibdev);
+
pr_debug("%s dev 0x%p\n", __func__, dev);
return sprintf(buf, "%x.%x\n", iwch_dev->rdev.rnic_info.pdev->vendor,
iwch_dev->rdev.rnic_info.pdev->device);
@@ -1348,6 +1337,8 @@ static const struct ib_device_ops iwch_dev_ops = {
.reg_user_mr = iwch_reg_user_mr,
.req_notify_cq = iwch_arm_cq,
.resize_cq = iwch_resize_cq,
+ INIT_RDMA_OBJ_SIZE(ib_pd, iwch_pd, ibpd),
+ INIT_RDMA_OBJ_SIZE(ib_ucontext, iwch_ucontext, ibucontext),
};
int iwch_register_device(struct iwch_dev *dev)
@@ -1391,7 +1382,7 @@ int iwch_register_device(struct iwch_dev *dev)
dev->ibdev.dev.parent = &dev->rdev.rnic_info.pdev->dev;
dev->ibdev.uverbs_abi_ver = IWCH_UVERBS_ABI_VERSION;
- dev->ibdev.iwcm = kmalloc(sizeof(struct iw_cm_verbs), GFP_KERNEL);
+ dev->ibdev.iwcm = kzalloc(sizeof(struct iw_cm_verbs), GFP_KERNEL);
if (!dev->ibdev.iwcm)
return -ENOMEM;
@@ -1409,7 +1400,7 @@ int iwch_register_device(struct iwch_dev *dev)
dev->ibdev.driver_id = RDMA_DRIVER_CXGB3;
rdma_set_device_sysfs_group(&dev->ibdev, &iwch_attr_group);
ib_set_device_ops(&dev->ibdev, &iwch_dev_ops);
- ret = ib_register_device(&dev->ibdev, "cxgb3_%d", NULL);
+ ret = ib_register_device(&dev->ibdev, "cxgb3_%d");
if (ret)
kfree(dev->ibdev.iwcm);
return ret;
diff --git a/drivers/infiniband/hw/cxgb4/Makefile b/drivers/infiniband/hw/cxgb4/Makefile
index 9edd92023e18..31a87d90a40b 100644
--- a/drivers/infiniband/hw/cxgb4/Makefile
+++ b/drivers/infiniband/hw/cxgb4/Makefile
@@ -1,5 +1,5 @@
-ccflags-y := -Idrivers/net/ethernet/chelsio/cxgb4
-ccflags-y += -Idrivers/net/ethernet/chelsio/libcxgb
+ccflags-y := -I $(srctree)/drivers/net/ethernet/chelsio/cxgb4
+ccflags-y += -I $(srctree)/drivers/net/ethernet/chelsio/libcxgb
obj-$(CONFIG_INFINIBAND_CXGB4) += iw_cxgb4.o
diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c
index 8221813219e5..4d232bdf9e97 100644
--- a/drivers/infiniband/hw/cxgb4/cm.c
+++ b/drivers/infiniband/hw/cxgb4/cm.c
@@ -655,7 +655,33 @@ static int send_halfclose(struct c4iw_ep *ep)
return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
}
-static int send_abort(struct c4iw_ep *ep)
+static void read_tcb(struct c4iw_ep *ep)
+{
+ struct sk_buff *skb;
+ struct cpl_get_tcb *req;
+ int wrlen = roundup(sizeof(*req), 16);
+
+ skb = get_skb(NULL, sizeof(*req), GFP_KERNEL);
+ if (WARN_ON(!skb))
+ return;
+
+ set_wr_txq(skb, CPL_PRIORITY_CONTROL, ep->ctrlq_idx);
+ req = (struct cpl_get_tcb *) skb_put(skb, wrlen);
+ memset(req, 0, wrlen);
+ INIT_TP_WR(req, ep->hwtid);
+ OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_GET_TCB, ep->hwtid));
+ req->reply_ctrl = htons(REPLY_CHAN_V(0) | QUEUENO_V(ep->rss_qid));
+
+ /*
+ * keep a ref on the ep so the tcb is not unlocked before this
+ * cpl completes. The ref is released in read_tcb_rpl().
+ */
+ c4iw_get_ep(&ep->com);
+ if (WARN_ON(c4iw_ofld_send(&ep->com.dev->rdev, skb)))
+ c4iw_put_ep(&ep->com);
+}
+
+static int send_abort_req(struct c4iw_ep *ep)
{
u32 wrlen = roundup(sizeof(struct cpl_abort_req), 16);
struct sk_buff *req_skb = skb_dequeue(&ep->com.ep_skb_list);
@@ -670,6 +696,17 @@ static int send_abort(struct c4iw_ep *ep)
return c4iw_l2t_send(&ep->com.dev->rdev, req_skb, ep->l2t);
}
+static int send_abort(struct c4iw_ep *ep)
+{
+ if (!ep->com.qp || !ep->com.qp->srq) {
+ send_abort_req(ep);
+ return 0;
+ }
+ set_bit(ABORT_REQ_IN_PROGRESS, &ep->com.flags);
+ read_tcb(ep);
+ return 0;
+}
+
static int send_connect(struct c4iw_ep *ep)
{
struct cpl_act_open_req *req = NULL;
@@ -1851,14 +1888,11 @@ static int rx_data(struct c4iw_dev *dev, struct sk_buff *skb)
return 0;
}
-static void complete_cached_srq_buffers(struct c4iw_ep *ep,
- __be32 srqidx_status)
+static void complete_cached_srq_buffers(struct c4iw_ep *ep, u32 srqidx)
{
enum chip_type adapter_type;
- u32 srqidx;
adapter_type = ep->com.dev->rdev.lldi.adapter_type;
- srqidx = ABORT_RSS_SRQIDX_G(be32_to_cpu(srqidx_status));
/*
* If this TCB had a srq buffer cached, then we must complete
@@ -1876,6 +1910,7 @@ static void complete_cached_srq_buffers(struct c4iw_ep *ep,
static int abort_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
{
+ u32 srqidx;
struct c4iw_ep *ep;
struct cpl_abort_rpl_rss6 *rpl = cplhdr(skb);
int release = 0;
@@ -1887,7 +1922,10 @@ static int abort_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
return 0;
}
- complete_cached_srq_buffers(ep, rpl->srqidx_status);
+ if (ep->com.qp && ep->com.qp->srq) {
+ srqidx = ABORT_RSS_SRQIDX_G(be32_to_cpu(rpl->srqidx_status));
+ complete_cached_srq_buffers(ep, srqidx ? srqidx : ep->srqe_idx);
+ }
pr_debug("ep %p tid %u\n", ep, ep->hwtid);
mutex_lock(&ep->com.mutex);
@@ -1903,8 +1941,10 @@ static int abort_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
}
mutex_unlock(&ep->com.mutex);
- if (release)
+ if (release) {
+ close_complete_upcall(ep, -ECONNRESET);
release_ep_resources(ep);
+ }
c4iw_put_ep(&ep->com);
return 0;
}
@@ -2072,7 +2112,7 @@ static int import_ep(struct c4iw_ep *ep, int iptype, __u8 *peer_ip,
} else {
pdev = get_real_dev(n->dev);
ep->l2t = cxgb4_l2t_get(cdev->rdev.lldi.l2t,
- n, pdev, 0);
+ n, pdev, rt_tos2priority(tos));
if (!ep->l2t)
goto out;
ep->mtu = dst_mtu(dst);
@@ -2161,7 +2201,8 @@ static int c4iw_reconnect(struct c4iw_ep *ep)
laddr6->sin6_addr.s6_addr,
raddr6->sin6_addr.s6_addr,
laddr6->sin6_port,
- raddr6->sin6_port, 0,
+ raddr6->sin6_port,
+ ep->com.cm_id->tos,
raddr6->sin6_scope_id);
iptype = 6;
ra = (__u8 *)&raddr6->sin6_addr;
@@ -2476,7 +2517,7 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb)
u16 peer_mss = ntohs(req->tcpopt.mss);
int iptype;
unsigned short hdrs;
- u8 tos = PASS_OPEN_TOS_G(ntohl(req->tos_stid));
+ u8 tos;
parent_ep = (struct c4iw_ep *)get_ep_from_stid(dev, stid);
if (!parent_ep) {
@@ -2490,6 +2531,11 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb)
goto reject;
}
+ if (parent_ep->com.cm_id->tos_set)
+ tos = parent_ep->com.cm_id->tos;
+ else
+ tos = PASS_OPEN_TOS_G(ntohl(req->tos_stid));
+
cxgb_get_4tuple(req, parent_ep->com.dev->rdev.lldi.adapter_type,
&iptype, local_ip, peer_ip, &local_port, &peer_port);
@@ -2509,7 +2555,7 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb)
ntohs(peer_port), peer_mss);
dst = cxgb_find_route6(&dev->rdev.lldi, get_real_dev,
local_ip, peer_ip, local_port, peer_port,
- PASS_OPEN_TOS_G(ntohl(req->tos_stid)),
+ tos,
((struct sockaddr_in6 *)
&parent_ep->com.local_addr)->sin6_scope_id);
}
@@ -2740,6 +2786,21 @@ static int peer_close(struct c4iw_dev *dev, struct sk_buff *skb)
return 0;
}
+static void finish_peer_abort(struct c4iw_dev *dev, struct c4iw_ep *ep)
+{
+ complete_cached_srq_buffers(ep, ep->srqe_idx);
+ if (ep->com.cm_id && ep->com.qp) {
+ struct c4iw_qp_attributes attrs;
+
+ attrs.next_state = C4IW_QP_STATE_ERROR;
+ c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp,
+ C4IW_QP_ATTR_NEXT_STATE, &attrs, 1);
+ }
+ peer_abort_upcall(ep);
+ release_ep_resources(ep);
+ c4iw_put_ep(&ep->com);
+}
+
static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb)
{
struct cpl_abort_req_rss6 *req = cplhdr(skb);
@@ -2750,6 +2811,7 @@ static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb)
int release = 0;
unsigned int tid = GET_TID(req);
u8 status;
+ u32 srqidx;
u32 len = roundup(sizeof(struct cpl_abort_rpl), 16);
@@ -2769,8 +2831,6 @@ static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb)
goto deref_ep;
}
- complete_cached_srq_buffers(ep, req->srqidx_status);
-
pr_debug("ep %p tid %u state %u\n", ep, ep->hwtid,
ep->com.state);
set_bit(PEER_ABORT, &ep->com.history);
@@ -2819,6 +2879,23 @@ static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb)
stop_ep_timer(ep);
/*FALLTHROUGH*/
case FPDU_MODE:
+ if (ep->com.qp && ep->com.qp->srq) {
+ srqidx = ABORT_RSS_SRQIDX_G(
+ be32_to_cpu(req->srqidx_status));
+ if (srqidx) {
+ complete_cached_srq_buffers(ep,
+ req->srqidx_status);
+ } else {
+ /* Hold ep ref until finish_peer_abort() */
+ c4iw_get_ep(&ep->com);
+ __state_set(&ep->com, ABORTING);
+ set_bit(PEER_ABORT_IN_PROGRESS, &ep->com.flags);
+ read_tcb(ep);
+ break;
+
+ }
+ }
+
if (ep->com.cm_id && ep->com.qp) {
attrs.next_state = C4IW_QP_STATE_ERROR;
ret = c4iw_modify_qp(ep->com.qp->rhp,
@@ -2942,15 +3019,18 @@ static int terminate(struct c4iw_dev *dev, struct sk_buff *skb)
ep = get_ep_from_tid(dev, tid);
- if (ep && ep->com.qp) {
- pr_warn("TERM received tid %u qpid %u\n",
- tid, ep->com.qp->wq.sq.qid);
- attrs.next_state = C4IW_QP_STATE_TERMINATE;
- c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp,
- C4IW_QP_ATTR_NEXT_STATE, &attrs, 1);
+ if (ep) {
+ if (ep->com.qp) {
+ pr_warn("TERM received tid %u qpid %u\n", tid,
+ ep->com.qp->wq.sq.qid);
+ attrs.next_state = C4IW_QP_STATE_TERMINATE;
+ c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp,
+ C4IW_QP_ATTR_NEXT_STATE, &attrs, 1);
+ }
+
+ c4iw_put_ep(&ep->com);
} else
pr_warn("TERM received tid %u no ep/qp\n", tid);
- c4iw_put_ep(&ep->com);
return 0;
}
@@ -3318,7 +3398,7 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
laddr6->sin6_addr.s6_addr,
raddr6->sin6_addr.s6_addr,
laddr6->sin6_port,
- raddr6->sin6_port, 0,
+ raddr6->sin6_port, cm_id->tos,
raddr6->sin6_scope_id);
}
if (!ep->dst) {
@@ -3606,7 +3686,6 @@ int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp)
if (close) {
if (abrupt) {
set_bit(EP_DISC_ABORT, &ep->com.history);
- close_complete_upcall(ep, -ECONNRESET);
ret = send_abort(ep);
} else {
set_bit(EP_DISC_CLOSE, &ep->com.history);
@@ -3717,6 +3796,80 @@ static void passive_ofld_conn_reply(struct c4iw_dev *dev, struct sk_buff *skb,
return;
}
+static inline u64 t4_tcb_get_field64(__be64 *tcb, u16 word)
+{
+ u64 tlo = be64_to_cpu(tcb[((31 - word) / 2)]);
+ u64 thi = be64_to_cpu(tcb[((31 - word) / 2) - 1]);
+ u64 t;
+ u32 shift = 32;
+
+ t = (thi << shift) | (tlo >> shift);
+
+ return t;
+}
+
+static inline u32 t4_tcb_get_field32(__be64 *tcb, u16 word, u32 mask, u32 shift)
+{
+ u32 v;
+ u64 t = be64_to_cpu(tcb[(31 - word) / 2]);
+
+ if (word & 0x1)
+ shift += 32;
+ v = (t >> shift) & mask;
+ return v;
+}
+
+static int read_tcb_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+ struct cpl_get_tcb_rpl *rpl = cplhdr(skb);
+ __be64 *tcb = (__be64 *)(rpl + 1);
+ unsigned int tid = GET_TID(rpl);
+ struct c4iw_ep *ep;
+ u64 t_flags_64;
+ u32 rx_pdu_out;
+
+ ep = get_ep_from_tid(dev, tid);
+ if (!ep)
+ return 0;
+ /* Examine the TF_RX_PDU_OUT (bit 49 of the t_flags) in order to
+ * determine if there's a rx PDU feedback event pending.
+ *
+ * If that bit is set, it means we'll need to re-read the TCB's
+ * rq_start value. The final value is the one present in a TCB
+ * with the TF_RX_PDU_OUT bit cleared.
+ */
+
+ t_flags_64 = t4_tcb_get_field64(tcb, TCB_T_FLAGS_W);
+ rx_pdu_out = (t_flags_64 & TF_RX_PDU_OUT_V(1)) >> TF_RX_PDU_OUT_S;
+
+ c4iw_put_ep(&ep->com); /* from get_ep_from_tid() */
+ c4iw_put_ep(&ep->com); /* from read_tcb() */
+
+ /* If TF_RX_PDU_OUT bit is set, re-read the TCB */
+ if (rx_pdu_out) {
+ if (++ep->rx_pdu_out_cnt >= 2) {
+ WARN_ONCE(1, "tcb re-read() reached the guard limit, finishing the cleanup\n");
+ goto cleanup;
+ }
+ read_tcb(ep);
+ return 0;
+ }
+
+ ep->srqe_idx = t4_tcb_get_field32(tcb, TCB_RQ_START_W, TCB_RQ_START_W,
+ TCB_RQ_START_S);
+cleanup:
+ pr_debug("ep %p tid %u %016x\n", ep, ep->hwtid, ep->srqe_idx);
+
+ if (test_bit(PEER_ABORT_IN_PROGRESS, &ep->com.flags))
+ finish_peer_abort(dev, ep);
+ else if (test_bit(ABORT_REQ_IN_PROGRESS, &ep->com.flags))
+ send_abort_req(ep);
+ else
+ WARN_ONCE(1, "unexpected state!");
+
+ return 0;
+}
+
static int deferred_fw6_msg(struct c4iw_dev *dev, struct sk_buff *skb)
{
struct cpl_fw6_msg *rpl = cplhdr(skb);
@@ -4037,6 +4190,7 @@ static c4iw_handler_func work_handlers[NUM_CPL_CMDS + NUM_FAKE_CPLS] = {
[CPL_CLOSE_CON_RPL] = close_con_rpl,
[CPL_RDMA_TERMINATE] = terminate,
[CPL_FW4_ACK] = fw4_ack,
+ [CPL_GET_TCB_RPL] = read_tcb_rpl,
[CPL_FW6_MSG] = deferred_fw6_msg,
[CPL_RX_PKT] = rx_pkt,
[FAKE_CPL_PUT_EP_SAFE] = _put_ep_safe,
@@ -4268,6 +4422,7 @@ c4iw_handler_func c4iw_handlers[NUM_CPL_CMDS] = {
[CPL_RDMA_TERMINATE] = sched,
[CPL_FW4_ACK] = sched,
[CPL_SET_TCB_RPL] = set_tcb_rpl,
+ [CPL_GET_TCB_RPL] = sched,
[CPL_FW6_MSG] = fw6_msg,
[CPL_RX_PKT] = sched
};
diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c
index d499cd61c0e8..c79cf63fb0bb 100644
--- a/drivers/infiniband/hw/cxgb4/device.c
+++ b/drivers/infiniband/hw/cxgb4/device.c
@@ -720,11 +720,8 @@ static const struct file_operations ep_debugfs_fops = {
.read = debugfs_read,
};
-static int setup_debugfs(struct c4iw_dev *devp)
+static void setup_debugfs(struct c4iw_dev *devp)
{
- if (!devp->debugfs_root)
- return -1;
-
debugfs_create_file_size("qps", S_IWUSR, devp->debugfs_root,
(void *)devp, &qp_debugfs_fops, 4096);
@@ -740,7 +737,6 @@ static int setup_debugfs(struct c4iw_dev *devp)
if (c4iw_wr_log)
debugfs_create_file_size("wr_log", S_IWUSR, devp->debugfs_root,
(void *)devp, &wr_log_debugfs_fops, 4096);
- return 0;
}
void c4iw_release_dev_ucontext(struct c4iw_rdev *rdev,
@@ -981,7 +977,7 @@ static struct c4iw_dev *c4iw_alloc(const struct cxgb4_lld_info *infop)
pr_info("%s: On-Chip Queues not supported on this device\n",
pci_name(infop->pdev));
- devp = (struct c4iw_dev *)ib_alloc_device(sizeof(*devp));
+ devp = ib_alloc_device(c4iw_dev, ibdev);
if (!devp) {
pr_err("Cannot allocate ib device\n");
return ERR_PTR(-ENOMEM);
@@ -1564,8 +1560,6 @@ static int __init c4iw_init_module(void)
return err;
c4iw_debugfs_root = debugfs_create_dir(DRV_NAME, NULL);
- if (!c4iw_debugfs_root)
- pr_warn("could not create debugfs entry, continuing\n");
reg_workq = create_singlethread_workqueue("Register_iWARP_device");
if (!reg_workq) {
diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
index f0fceadd0d12..5a5da41faef6 100644
--- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
+++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
@@ -589,7 +589,6 @@ struct c4iw_ucontext {
u32 key;
spinlock_t mmap_lock;
struct list_head mmaps;
- struct kref kref;
bool is_32b_cqe;
};
@@ -598,18 +597,6 @@ static inline struct c4iw_ucontext *to_c4iw_ucontext(struct ib_ucontext *c)
return container_of(c, struct c4iw_ucontext, ibucontext);
}
-void _c4iw_free_ucontext(struct kref *kref);
-
-static inline void c4iw_put_ucontext(struct c4iw_ucontext *ucontext)
-{
- kref_put(&ucontext->kref, _c4iw_free_ucontext);
-}
-
-static inline void c4iw_get_ucontext(struct c4iw_ucontext *ucontext)
-{
- kref_get(&ucontext->kref);
-}
-
struct c4iw_mm_entry {
struct list_head entry;
u64 addr;
@@ -982,6 +969,9 @@ struct c4iw_ep {
int rcv_win;
u32 snd_wscale;
struct c4iw_ep_stats stats;
+ u32 srqe_idx;
+ u32 rx_pdu_out_cnt;
+ struct sk_buff *peer_abort_skb;
};
static inline struct c4iw_ep *to_ep(struct iw_cm_id *cm_id)
diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c
index 7b76e6f81aeb..5baa31ab6366 100644
--- a/drivers/infiniband/hw/cxgb4/mem.c
+++ b/drivers/infiniband/hw/cxgb4/mem.c
@@ -502,10 +502,9 @@ struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
u64 virt, int acc, struct ib_udata *udata)
{
__be64 *pages;
- int shift, n, len;
- int i, k, entry;
+ int shift, n, i;
int err = -ENOMEM;
- struct scatterlist *sg;
+ struct sg_dma_page_iter sg_iter;
struct c4iw_dev *rhp;
struct c4iw_pd *php;
struct c4iw_mr *mhp;
@@ -537,11 +536,11 @@ struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
mhp->rhp = rhp;
- mhp->umem = ib_umem_get(pd->uobject->context, start, length, acc, 0);
+ mhp->umem = ib_umem_get(udata, start, length, acc, 0);
if (IS_ERR(mhp->umem))
goto err_free_skb;
- shift = mhp->umem->page_shift;
+ shift = PAGE_SHIFT;
n = mhp->umem->nmap;
err = alloc_pbl(mhp, n);
@@ -556,21 +555,16 @@ struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
i = n = 0;
- for_each_sg(mhp->umem->sg_head.sgl, sg, mhp->umem->nmap, entry) {
- len = sg_dma_len(sg) >> shift;
- for (k = 0; k < len; ++k) {
- pages[i++] = cpu_to_be64(sg_dma_address(sg) +
- (k << shift));
- if (i == PAGE_SIZE / sizeof *pages) {
- err = write_pbl(&mhp->rhp->rdev,
- pages,
- mhp->attr.pbl_addr + (n << 3), i,
- mhp->wr_waitp);
- if (err)
- goto pbl_done;
- n += i;
- i = 0;
- }
+ for_each_sg_dma_page(mhp->umem->sg_head.sgl, &sg_iter, mhp->umem->nmap, 0) {
+ pages[i++] = cpu_to_be64(sg_page_iter_dma_address(&sg_iter));
+ if (i == PAGE_SIZE / sizeof(*pages)) {
+ err = write_pbl(&mhp->rhp->rdev, pages,
+ mhp->attr.pbl_addr + (n << 3), i,
+ mhp->wr_waitp);
+ if (err)
+ goto pbl_done;
+ n += i;
+ i = 0;
}
}
@@ -684,8 +678,8 @@ int c4iw_dealloc_mw(struct ib_mw *mw)
mhp->wr_waitp);
kfree_skb(mhp->dereg_skb);
c4iw_put_wr_wait(mhp->wr_waitp);
- kfree(mhp);
pr_debug("ib_mw %p mmid 0x%x ptr %p\n", mw, mmid, mhp);
+ kfree(mhp);
return 0;
}
diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c
index 586b0c37481f..507c54572cc9 100644
--- a/drivers/infiniband/hw/cxgb4/provider.c
+++ b/drivers/infiniband/hw/cxgb4/provider.c
@@ -58,51 +58,34 @@ static int fastreg_support = 1;
module_param(fastreg_support, int, 0644);
MODULE_PARM_DESC(fastreg_support, "Advertise fastreg support (default=1)");
-void _c4iw_free_ucontext(struct kref *kref)
+static void c4iw_dealloc_ucontext(struct ib_ucontext *context)
{
- struct c4iw_ucontext *ucontext;
+ struct c4iw_ucontext *ucontext = to_c4iw_ucontext(context);
struct c4iw_dev *rhp;
struct c4iw_mm_entry *mm, *tmp;
- ucontext = container_of(kref, struct c4iw_ucontext, kref);
+ pr_debug("context %p\n", context);
rhp = to_c4iw_dev(ucontext->ibucontext.device);
- pr_debug("ucontext %p\n", ucontext);
list_for_each_entry_safe(mm, tmp, &ucontext->mmaps, entry)
kfree(mm);
c4iw_release_dev_ucontext(&rhp->rdev, &ucontext->uctx);
- kfree(ucontext);
}
-static int c4iw_dealloc_ucontext(struct ib_ucontext *context)
+static int c4iw_alloc_ucontext(struct ib_ucontext *ucontext,
+ struct ib_udata *udata)
{
- struct c4iw_ucontext *ucontext = to_c4iw_ucontext(context);
-
- pr_debug("context %p\n", context);
- c4iw_put_ucontext(ucontext);
- return 0;
-}
-
-static struct ib_ucontext *c4iw_alloc_ucontext(struct ib_device *ibdev,
- struct ib_udata *udata)
-{
- struct c4iw_ucontext *context;
+ struct ib_device *ibdev = ucontext->device;
+ struct c4iw_ucontext *context = to_c4iw_ucontext(ucontext);
struct c4iw_dev *rhp = to_c4iw_dev(ibdev);
struct c4iw_alloc_ucontext_resp uresp;
int ret = 0;
struct c4iw_mm_entry *mm = NULL;
pr_debug("ibdev %p\n", ibdev);
- context = kzalloc(sizeof(*context), GFP_KERNEL);
- if (!context) {
- ret = -ENOMEM;
- goto err;
- }
-
c4iw_init_dev_ucontext(&rhp->rdev, &context->uctx);
INIT_LIST_HEAD(&context->mmaps);
spin_lock_init(&context->mmap_lock);
- kref_init(&context->kref);
if (udata->outlen < sizeof(uresp) - sizeof(uresp.reserved)) {
pr_err_once("Warning - downlevel libcxgb4 (non-fatal), device status page disabled\n");
@@ -111,7 +94,7 @@ static struct ib_ucontext *c4iw_alloc_ucontext(struct ib_device *ibdev,
mm = kmalloc(sizeof(*mm), GFP_KERNEL);
if (!mm) {
ret = -ENOMEM;
- goto err_free;
+ goto err;
}
uresp.status_page_size = PAGE_SIZE;
@@ -131,13 +114,11 @@ static struct ib_ucontext *c4iw_alloc_ucontext(struct ib_device *ibdev,
mm->len = PAGE_SIZE;
insert_mmap(context, mm);
}
- return &context->ibucontext;
+ return 0;
err_mm:
kfree(mm);
-err_free:
- kfree(context);
err:
- return ERR_PTR(ret);
+ return ret;
}
static int c4iw_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
@@ -209,7 +190,7 @@ static int c4iw_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
return ret;
}
-static int c4iw_deallocate_pd(struct ib_pd *pd)
+static void c4iw_deallocate_pd(struct ib_pd *pd)
{
struct c4iw_dev *rhp;
struct c4iw_pd *php;
@@ -221,15 +202,13 @@ static int c4iw_deallocate_pd(struct ib_pd *pd)
mutex_lock(&rhp->rdev.stats.lock);
rhp->rdev.stats.pd.cur--;
mutex_unlock(&rhp->rdev.stats.lock);
- kfree(php);
- return 0;
}
-static struct ib_pd *c4iw_allocate_pd(struct ib_device *ibdev,
- struct ib_ucontext *context,
- struct ib_udata *udata)
+static int c4iw_allocate_pd(struct ib_pd *pd, struct ib_ucontext *context,
+ struct ib_udata *udata)
{
- struct c4iw_pd *php;
+ struct c4iw_pd *php = to_c4iw_pd(pd);
+ struct ib_device *ibdev = pd->device;
u32 pdid;
struct c4iw_dev *rhp;
@@ -237,12 +216,8 @@ static struct ib_pd *c4iw_allocate_pd(struct ib_device *ibdev,
rhp = (struct c4iw_dev *) ibdev;
pdid = c4iw_get_resource(&rhp->rdev.resource.pdid_table);
if (!pdid)
- return ERR_PTR(-EINVAL);
- php = kzalloc(sizeof(*php), GFP_KERNEL);
- if (!php) {
- c4iw_put_resource(&rhp->rdev.resource.pdid_table, pdid);
- return ERR_PTR(-ENOMEM);
- }
+ return -EINVAL;
+
php->pdid = pdid;
php->rhp = rhp;
if (context) {
@@ -250,7 +225,7 @@ static struct ib_pd *c4iw_allocate_pd(struct ib_device *ibdev,
if (ib_copy_to_udata(udata, &uresp, sizeof(uresp))) {
c4iw_deallocate_pd(&php->ibpd);
- return ERR_PTR(-EFAULT);
+ return -EFAULT;
}
}
mutex_lock(&rhp->rdev.stats.lock);
@@ -259,7 +234,7 @@ static struct ib_pd *c4iw_allocate_pd(struct ib_device *ibdev,
rhp->rdev.stats.pd.max = rhp->rdev.stats.pd.cur;
mutex_unlock(&rhp->rdev.stats.lock);
pr_debug("pdid 0x%0x ptr 0x%p\n", pdid, php);
- return &php->ibpd;
+ return 0;
}
static int c4iw_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
@@ -376,8 +351,9 @@ static int c4iw_query_port(struct ib_device *ibdev, u8 port,
static ssize_t hw_rev_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
- struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev,
- ibdev.dev);
+ struct c4iw_dev *c4iw_dev =
+ rdma_device_to_drv_device(dev, struct c4iw_dev, ibdev);
+
pr_debug("dev 0x%p\n", dev);
return sprintf(buf, "%d\n",
CHELSIO_CHIP_RELEASE(c4iw_dev->rdev.lldi.adapter_type));
@@ -387,8 +363,8 @@ static DEVICE_ATTR_RO(hw_rev);
static ssize_t hca_type_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
- struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev,
- ibdev.dev);
+ struct c4iw_dev *c4iw_dev =
+ rdma_device_to_drv_device(dev, struct c4iw_dev, ibdev);
struct ethtool_drvinfo info;
struct net_device *lldev = c4iw_dev->rdev.lldi.ports[0];
@@ -401,8 +377,9 @@ static DEVICE_ATTR_RO(hca_type);
static ssize_t board_id_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
- struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev,
- ibdev.dev);
+ struct c4iw_dev *c4iw_dev =
+ rdma_device_to_drv_device(dev, struct c4iw_dev, ibdev);
+
pr_debug("dev 0x%p\n", dev);
return sprintf(buf, "%x.%x\n", c4iw_dev->rdev.lldi.pdev->vendor,
c4iw_dev->rdev.lldi.pdev->device);
@@ -547,6 +524,7 @@ static const struct ib_device_ops c4iw_dev_ops = {
.destroy_cq = c4iw_destroy_cq,
.destroy_qp = c4iw_destroy_qp,
.destroy_srq = c4iw_destroy_srq,
+ .fill_res_entry = fill_res_entry,
.get_dev_fw_str = get_dev_fw_str,
.get_dma_mr = c4iw_get_dma_mr,
.get_hw_stats = c4iw_get_mib,
@@ -567,6 +545,8 @@ static const struct ib_device_ops c4iw_dev_ops = {
.query_qp = c4iw_ib_query_qp,
.reg_user_mr = c4iw_reg_user_mr,
.req_notify_cq = c4iw_arm_cq,
+ INIT_RDMA_OBJ_SIZE(ib_pd, c4iw_pd, ibpd),
+ INIT_RDMA_OBJ_SIZE(ib_ucontext, c4iw_ucontext, ibucontext),
};
void c4iw_register_device(struct work_struct *work)
@@ -613,7 +593,7 @@ void c4iw_register_device(struct work_struct *work)
dev->ibdev.dev.parent = &dev->rdev.lldi.pdev->dev;
dev->ibdev.uverbs_abi_ver = C4IW_UVERBS_ABI_VERSION;
- dev->ibdev.iwcm = kmalloc(sizeof(struct iw_cm_verbs), GFP_KERNEL);
+ dev->ibdev.iwcm = kzalloc(sizeof(struct iw_cm_verbs), GFP_KERNEL);
if (!dev->ibdev.iwcm) {
ret = -ENOMEM;
goto err_dealloc_ctx;
@@ -627,14 +607,13 @@ void c4iw_register_device(struct work_struct *work)
dev->ibdev.iwcm->add_ref = c4iw_qp_add_ref;
dev->ibdev.iwcm->rem_ref = c4iw_qp_rem_ref;
dev->ibdev.iwcm->get_qp = c4iw_get_qp;
- dev->ibdev.res.fill_res_entry = fill_res_entry;
memcpy(dev->ibdev.iwcm->ifname, dev->rdev.lldi.ports[0]->name,
sizeof(dev->ibdev.iwcm->ifname));
rdma_set_device_sysfs_group(&dev->ibdev, &c4iw_attr_group);
dev->ibdev.driver_id = RDMA_DRIVER_CXGB4;
ib_set_device_ops(&dev->ibdev, &c4iw_dev_ops);
- ret = ib_register_device(&dev->ibdev, "cxgb4_%d", NULL);
+ ret = ib_register_device(&dev->ibdev, "cxgb4_%d");
if (ret)
goto err_kfree_iwcm;
return;
diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c
index 504cf525508f..d3a82839f5ea 100644
--- a/drivers/infiniband/hw/cxgb4/qp.c
+++ b/drivers/infiniband/hw/cxgb4/qp.c
@@ -31,6 +31,7 @@
*/
#include <linux/module.h>
+#include <rdma/uverbs_ioctl.h>
#include "iw_cxgb4.h"
@@ -632,7 +633,10 @@ static void build_rdma_write_cmpl(struct t4_sq *sq,
wcwr->stag_sink = cpu_to_be32(rdma_wr(wr)->rkey);
wcwr->to_sink = cpu_to_be64(rdma_wr(wr)->remote_addr);
- wcwr->stag_inv = cpu_to_be32(wr->next->ex.invalidate_rkey);
+ if (wr->next->opcode == IB_WR_SEND)
+ wcwr->stag_inv = 0;
+ else
+ wcwr->stag_inv = cpu_to_be32(wr->next->ex.invalidate_rkey);
wcwr->r2 = 0;
wcwr->r3 = 0;
@@ -726,7 +730,10 @@ static void post_write_cmpl(struct c4iw_qp *qhp, const struct ib_send_wr *wr)
/* SEND_WITH_INV swsqe */
swsqe = &qhp->wq.sq.sw_sq[qhp->wq.sq.pidx];
- swsqe->opcode = FW_RI_SEND_WITH_INV;
+ if (wr->next->opcode == IB_WR_SEND)
+ swsqe->opcode = FW_RI_SEND;
+ else
+ swsqe->opcode = FW_RI_SEND_WITH_INV;
swsqe->idx = qhp->wq.sq.pidx;
swsqe->complete = 0;
swsqe->signaled = send_signaled;
@@ -897,8 +904,6 @@ static void free_qp_work(struct work_struct *work)
destroy_qp(&rhp->rdev, &qhp->wq,
ucontext ? &ucontext->uctx : &rhp->rdev.uctx, !qhp->srq);
- if (ucontext)
- c4iw_put_ucontext(ucontext);
c4iw_put_wr_wait(qhp->wr_waitp);
kfree(qhp);
}
@@ -1133,9 +1138,9 @@ int c4iw_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
/*
* Fastpath for NVMe-oF target WRITE + SEND_WITH_INV wr chain which is
* the response for small NVMEe-oF READ requests. If the chain is
- * exactly a WRITE->SEND_WITH_INV and the sgl depths and lengths
- * meet the requirements of the fw_ri_write_cmpl_wr work request,
- * then build and post the write_cmpl WR. If any of the tests
+ * exactly a WRITE->SEND_WITH_INV or a WRITE->SEND and the sgl depths
+ * and lengths meet the requirements of the fw_ri_write_cmpl_wr work
+ * request, then build and post the write_cmpl WR. If any of the tests
* below are not true, then we continue on with the tradtional WRITE
* and SEND WRs.
*/
@@ -1145,7 +1150,8 @@ int c4iw_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
wr && wr->next && !wr->next->next &&
wr->opcode == IB_WR_RDMA_WRITE &&
wr->sg_list[0].length && wr->num_sge <= T4_WRITE_CMPL_MAX_SGL &&
- wr->next->opcode == IB_WR_SEND_WITH_INV &&
+ (wr->next->opcode == IB_WR_SEND ||
+ wr->next->opcode == IB_WR_SEND_WITH_INV) &&
wr->next->sg_list[0].length == T4_WRITE_CMPL_MAX_CQE &&
wr->next->num_sge == 1 && num_wrs >= 2) {
post_write_cmpl(qhp, wr);
@@ -2129,7 +2135,8 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs,
struct c4iw_cq *rchp;
struct c4iw_create_qp_resp uresp;
unsigned int sqsize, rqsize = 0;
- struct c4iw_ucontext *ucontext;
+ struct c4iw_ucontext *ucontext = rdma_udata_to_drv_context(
+ udata, struct c4iw_ucontext, ibucontext);
int ret;
struct c4iw_mm_entry *sq_key_mm, *rq_key_mm = NULL, *sq_db_key_mm;
struct c4iw_mm_entry *rq_db_key_mm = NULL, *ma_sync_key_mm = NULL;
@@ -2163,8 +2170,6 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs,
if (sqsize < 8)
sqsize = 8;
- ucontext = udata ? to_c4iw_ucontext(pd->uobject->context) : NULL;
-
qhp = kzalloc(sizeof(*qhp), GFP_KERNEL);
if (!qhp)
return ERR_PTR(-ENOMEM);
@@ -2331,7 +2336,6 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs,
insert_mmap(ucontext, ma_sync_key_mm);
}
- c4iw_get_ucontext(ucontext);
qhp->ucontext = ucontext;
}
if (!attrs->srq) {
@@ -2589,7 +2593,7 @@ static int alloc_srq_queue(struct c4iw_srq *srq, struct c4iw_dev_ucontext *uctx,
/* build fw_ri_res_wr */
wr_len = sizeof(*res_wr) + sizeof(*res);
- skb = alloc_skb(wr_len, GFP_KERNEL | __GFP_NOFAIL);
+ skb = alloc_skb(wr_len, GFP_KERNEL);
if (!skb)
goto err_free_queue;
set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0);
@@ -2711,7 +2715,8 @@ struct ib_srq *c4iw_create_srq(struct ib_pd *pd, struct ib_srq_init_attr *attrs,
rqsize = attrs->attr.max_wr + 1;
rqsize = roundup_pow_of_two(max_t(u16, rqsize, 16));
- ucontext = udata ? to_c4iw_ucontext(pd->uobject->context) : NULL;
+ ucontext = rdma_udata_to_drv_context(udata, struct c4iw_ucontext,
+ ibucontext);
srq = kzalloc(sizeof(*srq), GFP_KERNEL);
if (!srq)
diff --git a/drivers/infiniband/hw/cxgb4/t4.h b/drivers/infiniband/hw/cxgb4/t4.h
index fff6d48d262f..b170817b2741 100644
--- a/drivers/infiniband/hw/cxgb4/t4.h
+++ b/drivers/infiniband/hw/cxgb4/t4.h
@@ -35,6 +35,7 @@
#include "t4_regs.h"
#include "t4_values.h"
#include "t4_msg.h"
+#include "t4_tcb.h"
#include "t4fw_ri_api.h"
#define T4_MAX_NUM_PD 65536
diff --git a/drivers/infiniband/hw/hfi1/Makefile b/drivers/infiniband/hw/hfi1/Makefile
index 3ce9dc8c3463..4044a8c8dbf4 100644
--- a/drivers/infiniband/hw/hfi1/Makefile
+++ b/drivers/infiniband/hw/hfi1/Makefile
@@ -24,6 +24,7 @@ hfi1-y := \
mad.o \
mmu_rb.o \
msix.o \
+ opfn.o \
pcie.o \
pio.o \
pio_copy.o \
diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c
index b443642eac02..612f04190ed8 100644
--- a/drivers/infiniband/hw/hfi1/chip.c
+++ b/drivers/infiniband/hw/hfi1/chip.c
@@ -4253,6 +4253,8 @@ static struct cntr_entry dev_cntrs[DEV_CNTR_LAST] = {
access_sw_pio_drain),
[C_SW_KMEM_WAIT] = CNTR_ELEM("KmemWait", 0, 0, CNTR_NORMAL,
access_sw_kmem_wait),
+[C_SW_TID_WAIT] = CNTR_ELEM("TidWait", 0, 0, CNTR_NORMAL,
+ hfi1_access_sw_tid_wait),
[C_SW_SEND_SCHED] = CNTR_ELEM("SendSched", 0, 0, CNTR_NORMAL,
access_sw_send_schedule),
[C_SDMA_DESC_FETCHED_CNT] = CNTR_ELEM("SDEDscFdCn",
@@ -5222,6 +5224,17 @@ int is_bx(struct hfi1_devdata *dd)
return (chip_rev_minor & 0xF0) == 0x10;
}
+/* return true is kernel urg disabled for rcd */
+bool is_urg_masked(struct hfi1_ctxtdata *rcd)
+{
+ u64 mask;
+ u32 is = IS_RCVURGENT_START + rcd->ctxt;
+ u8 bit = is % 64;
+
+ mask = read_csr(rcd->dd, CCE_INT_MASK + (8 * (is / 64)));
+ return !(mask & BIT_ULL(bit));
+}
+
/*
* Append string s to buffer buf. Arguments curp and len are the current
* position and remaining length, respectively.
diff --git a/drivers/infiniband/hw/hfi1/chip.h b/drivers/infiniband/hw/hfi1/chip.h
index 6b9c8f12dff8..6c27c1c6a868 100644
--- a/drivers/infiniband/hw/hfi1/chip.h
+++ b/drivers/infiniband/hw/hfi1/chip.h
@@ -1,7 +1,7 @@
#ifndef _CHIP_H
#define _CHIP_H
/*
- * Copyright(c) 2015 - 2017 Intel Corporation.
+ * Copyright(c) 2015 - 2018 Intel Corporation.
*
* This file is provided under a dual BSD/GPLv2 license. When using or
* redistributing this file, you may do so under either license.
@@ -804,6 +804,7 @@ void clear_linkup_counters(struct hfi1_devdata *dd);
u32 hdrqempty(struct hfi1_ctxtdata *rcd);
int is_ax(struct hfi1_devdata *dd);
int is_bx(struct hfi1_devdata *dd);
+bool is_urg_masked(struct hfi1_ctxtdata *rcd);
u32 read_physical_state(struct hfi1_devdata *dd);
u32 chip_to_opa_pstate(struct hfi1_devdata *dd, u32 chip_pstate);
const char *opa_lstate_name(u32 lstate);
@@ -926,6 +927,7 @@ enum {
C_SW_PIO_WAIT,
C_SW_PIO_DRAIN,
C_SW_KMEM_WAIT,
+ C_SW_TID_WAIT,
C_SW_SEND_SCHED,
C_SDMA_DESC_FETCHED_CNT,
C_SDMA_INT_CNT,
diff --git a/drivers/infiniband/hw/hfi1/common.h b/drivers/infiniband/hw/hfi1/common.h
index 40d3cfb58bd1..7310a5dba420 100644
--- a/drivers/infiniband/hw/hfi1/common.h
+++ b/drivers/infiniband/hw/hfi1/common.h
@@ -340,6 +340,10 @@ struct diag_pkt {
#define HFI1_PSM_IOC_BASE_SEQ 0x0
+/* Number of BTH.PSN bits used for sequence number in expected rcvs */
+#define HFI1_KDETH_BTH_SEQ_SHIFT 11
+#define HFI1_KDETH_BTH_SEQ_MASK (BIT(HFI1_KDETH_BTH_SEQ_SHIFT) - 1)
+
static inline __u64 rhf_to_cpu(const __le32 *rbuf)
{
return __le64_to_cpu(*((__le64 *)rbuf));
diff --git a/drivers/infiniband/hw/hfi1/debugfs.c b/drivers/infiniband/hw/hfi1/debugfs.c
index 0a557795563c..427ba0ce74a5 100644
--- a/drivers/infiniband/hw/hfi1/debugfs.c
+++ b/drivers/infiniband/hw/hfi1/debugfs.c
@@ -1167,6 +1167,7 @@ void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd)
char link[10];
struct hfi1_devdata *dd = dd_from_dev(ibd);
struct hfi1_pportdata *ppd;
+ struct dentry *root;
int unit = dd->unit;
int i, j;
@@ -1174,31 +1175,29 @@ void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd)
return;
snprintf(name, sizeof(name), "%s_%d", class_name(), unit);
snprintf(link, sizeof(link), "%d", unit);
- ibd->hfi1_ibdev_dbg = debugfs_create_dir(name, hfi1_dbg_root);
- if (!ibd->hfi1_ibdev_dbg) {
- pr_warn("create of %s failed\n", name);
- return;
- }
+ root = debugfs_create_dir(name, hfi1_dbg_root);
+ ibd->hfi1_ibdev_dbg = root;
+
ibd->hfi1_ibdev_link =
debugfs_create_symlink(link, hfi1_dbg_root, name);
- if (!ibd->hfi1_ibdev_link) {
- pr_warn("create of %s symlink failed\n", name);
- return;
- }
- DEBUGFS_SEQ_FILE_CREATE(opcode_stats, ibd->hfi1_ibdev_dbg, ibd);
- DEBUGFS_SEQ_FILE_CREATE(tx_opcode_stats, ibd->hfi1_ibdev_dbg, ibd);
- DEBUGFS_SEQ_FILE_CREATE(ctx_stats, ibd->hfi1_ibdev_dbg, ibd);
- DEBUGFS_SEQ_FILE_CREATE(qp_stats, ibd->hfi1_ibdev_dbg, ibd);
- DEBUGFS_SEQ_FILE_CREATE(sdes, ibd->hfi1_ibdev_dbg, ibd);
- DEBUGFS_SEQ_FILE_CREATE(rcds, ibd->hfi1_ibdev_dbg, ibd);
- DEBUGFS_SEQ_FILE_CREATE(pios, ibd->hfi1_ibdev_dbg, ibd);
- DEBUGFS_SEQ_FILE_CREATE(sdma_cpu_list, ibd->hfi1_ibdev_dbg, ibd);
+
+ debugfs_create_file("opcode_stats", 0444, root, ibd,
+ &_opcode_stats_file_ops);
+ debugfs_create_file("tx_opcode_stats", 0444, root, ibd,
+ &_tx_opcode_stats_file_ops);
+ debugfs_create_file("ctx_stats", 0444, root, ibd, &_ctx_stats_file_ops);
+ debugfs_create_file("qp_stats", 0444, root, ibd, &_qp_stats_file_ops);
+ debugfs_create_file("sdes", 0444, root, ibd, &_sdes_file_ops);
+ debugfs_create_file("rcds", 0444, root, ibd, &_rcds_file_ops);
+ debugfs_create_file("pios", 0444, root, ibd, &_pios_file_ops);
+ debugfs_create_file("sdma_cpu_list", 0444, root, ibd,
+ &_sdma_cpu_list_file_ops);
+
/* dev counter files */
for (i = 0; i < ARRAY_SIZE(cntr_ops); i++)
- DEBUGFS_FILE_CREATE(cntr_ops[i].name,
- ibd->hfi1_ibdev_dbg,
- dd,
- &cntr_ops[i].ops, S_IRUGO);
+ debugfs_create_file(cntr_ops[i].name, 0444, root, dd,
+ &cntr_ops[i].ops);
+
/* per port files */
for (ppd = dd->pport, j = 0; j < dd->num_pports; j++, ppd++)
for (i = 0; i < ARRAY_SIZE(port_cntr_ops); i++) {
@@ -1206,12 +1205,11 @@ void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd)
sizeof(name),
port_cntr_ops[i].name,
j + 1);
- DEBUGFS_FILE_CREATE(name,
- ibd->hfi1_ibdev_dbg,
- ppd,
- &port_cntr_ops[i].ops,
+ debugfs_create_file(name,
!port_cntr_ops[i].ops.write ?
- S_IRUGO : S_IRUGO | S_IWUSR);
+ S_IRUGO :
+ S_IRUGO | S_IWUSR,
+ root, ppd, &port_cntr_ops[i].ops);
}
hfi1_fault_init_debugfs(ibd);
@@ -1341,10 +1339,10 @@ DEBUGFS_FILE_OPS(driver_stats);
void hfi1_dbg_init(void)
{
hfi1_dbg_root = debugfs_create_dir(DRIVER_NAME, NULL);
- if (!hfi1_dbg_root)
- pr_warn("init of debugfs failed\n");
- DEBUGFS_SEQ_FILE_CREATE(driver_stats_names, hfi1_dbg_root, NULL);
- DEBUGFS_SEQ_FILE_CREATE(driver_stats, hfi1_dbg_root, NULL);
+ debugfs_create_file("driver_stats_names", 0444, hfi1_dbg_root, NULL,
+ &_driver_stats_names_file_ops);
+ debugfs_create_file("driver_stats", 0444, hfi1_dbg_root, NULL,
+ &_driver_stats_file_ops);
}
void hfi1_dbg_exit(void)
diff --git a/drivers/infiniband/hw/hfi1/debugfs.h b/drivers/infiniband/hw/hfi1/debugfs.h
index d5d824459fcc..57e582caa5eb 100644
--- a/drivers/infiniband/hw/hfi1/debugfs.h
+++ b/drivers/infiniband/hw/hfi1/debugfs.h
@@ -49,16 +49,6 @@
struct hfi1_ibdev;
-#define DEBUGFS_FILE_CREATE(name, parent, data, ops, mode) \
-do { \
- struct dentry *ent; \
- const char *__name = name; \
- ent = debugfs_create_file(__name, mode, parent, \
- data, ops); \
- if (!ent) \
- pr_warn("create of %s failed\n", __name); \
-} while (0)
-
#define DEBUGFS_SEQ_FILE_OPS(name) \
static const struct seq_operations _##name##_seq_ops = { \
.start = _##name##_seq_start, \
@@ -89,8 +79,6 @@ static const struct file_operations _##name##_file_ops = { \
.release = seq_release \
}
-#define DEBUGFS_SEQ_FILE_CREATE(name, parent, data) \
- DEBUGFS_FILE_CREATE(#name, parent, data, &_##name##_file_ops, 0444)
ssize_t hfi1_seq_read(struct file *file, char __user *buf, size_t size,
loff_t *ppos);
diff --git a/drivers/infiniband/hw/hfi1/driver.c b/drivers/infiniband/hw/hfi1/driver.c
index a8ad70730203..2a9d2912f5db 100644
--- a/drivers/infiniband/hw/hfi1/driver.c
+++ b/drivers/infiniband/hw/hfi1/driver.c
@@ -1575,25 +1575,32 @@ drop:
return -EINVAL;
}
-void handle_eflags(struct hfi1_packet *packet)
+static void show_eflags_errs(struct hfi1_packet *packet)
{
struct hfi1_ctxtdata *rcd = packet->rcd;
u32 rte = rhf_rcv_type_err(packet->rhf);
+ dd_dev_err(rcd->dd,
+ "receive context %d: rhf 0x%016llx, errs [ %s%s%s%s%s%s%s%s] rte 0x%x\n",
+ rcd->ctxt, packet->rhf,
+ packet->rhf & RHF_K_HDR_LEN_ERR ? "k_hdr_len " : "",
+ packet->rhf & RHF_DC_UNC_ERR ? "dc_unc " : "",
+ packet->rhf & RHF_DC_ERR ? "dc " : "",
+ packet->rhf & RHF_TID_ERR ? "tid " : "",
+ packet->rhf & RHF_LEN_ERR ? "len " : "",
+ packet->rhf & RHF_ECC_ERR ? "ecc " : "",
+ packet->rhf & RHF_VCRC_ERR ? "vcrc " : "",
+ packet->rhf & RHF_ICRC_ERR ? "icrc " : "",
+ rte);
+}
+
+void handle_eflags(struct hfi1_packet *packet)
+{
+ struct hfi1_ctxtdata *rcd = packet->rcd;
+
rcv_hdrerr(rcd, rcd->ppd, packet);
if (rhf_err_flags(packet->rhf))
- dd_dev_err(rcd->dd,
- "receive context %d: rhf 0x%016llx, errs [ %s%s%s%s%s%s%s%s] rte 0x%x\n",
- rcd->ctxt, packet->rhf,
- packet->rhf & RHF_K_HDR_LEN_ERR ? "k_hdr_len " : "",
- packet->rhf & RHF_DC_UNC_ERR ? "dc_unc " : "",
- packet->rhf & RHF_DC_ERR ? "dc " : "",
- packet->rhf & RHF_TID_ERR ? "tid " : "",
- packet->rhf & RHF_LEN_ERR ? "len " : "",
- packet->rhf & RHF_ECC_ERR ? "ecc " : "",
- packet->rhf & RHF_VCRC_ERR ? "vcrc " : "",
- packet->rhf & RHF_ICRC_ERR ? "icrc " : "",
- rte);
+ show_eflags_errs(packet);
}
/*
@@ -1699,11 +1706,14 @@ static int kdeth_process_expected(struct hfi1_packet *packet)
if (unlikely(hfi1_dbg_should_fault_rx(packet)))
return RHF_RCV_CONTINUE;
- if (unlikely(rhf_err_flags(packet->rhf)))
- handle_eflags(packet);
+ if (unlikely(rhf_err_flags(packet->rhf))) {
+ struct hfi1_ctxtdata *rcd = packet->rcd;
- dd_dev_err(packet->rcd->dd,
- "Unhandled expected packet received. Dropping.\n");
+ if (hfi1_handle_kdeth_eflags(rcd, rcd->ppd, packet))
+ return RHF_RCV_CONTINUE;
+ }
+
+ hfi1_kdeth_expected_rcv(packet);
return RHF_RCV_CONTINUE;
}
@@ -1712,11 +1722,17 @@ static int kdeth_process_eager(struct hfi1_packet *packet)
hfi1_setup_9B_packet(packet);
if (unlikely(hfi1_dbg_should_fault_rx(packet)))
return RHF_RCV_CONTINUE;
- if (unlikely(rhf_err_flags(packet->rhf)))
- handle_eflags(packet);
- dd_dev_err(packet->rcd->dd,
- "Unhandled eager packet received. Dropping.\n");
+ trace_hfi1_rcvhdr(packet);
+ if (unlikely(rhf_err_flags(packet->rhf))) {
+ struct hfi1_ctxtdata *rcd = packet->rcd;
+
+ show_eflags_errs(packet);
+ if (hfi1_handle_kdeth_eflags(rcd, rcd->ppd, packet))
+ return RHF_RCV_CONTINUE;
+ }
+
+ hfi1_kdeth_eager_rcv(packet);
return RHF_RCV_CONTINUE;
}
diff --git a/drivers/infiniband/hw/hfi1/fault.c b/drivers/infiniband/hw/hfi1/fault.c
index e2290f32c8d9..3fd3315d0fb0 100644
--- a/drivers/infiniband/hw/hfi1/fault.c
+++ b/drivers/infiniband/hw/hfi1/fault.c
@@ -250,6 +250,7 @@ void hfi1_fault_exit_debugfs(struct hfi1_ibdev *ibd)
int hfi1_fault_init_debugfs(struct hfi1_ibdev *ibd)
{
struct dentry *parent = ibd->hfi1_ibdev_dbg;
+ struct dentry *fault_dir;
ibd->fault = kzalloc(sizeof(*ibd->fault), GFP_KERNEL);
if (!ibd->fault)
@@ -269,45 +270,31 @@ int hfi1_fault_init_debugfs(struct hfi1_ibdev *ibd)
bitmap_zero(ibd->fault->opcodes,
sizeof(ibd->fault->opcodes) * BITS_PER_BYTE);
- ibd->fault->dir =
- fault_create_debugfs_attr("fault", parent,
- &ibd->fault->attr);
- if (IS_ERR(ibd->fault->dir)) {
+ fault_dir =
+ fault_create_debugfs_attr("fault", parent, &ibd->fault->attr);
+ if (IS_ERR(fault_dir)) {
kfree(ibd->fault);
ibd->fault = NULL;
return -ENOENT;
}
-
- DEBUGFS_SEQ_FILE_CREATE(fault_stats, ibd->fault->dir, ibd);
- if (!debugfs_create_bool("enable", 0600, ibd->fault->dir,
- &ibd->fault->enable))
- goto fail;
- if (!debugfs_create_bool("suppress_err", 0600,
- ibd->fault->dir,
- &ibd->fault->suppress_err))
- goto fail;
- if (!debugfs_create_bool("opcode_mode", 0600, ibd->fault->dir,
- &ibd->fault->opcode))
- goto fail;
- if (!debugfs_create_file("opcodes", 0600, ibd->fault->dir,
- ibd->fault, &__fault_opcodes_fops))
- goto fail;
- if (!debugfs_create_u64("skip_pkts", 0600,
- ibd->fault->dir,
- &ibd->fault->fault_skip))
- goto fail;
- if (!debugfs_create_u64("skip_usec", 0600,
- ibd->fault->dir,
- &ibd->fault->fault_skip_usec))
- goto fail;
- if (!debugfs_create_u8("direction", 0600, ibd->fault->dir,
- &ibd->fault->direction))
- goto fail;
+ ibd->fault->dir = fault_dir;
+
+ debugfs_create_file("fault_stats", 0444, fault_dir, ibd,
+ &_fault_stats_file_ops);
+ debugfs_create_bool("enable", 0600, fault_dir, &ibd->fault->enable);
+ debugfs_create_bool("suppress_err", 0600, fault_dir,
+ &ibd->fault->suppress_err);
+ debugfs_create_bool("opcode_mode", 0600, fault_dir,
+ &ibd->fault->opcode);
+ debugfs_create_file("opcodes", 0600, fault_dir, ibd->fault,
+ &__fault_opcodes_fops);
+ debugfs_create_u64("skip_pkts", 0600, fault_dir,
+ &ibd->fault->fault_skip);
+ debugfs_create_u64("skip_usec", 0600, fault_dir,
+ &ibd->fault->fault_skip_usec);
+ debugfs_create_u8("direction", 0600, fault_dir, &ibd->fault->direction);
return 0;
-fail:
- hfi1_fault_exit_debugfs(ibd);
- return -ENOMEM;
}
bool hfi1_dbg_fault_suppress_err(struct hfi1_ibdev *ibd)
diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h
index 6db2276f5c13..048b5d73ba39 100644
--- a/drivers/infiniband/hw/hfi1/hfi.h
+++ b/drivers/infiniband/hw/hfi1/hfi.h
@@ -73,6 +73,7 @@
#include "chip_registers.h"
#include "common.h"
+#include "opfn.h"
#include "verbs.h"
#include "pio.h"
#include "chip.h"
@@ -98,6 +99,8 @@
#define NEIGHBOR_TYPE_HFI 0
#define NEIGHBOR_TYPE_SWITCH 1
+#define HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES 5
+
extern unsigned long hfi1_cap_mask;
#define HFI1_CAP_KGET_MASK(mask, cap) ((mask) & HFI1_CAP_##cap)
#define HFI1_CAP_UGET_MASK(mask, cap) \
@@ -195,6 +198,14 @@ struct exp_tid_set {
};
typedef int (*rhf_rcv_function_ptr)(struct hfi1_packet *packet);
+
+struct tid_queue {
+ struct list_head queue_head;
+ /* queue head for QP TID resource waiters */
+ u32 enqueue; /* count of tid enqueues */
+ u32 dequeue; /* count of tid dequeues */
+};
+
struct hfi1_ctxtdata {
/* rcvhdrq base, needs mmap before useful */
void *rcvhdrq;
@@ -288,6 +299,12 @@ struct hfi1_ctxtdata {
/* PSM Specific fields */
/* lock protecting all Expected TID data */
struct mutex exp_mutex;
+ /* lock protecting all Expected TID data of kernel contexts */
+ spinlock_t exp_lock;
+ /* Queue for QP's waiting for HW TID flows */
+ struct tid_queue flow_queue;
+ /* Queue for QP's waiting for HW receive array entries */
+ struct tid_queue rarr_queue;
/* when waiting for rcv or pioavail */
wait_queue_head_t wait;
/* uuid from PSM */
@@ -320,6 +337,9 @@ struct hfi1_ctxtdata {
*/
u8 subctxt_cnt;
+ /* Bit mask to track free TID RDMA HW flows */
+ unsigned long flow_mask;
+ struct tid_flow_state flows[RXE_NUM_TID_FLOWS];
};
/**
@@ -1435,7 +1455,7 @@ void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd,
struct hfi1_devdata *dd, u8 hw_pidx, u8 port);
void hfi1_free_ctxtdata(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd);
int hfi1_rcd_put(struct hfi1_ctxtdata *rcd);
-void hfi1_rcd_get(struct hfi1_ctxtdata *rcd);
+int hfi1_rcd_get(struct hfi1_ctxtdata *rcd);
struct hfi1_ctxtdata *hfi1_rcd_get_by_index_safe(struct hfi1_devdata *dd,
u16 ctxt);
struct hfi1_ctxtdata *hfi1_rcd_get_by_index(struct hfi1_devdata *dd, u16 ctxt);
@@ -2100,7 +2120,7 @@ static inline u64 hfi1_pkt_default_send_ctxt_mask(struct hfi1_devdata *dd,
SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_TEST_SMASK |
#endif
HFI1_PKT_USER_SC_INTEGRITY;
- else
+ else if (ctxt_type != SC_KERNEL)
base_sc_integrity |= HFI1_PKT_KERNEL_SC_INTEGRITY;
/* turn on send-side job key checks if !A0 */
diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c
index 441b06e2a154..faaaac8fbc55 100644
--- a/drivers/infiniband/hw/hfi1/init.c
+++ b/drivers/infiniband/hw/hfi1/init.c
@@ -73,7 +73,6 @@
#undef pr_fmt
#define pr_fmt(fmt) DRIVER_NAME ": " fmt
-#define HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES 5
/*
* min buffers we want to have per context, after driver
*/
@@ -216,12 +215,12 @@ static void hfi1_rcd_free(struct kref *kref)
struct hfi1_ctxtdata *rcd =
container_of(kref, struct hfi1_ctxtdata, kref);
- hfi1_free_ctxtdata(rcd->dd, rcd);
-
spin_lock_irqsave(&rcd->dd->uctxt_lock, flags);
rcd->dd->rcd[rcd->ctxt] = NULL;
spin_unlock_irqrestore(&rcd->dd->uctxt_lock, flags);
+ hfi1_free_ctxtdata(rcd->dd, rcd);
+
kfree(rcd);
}
@@ -244,10 +243,13 @@ int hfi1_rcd_put(struct hfi1_ctxtdata *rcd)
* @rcd: pointer to an initialized rcd data structure
*
* Use this to get a reference after the init.
+ *
+ * Return : reflect kref_get_unless_zero(), which returns non-zero on
+ * increment, otherwise 0.
*/
-void hfi1_rcd_get(struct hfi1_ctxtdata *rcd)
+int hfi1_rcd_get(struct hfi1_ctxtdata *rcd)
{
- kref_get(&rcd->kref);
+ return kref_get_unless_zero(&rcd->kref);
}
/**
@@ -327,7 +329,8 @@ struct hfi1_ctxtdata *hfi1_rcd_get_by_index(struct hfi1_devdata *dd, u16 ctxt)
spin_lock_irqsave(&dd->uctxt_lock, flags);
if (dd->rcd[ctxt]) {
rcd = dd->rcd[ctxt];
- hfi1_rcd_get(rcd);
+ if (!hfi1_rcd_get(rcd))
+ rcd = NULL;
}
spin_unlock_irqrestore(&dd->uctxt_lock, flags);
@@ -372,6 +375,9 @@ int hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, int numa,
rcd->rhf_rcv_function_map = normal_rhf_rcv_functions;
mutex_init(&rcd->exp_mutex);
+ spin_lock_init(&rcd->exp_lock);
+ INIT_LIST_HEAD(&rcd->flow_queue.queue_head);
+ INIT_LIST_HEAD(&rcd->rarr_queue.queue_head);
hfi1_cdbg(PROC, "setting up context %u\n", rcd->ctxt);
@@ -474,6 +480,9 @@ int hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, int numa,
GFP_KERNEL, numa);
if (!rcd->opstats)
goto bail;
+
+ /* Initialize TID flow generations for the context */
+ hfi1_kern_init_ctxt_generations(rcd);
}
*context = rcd;
@@ -773,6 +782,8 @@ static void enable_chip(struct hfi1_devdata *dd)
rcvmask |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB;
if (HFI1_CAP_KGET_MASK(rcd->flags, NODROP_EGR_FULL))
rcvmask |= HFI1_RCVCTRL_NO_EGR_DROP_ENB;
+ if (HFI1_CAP_IS_KSET(TID_RDMA))
+ rcvmask |= HFI1_RCVCTRL_TIDFLOW_ENB;
hfi1_rcvctrl(dd, rcvmask, rcd);
sc_enable(rcd->sc);
hfi1_rcd_put(rcd);
@@ -928,6 +939,8 @@ int hfi1_init(struct hfi1_devdata *dd, int reinit)
lastfail = hfi1_create_rcvhdrq(dd, rcd);
if (!lastfail)
lastfail = hfi1_setup_eagerbufs(rcd);
+ if (!lastfail)
+ lastfail = hfi1_kern_exp_rcv_init(rcd, reinit);
if (lastfail) {
dd_dev_err(dd,
"failed to allocate kernel ctxt's rcvhdrq and/or egr bufs\n");
@@ -1498,6 +1511,13 @@ static int __init hfi1_mod_init(void)
/* sanitize link CRC options */
link_crc_mask &= SUPPORTED_CRCS;
+ ret = opfn_init();
+ if (ret < 0) {
+ pr_err("Failed to allocate opfn_wq");
+ goto bail_dev;
+ }
+
+ hfi1_compute_tid_rdma_flow_wt();
/*
* These must be called before the driver is registered with
* the PCI subsystem.
@@ -1528,6 +1548,7 @@ module_init(hfi1_mod_init);
static void __exit hfi1_mod_cleanup(void)
{
pci_unregister_driver(&hfi1_pci_driver);
+ opfn_exit();
node_affinity_destroy_all();
hfi1_dbg_exit();
@@ -1582,7 +1603,7 @@ static void cleanup_device_data(struct hfi1_devdata *dd)
struct hfi1_ctxtdata *rcd = dd->rcd[ctxt];
if (rcd) {
- hfi1_clear_tids(rcd);
+ hfi1_free_ctxt_rcv_groups(rcd);
hfi1_free_ctxt(rcd);
}
}
diff --git a/drivers/infiniband/hw/hfi1/iowait.c b/drivers/infiniband/hw/hfi1/iowait.c
index 582f1ba136ff..adb4a1ba921b 100644
--- a/drivers/infiniband/hw/hfi1/iowait.c
+++ b/drivers/infiniband/hw/hfi1/iowait.c
@@ -6,6 +6,9 @@
#include "iowait.h"
#include "trace_iowait.h"
+/* 1 priority == 16 starve_cnt */
+#define IOWAIT_PRIORITY_STARVE_SHIFT 4
+
void iowait_set_flag(struct iowait *wait, u32 flag)
{
trace_hfi1_iowait_set(wait, flag);
@@ -44,7 +47,8 @@ void iowait_init(struct iowait *wait, u32 tx_limit,
uint seq,
bool pkts_sent),
void (*wakeup)(struct iowait *wait, int reason),
- void (*sdma_drained)(struct iowait *wait))
+ void (*sdma_drained)(struct iowait *wait),
+ void (*init_priority)(struct iowait *wait))
{
int i;
@@ -58,6 +62,7 @@ void iowait_init(struct iowait *wait, u32 tx_limit,
wait->sleep = sleep;
wait->wakeup = wakeup;
wait->sdma_drained = sdma_drained;
+ wait->init_priority = init_priority;
wait->flags = 0;
for (i = 0; i < IOWAIT_SES; i++) {
wait->wait[i].iow = wait;
@@ -92,3 +97,30 @@ int iowait_set_work_flag(struct iowait_work *w)
iowait_set_flag(w->iow, IOWAIT_PENDING_TID);
return IOWAIT_TID_SE;
}
+
+/**
+ * iowait_priority_update_top - update the top priority entry
+ * @w: the iowait struct
+ * @top: a pointer to the top priority entry
+ * @idx: the index of the current iowait in an array
+ * @top_idx: the array index for the iowait entry that has the top priority
+ *
+ * This function is called to compare the priority of a given
+ * iowait with the given top priority entry. The top index will
+ * be returned.
+ */
+uint iowait_priority_update_top(struct iowait *w,
+ struct iowait *top,
+ uint idx, uint top_idx)
+{
+ u8 cnt, tcnt;
+
+ /* Convert priority into starve_cnt and compare the total.*/
+ cnt = (w->priority << IOWAIT_PRIORITY_STARVE_SHIFT) + w->starved_cnt;
+ tcnt = (top->priority << IOWAIT_PRIORITY_STARVE_SHIFT) +
+ top->starved_cnt;
+ if (cnt > tcnt)
+ return idx;
+ else
+ return top_idx;
+}
diff --git a/drivers/infiniband/hw/hfi1/iowait.h b/drivers/infiniband/hw/hfi1/iowait.h
index 23a58ac0d47c..07847cb72169 100644
--- a/drivers/infiniband/hw/hfi1/iowait.h
+++ b/drivers/infiniband/hw/hfi1/iowait.h
@@ -100,6 +100,7 @@ struct iowait_work {
* @sleep: no space callback
* @wakeup: space callback wakeup
* @sdma_drained: sdma count drained
+ * @init_priority: callback to manipulate priority
* @lock: lock protected head of wait queue
* @iowork: workqueue overhead
* @wait_dma: wait for sdma_busy == 0
@@ -109,7 +110,7 @@ struct iowait_work {
* @tx_limit: limit for overflow queuing
* @tx_count: number of tx entry's in tx_head'ed list
* @flags: wait flags (one per QP)
- * @wait: SE array
+ * @wait: SE array for multiple legs
*
* This is to be embedded in user's state structure
* (QP or PQ).
@@ -120,10 +121,13 @@ struct iowait_work {
* are callbacks for the ULP to implement
* what ever queuing/dequeuing of
* the embedded iowait and its containing struct
- * when a resource shortage like SDMA ring space is seen.
+ * when a resource shortage like SDMA ring space
+ * or PIO credit space is seen.
*
* Both potentially have locks help
- * so sleeping is not allowed.
+ * so sleeping is not allowed and it is not
+ * supported to submit txreqs from the wakeup
+ * call directly because of lock conflicts.
*
* The wait_dma member along with the iow
*
@@ -143,6 +147,7 @@ struct iowait {
);
void (*wakeup)(struct iowait *wait, int reason);
void (*sdma_drained)(struct iowait *wait);
+ void (*init_priority)(struct iowait *wait);
seqlock_t *lock;
wait_queue_head_t wait_dma;
wait_queue_head_t wait_pio;
@@ -152,6 +157,7 @@ struct iowait {
u32 tx_limit;
u32 tx_count;
u8 starved_cnt;
+ u8 priority;
unsigned long flags;
struct iowait_work wait[IOWAIT_SES];
};
@@ -171,7 +177,8 @@ void iowait_init(struct iowait *wait, u32 tx_limit,
uint seq,
bool pkts_sent),
void (*wakeup)(struct iowait *wait, int reason),
- void (*sdma_drained)(struct iowait *wait));
+ void (*sdma_drained)(struct iowait *wait),
+ void (*init_priority)(struct iowait *wait));
/**
* iowait_schedule() - schedule the default send engine work
@@ -186,6 +193,18 @@ static inline bool iowait_schedule(struct iowait *wait,
}
/**
+ * iowait_tid_schedule - schedule the tid SE
+ * @wait: the iowait structure
+ * @wq: the work queue
+ * @cpu: the cpu
+ */
+static inline bool iowait_tid_schedule(struct iowait *wait,
+ struct workqueue_struct *wq, int cpu)
+{
+ return !!queue_work_on(cpu, wq, &wait->wait[IOWAIT_TID_SE].iowork);
+}
+
+/**
* iowait_sdma_drain() - wait for DMAs to drain
*
* @wait: iowait structure
@@ -327,6 +346,8 @@ static inline u16 iowait_get_desc(struct iowait_work *w)
tx = list_first_entry(&w->tx_head, struct sdma_txreq,
list);
num_desc = tx->num_desc;
+ if (tx->flags & SDMA_TXREQ_F_VIP)
+ w->iow->priority++;
}
return num_desc;
}
@@ -340,6 +361,37 @@ static inline u32 iowait_get_all_desc(struct iowait *w)
return num_desc;
}
+static inline void iowait_update_priority(struct iowait_work *w)
+{
+ struct sdma_txreq *tx = NULL;
+
+ if (!list_empty(&w->tx_head)) {
+ tx = list_first_entry(&w->tx_head, struct sdma_txreq,
+ list);
+ if (tx->flags & SDMA_TXREQ_F_VIP)
+ w->iow->priority++;
+ }
+}
+
+static inline void iowait_update_all_priority(struct iowait *w)
+{
+ iowait_update_priority(&w->wait[IOWAIT_IB_SE]);
+ iowait_update_priority(&w->wait[IOWAIT_TID_SE]);
+}
+
+static inline void iowait_init_priority(struct iowait *w)
+{
+ w->priority = 0;
+ if (w->init_priority)
+ w->init_priority(w);
+}
+
+static inline void iowait_get_priority(struct iowait *w)
+{
+ iowait_init_priority(w);
+ iowait_update_all_priority(w);
+}
+
/**
* iowait_queue - Put the iowait on a wait queue
* @pkts_sent: have some packets been sent before queuing?
@@ -356,14 +408,18 @@ static inline void iowait_queue(bool pkts_sent, struct iowait *w,
/*
* To play fair, insert the iowait at the tail of the wait queue if it
* has already sent some packets; Otherwise, put it at the head.
+ * However, if it has priority packets to send, also put it at the
+ * head.
*/
- if (pkts_sent) {
- list_add_tail(&w->list, wait_head);
+ if (pkts_sent)
w->starved_cnt = 0;
- } else {
- list_add(&w->list, wait_head);
+ else
w->starved_cnt++;
- }
+
+ if (w->priority > 0 || !pkts_sent)
+ list_add(&w->list, wait_head);
+ else
+ list_add_tail(&w->list, wait_head);
}
/**
@@ -380,27 +436,10 @@ static inline void iowait_starve_clear(bool pkts_sent, struct iowait *w)
w->starved_cnt = 0;
}
-/**
- * iowait_starve_find_max - Find the maximum of the starve count
- * @w: the iowait struct
- * @max: a variable containing the max starve count
- * @idx: the index of the current iowait in an array
- * @max_idx: a variable containing the array index for the
- * iowait entry that has the max starve count
- *
- * This function is called to compare the starve count of a
- * given iowait with the given max starve count. The max starve
- * count and the index will be updated if the iowait's start
- * count is larger.
- */
-static inline void iowait_starve_find_max(struct iowait *w, u8 *max,
- uint idx, uint *max_idx)
-{
- if (w->starved_cnt > *max) {
- *max = w->starved_cnt;
- *max_idx = idx;
- }
-}
+/* Update the top priority index */
+uint iowait_priority_update_top(struct iowait *w,
+ struct iowait *top,
+ uint idx, uint top_idx);
/**
* iowait_packet_queued() - determine if a packet is queued
diff --git a/drivers/infiniband/hw/hfi1/opfn.c b/drivers/infiniband/hw/hfi1/opfn.c
new file mode 100644
index 000000000000..370a5a8eaa71
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/opfn.c
@@ -0,0 +1,323 @@
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+/*
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ */
+#include "hfi.h"
+#include "trace.h"
+#include "qp.h"
+#include "opfn.h"
+
+#define IB_BTHE_E BIT(IB_BTHE_E_SHIFT)
+
+#define OPFN_CODE(code) BIT((code) - 1)
+#define OPFN_MASK(code) OPFN_CODE(STL_VERBS_EXTD_##code)
+
+struct hfi1_opfn_type {
+ bool (*request)(struct rvt_qp *qp, u64 *data);
+ bool (*response)(struct rvt_qp *qp, u64 *data);
+ bool (*reply)(struct rvt_qp *qp, u64 data);
+ void (*error)(struct rvt_qp *qp);
+};
+
+static struct hfi1_opfn_type hfi1_opfn_handlers[STL_VERBS_EXTD_MAX] = {
+ [STL_VERBS_EXTD_TID_RDMA] = {
+ .request = tid_rdma_conn_req,
+ .response = tid_rdma_conn_resp,
+ .reply = tid_rdma_conn_reply,
+ .error = tid_rdma_conn_error,
+ },
+};
+
+static struct workqueue_struct *opfn_wq;
+
+static void opfn_schedule_conn_request(struct rvt_qp *qp);
+
+static bool hfi1_opfn_extended(u32 bth1)
+{
+ return !!(bth1 & IB_BTHE_E);
+}
+
+static void opfn_conn_request(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+ struct ib_atomic_wr wr;
+ u16 mask, capcode;
+ struct hfi1_opfn_type *extd;
+ u64 data;
+ unsigned long flags;
+ int ret = 0;
+
+ trace_hfi1_opfn_state_conn_request(qp);
+ spin_lock_irqsave(&priv->opfn.lock, flags);
+ /*
+ * Exit if the extended bit is not set, or if nothing is requested, or
+ * if we have completed all requests, or if a previous request is in
+ * progress
+ */
+ if (!priv->opfn.extended || !priv->opfn.requested ||
+ priv->opfn.requested == priv->opfn.completed || priv->opfn.curr)
+ goto done;
+
+ mask = priv->opfn.requested & ~priv->opfn.completed;
+ capcode = ilog2(mask & ~(mask - 1)) + 1;
+ if (capcode >= STL_VERBS_EXTD_MAX) {
+ priv->opfn.completed |= OPFN_CODE(capcode);
+ goto done;
+ }
+
+ extd = &hfi1_opfn_handlers[capcode];
+ if (!extd || !extd->request || !extd->request(qp, &data)) {
+ /*
+ * Either there is no handler for this capability or the request
+ * packet could not be generated. Either way, mark it as done so
+ * we don't keep attempting to complete it.
+ */
+ priv->opfn.completed |= OPFN_CODE(capcode);
+ goto done;
+ }
+
+ trace_hfi1_opfn_data_conn_request(qp, capcode, data);
+ data = (data & ~0xf) | capcode;
+
+ memset(&wr, 0, sizeof(wr));
+ wr.wr.opcode = IB_WR_OPFN;
+ wr.remote_addr = HFI1_VERBS_E_ATOMIC_VADDR;
+ wr.compare_add = data;
+
+ priv->opfn.curr = capcode; /* A new request is now in progress */
+ /* Drop opfn.lock before calling ib_post_send() */
+ spin_unlock_irqrestore(&priv->opfn.lock, flags);
+
+ ret = ib_post_send(&qp->ibqp, &wr.wr, NULL);
+ if (ret)
+ goto err;
+ trace_hfi1_opfn_state_conn_request(qp);
+ return;
+err:
+ trace_hfi1_msg_opfn_conn_request(qp, "ib_ost_send failed: ret = ",
+ (u64)ret);
+ spin_lock_irqsave(&priv->opfn.lock, flags);
+ /*
+ * In case of an unexpected error return from ib_post_send
+ * clear opfn.curr and reschedule to try again
+ */
+ priv->opfn.curr = STL_VERBS_EXTD_NONE;
+ opfn_schedule_conn_request(qp);
+done:
+ spin_unlock_irqrestore(&priv->opfn.lock, flags);
+}
+
+void opfn_send_conn_request(struct work_struct *work)
+{
+ struct hfi1_opfn_data *od;
+ struct hfi1_qp_priv *qpriv;
+
+ od = container_of(work, struct hfi1_opfn_data, opfn_work);
+ qpriv = container_of(od, struct hfi1_qp_priv, opfn);
+
+ opfn_conn_request(qpriv->owner);
+}
+
+/*
+ * When QP s_lock is held in the caller, the OPFN request must be scheduled
+ * to a different workqueue to avoid double locking QP s_lock in call to
+ * ib_post_send in opfn_conn_request
+ */
+static void opfn_schedule_conn_request(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ trace_hfi1_opfn_state_sched_conn_request(qp);
+ queue_work(opfn_wq, &priv->opfn.opfn_work);
+}
+
+void opfn_conn_response(struct rvt_qp *qp, struct rvt_ack_entry *e,
+ struct ib_atomic_eth *ateth)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+ u64 data = be64_to_cpu(ateth->compare_data);
+ struct hfi1_opfn_type *extd;
+ u8 capcode;
+ unsigned long flags;
+
+ trace_hfi1_opfn_state_conn_response(qp);
+ capcode = data & 0xf;
+ trace_hfi1_opfn_data_conn_response(qp, capcode, data);
+ if (!capcode || capcode >= STL_VERBS_EXTD_MAX)
+ return;
+
+ extd = &hfi1_opfn_handlers[capcode];
+
+ if (!extd || !extd->response) {
+ e->atomic_data = capcode;
+ return;
+ }
+
+ spin_lock_irqsave(&priv->opfn.lock, flags);
+ if (priv->opfn.completed & OPFN_CODE(capcode)) {
+ /*
+ * We are receiving a request for a feature that has already
+ * been negotiated. This may mean that the other side has reset
+ */
+ priv->opfn.completed &= ~OPFN_CODE(capcode);
+ if (extd->error)
+ extd->error(qp);
+ }
+
+ if (extd->response(qp, &data))
+ priv->opfn.completed |= OPFN_CODE(capcode);
+ e->atomic_data = (data & ~0xf) | capcode;
+ trace_hfi1_opfn_state_conn_response(qp);
+ spin_unlock_irqrestore(&priv->opfn.lock, flags);
+}
+
+void opfn_conn_reply(struct rvt_qp *qp, u64 data)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+ struct hfi1_opfn_type *extd;
+ u8 capcode;
+ unsigned long flags;
+
+ trace_hfi1_opfn_state_conn_reply(qp);
+ capcode = data & 0xf;
+ trace_hfi1_opfn_data_conn_reply(qp, capcode, data);
+ if (!capcode || capcode >= STL_VERBS_EXTD_MAX)
+ return;
+
+ spin_lock_irqsave(&priv->opfn.lock, flags);
+ /*
+ * Either there is no previous request or the reply is not for the
+ * current request
+ */
+ if (!priv->opfn.curr || capcode != priv->opfn.curr)
+ goto done;
+
+ extd = &hfi1_opfn_handlers[capcode];
+
+ if (!extd || !extd->reply)
+ goto clear;
+
+ if (extd->reply(qp, data))
+ priv->opfn.completed |= OPFN_CODE(capcode);
+clear:
+ /*
+ * Clear opfn.curr to indicate that the previous request is no longer in
+ * progress
+ */
+ priv->opfn.curr = STL_VERBS_EXTD_NONE;
+ trace_hfi1_opfn_state_conn_reply(qp);
+done:
+ spin_unlock_irqrestore(&priv->opfn.lock, flags);
+}
+
+void opfn_conn_error(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+ struct hfi1_opfn_type *extd = NULL;
+ unsigned long flags;
+ u16 capcode;
+
+ trace_hfi1_opfn_state_conn_error(qp);
+ trace_hfi1_msg_opfn_conn_error(qp, "error. qp state ", (u64)qp->state);
+ /*
+ * The QP has gone into the Error state. We have to invalidate all
+ * negotiated feature, including the one in progress (if any). The RC
+ * QP handling will clean the WQE for the connection request.
+ */
+ spin_lock_irqsave(&priv->opfn.lock, flags);
+ while (priv->opfn.completed) {
+ capcode = priv->opfn.completed & ~(priv->opfn.completed - 1);
+ extd = &hfi1_opfn_handlers[ilog2(capcode) + 1];
+ if (extd->error)
+ extd->error(qp);
+ priv->opfn.completed &= ~OPFN_CODE(capcode);
+ }
+ priv->opfn.extended = 0;
+ priv->opfn.requested = 0;
+ priv->opfn.curr = STL_VERBS_EXTD_NONE;
+ spin_unlock_irqrestore(&priv->opfn.lock, flags);
+}
+
+void opfn_qp_init(struct rvt_qp *qp, struct ib_qp_attr *attr, int attr_mask)
+{
+ struct ib_qp *ibqp = &qp->ibqp;
+ struct hfi1_qp_priv *priv = qp->priv;
+ unsigned long flags;
+
+ if (attr_mask & IB_QP_RETRY_CNT)
+ priv->s_retry = attr->retry_cnt;
+
+ spin_lock_irqsave(&priv->opfn.lock, flags);
+ if (ibqp->qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) {
+ struct tid_rdma_params *local = &priv->tid_rdma.local;
+
+ if (attr_mask & IB_QP_TIMEOUT)
+ priv->tid_retry_timeout_jiffies = qp->timeout_jiffies;
+ if (qp->pmtu == enum_to_mtu(OPA_MTU_4096) ||
+ qp->pmtu == enum_to_mtu(OPA_MTU_8192)) {
+ tid_rdma_opfn_init(qp, local);
+ /*
+ * We only want to set the OPFN requested bit when the
+ * QP transitions to RTS.
+ */
+ if (attr_mask & IB_QP_STATE &&
+ attr->qp_state == IB_QPS_RTS) {
+ priv->opfn.requested |= OPFN_MASK(TID_RDMA);
+ /*
+ * If the QP is transitioning to RTS and the
+ * opfn.completed for TID RDMA has already been
+ * set, the QP is being moved *back* into RTS.
+ * We can now renegotiate the TID RDMA
+ * parameters.
+ */
+ if (priv->opfn.completed &
+ OPFN_MASK(TID_RDMA)) {
+ priv->opfn.completed &=
+ ~OPFN_MASK(TID_RDMA);
+ /*
+ * Since the opfn.completed bit was
+ * already set, it is safe to assume
+ * that the opfn.extended is also set.
+ */
+ opfn_schedule_conn_request(qp);
+ }
+ }
+ } else {
+ memset(local, 0, sizeof(*local));
+ }
+ }
+ spin_unlock_irqrestore(&priv->opfn.lock, flags);
+}
+
+void opfn_trigger_conn_request(struct rvt_qp *qp, u32 bth1)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ if (!priv->opfn.extended && hfi1_opfn_extended(bth1) &&
+ HFI1_CAP_IS_KSET(OPFN)) {
+ priv->opfn.extended = 1;
+ if (qp->state == IB_QPS_RTS)
+ opfn_conn_request(qp);
+ }
+}
+
+int opfn_init(void)
+{
+ opfn_wq = alloc_workqueue("hfi_opfn",
+ WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE |
+ WQ_MEM_RECLAIM,
+ HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES);
+ if (!opfn_wq)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void opfn_exit(void)
+{
+ if (opfn_wq) {
+ destroy_workqueue(opfn_wq);
+ opfn_wq = NULL;
+ }
+}
diff --git a/drivers/infiniband/hw/hfi1/opfn.h b/drivers/infiniband/hw/hfi1/opfn.h
new file mode 100644
index 000000000000..5f2011cabc25
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/opfn.h
@@ -0,0 +1,85 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
+/*
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ */
+#ifndef _HFI1_OPFN_H
+#define _HFI1_OPFN_H
+
+/**
+ * DOC: Omni Path Feature Negotion (OPFN)
+ *
+ * OPFN is a discovery protocol for Intel Omni-Path fabric that
+ * allows two RC QPs to negotiate a common feature that both QPs
+ * can support. Currently, the only OPA feature that OPFN
+ * supports is TID RDMA.
+ *
+ * Architecture
+ *
+ * OPFN involves the communication between two QPs on the HFI
+ * level on an Omni-Path fabric, and ULPs have no knowledge of
+ * OPFN at all.
+ *
+ * Implementation
+ *
+ * OPFN extends the existing IB RC protocol with the following
+ * changes:
+ * -- Uses Bit 24 (reserved) of DWORD 1 of Base Transport
+ * Header (BTH1) to indicate that the RC QP supports OPFN;
+ * -- Uses a combination of RC COMPARE_SWAP opcode (0x13) and
+ * the address U64_MAX (0xFFFFFFFFFFFFFFFF) as an OPFN
+ * request; The 64-bit data carried with the request/response
+ * contains the parameters for negotiation and will be
+ * defined in tid_rdma.c file;
+ * -- Defines IB_WR_RESERVED3 as IB_WR_OPFN.
+ *
+ * The OPFN communication will be triggered when an RC QP
+ * receives a request with Bit 24 of BTH1 set. The responder QP
+ * will then post send an OPFN request with its local
+ * parameters, which will be sent to the requester QP once all
+ * existing requests on the responder QP side have been sent.
+ * Once the requester QP receives the OPFN request, it will
+ * keep a copy of the responder QP's parameters, and return a
+ * response packet with its own local parameters. The responder
+ * QP receives the response packet and keeps a copy of the requester
+ * QP's parameters. After this exchange, each side has the parameters
+ * for both sides and therefore can select the right parameters
+ * for future transactions
+ */
+
+/* STL Verbs Extended */
+#define IB_BTHE_E_SHIFT 24
+#define HFI1_VERBS_E_ATOMIC_VADDR U64_MAX
+
+struct ib_atomic_eth;
+
+enum hfi1_opfn_codes {
+ STL_VERBS_EXTD_NONE = 0,
+ STL_VERBS_EXTD_TID_RDMA,
+ STL_VERBS_EXTD_MAX
+};
+
+struct hfi1_opfn_data {
+ u8 extended;
+ u16 requested;
+ u16 completed;
+ enum hfi1_opfn_codes curr;
+ /* serialize opfn function calls */
+ spinlock_t lock;
+ struct work_struct opfn_work;
+};
+
+/* WR opcode for OPFN */
+#define IB_WR_OPFN IB_WR_RESERVED3
+
+void opfn_send_conn_request(struct work_struct *work);
+void opfn_conn_response(struct rvt_qp *qp, struct rvt_ack_entry *e,
+ struct ib_atomic_eth *ateth);
+void opfn_conn_reply(struct rvt_qp *qp, u64 data);
+void opfn_conn_error(struct rvt_qp *qp);
+void opfn_qp_init(struct rvt_qp *qp, struct ib_qp_attr *attr, int attr_mask);
+void opfn_trigger_conn_request(struct rvt_qp *qp, u32 bth1);
+int opfn_init(void);
+void opfn_exit(void);
+
+#endif /* _HFI1_OPFN_H */
diff --git a/drivers/infiniband/hw/hfi1/pio.c b/drivers/infiniband/hw/hfi1/pio.c
index 04126d7e318d..a1de566fe95e 100644
--- a/drivers/infiniband/hw/hfi1/pio.c
+++ b/drivers/infiniband/hw/hfi1/pio.c
@@ -1599,8 +1599,7 @@ static void sc_piobufavail(struct send_context *sc)
struct rvt_qp *qp;
struct hfi1_qp_priv *priv;
unsigned long flags;
- uint i, n = 0, max_idx = 0;
- u8 max_starved_cnt = 0;
+ uint i, n = 0, top_idx = 0;
if (dd->send_contexts[sc->sw_index].type != SC_KERNEL &&
dd->send_contexts[sc->sw_index].type != SC_VL15)
@@ -1619,11 +1618,18 @@ static void sc_piobufavail(struct send_context *sc)
if (n == ARRAY_SIZE(qps))
break;
wait = list_first_entry(list, struct iowait, list);
+ iowait_get_priority(wait);
qp = iowait_to_qp(wait);
priv = qp->priv;
list_del_init(&priv->s_iowait.list);
priv->s_iowait.lock = NULL;
- iowait_starve_find_max(wait, &max_starved_cnt, n, &max_idx);
+ if (n) {
+ priv = qps[top_idx]->priv;
+ top_idx = iowait_priority_update_top(wait,
+ &priv->s_iowait,
+ n, top_idx);
+ }
+
/* refcount held until actual wake up */
qps[n++] = qp;
}
@@ -1638,12 +1644,12 @@ static void sc_piobufavail(struct send_context *sc)
}
write_sequnlock_irqrestore(&sc->waitlock, flags);
- /* Wake up the most starved one first */
+ /* Wake up the top-priority one first */
if (n)
- hfi1_qp_wakeup(qps[max_idx],
+ hfi1_qp_wakeup(qps[top_idx],
RVT_S_WAIT_PIO | HFI1_S_WAIT_PIO_DRAIN);
for (i = 0; i < n; i++)
- if (i != max_idx)
+ if (i != top_idx)
hfi1_qp_wakeup(qps[i],
RVT_S_WAIT_PIO | HFI1_S_WAIT_PIO_DRAIN);
}
diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c
index 5344e8993b28..9b643c2409cf 100644
--- a/drivers/infiniband/hw/hfi1/qp.c
+++ b/drivers/infiniband/hw/hfi1/qp.c
@@ -132,6 +132,18 @@ const struct rvt_operation_params hfi1_post_parms[RVT_OPERATION_MAX] = {
.qpt_support = BIT(IB_QPT_RC),
},
+[IB_WR_OPFN] = {
+ .length = sizeof(struct ib_atomic_wr),
+ .qpt_support = BIT(IB_QPT_RC),
+ .flags = RVT_OPERATION_USE_RESERVE,
+},
+
+[IB_WR_TID_RDMA_WRITE] = {
+ .length = sizeof(struct ib_rdma_wr),
+ .qpt_support = BIT(IB_QPT_RC),
+ .flags = RVT_OPERATION_IGN_RNR_CNT,
+},
+
};
static void flush_list_head(struct list_head *l)
@@ -285,6 +297,8 @@ void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr,
priv->s_sendcontext = qp_to_send_context(qp, priv->s_sc);
qp_set_16b(qp);
}
+
+ opfn_qp_init(qp, attr, attr_mask);
}
/**
@@ -311,6 +325,8 @@ int hfi1_setup_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe, bool *call_send)
switch (qp->ibqp.qp_type) {
case IB_QPT_RC:
+ hfi1_setup_tid_rdma_wqe(qp, wqe);
+ /* fall through */
case IB_QPT_UC:
if (wqe->length > 0x80000000U)
return -EINVAL;
@@ -422,6 +438,11 @@ static void hfi1_qp_schedule(struct rvt_qp *qp)
if (ret)
iowait_clear_flag(&priv->s_iowait, IOWAIT_PENDING_IB);
}
+ if (iowait_flag_set(&priv->s_iowait, IOWAIT_PENDING_TID)) {
+ ret = hfi1_schedule_tid_send(qp);
+ if (ret)
+ iowait_clear_flag(&priv->s_iowait, IOWAIT_PENDING_TID);
+ }
}
void hfi1_qp_wakeup(struct rvt_qp *qp, u32 flag)
@@ -441,8 +462,27 @@ void hfi1_qp_wakeup(struct rvt_qp *qp, u32 flag)
void hfi1_qp_unbusy(struct rvt_qp *qp, struct iowait_work *wait)
{
- if (iowait_set_work_flag(wait) == IOWAIT_IB_SE)
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ if (iowait_set_work_flag(wait) == IOWAIT_IB_SE) {
qp->s_flags &= ~RVT_S_BUSY;
+ /*
+ * If we are sending a first-leg packet from the second leg,
+ * we need to clear the busy flag from priv->s_flags to
+ * avoid a race condition when the qp wakes up before
+ * the call to hfi1_verbs_send() returns to the second
+ * leg. In that case, the second leg will terminate without
+ * being re-scheduled, resulting in failure to send TID RDMA
+ * WRITE DATA and TID RDMA ACK packets.
+ */
+ if (priv->s_flags & HFI1_S_TID_BUSY_SET) {
+ priv->s_flags &= ~(HFI1_S_TID_BUSY_SET |
+ RVT_S_BUSY);
+ iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_TID);
+ }
+ } else {
+ priv->s_flags &= ~RVT_S_BUSY;
+ }
}
static int iowait_sleep(
@@ -479,6 +519,7 @@ static int iowait_sleep(
ibp->rvp.n_dmawait++;
qp->s_flags |= RVT_S_WAIT_DMA_DESC;
+ iowait_get_priority(&priv->s_iowait);
iowait_queue(pkts_sent, &priv->s_iowait,
&sde->dmawait);
priv->s_iowait.lock = &sde->waitlock;
@@ -528,6 +569,17 @@ static void iowait_sdma_drained(struct iowait *wait)
spin_unlock_irqrestore(&qp->s_lock, flags);
}
+static void hfi1_init_priority(struct iowait *w)
+{
+ struct rvt_qp *qp = iowait_to_qp(w);
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ if (qp->s_flags & RVT_S_ACK_PENDING)
+ w->priority++;
+ if (priv->s_flags & RVT_S_ACK_PENDING)
+ w->priority++;
+}
+
/**
* qp_to_sdma_engine - map a qp to a send engine
* @qp: the QP
@@ -685,10 +737,11 @@ void *qp_priv_alloc(struct rvt_dev_info *rdi, struct rvt_qp *qp)
&priv->s_iowait,
1,
_hfi1_do_send,
- NULL,
+ _hfi1_do_tid_send,
iowait_sleep,
iowait_wakeup,
- iowait_sdma_drained);
+ iowait_sdma_drained,
+ hfi1_init_priority);
return priv;
}
@@ -696,6 +749,7 @@ void qp_priv_free(struct rvt_dev_info *rdi, struct rvt_qp *qp)
{
struct hfi1_qp_priv *priv = qp->priv;
+ hfi1_qp_priv_tid_free(rdi, qp);
kfree(priv->s_ahg);
kfree(priv);
}
@@ -729,6 +783,7 @@ void flush_qp_waiters(struct rvt_qp *qp)
{
lockdep_assert_held(&qp->s_lock);
flush_iowait(qp);
+ hfi1_tid_rdma_flush_wait(qp);
}
void stop_send_queue(struct rvt_qp *qp)
@@ -736,12 +791,16 @@ void stop_send_queue(struct rvt_qp *qp)
struct hfi1_qp_priv *priv = qp->priv;
iowait_cancel_work(&priv->s_iowait);
+ if (cancel_work_sync(&priv->tid_rdma.trigger_work))
+ rvt_put_qp(qp);
}
void quiesce_qp(struct rvt_qp *qp)
{
struct hfi1_qp_priv *priv = qp->priv;
+ hfi1_del_tid_reap_timer(qp);
+ hfi1_del_tid_retry_timer(qp);
iowait_sdma_drain(&priv->s_iowait);
qp_pio_drain(qp);
flush_tx_list(qp);
@@ -749,8 +808,13 @@ void quiesce_qp(struct rvt_qp *qp)
void notify_qp_reset(struct rvt_qp *qp)
{
+ hfi1_qp_kern_exp_rcv_clear_all(qp);
qp->r_adefered = 0;
clear_ahg(qp);
+
+ /* Clear any OPFN state */
+ if (qp->ibqp.qp_type == IB_QPT_RC)
+ opfn_conn_error(qp);
}
/*
@@ -832,7 +896,8 @@ void notify_error_qp(struct rvt_qp *qp)
if (lock) {
write_seqlock(lock);
if (!list_empty(&priv->s_iowait.list) &&
- !(qp->s_flags & RVT_S_BUSY)) {
+ !(qp->s_flags & RVT_S_BUSY) &&
+ !(priv->s_flags & RVT_S_BUSY)) {
qp->s_flags &= ~RVT_S_ANY_WAIT_IO;
list_del_init(&priv->s_iowait.list);
priv->s_iowait.lock = NULL;
@@ -841,7 +906,8 @@ void notify_error_qp(struct rvt_qp *qp)
write_sequnlock(lock);
}
- if (!(qp->s_flags & RVT_S_BUSY)) {
+ if (!(qp->s_flags & RVT_S_BUSY) && !(priv->s_flags & RVT_S_BUSY)) {
+ qp->s_hdrwords = 0;
if (qp->s_rdma_mr) {
rvt_put_mr(qp->s_rdma_mr);
qp->s_rdma_mr = NULL;
diff --git a/drivers/infiniband/hw/hfi1/qp.h b/drivers/infiniband/hw/hfi1/qp.h
index 7adb6dff6813..b670321365d3 100644
--- a/drivers/infiniband/hw/hfi1/qp.h
+++ b/drivers/infiniband/hw/hfi1/qp.h
@@ -63,11 +63,17 @@ extern const struct rvt_operation_params hfi1_post_parms[];
* HFI1_S_AHG_VALID - ahg header valid on chip
* HFI1_S_AHG_CLEAR - have send engine clear ahg state
* HFI1_S_WAIT_PIO_DRAIN - qp waiting for PIOs to drain
+ * HFI1_S_WAIT_TID_SPACE - a QP is waiting for TID resource
+ * HFI1_S_WAIT_TID_RESP - waiting for a TID RDMA WRITE response
+ * HFI1_S_WAIT_HALT - halt the first leg send engine
* HFI1_S_MIN_BIT_MASK - the lowest bit that can be used by hfi1
*/
#define HFI1_S_AHG_VALID 0x80000000
#define HFI1_S_AHG_CLEAR 0x40000000
#define HFI1_S_WAIT_PIO_DRAIN 0x20000000
+#define HFI1_S_WAIT_TID_SPACE 0x10000000
+#define HFI1_S_WAIT_TID_RESP 0x08000000
+#define HFI1_S_WAIT_HALT 0x04000000
#define HFI1_S_MIN_BIT_MASK 0x01000000
/*
@@ -76,6 +82,7 @@ extern const struct rvt_operation_params hfi1_post_parms[];
#define HFI1_S_ANY_WAIT_IO (RVT_S_ANY_WAIT_IO | HFI1_S_WAIT_PIO_DRAIN)
#define HFI1_S_ANY_WAIT (HFI1_S_ANY_WAIT_IO | RVT_S_ANY_WAIT_SEND)
+#define HFI1_S_ANY_TID_WAIT_SEND (RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_DMA)
/*
* Send if not busy or waiting for I/O and either
diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c
index be603f35d7e4..e6726c1ab866 100644
--- a/drivers/infiniband/hw/hfi1/rc.c
+++ b/drivers/infiniband/hw/hfi1/rc.c
@@ -51,24 +51,48 @@
#include "hfi.h"
#include "qp.h"
+#include "rc.h"
#include "verbs_txreq.h"
#include "trace.h"
-/* cut down ridiculously long IB macro names */
-#define OP(x) RC_OP(x)
-
-static u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe,
- u32 psn, u32 pmtu)
+struct rvt_ack_entry *find_prev_entry(struct rvt_qp *qp, u32 psn, u8 *prev,
+ u8 *prev_ack, bool *scheduled)
+ __must_hold(&qp->s_lock)
{
- u32 len;
-
- len = delta_psn(psn, wqe->psn) * pmtu;
- ss->sge = wqe->sg_list[0];
- ss->sg_list = wqe->sg_list + 1;
- ss->num_sge = wqe->wr.num_sge;
- ss->total_len = wqe->length;
- rvt_skip_sge(ss, len, false);
- return wqe->length - len;
+ struct rvt_ack_entry *e = NULL;
+ u8 i, p;
+ bool s = true;
+
+ for (i = qp->r_head_ack_queue; ; i = p) {
+ if (i == qp->s_tail_ack_queue)
+ s = false;
+ if (i)
+ p = i - 1;
+ else
+ p = rvt_size_atomic(ib_to_rvt(qp->ibqp.device));
+ if (p == qp->r_head_ack_queue) {
+ e = NULL;
+ break;
+ }
+ e = &qp->s_ack_queue[p];
+ if (!e->opcode) {
+ e = NULL;
+ break;
+ }
+ if (cmp_psn(psn, e->psn) >= 0) {
+ if (p == qp->s_tail_ack_queue &&
+ cmp_psn(psn, e->lpsn) <= 0)
+ s = false;
+ break;
+ }
+ }
+ if (prev)
+ *prev = p;
+ if (prev_ack)
+ *prev_ack = i;
+ if (scheduled)
+ *scheduled = s;
+ return e;
}
/**
@@ -87,20 +111,25 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
struct hfi1_pkt_state *ps)
{
struct rvt_ack_entry *e;
- u32 hwords;
- u32 len;
- u32 bth0;
- u32 bth2;
+ u32 hwords, hdrlen;
+ u32 len = 0;
+ u32 bth0 = 0, bth2 = 0;
+ u32 bth1 = qp->remote_qpn | (HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT);
int middle = 0;
u32 pmtu = qp->pmtu;
- struct hfi1_qp_priv *priv = qp->priv;
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ bool last_pkt;
+ u32 delta;
+ u8 next = qp->s_tail_ack_queue;
+ struct tid_rdma_request *req;
+ trace_hfi1_rsp_make_rc_ack(qp, 0);
lockdep_assert_held(&qp->s_lock);
/* Don't send an ACK if we aren't supposed to. */
if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
goto bail;
- if (priv->hdr_type == HFI1_PKT_TYPE_9B)
+ if (qpriv->hdr_type == HFI1_PKT_TYPE_9B)
/* header size in 32-bit words LRH+BTH = (8+12)/4. */
hwords = 5;
else
@@ -122,8 +151,18 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
* response has been sent instead of only being
* constructed.
*/
- if (++qp->s_tail_ack_queue > HFI1_MAX_RDMA_ATOMIC)
- qp->s_tail_ack_queue = 0;
+ if (++next > rvt_size_atomic(&dev->rdi))
+ next = 0;
+ /*
+ * Only advance the s_acked_ack_queue pointer if there
+ * have been no TID RDMA requests.
+ */
+ e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+ if (e->opcode != TID_OP(WRITE_REQ) &&
+ qp->s_acked_ack_queue == qp->s_tail_ack_queue)
+ qp->s_acked_ack_queue = next;
+ qp->s_tail_ack_queue = next;
+ trace_hfi1_rsp_make_rc_ack(qp, e->psn);
/* FALLTHROUGH */
case OP(SEND_ONLY):
case OP(ACKNOWLEDGE):
@@ -135,6 +174,12 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
}
e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+ /* Check for tid write fence */
+ if ((qpriv->s_flags & HFI1_R_TID_WAIT_INTERLCK) ||
+ hfi1_tid_rdma_ack_interlock(qp, e)) {
+ iowait_set_flag(&qpriv->s_iowait, IOWAIT_PENDING_IB);
+ goto bail;
+ }
if (e->opcode == OP(RDMA_READ_REQUEST)) {
/*
* If a RDMA read response is being resent and
@@ -144,6 +189,10 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
*/
len = e->rdma_sge.sge_length;
if (len && !e->rdma_sge.mr) {
+ if (qp->s_acked_ack_queue ==
+ qp->s_tail_ack_queue)
+ qp->s_acked_ack_queue =
+ qp->r_head_ack_queue;
qp->s_tail_ack_queue = qp->r_head_ack_queue;
goto bail;
}
@@ -165,6 +214,45 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
hwords++;
qp->s_ack_rdma_psn = e->psn;
bth2 = mask_psn(qp->s_ack_rdma_psn++);
+ } else if (e->opcode == TID_OP(WRITE_REQ)) {
+ /*
+ * If a TID RDMA WRITE RESP is being resent, we have to
+ * wait for the actual request. All requests that are to
+ * be resent will have their state set to
+ * TID_REQUEST_RESEND. When the new request arrives, the
+ * state will be changed to TID_REQUEST_RESEND_ACTIVE.
+ */
+ req = ack_to_tid_req(e);
+ if (req->state == TID_REQUEST_RESEND ||
+ req->state == TID_REQUEST_INIT_RESEND)
+ goto bail;
+ qp->s_ack_state = TID_OP(WRITE_RESP);
+ qp->s_ack_rdma_psn = mask_psn(e->psn + req->cur_seg);
+ goto write_resp;
+ } else if (e->opcode == TID_OP(READ_REQ)) {
+ /*
+ * If a TID RDMA read response is being resent and
+ * we haven't seen the duplicate request yet,
+ * then stop sending the remaining responses the
+ * responder has seen until the requester re-sends it.
+ */
+ len = e->rdma_sge.sge_length;
+ if (len && !e->rdma_sge.mr) {
+ if (qp->s_acked_ack_queue ==
+ qp->s_tail_ack_queue)
+ qp->s_acked_ack_queue =
+ qp->r_head_ack_queue;
+ qp->s_tail_ack_queue = qp->r_head_ack_queue;
+ goto bail;
+ }
+ /* Copy SGE state in case we need to resend */
+ ps->s_txreq->mr = e->rdma_sge.mr;
+ if (ps->s_txreq->mr)
+ rvt_get_mr(ps->s_txreq->mr);
+ qp->s_ack_rdma_sge.sge = e->rdma_sge;
+ qp->s_ack_rdma_sge.num_sge = 1;
+ qp->s_ack_state = TID_OP(READ_RESP);
+ goto read_resp;
} else {
/* COMPARE_SWAP or FETCH_ADD */
ps->s_txreq->ss = NULL;
@@ -176,6 +264,7 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
bth2 = mask_psn(e->psn);
e->sent = 1;
}
+ trace_hfi1_tid_write_rsp_make_rc_ack(qp);
bth0 = qp->s_ack_state << 24;
break;
@@ -202,6 +291,83 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
bth2 = mask_psn(qp->s_ack_rdma_psn++);
break;
+ case TID_OP(WRITE_RESP):
+write_resp:
+ /*
+ * 1. Check if RVT_S_ACK_PENDING is set. If yes,
+ * goto normal.
+ * 2. Attempt to allocate TID resources.
+ * 3. Remove RVT_S_RESP_PENDING flags from s_flags
+ * 4. If resources not available:
+ * 4.1 Set RVT_S_WAIT_TID_SPACE
+ * 4.2 Queue QP on RCD TID queue
+ * 4.3 Put QP on iowait list.
+ * 4.4 Build IB RNR NAK with appropriate timeout value
+ * 4.5 Return indication progress made.
+ * 5. If resources are available:
+ * 5.1 Program HW flow CSRs
+ * 5.2 Build TID RDMA WRITE RESP packet
+ * 5.3 If more resources needed, do 2.1 - 2.3.
+ * 5.4 Wake up next QP on RCD TID queue.
+ * 5.5 Return indication progress made.
+ */
+
+ e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+ req = ack_to_tid_req(e);
+
+ /*
+ * Send scheduled RNR NAK's. RNR NAK's need to be sent at
+ * segment boundaries, not at request boundaries. Don't change
+ * s_ack_state because we are still in the middle of a request
+ */
+ if (qpriv->rnr_nak_state == TID_RNR_NAK_SEND &&
+ qp->s_tail_ack_queue == qpriv->r_tid_alloc &&
+ req->cur_seg == req->alloc_seg) {
+ qpriv->rnr_nak_state = TID_RNR_NAK_SENT;
+ goto normal_no_state;
+ }
+
+ bth2 = mask_psn(qp->s_ack_rdma_psn);
+ hdrlen = hfi1_build_tid_rdma_write_resp(qp, e, ohdr, &bth1,
+ bth2, &len,
+ &ps->s_txreq->ss);
+ if (!hdrlen)
+ return 0;
+
+ hwords += hdrlen;
+ bth0 = qp->s_ack_state << 24;
+ qp->s_ack_rdma_psn++;
+ trace_hfi1_tid_req_make_rc_ack_write(qp, 0, e->opcode, e->psn,
+ e->lpsn, req);
+ if (req->cur_seg != req->total_segs)
+ break;
+
+ e->sent = 1;
+ qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
+ break;
+
+ case TID_OP(READ_RESP):
+read_resp:
+ e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+ ps->s_txreq->ss = &qp->s_ack_rdma_sge;
+ delta = hfi1_build_tid_rdma_read_resp(qp, e, ohdr, &bth0,
+ &bth1, &bth2, &len,
+ &last_pkt);
+ if (delta == 0)
+ goto error_qp;
+ hwords += delta;
+ if (last_pkt) {
+ e->sent = 1;
+ /*
+ * Increment qp->s_tail_ack_queue through s_ack_state
+ * transition.
+ */
+ qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
+ }
+ break;
+ case TID_OP(READ_REQ):
+ goto bail;
+
default:
normal:
/*
@@ -211,8 +377,7 @@ normal:
* (see above).
*/
qp->s_ack_state = OP(SEND_ONLY);
- qp->s_flags &= ~RVT_S_ACK_PENDING;
- ps->s_txreq->ss = NULL;
+normal_no_state:
if (qp->s_nak_state)
ohdr->u.aeth =
cpu_to_be32((qp->r_msn & IB_MSN_MASK) |
@@ -224,14 +389,24 @@ normal:
len = 0;
bth0 = OP(ACKNOWLEDGE) << 24;
bth2 = mask_psn(qp->s_ack_psn);
+ qp->s_flags &= ~RVT_S_ACK_PENDING;
+ ps->s_txreq->txreq.flags |= SDMA_TXREQ_F_VIP;
+ ps->s_txreq->ss = NULL;
}
qp->s_rdma_ack_cnt++;
- ps->s_txreq->sde = priv->s_sde;
+ ps->s_txreq->sde = qpriv->s_sde;
ps->s_txreq->s_cur_size = len;
ps->s_txreq->hdr_dwords = hwords;
- hfi1_make_ruc_header(qp, ohdr, bth0, bth2, middle, ps);
+ hfi1_make_ruc_header(qp, ohdr, bth0, bth1, bth2, middle, ps);
return 1;
-
+error_qp:
+ spin_unlock_irqrestore(&qp->s_lock, ps->flags);
+ spin_lock_irqsave(&qp->r_lock, ps->flags);
+ spin_lock(&qp->s_lock);
+ rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+ spin_unlock(&qp->s_lock);
+ spin_unlock_irqrestore(&qp->r_lock, ps->flags);
+ spin_lock_irqsave(&qp->s_lock, ps->flags);
bail:
qp->s_ack_state = OP(ACKNOWLEDGE);
/*
@@ -258,17 +433,23 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
struct hfi1_qp_priv *priv = qp->priv;
struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
struct ib_other_headers *ohdr;
- struct rvt_sge_state *ss;
+ struct rvt_sge_state *ss = NULL;
struct rvt_swqe *wqe;
- u32 hwords;
- u32 len;
- u32 bth0 = 0;
- u32 bth2;
+ struct hfi1_swqe_priv *wpriv;
+ struct tid_rdma_request *req = NULL;
+ /* header size in 32-bit words LRH+BTH = (8+12)/4. */
+ u32 hwords = 5;
+ u32 len = 0;
+ u32 bth0 = 0, bth2 = 0;
+ u32 bth1 = qp->remote_qpn | (HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT);
u32 pmtu = qp->pmtu;
char newreq;
int middle = 0;
int delta;
+ struct tid_rdma_flow *flow = NULL;
+ struct tid_rdma_params *remote;
+ trace_hfi1_sender_make_rc_req(qp);
lockdep_assert_held(&qp->s_lock);
ps->s_txreq = get_txreq(ps->dev, qp);
if (!ps->s_txreq)
@@ -309,13 +490,13 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
}
clear_ahg(qp);
wqe = rvt_get_swqe_ptr(qp, qp->s_last);
- rvt_send_complete(qp, wqe, qp->s_last != qp->s_acked ?
- IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR);
+ hfi1_trdma_send_complete(qp, wqe, qp->s_last != qp->s_acked ?
+ IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR);
/* will get called again */
goto done_free_tx;
}
- if (qp->s_flags & (RVT_S_WAIT_RNR | RVT_S_WAIT_ACK))
+ if (qp->s_flags & (RVT_S_WAIT_RNR | RVT_S_WAIT_ACK | HFI1_S_WAIT_HALT))
goto bail;
if (cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) {
@@ -329,6 +510,7 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
/* Send a request. */
wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
+check_s_state:
switch (qp->s_state) {
default:
if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_NEXT_SEND_OK))
@@ -350,9 +532,13 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
/*
* If a fence is requested, wait for previous
* RDMA read and atomic operations to finish.
+ * However, there is no need to guard against
+ * TID RDMA READ after TID RDMA READ.
*/
if ((wqe->wr.send_flags & IB_SEND_FENCE) &&
- qp->s_num_rd_atomic) {
+ qp->s_num_rd_atomic &&
+ (wqe->wr.opcode != IB_WR_TID_RDMA_READ ||
+ priv->pending_tid_r_segs < qp->s_num_rd_atomic)) {
qp->s_flags |= RVT_S_WAIT_FENCE;
goto bail;
}
@@ -397,6 +583,15 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
len = wqe->length;
ss = &qp->s_sge;
bth2 = mask_psn(qp->s_psn);
+
+ /*
+ * Interlock between various IB requests and TID RDMA
+ * if necessary.
+ */
+ if ((priv->s_flags & HFI1_S_TID_WAIT_INTERLCK) ||
+ hfi1_tid_rdma_wqe_interlock(qp, wqe))
+ goto bail;
+
switch (wqe->wr.opcode) {
case IB_WR_SEND:
case IB_WR_SEND_WITH_IMM:
@@ -473,21 +668,126 @@ no_flow_control:
qp->s_cur = 0;
break;
+ case IB_WR_TID_RDMA_WRITE:
+ if (newreq) {
+ /*
+ * Limit the number of TID RDMA WRITE requests.
+ */
+ if (atomic_read(&priv->n_tid_requests) >=
+ HFI1_TID_RDMA_WRITE_CNT)
+ goto bail;
+
+ if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
+ qp->s_lsn++;
+ }
+
+ hwords += hfi1_build_tid_rdma_write_req(qp, wqe, ohdr,
+ &bth1, &bth2,
+ &len);
+ ss = NULL;
+ if (priv->s_tid_cur == HFI1_QP_WQE_INVALID) {
+ priv->s_tid_cur = qp->s_cur;
+ if (priv->s_tid_tail == HFI1_QP_WQE_INVALID) {
+ priv->s_tid_tail = qp->s_cur;
+ priv->s_state = TID_OP(WRITE_RESP);
+ }
+ } else if (priv->s_tid_cur == priv->s_tid_head) {
+ struct rvt_swqe *__w;
+ struct tid_rdma_request *__r;
+
+ __w = rvt_get_swqe_ptr(qp, priv->s_tid_cur);
+ __r = wqe_to_tid_req(__w);
+
+ /*
+ * The s_tid_cur pointer is advanced to s_cur if
+ * any of the following conditions about the WQE
+ * to which s_ti_cur currently points to are
+ * satisfied:
+ * 1. The request is not a TID RDMA WRITE
+ * request,
+ * 2. The request is in the INACTIVE or
+ * COMPLETE states (TID RDMA READ requests
+ * stay at INACTIVE and TID RDMA WRITE
+ * transition to COMPLETE when done),
+ * 3. The request is in the ACTIVE or SYNC
+ * state and the number of completed
+ * segments is equal to the total segment
+ * count.
+ * (If ACTIVE, the request is waiting for
+ * ACKs. If SYNC, the request has not
+ * received any responses because it's
+ * waiting on a sync point.)
+ */
+ if (__w->wr.opcode != IB_WR_TID_RDMA_WRITE ||
+ __r->state == TID_REQUEST_INACTIVE ||
+ __r->state == TID_REQUEST_COMPLETE ||
+ ((__r->state == TID_REQUEST_ACTIVE ||
+ __r->state == TID_REQUEST_SYNC) &&
+ __r->comp_seg == __r->total_segs)) {
+ if (priv->s_tid_tail ==
+ priv->s_tid_cur &&
+ priv->s_state ==
+ TID_OP(WRITE_DATA_LAST)) {
+ priv->s_tid_tail = qp->s_cur;
+ priv->s_state =
+ TID_OP(WRITE_RESP);
+ }
+ priv->s_tid_cur = qp->s_cur;
+ }
+ /*
+ * A corner case: when the last TID RDMA WRITE
+ * request was completed, s_tid_head,
+ * s_tid_cur, and s_tid_tail all point to the
+ * same location. Other requests are posted and
+ * s_cur wraps around to the same location,
+ * where a new TID RDMA WRITE is posted. In
+ * this case, none of the indices need to be
+ * updated. However, the priv->s_state should.
+ */
+ if (priv->s_tid_tail == qp->s_cur &&
+ priv->s_state == TID_OP(WRITE_DATA_LAST))
+ priv->s_state = TID_OP(WRITE_RESP);
+ }
+ req = wqe_to_tid_req(wqe);
+ if (newreq) {
+ priv->s_tid_head = qp->s_cur;
+ priv->pending_tid_w_resp += req->total_segs;
+ atomic_inc(&priv->n_tid_requests);
+ atomic_dec(&priv->n_requests);
+ } else {
+ req->state = TID_REQUEST_RESEND;
+ req->comp_seg = delta_psn(bth2, wqe->psn);
+ /*
+ * Pull back any segments since we are going
+ * to re-receive them.
+ */
+ req->setup_head = req->clear_tail;
+ priv->pending_tid_w_resp +=
+ delta_psn(wqe->lpsn, bth2) + 1;
+ }
+
+ trace_hfi1_tid_write_sender_make_req(qp, newreq);
+ trace_hfi1_tid_req_make_req_write(qp, newreq,
+ wqe->wr.opcode,
+ wqe->psn, wqe->lpsn,
+ req);
+ if (++qp->s_cur == qp->s_size)
+ qp->s_cur = 0;
+ break;
+
case IB_WR_RDMA_READ:
/*
* Don't allow more operations to be started
* than the QP limits allow.
*/
- if (newreq) {
- if (qp->s_num_rd_atomic >=
- qp->s_max_rd_atomic) {
- qp->s_flags |= RVT_S_WAIT_RDMAR;
- goto bail;
- }
- qp->s_num_rd_atomic++;
- if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
- qp->s_lsn++;
+ if (qp->s_num_rd_atomic >=
+ qp->s_max_rd_atomic) {
+ qp->s_flags |= RVT_S_WAIT_RDMAR;
+ goto bail;
}
+ qp->s_num_rd_atomic++;
+ if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
+ qp->s_lsn++;
put_ib_reth_vaddr(
wqe->rdma_wr.remote_addr,
&ohdr->u.rc.reth);
@@ -503,23 +803,99 @@ no_flow_control:
qp->s_cur = 0;
break;
+ case IB_WR_TID_RDMA_READ:
+ trace_hfi1_tid_read_sender_make_req(qp, newreq);
+ wpriv = wqe->priv;
+ req = wqe_to_tid_req(wqe);
+ trace_hfi1_tid_req_make_req_read(qp, newreq,
+ wqe->wr.opcode,
+ wqe->psn, wqe->lpsn,
+ req);
+ delta = cmp_psn(qp->s_psn, wqe->psn);
+
+ /*
+ * Don't allow more operations to be started
+ * than the QP limits allow. We could get here under
+ * three conditions; (1) It's a new request; (2) We are
+ * sending the second or later segment of a request,
+ * but the qp->s_state is set to OP(RDMA_READ_REQUEST)
+ * when the last segment of a previous request is
+ * received just before this; (3) We are re-sending a
+ * request.
+ */
+ if (qp->s_num_rd_atomic >= qp->s_max_rd_atomic) {
+ qp->s_flags |= RVT_S_WAIT_RDMAR;
+ goto bail;
+ }
+ if (newreq) {
+ struct tid_rdma_flow *flow =
+ &req->flows[req->setup_head];
+
+ /*
+ * Set up s_sge as it is needed for TID
+ * allocation. However, if the pages have been
+ * walked and mapped, skip it. An earlier try
+ * has failed to allocate the TID entries.
+ */
+ if (!flow->npagesets) {
+ qp->s_sge.sge = wqe->sg_list[0];
+ qp->s_sge.sg_list = wqe->sg_list + 1;
+ qp->s_sge.num_sge = wqe->wr.num_sge;
+ qp->s_sge.total_len = wqe->length;
+ qp->s_len = wqe->length;
+ req->isge = 0;
+ req->clear_tail = req->setup_head;
+ req->flow_idx = req->setup_head;
+ req->state = TID_REQUEST_ACTIVE;
+ }
+ } else if (delta == 0) {
+ /* Re-send a request */
+ req->cur_seg = 0;
+ req->comp_seg = 0;
+ req->ack_pending = 0;
+ req->flow_idx = req->clear_tail;
+ req->state = TID_REQUEST_RESEND;
+ }
+ req->s_next_psn = qp->s_psn;
+ /* Read one segment at a time */
+ len = min_t(u32, req->seg_len,
+ wqe->length - req->seg_len * req->cur_seg);
+ delta = hfi1_build_tid_rdma_read_req(qp, wqe, ohdr,
+ &bth1, &bth2,
+ &len);
+ if (delta <= 0) {
+ /* Wait for TID space */
+ goto bail;
+ }
+ if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
+ qp->s_lsn++;
+ hwords += delta;
+ ss = &wpriv->ss;
+ /* Check if this is the last segment */
+ if (req->cur_seg >= req->total_segs &&
+ ++qp->s_cur == qp->s_size)
+ qp->s_cur = 0;
+ break;
+
case IB_WR_ATOMIC_CMP_AND_SWP:
case IB_WR_ATOMIC_FETCH_AND_ADD:
/*
* Don't allow more operations to be started
* than the QP limits allow.
*/
- if (newreq) {
- if (qp->s_num_rd_atomic >=
- qp->s_max_rd_atomic) {
- qp->s_flags |= RVT_S_WAIT_RDMAR;
- goto bail;
- }
- qp->s_num_rd_atomic++;
- if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
- qp->s_lsn++;
+ if (qp->s_num_rd_atomic >=
+ qp->s_max_rd_atomic) {
+ qp->s_flags |= RVT_S_WAIT_RDMAR;
+ goto bail;
}
- if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
+ qp->s_num_rd_atomic++;
+
+ /* FALLTHROUGH */
+ case IB_WR_OPFN:
+ if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
+ qp->s_lsn++;
+ if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+ wqe->wr.opcode == IB_WR_OPFN) {
qp->s_state = OP(COMPARE_SWAP);
put_ib_ateth_swap(wqe->atomic_wr.swap,
&ohdr->u.atomic_eth);
@@ -546,18 +922,23 @@ no_flow_control:
default:
goto bail;
}
- qp->s_sge.sge = wqe->sg_list[0];
- qp->s_sge.sg_list = wqe->sg_list + 1;
- qp->s_sge.num_sge = wqe->wr.num_sge;
- qp->s_sge.total_len = wqe->length;
- qp->s_len = wqe->length;
+ if (wqe->wr.opcode != IB_WR_TID_RDMA_READ) {
+ qp->s_sge.sge = wqe->sg_list[0];
+ qp->s_sge.sg_list = wqe->sg_list + 1;
+ qp->s_sge.num_sge = wqe->wr.num_sge;
+ qp->s_sge.total_len = wqe->length;
+ qp->s_len = wqe->length;
+ }
if (newreq) {
qp->s_tail++;
if (qp->s_tail >= qp->s_size)
qp->s_tail = 0;
}
- if (wqe->wr.opcode == IB_WR_RDMA_READ)
+ if (wqe->wr.opcode == IB_WR_RDMA_READ ||
+ wqe->wr.opcode == IB_WR_TID_RDMA_WRITE)
qp->s_psn = wqe->lpsn + 1;
+ else if (wqe->wr.opcode == IB_WR_TID_RDMA_READ)
+ qp->s_psn = req->s_next_psn;
else
qp->s_psn++;
break;
@@ -674,10 +1055,137 @@ no_flow_control:
if (qp->s_cur == qp->s_size)
qp->s_cur = 0;
break;
+
+ case TID_OP(WRITE_RESP):
+ /*
+ * This value for s_state is used for restarting a TID RDMA
+ * WRITE request. See comment in OP(RDMA_READ_RESPONSE_MIDDLE
+ * for more).
+ */
+ req = wqe_to_tid_req(wqe);
+ req->state = TID_REQUEST_RESEND;
+ rcu_read_lock();
+ remote = rcu_dereference(priv->tid_rdma.remote);
+ req->comp_seg = delta_psn(qp->s_psn, wqe->psn);
+ len = wqe->length - (req->comp_seg * remote->max_len);
+ rcu_read_unlock();
+
+ bth2 = mask_psn(qp->s_psn);
+ hwords += hfi1_build_tid_rdma_write_req(qp, wqe, ohdr, &bth1,
+ &bth2, &len);
+ qp->s_psn = wqe->lpsn + 1;
+ ss = NULL;
+ qp->s_state = TID_OP(WRITE_REQ);
+ priv->pending_tid_w_resp += delta_psn(wqe->lpsn, bth2) + 1;
+ priv->s_tid_cur = qp->s_cur;
+ if (++qp->s_cur == qp->s_size)
+ qp->s_cur = 0;
+ trace_hfi1_tid_req_make_req_write(qp, 0, wqe->wr.opcode,
+ wqe->psn, wqe->lpsn, req);
+ break;
+
+ case TID_OP(READ_RESP):
+ if (wqe->wr.opcode != IB_WR_TID_RDMA_READ)
+ goto bail;
+ /* This is used to restart a TID read request */
+ req = wqe_to_tid_req(wqe);
+ wpriv = wqe->priv;
+ /*
+ * Back down. The field qp->s_psn has been set to the psn with
+ * which the request should be restart. It's OK to use division
+ * as this is on the retry path.
+ */
+ req->cur_seg = delta_psn(qp->s_psn, wqe->psn) / priv->pkts_ps;
+
+ /*
+ * The following function need to be redefined to return the
+ * status to make sure that we find the flow. At the same
+ * time, we can use the req->state change to check if the
+ * call succeeds or not.
+ */
+ req->state = TID_REQUEST_RESEND;
+ hfi1_tid_rdma_restart_req(qp, wqe, &bth2);
+ if (req->state != TID_REQUEST_ACTIVE) {
+ /*
+ * Failed to find the flow. Release all allocated tid
+ * resources.
+ */
+ hfi1_kern_exp_rcv_clear_all(req);
+ hfi1_kern_clear_hw_flow(priv->rcd, qp);
+
+ hfi1_trdma_send_complete(qp, wqe, IB_WC_LOC_QP_OP_ERR);
+ goto bail;
+ }
+ req->state = TID_REQUEST_RESEND;
+ len = min_t(u32, req->seg_len,
+ wqe->length - req->seg_len * req->cur_seg);
+ flow = &req->flows[req->flow_idx];
+ len -= flow->sent;
+ req->s_next_psn = flow->flow_state.ib_lpsn + 1;
+ delta = hfi1_build_tid_rdma_read_packet(wqe, ohdr, &bth1,
+ &bth2, &len);
+ if (delta <= 0) {
+ /* Wait for TID space */
+ goto bail;
+ }
+ hwords += delta;
+ ss = &wpriv->ss;
+ /* Check if this is the last segment */
+ if (req->cur_seg >= req->total_segs &&
+ ++qp->s_cur == qp->s_size)
+ qp->s_cur = 0;
+ qp->s_psn = req->s_next_psn;
+ trace_hfi1_tid_req_make_req_read(qp, 0, wqe->wr.opcode,
+ wqe->psn, wqe->lpsn, req);
+ break;
+ case TID_OP(READ_REQ):
+ req = wqe_to_tid_req(wqe);
+ delta = cmp_psn(qp->s_psn, wqe->psn);
+ /*
+ * If the current WR is not TID RDMA READ, or this is the start
+ * of a new request, we need to change the qp->s_state so that
+ * the request can be set up properly.
+ */
+ if (wqe->wr.opcode != IB_WR_TID_RDMA_READ || delta == 0 ||
+ qp->s_cur == qp->s_tail) {
+ qp->s_state = OP(RDMA_READ_REQUEST);
+ if (delta == 0 || qp->s_cur == qp->s_tail)
+ goto check_s_state;
+ else
+ goto bail;
+ }
+
+ /* Rate limiting */
+ if (qp->s_num_rd_atomic >= qp->s_max_rd_atomic) {
+ qp->s_flags |= RVT_S_WAIT_RDMAR;
+ goto bail;
+ }
+
+ wpriv = wqe->priv;
+ /* Read one segment at a time */
+ len = min_t(u32, req->seg_len,
+ wqe->length - req->seg_len * req->cur_seg);
+ delta = hfi1_build_tid_rdma_read_req(qp, wqe, ohdr, &bth1,
+ &bth2, &len);
+ if (delta <= 0) {
+ /* Wait for TID space */
+ goto bail;
+ }
+ hwords += delta;
+ ss = &wpriv->ss;
+ /* Check if this is the last segment */
+ if (req->cur_seg >= req->total_segs &&
+ ++qp->s_cur == qp->s_size)
+ qp->s_cur = 0;
+ qp->s_psn = req->s_next_psn;
+ trace_hfi1_tid_req_make_req_read(qp, 0, wqe->wr.opcode,
+ wqe->psn, wqe->lpsn, req);
+ break;
}
qp->s_sending_hpsn = bth2;
delta = delta_psn(bth2, wqe->psn);
- if (delta && delta % HFI1_PSN_CREDIT == 0)
+ if (delta && delta % HFI1_PSN_CREDIT == 0 &&
+ wqe->wr.opcode != IB_WR_TID_RDMA_WRITE)
bth2 |= IB_BTH_REQ_ACK;
if (qp->s_flags & RVT_S_SEND_ONE) {
qp->s_flags &= ~RVT_S_SEND_ONE;
@@ -693,6 +1201,7 @@ no_flow_control:
qp,
ohdr,
bth0 | (qp->s_state << 24),
+ bth1,
bth2,
middle,
ps);
@@ -709,6 +1218,12 @@ bail:
bail_no_tx:
ps->s_txreq = NULL;
qp->s_flags &= ~RVT_S_BUSY;
+ /*
+ * If we didn't get a txreq, the QP will be woken up later to try
+ * again. Set the flags to indicate which work item to wake
+ * up.
+ */
+ iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_IB);
return 0;
}
@@ -796,6 +1311,11 @@ static inline void hfi1_make_rc_ack_9B(struct hfi1_packet *packet,
if (qp->s_mig_state == IB_MIG_MIGRATED)
bth0 |= IB_BTH_MIG_REQ;
bth1 = (!!is_fecn) << IB_BECN_SHIFT;
+ /*
+ * Inline ACKs go out without the use of the Verbs send engine, so
+ * we need to set the STL Verbs Extended bit here
+ */
+ bth1 |= HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT;
hfi1_make_bth_aeth(qp, ohdr, bth0, bth1);
}
@@ -936,6 +1456,43 @@ void hfi1_send_rc_ack(struct hfi1_packet *packet, bool is_fecn)
}
/**
+ * update_num_rd_atomic - update the qp->s_num_rd_atomic
+ * @qp: the QP
+ * @psn: the packet sequence number to restart at
+ * @wqe: the wqe
+ *
+ * This is called from reset_psn() to update qp->s_num_rd_atomic
+ * for the current wqe.
+ * Called at interrupt level with the QP s_lock held.
+ */
+static void update_num_rd_atomic(struct rvt_qp *qp, u32 psn,
+ struct rvt_swqe *wqe)
+{
+ u32 opcode = wqe->wr.opcode;
+
+ if (opcode == IB_WR_RDMA_READ ||
+ opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+ opcode == IB_WR_ATOMIC_FETCH_AND_ADD) {
+ qp->s_num_rd_atomic++;
+ } else if (opcode == IB_WR_TID_RDMA_READ) {
+ struct tid_rdma_request *req = wqe_to_tid_req(wqe);
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ if (cmp_psn(psn, wqe->lpsn) <= 0) {
+ u32 cur_seg;
+
+ cur_seg = (psn - wqe->psn) / priv->pkts_ps;
+ req->ack_pending = cur_seg - req->comp_seg;
+ priv->pending_tid_r_segs += req->ack_pending;
+ qp->s_num_rd_atomic += req->ack_pending;
+ } else {
+ priv->pending_tid_r_segs += req->total_segs;
+ qp->s_num_rd_atomic += req->total_segs;
+ }
+ }
+}
+
+/**
* reset_psn - reset the QP state to send starting from PSN
* @qp: the QP
* @psn: the packet sequence number to restart at
@@ -949,9 +1506,13 @@ static void reset_psn(struct rvt_qp *qp, u32 psn)
u32 n = qp->s_acked;
struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, n);
u32 opcode;
+ struct hfi1_qp_priv *priv = qp->priv;
lockdep_assert_held(&qp->s_lock);
qp->s_cur = n;
+ priv->pending_tid_r_segs = 0;
+ priv->pending_tid_w_resp = 0;
+ qp->s_num_rd_atomic = 0;
/*
* If we are starting the request from the beginning,
@@ -961,9 +1522,9 @@ static void reset_psn(struct rvt_qp *qp, u32 psn)
qp->s_state = OP(SEND_LAST);
goto done;
}
+ update_num_rd_atomic(qp, psn, wqe);
/* Find the work request opcode corresponding to the given PSN. */
- opcode = wqe->wr.opcode;
for (;;) {
int diff;
@@ -973,8 +1534,11 @@ static void reset_psn(struct rvt_qp *qp, u32 psn)
break;
wqe = rvt_get_swqe_ptr(qp, n);
diff = cmp_psn(psn, wqe->psn);
- if (diff < 0)
+ if (diff < 0) {
+ /* Point wqe back to the previous one*/
+ wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
break;
+ }
qp->s_cur = n;
/*
* If we are starting the request from the beginning,
@@ -984,8 +1548,10 @@ static void reset_psn(struct rvt_qp *qp, u32 psn)
qp->s_state = OP(SEND_LAST);
goto done;
}
- opcode = wqe->wr.opcode;
+
+ update_num_rd_atomic(qp, psn, wqe);
}
+ opcode = wqe->wr.opcode;
/*
* Set the state to restart in the middle of a request.
@@ -1003,10 +1569,18 @@ static void reset_psn(struct rvt_qp *qp, u32 psn)
qp->s_state = OP(RDMA_READ_RESPONSE_LAST);
break;
+ case IB_WR_TID_RDMA_WRITE:
+ qp->s_state = TID_OP(WRITE_RESP);
+ break;
+
case IB_WR_RDMA_READ:
qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE);
break;
+ case IB_WR_TID_RDMA_READ:
+ qp->s_state = TID_OP(READ_RESP);
+ break;
+
default:
/*
* This case shouldn't happen since its only
@@ -1015,6 +1589,7 @@ static void reset_psn(struct rvt_qp *qp, u32 psn)
qp->s_state = OP(SEND_LAST);
}
done:
+ priv->s_flags &= ~HFI1_S_TID_WAIT_INTERLCK;
qp->s_psn = psn;
/*
* Set RVT_S_WAIT_PSN as rc_complete() may start the timer
@@ -1025,6 +1600,7 @@ done:
(cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0))
qp->s_flags |= RVT_S_WAIT_PSN;
qp->s_flags &= ~HFI1_S_AHG_VALID;
+ trace_hfi1_sender_reset_psn(qp);
}
/*
@@ -1033,18 +1609,47 @@ done:
*/
void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait)
{
+ struct hfi1_qp_priv *priv = qp->priv;
struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
struct hfi1_ibport *ibp;
lockdep_assert_held(&qp->r_lock);
lockdep_assert_held(&qp->s_lock);
+ trace_hfi1_sender_restart_rc(qp);
if (qp->s_retry == 0) {
if (qp->s_mig_state == IB_MIG_ARMED) {
hfi1_migrate_qp(qp);
qp->s_retry = qp->s_retry_cnt;
} else if (qp->s_last == qp->s_acked) {
- rvt_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR);
- rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+ /*
+ * We need special handling for the OPFN request WQEs as
+ * they are not allowed to generate real user errors
+ */
+ if (wqe->wr.opcode == IB_WR_OPFN) {
+ struct hfi1_ibport *ibp =
+ to_iport(qp->ibqp.device, qp->port_num);
+ /*
+ * Call opfn_conn_reply() with capcode and
+ * remaining data as 0 to close out the
+ * current request
+ */
+ opfn_conn_reply(qp, priv->opfn.curr);
+ wqe = do_rc_completion(qp, wqe, ibp);
+ qp->s_flags &= ~RVT_S_WAIT_ACK;
+ } else {
+ trace_hfi1_tid_write_sender_restart_rc(qp, 0);
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
+ struct tid_rdma_request *req;
+
+ req = wqe_to_tid_req(wqe);
+ hfi1_kern_exp_rcv_clear_all(req);
+ hfi1_kern_clear_hw_flow(priv->rcd, qp);
+ }
+
+ hfi1_trdma_send_complete(qp, wqe,
+ IB_WC_RETRY_EXC_ERR);
+ rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+ }
return;
} else { /* need to handle delayed completion */
return;
@@ -1054,14 +1659,15 @@ void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait)
}
ibp = to_iport(qp->ibqp.device, qp->port_num);
- if (wqe->wr.opcode == IB_WR_RDMA_READ)
+ if (wqe->wr.opcode == IB_WR_RDMA_READ ||
+ wqe->wr.opcode == IB_WR_TID_RDMA_READ)
ibp->rvp.n_rc_resends++;
else
ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn);
qp->s_flags &= ~(RVT_S_WAIT_FENCE | RVT_S_WAIT_RDMAR |
RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_PSN |
- RVT_S_WAIT_ACK);
+ RVT_S_WAIT_ACK | HFI1_S_WAIT_TID_RESP);
if (wait)
qp->s_flags |= RVT_S_SEND_ONE;
reset_psn(qp, psn);
@@ -1069,7 +1675,8 @@ void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait)
/*
* Set qp->s_sending_psn to the next PSN after the given one.
- * This would be psn+1 except when RDMA reads are present.
+ * This would be psn+1 except when RDMA reads or TID RDMA ops
+ * are present.
*/
static void reset_sending_psn(struct rvt_qp *qp, u32 psn)
{
@@ -1081,7 +1688,9 @@ static void reset_sending_psn(struct rvt_qp *qp, u32 psn)
for (;;) {
wqe = rvt_get_swqe_ptr(qp, n);
if (cmp_psn(psn, wqe->lpsn) <= 0) {
- if (wqe->wr.opcode == IB_WR_RDMA_READ)
+ if (wqe->wr.opcode == IB_WR_RDMA_READ ||
+ wqe->wr.opcode == IB_WR_TID_RDMA_READ ||
+ wqe->wr.opcode == IB_WR_TID_RDMA_WRITE)
qp->s_sending_psn = wqe->lpsn + 1;
else
qp->s_sending_psn = psn + 1;
@@ -1104,8 +1713,9 @@ void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah)
struct rvt_swqe *wqe;
struct ib_header *hdr = NULL;
struct hfi1_16b_header *hdr_16b = NULL;
- u32 opcode;
+ u32 opcode, head, tail;
u32 psn;
+ struct tid_rdma_request *req;
lockdep_assert_held(&qp->s_lock);
if (!(ib_rvt_state_ops[qp->state] & RVT_SEND_OR_FLUSH_OR_RECV_OK))
@@ -1130,25 +1740,85 @@ void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah)
}
opcode = ib_bth_get_opcode(ohdr);
- if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
- opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
+ if ((opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
+ opcode <= OP(ATOMIC_ACKNOWLEDGE)) ||
+ opcode == TID_OP(READ_RESP) ||
+ opcode == TID_OP(WRITE_RESP)) {
WARN_ON(!qp->s_rdma_ack_cnt);
qp->s_rdma_ack_cnt--;
return;
}
psn = ib_bth_get_psn(ohdr);
- reset_sending_psn(qp, psn);
+ /*
+ * Don't attempt to reset the sending PSN for packets in the
+ * KDETH PSN space since the PSN does not match anything.
+ */
+ if (opcode != TID_OP(WRITE_DATA) &&
+ opcode != TID_OP(WRITE_DATA_LAST) &&
+ opcode != TID_OP(ACK) && opcode != TID_OP(RESYNC))
+ reset_sending_psn(qp, psn);
+
+ /* Handle TID RDMA WRITE packets differently */
+ if (opcode >= TID_OP(WRITE_REQ) &&
+ opcode <= TID_OP(WRITE_DATA_LAST)) {
+ head = priv->s_tid_head;
+ tail = priv->s_tid_cur;
+ /*
+ * s_tid_cur is set to s_tid_head in the case, where
+ * a new TID RDMA request is being started and all
+ * previous ones have been completed.
+ * Therefore, we need to do a secondary check in order
+ * to properly determine whether we should start the
+ * RC timer.
+ */
+ wqe = rvt_get_swqe_ptr(qp, tail);
+ req = wqe_to_tid_req(wqe);
+ if (head == tail && req->comp_seg < req->total_segs) {
+ if (tail == 0)
+ tail = qp->s_size - 1;
+ else
+ tail -= 1;
+ }
+ } else {
+ head = qp->s_tail;
+ tail = qp->s_acked;
+ }
/*
* Start timer after a packet requesting an ACK has been sent and
* there are still requests that haven't been acked.
*/
- if ((psn & IB_BTH_REQ_ACK) && qp->s_acked != qp->s_tail &&
+ if ((psn & IB_BTH_REQ_ACK) && tail != head &&
+ opcode != TID_OP(WRITE_DATA) && opcode != TID_OP(WRITE_DATA_LAST) &&
+ opcode != TID_OP(RESYNC) &&
!(qp->s_flags &
- (RVT_S_TIMER | RVT_S_WAIT_RNR | RVT_S_WAIT_PSN)) &&
- (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
- rvt_add_retry_timer(qp);
+ (RVT_S_TIMER | RVT_S_WAIT_RNR | RVT_S_WAIT_PSN)) &&
+ (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
+ if (opcode == TID_OP(READ_REQ))
+ rvt_add_retry_timer_ext(qp, priv->timeout_shift);
+ else
+ rvt_add_retry_timer(qp);
+ }
+
+ /* Start TID RDMA ACK timer */
+ if ((opcode == TID_OP(WRITE_DATA) ||
+ opcode == TID_OP(WRITE_DATA_LAST) ||
+ opcode == TID_OP(RESYNC)) &&
+ (psn & IB_BTH_REQ_ACK) &&
+ !(priv->s_flags & HFI1_S_TID_RETRY_TIMER) &&
+ (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
+ /*
+ * The TID RDMA ACK packet could be received before this
+ * function is called. Therefore, add the timer only if TID
+ * RDMA ACK packets are actually pending.
+ */
+ wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+ req = wqe_to_tid_req(wqe);
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE &&
+ req->ack_seg < req->cur_seg)
+ hfi1_add_tid_retry_timer(qp);
+ }
while (qp->s_last != qp->s_acked) {
u32 s_last;
@@ -1157,6 +1827,7 @@ void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah)
if (cmp_psn(wqe->lpsn, qp->s_sending_psn) >= 0 &&
cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)
break;
+ trdma_clean_swqe(qp, wqe);
rvt_qp_wqe_unreserve(qp, wqe);
s_last = qp->s_last;
trace_hfi1_qp_send_completion(qp, wqe, s_last);
@@ -1195,20 +1866,24 @@ static inline void update_last_psn(struct rvt_qp *qp, u32 psn)
* This is similar to hfi1_send_complete but has to check to be sure
* that the SGEs are not being referenced if the SWQE is being resent.
*/
-static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
- struct rvt_swqe *wqe,
- struct hfi1_ibport *ibp)
+struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
+ struct rvt_swqe *wqe,
+ struct hfi1_ibport *ibp)
{
+ struct hfi1_qp_priv *priv = qp->priv;
+
lockdep_assert_held(&qp->s_lock);
/*
* Don't decrement refcount and don't generate a
* completion if the SWQE is being resent until the send
* is finished.
*/
+ trace_hfi1_rc_completion(qp, wqe->lpsn);
if (cmp_psn(wqe->lpsn, qp->s_sending_psn) < 0 ||
cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
u32 s_last;
+ trdma_clean_swqe(qp, wqe);
rvt_put_swqe(wqe);
rvt_qp_wqe_unreserve(qp, wqe);
s_last = qp->s_last;
@@ -1243,7 +1918,16 @@ static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
}
qp->s_retry = qp->s_retry_cnt;
- update_last_psn(qp, wqe->lpsn);
+ /*
+ * Don't update the last PSN if the request being completed is
+ * a TID RDMA WRITE request.
+ * Completion of the TID RDMA WRITE requests are done by the
+ * TID RDMA ACKs and as such could be for a request that has
+ * already been ACKed as far as the IB state machine is
+ * concerned.
+ */
+ if (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE)
+ update_last_psn(qp, wqe->lpsn);
/*
* If we are completing a request which is in the process of
@@ -1266,9 +1950,61 @@ static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
qp->s_draining = 0;
wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
}
+ if (priv->s_flags & HFI1_S_TID_WAIT_INTERLCK) {
+ priv->s_flags &= ~HFI1_S_TID_WAIT_INTERLCK;
+ hfi1_schedule_send(qp);
+ }
return wqe;
}
+static void set_restart_qp(struct rvt_qp *qp, struct hfi1_ctxtdata *rcd)
+{
+ /* Retry this request. */
+ if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) {
+ qp->r_flags |= RVT_R_RDMAR_SEQ;
+ hfi1_restart_rc(qp, qp->s_last_psn + 1, 0);
+ if (list_empty(&qp->rspwait)) {
+ qp->r_flags |= RVT_R_RSP_SEND;
+ rvt_get_qp(qp);
+ list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
+ }
+ }
+}
+
+/**
+ * update_qp_retry_state - Update qp retry state.
+ * @qp: the QP
+ * @psn: the packet sequence number of the TID RDMA WRITE RESP.
+ * @spsn: The start psn for the given TID RDMA WRITE swqe.
+ * @lpsn: The last psn for the given TID RDMA WRITE swqe.
+ *
+ * This function is called to update the qp retry state upon
+ * receiving a TID WRITE RESP after the qp is scheduled to retry
+ * a request.
+ */
+static void update_qp_retry_state(struct rvt_qp *qp, u32 psn, u32 spsn,
+ u32 lpsn)
+{
+ struct hfi1_qp_priv *qpriv = qp->priv;
+
+ qp->s_psn = psn + 1;
+ /*
+ * If this is the first TID RDMA WRITE RESP packet for the current
+ * request, change the s_state so that the retry will be processed
+ * correctly. Similarly, if this is the last TID RDMA WRITE RESP
+ * packet, change the s_state and advance the s_cur.
+ */
+ if (cmp_psn(psn, lpsn) >= 0) {
+ qp->s_cur = qpriv->s_tid_cur + 1;
+ if (qp->s_cur >= qp->s_size)
+ qp->s_cur = 0;
+ qp->s_state = TID_OP(WRITE_REQ);
+ } else if (!cmp_psn(psn, spsn)) {
+ qp->s_cur = qpriv->s_tid_cur;
+ qp->s_state = TID_OP(WRITE_RESP);
+ }
+}
+
/**
* do_rc_ack - process an incoming RC ACK
* @qp: the QP the ACK came in on
@@ -1280,15 +2016,17 @@ static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
* May be called at interrupt level, with the QP s_lock held.
* Returns 1 if OK, 0 if current operation should be aborted (NAK).
*/
-static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
- u64 val, struct hfi1_ctxtdata *rcd)
+int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
+ u64 val, struct hfi1_ctxtdata *rcd)
{
struct hfi1_ibport *ibp;
enum ib_wc_status status;
+ struct hfi1_qp_priv *qpriv = qp->priv;
struct rvt_swqe *wqe;
int ret = 0;
u32 ack_psn;
int diff;
+ struct rvt_dev_info *rdi;
lockdep_assert_held(&qp->s_lock);
/*
@@ -1331,20 +2069,14 @@ static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
*/
if ((wqe->wr.opcode == IB_WR_RDMA_READ &&
(opcode != OP(RDMA_READ_RESPONSE_LAST) || diff != 0)) ||
+ (wqe->wr.opcode == IB_WR_TID_RDMA_READ &&
+ (opcode != TID_OP(READ_RESP) || diff != 0)) ||
((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) &&
- (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) {
- /* Retry this request. */
- if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) {
- qp->r_flags |= RVT_R_RDMAR_SEQ;
- hfi1_restart_rc(qp, qp->s_last_psn + 1, 0);
- if (list_empty(&qp->rspwait)) {
- qp->r_flags |= RVT_R_RSP_SEND;
- rvt_get_qp(qp);
- list_add_tail(&qp->rspwait,
- &rcd->qp_wait_list);
- }
- }
+ (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0)) ||
+ (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE &&
+ (delta_psn(psn, qp->s_last_psn) != 1))) {
+ set_restart_qp(qp, rcd);
/*
* No need to process the ACK/NAK since we are
* restarting an earlier request.
@@ -1356,6 +2088,9 @@ static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
u64 *vaddr = wqe->sg_list[0].vaddr;
*vaddr = val;
}
+ if (wqe->wr.opcode == IB_WR_OPFN)
+ opfn_conn_reply(qp, val);
+
if (qp->s_num_rd_atomic &&
(wqe->wr.opcode == IB_WR_RDMA_READ ||
wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
@@ -1373,26 +2108,85 @@ static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
hfi1_schedule_send(qp);
}
}
+
+ /*
+ * TID RDMA WRITE requests will be completed by the TID RDMA
+ * ACK packet handler (see tid_rdma.c).
+ */
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE)
+ break;
+
wqe = do_rc_completion(qp, wqe, ibp);
if (qp->s_acked == qp->s_tail)
break;
}
+ trace_hfi1_rc_ack_do(qp, aeth, psn, wqe);
+ trace_hfi1_sender_do_rc_ack(qp);
switch (aeth >> IB_AETH_NAK_SHIFT) {
case 0: /* ACK */
this_cpu_inc(*ibp->rvp.rc_acks);
- if (qp->s_acked != qp->s_tail) {
- /*
- * We are expecting more ACKs so
- * mod the retry timer.
- */
- rvt_mod_retry_timer(qp);
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
+ if (wqe_to_tid_req(wqe)->ack_pending)
+ rvt_mod_retry_timer_ext(qp,
+ qpriv->timeout_shift);
+ else
+ rvt_stop_rc_timers(qp);
+ } else if (qp->s_acked != qp->s_tail) {
+ struct rvt_swqe *__w = NULL;
+
+ if (qpriv->s_tid_cur != HFI1_QP_WQE_INVALID)
+ __w = rvt_get_swqe_ptr(qp, qpriv->s_tid_cur);
+
/*
- * We can stop re-sending the earlier packets and
- * continue with the next packet the receiver wants.
+ * Stop timers if we've received all of the TID RDMA
+ * WRITE * responses.
*/
- if (cmp_psn(qp->s_psn, psn) <= 0)
- reset_psn(qp, psn + 1);
+ if (__w && __w->wr.opcode == IB_WR_TID_RDMA_WRITE &&
+ opcode == TID_OP(WRITE_RESP)) {
+ /*
+ * Normally, the loop above would correctly
+ * process all WQEs from s_acked onward and
+ * either complete them or check for correct
+ * PSN sequencing.
+ * However, for TID RDMA, due to pipelining,
+ * the response may not be for the request at
+ * s_acked so the above look would just be
+ * skipped. This does not allow for checking
+ * the PSN sequencing. It has to be done
+ * separately.
+ */
+ if (cmp_psn(psn, qp->s_last_psn + 1)) {
+ set_restart_qp(qp, rcd);
+ goto bail_stop;
+ }
+ /*
+ * If the psn is being resent, stop the
+ * resending.
+ */
+ if (qp->s_cur != qp->s_tail &&
+ cmp_psn(qp->s_psn, psn) <= 0)
+ update_qp_retry_state(qp, psn,
+ __w->psn,
+ __w->lpsn);
+ else if (--qpriv->pending_tid_w_resp)
+ rvt_mod_retry_timer(qp);
+ else
+ rvt_stop_rc_timers(qp);
+ } else {
+ /*
+ * We are expecting more ACKs so
+ * mod the retry timer.
+ */
+ rvt_mod_retry_timer(qp);
+ /*
+ * We can stop re-sending the earlier packets
+ * and continue with the next packet the
+ * receiver wants.
+ */
+ if (cmp_psn(qp->s_psn, psn) <= 0)
+ reset_psn(qp, psn + 1);
+ }
} else {
/* No more acks - kill all timers */
rvt_stop_rc_timers(qp);
@@ -1408,6 +2202,15 @@ static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
rvt_get_credit(qp, aeth);
qp->s_rnr_retry = qp->s_rnr_retry_cnt;
qp->s_retry = qp->s_retry_cnt;
+ /*
+ * If the current request is a TID RDMA WRITE request and the
+ * response is not a TID RDMA WRITE RESP packet, s_last_psn
+ * can't be advanced.
+ */
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE &&
+ opcode != TID_OP(WRITE_RESP) &&
+ cmp_psn(psn, wqe->psn) >= 0)
+ return 1;
update_last_psn(qp, psn);
return 1;
@@ -1417,20 +2220,31 @@ static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
goto bail_stop;
if (qp->s_flags & RVT_S_WAIT_RNR)
goto bail_stop;
- if (qp->s_rnr_retry == 0) {
+ rdi = ib_to_rvt(qp->ibqp.device);
+ if (qp->s_rnr_retry == 0 &&
+ !((rdi->post_parms[wqe->wr.opcode].flags &
+ RVT_OPERATION_IGN_RNR_CNT) &&
+ qp->s_rnr_retry_cnt == 0)) {
status = IB_WC_RNR_RETRY_EXC_ERR;
goto class_b;
}
- if (qp->s_rnr_retry_cnt < 7)
+ if (qp->s_rnr_retry_cnt < 7 && qp->s_rnr_retry_cnt > 0)
qp->s_rnr_retry--;
- /* The last valid PSN is the previous PSN. */
- update_last_psn(qp, psn - 1);
+ /*
+ * The last valid PSN is the previous PSN. For TID RDMA WRITE
+ * request, s_last_psn should be incremented only when a TID
+ * RDMA WRITE RESP is received to avoid skipping lost TID RDMA
+ * WRITE RESP packets.
+ */
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) {
+ reset_psn(qp, qp->s_last_psn + 1);
+ } else {
+ update_last_psn(qp, psn - 1);
+ reset_psn(qp, psn);
+ }
ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn);
-
- reset_psn(qp, psn);
-
qp->s_flags &= ~(RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_ACK);
rvt_stop_rc_timers(qp);
rvt_add_rnr_timer(qp, aeth);
@@ -1470,7 +2284,10 @@ static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
ibp->rvp.n_other_naks++;
class_b:
if (qp->s_last == qp->s_acked) {
- rvt_send_complete(qp, wqe, status);
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_READ)
+ hfi1_kern_read_tid_flow_free(qp);
+
+ hfi1_trdma_send_complete(qp, wqe, status);
rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
}
break;
@@ -1511,6 +2328,8 @@ static void rdma_seq_err(struct rvt_qp *qp, struct hfi1_ibport *ibp, u32 psn,
while (cmp_psn(psn, wqe->lpsn) > 0) {
if (wqe->wr.opcode == IB_WR_RDMA_READ ||
+ wqe->wr.opcode == IB_WR_TID_RDMA_READ ||
+ wqe->wr.opcode == IB_WR_TID_RDMA_WRITE ||
wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)
break;
@@ -1717,16 +2536,6 @@ bail:
return;
}
-static inline void rc_defered_ack(struct hfi1_ctxtdata *rcd,
- struct rvt_qp *qp)
-{
- if (list_empty(&qp->rspwait)) {
- qp->r_flags |= RVT_R_RSP_NAK;
- rvt_get_qp(qp);
- list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
- }
-}
-
static inline void rc_cancel_ack(struct rvt_qp *qp)
{
qp->r_adefered = 0;
@@ -1759,8 +2568,9 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data,
struct hfi1_ibport *ibp = rcd_to_iport(rcd);
struct rvt_ack_entry *e;
unsigned long flags;
- u8 i, prev;
- int old_req;
+ u8 prev;
+ u8 mra; /* most recent ACK */
+ bool old_req;
trace_hfi1_rcv_error(qp, psn);
if (diff > 0) {
@@ -1806,29 +2616,8 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data,
spin_lock_irqsave(&qp->s_lock, flags);
- for (i = qp->r_head_ack_queue; ; i = prev) {
- if (i == qp->s_tail_ack_queue)
- old_req = 0;
- if (i)
- prev = i - 1;
- else
- prev = HFI1_MAX_RDMA_ATOMIC;
- if (prev == qp->r_head_ack_queue) {
- e = NULL;
- break;
- }
- e = &qp->s_ack_queue[prev];
- if (!e->opcode) {
- e = NULL;
- break;
- }
- if (cmp_psn(psn, e->psn) >= 0) {
- if (prev == qp->s_tail_ack_queue &&
- cmp_psn(psn, e->lpsn) <= 0)
- old_req = 0;
- break;
- }
- }
+ e = find_prev_entry(qp, psn, &prev, &mra, &old_req);
+
switch (opcode) {
case OP(RDMA_READ_REQUEST): {
struct ib_reth *reth;
@@ -1875,6 +2664,8 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data,
e->psn = psn;
if (old_req)
goto unlock_done;
+ if (qp->s_acked_ack_queue == qp->s_tail_ack_queue)
+ qp->s_acked_ack_queue = prev;
qp->s_tail_ack_queue = prev;
break;
}
@@ -1888,6 +2679,8 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data,
*/
if (!e || e->opcode != (u8)opcode || old_req)
goto unlock_done;
+ if (qp->s_tail_ack_queue == qp->s_acked_ack_queue)
+ qp->s_acked_ack_queue = prev;
qp->s_tail_ack_queue = prev;
break;
}
@@ -1903,7 +2696,7 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data,
* Resend the most recent ACK if this request is
* after all the previous RDMA reads and atomics.
*/
- if (i == qp->r_head_ack_queue) {
+ if (mra == qp->r_head_ack_queue) {
spin_unlock_irqrestore(&qp->s_lock, flags);
qp->r_nak_state = 0;
qp->r_ack_psn = qp->r_psn - 1;
@@ -1914,7 +2707,9 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data,
* Resend the RDMA read or atomic op which
* ACKs this duplicate request.
*/
- qp->s_tail_ack_queue = i;
+ if (qp->s_tail_ack_queue == qp->s_acked_ack_queue)
+ qp->s_acked_ack_queue = mra;
+ qp->s_tail_ack_queue = mra;
break;
}
qp->s_ack_state = OP(ACKNOWLEDGE);
@@ -1931,17 +2726,6 @@ send_ack:
return 0;
}
-static inline void update_ack_queue(struct rvt_qp *qp, unsigned n)
-{
- unsigned next;
-
- next = n + 1;
- if (next > HFI1_MAX_RDMA_ATOMIC)
- next = 0;
- qp->s_tail_ack_queue = next;
- qp->s_ack_state = OP(ACKNOWLEDGE);
-}
-
static void log_cca_event(struct hfi1_pportdata *ppd, u8 sl, u32 rlid,
u32 lqpn, u32 rqpn, u8 svc_type)
{
@@ -2039,6 +2823,7 @@ void hfi1_rc_rcv(struct hfi1_packet *packet)
void *data = packet->payload;
u32 tlen = packet->tlen;
struct rvt_qp *qp = packet->qp;
+ struct hfi1_qp_priv *qpriv = qp->priv;
struct hfi1_ibport *ibp = rcd_to_iport(rcd);
struct ib_other_headers *ohdr = packet->ohdr;
u32 opcode = packet->opcode;
@@ -2061,6 +2846,7 @@ void hfi1_rc_rcv(struct hfi1_packet *packet)
return;
fecn = process_ecn(qp, packet);
+ opfn_trigger_conn_request(qp, be32_to_cpu(ohdr->bth[1]));
/*
* Process responses (ACKs) before anything else. Note that the
@@ -2292,11 +3078,11 @@ send_last:
if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
goto nack_inv;
next = qp->r_head_ack_queue + 1;
- /* s_ack_queue is size HFI1_MAX_RDMA_ATOMIC+1 so use > not >= */
- if (next > HFI1_MAX_RDMA_ATOMIC)
+ /* s_ack_queue is size rvt_size_atomic()+1 so use > not >= */
+ if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
next = 0;
spin_lock_irqsave(&qp->s_lock, flags);
- if (unlikely(next == qp->s_tail_ack_queue)) {
+ if (unlikely(next == qp->s_acked_ack_queue)) {
if (!qp->s_ack_queue[next].sent)
goto nack_inv_unlck;
update_ack_queue(qp, next);
@@ -2343,6 +3129,7 @@ send_last:
qp->r_state = opcode;
qp->r_nak_state = 0;
qp->r_head_ack_queue = next;
+ qpriv->r_tid_alloc = qp->r_head_ack_queue;
/* Schedule the send engine. */
qp->s_flags |= RVT_S_RESP_PENDING;
@@ -2356,21 +3143,24 @@ send_last:
case OP(COMPARE_SWAP):
case OP(FETCH_ADD): {
- struct ib_atomic_eth *ateth;
+ struct ib_atomic_eth *ateth = &ohdr->u.atomic_eth;
+ u64 vaddr = get_ib_ateth_vaddr(ateth);
+ bool opfn = opcode == OP(COMPARE_SWAP) &&
+ vaddr == HFI1_VERBS_E_ATOMIC_VADDR;
struct rvt_ack_entry *e;
- u64 vaddr;
atomic64_t *maddr;
u64 sdata;
u32 rkey;
u8 next;
- if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
+ if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC) &&
+ !opfn))
goto nack_inv;
next = qp->r_head_ack_queue + 1;
- if (next > HFI1_MAX_RDMA_ATOMIC)
+ if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
next = 0;
spin_lock_irqsave(&qp->s_lock, flags);
- if (unlikely(next == qp->s_tail_ack_queue)) {
+ if (unlikely(next == qp->s_acked_ack_queue)) {
if (!qp->s_ack_queue[next].sent)
goto nack_inv_unlck;
update_ack_queue(qp, next);
@@ -2380,8 +3170,11 @@ send_last:
rvt_put_mr(e->rdma_sge.mr);
e->rdma_sge.mr = NULL;
}
- ateth = &ohdr->u.atomic_eth;
- vaddr = get_ib_ateth_vaddr(ateth);
+ /* Process OPFN special virtual address */
+ if (opfn) {
+ opfn_conn_response(qp, e, ateth);
+ goto ack;
+ }
if (unlikely(vaddr & (sizeof(u64) - 1)))
goto nack_inv_unlck;
rkey = be32_to_cpu(ateth->rkey);
@@ -2400,6 +3193,7 @@ send_last:
sdata);
rvt_put_mr(qp->r_sge.sge.mr);
qp->r_sge.num_sge = 0;
+ack:
e->opcode = opcode;
e->sent = 0;
e->psn = psn;
@@ -2409,6 +3203,7 @@ send_last:
qp->r_state = opcode;
qp->r_nak_state = 0;
qp->r_head_ack_queue = next;
+ qpriv->r_tid_alloc = qp->r_head_ack_queue;
/* Schedule the send engine. */
qp->s_flags |= RVT_S_RESP_PENDING;
diff --git a/drivers/infiniband/hw/hfi1/rc.h b/drivers/infiniband/hw/hfi1/rc.h
new file mode 100644
index 000000000000..8e0935b9bf2a
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/rc.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
+/*
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ */
+
+#ifndef HFI1_RC_H
+#define HFI1_RC_H
+
+/* cut down ridiculously long IB macro names */
+#define OP(x) IB_OPCODE_RC_##x
+
+static inline void update_ack_queue(struct rvt_qp *qp, unsigned int n)
+{
+ unsigned int next;
+
+ next = n + 1;
+ if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
+ next = 0;
+ qp->s_tail_ack_queue = next;
+ qp->s_acked_ack_queue = next;
+ qp->s_ack_state = OP(ACKNOWLEDGE);
+}
+
+static inline void rc_defered_ack(struct hfi1_ctxtdata *rcd,
+ struct rvt_qp *qp)
+{
+ if (list_empty(&qp->rspwait)) {
+ qp->r_flags |= RVT_R_RSP_NAK;
+ rvt_get_qp(qp);
+ list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
+ }
+}
+
+static inline u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe,
+ u32 psn, u32 pmtu)
+{
+ u32 len;
+
+ len = delta_psn(psn, wqe->psn) * pmtu;
+ return rvt_restart_sge(ss, wqe, len);
+}
+
+struct rvt_ack_entry *find_prev_entry(struct rvt_qp *qp, u32 psn, u8 *prev,
+ u8 *prev_ack, bool *scheduled);
+int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, u64 val,
+ struct hfi1_ctxtdata *rcd);
+struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, struct rvt_swqe *wqe,
+ struct hfi1_ibport *ibp);
+
+#endif /* HFI1_RC_H */
diff --git a/drivers/infiniband/hw/hfi1/ruc.c b/drivers/infiniband/hw/hfi1/ruc.c
index 7fb317c711df..124a3ec1e15c 100644
--- a/drivers/infiniband/hw/hfi1/ruc.c
+++ b/drivers/infiniband/hw/hfi1/ruc.c
@@ -250,7 +250,6 @@ static inline void hfi1_make_ruc_bth(struct rvt_qp *qp,
struct ib_other_headers *ohdr,
u32 bth0, u32 bth1, u32 bth2)
{
- bth1 |= qp->remote_qpn;
ohdr->bth[0] = cpu_to_be32(bth0);
ohdr->bth[1] = cpu_to_be32(bth1);
ohdr->bth[2] = cpu_to_be32(bth2);
@@ -272,13 +271,13 @@ static inline void hfi1_make_ruc_bth(struct rvt_qp *qp,
*/
static inline void hfi1_make_ruc_header_16B(struct rvt_qp *qp,
struct ib_other_headers *ohdr,
- u32 bth0, u32 bth2, int middle,
+ u32 bth0, u32 bth1, u32 bth2,
+ int middle,
struct hfi1_pkt_state *ps)
{
struct hfi1_qp_priv *priv = qp->priv;
struct hfi1_ibport *ibp = ps->ibp;
struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
- u32 bth1 = 0;
u32 slid;
u16 pkey = hfi1_get_pkey(ibp, qp->s_pkey_index);
u8 l4 = OPA_16B_L4_IB_LOCAL;
@@ -360,12 +359,12 @@ static inline void hfi1_make_ruc_header_16B(struct rvt_qp *qp,
*/
static inline void hfi1_make_ruc_header_9B(struct rvt_qp *qp,
struct ib_other_headers *ohdr,
- u32 bth0, u32 bth2, int middle,
+ u32 bth0, u32 bth1, u32 bth2,
+ int middle,
struct hfi1_pkt_state *ps)
{
struct hfi1_qp_priv *priv = qp->priv;
struct hfi1_ibport *ibp = ps->ibp;
- u32 bth1 = 0;
u16 pkey = hfi1_get_pkey(ibp, qp->s_pkey_index);
u16 lrh0 = HFI1_LRH_BTH;
u8 extra_bytes = -ps->s_txreq->s_cur_size & 3;
@@ -415,7 +414,7 @@ static inline void hfi1_make_ruc_header_9B(struct rvt_qp *qp,
typedef void (*hfi1_make_ruc_hdr)(struct rvt_qp *qp,
struct ib_other_headers *ohdr,
- u32 bth0, u32 bth2, int middle,
+ u32 bth0, u32 bth1, u32 bth2, int middle,
struct hfi1_pkt_state *ps);
/* We support only two types - 9B and 16B for now */
@@ -425,7 +424,7 @@ static const hfi1_make_ruc_hdr hfi1_ruc_header_tbl[2] = {
};
void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr,
- u32 bth0, u32 bth2, int middle,
+ u32 bth0, u32 bth1, u32 bth2, int middle,
struct hfi1_pkt_state *ps)
{
struct hfi1_qp_priv *priv = qp->priv;
@@ -446,18 +445,21 @@ void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr,
priv->s_ahg->ahgidx = 0;
/* Make the appropriate header */
- hfi1_ruc_header_tbl[priv->hdr_type](qp, ohdr, bth0, bth2, middle, ps);
+ hfi1_ruc_header_tbl[priv->hdr_type](qp, ohdr, bth0, bth1, bth2, middle,
+ ps);
}
/* when sending, force a reschedule every one of these periods */
#define SEND_RESCHED_TIMEOUT (5 * HZ) /* 5s in jiffies */
/**
- * schedule_send_yield - test for a yield required for QP send engine
+ * hfi1_schedule_send_yield - test for a yield required for QP
+ * send engine
* @timeout: Final time for timeout slice for jiffies
* @qp: a pointer to QP
* @ps: a pointer to a structure with commonly lookup values for
* the the send engine progress
+ * @tid - true if it is the tid leg
*
* This routine checks if the time slice for the QP has expired
* for RC QPs, if so an additional work entry is queued. At this
@@ -465,8 +467,8 @@ void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr,
* returns true if a yield is required, otherwise, false
* is returned.
*/
-static bool schedule_send_yield(struct rvt_qp *qp,
- struct hfi1_pkt_state *ps)
+bool hfi1_schedule_send_yield(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
+ bool tid)
{
ps->pkts_sent = true;
@@ -474,8 +476,24 @@ static bool schedule_send_yield(struct rvt_qp *qp,
if (!ps->in_thread ||
workqueue_congested(ps->cpu, ps->ppd->hfi1_wq)) {
spin_lock_irqsave(&qp->s_lock, ps->flags);
- qp->s_flags &= ~RVT_S_BUSY;
- hfi1_schedule_send(qp);
+ if (!tid) {
+ qp->s_flags &= ~RVT_S_BUSY;
+ hfi1_schedule_send(qp);
+ } else {
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ if (priv->s_flags &
+ HFI1_S_TID_BUSY_SET) {
+ qp->s_flags &= ~RVT_S_BUSY;
+ priv->s_flags &=
+ ~(HFI1_S_TID_BUSY_SET |
+ RVT_S_BUSY);
+ } else {
+ priv->s_flags &= ~RVT_S_BUSY;
+ }
+ hfi1_schedule_tid_send(qp);
+ }
+
spin_unlock_irqrestore(&qp->s_lock, ps->flags);
this_cpu_inc(*ps->ppd->dd->send_schedule);
trace_hfi1_rc_expired_time_slice(qp, true);
@@ -576,6 +594,8 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread)
do {
/* Check for a constructed packet to be sent. */
if (ps.s_txreq) {
+ if (priv->s_flags & HFI1_S_TID_BUSY_SET)
+ qp->s_flags |= RVT_S_BUSY;
spin_unlock_irqrestore(&qp->s_lock, ps.flags);
/*
* If the packet cannot be sent now, return and
@@ -585,7 +605,7 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread)
return;
/* allow other tasks to run */
- if (schedule_send_yield(qp, &ps))
+ if (hfi1_schedule_send_yield(qp, &ps, false))
return;
spin_lock_irqsave(&qp->s_lock, ps.flags);
diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c
index 96897a91fb0a..b0110728f541 100644
--- a/drivers/infiniband/hw/hfi1/sdma.c
+++ b/drivers/infiniband/hw/hfi1/sdma.c
@@ -1747,10 +1747,9 @@ retry:
*/
static void sdma_desc_avail(struct sdma_engine *sde, uint avail)
{
- struct iowait *wait, *nw;
+ struct iowait *wait, *nw, *twait;
struct iowait *waits[SDMA_WAIT_BATCH_SIZE];
- uint i, n = 0, seq, max_idx = 0;
- u8 max_starved_cnt = 0;
+ uint i, n = 0, seq, tidx = 0;
#ifdef CONFIG_SDMA_VERBOSITY
dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx,
@@ -1775,13 +1774,20 @@ static void sdma_desc_avail(struct sdma_engine *sde, uint avail)
continue;
if (n == ARRAY_SIZE(waits))
break;
+ iowait_init_priority(wait);
num_desc = iowait_get_all_desc(wait);
if (num_desc > avail)
break;
avail -= num_desc;
- /* Find the most starved wait memeber */
- iowait_starve_find_max(wait, &max_starved_cnt,
- n, &max_idx);
+ /* Find the top-priority wait memeber */
+ if (n) {
+ twait = waits[tidx];
+ tidx =
+ iowait_priority_update_top(wait,
+ twait,
+ n,
+ tidx);
+ }
list_del_init(&wait->list);
waits[n++] = wait;
}
@@ -1790,12 +1796,12 @@ static void sdma_desc_avail(struct sdma_engine *sde, uint avail)
}
} while (read_seqretry(&sde->waitlock, seq));
- /* Schedule the most starved one first */
+ /* Schedule the top-priority entry first */
if (n)
- waits[max_idx]->wakeup(waits[max_idx], SDMA_AVAIL_REASON);
+ waits[tidx]->wakeup(waits[tidx], SDMA_AVAIL_REASON);
for (i = 0; i < n; i++)
- if (i != max_idx)
+ if (i != tidx)
waits[i]->wakeup(waits[i], SDMA_AVAIL_REASON);
}
diff --git a/drivers/infiniband/hw/hfi1/sdma_txreq.h b/drivers/infiniband/hw/hfi1/sdma_txreq.h
index bf7d777d756e..514a4784566b 100644
--- a/drivers/infiniband/hw/hfi1/sdma_txreq.h
+++ b/drivers/infiniband/hw/hfi1/sdma_txreq.h
@@ -91,6 +91,7 @@ struct sdma_desc {
#define SDMA_TXREQ_F_URGENT 0x0001
#define SDMA_TXREQ_F_AHG_COPY 0x0002
#define SDMA_TXREQ_F_USE_AHG 0x0004
+#define SDMA_TXREQ_F_VIP 0x0010
struct sdma_txreq;
typedef void (*callback_t)(struct sdma_txreq *, int);
diff --git a/drivers/infiniband/hw/hfi1/sysfs.c b/drivers/infiniband/hw/hfi1/sysfs.c
index 2be513d4c9da..90f62c4bddba 100644
--- a/drivers/infiniband/hw/hfi1/sysfs.c
+++ b/drivers/infiniband/hw/hfi1/sysfs.c
@@ -498,7 +498,7 @@ static ssize_t hw_rev_show(struct device *device, struct device_attribute *attr,
char *buf)
{
struct hfi1_ibdev *dev =
- container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+ rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev);
return sprintf(buf, "%x\n", dd_from_dev(dev)->minrev);
}
@@ -508,7 +508,7 @@ static ssize_t board_id_show(struct device *device,
struct device_attribute *attr, char *buf)
{
struct hfi1_ibdev *dev =
- container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+ rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev);
struct hfi1_devdata *dd = dd_from_dev(dev);
int ret;
@@ -524,7 +524,7 @@ static ssize_t boardversion_show(struct device *device,
struct device_attribute *attr, char *buf)
{
struct hfi1_ibdev *dev =
- container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+ rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev);
struct hfi1_devdata *dd = dd_from_dev(dev);
/* The string printed here is already newline-terminated. */
@@ -536,7 +536,7 @@ static ssize_t nctxts_show(struct device *device,
struct device_attribute *attr, char *buf)
{
struct hfi1_ibdev *dev =
- container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+ rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev);
struct hfi1_devdata *dd = dd_from_dev(dev);
/*
@@ -555,7 +555,7 @@ static ssize_t nfreectxts_show(struct device *device,
struct device_attribute *attr, char *buf)
{
struct hfi1_ibdev *dev =
- container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+ rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev);
struct hfi1_devdata *dd = dd_from_dev(dev);
/* Return the number of free user ports (contexts) available. */
@@ -567,7 +567,7 @@ static ssize_t serial_show(struct device *device,
struct device_attribute *attr, char *buf)
{
struct hfi1_ibdev *dev =
- container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+ rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev);
struct hfi1_devdata *dd = dd_from_dev(dev);
return scnprintf(buf, PAGE_SIZE, "%s", dd->serial);
@@ -579,7 +579,7 @@ static ssize_t chip_reset_store(struct device *device,
size_t count)
{
struct hfi1_ibdev *dev =
- container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+ rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev);
struct hfi1_devdata *dd = dd_from_dev(dev);
int ret;
@@ -609,7 +609,7 @@ static ssize_t tempsense_show(struct device *device,
struct device_attribute *attr, char *buf)
{
struct hfi1_ibdev *dev =
- container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+ rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev);
struct hfi1_devdata *dd = dd_from_dev(dev);
struct hfi1_temp temp;
int ret;
diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c
index da1ecb68a928..fdda33aca77f 100644
--- a/drivers/infiniband/hw/hfi1/tid_rdma.c
+++ b/drivers/infiniband/hw/hfi1/tid_rdma.c
@@ -5,8 +5,282 @@
*/
#include "hfi.h"
+#include "qp.h"
+#include "rc.h"
#include "verbs.h"
#include "tid_rdma.h"
+#include "exp_rcv.h"
+#include "trace.h"
+
+/**
+ * DOC: TID RDMA READ protocol
+ *
+ * This is an end-to-end protocol at the hfi1 level between two nodes that
+ * improves performance by avoiding data copy on the requester side. It
+ * converts a qualified RDMA READ request into a TID RDMA READ request on
+ * the requester side and thereafter handles the request and response
+ * differently. To be qualified, the RDMA READ request should meet the
+ * following:
+ * -- The total data length should be greater than 256K;
+ * -- The total data length should be a multiple of 4K page size;
+ * -- Each local scatter-gather entry should be 4K page aligned;
+ * -- Each local scatter-gather entry should be a multiple of 4K page size;
+ */
+
+#define RCV_TID_FLOW_TABLE_CTRL_FLOW_VALID_SMASK BIT_ULL(32)
+#define RCV_TID_FLOW_TABLE_CTRL_HDR_SUPP_EN_SMASK BIT_ULL(33)
+#define RCV_TID_FLOW_TABLE_CTRL_KEEP_AFTER_SEQ_ERR_SMASK BIT_ULL(34)
+#define RCV_TID_FLOW_TABLE_CTRL_KEEP_ON_GEN_ERR_SMASK BIT_ULL(35)
+#define RCV_TID_FLOW_TABLE_STATUS_SEQ_MISMATCH_SMASK BIT_ULL(37)
+#define RCV_TID_FLOW_TABLE_STATUS_GEN_MISMATCH_SMASK BIT_ULL(38)
+
+/* Maximum number of packets within a flow generation. */
+#define MAX_TID_FLOW_PSN BIT(HFI1_KDETH_BTH_SEQ_SHIFT)
+
+#define GENERATION_MASK 0xFFFFF
+
+static u32 mask_generation(u32 a)
+{
+ return a & GENERATION_MASK;
+}
+
+/* Reserved generation value to set to unused flows for kernel contexts */
+#define KERN_GENERATION_RESERVED mask_generation(U32_MAX)
+
+/*
+ * J_KEY for kernel contexts when TID RDMA is used.
+ * See generate_jkey() in hfi.h for more information.
+ */
+#define TID_RDMA_JKEY 32
+#define HFI1_KERNEL_MIN_JKEY HFI1_ADMIN_JKEY_RANGE
+#define HFI1_KERNEL_MAX_JKEY (2 * HFI1_ADMIN_JKEY_RANGE - 1)
+
+/* Maximum number of segments in flight per QP request. */
+#define TID_RDMA_MAX_READ_SEGS_PER_REQ 6
+#define TID_RDMA_MAX_WRITE_SEGS_PER_REQ 4
+#define MAX_REQ max_t(u16, TID_RDMA_MAX_READ_SEGS_PER_REQ, \
+ TID_RDMA_MAX_WRITE_SEGS_PER_REQ)
+#define MAX_FLOWS roundup_pow_of_two(MAX_REQ + 1)
+
+#define MAX_EXPECTED_PAGES (MAX_EXPECTED_BUFFER / PAGE_SIZE)
+
+#define TID_RDMA_DESTQP_FLOW_SHIFT 11
+#define TID_RDMA_DESTQP_FLOW_MASK 0x1f
+
+#define TID_FLOW_SW_PSN BIT(0)
+
+#define TID_OPFN_QP_CTXT_MASK 0xff
+#define TID_OPFN_QP_CTXT_SHIFT 56
+#define TID_OPFN_QP_KDETH_MASK 0xff
+#define TID_OPFN_QP_KDETH_SHIFT 48
+#define TID_OPFN_MAX_LEN_MASK 0x7ff
+#define TID_OPFN_MAX_LEN_SHIFT 37
+#define TID_OPFN_TIMEOUT_MASK 0x1f
+#define TID_OPFN_TIMEOUT_SHIFT 32
+#define TID_OPFN_RESERVED_MASK 0x3f
+#define TID_OPFN_RESERVED_SHIFT 26
+#define TID_OPFN_URG_MASK 0x1
+#define TID_OPFN_URG_SHIFT 25
+#define TID_OPFN_VER_MASK 0x7
+#define TID_OPFN_VER_SHIFT 22
+#define TID_OPFN_JKEY_MASK 0x3f
+#define TID_OPFN_JKEY_SHIFT 16
+#define TID_OPFN_MAX_READ_MASK 0x3f
+#define TID_OPFN_MAX_READ_SHIFT 10
+#define TID_OPFN_MAX_WRITE_MASK 0x3f
+#define TID_OPFN_MAX_WRITE_SHIFT 4
+
+/*
+ * OPFN TID layout
+ *
+ * 63 47 31 15
+ * NNNNNNNNKKKKKKKK MMMMMMMMMMMTTTTT DDDDDDUVVVJJJJJJ RRRRRRWWWWWWCCCC
+ * 3210987654321098 7654321098765432 1098765432109876 5432109876543210
+ * N - the context Number
+ * K - the Kdeth_qp
+ * M - Max_len
+ * T - Timeout
+ * D - reserveD
+ * V - version
+ * U - Urg capable
+ * J - Jkey
+ * R - max_Read
+ * W - max_Write
+ * C - Capcode
+ */
+
+static u32 tid_rdma_flow_wt;
+
+static void tid_rdma_trigger_resume(struct work_struct *work);
+static void hfi1_kern_exp_rcv_free_flows(struct tid_rdma_request *req);
+static int hfi1_kern_exp_rcv_alloc_flows(struct tid_rdma_request *req,
+ gfp_t gfp);
+static void hfi1_init_trdma_req(struct rvt_qp *qp,
+ struct tid_rdma_request *req);
+static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx);
+static void hfi1_tid_timeout(struct timer_list *t);
+static void hfi1_add_tid_reap_timer(struct rvt_qp *qp);
+static void hfi1_mod_tid_reap_timer(struct rvt_qp *qp);
+static void hfi1_mod_tid_retry_timer(struct rvt_qp *qp);
+static int hfi1_stop_tid_retry_timer(struct rvt_qp *qp);
+static void hfi1_tid_retry_timeout(struct timer_list *t);
+static int make_tid_rdma_ack(struct rvt_qp *qp,
+ struct ib_other_headers *ohdr,
+ struct hfi1_pkt_state *ps);
+static void hfi1_do_tid_send(struct rvt_qp *qp);
+
+static u64 tid_rdma_opfn_encode(struct tid_rdma_params *p)
+{
+ return
+ (((u64)p->qp & TID_OPFN_QP_CTXT_MASK) <<
+ TID_OPFN_QP_CTXT_SHIFT) |
+ ((((u64)p->qp >> 16) & TID_OPFN_QP_KDETH_MASK) <<
+ TID_OPFN_QP_KDETH_SHIFT) |
+ (((u64)((p->max_len >> PAGE_SHIFT) - 1) &
+ TID_OPFN_MAX_LEN_MASK) << TID_OPFN_MAX_LEN_SHIFT) |
+ (((u64)p->timeout & TID_OPFN_TIMEOUT_MASK) <<
+ TID_OPFN_TIMEOUT_SHIFT) |
+ (((u64)p->urg & TID_OPFN_URG_MASK) << TID_OPFN_URG_SHIFT) |
+ (((u64)p->jkey & TID_OPFN_JKEY_MASK) << TID_OPFN_JKEY_SHIFT) |
+ (((u64)p->max_read & TID_OPFN_MAX_READ_MASK) <<
+ TID_OPFN_MAX_READ_SHIFT) |
+ (((u64)p->max_write & TID_OPFN_MAX_WRITE_MASK) <<
+ TID_OPFN_MAX_WRITE_SHIFT);
+}
+
+static void tid_rdma_opfn_decode(struct tid_rdma_params *p, u64 data)
+{
+ p->max_len = (((data >> TID_OPFN_MAX_LEN_SHIFT) &
+ TID_OPFN_MAX_LEN_MASK) + 1) << PAGE_SHIFT;
+ p->jkey = (data >> TID_OPFN_JKEY_SHIFT) & TID_OPFN_JKEY_MASK;
+ p->max_write = (data >> TID_OPFN_MAX_WRITE_SHIFT) &
+ TID_OPFN_MAX_WRITE_MASK;
+ p->max_read = (data >> TID_OPFN_MAX_READ_SHIFT) &
+ TID_OPFN_MAX_READ_MASK;
+ p->qp =
+ ((((data >> TID_OPFN_QP_KDETH_SHIFT) & TID_OPFN_QP_KDETH_MASK)
+ << 16) |
+ ((data >> TID_OPFN_QP_CTXT_SHIFT) & TID_OPFN_QP_CTXT_MASK));
+ p->urg = (data >> TID_OPFN_URG_SHIFT) & TID_OPFN_URG_MASK;
+ p->timeout = (data >> TID_OPFN_TIMEOUT_SHIFT) & TID_OPFN_TIMEOUT_MASK;
+}
+
+void tid_rdma_opfn_init(struct rvt_qp *qp, struct tid_rdma_params *p)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ p->qp = (kdeth_qp << 16) | priv->rcd->ctxt;
+ p->max_len = TID_RDMA_MAX_SEGMENT_SIZE;
+ p->jkey = priv->rcd->jkey;
+ p->max_read = TID_RDMA_MAX_READ_SEGS_PER_REQ;
+ p->max_write = TID_RDMA_MAX_WRITE_SEGS_PER_REQ;
+ p->timeout = qp->timeout;
+ p->urg = is_urg_masked(priv->rcd);
+}
+
+bool tid_rdma_conn_req(struct rvt_qp *qp, u64 *data)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ *data = tid_rdma_opfn_encode(&priv->tid_rdma.local);
+ return true;
+}
+
+bool tid_rdma_conn_reply(struct rvt_qp *qp, u64 data)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+ struct tid_rdma_params *remote, *old;
+ bool ret = true;
+
+ old = rcu_dereference_protected(priv->tid_rdma.remote,
+ lockdep_is_held(&priv->opfn.lock));
+ data &= ~0xfULL;
+ /*
+ * If data passed in is zero, return true so as not to continue the
+ * negotiation process
+ */
+ if (!data || !HFI1_CAP_IS_KSET(TID_RDMA))
+ goto null;
+ /*
+ * If kzalloc fails, return false. This will result in:
+ * * at the requester a new OPFN request being generated to retry
+ * the negotiation
+ * * at the responder, 0 being returned to the requester so as to
+ * disable TID RDMA at both the requester and the responder
+ */
+ remote = kzalloc(sizeof(*remote), GFP_ATOMIC);
+ if (!remote) {
+ ret = false;
+ goto null;
+ }
+
+ tid_rdma_opfn_decode(remote, data);
+ priv->tid_timer_timeout_jiffies =
+ usecs_to_jiffies((((4096UL * (1UL << remote->timeout)) /
+ 1000UL) << 3) * 7);
+ trace_hfi1_opfn_param(qp, 0, &priv->tid_rdma.local);
+ trace_hfi1_opfn_param(qp, 1, remote);
+ rcu_assign_pointer(priv->tid_rdma.remote, remote);
+ /*
+ * A TID RDMA READ request's segment size is not equal to
+ * remote->max_len only when the request's data length is smaller
+ * than remote->max_len. In that case, there will be only one segment.
+ * Therefore, when priv->pkts_ps is used to calculate req->cur_seg
+ * during retry, it will lead to req->cur_seg = 0, which is exactly
+ * what is expected.
+ */
+ priv->pkts_ps = (u16)rvt_div_mtu(qp, remote->max_len);
+ priv->timeout_shift = ilog2(priv->pkts_ps - 1) + 1;
+ goto free;
+null:
+ RCU_INIT_POINTER(priv->tid_rdma.remote, NULL);
+ priv->timeout_shift = 0;
+free:
+ if (old)
+ kfree_rcu(old, rcu_head);
+ return ret;
+}
+
+bool tid_rdma_conn_resp(struct rvt_qp *qp, u64 *data)
+{
+ bool ret;
+
+ ret = tid_rdma_conn_reply(qp, *data);
+ *data = 0;
+ /*
+ * If tid_rdma_conn_reply() returns error, set *data as 0 to indicate
+ * TID RDMA could not be enabled. This will result in TID RDMA being
+ * disabled at the requester too.
+ */
+ if (ret)
+ (void)tid_rdma_conn_req(qp, data);
+ return ret;
+}
+
+void tid_rdma_conn_error(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+ struct tid_rdma_params *old;
+
+ old = rcu_dereference_protected(priv->tid_rdma.remote,
+ lockdep_is_held(&priv->opfn.lock));
+ RCU_INIT_POINTER(priv->tid_rdma.remote, NULL);
+ if (old)
+ kfree_rcu(old, rcu_head);
+}
+
+/* This is called at context initialization time */
+int hfi1_kern_exp_rcv_init(struct hfi1_ctxtdata *rcd, int reinit)
+{
+ if (reinit)
+ return 0;
+
+ BUILD_BUG_ON(TID_RDMA_JKEY < HFI1_KERNEL_MIN_JKEY);
+ BUILD_BUG_ON(TID_RDMA_JKEY > HFI1_KERNEL_MAX_JKEY);
+ rcd->jkey = TID_RDMA_JKEY;
+ hfi1_set_ctxt_jkey(rcd->dd, rcd, rcd->jkey);
+ return hfi1_alloc_ctxt_rcv_groups(rcd);
+}
/**
* qp_to_rcd - determine the receive context used by a qp
@@ -41,8 +315,5152 @@ int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp,
struct ib_qp_init_attr *init_attr)
{
struct hfi1_qp_priv *qpriv = qp->priv;
+ int i, ret;
qpriv->rcd = qp_to_rcd(rdi, qp);
+ spin_lock_init(&qpriv->opfn.lock);
+ INIT_WORK(&qpriv->opfn.opfn_work, opfn_send_conn_request);
+ INIT_WORK(&qpriv->tid_rdma.trigger_work, tid_rdma_trigger_resume);
+ qpriv->flow_state.psn = 0;
+ qpriv->flow_state.index = RXE_NUM_TID_FLOWS;
+ qpriv->flow_state.last_index = RXE_NUM_TID_FLOWS;
+ qpriv->flow_state.generation = KERN_GENERATION_RESERVED;
+ qpriv->s_state = TID_OP(WRITE_RESP);
+ qpriv->s_tid_cur = HFI1_QP_WQE_INVALID;
+ qpriv->s_tid_head = HFI1_QP_WQE_INVALID;
+ qpriv->s_tid_tail = HFI1_QP_WQE_INVALID;
+ qpriv->rnr_nak_state = TID_RNR_NAK_INIT;
+ qpriv->r_tid_head = HFI1_QP_WQE_INVALID;
+ qpriv->r_tid_tail = HFI1_QP_WQE_INVALID;
+ qpriv->r_tid_ack = HFI1_QP_WQE_INVALID;
+ qpriv->r_tid_alloc = HFI1_QP_WQE_INVALID;
+ atomic_set(&qpriv->n_requests, 0);
+ atomic_set(&qpriv->n_tid_requests, 0);
+ timer_setup(&qpriv->s_tid_timer, hfi1_tid_timeout, 0);
+ timer_setup(&qpriv->s_tid_retry_timer, hfi1_tid_retry_timeout, 0);
+ INIT_LIST_HEAD(&qpriv->tid_wait);
+
+ if (init_attr->qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) {
+ struct hfi1_devdata *dd = qpriv->rcd->dd;
+
+ qpriv->pages = kzalloc_node(TID_RDMA_MAX_PAGES *
+ sizeof(*qpriv->pages),
+ GFP_KERNEL, dd->node);
+ if (!qpriv->pages)
+ return -ENOMEM;
+ for (i = 0; i < qp->s_size; i++) {
+ struct hfi1_swqe_priv *priv;
+ struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, i);
+
+ priv = kzalloc_node(sizeof(*priv), GFP_KERNEL,
+ dd->node);
+ if (!priv)
+ return -ENOMEM;
+
+ hfi1_init_trdma_req(qp, &priv->tid_req);
+ priv->tid_req.e.swqe = wqe;
+ wqe->priv = priv;
+ }
+ for (i = 0; i < rvt_max_atomic(rdi); i++) {
+ struct hfi1_ack_priv *priv;
+
+ priv = kzalloc_node(sizeof(*priv), GFP_KERNEL,
+ dd->node);
+ if (!priv)
+ return -ENOMEM;
+
+ hfi1_init_trdma_req(qp, &priv->tid_req);
+ priv->tid_req.e.ack = &qp->s_ack_queue[i];
+
+ ret = hfi1_kern_exp_rcv_alloc_flows(&priv->tid_req,
+ GFP_KERNEL);
+ if (ret) {
+ kfree(priv);
+ return ret;
+ }
+ qp->s_ack_queue[i].priv = priv;
+ }
+ }
+
return 0;
}
+
+void hfi1_qp_priv_tid_free(struct rvt_dev_info *rdi, struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct rvt_swqe *wqe;
+ u32 i;
+
+ if (qp->ibqp.qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) {
+ for (i = 0; i < qp->s_size; i++) {
+ wqe = rvt_get_swqe_ptr(qp, i);
+ kfree(wqe->priv);
+ wqe->priv = NULL;
+ }
+ for (i = 0; i < rvt_max_atomic(rdi); i++) {
+ struct hfi1_ack_priv *priv = qp->s_ack_queue[i].priv;
+
+ if (priv)
+ hfi1_kern_exp_rcv_free_flows(&priv->tid_req);
+ kfree(priv);
+ qp->s_ack_queue[i].priv = NULL;
+ }
+ cancel_work_sync(&qpriv->opfn.opfn_work);
+ kfree(qpriv->pages);
+ qpriv->pages = NULL;
+ }
+}
+
+/* Flow and tid waiter functions */
+/**
+ * DOC: lock ordering
+ *
+ * There are two locks involved with the queuing
+ * routines: the qp s_lock and the exp_lock.
+ *
+ * Since the tid space allocation is called from
+ * the send engine, the qp s_lock is already held.
+ *
+ * The allocation routines will get the exp_lock.
+ *
+ * The first_qp() call is provided to allow the head of
+ * the rcd wait queue to be fetched under the exp_lock and
+ * followed by a drop of the exp_lock.
+ *
+ * Any qp in the wait list will have the qp reference count held
+ * to hold the qp in memory.
+ */
+
+/*
+ * return head of rcd wait list
+ *
+ * Must hold the exp_lock.
+ *
+ * Get a reference to the QP to hold the QP in memory.
+ *
+ * The caller must release the reference when the local
+ * is no longer being used.
+ */
+static struct rvt_qp *first_qp(struct hfi1_ctxtdata *rcd,
+ struct tid_queue *queue)
+ __must_hold(&rcd->exp_lock)
+{
+ struct hfi1_qp_priv *priv;
+
+ lockdep_assert_held(&rcd->exp_lock);
+ priv = list_first_entry_or_null(&queue->queue_head,
+ struct hfi1_qp_priv,
+ tid_wait);
+ if (!priv)
+ return NULL;
+ rvt_get_qp(priv->owner);
+ return priv->owner;
+}
+
+/**
+ * kernel_tid_waiters - determine rcd wait
+ * @rcd: the receive context
+ * @qp: the head of the qp being processed
+ *
+ * This routine will return false IFF
+ * the list is NULL or the head of the
+ * list is the indicated qp.
+ *
+ * Must hold the qp s_lock and the exp_lock.
+ *
+ * Return:
+ * false if either of the conditions below are statisfied:
+ * 1. The list is empty or
+ * 2. The indicated qp is at the head of the list and the
+ * HFI1_S_WAIT_TID_SPACE bit is set in qp->s_flags.
+ * true is returned otherwise.
+ */
+static bool kernel_tid_waiters(struct hfi1_ctxtdata *rcd,
+ struct tid_queue *queue, struct rvt_qp *qp)
+ __must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock)
+{
+ struct rvt_qp *fqp;
+ bool ret = true;
+
+ lockdep_assert_held(&qp->s_lock);
+ lockdep_assert_held(&rcd->exp_lock);
+ fqp = first_qp(rcd, queue);
+ if (!fqp || (fqp == qp && (qp->s_flags & HFI1_S_WAIT_TID_SPACE)))
+ ret = false;
+ rvt_put_qp(fqp);
+ return ret;
+}
+
+/**
+ * dequeue_tid_waiter - dequeue the qp from the list
+ * @qp - the qp to remove the wait list
+ *
+ * This routine removes the indicated qp from the
+ * wait list if it is there.
+ *
+ * This should be done after the hardware flow and
+ * tid array resources have been allocated.
+ *
+ * Must hold the qp s_lock and the rcd exp_lock.
+ *
+ * It assumes the s_lock to protect the s_flags
+ * field and to reliably test the HFI1_S_WAIT_TID_SPACE flag.
+ */
+static void dequeue_tid_waiter(struct hfi1_ctxtdata *rcd,
+ struct tid_queue *queue, struct rvt_qp *qp)
+ __must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ lockdep_assert_held(&qp->s_lock);
+ lockdep_assert_held(&rcd->exp_lock);
+ if (list_empty(&priv->tid_wait))
+ return;
+ list_del_init(&priv->tid_wait);
+ qp->s_flags &= ~HFI1_S_WAIT_TID_SPACE;
+ queue->dequeue++;
+ rvt_put_qp(qp);
+}
+
+/**
+ * queue_qp_for_tid_wait - suspend QP on tid space
+ * @rcd: the receive context
+ * @qp: the qp
+ *
+ * The qp is inserted at the tail of the rcd
+ * wait queue and the HFI1_S_WAIT_TID_SPACE s_flag is set.
+ *
+ * Must hold the qp s_lock and the exp_lock.
+ */
+static void queue_qp_for_tid_wait(struct hfi1_ctxtdata *rcd,
+ struct tid_queue *queue, struct rvt_qp *qp)
+ __must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ lockdep_assert_held(&qp->s_lock);
+ lockdep_assert_held(&rcd->exp_lock);
+ if (list_empty(&priv->tid_wait)) {
+ qp->s_flags |= HFI1_S_WAIT_TID_SPACE;
+ list_add_tail(&priv->tid_wait, &queue->queue_head);
+ priv->tid_enqueue = ++queue->enqueue;
+ rcd->dd->verbs_dev.n_tidwait++;
+ trace_hfi1_qpsleep(qp, HFI1_S_WAIT_TID_SPACE);
+ rvt_get_qp(qp);
+ }
+}
+
+/**
+ * __trigger_tid_waiter - trigger tid waiter
+ * @qp: the qp
+ *
+ * This is a private entrance to schedule the qp
+ * assuming the caller is holding the qp->s_lock.
+ */
+static void __trigger_tid_waiter(struct rvt_qp *qp)
+ __must_hold(&qp->s_lock)
+{
+ lockdep_assert_held(&qp->s_lock);
+ if (!(qp->s_flags & HFI1_S_WAIT_TID_SPACE))
+ return;
+ trace_hfi1_qpwakeup(qp, HFI1_S_WAIT_TID_SPACE);
+ hfi1_schedule_send(qp);
+}
+
+/**
+ * tid_rdma_schedule_tid_wakeup - schedule wakeup for a qp
+ * @qp - the qp
+ *
+ * trigger a schedule or a waiting qp in a deadlock
+ * safe manner. The qp reference is held prior
+ * to this call via first_qp().
+ *
+ * If the qp trigger was already scheduled (!rval)
+ * the the reference is dropped, otherwise the resume
+ * or the destroy cancel will dispatch the reference.
+ */
+static void tid_rdma_schedule_tid_wakeup(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *priv;
+ struct hfi1_ibport *ibp;
+ struct hfi1_pportdata *ppd;
+ struct hfi1_devdata *dd;
+ bool rval;
+
+ if (!qp)
+ return;
+
+ priv = qp->priv;
+ ibp = to_iport(qp->ibqp.device, qp->port_num);
+ ppd = ppd_from_ibp(ibp);
+ dd = dd_from_ibdev(qp->ibqp.device);
+
+ rval = queue_work_on(priv->s_sde ?
+ priv->s_sde->cpu :
+ cpumask_first(cpumask_of_node(dd->node)),
+ ppd->hfi1_wq,
+ &priv->tid_rdma.trigger_work);
+ if (!rval)
+ rvt_put_qp(qp);
+}
+
+/**
+ * tid_rdma_trigger_resume - field a trigger work request
+ * @work - the work item
+ *
+ * Complete the off qp trigger processing by directly
+ * calling the progress routine.
+ */
+static void tid_rdma_trigger_resume(struct work_struct *work)
+{
+ struct tid_rdma_qp_params *tr;
+ struct hfi1_qp_priv *priv;
+ struct rvt_qp *qp;
+
+ tr = container_of(work, struct tid_rdma_qp_params, trigger_work);
+ priv = container_of(tr, struct hfi1_qp_priv, tid_rdma);
+ qp = priv->owner;
+ spin_lock_irq(&qp->s_lock);
+ if (qp->s_flags & HFI1_S_WAIT_TID_SPACE) {
+ spin_unlock_irq(&qp->s_lock);
+ hfi1_do_send(priv->owner, true);
+ } else {
+ spin_unlock_irq(&qp->s_lock);
+ }
+ rvt_put_qp(qp);
+}
+
+/**
+ * tid_rdma_flush_wait - unwind any tid space wait
+ *
+ * This is called when resetting a qp to
+ * allow a destroy or reset to get rid
+ * of any tid space linkage and reference counts.
+ */
+static void _tid_rdma_flush_wait(struct rvt_qp *qp, struct tid_queue *queue)
+ __must_hold(&qp->s_lock)
+{
+ struct hfi1_qp_priv *priv;
+
+ if (!qp)
+ return;
+ lockdep_assert_held(&qp->s_lock);
+ priv = qp->priv;
+ qp->s_flags &= ~HFI1_S_WAIT_TID_SPACE;
+ spin_lock(&priv->rcd->exp_lock);
+ if (!list_empty(&priv->tid_wait)) {
+ list_del_init(&priv->tid_wait);
+ qp->s_flags &= ~HFI1_S_WAIT_TID_SPACE;
+ queue->dequeue++;
+ rvt_put_qp(qp);
+ }
+ spin_unlock(&priv->rcd->exp_lock);
+}
+
+void hfi1_tid_rdma_flush_wait(struct rvt_qp *qp)
+ __must_hold(&qp->s_lock)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ _tid_rdma_flush_wait(qp, &priv->rcd->flow_queue);
+ _tid_rdma_flush_wait(qp, &priv->rcd->rarr_queue);
+}
+
+/* Flow functions */
+/**
+ * kern_reserve_flow - allocate a hardware flow
+ * @rcd - the context to use for allocation
+ * @last - the index of the preferred flow. Use RXE_NUM_TID_FLOWS to
+ * signify "don't care".
+ *
+ * Use a bit mask based allocation to reserve a hardware
+ * flow for use in receiving KDETH data packets. If a preferred flow is
+ * specified the function will attempt to reserve that flow again, if
+ * available.
+ *
+ * The exp_lock must be held.
+ *
+ * Return:
+ * On success: a value postive value between 0 and RXE_NUM_TID_FLOWS - 1
+ * On failure: -EAGAIN
+ */
+static int kern_reserve_flow(struct hfi1_ctxtdata *rcd, int last)
+ __must_hold(&rcd->exp_lock)
+{
+ int nr;
+
+ /* Attempt to reserve the preferred flow index */
+ if (last >= 0 && last < RXE_NUM_TID_FLOWS &&
+ !test_and_set_bit(last, &rcd->flow_mask))
+ return last;
+
+ nr = ffz(rcd->flow_mask);
+ BUILD_BUG_ON(RXE_NUM_TID_FLOWS >=
+ (sizeof(rcd->flow_mask) * BITS_PER_BYTE));
+ if (nr > (RXE_NUM_TID_FLOWS - 1))
+ return -EAGAIN;
+ set_bit(nr, &rcd->flow_mask);
+ return nr;
+}
+
+static void kern_set_hw_flow(struct hfi1_ctxtdata *rcd, u32 generation,
+ u32 flow_idx)
+{
+ u64 reg;
+
+ reg = ((u64)generation << HFI1_KDETH_BTH_SEQ_SHIFT) |
+ RCV_TID_FLOW_TABLE_CTRL_FLOW_VALID_SMASK |
+ RCV_TID_FLOW_TABLE_CTRL_KEEP_AFTER_SEQ_ERR_SMASK |
+ RCV_TID_FLOW_TABLE_CTRL_KEEP_ON_GEN_ERR_SMASK |
+ RCV_TID_FLOW_TABLE_STATUS_SEQ_MISMATCH_SMASK |
+ RCV_TID_FLOW_TABLE_STATUS_GEN_MISMATCH_SMASK;
+
+ if (generation != KERN_GENERATION_RESERVED)
+ reg |= RCV_TID_FLOW_TABLE_CTRL_HDR_SUPP_EN_SMASK;
+
+ write_uctxt_csr(rcd->dd, rcd->ctxt,
+ RCV_TID_FLOW_TABLE + 8 * flow_idx, reg);
+}
+
+static u32 kern_setup_hw_flow(struct hfi1_ctxtdata *rcd, u32 flow_idx)
+ __must_hold(&rcd->exp_lock)
+{
+ u32 generation = rcd->flows[flow_idx].generation;
+
+ kern_set_hw_flow(rcd, generation, flow_idx);
+ return generation;
+}
+
+static u32 kern_flow_generation_next(u32 gen)
+{
+ u32 generation = mask_generation(gen + 1);
+
+ if (generation == KERN_GENERATION_RESERVED)
+ generation = mask_generation(generation + 1);
+ return generation;
+}
+
+static void kern_clear_hw_flow(struct hfi1_ctxtdata *rcd, u32 flow_idx)
+ __must_hold(&rcd->exp_lock)
+{
+ rcd->flows[flow_idx].generation =
+ kern_flow_generation_next(rcd->flows[flow_idx].generation);
+ kern_set_hw_flow(rcd, KERN_GENERATION_RESERVED, flow_idx);
+}
+
+int hfi1_kern_setup_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *qpriv = (struct hfi1_qp_priv *)qp->priv;
+ struct tid_flow_state *fs = &qpriv->flow_state;
+ struct rvt_qp *fqp;
+ unsigned long flags;
+ int ret = 0;
+
+ /* The QP already has an allocated flow */
+ if (fs->index != RXE_NUM_TID_FLOWS)
+ return ret;
+
+ spin_lock_irqsave(&rcd->exp_lock, flags);
+ if (kernel_tid_waiters(rcd, &rcd->flow_queue, qp))
+ goto queue;
+
+ ret = kern_reserve_flow(rcd, fs->last_index);
+ if (ret < 0)
+ goto queue;
+ fs->index = ret;
+ fs->last_index = fs->index;
+
+ /* Generation received in a RESYNC overrides default flow generation */
+ if (fs->generation != KERN_GENERATION_RESERVED)
+ rcd->flows[fs->index].generation = fs->generation;
+ fs->generation = kern_setup_hw_flow(rcd, fs->index);
+ fs->psn = 0;
+ fs->flags = 0;
+ dequeue_tid_waiter(rcd, &rcd->flow_queue, qp);
+ /* get head before dropping lock */
+ fqp = first_qp(rcd, &rcd->flow_queue);
+ spin_unlock_irqrestore(&rcd->exp_lock, flags);
+
+ tid_rdma_schedule_tid_wakeup(fqp);
+ return 0;
+queue:
+ queue_qp_for_tid_wait(rcd, &rcd->flow_queue, qp);
+ spin_unlock_irqrestore(&rcd->exp_lock, flags);
+ return -EAGAIN;
+}
+
+void hfi1_kern_clear_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *qpriv = (struct hfi1_qp_priv *)qp->priv;
+ struct tid_flow_state *fs = &qpriv->flow_state;
+ struct rvt_qp *fqp;
+ unsigned long flags;
+
+ if (fs->index >= RXE_NUM_TID_FLOWS)
+ return;
+ spin_lock_irqsave(&rcd->exp_lock, flags);
+ kern_clear_hw_flow(rcd, fs->index);
+ clear_bit(fs->index, &rcd->flow_mask);
+ fs->index = RXE_NUM_TID_FLOWS;
+ fs->psn = 0;
+ fs->generation = KERN_GENERATION_RESERVED;
+
+ /* get head before dropping lock */
+ fqp = first_qp(rcd, &rcd->flow_queue);
+ spin_unlock_irqrestore(&rcd->exp_lock, flags);
+
+ if (fqp == qp) {
+ __trigger_tid_waiter(fqp);
+ rvt_put_qp(fqp);
+ } else {
+ tid_rdma_schedule_tid_wakeup(fqp);
+ }
+}
+
+void hfi1_kern_init_ctxt_generations(struct hfi1_ctxtdata *rcd)
+{
+ int i;
+
+ for (i = 0; i < RXE_NUM_TID_FLOWS; i++) {
+ rcd->flows[i].generation = mask_generation(prandom_u32());
+ kern_set_hw_flow(rcd, KERN_GENERATION_RESERVED, i);
+ }
+}
+
+/* TID allocation functions */
+static u8 trdma_pset_order(struct tid_rdma_pageset *s)
+{
+ u8 count = s->count;
+
+ return ilog2(count) + 1;
+}
+
+/**
+ * tid_rdma_find_phys_blocks_4k - get groups base on mr info
+ * @npages - number of pages
+ * @pages - pointer to an array of page structs
+ * @list - page set array to return
+ *
+ * This routine returns the number of groups associated with
+ * the current sge information. This implementation is based
+ * on the expected receive find_phys_blocks() adjusted to
+ * use the MR information vs. the pfn.
+ *
+ * Return:
+ * the number of RcvArray entries
+ */
+static u32 tid_rdma_find_phys_blocks_4k(struct tid_rdma_flow *flow,
+ struct page **pages,
+ u32 npages,
+ struct tid_rdma_pageset *list)
+{
+ u32 pagecount, pageidx, setcount = 0, i;
+ void *vaddr, *this_vaddr;
+
+ if (!npages)
+ return 0;
+
+ /*
+ * Look for sets of physically contiguous pages in the user buffer.
+ * This will allow us to optimize Expected RcvArray entry usage by
+ * using the bigger supported sizes.
+ */
+ vaddr = page_address(pages[0]);
+ trace_hfi1_tid_flow_page(flow->req->qp, flow, 0, 0, 0, vaddr);
+ for (pageidx = 0, pagecount = 1, i = 1; i <= npages; i++) {
+ this_vaddr = i < npages ? page_address(pages[i]) : NULL;
+ trace_hfi1_tid_flow_page(flow->req->qp, flow, i, 0, 0,
+ this_vaddr);
+ /*
+ * If the vaddr's are not sequential, pages are not physically
+ * contiguous.
+ */
+ if (this_vaddr != (vaddr + PAGE_SIZE)) {
+ /*
+ * At this point we have to loop over the set of
+ * physically contiguous pages and break them down it
+ * sizes supported by the HW.
+ * There are two main constraints:
+ * 1. The max buffer size is MAX_EXPECTED_BUFFER.
+ * If the total set size is bigger than that
+ * program only a MAX_EXPECTED_BUFFER chunk.
+ * 2. The buffer size has to be a power of two. If
+ * it is not, round down to the closes power of
+ * 2 and program that size.
+ */
+ while (pagecount) {
+ int maxpages = pagecount;
+ u32 bufsize = pagecount * PAGE_SIZE;
+
+ if (bufsize > MAX_EXPECTED_BUFFER)
+ maxpages =
+ MAX_EXPECTED_BUFFER >>
+ PAGE_SHIFT;
+ else if (!is_power_of_2(bufsize))
+ maxpages =
+ rounddown_pow_of_two(bufsize) >>
+ PAGE_SHIFT;
+
+ list[setcount].idx = pageidx;
+ list[setcount].count = maxpages;
+ trace_hfi1_tid_pageset(flow->req->qp, setcount,
+ list[setcount].idx,
+ list[setcount].count);
+ pagecount -= maxpages;
+ pageidx += maxpages;
+ setcount++;
+ }
+ pageidx = i;
+ pagecount = 1;
+ vaddr = this_vaddr;
+ } else {
+ vaddr += PAGE_SIZE;
+ pagecount++;
+ }
+ }
+ /* insure we always return an even number of sets */
+ if (setcount & 1)
+ list[setcount++].count = 0;
+ return setcount;
+}
+
+/**
+ * tid_flush_pages - dump out pages into pagesets
+ * @list - list of pagesets
+ * @idx - pointer to current page index
+ * @pages - number of pages to dump
+ * @sets - current number of pagesset
+ *
+ * This routine flushes out accumuated pages.
+ *
+ * To insure an even number of sets the
+ * code may add a filler.
+ *
+ * This can happen with when pages is not
+ * a power of 2 or pages is a power of 2
+ * less than the maximum pages.
+ *
+ * Return:
+ * The new number of sets
+ */
+
+static u32 tid_flush_pages(struct tid_rdma_pageset *list,
+ u32 *idx, u32 pages, u32 sets)
+{
+ while (pages) {
+ u32 maxpages = pages;
+
+ if (maxpages > MAX_EXPECTED_PAGES)
+ maxpages = MAX_EXPECTED_PAGES;
+ else if (!is_power_of_2(maxpages))
+ maxpages = rounddown_pow_of_two(maxpages);
+ list[sets].idx = *idx;
+ list[sets++].count = maxpages;
+ *idx += maxpages;
+ pages -= maxpages;
+ }
+ /* might need a filler */
+ if (sets & 1)
+ list[sets++].count = 0;
+ return sets;
+}
+
+/**
+ * tid_rdma_find_phys_blocks_8k - get groups base on mr info
+ * @pages - pointer to an array of page structs
+ * @npages - number of pages
+ * @list - page set array to return
+ *
+ * This routine parses an array of pages to compute pagesets
+ * in an 8k compatible way.
+ *
+ * pages are tested two at a time, i, i + 1 for contiguous
+ * pages and i - 1 and i contiguous pages.
+ *
+ * If any condition is false, any accumlated pages are flushed and
+ * v0,v1 are emitted as separate PAGE_SIZE pagesets
+ *
+ * Otherwise, the current 8k is totaled for a future flush.
+ *
+ * Return:
+ * The number of pagesets
+ * list set with the returned number of pagesets
+ *
+ */
+static u32 tid_rdma_find_phys_blocks_8k(struct tid_rdma_flow *flow,
+ struct page **pages,
+ u32 npages,
+ struct tid_rdma_pageset *list)
+{
+ u32 idx, sets = 0, i;
+ u32 pagecnt = 0;
+ void *v0, *v1, *vm1;
+
+ if (!npages)
+ return 0;
+ for (idx = 0, i = 0, vm1 = NULL; i < npages; i += 2) {
+ /* get a new v0 */
+ v0 = page_address(pages[i]);
+ trace_hfi1_tid_flow_page(flow->req->qp, flow, i, 1, 0, v0);
+ v1 = i + 1 < npages ?
+ page_address(pages[i + 1]) : NULL;
+ trace_hfi1_tid_flow_page(flow->req->qp, flow, i, 1, 1, v1);
+ /* compare i, i + 1 vaddr */
+ if (v1 != (v0 + PAGE_SIZE)) {
+ /* flush out pages */
+ sets = tid_flush_pages(list, &idx, pagecnt, sets);
+ /* output v0,v1 as two pagesets */
+ list[sets].idx = idx++;
+ list[sets++].count = 1;
+ if (v1) {
+ list[sets].count = 1;
+ list[sets++].idx = idx++;
+ } else {
+ list[sets++].count = 0;
+ }
+ vm1 = NULL;
+ pagecnt = 0;
+ continue;
+ }
+ /* i,i+1 consecutive, look at i-1,i */
+ if (vm1 && v0 != (vm1 + PAGE_SIZE)) {
+ /* flush out pages */
+ sets = tid_flush_pages(list, &idx, pagecnt, sets);
+ pagecnt = 0;
+ }
+ /* pages will always be a multiple of 8k */
+ pagecnt += 2;
+ /* save i-1 */
+ vm1 = v1;
+ /* move to next pair */
+ }
+ /* dump residual pages at end */
+ sets = tid_flush_pages(list, &idx, npages - idx, sets);
+ /* by design cannot be odd sets */
+ WARN_ON(sets & 1);
+ return sets;
+}
+
+/**
+ * Find pages for one segment of a sge array represented by @ss. The function
+ * does not check the sge, the sge must have been checked for alignment with a
+ * prior call to hfi1_kern_trdma_ok. Other sge checking is done as part of
+ * rvt_lkey_ok and rvt_rkey_ok. Also, the function only modifies the local sge
+ * copy maintained in @ss->sge, the original sge is not modified.
+ *
+ * Unlike IB RDMA WRITE, we can't decrement ss->num_sge here because we are not
+ * releasing the MR reference count at the same time. Otherwise, we'll "leak"
+ * references to the MR. This difference requires that we keep track of progress
+ * into the sg_list. This is done by the cur_seg cursor in the tid_rdma_request
+ * structure.
+ */
+static u32 kern_find_pages(struct tid_rdma_flow *flow,
+ struct page **pages,
+ struct rvt_sge_state *ss, bool *last)
+{
+ struct tid_rdma_request *req = flow->req;
+ struct rvt_sge *sge = &ss->sge;
+ u32 length = flow->req->seg_len;
+ u32 len = PAGE_SIZE;
+ u32 i = 0;
+
+ while (length && req->isge < ss->num_sge) {
+ pages[i++] = virt_to_page(sge->vaddr);
+
+ sge->vaddr += len;
+ sge->length -= len;
+ sge->sge_length -= len;
+ if (!sge->sge_length) {
+ if (++req->isge < ss->num_sge)
+ *sge = ss->sg_list[req->isge - 1];
+ } else if (sge->length == 0 && sge->mr->lkey) {
+ if (++sge->n >= RVT_SEGSZ) {
+ ++sge->m;
+ sge->n = 0;
+ }
+ sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr;
+ sge->length = sge->mr->map[sge->m]->segs[sge->n].length;
+ }
+ length -= len;
+ }
+
+ flow->length = flow->req->seg_len - length;
+ *last = req->isge == ss->num_sge ? false : true;
+ return i;
+}
+
+static void dma_unmap_flow(struct tid_rdma_flow *flow)
+{
+ struct hfi1_devdata *dd;
+ int i;
+ struct tid_rdma_pageset *pset;
+
+ dd = flow->req->rcd->dd;
+ for (i = 0, pset = &flow->pagesets[0]; i < flow->npagesets;
+ i++, pset++) {
+ if (pset->count && pset->addr) {
+ dma_unmap_page(&dd->pcidev->dev,
+ pset->addr,
+ PAGE_SIZE * pset->count,
+ DMA_FROM_DEVICE);
+ pset->mapped = 0;
+ }
+ }
+}
+
+static int dma_map_flow(struct tid_rdma_flow *flow, struct page **pages)
+{
+ int i;
+ struct hfi1_devdata *dd = flow->req->rcd->dd;
+ struct tid_rdma_pageset *pset;
+
+ for (i = 0, pset = &flow->pagesets[0]; i < flow->npagesets;
+ i++, pset++) {
+ if (pset->count) {
+ pset->addr = dma_map_page(&dd->pcidev->dev,
+ pages[pset->idx],
+ 0,
+ PAGE_SIZE * pset->count,
+ DMA_FROM_DEVICE);
+
+ if (dma_mapping_error(&dd->pcidev->dev, pset->addr)) {
+ dma_unmap_flow(flow);
+ return -ENOMEM;
+ }
+ pset->mapped = 1;
+ }
+ }
+ return 0;
+}
+
+static inline bool dma_mapped(struct tid_rdma_flow *flow)
+{
+ return !!flow->pagesets[0].mapped;
+}
+
+/*
+ * Get pages pointers and identify contiguous physical memory chunks for a
+ * segment. All segments are of length flow->req->seg_len.
+ */
+static int kern_get_phys_blocks(struct tid_rdma_flow *flow,
+ struct page **pages,
+ struct rvt_sge_state *ss, bool *last)
+{
+ u8 npages;
+
+ /* Reuse previously computed pagesets, if any */
+ if (flow->npagesets) {
+ trace_hfi1_tid_flow_alloc(flow->req->qp, flow->req->setup_head,
+ flow);
+ if (!dma_mapped(flow))
+ return dma_map_flow(flow, pages);
+ return 0;
+ }
+
+ npages = kern_find_pages(flow, pages, ss, last);
+
+ if (flow->req->qp->pmtu == enum_to_mtu(OPA_MTU_4096))
+ flow->npagesets =
+ tid_rdma_find_phys_blocks_4k(flow, pages, npages,
+ flow->pagesets);
+ else
+ flow->npagesets =
+ tid_rdma_find_phys_blocks_8k(flow, pages, npages,
+ flow->pagesets);
+
+ return dma_map_flow(flow, pages);
+}
+
+static inline void kern_add_tid_node(struct tid_rdma_flow *flow,
+ struct hfi1_ctxtdata *rcd, char *s,
+ struct tid_group *grp, u8 cnt)
+{
+ struct kern_tid_node *node = &flow->tnode[flow->tnode_cnt++];
+
+ WARN_ON_ONCE(flow->tnode_cnt >=
+ (TID_RDMA_MAX_SEGMENT_SIZE >> PAGE_SHIFT));
+ if (WARN_ON_ONCE(cnt & 1))
+ dd_dev_err(rcd->dd,
+ "unexpected odd allocation cnt %u map 0x%x used %u",
+ cnt, grp->map, grp->used);
+
+ node->grp = grp;
+ node->map = grp->map;
+ node->cnt = cnt;
+ trace_hfi1_tid_node_add(flow->req->qp, s, flow->tnode_cnt - 1,
+ grp->base, grp->map, grp->used, cnt);
+}
+
+/*
+ * Try to allocate pageset_count TID's from TID groups for a context
+ *
+ * This function allocates TID's without moving groups between lists or
+ * modifying grp->map. This is done as follows, being cogizant of the lists
+ * between which the TID groups will move:
+ * 1. First allocate complete groups of 8 TID's since this is more efficient,
+ * these groups will move from group->full without affecting used
+ * 2. If more TID's are needed allocate from used (will move from used->full or
+ * stay in used)
+ * 3. If we still don't have the required number of TID's go back and look again
+ * at a complete group (will move from group->used)
+ */
+static int kern_alloc_tids(struct tid_rdma_flow *flow)
+{
+ struct hfi1_ctxtdata *rcd = flow->req->rcd;
+ struct hfi1_devdata *dd = rcd->dd;
+ u32 ngroups, pageidx = 0;
+ struct tid_group *group = NULL, *used;
+ u8 use;
+
+ flow->tnode_cnt = 0;
+ ngroups = flow->npagesets / dd->rcv_entries.group_size;
+ if (!ngroups)
+ goto used_list;
+
+ /* First look at complete groups */
+ list_for_each_entry(group, &rcd->tid_group_list.list, list) {
+ kern_add_tid_node(flow, rcd, "complete groups", group,
+ group->size);
+
+ pageidx += group->size;
+ if (!--ngroups)
+ break;
+ }
+
+ if (pageidx >= flow->npagesets)
+ goto ok;
+
+used_list:
+ /* Now look at partially used groups */
+ list_for_each_entry(used, &rcd->tid_used_list.list, list) {
+ use = min_t(u32, flow->npagesets - pageidx,
+ used->size - used->used);
+ kern_add_tid_node(flow, rcd, "used groups", used, use);
+
+ pageidx += use;
+ if (pageidx >= flow->npagesets)
+ goto ok;
+ }
+
+ /*
+ * Look again at a complete group, continuing from where we left.
+ * However, if we are at the head, we have reached the end of the
+ * complete groups list from the first loop above
+ */
+ if (group && &group->list == &rcd->tid_group_list.list)
+ goto bail_eagain;
+ group = list_prepare_entry(group, &rcd->tid_group_list.list,
+ list);
+ if (list_is_last(&group->list, &rcd->tid_group_list.list))
+ goto bail_eagain;
+ group = list_next_entry(group, list);
+ use = min_t(u32, flow->npagesets - pageidx, group->size);
+ kern_add_tid_node(flow, rcd, "complete continue", group, use);
+ pageidx += use;
+ if (pageidx >= flow->npagesets)
+ goto ok;
+bail_eagain:
+ trace_hfi1_msg_alloc_tids(flow->req->qp, " insufficient tids: needed ",
+ (u64)flow->npagesets);
+ return -EAGAIN;
+ok:
+ return 0;
+}
+
+static void kern_program_rcv_group(struct tid_rdma_flow *flow, int grp_num,
+ u32 *pset_idx)
+{
+ struct hfi1_ctxtdata *rcd = flow->req->rcd;
+ struct hfi1_devdata *dd = rcd->dd;
+ struct kern_tid_node *node = &flow->tnode[grp_num];
+ struct tid_group *grp = node->grp;
+ struct tid_rdma_pageset *pset;
+ u32 pmtu_pg = flow->req->qp->pmtu >> PAGE_SHIFT;
+ u32 rcventry, npages = 0, pair = 0, tidctrl;
+ u8 i, cnt = 0;
+
+ for (i = 0; i < grp->size; i++) {
+ rcventry = grp->base + i;
+
+ if (node->map & BIT(i) || cnt >= node->cnt) {
+ rcv_array_wc_fill(dd, rcventry);
+ continue;
+ }
+ pset = &flow->pagesets[(*pset_idx)++];
+ if (pset->count) {
+ hfi1_put_tid(dd, rcventry, PT_EXPECTED,
+ pset->addr, trdma_pset_order(pset));
+ } else {
+ hfi1_put_tid(dd, rcventry, PT_INVALID, 0, 0);
+ }
+ npages += pset->count;
+
+ rcventry -= rcd->expected_base;
+ tidctrl = pair ? 0x3 : rcventry & 0x1 ? 0x2 : 0x1;
+ /*
+ * A single TID entry will be used to use a rcvarr pair (with
+ * tidctrl 0x3), if ALL these are true (a) the bit pos is even
+ * (b) the group map shows current and the next bits as free
+ * indicating two consecutive rcvarry entries are available (c)
+ * we actually need 2 more entries
+ */
+ pair = !(i & 0x1) && !((node->map >> i) & 0x3) &&
+ node->cnt >= cnt + 2;
+ if (!pair) {
+ if (!pset->count)
+ tidctrl = 0x1;
+ flow->tid_entry[flow->tidcnt++] =
+ EXP_TID_SET(IDX, rcventry >> 1) |
+ EXP_TID_SET(CTRL, tidctrl) |
+ EXP_TID_SET(LEN, npages);
+ trace_hfi1_tid_entry_alloc(/* entry */
+ flow->req->qp, flow->tidcnt - 1,
+ flow->tid_entry[flow->tidcnt - 1]);
+
+ /* Efficient DIV_ROUND_UP(npages, pmtu_pg) */
+ flow->npkts += (npages + pmtu_pg - 1) >> ilog2(pmtu_pg);
+ npages = 0;
+ }
+
+ if (grp->used == grp->size - 1)
+ tid_group_move(grp, &rcd->tid_used_list,
+ &rcd->tid_full_list);
+ else if (!grp->used)
+ tid_group_move(grp, &rcd->tid_group_list,
+ &rcd->tid_used_list);
+
+ grp->used++;
+ grp->map |= BIT(i);
+ cnt++;
+ }
+}
+
+static void kern_unprogram_rcv_group(struct tid_rdma_flow *flow, int grp_num)
+{
+ struct hfi1_ctxtdata *rcd = flow->req->rcd;
+ struct hfi1_devdata *dd = rcd->dd;
+ struct kern_tid_node *node = &flow->tnode[grp_num];
+ struct tid_group *grp = node->grp;
+ u32 rcventry;
+ u8 i, cnt = 0;
+
+ for (i = 0; i < grp->size; i++) {
+ rcventry = grp->base + i;
+
+ if (node->map & BIT(i) || cnt >= node->cnt) {
+ rcv_array_wc_fill(dd, rcventry);
+ continue;
+ }
+
+ hfi1_put_tid(dd, rcventry, PT_INVALID, 0, 0);
+
+ grp->used--;
+ grp->map &= ~BIT(i);
+ cnt++;
+
+ if (grp->used == grp->size - 1)
+ tid_group_move(grp, &rcd->tid_full_list,
+ &rcd->tid_used_list);
+ else if (!grp->used)
+ tid_group_move(grp, &rcd->tid_used_list,
+ &rcd->tid_group_list);
+ }
+ if (WARN_ON_ONCE(cnt & 1)) {
+ struct hfi1_ctxtdata *rcd = flow->req->rcd;
+ struct hfi1_devdata *dd = rcd->dd;
+
+ dd_dev_err(dd, "unexpected odd free cnt %u map 0x%x used %u",
+ cnt, grp->map, grp->used);
+ }
+}
+
+static void kern_program_rcvarray(struct tid_rdma_flow *flow)
+{
+ u32 pset_idx = 0;
+ int i;
+
+ flow->npkts = 0;
+ flow->tidcnt = 0;
+ for (i = 0; i < flow->tnode_cnt; i++)
+ kern_program_rcv_group(flow, i, &pset_idx);
+ trace_hfi1_tid_flow_alloc(flow->req->qp, flow->req->setup_head, flow);
+}
+
+/**
+ * hfi1_kern_exp_rcv_setup() - setup TID's and flow for one segment of a
+ * TID RDMA request
+ *
+ * @req: TID RDMA request for which the segment/flow is being set up
+ * @ss: sge state, maintains state across successive segments of a sge
+ * @last: set to true after the last sge segment has been processed
+ *
+ * This function
+ * (1) finds a free flow entry in the flow circular buffer
+ * (2) finds pages and continuous physical chunks constituing one segment
+ * of an sge
+ * (3) allocates TID group entries for those chunks
+ * (4) programs rcvarray entries in the hardware corresponding to those
+ * TID's
+ * (5) computes a tidarray with formatted TID entries which can be sent
+ * to the sender
+ * (6) Reserves and programs HW flows.
+ * (7) It also manages queing the QP when TID/flow resources are not
+ * available.
+ *
+ * @req points to struct tid_rdma_request of which the segments are a part. The
+ * function uses qp, rcd and seg_len members of @req. In the absence of errors,
+ * req->flow_idx is the index of the flow which has been prepared in this
+ * invocation of function call. With flow = &req->flows[req->flow_idx],
+ * flow->tid_entry contains the TID array which the sender can use for TID RDMA
+ * sends and flow->npkts contains number of packets required to send the
+ * segment.
+ *
+ * hfi1_check_sge_align should be called prior to calling this function and if
+ * it signals error TID RDMA cannot be used for this sge and this function
+ * should not be called.
+ *
+ * For the queuing, caller must hold the flow->req->qp s_lock from the send
+ * engine and the function will procure the exp_lock.
+ *
+ * Return:
+ * The function returns -EAGAIN if sufficient number of TID/flow resources to
+ * map the segment could not be allocated. In this case the function should be
+ * called again with previous arguments to retry the TID allocation. There are
+ * no other error returns. The function returns 0 on success.
+ */
+int hfi1_kern_exp_rcv_setup(struct tid_rdma_request *req,
+ struct rvt_sge_state *ss, bool *last)
+ __must_hold(&req->qp->s_lock)
+{
+ struct tid_rdma_flow *flow = &req->flows[req->setup_head];
+ struct hfi1_ctxtdata *rcd = req->rcd;
+ struct hfi1_qp_priv *qpriv = req->qp->priv;
+ unsigned long flags;
+ struct rvt_qp *fqp;
+ u16 clear_tail = req->clear_tail;
+
+ lockdep_assert_held(&req->qp->s_lock);
+ /*
+ * We return error if either (a) we don't have space in the flow
+ * circular buffer, or (b) we already have max entries in the buffer.
+ * Max entries depend on the type of request we are processing and the
+ * negotiated TID RDMA parameters.
+ */
+ if (!CIRC_SPACE(req->setup_head, clear_tail, MAX_FLOWS) ||
+ CIRC_CNT(req->setup_head, clear_tail, MAX_FLOWS) >=
+ req->n_flows)
+ return -EINVAL;
+
+ /*
+ * Get pages, identify contiguous physical memory chunks for the segment
+ * If we can not determine a DMA address mapping we will treat it just
+ * like if we ran out of space above.
+ */
+ if (kern_get_phys_blocks(flow, qpriv->pages, ss, last)) {
+ hfi1_wait_kmem(flow->req->qp);
+ return -ENOMEM;
+ }
+
+ spin_lock_irqsave(&rcd->exp_lock, flags);
+ if (kernel_tid_waiters(rcd, &rcd->rarr_queue, flow->req->qp))
+ goto queue;
+
+ /*
+ * At this point we know the number of pagesets and hence the number of
+ * TID's to map the segment. Allocate the TID's from the TID groups. If
+ * we cannot allocate the required number we exit and try again later
+ */
+ if (kern_alloc_tids(flow))
+ goto queue;
+ /*
+ * Finally program the TID entries with the pagesets, compute the
+ * tidarray and enable the HW flow
+ */
+ kern_program_rcvarray(flow);
+
+ /*
+ * Setup the flow state with relevant information.
+ * This information is used for tracking the sequence of data packets
+ * for the segment.
+ * The flow is setup here as this is the most accurate time and place
+ * to do so. Doing at a later time runs the risk of the flow data in
+ * qpriv getting out of sync.
+ */
+ memset(&flow->flow_state, 0x0, sizeof(flow->flow_state));
+ flow->idx = qpriv->flow_state.index;
+ flow->flow_state.generation = qpriv->flow_state.generation;
+ flow->flow_state.spsn = qpriv->flow_state.psn;
+ flow->flow_state.lpsn = flow->flow_state.spsn + flow->npkts - 1;
+ flow->flow_state.r_next_psn =
+ full_flow_psn(flow, flow->flow_state.spsn);
+ qpriv->flow_state.psn += flow->npkts;
+
+ dequeue_tid_waiter(rcd, &rcd->rarr_queue, flow->req->qp);
+ /* get head before dropping lock */
+ fqp = first_qp(rcd, &rcd->rarr_queue);
+ spin_unlock_irqrestore(&rcd->exp_lock, flags);
+ tid_rdma_schedule_tid_wakeup(fqp);
+
+ req->setup_head = (req->setup_head + 1) & (MAX_FLOWS - 1);
+ return 0;
+queue:
+ queue_qp_for_tid_wait(rcd, &rcd->rarr_queue, flow->req->qp);
+ spin_unlock_irqrestore(&rcd->exp_lock, flags);
+ return -EAGAIN;
+}
+
+static void hfi1_tid_rdma_reset_flow(struct tid_rdma_flow *flow)
+{
+ flow->npagesets = 0;
+}
+
+/*
+ * This function is called after one segment has been successfully sent to
+ * release the flow and TID HW/SW resources for that segment. The segments for a
+ * TID RDMA request are setup and cleared in FIFO order which is managed using a
+ * circular buffer.
+ */
+int hfi1_kern_exp_rcv_clear(struct tid_rdma_request *req)
+ __must_hold(&req->qp->s_lock)
+{
+ struct tid_rdma_flow *flow = &req->flows[req->clear_tail];
+ struct hfi1_ctxtdata *rcd = req->rcd;
+ unsigned long flags;
+ int i;
+ struct rvt_qp *fqp;
+
+ lockdep_assert_held(&req->qp->s_lock);
+ /* Exit if we have nothing in the flow circular buffer */
+ if (!CIRC_CNT(req->setup_head, req->clear_tail, MAX_FLOWS))
+ return -EINVAL;
+
+ spin_lock_irqsave(&rcd->exp_lock, flags);
+
+ for (i = 0; i < flow->tnode_cnt; i++)
+ kern_unprogram_rcv_group(flow, i);
+ /* To prevent double unprogramming */
+ flow->tnode_cnt = 0;
+ /* get head before dropping lock */
+ fqp = first_qp(rcd, &rcd->rarr_queue);
+ spin_unlock_irqrestore(&rcd->exp_lock, flags);
+
+ dma_unmap_flow(flow);
+
+ hfi1_tid_rdma_reset_flow(flow);
+ req->clear_tail = (req->clear_tail + 1) & (MAX_FLOWS - 1);
+
+ if (fqp == req->qp) {
+ __trigger_tid_waiter(fqp);
+ rvt_put_qp(fqp);
+ } else {
+ tid_rdma_schedule_tid_wakeup(fqp);
+ }
+
+ return 0;
+}
+
+/*
+ * This function is called to release all the tid entries for
+ * a request.
+ */
+void hfi1_kern_exp_rcv_clear_all(struct tid_rdma_request *req)
+ __must_hold(&req->qp->s_lock)
+{
+ /* Use memory barrier for proper ordering */
+ while (CIRC_CNT(req->setup_head, req->clear_tail, MAX_FLOWS)) {
+ if (hfi1_kern_exp_rcv_clear(req))
+ break;
+ }
+}
+
+/**
+ * hfi1_kern_exp_rcv_free_flows - free priviously allocated flow information
+ * @req - the tid rdma request to be cleaned
+ */
+static void hfi1_kern_exp_rcv_free_flows(struct tid_rdma_request *req)
+{
+ kfree(req->flows);
+ req->flows = NULL;
+}
+
+/**
+ * __trdma_clean_swqe - clean up for large sized QPs
+ * @qp: the queue patch
+ * @wqe: the send wqe
+ */
+void __trdma_clean_swqe(struct rvt_qp *qp, struct rvt_swqe *wqe)
+{
+ struct hfi1_swqe_priv *p = wqe->priv;
+
+ hfi1_kern_exp_rcv_free_flows(&p->tid_req);
+}
+
+/*
+ * This can be called at QP create time or in the data path.
+ */
+static int hfi1_kern_exp_rcv_alloc_flows(struct tid_rdma_request *req,
+ gfp_t gfp)
+{
+ struct tid_rdma_flow *flows;
+ int i;
+
+ if (likely(req->flows))
+ return 0;
+ flows = kmalloc_node(MAX_FLOWS * sizeof(*flows), gfp,
+ req->rcd->numa_id);
+ if (!flows)
+ return -ENOMEM;
+ /* mini init */
+ for (i = 0; i < MAX_FLOWS; i++) {
+ flows[i].req = req;
+ flows[i].npagesets = 0;
+ flows[i].pagesets[0].mapped = 0;
+ }
+ req->flows = flows;
+ return 0;
+}
+
+static void hfi1_init_trdma_req(struct rvt_qp *qp,
+ struct tid_rdma_request *req)
+{
+ struct hfi1_qp_priv *qpriv = qp->priv;
+
+ /*
+ * Initialize various TID RDMA request variables.
+ * These variables are "static", which is why they
+ * can be pre-initialized here before the WRs has
+ * even been submitted.
+ * However, non-NULL values for these variables do not
+ * imply that this WQE has been enabled for TID RDMA.
+ * Drivers should check the WQE's opcode to determine
+ * if a request is a TID RDMA one or not.
+ */
+ req->qp = qp;
+ req->rcd = qpriv->rcd;
+}
+
+u64 hfi1_access_sw_tid_wait(const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = context;
+
+ return dd->verbs_dev.n_tidwait;
+}
+
+static struct tid_rdma_flow *find_flow_ib(struct tid_rdma_request *req,
+ u32 psn, u16 *fidx)
+{
+ u16 head, tail;
+ struct tid_rdma_flow *flow;
+
+ head = req->setup_head;
+ tail = req->clear_tail;
+ for ( ; CIRC_CNT(head, tail, MAX_FLOWS);
+ tail = CIRC_NEXT(tail, MAX_FLOWS)) {
+ flow = &req->flows[tail];
+ if (cmp_psn(psn, flow->flow_state.ib_spsn) >= 0 &&
+ cmp_psn(psn, flow->flow_state.ib_lpsn) <= 0) {
+ if (fidx)
+ *fidx = tail;
+ return flow;
+ }
+ }
+ return NULL;
+}
+
+static struct tid_rdma_flow *
+__find_flow_ranged(struct tid_rdma_request *req, u16 head, u16 tail,
+ u32 psn, u16 *fidx)
+{
+ for ( ; CIRC_CNT(head, tail, MAX_FLOWS);
+ tail = CIRC_NEXT(tail, MAX_FLOWS)) {
+ struct tid_rdma_flow *flow = &req->flows[tail];
+ u32 spsn, lpsn;
+
+ spsn = full_flow_psn(flow, flow->flow_state.spsn);
+ lpsn = full_flow_psn(flow, flow->flow_state.lpsn);
+
+ if (cmp_psn(psn, spsn) >= 0 && cmp_psn(psn, lpsn) <= 0) {
+ if (fidx)
+ *fidx = tail;
+ return flow;
+ }
+ }
+ return NULL;
+}
+
+static struct tid_rdma_flow *find_flow(struct tid_rdma_request *req,
+ u32 psn, u16 *fidx)
+{
+ return __find_flow_ranged(req, req->setup_head, req->clear_tail, psn,
+ fidx);
+}
+
+/* TID RDMA READ functions */
+u32 hfi1_build_tid_rdma_read_packet(struct rvt_swqe *wqe,
+ struct ib_other_headers *ohdr, u32 *bth1,
+ u32 *bth2, u32 *len)
+{
+ struct tid_rdma_request *req = wqe_to_tid_req(wqe);
+ struct tid_rdma_flow *flow = &req->flows[req->flow_idx];
+ struct rvt_qp *qp = req->qp;
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct hfi1_swqe_priv *wpriv = wqe->priv;
+ struct tid_rdma_read_req *rreq = &ohdr->u.tid_rdma.r_req;
+ struct tid_rdma_params *remote;
+ u32 req_len = 0;
+ void *req_addr = NULL;
+
+ /* This is the IB psn used to send the request */
+ *bth2 = mask_psn(flow->flow_state.ib_spsn + flow->pkt);
+ trace_hfi1_tid_flow_build_read_pkt(qp, req->flow_idx, flow);
+
+ /* TID Entries for TID RDMA READ payload */
+ req_addr = &flow->tid_entry[flow->tid_idx];
+ req_len = sizeof(*flow->tid_entry) *
+ (flow->tidcnt - flow->tid_idx);
+
+ memset(&ohdr->u.tid_rdma.r_req, 0, sizeof(ohdr->u.tid_rdma.r_req));
+ wpriv->ss.sge.vaddr = req_addr;
+ wpriv->ss.sge.sge_length = req_len;
+ wpriv->ss.sge.length = wpriv->ss.sge.sge_length;
+ /*
+ * We can safely zero these out. Since the first SGE covers the
+ * entire packet, nothing else should even look at the MR.
+ */
+ wpriv->ss.sge.mr = NULL;
+ wpriv->ss.sge.m = 0;
+ wpriv->ss.sge.n = 0;
+
+ wpriv->ss.sg_list = NULL;
+ wpriv->ss.total_len = wpriv->ss.sge.sge_length;
+ wpriv->ss.num_sge = 1;
+
+ /* Construct the TID RDMA READ REQ packet header */
+ rcu_read_lock();
+ remote = rcu_dereference(qpriv->tid_rdma.remote);
+
+ KDETH_RESET(rreq->kdeth0, KVER, 0x1);
+ KDETH_RESET(rreq->kdeth1, JKEY, remote->jkey);
+ rreq->reth.vaddr = cpu_to_be64(wqe->rdma_wr.remote_addr +
+ req->cur_seg * req->seg_len + flow->sent);
+ rreq->reth.rkey = cpu_to_be32(wqe->rdma_wr.rkey);
+ rreq->reth.length = cpu_to_be32(*len);
+ rreq->tid_flow_psn =
+ cpu_to_be32((flow->flow_state.generation <<
+ HFI1_KDETH_BTH_SEQ_SHIFT) |
+ ((flow->flow_state.spsn + flow->pkt) &
+ HFI1_KDETH_BTH_SEQ_MASK));
+ rreq->tid_flow_qp =
+ cpu_to_be32(qpriv->tid_rdma.local.qp |
+ ((flow->idx & TID_RDMA_DESTQP_FLOW_MASK) <<
+ TID_RDMA_DESTQP_FLOW_SHIFT) |
+ qpriv->rcd->ctxt);
+ rreq->verbs_qp = cpu_to_be32(qp->remote_qpn);
+ *bth1 &= ~RVT_QPN_MASK;
+ *bth1 |= remote->qp;
+ *bth2 |= IB_BTH_REQ_ACK;
+ rcu_read_unlock();
+
+ /* We are done with this segment */
+ flow->sent += *len;
+ req->cur_seg++;
+ qp->s_state = TID_OP(READ_REQ);
+ req->ack_pending++;
+ req->flow_idx = (req->flow_idx + 1) & (MAX_FLOWS - 1);
+ qpriv->pending_tid_r_segs++;
+ qp->s_num_rd_atomic++;
+
+ /* Set the TID RDMA READ request payload size */
+ *len = req_len;
+
+ return sizeof(ohdr->u.tid_rdma.r_req) / sizeof(u32);
+}
+
+/*
+ * @len: contains the data length to read upon entry and the read request
+ * payload length upon exit.
+ */
+u32 hfi1_build_tid_rdma_read_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
+ struct ib_other_headers *ohdr, u32 *bth1,
+ u32 *bth2, u32 *len)
+ __must_hold(&qp->s_lock)
+{
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct tid_rdma_request *req = wqe_to_tid_req(wqe);
+ struct tid_rdma_flow *flow = NULL;
+ u32 hdwords = 0;
+ bool last;
+ bool retry = true;
+ u32 npkts = rvt_div_round_up_mtu(qp, *len);
+
+ trace_hfi1_tid_req_build_read_req(qp, 0, wqe->wr.opcode, wqe->psn,
+ wqe->lpsn, req);
+ /*
+ * Check sync conditions. Make sure that there are no pending
+ * segments before freeing the flow.
+ */
+sync_check:
+ if (req->state == TID_REQUEST_SYNC) {
+ if (qpriv->pending_tid_r_segs)
+ goto done;
+
+ hfi1_kern_clear_hw_flow(req->rcd, qp);
+ req->state = TID_REQUEST_ACTIVE;
+ }
+
+ /*
+ * If the request for this segment is resent, the tid resources should
+ * have been allocated before. In this case, req->flow_idx should
+ * fall behind req->setup_head.
+ */
+ if (req->flow_idx == req->setup_head) {
+ retry = false;
+ if (req->state == TID_REQUEST_RESEND) {
+ /*
+ * This is the first new segment for a request whose
+ * earlier segments have been re-sent. We need to
+ * set up the sge pointer correctly.
+ */
+ restart_sge(&qp->s_sge, wqe, req->s_next_psn,
+ qp->pmtu);
+ req->isge = 0;
+ req->state = TID_REQUEST_ACTIVE;
+ }
+
+ /*
+ * Check sync. The last PSN of each generation is reserved for
+ * RESYNC.
+ */
+ if ((qpriv->flow_state.psn + npkts) > MAX_TID_FLOW_PSN - 1) {
+ req->state = TID_REQUEST_SYNC;
+ goto sync_check;
+ }
+
+ /* Allocate the flow if not yet */
+ if (hfi1_kern_setup_hw_flow(qpriv->rcd, qp))
+ goto done;
+
+ /*
+ * The following call will advance req->setup_head after
+ * allocating the tid entries.
+ */
+ if (hfi1_kern_exp_rcv_setup(req, &qp->s_sge, &last)) {
+ req->state = TID_REQUEST_QUEUED;
+
+ /*
+ * We don't have resources for this segment. The QP has
+ * already been queued.
+ */
+ goto done;
+ }
+ }
+
+ /* req->flow_idx should only be one slot behind req->setup_head */
+ flow = &req->flows[req->flow_idx];
+ flow->pkt = 0;
+ flow->tid_idx = 0;
+ flow->sent = 0;
+ if (!retry) {
+ /* Set the first and last IB PSN for the flow in use.*/
+ flow->flow_state.ib_spsn = req->s_next_psn;
+ flow->flow_state.ib_lpsn =
+ flow->flow_state.ib_spsn + flow->npkts - 1;
+ }
+
+ /* Calculate the next segment start psn.*/
+ req->s_next_psn += flow->npkts;
+
+ /* Build the packet header */
+ hdwords = hfi1_build_tid_rdma_read_packet(wqe, ohdr, bth1, bth2, len);
+done:
+ return hdwords;
+}
+
+/*
+ * Validate and accept the TID RDMA READ request parameters.
+ * Return 0 if the request is accepted successfully;
+ * Return 1 otherwise.
+ */
+static int tid_rdma_rcv_read_request(struct rvt_qp *qp,
+ struct rvt_ack_entry *e,
+ struct hfi1_packet *packet,
+ struct ib_other_headers *ohdr,
+ u32 bth0, u32 psn, u64 vaddr, u32 len)
+{
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct tid_rdma_request *req;
+ struct tid_rdma_flow *flow;
+ u32 flow_psn, i, tidlen = 0, pktlen, tlen;
+
+ req = ack_to_tid_req(e);
+
+ /* Validate the payload first */
+ flow = &req->flows[req->setup_head];
+
+ /* payload length = packet length - (header length + ICRC length) */
+ pktlen = packet->tlen - (packet->hlen + 4);
+ if (pktlen > sizeof(flow->tid_entry))
+ return 1;
+ memcpy(flow->tid_entry, packet->ebuf, pktlen);
+ flow->tidcnt = pktlen / sizeof(*flow->tid_entry);
+
+ /*
+ * Walk the TID_ENTRY list to make sure we have enough space for a
+ * complete segment. Also calculate the number of required packets.
+ */
+ flow->npkts = rvt_div_round_up_mtu(qp, len);
+ for (i = 0; i < flow->tidcnt; i++) {
+ trace_hfi1_tid_entry_rcv_read_req(qp, i,
+ flow->tid_entry[i]);
+ tlen = EXP_TID_GET(flow->tid_entry[i], LEN);
+ if (!tlen)
+ return 1;
+
+ /*
+ * For tid pair (tidctr == 3), the buffer size of the pair
+ * should be the sum of the buffer size described by each
+ * tid entry. However, only the first entry needs to be
+ * specified in the request (see WFR HAS Section 8.5.7.1).
+ */
+ tidlen += tlen;
+ }
+ if (tidlen * PAGE_SIZE < len)
+ return 1;
+
+ /* Empty the flow array */
+ req->clear_tail = req->setup_head;
+ flow->pkt = 0;
+ flow->tid_idx = 0;
+ flow->tid_offset = 0;
+ flow->sent = 0;
+ flow->tid_qpn = be32_to_cpu(ohdr->u.tid_rdma.r_req.tid_flow_qp);
+ flow->idx = (flow->tid_qpn >> TID_RDMA_DESTQP_FLOW_SHIFT) &
+ TID_RDMA_DESTQP_FLOW_MASK;
+ flow_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.r_req.tid_flow_psn));
+ flow->flow_state.generation = flow_psn >> HFI1_KDETH_BTH_SEQ_SHIFT;
+ flow->flow_state.spsn = flow_psn & HFI1_KDETH_BTH_SEQ_MASK;
+ flow->length = len;
+
+ flow->flow_state.lpsn = flow->flow_state.spsn +
+ flow->npkts - 1;
+ flow->flow_state.ib_spsn = psn;
+ flow->flow_state.ib_lpsn = flow->flow_state.ib_spsn + flow->npkts - 1;
+
+ trace_hfi1_tid_flow_rcv_read_req(qp, req->setup_head, flow);
+ /* Set the initial flow index to the current flow. */
+ req->flow_idx = req->setup_head;
+
+ /* advance circular buffer head */
+ req->setup_head = (req->setup_head + 1) & (MAX_FLOWS - 1);
+
+ /*
+ * Compute last PSN for request.
+ */
+ e->opcode = (bth0 >> 24) & 0xff;
+ e->psn = psn;
+ e->lpsn = psn + flow->npkts - 1;
+ e->sent = 0;
+
+ req->n_flows = qpriv->tid_rdma.local.max_read;
+ req->state = TID_REQUEST_ACTIVE;
+ req->cur_seg = 0;
+ req->comp_seg = 0;
+ req->ack_seg = 0;
+ req->isge = 0;
+ req->seg_len = qpriv->tid_rdma.local.max_len;
+ req->total_len = len;
+ req->total_segs = 1;
+ req->r_flow_psn = e->psn;
+
+ trace_hfi1_tid_req_rcv_read_req(qp, 0, e->opcode, e->psn, e->lpsn,
+ req);
+ return 0;
+}
+
+static int tid_rdma_rcv_error(struct hfi1_packet *packet,
+ struct ib_other_headers *ohdr,
+ struct rvt_qp *qp, u32 psn, int diff)
+{
+ struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+ struct hfi1_ctxtdata *rcd = ((struct hfi1_qp_priv *)qp->priv)->rcd;
+ struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct rvt_ack_entry *e;
+ struct tid_rdma_request *req;
+ unsigned long flags;
+ u8 prev;
+ bool old_req;
+
+ trace_hfi1_rsp_tid_rcv_error(qp, psn);
+ trace_hfi1_tid_rdma_rcv_err(qp, 0, psn, diff);
+ if (diff > 0) {
+ /* sequence error */
+ if (!qp->r_nak_state) {
+ ibp->rvp.n_rc_seqnak++;
+ qp->r_nak_state = IB_NAK_PSN_ERROR;
+ qp->r_ack_psn = qp->r_psn;
+ rc_defered_ack(rcd, qp);
+ }
+ goto done;
+ }
+
+ ibp->rvp.n_rc_dupreq++;
+
+ spin_lock_irqsave(&qp->s_lock, flags);
+ e = find_prev_entry(qp, psn, &prev, NULL, &old_req);
+ if (!e || (e->opcode != TID_OP(READ_REQ) &&
+ e->opcode != TID_OP(WRITE_REQ)))
+ goto unlock;
+
+ req = ack_to_tid_req(e);
+ req->r_flow_psn = psn;
+ trace_hfi1_tid_req_rcv_err(qp, 0, e->opcode, e->psn, e->lpsn, req);
+ if (e->opcode == TID_OP(READ_REQ)) {
+ struct ib_reth *reth;
+ u32 offset;
+ u32 len;
+ u32 rkey;
+ u64 vaddr;
+ int ok;
+ u32 bth0;
+
+ reth = &ohdr->u.tid_rdma.r_req.reth;
+ /*
+ * The requester always restarts from the start of the original
+ * request.
+ */
+ offset = delta_psn(psn, e->psn) * qp->pmtu;
+ len = be32_to_cpu(reth->length);
+ if (psn != e->psn || len != req->total_len)
+ goto unlock;
+
+ if (e->rdma_sge.mr) {
+ rvt_put_mr(e->rdma_sge.mr);
+ e->rdma_sge.mr = NULL;
+ }
+
+ rkey = be32_to_cpu(reth->rkey);
+ vaddr = get_ib_reth_vaddr(reth);
+
+ qp->r_len = len;
+ ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr, rkey,
+ IB_ACCESS_REMOTE_READ);
+ if (unlikely(!ok))
+ goto unlock;
+
+ /*
+ * If all the response packets for the current request have
+ * been sent out and this request is complete (old_request
+ * == false) and the TID flow may be unusable (the
+ * req->clear_tail is advanced). However, when an earlier
+ * request is received, this request will not be complete any
+ * more (qp->s_tail_ack_queue is moved back, see below).
+ * Consequently, we need to update the TID flow info everytime
+ * a duplicate request is received.
+ */
+ bth0 = be32_to_cpu(ohdr->bth[0]);
+ if (tid_rdma_rcv_read_request(qp, e, packet, ohdr, bth0, psn,
+ vaddr, len))
+ goto unlock;
+
+ /*
+ * True if the request is already scheduled (between
+ * qp->s_tail_ack_queue and qp->r_head_ack_queue);
+ */
+ if (old_req)
+ goto unlock;
+ } else {
+ struct flow_state *fstate;
+ bool schedule = false;
+ u8 i;
+
+ if (req->state == TID_REQUEST_RESEND) {
+ req->state = TID_REQUEST_RESEND_ACTIVE;
+ } else if (req->state == TID_REQUEST_INIT_RESEND) {
+ req->state = TID_REQUEST_INIT;
+ schedule = true;
+ }
+
+ /*
+ * True if the request is already scheduled (between
+ * qp->s_tail_ack_queue and qp->r_head_ack_queue).
+ * Also, don't change requests, which are at the SYNC
+ * point and haven't generated any responses yet.
+ * There is nothing to retransmit for them yet.
+ */
+ if (old_req || req->state == TID_REQUEST_INIT ||
+ (req->state == TID_REQUEST_SYNC && !req->cur_seg)) {
+ for (i = prev + 1; ; i++) {
+ if (i > rvt_size_atomic(&dev->rdi))
+ i = 0;
+ if (i == qp->r_head_ack_queue)
+ break;
+ e = &qp->s_ack_queue[i];
+ req = ack_to_tid_req(e);
+ if (e->opcode == TID_OP(WRITE_REQ) &&
+ req->state == TID_REQUEST_INIT)
+ req->state = TID_REQUEST_INIT_RESEND;
+ }
+ /*
+ * If the state of the request has been changed,
+ * the first leg needs to get scheduled in order to
+ * pick up the change. Otherwise, normal response
+ * processing should take care of it.
+ */
+ if (!schedule)
+ goto unlock;
+ }
+
+ /*
+ * If there is no more allocated segment, just schedule the qp
+ * without changing any state.
+ */
+ if (req->clear_tail == req->setup_head)
+ goto schedule;
+ /*
+ * If this request has sent responses for segments, which have
+ * not received data yet (flow_idx != clear_tail), the flow_idx
+ * pointer needs to be adjusted so the same responses can be
+ * re-sent.
+ */
+ if (CIRC_CNT(req->flow_idx, req->clear_tail, MAX_FLOWS)) {
+ fstate = &req->flows[req->clear_tail].flow_state;
+ qpriv->pending_tid_w_segs -=
+ CIRC_CNT(req->flow_idx, req->clear_tail,
+ MAX_FLOWS);
+ req->flow_idx =
+ CIRC_ADD(req->clear_tail,
+ delta_psn(psn, fstate->resp_ib_psn),
+ MAX_FLOWS);
+ qpriv->pending_tid_w_segs +=
+ delta_psn(psn, fstate->resp_ib_psn);
+ /*
+ * When flow_idx == setup_head, we've gotten a duplicate
+ * request for a segment, which has not been allocated
+ * yet. In that case, don't adjust this request.
+ * However, we still want to go through the loop below
+ * to adjust all subsequent requests.
+ */
+ if (CIRC_CNT(req->setup_head, req->flow_idx,
+ MAX_FLOWS)) {
+ req->cur_seg = delta_psn(psn, e->psn);
+ req->state = TID_REQUEST_RESEND_ACTIVE;
+ }
+ }
+
+ for (i = prev + 1; ; i++) {
+ /*
+ * Look at everything up to and including
+ * s_tail_ack_queue
+ */
+ if (i > rvt_size_atomic(&dev->rdi))
+ i = 0;
+ if (i == qp->r_head_ack_queue)
+ break;
+ e = &qp->s_ack_queue[i];
+ req = ack_to_tid_req(e);
+ trace_hfi1_tid_req_rcv_err(qp, 0, e->opcode, e->psn,
+ e->lpsn, req);
+ if (e->opcode != TID_OP(WRITE_REQ) ||
+ req->cur_seg == req->comp_seg ||
+ req->state == TID_REQUEST_INIT ||
+ req->state == TID_REQUEST_INIT_RESEND) {
+ if (req->state == TID_REQUEST_INIT)
+ req->state = TID_REQUEST_INIT_RESEND;
+ continue;
+ }
+ qpriv->pending_tid_w_segs -=
+ CIRC_CNT(req->flow_idx,
+ req->clear_tail,
+ MAX_FLOWS);
+ req->flow_idx = req->clear_tail;
+ req->state = TID_REQUEST_RESEND;
+ req->cur_seg = req->comp_seg;
+ }
+ qpriv->s_flags &= ~HFI1_R_TID_WAIT_INTERLCK;
+ }
+ /* Re-process old requests.*/
+ if (qp->s_acked_ack_queue == qp->s_tail_ack_queue)
+ qp->s_acked_ack_queue = prev;
+ qp->s_tail_ack_queue = prev;
+ /*
+ * Since the qp->s_tail_ack_queue is modified, the
+ * qp->s_ack_state must be changed to re-initialize
+ * qp->s_ack_rdma_sge; Otherwise, we will end up in
+ * wrong memory region.
+ */
+ qp->s_ack_state = OP(ACKNOWLEDGE);
+schedule:
+ /*
+ * It's possible to receive a retry psn that is earlier than an RNRNAK
+ * psn. In this case, the rnrnak state should be cleared.
+ */
+ if (qpriv->rnr_nak_state) {
+ qp->s_nak_state = 0;
+ qpriv->rnr_nak_state = TID_RNR_NAK_INIT;
+ qp->r_psn = e->lpsn + 1;
+ hfi1_tid_write_alloc_resources(qp, true);
+ }
+
+ qp->r_state = e->opcode;
+ qp->r_nak_state = 0;
+ qp->s_flags |= RVT_S_RESP_PENDING;
+ hfi1_schedule_send(qp);
+unlock:
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+done:
+ return 1;
+}
+
+void hfi1_rc_rcv_tid_rdma_read_req(struct hfi1_packet *packet)
+{
+ /* HANDLER FOR TID RDMA READ REQUEST packet (Responder side)*/
+
+ /*
+ * 1. Verify TID RDMA READ REQ as per IB_OPCODE_RC_RDMA_READ
+ * (see hfi1_rc_rcv())
+ * 2. Put TID RDMA READ REQ into the response queueu (s_ack_queue)
+ * - Setup struct tid_rdma_req with request info
+ * - Initialize struct tid_rdma_flow info;
+ * - Copy TID entries;
+ * 3. Set the qp->s_ack_state.
+ * 4. Set RVT_S_RESP_PENDING in s_flags.
+ * 5. Kick the send engine (hfi1_schedule_send())
+ */
+ struct hfi1_ctxtdata *rcd = packet->rcd;
+ struct rvt_qp *qp = packet->qp;
+ struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+ struct ib_other_headers *ohdr = packet->ohdr;
+ struct rvt_ack_entry *e;
+ unsigned long flags;
+ struct ib_reth *reth;
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ u32 bth0, psn, len, rkey;
+ bool is_fecn;
+ u8 next;
+ u64 vaddr;
+ int diff;
+ u8 nack_state = IB_NAK_INVALID_REQUEST;
+
+ bth0 = be32_to_cpu(ohdr->bth[0]);
+ if (hfi1_ruc_check_hdr(ibp, packet))
+ return;
+
+ is_fecn = process_ecn(qp, packet);
+ psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
+ trace_hfi1_rsp_rcv_tid_read_req(qp, psn);
+
+ if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST))
+ rvt_comm_est(qp);
+
+ if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
+ goto nack_inv;
+
+ reth = &ohdr->u.tid_rdma.r_req.reth;
+ vaddr = be64_to_cpu(reth->vaddr);
+ len = be32_to_cpu(reth->length);
+ /* The length needs to be in multiples of PAGE_SIZE */
+ if (!len || len & ~PAGE_MASK || len > qpriv->tid_rdma.local.max_len)
+ goto nack_inv;
+
+ diff = delta_psn(psn, qp->r_psn);
+ if (unlikely(diff)) {
+ if (tid_rdma_rcv_error(packet, ohdr, qp, psn, diff))
+ return;
+ goto send_ack;
+ }
+
+ /* We've verified the request, insert it into the ack queue. */
+ next = qp->r_head_ack_queue + 1;
+ if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
+ next = 0;
+ spin_lock_irqsave(&qp->s_lock, flags);
+ if (unlikely(next == qp->s_tail_ack_queue)) {
+ if (!qp->s_ack_queue[next].sent) {
+ nack_state = IB_NAK_REMOTE_OPERATIONAL_ERROR;
+ goto nack_inv_unlock;
+ }
+ update_ack_queue(qp, next);
+ }
+ e = &qp->s_ack_queue[qp->r_head_ack_queue];
+ if (e->rdma_sge.mr) {
+ rvt_put_mr(e->rdma_sge.mr);
+ e->rdma_sge.mr = NULL;
+ }
+
+ rkey = be32_to_cpu(reth->rkey);
+ qp->r_len = len;
+
+ if (unlikely(!rvt_rkey_ok(qp, &e->rdma_sge, qp->r_len, vaddr,
+ rkey, IB_ACCESS_REMOTE_READ)))
+ goto nack_acc;
+
+ /* Accept the request parameters */
+ if (tid_rdma_rcv_read_request(qp, e, packet, ohdr, bth0, psn, vaddr,
+ len))
+ goto nack_inv_unlock;
+
+ qp->r_state = e->opcode;
+ qp->r_nak_state = 0;
+ /*
+ * We need to increment the MSN here instead of when we
+ * finish sending the result since a duplicate request would
+ * increment it more than once.
+ */
+ qp->r_msn++;
+ qp->r_psn += e->lpsn - e->psn + 1;
+
+ qp->r_head_ack_queue = next;
+
+ /*
+ * For all requests other than TID WRITE which are added to the ack
+ * queue, qpriv->r_tid_alloc follows qp->r_head_ack_queue. It is ok to
+ * do this because of interlocks between these and TID WRITE
+ * requests. The same change has also been made in hfi1_rc_rcv().
+ */
+ qpriv->r_tid_alloc = qp->r_head_ack_queue;
+
+ /* Schedule the send tasklet. */
+ qp->s_flags |= RVT_S_RESP_PENDING;
+ hfi1_schedule_send(qp);
+
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ if (is_fecn)
+ goto send_ack;
+ return;
+
+nack_inv_unlock:
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+nack_inv:
+ rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+ qp->r_nak_state = nack_state;
+ qp->r_ack_psn = qp->r_psn;
+ /* Queue NAK for later */
+ rc_defered_ack(rcd, qp);
+ return;
+nack_acc:
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ rvt_rc_error(qp, IB_WC_LOC_PROT_ERR);
+ qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
+ qp->r_ack_psn = qp->r_psn;
+send_ack:
+ hfi1_send_rc_ack(packet, is_fecn);
+}
+
+u32 hfi1_build_tid_rdma_read_resp(struct rvt_qp *qp, struct rvt_ack_entry *e,
+ struct ib_other_headers *ohdr, u32 *bth0,
+ u32 *bth1, u32 *bth2, u32 *len, bool *last)
+{
+ struct hfi1_ack_priv *epriv = e->priv;
+ struct tid_rdma_request *req = &epriv->tid_req;
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct tid_rdma_flow *flow = &req->flows[req->clear_tail];
+ u32 tidentry = flow->tid_entry[flow->tid_idx];
+ u32 tidlen = EXP_TID_GET(tidentry, LEN) << PAGE_SHIFT;
+ struct tid_rdma_read_resp *resp = &ohdr->u.tid_rdma.r_rsp;
+ u32 next_offset, om = KDETH_OM_LARGE;
+ bool last_pkt;
+ u32 hdwords = 0;
+ struct tid_rdma_params *remote;
+
+ *len = min_t(u32, qp->pmtu, tidlen - flow->tid_offset);
+ flow->sent += *len;
+ next_offset = flow->tid_offset + *len;
+ last_pkt = (flow->sent >= flow->length);
+
+ trace_hfi1_tid_entry_build_read_resp(qp, flow->tid_idx, tidentry);
+ trace_hfi1_tid_flow_build_read_resp(qp, req->clear_tail, flow);
+
+ rcu_read_lock();
+ remote = rcu_dereference(qpriv->tid_rdma.remote);
+ if (!remote) {
+ rcu_read_unlock();
+ goto done;
+ }
+ KDETH_RESET(resp->kdeth0, KVER, 0x1);
+ KDETH_SET(resp->kdeth0, SH, !last_pkt);
+ KDETH_SET(resp->kdeth0, INTR, !!(!last_pkt && remote->urg));
+ KDETH_SET(resp->kdeth0, TIDCTRL, EXP_TID_GET(tidentry, CTRL));
+ KDETH_SET(resp->kdeth0, TID, EXP_TID_GET(tidentry, IDX));
+ KDETH_SET(resp->kdeth0, OM, om == KDETH_OM_LARGE);
+ KDETH_SET(resp->kdeth0, OFFSET, flow->tid_offset / om);
+ KDETH_RESET(resp->kdeth1, JKEY, remote->jkey);
+ resp->verbs_qp = cpu_to_be32(qp->remote_qpn);
+ rcu_read_unlock();
+
+ resp->aeth = rvt_compute_aeth(qp);
+ resp->verbs_psn = cpu_to_be32(mask_psn(flow->flow_state.ib_spsn +
+ flow->pkt));
+
+ *bth0 = TID_OP(READ_RESP) << 24;
+ *bth1 = flow->tid_qpn;
+ *bth2 = mask_psn(((flow->flow_state.spsn + flow->pkt++) &
+ HFI1_KDETH_BTH_SEQ_MASK) |
+ (flow->flow_state.generation <<
+ HFI1_KDETH_BTH_SEQ_SHIFT));
+ *last = last_pkt;
+ if (last_pkt)
+ /* Advance to next flow */
+ req->clear_tail = (req->clear_tail + 1) &
+ (MAX_FLOWS - 1);
+
+ if (next_offset >= tidlen) {
+ flow->tid_offset = 0;
+ flow->tid_idx++;
+ } else {
+ flow->tid_offset = next_offset;
+ }
+
+ hdwords = sizeof(ohdr->u.tid_rdma.r_rsp) / sizeof(u32);
+
+done:
+ return hdwords;
+}
+
+static inline struct tid_rdma_request *
+find_tid_request(struct rvt_qp *qp, u32 psn, enum ib_wr_opcode opcode)
+ __must_hold(&qp->s_lock)
+{
+ struct rvt_swqe *wqe;
+ struct tid_rdma_request *req = NULL;
+ u32 i, end;
+
+ end = qp->s_cur + 1;
+ if (end == qp->s_size)
+ end = 0;
+ for (i = qp->s_acked; i != end;) {
+ wqe = rvt_get_swqe_ptr(qp, i);
+ if (cmp_psn(psn, wqe->psn) >= 0 &&
+ cmp_psn(psn, wqe->lpsn) <= 0) {
+ if (wqe->wr.opcode == opcode)
+ req = wqe_to_tid_req(wqe);
+ break;
+ }
+ if (++i == qp->s_size)
+ i = 0;
+ }
+
+ return req;
+}
+
+void hfi1_rc_rcv_tid_rdma_read_resp(struct hfi1_packet *packet)
+{
+ /* HANDLER FOR TID RDMA READ RESPONSE packet (Requestor side */
+
+ /*
+ * 1. Find matching SWQE
+ * 2. Check that the entire segment has been read.
+ * 3. Remove HFI1_S_WAIT_TID_RESP from s_flags.
+ * 4. Free the TID flow resources.
+ * 5. Kick the send engine (hfi1_schedule_send())
+ */
+ struct ib_other_headers *ohdr = packet->ohdr;
+ struct rvt_qp *qp = packet->qp;
+ struct hfi1_qp_priv *priv = qp->priv;
+ struct hfi1_ctxtdata *rcd = packet->rcd;
+ struct tid_rdma_request *req;
+ struct tid_rdma_flow *flow;
+ u32 opcode, aeth;
+ bool is_fecn;
+ unsigned long flags;
+ u32 kpsn, ipsn;
+
+ trace_hfi1_sender_rcv_tid_read_resp(qp);
+ is_fecn = process_ecn(qp, packet);
+ kpsn = mask_psn(be32_to_cpu(ohdr->bth[2]));
+ aeth = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.aeth);
+ opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
+
+ spin_lock_irqsave(&qp->s_lock, flags);
+ ipsn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_psn));
+ req = find_tid_request(qp, ipsn, IB_WR_TID_RDMA_READ);
+ if (unlikely(!req))
+ goto ack_op_err;
+
+ flow = &req->flows[req->clear_tail];
+ /* When header suppression is disabled */
+ if (cmp_psn(ipsn, flow->flow_state.ib_lpsn))
+ goto ack_done;
+ req->ack_pending--;
+ priv->pending_tid_r_segs--;
+ qp->s_num_rd_atomic--;
+ if ((qp->s_flags & RVT_S_WAIT_FENCE) &&
+ !qp->s_num_rd_atomic) {
+ qp->s_flags &= ~(RVT_S_WAIT_FENCE |
+ RVT_S_WAIT_ACK);
+ hfi1_schedule_send(qp);
+ }
+ if (qp->s_flags & RVT_S_WAIT_RDMAR) {
+ qp->s_flags &= ~(RVT_S_WAIT_RDMAR | RVT_S_WAIT_ACK);
+ hfi1_schedule_send(qp);
+ }
+
+ trace_hfi1_ack(qp, ipsn);
+ trace_hfi1_tid_req_rcv_read_resp(qp, 0, req->e.swqe->wr.opcode,
+ req->e.swqe->psn, req->e.swqe->lpsn,
+ req);
+ trace_hfi1_tid_flow_rcv_read_resp(qp, req->clear_tail, flow);
+
+ /* Release the tid resources */
+ hfi1_kern_exp_rcv_clear(req);
+
+ if (!do_rc_ack(qp, aeth, ipsn, opcode, 0, rcd))
+ goto ack_done;
+
+ /* If not done yet, build next read request */
+ if (++req->comp_seg >= req->total_segs) {
+ priv->tid_r_comp++;
+ req->state = TID_REQUEST_COMPLETE;
+ }
+
+ /*
+ * Clear the hw flow under two conditions:
+ * 1. This request is a sync point and it is complete;
+ * 2. Current request is completed and there are no more requests.
+ */
+ if ((req->state == TID_REQUEST_SYNC &&
+ req->comp_seg == req->cur_seg) ||
+ priv->tid_r_comp == priv->tid_r_reqs) {
+ hfi1_kern_clear_hw_flow(priv->rcd, qp);
+ if (req->state == TID_REQUEST_SYNC)
+ req->state = TID_REQUEST_ACTIVE;
+ }
+
+ hfi1_schedule_send(qp);
+ goto ack_done;
+
+ack_op_err:
+ /*
+ * The test indicates that the send engine has finished its cleanup
+ * after sending the request and it's now safe to put the QP into error
+ * state. However, if the wqe queue is empty (qp->s_acked == qp->s_tail
+ * == qp->s_head), it would be unsafe to complete the wqe pointed by
+ * qp->s_acked here. Putting the qp into error state will safely flush
+ * all remaining requests.
+ */
+ if (qp->s_last == qp->s_acked)
+ rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+
+ack_done:
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ if (is_fecn)
+ hfi1_send_rc_ack(packet, is_fecn);
+}
+
+void hfi1_kern_read_tid_flow_free(struct rvt_qp *qp)
+ __must_hold(&qp->s_lock)
+{
+ u32 n = qp->s_acked;
+ struct rvt_swqe *wqe;
+ struct tid_rdma_request *req;
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ lockdep_assert_held(&qp->s_lock);
+ /* Free any TID entries */
+ while (n != qp->s_tail) {
+ wqe = rvt_get_swqe_ptr(qp, n);
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
+ req = wqe_to_tid_req(wqe);
+ hfi1_kern_exp_rcv_clear_all(req);
+ }
+
+ if (++n == qp->s_size)
+ n = 0;
+ }
+ /* Free flow */
+ hfi1_kern_clear_hw_flow(priv->rcd, qp);
+}
+
+static bool tid_rdma_tid_err(struct hfi1_ctxtdata *rcd,
+ struct hfi1_packet *packet, u8 rcv_type,
+ u8 opcode)
+{
+ struct rvt_qp *qp = packet->qp;
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ u32 ipsn;
+ struct ib_other_headers *ohdr = packet->ohdr;
+ struct rvt_ack_entry *e;
+ struct tid_rdma_request *req;
+ struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
+ u32 i;
+
+ if (rcv_type >= RHF_RCV_TYPE_IB)
+ goto done;
+
+ spin_lock(&qp->s_lock);
+
+ /*
+ * We've ran out of space in the eager buffer.
+ * Eagerly received KDETH packets which require space in the
+ * Eager buffer (packet that have payload) are TID RDMA WRITE
+ * response packets. In this case, we have to re-transmit the
+ * TID RDMA WRITE request.
+ */
+ if (rcv_type == RHF_RCV_TYPE_EAGER) {
+ hfi1_restart_rc(qp, qp->s_last_psn + 1, 1);
+ hfi1_schedule_send(qp);
+ goto done_unlock;
+ }
+
+ /*
+ * For TID READ response, error out QP after freeing the tid
+ * resources.
+ */
+ if (opcode == TID_OP(READ_RESP)) {
+ ipsn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_psn));
+ if (cmp_psn(ipsn, qp->s_last_psn) > 0 &&
+ cmp_psn(ipsn, qp->s_psn) < 0) {
+ hfi1_kern_read_tid_flow_free(qp);
+ spin_unlock(&qp->s_lock);
+ rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+ goto done;
+ }
+ goto done_unlock;
+ }
+
+ /*
+ * Error out the qp for TID RDMA WRITE
+ */
+ hfi1_kern_clear_hw_flow(qpriv->rcd, qp);
+ for (i = 0; i < rvt_max_atomic(rdi); i++) {
+ e = &qp->s_ack_queue[i];
+ if (e->opcode == TID_OP(WRITE_REQ)) {
+ req = ack_to_tid_req(e);
+ hfi1_kern_exp_rcv_clear_all(req);
+ }
+ }
+ spin_unlock(&qp->s_lock);
+ rvt_rc_error(qp, IB_WC_LOC_LEN_ERR);
+ goto done;
+
+done_unlock:
+ spin_unlock(&qp->s_lock);
+done:
+ return true;
+}
+
+static void restart_tid_rdma_read_req(struct hfi1_ctxtdata *rcd,
+ struct rvt_qp *qp, struct rvt_swqe *wqe)
+{
+ struct tid_rdma_request *req;
+ struct tid_rdma_flow *flow;
+
+ /* Start from the right segment */
+ qp->r_flags |= RVT_R_RDMAR_SEQ;
+ req = wqe_to_tid_req(wqe);
+ flow = &req->flows[req->clear_tail];
+ hfi1_restart_rc(qp, flow->flow_state.ib_spsn, 0);
+ if (list_empty(&qp->rspwait)) {
+ qp->r_flags |= RVT_R_RSP_SEND;
+ rvt_get_qp(qp);
+ list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
+ }
+}
+
+/*
+ * Handle the KDETH eflags for TID RDMA READ response.
+ *
+ * Return true if the last packet for a segment has been received and it is
+ * time to process the response normally; otherwise, return true.
+ *
+ * The caller must hold the packet->qp->r_lock and the rcu_read_lock.
+ */
+static bool handle_read_kdeth_eflags(struct hfi1_ctxtdata *rcd,
+ struct hfi1_packet *packet, u8 rcv_type,
+ u8 rte, u32 psn, u32 ibpsn)
+ __must_hold(&packet->qp->r_lock) __must_hold(RCU)
+{
+ struct hfi1_pportdata *ppd = rcd->ppd;
+ struct hfi1_devdata *dd = ppd->dd;
+ struct hfi1_ibport *ibp;
+ struct rvt_swqe *wqe;
+ struct tid_rdma_request *req;
+ struct tid_rdma_flow *flow;
+ u32 ack_psn;
+ struct rvt_qp *qp = packet->qp;
+ struct hfi1_qp_priv *priv = qp->priv;
+ bool ret = true;
+ int diff = 0;
+ u32 fpsn;
+
+ lockdep_assert_held(&qp->r_lock);
+ /* If the psn is out of valid range, drop the packet */
+ if (cmp_psn(ibpsn, qp->s_last_psn) < 0 ||
+ cmp_psn(ibpsn, qp->s_psn) > 0)
+ return ret;
+
+ spin_lock(&qp->s_lock);
+ /*
+ * Note that NAKs implicitly ACK outstanding SEND and RDMA write
+ * requests and implicitly NAK RDMA read and atomic requests issued
+ * before the NAK'ed request.
+ */
+ ack_psn = ibpsn - 1;
+ wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+ ibp = to_iport(qp->ibqp.device, qp->port_num);
+
+ /* Complete WQEs that the PSN finishes. */
+ while ((int)delta_psn(ack_psn, wqe->lpsn) >= 0) {
+ /*
+ * If this request is a RDMA read or atomic, and the NACK is
+ * for a later operation, this NACK NAKs the RDMA read or
+ * atomic.
+ */
+ if (wqe->wr.opcode == IB_WR_RDMA_READ ||
+ wqe->wr.opcode == IB_WR_TID_RDMA_READ ||
+ wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+ wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) {
+ /* Retry this request. */
+ if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) {
+ qp->r_flags |= RVT_R_RDMAR_SEQ;
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
+ restart_tid_rdma_read_req(rcd, qp,
+ wqe);
+ } else {
+ hfi1_restart_rc(qp, qp->s_last_psn + 1,
+ 0);
+ if (list_empty(&qp->rspwait)) {
+ qp->r_flags |= RVT_R_RSP_SEND;
+ rvt_get_qp(qp);
+ list_add_tail(/* wait */
+ &qp->rspwait,
+ &rcd->qp_wait_list);
+ }
+ }
+ }
+ /*
+ * No need to process the NAK since we are
+ * restarting an earlier request.
+ */
+ break;
+ }
+
+ wqe = do_rc_completion(qp, wqe, ibp);
+ if (qp->s_acked == qp->s_tail)
+ break;
+ }
+
+ /* Handle the eflags for the request */
+ if (wqe->wr.opcode != IB_WR_TID_RDMA_READ)
+ goto s_unlock;
+
+ req = wqe_to_tid_req(wqe);
+ switch (rcv_type) {
+ case RHF_RCV_TYPE_EXPECTED:
+ switch (rte) {
+ case RHF_RTE_EXPECTED_FLOW_SEQ_ERR:
+ /*
+ * On the first occurrence of a Flow Sequence error,
+ * the flag TID_FLOW_SW_PSN is set.
+ *
+ * After that, the flow is *not* reprogrammed and the
+ * protocol falls back to SW PSN checking. This is done
+ * to prevent continuous Flow Sequence errors for any
+ * packets that could be still in the fabric.
+ */
+ flow = find_flow(req, psn, NULL);
+ if (!flow) {
+ /*
+ * We can't find the IB PSN matching the
+ * received KDETH PSN. The only thing we can
+ * do at this point is report the error to
+ * the QP.
+ */
+ hfi1_kern_read_tid_flow_free(qp);
+ spin_unlock(&qp->s_lock);
+ rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+ return ret;
+ }
+ if (priv->flow_state.flags & TID_FLOW_SW_PSN) {
+ diff = cmp_psn(psn,
+ priv->flow_state.r_next_psn);
+ if (diff > 0) {
+ if (!(qp->r_flags & RVT_R_RDMAR_SEQ))
+ restart_tid_rdma_read_req(rcd,
+ qp,
+ wqe);
+
+ /* Drop the packet.*/
+ goto s_unlock;
+ } else if (diff < 0) {
+ /*
+ * If a response packet for a restarted
+ * request has come back, reset the
+ * restart flag.
+ */
+ if (qp->r_flags & RVT_R_RDMAR_SEQ)
+ qp->r_flags &=
+ ~RVT_R_RDMAR_SEQ;
+
+ /* Drop the packet.*/
+ goto s_unlock;
+ }
+
+ /*
+ * If SW PSN verification is successful and
+ * this is the last packet in the segment, tell
+ * the caller to process it as a normal packet.
+ */
+ fpsn = full_flow_psn(flow,
+ flow->flow_state.lpsn);
+ if (cmp_psn(fpsn, psn) == 0) {
+ ret = false;
+ if (qp->r_flags & RVT_R_RDMAR_SEQ)
+ qp->r_flags &=
+ ~RVT_R_RDMAR_SEQ;
+ }
+ priv->flow_state.r_next_psn++;
+ } else {
+ u64 reg;
+ u32 last_psn;
+
+ /*
+ * The only sane way to get the amount of
+ * progress is to read the HW flow state.
+ */
+ reg = read_uctxt_csr(dd, rcd->ctxt,
+ RCV_TID_FLOW_TABLE +
+ (8 * flow->idx));
+ last_psn = mask_psn(reg);
+
+ priv->flow_state.r_next_psn = last_psn;
+ priv->flow_state.flags |= TID_FLOW_SW_PSN;
+ /*
+ * If no request has been restarted yet,
+ * restart the current one.
+ */
+ if (!(qp->r_flags & RVT_R_RDMAR_SEQ))
+ restart_tid_rdma_read_req(rcd, qp,
+ wqe);
+ }
+
+ break;
+
+ case RHF_RTE_EXPECTED_FLOW_GEN_ERR:
+ /*
+ * Since the TID flow is able to ride through
+ * generation mismatch, drop this stale packet.
+ */
+ break;
+
+ default:
+ break;
+ }
+ break;
+
+ case RHF_RCV_TYPE_ERROR:
+ switch (rte) {
+ case RHF_RTE_ERROR_OP_CODE_ERR:
+ case RHF_RTE_ERROR_KHDR_MIN_LEN_ERR:
+ case RHF_RTE_ERROR_KHDR_HCRC_ERR:
+ case RHF_RTE_ERROR_KHDR_KVER_ERR:
+ case RHF_RTE_ERROR_CONTEXT_ERR:
+ case RHF_RTE_ERROR_KHDR_TID_ERR:
+ default:
+ break;
+ }
+ default:
+ break;
+ }
+s_unlock:
+ spin_unlock(&qp->s_lock);
+ return ret;
+}
+
+bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd,
+ struct hfi1_pportdata *ppd,
+ struct hfi1_packet *packet)
+{
+ struct hfi1_ibport *ibp = &ppd->ibport_data;
+ struct hfi1_devdata *dd = ppd->dd;
+ struct rvt_dev_info *rdi = &dd->verbs_dev.rdi;
+ u8 rcv_type = rhf_rcv_type(packet->rhf);
+ u8 rte = rhf_rcv_type_err(packet->rhf);
+ struct ib_header *hdr = packet->hdr;
+ struct ib_other_headers *ohdr = NULL;
+ int lnh = be16_to_cpu(hdr->lrh[0]) & 3;
+ u16 lid = be16_to_cpu(hdr->lrh[1]);
+ u8 opcode;
+ u32 qp_num, psn, ibpsn;
+ struct rvt_qp *qp;
+ struct hfi1_qp_priv *qpriv;
+ unsigned long flags;
+ bool ret = true;
+ struct rvt_ack_entry *e;
+ struct tid_rdma_request *req;
+ struct tid_rdma_flow *flow;
+
+ trace_hfi1_msg_handle_kdeth_eflags(NULL, "Kdeth error: rhf ",
+ packet->rhf);
+ if (packet->rhf & (RHF_VCRC_ERR | RHF_ICRC_ERR))
+ return ret;
+
+ packet->ohdr = &hdr->u.oth;
+ ohdr = packet->ohdr;
+ trace_input_ibhdr(rcd->dd, packet, !!(rhf_dc_info(packet->rhf)));
+
+ /* Get the destination QP number. */
+ qp_num = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_qp) &
+ RVT_QPN_MASK;
+ if (lid >= be16_to_cpu(IB_MULTICAST_LID_BASE))
+ goto drop;
+
+ psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
+ opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
+
+ rcu_read_lock();
+ qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num);
+ if (!qp)
+ goto rcu_unlock;
+
+ packet->qp = qp;
+
+ /* Check for valid receive state. */
+ spin_lock_irqsave(&qp->r_lock, flags);
+ if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
+ ibp->rvp.n_pkt_drops++;
+ goto r_unlock;
+ }
+
+ if (packet->rhf & RHF_TID_ERR) {
+ /* For TIDERR and RC QPs preemptively schedule a NAK */
+ u32 tlen = rhf_pkt_len(packet->rhf); /* in bytes */
+
+ /* Sanity check packet */
+ if (tlen < 24)
+ goto r_unlock;
+
+ /*
+ * Check for GRH. We should never get packets with GRH in this
+ * path.
+ */
+ if (lnh == HFI1_LRH_GRH)
+ goto r_unlock;
+
+ if (tid_rdma_tid_err(rcd, packet, rcv_type, opcode))
+ goto r_unlock;
+ }
+
+ /* handle TID RDMA READ */
+ if (opcode == TID_OP(READ_RESP)) {
+ ibpsn = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_psn);
+ ibpsn = mask_psn(ibpsn);
+ ret = handle_read_kdeth_eflags(rcd, packet, rcv_type, rte, psn,
+ ibpsn);
+ goto r_unlock;
+ }
+
+ /*
+ * qp->s_tail_ack_queue points to the rvt_ack_entry currently being
+ * processed. These a completed sequentially so we can be sure that
+ * the pointer will not change until the entire request has completed.
+ */
+ spin_lock(&qp->s_lock);
+ qpriv = qp->priv;
+ e = &qp->s_ack_queue[qpriv->r_tid_tail];
+ req = ack_to_tid_req(e);
+ flow = &req->flows[req->clear_tail];
+ trace_hfi1_eflags_err_write(qp, rcv_type, rte, psn);
+ trace_hfi1_rsp_handle_kdeth_eflags(qp, psn);
+ trace_hfi1_tid_write_rsp_handle_kdeth_eflags(qp);
+ trace_hfi1_tid_req_handle_kdeth_eflags(qp, 0, e->opcode, e->psn,
+ e->lpsn, req);
+ trace_hfi1_tid_flow_handle_kdeth_eflags(qp, req->clear_tail, flow);
+
+ switch (rcv_type) {
+ case RHF_RCV_TYPE_EXPECTED:
+ switch (rte) {
+ case RHF_RTE_EXPECTED_FLOW_SEQ_ERR:
+ if (!(qpriv->s_flags & HFI1_R_TID_SW_PSN)) {
+ u64 reg;
+
+ qpriv->s_flags |= HFI1_R_TID_SW_PSN;
+ /*
+ * The only sane way to get the amount of
+ * progress is to read the HW flow state.
+ */
+ reg = read_uctxt_csr(dd, rcd->ctxt,
+ RCV_TID_FLOW_TABLE +
+ (8 * flow->idx));
+ flow->flow_state.r_next_psn = mask_psn(reg);
+ qpriv->r_next_psn_kdeth =
+ flow->flow_state.r_next_psn;
+ goto nak_psn;
+ } else {
+ /*
+ * If the received PSN does not match the next
+ * expected PSN, NAK the packet.
+ * However, only do that if we know that the a
+ * NAK has already been sent. Otherwise, this
+ * mismatch could be due to packets that were
+ * already in flight.
+ */
+ if (psn != flow->flow_state.r_next_psn) {
+ psn = flow->flow_state.r_next_psn;
+ goto nak_psn;
+ }
+
+ qpriv->s_nak_state = 0;
+ /*
+ * If SW PSN verification is successful and this
+ * is the last packet in the segment, tell the
+ * caller to process it as a normal packet.
+ */
+ if (psn == full_flow_psn(flow,
+ flow->flow_state.lpsn))
+ ret = false;
+ qpriv->r_next_psn_kdeth =
+ ++flow->flow_state.r_next_psn;
+ }
+ break;
+
+ case RHF_RTE_EXPECTED_FLOW_GEN_ERR:
+ goto nak_psn;
+
+ default:
+ break;
+ }
+ break;
+
+ case RHF_RCV_TYPE_ERROR:
+ switch (rte) {
+ case RHF_RTE_ERROR_OP_CODE_ERR:
+ case RHF_RTE_ERROR_KHDR_MIN_LEN_ERR:
+ case RHF_RTE_ERROR_KHDR_HCRC_ERR:
+ case RHF_RTE_ERROR_KHDR_KVER_ERR:
+ case RHF_RTE_ERROR_CONTEXT_ERR:
+ case RHF_RTE_ERROR_KHDR_TID_ERR:
+ default:
+ break;
+ }
+ default:
+ break;
+ }
+
+unlock:
+ spin_unlock(&qp->s_lock);
+r_unlock:
+ spin_unlock_irqrestore(&qp->r_lock, flags);
+rcu_unlock:
+ rcu_read_unlock();
+drop:
+ return ret;
+nak_psn:
+ ibp->rvp.n_rc_seqnak++;
+ if (!qpriv->s_nak_state) {
+ qpriv->s_nak_state = IB_NAK_PSN_ERROR;
+ /* We are NAK'ing the next expected PSN */
+ qpriv->s_nak_psn = mask_psn(flow->flow_state.r_next_psn);
+ qpriv->s_flags |= RVT_S_ACK_PENDING;
+ if (qpriv->r_tid_ack == HFI1_QP_WQE_INVALID)
+ qpriv->r_tid_ack = qpriv->r_tid_tail;
+ hfi1_schedule_tid_send(qp);
+ }
+ goto unlock;
+}
+
+/*
+ * "Rewind" the TID request information.
+ * This means that we reset the state back to ACTIVE,
+ * find the proper flow, set the flow index to that flow,
+ * and reset the flow information.
+ */
+void hfi1_tid_rdma_restart_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
+ u32 *bth2)
+{
+ struct tid_rdma_request *req = wqe_to_tid_req(wqe);
+ struct tid_rdma_flow *flow;
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ int diff, delta_pkts;
+ u32 tididx = 0, i;
+ u16 fidx;
+
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
+ *bth2 = mask_psn(qp->s_psn);
+ flow = find_flow_ib(req, *bth2, &fidx);
+ if (!flow) {
+ trace_hfi1_msg_tid_restart_req(/* msg */
+ qp, "!!!!!! Could not find flow to restart: bth2 ",
+ (u64)*bth2);
+ trace_hfi1_tid_req_restart_req(qp, 0, wqe->wr.opcode,
+ wqe->psn, wqe->lpsn,
+ req);
+ return;
+ }
+ } else {
+ fidx = req->acked_tail;
+ flow = &req->flows[fidx];
+ *bth2 = mask_psn(req->r_ack_psn);
+ }
+
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_READ)
+ delta_pkts = delta_psn(*bth2, flow->flow_state.ib_spsn);
+ else
+ delta_pkts = delta_psn(*bth2,
+ full_flow_psn(flow,
+ flow->flow_state.spsn));
+
+ trace_hfi1_tid_flow_restart_req(qp, fidx, flow);
+ diff = delta_pkts + flow->resync_npkts;
+
+ flow->sent = 0;
+ flow->pkt = 0;
+ flow->tid_idx = 0;
+ flow->tid_offset = 0;
+ if (diff) {
+ for (tididx = 0; tididx < flow->tidcnt; tididx++) {
+ u32 tidentry = flow->tid_entry[tididx], tidlen,
+ tidnpkts, npkts;
+
+ flow->tid_offset = 0;
+ tidlen = EXP_TID_GET(tidentry, LEN) * PAGE_SIZE;
+ tidnpkts = rvt_div_round_up_mtu(qp, tidlen);
+ npkts = min_t(u32, diff, tidnpkts);
+ flow->pkt += npkts;
+ flow->sent += (npkts == tidnpkts ? tidlen :
+ npkts * qp->pmtu);
+ flow->tid_offset += npkts * qp->pmtu;
+ diff -= npkts;
+ if (!diff)
+ break;
+ }
+ }
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) {
+ rvt_skip_sge(&qpriv->tid_ss, (req->cur_seg * req->seg_len) +
+ flow->sent, 0);
+ /*
+ * Packet PSN is based on flow_state.spsn + flow->pkt. However,
+ * during a RESYNC, the generation is incremented and the
+ * sequence is reset to 0. Since we've adjusted the npkts in the
+ * flow and the SGE has been sufficiently advanced, we have to
+ * adjust flow->pkt in order to calculate the correct PSN.
+ */
+ flow->pkt -= flow->resync_npkts;
+ }
+
+ if (flow->tid_offset ==
+ EXP_TID_GET(flow->tid_entry[tididx], LEN) * PAGE_SIZE) {
+ tididx++;
+ flow->tid_offset = 0;
+ }
+ flow->tid_idx = tididx;
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_READ)
+ /* Move flow_idx to correct index */
+ req->flow_idx = fidx;
+ else
+ req->clear_tail = fidx;
+
+ trace_hfi1_tid_flow_restart_req(qp, fidx, flow);
+ trace_hfi1_tid_req_restart_req(qp, 0, wqe->wr.opcode, wqe->psn,
+ wqe->lpsn, req);
+ req->state = TID_REQUEST_ACTIVE;
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) {
+ /* Reset all the flows that we are going to resend */
+ fidx = CIRC_NEXT(fidx, MAX_FLOWS);
+ i = qpriv->s_tid_tail;
+ do {
+ for (; CIRC_CNT(req->setup_head, fidx, MAX_FLOWS);
+ fidx = CIRC_NEXT(fidx, MAX_FLOWS)) {
+ req->flows[fidx].sent = 0;
+ req->flows[fidx].pkt = 0;
+ req->flows[fidx].tid_idx = 0;
+ req->flows[fidx].tid_offset = 0;
+ req->flows[fidx].resync_npkts = 0;
+ }
+ if (i == qpriv->s_tid_cur)
+ break;
+ do {
+ i = (++i == qp->s_size ? 0 : i);
+ wqe = rvt_get_swqe_ptr(qp, i);
+ } while (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE);
+ req = wqe_to_tid_req(wqe);
+ req->cur_seg = req->ack_seg;
+ fidx = req->acked_tail;
+ /* Pull req->clear_tail back */
+ req->clear_tail = fidx;
+ } while (1);
+ }
+}
+
+void hfi1_qp_kern_exp_rcv_clear_all(struct rvt_qp *qp)
+{
+ int i, ret;
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct tid_flow_state *fs;
+
+ if (qp->ibqp.qp_type != IB_QPT_RC || !HFI1_CAP_IS_KSET(TID_RDMA))
+ return;
+
+ /*
+ * First, clear the flow to help prevent any delayed packets from
+ * being delivered.
+ */
+ fs = &qpriv->flow_state;
+ if (fs->index != RXE_NUM_TID_FLOWS)
+ hfi1_kern_clear_hw_flow(qpriv->rcd, qp);
+
+ for (i = qp->s_acked; i != qp->s_head;) {
+ struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, i);
+
+ if (++i == qp->s_size)
+ i = 0;
+ /* Free only locally allocated TID entries */
+ if (wqe->wr.opcode != IB_WR_TID_RDMA_READ)
+ continue;
+ do {
+ struct hfi1_swqe_priv *priv = wqe->priv;
+
+ ret = hfi1_kern_exp_rcv_clear(&priv->tid_req);
+ } while (!ret);
+ }
+ for (i = qp->s_acked_ack_queue; i != qp->r_head_ack_queue;) {
+ struct rvt_ack_entry *e = &qp->s_ack_queue[i];
+
+ if (++i == rvt_max_atomic(ib_to_rvt(qp->ibqp.device)))
+ i = 0;
+ /* Free only locally allocated TID entries */
+ if (e->opcode != TID_OP(WRITE_REQ))
+ continue;
+ do {
+ struct hfi1_ack_priv *priv = e->priv;
+
+ ret = hfi1_kern_exp_rcv_clear(&priv->tid_req);
+ } while (!ret);
+ }
+}
+
+bool hfi1_tid_rdma_wqe_interlock(struct rvt_qp *qp, struct rvt_swqe *wqe)
+{
+ struct rvt_swqe *prev;
+ struct hfi1_qp_priv *priv = qp->priv;
+ u32 s_prev;
+ struct tid_rdma_request *req;
+
+ s_prev = (qp->s_cur == 0 ? qp->s_size : qp->s_cur) - 1;
+ prev = rvt_get_swqe_ptr(qp, s_prev);
+
+ switch (wqe->wr.opcode) {
+ case IB_WR_SEND:
+ case IB_WR_SEND_WITH_IMM:
+ case IB_WR_SEND_WITH_INV:
+ case IB_WR_ATOMIC_CMP_AND_SWP:
+ case IB_WR_ATOMIC_FETCH_AND_ADD:
+ case IB_WR_RDMA_WRITE:
+ switch (prev->wr.opcode) {
+ case IB_WR_TID_RDMA_WRITE:
+ req = wqe_to_tid_req(prev);
+ if (req->ack_seg != req->total_segs)
+ goto interlock;
+ default:
+ break;
+ }
+ break;
+ case IB_WR_RDMA_READ:
+ if (prev->wr.opcode != IB_WR_TID_RDMA_WRITE)
+ break;
+ /* fall through */
+ case IB_WR_TID_RDMA_READ:
+ switch (prev->wr.opcode) {
+ case IB_WR_RDMA_READ:
+ if (qp->s_acked != qp->s_cur)
+ goto interlock;
+ break;
+ case IB_WR_TID_RDMA_WRITE:
+ req = wqe_to_tid_req(prev);
+ if (req->ack_seg != req->total_segs)
+ goto interlock;
+ default:
+ break;
+ }
+ default:
+ break;
+ }
+ return false;
+
+interlock:
+ priv->s_flags |= HFI1_S_TID_WAIT_INTERLCK;
+ return true;
+}
+
+/* Does @sge meet the alignment requirements for tid rdma? */
+static inline bool hfi1_check_sge_align(struct rvt_qp *qp,
+ struct rvt_sge *sge, int num_sge)
+{
+ int i;
+
+ for (i = 0; i < num_sge; i++, sge++) {
+ trace_hfi1_sge_check_align(qp, i, sge);
+ if ((u64)sge->vaddr & ~PAGE_MASK ||
+ sge->sge_length & ~PAGE_MASK)
+ return false;
+ }
+ return true;
+}
+
+void setup_tid_rdma_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe)
+{
+ struct hfi1_qp_priv *qpriv = (struct hfi1_qp_priv *)qp->priv;
+ struct hfi1_swqe_priv *priv = wqe->priv;
+ struct tid_rdma_params *remote;
+ enum ib_wr_opcode new_opcode;
+ bool do_tid_rdma = false;
+ struct hfi1_pportdata *ppd = qpriv->rcd->ppd;
+
+ if ((rdma_ah_get_dlid(&qp->remote_ah_attr) & ~((1 << ppd->lmc) - 1)) ==
+ ppd->lid)
+ return;
+ if (qpriv->hdr_type != HFI1_PKT_TYPE_9B)
+ return;
+
+ rcu_read_lock();
+ remote = rcu_dereference(qpriv->tid_rdma.remote);
+ /*
+ * If TID RDMA is disabled by the negotiation, don't
+ * use it.
+ */
+ if (!remote)
+ goto exit;
+
+ if (wqe->wr.opcode == IB_WR_RDMA_READ) {
+ if (hfi1_check_sge_align(qp, &wqe->sg_list[0],
+ wqe->wr.num_sge)) {
+ new_opcode = IB_WR_TID_RDMA_READ;
+ do_tid_rdma = true;
+ }
+ } else if (wqe->wr.opcode == IB_WR_RDMA_WRITE) {
+ /*
+ * TID RDMA is enabled for this RDMA WRITE request iff:
+ * 1. The remote address is page-aligned,
+ * 2. The length is larger than the minimum segment size,
+ * 3. The length is page-multiple.
+ */
+ if (!(wqe->rdma_wr.remote_addr & ~PAGE_MASK) &&
+ !(wqe->length & ~PAGE_MASK)) {
+ new_opcode = IB_WR_TID_RDMA_WRITE;
+ do_tid_rdma = true;
+ }
+ }
+
+ if (do_tid_rdma) {
+ if (hfi1_kern_exp_rcv_alloc_flows(&priv->tid_req, GFP_ATOMIC))
+ goto exit;
+ wqe->wr.opcode = new_opcode;
+ priv->tid_req.seg_len =
+ min_t(u32, remote->max_len, wqe->length);
+ priv->tid_req.total_segs =
+ DIV_ROUND_UP(wqe->length, priv->tid_req.seg_len);
+ /* Compute the last PSN of the request */
+ wqe->lpsn = wqe->psn;
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
+ priv->tid_req.n_flows = remote->max_read;
+ qpriv->tid_r_reqs++;
+ wqe->lpsn += rvt_div_round_up_mtu(qp, wqe->length) - 1;
+ } else {
+ wqe->lpsn += priv->tid_req.total_segs - 1;
+ atomic_inc(&qpriv->n_requests);
+ }
+
+ priv->tid_req.cur_seg = 0;
+ priv->tid_req.comp_seg = 0;
+ priv->tid_req.ack_seg = 0;
+ priv->tid_req.state = TID_REQUEST_INACTIVE;
+ /*
+ * Reset acked_tail.
+ * TID RDMA READ does not have ACKs so it does not
+ * update the pointer. We have to reset it so TID RDMA
+ * WRITE does not get confused.
+ */
+ priv->tid_req.acked_tail = priv->tid_req.setup_head;
+ trace_hfi1_tid_req_setup_tid_wqe(qp, 1, wqe->wr.opcode,
+ wqe->psn, wqe->lpsn,
+ &priv->tid_req);
+ }
+exit:
+ rcu_read_unlock();
+}
+
+/* TID RDMA WRITE functions */
+
+u32 hfi1_build_tid_rdma_write_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
+ struct ib_other_headers *ohdr,
+ u32 *bth1, u32 *bth2, u32 *len)
+{
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct tid_rdma_request *req = wqe_to_tid_req(wqe);
+ struct tid_rdma_params *remote;
+
+ rcu_read_lock();
+ remote = rcu_dereference(qpriv->tid_rdma.remote);
+ /*
+ * Set the number of flow to be used based on negotiated
+ * parameters.
+ */
+ req->n_flows = remote->max_write;
+ req->state = TID_REQUEST_ACTIVE;
+
+ KDETH_RESET(ohdr->u.tid_rdma.w_req.kdeth0, KVER, 0x1);
+ KDETH_RESET(ohdr->u.tid_rdma.w_req.kdeth1, JKEY, remote->jkey);
+ ohdr->u.tid_rdma.w_req.reth.vaddr =
+ cpu_to_be64(wqe->rdma_wr.remote_addr + (wqe->length - *len));
+ ohdr->u.tid_rdma.w_req.reth.rkey =
+ cpu_to_be32(wqe->rdma_wr.rkey);
+ ohdr->u.tid_rdma.w_req.reth.length = cpu_to_be32(*len);
+ ohdr->u.tid_rdma.w_req.verbs_qp = cpu_to_be32(qp->remote_qpn);
+ *bth1 &= ~RVT_QPN_MASK;
+ *bth1 |= remote->qp;
+ qp->s_state = TID_OP(WRITE_REQ);
+ qp->s_flags |= HFI1_S_WAIT_TID_RESP;
+ *bth2 |= IB_BTH_REQ_ACK;
+ *len = 0;
+
+ rcu_read_unlock();
+ return sizeof(ohdr->u.tid_rdma.w_req) / sizeof(u32);
+}
+
+void hfi1_compute_tid_rdma_flow_wt(void)
+{
+ /*
+ * Heuristic for computing the RNR timeout when waiting on the flow
+ * queue. Rather than a computationaly expensive exact estimate of when
+ * a flow will be available, we assume that if a QP is at position N in
+ * the flow queue it has to wait approximately (N + 1) * (number of
+ * segments between two sync points), assuming PMTU of 4K. The rationale
+ * for this is that flows are released and recycled at each sync point.
+ */
+ tid_rdma_flow_wt = MAX_TID_FLOW_PSN * enum_to_mtu(OPA_MTU_4096) /
+ TID_RDMA_MAX_SEGMENT_SIZE;
+}
+
+static u32 position_in_queue(struct hfi1_qp_priv *qpriv,
+ struct tid_queue *queue)
+{
+ return qpriv->tid_enqueue - queue->dequeue;
+}
+
+/*
+ * @qp: points to rvt_qp context.
+ * @to_seg: desired RNR timeout in segments.
+ * Return: index of the next highest timeout in the ib_hfi1_rnr_table[]
+ */
+static u32 hfi1_compute_tid_rnr_timeout(struct rvt_qp *qp, u32 to_seg)
+{
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ u64 timeout;
+ u32 bytes_per_us;
+ u8 i;
+
+ bytes_per_us = active_egress_rate(qpriv->rcd->ppd) / 8;
+ timeout = (to_seg * TID_RDMA_MAX_SEGMENT_SIZE) / bytes_per_us;
+ /*
+ * Find the next highest value in the RNR table to the required
+ * timeout. This gives the responder some padding.
+ */
+ for (i = 1; i <= IB_AETH_CREDIT_MASK; i++)
+ if (rvt_rnr_tbl_to_usec(i) >= timeout)
+ return i;
+ return 0;
+}
+
+/**
+ * Central place for resource allocation at TID write responder,
+ * is called from write_req and write_data interrupt handlers as
+ * well as the send thread when a queued QP is scheduled for
+ * resource allocation.
+ *
+ * Iterates over (a) segments of a request and then (b) queued requests
+ * themselves to allocate resources for up to local->max_write
+ * segments across multiple requests. Stop allocating when we
+ * hit a sync point, resume allocating after data packets at
+ * sync point have been received.
+ *
+ * Resource allocation and sending of responses is decoupled. The
+ * request/segment which are being allocated and sent are as follows.
+ * Resources are allocated for:
+ * [request: qpriv->r_tid_alloc, segment: req->alloc_seg]
+ * The send thread sends:
+ * [request: qp->s_tail_ack_queue, segment:req->cur_seg]
+ */
+static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx)
+{
+ struct tid_rdma_request *req;
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct hfi1_ctxtdata *rcd = qpriv->rcd;
+ struct tid_rdma_params *local = &qpriv->tid_rdma.local;
+ struct rvt_ack_entry *e;
+ u32 npkts, to_seg;
+ bool last;
+ int ret = 0;
+
+ lockdep_assert_held(&qp->s_lock);
+
+ while (1) {
+ trace_hfi1_rsp_tid_write_alloc_res(qp, 0);
+ trace_hfi1_tid_write_rsp_alloc_res(qp);
+ /*
+ * Don't allocate more segments if a RNR NAK has already been
+ * scheduled to avoid messing up qp->r_psn: the RNR NAK will
+ * be sent only when all allocated segments have been sent.
+ * However, if more segments are allocated before that, TID RDMA
+ * WRITE RESP packets will be sent out for these new segments
+ * before the RNR NAK packet. When the requester receives the
+ * RNR NAK packet, it will restart with qp->s_last_psn + 1,
+ * which does not match qp->r_psn and will be dropped.
+ * Consequently, the requester will exhaust its retries and
+ * put the qp into error state.
+ */
+ if (qpriv->rnr_nak_state == TID_RNR_NAK_SEND)
+ break;
+
+ /* No requests left to process */
+ if (qpriv->r_tid_alloc == qpriv->r_tid_head) {
+ /* If all data has been received, clear the flow */
+ if (qpriv->flow_state.index < RXE_NUM_TID_FLOWS &&
+ !qpriv->alloc_w_segs)
+ hfi1_kern_clear_hw_flow(rcd, qp);
+ break;
+ }
+
+ e = &qp->s_ack_queue[qpriv->r_tid_alloc];
+ if (e->opcode != TID_OP(WRITE_REQ))
+ goto next_req;
+ req = ack_to_tid_req(e);
+ trace_hfi1_tid_req_write_alloc_res(qp, 0, e->opcode, e->psn,
+ e->lpsn, req);
+ /* Finished allocating for all segments of this request */
+ if (req->alloc_seg >= req->total_segs)
+ goto next_req;
+
+ /* Can allocate only a maximum of local->max_write for a QP */
+ if (qpriv->alloc_w_segs >= local->max_write)
+ break;
+
+ /* Don't allocate at a sync point with data packets pending */
+ if (qpriv->sync_pt && qpriv->alloc_w_segs)
+ break;
+
+ /* All data received at the sync point, continue */
+ if (qpriv->sync_pt && !qpriv->alloc_w_segs) {
+ hfi1_kern_clear_hw_flow(rcd, qp);
+ qpriv->sync_pt = false;
+ if (qpriv->s_flags & HFI1_R_TID_SW_PSN)
+ qpriv->s_flags &= ~HFI1_R_TID_SW_PSN;
+ }
+
+ /* Allocate flow if we don't have one */
+ if (qpriv->flow_state.index >= RXE_NUM_TID_FLOWS) {
+ ret = hfi1_kern_setup_hw_flow(qpriv->rcd, qp);
+ if (ret) {
+ to_seg = tid_rdma_flow_wt *
+ position_in_queue(qpriv,
+ &rcd->flow_queue);
+ break;
+ }
+ }
+
+ npkts = rvt_div_round_up_mtu(qp, req->seg_len);
+
+ /*
+ * We are at a sync point if we run out of KDETH PSN space.
+ * Last PSN of every generation is reserved for RESYNC.
+ */
+ if (qpriv->flow_state.psn + npkts > MAX_TID_FLOW_PSN - 1) {
+ qpriv->sync_pt = true;
+ break;
+ }
+
+ /*
+ * If overtaking req->acked_tail, send an RNR NAK. Because the
+ * QP is not queued in this case, and the issue can only be
+ * caused due a delay in scheduling the second leg which we
+ * cannot estimate, we use a rather arbitrary RNR timeout of
+ * (MAX_FLOWS / 2) segments
+ */
+ if (!CIRC_SPACE(req->setup_head, req->acked_tail,
+ MAX_FLOWS)) {
+ ret = -EAGAIN;
+ to_seg = MAX_FLOWS >> 1;
+ qpriv->s_flags |= RVT_S_ACK_PENDING;
+ hfi1_schedule_tid_send(qp);
+ break;
+ }
+
+ /* Try to allocate rcv array / TID entries */
+ ret = hfi1_kern_exp_rcv_setup(req, &req->ss, &last);
+ if (ret == -EAGAIN)
+ to_seg = position_in_queue(qpriv, &rcd->rarr_queue);
+ if (ret)
+ break;
+
+ qpriv->alloc_w_segs++;
+ req->alloc_seg++;
+ continue;
+next_req:
+ /* Begin processing the next request */
+ if (++qpriv->r_tid_alloc >
+ rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
+ qpriv->r_tid_alloc = 0;
+ }
+
+ /*
+ * Schedule an RNR NAK to be sent if (a) flow or rcv array allocation
+ * has failed (b) we are called from the rcv handler interrupt context
+ * (c) an RNR NAK has not already been scheduled
+ */
+ if (ret == -EAGAIN && intr_ctx && !qp->r_nak_state)
+ goto send_rnr_nak;
+
+ return;
+
+send_rnr_nak:
+ lockdep_assert_held(&qp->r_lock);
+
+ /* Set r_nak_state to prevent unrelated events from generating NAK's */
+ qp->r_nak_state = hfi1_compute_tid_rnr_timeout(qp, to_seg) | IB_RNR_NAK;
+
+ /* Pull back r_psn to the segment being RNR NAK'd */
+ qp->r_psn = e->psn + req->alloc_seg;
+ qp->r_ack_psn = qp->r_psn;
+ /*
+ * Pull back r_head_ack_queue to the ack entry following the request
+ * being RNR NAK'd. This allows resources to be allocated to the request
+ * if the queued QP is scheduled.
+ */
+ qp->r_head_ack_queue = qpriv->r_tid_alloc + 1;
+ if (qp->r_head_ack_queue > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
+ qp->r_head_ack_queue = 0;
+ qpriv->r_tid_head = qp->r_head_ack_queue;
+ /*
+ * These send side fields are used in make_rc_ack(). They are set in
+ * hfi1_send_rc_ack() but must be set here before dropping qp->s_lock
+ * for consistency
+ */
+ qp->s_nak_state = qp->r_nak_state;
+ qp->s_ack_psn = qp->r_ack_psn;
+ /*
+ * Clear the ACK PENDING flag to prevent unwanted ACK because we
+ * have modified qp->s_ack_psn here.
+ */
+ qp->s_flags &= ~(RVT_S_ACK_PENDING);
+
+ trace_hfi1_rsp_tid_write_alloc_res(qp, qp->r_psn);
+ /*
+ * qpriv->rnr_nak_state is used to determine when the scheduled RNR NAK
+ * has actually been sent. qp->s_flags RVT_S_ACK_PENDING bit cannot be
+ * used for this because qp->s_lock is dropped before calling
+ * hfi1_send_rc_ack() leading to inconsistency between the receive
+ * interrupt handlers and the send thread in make_rc_ack()
+ */
+ qpriv->rnr_nak_state = TID_RNR_NAK_SEND;
+
+ /*
+ * Schedule RNR NAK to be sent. RNR NAK's are scheduled from the receive
+ * interrupt handlers but will be sent from the send engine behind any
+ * previous responses that may have been scheduled
+ */
+ rc_defered_ack(rcd, qp);
+}
+
+void hfi1_rc_rcv_tid_rdma_write_req(struct hfi1_packet *packet)
+{
+ /* HANDLER FOR TID RDMA WRITE REQUEST packet (Responder side)*/
+
+ /*
+ * 1. Verify TID RDMA WRITE REQ as per IB_OPCODE_RC_RDMA_WRITE_FIRST
+ * (see hfi1_rc_rcv())
+ * - Don't allow 0-length requests.
+ * 2. Put TID RDMA WRITE REQ into the response queueu (s_ack_queue)
+ * - Setup struct tid_rdma_req with request info
+ * - Prepare struct tid_rdma_flow array?
+ * 3. Set the qp->s_ack_state as state diagram in design doc.
+ * 4. Set RVT_S_RESP_PENDING in s_flags.
+ * 5. Kick the send engine (hfi1_schedule_send())
+ */
+ struct hfi1_ctxtdata *rcd = packet->rcd;
+ struct rvt_qp *qp = packet->qp;
+ struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+ struct ib_other_headers *ohdr = packet->ohdr;
+ struct rvt_ack_entry *e;
+ unsigned long flags;
+ struct ib_reth *reth;
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct tid_rdma_request *req;
+ u32 bth0, psn, len, rkey, num_segs;
+ bool is_fecn;
+ u8 next;
+ u64 vaddr;
+ int diff;
+
+ bth0 = be32_to_cpu(ohdr->bth[0]);
+ if (hfi1_ruc_check_hdr(ibp, packet))
+ return;
+
+ is_fecn = process_ecn(qp, packet);
+ psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
+ trace_hfi1_rsp_rcv_tid_write_req(qp, psn);
+
+ if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST))
+ rvt_comm_est(qp);
+
+ if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
+ goto nack_inv;
+
+ reth = &ohdr->u.tid_rdma.w_req.reth;
+ vaddr = be64_to_cpu(reth->vaddr);
+ len = be32_to_cpu(reth->length);
+
+ num_segs = DIV_ROUND_UP(len, qpriv->tid_rdma.local.max_len);
+ diff = delta_psn(psn, qp->r_psn);
+ if (unlikely(diff)) {
+ if (tid_rdma_rcv_error(packet, ohdr, qp, psn, diff))
+ return;
+ goto send_ack;
+ }
+
+ /*
+ * The resent request which was previously RNR NAK'd is inserted at the
+ * location of the original request, which is one entry behind
+ * r_head_ack_queue
+ */
+ if (qpriv->rnr_nak_state)
+ qp->r_head_ack_queue = qp->r_head_ack_queue ?
+ qp->r_head_ack_queue - 1 :
+ rvt_size_atomic(ib_to_rvt(qp->ibqp.device));
+
+ /* We've verified the request, insert it into the ack queue. */
+ next = qp->r_head_ack_queue + 1;
+ if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
+ next = 0;
+ spin_lock_irqsave(&qp->s_lock, flags);
+ if (unlikely(next == qp->s_acked_ack_queue)) {
+ if (!qp->s_ack_queue[next].sent)
+ goto nack_inv_unlock;
+ update_ack_queue(qp, next);
+ }
+ e = &qp->s_ack_queue[qp->r_head_ack_queue];
+ req = ack_to_tid_req(e);
+
+ /* Bring previously RNR NAK'd request back to life */
+ if (qpriv->rnr_nak_state) {
+ qp->r_nak_state = 0;
+ qp->s_nak_state = 0;
+ qpriv->rnr_nak_state = TID_RNR_NAK_INIT;
+ qp->r_psn = e->lpsn + 1;
+ req->state = TID_REQUEST_INIT;
+ goto update_head;
+ }
+
+ if (e->rdma_sge.mr) {
+ rvt_put_mr(e->rdma_sge.mr);
+ e->rdma_sge.mr = NULL;
+ }
+
+ /* The length needs to be in multiples of PAGE_SIZE */
+ if (!len || len & ~PAGE_MASK)
+ goto nack_inv_unlock;
+
+ rkey = be32_to_cpu(reth->rkey);
+ qp->r_len = len;
+
+ if (e->opcode == TID_OP(WRITE_REQ) &&
+ (req->setup_head != req->clear_tail ||
+ req->clear_tail != req->acked_tail))
+ goto nack_inv_unlock;
+
+ if (unlikely(!rvt_rkey_ok(qp, &e->rdma_sge, qp->r_len, vaddr,
+ rkey, IB_ACCESS_REMOTE_WRITE)))
+ goto nack_acc;
+
+ qp->r_psn += num_segs - 1;
+
+ e->opcode = (bth0 >> 24) & 0xff;
+ e->psn = psn;
+ e->lpsn = qp->r_psn;
+ e->sent = 0;
+
+ req->n_flows = min_t(u16, num_segs, qpriv->tid_rdma.local.max_write);
+ req->state = TID_REQUEST_INIT;
+ req->cur_seg = 0;
+ req->comp_seg = 0;
+ req->ack_seg = 0;
+ req->alloc_seg = 0;
+ req->isge = 0;
+ req->seg_len = qpriv->tid_rdma.local.max_len;
+ req->total_len = len;
+ req->total_segs = num_segs;
+ req->r_flow_psn = e->psn;
+ req->ss.sge = e->rdma_sge;
+ req->ss.num_sge = 1;
+
+ req->flow_idx = req->setup_head;
+ req->clear_tail = req->setup_head;
+ req->acked_tail = req->setup_head;
+
+ qp->r_state = e->opcode;
+ qp->r_nak_state = 0;
+ /*
+ * We need to increment the MSN here instead of when we
+ * finish sending the result since a duplicate request would
+ * increment it more than once.
+ */
+ qp->r_msn++;
+ qp->r_psn++;
+
+ trace_hfi1_tid_req_rcv_write_req(qp, 0, e->opcode, e->psn, e->lpsn,
+ req);
+
+ if (qpriv->r_tid_tail == HFI1_QP_WQE_INVALID) {
+ qpriv->r_tid_tail = qp->r_head_ack_queue;
+ } else if (qpriv->r_tid_tail == qpriv->r_tid_head) {
+ struct tid_rdma_request *ptr;
+
+ e = &qp->s_ack_queue[qpriv->r_tid_tail];
+ ptr = ack_to_tid_req(e);
+
+ if (e->opcode != TID_OP(WRITE_REQ) ||
+ ptr->comp_seg == ptr->total_segs) {
+ if (qpriv->r_tid_tail == qpriv->r_tid_ack)
+ qpriv->r_tid_ack = qp->r_head_ack_queue;
+ qpriv->r_tid_tail = qp->r_head_ack_queue;
+ }
+ }
+update_head:
+ qp->r_head_ack_queue = next;
+ qpriv->r_tid_head = qp->r_head_ack_queue;
+
+ hfi1_tid_write_alloc_resources(qp, true);
+ trace_hfi1_tid_write_rsp_rcv_req(qp);
+
+ /* Schedule the send tasklet. */
+ qp->s_flags |= RVT_S_RESP_PENDING;
+ hfi1_schedule_send(qp);
+
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ if (is_fecn)
+ goto send_ack;
+ return;
+
+nack_inv_unlock:
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+nack_inv:
+ rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+ qp->r_nak_state = IB_NAK_INVALID_REQUEST;
+ qp->r_ack_psn = qp->r_psn;
+ /* Queue NAK for later */
+ rc_defered_ack(rcd, qp);
+ return;
+nack_acc:
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ rvt_rc_error(qp, IB_WC_LOC_PROT_ERR);
+ qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
+ qp->r_ack_psn = qp->r_psn;
+send_ack:
+ hfi1_send_rc_ack(packet, is_fecn);
+}
+
+u32 hfi1_build_tid_rdma_write_resp(struct rvt_qp *qp, struct rvt_ack_entry *e,
+ struct ib_other_headers *ohdr, u32 *bth1,
+ u32 bth2, u32 *len,
+ struct rvt_sge_state **ss)
+{
+ struct hfi1_ack_priv *epriv = e->priv;
+ struct tid_rdma_request *req = &epriv->tid_req;
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct tid_rdma_flow *flow = NULL;
+ u32 resp_len = 0, hdwords = 0;
+ void *resp_addr = NULL;
+ struct tid_rdma_params *remote;
+
+ trace_hfi1_tid_req_build_write_resp(qp, 0, e->opcode, e->psn, e->lpsn,
+ req);
+ trace_hfi1_tid_write_rsp_build_resp(qp);
+ trace_hfi1_rsp_build_tid_write_resp(qp, bth2);
+ flow = &req->flows[req->flow_idx];
+ switch (req->state) {
+ default:
+ /*
+ * Try to allocate resources here in case QP was queued and was
+ * later scheduled when resources became available
+ */
+ hfi1_tid_write_alloc_resources(qp, false);
+
+ /* We've already sent everything which is ready */
+ if (req->cur_seg >= req->alloc_seg)
+ goto done;
+
+ /*
+ * Resources can be assigned but responses cannot be sent in
+ * rnr_nak state, till the resent request is recei