aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/infiniband
diff options
context:
space:
mode:
authorDmitry Torokhov <dmitry.torokhov@gmail.com>2019-07-10 23:24:10 -0700
committerDmitry Torokhov <dmitry.torokhov@gmail.com>2019-07-10 23:24:10 -0700
commit597473720f4dc69749542bfcfed4a927a43d935e (patch)
tree711bf773910fb93d1dd9120c633adc807685e0d8 /drivers/infiniband
parentInput: atmel_mxt_ts - fix leak in mxt_update_cfg() (diff)
parentInput: gpio_keys_polled - allow specifying name of input device (diff)
downloadlinux-dev-597473720f4dc69749542bfcfed4a927a43d935e.tar.xz
linux-dev-597473720f4dc69749542bfcfed4a927a43d935e.zip
Merge branch 'next' into for-linus
Prepare input updates for 5.3 merge window.
Diffstat (limited to 'drivers/infiniband')
-rw-r--r--drivers/infiniband/Kconfig15
-rw-r--r--drivers/infiniband/core/Makefile6
-rw-r--r--drivers/infiniband/core/agent.c4
-rw-r--r--drivers/infiniband/core/cache.c131
-rw-r--r--drivers/infiniband/core/cgroup.c5
-rw-r--r--drivers/infiniband/core/cm.c9
-rw-r--r--drivers/infiniband/core/cma.c144
-rw-r--r--drivers/infiniband/core/cma_configfs.c3
-rw-r--r--drivers/infiniband/core/cma_priv.h32
-rw-r--r--drivers/infiniband/core/core_priv.h79
-rw-r--r--drivers/infiniband/core/cq.c8
-rw-r--r--drivers/infiniband/core/device.c1465
-rw-r--r--drivers/infiniband/core/fmr_pool.c8
-rw-r--r--drivers/infiniband/core/iwcm.c21
-rw-r--r--drivers/infiniband/core/iwpm_msg.c232
-rw-r--r--drivers/infiniband/core/iwpm_util.c86
-rw-r--r--drivers/infiniband/core/iwpm_util.h12
-rw-r--r--drivers/infiniband/core/mad.c26
-rw-r--r--drivers/infiniband/core/mad_rmpp.c11
-rw-r--r--drivers/infiniband/core/netlink.c4
-rw-r--r--drivers/infiniband/core/nldev.c524
-rw-r--r--drivers/infiniband/core/opa_smi.h4
-rw-r--r--drivers/infiniband/core/rdma_core.c94
-rw-r--r--drivers/infiniband/core/rdma_core.h81
-rw-r--r--drivers/infiniband/core/restrack.c240
-rw-r--r--drivers/infiniband/core/restrack.h28
-rw-r--r--drivers/infiniband/core/rw.c12
-rw-r--r--drivers/infiniband/core/sa_query.c9
-rw-r--r--drivers/infiniband/core/security.c104
-rw-r--r--drivers/infiniband/core/smi.h4
-rw-r--r--drivers/infiniband/core/sysfs.c119
-rw-r--r--drivers/infiniband/core/ucm.c2
-rw-r--r--drivers/infiniband/core/ucma.c7
-rw-r--r--drivers/infiniband/core/umem.c60
-rw-r--r--drivers/infiniband/core/umem_odp.c58
-rw-r--r--drivers/infiniband/core/user_mad.c282
-rw-r--r--drivers/infiniband/core/uverbs.h87
-rw-r--r--drivers/infiniband/core/uverbs_cmd.c2128
-rw-r--r--drivers/infiniband/core/uverbs_ioctl.c118
-rw-r--r--drivers/infiniband/core/uverbs_main.c328
-rw-r--r--drivers/infiniband/core/uverbs_std_types.c120
-rw-r--r--drivers/infiniband/core/uverbs_std_types_counters.c20
-rw-r--r--drivers/infiniband/core/uverbs_std_types_cq.c23
-rw-r--r--drivers/infiniband/core/uverbs_std_types_device.c230
-rw-r--r--drivers/infiniband/core/uverbs_std_types_dm.c17
-rw-r--r--drivers/infiniband/core/uverbs_std_types_flow_action.c31
-rw-r--r--drivers/infiniband/core/uverbs_std_types_mr.c76
-rw-r--r--drivers/infiniband/core/uverbs_uapi.c519
-rw-r--r--drivers/infiniband/core/verbs.c255
-rw-r--r--drivers/infiniband/hw/bnxt_re/Kconfig2
-rw-r--r--drivers/infiniband/hw/bnxt_re/Makefile2
-rw-r--r--drivers/infiniband/hw/bnxt_re/bnxt_re.h1
-rw-r--r--drivers/infiniband/hw/bnxt_re/ib_verbs.c283
-rw-r--r--drivers/infiniband/hw/bnxt_re/ib_verbs.h19
-rw-r--r--drivers/infiniband/hw/bnxt_re/main.c262
-rw-r--r--drivers/infiniband/hw/bnxt_re/qplib_fp.c193
-rw-r--r--drivers/infiniband/hw/bnxt_re/qplib_fp.h47
-rw-r--r--drivers/infiniband/hw/bnxt_re/qplib_rcfw.c76
-rw-r--r--drivers/infiniband/hw/bnxt_re/qplib_rcfw.h104
-rw-r--r--drivers/infiniband/hw/bnxt_re/qplib_res.c30
-rw-r--r--drivers/infiniband/hw/bnxt_re/qplib_res.h31
-rw-r--r--drivers/infiniband/hw/bnxt_re/qplib_sp.c18
-rw-r--r--drivers/infiniband/hw/bnxt_re/qplib_sp.h6
-rw-r--r--drivers/infiniband/hw/bnxt_re/roce_hsi.h160
-rw-r--r--drivers/infiniband/hw/cxgb3/Makefile2
-rw-r--r--drivers/infiniband/hw/cxgb3/cxio_hal.c5
-rw-r--r--drivers/infiniband/hw/cxgb3/iwch.c2
-rw-r--r--drivers/infiniband/hw/cxgb3/iwch_provider.c159
-rw-r--r--drivers/infiniband/hw/cxgb4/Makefile4
-rw-r--r--drivers/infiniband/hw/cxgb4/cm.c210
-rw-r--r--drivers/infiniband/hw/cxgb4/device.c25
-rw-r--r--drivers/infiniband/hw/cxgb4/iw_cxgb4.h16
-rw-r--r--drivers/infiniband/hw/cxgb4/mem.c36
-rw-r--r--drivers/infiniband/hw/cxgb4/provider.c159
-rw-r--r--drivers/infiniband/hw/cxgb4/qp.c39
-rw-r--r--drivers/infiniband/hw/cxgb4/t4.h1
-rw-r--r--drivers/infiniband/hw/hfi1/Makefile2
-rw-r--r--drivers/infiniband/hw/hfi1/affinity.c3
-rw-r--r--drivers/infiniband/hw/hfi1/chip.c86
-rw-r--r--drivers/infiniband/hw/hfi1/chip.h4
-rw-r--r--drivers/infiniband/hw/hfi1/chip_registers.h4
-rw-r--r--drivers/infiniband/hw/hfi1/common.h23
-rw-r--r--drivers/infiniband/hw/hfi1/debugfs.c105
-rw-r--r--drivers/infiniband/hw/hfi1/debugfs.h12
-rw-r--r--drivers/infiniband/hw/hfi1/driver.c130
-rw-r--r--drivers/infiniband/hw/hfi1/fault.c53
-rw-r--r--drivers/infiniband/hw/hfi1/file_ops.c2
-rw-r--r--drivers/infiniband/hw/hfi1/hfi.h59
-rw-r--r--drivers/infiniband/hw/hfi1/init.c67
-rw-r--r--drivers/infiniband/hw/hfi1/iowait.c34
-rw-r--r--drivers/infiniband/hw/hfi1/iowait.h99
-rw-r--r--drivers/infiniband/hw/hfi1/mad.c2
-rw-r--r--drivers/infiniband/hw/hfi1/mmu_rb.c13
-rw-r--r--drivers/infiniband/hw/hfi1/opfn.c323
-rw-r--r--drivers/infiniband/hw/hfi1/opfn.h85
-rw-r--r--drivers/infiniband/hw/hfi1/pio.c58
-rw-r--r--drivers/infiniband/hw/hfi1/pio.h5
-rw-r--r--drivers/infiniband/hw/hfi1/qp.c100
-rw-r--r--drivers/infiniband/hw/hfi1/qp.h7
-rw-r--r--drivers/infiniband/hw/hfi1/rc.c1177
-rw-r--r--drivers/infiniband/hw/hfi1/rc.h51
-rw-r--r--drivers/infiniband/hw/hfi1/ruc.c48
-rw-r--r--drivers/infiniband/hw/hfi1/sdma.c61
-rw-r--r--drivers/infiniband/hw/hfi1/sdma.h1
-rw-r--r--drivers/infiniband/hw/hfi1/sdma_txreq.h1
-rw-r--r--drivers/infiniband/hw/hfi1/sysfs.c16
-rw-r--r--drivers/infiniband/hw/hfi1/tid_rdma.c5451
-rw-r--r--drivers/infiniband/hw/hfi1/tid_rdma.h322
-rw-r--r--drivers/infiniband/hw/hfi1/trace.c118
-rw-r--r--drivers/infiniband/hw/hfi1/trace.h1
-rw-r--r--drivers/infiniband/hw/hfi1/trace_ibhdrs.h8
-rw-r--r--drivers/infiniband/hw/hfi1/trace_rc.h48
-rw-r--r--drivers/infiniband/hw/hfi1/trace_rx.h107
-rw-r--r--drivers/infiniband/hw/hfi1/trace_tid.h1610
-rw-r--r--drivers/infiniband/hw/hfi1/trace_tx.h18
-rw-r--r--drivers/infiniband/hw/hfi1/uc.c5
-rw-r--r--drivers/infiniband/hw/hfi1/ud.c58
-rw-r--r--drivers/infiniband/hw/hfi1/user_exp_rcv.c2
-rw-r--r--drivers/infiniband/hw/hfi1/user_exp_rcv.h1
-rw-r--r--drivers/infiniband/hw/hfi1/user_pages.c12
-rw-r--r--drivers/infiniband/hw/hfi1/user_sdma.c14
-rw-r--r--drivers/infiniband/hw/hfi1/verbs.c234
-rw-r--r--drivers/infiniband/hw/hfi1/verbs.h106
-rw-r--r--drivers/infiniband/hw/hfi1/verbs_txreq.h1
-rw-r--r--drivers/infiniband/hw/hfi1/vnic_main.c4
-rw-r--r--drivers/infiniband/hw/hfi1/vnic_sdma.c24
-rw-r--r--drivers/infiniband/hw/hns/Kconfig1
-rw-r--r--drivers/infiniband/hw/hns/Makefile4
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_ah.c3
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_alloc.c13
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_cmd.c32
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_cmd.h16
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_common.h3
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_cq.c9
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_db.c6
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_device.h160
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_hem.c115
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_hem.h5
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_hw_v1.c49
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_hw_v2.c1162
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_hw_v2.h234
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_main.c273
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_mr.c240
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_pd.c25
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_qp.c126
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_srq.c463
-rw-r--r--drivers/infiniband/hw/i40iw/Makefile2
-rw-r--r--drivers/infiniband/hw/i40iw/i40iw_cm.c10
-rw-r--r--drivers/infiniband/hw/i40iw/i40iw_utils.c17
-rw-r--r--drivers/infiniband/hw/i40iw/i40iw_verbs.c262
-rw-r--r--drivers/infiniband/hw/mlx4/Kconfig2
-rw-r--r--drivers/infiniband/hw/mlx4/ah.c6
-rw-r--r--drivers/infiniband/hw/mlx4/alias_GUID.c4
-rw-r--r--drivers/infiniband/hw/mlx4/cm.c2
-rw-r--r--drivers/infiniband/hw/mlx4/cq.c26
-rw-r--r--drivers/infiniband/hw/mlx4/doorbell.c6
-rw-r--r--drivers/infiniband/hw/mlx4/mad.c32
-rw-r--r--drivers/infiniband/hw/mlx4/main.c282
-rw-r--r--drivers/infiniband/hw/mlx4/mlx4_ib.h8
-rw-r--r--drivers/infiniband/hw/mlx4/mr.c13
-rw-r--r--drivers/infiniband/hw/mlx4/qp.c115
-rw-r--r--drivers/infiniband/hw/mlx4/srq.c22
-rw-r--r--drivers/infiniband/hw/mlx4/sysfs.c12
-rw-r--r--drivers/infiniband/hw/mlx5/Kconfig1
-rw-r--r--drivers/infiniband/hw/mlx5/Makefile4
-rw-r--r--drivers/infiniband/hw/mlx5/ah.c4
-rw-r--r--drivers/infiniband/hw/mlx5/cmd.c56
-rw-r--r--drivers/infiniband/hw/mlx5/cmd.h4
-rw-r--r--drivers/infiniband/hw/mlx5/cong.c15
-rw-r--r--drivers/infiniband/hw/mlx5/cq.c129
-rw-r--r--drivers/infiniband/hw/mlx5/devx.c720
-rw-r--r--drivers/infiniband/hw/mlx5/doorbell.c6
-rw-r--r--drivers/infiniband/hw/mlx5/flow.c72
-rw-r--r--drivers/infiniband/hw/mlx5/ib_rep.c78
-rw-r--r--drivers/infiniband/hw/mlx5/ib_rep.h10
-rw-r--r--drivers/infiniband/hw/mlx5/mad.c29
-rw-r--r--drivers/infiniband/hw/mlx5/main.c936
-rw-r--r--drivers/infiniband/hw/mlx5/mem.c5
-rw-r--r--drivers/infiniband/hw/mlx5/mlx5_ib.h122
-rw-r--r--drivers/infiniband/hw/mlx5/mr.c188
-rw-r--r--drivers/infiniband/hw/mlx5/odp.c721
-rw-r--r--drivers/infiniband/hw/mlx5/qp.c783
-rw-r--r--drivers/infiniband/hw/mlx5/srq.c84
-rw-r--r--drivers/infiniband/hw/mlx5/srq.h71
-rw-r--r--drivers/infiniband/hw/mlx5/srq_cmd.c720
-rw-r--r--drivers/infiniband/hw/mthca/mthca_dev.h9
-rw-r--r--drivers/infiniband/hw/mthca/mthca_mad.c7
-rw-r--r--drivers/infiniband/hw/mthca/mthca_main.c2
-rw-r--r--drivers/infiniband/hw/mthca/mthca_memfree.c5
-rw-r--r--drivers/infiniband/hw/mthca/mthca_provider.c301
-rw-r--r--drivers/infiniband/hw/mthca/mthca_qp.c35
-rw-r--r--drivers/infiniband/hw/mthca/mthca_srq.c42
-rw-r--r--drivers/infiniband/hw/nes/Kconfig2
-rw-r--r--drivers/infiniband/hw/nes/nes_cm.c2
-rw-r--r--drivers/infiniband/hw/nes/nes_mgt.c8
-rw-r--r--drivers/infiniband/hw/nes/nes_verbs.c382
-rw-r--r--drivers/infiniband/hw/nes/nes_verbs.h1
-rw-r--r--drivers/infiniband/hw/ocrdma/Makefile2
-rw-r--r--drivers/infiniband/hw/ocrdma/ocrdma_ah.c4
-rw-r--r--drivers/infiniband/hw/ocrdma/ocrdma_ah.h4
-rw-r--r--drivers/infiniband/hw/ocrdma/ocrdma_hw.c14
-rw-r--r--drivers/infiniband/hw/ocrdma/ocrdma_main.c104
-rw-r--r--drivers/infiniband/hw/ocrdma/ocrdma_stats.c76
-rw-r--r--drivers/infiniband/hw/ocrdma/ocrdma_verbs.c214
-rw-r--r--drivers/infiniband/hw/ocrdma/ocrdma_verbs.h11
-rw-r--r--drivers/infiniband/hw/qedr/main.c112
-rw-r--r--drivers/infiniband/hw/qedr/qedr_iw_cm.c4
-rw-r--r--drivers/infiniband/hw/qedr/verbs.c221
-rw-r--r--drivers/infiniband/hw/qedr/verbs.h14
-rw-r--r--drivers/infiniband/hw/qib/qib_debugfs.c27
-rw-r--r--drivers/infiniband/hw/qib/qib_file_ops.c2
-rw-r--r--drivers/infiniband/hw/qib/qib_iba6120.c1
-rw-r--r--drivers/infiniband/hw/qib/qib_iba7220.c2
-rw-r--r--drivers/infiniband/hw/qib/qib_iba7322.c3
-rw-r--r--drivers/infiniband/hw/qib/qib_init.c1
-rw-r--r--drivers/infiniband/hw/qib/qib_mad.c3
-rw-r--r--drivers/infiniband/hw/qib/qib_pcie.c2
-rw-r--r--drivers/infiniband/hw/qib/qib_rc.c7
-rw-r--r--drivers/infiniband/hw/qib/qib_sdma.c33
-rw-r--r--drivers/infiniband/hw/qib/qib_sysfs.c18
-rw-r--r--drivers/infiniband/hw/qib/qib_ud.c8
-rw-r--r--drivers/infiniband/hw/qib/qib_user_pages.c75
-rw-r--r--drivers/infiniband/hw/qib/qib_user_sdma.c1
-rw-r--r--drivers/infiniband/hw/qib/qib_verbs.c33
-rw-r--r--drivers/infiniband/hw/usnic/Makefile2
-rw-r--r--drivers/infiniband/hw/usnic/usnic_debugfs.c26
-rw-r--r--drivers/infiniband/hw/usnic/usnic_ib_main.c104
-rw-r--r--drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c2
-rw-r--r--drivers/infiniband/hw/usnic/usnic_ib_sysfs.c26
-rw-r--r--drivers/infiniband/hw/usnic/usnic_ib_verbs.c124
-rw-r--r--drivers/infiniband/hw/usnic/usnic_ib_verbs.h27
-rw-r--r--drivers/infiniband/hw/usnic/usnic_uiom.c65
-rw-r--r--drivers/infiniband/hw/usnic/usnic_uiom.h1
-rw-r--r--drivers/infiniband/hw/vmw_pvrdma/pvrdma.h35
-rw-r--r--drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c2
-rw-r--r--drivers/infiniband/hw/vmw_pvrdma/pvrdma_dev_api.h15
-rw-r--r--drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c100
-rw-r--r--drivers/infiniband/hw/vmw_pvrdma/pvrdma_misc.c21
-rw-r--r--drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c3
-rw-r--r--drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c14
-rw-r--r--drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c6
-rw-r--r--drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c106
-rw-r--r--drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h16
-rw-r--r--drivers/infiniband/sw/rdmavt/ah.c5
-rw-r--r--drivers/infiniband/sw/rdmavt/ah.h3
-rw-r--r--drivers/infiniband/sw/rdmavt/mad.c3
-rw-r--r--drivers/infiniband/sw/rdmavt/mr.c38
-rw-r--r--drivers/infiniband/sw/rdmavt/pd.c29
-rw-r--r--drivers/infiniband/sw/rdmavt/pd.h7
-rw-r--r--drivers/infiniband/sw/rdmavt/qp.c120
-rw-r--r--drivers/infiniband/sw/rdmavt/rc.c13
-rw-r--r--drivers/infiniband/sw/rdmavt/srq.c5
-rw-r--r--drivers/infiniband/sw/rdmavt/trace_cq.h10
-rw-r--r--drivers/infiniband/sw/rdmavt/vt.c333
-rw-r--r--drivers/infiniband/sw/rxe/rxe.c67
-rw-r--r--drivers/infiniband/sw/rxe/rxe.h22
-rw-r--r--drivers/infiniband/sw/rxe/rxe_av.c7
-rw-r--r--drivers/infiniband/sw/rxe/rxe_comp.c12
-rw-r--r--drivers/infiniband/sw/rxe/rxe_hw_counters.c9
-rw-r--r--drivers/infiniband/sw/rxe/rxe_hw_counters.h3
-rw-r--r--drivers/infiniband/sw/rxe/rxe_loc.h16
-rw-r--r--drivers/infiniband/sw/rxe/rxe_mr.c15
-rw-r--r--drivers/infiniband/sw/rxe/rxe_net.c113
-rw-r--r--drivers/infiniband/sw/rxe/rxe_net.h2
-rw-r--r--drivers/infiniband/sw/rxe/rxe_param.h3
-rw-r--r--drivers/infiniband/sw/rxe/rxe_pool.c103
-rw-r--r--drivers/infiniband/sw/rxe/rxe_pool.h6
-rw-r--r--drivers/infiniband/sw/rxe/rxe_qp.c24
-rw-r--r--drivers/infiniband/sw/rxe/rxe_recv.c12
-rw-r--r--drivers/infiniband/sw/rxe/rxe_req.c3
-rw-r--r--drivers/infiniband/sw/rxe/rxe_resp.c36
-rw-r--r--drivers/infiniband/sw/rxe/rxe_sysfs.c56
-rw-r--r--drivers/infiniband/sw/rxe/rxe_verbs.c228
-rw-r--r--drivers/infiniband/sw/rxe/rxe_verbs.h15
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib.h5
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_cm.c3
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_fs.c7
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_ib.c5
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_main.c24
-rw-r--r--drivers/infiniband/ulp/iser/iscsi_iser.c1
-rw-r--r--drivers/infiniband/ulp/iser/iscsi_iser.h2
-rw-r--r--drivers/infiniband/ulp/iser/iser_memory.c28
-rw-r--r--drivers/infiniband/ulp/isert/Makefile1
-rw-r--r--drivers/infiniband/ulp/isert/ib_isert.c2
-rw-r--r--drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c8
-rw-r--r--drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c8
-rw-r--r--drivers/infiniband/ulp/srp/ib_srp.c192
-rw-r--r--drivers/infiniband/ulp/srp/ib_srp.h20
-rw-r--r--drivers/infiniband/ulp/srpt/Makefile1
-rw-r--r--drivers/infiniband/ulp/srpt/ib_srpt.c371
-rw-r--r--drivers/infiniband/ulp/srpt/ib_srpt.h48
291 files changed, 24660 insertions, 8937 deletions
diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
index 0a3ec7c726ec..a1fb840de45d 100644
--- a/drivers/infiniband/Kconfig
+++ b/drivers/infiniband/Kconfig
@@ -89,6 +89,7 @@ config INFINIBAND_ADDR_TRANS_CONFIGFS
This allows the user to config the default GID type that the CM
uses for each device, when initiaing new connections.
+if INFINIBAND_USER_ACCESS || !INFINIBAND_USER_ACCESS
source "drivers/infiniband/hw/mthca/Kconfig"
source "drivers/infiniband/hw/qib/Kconfig"
source "drivers/infiniband/hw/cxgb3/Kconfig"
@@ -101,6 +102,12 @@ source "drivers/infiniband/hw/ocrdma/Kconfig"
source "drivers/infiniband/hw/vmw_pvrdma/Kconfig"
source "drivers/infiniband/hw/usnic/Kconfig"
source "drivers/infiniband/hw/hns/Kconfig"
+source "drivers/infiniband/hw/bnxt_re/Kconfig"
+source "drivers/infiniband/hw/hfi1/Kconfig"
+source "drivers/infiniband/hw/qedr/Kconfig"
+source "drivers/infiniband/sw/rdmavt/Kconfig"
+source "drivers/infiniband/sw/rxe/Kconfig"
+endif
source "drivers/infiniband/ulp/ipoib/Kconfig"
@@ -111,13 +118,5 @@ source "drivers/infiniband/ulp/iser/Kconfig"
source "drivers/infiniband/ulp/isert/Kconfig"
source "drivers/infiniband/ulp/opa_vnic/Kconfig"
-source "drivers/infiniband/sw/rdmavt/Kconfig"
-source "drivers/infiniband/sw/rxe/Kconfig"
-
-source "drivers/infiniband/hw/hfi1/Kconfig"
-
-source "drivers/infiniband/hw/qedr/Kconfig"
-
-source "drivers/infiniband/hw/bnxt_re/Kconfig"
endif # INFINIBAND
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index 867cee5e27b2..313f2349b518 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -15,8 +15,6 @@ ib_core-y := packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \
nldev.o restrack.o
ib_core-$(CONFIG_SECURITY_INFINIBAND) += security.o
-ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
-ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o
ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o
ib_cm-y := cm.o
@@ -38,4 +36,6 @@ ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o \
uverbs_std_types_cq.o \
uverbs_std_types_flow_action.o uverbs_std_types_dm.o \
uverbs_std_types_mr.o uverbs_std_types_counters.o \
- uverbs_uapi.o
+ uverbs_uapi.o uverbs_std_types_device.o
+ib_uverbs-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
+ib_uverbs-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o
diff --git a/drivers/infiniband/core/agent.c b/drivers/infiniband/core/agent.c
index 324ef85a13b6..f82b4260de42 100644
--- a/drivers/infiniband/core/agent.c
+++ b/drivers/infiniband/core/agent.c
@@ -137,13 +137,13 @@ void agent_send_response(const struct ib_mad_hdr *mad_hdr, const struct ib_grh *
err2:
ib_free_send_mad(send_buf);
err1:
- rdma_destroy_ah(ah);
+ rdma_destroy_ah(ah, RDMA_DESTROY_AH_SLEEPABLE);
}
static void agent_send_handler(struct ib_mad_agent *mad_agent,
struct ib_mad_send_wc *mad_send_wc)
{
- rdma_destroy_ah(mad_send_wc->send_buf->ah);
+ rdma_destroy_ah(mad_send_wc->send_buf->ah, RDMA_DESTROY_AH_SLEEPABLE);
ib_free_send_mad(mad_send_wc->send_buf);
}
diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c
index 5b2fce4a7091..43c67e5f43c6 100644
--- a/drivers/infiniband/core/cache.c
+++ b/drivers/infiniband/core/cache.c
@@ -185,7 +185,7 @@ EXPORT_SYMBOL(ib_cache_gid_parse_type_str);
static struct ib_gid_table *rdma_gid_table(struct ib_device *device, u8 port)
{
- return device->cache.ports[port - rdma_start_port(device)].gid;
+ return device->port_data[port].cache.gid;
}
static bool is_gid_entry_free(const struct ib_gid_table_entry *entry)
@@ -215,10 +215,6 @@ static void free_gid_entry_locked(struct ib_gid_table_entry *entry)
dev_dbg(&device->dev, "%s port=%d index=%d gid %pI6\n", __func__,
port_num, entry->attr.index, entry->attr.gid.raw);
- if (rdma_cap_roce_gid_table(device, port_num) &&
- entry->state != GID_TABLE_ENTRY_INVALID)
- device->del_gid(&entry->attr, &entry->context);
-
write_lock_irq(&table->rwlock);
/*
@@ -324,7 +320,7 @@ static int add_roce_gid(struct ib_gid_table_entry *entry)
return -EINVAL;
}
if (rdma_cap_roce_gid_table(attr->device, attr->port_num)) {
- ret = attr->device->add_gid(attr, &entry->context);
+ ret = attr->device->ops.add_gid(attr, &entry->context);
if (ret) {
dev_err(&attr->device->dev,
"%s GID add failed port=%d index=%d\n",
@@ -364,6 +360,9 @@ static void del_gid(struct ib_device *ib_dev, u8 port,
table->data_vec[ix] = NULL;
write_unlock_irq(&table->rwlock);
+ if (rdma_cap_roce_gid_table(ib_dev, port))
+ ib_dev->ops.del_gid(&entry->attr, &entry->context);
+
put_gid_entry_locked(entry);
}
@@ -548,21 +547,19 @@ int ib_cache_gid_add(struct ib_device *ib_dev, u8 port,
unsigned long mask;
int ret;
- if (ib_dev->get_netdev) {
- idev = ib_dev->get_netdev(ib_dev, port);
- if (idev && attr->ndev != idev) {
- union ib_gid default_gid;
+ idev = ib_device_get_netdev(ib_dev, port);
+ if (idev && attr->ndev != idev) {
+ union ib_gid default_gid;
- /* Adding default GIDs in not permitted */
- make_default_gid(idev, &default_gid);
- if (!memcmp(gid, &default_gid, sizeof(*gid))) {
- dev_put(idev);
- return -EPERM;
- }
- }
- if (idev)
+ /* Adding default GIDs is not permitted */
+ make_default_gid(idev, &default_gid);
+ if (!memcmp(gid, &default_gid, sizeof(*gid))) {
dev_put(idev);
+ return -EPERM;
+ }
}
+ if (idev)
+ dev_put(idev);
mask = GID_ATTR_FIND_MASK_GID |
GID_ATTR_FIND_MASK_GID_TYPE |
@@ -766,7 +763,7 @@ err_free_table:
return NULL;
}
-static void release_gid_table(struct ib_device *device, u8 port,
+static void release_gid_table(struct ib_device *device,
struct ib_gid_table *table)
{
bool leak = false;
@@ -864,31 +861,27 @@ static void gid_table_reserve_default(struct ib_device *ib_dev, u8 port,
static void gid_table_release_one(struct ib_device *ib_dev)
{
- struct ib_gid_table *table;
- u8 port;
+ unsigned int p;
- for (port = 0; port < ib_dev->phys_port_cnt; port++) {
- table = ib_dev->cache.ports[port].gid;
- release_gid_table(ib_dev, port, table);
- ib_dev->cache.ports[port].gid = NULL;
+ rdma_for_each_port (ib_dev, p) {
+ release_gid_table(ib_dev, ib_dev->port_data[p].cache.gid);
+ ib_dev->port_data[p].cache.gid = NULL;
}
}
static int _gid_table_setup_one(struct ib_device *ib_dev)
{
- u8 port;
struct ib_gid_table *table;
+ unsigned int rdma_port;
- for (port = 0; port < ib_dev->phys_port_cnt; port++) {
- u8 rdma_port = port + rdma_start_port(ib_dev);
-
- table = alloc_gid_table(
- ib_dev->port_immutable[rdma_port].gid_tbl_len);
+ rdma_for_each_port (ib_dev, rdma_port) {
+ table = alloc_gid_table(
+ ib_dev->port_data[rdma_port].immutable.gid_tbl_len);
if (!table)
goto rollback_table_setup;
gid_table_reserve_default(ib_dev, rdma_port, table);
- ib_dev->cache.ports[port].gid = table;
+ ib_dev->port_data[rdma_port].cache.gid = table;
}
return 0;
@@ -899,14 +892,11 @@ rollback_table_setup:
static void gid_table_cleanup_one(struct ib_device *ib_dev)
{
- struct ib_gid_table *table;
- u8 port;
+ unsigned int p;
- for (port = 0; port < ib_dev->phys_port_cnt; port++) {
- table = ib_dev->cache.ports[port].gid;
- cleanup_gid_table_port(ib_dev, port + rdma_start_port(ib_dev),
- table);
- }
+ rdma_for_each_port (ib_dev, p)
+ cleanup_gid_table_port(ib_dev, p,
+ ib_dev->port_data[p].cache.gid);
}
static int gid_table_setup_one(struct ib_device *ib_dev)
@@ -984,17 +974,17 @@ const struct ib_gid_attr *rdma_find_gid(struct ib_device *device,
unsigned long mask = GID_ATTR_FIND_MASK_GID |
GID_ATTR_FIND_MASK_GID_TYPE;
struct ib_gid_attr gid_attr_val = {.ndev = ndev, .gid_type = gid_type};
- u8 p;
+ unsigned int p;
if (ndev)
mask |= GID_ATTR_FIND_MASK_NETDEV;
- for (p = 0; p < device->phys_port_cnt; p++) {
+ rdma_for_each_port(device, p) {
struct ib_gid_table *table;
unsigned long flags;
int index;
- table = device->cache.ports[p].gid;
+ table = device->port_data[p].cache.gid;
read_lock_irqsave(&table->rwlock, flags);
index = find_gid(table, gid, &gid_attr_val, false, mask, NULL);
if (index >= 0) {
@@ -1026,7 +1016,7 @@ int ib_get_cached_pkey(struct ib_device *device,
read_lock_irqsave(&device->cache.lock, flags);
- cache = device->cache.ports[port_num - rdma_start_port(device)].pkey;
+ cache = device->port_data[port_num].cache.pkey;
if (index < 0 || index >= cache->table_len)
ret = -EINVAL;
@@ -1044,14 +1034,12 @@ int ib_get_cached_subnet_prefix(struct ib_device *device,
u64 *sn_pfx)
{
unsigned long flags;
- int p;
if (!rdma_is_port_valid(device, port_num))
return -EINVAL;
- p = port_num - rdma_start_port(device);
read_lock_irqsave(&device->cache.lock, flags);
- *sn_pfx = device->cache.ports[p].subnet_prefix;
+ *sn_pfx = device->port_data[port_num].cache.subnet_prefix;
read_unlock_irqrestore(&device->cache.lock, flags);
return 0;
@@ -1074,7 +1062,7 @@ int ib_find_cached_pkey(struct ib_device *device,
read_lock_irqsave(&device->cache.lock, flags);
- cache = device->cache.ports[port_num - rdma_start_port(device)].pkey;
+ cache = device->port_data[port_num].cache.pkey;
*index = -1;
@@ -1114,7 +1102,7 @@ int ib_find_exact_cached_pkey(struct ib_device *device,
read_lock_irqsave(&device->cache.lock, flags);
- cache = device->cache.ports[port_num - rdma_start_port(device)].pkey;
+ cache = device->port_data[port_num].cache.pkey;
*index = -1;
@@ -1142,7 +1130,7 @@ int ib_get_cached_lmc(struct ib_device *device,
return -EINVAL;
read_lock_irqsave(&device->cache.lock, flags);
- *lmc = device->cache.ports[port_num - rdma_start_port(device)].lmc;
+ *lmc = device->port_data[port_num].cache.lmc;
read_unlock_irqrestore(&device->cache.lock, flags);
return ret;
@@ -1160,8 +1148,7 @@ int ib_get_cached_port_state(struct ib_device *device,
return -EINVAL;
read_lock_irqsave(&device->cache.lock, flags);
- *port_state = device->cache.ports[port_num
- - rdma_start_port(device)].port_state;
+ *port_state = device->port_data[port_num].cache.port_state;
read_unlock_irqrestore(&device->cache.lock, flags);
return ret;
@@ -1296,9 +1283,9 @@ static int config_non_roce_gid_cache(struct ib_device *device,
mutex_lock(&table->lock);
for (i = 0; i < gid_tbl_len; ++i) {
- if (!device->query_gid)
+ if (!device->ops.query_gid)
continue;
- ret = device->query_gid(device, port, i, &gid_attr.gid);
+ ret = device->ops.query_gid(device, port, i, &gid_attr.gid);
if (ret) {
dev_warn(&device->dev,
"query_gid failed (%d) for index %d\n", ret,
@@ -1362,16 +1349,13 @@ static void ib_cache_update(struct ib_device *device,
write_lock_irq(&device->cache.lock);
- old_pkey_cache = device->cache.ports[port -
- rdma_start_port(device)].pkey;
+ old_pkey_cache = device->port_data[port].cache.pkey;
- device->cache.ports[port - rdma_start_port(device)].pkey = pkey_cache;
- device->cache.ports[port - rdma_start_port(device)].lmc = tprops->lmc;
- device->cache.ports[port - rdma_start_port(device)].port_state =
- tprops->state;
+ device->port_data[port].cache.pkey = pkey_cache;
+ device->port_data[port].cache.lmc = tprops->lmc;
+ device->port_data[port].cache.port_state = tprops->state;
- device->cache.ports[port - rdma_start_port(device)].subnet_prefix =
- tprops->subnet_prefix;
+ device->port_data[port].cache.subnet_prefix = tprops->subnet_prefix;
write_unlock_irq(&device->cache.lock);
if (enforce_security)
@@ -1429,27 +1413,17 @@ static void ib_cache_event(struct ib_event_handler *handler,
int ib_cache_setup_one(struct ib_device *device)
{
- int p;
+ unsigned int p;
int err;
rwlock_init(&device->cache.lock);
- device->cache.ports =
- kcalloc(rdma_end_port(device) - rdma_start_port(device) + 1,
- sizeof(*device->cache.ports),
- GFP_KERNEL);
- if (!device->cache.ports)
- return -ENOMEM;
-
err = gid_table_setup_one(device);
- if (err) {
- kfree(device->cache.ports);
- device->cache.ports = NULL;
+ if (err)
return err;
- }
- for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p)
- ib_cache_update(device, p + rdma_start_port(device), true);
+ rdma_for_each_port (device, p)
+ ib_cache_update(device, p, true);
INIT_IB_EVENT_HANDLER(&device->cache.event_handler,
device, ib_cache_event);
@@ -1459,7 +1433,7 @@ int ib_cache_setup_one(struct ib_device *device)
void ib_cache_release_one(struct ib_device *device)
{
- int p;
+ unsigned int p;
/*
* The release function frees all the cache elements.
@@ -1467,11 +1441,10 @@ void ib_cache_release_one(struct ib_device *device)
* all the device's resources when the cache could no
* longer be accessed.
*/
- for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p)
- kfree(device->cache.ports[p].pkey);
+ rdma_for_each_port (device, p)
+ kfree(device->port_data[p].cache.pkey);
gid_table_release_one(device);
- kfree(device->cache.ports);
}
void ib_cache_cleanup_one(struct ib_device *device)
diff --git a/drivers/infiniband/core/cgroup.c b/drivers/infiniband/core/cgroup.c
index 126ac5f99db7..388fd04e5f63 100644
--- a/drivers/infiniband/core/cgroup.c
+++ b/drivers/infiniband/core/cgroup.c
@@ -21,12 +21,11 @@
* Register with the rdma cgroup. Should be called before
* exposing rdma device to user space applications to avoid
* resource accounting leak.
- * Returns 0 on success or otherwise failure code.
*/
-int ib_device_register_rdmacg(struct ib_device *device)
+void ib_device_register_rdmacg(struct ib_device *device)
{
device->cg_device.name = device->name;
- return rdmacg_register_device(&device->cg_device);
+ rdmacg_register_device(&device->cg_device);
}
/**
diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index edb2cb758be7..b9416a6fca36 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -343,7 +343,7 @@ static int cm_alloc_msg(struct cm_id_private *cm_id_priv,
ret = -ENODEV;
goto out;
}
- ah = rdma_create_ah(mad_agent->qp->pd, &av->ah_attr);
+ ah = rdma_create_ah(mad_agent->qp->pd, &av->ah_attr, 0);
if (IS_ERR(ah)) {
ret = PTR_ERR(ah);
goto out;
@@ -355,7 +355,7 @@ static int cm_alloc_msg(struct cm_id_private *cm_id_priv,
GFP_ATOMIC,
IB_MGMT_BASE_VERSION);
if (IS_ERR(m)) {
- rdma_destroy_ah(ah);
+ rdma_destroy_ah(ah, 0);
ret = PTR_ERR(m);
goto out;
}
@@ -400,7 +400,7 @@ static int cm_create_response_msg_ah(struct cm_port *port,
static void cm_free_msg(struct ib_mad_send_buf *msg)
{
if (msg->ah)
- rdma_destroy_ah(msg->ah);
+ rdma_destroy_ah(msg->ah, 0);
if (msg->context[0])
cm_deref_id(msg->context[0]);
ib_free_send_mad(msg);
@@ -4052,8 +4052,7 @@ static void cm_recv_handler(struct ib_mad_agent *mad_agent,
atomic_long_inc(&port->counter_group[CM_RECV].
counter[attr_id - CM_ATTR_ID_OFFSET]);
- work = kmalloc(sizeof(*work) + sizeof(struct sa_path_rec) * paths,
- GFP_KERNEL);
+ work = kmalloc(struct_size(work, path, paths), GFP_KERNEL);
if (!work) {
ib_free_recv_mad(mad_recv_wc);
return;
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 15d5bb7bf6bb..68c997be2429 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -494,7 +494,10 @@ static void _cma_attach_to_dev(struct rdma_id_private *id_priv,
id_priv->id.route.addr.dev_addr.transport =
rdma_node_get_transport(cma_dev->device->node_type);
list_add_tail(&id_priv->list, &cma_dev->id_list);
- rdma_restrack_add(&id_priv->res);
+ if (id_priv->res.kern_name)
+ rdma_restrack_kadd(&id_priv->res);
+ else
+ rdma_restrack_uadd(&id_priv->res);
}
static void cma_attach_to_dev(struct rdma_id_private *id_priv,
@@ -656,7 +659,7 @@ static int cma_acquire_dev_by_src_ip(struct rdma_id_private *id_priv)
struct cma_device *cma_dev;
enum ib_gid_type gid_type;
int ret = -ENODEV;
- u8 port;
+ unsigned int port;
if (dev_addr->dev_type != ARPHRD_INFINIBAND &&
id_priv->id.ps == RDMA_PS_IPOIB)
@@ -670,8 +673,7 @@ static int cma_acquire_dev_by_src_ip(struct rdma_id_private *id_priv)
mutex_lock(&lock);
list_for_each_entry(cma_dev, &dev_list, list) {
- for (port = rdma_start_port(cma_dev->device);
- port <= rdma_end_port(cma_dev->device); port++) {
+ rdma_for_each_port (cma_dev->device, port) {
gidp = rdma_protocol_roce(cma_dev->device, port) ?
&iboe_gid : &gid;
gid_type = cma_dev->default_gid_type[port - 1];
@@ -885,6 +887,7 @@ struct rdma_cm_id *__rdma_create_id(struct net *net,
id_priv->id.ps = ps;
id_priv->id.qp_type = qp_type;
id_priv->tos_set = false;
+ id_priv->timeout_set = false;
id_priv->gid_type = IB_GID_TYPE_IB;
spin_lock_init(&id_priv->lock);
mutex_init(&id_priv->qp_mutex);
@@ -1127,6 +1130,9 @@ int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr,
} else
ret = -ENOSYS;
+ if ((*qp_attr_mask & IB_QP_TIMEOUT) && id_priv->timeout_set)
+ qp_attr->timeout = id_priv->timeout;
+
return ret;
}
EXPORT_SYMBOL(rdma_init_qp_attr);
@@ -2407,6 +2413,7 @@ static int cma_iw_listen(struct rdma_id_private *id_priv, int backlog)
return PTR_ERR(id);
id->tos = id_priv->tos;
+ id->tos_set = id_priv->tos_set;
id_priv->cm_id.iw = id;
memcpy(&id_priv->cm_id.iw->local_addr, cma_src_addr(id_priv),
@@ -2459,6 +2466,8 @@ static void cma_listen_on_dev(struct rdma_id_private *id_priv,
atomic_inc(&id_priv->refcount);
dev_id_priv->internal_id = 1;
dev_id_priv->afonly = id_priv->afonly;
+ dev_id_priv->tos_set = id_priv->tos_set;
+ dev_id_priv->tos = id_priv->tos;
ret = rdma_listen(id, id_priv->backlog);
if (ret)
@@ -2487,6 +2496,34 @@ void rdma_set_service_type(struct rdma_cm_id *id, int tos)
}
EXPORT_SYMBOL(rdma_set_service_type);
+/**
+ * rdma_set_ack_timeout() - Set the ack timeout of QP associated
+ * with a connection identifier.
+ * @id: Communication identifier to associated with service type.
+ * @timeout: Ack timeout to set a QP, expressed as 4.096 * 2^(timeout) usec.
+ *
+ * This function should be called before rdma_connect() on active side,
+ * and on passive side before rdma_accept(). It is applicable to primary
+ * path only. The timeout will affect the local side of the QP, it is not
+ * negotiated with remote side and zero disables the timer.
+ *
+ * Return: 0 for success
+ */
+int rdma_set_ack_timeout(struct rdma_cm_id *id, u8 timeout)
+{
+ struct rdma_id_private *id_priv;
+
+ if (id->qp_type != IB_QPT_RC)
+ return -EINVAL;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ id_priv->timeout = timeout;
+ id_priv->timeout_set = true;
+
+ return 0;
+}
+EXPORT_SYMBOL(rdma_set_ack_timeout);
+
static void cma_query_handler(int status, struct sa_path_rec *path_rec,
void *context)
{
@@ -2963,13 +3000,22 @@ static void addr_handler(int status, struct sockaddr *src_addr,
{
struct rdma_id_private *id_priv = context;
struct rdma_cm_event event = {};
+ struct sockaddr *addr;
+ struct sockaddr_storage old_addr;
mutex_lock(&id_priv->handler_mutex);
if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY,
RDMA_CM_ADDR_RESOLVED))
goto out;
- memcpy(cma_src_addr(id_priv), src_addr, rdma_addr_size(src_addr));
+ /*
+ * Store the previous src address, so that if we fail to acquire
+ * matching rdma device, old address can be restored back, which helps
+ * to cancel the cma listen operation correctly.
+ */
+ addr = cma_src_addr(id_priv);
+ memcpy(&old_addr, addr, rdma_addr_size(addr));
+ memcpy(addr, src_addr, rdma_addr_size(src_addr));
if (!status && !id_priv->cma_dev) {
status = cma_acquire_dev_by_src_ip(id_priv);
if (status)
@@ -2980,6 +3026,8 @@ static void addr_handler(int status, struct sockaddr *src_addr,
}
if (status) {
+ memcpy(addr, &old_addr,
+ rdma_addr_size((struct sockaddr *)&old_addr));
if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED,
RDMA_CM_ADDR_BOUND))
goto out;
@@ -3795,6 +3843,7 @@ static int cma_connect_iw(struct rdma_id_private *id_priv,
return PTR_ERR(cm_id);
cm_id->tos = id_priv->tos;
+ cm_id->tos_set = id_priv->tos_set;
id_priv->cm_id.iw = cm_id;
memcpy(&cm_id->local_addr, cma_src_addr(id_priv),
@@ -4498,7 +4547,7 @@ static void cma_add_one(struct ib_device *device)
if (!cma_dev->default_roce_tos)
goto free_gid_type;
- for (i = rdma_start_port(device); i <= rdma_end_port(device); i++) {
+ rdma_for_each_port (device, i) {
supported_gids = roce_gid_type_mask_support(device, i);
WARN_ON(!supported_gids);
if (supported_gids & (1 << CMA_PREFERRED_ROCE_GID_TYPE))
@@ -4602,85 +4651,6 @@ static void cma_remove_one(struct ib_device *device, void *client_data)
kfree(cma_dev);
}
-static int cma_get_id_stats(struct sk_buff *skb, struct netlink_callback *cb)
-{
- struct nlmsghdr *nlh;
- struct rdma_cm_id_stats *id_stats;
- struct rdma_id_private *id_priv;
- struct rdma_cm_id *id = NULL;
- struct cma_device *cma_dev;
- int i_dev = 0, i_id = 0;
-
- /*
- * We export all of the IDs as a sequence of messages. Each
- * ID gets its own netlink message.
- */
- mutex_lock(&lock);
-
- list_for_each_entry(cma_dev, &dev_list, list) {
- if (i_dev < cb->args[0]) {
- i_dev++;
- continue;
- }
-
- i_id = 0;
- list_for_each_entry(id_priv, &cma_dev->id_list, list) {
- if (i_id < cb->args[1]) {
- i_id++;
- continue;
- }
-
- id_stats = ibnl_put_msg(skb, &nlh, cb->nlh->nlmsg_seq,
- sizeof *id_stats, RDMA_NL_RDMA_CM,
- RDMA_NL_RDMA_CM_ID_STATS,
- NLM_F_MULTI);
- if (!id_stats)
- goto out;
-
- memset(id_stats, 0, sizeof *id_stats);
- id = &id_priv->id;
- id_stats->node_type = id->route.addr.dev_addr.dev_type;
- id_stats->port_num = id->port_num;
- id_stats->bound_dev_if =
- id->route.addr.dev_addr.bound_dev_if;
-
- if (ibnl_put_attr(skb, nlh,
- rdma_addr_size(cma_src_addr(id_priv)),
- cma_src_addr(id_priv),
- RDMA_NL_RDMA_CM_ATTR_SRC_ADDR))
- goto out;
- if (ibnl_put_attr(skb, nlh,
- rdma_addr_size(cma_dst_addr(id_priv)),
- cma_dst_addr(id_priv),
- RDMA_NL_RDMA_CM_ATTR_DST_ADDR))
- goto out;
-
- id_stats->pid = task_pid_vnr(id_priv->res.task);
- id_stats->port_space = id->ps;
- id_stats->cm_state = id_priv->state;
- id_stats->qp_num = id_priv->qp_num;
- id_stats->qp_type = id->qp_type;
-
- i_id++;
- nlmsg_end(skb, nlh);
- }
-
- cb->args[1] = 0;
- i_dev++;
- }
-
-out:
- mutex_unlock(&lock);
- cb->args[0] = i_dev;
- cb->args[1] = i_id;
-
- return skb->len;
-}
-
-static const struct rdma_nl_cbs cma_cb_table[RDMA_NL_RDMA_CM_NUM_OPS] = {
- [RDMA_NL_RDMA_CM_ID_STATS] = { .dump = cma_get_id_stats},
-};
-
static int cma_init_net(struct net *net)
{
struct cma_pernet *pernet = cma_pernet(net);
@@ -4729,7 +4699,6 @@ static int __init cma_init(void)
if (ret)
goto err;
- rdma_nl_register(RDMA_NL_RDMA_CM, cma_cb_table);
cma_configfs_init();
return 0;
@@ -4745,7 +4714,6 @@ err_wq:
static void __exit cma_cleanup(void)
{
cma_configfs_exit();
- rdma_nl_unregister(RDMA_NL_RDMA_CM);
ib_unregister_client(&cma_client);
unregister_netdevice_notifier(&cma_nb);
ib_sa_unregister_client(&sa_client);
@@ -4753,7 +4721,5 @@ static void __exit cma_cleanup(void)
destroy_workqueue(cma_wq);
}
-MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_RDMA_CM, 1);
-
module_init(cma_init);
module_exit(cma_cleanup);
diff --git a/drivers/infiniband/core/cma_configfs.c b/drivers/infiniband/core/cma_configfs.c
index 8c2dfb3e294e..3ec2c415bb70 100644
--- a/drivers/infiniband/core/cma_configfs.c
+++ b/drivers/infiniband/core/cma_configfs.c
@@ -33,7 +33,10 @@
#include <linux/module.h>
#include <linux/configfs.h>
#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+
#include "core_priv.h"
+#include "cma_priv.h"
struct cma_device;
diff --git a/drivers/infiniband/core/cma_priv.h b/drivers/infiniband/core/cma_priv.h
index 194cfe78c447..ca7307277518 100644
--- a/drivers/infiniband/core/cma_priv.h
+++ b/drivers/infiniband/core/cma_priv.h
@@ -84,9 +84,11 @@ struct rdma_id_private {
u32 options;
u8 srq;
u8 tos;
- bool tos_set;
+ u8 tos_set:1;
+ u8 timeout_set:1;
u8 reuseaddr;
u8 afonly;
+ u8 timeout;
enum ib_gid_type gid_type;
/*
@@ -94,4 +96,32 @@ struct rdma_id_private {
*/
struct rdma_restrack_entry res;
};
+
+#if IS_ENABLED(CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS)
+int cma_configfs_init(void);
+void cma_configfs_exit(void);
+#else
+static inline int cma_configfs_init(void)
+{
+ return 0;
+}
+
+static inline void cma_configfs_exit(void)
+{
+}
+#endif
+
+void cma_ref_dev(struct cma_device *dev);
+void cma_deref_dev(struct cma_device *dev);
+typedef bool (*cma_device_filter)(struct ib_device *, void *);
+struct cma_device *cma_enum_devices_by_ibdev(cma_device_filter filter,
+ void *cookie);
+int cma_get_default_gid_type(struct cma_device *dev, unsigned int port);
+int cma_set_default_gid_type(struct cma_device *dev, unsigned int port,
+ enum ib_gid_type default_gid_type);
+int cma_get_default_roce_tos(struct cma_device *dev, unsigned int port);
+int cma_set_default_roce_tos(struct cma_device *dev, unsigned int port,
+ u8 default_roce_tos);
+struct ib_device *cma_get_ib_dev(struct cma_device *dev);
+
#endif /* _CMA_PRIV_H */
diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
index bb9007a0cca7..08c690249594 100644
--- a/drivers/infiniband/core/core_priv.h
+++ b/drivers/infiniband/core/core_priv.h
@@ -54,38 +54,9 @@ struct pkey_index_qp_list {
struct list_head qp_list;
};
-#if IS_ENABLED(CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS)
-int cma_configfs_init(void);
-void cma_configfs_exit(void);
-#else
-static inline int cma_configfs_init(void)
-{
- return 0;
-}
+extern const struct attribute_group ib_dev_attr_group;
-static inline void cma_configfs_exit(void)
-{
-}
-#endif
-struct cma_device;
-void cma_ref_dev(struct cma_device *cma_dev);
-void cma_deref_dev(struct cma_device *cma_dev);
-typedef bool (*cma_device_filter)(struct ib_device *, void *);
-struct cma_device *cma_enum_devices_by_ibdev(cma_device_filter filter,
- void *cookie);
-int cma_get_default_gid_type(struct cma_device *cma_dev,
- unsigned int port);
-int cma_set_default_gid_type(struct cma_device *cma_dev,
- unsigned int port,
- enum ib_gid_type default_gid_type);
-int cma_get_default_roce_tos(struct cma_device *cma_dev, unsigned int port);
-int cma_set_default_roce_tos(struct cma_device *a_dev, unsigned int port,
- u8 default_roce_tos);
-struct ib_device *cma_get_ib_dev(struct cma_device *cma_dev);
-
-int ib_device_register_sysfs(struct ib_device *device,
- int (*port_callback)(struct ib_device *,
- u8, struct kobject *));
+int ib_device_register_sysfs(struct ib_device *device);
void ib_device_unregister_sysfs(struct ib_device *device);
int ib_device_rename(struct ib_device *ibdev, const char *name);
@@ -95,6 +66,9 @@ typedef void (*roce_netdev_callback)(struct ib_device *device, u8 port,
typedef bool (*roce_netdev_filter)(struct ib_device *device, u8 port,
struct net_device *idev, void *cookie);
+struct net_device *ib_device_get_netdev(struct ib_device *ib_dev,
+ unsigned int port);
+
void ib_enum_roce_netdev(struct ib_device *ib_dev,
roce_netdev_filter filter,
void *filter_cookie,
@@ -146,7 +120,7 @@ void ib_cache_cleanup_one(struct ib_device *device);
void ib_cache_release_one(struct ib_device *device);
#ifdef CONFIG_CGROUP_RDMA
-int ib_device_register_rdmacg(struct ib_device *device);
+void ib_device_register_rdmacg(struct ib_device *device);
void ib_device_unregister_rdmacg(struct ib_device *device);
int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj,
@@ -157,21 +131,26 @@ void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj,
struct ib_device *device,
enum rdmacg_resource_type resource_index);
#else
-static inline int ib_device_register_rdmacg(struct ib_device *device)
-{ return 0; }
+static inline void ib_device_register_rdmacg(struct ib_device *device)
+{
+}
static inline void ib_device_unregister_rdmacg(struct ib_device *device)
-{ }
+{
+}
static inline int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj,
struct ib_device *device,
enum rdmacg_resource_type resource_index)
-{ return 0; }
+{
+ return 0;
+}
static inline void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj,
struct ib_device *device,
enum rdmacg_resource_type resource_index)
-{ }
+{
+}
#endif
static inline bool rdma_is_upper_dev_rcu(struct net_device *dev,
@@ -207,7 +186,7 @@ int ib_get_cached_subnet_prefix(struct ib_device *device,
u64 *sn_pfx);
#ifdef CONFIG_SECURITY_INFINIBAND
-void ib_security_destroy_port_pkey_list(struct ib_device *device);
+void ib_security_release_port_pkey_list(struct ib_device *device);
void ib_security_cache_change(struct ib_device *device,
u8 port_num,
@@ -228,8 +207,9 @@ int ib_mad_agent_security_setup(struct ib_mad_agent *agent,
enum ib_qp_type qp_type);
void ib_mad_agent_security_cleanup(struct ib_mad_agent *agent);
int ib_mad_enforce_security(struct ib_mad_agent_private *map, u16 pkey_index);
+void ib_mad_agent_security_change(void);
#else
-static inline void ib_security_destroy_port_pkey_list(struct ib_device *device)
+static inline void ib_security_release_port_pkey_list(struct ib_device *device)
{
}
@@ -244,10 +224,10 @@ static inline int ib_security_modify_qp(struct ib_qp *qp,
int qp_attr_mask,
struct ib_udata *udata)
{
- return qp->device->modify_qp(qp->real_qp,
- qp_attr,
- qp_attr_mask,
- udata);
+ return qp->device->ops.modify_qp(qp->real_qp,
+ qp_attr,
+ qp_attr_mask,
+ udata);
}
static inline int ib_create_qp_security(struct ib_qp *qp,
@@ -293,6 +273,10 @@ static inline int ib_mad_enforce_security(struct ib_mad_agent_private *map,
{
return 0;
}
+
+static inline void ib_mad_agent_security_change(void)
+{
+}
#endif
struct ib_device *ib_device_get_by_index(u32 ifindex);
@@ -308,10 +292,10 @@ static inline struct ib_qp *_ib_create_qp(struct ib_device *dev,
{
struct ib_qp *qp;
- if (!dev->create_qp)
+ if (!dev->ops.create_qp)
return ERR_PTR(-EOPNOTSUPP);
- qp = dev->create_qp(pd, attr, udata);
+ qp = dev->ops.create_qp(pd, attr, udata);
if (IS_ERR(qp))
return qp;
@@ -325,7 +309,10 @@ static inline struct ib_qp *_ib_create_qp(struct ib_device *dev,
*/
if (attr->qp_type < IB_QPT_XRC_INI) {
qp->res.type = RDMA_RESTRACK_QP;
- rdma_restrack_add(&qp->res);
+ if (uobj)
+ rdma_restrack_uadd(&qp->res);
+ else
+ rdma_restrack_kadd(&qp->res);
} else
qp->res.valid = false;
diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c
index b1e5365ddafa..d61e5e1427c2 100644
--- a/drivers/infiniband/core/cq.c
+++ b/drivers/infiniband/core/cq.c
@@ -145,7 +145,7 @@ struct ib_cq *__ib_alloc_cq(struct ib_device *dev, void *private,
struct ib_cq *cq;
int ret = -ENOMEM;
- cq = dev->create_cq(dev, &cq_attr, NULL, NULL);
+ cq = dev->ops.create_cq(dev, &cq_attr, NULL, NULL);
if (IS_ERR(cq))
return cq;
@@ -162,7 +162,7 @@ struct ib_cq *__ib_alloc_cq(struct ib_device *dev, void *private,
cq->res.type = RDMA_RESTRACK_CQ;
rdma_restrack_set_task(&cq->res, caller);
- rdma_restrack_add(&cq->res);
+ rdma_restrack_kadd(&cq->res);
switch (cq->poll_ctx) {
case IB_POLL_DIRECT:
@@ -193,7 +193,7 @@ out_free_wc:
kfree(cq->wc);
rdma_restrack_del(&cq->res);
out_destroy_cq:
- cq->device->destroy_cq(cq);
+ cq->device->ops.destroy_cq(cq);
return ERR_PTR(ret);
}
EXPORT_SYMBOL(__ib_alloc_cq);
@@ -225,7 +225,7 @@ void ib_free_cq(struct ib_cq *cq)
kfree(cq->wc);
rdma_restrack_del(&cq->res);
- ret = cq->device->destroy_cq(cq);
+ ret = cq->device->ops.destroy_cq(cq);
WARN_ON_ONCE(ret);
}
EXPORT_SYMBOL(ib_free_cq);
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 87eb4f2cdd7d..7421ec4883fb 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -37,54 +37,111 @@
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/init.h>
-#include <linux/mutex.h>
#include <linux/netdevice.h>
#include <linux/security.h>
#include <linux/notifier.h>
+#include <linux/hashtable.h>
#include <rdma/rdma_netlink.h>
#include <rdma/ib_addr.h>
#include <rdma/ib_cache.h>
#include "core_priv.h"
+#include "restrack.h"
MODULE_AUTHOR("Roland Dreier");
MODULE_DESCRIPTION("core kernel InfiniBand API");
MODULE_LICENSE("Dual BSD/GPL");
-struct ib_client_data {
- struct list_head list;
- struct ib_client *client;
- void * data;
- /* The device or client is going down. Do not call client or device
- * callbacks other than remove(). */
- bool going_down;
-};
-
struct workqueue_struct *ib_comp_wq;
struct workqueue_struct *ib_comp_unbound_wq;
struct workqueue_struct *ib_wq;
EXPORT_SYMBOL_GPL(ib_wq);
-/* The device_list and client_list contain devices and clients after their
- * registration has completed, and the devices and clients are removed
- * during unregistration. */
-static LIST_HEAD(device_list);
-static LIST_HEAD(client_list);
+/*
+ * Each of the three rwsem locks (devices, clients, client_data) protects the
+ * xarray of the same name. Specifically it allows the caller to assert that
+ * the MARK will/will not be changing under the lock, and for devices and
+ * clients, that the value in the xarray is still a valid pointer. Change of
+ * the MARK is linked to the object state, so holding the lock and testing the
+ * MARK also asserts that the contained object is in a certain state.
+ *
+ * This is used to build a two stage register/unregister flow where objects
+ * can continue to be in the xarray even though they are still in progress to
+ * register/unregister.
+ *
+ * The xarray itself provides additional locking, and restartable iteration,
+ * which is also relied on.
+ *
+ * Locks should not be nested, with the exception of client_data, which is
+ * allowed to nest under the read side of the other two locks.
+ *
+ * The devices_rwsem also protects the device name list, any change or
+ * assignment of device name must also hold the write side to guarantee unique
+ * names.
+ */
/*
- * device_mutex and lists_rwsem protect access to both device_list and
- * client_list. device_mutex protects writer access by device and client
- * registration / de-registration. lists_rwsem protects reader access to
- * these lists. Iterators of these lists must lock it for read, while updates
- * to the lists must be done with a write lock. A special case is when the
- * device_mutex is locked. In this case locking the lists for read access is
- * not necessary as the device_mutex implies it.
+ * devices contains devices that have had their names assigned. The
+ * devices may not be registered. Users that care about the registration
+ * status need to call ib_device_try_get() on the device to ensure it is
+ * registered, and keep it registered, for the required duration.
*
- * lists_rwsem also protects access to the client data list.
*/
-static DEFINE_MUTEX(device_mutex);
-static DECLARE_RWSEM(lists_rwsem);
+static DEFINE_XARRAY_FLAGS(devices, XA_FLAGS_ALLOC);
+static DECLARE_RWSEM(devices_rwsem);
+#define DEVICE_REGISTERED XA_MARK_1
+static LIST_HEAD(client_list);
+#define CLIENT_REGISTERED XA_MARK_1
+static DEFINE_XARRAY_FLAGS(clients, XA_FLAGS_ALLOC);
+static DECLARE_RWSEM(clients_rwsem);
+
+/*
+ * If client_data is registered then the corresponding client must also still
+ * be registered.
+ */
+#define CLIENT_DATA_REGISTERED XA_MARK_1
+/*
+ * xarray has this behavior where it won't iterate over NULL values stored in
+ * allocated arrays. So we need our own iterator to see all values stored in
+ * the array. This does the same thing as xa_for_each except that it also
+ * returns NULL valued entries if the array is allocating. Simplified to only
+ * work on simple xarrays.
+ */
+static void *xan_find_marked(struct xarray *xa, unsigned long *indexp,
+ xa_mark_t filter)
+{
+ XA_STATE(xas, xa, *indexp);
+ void *entry;
+
+ rcu_read_lock();
+ do {
+ entry = xas_find_marked(&xas, ULONG_MAX, filter);
+ if (xa_is_zero(entry))
+ break;
+ } while (xas_retry(&xas, entry));
+ rcu_read_unlock();
+
+ if (entry) {
+ *indexp = xas.xa_index;
+ if (xa_is_zero(entry))
+ return NULL;
+ return entry;
+ }
+ return XA_ERROR(-ENOENT);
+}
+#define xan_for_each_marked(xa, index, entry, filter) \
+ for (index = 0, entry = xan_find_marked(xa, &(index), filter); \
+ !xa_is_err(entry); \
+ (index)++, entry = xan_find_marked(xa, &(index), filter))
+
+/* RCU hash table mapping netdevice pointers to struct ib_port_data */
+static DEFINE_SPINLOCK(ndev_hash_lock);
+static DECLARE_HASHTABLE(ndev_hash, 5);
+
+static void free_netdevs(struct ib_device *ib_dev);
+static void ib_unregister_work(struct work_struct *work);
+static void __ib_unregister_device(struct ib_device *device);
static int ib_security_change(struct notifier_block *nb, unsigned long event,
void *lsm_data);
static void ib_policy_change_task(struct work_struct *work);
@@ -94,9 +151,15 @@ static struct notifier_block ibdev_lsm_nb = {
.notifier_call = ib_security_change,
};
+/* Pointer to the RCU head at the start of the ib_port_data array */
+struct ib_port_data_rcu {
+ struct rcu_head rcu_head;
+ struct ib_port_data pdata[];
+};
+
static int ib_device_check_mandatory(struct ib_device *device)
{
-#define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device, x), #x }
+#define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device_ops, x), #x }
static const struct {
size_t offset;
char *name;
@@ -121,70 +184,103 @@ static int ib_device_check_mandatory(struct ib_device *device)
};
int i;
+ device->kverbs_provider = true;
for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) {
- if (!*(void **) ((void *) device + mandatory_table[i].offset)) {
- dev_warn(&device->dev,
- "Device is missing mandatory function %s\n",
- mandatory_table[i].name);
- return -EINVAL;
+ if (!*(void **) ((void *) &device->ops +
+ mandatory_table[i].offset)) {
+ device->kverbs_provider = false;
+ break;
}
}
return 0;
}
-static struct ib_device *__ib_device_get_by_index(u32 index)
-{
- struct ib_device *device;
-
- list_for_each_entry(device, &device_list, core_list)
- if (device->index == index)
- return device;
-
- return NULL;
-}
-
/*
- * Caller is responsible to return refrerence count by calling put_device()
+ * Caller must perform ib_device_put() to return the device reference count
+ * when ib_device_get_by_index() returns valid device pointer.
*/
struct ib_device *ib_device_get_by_index(u32 index)
{
struct ib_device *device;
- down_read(&lists_rwsem);
- device = __ib_device_get_by_index(index);
- if (device)
- get_device(&device->dev);
-
- up_read(&lists_rwsem);
+ down_read(&devices_rwsem);
+ device = xa_load(&devices, index);
+ if (device) {
+ if (!ib_device_try_get(device))
+ device = NULL;
+ }
+ up_read(&devices_rwsem);
return device;
}
+/**
+ * ib_device_put - Release IB device reference
+ * @device: device whose reference to be released
+ *
+ * ib_device_put() releases reference to the IB device to allow it to be
+ * unregistered and eventually free.
+ */
+void ib_device_put(struct ib_device *device)
+{
+ if (refcount_dec_and_test(&device->refcount))
+ complete(&device->unreg_completion);
+}
+EXPORT_SYMBOL(ib_device_put);
+
static struct ib_device *__ib_device_get_by_name(const char *name)
{
struct ib_device *device;
+ unsigned long index;
- list_for_each_entry(device, &device_list, core_list)
+ xa_for_each (&devices, index, device)
if (!strcmp(name, dev_name(&device->dev)))
return device;
return NULL;
}
-int ib_device_rename(struct ib_device *ibdev, const char *name)
+/**
+ * ib_device_get_by_name - Find an IB device by name
+ * @name: The name to look for
+ * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all)
+ *
+ * Find and hold an ib_device by its name. The caller must call
+ * ib_device_put() on the returned pointer.
+ */
+struct ib_device *ib_device_get_by_name(const char *name,
+ enum rdma_driver_id driver_id)
{
struct ib_device *device;
- int ret = 0;
- if (!strcmp(name, dev_name(&ibdev->dev)))
- return ret;
+ down_read(&devices_rwsem);
+ device = __ib_device_get_by_name(name);
+ if (device && driver_id != RDMA_DRIVER_UNKNOWN &&
+ device->driver_id != driver_id)
+ device = NULL;
- mutex_lock(&device_mutex);
- list_for_each_entry(device, &device_list, core_list) {
- if (!strcmp(name, dev_name(&device->dev))) {
- ret = -EEXIST;
- goto out;
- }
+ if (device) {
+ if (!ib_device_try_get(device))
+ device = NULL;
+ }
+ up_read(&devices_rwsem);
+ return device;
+}
+EXPORT_SYMBOL(ib_device_get_by_name);
+
+int ib_device_rename(struct ib_device *ibdev, const char *name)
+{
+ int ret;
+
+ down_write(&devices_rwsem);
+ if (!strcmp(name, dev_name(&ibdev->dev))) {
+ ret = 0;
+ goto out;
+ }
+
+ if (__ib_device_get_by_name(name)) {
+ ret = -EEXIST;
+ goto out;
}
ret = device_rename(&ibdev->dev, name);
@@ -192,53 +288,60 @@ int ib_device_rename(struct ib_device *ibdev, const char *name)
goto out;
strlcpy(ibdev->name, name, IB_DEVICE_NAME_MAX);
out:
- mutex_unlock(&device_mutex);
+ up_write(&devices_rwsem);
return ret;
}
static int alloc_name(struct ib_device *ibdev, const char *name)
{
- unsigned long *inuse;
struct ib_device *device;
+ unsigned long index;
+ struct ida inuse;
+ int rc;
int i;
- inuse = (unsigned long *) get_zeroed_page(GFP_KERNEL);
- if (!inuse)
- return -ENOMEM;
-
- list_for_each_entry(device, &device_list, core_list) {
+ lockdep_assert_held_exclusive(&devices_rwsem);
+ ida_init(&inuse);
+ xa_for_each (&devices, index, device) {
char buf[IB_DEVICE_NAME_MAX];
if (sscanf(dev_name(&device->dev), name, &i) != 1)
continue;
- if (i < 0 || i >= PAGE_SIZE * 8)
+ if (i < 0 || i >= INT_MAX)
continue;
snprintf(buf, sizeof buf, name, i);
- if (!strcmp(buf, dev_name(&device->dev)))
- set_bit(i, inuse);
+ if (strcmp(buf, dev_name(&device->dev)) != 0)
+ continue;
+
+ rc = ida_alloc_range(&inuse, i, i, GFP_KERNEL);
+ if (rc < 0)
+ goto out;
}
- i = find_first_zero_bit(inuse, PAGE_SIZE * 8);
- free_page((unsigned long) inuse);
+ rc = ida_alloc(&inuse, GFP_KERNEL);
+ if (rc < 0)
+ goto out;
- return dev_set_name(&ibdev->dev, name, i);
+ rc = dev_set_name(&ibdev->dev, name, rc);
+out:
+ ida_destroy(&inuse);
+ return rc;
}
static void ib_device_release(struct device *device)
{
struct ib_device *dev = container_of(device, struct ib_device, dev);
- WARN_ON(dev->reg_state == IB_DEV_REGISTERED);
- if (dev->reg_state == IB_DEV_UNREGISTERED) {
- /*
- * In IB_DEV_UNINITIALIZED state, cache or port table
- * is not even created. Free cache and port table only when
- * device reaches UNREGISTERED state.
- */
- ib_cache_release_one(dev);
- kfree(dev->port_immutable);
- }
- kfree(dev);
+ free_netdevs(dev);
+ WARN_ON(refcount_read(&dev->refcount));
+ ib_cache_release_one(dev);
+ ib_security_release_port_pkey_list(dev);
+ xa_destroy(&dev->client_data);
+ if (dev->port_data)
+ kfree_rcu(container_of(dev->port_data, struct ib_port_data_rcu,
+ pdata[0]),
+ rcu_head);
+ kfree_rcu(dev, rcu_head);
}
static int ib_device_uevent(struct device *device,
@@ -261,7 +364,7 @@ static struct class ib_class = {
};
/**
- * ib_alloc_device - allocate an IB device struct
+ * _ib_alloc_device - allocate an IB device struct
* @size:size of structure to allocate
*
* Low-level drivers should use ib_alloc_device() to allocate &struct
@@ -270,7 +373,7 @@ static struct class ib_class = {
* ib_dealloc_device() must be used to free structures allocated with
* ib_alloc_device().
*/
-struct ib_device *ib_alloc_device(size_t size)
+struct ib_device *_ib_alloc_device(size_t size)
{
struct ib_device *device;
@@ -281,22 +384,32 @@ struct ib_device *ib_alloc_device(size_t size)
if (!device)
return NULL;
- rdma_restrack_init(&device->res);
+ if (rdma_restrack_init(device)) {
+ kfree(device);
+ return NULL;
+ }
device->dev.class = &ib_class;
+ device->groups[0] = &ib_dev_attr_group;
+ device->dev.groups = device->groups;
device_initialize(&device->dev);
- dev_set_drvdata(&device->dev, device);
-
INIT_LIST_HEAD(&device->event_handler_list);
spin_lock_init(&device->event_handler_lock);
- rwlock_init(&device->client_data_lock);
- INIT_LIST_HEAD(&device->client_data_list);
+ mutex_init(&device->unregistration_lock);
+ /*
+ * client_data needs to be alloc because we don't want our mark to be
+ * destroyed if the user stores NULL in the client data.
+ */
+ xa_init_flags(&device->client_data, XA_FLAGS_ALLOC);
+ init_rwsem(&device->client_data_rwsem);
INIT_LIST_HEAD(&device->port_list);
+ init_completion(&device->unreg_completion);
+ INIT_WORK(&device->unregistration_work, ib_unregister_work);
return device;
}
-EXPORT_SYMBOL(ib_alloc_device);
+EXPORT_SYMBOL(_ib_alloc_device);
/**
* ib_dealloc_device - free an IB device struct
@@ -306,32 +419,153 @@ EXPORT_SYMBOL(ib_alloc_device);
*/
void ib_dealloc_device(struct ib_device *device)
{
- WARN_ON(!list_empty(&device->client_data_list));
- WARN_ON(device->reg_state != IB_DEV_UNREGISTERED &&
- device->reg_state != IB_DEV_UNINITIALIZED);
- rdma_restrack_clean(&device->res);
+ if (device->ops.dealloc_driver)
+ device->ops.dealloc_driver(device);
+
+ /*
+ * ib_unregister_driver() requires all devices to remain in the xarray
+ * while their ops are callable. The last op we call is dealloc_driver
+ * above. This is needed to create a fence on op callbacks prior to
+ * allowing the driver module to unload.
+ */
+ down_write(&devices_rwsem);
+ if (xa_load(&devices, device->index) == device)
+ xa_erase(&devices, device->index);
+ up_write(&devices_rwsem);
+
+ /* Expedite releasing netdev references */
+ free_netdevs(device);
+
+ WARN_ON(!xa_empty(&device->client_data));
+ WARN_ON(refcount_read(&device->refcount));
+ rdma_restrack_clean(device);
+ /* Balances with device_initialize */
put_device(&device->dev);
}
EXPORT_SYMBOL(ib_dealloc_device);
-static int add_client_context(struct ib_device *device, struct ib_client *client)
+/*
+ * add_client_context() and remove_client_context() must be safe against
+ * parallel calls on the same device - registration/unregistration of both the
+ * device and client can be occurring in parallel.
+ *
+ * The routines need to be a fence, any caller must not return until the add
+ * or remove is fully completed.
+ */
+static int add_client_context(struct ib_device *device,
+ struct ib_client *client)
{
- struct ib_client_data *context;
+ int ret = 0;
- context = kmalloc(sizeof(*context), GFP_KERNEL);
- if (!context)
- return -ENOMEM;
+ if (!device->kverbs_provider && !client->no_kverbs_req)
+ return 0;
- context->client = client;
- context->data = NULL;
- context->going_down = false;
+ down_write(&device->client_data_rwsem);
+ /*
+ * Another caller to add_client_context got here first and has already
+ * completely initialized context.
+ */
+ if (xa_get_mark(&device->client_data, client->client_id,
+ CLIENT_DATA_REGISTERED))
+ goto out;
- down_write(&lists_rwsem);
- write_lock_irq(&device->client_data_lock);
- list_add(&context->list, &device->client_data_list);
- write_unlock_irq(&device->client_data_lock);
- up_write(&lists_rwsem);
+ ret = xa_err(xa_store(&device->client_data, client->client_id, NULL,
+ GFP_KERNEL));
+ if (ret)
+ goto out;
+ downgrade_write(&device->client_data_rwsem);
+ if (client->add)
+ client->add(device);
+
+ /* Readers shall not see a client until add has been completed */
+ xa_set_mark(&device->client_data, client->client_id,
+ CLIENT_DATA_REGISTERED);
+ up_read(&device->client_data_rwsem);
+ return 0;
+out:
+ up_write(&device->client_data_rwsem);
+ return ret;
+}
+
+static void remove_client_context(struct ib_device *device,
+ unsigned int client_id)
+{
+ struct ib_client *client;
+ void *client_data;
+
+ down_write(&device->client_data_rwsem);
+ if (!xa_get_mark(&device->client_data, client_id,
+ CLIENT_DATA_REGISTERED)) {
+ up_write(&device->client_data_rwsem);
+ return;
+ }
+ client_data = xa_load(&device->client_data, client_id);
+ xa_clear_mark(&device->client_data, client_id, CLIENT_DATA_REGISTERED);
+ client = xa_load(&clients, client_id);
+ downgrade_write(&device->client_data_rwsem);
+
+ /*
+ * Notice we cannot be holding any exclusive locks when calling the
+ * remove callback as the remove callback can recurse back into any
+ * public functions in this module and thus try for any locks those
+ * functions take.
+ *
+ * For this reason clients and drivers should not call the
+ * unregistration functions will holdling any locks.
+ *
+ * It tempting to drop the client_data_rwsem too, but this is required
+ * to ensure that unregister_client does not return until all clients
+ * are completely unregistered, which is required to avoid module
+ * unloading races.
+ */
+ if (client->remove)
+ client->remove(device, client_data);
+
+ xa_erase(&device->client_data, client_id);
+ up_read(&device->client_data_rwsem);
+}
+
+static int alloc_port_data(struct ib_device *device)
+{
+ struct ib_port_data_rcu *pdata_rcu;
+ unsigned int port;
+
+ if (device->port_data)
+ return 0;
+
+ /* This can only be called once the physical port range is defined */
+ if (WARN_ON(!device->phys_port_cnt))
+ return -EINVAL;
+
+ /*
+ * device->port_data is indexed directly by the port number to make
+ * access to this data as efficient as possible.
+ *
+ * Therefore port_data is declared as a 1 based array with potential
+ * empty slots at the beginning.
+ */
+ pdata_rcu = kzalloc(struct_size(pdata_rcu, pdata,
+ rdma_end_port(device) + 1),
+ GFP_KERNEL);
+ if (!pdata_rcu)
+ return -ENOMEM;
+ /*
+ * The rcu_head is put in front of the port data array and the stored
+ * pointer is adjusted since we never need to see that member until
+ * kfree_rcu.
+ */
+ device->port_data = pdata_rcu->pdata;
+
+ rdma_for_each_port (device, port) {
+ struct ib_port_data *pdata = &device->port_data[port];
+
+ pdata->ib_dev = device;
+ spin_lock_init(&pdata->pkey_list_lock);
+ INIT_LIST_HEAD(&pdata->pkey_list);
+ spin_lock_init(&pdata->netdev_lock);
+ INIT_HLIST_NODE(&pdata->ndev_hash_link);
+ }
return 0;
}
@@ -341,29 +575,20 @@ static int verify_immutable(const struct ib_device *dev, u8 port)
rdma_max_mad_size(dev, port) != 0);
}
-static int read_port_immutable(struct ib_device *device)
+static int setup_port_data(struct ib_device *device)
{
+ unsigned int port;
int ret;
- u8 start_port = rdma_start_port(device);
- u8 end_port = rdma_end_port(device);
- u8 port;
- /**
- * device->port_immutable is indexed directly by the port number to make
- * access to this data as efficient as possible.
- *
- * Therefore port_immutable is declared as a 1 based array with
- * potential empty slots at the beginning.
- */
- device->port_immutable = kcalloc(end_port + 1,
- sizeof(*device->port_immutable),
- GFP_KERNEL);
- if (!device->port_immutable)
- return -ENOMEM;
+ ret = alloc_port_data(device);
+ if (ret)
+ return ret;
+
+ rdma_for_each_port (device, port) {
+ struct ib_port_data *pdata = &device->port_data[port];
- for (port = start_port; port <= end_port; ++port) {
- ret = device->get_port_immutable(device, port,
- &device->port_immutable[port]);
+ ret = device->ops.get_port_immutable(device, port,
+ &pdata->immutable);
if (ret)
return ret;
@@ -375,46 +600,23 @@ static int read_port_immutable(struct ib_device *device)
void ib_get_device_fw_str(struct ib_device *dev, char *str)
{
- if (dev->get_dev_fw_str)
- dev->get_dev_fw_str(dev, str);
+ if (dev->ops.get_dev_fw_str)
+ dev->ops.get_dev_fw_str(dev, str);
else
str[0] = '\0';
}
EXPORT_SYMBOL(ib_get_device_fw_str);
-static int setup_port_pkey_list(struct ib_device *device)
-{
- int i;
-
- /**
- * device->port_pkey_list is indexed directly by the port number,
- * Therefore it is declared as a 1 based array with potential empty
- * slots at the beginning.
- */
- device->port_pkey_list = kcalloc(rdma_end_port(device) + 1,
- sizeof(*device->port_pkey_list),
- GFP_KERNEL);
-
- if (!device->port_pkey_list)
- return -ENOMEM;
-
- for (i = 0; i < (rdma_end_port(device) + 1); i++) {
- spin_lock_init(&device->port_pkey_list[i].list_lock);
- INIT_LIST_HEAD(&device->port_pkey_list[i].pkey_list);
- }
-
- return 0;
-}
-
static void ib_policy_change_task(struct work_struct *work)
{
struct ib_device *dev;
+ unsigned long index;
- down_read(&lists_rwsem);
- list_for_each_entry(dev, &device_list, core_list) {
- int i;
+ down_read(&devices_rwsem);
+ xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) {
+ unsigned int i;
- for (i = rdma_start_port(dev); i <= rdma_end_port(dev); i++) {
+ rdma_for_each_port (dev, i) {
u64 sp;
int ret = ib_get_cached_subnet_prefix(dev,
i,
@@ -427,7 +629,7 @@ static void ib_policy_change_task(struct work_struct *work)
ib_security_cache_change(dev, i, sp);
}
}
- up_read(&lists_rwsem);
+ up_read(&devices_rwsem);
}
static int ib_security_change(struct notifier_block *nb, unsigned long event,
@@ -437,32 +639,43 @@ static int ib_security_change(struct notifier_block *nb, unsigned long event,
return NOTIFY_DONE;
schedule_work(&ib_policy_change_work);
+ ib_mad_agent_security_change();
return NOTIFY_OK;
}
-/**
- * __dev_new_index - allocate an device index
- *
- * Returns a suitable unique value for a new device interface
- * number. It assumes that there are less than 2^32-1 ib devices
- * will be present in the system.
+/*
+ * Assign the unique string device name and the unique device index. This is
+ * undone by ib_dealloc_device.
*/
-static u32 __dev_new_index(void)
+static int assign_name(struct ib_device *device, const char *name)
{
- /*
- * The device index to allow stable naming.
- * Similar to struct net -> ifindex.
- */
- static u32 index;
+ static u32 last_id;
+ int ret;
- for (;;) {
- if (!(++index))
- index = 1;
+ down_write(&devices_rwsem);
+ /* Assign a unique name to the device */
+ if (strchr(name, '%'))
+ ret = alloc_name(device, name);
+ else
+ ret = dev_set_name(&device->dev, name);
+ if (ret)
+ goto out;
- if (!__ib_device_get_by_index(index))
- return index;
+ if (__ib_device_get_by_name(dev_name(&device->dev))) {
+ ret = -ENFILE;
+ goto out;
}
+ strlcpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX);
+
+ ret = xa_alloc_cyclic(&devices, &device->index, device, xa_limit_31b,
+ &last_id, GFP_KERNEL);
+ if (ret > 0)
+ ret = 0;
+
+out:
+ up_write(&devices_rwsem);
+ return ret;
}
static void setup_dma_device(struct ib_device *device)
@@ -500,56 +713,103 @@ static void setup_dma_device(struct ib_device *device)
}
}
-static void cleanup_device(struct ib_device *device)
-{
- ib_cache_cleanup_one(device);
- ib_cache_release_one(device);
- kfree(device->port_pkey_list);
- kfree(device->port_immutable);
-}
-
+/*
+ * setup_device() allocates memory and sets up data that requires calling the
+ * device ops, this is the only reason these actions are not done during
+ * ib_alloc_device. It is undone by ib_dealloc_device().
+ */
static int setup_device(struct ib_device *device)
{
struct ib_udata uhw = {.outlen = 0, .inlen = 0};
int ret;
+ setup_dma_device(device);
+
ret = ib_device_check_mandatory(device);
if (ret)
return ret;
- ret = read_port_immutable(device);
+ ret = setup_port_data(device);
if (ret) {
- dev_warn(&device->dev,
- "Couldn't create per port immutable data\n");
+ dev_warn(&device->dev, "Couldn't create per-port data\n");
return ret;
}
memset(&device->attrs, 0, sizeof(device->attrs));
- ret = device->query_device(device, &device->attrs, &uhw);
+ ret = device->ops.query_device(device, &device->attrs, &uhw);
if (ret) {
dev_warn(&device->dev,
"Couldn't query the device attributes\n");
- goto port_cleanup;
+ return ret;
}
- ret = setup_port_pkey_list(device);
- if (ret) {
- dev_warn(&device->dev, "Couldn't create per port_pkey_list\n");
- goto port_cleanup;
+ return 0;
+}
+
+static void disable_device(struct ib_device *device)
+{
+ struct ib_client *client;
+
+ WARN_ON(!refcount_read(&device->refcount));
+
+ down_write(&devices_rwsem);
+ xa_clear_mark(&devices, device->index, DEVICE_REGISTERED);
+ up_write(&devices_rwsem);
+
+ down_read(&clients_rwsem);
+ list_for_each_entry_reverse(client, &client_list, list)
+ remove_client_context(device, client->client_id);
+ up_read(&clients_rwsem);
+
+ /* Pairs with refcount_set in enable_device */
+ ib_device_put(device);
+ wait_for_completion(&device->unreg_completion);
+
+ /* Expedite removing unregistered pointers from the hash table */
+ free_netdevs(device);
+}
+
+/*
+ * An enabled device is visible to all clients and to all the public facing
+ * APIs that return a device pointer. This always returns with a new get, even
+ * if it fails.
+ */
+static int enable_device_and_get(struct ib_device *device)
+{
+ struct ib_client *client;
+ unsigned long index;
+ int ret = 0;
+
+ /*
+ * One ref belongs to the xa and the other belongs to this
+ * thread. This is needed to guard against parallel unregistration.
+ */
+ refcount_set(&device->refcount, 2);
+ down_write(&devices_rwsem);
+ xa_set_mark(&devices, device->index, DEVICE_REGISTERED);
+
+ /*
+ * By using downgrade_write() we ensure that no other thread can clear
+ * DEVICE_REGISTERED while we are completing the client setup.
+ */
+ downgrade_write(&devices_rwsem);
+
+ if (device->ops.enable_driver) {
+ ret = device->ops.enable_driver(device);
+ if (ret)
+ goto out;
}
- ret = ib_cache_setup_one(device);
- if (ret) {
- dev_warn(&device->dev,
- "Couldn't set up InfiniBand P_Key/GID cache\n");
- goto pkey_cleanup;
+ down_read(&clients_rwsem);
+ xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) {
+ ret = add_client_context(device, client);
+ if (ret)
+ break;
}
- return 0;
+ up_read(&clients_rwsem);
-pkey_cleanup:
- kfree(device->port_pkey_list);
-port_cleanup:
- kfree(device->port_immutable);
+out:
+ up_read(&devices_rwsem);
return ret;
}
@@ -561,125 +821,254 @@ port_cleanup:
* devices with the IB core. All registered clients will receive a
* callback for each device that is added. @device must be allocated
* with ib_alloc_device().
+ *
+ * If the driver uses ops.dealloc_driver and calls any ib_unregister_device()
+ * asynchronously then the device pointer may become freed as soon as this
+ * function returns.
*/
-int ib_register_device(struct ib_device *device, const char *name,
- int (*port_callback)(struct ib_device *, u8,
- struct kobject *))
+int ib_register_device(struct ib_device *device, const char *name)
{
int ret;
- struct ib_client *client;
-
- setup_dma_device(device);
- mutex_lock(&device_mutex);
-
- if (strchr(name, '%')) {
- ret = alloc_name(device, name);
- if (ret)
- goto out;
- } else {
- ret = dev_set_name(&device->dev, name);
- if (ret)
- goto out;
- }
- if (__ib_device_get_by_name(dev_name(&device->dev))) {
- ret = -ENFILE;
- goto out;
- }
- strlcpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX);
+ ret = assign_name(device, name);
+ if (ret)
+ return ret;
ret = setup_device(device);
if (ret)
- goto out;
-
- device->index = __dev_new_index();
+ return ret;
- ret = ib_device_register_rdmacg(device);
+ ret = ib_cache_setup_one(device);
if (ret) {
dev_warn(&device->dev,
- "Couldn't register device with rdma cgroup\n");
- goto dev_cleanup;
+ "Couldn't set up InfiniBand P_Key/GID cache\n");
+ return ret;
}
- ret = ib_device_register_sysfs(device, port_callback);
+ ib_device_register_rdmacg(device);
+
+ ret = device_add(&device->dev);
+ if (ret)
+ goto cg_cleanup;
+
+ ret = ib_device_register_sysfs(device);
if (ret) {
dev_warn(&device->dev,
"Couldn't register device with driver model\n");
- goto cg_cleanup;
+ goto dev_cleanup;
}
- device->reg_state = IB_DEV_REGISTERED;
+ ret = enable_device_and_get(device);
+ if (ret) {
+ void (*dealloc_fn)(struct ib_device *);
- list_for_each_entry(client, &client_list, list)
- if (!add_client_context(device, client) && client->add)
- client->add(device);
+ /*
+ * If we hit this error flow then we don't want to
+ * automatically dealloc the device since the caller is
+ * expected to call ib_dealloc_device() after
+ * ib_register_device() fails. This is tricky due to the
+ * possibility for a parallel unregistration along with this
+ * error flow. Since we have a refcount here we know any
+ * parallel flow is stopped in disable_device and will see the
+ * NULL pointers, causing the responsibility to
+ * ib_dealloc_device() to revert back to this thread.
+ */
+ dealloc_fn = device->ops.dealloc_driver;
+ device->ops.dealloc_driver = NULL;
+ ib_device_put(device);
+ __ib_unregister_device(device);
+ device->ops.dealloc_driver = dealloc_fn;
+ return ret;
+ }
+ ib_device_put(device);
- down_write(&lists_rwsem);
- list_add_tail(&device->core_list, &device_list);
- up_write(&lists_rwsem);
- mutex_unlock(&device_mutex);
return 0;
+dev_cleanup:
+ device_del(&device->dev);
cg_cleanup:
ib_device_unregister_rdmacg(device);
-dev_cleanup:
- cleanup_device(device);
-out:
- mutex_unlock(&device_mutex);
+ ib_cache_cleanup_one(device);
return ret;
}
EXPORT_SYMBOL(ib_register_device);
+/* Callers must hold a get on the device. */
+static void __ib_unregister_device(struct ib_device *ib_dev)
+{
+ /*
+ * We have a registration lock so that all the calls to unregister are
+ * fully fenced, once any unregister returns the device is truely
+ * unregistered even if multiple callers are unregistering it at the
+ * same time. This also interacts with the registration flow and
+ * provides sane semantics if register and unregister are racing.
+ */
+ mutex_lock(&ib_dev->unregistration_lock);
+ if (!refcount_read(&ib_dev->refcount))
+ goto out;
+
+ disable_device(ib_dev);
+ ib_device_unregister_sysfs(ib_dev);
+ device_del(&ib_dev->dev);
+ ib_device_unregister_rdmacg(ib_dev);
+ ib_cache_cleanup_one(ib_dev);
+
+ /*
+ * Drivers using the new flow may not call ib_dealloc_device except
+ * in error unwind prior to registration success.
+ */
+ if (ib_dev->ops.dealloc_driver) {
+ WARN_ON(kref_read(&ib_dev->dev.kobj.kref) <= 1);
+ ib_dealloc_device(ib_dev);
+ }
+out:
+ mutex_unlock(&ib_dev->unregistration_lock);
+}
+
/**
* ib_unregister_device - Unregister an IB device
- * @device:Device to unregister
+ * @device: The device to unregister
*
* Unregister an IB device. All clients will receive a remove callback.
+ *
+ * Callers should call this routine only once, and protect against races with
+ * registration. Typically it should only be called as part of a remove
+ * callback in an implementation of driver core's struct device_driver and
+ * related.
+ *
+ * If ops.dealloc_driver is used then ib_dev will be freed upon return from
+ * this function.
*/
-void ib_unregister_device(struct ib_device *device)
+void ib_unregister_device(struct ib_device *ib_dev)
{
- struct ib_client_data *context, *tmp;
- unsigned long flags;
+ get_device(&ib_dev->dev);
+ __ib_unregister_device(ib_dev);
+ put_device(&ib_dev->dev);
+}
+EXPORT_SYMBOL(ib_unregister_device);
- mutex_lock(&device_mutex);
+/**
+ * ib_unregister_device_and_put - Unregister a device while holding a 'get'
+ * device: The device to unregister
+ *
+ * This is the same as ib_unregister_device(), except it includes an internal
+ * ib_device_put() that should match a 'get' obtained by the caller.
+ *
+ * It is safe to call this routine concurrently from multiple threads while
+ * holding the 'get'. When the function returns the device is fully
+ * unregistered.
+ *
+ * Drivers using this flow MUST use the driver_unregister callback to clean up
+ * their resources associated with the device and dealloc it.
+ */
+void ib_unregister_device_and_put(struct ib_device *ib_dev)
+{
+ WARN_ON(!ib_dev->ops.dealloc_driver);
+ get_device(&ib_dev->dev);
+ ib_device_put(ib_dev);
+ __ib_unregister_device(ib_dev);
+ put_device(&ib_dev->dev);
+}
+EXPORT_SYMBOL(ib_unregister_device_and_put);
+
+/**
+ * ib_unregister_driver - Unregister all IB devices for a driver
+ * @driver_id: The driver to unregister
+ *
+ * This implements a fence for device unregistration. It only returns once all
+ * devices associated with the driver_id have fully completed their
+ * unregistration and returned from ib_unregister_device*().
+ *
+ * If device's are not yet unregistered it goes ahead and starts unregistering
+ * them.
+ *
+ * This does not block creation of new devices with the given driver_id, that
+ * is the responsibility of the caller.
+ */
+void ib_unregister_driver(enum rdma_driver_id driver_id)
+{
+ struct ib_device *ib_dev;
+ unsigned long index;
+
+ down_read(&devices_rwsem);
+ xa_for_each (&devices, index, ib_dev) {
+ if (ib_dev->driver_id != driver_id)
+ continue;
- down_write(&lists_rwsem);
- list_del(&device->core_list);
- write_lock_irq(&device->client_data_lock);
- list_for_each_entry(context, &device->client_data_list, list)
- context->going_down = true;
- write_unlock_irq(&device->client_data_lock);
- downgrade_write(&lists_rwsem);
+ get_device(&ib_dev->dev);
+ up_read(&devices_rwsem);
- list_for_each_entry(context, &device->client_data_list, list) {
- if (context->client->remove)
- context->client->remove(device, context->data);
+ WARN_ON(!ib_dev->ops.dealloc_driver);
+ __ib_unregister_device(ib_dev);
+
+ put_device(&ib_dev->dev);
+ down_read(&devices_rwsem);
}
- up_read(&lists_rwsem);
+ up_read(&devices_rwsem);
+}
+EXPORT_SYMBOL(ib_unregister_driver);
- ib_device_unregister_sysfs(device);
- ib_device_unregister_rdmacg(device);
+static void ib_unregister_work(struct work_struct *work)
+{
+ struct ib_device *ib_dev =
+ container_of(work, struct ib_device, unregistration_work);
- mutex_unlock(&device_mutex);
+ __ib_unregister_device(ib_dev);
+ put_device(&ib_dev->dev);
+}
- ib_cache_cleanup_one(device);
+/**
+ * ib_unregister_device_queued - Unregister a device using a work queue
+ * device: The device to unregister
+ *
+ * This schedules an asynchronous unregistration using a WQ for the device. A
+ * driver should use this to avoid holding locks while doing unregistration,
+ * such as holding the RTNL lock.
+ *
+ * Drivers using this API must use ib_unregister_driver before module unload
+ * to ensure that all scheduled unregistrations have completed.
+ */
+void ib_unregister_device_queued(struct ib_device *ib_dev)
+{
+ WARN_ON(!refcount_read(&ib_dev->refcount));
+ WARN_ON(!ib_dev->ops.dealloc_driver);
+ get_device(&ib_dev->dev);
+ if (!queue_work(system_unbound_wq, &ib_dev->unregistration_work))
+ put_device(&ib_dev->dev);
+}
+EXPORT_SYMBOL(ib_unregister_device_queued);
- ib_security_destroy_port_pkey_list(device);
- kfree(device->port_pkey_list);
+static int assign_client_id(struct ib_client *client)
+{
+ int ret;
- down_write(&lists_rwsem);
- write_lock_irqsave(&device->client_data_lock, flags);
- list_for_each_entry_safe(context, tmp, &device->client_data_list,
- list) {
- list_del(&context->list);
- kfree(context);
+ down_write(&clients_rwsem);
+ /*
+ * The add/remove callbacks must be called in FIFO/LIFO order. To
+ * achieve this we assign client_ids so they are sorted in
+ * registration order, and retain a linked list we can reverse iterate
+ * to get the LIFO order. The extra linked list can go away if xarray
+ * learns to reverse iterate.
+ */
+ if (list_empty(&client_list)) {
+ client->client_id = 0;
+ } else {
+ struct ib_client *last;
+
+ last = list_last_entry(&client_list, struct ib_client, list);
+ client->client_id = last->client_id + 1;
}
- write_unlock_irqrestore(&device->client_data_lock, flags);
- up_write(&lists_rwsem);
+ ret = xa_insert(&clients, client->client_id, client, GFP_KERNEL);
+ if (ret)
+ goto out;
- device->reg_state = IB_DEV_UNREGISTERED;
+ xa_set_mark(&clients, client->client_id, CLIENT_REGISTERED);
+ list_add_tail(&client->list, &client_list);
+
+out:
+ up_write(&clients_rwsem);
+ return ret;
}
-EXPORT_SYMBOL(ib_unregister_device);
/**
* ib_register_client - Register an IB client
@@ -697,19 +1086,23 @@ EXPORT_SYMBOL(ib_unregister_device);
int ib_register_client(struct ib_client *client)
{
struct ib_device *device;
+ unsigned long index;
+ int ret;
- mutex_lock(&device_mutex);
-
- list_for_each_entry(device, &device_list, core_list)
- if (!add_client_context(device, client) && client->add)
- client->add(device);
-
- down_write(&lists_rwsem);
- list_add_tail(&client->list, &client_list);
- up_write(&lists_rwsem);
-
- mutex_unlock(&device_mutex);
+ ret = assign_client_id(client);
+ if (ret)
+ return ret;
+ down_read(&devices_rwsem);
+ xa_for_each_marked (&devices, index, device, DEVICE_REGISTERED) {
+ ret = add_client_context(device, client);
+ if (ret) {
+ up_read(&devices_rwsem);
+ ib_unregister_client(client);
+ return ret;
+ }
+ }
+ up_read(&devices_rwsem);
return 0;
}
EXPORT_SYMBOL(ib_register_client);
@@ -721,108 +1114,56 @@ EXPORT_SYMBOL(ib_register_client);
* Upper level users use ib_unregister_client() to remove their client
* registration. When ib_unregister_client() is called, the client
* will receive a remove callback for each IB device still registered.
+ *
+ * This is a full fence, once it returns no client callbacks will be called,
+ * or are running in another thread.
*/
void ib_unregister_client(struct ib_client *client)
{
- struct ib_client_data *context;
struct ib_device *device;
+ unsigned long index;
- mutex_lock(&device_mutex);
+ down_write(&clients_rwsem);
+ xa_clear_mark(&clients, client->client_id, CLIENT_REGISTERED);
+ up_write(&clients_rwsem);
+ /*
+ * Every device still known must be serialized to make sure we are
+ * done with the client callbacks before we return.
+ */
+ down_read(&devices_rwsem);
+ xa_for_each (&devices, index, device)
+ remove_client_context(device, client->client_id);
+ up_read(&devices_rwsem);
- down_write(&lists_rwsem);
+ down_write(&clients_rwsem);
list_del(&client->list);
- up_write(&lists_rwsem);
-
- list_for_each_entry(device, &device_list, core_list) {
- struct ib_client_data *found_context = NULL;
-
- down_write(&lists_rwsem);
- write_lock_irq(&device->client_data_lock);
- list_for_each_entry(context, &device->client_data_list, list)
- if (context->client == client) {
- context->going_down = true;
- found_context = context;
- break;
- }
- write_unlock_irq(&device->client_data_lock);
- up_write(&lists_rwsem);
-
- if (client->remove)
- client->remove(device, found_context ?
- found_context->data : NULL);
-
- if (!found_context) {
- dev_warn(&device->dev,
- "No client context found for %s\n",
- client->name);
- continue;
- }
-
- down_write(&lists_rwsem);
- write_lock_irq(&device->client_data_lock);
- list_del(&found_context->list);
- write_unlock_irq(&device->client_data_lock);
- up_write(&lists_rwsem);
- kfree(found_context);
- }
-
- mutex_unlock(&device_mutex);
+ xa_erase(&clients, client->client_id);
+ up_write(&clients_rwsem);
}
EXPORT_SYMBOL(ib_unregister_client);
/**
- * ib_get_client_data - Get IB client context
- * @device:Device to get context for
- * @client:Client to get context for
- *
- * ib_get_client_data() returns client context set with
- * ib_set_client_data().
- */
-void *ib_get_client_data(struct ib_device *device, struct ib_client *client)
-{
- struct ib_client_data *context;
- void *ret = NULL;
- unsigned long flags;
-
- read_lock_irqsave(&device->client_data_lock, flags);
- list_for_each_entry(context, &device->client_data_list, list)
- if (context->client == client) {
- ret = context->data;
- break;
- }
- read_unlock_irqrestore(&device->client_data_lock, flags);
-
- return ret;
-}
-EXPORT_SYMBOL(ib_get_client_data);
-
-/**
* ib_set_client_data - Set IB client context
* @device:Device to set context for
* @client:Client to set context for
* @data:Context to set
*
- * ib_set_client_data() sets client context that can be retrieved with
- * ib_get_client_data().
+ * ib_set_client_data() sets client context data that can be retrieved with
+ * ib_get_client_data(). This can only be called while the client is
+ * registered to the device, once the ib_client remove() callback returns this
+ * cannot be called.
*/
void ib_set_client_data(struct ib_device *device, struct ib_client *client,
void *data)
{
- struct ib_client_data *context;
- unsigned long flags;
+ void *rc;
- write_lock_irqsave(&device->client_data_lock, flags);
- list_for_each_entry(context, &device->client_data_list, list)
- if (context->client == client) {
- context->data = data;
- goto out;
- }
-
- dev_warn(&device->dev, "No client context found for %s\n",
- client->name);
+ if (WARN_ON(IS_ERR(data)))
+ data = NULL;
-out:
- write_unlock_irqrestore(&device->client_data_lock, flags);
+ rc = xa_store(&device->client_data, client->client_id, data,
+ GFP_KERNEL);
+ WARN_ON(xa_is_err(rc));
}
EXPORT_SYMBOL(ib_set_client_data);
@@ -905,14 +1246,14 @@ int ib_query_port(struct ib_device *device,
return -EINVAL;
memset(port_attr, 0, sizeof(*port_attr));
- err = device->query_port(device, port_num, port_attr);
+ err = device->ops.query_port(device, port_num, port_attr);
if (err || port_attr->subnet_prefix)
return err;
if (rdma_port_get_link_layer(device, port_num) != IB_LINK_LAYER_INFINIBAND)
return 0;
- err = device->query_gid(device, port_num, 0, &gid);
+ err = device->ops.query_gid(device, port_num, 0, &gid);
if (err)
return err;
@@ -921,6 +1262,185 @@ int ib_query_port(struct ib_device *device,
}
EXPORT_SYMBOL(ib_query_port);
+static void add_ndev_hash(struct ib_port_data *pdata)
+{
+ unsigned long flags;
+
+ might_sleep();
+
+ spin_lock_irqsave(&ndev_hash_lock, flags);
+ if (hash_hashed(&pdata->ndev_hash_link)) {
+ hash_del_rcu(&pdata->ndev_hash_link);
+ spin_unlock_irqrestore(&ndev_hash_lock, flags);
+ /*
+ * We cannot do hash_add_rcu after a hash_del_rcu until the
+ * grace period
+ */
+ synchronize_rcu();
+ spin_lock_irqsave(&ndev_hash_lock, flags);
+ }
+ if (pdata->netdev)
+ hash_add_rcu(ndev_hash, &pdata->ndev_hash_link,
+ (uintptr_t)pdata->netdev);
+ spin_unlock_irqrestore(&ndev_hash_lock, flags);
+}
+
+/**
+ * ib_device_set_netdev - Associate the ib_dev with an underlying net_device
+ * @ib_dev: Device to modify
+ * @ndev: net_device to affiliate, may be NULL
+ * @port: IB port the net_device is connected to
+ *
+ * Drivers should use this to link the ib_device to a netdev so the netdev
+ * shows up in interfaces like ib_enum_roce_netdev. Only one netdev may be
+ * affiliated with any port.
+ *
+ * The caller must ensure that the given ndev is not unregistered or
+ * unregistering, and that either the ib_device is unregistered or
+ * ib_device_set_netdev() is called with NULL when the ndev sends a
+ * NETDEV_UNREGISTER event.
+ */
+int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev,
+ unsigned int port)
+{
+ struct net_device *old_ndev;
+ struct ib_port_data *pdata;
+ unsigned long flags;
+ int ret;
+
+ /*
+ * Drivers wish to call this before ib_register_driver, so we have to
+ * setup the port data early.
+ */
+ ret = alloc_port_data(ib_dev);
+ if (ret)
+ return ret;
+
+ if (!rdma_is_port_valid(ib_dev, port))
+ return -EINVAL;
+
+ pdata = &ib_dev->port_data[port];
+ spin_lock_irqsave(&pdata->netdev_lock, flags);
+ old_ndev = rcu_dereference_protected(
+ pdata->netdev, lockdep_is_held(&pdata->netdev_lock));
+ if (old_ndev == ndev) {
+ spin_unlock_irqrestore(&pdata->netdev_lock, flags);
+ return 0;
+ }
+
+ if (ndev)
+ dev_hold(ndev);
+ rcu_assign_pointer(pdata->netdev, ndev);
+ spin_unlock_irqrestore(&pdata->netdev_lock, flags);
+
+ add_ndev_hash(pdata);
+ if (old_ndev)
+ dev_put(old_ndev);
+
+ return 0;
+}
+EXPORT_SYMBOL(ib_device_set_netdev);
+
+static void free_netdevs(struct ib_device *ib_dev)
+{
+ unsigned long flags;
+ unsigned int port;
+
+ rdma_for_each_port (ib_dev, port) {
+ struct ib_port_data *pdata = &ib_dev->port_data[port];
+ struct net_device *ndev;
+
+ spin_lock_irqsave(&pdata->netdev_lock, flags);
+ ndev = rcu_dereference_protected(
+ pdata->netdev, lockdep_is_held(&pdata->netdev_lock));
+ if (ndev) {
+ spin_lock(&ndev_hash_lock);
+ hash_del_rcu(&pdata->ndev_hash_link);
+ spin_unlock(&ndev_hash_lock);
+
+ /*
+ * If this is the last dev_put there is still a
+ * synchronize_rcu before the netdev is kfreed, so we
+ * can continue to rely on unlocked pointer
+ * comparisons after the put
+ */
+ rcu_assign_pointer(pdata->netdev, NULL);
+ dev_put(ndev);
+ }
+ spin_unlock_irqrestore(&pdata->netdev_lock, flags);
+ }
+}
+
+struct net_device *ib_device_get_netdev(struct ib_device *ib_dev,
+ unsigned int port)
+{
+ struct ib_port_data *pdata;
+ struct net_device *res;
+
+ if (!rdma_is_port_valid(ib_dev, port))
+ return NULL;
+
+ pdata = &ib_dev->port_data[port];
+
+ /*
+ * New drivers should use ib_device_set_netdev() not the legacy
+ * get_netdev().
+ */
+ if (ib_dev->ops.get_netdev)
+ res = ib_dev->ops.get_netdev(ib_dev, port);
+ else {
+ spin_lock(&pdata->netdev_lock);
+ res = rcu_dereference_protected(
+ pdata->netdev, lockdep_is_held(&pdata->netdev_lock));
+ if (res)
+ dev_hold(res);
+ spin_unlock(&pdata->netdev_lock);
+ }
+
+ /*
+ * If we are starting to unregister expedite things by preventing
+ * propagation of an unregistering netdev.
+ */
+ if (res && res->reg_state != NETREG_REGISTERED) {
+ dev_put(res);
+ return NULL;
+ }
+
+ return res;
+}
+
+/**
+ * ib_device_get_by_netdev - Find an IB device associated with a netdev
+ * @ndev: netdev to locate
+ * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all)
+ *
+ * Find and hold an ib_device that is associated with a netdev via
+ * ib_device_set_netdev(). The caller must call ib_device_put() on the
+ * returned pointer.
+ */
+struct ib_device *ib_device_get_by_netdev(struct net_device *ndev,
+ enum rdma_driver_id driver_id)
+{
+ struct ib_device *res = NULL;
+ struct ib_port_data *cur;
+
+ rcu_read_lock();
+ hash_for_each_possible_rcu (ndev_hash, cur, ndev_hash_link,
+ (uintptr_t)ndev) {
+ if (rcu_access_pointer(cur->netdev) == ndev &&
+ (driver_id == RDMA_DRIVER_UNKNOWN ||
+ cur->ib_dev->driver_id == driver_id) &&
+ ib_device_try_get(cur->ib_dev)) {
+ res = cur->ib_dev;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return res;
+}
+EXPORT_SYMBOL(ib_device_get_by_netdev);
+
/**
* ib_enum_roce_netdev - enumerate all RoCE ports
* @ib_dev : IB device we want to query
@@ -939,21 +1459,12 @@ void ib_enum_roce_netdev(struct ib_device *ib_dev,
roce_netdev_callback cb,
void *cookie)
{
- u8 port;
+ unsigned int port;
- for (port = rdma_start_port(ib_dev); port <= rdma_end_port(ib_dev);
- port++)
+ rdma_for_each_port (ib_dev, port)
if (rdma_protocol_roce(ib_dev, port)) {
- struct net_device *idev = NULL;
-
- if (ib_dev->get_netdev)
- idev = ib_dev->get_netdev(ib_dev, port);
-
- if (idev &&
- idev->reg_state >= NETREG_UNREGISTERED) {
- dev_put(idev);
- idev = NULL;
- }
+ struct net_device *idev =
+ ib_device_get_netdev(ib_dev, port);
if (filter(ib_dev, port, idev, filter_cookie))
cb(ib_dev, port, idev, cookie);
@@ -980,11 +1491,12 @@ void ib_enum_all_roce_netdevs(roce_netdev_filter filter,
void *cookie)
{
struct ib_device *dev;
+ unsigned long index;
- down_read(&lists_rwsem);
- list_for_each_entry(dev, &device_list, core_list)
+ down_read(&devices_rwsem);
+ xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED)
ib_enum_roce_netdev(dev, filter, filter_cookie, cb, cookie);
- up_read(&lists_rwsem);
+ up_read(&devices_rwsem);
}
/**
@@ -996,19 +1508,19 @@ void ib_enum_all_roce_netdevs(roce_netdev_filter filter,
int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb,
struct netlink_callback *cb)
{
+ unsigned long index;
struct ib_device *dev;
unsigned int idx = 0;
int ret = 0;
- down_read(&lists_rwsem);
- list_for_each_entry(dev, &device_list, core_list) {
+ down_read(&devices_rwsem);
+ xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) {
ret = nldev_cb(dev, skb, cb, idx);
if (ret)
break;
idx++;
}
-
- up_read(&lists_rwsem);
+ up_read(&devices_rwsem);
return ret;
}
@@ -1024,7 +1536,10 @@ int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb,
int ib_query_pkey(struct ib_device *device,
u8 port_num, u16 index, u16 *pkey)
{
- return device->query_pkey(device, port_num, index, pkey);
+ if (!rdma_is_port_valid(device, port_num))
+ return -EINVAL;
+
+ return device->ops.query_pkey(device, port_num, index, pkey);
}
EXPORT_SYMBOL(ib_query_pkey);
@@ -1041,11 +1556,11 @@ int ib_modify_device(struct ib_device *device,
int device_modify_mask,
struct ib_device_modify *device_modify)
{
- if (!device->modify_device)
+ if (!device->ops.modify_device)
return -ENOSYS;
- return device->modify_device(device, device_modify_mask,
- device_modify);
+ return device->ops.modify_device(device, device_modify_mask,
+ device_modify);
}
EXPORT_SYMBOL(ib_modify_device);
@@ -1069,9 +1584,10 @@ int ib_modify_port(struct ib_device *device,
if (!rdma_is_port_valid(device, port_num))
return -EINVAL;
- if (device->modify_port)
- rc = device->modify_port(device, port_num, port_modify_mask,
- port_modify);
+ if (device->ops.modify_port)
+ rc = device->ops.modify_port(device, port_num,
+ port_modify_mask,
+ port_modify);
else
rc = rdma_protocol_roce(device, port_num) ? 0 : -ENOSYS;
return rc;
@@ -1091,13 +1607,15 @@ int ib_find_gid(struct ib_device *device, union ib_gid *gid,
u8 *port_num, u16 *index)
{
union ib_gid tmp_gid;
- int ret, port, i;
+ unsigned int port;
+ int ret, i;
- for (port = rdma_start_port(device); port <= rdma_end_port(device); ++port) {
+ rdma_for_each_port (device, port) {
if (!rdma_protocol_ib(device, port))
continue;
- for (i = 0; i < device->port_immutable[port].gid_tbl_len; ++i) {
+ for (i = 0; i < device->port_data[port].immutable.gid_tbl_len;
+ ++i) {
ret = rdma_query_gid(device, port, i, &tmp_gid);
if (ret)
return ret;
@@ -1129,7 +1647,8 @@ int ib_find_pkey(struct ib_device *device,
u16 tmp_pkey;
int partial_ix = -1;
- for (i = 0; i < device->port_immutable[port_num].pkey_tbl_len; ++i) {
+ for (i = 0; i < device->port_data[port_num].immutable.pkey_tbl_len;
+ ++i) {
ret = ib_query_pkey(device, port_num, i, &tmp_pkey);
if (ret)
return ret;
@@ -1162,6 +1681,7 @@ EXPORT_SYMBOL(ib_find_pkey);
* @gid: A GID that the net_dev uses to communicate.
* @addr: Contains the IP address that the request specified as its
* destination.
+ *
*/
struct net_device *ib_get_net_dev_by_params(struct ib_device *dev,
u8 port,
@@ -1170,34 +1690,144 @@ struct net_device *ib_get_net_dev_by_params(struct ib_device *dev,
const struct sockaddr *addr)
{
struct net_device *net_dev = NULL;
- struct ib_client_data *context;
+ unsigned long index;
+ void *client_data;
if (!rdma_protocol_ib(dev, port))
return NULL;
- down_read(&lists_rwsem);
-
- list_for_each_entry(context, &dev->client_data_list, list) {
- struct ib_client *client = context->client;
+ /*
+ * Holding the read side guarantees that the client will not become
+ * unregistered while we are calling get_net_dev_by_params()
+ */
+ down_read(&dev->client_data_rwsem);
+ xan_for_each_marked (&dev->client_data, index, client_data,
+ CLIENT_DATA_REGISTERED) {
+ struct ib_client *client = xa_load(&clients, index);
- if (context->going_down)
+ if (!client || !client->get_net_dev_by_params)
continue;
- if (client->get_net_dev_by_params) {
- net_dev = client->get_net_dev_by_params(dev, port, pkey,
- gid, addr,
- context->data);
- if (net_dev)
- break;
- }
+ net_dev = client->get_net_dev_by_params(dev, port, pkey, gid,
+ addr, client_data);
+ if (net_dev)
+ break;
}
-
- up_read(&lists_rwsem);
+ up_read(&dev->client_data_rwsem);
return net_dev;
}
EXPORT_SYMBOL(ib_get_net_dev_by_params);
+void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
+{
+ struct ib_device_ops *dev_ops = &dev->ops;
+#define SET_DEVICE_OP(ptr, name) \
+ do { \
+ if (ops->name) \
+ if (!((ptr)->name)) \
+ (ptr)->name = ops->name; \
+ } while (0)
+
+#define SET_OBJ_SIZE(ptr, name) SET_DEVICE_OP(ptr, size_##name)
+
+ SET_DEVICE_OP(dev_ops, add_gid);
+ SET_DEVICE_OP(dev_ops, advise_mr);
+ SET_DEVICE_OP(dev_ops, alloc_dm);
+ SET_DEVICE_OP(dev_ops, alloc_fmr);
+ SET_DEVICE_OP(dev_ops, alloc_hw_stats);
+ SET_DEVICE_OP(dev_ops, alloc_mr);
+ SET_DEVICE_OP(dev_ops, alloc_mw);
+ SET_DEVICE_OP(dev_ops, alloc_pd);
+ SET_DEVICE_OP(dev_ops, alloc_rdma_netdev);
+ SET_DEVICE_OP(dev_ops, alloc_ucontext);
+ SET_DEVICE_OP(dev_ops, alloc_xrcd);
+ SET_DEVICE_OP(dev_ops, attach_mcast);
+ SET_DEVICE_OP(dev_ops, check_mr_status);
+ SET_DEVICE_OP(dev_ops, create_ah);
+ SET_DEVICE_OP(dev_ops, create_counters);
+ SET_DEVICE_OP(dev_ops, create_cq);
+ SET_DEVICE_OP(dev_ops, create_flow);
+ SET_DEVICE_OP(dev_ops, create_flow_action_esp);
+ SET_DEVICE_OP(dev_ops, create_qp);
+ SET_DEVICE_OP(dev_ops, create_rwq_ind_table);
+ SET_DEVICE_OP(dev_ops, create_srq);
+ SET_DEVICE_OP(dev_ops, create_wq);
+ SET_DEVICE_OP(dev_ops, dealloc_dm);
+ SET_DEVICE_OP(dev_ops, dealloc_driver);
+ SET_DEVICE_OP(dev_ops, dealloc_fmr);
+ SET_DEVICE_OP(dev_ops, dealloc_mw);
+ SET_DEVICE_OP(dev_ops, dealloc_pd);
+ SET_DEVICE_OP(dev_ops, dealloc_ucontext);
+ SET_DEVICE_OP(dev_ops, dealloc_xrcd);
+ SET_DEVICE_OP(dev_ops, del_gid);
+ SET_DEVICE_OP(dev_ops, dereg_mr);
+ SET_DEVICE_OP(dev_ops, destroy_ah);
+ SET_DEVICE_OP(dev_ops, destroy_counters);
+ SET_DEVICE_OP(dev_ops, destroy_cq);
+ SET_DEVICE_OP(dev_ops, destroy_flow);
+ SET_DEVICE_OP(dev_ops, destroy_flow_action);
+ SET_DEVICE_OP(dev_ops, destroy_qp);
+ SET_DEVICE_OP(dev_ops, destroy_rwq_ind_table);
+ SET_DEVICE_OP(dev_ops, destroy_srq);
+ SET_DEVICE_OP(dev_ops, destroy_wq);
+ SET_DEVICE_OP(dev_ops, detach_mcast);
+ SET_DEVICE_OP(dev_ops, disassociate_ucontext);
+ SET_DEVICE_OP(dev_ops, drain_rq);
+ SET_DEVICE_OP(dev_ops, drain_sq);
+ SET_DEVICE_OP(dev_ops, enable_driver);
+ SET_DEVICE_OP(dev_ops, fill_res_entry);
+ SET_DEVICE_OP(dev_ops, get_dev_fw_str);
+ SET_DEVICE_OP(dev_ops, get_dma_mr);
+ SET_DEVICE_OP(dev_ops, get_hw_stats);
+ SET_DEVICE_OP(dev_ops, get_link_layer);
+ SET_DEVICE_OP(dev_ops, get_netdev);
+ SET_DEVICE_OP(dev_ops, get_port_immutable);
+ SET_DEVICE_OP(dev_ops, get_vector_affinity);
+ SET_DEVICE_OP(dev_ops, get_vf_config);
+ SET_DEVICE_OP(dev_ops, get_vf_stats);
+ SET_DEVICE_OP(dev_ops, init_port);
+ SET_DEVICE_OP(dev_ops, map_mr_sg);
+ SET_DEVICE_OP(dev_ops, map_phys_fmr);
+ SET_DEVICE_OP(dev_ops, mmap);
+ SET_DEVICE_OP(dev_ops, modify_ah);
+ SET_DEVICE_OP(dev_ops, modify_cq);
+ SET_DEVICE_OP(dev_ops, modify_device);
+ SET_DEVICE_OP(dev_ops, modify_flow_action_esp);
+ SET_DEVICE_OP(dev_ops, modify_port);
+ SET_DEVICE_OP(dev_ops, modify_qp);
+ SET_DEVICE_OP(dev_ops, modify_srq);
+ SET_DEVICE_OP(dev_ops, modify_wq);
+ SET_DEVICE_OP(dev_ops, peek_cq);
+ SET_DEVICE_OP(dev_ops, poll_cq);
+ SET_DEVICE_OP(dev_ops, post_recv);
+ SET_DEVICE_OP(dev_ops, post_send);
+ SET_DEVICE_OP(dev_ops, post_srq_recv);
+ SET_DEVICE_OP(dev_ops, process_mad);
+ SET_DEVICE_OP(dev_ops, query_ah);
+ SET_DEVICE_OP(dev_ops, query_device);
+ SET_DEVICE_OP(dev_ops, query_gid);
+ SET_DEVICE_OP(dev_ops, query_pkey);
+ SET_DEVICE_OP(dev_ops, query_port);
+ SET_DEVICE_OP(dev_ops, query_qp);
+ SET_DEVICE_OP(dev_ops, query_srq);
+ SET_DEVICE_OP(dev_ops, rdma_netdev_get_params);
+ SET_DEVICE_OP(dev_ops, read_counters);
+ SET_DEVICE_OP(dev_ops, reg_dm_mr);
+ SET_DEVICE_OP(dev_ops, reg_user_mr);
+ SET_DEVICE_OP(dev_ops, req_ncomp_notif);
+ SET_DEVICE_OP(dev_ops, req_notify_cq);
+ SET_DEVICE_OP(dev_ops, rereg_user_mr);
+ SET_DEVICE_OP(dev_ops, resize_cq);
+ SET_DEVICE_OP(dev_ops, set_vf_guid);
+ SET_DEVICE_OP(dev_ops, set_vf_link_state);
+ SET_DEVICE_OP(dev_ops, unmap_fmr);
+
+ SET_OBJ_SIZE(dev_ops, ib_pd);
+ SET_OBJ_SIZE(dev_ops, ib_ucontext);
+}
+EXPORT_SYMBOL(ib_set_device_ops);
+
static const struct rdma_nl_cbs ibnl_ls_cb_table[RDMA_NL_LS_NUM_OPS] = {
[RDMA_NL_LS_OP_RESOLVE] = {
.doit = ib_nl_handle_resolve_resp,
@@ -1313,6 +1943,9 @@ static void __exit ib_core_cleanup(void)
destroy_workqueue(ib_comp_wq);
/* Make sure that any pending umem accounting work is done. */
destroy_workqueue(ib_wq);
+ flush_workqueue(system_unbound_wq);
+ WARN_ON(!xa_empty(&clients));
+ WARN_ON(!xa_empty(&devices));
}
MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_LS, 4);
diff --git a/drivers/infiniband/core/fmr_pool.c b/drivers/infiniband/core/fmr_pool.c
index 83ba0068e8bb..7d841b689a1e 100644
--- a/drivers/infiniband/core/fmr_pool.c
+++ b/drivers/infiniband/core/fmr_pool.c
@@ -211,8 +211,8 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd,
return ERR_PTR(-EINVAL);
device = pd->device;
- if (!device->alloc_fmr || !device->dealloc_fmr ||
- !device->map_phys_fmr || !device->unmap_fmr) {
+ if (!device->ops.alloc_fmr || !device->ops.dealloc_fmr ||
+ !device->ops.map_phys_fmr || !device->ops.unmap_fmr) {
dev_info(&device->dev, "Device does not support FMRs\n");
return ERR_PTR(-ENOSYS);
}
@@ -474,7 +474,7 @@ EXPORT_SYMBOL(ib_fmr_pool_map_phys);
* Unmap an FMR. The FMR mapping may remain valid until the FMR is
* reused (or until ib_flush_fmr_pool() is called).
*/
-int ib_fmr_pool_unmap(struct ib_pool_fmr *fmr)
+void ib_fmr_pool_unmap(struct ib_pool_fmr *fmr)
{
struct ib_fmr_pool *pool;
unsigned long flags;
@@ -503,7 +503,5 @@ int ib_fmr_pool_unmap(struct ib_pool_fmr *fmr)
#endif
spin_unlock_irqrestore(&pool->pool_lock, flags);
-
- return 0;
}
EXPORT_SYMBOL(ib_fmr_pool_unmap);
diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c
index ba668d49c751..732637c913d9 100644
--- a/drivers/infiniband/core/iwcm.c
+++ b/drivers/infiniband/core/iwcm.c
@@ -87,7 +87,8 @@ static struct rdma_nl_cbs iwcm_nl_cb_table[RDMA_NL_IWPM_NUM_OPS] = {
[RDMA_NL_IWPM_REMOTE_INFO] = {.dump = iwpm_remote_info_cb},
[RDMA_NL_IWPM_HANDLE_ERR] = {.dump = iwpm_mapping_error_cb},
[RDMA_NL_IWPM_MAPINFO] = {.dump = iwpm_mapping_info_cb},
- [RDMA_NL_IWPM_MAPINFO_NUM] = {.dump = iwpm_ack_mapping_info_cb}
+ [RDMA_NL_IWPM_MAPINFO_NUM] = {.dump = iwpm_ack_mapping_info_cb},
+ [RDMA_NL_IWPM_HELLO] = {.dump = iwpm_hello_cb}
};
static struct workqueue_struct *iwcm_wq;
@@ -502,17 +503,21 @@ static void iw_cm_check_wildcard(struct sockaddr_storage *pm_addr,
*/
static int iw_cm_map(struct iw_cm_id *cm_id, bool active)
{
- struct iwpm_dev_data pm_reg_msg;
+ const char *devname = dev_name(&cm_id->device->dev);
+ const char *ifname = cm_id->device->iwcm->ifname;
+ struct iwpm_dev_data pm_reg_msg = {};
struct iwpm_sa_data pm_msg;
int status;
+ if (strlen(devname) >= sizeof(pm_reg_msg.dev_name) ||
+ strlen(ifname) >= sizeof(pm_reg_msg.if_name))
+ return -EINVAL;
+
cm_id->m_local_addr = cm_id->local_addr;
cm_id->m_remote_addr = cm_id->remote_addr;
- memcpy(pm_reg_msg.dev_name, dev_name(&cm_id->device->dev),
- sizeof(pm_reg_msg.dev_name));
- memcpy(pm_reg_msg.if_name, cm_id->device->iwcm->ifname,
- sizeof(pm_reg_msg.if_name));
+ strcpy(pm_reg_msg.dev_name, devname);
+ strcpy(pm_reg_msg.if_name, ifname);
if (iwpm_register_pid(&pm_reg_msg, RDMA_NL_IWCM) ||
!iwpm_valid_pid())
@@ -521,6 +526,8 @@ static int iw_cm_map(struct iw_cm_id *cm_id, bool active)
cm_id->mapped = true;
pm_msg.loc_addr = cm_id->local_addr;
pm_msg.rem_addr = cm_id->remote_addr;
+ pm_msg.flags = (cm_id->device->iwcm->driver_flags & IW_F_NO_PORT_MAP) ?
+ IWPM_FLAGS_NO_PORT_MAP : 0;
if (active)
status = iwpm_add_and_query_mapping(&pm_msg,
RDMA_NL_IWCM);
@@ -539,7 +546,7 @@ static int iw_cm_map(struct iw_cm_id *cm_id, bool active)
return iwpm_create_mapinfo(&cm_id->local_addr,
&cm_id->m_local_addr,
- RDMA_NL_IWCM);
+ RDMA_NL_IWCM, pm_msg.flags);
}
/*
diff --git a/drivers/infiniband/core/iwpm_msg.c b/drivers/infiniband/core/iwpm_msg.c
index 8861c052155a..2452b0ddcf0d 100644
--- a/drivers/infiniband/core/iwpm_msg.c
+++ b/drivers/infiniband/core/iwpm_msg.c
@@ -34,18 +34,25 @@
#include "iwpm_util.h"
static const char iwpm_ulib_name[IWPM_ULIBNAME_SIZE] = "iWarpPortMapperUser";
-static int iwpm_ulib_version = 3;
+u16 iwpm_ulib_version = IWPM_UABI_VERSION_MIN;
static int iwpm_user_pid = IWPM_PID_UNDEFINED;
static atomic_t echo_nlmsg_seq;
+/**
+ * iwpm_valid_pid - Check if the userspace iwarp port mapper pid is valid
+ *
+ * Returns true if the pid is greater than zero, otherwise returns false
+ */
int iwpm_valid_pid(void)
{
return iwpm_user_pid > 0;
}
-/*
- * iwpm_register_pid - Send a netlink query to user space
- * for the iwarp port mapper pid
+/**
+ * iwpm_register_pid - Send a netlink query to userspace
+ * to get the iwarp port mapper pid
+ * @pm_msg: Contains driver info to send to the userspace port mapper
+ * @nl_client: The index of the netlink client
*
* nlmsg attributes:
* [IWPM_NLA_REG_PID_SEQ]
@@ -124,12 +131,19 @@ pid_query_error:
return ret;
}
-/*
- * iwpm_add_mapping - Send a netlink add mapping message
- * to the port mapper
+/**
+ * iwpm_add_mapping - Send a netlink add mapping request to
+ * the userspace port mapper
+ * @pm_msg: Contains the local ip/tcp address info to send
+ * @nl_client: The index of the netlink client
+ *
* nlmsg attributes:
* [IWPM_NLA_MANAGE_MAPPING_SEQ]
* [IWPM_NLA_MANAGE_ADDR]
+ * [IWPM_NLA_MANAGE_FLAGS]
+ *
+ * If the request is successful, the pm_msg stores
+ * the port mapper response (mapped address info)
*/
int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client)
{
@@ -173,6 +187,18 @@ int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client)
if (ret)
goto add_mapping_error;
+ /* If flags are required and we're not V4, then return a quiet error */
+ if (pm_msg->flags && iwpm_ulib_version == IWPM_UABI_VERSION_MIN) {
+ ret = -EINVAL;
+ goto add_mapping_error_nowarn;
+ }
+ if (iwpm_ulib_version > IWPM_UABI_VERSION_MIN) {
+ ret = ibnl_put_attr(skb, nlh, sizeof(u32), &pm_msg->flags,
+ IWPM_NLA_MANAGE_FLAGS);
+ if (ret)
+ goto add_mapping_error;
+ }
+
nlmsg_end(skb, nlh);
nlmsg_request->req_buffer = pm_msg;
@@ -187,6 +213,7 @@ int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client)
return ret;
add_mapping_error:
pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client);
+add_mapping_error_nowarn:
if (skb)
dev_kfree_skb(skb);
if (nlmsg_request)
@@ -194,13 +221,17 @@ add_mapping_error:
return ret;
}
-/*
- * iwpm_add_and_query_mapping - Send a netlink add and query
- * mapping message to the port mapper
+/**
+ * iwpm_add_and_query_mapping - Process the port mapper response to
+ * iwpm_add_and_query_mapping request
+ * @pm_msg: Contains the local ip/tcp address info to send
+ * @nl_client: The index of the netlink client
+ *
* nlmsg attributes:
* [IWPM_NLA_QUERY_MAPPING_SEQ]
* [IWPM_NLA_QUERY_LOCAL_ADDR]
* [IWPM_NLA_QUERY_REMOTE_ADDR]
+ * [IWPM_NLA_QUERY_FLAGS]
*/
int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client)
{
@@ -251,6 +282,18 @@ int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client)
if (ret)
goto query_mapping_error;
+ /* If flags are required and we're not V4, then return a quite error */
+ if (pm_msg->flags && iwpm_ulib_version == IWPM_UABI_VERSION_MIN) {
+ ret = -EINVAL;
+ goto query_mapping_error_nowarn;
+ }
+ if (iwpm_ulib_version > IWPM_UABI_VERSION_MIN) {
+ ret = ibnl_put_attr(skb, nlh, sizeof(u32), &pm_msg->flags,
+ IWPM_NLA_QUERY_FLAGS);
+ if (ret)
+ goto query_mapping_error;
+ }
+
nlmsg_end(skb, nlh);
nlmsg_request->req_buffer = pm_msg;
@@ -264,6 +307,7 @@ int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client)
return ret;
query_mapping_error:
pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client);
+query_mapping_error_nowarn:
if (skb)
dev_kfree_skb(skb);
if (nlmsg_request)
@@ -271,9 +315,13 @@ query_mapping_error:
return ret;
}
-/*
- * iwpm_remove_mapping - Send a netlink remove mapping message
- * to the port mapper
+/**
+ * iwpm_remove_mapping - Send a netlink remove mapping request
+ * to the userspace port mapper
+ *
+ * @local_addr: Local ip/tcp address to remove
+ * @nl_client: The index of the netlink client
+ *
* nlmsg attributes:
* [IWPM_NLA_MANAGE_MAPPING_SEQ]
* [IWPM_NLA_MANAGE_ADDR]
@@ -344,9 +392,14 @@ static const struct nla_policy resp_reg_policy[IWPM_NLA_RREG_PID_MAX] = {
[IWPM_NLA_RREG_PID_ERR] = { .type = NLA_U16 }
};
-/*
- * iwpm_register_pid_cb - Process a port mapper response to
- * iwpm_register_pid()
+/**
+ * iwpm_register_pid_cb - Process the port mapper response to
+ * iwpm_register_pid query
+ * @skb:
+ * @cb: Contains the received message (payload and netlink header)
+ *
+ * If successful, the function receives the userspace port mapper pid
+ * which is used in future communication with the port mapper
*/
int iwpm_register_pid_cb(struct sk_buff *skb, struct netlink_callback *cb)
{
@@ -379,7 +432,7 @@ int iwpm_register_pid_cb(struct sk_buff *skb, struct netlink_callback *cb)
/* check device name, ulib name and version */
if (strcmp(pm_msg->dev_name, dev_name) ||
strcmp(iwpm_ulib_name, iwpm_name) ||
- iwpm_version != iwpm_ulib_version) {
+ iwpm_version < IWPM_UABI_VERSION_MIN) {
pr_info("%s: Incorrect info (dev = %s name = %s version = %d)\n",
__func__, dev_name, iwpm_name, iwpm_version);
@@ -387,6 +440,10 @@ int iwpm_register_pid_cb(struct sk_buff *skb, struct netlink_callback *cb)
goto register_pid_response_exit;
}
iwpm_user_pid = cb->nlh->nlmsg_pid;
+ iwpm_ulib_version = iwpm_version;
+ if (iwpm_ulib_version < IWPM_UABI_VERSION)
+ pr_warn_once("%s: Down level iwpmd/pid %u. Continuing...",
+ __func__, iwpm_user_pid);
atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
pr_debug("%s: iWarp Port Mapper (pid = %d) is available!\n",
__func__, iwpm_user_pid);
@@ -403,15 +460,19 @@ register_pid_response_exit:
/* netlink attribute policy for the received response to add mapping request */
static const struct nla_policy resp_add_policy[IWPM_NLA_RMANAGE_MAPPING_MAX] = {
- [IWPM_NLA_MANAGE_MAPPING_SEQ] = { .type = NLA_U32 },
- [IWPM_NLA_MANAGE_ADDR] = { .len = sizeof(struct sockaddr_storage) },
- [IWPM_NLA_MANAGE_MAPPED_LOC_ADDR] = { .len = sizeof(struct sockaddr_storage) },
- [IWPM_NLA_RMANAGE_MAPPING_ERR] = { .type = NLA_U16 }
+ [IWPM_NLA_RMANAGE_MAPPING_SEQ] = { .type = NLA_U32 },
+ [IWPM_NLA_RMANAGE_ADDR] = {
+ .len = sizeof(struct sockaddr_storage) },
+ [IWPM_NLA_RMANAGE_MAPPED_LOC_ADDR] = {
+ .len = sizeof(struct sockaddr_storage) },
+ [IWPM_NLA_RMANAGE_MAPPING_ERR] = { .type = NLA_U16 }
};
-/*
- * iwpm_add_mapping_cb - Process a port mapper response to
- * iwpm_add_mapping()
+/**
+ * iwpm_add_mapping_cb - Process the port mapper response to
+ * iwpm_add_mapping request
+ * @skb:
+ * @cb: Contains the received message (payload and netlink header)
*/
int iwpm_add_mapping_cb(struct sk_buff *skb, struct netlink_callback *cb)
{
@@ -430,7 +491,7 @@ int iwpm_add_mapping_cb(struct sk_buff *skb, struct netlink_callback *cb)
atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
- msg_seq = nla_get_u32(nltb[IWPM_NLA_MANAGE_MAPPING_SEQ]);
+ msg_seq = nla_get_u32(nltb[IWPM_NLA_RMANAGE_MAPPING_SEQ]);
nlmsg_request = iwpm_find_nlmsg_request(msg_seq);
if (!nlmsg_request) {
pr_info("%s: Could not find a matching request (seq = %u)\n",
@@ -439,9 +500,9 @@ int iwpm_add_mapping_cb(struct sk_buff *skb, struct netlink_callback *cb)
}
pm_msg = nlmsg_request->req_buffer;
local_sockaddr = (struct sockaddr_storage *)
- nla_data(nltb[IWPM_NLA_MANAGE_ADDR]);
+ nla_data(nltb[IWPM_NLA_RMANAGE_ADDR]);
mapped_sockaddr = (struct sockaddr_storage *)
- nla_data(nltb[IWPM_NLA_MANAGE_MAPPED_LOC_ADDR]);
+ nla_data(nltb[IWPM_NLA_RMANAGE_MAPPED_LOC_ADDR]);
if (iwpm_compare_sockaddr(local_sockaddr, &pm_msg->loc_addr)) {
nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR;
@@ -472,17 +533,23 @@ add_mapping_response_exit:
/* netlink attribute policy for the response to add and query mapping request
* and response with remote address info */
static const struct nla_policy resp_query_policy[IWPM_NLA_RQUERY_MAPPING_MAX] = {
- [IWPM_NLA_QUERY_MAPPING_SEQ] = { .type = NLA_U32 },
- [IWPM_NLA_QUERY_LOCAL_ADDR] = { .len = sizeof(struct sockaddr_storage) },
- [IWPM_NLA_QUERY_REMOTE_ADDR] = { .len = sizeof(struct sockaddr_storage) },
- [IWPM_NLA_RQUERY_MAPPED_LOC_ADDR] = { .len = sizeof(struct sockaddr_storage) },
- [IWPM_NLA_RQUERY_MAPPED_REM_ADDR] = { .len = sizeof(struct sockaddr_storage) },
+ [IWPM_NLA_RQUERY_MAPPING_SEQ] = { .type = NLA_U32 },
+ [IWPM_NLA_RQUERY_LOCAL_ADDR] = {
+ .len = sizeof(struct sockaddr_storage) },
+ [IWPM_NLA_RQUERY_REMOTE_ADDR] = {
+ .len = sizeof(struct sockaddr_storage) },
+ [IWPM_NLA_RQUERY_MAPPED_LOC_ADDR] = {
+ .len = sizeof(struct sockaddr_storage) },
+ [IWPM_NLA_RQUERY_MAPPED_REM_ADDR] = {
+ .len = sizeof(struct sockaddr_storage) },
[IWPM_NLA_RQUERY_MAPPING_ERR] = { .type = NLA_U16 }
};
-/*
- * iwpm_add_and_query_mapping_cb - Process a port mapper response to
- * iwpm_add_and_query_mapping()
+/**
+ * iwpm_add_and_query_mapping_cb - Process the port mapper response to
+ * iwpm_add_and_query_mapping request
+ * @skb:
+ * @cb: Contains the received message (payload and netlink header)
*/
int iwpm_add_and_query_mapping_cb(struct sk_buff *skb,
struct netlink_callback *cb)
@@ -502,7 +569,7 @@ int iwpm_add_and_query_mapping_cb(struct sk_buff *skb,
return -EINVAL;
atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
- msg_seq = nla_get_u32(nltb[IWPM_NLA_QUERY_MAPPING_SEQ]);
+ msg_seq = nla_get_u32(nltb[IWPM_NLA_RQUERY_MAPPING_SEQ]);
nlmsg_request = iwpm_find_nlmsg_request(msg_seq);
if (!nlmsg_request) {
pr_info("%s: Could not find a matching request (seq = %u)\n",
@@ -511,9 +578,9 @@ int iwpm_add_and_query_mapping_cb(struct sk_buff *skb,
}
pm_msg = nlmsg_request->req_buffer;
local_sockaddr = (struct sockaddr_storage *)
- nla_data(nltb[IWPM_NLA_QUERY_LOCAL_ADDR]);
+ nla_data(nltb[IWPM_NLA_RQUERY_LOCAL_ADDR]);
remote_sockaddr = (struct sockaddr_storage *)
- nla_data(nltb[IWPM_NLA_QUERY_REMOTE_ADDR]);
+ nla_data(nltb[IWPM_NLA_RQUERY_REMOTE_ADDR]);
mapped_loc_sockaddr = (struct sockaddr_storage *)
nla_data(nltb[IWPM_NLA_RQUERY_MAPPED_LOC_ADDR]);
mapped_rem_sockaddr = (struct sockaddr_storage *)
@@ -560,9 +627,13 @@ query_mapping_response_exit:
return 0;
}
-/*
- * iwpm_remote_info_cb - Process a port mapper message, containing
- * the remote connecting peer address info
+/**
+ * iwpm_remote_info_cb - Process remote connecting peer address info, which
+ * the port mapper has received from the connecting peer
+ * @skb:
+ * @cb: Contains the received message (payload and netlink header)
+ *
+ * Stores the IPv4/IPv6 address info in a hash table
*/
int iwpm_remote_info_cb(struct sk_buff *skb, struct netlink_callback *cb)
{
@@ -588,9 +659,9 @@ int iwpm_remote_info_cb(struct sk_buff *skb, struct netlink_callback *cb)
atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
local_sockaddr = (struct sockaddr_storage *)
- nla_data(nltb[IWPM_NLA_QUERY_LOCAL_ADDR]);
+ nla_data(nltb[IWPM_NLA_RQUERY_LOCAL_ADDR]);
remote_sockaddr = (struct sockaddr_storage *)
- nla_data(nltb[IWPM_NLA_QUERY_REMOTE_ADDR]);
+ nla_data(nltb[IWPM_NLA_RQUERY_REMOTE_ADDR]);
mapped_loc_sockaddr = (struct sockaddr_storage *)
nla_data(nltb[IWPM_NLA_RQUERY_MAPPED_LOC_ADDR]);
mapped_rem_sockaddr = (struct sockaddr_storage *)
@@ -635,8 +706,14 @@ static const struct nla_policy resp_mapinfo_policy[IWPM_NLA_MAPINFO_REQ_MAX] = {
[IWPM_NLA_MAPINFO_ULIB_VER] = { .type = NLA_U16 }
};
-/*
- * iwpm_mapping_info_cb - Process a port mapper request for mapping info
+/**
+ * iwpm_mapping_info_cb - Process a notification that the userspace
+ * port mapper daemon is started
+ * @skb:
+ * @cb: Contains the received message (payload and netlink header)
+ *
+ * Using the received port mapper pid, send all the local mapping
+ * info records to the userspace port mapper
*/
int iwpm_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb)
{
@@ -655,7 +732,7 @@ int iwpm_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb)
iwpm_name = (char *)nla_data(nltb[IWPM_NLA_MAPINFO_ULIB_NAME]);
iwpm_version = nla_get_u16(nltb[IWPM_NLA_MAPINFO_ULIB_VER]);
if (strcmp(iwpm_ulib_name, iwpm_name) ||
- iwpm_version != iwpm_ulib_version) {
+ iwpm_version < IWPM_UABI_VERSION_MIN) {
pr_info("%s: Invalid port mapper name = %s version = %d\n",
__func__, iwpm_name, iwpm_version);
return ret;
@@ -669,6 +746,11 @@ int iwpm_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb)
iwpm_set_registration(nl_client, IWPM_REG_INCOMPL);
atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
iwpm_user_pid = cb->nlh->nlmsg_pid;
+
+ if (iwpm_ulib_version < IWPM_UABI_VERSION)
+ pr_warn_once("%s: Down level iwpmd/pid %u. Continuing...",
+ __func__, iwpm_user_pid);
+
if (!iwpm_mapinfo_available())
return 0;
pr_debug("%s: iWarp Port Mapper (pid = %d) is available!\n",
@@ -684,9 +766,11 @@ static const struct nla_policy ack_mapinfo_policy[IWPM_NLA_MAPINFO_NUM_MAX] = {
[IWPM_NLA_MAPINFO_ACK_NUM] = { .type = NLA_U32 }
};
-/*
- * iwpm_ack_mapping_info_cb - Process a port mapper ack for
- * the provided mapping info records
+/**
+ * iwpm_ack_mapping_info_cb - Process the port mapper ack for
+ * the provided local mapping info records
+ * @skb:
+ * @cb: Contains the received message (payload and netlink header)
*/
int iwpm_ack_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb)
{
@@ -712,8 +796,11 @@ static const struct nla_policy map_error_policy[IWPM_NLA_ERR_MAX] = {
[IWPM_NLA_ERR_CODE] = { .type = NLA_U16 },
};
-/*
- * iwpm_mapping_error_cb - Process a port mapper error message
+/**
+ * iwpm_mapping_error_cb - Process port mapper notification for error
+ *
+ * @skb:
+ * @cb: Contains the received message (payload and netlink header)
*/
int iwpm_mapping_error_cb(struct sk_buff *skb, struct netlink_callback *cb)
{
@@ -748,3 +835,46 @@ int iwpm_mapping_error_cb(struct sk_buff *skb, struct netlink_callback *cb)
up(&nlmsg_request->sem);
return 0;
}
+
+/* netlink attribute policy for the received hello request */
+static const struct nla_policy hello_policy[IWPM_NLA_HELLO_MAX] = {
+ [IWPM_NLA_HELLO_ABI_VERSION] = { .type = NLA_U16 }
+};
+
+/**
+ * iwpm_hello_cb - Process a hello message from iwpmd
+ *
+ * @skb:
+ * @cb: Contains the received message (payload and netlink header)
+ *
+ * Using the received port mapper pid, send the kernel's abi_version
+ * after adjusting it to support the iwpmd version.
+ */
+int iwpm_hello_cb(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct nlattr *nltb[IWPM_NLA_HELLO_MAX];
+ const char *msg_type = "Hello request";
+ u8 nl_client;
+ u16 abi_version;
+ int ret = -EINVAL;
+
+ if (iwpm_parse_nlmsg(cb, IWPM_NLA_HELLO_MAX, hello_policy, nltb,
+ msg_type)) {
+ pr_info("%s: Unable to parse nlmsg\n", __func__);
+ return ret;
+ }
+ abi_version = nla_get_u16(nltb[IWPM_NLA_HELLO_ABI_VERSION]);
+ nl_client = RDMA_NL_GET_CLIENT(cb->nlh->nlmsg_type);
+ if (!iwpm_valid_client(nl_client)) {
+ pr_info("%s: Invalid port mapper client = %d\n",
+ __func__, nl_client);
+ return ret;
+ }
+ iwpm_set_registration(nl_client, IWPM_REG_INCOMPL);
+ atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
+ iwpm_ulib_version = min_t(u16, IWPM_UABI_VERSION, abi_version);
+ pr_debug("Using ABI version %u\n", iwpm_ulib_version);
+ iwpm_user_pid = cb->nlh->nlmsg_pid;
+ ret = iwpm_send_hello(nl_client, iwpm_user_pid, iwpm_ulib_version);
+ return ret;
+}
diff --git a/drivers/infiniband/core/iwpm_util.c b/drivers/infiniband/core/iwpm_util.c
index cdb63f3f4de7..a5d2a20ee697 100644
--- a/drivers/infiniband/core/iwpm_util.c
+++ b/drivers/infiniband/core/iwpm_util.c
@@ -51,6 +51,12 @@ static DEFINE_SPINLOCK(iwpm_reminfo_lock);
static DEFINE_MUTEX(iwpm_admin_lock);
static struct iwpm_admin_data iwpm_admin;
+/**
+ * iwpm_init - Allocate resources for the iwarp port mapper
+ * @nl_client: The index of the netlink client
+ *
+ * Should be called when network interface goes up.
+ */
int iwpm_init(u8 nl_client)
{
int ret = 0;
@@ -87,6 +93,12 @@ init_exit:
static void free_hash_bucket(void);
static void free_reminfo_bucket(void);
+/**
+ * iwpm_exit - Deallocate resources for the iwarp port mapper
+ * @nl_client: The index of the netlink client
+ *
+ * Should be called when network interface goes down.
+ */
int iwpm_exit(u8 nl_client)
{
@@ -112,9 +124,17 @@ int iwpm_exit(u8 nl_client)
static struct hlist_head *get_mapinfo_hash_bucket(struct sockaddr_storage *,
struct sockaddr_storage *);
+/**
+ * iwpm_create_mapinfo - Store local and mapped IPv4/IPv6 address
+ * info in a hash table
+ * @local_addr: Local ip/tcp address
+ * @mapped_addr: Mapped local ip/tcp address
+ * @nl_client: The index of the netlink client
+ * @map_flags: IWPM mapping flags
+ */
int iwpm_create_mapinfo(struct sockaddr_storage *local_sockaddr,
struct sockaddr_storage *mapped_sockaddr,
- u8 nl_client)
+ u8 nl_client, u32 map_flags)
{
struct hlist_head *hash_bucket_head = NULL;
struct iwpm_mapping_info *map_info;
@@ -132,6 +152,7 @@ int iwpm_create_mapinfo(struct sockaddr_storage *local_sockaddr,
memcpy(&map_info->mapped_sockaddr, mapped_sockaddr,
sizeof(struct sockaddr_storage));
map_info->nl_client = nl_client;
+ map_info->map_flags = map_flags;
spin_lock_irqsave(&iwpm_mapinfo_lock, flags);
if (iwpm_hash_bucket) {
@@ -150,6 +171,15 @@ int iwpm_create_mapinfo(struct sockaddr_storage *local_sockaddr,
return ret;
}
+/**
+ * iwpm_remove_mapinfo - Remove local and mapped IPv4/IPv6 address
+ * info from the hash table
+ * @local_addr: Local ip/tcp address
+ * @mapped_local_addr: Mapped local ip/tcp address
+ *
+ * Returns err code if mapping info is not found in the hash table,
+ * otherwise returns 0
+ */
int iwpm_remove_mapinfo(struct sockaddr_storage *local_sockaddr,
struct sockaddr_storage *mapped_local_addr)
{
@@ -250,6 +280,17 @@ void iwpm_add_remote_info(struct iwpm_remote_info *rem_info)
spin_unlock_irqrestore(&iwpm_reminfo_lock, flags);
}
+/**
+ * iwpm_get_remote_info - Get the remote connecting peer address info
+ *
+ * @mapped_loc_addr: Mapped local address of the listening peer
+ * @mapped_rem_addr: Mapped remote address of the connecting peer
+ * @remote_addr: To store the remote address of the connecting peer
+ * @nl_client: The index of the netlink client
+ *
+ * The remote address info is retrieved and provided to the client in
+ * the remote_addr. After that it is removed from the hash table
+ */
int iwpm_get_remote_info(struct sockaddr_storage *mapped_loc_addr,
struct sockaddr_storage *mapped_rem_addr,
struct sockaddr_storage *remote_addr,
@@ -686,6 +727,14 @@ int iwpm_send_mapinfo(u8 nl_client, int iwpm_pid)
if (ret)
goto send_mapping_info_unlock;
+ if (iwpm_ulib_version > IWPM_UABI_VERSION_MIN) {
+ ret = ibnl_put_attr(skb, nlh, sizeof(u32),
+ &map_info->map_flags,
+ IWPM_NLA_MAPINFO_FLAGS);
+ if (ret)
+ goto send_mapping_info_unlock;
+ }
+
nlmsg_end(skb, nlh);
iwpm_print_sockaddr(&map_info->local_sockaddr,
@@ -754,3 +803,38 @@ int iwpm_mapinfo_available(void)
spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags);
return full_bucket;
}
+
+int iwpm_send_hello(u8 nl_client, int iwpm_pid, u16 abi_version)
+{
+ struct sk_buff *skb = NULL;
+ struct nlmsghdr *nlh;
+ const char *err_str = "";
+ int ret = -EINVAL;
+
+ skb = iwpm_create_nlmsg(RDMA_NL_IWPM_HELLO, &nlh, nl_client);
+ if (!skb) {
+ err_str = "Unable to create a nlmsg";
+ goto hello_num_error;
+ }
+ nlh->nlmsg_seq = iwpm_get_nlmsg_seq();
+ err_str = "Unable to put attribute of abi_version into nlmsg";
+ ret = ibnl_put_attr(skb, nlh, sizeof(u16), &abi_version,
+ IWPM_NLA_HELLO_ABI_VERSION);
+ if (ret)
+ goto hello_num_error;
+ nlmsg_end(skb, nlh);
+
+ ret = rdma_nl_unicast(skb, iwpm_pid);
+ if (ret) {
+ skb = NULL;
+ err_str = "Unable to send a nlmsg";
+ goto hello_num_error;
+ }
+ pr_debug("%s: Sent hello abi_version = %u\n", __func__, abi_version);
+ return 0;
+hello_num_error:
+ pr_info("%s: %s\n", __func__, err_str);
+ if (skb)
+ dev_kfree_skb(skb);
+ return ret;
+}
diff --git a/drivers/infiniband/core/iwpm_util.h b/drivers/infiniband/core/iwpm_util.h
index af1fc14a0d3d..7e2bcc72f66c 100644
--- a/drivers/infiniband/core/iwpm_util.h
+++ b/drivers/infiniband/core/iwpm_util.h
@@ -78,6 +78,7 @@ struct iwpm_mapping_info {
struct sockaddr_storage local_sockaddr;
struct sockaddr_storage mapped_sockaddr;
u8 nl_client;
+ u32 map_flags;
};
struct iwpm_remote_info {
@@ -266,4 +267,15 @@ int iwpm_parse_nlmsg(struct netlink_callback *cb, int policy_max,
* @msg: Message to print
*/
void iwpm_print_sockaddr(struct sockaddr_storage *sockaddr, char *msg);
+
+/**
+ * iwpm_send_hello - Send hello response to iwpmd
+ *
+ * @nl_client: The index of the netlink client
+ * @abi_version: The kernel's abi_version
+ *
+ * Returns 0 on success or a negative error code
+ */
+int iwpm_send_hello(u8 nl_client, int iwpm_pid, u16 abi_version);
+extern u16 iwpm_ulib_version;
#endif
diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
index d7025cd5be28..e742a6a2c138 100644
--- a/drivers/infiniband/core/mad.c
+++ b/drivers/infiniband/core/mad.c
@@ -888,10 +888,10 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv,
}
/* No GRH for DR SMP */
- ret = device->process_mad(device, 0, port_num, &mad_wc, NULL,
- (const struct ib_mad_hdr *)smp, mad_size,
- (struct ib_mad_hdr *)mad_priv->mad,
- &mad_size, &out_mad_pkey_index);
+ ret = device->ops.process_mad(device, 0, port_num, &mad_wc, NULL,
+ (const struct ib_mad_hdr *)smp, mad_size,
+ (struct ib_mad_hdr *)mad_priv->mad,
+ &mad_size, &out_mad_pkey_index);
switch (ret)
{
case IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY:
@@ -2305,14 +2305,12 @@ static void ib_mad_recv_done(struct ib_cq *cq, struct ib_wc *wc)
}
/* Give driver "right of first refusal" on incoming MAD */
- if (port_priv->device->process_mad) {
- ret = port_priv->device->process_mad(port_priv->device, 0,
- port_priv->port_num,
- wc, &recv->grh,
- (const struct ib_mad_hdr *)recv->mad,
- recv->mad_size,
- (struct ib_mad_hdr *)response->mad,
- &mad_size, &resp_mad_pkey_index);
+ if (port_priv->device->ops.process_mad) {
+ ret = port_priv->device->ops.process_mad(
+ port_priv->device, 0, port_priv->port_num, wc,
+ &recv->grh, (const struct ib_mad_hdr *)recv->mad,
+ recv->mad_size, (struct ib_mad_hdr *)response->mad,
+ &mad_size, &resp_mad_pkey_index);
if (opa)
wc->pkey_index = resp_mad_pkey_index;
@@ -3328,9 +3326,9 @@ error:
static void ib_mad_remove_device(struct ib_device *device, void *client_data)
{
- int i;
+ unsigned int i;
- for (i = rdma_start_port(device); i <= rdma_end_port(device); i++) {
+ rdma_for_each_port (device, i) {
if (!rdma_cap_ib_mad(device, i))
continue;
diff --git a/drivers/infiniband/core/mad_rmpp.c b/drivers/infiniband/core/mad_rmpp.c
index e5cf09c66fe6..5ec57abc0849 100644
--- a/drivers/infiniband/core/mad_rmpp.c
+++ b/drivers/infiniband/core/mad_rmpp.c
@@ -81,7 +81,7 @@ static void destroy_rmpp_recv(struct mad_rmpp_recv *rmpp_recv)
{
deref_rmpp_recv(rmpp_recv);
wait_for_completion(&rmpp_recv->comp);
- rdma_destroy_ah(rmpp_recv->ah);
+ rdma_destroy_ah(rmpp_recv->ah, RDMA_DESTROY_AH_SLEEPABLE);
kfree(rmpp_recv);
}
@@ -171,7 +171,7 @@ static struct ib_mad_send_buf *alloc_response_msg(struct ib_mad_agent *agent,
hdr_len, 0, GFP_KERNEL,
IB_MGMT_BASE_VERSION);
if (IS_ERR(msg))
- rdma_destroy_ah(ah);
+ rdma_destroy_ah(ah, RDMA_DESTROY_AH_SLEEPABLE);
else {
msg->ah = ah;
msg->context[0] = ah;
@@ -201,7 +201,7 @@ static void ack_ds_ack(struct ib_mad_agent_private *agent,
ret = ib_post_send_mad(msg, NULL);
if (ret) {
- rdma_destroy_ah(msg->ah);
+ rdma_destroy_ah(msg->ah, RDMA_DESTROY_AH_SLEEPABLE);
ib_free_send_mad(msg);
}
}
@@ -209,7 +209,8 @@ static void ack_ds_ack(struct ib_mad_agent_private *agent,
void ib_rmpp_send_handler(struct ib_mad_send_wc *mad_send_wc)
{
if (mad_send_wc->send_buf->context[0] == mad_send_wc->send_buf->ah)
- rdma_destroy_ah(mad_send_wc->send_buf->ah);
+ rdma_destroy_ah(mad_send_wc->send_buf->ah,
+ RDMA_DESTROY_AH_SLEEPABLE);
ib_free_send_mad(mad_send_wc->send_buf);
}
@@ -237,7 +238,7 @@ static void nack_recv(struct ib_mad_agent_private *agent,
ret = ib_post_send_mad(msg, NULL);
if (ret) {
- rdma_destroy_ah(msg->ah);
+ rdma_destroy_ah(msg->ah, RDMA_DESTROY_AH_SLEEPABLE);
ib_free_send_mad(msg);
}
}
diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c
index 724f5a62e82f..eecfc0b377c9 100644
--- a/drivers/infiniband/core/netlink.c
+++ b/drivers/infiniband/core/netlink.c
@@ -56,7 +56,6 @@ EXPORT_SYMBOL(rdma_nl_chk_listeners);
static bool is_nl_msg_valid(unsigned int type, unsigned int op)
{
static const unsigned int max_num_ops[RDMA_NL_NUM_CLIENTS] = {
- [RDMA_NL_RDMA_CM] = RDMA_NL_RDMA_CM_NUM_OPS,
[RDMA_NL_IWCM] = RDMA_NL_IWPM_NUM_OPS,
[RDMA_NL_LS] = RDMA_NL_LS_NUM_OPS,
[RDMA_NL_NLDEV] = RDMA_NLDEV_NUM_OPS,
@@ -181,8 +180,7 @@ static int rdma_nl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
return -EINVAL;
}
/* FIXME: Convert IWCM to properly handle doit callbacks */
- if ((nlh->nlmsg_flags & NLM_F_DUMP) || index == RDMA_NL_RDMA_CM ||
- index == RDMA_NL_IWCM) {
+ if ((nlh->nlmsg_flags & NLM_F_DUMP) || index == RDMA_NL_IWCM) {
struct netlink_dump_control c = {
.dump = cb_table[op].dump,
};
diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index 573399e3ccc1..11ed58d3fce5 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -33,12 +33,14 @@
#include <linux/module.h>
#include <linux/pid.h>
#include <linux/pid_namespace.h>
+#include <linux/mutex.h>
#include <net/netlink.h>
#include <rdma/rdma_cm.h>
#include <rdma/rdma_netlink.h>
#include "core_priv.h"
#include "cma_priv.h"
+#include "restrack.h"
static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
[RDMA_NLDEV_ATTR_DEV_INDEX] = { .type = NLA_U32 },
@@ -107,6 +109,13 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
[RDMA_NLDEV_ATTR_DRIVER_U32] = { .type = NLA_U32 },
[RDMA_NLDEV_ATTR_DRIVER_S64] = { .type = NLA_S64 },
[RDMA_NLDEV_ATTR_DRIVER_U64] = { .type = NLA_U64 },
+ [RDMA_NLDEV_ATTR_RES_PDN] = { .type = NLA_U32 },
+ [RDMA_NLDEV_ATTR_RES_CQN] = { .type = NLA_U32 },
+ [RDMA_NLDEV_ATTR_RES_MRN] = { .type = NLA_U32 },
+ [RDMA_NLDEV_ATTR_RES_CM_IDN] = { .type = NLA_U32 },
+ [RDMA_NLDEV_ATTR_RES_CTXN] = { .type = NLA_U32 },
+ [RDMA_NLDEV_ATTR_LINK_TYPE] = { .type = NLA_NUL_STRING,
+ .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN },
};
static int put_driver_name_print_type(struct sk_buff *msg, const char *name,
@@ -227,6 +236,7 @@ static int fill_port_info(struct sk_buff *msg,
struct net_device *netdev = NULL;
struct ib_port_attr attr;
int ret;
+ u64 cap_flags = 0;
if (fill_nldev_handle(msg, device))
return -EMSGSIZE;
@@ -239,10 +249,12 @@ static int fill_port_info(struct sk_buff *msg,
return ret;
if (rdma_protocol_ib(device, port)) {
- BUILD_BUG_ON(sizeof(attr.port_cap_flags) > sizeof(u64));
+ BUILD_BUG_ON((sizeof(attr.port_cap_flags) +
+ sizeof(attr.port_cap_flags2)) > sizeof(u64));
+ cap_flags = attr.port_cap_flags |
+ ((u64)attr.port_cap_flags2 << 32);
if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CAP_FLAGS,
- (u64)attr.port_cap_flags,
- RDMA_NLDEV_ATTR_PAD))
+ cap_flags, RDMA_NLDEV_ATTR_PAD))
return -EMSGSIZE;
if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_SUBNET_PREFIX,
attr.subnet_prefix, RDMA_NLDEV_ATTR_PAD))
@@ -259,9 +271,7 @@ static int fill_port_info(struct sk_buff *msg,
if (nla_put_u8(msg, RDMA_NLDEV_ATTR_PORT_PHYS_STATE, attr.phys_state))
return -EMSGSIZE;
- if (device->get_netdev)
- netdev = device->get_netdev(device, port);
-
+ netdev = ib_device_get_netdev(device, port);
if (netdev && net_eq(dev_net(netdev), net)) {
ret = nla_put_u32(msg,
RDMA_NLDEV_ATTR_NDEV_INDEX, netdev->ifindex);
@@ -308,9 +318,9 @@ static int fill_res_info(struct sk_buff *msg, struct ib_device *device)
[RDMA_RESTRACK_QP] = "qp",
[RDMA_RESTRACK_CM_ID] = "cm_id",
[RDMA_RESTRACK_MR] = "mr",
+ [RDMA_RESTRACK_CTX] = "ctx",
};
- struct rdma_restrack_root *res = &device->res;
struct nlattr *table_attr;
int ret, i, curr;
@@ -324,7 +334,8 @@ static int fill_res_info(struct sk_buff *msg, struct ib_device *device)
for (i = 0; i < RDMA_RESTRACK_MAX; i++) {
if (!names[i])
continue;
- curr = rdma_restrack_count(res, i, task_active_pid_ns(current));
+ curr = rdma_restrack_count(device, i,
+ task_active_pid_ns(current));
ret = fill_res_info_entry(msg, names[i], curr);
if (ret)
goto err;
@@ -357,13 +368,20 @@ static int fill_res_name_pid(struct sk_buff *msg,
return 0;
}
-static int fill_res_qp_entry(struct sk_buff *msg, struct netlink_callback *cb,
+static bool fill_res_entry(struct ib_device *dev, struct sk_buff *msg,
+ struct rdma_restrack_entry *res)
+{
+ if (!dev->ops.fill_res_entry)
+ return false;
+ return dev->ops.fill_res_entry(msg, res);
+}
+
+static int fill_res_qp_entry(struct sk_buff *msg, bool has_cap_net_admin,
struct rdma_restrack_entry *res, uint32_t port)
{
struct ib_qp *qp = container_of(res, struct ib_qp, res);
- struct rdma_restrack_root *resroot = &qp->device->res;
+ struct ib_device *dev = qp->device;
struct ib_qp_init_attr qp_init_attr;
- struct nlattr *entry_attr;
struct ib_qp_attr qp_attr;
int ret;
@@ -372,11 +390,7 @@ static int fill_res_qp_entry(struct sk_buff *msg, struct netlink_callback *cb,
return ret;
if (port && port != qp_attr.port_num)
- return 0;
-
- entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_QP_ENTRY);
- if (!entry_attr)
- goto out;
+ return -EAGAIN;
/* In create_qp() port is not set yet */
if (qp_attr.port_num &&
@@ -408,38 +422,32 @@ static int fill_res_qp_entry(struct sk_buff *msg, struct netlink_callback *cb,
if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_STATE, qp_attr.qp_state))
goto err;
+ if (!rdma_is_kernel_res(res) &&
+ nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, qp->pd->res.id))
+ goto err;
+
if (fill_res_name_pid(msg, res))
goto err;
- if (resroot->fill_res_entry(msg, res))
+ if (fill_res_entry(dev, msg, res))
goto err;
- nla_nest_end(msg, entry_attr);
return 0;
-err:
- nla_nest_cancel(msg, entry_attr);
-out:
- return -EMSGSIZE;
+err: return -EMSGSIZE;
}
-static int fill_res_cm_id_entry(struct sk_buff *msg,
- struct netlink_callback *cb,
+static int fill_res_cm_id_entry(struct sk_buff *msg, bool has_cap_net_admin,
struct rdma_restrack_entry *res, uint32_t port)
{
struct rdma_id_private *id_priv =
container_of(res, struct rdma_id_private, res);
- struct rdma_restrack_root *resroot = &id_priv->id.device->res;
+ struct ib_device *dev = id_priv->id.device;
struct rdma_cm_id *cm_id = &id_priv->id;
- struct nlattr *entry_attr;
if (port && port != cm_id->port_num)
return 0;
- entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_CM_ID_ENTRY);
- if (!entry_attr)
- goto out;
-
if (cm_id->port_num &&
nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, cm_id->port_num))
goto err;
@@ -468,31 +476,25 @@ static int fill_res_cm_id_entry(struct sk_buff *msg,
&cm_id->route.addr.dst_addr))
goto err;
+ if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CM_IDN, res->id))
+ goto err;
+
if (fill_res_name_pid(msg, res))
goto err;
- if (resroot->fill_res_entry(msg, res))
+ if (fill_res_entry(dev, msg, res))
goto err;
- nla_nest_end(msg, entry_attr);
return 0;
-err:
- nla_nest_cancel(msg, entry_attr);
-out:
- return -EMSGSIZE;
+err: return -EMSGSIZE;
}
-static int fill_res_cq_entry(struct sk_buff *msg, struct netlink_callback *cb,
+static int fill_res_cq_entry(struct sk_buff *msg, bool has_cap_net_admin,
struct rdma_restrack_entry *res, uint32_t port)
{
struct ib_cq *cq = container_of(res, struct ib_cq, res);
- struct rdma_restrack_root *resroot = &cq->device->res;
- struct nlattr *entry_attr;
-
- entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_CQ_ENTRY);
- if (!entry_attr)
- goto out;
+ struct ib_device *dev = cq->device;
if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CQE, cq->cqe))
goto err;
@@ -505,33 +507,31 @@ static int fill_res_cq_entry(struct sk_buff *msg, struct netlink_callback *cb,
nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_POLL_CTX, cq->poll_ctx))
goto err;
+ if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CQN, res->id))
+ goto err;
+ if (!rdma_is_kernel_res(res) &&
+ nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CTXN,
+ cq->uobject->context->res.id))
+ goto err;
+
if (fill_res_name_pid(msg, res))
goto err;
- if (resroot->fill_res_entry(msg, res))
+ if (fill_res_entry(dev, msg, res))
goto err;
- nla_nest_end(msg, entry_attr);
return 0;
-err:
- nla_nest_cancel(msg, entry_attr);
-out:
- return -EMSGSIZE;
+err: return -EMSGSIZE;
}
-static int fill_res_mr_entry(struct sk_buff *msg, struct netlink_callback *cb,
+static int fill_res_mr_entry(struct sk_buff *msg, bool has_cap_net_admin,
struct rdma_restrack_entry *res, uint32_t port)
{
struct ib_mr *mr = container_of(res, struct ib_mr, res);
- struct rdma_restrack_root *resroot = &mr->pd->device->res;
- struct nlattr *entry_attr;
-
- entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_MR_ENTRY);
- if (!entry_attr)
- goto out;
+ struct ib_device *dev = mr->pd->device;
- if (netlink_capable(cb->skb, CAP_NET_ADMIN)) {
+ if (has_cap_net_admin) {
if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_RKEY, mr->rkey))
goto err;
if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LKEY, mr->lkey))
@@ -542,33 +542,31 @@ static int fill_res_mr_entry(struct sk_buff *msg, struct netlink_callback *cb,
RDMA_NLDEV_ATTR_PAD))
goto err;
+ if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_MRN, res->id))
+ goto err;
+
+ if (!rdma_is_kernel_res(res) &&
+ nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, mr->pd->res.id))
+ goto err;
+
if (fill_res_name_pid(msg, res))
goto err;
- if (resroot->fill_res_entry(msg, res))
+ if (fill_res_entry(dev, msg, res))
goto err;
- nla_nest_end(msg, entry_attr);
return 0;
-err:
- nla_nest_cancel(msg, entry_attr);
-out:
- return -EMSGSIZE;
+err: return -EMSGSIZE;
}
-static int fill_res_pd_entry(struct sk_buff *msg, struct netlink_callback *cb,
+static int fill_res_pd_entry(struct sk_buff *msg, bool has_cap_net_admin,
struct rdma_restrack_entry *res, uint32_t port)
{
struct ib_pd *pd = container_of(res, struct ib_pd, res);
- struct rdma_restrack_root *resroot = &pd->device->res;
- struct nlattr *entry_attr;
-
- entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_PD_ENTRY);
- if (!entry_attr)
- goto out;
+ struct ib_device *dev = pd->device;
- if (netlink_capable(cb->skb, CAP_NET_ADMIN)) {
+ if (has_cap_net_admin) {
if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY,
pd->local_dma_lkey))
goto err;
@@ -580,24 +578,24 @@ static int fill_res_pd_entry(struct sk_buff *msg, struct netlink_callback *cb,
if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_RES_USECNT,
atomic_read(&pd->usecnt), RDMA_NLDEV_ATTR_PAD))
goto err;
- if ((pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY) &&
- nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY,
- pd->unsafe_global_rkey))
+
+ if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, res->id))
+ goto err;
+
+ if (!rdma_is_kernel_res(res) &&
+ nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CTXN,
+ pd->uobject->context->res.id))
goto err;
if (fill_res_name_pid(msg, res))
goto err;
- if (resroot->fill_res_entry(msg, res))
+ if (fill_res_entry(dev, msg, res))
goto err;
- nla_nest_end(msg, entry_attr);
return 0;
-err:
- nla_nest_cancel(msg, entry_attr);
-out:
- return -EMSGSIZE;
+err: return -EMSGSIZE;
}
static int nldev_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -636,13 +634,13 @@ static int nldev_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
nlmsg_end(msg, nlh);
- put_device(&device->dev);
+ ib_device_put(device);
return rdma_nl_unicast(msg, NETLINK_CB(skb).portid);
err_free:
nlmsg_free(msg);
err:
- put_device(&device->dev);
+ ib_device_put(device);
return err;
}
@@ -672,7 +670,7 @@ static int nldev_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
err = ib_device_rename(device, name);
}
- put_device(&device->dev);
+ ib_device_put(device);
return err;
}
@@ -756,14 +754,14 @@ static int nldev_port_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
goto err_free;
nlmsg_end(msg, nlh);
- put_device(&device->dev);
+ ib_device_put(device);
return rdma_nl_unicast(msg, NETLINK_CB(skb).portid);
err_free:
nlmsg_free(msg);
err:
- put_device(&device->dev);
+ ib_device_put(device);
return err;
}
@@ -777,7 +775,7 @@ static int nldev_port_get_dumpit(struct sk_buff *skb,
u32 idx = 0;
u32 ifindex;
int err;
- u32 p;
+ unsigned int p;
err = nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
nldev_policy, NULL);
@@ -789,7 +787,7 @@ static int nldev_port_get_dumpit(struct sk_buff *skb,
if (!device)
return -EINVAL;
- for (p = rdma_start_port(device); p <= rdma_end_port(device); ++p) {
+ rdma_for_each_port (device, p) {
/*
* The dumpit function returns all information from specific
* index. This specific index is taken from the netlink
@@ -820,7 +818,7 @@ static int nldev_port_get_dumpit(struct sk_buff *skb,
}
out:
- put_device(&device->dev);
+ ib_device_put(device);
cb->args[0] = idx;
return skb->len;
}
@@ -859,13 +857,13 @@ static int nldev_res_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
goto err_free;
nlmsg_end(msg, nlh);
- put_device(&device->dev);
+ ib_device_put(device);
return rdma_nl_unicast(msg, NETLINK_CB(skb).portid);
err_free:
nlmsg_free(msg);
err:
- put_device(&device->dev);
+ ib_device_put(device);
return ret;
}
@@ -905,10 +903,17 @@ static int nldev_res_get_dumpit(struct sk_buff *skb,
}
struct nldev_fill_res_entry {
- int (*fill_res_func)(struct sk_buff *msg, struct netlink_callback *cb,
+ int (*fill_res_func)(struct sk_buff *msg, bool has_cap_net_admin,
struct rdma_restrack_entry *res, u32 port);
enum rdma_nldev_attr nldev_attr;
enum rdma_nldev_command nldev_cmd;
+ u8 flags;
+ u32 entry;
+ u32 id;
+};
+
+enum nldev_res_flags {
+ NLDEV_PER_DEV = 1 << 0,
};
static const struct nldev_fill_res_entry fill_entries[RDMA_RESTRACK_MAX] = {
@@ -916,29 +921,136 @@ static const struct nldev_fill_res_entry fill_entries[RDMA_RESTRACK_MAX] = {
.fill_res_func = fill_res_qp_entry,
.nldev_cmd = RDMA_NLDEV_CMD_RES_QP_GET,
.nldev_attr = RDMA_NLDEV_ATTR_RES_QP,
+ .entry = RDMA_NLDEV_ATTR_RES_QP_ENTRY,
+ .id = RDMA_NLDEV_ATTR_RES_LQPN,
},
[RDMA_RESTRACK_CM_ID] = {
.fill_res_func = fill_res_cm_id_entry,
.nldev_cmd = RDMA_NLDEV_CMD_RES_CM_ID_GET,
.nldev_attr = RDMA_NLDEV_ATTR_RES_CM_ID,
+ .entry = RDMA_NLDEV_ATTR_RES_CM_ID_ENTRY,
+ .id = RDMA_NLDEV_ATTR_RES_CM_IDN,
},
[RDMA_RESTRACK_CQ] = {
.fill_res_func = fill_res_cq_entry,
.nldev_cmd = RDMA_NLDEV_CMD_RES_CQ_GET,
.nldev_attr = RDMA_NLDEV_ATTR_RES_CQ,
+ .flags = NLDEV_PER_DEV,
+ .entry = RDMA_NLDEV_ATTR_RES_CQ_ENTRY,
+ .id = RDMA_NLDEV_ATTR_RES_CQN,
},
[RDMA_RESTRACK_MR] = {
.fill_res_func = fill_res_mr_entry,
.nldev_cmd = RDMA_NLDEV_CMD_RES_MR_GET,
.nldev_attr = RDMA_NLDEV_ATTR_RES_MR,
+ .flags = NLDEV_PER_DEV,
+ .entry = RDMA_NLDEV_ATTR_RES_MR_ENTRY,
+ .id = RDMA_NLDEV_ATTR_RES_MRN,
},
[RDMA_RESTRACK_PD] = {
.fill_res_func = fill_res_pd_entry,
.nldev_cmd = RDMA_NLDEV_CMD_RES_PD_GET,
.nldev_attr = RDMA_NLDEV_ATTR_RES_PD,
+ .flags = NLDEV_PER_DEV,
+ .entry = RDMA_NLDEV_ATTR_RES_PD_ENTRY,
+ .id = RDMA_NLDEV_ATTR_RES_PDN,
},
};
+static bool is_visible_in_pid_ns(struct rdma_restrack_entry *res)
+{
+ /*
+ * 1. Kern resources should be visible in init name space only
+ * 2. Present only resources visible in the current namespace
+ */
+ if (rdma_is_kernel_res(res))
+ return task_active_pid_ns(current) == &init_pid_ns;
+ return task_active_pid_ns(current) == task_active_pid_ns(res->task);
+}
+
+static int res_get_common_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack,
+ enum rdma_restrack_type res_type)
+{
+ const struct nldev_fill_res_entry *fe = &fill_entries[res_type];
+ struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+ struct rdma_restrack_entry *res;
+ struct ib_device *device;
+ u32 index, id, port = 0;
+ bool has_cap_net_admin;
+ struct sk_buff *msg;
+ int ret;
+
+ ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+ nldev_policy, extack);
+ if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !fe->id || !tb[fe->id])
+ return -EINVAL;
+
+ index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+ device = ib_device_get_by_index(index);
+ if (!device)
+ return -EINVAL;
+
+ if (tb[RDMA_NLDEV_ATTR_PORT_INDEX]) {
+ port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]);
+ if (!rdma_is_port_valid(device, port)) {
+ ret = -EINVAL;
+ goto err;
+ }
+ }
+
+ if ((port && fe->flags & NLDEV_PER_DEV) ||
+ (!port && ~fe->flags & NLDEV_PER_DEV)) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ id = nla_get_u32(tb[fe->id]);
+ res = rdma_restrack_get_byid(device, res_type, id);
+ if (IS_ERR(res)) {
+ ret = PTR_ERR(res);
+ goto err;
+ }
+
+ if (!is_visible_in_pid_ns(res)) {
+ ret = -ENOENT;
+ goto err_get;
+ }
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
+ RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, fe->nldev_cmd),
+ 0, 0);
+
+ if (fill_nldev_handle(msg, device)) {
+ ret = -EMSGSIZE;
+ goto err_free;
+ }
+
+ has_cap_net_admin = netlink_capable(skb, CAP_NET_ADMIN);
+ ret = fe->fill_res_func(msg, has_cap_net_admin, res, port);
+ rdma_restrack_put(res);
+ if (ret)
+ goto err_free;
+
+ nlmsg_end(msg, nlh);
+ ib_device_put(device);
+ return rdma_nl_unicast(msg, NETLINK_CB(skb).portid);
+
+err_free:
+ nlmsg_free(msg);
+err_get:
+ rdma_restrack_put(res);
+err:
+ ib_device_put(device);
+ return ret;
+}
+
static int res_get_common_dumpit(struct sk_buff *skb,
struct netlink_callback *cb,
enum rdma_restrack_type res_type)
@@ -946,11 +1058,15 @@ static int res_get_common_dumpit(struct sk_buff *skb,
const struct nldev_fill_res_entry *fe = &fill_entries[res_type];
struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
struct rdma_restrack_entry *res;
+ struct rdma_restrack_root *rt;
int err, ret = 0, idx = 0;
struct nlattr *table_attr;
+ struct nlattr *entry_attr;
struct ib_device *device;
int start = cb->args[0];
+ bool has_cap_net_admin;
struct nlmsghdr *nlh;
+ unsigned long id;
u32 index, port = 0;
bool filled = false;
@@ -998,55 +1114,51 @@ static int res_get_common_dumpit(struct sk_buff *skb,
goto err;
}
- down_read(&device->res.rwsem);
- hash_for_each_possible(device->res.hash, res, node, res_type) {
- if (idx < start)
- goto next;
+ has_cap_net_admin = netlink_capable(cb->skb, CAP_NET_ADMIN);
- if ((rdma_is_kernel_res(res) &&
- task_active_pid_ns(current) != &init_pid_ns) ||
- (!rdma_is_kernel_res(res) && task_active_pid_ns(current) !=
- task_active_pid_ns(res->task)))
- /*
- * 1. Kern resources should be visible in init
- * namspace only
- * 2. Present only resources visible in the current
- * namespace
- */
- goto next;
+ rt = &device->res[res_type];
+ xa_lock(&rt->xa);
+ /*
+ * FIXME: if the skip ahead is something common this loop should
+ * use xas_for_each & xas_pause to optimize, we can have a lot of
+ * objects.
+ */
+ xa_for_each(&rt->xa, id, res) {
+ if (!is_visible_in_pid_ns(res))
+ continue;
- if (!rdma_restrack_get(res))
- /*
- * Resource is under release now, but we are not
- * relesing lock now, so it will be released in
- * our next pass, once we will get ->next pointer.
- */
+ if (idx < start || !rdma_restrack_get(res))
goto next;
+ xa_unlock(&rt->xa);
+
filled = true;
- up_read(&device->res.rwsem);
- ret = fe->fill_res_func(skb, cb, res, port);
- down_read(&device->res.rwsem);
- /*
- * Return resource back, but it won't be released till
- * the &device->res.rwsem will be released for write.
- */
+ entry_attr = nla_nest_start(skb, fe->entry);
+ if (!entry_attr) {
+ ret = -EMSGSIZE;
+ rdma_restrack_put(res);
+ goto msg_full;
+ }
+
+ ret = fe->fill_res_func(skb, has_cap_net_admin, res, port);
rdma_restrack_put(res);
- if (ret == -EMSGSIZE)
- /*
- * There is a chance to optimize here.
- * It can be done by using list_prepare_entry
- * and list_for_each_entry_continue afterwards.
- */
- break;
- if (ret)
+ if (ret) {
+ nla_nest_cancel(skb, entry_attr);
+ if (ret == -EMSGSIZE)
+ goto msg_full;
+ if (ret == -EAGAIN)
+ goto again;
goto res_err;
+ }
+ nla_nest_end(skb, entry_attr);
+again: xa_lock(&rt->xa);
next: idx++;
}
- up_read(&device->res.rwsem);
+ xa_unlock(&rt->xa);
+msg_full:
nla_nest_end(skb, table_attr);
nlmsg_end(skb, nlh);
cb->args[0] = idx;
@@ -1058,49 +1170,146 @@ next: idx++;
if (!filled)
goto err;
- put_device(&device->dev);
+ ib_device_put(device);
return skb->len;
res_err:
nla_nest_cancel(skb, table_attr);
- up_read(&device->res.rwsem);
err:
nlmsg_cancel(skb, nlh);
err_index:
- put_device(&device->dev);
+ ib_device_put(device);
return ret;
}
-static int nldev_res_get_qp_dumpit(struct sk_buff *skb,
- struct netlink_callback *cb)
+#define RES_GET_FUNCS(name, type) \
+ static int nldev_res_get_##name##_dumpit(struct sk_buff *skb, \
+ struct netlink_callback *cb) \
+ { \
+ return res_get_common_dumpit(skb, cb, type); \
+ } \
+ static int nldev_res_get_##name##_doit(struct sk_buff *skb, \
+ struct nlmsghdr *nlh, \
+ struct netlink_ext_ack *extack) \
+ { \
+ return res_get_common_doit(skb, nlh, extack, type); \
+ }
+
+RES_GET_FUNCS(qp, RDMA_RESTRACK_QP);
+RES_GET_FUNCS(cm_id, RDMA_RESTRACK_CM_ID);
+RES_GET_FUNCS(cq, RDMA_RESTRACK_CQ);
+RES_GET_FUNCS(pd, RDMA_RESTRACK_PD);
+RES_GET_FUNCS(mr, RDMA_RESTRACK_MR);
+
+static LIST_HEAD(link_ops);
+static DECLARE_RWSEM(link_ops_rwsem);
+
+static const struct rdma_link_ops *link_ops_get(const char *type)
{
- return res_get_common_dumpit(skb, cb, RDMA_RESTRACK_QP);
+ const struct rdma_link_ops *ops;
+
+ list_for_each_entry(ops, &link_ops, list) {
+ if (!strcmp(ops->type, type))
+ goto out;
+ }
+ ops = NULL;
+out:
+ return ops;
}
-static int nldev_res_get_cm_id_dumpit(struct sk_buff *skb,
- struct netlink_callback *cb)
+void rdma_link_register(struct rdma_link_ops *ops)
{
- return res_get_common_dumpit(skb, cb, RDMA_RESTRACK_CM_ID);
+ down_write(&link_ops_rwsem);
+ if (WARN_ON_ONCE(link_ops_get(ops->type)))
+ goto out;
+ list_add(&ops->list, &link_ops);
+out:
+ up_write(&link_ops_rwsem);
}
+EXPORT_SYMBOL(rdma_link_register);
-static int nldev_res_get_cq_dumpit(struct sk_buff *skb,
- struct netlink_callback *cb)
+void rdma_link_unregister(struct rdma_link_ops *ops)
{
- return res_get_common_dumpit(skb, cb, RDMA_RESTRACK_CQ);
+ down_write(&link_ops_rwsem);
+ list_del(&ops->list);
+ up_write(&link_ops_rwsem);
}
+EXPORT_SYMBOL(rdma_link_unregister);
-static int nldev_res_get_mr_dumpit(struct sk_buff *skb,
- struct netlink_callback *cb)
+static int nldev_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
{
- return res_get_common_dumpit(skb, cb, RDMA_RESTRACK_MR);
+ struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+ char ibdev_name[IB_DEVICE_NAME_MAX];
+ const struct rdma_link_ops *ops;
+ char ndev_name[IFNAMSIZ];
+ struct net_device *ndev;
+ char type[IFNAMSIZ];
+ int err;
+
+ err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+ nldev_policy, extack);
+ if (err || !tb[RDMA_NLDEV_ATTR_DEV_NAME] ||
+ !tb[RDMA_NLDEV_ATTR_LINK_TYPE] || !tb[RDMA_NLDEV_ATTR_NDEV_NAME])
+ return -EINVAL;
+
+ nla_strlcpy(ibdev_name, tb[RDMA_NLDEV_ATTR_DEV_NAME],
+ sizeof(ibdev_name));
+ if (strchr(ibdev_name, '%'))
+ return -EINVAL;
+
+ nla_strlcpy(type, tb[RDMA_NLDEV_ATTR_LINK_TYPE], sizeof(type));
+ nla_strlcpy(ndev_name, tb[RDMA_NLDEV_ATTR_NDEV_NAME],
+ sizeof(ndev_name));
+
+ ndev = dev_get_by_name(&init_net, ndev_name);
+ if (!ndev)
+ return -ENODEV;
+
+ down_read(&link_ops_rwsem);
+ ops = link_ops_get(type);
+#ifdef CONFIG_MODULES
+ if (!ops) {
+ up_read(&link_ops_rwsem);
+ request_module("rdma-link-%s", type);
+ down_read(&link_ops_rwsem);
+ ops = link_ops_get(type);
+ }
+#endif
+ err = ops ? ops->newlink(ibdev_name, ndev) : -EINVAL;
+ up_read(&link_ops_rwsem);
+ dev_put(ndev);
+
+ return err;
}
-static int nldev_res_get_pd_dumpit(struct sk_buff *skb,
- struct netlink_callback *cb)
+static int nldev_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
{
- return res_get_common_dumpit(skb, cb, RDMA_RESTRACK_PD);
+ struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+ struct ib_device *device;
+ u32 index;
+ int err;
+
+ err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+ nldev_policy, extack);
+ if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX])
+ return -EINVAL;
+
+ index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+ device = ib_device_get_by_index(index);
+ if (!device)
+ return -EINVAL;
+
+ if (!(device->attrs.device_cap_flags & IB_DEVICE_ALLOW_USER_UNREG)) {
+ ib_device_put(device);
+ return -EINVAL;
+ }
+
+ ib_unregister_device_and_put(device);
+ return 0;
}
static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = {
@@ -1112,6 +1321,14 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = {
.doit = nldev_set_doit,
.flags = RDMA_NL_ADMIN_PERM,
},
+ [RDMA_NLDEV_CMD_NEWLINK] = {
+ .doit = nldev_newlink,
+ .flags = RDMA_NL_ADMIN_PERM,
+ },
+ [RDMA_NLDEV_CMD_DELLINK] = {
+ .doit = nldev_dellink,
+ .flags = RDMA_NL_ADMIN_PERM,
+ },
[RDMA_NLDEV_CMD_PORT_GET] = {
.doit = nldev_port_get_doit,
.dump = nldev_port_get_dumpit,
@@ -1121,28 +1338,23 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = {
.dump = nldev_res_get_dumpit,
},
[RDMA_NLDEV_CMD_RES_QP_GET] = {
+ .doit = nldev_res_get_qp_doit,
.dump = nldev_res_get_qp_dumpit,
- /*
- * .doit is not implemented yet for two reasons:
- * 1. It is not needed yet.
- * 2. There is a need to provide identifier, while it is easy
- * for the QPs (device index + port index + LQPN), it is not
- * the case for the rest of resources (PD and CQ). Because it
- * is better to provide similar interface for all resources,
- * let's wait till we will have other resources implemented
- * too.
- */
},
[RDMA_NLDEV_CMD_RES_CM_ID_GET] = {
+ .doit = nldev_res_get_cm_id_doit,
.dump = nldev_res_get_cm_id_dumpit,
},
[RDMA_NLDEV_CMD_RES_CQ_GET] = {
+ .doit = nldev_res_get_cq_doit,
.dump = nldev_res_get_cq_dumpit,
},
[RDMA_NLDEV_CMD_RES_MR_GET] = {
+ .doit = nldev_res_get_mr_doit,
.dump = nldev_res_get_mr_dumpit,
},
[RDMA_NLDEV_CMD_RES_PD_GET] = {
+ .doit = nldev_res_get_pd_doit,
.dump = nldev_res_get_pd_dumpit,
},
};
diff --git a/drivers/infiniband/core/opa_smi.h b/drivers/infiniband/core/opa_smi.h
index 3bfab3505a29..af4879bdf3d6 100644
--- a/drivers/infiniband/core/opa_smi.h
+++ b/drivers/infiniband/core/opa_smi.h
@@ -55,7 +55,7 @@ static inline enum smi_action opa_smi_check_local_smp(struct opa_smp *smp,
{
/* C14-9:3 -- We're at the end of the DR segment of path */
/* C14-9:4 -- Hop Pointer = Hop Count + 1 -> give to SMA/SM */
- return (device->process_mad &&
+ return (device->ops.process_mad &&
!opa_get_smp_direction(smp) &&
(smp->hop_ptr == smp->hop_cnt + 1)) ?
IB_SMI_HANDLE : IB_SMI_DISCARD;
@@ -70,7 +70,7 @@ static inline enum smi_action opa_smi_check_local_returning_smp(struct opa_smp *
{
/* C14-13:3 -- We're at the end of the DR segment of path */
/* C14-13:4 -- Hop Pointer == 0 -> give to SM */
- return (device->process_mad &&
+ return (device->ops.process_mad &&
opa_get_smp_direction(smp) &&
!smp->hop_ptr) ? IB_SMI_HANDLE : IB_SMI_DISCARD;
}
diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c
index 752a55c6bdce..778375ff664e 100644
--- a/drivers/infiniband/core/rdma_core.c
+++ b/drivers/infiniband/core/rdma_core.c
@@ -224,12 +224,14 @@ out_unlock:
* uverbs_put_destroy.
*/
struct ib_uobject *__uobj_get_destroy(const struct uverbs_api_object *obj,
- u32 id, struct ib_uverbs_file *ufile)
+ u32 id,
+ const struct uverbs_attr_bundle *attrs)
{
struct ib_uobject *uobj;
int ret;
- uobj = rdma_lookup_get_uobject(obj, ufile, id, UVERBS_LOOKUP_DESTROY);
+ uobj = rdma_lookup_get_uobject(obj, attrs->ufile, id,
+ UVERBS_LOOKUP_DESTROY);
if (IS_ERR(uobj))
return uobj;
@@ -243,21 +245,20 @@ struct ib_uobject *__uobj_get_destroy(const struct uverbs_api_object *obj,
}
/*
- * Does both uobj_get_destroy() and uobj_put_destroy(). Returns success_res
- * on success (negative errno on failure). For use by callers that do not need
- * the uobj.
+ * Does both uobj_get_destroy() and uobj_put_destroy(). Returns 0 on success
+ * (negative errno on failure). For use by callers that do not need the uobj.
*/
int __uobj_perform_destroy(const struct uverbs_api_object *obj, u32 id,
- struct ib_uverbs_file *ufile, int success_res)
+ const struct uverbs_attr_bundle *attrs)
{
struct ib_uobject *uobj;
- uobj = __uobj_get_destroy(obj, id, ufile);
+ uobj = __uobj_get_destroy(obj, id, attrs);
if (IS_ERR(uobj))
return PTR_ERR(uobj);
rdma_lookup_put_uobject(uobj, UVERBS_LOOKUP_WRITE);
- return success_res;
+ return 0;
}
/* alloc_uobj must be undone by uverbs_destroy_uobject() */
@@ -267,7 +268,7 @@ static struct ib_uobject *alloc_uobj(struct ib_uverbs_file *ufile,
struct ib_uobject *uobj;
struct ib_ucontext *ucontext;
- ucontext = ib_uverbs_get_ucontext(ufile);
+ ucontext = ib_uverbs_get_ucontext_file(ufile);
if (IS_ERR(ucontext))
return ERR_CAST(ucontext);
@@ -397,16 +398,23 @@ struct ib_uobject *rdma_lookup_get_uobject(const struct uverbs_api_object *obj,
struct ib_uobject *uobj;
int ret;
- if (!obj)
- return ERR_PTR(-EINVAL);
+ if (IS_ERR(obj) && PTR_ERR(obj) == -ENOMSG) {
+ /* must be UVERBS_IDR_ANY_OBJECT, see uapi_get_object() */
+ uobj = lookup_get_idr_uobject(NULL, ufile, id, mode);
+ if (IS_ERR(uobj))
+ return uobj;
+ } else {
+ if (IS_ERR(obj))
+ return ERR_PTR(-EINVAL);
- uobj = obj->type_class->lookup_get(obj, ufile, id, mode);
- if (IS_ERR(uobj))
- return uobj;
+ uobj = obj->type_class->lookup_get(obj, ufile, id, mode);
+ if (IS_ERR(uobj))
+ return uobj;
- if (uobj->uapi_object != obj) {
- ret = -EINVAL;
- goto free;
+ if (uobj->uapi_object != obj) {
+ ret = -EINVAL;
+ goto free;
+ }
}
/*
@@ -426,10 +434,42 @@ struct ib_uobject *rdma_lookup_get_uobject(const struct uverbs_api_object *obj,
return uobj;
free:
- obj->type_class->lookup_put(uobj, mode);
+ uobj->uapi_object->type_class->lookup_put(uobj, mode);
uverbs_uobject_put(uobj);
return ERR_PTR(ret);
}
+struct ib_uobject *_uobj_get_read(enum uverbs_default_objects type,
+ u32 object_id,
+ struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uobject *uobj;
+
+ uobj = rdma_lookup_get_uobject(uobj_get_type(attrs, type), attrs->ufile,
+ object_id, UVERBS_LOOKUP_READ);
+ if (IS_ERR(uobj))
+ return uobj;
+
+ attrs->context = uobj->context;
+
+ return uobj;
+}
+
+struct ib_uobject *_uobj_get_write(enum uverbs_default_objects type,
+ u32 object_id,
+ struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uobject *uobj;
+
+ uobj = rdma_lookup_get_uobject(uobj_get_type(attrs, type), attrs->ufile,
+ object_id, UVERBS_LOOKUP_WRITE);
+
+ if (IS_ERR(uobj))
+ return uobj;
+
+ attrs->context = uobj->context;
+
+ return uobj;
+}
static struct ib_uobject *
alloc_begin_idr_uobject(const struct uverbs_api_object *obj,
@@ -490,7 +530,7 @@ struct ib_uobject *rdma_alloc_begin_uobject(const struct uverbs_api_object *obj,
{
struct ib_uobject *ret;
- if (!obj)
+ if (IS_ERR(obj))
return ERR_PTR(-EINVAL);
/*
@@ -793,6 +833,7 @@ void uverbs_close_fd(struct file *f)
/* Pairs with filp->private_data in alloc_begin_fd_uobject */
uverbs_uobject_put(uobj);
}
+EXPORT_SYMBOL(uverbs_close_fd);
/*
* Drop the ucontext off the ufile and completely disconnect it from the
@@ -803,7 +844,6 @@ static void ufile_destroy_ucontext(struct ib_uverbs_file *ufile,
{
struct ib_ucontext *ucontext = ufile->ucontext;
struct ib_device *ib_dev = ucontext->device;
- int ret;
/*
* If we are closing the FD then the user mmap VMAs must have
@@ -812,19 +852,17 @@ static void ufile_destroy_ucontext(struct ib_uverbs_file *ufile,
*/
if (reason == RDMA_REMOVE_DRIVER_REMOVE) {
uverbs_user_mmap_disassociate(ufile);
- if (ib_dev->disassociate_ucontext)
- ib_dev->disassociate_ucontext(ucontext);
+ if (ib_dev->ops.disassociate_ucontext)
+ ib_dev->ops.disassociate_ucontext(ucontext);
}
ib_rdmacg_uncharge(&ucontext->cg_obj, ib_dev,
RDMACG_RESOURCE_HCA_HANDLE);
- /*
- * FIXME: Drivers are not permitted to fail dealloc_ucontext, remove
- * the error return.
- */
- ret = ib_dev->dealloc_ucontext(ucontext);
- WARN_ON(ret);
+ rdma_restrack_del(&ucontext->res);
+
+ ib_dev->ops.dealloc_ucontext(ucontext);
+ kfree(ucontext);
ufile->ucontext = NULL;
}
diff --git a/drivers/infiniband/core/rdma_core.h b/drivers/infiniband/core/rdma_core.h
index 4886d2bba7c7..69f8db66925e 100644
--- a/drivers/infiniband/core/rdma_core.h
+++ b/drivers/infiniband/core/rdma_core.h
@@ -106,6 +106,8 @@ int uverbs_finalize_object(struct ib_uobject *uobj,
enum uverbs_obj_access access,
bool commit);
+int uverbs_output_written(const struct uverbs_attr_bundle *bundle, size_t idx);
+
void setup_ufile_idr_uobject(struct ib_uverbs_file *ufile);
void release_ufile_idr_uobject(struct ib_uverbs_file *ufile);
@@ -118,43 +120,67 @@ void release_ufile_idr_uobject(struct ib_uverbs_file *ufile);
* Depending on ID the slot pointer in the radix tree points at one of these
* structs.
*/
-struct uverbs_api_object {
- const struct uverbs_obj_type *type_attrs;
- const struct uverbs_obj_type_class *type_class;
-};
struct uverbs_api_ioctl_method {
- int (__rcu *handler)(struct ib_uverbs_file *ufile,
- struct uverbs_attr_bundle *ctx);
+ int(__rcu *handler)(struct uverbs_attr_bundle *attrs);
DECLARE_BITMAP(attr_mandatory, UVERBS_API_ATTR_BKEY_LEN);
u16 bundle_size;
u8 use_stack:1;
u8 driver_method:1;
+ u8 disabled:1;
+ u8 has_udata:1;
u8 key_bitmap_len;
u8 destroy_bkey;
};
+struct uverbs_api_write_method {
+ int (*handler)(struct uverbs_attr_bundle *attrs);
+ u8 disabled:1;
+ u8 is_ex:1;
+ u8 has_udata:1;
+ u8 has_resp:1;
+ u8 req_size;
+ u8 resp_size;
+};
+
struct uverbs_api_attr {
struct uverbs_attr_spec spec;
};
-struct uverbs_api_object;
struct uverbs_api {
/* radix tree contains struct uverbs_api_* pointers */
struct radix_tree_root radix;
enum rdma_driver_id driver_id;
+
+ unsigned int num_write;
+ unsigned int num_write_ex;
+ struct uverbs_api_write_method notsupp_method;
+ const struct uverbs_api_write_method **write_methods;
+ const struct uverbs_api_write_method **write_ex_methods;
};
+/*
+ * Get an uverbs_api_object that corresponds to the given object_id.
+ * Note:
+ * -ENOMSG means that any object is allowed to match during lookup.
+ */
static inline const struct uverbs_api_object *
uapi_get_object(struct uverbs_api *uapi, u16 object_id)
{
- return radix_tree_lookup(&uapi->radix, uapi_key_obj(object_id));
+ const struct uverbs_api_object *res;
+
+ if (object_id == UVERBS_IDR_ANY_OBJECT)
+ return ERR_PTR(-ENOMSG);
+
+ res = radix_tree_lookup(&uapi->radix, uapi_key_obj(object_id));
+ if (!res)
+ return ERR_PTR(-ENOENT);
+
+ return res;
}
char *uapi_key_format(char *S, unsigned int key);
-struct uverbs_api *uverbs_alloc_api(
- const struct uverbs_object_tree_def *const *driver_specs,
- enum rdma_driver_id driver_id);
+struct uverbs_api *uverbs_alloc_api(struct ib_device *ibdev);
void uverbs_disassociate_api_pre(struct ib_uverbs_device *uverbs_dev);
void uverbs_disassociate_api(struct uverbs_api *uapi);
void uverbs_destroy_api(struct uverbs_api *uapi);
@@ -162,4 +188,37 @@ void uapi_compute_bundle_size(struct uverbs_api_ioctl_method *method_elm,
unsigned int num_attrs);
void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile);
+extern const struct uapi_definition uverbs_def_obj_counters[];
+extern const struct uapi_definition uverbs_def_obj_cq[];
+extern const struct uapi_definition uverbs_def_obj_device[];
+extern const struct uapi_definition uverbs_def_obj_dm[];
+extern const struct uapi_definition uverbs_def_obj_flow_action[];
+extern const struct uapi_definition uverbs_def_obj_intf[];
+extern const struct uapi_definition uverbs_def_obj_mr[];
+extern const struct uapi_definition uverbs_def_write_intf[];
+
+static inline const struct uverbs_api_write_method *
+uapi_get_method(const struct uverbs_api *uapi, u32 command)
+{
+ u32 cmd_idx = command & IB_USER_VERBS_CMD_COMMAND_MASK;
+
+ if (command & ~(u32)(IB_USER_VERBS_CMD_FLAG_EXTENDED |
+ IB_USER_VERBS_CMD_COMMAND_MASK))
+ return ERR_PTR(-EINVAL);
+
+ if (command & IB_USER_VERBS_CMD_FLAG_EXTENDED) {
+ if (cmd_idx >= uapi->num_write_ex)
+ return ERR_PTR(-EOPNOTSUPP);
+ return uapi->write_ex_methods[cmd_idx];
+ }
+
+ if (cmd_idx >= uapi->num_write)
+ return ERR_PTR(-EOPNOTSUPP);
+ return uapi->write_methods[cmd_idx];
+}
+
+void uverbs_fill_udata(struct uverbs_attr_bundle *bundle,
+ struct ib_udata *udata, unsigned int attr_in,
+ unsigned int attr_out);
+
#endif /* RDMA_CORE_H */
diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c
index 06d8657ce583..3b5ff2f7b5f8 100644
--- a/drivers/infiniband/core/restrack.c
+++ b/drivers/infiniband/core/restrack.c
@@ -11,17 +11,29 @@
#include <linux/pid_namespace.h>
#include "cma_priv.h"
+#include "restrack.h"
-static int fill_res_noop(struct sk_buff *msg,
- struct rdma_restrack_entry *entry)
+/**
+ * rdma_restrack_init() - initialize and allocate resource tracking
+ * @dev: IB device
+ *
+ * Return: 0 on success
+ */
+int rdma_restrack_init(struct ib_device *dev)
{
- return 0;
-}
+ struct rdma_restrack_root *rt;
+ int i;
-void rdma_restrack_init(struct rdma_restrack_root *res)
-{
- init_rwsem(&res->rwsem);
- res->fill_res_entry = fill_res_noop;
+ dev->res = kcalloc(RDMA_RESTRACK_MAX, sizeof(*rt), GFP_KERNEL);
+ if (!dev->res)
+ return -ENOMEM;
+
+ rt = dev->res;
+
+ for (i = 0; i < RDMA_RESTRACK_MAX; i++)
+ xa_init_flags(&rt[i].xa, XA_FLAGS_ALLOC);
+
+ return 0;
}
static const char *type2str(enum rdma_restrack_type type)
@@ -32,60 +44,85 @@ static const char *type2str(enum rdma_restrack_type type)
[RDMA_RESTRACK_QP] = "QP",
[RDMA_RESTRACK_CM_ID] = "CM_ID",
[RDMA_RESTRACK_MR] = "MR",
+ [RDMA_RESTRACK_CTX] = "CTX",
};
return names[type];
};
-void rdma_restrack_clean(struct rdma_restrack_root *res)
+/**
+ * rdma_restrack_clean() - clean resource tracking
+ * @dev: IB device
+ */
+void rdma_restrack_clean(struct ib_device *dev)
{
+ struct rdma_restrack_root *rt = dev->res;
struct rdma_restrack_entry *e;
char buf[TASK_COMM_LEN];
- struct ib_device *dev;
+ bool found = false;
const char *owner;
- int bkt;
-
- if (hash_empty(res->hash))
- return;
-
- dev = container_of(res, struct ib_device, res);
- pr_err("restrack: %s", CUT_HERE);
- dev_err(&dev->dev, "BUG: RESTRACK detected leak of resources\n");
- hash_for_each(res->hash, bkt, e, node) {
- if (rdma_is_kernel_res(e)) {
- owner = e->kern_name;
- } else {
- /*
- * There is no need to call get_task_struct here,
- * because we can be here only if there are more
- * get_task_struct() call than put_task_struct().
- */
- get_task_comm(buf, e->task);
- owner = buf;
+ int i;
+
+ for (i = 0 ; i < RDMA_RESTRACK_MAX; i++) {
+ struct xarray *xa = &dev->res[i].xa;
+
+ if (!xa_empty(xa)) {
+ unsigned long index;
+
+ if (!found) {
+ pr_err("restrack: %s", CUT_HERE);
+ dev_err(&dev->dev, "BUG: RESTRACK detected leak of resources\n");
+ }
+ xa_for_each(xa, index, e) {
+ if (rdma_is_kernel_res(e)) {
+ owner = e->kern_name;
+ } else {
+ /*
+ * There is no need to call get_task_struct here,
+ * because we can be here only if there are more
+ * get_task_struct() call than put_task_struct().
+ */
+ get_task_comm(buf, e->task);
+ owner = buf;
+ }
+
+ pr_err("restrack: %s %s object allocated by %s is not freed\n",
+ rdma_is_kernel_res(e) ? "Kernel" :
+ "User",
+ type2str(e->type), owner);
+ }
+ found = true;
}
-
- pr_err("restrack: %s %s object allocated by %s is not freed\n",
- rdma_is_kernel_res(e) ? "Kernel" : "User",
- type2str(e->type), owner);
+ xa_destroy(xa);
}
- pr_err("restrack: %s", CUT_HERE);
+ if (found)
+ pr_err("restrack: %s", CUT_HERE);
+
+ kfree(rt);
}
-int rdma_restrack_count(struct rdma_restrack_root *res,
- enum rdma_restrack_type type,
+/**
+ * rdma_restrack_count() - the current usage of specific object
+ * @dev: IB device
+ * @type: actual type of object to operate
+ * @ns: PID namespace
+ */
+int rdma_restrack_count(struct ib_device *dev, enum rdma_restrack_type type,
struct pid_namespace *ns)
{
+ struct rdma_restrack_root *rt = &dev->res[type];
struct rdma_restrack_entry *e;
+ XA_STATE(xas, &rt->xa, 0);
u32 cnt = 0;
- down_read(&res->rwsem);
- hash_for_each_possible(res->hash, e, node, type) {
+ xa_lock(&rt->xa);
+ xas_for_each(&xas, e, U32_MAX) {
if (ns == &init_pid_ns ||
(!rdma_is_kernel_res(e) &&
ns == task_active_pid_ns(e->task)))
cnt++;
}
- up_read(&res->rwsem);
+ xa_unlock(&rt->xa);
return cnt;
}
EXPORT_SYMBOL(rdma_restrack_count);
@@ -130,31 +167,14 @@ static struct ib_device *res_to_dev(struct rdma_restrack_entry *res)
res)->id.device;
case RDMA_RESTRACK_MR:
return container_of(res, struct ib_mr, res)->device;
+ case RDMA_RESTRACK_CTX:
+ return container_of(res, struct ib_ucontext, res)->device;
default:
WARN_ONCE(true, "Wrong resource tracking type %u\n", res->type);
return NULL;
}
}
-static bool res_is_user(struct rdma_restrack_entry *res)
-{
- switch (res->type) {
- case RDMA_RESTRACK_PD:
- return container_of(res, struct ib_pd, res)->uobject;
- case RDMA_RESTRACK_CQ:
- return container_of(res, struct ib_cq, res)->uobject;
- case RDMA_RESTRACK_QP:
- return container_of(res, struct ib_qp, res)->uobject;
- case RDMA_RESTRACK_CM_ID:
- return !res->kern_name;
- case RDMA_RESTRACK_MR:
- return container_of(res, struct ib_mr, res)->pd->uobject;
- default:
- WARN_ONCE(true, "Wrong resource tracking type %u\n", res->type);
- return false;
- }
-}
-
void rdma_restrack_set_task(struct rdma_restrack_entry *res,
const char *caller)
{
@@ -170,33 +190,64 @@ void rdma_restrack_set_task(struct rdma_restrack_entry *res,
}
EXPORT_SYMBOL(rdma_restrack_set_task);
-void rdma_restrack_add(struct rdma_restrack_entry *res)
+static void rdma_restrack_add(struct rdma_restrack_entry *res)
{
struct ib_device *dev = res_to_dev(res);
+ struct rdma_restrack_root *rt;
+ int ret;
if (!dev)
return;
- if (res->type != RDMA_RESTRACK_CM_ID || !res_is_user(res))
- res->task = NULL;
-
- if (res_is_user(res)) {
- if (!res->task)
- rdma_restrack_set_task(res, NULL);
- res->kern_name = NULL;
- } else {
- set_kern_name(res);
- }
+ rt = &dev->res[res->type];
kref_init(&res->kref);
init_completion(&res->comp);
- res->valid = true;
+ if (res->type != RDMA_RESTRACK_QP)
+ ret = xa_alloc_cyclic(&rt->xa, &res->id, res, xa_limit_32b,
+ &rt->next_id, GFP_KERNEL);
+ else {
+ /* Special case to ensure that LQPN points to right QP */
+ struct ib_qp *qp = container_of(res, struct ib_qp, res);
+
+ ret = xa_insert(&rt->xa, qp->qp_num, res, GFP_KERNEL);
+ res->id = ret ? 0 : qp->qp_num;
+ }
+
+ if (!ret)
+ res->valid = true;
+}
+
+/**
+ * rdma_restrack_kadd() - add kernel object to the reource tracking database
+ * @res: resource entry
+ */
+void rdma_restrack_kadd(struct rdma_restrack_entry *res)
+{
+ res->task = NULL;
+ set_kern_name(res);
+ res->user = false;
+ rdma_restrack_add(res);
+}
+EXPORT_SYMBOL(rdma_restrack_kadd);
+
+/**
+ * rdma_restrack_uadd() - add user object to the reource tracking database
+ * @res: resource entry
+ */
+void rdma_restrack_uadd(struct rdma_restrack_entry *res)
+{
+ if (res->type != RDMA_RESTRACK_CM_ID)
+ res->task = NULL;
+
+ if (!res->task)
+ rdma_restrack_set_task(res, NULL);
+ res->kern_name = NULL;
- down_write(&dev->res.rwsem);
- hash_add(dev->res.hash, &res->node, res->type);
- up_write(&dev->res.rwsem);
+ res->user = true;
+ rdma_restrack_add(res);
}
-EXPORT_SYMBOL(rdma_restrack_add);
+EXPORT_SYMBOL(rdma_restrack_uadd);
int __must_check rdma_restrack_get(struct rdma_restrack_entry *res)
{
@@ -204,6 +255,31 @@ int __must_check rdma_restrack_get(struct rdma_restrack_entry *res)
}
EXPORT_SYMBOL(rdma_restrack_get);
+/**
+ * rdma_restrack_get_byid() - translate from ID to restrack object
+ * @dev: IB device
+ * @type: resource track type
+ * @id: ID to take a look
+ *
+ * Return: Pointer to restrack entry or -ENOENT in case of error.
+ */
+struct rdma_restrack_entry *
+rdma_restrack_get_byid(struct ib_device *dev,
+ enum rdma_restrack_type type, u32 id)
+{
+ struct rdma_restrack_root *rt = &dev->res[type];
+ struct rdma_restrack_entry *res;
+
+ xa_lock(&rt->xa);
+ res = xa_load(&rt->xa, id);
+ if (!res || !rdma_restrack_get(res))
+ res = ERR_PTR(-ENOENT);
+ xa_unlock(&rt->xa);
+
+ return res;
+}
+EXPORT_SYMBOL(rdma_restrack_get_byid);
+
static void restrack_release(struct kref *kref)
{
struct rdma_restrack_entry *res;
@@ -220,23 +296,25 @@ EXPORT_SYMBOL(rdma_restrack_put);
void rdma_restrack_del(struct rdma_restrack_entry *res)
{
+ struct rdma_restrack_entry *old;
+ struct rdma_restrack_root *rt;
struct ib_device *dev;
if (!res->valid)
goto out;
dev = res_to_dev(res);
- if (!dev)
+ if (WARN_ON(!dev))
return;
- rdma_restrack_put(res);
-
- wait_for_completion(&res->comp);
+ rt = &dev->res[res->type];
- down_write(&dev->res.rwsem);
- hash_del(&res->node);
+ old = xa_erase(&rt->xa, res->id);
+ WARN_ON(old != res);
res->valid = false;
- up_write(&dev->res.rwsem);
+
+ rdma_restrack_put(res);
+ wait_for_completion(&res->comp);
out:
if (res->task) {
diff --git a/drivers/infiniband/core/restrack.h b/drivers/infiniband/core/restrack.h
new file mode 100644
index 000000000000..09a1fbdf578e
--- /dev/null
+++ b/drivers/infiniband/core/restrack.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2017-2019 Mellanox Technologies. All rights reserved.
+ */
+
+#ifndef _RDMA_CORE_RESTRACK_H_
+#define _RDMA_CORE_RESTRACK_H_
+
+#include <linux/mutex.h>
+
+/**
+ * struct rdma_restrack_root - main resource tracking management
+ * entity, per-device
+ */
+struct rdma_restrack_root {
+ /**
+ * @xa: Array of XArray structure to hold restrack entries.
+ */
+ struct xarray xa;
+ /**
+ * @next_id: Next ID to support cyclic allocation
+ */
+ u32 next_id;
+};
+
+int rdma_restrack_init(struct ib_device *dev);
+void rdma_restrack_clean(struct ib_device *dev);
+#endif /* _RDMA_CORE_RESTRACK_H_ */
diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c
index d22c4a2ebac6..89a5be3a2f97 100644
--- a/drivers/infiniband/core/rw.c
+++ b/drivers/infiniband/core/rw.c
@@ -179,7 +179,6 @@ static int rdma_rw_init_map_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
struct scatterlist *sg, u32 sg_cnt, u32 offset,
u64 remote_addr, u32 rkey, enum dma_data_direction dir)
{
- struct ib_device *dev = qp->pd->device;
u32 max_sge = dir == DMA_TO_DEVICE ? qp->max_write_sge :
qp->max_read_sge;
struct ib_sge *sge;
@@ -209,8 +208,8 @@ static int rdma_rw_init_map_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
rdma_wr->wr.sg_list = sge;
for (j = 0; j < nr_sge; j++, sg = sg_next(sg)) {
- sge->addr = ib_sg_dma_address(dev, sg) + offset;
- sge->length = ib_sg_dma_len(dev, sg) - offset;
+ sge->addr = sg_dma_address(sg) + offset;
+ sge->length = sg_dma_len(sg) - offset;
sge->lkey = qp->pd->local_dma_lkey;
total_len += sge->length;
@@ -236,14 +235,13 @@ static int rdma_rw_init_single_wr(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
struct scatterlist *sg, u32 offset, u64 remote_addr, u32 rkey,
enum dma_data_direction dir)
{
- struct ib_device *dev = qp->pd->device;
struct ib_rdma_wr *rdma_wr = &ctx->single.wr;
ctx->nr_ops = 1;
ctx->single.sge.lkey = qp->pd->local_dma_lkey;
- ctx->single.sge.addr = ib_sg_dma_address(dev, sg) + offset;
- ctx->single.sge.length = ib_sg_dma_len(dev, sg) - offset;
+ ctx->single.sge.addr = sg_dma_address(sg) + offset;
+ ctx->single.sge.length = sg_dma_len(sg) - offset;
memset(rdma_wr, 0, sizeof(*rdma_wr));
if (dir == DMA_TO_DEVICE)
@@ -294,7 +292,7 @@ int rdma_rw_ctx_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num,
* Skip to the S/G entry that sg_offset falls into:
*/
for (;;) {
- u32 len = ib_sg_dma_len(dev, sg);
+ u32 len = sg_dma_len(sg);
if (sg_offset < len)
break;
diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
index be5ba5e15496..7925e45ea88a 100644
--- a/drivers/infiniband/core/sa_query.c
+++ b/drivers/infiniband/core/sa_query.c
@@ -1147,7 +1147,7 @@ static void free_sm_ah(struct kref *kref)
{
struct ib_sa_sm_ah *sm_ah = container_of(kref, struct ib_sa_sm_ah, ref);
- rdma_destroy_ah(sm_ah->ah);
+ rdma_destroy_ah(sm_ah->ah, 0);
kfree(sm_ah);
}
@@ -2276,7 +2276,8 @@ static void update_sm_ah(struct work_struct *work)
cpu_to_be64(IB_SA_WELL_KNOWN_GUID));
}
- new_ah->ah = rdma_create_ah(port->agent->qp->pd, &ah_attr);
+ new_ah->ah = rdma_create_ah(port->agent->qp->pd, &ah_attr,
+ RDMA_CREATE_AH_SLEEPABLE);
if (IS_ERR(new_ah->ah)) {
pr_warn("Couldn't create new SM AH\n");
kfree(new_ah);
@@ -2341,9 +2342,7 @@ static void ib_sa_add_one(struct ib_device *device)
s = rdma_start_port(device);
e = rdma_end_port(device);
- sa_dev = kzalloc(sizeof *sa_dev +
- (e - s + 1) * sizeof (struct ib_sa_port),
- GFP_KERNEL);
+ sa_dev = kzalloc(struct_size(sa_dev, port, e - s + 1), GFP_KERNEL);
if (!sa_dev)
return;
diff --git a/drivers/infiniband/core/security.c b/drivers/infiniband/core/security.c
index 1143c0448666..1ab423b19f77 100644
--- a/drivers/infiniband/core/security.c
+++ b/drivers/infiniband/core/security.c
@@ -39,22 +39,25 @@
#include "core_priv.h"
#include "mad_priv.h"
+static LIST_HEAD(mad_agent_list);
+/* Lock to protect mad_agent_list */
+static DEFINE_SPINLOCK(mad_agent_list_lock);
+
static struct pkey_index_qp_list *get_pkey_idx_qp_list(struct ib_port_pkey *pp)
{
struct pkey_index_qp_list *pkey = NULL;
struct pkey_index_qp_list *tmp_pkey;
struct ib_device *dev = pp->sec->dev;
- spin_lock(&dev->port_pkey_list[pp->port_num].list_lock);
- list_for_each_entry(tmp_pkey,
- &dev->port_pkey_list[pp->port_num].pkey_list,
- pkey_index_list) {
+ spin_lock(&dev->port_data[pp->port_num].pkey_list_lock);
+ list_for_each_entry (tmp_pkey, &dev->port_data[pp->port_num].pkey_list,
+ pkey_index_list) {
if (tmp_pkey->pkey_index == pp->pkey_index) {
pkey = tmp_pkey;
break;
}
}
- spin_unlock(&dev->port_pkey_list[pp->port_num].list_lock);
+ spin_unlock(&dev->port_data[pp->port_num].pkey_list_lock);
return pkey;
}
@@ -259,12 +262,12 @@ static int port_pkey_list_insert(struct ib_port_pkey *pp)
if (!pkey)
return -ENOMEM;
- spin_lock(&dev->port_pkey_list[port_num].list_lock);
+ spin_lock(&dev->port_data[port_num].pkey_list_lock);
/* Check for the PKey again. A racing process may
* have created it.
*/
list_for_each_entry(tmp_pkey,
- &dev->port_pkey_list[port_num].pkey_list,
+ &dev->port_data[port_num].pkey_list,
pkey_index_list) {
if (tmp_pkey->pkey_index == pp->pkey_index) {
kfree(pkey);
@@ -279,9 +282,9 @@ static int port_pkey_list_insert(struct ib_port_pkey *pp)
spin_lock_init(&pkey->qp_list_lock);
INIT_LIST_HEAD(&pkey->qp_list);
list_add(&pkey->pkey_index_list,
- &dev->port_pkey_list[port_num].pkey_list);
+ &dev->port_data[port_num].pkey_list);
}
- spin_unlock(&dev->port_pkey_list[port_num].list_lock);
+ spin_unlock(&dev->port_data[port_num].pkey_list_lock);
}
spin_lock(&pkey->qp_list_lock);
@@ -418,12 +421,15 @@ void ib_close_shared_qp_security(struct ib_qp_security *sec)
int ib_create_qp_security(struct ib_qp *qp, struct ib_device *dev)
{
- u8 i = rdma_start_port(dev);
+ unsigned int i;
bool is_ib = false;
int ret;
- while (i <= rdma_end_port(dev) && !is_ib)
+ rdma_for_each_port (dev, i) {
is_ib = rdma_protocol_ib(dev, i++);
+ if (is_ib)
+ break;
+ }
/* If this isn't an IB device don't create the security context */
if (!is_ib)
@@ -544,9 +550,8 @@ void ib_security_cache_change(struct ib_device *device,
{
struct pkey_index_qp_list *pkey;
- list_for_each_entry(pkey,
- &device->port_pkey_list[port_num].pkey_list,
- pkey_index_list) {
+ list_for_each_entry (pkey, &device->port_data[port_num].pkey_list,
+ pkey_index_list) {
check_pkey_qps(pkey,
device,
port_num,
@@ -554,21 +559,19 @@ void ib_security_cache_change(struct ib_device *device,
}
}
-void ib_security_destroy_port_pkey_list(struct ib_device *device)
+void ib_security_release_port_pkey_list(struct ib_device *device)
{
struct pkey_index_qp_list *pkey, *tmp_pkey;
- int i;
+ unsigned int i;
- for (i = rdma_start_port(device); i <= rdma_end_port(device); i++) {
- spin_lock(&device->port_pkey_list[i].list_lock);
+ rdma_for_each_port (device, i) {
list_for_each_entry_safe(pkey,
tmp_pkey,
- &device->port_pkey_list[i].pkey_list,
+ &device->port_data[i].pkey_list,
pkey_index_list) {
list_del(&pkey->pkey_index_list);
kfree(pkey);
}
- spin_unlock(&device->port_pkey_list[i].list_lock);
}
}
@@ -626,10 +629,10 @@ int ib_security_modify_qp(struct ib_qp *qp,
}
if (!ret)
- ret = real_qp->device->modify_qp(real_qp,
- qp_attr,
- qp_attr_mask,
- udata);
+ ret = real_qp->device->ops.modify_qp(real_qp,
+ qp_attr,
+ qp_attr_mask,
+ udata);
if (new_pps) {
/* Clean up the lists and free the appropriate
@@ -676,19 +679,18 @@ static int ib_security_pkey_access(struct ib_device *dev,
return security_ib_pkey_access(sec, subnet_prefix, pkey);
}
-static int ib_mad_agent_security_change(struct notifier_block *nb,
- unsigned long event,
- void *data)
+void ib_mad_agent_security_change(void)
{
- struct ib_mad_agent *ag = container_of(nb, struct ib_mad_agent, lsm_nb);
-
- if (event != LSM_POLICY_CHANGE)
- return NOTIFY_DONE;
-
- ag->smp_allowed = !security_ib_endport_manage_subnet(
- ag->security, dev_name(&ag->device->dev), ag->port_num);
-
- return NOTIFY_OK;
+ struct ib_mad_agent *ag;
+
+ spin_lock(&mad_agent_list_lock);
+ list_for_each_entry(ag,
+ &mad_agent_list,
+ mad_agent_sec_list)
+ WRITE_ONCE(ag->smp_allowed,
+ !security_ib_endport_manage_subnet(ag->security,
+ dev_name(&ag->device->dev), ag->port_num));
+ spin_unlock(&mad_agent_list_lock);
}
int ib_mad_agent_security_setup(struct ib_mad_agent *agent,
@@ -699,6 +701,8 @@ int ib_mad_agent_security_setup(struct ib_mad_agent *agent,
if (!rdma_protocol_ib(agent->device, agent->port_num))
return 0;
+ INIT_LIST_HEAD(&agent->mad_agent_sec_list);
+
ret = security_ib_alloc_security(&agent->security);
if (ret)
return ret;
@@ -706,20 +710,22 @@ int ib_mad_agent_security_setup(struct ib_mad_agent *agent,
if (qp_type != IB_QPT_SMI)
return 0;
+ spin_lock(&mad_agent_list_lock);
ret = security_ib_endport_manage_subnet(agent->security,
dev_name(&agent->device->dev),
agent->port_num);
if (ret)
- return ret;
+ goto free_security;
- agent->lsm_nb.notifier_call = ib_mad_agent_security_change;
- ret = register_lsm_notifier(&agent->lsm_nb);
- if (ret)
- return ret;
-
- agent->smp_allowed = true;
- agent->lsm_nb_reg = true;
+ WRITE_ONCE(agent->smp_allowed, true);
+ list_add(&agent->mad_agent_sec_list, &mad_agent_list);
+ spin_unlock(&mad_agent_list_lock);
return 0;
+
+free_security:
+ spin_unlock(&mad_agent_list_lock);
+ security_ib_free_security(agent->security);
+ return ret;
}
void ib_mad_agent_security_cleanup(struct ib_mad_agent *agent)
@@ -727,9 +733,13 @@ void ib_mad_agent_security_cleanup(struct ib_mad_agent *agent)
if (!rdma_protocol_ib(agent->device, agent->port_num))
return;
+ if (agent->qp->qp_type == IB_QPT_SMI) {
+ spin_lock(&mad_agent_list_lock);
+ list_del(&agent->mad_agent_sec_list);
+ spin_unlock(&mad_agent_list_lock);
+ }
+
security_ib_free_security(agent->security);
- if (agent->lsm_nb_reg)
- unregister_lsm_notifier(&agent->lsm_nb);
}
int ib_mad_enforce_security(struct ib_mad_agent_private *map, u16 pkey_index)
@@ -738,7 +748,7 @@ int ib_mad_enforce_security(struct ib_mad_agent_private *map, u16 pkey_index)
return 0;
if (map->agent.qp->qp_type == IB_QPT_SMI) {
- if (!map->agent.smp_allowed)
+ if (!READ_ONCE(map->agent.smp_allowed))
return -EACCES;
return 0;
}
diff --git a/drivers/infiniband/core/smi.h b/drivers/infiniband/core/smi.h
index 33c91c8a16e9..91d9b353ab85 100644
--- a/drivers/infiniband/core/smi.h
+++ b/drivers/infiniband/core/smi.h
@@ -67,7 +67,7 @@ static inline enum smi_action smi_check_local_smp(struct ib_smp *smp,
{
/* C14-9:3 -- We're at the end of the DR segment of path */
/* C14-9:4 -- Hop Pointer = Hop Count + 1 -> give to SMA/SM */
- return ((device->process_mad &&
+ return ((device->ops.process_mad &&
!ib_get_smp_direction(smp) &&
(smp->hop_ptr == smp->hop_cnt + 1)) ?
IB_SMI_HANDLE : IB_SMI_DISCARD);
@@ -82,7 +82,7 @@ static inline enum smi_action smi_check_local_returning_smp(struct ib_smp *smp,
{
/* C14-13:3 -- We're at the end of the DR segment of path */
/* C14-13:4 -- Hop Pointer == 0 -> give to SM */
- return ((device->process_mad &&
+ return ((device->ops.process_mad &&
ib_get_smp_direction(smp) &&
!smp->hop_ptr) ? IB_SMI_HANDLE : IB_SMI_DISCARD);
}
diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c
index 6fcce2c206c6..9b6a065bdfa5 100644
--- a/drivers/infiniband/core/sysfs.c
+++ b/drivers/infiniband/core/sysfs.c
@@ -462,7 +462,7 @@ static int get_perf_mad(struct ib_device *dev, int port_num, __be16 attr,
u16 out_mad_pkey_index = 0;
ssize_t ret;
- if (!dev->process_mad)
+ if (!dev->ops.process_mad)
return -ENOSYS;
in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL);
@@ -481,11 +481,11 @@ static int get_perf_mad(struct ib_device *dev, int port_num, __be16 attr,
if (attr != IB_PMA_CLASS_PORT_INFO)
in_mad->data[41] = port_num; /* PortSelect field */
- if ((dev->process_mad(dev, IB_MAD_IGNORE_MKEY,
- port_num, NULL, NULL,
- (const struct ib_mad_hdr *)in_mad, mad_size,
- (struct ib_mad_hdr *)out_mad, &mad_size,
- &out_mad_pkey_index) &
+ if ((dev->ops.process_mad(dev, IB_MAD_IGNORE_MKEY,
+ port_num, NULL, NULL,
+ (const struct ib_mad_hdr *)in_mad, mad_size,
+ (struct ib_mad_hdr *)out_mad, &mad_size,
+ &out_mad_pkey_index) &
(IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY)) !=
(IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY)) {
ret = -EINVAL;
@@ -786,7 +786,7 @@ static int update_hw_stats(struct ib_device *dev, struct rdma_hw_stats *stats,
if (time_is_after_eq_jiffies(stats->timestamp + stats->lifespan))
return 0;
- ret = dev->get_hw_stats(dev, stats, port_num, index);
+ ret = dev->ops.get_hw_stats(dev, stats, port_num, index);
if (ret < 0)
return ret;
if (ret == stats->num_counters)
@@ -946,7 +946,7 @@ static void setup_hw_stats(struct ib_device *device, struct ib_port *port,
struct rdma_hw_stats *stats;
int i, ret;
- stats = device->alloc_hw_stats(device, port_num);
+ stats = device->ops.alloc_hw_stats(device, port_num);
if (!stats)
return;
@@ -964,8 +964,8 @@ static void setup_hw_stats(struct ib_device *device, struct ib_port *port,
if (!hsag)
goto err_free_stats;
- ret = device->get_hw_stats(device, stats, port_num,
- stats->num_counters);
+ ret = device->ops.get_hw_stats(device, stats, port_num,
+ stats->num_counters);
if (ret != stats->num_counters)
goto err_free_hsag;
@@ -1015,9 +1015,7 @@ err_free_stats:
return;
}
-static int add_port(struct ib_device *device, int port_num,
- int (*port_callback)(struct ib_device *,
- u8, struct kobject *))
+static int add_port(struct ib_device *device, int port_num)
{
struct ib_port *p;
struct ib_port_attr attr;
@@ -1057,7 +1055,7 @@ static int add_port(struct ib_device *device, int port_num,
goto err_put;
}
- if (device->process_mad) {
+ if (device->ops.process_mad) {
p->pma_table = get_counter_table(device, port_num);
ret = sysfs_create_group(&p->kobj, p->pma_table);
if (ret)
@@ -1113,8 +1111,8 @@ static int add_port(struct ib_device *device, int port_num,
if (ret)
goto err_free_pkey;
- if (port_callback) {
- ret = port_callback(device, port_num, &p->kobj);
+ if (device->ops.init_port) {
+ ret = device->ops.init_port(device, port_num, &p->kobj);
if (ret)
goto err_remove_pkey;
}
@@ -1124,7 +1122,7 @@ static int add_port(struct ib_device *device, int port_num,
* port, so holder should be device. Therefore skip per port conunter
* initialization.
*/
- if (device->alloc_hw_stats && port_num)
+ if (device->ops.alloc_hw_stats && port_num)
setup_hw_stats(device, p, port_num);
list_add_tail(&p->kobj.entry, &device->port_list);
@@ -1189,7 +1187,7 @@ err_put:
static ssize_t node_type_show(struct device *device,
struct device_attribute *attr, char *buf)
{
- struct ib_device *dev = container_of(device, struct ib_device, dev);
+ struct ib_device *dev = rdma_device_to_ibdev(device);
switch (dev->node_type) {
case RDMA_NODE_IB_CA: return sprintf(buf, "%d: CA\n", dev->node_type);
@@ -1206,7 +1204,7 @@ static DEVICE_ATTR_RO(node_type);
static ssize_t sys_image_guid_show(struct device *device,
struct device_attribute *dev_attr, char *buf)
{
- struct ib_device *dev = container_of(device, struct ib_device, dev);
+ struct ib_device *dev = rdma_device_to_ibdev(device);
return sprintf(buf, "%04x:%04x:%04x:%04x\n",
be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[0]),
@@ -1219,7 +1217,7 @@ static DEVICE_ATTR_RO(sys_image_guid);
static ssize_t node_guid_show(struct device *device,
struct device_attribute *attr, char *buf)
{
- struct ib_device *dev = container_of(device, struct ib_device, dev);
+ struct ib_device *dev = rdma_device_to_ibdev(device);
return sprintf(buf, "%04x:%04x:%04x:%04x\n",
be16_to_cpu(((__be16 *) &dev->node_guid)[0]),
@@ -1232,7 +1230,7 @@ static DEVICE_ATTR_RO(node_guid);
static ssize_t node_desc_show(struct device *device,
struct device_attribute *attr, char *buf)
{
- struct ib_device *dev = container_of(device, struct ib_device, dev);
+ struct ib_device *dev = rdma_device_to_ibdev(device);
return sprintf(buf, "%.64s\n", dev->node_desc);
}
@@ -1241,11 +1239,11 @@ static ssize_t node_desc_store(struct device *device,
struct device_attribute *attr,
const char *buf, size_t count)
{
- struct ib_device *dev = container_of(device, struct ib_device, dev);
+ struct ib_device *dev = rdma_device_to_ibdev(device);
struct ib_device_modify desc = {};
int ret;
- if (!dev->modify_device)
+ if (!dev->ops.modify_device)
return -EIO;
memcpy(desc.node_desc, buf, min_t(int, count, IB_DEVICE_NODE_DESC_MAX));
@@ -1260,7 +1258,7 @@ static DEVICE_ATTR_RW(node_desc);
static ssize_t fw_ver_show(struct device *device, struct device_attribute *attr,
char *buf)
{
- struct ib_device *dev = container_of(device, struct ib_device, dev);
+ struct ib_device *dev = rdma_device_to_ibdev(device);
ib_get_device_fw_str(dev, buf);
strlcat(buf, "\n", IB_FW_VERSION_NAME_MAX);
@@ -1277,21 +1275,21 @@ static struct attribute *ib_dev_attrs[] = {
NULL,
};
-static const struct attribute_group dev_attr_group = {
+const struct attribute_group ib_dev_attr_group = {
.attrs = ib_dev_attrs,
};
-static void free_port_list_attributes(struct ib_device *device)
+static void ib_free_port_attrs(struct ib_device *device)
{
struct kobject *p, *t;
list_for_each_entry_safe(p, t, &device->port_list, entry) {
struct ib_port *port = container_of(p, struct ib_port, kobj);
+
list_del(&p->entry);
- if (port->hw_stats) {
- kfree(port->hw_stats);
+ if (port->hw_stats_ag)
free_hsag(&port->kobj, port->hw_stats_ag);
- }
+ kfree(port->hw_stats);
if (port->pma_table)
sysfs_remove_group(p, port->pma_table);
@@ -1308,62 +1306,47 @@ static void free_port_list_attributes(struct ib_device *device)
kobject_put(device->ports_kobj);
}
-int ib_device_register_sysfs(struct ib_device *device,
- int (*port_callback)(struct ib_device *,
- u8, struct kobject *))
+static int ib_setup_port_attrs(struct ib_device *device)
{
- struct device *class_dev = &device->dev;
+ unsigned int port;
int ret;
- int i;
-
- device->groups[0] = &dev_attr_group;
- class_dev->groups = device->groups;
- ret = device_add(class_dev);
- if (ret)
- goto err;
-
- device->ports_kobj = kobject_create_and_add("ports", &class_dev->kobj);
- if (!device->ports_kobj) {
- ret = -ENOMEM;
- goto err_put;
- }
+ device->ports_kobj = kobject_create_and_add("ports", &device->dev.kobj);
+ if (!device->ports_kobj)
+ return -ENOMEM;
- if (rdma_cap_ib_switch(device)) {
- ret = add_port(device, 0, port_callback);
+ rdma_for_each_port (device, port) {
+ ret = add_port(device, port);
if (ret)
goto err_put;
- } else {
- for (i = 1; i <= device->phys_port_cnt; ++i) {
- ret = add_port(device, i, port_callback);
- if (ret)
- goto err_put;
- }
}
- if (device->alloc_hw_stats)
- setup_hw_stats(device, NULL, 0);
-
return 0;
err_put:
- free_port_list_attributes(device);
- device_del(class_dev);
-err:
+ ib_free_port_attrs(device);
return ret;
}
-void ib_device_unregister_sysfs(struct ib_device *device)
+int ib_device_register_sysfs(struct ib_device *device)
{
- /* Hold device until ib_dealloc_device() */
- get_device(&device->dev);
+ int ret;
- free_port_list_attributes(device);
+ ret = ib_setup_port_attrs(device);
+ if (ret)
+ return ret;
+
+ if (device->ops.alloc_hw_stats)
+ setup_hw_stats(device, NULL, 0);
- if (device->hw_stats) {
- kfree(device->hw_stats);
+ return 0;
+}
+
+void ib_device_unregister_sysfs(struct ib_device *device)
+{
+ if (device->hw_stats_ag)
free_hsag(&device->dev.kobj, device->hw_stats_ag);
- }
+ kfree(device->hw_stats);
- device_unregister(&device->dev);
+ ib_free_port_attrs(device);
}
diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c
index 73332b9a25b5..7541fbaf58a3 100644
--- a/drivers/infiniband/core/ucm.c
+++ b/drivers/infiniband/core/ucm.c
@@ -1242,7 +1242,7 @@ static void ib_ucm_add_one(struct ib_device *device)
dev_t base;
struct ib_ucm_device *ucm_dev;
- if (!device->alloc_ucontext || !rdma_cap_ib_cm(device, 1))
+ if (!device->ops.alloc_ucontext || !rdma_cap_ib_cm(device, 1))
return;
ucm_dev = kzalloc(sizeof *ucm_dev, GFP_KERNEL);
diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
index 01d68ed46c1b..7468b26b8a01 100644
--- a/drivers/infiniband/core/ucma.c
+++ b/drivers/infiniband/core/ucma.c
@@ -1236,6 +1236,13 @@ static int ucma_set_option_id(struct ucma_context *ctx, int optname,
}
ret = rdma_set_afonly(ctx->cm_id, *((int *) optval) ? 1 : 0);
break;
+ case RDMA_OPTION_ID_ACK_TIMEOUT:
+ if (optlen != sizeof(u8)) {
+ ret = -EINVAL;
+ break;
+ }
+ ret = rdma_set_ack_timeout(ctx->cm_id, *((u8 *)optval));
+ break;
default:
ret = -ENOSYS;
}
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index c6144df47ea4..fe5551562dbc 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -72,15 +72,16 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d
* If access flags indicate ODP memory, avoid pinning. Instead, stores
* the mm for future page fault handling in conjunction with MMU notifiers.
*
- * @context: userspace context to pin memory for
+ * @udata: userspace context to pin memory for
* @addr: userspace virtual address to start at
* @size: length of region to pin
* @access: IB_ACCESS_xxx flags for memory being pinned
* @dmasync: flush in-flight DMA when the memory region is written
*/
-struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
+struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
size_t size, int access, int dmasync)
{
+ struct ib_ucontext *context;
struct ib_umem *umem;
struct page **page_list;
struct vm_area_struct **vma_list;
@@ -95,6 +96,14 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
struct scatterlist *sg, *sg_list_start;
unsigned int gup_flags = FOLL_WRITE;
+ if (!udata)
+ return ERR_PTR(-EIO);
+
+ context = container_of(udata, struct uverbs_attr_bundle, driver_udata)
+ ->context;
+ if (!context)
+ return ERR_PTR(-EIO);
+
if (dmasync)
dma_attrs |= DMA_ATTR_WRITE_BARRIER;
@@ -160,15 +169,12 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
- down_write(&mm->mmap_sem);
- if (check_add_overflow(mm->pinned_vm, npages, &new_pinned) ||
- (new_pinned > lock_limit && !capable(CAP_IPC_LOCK))) {
- up_write(&mm->mmap_sem);
+ new_pinned = atomic64_add_return(npages, &mm->pinned_vm);
+ if (new_pinned > lock_limit && !capable(CAP_IPC_LOCK)) {
+ atomic64_sub(npages, &mm->pinned_vm);
ret = -ENOMEM;
goto out;
}
- mm->pinned_vm = new_pinned;
- up_write(&mm->mmap_sem);
cur_base = addr & PAGE_MASK;
@@ -228,9 +234,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
umem_release:
__ib_umem_release(context->device, umem, 0);
vma:
- down_write(&mm->mmap_sem);
- mm->pinned_vm -= ib_umem_num_pages(umem);
- up_write(&mm->mmap_sem);
+ atomic64_sub(ib_umem_num_pages(umem), &mm->pinned_vm);
out:
if (vma_list)
free_page((unsigned long) vma_list);
@@ -253,25 +257,12 @@ static void __ib_umem_release_tail(struct ib_umem *umem)
kfree(umem);
}
-static void ib_umem_release_defer(struct work_struct *work)
-{
- struct ib_umem *umem = container_of(work, struct ib_umem, work);
-
- down_write(&umem->owning_mm->mmap_sem);
- umem->owning_mm->pinned_vm -= ib_umem_num_pages(umem);
- up_write(&umem->owning_mm->mmap_sem);
-
- __ib_umem_release_tail(umem);
-}
-
/**
* ib_umem_release - release memory pinned with ib_umem_get
* @umem: umem struct to release
*/
void ib_umem_release(struct ib_umem *umem)
{
- struct ib_ucontext *context = umem->context;
-
if (umem->is_odp) {
ib_umem_odp_release(to_ib_umem_odp(umem));
__ib_umem_release_tail(umem);
@@ -280,26 +271,7 @@ void ib_umem_release(struct ib_umem *umem)
__ib_umem_release(umem->context->device, umem, 1);
- /*
- * We may be called with the mm's mmap_sem already held. This
- * can happen when a userspace munmap() is the call that drops
- * the last reference to our file and calls our release
- * method. If there are memory regions to destroy, we'll end
- * up here and not be able to take the mmap_sem. In that case
- * we defer the vm_locked accounting a workqueue.
- */
- if (context->closing) {
- if (!down_write_trylock(&umem->owning_mm->mmap_sem)) {
- INIT_WORK(&umem->work, ib_umem_release_defer);
- queue_work(ib_wq, &umem->work);
- return;
- }
- } else {
- down_write(&umem->owning_mm->mmap_sem);
- }
- umem->owning_mm->pinned_vm -= ib_umem_num_pages(umem);
- up_write(&umem->owning_mm->mmap_sem);
-
+ atomic64_sub(ib_umem_num_pages(umem), &umem->owning_mm->pinned_vm);
__ib_umem_release_tail(umem);
}
EXPORT_SYMBOL(ib_umem_release);
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
index 676c1fd1119d..e6ec79ad9cc8 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -40,6 +40,7 @@
#include <linux/vmalloc.h>
#include <linux/hugetlb.h>
#include <linux/interval_tree_generic.h>
+#include <linux/pagemap.h>
#include <rdma/ib_verbs.h>
#include <rdma/ib_umem.h>
@@ -146,15 +147,12 @@ static int invalidate_range_start_trampoline(struct ib_umem_odp *item,
}
static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn,
- struct mm_struct *mm,
- unsigned long start,
- unsigned long end,
- bool blockable)
+ const struct mmu_notifier_range *range)
{
struct ib_ucontext_per_mm *per_mm =
container_of(mn, struct ib_ucontext_per_mm, mn);
- if (blockable)
+ if (range->blockable)
down_read(&per_mm->umem_rwsem);
else if (!down_read_trylock(&per_mm->umem_rwsem))
return -EAGAIN;
@@ -169,9 +167,10 @@ static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn,
return 0;
}
- return rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, start, end,
+ return rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, range->start,
+ range->end,
invalidate_range_start_trampoline,
- blockable, NULL);
+ range->blockable, NULL);
}
static int invalidate_range_end_trampoline(struct ib_umem_odp *item, u64 start,
@@ -182,9 +181,7 @@ static int invalidate_range_end_trampoline(struct ib_umem_odp *item, u64 start,
}
static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn,
- struct mm_struct *mm,
- unsigned long start,
- unsigned long end)
+ const struct mmu_notifier_range *range)
{
struct ib_ucontext_per_mm *per_mm =
container_of(mn, struct ib_ucontext_per_mm, mn);
@@ -192,8 +189,8 @@ static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn,
if (unlikely(!per_mm->active))
return;
- rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, start,
- end,
+ rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, range->start,
+ range->end,
invalidate_range_end_trampoline, true, NULL);
up_read(&per_mm->umem_rwsem);
}
@@ -303,7 +300,7 @@ static void free_per_mm(struct rcu_head *rcu)
kfree(container_of(rcu, struct ib_ucontext_per_mm, rcu));
}
-void put_per_mm(struct ib_umem_odp *umem_odp)
+static void put_per_mm(struct ib_umem_odp *umem_odp)
{
struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm;
struct ib_ucontext *ctx = umem_odp->umem.context;
@@ -336,9 +333,10 @@ void put_per_mm(struct ib_umem_odp *umem_odp)
mmu_notifier_call_srcu(&per_mm->rcu, free_per_mm);
}
-struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext_per_mm *per_mm,
+struct ib_umem_odp *ib_alloc_odp_umem(struct ib_umem_odp *root,
unsigned long addr, size_t size)
{
+ struct ib_ucontext_per_mm *per_mm = root->per_mm;
struct ib_ucontext *ctx = per_mm->context;
struct ib_umem_odp *odp_data;
struct ib_umem *umem;
@@ -353,9 +351,11 @@ struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext_per_mm *per_mm,
umem->length = size;
umem->address = addr;
umem->page_shift = PAGE_SHIFT;
- umem->writable = 1;
+ umem->writable = root->umem.writable;
umem->is_odp = 1;
odp_data->per_mm = per_mm;
+ umem->owning_mm = per_mm->mm;
+ mmgrab(umem->owning_mm);
mutex_init(&odp_data->umem_mutex);
init_completion(&odp_data->notifier_completion);
@@ -388,6 +388,7 @@ struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext_per_mm *per_mm,
out_page_list:
vfree(odp_data->page_list);
out_odp_data:
+ mmdrop(umem->owning_mm);
kfree(odp_data);
return ERR_PTR(ret);
}
@@ -618,7 +619,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt,
* mmget_not_zero will fail in this case.
*/
owning_process = get_pid_task(umem_odp->per_mm->tgid, PIDTYPE_PID);
- if (WARN_ON(!mmget_not_zero(umem_odp->umem.owning_mm))) {
+ if (!owning_process || !mmget_not_zero(owning_mm)) {
ret = -EINVAL;
goto out_put_task;
}
@@ -647,8 +648,13 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt,
flags, local_page_list, NULL, NULL);
up_read(&owning_mm->mmap_sem);
- if (npages < 0)
+ if (npages < 0) {
+ if (npages != -EAGAIN)
+ pr_warn("fail to get %zu user pages with error %d\n", gup_num_pages, npages);
+ else
+ pr_debug("fail to get %zu user pages with error %d\n", gup_num_pages, npages);
break;
+ }
bcnt -= min_t(size_t, npages << PAGE_SHIFT, bcnt);
mutex_lock(&umem_odp->umem_mutex);
@@ -666,8 +672,13 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt,
ret = ib_umem_odp_map_dma_single_page(
umem_odp, k, local_page_list[j],
access_mask, current_seq);
- if (ret < 0)
+ if (ret < 0) {
+ if (ret != -EAGAIN)
+ pr_warn("ib_umem_odp_map_dma_single_page failed with error %d\n", ret);
+ else
+ pr_debug("ib_umem_odp_map_dma_single_page failed with error %d\n", ret);
break;
+ }
p = page_to_phys(local_page_list[j]);
k++;
@@ -675,9 +686,14 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt,
mutex_unlock(&umem_odp->umem_mutex);
if (ret < 0) {
- /* Release left over pages when handling errors. */
- for (++j; j < npages; ++j)
- put_page(local_page_list[j]);
+ /*
+ * Release pages, remembering that the first page
+ * to hit an error was already released by
+ * ib_umem_odp_map_dma_single_page().
+ */
+ if (npages - (j + 1) > 0)
+ release_pages(&local_page_list[j+1],
+ npages - (j + 1));
break;
}
}
diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c
index f55f48f6b272..02b7947ab215 100644
--- a/drivers/infiniband/core/user_mad.c
+++ b/drivers/infiniband/core/user_mad.c
@@ -88,10 +88,9 @@ enum {
struct ib_umad_port {
struct cdev cdev;
- struct device *dev;
-
+ struct device dev;
struct cdev sm_cdev;
- struct device *sm_dev;
+ struct device sm_dev;
struct semaphore sm_sem;
struct mutex file_mutex;
@@ -104,8 +103,8 @@ struct ib_umad_port {
};
struct ib_umad_device {
- struct kobject kobj;
- struct ib_umad_port port[0];
+ struct kref kref;
+ struct ib_umad_port ports[];
};
struct ib_umad_file {
@@ -130,8 +129,6 @@ struct ib_umad_packet {
struct ib_user_mad mad;
};
-static struct class *umad_class;
-
static const dev_t base_umad_dev = MKDEV(IB_UMAD_MAJOR, IB_UMAD_MINOR_BASE);
static const dev_t base_issm_dev = MKDEV(IB_UMAD_MAJOR, IB_UMAD_MINOR_BASE) +
IB_UMAD_NUM_FIXED_MINOR;
@@ -143,17 +140,23 @@ static DEFINE_IDA(umad_ida);
static void ib_umad_add_one(struct ib_device *device);
static void ib_umad_remove_one(struct ib_device *device, void *client_data);
-static void ib_umad_release_dev(struct kobject *kobj)
+static void ib_umad_dev_free(struct kref *kref)
{
struct ib_umad_device *dev =
- container_of(kobj, struct ib_umad_device, kobj);
+ container_of(kref, struct ib_umad_device, kref);
kfree(dev);
}
-static struct kobj_type ib_umad_dev_ktype = {
- .release = ib_umad_release_dev,
-};
+static void ib_umad_dev_get(struct ib_umad_device *dev)
+{
+ kref_get(&dev->kref);
+}
+
+static void ib_umad_dev_put(struct ib_umad_device *dev)
+{
+ kref_put(&dev->kref, ib_umad_dev_free);
+}
static int hdr_size(struct ib_umad_file *file)
{
@@ -205,7 +208,7 @@ static void send_handler(struct ib_mad_agent *agent,
struct ib_umad_packet *packet = send_wc->send_buf->context[0];
dequeue_send(file, packet);
- rdma_destroy_ah(packet->msg->ah);
+ rdma_destroy_ah(packet->msg->ah, RDMA_DESTROY_AH_SLEEPABLE);
ib_free_send_mad(packet->msg);
if (send_wc->status == IB_WC_RESP_TIMEOUT_ERR) {
@@ -621,7 +624,7 @@ err_send:
err_msg:
ib_free_send_mad(packet->msg);
err_ah:
- rdma_destroy_ah(ah);
+ rdma_destroy_ah(ah, RDMA_DESTROY_AH_SLEEPABLE);
err_up:
mutex_unlock(&file->mutex);
err:
@@ -657,7 +660,7 @@ static int ib_umad_reg_agent(struct ib_umad_file *file, void __user *arg,
mutex_lock(&file->mutex);
if (!file->port->ib_dev) {
- dev_notice(file->port->dev,
+ dev_notice(&file->port->dev,
"ib_umad_reg_agent: invalid device\n");
ret = -EPIPE;
goto out;
@@ -669,7 +672,7 @@ static int ib_umad_reg_agent(struct ib_umad_file *file, void __user *arg,
}
if (ureq.qpn != 0 && ureq.qpn != 1) {
- dev_notice(file->port->dev,
+ dev_notice(&file->port->dev,
"ib_umad_reg_agent: invalid QPN %d specified\n",
ureq.qpn);
ret = -EINVAL;
@@ -680,7 +683,7 @@ static int ib_umad_reg_agent(struct ib_umad_file *file, void __user *arg,
if (!__get_agent(file, agent_id))
goto found;
- dev_notice(file->port->dev,
+ dev_notice(&file->port->dev,
"ib_umad_reg_agent: Max Agents (%u) reached\n",
IB_UMAD_MAX_AGENTS);
ret = -ENOMEM;
@@ -725,10 +728,10 @@ found:
if (!file->already_used) {
file->already_used = 1;
if (!file->use_pkey_index) {
- dev_warn(file->port->dev,
+ dev_warn(&file->port->dev,
"process %s did not enable P_Key index support.\n",
current->comm);
- dev_warn(file->port->dev,
+ dev_warn(&file->port->dev,
" Documentation/infiniband/user_mad.txt has info on the new ABI.\n");
}
}
@@ -759,7 +762,7 @@ static int ib_umad_reg_agent2(struct ib_umad_file *file, void __user *arg)
mutex_lock(&file->mutex);
if (!file->port->ib_dev) {
- dev_notice(file->port->dev,
+ dev_notice(&file->port->dev,
"ib_umad_reg_agent2: invalid device\n");
ret = -EPIPE;
goto out;
@@ -771,7 +774,7 @@ static int ib_umad_reg_agent2(struct ib_umad_file *file, void __user *arg)
}
if (ureq.qpn != 0 && ureq.qpn != 1) {
- dev_notice(file->port->dev,
+ dev_notice(&file->port->dev,
"ib_umad_reg_agent2: invalid QPN %d specified\n",
ureq.qpn);
ret = -EINVAL;
@@ -779,7 +782,7 @@ static int ib_umad_reg_agent2(struct ib_umad_file *file, void __user *arg)
}
if (ureq.flags & ~IB_USER_MAD_REG_FLAGS_CAP) {
- dev_notice(file->port->dev,
+ dev_notice(&file->port->dev,
"ib_umad_reg_agent2 failed: invalid registration flags specified 0x%x; supported 0x%x\n",
ureq.flags, IB_USER_MAD_REG_FLAGS_CAP);
ret = -EINVAL;
@@ -796,7 +799,7 @@ static int ib_umad_reg_agent2(struct ib_umad_file *file, void __user *arg)
if (!__get_agent(file, agent_id))
goto found;
- dev_notice(file->port->dev,
+ dev_notice(&file->port->dev,
"ib_umad_reg_agent2: Max Agents (%u) reached\n",
IB_UMAD_MAX_AGENTS);
ret = -ENOMEM;
@@ -808,7 +811,7 @@ found:
req.mgmt_class = ureq.mgmt_class;
req.mgmt_class_version = ureq.mgmt_class_version;
if (ureq.oui & 0xff000000) {
- dev_notice(file->port->dev,
+ dev_notice(&file->port->dev,
"ib_umad_reg_agent2 failed: oui invalid 0x%08x\n",
ureq.oui);
ret = -EINVAL;
@@ -954,19 +957,22 @@ static int ib_umad_open(struct inode *inode, struct file *filp)
{
struct ib_umad_port *port;
struct ib_umad_file *file;
- int ret = -ENXIO;
+ int ret = 0;
port = container_of(inode->i_cdev, struct ib_umad_port, cdev);
mutex_lock(&port->file_mutex);
- if (!port->ib_dev)
+ if (!port->ib_dev) {
+ ret = -ENXIO;
goto out;
+ }
- ret = -ENOMEM;
- file = kzalloc(sizeof *file, GFP_KERNEL);
- if (!file)
+ file = kzalloc(sizeof(*file), GFP_KERNEL);
+ if (!file) {
+ ret = -ENOMEM;
goto out;
+ }
mutex_init(&file->mutex);
spin_lock_init(&file->send_lock);
@@ -979,15 +985,7 @@ static int ib_umad_open(struct inode *inode, struct file *filp)
list_add_tail(&file->port_list, &port->file_list);
- ret = nonseekable_open(inode, filp);
- if (ret) {
- list_del(&file->port_list);
- kfree(file);
- goto out;
- }
-
- kobject_get(&port->umad_dev->kobj);
-
+ nonseekable_open(inode, filp);
out:
mutex_unlock(&port->file_mutex);
return ret;
@@ -996,7 +994,6 @@ out:
static int ib_umad_close(struct inode *inode, struct file *filp)
{
struct ib_umad_file *file = filp->private_data;
- struct ib_umad_device *dev = file->port->umad_dev;
struct ib_umad_packet *packet, *tmp;
int already_dead;
int i;
@@ -1025,8 +1022,6 @@ static int ib_umad_close(struct inode *inode, struct file *filp)
mutex_unlock(&file->port->file_mutex);
kfree(file);
- kobject_put(&dev->kobj);
-
return 0;
}
@@ -1072,18 +1067,9 @@ static int ib_umad_sm_open(struct inode *inode, struct file *filp)
filp->private_data = port;
- ret = nonseekable_open(inode, filp);
- if (ret)
- goto err_clr_sm_cap;
-
- kobject_get(&port->umad_dev->kobj);
-
+ nonseekable_open(inode, filp);
return 0;
-err_clr_sm_cap:
- swap(props.set_port_cap_mask, props.clr_port_cap_mask);
- ib_modify_port(port->ib_dev, port->port_num, 0, &props);
-
err_up_sem:
up(&port->sm_sem);
@@ -1106,8 +1092,6 @@ static int ib_umad_sm_close(struct inode *inode, struct file *filp)
up(&port->sm_sem);
- kobject_put(&port->umad_dev->kobj);
-
return ret;
}
@@ -1124,7 +1108,7 @@ static struct ib_client umad_client = {
.remove = ib_umad_remove_one
};
-static ssize_t show_ibdev(struct device *dev, struct device_attribute *attr,
+static ssize_t ibdev_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
struct ib_umad_port *port = dev_get_drvdata(dev);
@@ -1134,9 +1118,9 @@ static ssize_t show_ibdev(struct device *dev, struct device_attribute *attr,
return sprintf(buf, "%s\n", dev_name(&port->ib_dev->dev));
}
-static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL);
+static DEVICE_ATTR_RO(ibdev);
-static ssize_t show_port(struct device *dev, struct device_attribute *attr,
+static ssize_t port_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
struct ib_umad_port *port = dev_get_drvdata(dev);
@@ -1146,10 +1130,59 @@ static ssize_t show_port(struct device *dev, struct device_attribute *attr,
return sprintf(buf, "%d\n", port->port_num);
}
-static DEVICE_ATTR(port, S_IRUGO, show_port, NULL);
+static DEVICE_ATTR_RO(port);
+
+static struct attribute *umad_class_dev_attrs[] = {
+ &dev_attr_ibdev.attr,
+ &dev_attr_port.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(umad_class_dev);
+
+static char *umad_devnode(struct device *dev, umode_t *mode)
+{
+ return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev));
+}
+
+static ssize_t abi_version_show(struct class *class,
+ struct class_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%d\n", IB_USER_MAD_ABI_VERSION);
+}
+static CLASS_ATTR_RO(abi_version);
+
+static struct attribute *umad_class_attrs[] = {
+ &class_attr_abi_version.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(umad_class);
+
+static struct class umad_class = {
+ .name = "infiniband_mad",
+ .devnode = umad_devnode,
+ .class_groups = umad_class_groups,
+ .dev_groups = umad_class_dev_groups,
+};
+
+static void ib_umad_release_port(struct device *device)
+{
+ struct ib_umad_port *port = dev_get_drvdata(device);
+ struct ib_umad_device *umad_dev = port->umad_dev;
+
+ ib_umad_dev_put(umad_dev);
+}
-static CLASS_ATTR_STRING(abi_version, S_IRUGO,
- __stringify(IB_USER_MAD_ABI_VERSION));
+static void ib_umad_init_port_dev(struct device *dev,
+ struct ib_umad_port *port,
+ const struct ib_device *device)
+{
+ device_initialize(dev);
+ ib_umad_dev_get(port->umad_dev);
+ dev->class = &umad_class;
+ dev->parent = device->dev.parent;
+ dev_set_drvdata(dev, port);
+ dev->release = ib_umad_release_port;
+}
static int ib_umad_init_port(struct ib_device *device, int port_num,
struct ib_umad_device *umad_dev,
@@ -1158,6 +1191,7 @@ static int ib_umad_init_port(struct ib_device *device, int port_num,
int devnum;
dev_t base_umad;
dev_t base_issm;
+ int ret;
devnum = ida_alloc_max(&umad_ida, IB_UMAD_MAX_PORTS - 1, GFP_KERNEL);
if (devnum < 0)
@@ -1172,63 +1206,41 @@ static int ib_umad_init_port(struct ib_device *device, int port_num,
}
port->ib_dev = device;
+ port->umad_dev = umad_dev;
port->port_num = port_num;
sema_init(&port->sm_sem, 1);
mutex_init(&port->file_mutex);
INIT_LIST_HEAD(&port->file_list);
+ ib_umad_init_port_dev(&port->dev, port, device);
+ port->dev.devt = base_umad;
+ dev_set_name(&port->dev, "umad%d", port->dev_num);
cdev_init(&port->cdev, &umad_fops);
port->cdev.owner = THIS_MODULE;
- cdev_set_parent(&port->cdev, &umad_dev->kobj);
- kobject_set_name(&port->cdev.kobj, "umad%d", port->dev_num);
- if (cdev_add(&port->cdev, base_umad, 1))
- goto err_cdev;
- port->dev = device_create(umad_class, device->dev.parent,
- port->cdev.dev, port,
- "umad%d", port->dev_num);
- if (IS_ERR(port->dev))
+ ret = cdev_device_add(&port->cdev, &port->dev);
+ if (ret)
goto err_cdev;
- if (device_create_file(port->dev, &dev_attr_ibdev))
- goto err_dev;
- if (device_create_file(port->dev, &dev_attr_port))
- goto err_dev;
-
+ ib_umad_init_port_dev(&port->sm_dev, port, device);
+ port->sm_dev.devt = base_issm;
+ dev_set_name(&port->sm_dev, "issm%d", port->dev_num);
cdev_init(&port->sm_cdev, &umad_sm_fops);
port->sm_cdev.owner = THIS_MODULE;
- cdev_set_parent(&port->sm_cdev, &umad_dev->kobj);
- kobject_set_name(&port->sm_cdev.kobj, "issm%d", port->dev_num);
- if (cdev_add(&port->sm_cdev, base_issm, 1))
- goto err_sm_cdev;
-
- port->sm_dev = device_create(umad_class, device->dev.parent,
- port->sm_cdev.dev, port,
- "issm%d", port->dev_num);
- if (IS_ERR(port->sm_dev))
- goto err_sm_cdev;
-
- if (device_create_file(port->sm_dev, &dev_attr_ibdev))
- goto err_sm_dev;
- if (device_create_file(port->sm_dev, &dev_attr_port))
- goto err_sm_dev;
- return 0;
-
-err_sm_dev:
- device_destroy(umad_class, port->sm_cdev.dev);
+ ret = cdev_device_add(&port->sm_cdev, &port->sm_dev);
+ if (ret)
+ goto err_dev;
-err_sm_cdev:
- cdev_del(&port->sm_cdev);
+ return 0;
err_dev:
- device_destroy(umad_class, port->cdev.dev);
-
+ put_device(&port->sm_dev);
+ cdev_device_del(&port->cdev, &port->dev);
err_cdev:
- cdev_del(&port->cdev);
+ put_device(&port->dev);
ida_free(&umad_ida, devnum);
-
- return -1;
+ return ret;
}
static void ib_umad_kill_port(struct ib_umad_port *port)
@@ -1236,17 +1248,11 @@ static void ib_umad_kill_port(struct ib_umad_port *port)
struct ib_umad_file *file;
int id;
- dev_set_drvdata(port->dev, NULL);
- dev_set_drvdata(port->sm_dev, NULL);
-
- device_destroy(umad_class, port->cdev.dev);
- device_destroy(umad_class, port->sm_cdev.dev);
-
- cdev_del(&port->cdev);
- cdev_del(&port->sm_cdev);
-
mutex_lock(&port->file_mutex);
+ /* Mark ib_dev NULL and block ioctl or other file ops to progress
+ * further.
+ */
port->ib_dev = NULL;
list_for_each_entry(file, &port->file_list, port_list) {
@@ -1260,7 +1266,14 @@ static void ib_umad_kill_port(struct ib_umad_port *port)
}
mutex_unlock(&port->file_mutex);
+
+ cdev_device_del(&port->sm_cdev, &port->sm_dev);
+ cdev_device_del(&port->cdev, &port->dev);
ida_free(&umad_ida, port->dev_num);
+
+ /* balances device_initialize() */
+ put_device(&port->sm_dev);
+ put_device(&port->dev);
}
static void ib_umad_add_one(struct ib_device *device)
@@ -1272,22 +1285,17 @@ static void ib_umad_add_one(struct ib_device *device)
s = rdma_start_port(device);
e = rdma_end_port(device);
- umad_dev = kzalloc(sizeof *umad_dev +
- (e - s + 1) * sizeof (struct ib_umad_port),
- GFP_KERNEL);
+ umad_dev = kzalloc(struct_size(umad_dev, ports, e - s + 1), GFP_KERNEL);
if (!umad_dev)
return;
- kobject_init(&umad_dev->kobj, &ib_umad_dev_ktype);
-
+ kref_init(&umad_dev->kref);
for (i = s; i <= e; ++i) {
if (!rdma_cap_ib_mad(device, i))
continue;
- umad_dev->port[i - s].umad_dev = umad_dev;
-
if (ib_umad_init_port(device, i, umad_dev,
- &umad_dev->port[i - s]))
+ &umad_dev->ports[i - s]))
goto err;
count++;
@@ -1305,31 +1313,28 @@ err:
if (!rdma_cap_ib_mad(device, i))
continue;
- ib_umad_kill_port(&umad_dev->port[i - s]);
+ ib_umad_kill_port(&umad_dev->ports[i - s]);
}
free:
- kobject_put(&umad_dev->kobj);
+ /* balances kref_init */
+ ib_umad_dev_put(umad_dev);
}
static void ib_umad_remove_one(struct ib_device *device, void *client_data)
{
struct ib_umad_device *umad_dev = client_data;
- int i;
+ unsigned int i;
if (!umad_dev)
return;
- for (i = 0; i <= rdma_end_port(device) - rdma_start_port(device); ++i) {
- if (rdma_cap_ib_mad(device, i + rdma_start_port(device)))
- ib_umad_kill_port(&umad_dev->port[i]);
+ rdma_for_each_port (device, i) {
+ if (rdma_cap_ib_mad(device, i))
+ ib_umad_kill_port(
+ &umad_dev->ports[i - rdma_start_port(device)]);
}
-
- kobject_put(&umad_dev->kobj);
-}
-
-static char *umad_devnode(struct device *dev, umode_t *mode)
-{
- return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev));
+ /* balances kref_init() */
+ ib_umad_dev_put(umad_dev);
}
static int __init ib_umad_init(void)
@@ -1338,7 +1343,7 @@ static int __init ib_umad_init(void)
ret = register_chrdev_region(base_umad_dev,
IB_UMAD_NUM_FIXED_MINOR * 2,
- "infiniband_mad");
+ umad_class.name);
if (ret) {
pr_err("couldn't register device number\n");
goto out;
@@ -1346,28 +1351,19 @@ static int __init ib_umad_init(void)
ret = alloc_chrdev_region(&dynamic_umad_dev, 0,
IB_UMAD_NUM_DYNAMIC_MINOR * 2,
- "infiniband_mad");
+ umad_class.name);
if (ret) {
pr_err("couldn't register dynamic device number\n");
goto out_alloc;
}
dynamic_issm_dev = dynamic_umad_dev + IB_UMAD_NUM_DYNAMIC_MINOR;
- umad_class = class_create(THIS_MODULE, "infiniband_mad");
- if (IS_ERR(umad_class)) {
- ret = PTR_ERR(umad_class);
+ ret = class_register(&umad_class);
+ if (ret) {
pr_err("couldn't create class infiniband_mad\n");
goto out_chrdev;
}
- umad_class->devnode = umad_devnode;
-
- ret = class_create_file(umad_class, &class_attr_abi_version.attr);
- if (ret) {
- pr_err("couldn't create abi_version attribute\n");
- goto out_class;
- }
-
ret = ib_register_client(&umad_client);
if (ret) {
pr_err("couldn't register ib_umad client\n");
@@ -1377,7 +1373,7 @@ static int __init ib_umad_init(void)
return 0;
out_class:
- class_destroy(umad_class);
+ class_unregister(&umad_class);
out_chrdev:
unregister_chrdev_region(dynamic_umad_dev,
@@ -1394,7 +1390,7 @@ out:
static void __exit ib_umad_cleanup(void)
{
ib_unregister_client(&umad_client);
- class_destroy(umad_class);
+ class_unregister(&umad_class);
unregister_chrdev_region(base_umad_dev,
IB_UMAD_NUM_FIXED_MINOR * 2);
unregister_chrdev_region(dynamic_umad_dev,
diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h
index c97935a0c7c6..32cc8fe7902f 100644
--- a/drivers/infiniband/core/uverbs.h
+++ b/drivers/infiniband/core/uverbs.h
@@ -160,9 +160,7 @@ struct ib_uverbs_file {
struct mutex umap_lock;
struct list_head umaps;
-
- u64 uverbs_cmd_mask;
- u64 uverbs_ex_cmd_mask;
+ struct page *disassociate_page;
struct idr idr;
/* spinlock protects write access to idr */
@@ -249,7 +247,6 @@ int uverbs_dealloc_mw(struct ib_mw *mw);
void ib_uverbs_detach_umcast(struct ib_qp *qp,
struct ib_uqp_object *uobj);
-void create_udata(struct uverbs_attr_bundle *ctx, struct ib_udata *udata);
long ib_uverbs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
struct ib_uverbs_flow_spec {
@@ -297,63 +294,29 @@ extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_FLOW_ACTION);
extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_DM);
extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_COUNTERS);
-#define IB_UVERBS_DECLARE_CMD(name) \
- ssize_t ib_uverbs_##name(struct ib_uverbs_file *file, \
- const char __user *buf, int in_len, \
- int out_len)
-
-IB_UVERBS_DECLARE_CMD(get_context);
-IB_UVERBS_DECLARE_CMD(query_device);
-IB_UVERBS_DECLARE_CMD(query_port);
-IB_UVERBS_DECLARE_CMD(alloc_pd);
-IB_UVERBS_DECLARE_CMD(dealloc_pd);
-IB_UVERBS_DECLARE_CMD(reg_mr);
-IB_UVERBS_DECLARE_CMD(rereg_mr);
-IB_UVERBS_DECLARE_CMD(dereg_mr);
-IB_UVERBS_DECLARE_CMD(alloc_mw);
-IB_UVERBS_DECLARE_CMD(dealloc_mw);
-IB_UVERBS_DECLARE_CMD(create_comp_channel);
-IB_UVERBS_DECLARE_CMD(create_cq);
-IB_UVERBS_DECLARE_CMD(resize_cq);
-IB_UVERBS_DECLARE_CMD(poll_cq);
-IB_UVERBS_DECLARE_CMD(req_notify_cq);
-IB_UVERBS_DECLARE_CMD(destroy_cq);
-IB_UVERBS_DECLARE_CMD(create_qp);
-IB_UVERBS_DECLARE_CMD(open_qp);
-IB_UVERBS_DECLARE_CMD(query_qp);
-IB_UVERBS_DECLARE_CMD(modify_qp);
-IB_UVERBS_DECLARE_CMD(destroy_qp);
-IB_UVERBS_DECLARE_CMD(post_send);
-IB_UVERBS_DECLARE_CMD(post_recv);
-IB_UVERBS_DECLARE_CMD(post_srq_recv);
-IB_UVERBS_DECLARE_CMD(create_ah);
-IB_UVERBS_DECLARE_CMD(destroy_ah);
-IB_UVERBS_DECLARE_CMD(attach_mcast);
-IB_UVERBS_DECLARE_CMD(detach_mcast);
-IB_UVERBS_DECLARE_CMD(create_srq);
-IB_UVERBS_DECLARE_CMD(modify_srq);
-IB_UVERBS_DECLARE_CMD(query_srq);
-IB_UVERBS_DECLARE_CMD(destroy_srq);
-IB_UVERBS_DECLARE_CMD(create_xsrq);
-IB_UVERBS_DECLARE_CMD(open_xrcd);
-IB_UVERBS_DECLARE_CMD(close_xrcd);
-
-#define IB_UVERBS_DECLARE_EX_CMD(name) \
- int ib_uverbs_ex_##name(struct ib_uverbs_file *file, \
- struct ib_udata *ucore, \
- struct ib_udata *uhw)
-
-IB_UVERBS_DECLARE_EX_CMD(create_flow);
-IB_UVERBS_DECLARE_EX_CMD(destroy_flow);
-IB_UVERBS_DECLARE_EX_CMD(query_device);
-IB_UVERBS_DECLARE_EX_CMD(create_cq);
-IB_UVERBS_DECLARE_EX_CMD(create_qp);
-IB_UVERBS_DECLARE_EX_CMD(create_wq);
-IB_UVERBS_DECLARE_EX_CMD(modify_wq);
-IB_UVERBS_DECLARE_EX_CMD(destroy_wq);
-IB_UVERBS_DECLARE_EX_CMD(create_rwq_ind_table);
-IB_UVERBS_DECLARE_EX_CMD(destroy_rwq_ind_table);
-IB_UVERBS_DECLARE_EX_CMD(modify_qp);
-IB_UVERBS_DECLARE_EX_CMD(modify_cq);
+/*
+ * ib_uverbs_query_port_resp.port_cap_flags started out as just a copy of the
+ * PortInfo CapabilityMask, but was extended with unique bits.
+ */
+static inline u32 make_port_cap_flags(const struct ib_port_attr *attr)
+{
+ u32 res;
+
+ /* All IBA CapabilityMask bits are passed through here, except bit 26,
+ * which is overridden with IP_BASED_GIDS. This is due to a historical
+ * mistake in the implementation of IP_BASED_GIDS. Otherwise all other
+ * bits match the IBA definition across all kernel versions.
+ */
+ res = attr->port_cap_flags & ~(u32)IB_UVERBS_PCF_IP_BASED_GIDS;
+
+ if (attr->ip_gids)
+ res |= IB_UVERBS_PCF_IP_BASED_GIDS;
+
+ return res;
+}
+
+void copy_port_attr_to_resp(struct ib_port_attr *attr,
+ struct ib_uverbs_query_port_resp *resp,
+ struct ib_device *ib_dev, u8 port_num);
#endif /* UVERBS_H */
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index a93853770e3c..062a86c04123 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -47,11 +47,138 @@
#include "uverbs.h"
#include "core_priv.h"
+/*
+ * Copy a response to userspace. If the provided 'resp' is larger than the
+ * user buffer it is silently truncated. If the user provided a larger buffer
+ * then the trailing portion is zero filled.
+ *
+ * These semantics are intended to support future extension of the output
+ * structures.
+ */
+static int uverbs_response(struct uverbs_attr_bundle *attrs, const void *resp,
+ size_t resp_len)
+{
+ int ret;
+
+ if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_CORE_OUT))
+ return uverbs_copy_to_struct_or_zero(
+ attrs, UVERBS_ATTR_CORE_OUT, resp, resp_len);
+
+ if (copy_to_user(attrs->ucore.outbuf, resp,
+ min(attrs->ucore.outlen, resp_len)))
+ return -EFAULT;
+
+ if (resp_len < attrs->ucore.outlen) {
+ /*
+ * Zero fill any extra memory that user
+ * space might have provided.
+ */
+ ret = clear_user(attrs->ucore.outbuf + resp_len,
+ attrs->ucore.outlen - resp_len);
+ if (ret)
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+/*
+ * Copy a request from userspace. If the provided 'req' is larger than the
+ * user buffer then the user buffer is zero extended into the 'req'. If 'req'
+ * is smaller than the user buffer then the uncopied bytes in the user buffer
+ * must be zero.
+ */
+static int uverbs_request(struct uverbs_attr_bundle *attrs, void *req,
+ size_t req_len)
+{
+ if (copy_from_user(req, attrs->ucore.inbuf,
+ min(attrs->ucore.inlen, req_len)))
+ return -EFAULT;
+
+ if (attrs->ucore.inlen < req_len) {
+ memset(req + attrs->ucore.inlen, 0,
+ req_len - attrs->ucore.inlen);
+ } else if (attrs->ucore.inlen > req_len) {
+ if (!ib_is_buffer_cleared(attrs->ucore.inbuf + req_len,
+ attrs->ucore.inlen - req_len))
+ return -EOPNOTSUPP;
+ }
+ return 0;
+}
+
+/*
+ * Generate the value for the 'response_length' protocol used by write_ex.
+ * This is the number of bytes the kernel actually wrote. Userspace can use
+ * this to detect what structure members in the response the kernel
+ * understood.
+ */
+static u32 uverbs_response_length(struct uverbs_attr_bundle *attrs,
+ size_t resp_len)
+{
+ return min_t(size_t, attrs->ucore.outlen, resp_len);
+}
+
+/*
+ * The iterator version of the request interface is for handlers that need to
+ * step over a flex array at the end of a command header.
+ */
+struct uverbs_req_iter {
+ const void __user *cur;
+ const void __user *end;
+};
+
+static int uverbs_request_start(struct uverbs_attr_bundle *attrs,
+ struct uverbs_req_iter *iter,
+ void *req,
+ size_t req_len)
+{
+ if (attrs->ucore.inlen < req_len)
+ return -ENOSPC;
+
+ if (copy_from_user(req, attrs->ucore.inbuf, req_len))
+ return -EFAULT;
+
+ iter->cur = attrs->ucore.inbuf + req_len;
+ iter->end = attrs->ucore.inbuf + attrs->ucore.inlen;
+ return 0;
+}
+
+static int uverbs_request_next(struct uverbs_req_iter *iter, void *val,
+ size_t len)
+{
+ if (iter->cur + len > iter->end)
+ return -ENOSPC;
+
+ if (copy_from_user(val, iter->cur, len))
+ return -EFAULT;
+
+ iter->cur += len;
+ return 0;
+}
+
+static const void __user *uverbs_request_next_ptr(struct uverbs_req_iter *iter,
+ size_t len)
+{
+ const void __user *res = iter->cur;
+
+ if (iter->cur + len > iter->end)
+ return ERR_PTR(-ENOSPC);
+ iter->cur += len;
+ return res;
+}
+
+static int uverbs_request_finish(struct uverbs_req_iter *iter)
+{
+ if (!ib_is_buffer_cleared(iter->cur, iter->end - iter->cur))
+ return -EOPNOTSUPP;
+ return 0;
+}
+
static struct ib_uverbs_completion_event_file *
-_ib_uverbs_lookup_comp_file(s32 fd, struct ib_uverbs_file *ufile)
+_ib_uverbs_lookup_comp_file(s32 fd, const struct uverbs_attr_bundle *attrs)
{
struct ib_uobject *uobj = ufd_get_read(UVERBS_OBJECT_COMP_CHANNEL,
- fd, ufile);
+ fd, attrs);
if (IS_ERR(uobj))
return (void *)uobj;
@@ -65,24 +192,20 @@ _ib_uverbs_lookup_comp_file(s32 fd, struct ib_uverbs_file *ufile)
#define ib_uverbs_lookup_comp_file(_fd, _ufile) \
_ib_uverbs_lookup_comp_file((_fd)*typecheck(s32, _fd), _ufile)
-ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
- const char __user *buf,
- int in_len, int out_len)
+static int ib_uverbs_get_context(struct uverbs_attr_bundle *attrs)
{
+ struct ib_uverbs_file *file = attrs->ufile;
struct ib_uverbs_get_context cmd;
struct ib_uverbs_get_context_resp resp;
- struct ib_udata udata;
struct ib_ucontext *ucontext;
struct file *filp;
struct ib_rdmacg_object cg_obj;
struct ib_device *ib_dev;
int ret;
- if (out_len < sizeof resp)
- return -ENOSPC;
-
- if (copy_from_user(&cmd, buf, sizeof cmd))
- return -EFAULT;
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
mutex_lock(&file->ucontext_lock);
ib_dev = srcu_dereference(file->device->ib_dev,
@@ -97,21 +220,17 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
goto err;
}
- ib_uverbs_init_udata(&udata, buf + sizeof(cmd),
- u64_to_user_ptr(cmd.response) + sizeof(resp),
- in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
- out_len - sizeof(resp));
-
ret = ib_rdmacg_try_charge(&cg_obj, ib_dev, RDMACG_RESOURCE_HCA_HANDLE);
if (ret)
goto err;
- ucontext = ib_dev->alloc_ucontext(ib_dev, &udata);
- if (IS_ERR(ucontext)) {
- ret = PTR_ERR(ucontext);
+ ucontext = rdma_zalloc_drv_obj(ib_dev, ib_ucontext);
+ if (!ucontext) {
+ ret = -ENOMEM;
goto err_alloc;
}
+ ucontext->res.type = RDMA_RESTRACK_CTX;
ucontext->device = ib_dev;
ucontext->cg_obj = cg_obj;
/* ufile is required when some objects are released */
@@ -120,15 +239,8 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
ucontext->closing = false;
ucontext->cleanup_retryable = false;
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
mutex_init(&ucontext->per_mm_list_lock);
INIT_LIST_HEAD(&ucontext->per_mm_list);
- if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING))
- ucontext->invalidate_range = NULL;
-
-#endif
-
- resp.num_comp_vectors = file->device->num_comp_vectors;
ret = get_unused_fd_flags(O_CLOEXEC);
if (ret < 0)
@@ -141,10 +253,19 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
goto err_fd;
}
- if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp)) {
- ret = -EFAULT;
+ resp.num_comp_vectors = file->device->num_comp_vectors;
+
+ ret = uverbs_response(attrs, &resp, sizeof(resp));
+ if (ret)
goto err_file;
- }
+
+ ret = ib_dev->ops.alloc_ucontext(ucontext, &attrs->driver_udata);
+ if (ret)
+ goto err_file;
+ if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING))
+ ucontext->invalidate_range = NULL;
+
+ rdma_restrack_uadd(&ucontext->res);
fd_install(resp.async_fd, filp);
@@ -156,7 +277,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
mutex_unlock(&file->ucontext_lock);
- return in_len;
+ return 0;
err_file:
ib_uverbs_free_async_event_file(file);
@@ -166,7 +287,7 @@ err_fd:
put_unused_fd(resp.async_fd);
err_free:
- ib_dev->dealloc_ucontext(ucontext);
+ kfree(ucontext);
err_alloc:
ib_rdmacg_uncharge(&cg_obj, ib_dev, RDMACG_RESOURCE_HCA_HANDLE);
@@ -224,57 +345,28 @@ static void copy_query_dev_fields(struct ib_ucontext *ucontext,
resp->phys_port_cnt = ib_dev->phys_port_cnt;
}
-ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file,
- const char __user *buf,
- int in_len, int out_len)
+static int ib_uverbs_query_device(struct uverbs_attr_bundle *attrs)
{
struct ib_uverbs_query_device cmd;
struct ib_uverbs_query_device_resp resp;
struct ib_ucontext *ucontext;
+ int ret;
- ucontext = ib_uverbs_get_ucontext(file);
+ ucontext = ib_uverbs_get_ucontext(attrs);
if (IS_ERR(ucontext))
return PTR_ERR(ucontext);
- if (out_len < sizeof resp)
- return -ENOSPC;
-
- if (copy_from_user(&cmd, buf, sizeof cmd))
- return -EFAULT;
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
memset(&resp, 0, sizeof resp);
copy_query_dev_fields(ucontext, &resp, &ucontext->device->attrs);
- if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp))
- return -EFAULT;
-
- return in_len;
-}
-
-/*
- * ib_uverbs_query_port_resp.port_cap_flags started out as just a copy of the
- * PortInfo CapabilityMask, but was extended with unique bits.
- */
-static u32 make_port_cap_flags(const struct ib_port_attr *attr)
-{
- u32 res;
-
- /* All IBA CapabilityMask bits are passed through here, except bit 26,
- * which is overridden with IP_BASED_GIDS. This is due to a historical
- * mistake in the implementation of IP_BASED_GIDS. Otherwise all other
- * bits match the IBA definition across all kernel versions.
- */
- res = attr->port_cap_flags & ~(u32)IB_UVERBS_PCF_IP_BASED_GIDS;
-
- if (attr->ip_gids)
- res |= IB_UVERBS_PCF_IP_BASED_GIDS;
-
- return res;
+ return uverbs_response(attrs, &resp, sizeof(resp));
}
-ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file,
- const char __user *buf,
- int in_len, int out_len)
+static int ib_uverbs_query_port(struct uverbs_attr_bundle *attrs)
{
struct ib_uverbs_query_port cmd;
struct ib_uverbs_query_port_resp resp;
@@ -283,90 +375,45 @@ ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file,
struct ib_ucontext *ucontext;
struct ib_device *ib_dev;
- ucontext = ib_uverbs_get_ucontext(file);
+ ucontext = ib_uverbs_get_ucontext(attrs);
if (IS_ERR(ucontext))
return PTR_ERR(ucontext);
ib_dev = ucontext->device;
- if (out_len < sizeof resp)
- return -ENOSPC;
-
- if (copy_from_user(&cmd, buf, sizeof cmd))
- return -EFAULT;
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
ret = ib_query_port(ib_dev, cmd.port_num, &attr);
if (ret)
return ret;
memset(&resp, 0, sizeof resp);
+ copy_port_attr_to_resp(&attr, &resp, ib_dev, cmd.port_num);
- resp.state = attr.state;
- resp.max_mtu = attr.max_mtu;
- resp.active_mtu = attr.active_mtu;
- resp.gid_tbl_len = attr.gid_tbl_len;
- resp.port_cap_flags = make_port_cap_flags(&attr);
- resp.max_msg_sz = attr.max_msg_sz;
- resp.bad_pkey_cntr = attr.bad_pkey_cntr;
- resp.qkey_viol_cntr = attr.qkey_viol_cntr;
- resp.pkey_tbl_len = attr.pkey_tbl_len;
-
- if (rdma_is_grh_required(ib_dev, cmd.port_num))
- resp.flags |= IB_UVERBS_QPF_GRH_REQUIRED;
-
- if (rdma_cap_opa_ah(ib_dev, cmd.port_num)) {
- resp.lid = OPA_TO_IB_UCAST_LID(attr.lid);
- resp.sm_lid = OPA_TO_IB_UCAST_LID(attr.sm_lid);
- } else {
- resp.lid = ib_lid_cpu16(attr.lid);
- resp.sm_lid = ib_lid_cpu16(attr.sm_lid);
- }
- resp.lmc = attr.lmc;
- resp.max_vl_num = attr.max_vl_num;
- resp.sm_sl = attr.sm_sl;
- resp.subnet_timeout = attr.subnet_timeout;
- resp.init_type_reply = attr.init_type_reply;
- resp.active_width = attr.active_width;
- resp.active_speed = attr.active_speed;
- resp.phys_state = attr.phys_state;
- resp.link_layer = rdma_port_get_link_layer(ib_dev,
- cmd.port_num);
-
- if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp))
- return -EFAULT;
-
- return in_len;
+ return uverbs_response(attrs, &resp, sizeof(resp));
}
-ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file,
- const char __user *buf,
- int in_len, int out_len)
+static int ib_uverbs_alloc_pd(struct uverbs_attr_bundle *attrs)
{
struct ib_uverbs_alloc_pd cmd;
struct ib_uverbs_alloc_pd_resp resp;
- struct ib_udata udata;
struct ib_uobject *uobj;
struct ib_pd *pd;
int ret;
struct ib_device *ib_dev;
- if (out_len < sizeof resp)
- return -ENOSPC;
-
- if (copy_from_user(&cmd, buf, sizeof cmd))
- return -EFAULT;
-
- ib_uverbs_init_udata(&udata, buf + sizeof(cmd),
- u64_to_user_ptr(cmd.response) + sizeof(resp),
- in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
- out_len - sizeof(resp));
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
- uobj = uobj_alloc(UVERBS_OBJECT_PD, file, &ib_dev);
+ uobj = uobj_alloc(UVERBS_OBJECT_PD, attrs, &ib_dev);
if (IS_ERR(uobj))
return PTR_ERR(uobj);
- pd = ib_dev->alloc_pd(ib_dev, uobj->context, &udata);
- if (IS_ERR(pd)) {
- ret = PTR_ERR(pd);
+ pd = rdma_zalloc_drv_obj(ib_dev, ib_pd);
+ if (!pd) {
+ ret = -ENOMEM;
goto err;
}
@@ -374,39 +421,43 @@ ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file,
pd->uobject = uobj;
pd->__internal_mr = NULL;
atomic_set(&pd->usecnt, 0);
+ pd->res.type = RDMA_RESTRACK_PD;
+
+ ret = ib_dev->ops.alloc_pd(pd, uobj->context, &attrs->driver_udata);
+ if (ret)
+ goto err_alloc;
uobj->object = pd;
memset(&resp, 0, sizeof resp);
resp.pd_handle = uobj->id;
- pd->res.type = RDMA_RESTRACK_PD;
- rdma_restrack_add(&pd->res);
+ rdma_restrack_uadd(&pd->res);
- if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp)) {
- ret = -EFAULT;
+ ret = uverbs_response(attrs, &resp, sizeof(resp));
+ if (ret)
goto err_copy;
- }
- return uobj_alloc_commit(uobj, in_len);
+ return uobj_alloc_commit(uobj);
err_copy:
ib_dealloc_pd(pd);
-
+ pd = NULL;
+err_alloc:
+ kfree(pd);
err:
uobj_alloc_abort(uobj);
return ret;
}
-ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file,
- const char __user *buf,
- int in_len, int out_len)
+static int ib_uverbs_dealloc_pd(struct uverbs_attr_bundle *attrs)
{
struct ib_uverbs_dealloc_pd cmd;
+ int ret;
- if (copy_from_user(&cmd, buf, sizeof cmd))
- return -EFAULT;
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
- return uobj_perform_destroy(UVERBS_OBJECT_PD, cmd.pd_handle, file,
- in_len);
+ return uobj_perform_destroy(UVERBS_OBJECT_PD, cmd.pd_handle, attrs);
}
struct xrcd_table_entry {
@@ -494,13 +545,11 @@ static void xrcd_table_delete(struct ib_uverbs_device *dev,
}
}
-ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+static int ib_uverbs_open_xrcd(struct uverbs_attr_bundle *attrs)
{
+ struct ib_uverbs_device *ibudev = attrs->ufile->device;
struct ib_uverbs_open_xrcd cmd;
struct ib_uverbs_open_xrcd_resp resp;
- struct ib_udata udata;
struct ib_uxrcd_object *obj;
struct ib_xrcd *xrcd = NULL;
struct fd f = {NULL, 0};
@@ -509,18 +558,11 @@ ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file,
int new_xrcd = 0;
struct ib_device *ib_dev;
- if (out_len < sizeof resp)
- return -ENOSPC;
-
- if (copy_from_user(&cmd, buf, sizeof cmd))
- return -EFAULT;
-
- ib_uverbs_init_udata(&udata, buf + sizeof(cmd),
- u64_to_user_ptr(cmd.response) + sizeof(resp),
- in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
- out_len - sizeof(resp));
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
- mutex_lock(&file->device->xrcd_tree_mutex);
+ mutex_lock(&ibudev->xrcd_tree_mutex);
if (cmd.fd != -1) {
/* search for file descriptor */
@@ -531,7 +573,7 @@ ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file,
}
inode = file_inode(f.file);
- xrcd = find_xrcd(file->device, inode);
+ xrcd = find_xrcd(ibudev, inode);
if (!xrcd && !(cmd.oflags & O_CREAT)) {
/* no file descriptor. Need CREATE flag */
ret = -EAGAIN;
@@ -544,7 +586,7 @@ ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file,
}
}
- obj = (struct ib_uxrcd_object *)uobj_alloc(UVERBS_OBJECT_XRCD, file,
+ obj = (struct ib_uxrcd_object *)uobj_alloc(UVERBS_OBJECT_XRCD, attrs,
&ib_dev);
if (IS_ERR(obj)) {
ret = PTR_ERR(obj);
@@ -552,7 +594,8 @@ ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file,
}
if (!xrcd) {
- xrcd = ib_dev->alloc_xrcd(ib_dev, obj->uobject.context, &udata);
+ xrcd = ib_dev->ops.alloc_xrcd(ib_dev, obj->uobject.context,
+ &attrs->driver_udata);
if (IS_ERR(xrcd)) {
ret = PTR_ERR(xrcd);
goto err;
@@ -574,29 +617,28 @@ ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file,
if (inode) {
if (new_xrcd) {
/* create new inode/xrcd table entry */
- ret = xrcd_table_insert(file->device, inode, xrcd);
+ ret = xrcd_table_insert(ibudev, inode, xrcd);
if (ret)
goto err_dealloc_xrcd;
}
atomic_inc(&xrcd->usecnt);
}
- if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp)) {
- ret = -EFAULT;
+ ret = uverbs_response(attrs, &resp, sizeof(resp));
+ if (ret)
goto err_copy;
- }
if (f.file)
fdput(f);
- mutex_unlock(&file->device->xrcd_tree_mutex);
+ mutex_unlock(&ibudev->xrcd_tree_mutex);
- return uobj_alloc_commit(&obj->uobject, in_len);
+ return uobj_alloc_commit(&obj->uobject);
err_copy:
if (inode) {
if (new_xrcd)
- xrcd_table_delete(file->device, inode);
+ xrcd_table_delete(ibudev, inode);
atomic_dec(&xrcd->usecnt);
}
@@ -610,22 +652,21 @@ err_tree_mutex_unlock:
if (f.file)
fdput(f);
- mutex_unlock(&file->device->xrcd_tree_mutex);
+ mutex_unlock(&ibudev->xrcd_tree_mutex);
return ret;
}
-ssize_t ib_uverbs_close_xrcd(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+static int ib_uverbs_close_xrcd(struct uverbs_attr_bundle *attrs)
{
struct ib_uverbs_close_xrcd cmd;
+ int ret;
- if (copy_from_user(&cmd, buf, sizeof cmd))
- return -EFAULT;
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
- return uobj_perform_destroy(UVERBS_OBJECT_XRCD, cmd.xrcd_handle, file,
- in_len);
+ return uobj_perform_destroy(UVERBS_OBJECT_XRCD, cmd.xrcd_handle, attrs);
}
int ib_uverbs_dealloc_xrcd(struct ib_uobject *uobject,
@@ -653,29 +694,19 @@ int ib_uverbs_dealloc_xrcd(struct ib_uobject *uobject,
return ret;
}
-ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+static int ib_uverbs_reg_mr(struct uverbs_attr_bundle *attrs)
{
struct ib_uverbs_reg_mr cmd;
struct ib_uverbs_reg_mr_resp resp;
- struct ib_udata udata;
struct ib_uobject *uobj;
struct ib_pd *pd;
struct ib_mr *mr;
int ret;
struct ib_device *ib_dev;
- if (out_len < sizeof resp)
- return -ENOSPC;
-
- if (copy_from_user(&cmd, buf, sizeof cmd))
- return -EFAULT;
-
- ib_uverbs_init_udata(&udata, buf + sizeof(cmd),
- u64_to_user_ptr(cmd.response) + sizeof(resp),
- in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
- out_len - sizeof(resp));
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
if ((cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK))
return -EINVAL;
@@ -684,11 +715,11 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
if (ret)
return ret;
- uobj = uobj_alloc(UVERBS_OBJECT_MR, file, &ib_dev);
+ uobj = uobj_alloc(UVERBS_OBJECT_MR, attrs, &ib_dev);
if (IS_ERR(uobj))
return PTR_ERR(uobj);
- pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, file);
+ pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, attrs);
if (!pd) {
ret = -EINVAL;
goto err_free;
@@ -703,8 +734,9 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
}
}
- mr = pd->device->reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va,
- cmd.access_flags, &udata);
+ mr = pd->device->ops.reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va,
+ cmd.access_flags,
+ &attrs->driver_udata);
if (IS_ERR(mr)) {
ret = PTR_ERR(mr);
goto err_put;
@@ -716,7 +748,7 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
mr->uobject = uobj;
atomic_inc(&pd->usecnt);
mr->res.type = RDMA_RESTRACK_MR;
- rdma_restrack_add(&mr->res);
+ rdma_restrack_uadd(&mr->res);
uobj->object = mr;
@@ -725,14 +757,13 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
resp.rkey = mr->rkey;
resp.mr_handle = uobj->id;
- if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp)) {
- ret = -EFAULT;
+ ret = uverbs_response(attrs, &resp, sizeof(resp));
+ if (ret)
goto err_copy;
- }
uobj_put_obj_read(pd);
- return uobj_alloc_commit(uobj, in_len);
+ return uobj_alloc_commit(uobj);
err_copy:
ib_dereg_mr(mr);
@@ -745,29 +776,19 @@ err_free:
return ret;
}
-ssize_t ib_uverbs_rereg_mr(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+static int ib_uverbs_rereg_mr(struct uverbs_attr_bundle *attrs)
{
struct ib_uverbs_rereg_mr cmd;
struct ib_uverbs_rereg_mr_resp resp;
- struct ib_udata udata;
struct ib_pd *pd = NULL;
struct ib_mr *mr;
struct ib_pd *old_pd;
int ret;
struct ib_uobject *uobj;
- if (out_len < sizeof(resp))
- return -ENOSPC;
-
- if (copy_from_user(&cmd, buf, sizeof(cmd)))
- return -EFAULT;
-
- ib_uverbs_init_udata(&udata, buf + sizeof(cmd),
- u64_to_user_ptr(cmd.response) + sizeof(resp),
- in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
- out_len - sizeof(resp));
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
if (cmd.flags & ~IB_MR_REREG_SUPPORTED || !cmd.flags)
return -EINVAL;
@@ -777,7 +798,7 @@ ssize_t ib_uverbs_rereg_mr(struct ib_uverbs_file *file,
(cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK)))
return -EINVAL;
- uobj = uobj_get_write(UVERBS_OBJECT_MR, cmd.mr_handle, file);
+ uobj = uobj_get_write(UVERBS_OBJECT_MR, cmd.mr_handle, attrs);
if (IS_ERR(uobj))
return PTR_ERR(uobj);
@@ -796,7 +817,7 @@ ssize_t ib_uverbs_rereg_mr(struct ib_uverbs_file *file,
if (cmd.flags & IB_MR_REREG_PD) {
pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle,
- file);
+ attrs);
if (!pd) {
ret = -EINVAL;
goto put_uobjs;
@@ -804,27 +825,24 @@ ssize_t ib_uverbs_rereg_mr(struct ib_uverbs_file *file,
}
old_pd = mr->pd;
- ret = mr->device->rereg_user_mr(mr, cmd.flags, cmd.start,
- cmd.length, cmd.hca_va,
- cmd.access_flags, pd, &udata);
- if (!ret) {
- if (cmd.flags & IB_MR_REREG_PD) {
- atomic_inc(&pd->usecnt);
- mr->pd = pd;
- atomic_dec(&old_pd->usecnt);
- }
- } else {
+ ret = mr->device->ops.rereg_user_mr(mr, cmd.flags, cmd.start,
+ cmd.length, cmd.hca_va,
+ cmd.access_flags, pd,
+ &attrs->driver_udata);
+ if (ret)
goto put_uobj_pd;
+
+ if (cmd.flags & IB_MR_REREG_PD) {
+ atomic_inc(&pd->usecnt);
+ mr->pd = pd;
+ atomic_dec(&old_pd->usecnt);
}
memset(&resp, 0, sizeof(resp));
resp.lkey = mr->lkey;
resp.rkey = mr->rkey;
- if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof(resp)))
- ret = -EFAULT;
- else
- ret = in_len;
+ ret = uverbs_response(attrs, &resp, sizeof(resp));
put_uobj_pd:
if (cmd.flags & IB_MR_REREG_PD)
@@ -836,54 +854,48 @@ put_uobjs:
return ret;
}
-ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+static int ib_uverbs_dereg_mr(struct uverbs_attr_bundle *attrs)
{
struct ib_uverbs_dereg_mr cmd;
+ int ret;
- if (copy_from_user(&cmd, buf, sizeof cmd))
- return -EFAULT;
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
- return uobj_perform_destroy(UVERBS_OBJECT_MR, cmd.mr_handle, file,
- in_len);
+ return uobj_perform_destroy(UVERBS_OBJECT_MR, cmd.mr_handle, attrs);
}
-ssize_t ib_uverbs_alloc_mw(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+static int ib_uverbs_alloc_mw(struct uverbs_attr_bundle *attrs)
{
struct ib_uverbs_alloc_mw cmd;
struct ib_uverbs_alloc_mw_resp resp;
struct ib_uobject *uobj;
struct ib_pd *pd;
struct ib_mw *mw;
- struct ib_udata udata;
int ret;
struct ib_device *ib_dev;
- if (out_len < sizeof(resp))
- return -ENOSPC;
-
- if (copy_from_user(&cmd, buf, sizeof(cmd)))
- return -EFAULT;
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
- uobj = uobj_alloc(UVERBS_OBJECT_MW, file, &ib_dev);
+ uobj = uobj_alloc(UVERBS_OBJECT_MW, attrs, &ib_dev);
if (IS_ERR(uobj))
return PTR_ERR(uobj);
- pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, file);
+ pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, attrs);
if (!pd) {
ret = -EINVAL;
goto err_free;
}
- ib_uverbs_init_udata(&udata, buf + sizeof(cmd),
- u64_to_user_ptr(cmd.response) + sizeof(resp),
- in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
- out_len - sizeof(resp));
+ if (cmd.mw_type != IB_MW_TYPE_1 && cmd.mw_type != IB_MW_TYPE_2) {
+ ret = -EINVAL;
+ goto err_put;
+ }
- mw = pd->device->alloc_mw(pd, cmd.mw_type, &udata);
+ mw = pd->device->ops.alloc_mw(pd, cmd.mw_type, &attrs->driver_udata);
if (IS_ERR(mw)) {
ret = PTR_ERR(mw);
goto err_put;
@@ -900,13 +912,12 @@ ssize_t ib_uverbs_alloc_mw(struct ib_uverbs_file *file,
resp.rkey = mw->rkey;
resp.mw_handle = uobj->id;
- if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof(resp))) {
- ret = -EFAULT;
+ ret = uverbs_response(attrs, &resp, sizeof(resp));
+ if (ret)
goto err_copy;
- }
uobj_put_obj_read(pd);
- return uobj_alloc_commit(uobj, in_len);
+ return uobj_alloc_commit(uobj);
err_copy:
uverbs_dealloc_mw(mw);
@@ -917,36 +928,32 @@ err_free:
return ret;
}
-ssize_t ib_uverbs_dealloc_mw(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+static int ib_uverbs_dealloc_mw(struct uverbs_attr_bundle *attrs)
{
struct ib_uverbs_dealloc_mw cmd;
+ int ret;
- if (copy_from_user(&cmd, buf, sizeof(cmd)))
- return -EFAULT;
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
- return uobj_perform_destroy(UVERBS_OBJECT_MW, cmd.mw_handle, file,
- in_len);
+ return uobj_perform_destroy(UVERBS_OBJECT_MW, cmd.mw_handle, attrs);
}
-ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+static int ib_uverbs_create_comp_channel(struct uverbs_attr_bundle *attrs)
{
struct ib_uverbs_create_comp_channel cmd;
struct ib_uverbs_create_comp_channel_resp resp;
struct ib_uobject *uobj;
struct ib_uverbs_completion_event_file *ev_file;
struct ib_device *ib_dev;
+ int ret;
- if (out_len < sizeof resp)
- return -ENOSPC;
-
- if (copy_from_user(&cmd, buf, sizeof cmd))
- return -EFAULT;
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
- uobj = uobj_alloc(UVERBS_OBJECT_COMP_CHANNEL, file, &ib_dev);
+ uobj = uobj_alloc(UVERBS_OBJECT_COMP_CHANNEL, attrs, &ib_dev);
if (IS_ERR(uobj))
return PTR_ERR(uobj);
@@ -956,25 +963,17 @@ ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file,
uobj);
ib_uverbs_init_event_queue(&ev_file->ev_queue);
- if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp)) {
+ ret = uverbs_response(attrs, &resp, sizeof(resp));
+ if (ret) {
uobj_alloc_abort(uobj);
- return -EFAULT;
+ return ret;
}
- return uobj_alloc_commit(uobj, in_len);
+ return uobj_alloc_commit(uobj);
}
-static struct ib_ucq_object *create_cq(struct ib_uverbs_file *file,
- struct ib_udata *ucore,
- struct ib_udata *uhw,
- struct ib_uverbs_ex_create_cq *cmd,
- size_t cmd_sz,
- int (*cb)(struct ib_uverbs_file *file,
- struct ib_ucq_object *obj,
- struct ib_uverbs_ex_create_cq_resp *resp,
- struct ib_udata *udata,
- void *context),
- void *context)
+static struct ib_ucq_object *create_cq(struct uverbs_attr_bundle *attrs,
+ struct ib_uverbs_ex_create_cq *cmd)
{
struct ib_ucq_object *obj;
struct ib_uverbs_completion_event_file *ev_file = NULL;
@@ -984,21 +983,16 @@ static struct ib_ucq_object *create_cq(struct ib_uverbs_file *file,
struct ib_cq_init_attr attr = {};
struct ib_device *ib_dev;
- if (cmd->comp_vector >= file->device->num_comp_vectors)
+ if (cmd->comp_vector >= attrs->ufile->device->num_comp_vectors)
return ERR_PTR(-EINVAL);
- obj = (struct ib_ucq_object *)uobj_alloc(UVERBS_OBJECT_CQ, file,
+ obj = (struct ib_ucq_object *)uobj_alloc(UVERBS_OBJECT_CQ, attrs,
&ib_dev);
if (IS_ERR(obj))
return obj;
- if (!ib_dev->create_cq) {
- ret = -EOPNOTSUPP;
- goto err;
- }
-
if (cmd->comp_channel >= 0) {
- ev_file = ib_uverbs_lookup_comp_file(cmd->comp_channel, file);
+ ev_file = ib_uverbs_lookup_comp_file(cmd->comp_channel, attrs);
if (IS_ERR(ev_file)) {
ret = PTR_ERR(ev_file);
goto err;
@@ -1013,11 +1007,10 @@ static struct ib_ucq_object *create_cq(struct ib_uverbs_file *file,
attr.cqe = cmd->cqe;
attr.comp_vector = cmd->comp_vector;
+ attr.flags = cmd->flags;
- if (cmd_sz > offsetof(typeof(*cmd), flags) + sizeof(cmd->flags))
- attr.flags = cmd->flags;
-
- cq = ib_dev->create_cq(ib_dev, &attr, obj->uobject.context, uhw);
+ cq = ib_dev->ops.create_cq(ib_dev, &attr, obj->uobject.context,
+ &attrs->driver_udata);
if (IS_ERR(cq)) {
ret = PTR_ERR(cq);
goto err_file;
@@ -1034,18 +1027,16 @@ static struct ib_ucq_object *create_cq(struct ib_uverbs_file *file,
memset(&resp, 0, sizeof resp);
resp.base.cq_handle = obj->uobject.id;
resp.base.cqe = cq->cqe;
-
- resp.response_length = offsetof(typeof(resp), response_length) +
- sizeof(resp.response_length);
+ resp.response_length = uverbs_response_length(attrs, sizeof(resp));
cq->res.type = RDMA_RESTRACK_CQ;
- rdma_restrack_add(&cq->res);
+ rdma_restrack_uadd(&cq->res);
- ret = cb(file, obj, &resp, ucore, context);
+ ret = uverbs_response(attrs, &resp, sizeof(resp));
if (ret)
goto err_cb;
- ret = uobj_alloc_commit(&obj->uobject, 0);
+ ret = uobj_alloc_commit(&obj->uobject);
if (ret)
return ERR_PTR(ret);
return obj;
@@ -1055,7 +1046,7 @@ err_cb:
err_file:
if (ev_file)
- ib_uverbs_release_ucq(file, ev_file, obj);
+ ib_uverbs_release_ucq(attrs->ufile, ev_file, obj);
err:
uobj_alloc_abort(&obj->uobject);
@@ -1063,41 +1054,16 @@ err:
return ERR_PTR(ret);
}
-static int ib_uverbs_create_cq_cb(struct ib_uverbs_file *file,
- struct ib_ucq_object *obj,
- struct ib_uverbs_ex_create_cq_resp *resp,
- struct ib_udata *ucore, void *context)
-{
- if (ib_copy_to_udata(ucore, &resp->base, sizeof(resp->base)))
- return -EFAULT;
-
- return 0;
-}
-
-ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+static int ib_uverbs_create_cq(struct uverbs_attr_bundle *attrs)
{
struct ib_uverbs_create_cq cmd;
struct ib_uverbs_ex_create_cq cmd_ex;
- struct ib_uverbs_create_cq_resp resp;
- struct ib_udata ucore;
- struct ib_udata uhw;
struct ib_ucq_object *obj;
+ int ret;
- if (out_len < sizeof(resp))
- return -ENOSPC;
-
- if (copy_from_user(&cmd, buf, sizeof(cmd)))
- return -EFAULT;
-
- ib_uverbs_init_udata(&ucore, buf, u64_to_user_ptr(cmd.response),
- sizeof(cmd), sizeof(resp));
-
- ib_uverbs_init_udata(&uhw, buf + sizeof(cmd),
- u64_to_user_ptr(cmd.response) + sizeof(resp),
- in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
- out_len - sizeof(resp));
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
memset(&cmd_ex, 0, sizeof(cmd_ex));
cmd_ex.user_handle = cmd.user_handle;
@@ -1105,43 +1071,19 @@ ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file,
cmd_ex.comp_vector = cmd.comp_vector;
cmd_ex.comp_channel = cmd.comp_channel;
- obj = create_cq(file, &ucore, &uhw, &cmd_ex,
- offsetof(typeof(cmd_ex), comp_channel) +
- sizeof(cmd.comp_channel), ib_uverbs_create_cq_cb,
- NULL);
-
- if (IS_ERR(obj))
- return PTR_ERR(obj);
-
- return in_len;
-}
-
-static int ib_uverbs_ex_create_cq_cb(struct ib_uverbs_file *file,
- struct ib_ucq_object *obj,
- struct ib_uverbs_ex_create_cq_resp *resp,
- struct ib_udata *ucore, void *context)
-{
- if (ib_copy_to_udata(ucore, resp, resp->response_length))
- return -EFAULT;
-
- return 0;
+ obj = create_cq(attrs, &cmd_ex);
+ return PTR_ERR_OR_ZERO(obj);
}
-int ib_uverbs_ex_create_cq(struct ib_uverbs_file *file,
- struct ib_udata *ucore,
- struct ib_udata *uhw)
+static int ib_uverbs_ex_create_cq(struct uverbs_attr_bundle *attrs)
{
- struct ib_uverbs_ex_create_cq_resp resp;
struct ib_uverbs_ex_create_cq cmd;
struct ib_ucq_object *obj;
- int err;
-
- if (ucore->inlen < sizeof(cmd))
- return -EINVAL;
+ int ret;
- err = ib_copy_from_udata(&cmd, ucore, sizeof(cmd));
- if (err)
- return err;
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
if (cmd.comp_mask)
return -EINVAL;
@@ -1149,52 +1091,36 @@ int ib_uverbs_ex_create_cq(struct ib_uverbs_file *file,
if (cmd.reserved)
return -EINVAL;
- if (ucore->outlen < (offsetof(typeof(resp), response_length) +
- sizeof(resp.response_length)))
- return -ENOSPC;
-
- obj = create_cq(file, ucore, uhw, &cmd,
- min(ucore->inlen, sizeof(cmd)),
- ib_uverbs_ex_create_cq_cb, NULL);
-
+ obj = create_cq(attrs, &cmd);
return PTR_ERR_OR_ZERO(obj);
}
-ssize_t ib_uverbs_resize_cq(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+static int ib_uverbs_resize_cq(struct uverbs_attr_bundle *attrs)
{
struct ib_uverbs_resize_cq cmd;
struct ib_uverbs_resize_cq_resp resp = {};
- struct ib_udata udata;
struct ib_cq *cq;
int ret = -EINVAL;
- if (copy_from_user(&cmd, buf, sizeof cmd))
- return -EFAULT;
-
- ib_uverbs_init_udata(&udata, buf + sizeof(cmd),
- u64_to_user_ptr(cmd.response) + sizeof(resp),
- in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
- out_len - sizeof(resp));
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
- cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, file);
+ cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, attrs);
if (!cq)
return -EINVAL;
- ret = cq->device->resize_cq(cq, cmd.cqe, &udata);
+ ret = cq->device->ops.resize_cq(cq, cmd.cqe, &attrs->driver_udata);
if (ret)
goto out;
resp.cqe = cq->cqe;
- if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp.cqe))
- ret = -EFAULT;
-
+ ret = uverbs_response(attrs, &resp, sizeof(resp));
out:
uobj_put_obj_read(cq);
- return ret ? ret : in_len;
+ return ret;
}
static int copy_wc_to_user(struct ib_device *ib_dev, void __user *dest,
@@ -1227,9 +1153,7 @@ static int copy_wc_to_user(struct ib_device *ib_dev, void __user *dest,
return 0;
}
-ssize_t ib_uverbs_poll_cq(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+static int ib_uverbs_poll_cq(struct uverbs_attr_bundle *attrs)
{
struct ib_uverbs_poll_cq cmd;
struct ib_uverbs_poll_cq_resp resp;
@@ -1239,15 +1163,16 @@ ssize_t ib_uverbs_poll_cq(struct ib_uverbs_file *file,
struct ib_wc wc;
int ret;
- if (copy_from_user(&cmd, buf, sizeof cmd))
- return -EFAULT;
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
- cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, file);
+ cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, attrs);
if (!cq)
return -EINVAL;
/* we copy a struct ib_uverbs_poll_cq_resp to user space */
- header_ptr = u64_to_user_ptr(cmd.response);
+ header_ptr = attrs->ucore.outbuf;
data_ptr = header_ptr + sizeof resp;
memset(&resp, 0, sizeof resp);
@@ -1270,25 +1195,27 @@ ssize_t ib_uverbs_poll_cq(struct ib_uverbs_file *file,
ret = -EFAULT;
goto out_put;
}
+ ret = 0;
- ret = in_len;
+ if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_CORE_OUT))
+ ret = uverbs_output_written(attrs, UVERBS_ATTR_CORE_OUT);
out_put:
uobj_put_obj_read(cq);
return ret;
}
-ssize_t ib_uverbs_req_notify_cq(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+static int ib_uverbs_req_notify_cq(struct uverbs_attr_bundle *attrs)
{
struct ib_uverbs_req_notify_cq cmd;
struct ib_cq *cq;
+ int ret;
- if (copy_from_user(&cmd, buf, sizeof cmd))
- return -EFAULT;
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
- cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, file);
+ cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, attrs);
if (!cq)
return -EINVAL;
@@ -1297,22 +1224,22 @@ ssize_t ib_uverbs_req_notify_cq(struct ib_uverbs_file *file,
uobj_put_obj_read(cq);
- return in_len;
+ return 0;
}
-ssize_t ib_uverbs_destroy_cq(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+static int ib_uverbs_destroy_cq(struct uverbs_attr_bundle *attrs)
{
struct ib_uverbs_destroy_cq cmd;
struct ib_uverbs_destroy_cq_resp resp;
struct ib_uobject *uobj;
struct ib_ucq_object *obj;
+ int ret;
- if (copy_from_user(&cmd, buf, sizeof cmd))
- return -EFAULT;
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
- uobj = uobj_get_destroy(UVERBS_OBJECT_CQ, cmd.cq_handle, file);
+ uobj = uobj_get_destroy(UVERBS_OBJECT_CQ, cmd.cq_handle, attrs);
if (IS_ERR(uobj))
return PTR_ERR(uobj);
@@ -1323,21 +1250,11 @@ ssize_t ib_uverbs_destroy_cq(struct ib_uverbs_file *file,
uobj_put_destroy(uobj);
- if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp))
- return -EFAULT;
-
- return in_len;
+ return uverbs_response(attrs, &resp, sizeof(resp));
}
-static int create_qp(struct ib_uverbs_file *file,
- struct ib_udata *ucore,
- struct ib_udata *uhw,
- struct ib_uverbs_ex_create_qp *cmd,
- size_t cmd_sz,
- int (*cb)(struct ib_uverbs_file *file,
- struct ib_uverbs_ex_create_qp_resp *resp,
- struct ib_udata *udata),
- void *context)
+static int create_qp(struct uverbs_attr_bundle *attrs,
+ struct ib_uverbs_ex_create_qp *cmd)
{
struct ib_uqp_object *obj;
struct ib_device *device;
@@ -1347,7 +1264,6 @@ static int create_qp(struct ib_uverbs_file *file,
struct ib_cq *scq = NULL, *rcq = NULL;
struct ib_srq *srq = NULL;
struct ib_qp *qp;
- char *buf;
struct ib_qp_init_attr attr = {};
struct ib_uverbs_ex_create_qp_resp resp;
int ret;
@@ -1358,7 +1274,7 @@ static int create_qp(struct ib_uverbs_file *file,
if (cmd->qp_type == IB_QPT_RAW_PACKET && !capable(CAP_NET_RAW))
return -EPERM;
- obj = (struct ib_uqp_object *)uobj_alloc(UVERBS_OBJECT_QP, file,
+ obj = (struct ib_uqp_object *)uobj_alloc(UVERBS_OBJECT_QP, attrs,
&ib_dev);
if (IS_ERR(obj))
return PTR_ERR(obj);
@@ -1366,12 +1282,10 @@ static int create_qp(struct ib_uverbs_file *file,
obj->uevent.uobject.user_handle = cmd->user_handle;
mutex_init(&obj->mcast_lock);
- if (cmd_sz >= offsetof(typeof(*cmd), rwq_ind_tbl_handle) +
- sizeof(cmd->rwq_ind_tbl_handle) &&
- (cmd->comp_mask & IB_UVERBS_CREATE_QP_MASK_IND_TABLE)) {
+ if (cmd->comp_mask & IB_UVERBS_CREATE_QP_MASK_IND_TABLE) {
ind_tbl = uobj_get_obj_read(rwq_ind_table,
UVERBS_OBJECT_RWQ_IND_TBL,
- cmd->rwq_ind_tbl_handle, file);
+ cmd->rwq_ind_tbl_handle, attrs);
if (!ind_tbl) {
ret = -EINVAL;
goto err_put;
@@ -1380,13 +1294,6 @@ static int create_qp(struct ib_uverbs_file *file,
attr.rwq_ind_tbl = ind_tbl;
}
- if (cmd_sz > sizeof(*cmd) &&
- !ib_is_udata_cleared(ucore, sizeof(*cmd),
- cmd_sz - sizeof(*cmd))) {
- ret = -EOPNOTSUPP;
- goto err_put;
- }
-
if (ind_tbl && (cmd->max_recv_wr || cmd->max_recv_sge || cmd->is_srq)) {
ret = -EINVAL;
goto err_put;
@@ -1397,7 +1304,7 @@ static int create_qp(struct ib_uverbs_file *file,
if (cmd->qp_type == IB_QPT_XRC_TGT) {
xrcd_uobj = uobj_get_read(UVERBS_OBJECT_XRCD, cmd->pd_handle,
- file);
+ attrs);
if (IS_ERR(xrcd_uobj)) {
ret = -EINVAL;
@@ -1417,7 +1324,7 @@ static int create_qp(struct ib_uverbs_file *file,
} else {
if (cmd->is_srq) {
srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ,
- cmd->srq_handle, file);
+ cmd->srq_handle, attrs);
if (!srq || srq->srq_type == IB_SRQT_XRC) {
ret = -EINVAL;
goto err_put;
@@ -1428,7 +1335,7 @@ static int create_qp(struct ib_uverbs_file *file,
if (cmd->recv_cq_handle != cmd->send_cq_handle) {
rcq = uobj_get_obj_read(
cq, UVERBS_OBJECT_CQ,
- cmd->recv_cq_handle, file);
+ cmd->recv_cq_handle, attrs);
if (!rcq) {
ret = -EINVAL;
goto err_put;
@@ -1439,11 +1346,11 @@ static int create_qp(struct ib_uverbs_file *file,
if (has_sq)
scq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ,
- cmd->send_cq_handle, file);
+ cmd->send_cq_handle, attrs);
if (!ind_tbl)
rcq = rcq ?: scq;
pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd->pd_handle,
- file);
+ attrs);
if (!pd || (!scq && has_sq)) {
ret = -EINVAL;
goto err_put;
@@ -1453,7 +1360,7 @@ static int create_qp(struct ib_uverbs_file *file,
}
attr.event_handler = ib_uverbs_qp_event_handler;
- attr.qp_context = file;
+ attr.qp_context = attrs->ufile;
attr.send_cq = scq;
attr.recv_cq = rcq;
attr.srq = srq;
@@ -1473,10 +1380,7 @@ static int create_qp(struct ib_uverbs_file *file,
INIT_LIST_HEAD(&obj->uevent.event_list);
INIT_LIST_HEAD(&obj->mcast_list);
- if (cmd_sz >= offsetof(typeof(*cmd), create_flags) +
- sizeof(cmd->create_flags))
- attr.create_flags = cmd->create_flags;
-
+ attr.create_flags = cmd->create_flags;
if (attr.create_flags & ~(IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK |
IB_QP_CREATE_CROSS_CHANNEL |
IB_QP_CREATE_MANAGED_SEND |
@@ -1498,18 +1402,10 @@ static int create_qp(struct ib_uverbs_file *file,
attr.source_qpn = cmd->source_qpn;
}
- buf = (void *)cmd + sizeof(*cmd);
- if (cmd_sz > sizeof(*cmd))
- if (!(buf[0] == 0 && !memcmp(buf, buf + 1,
- cmd_sz - sizeof(*cmd) - 1))) {
- ret = -EINVAL;
- goto err_put;
- }
-
if (cmd->qp_type == IB_QPT_XRC_TGT)
qp = ib_create_qp(pd, &attr);
else
- qp = _ib_create_qp(device, pd, &attr, uhw,
+ qp = _ib_create_qp(device, pd, &attr, &attrs->driver_udata,
&obj->uevent.uobject);
if (IS_ERR(qp)) {
@@ -1557,11 +1453,9 @@ static int create_qp(struct ib_uverbs_file *file,
resp.base.max_recv_wr = attr.cap.max_recv_wr;
resp.base.max_send_wr = attr.cap.max_send_wr;
resp.base.max_inline_data = attr.cap.max_inline_data;
+ resp.response_length = uverbs_response_length(attrs, sizeof(resp));
- resp.response_length = offsetof(typeof(resp), response_length) +
- sizeof(resp.response_length);
-
- ret = cb(file, &resp, ucore);
+ ret = uverbs_response(attrs, &resp, sizeof(resp));
if (ret)
goto err_cb;
@@ -1583,7 +1477,7 @@ static int create_qp(struct ib_uverbs_file *file,
if (ind_tbl)
uobj_put_obj_read(ind_tbl);
- return uobj_alloc_commit(&obj->uevent.uobject, 0);
+ return uobj_alloc_commit(&obj->uevent.uobject);
err_cb:
ib_destroy_qp(qp);
@@ -1605,39 +1499,15 @@ err_put:
return ret;
}
-static int ib_uverbs_create_qp_cb(struct ib_uverbs_file *file,
- struct ib_uverbs_ex_create_qp_resp *resp,
- struct ib_udata *ucore)
-{
- if (ib_copy_to_udata(ucore, &resp->base, sizeof(resp->base)))
- return -EFAULT;
-
- return 0;
-}
-
-ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+static int ib_uverbs_create_qp(struct uverbs_attr_bundle *attrs)
{
struct ib_uverbs_create_qp cmd;
struct ib_uverbs_ex_create_qp cmd_ex;
- struct ib_udata ucore;
- struct ib_udata uhw;
- ssize_t resp_size = sizeof(struct ib_uverbs_create_qp_resp);
- int err;
-
- if (out_len < resp_size)
- return -ENOSPC;
-
- if (copy_from_user(&cmd, buf, sizeof(cmd)))
- return -EFAULT;
+ int ret;
- ib_uverbs_init_udata(&ucore, buf, u64_to_user_ptr(cmd.response),
- sizeof(cmd), resp_size);
- ib_uverbs_init_udata(&uhw, buf + sizeof(cmd),
- u64_to_user_ptr(cmd.response) + resp_size,
- in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
- out_len - resp_size);
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
memset(&cmd_ex, 0, sizeof(cmd_ex));
cmd_ex.user_handle = cmd.user_handle;
@@ -1654,42 +1524,17 @@ ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file,
cmd_ex.qp_type = cmd.qp_type;
cmd_ex.is_srq = cmd.is_srq;
- err = create_qp(file, &ucore, &uhw, &cmd_ex,
- offsetof(typeof(cmd_ex), is_srq) +
- sizeof(cmd.is_srq), ib_uverbs_create_qp_cb,
- NULL);
-
- if (err)
- return err;
-
- return in_len;
+ return create_qp(attrs, &cmd_ex);
}
-static int ib_uverbs_ex_create_qp_cb(struct ib_uverbs_file *file,
- struct ib_uverbs_ex_create_qp_resp *resp,
- struct ib_udata *ucore)
+static int ib_uverbs_ex_create_qp(struct uverbs_attr_bundle *attrs)
{
- if (ib_copy_to_udata(ucore, resp, resp->response_length))
- return -EFAULT;
-
- return 0;
-}
-
-int ib_uverbs_ex_create_qp(struct ib_uverbs_file *file,
- struct ib_udata *ucore,
- struct ib_udata *uhw)
-{
- struct ib_uverbs_ex_create_qp_resp resp;
- struct ib_uverbs_ex_create_qp cmd = {0};
- int err;
-
- if (ucore->inlen < (offsetof(typeof(cmd), comp_mask) +
- sizeof(cmd.comp_mask)))
- return -EINVAL;
+ struct ib_uverbs_ex_create_qp cmd;
+ int ret;
- err = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen));
- if (err)
- return err;
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
if (cmd.comp_mask & ~IB_UVERBS_CREATE_QP_SUP_COMP_MASK)
return -EINVAL;
@@ -1697,26 +1542,13 @@ int ib_uverbs_ex_create_qp(struct ib_uverbs_file *file,
if (cmd.reserved)
return -EINVAL;
- if (ucore->outlen < (offsetof(typeof(resp), response_length) +
- sizeof(resp.response_length)))
- return -ENOSPC;
-
- err = create_qp(file, ucore, uhw, &cmd,
- min(ucore->inlen, sizeof(cmd)),
- ib_uverbs_ex_create_qp_cb, NULL);
-
- if (err)
- return err;
-
- return 0;
+ return create_qp(attrs, &cmd);
}
-ssize_t ib_uverbs_open_qp(struct ib_uverbs_file *file,
- const char __user *buf, int in_len, int out_len)
+static int ib_uverbs_open_qp(struct uverbs_attr_bundle *attrs)
{
struct ib_uverbs_open_qp cmd;
struct ib_uverbs_create_qp_resp resp;
- struct ib_udata udata;
struct ib_uqp_object *obj;
struct ib_xrcd *xrcd;
struct ib_uobject *uninitialized_var(xrcd_uobj);
@@ -1725,23 +1557,16 @@ ssize_t ib_uverbs_open_qp(struct ib_uverbs_file *file,
int ret;
struct ib_device *ib_dev;
- if (out_len < sizeof resp)
- return -ENOSPC;
-
- if (copy_from_user(&cmd, buf, sizeof cmd))
- return -EFAULT;
-
- ib_uverbs_init_udata(&udata, buf + sizeof(cmd),
- u64_to_user_ptr(cmd.response) + sizeof(resp),
- in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
- out_len - sizeof(resp));
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
- obj = (struct ib_uqp_object *)uobj_alloc(UVERBS_OBJECT_QP, file,
+ obj = (struct ib_uqp_object *)uobj_alloc(UVERBS_OBJECT_QP, attrs,
&ib_dev);
if (IS_ERR(obj))
return PTR_ERR(obj);
- xrcd_uobj = uobj_get_read(UVERBS_OBJECT_XRCD, cmd.pd_handle, file);
+ xrcd_uobj = uobj_get_read(UVERBS_OBJECT_XRCD, cmd.pd_handle, attrs);
if (IS_ERR(xrcd_uobj)) {
ret = -EINVAL;
goto err_put;
@@ -1754,7 +1579,7 @@ ssize_t ib_uverbs_open_qp(struct ib_uverbs_file *file,
}
attr.event_handler = ib_uverbs_qp_event_handler;
- attr.qp_context = file;
+ attr.qp_context = attrs->ufile;
attr.qp_num = cmd.qpn;
attr.qp_type = cmd.qp_type;
@@ -1775,17 +1600,16 @@ ssize_t ib_uverbs_open_qp(struct ib_uverbs_file *file,
resp.qpn = qp->qp_num;
resp.qp_handle = obj->uevent.uobject.id;
- if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp)) {
- ret = -EFAULT;
+ ret = uverbs_response(attrs, &resp, sizeof(resp));
+ if (ret)
goto err_destroy;
- }
obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject);
atomic_inc(&obj->uxrcd->refcnt);
qp->uobject = &obj->uevent.uobject;
uobj_put_read(xrcd_uobj);
- return uobj_alloc_commit(&obj->uevent.uobject, in_len);
+ return uobj_alloc_commit(&obj->uevent.uobject);
err_destroy:
ib_destroy_qp(qp);
@@ -1818,9 +1642,7 @@ static void copy_ah_attr_to_uverbs(struct ib_uverbs_qp_dest *uverb_attr,
uverb_attr->port_num = rdma_ah_get_port_num(rdma_attr);
}
-ssize_t ib_uverbs_query_qp(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+static int ib_uverbs_query_qp(struct uverbs_attr_bundle *attrs)
{
struct ib_uverbs_query_qp cmd;
struct ib_uverbs_query_qp_resp resp;
@@ -1829,8 +1651,9 @@ ssize_t ib_uverbs_query_qp(struct ib_uverbs_file *file,
struct ib_qp_init_attr *init_attr;
int ret;
- if (copy_from_user(&cmd, buf, sizeof cmd))
- return -EFAULT;
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
attr = kmalloc(sizeof *attr, GFP_KERNEL);
init_attr = kmalloc(sizeof *init_attr, GFP_KERNEL);
@@ -1839,7 +1662,7 @@ ssize_t ib_uverbs_query_qp(struct ib_uverbs_file *file,
goto out;
}
- qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, file);
+ qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs);
if (!qp) {
ret = -EINVAL;
goto out;
@@ -1886,14 +1709,13 @@ ssize_t ib_uverbs_query_qp(struct ib_uverbs_file *file,
resp.max_inline_data = init_attr->cap.max_inline_data;
resp.sq_sig_all = init_attr->sq_sig_type == IB_SIGNAL_ALL_WR;
- if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp))
- ret = -EFAULT;
+ ret = uverbs_response(attrs, &resp, sizeof(resp));
out:
kfree(attr);
kfree(init_attr);
- return ret ? ret : in_len;
+ return ret;
}
/* Remove ignored fields set in the attribute mask */
@@ -1933,8 +1755,8 @@ static void copy_ah_attr_from_uverbs(struct ib_device *dev,
rdma_ah_set_make_grd(rdma_attr, false);
}
-static int modify_qp(struct ib_uverbs_file *file,
- struct ib_uverbs_ex_modify_qp *cmd, struct ib_udata *udata)
+static int modify_qp(struct uverbs_attr_bundle *attrs,
+ struct ib_uverbs_ex_modify_qp *cmd)
{
struct ib_qp_attr *attr;
struct ib_qp *qp;
@@ -1944,7 +1766,8 @@ static int modify_qp(struct ib_uverbs_file *file,
if (!attr)
return -ENOMEM;
- qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd->base.qp_handle, file);
+ qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd->base.qp_handle,
+ attrs);
if (!qp) {
ret = -EINVAL;
goto out;
@@ -2081,7 +1904,7 @@ static int modify_qp(struct ib_uverbs_file *file,
ret = ib_modify_qp_with_udata(qp, attr,
modify_qp_mask(qp->qp_type,
cmd->base.attr_mask),
- udata);
+ &attrs->driver_udata);
release_qp:
uobj_put_obj_read(qp);
@@ -2091,80 +1914,64 @@ out:
return ret;
}
-ssize_t ib_uverbs_modify_qp(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+static int ib_uverbs_modify_qp(struct uverbs_attr_bundle *attrs)
{
- struct ib_uverbs_ex_modify_qp cmd = {};
- struct ib_udata udata;
+ struct ib_uverbs_ex_modify_qp cmd;
int ret;
- if (copy_from_user(&cmd.base, buf, sizeof(cmd.base)))
- return -EFAULT;
+ ret = uverbs_request(attrs, &cmd.base, sizeof(cmd.base));
+ if (ret)
+ return ret;
if (cmd.base.attr_mask &
~((IB_USER_LEGACY_LAST_QP_ATTR_MASK << 1) - 1))
return -EOPNOTSUPP;
- ib_uverbs_init_udata(&udata, buf + sizeof(cmd.base), NULL,
- in_len - sizeof(cmd.base) - sizeof(struct ib_uverbs_cmd_hdr),
- out_len);
-
- ret = modify_qp(file, &cmd, &udata);
- if (ret)
- return ret;
-
- return in_len;
+ return modify_qp(attrs, &cmd);
}
-int ib_uverbs_ex_modify_qp(struct ib_uverbs_file *file,
- struct ib_udata *ucore,
- struct ib_udata *uhw)
+static int ib_uverbs_ex_modify_qp(struct uverbs_attr_bundle *attrs)
{
- struct ib_uverbs_ex_modify_qp cmd = {};
+ struct ib_uverbs_ex_modify_qp cmd;
+ struct ib_uverbs_ex_modify_qp_resp resp = {
+ .response_length = uverbs_response_length(attrs, sizeof(resp))
+ };
int ret;
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
+
/*
* Last bit is reserved for extending the attr_mask by
* using another field.
*/
BUILD_BUG_ON(IB_USER_LAST_QP_ATTR_MASK == (1 << 31));
- if (ucore->inlen < sizeof(cmd.base))
- return -EINVAL;
-
- ret = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen));
- if (ret)
- return ret;
-
if (cmd.base.attr_mask &
~((IB_USER_LAST_QP_ATTR_MASK << 1) - 1))
return -EOPNOTSUPP;
- if (ucore->inlen > sizeof(cmd)) {
- if (!ib_is_udata_cleared(ucore, sizeof(cmd),
- ucore->inlen - sizeof(cmd)))
- return -EOPNOTSUPP;
- }
-
- ret = modify_qp(file, &cmd, uhw);
+ ret = modify_qp(attrs, &cmd);
+ if (ret)
+ return ret;
- return ret;
+ return uverbs_response(attrs, &resp, sizeof(resp));
}
-ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+static int ib_uverbs_destroy_qp(struct uverbs_attr_bundle *attrs)
{
struct ib_uverbs_destroy_qp cmd;
struct ib_uverbs_destroy_qp_resp resp;
struct ib_uobject *uobj;
struct ib_uqp_object *obj;
+ int ret;
- if (copy_from_user(&cmd, buf, sizeof cmd))
- return -EFAULT;
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
- uobj = uobj_get_destroy(UVERBS_OBJECT_QP, cmd.qp_handle, file);
+ uobj = uobj_get_destroy(UVERBS_OBJECT_QP, cmd.qp_handle, attrs);
if (IS_ERR(uobj))
return PTR_ERR(uobj);
@@ -2174,10 +1981,7 @@ ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file,
uobj_put_destroy(uobj);
- if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp))
- return -EFAULT;
-
- return in_len;
+ return uverbs_response(attrs, &resp, sizeof(resp));
}
static void *alloc_wr(size_t wr_size, __u32 num_sge)
@@ -2190,9 +1994,7 @@ static void *alloc_wr(size_t wr_size, __u32 num_sge)
num_sge * sizeof (struct ib_sge), GFP_KERNEL);
}
-ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+static int ib_uverbs_post_send(struct uverbs_attr_bundle *attrs)
{
struct ib_uverbs_post_send cmd;
struct ib_uverbs_post_send_resp resp;
@@ -2202,33 +2004,41 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file,
struct ib_qp *qp;
int i, sg_ind;
int is_ud;
- ssize_t ret = -EINVAL;
+ int ret, ret2;
size_t next_size;
+ const struct ib_sge __user *sgls;
+ const void __user *wqes;
+ struct uverbs_req_iter iter;
- if (copy_from_user(&cmd, buf, sizeof cmd))
- return -EFAULT;
-
- if (in_len < sizeof cmd + cmd.wqe_size * cmd.wr_count +
- cmd.sge_count * sizeof (struct ib_uverbs_sge))
- return -EINVAL;
-
- if (cmd.wqe_size < sizeof (struct ib_uverbs_send_wr))
- return -EINVAL;
+ ret = uverbs_request_start(attrs, &iter, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
+ wqes = uverbs_request_next_ptr(&iter, cmd.wqe_size * cmd.wr_count);
+ if (IS_ERR(wqes))
+ return PTR_ERR(wqes);
+ sgls = uverbs_request_next_ptr(
+ &iter, cmd.sge_count * sizeof(struct ib_uverbs_sge));
+ if (IS_ERR(sgls))
+ return PTR_ERR(sgls);
+ ret = uverbs_request_finish(&iter);
+ if (ret)
+ return ret;
user_wr = kmalloc(cmd.wqe_size, GFP_KERNEL);
if (!user_wr)
return -ENOMEM;
- qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, file);
- if (!qp)
+ qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs);
+ if (!qp) {
+ ret = -EINVAL;
goto out;
+ }
is_ud = qp->qp_type == IB_QPT_UD;
sg_ind = 0;
last = NULL;
for (i = 0; i < cmd.wr_count; ++i) {
- if (copy_from_user(user_wr,
- buf + sizeof cmd + i * cmd.wqe_size,
+ if (copy_from_user(user_wr, wqes + i * cmd.wqe_size,
cmd.wqe_size)) {
ret = -EFAULT;
goto out_put;
@@ -2256,7 +2066,7 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file,
}
ud->ah = uobj_get_obj_read(ah, UVERBS_OBJECT_AH,
- user_wr->wr.ud.ah, file);
+ user_wr->wr.ud.ah, attrs);
if (!ud->ah) {
kfree(ud);
ret = -EINVAL;
@@ -2336,11 +2146,9 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file,
if (next->num_sge) {
next->sg_list = (void *) next +
ALIGN(next_size, sizeof(struct ib_sge));
- if (copy_from_user(next->sg_list,
- buf + sizeof cmd +
- cmd.wr_count * cmd.wqe_size +
- sg_ind * sizeof (struct ib_sge),
- next->num_sge * sizeof (struct ib_sge))) {
+ if (copy_from_user(next->sg_list, sgls + sg_ind,
+ next->num_sge *
+ sizeof(struct ib_sge))) {
ret = -EFAULT;
goto out_put;
}
@@ -2350,7 +2158,7 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file,
}
resp.bad_wr = 0;
- ret = qp->device->post_send(qp->real_qp, wr, &bad_wr);
+ ret = qp->device->ops.post_send(qp->real_qp, wr, &bad_wr);
if (ret)
for (next = wr; next; next = next->next) {
++resp.bad_wr;
@@ -2358,8 +2166,9 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file,
break;
}
- if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp))
- ret = -EFAULT;
+ ret2 = uverbs_response(attrs, &resp, sizeof(resp));
+ if (ret2)
+ ret = ret2;
out_put:
uobj_put_obj_read(qp);
@@ -2375,28 +2184,35 @@ out_put:
out:
kfree(user_wr);
- return ret ? ret : in_len;
+ return ret;
}
-static struct ib_recv_wr *ib_uverbs_unmarshall_recv(const char __user *buf,
- int in_len,
- u32 wr_count,
- u32 sge_count,
- u32 wqe_size)
+static struct ib_recv_wr *
+ib_uverbs_unmarshall_recv(struct uverbs_req_iter *iter, u32 wr_count,
+ u32 wqe_size, u32 sge_count)
{
struct ib_uverbs_recv_wr *user_wr;
struct ib_recv_wr *wr = NULL, *last, *next;
int sg_ind;
int i;
int ret;
-
- if (in_len < wqe_size * wr_count +
- sge_count * sizeof (struct ib_uverbs_sge))
- return ERR_PTR(-EINVAL);
+ const struct ib_sge __user *sgls;
+ const void __user *wqes;
if (wqe_size < sizeof (struct ib_uverbs_recv_wr))
return ERR_PTR(-EINVAL);
+ wqes = uverbs_request_next_ptr(iter, wqe_size * wr_count);
+ if (IS_ERR(wqes))
+ return ERR_CAST(wqes);
+ sgls = uverbs_request_next_ptr(
+ iter, sge_count * sizeof(struct ib_uverbs_sge));
+ if (IS_ERR(sgls))
+ return ERR_CAST(sgls);
+ ret = uverbs_request_finish(iter);
+ if (ret)
+ return ERR_PTR(ret);
+
user_wr = kmalloc(wqe_size, GFP_KERNEL);
if (!user_wr)
return ERR_PTR(-ENOMEM);
@@ -2404,7 +2220,7 @@ static struct ib_recv_wr *ib_uverbs_unmarshall_recv(const char __user *buf,
sg_ind = 0;
last = NULL;
for (i = 0; i < wr_count; ++i) {
- if (copy_from_user(user_wr, buf + i * wqe_size,
+ if (copy_from_user(user_wr, wqes + i * wqe_size,
wqe_size)) {
ret = -EFAULT;
goto err;
@@ -2443,10 +2259,9 @@ static struct ib_recv_wr *ib_uverbs_unmarshall_recv(const char __user *buf,
if (next->num_sge) {
next->sg_list = (void *) next +
ALIGN(sizeof *next, sizeof (struct ib_sge));
- if (copy_from_user(next->sg_list,
- buf + wr_count * wqe_size +
- sg_ind * sizeof (struct ib_sge),
- next->num_sge * sizeof (struct ib_sge))) {
+ if (copy_from_user(next->sg_list, sgls + sg_ind,
+ next->num_sge *
+ sizeof(struct ib_sge))) {
ret = -EFAULT;
goto err;
}
@@ -2470,32 +2285,33 @@ err:
return ERR_PTR(ret);
}
-ssize_t ib_uverbs_post_recv(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+static int ib_uverbs_post_recv(struct uverbs_attr_bundle *attrs)
{
struct ib_uverbs_post_recv cmd;
struct ib_uverbs_post_recv_resp resp;
struct ib_recv_wr *wr, *next;
const struct ib_recv_wr *bad_wr;
struct ib_qp *qp;
- ssize_t ret = -EINVAL;
+ int ret, ret2;
+ struct uverbs_req_iter iter;
- if (copy_from_user(&cmd, buf, sizeof cmd))
- return -EFAULT;
+ ret = uverbs_request_start(attrs, &iter, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
- wr = ib_uverbs_unmarshall_recv(buf + sizeof cmd,
- in_len - sizeof cmd, cmd.wr_count,
- cmd.sge_count, cmd.wqe_size);
+ wr = ib_uverbs_unmarshall_recv(&iter, cmd.wr_count, cmd.wqe_size,
+ cmd.sge_count);
if (IS_ERR(wr))
return PTR_ERR(wr);
- qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, file);
- if (!qp)
+ qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs);
+ if (!qp) {
+ ret = -EINVAL;
goto out;
+ }
resp.bad_wr = 0;
- ret = qp->device->post_recv(qp->real_qp, wr, &bad_wr);
+ ret = qp->device->ops.post_recv(qp->real_qp, wr, &bad_wr);
uobj_put_obj_read(qp);
if (ret) {
@@ -2506,9 +2322,9 @@ ssize_t ib_uverbs_post_recv(struct ib_uverbs_file *file,
}
}
- if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp))
- ret = -EFAULT;
-
+ ret2 = uverbs_response(attrs, &resp, sizeof(resp));
+ if (ret2)
+ ret = ret2;
out:
while (wr) {
next = wr->next;
@@ -2516,36 +2332,36 @@ out:
wr = next;
}
- return ret ? ret : in_len;
+ return ret;
}
-ssize_t ib_uverbs_post_srq_recv(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+static int ib_uverbs_post_srq_recv(struct uverbs_attr_bundle *attrs)
{
struct ib_uverbs_post_srq_recv cmd;
struct ib_uverbs_post_srq_recv_resp resp;
struct ib_recv_wr *wr, *next;
const struct ib_recv_wr *bad_wr;
struct ib_srq *srq;
- ssize_t ret = -EINVAL;
+ int ret, ret2;
+ struct uverbs_req_iter iter;
- if (copy_from_user(&cmd, buf, sizeof cmd))
- return -EFAULT;
+ ret = uverbs_request_start(attrs, &iter, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
- wr = ib_uverbs_unmarshall_recv(buf + sizeof cmd,
- in_len - sizeof cmd, cmd.wr_count,
- cmd.sge_count, cmd.wqe_size);
+ wr = ib_uverbs_unmarshall_recv(&iter, cmd.wr_count, cmd.wqe_size,
+ cmd.sge_count);
if (IS_ERR(wr))
return PTR_ERR(wr);
- srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, cmd.srq_handle, file);
- if (!srq)
+ srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, cmd.srq_handle, attrs);
+ if (!srq) {
+ ret = -EINVAL;
goto out;
+ }
resp.bad_wr = 0;
- ret = srq->device->post_srq_recv ?
- srq->device->post_srq_recv(srq, wr, &bad_wr) : -EOPNOTSUPP;
+ ret = srq->device->ops.post_srq_recv(srq, wr, &bad_wr);
uobj_put_obj_read(srq);
@@ -2556,8 +2372,9 @@ ssize_t ib_uverbs_post_srq_recv(struct ib_uverbs_file *file,
break;
}
- if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp))
- ret = -EFAULT;
+ ret2 = uverbs_response(attrs, &resp, sizeof(resp));
+ if (ret2)
+ ret = ret2;
out:
while (wr) {
@@ -2566,12 +2383,10 @@ out:
wr = next;
}
- return ret ? ret : in_len;
+ return ret;
}
-ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+static int ib_uverbs_create_ah(struct uverbs_attr_bundle *attrs)
{
struct ib_uverbs_create_ah cmd;
struct ib_uverbs_create_ah_resp resp;
@@ -2580,21 +2395,13 @@ ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file,
struct ib_ah *ah;
struct rdma_ah_attr attr = {};
int ret;
- struct ib_udata udata;
struct ib_device *ib_dev;
- if (out_len < sizeof resp)
- return -ENOSPC;
-
- if (copy_from_user(&cmd, buf, sizeof cmd))
- return -EFAULT;
-
- ib_uverbs_init_udata(&udata, buf + sizeof(cmd),
- u64_to_user_ptr(cmd.response) + sizeof(resp),
- in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
- out_len - sizeof(resp));
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
- uobj = uobj_alloc(UVERBS_OBJECT_AH, file, &ib_dev);
+ uobj = uobj_alloc(UVERBS_OBJECT_AH, attrs, &ib_dev);
if (IS_ERR(uobj))
return PTR_ERR(uobj);
@@ -2603,7 +2410,7 @@ ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file,
goto err;
}
- pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, file);
+ pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, attrs);
if (!pd) {
ret = -EINVAL;
goto err;
@@ -2627,7 +2434,7 @@ ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file,
rdma_ah_set_ah_flags(&attr, 0);
}
- ah = rdma_create_user_ah(pd, &attr, &udata);
+ ah = rdma_create_user_ah(pd, &attr, &attrs->driver_udata);
if (IS_ERR(ah)) {
ret = PTR_ERR(ah);
goto err_put;
@@ -2639,16 +2446,15 @@ ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file,
resp.ah_handle = uobj->id;
- if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp)) {
- ret = -EFAULT;
+ ret = uverbs_response(attrs, &resp, sizeof(resp));
+ if (ret)
goto err_copy;
- }
uobj_put_obj_read(pd);
- return uobj_alloc_commit(uobj, in_len);
+ return uobj_alloc_commit(uobj);
err_copy:
- rdma_destroy_ah(ah);
+ rdma_destroy_ah(ah, RDMA_DESTROY_AH_SLEEPABLE);
err_put:
uobj_put_obj_read(pd);
@@ -2658,21 +2464,19 @@ err:
return ret;
}
-ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file,
- const char __user *buf, int in_len, int out_len)
+static int ib_uverbs_destroy_ah(struct uverbs_attr_bundle *attrs)
{
struct ib_uverbs_destroy_ah cmd;
+ int ret;
- if (copy_from_user(&cmd, buf, sizeof cmd))
- return -EFAULT;
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
- return uobj_perform_destroy(UVERBS_OBJECT_AH, cmd.ah_handle, file,
- in_len);
+ return uobj_perform_destroy(UVERBS_OBJECT_AH, cmd.ah_handle, attrs);
}
-ssize_t ib_uverbs_attach_mcast(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+static int ib_uverbs_attach_mcast(struct uverbs_attr_bundle *attrs)
{
struct ib_uverbs_attach_mcast cmd;
struct ib_qp *qp;
@@ -2680,10 +2484,11 @@ ssize_t ib_uverbs_attach_mcast(struct ib_uverbs_file *file,
struct ib_uverbs_mcast_entry *mcast;
int ret;
- if (copy_from_user(&cmd, buf, sizeof cmd))
- return -EFAULT;
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
- qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, file);
+ qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs);
if (!qp)
return -EINVAL;
@@ -2716,12 +2521,10 @@ out_put:
mutex_unlock(&obj->mcast_lock);
uobj_put_obj_read(qp);
- return ret ? ret : in_len;
+ return ret;
}
-ssize_t ib_uverbs_detach_mcast(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+static int ib_uverbs_detach_mcast(struct uverbs_attr_bundle *attrs)
{
struct ib_uverbs_detach_mcast cmd;
struct ib_uqp_object *obj;
@@ -2730,10 +2533,11 @@ ssize_t ib_uverbs_detach_mcast(struct ib_uverbs_file *file,
int ret = -EINVAL;
bool found = false;
- if (copy_from_user(&cmd, buf, sizeof cmd))
- return -EFAULT;
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
- qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, file);
+ qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs);
if (!qp)
return -EINVAL;
@@ -2759,7 +2563,7 @@ ssize_t ib_uverbs_detach_mcast(struct ib_uverbs_file *file,
out_put:
mutex_unlock(&obj->mcast_lock);
uobj_put_obj_read(qp);
- return ret ? ret : in_len;
+ return ret;
}
struct ib_uflow_resources *flow_resources_alloc(size_t num_specs)
@@ -2838,7 +2642,7 @@ void flow_resources_add(struct ib_uflow_resources *uflow_res,
}
EXPORT_SYMBOL(flow_resources_add);
-static int kern_spec_to_ib_spec_action(struct ib_uverbs_file *ufile,
+static int kern_spec_to_ib_spec_action(struct uverbs_attr_bundle *attrs,
struct ib_uverbs_flow_spec *kern_spec,
union ib_flow_spec *ib_spec,
struct ib_uflow_resources *uflow_res)
@@ -2867,7 +2671,7 @@ static int kern_spec_to_ib_spec_action(struct ib_uverbs_file *ufile,
ib_spec->action.act = uobj_get_obj_read(flow_action,
UVERBS_OBJECT_FLOW_ACTION,
kern_spec->action.handle,
- ufile);
+ attrs);
if (!ib_spec->action.act)
return -EINVAL;
ib_spec->action.size =
@@ -2885,7 +2689,7 @@ static int kern_spec_to_ib_spec_action(struct ib_uverbs_file *ufile,
uobj_get_obj_read(counters,
UVERBS_OBJECT_COUNTERS,
kern_spec->flow_count.handle,
- ufile);
+ attrs);
if (!ib_spec->flow_count.counters)
return -EINVAL;
ib_spec->flow_count.size =
@@ -3066,7 +2870,7 @@ static int kern_spec_to_ib_spec_filter(struct ib_uverbs_flow_spec *kern_spec,
kern_filter_sz, ib_spec);
}
-static int kern_spec_to_ib_spec(struct ib_uverbs_file *ufile,
+static int kern_spec_to_ib_spec(struct uverbs_attr_bundle *attrs,
struct ib_uverbs_flow_spec *kern_spec,
union ib_flow_spec *ib_spec,
struct ib_uflow_resources *uflow_res)
@@ -3075,17 +2879,15 @@ static int kern_spec_to_ib_spec(struct ib_uverbs_file *ufile,
return -EINVAL;
if (kern_spec->type >= IB_FLOW_SPEC_ACTION_TAG)
- return kern_spec_to_ib_spec_action(ufile, kern_spec, ib_spec,
+ return kern_spec_to_ib_spec_action(attrs, kern_spec, ib_spec,
uflow_res);
else
return kern_spec_to_ib_spec_filter(kern_spec, ib_spec);
}
-int ib_uverbs_ex_create_wq(struct ib_uverbs_file *file,
- struct ib_udata *ucore,
- struct ib_udata *uhw)
+static int ib_uverbs_ex_create_wq(struct uverbs_attr_bundle *attrs)
{
- struct ib_uverbs_ex_create_wq cmd = {};
+ struct ib_uverbs_ex_create_wq cmd;
struct ib_uverbs_ex_create_wq_resp resp = {};
struct ib_uwq_object *obj;
int err = 0;
@@ -3093,43 +2895,27 @@ int ib_uverbs_ex_create_wq(struct ib_uverbs_file *file,
struct ib_pd *pd;
struct ib_wq *wq;
struct ib_wq_init_attr wq_init_attr = {};
- size_t required_cmd_sz;
- size_t required_resp_len;
struct ib_device *ib_dev;
- required_cmd_sz = offsetof(typeof(cmd), max_sge) + sizeof(cmd.max_sge);
- required_resp_len = offsetof(typeof(resp), wqn) + sizeof(resp.wqn);
-
- if (ucore->inlen < required_cmd_sz)
- return -EINVAL;
-
- if (ucore->outlen < required_resp_len)
- return -ENOSPC;
-
- if (ucore->inlen > sizeof(cmd) &&
- !ib_is_udata_cleared(ucore, sizeof(cmd),
- ucore->inlen - sizeof(cmd)))
- return -EOPNOTSUPP;
-
- err = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen));
+ err = uverbs_request(attrs, &cmd, sizeof(cmd));
if (err)
return err;
if (cmd.comp_mask)
return -EOPNOTSUPP;
- obj = (struct ib_uwq_object *)uobj_alloc(UVERBS_OBJECT_WQ, file,
+ obj = (struct ib_uwq_object *)uobj_alloc(UVERBS_OBJECT_WQ, attrs,
&ib_dev);
if (IS_ERR(obj))
return PTR_ERR(obj);
- pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, file);
+ pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, attrs);
if (!pd) {
err = -EINVAL;
goto err_uobj;
}
- cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, file);
+ cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, attrs);
if (!cq) {
err = -EINVAL;
goto err_put_pd;
@@ -3138,20 +2924,14 @@ int ib_uverbs_ex_create_wq(struct ib_uverbs_file *file,
wq_init_attr.cq = cq;
wq_init_attr.max_sge = cmd.max_sge;
wq_init_attr.max_wr = cmd.max_wr;
- wq_init_attr.wq_context = file;
+ wq_init_attr.wq_context = attrs->ufile;
wq_init_attr.wq_type = cmd.wq_type;
wq_init_attr.event_handler = ib_uverbs_wq_event_handler;
- if (ucore->inlen >= (offsetof(typeof(cmd), create_flags) +
- sizeof(cmd.create_flags)))
- wq_init_attr.create_flags = cmd.create_flags;
+ wq_init_attr.create_flags = cmd.create_flags;
obj->uevent.events_reported = 0;
INIT_LIST_HEAD(&obj->uevent.event_list);
- if (!pd->device->create_wq) {
- err = -EOPNOTSUPP;
- goto err_put_cq;
- }
- wq = pd->device->create_wq(pd, &wq_init_attr, uhw);
+ wq = pd->device->ops.create_wq(pd, &wq_init_attr, &attrs->driver_udata);
if (IS_ERR(wq)) {
err = PTR_ERR(wq);
goto err_put_cq;
@@ -3175,15 +2955,14 @@ int ib_uverbs_ex_create_wq(struct ib_uverbs_file *file,
resp.max_sge = wq_init_attr.max_sge;
resp.max_wr = wq_init_attr.max_wr;
resp.wqn = wq->wq_num;
- resp.response_length = required_resp_len;
- err = ib_copy_to_udata(ucore,
- &resp, resp.response_length);
+ resp.response_length = uverbs_response_length(attrs, sizeof(resp));
+ err = uverbs_response(attrs, &resp, sizeof(resp));
if (err)
goto err_copy;
uobj_put_obj_read(pd);
uobj_put_obj_read(cq);
- return uobj_alloc_commit(&obj->uevent.uobject, 0);
+ return uobj_alloc_commit(&obj->uevent.uobject);
err_copy:
ib_destroy_wq(wq);
@@ -3197,41 +2976,23 @@ err_uobj:
return err;
}
-int ib_uverbs_ex_destroy_wq(struct ib_uverbs_file *file,
- struct ib_udata *ucore,
- struct ib_udata *uhw)
+static int ib_uverbs_ex_destroy_wq(struct uverbs_attr_bundle *attrs)
{
- struct ib_uverbs_ex_destroy_wq cmd = {};
+ struct ib_uverbs_ex_destroy_wq cmd;
struct ib_uverbs_ex_destroy_wq_resp resp = {};
struct ib_uobject *uobj;
struct ib_uwq_object *obj;
- size_t required_cmd_sz;
- size_t required_resp_len;
int ret;
- required_cmd_sz = offsetof(typeof(cmd), wq_handle) + sizeof(cmd.wq_handle);
- required_resp_len = offsetof(typeof(resp), reserved) + sizeof(resp.reserved);
-
- if (ucore->inlen < required_cmd_sz)
- return -EINVAL;
-
- if (ucore->outlen < required_resp_len)
- return -ENOSPC;
-
- if (ucore->inlen > sizeof(cmd) &&
- !ib_is_udata_cleared(ucore, sizeof(cmd),
- ucore->inlen - sizeof(cmd)))
- return -EOPNOTSUPP;
-
- ret = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen));
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
if (ret)
return ret;
if (cmd.comp_mask)
return -EOPNOTSUPP;
- resp.response_length = required_resp_len;
- uobj = uobj_get_destroy(UVERBS_OBJECT_WQ, cmd.wq_handle, file);
+ resp.response_length = uverbs_response_length(attrs, sizeof(resp));
+ uobj = uobj_get_destroy(UVERBS_OBJECT_WQ, cmd.wq_handle, attrs);
if (IS_ERR(uobj))
return PTR_ERR(uobj);
@@ -3240,29 +3001,17 @@ int ib_uverbs_ex_destroy_wq(struct ib_uverbs_file *file,
uobj_put_destroy(uobj);
- return ib_copy_to_udata(ucore, &resp, resp.response_length);
+ return uverbs_response(attrs, &resp, sizeof(resp));
}
-int ib_uverbs_ex_modify_wq(struct ib_uverbs_file *file,
- struct ib_udata *ucore,
- struct ib_udata *uhw)
+static int ib_uverbs_ex_modify_wq(struct uverbs_attr_bundle *attrs)
{
- struct ib_uverbs_ex_modify_wq cmd = {};
+ struct ib_uverbs_ex_modify_wq cmd;
struct ib_wq *wq;
struct ib_wq_attr wq_attr = {};
- size_t required_cmd_sz;
int ret;
- required_cmd_sz = offsetof(typeof(cmd), curr_wq_state) + sizeof(cmd.curr_wq_state);
- if (ucore->inlen < required_cmd_sz)
- return -EINVAL;
-
- if (ucore->inlen > sizeof(cmd) &&
- !ib_is_udata_cleared(ucore, sizeof(cmd),
- ucore->inlen - sizeof(cmd)))
- return -EOPNOTSUPP;
-
- ret = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen));
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
if (ret)
return ret;
@@ -3272,7 +3021,7 @@ int ib_uverbs_ex_modify_wq(struct ib_uverbs_file *file,
if (cmd.attr_mask > (IB_WQ_STATE | IB_WQ_CUR_STATE | IB_WQ_FLAGS))
return -EINVAL;
- wq = uobj_get_obj_read(wq, UVERBS_OBJECT_WQ, cmd.wq_handle, file);
+ wq = uobj_get_obj_read(wq, UVERBS_OBJECT_WQ, cmd.wq_handle, attrs);
if (!wq)
return -EINVAL;
@@ -3282,24 +3031,18 @@ int ib_uverbs_ex_modify_wq(struct ib_uverbs_file *file,
wq_attr.flags = cmd.flags;
wq_attr.flags_mask = cmd.flags_mask;
}
- if (!wq->device->modify_wq) {
- ret = -EOPNOTSUPP;
- goto out;
- }
- ret = wq->device->modify_wq(wq, &wq_attr, cmd.attr_mask, uhw);
-out:
+ ret = wq->device->ops.modify_wq(wq, &wq_attr, cmd.attr_mask,
+ &attrs->driver_udata);
uobj_put_obj_read(wq);
return ret;
}
-int ib_uverbs_ex_create_rwq_ind_table(struct ib_uverbs_file *file,
- struct ib_udata *ucore,
- struct ib_udata *uhw)
+static int ib_uverbs_ex_create_rwq_ind_table(struct uverbs_attr_bundle *attrs)
{
- struct ib_uverbs_ex_create_rwq_ind_table cmd = {};
+ struct ib_uverbs_ex_create_rwq_ind_table cmd;
struct ib_uverbs_ex_create_rwq_ind_table_resp resp = {};
struct ib_uobject *uobj;
- int err = 0;
+ int err;
struct ib_rwq_ind_table_init_attr init_attr = {};
struct ib_rwq_ind_table *rwq_ind_tbl;
struct ib_wq **wqs = NULL;
@@ -3307,27 +3050,13 @@ int ib_uverbs_ex_create_rwq_ind_table(struct ib_uverbs_file *file,
struct ib_wq *wq = NULL;
int i, j, num_read_wqs;
u32 num_wq_handles;
- u32 expected_in_size;
- size_t required_cmd_sz_header;
- size_t required_resp_len;
+ struct uverbs_req_iter iter;
struct ib_device *ib_dev;
- required_cmd_sz_header = offsetof(typeof(cmd), log_ind_tbl_size) + sizeof(cmd.log_ind_tbl_size);
- required_resp_len = offsetof(typeof(resp), ind_tbl_num) + sizeof(resp.ind_tbl_num);
-
- if (ucore->inlen < required_cmd_sz_header)
- return -EINVAL;
-
- if (ucore->outlen < required_resp_len)
- return -ENOSPC;
-
- err = ib_copy_from_udata(&cmd, ucore, required_cmd_sz_header);
+ err = uverbs_request_start(attrs, &iter, &cmd, sizeof(cmd));
if (err)
return err;
- ucore->inbuf += required_cmd_sz_header;
- ucore->inlen -= required_cmd_sz_header;
-
if (cmd.comp_mask)
return -EOPNOTSUPP;
@@ -3335,26 +3064,17 @@ int ib_uverbs_ex_create_rwq_ind_table(struct ib_uverbs_file *file,
return -EINVAL;
num_wq_handles = 1 << cmd.log_ind_tbl_size;
- expected_in_size = num_wq_handles * sizeof(__u32);
- if (num_wq_handles == 1)
- /* input size for wq handles is u64 aligned */
- expected_in_size += sizeof(__u32);
-
- if (ucore->inlen < expected_in_size)
- return -EINVAL;
-
- if (ucore->inlen > expected_in_size &&
- !ib_is_udata_cleared(ucore, expected_in_size,
- ucore->inlen - expected_in_size))
- return -EOPNOTSUPP;
-
wqs_handles = kcalloc(num_wq_handles, sizeof(*wqs_handles),
GFP_KERNEL);
if (!wqs_handles)
return -ENOMEM;
- err = ib_copy_from_udata(wqs_handles, ucore,
- num_wq_handles * sizeof(__u32));
+ err = uverbs_request_next(&iter, wqs_handles,
+ num_wq_handles * sizeof(__u32));
+ if (err)
+ goto err_free;
+
+ err = uverbs_request_finish(&iter);
if (err)
goto err_free;
@@ -3367,7 +3087,7 @@ int ib_uverbs_ex_create_rwq_ind_table(struct ib_uverbs_file *file,
for (num_read_wqs = 0; num_read_wqs < num_wq_handles;
num_read_wqs++) {
wq = uobj_get_obj_read(wq, UVERBS_OBJECT_WQ,
- wqs_handles[num_read_wqs], file);
+ wqs_handles[num_read_wqs], attrs);
if (!wq) {
err = -EINVAL;
goto put_wqs;
@@ -3376,7 +3096,7 @@ int ib_uverbs_ex_create_rwq_ind_table(struct ib_uverbs_file *file,
wqs[num_read_wqs] = wq;
}
- uobj = uobj_alloc(UVERBS_OBJECT_RWQ_IND_TBL, file, &ib_dev);
+ uobj = uobj_alloc(UVERBS_OBJECT_RWQ_IND_TBL, attrs, &ib_dev);
if (IS_ERR(uobj)) {
err = PTR_ERR(uobj);
goto put_wqs;
@@ -3385,11 +3105,8 @@ int ib_uverbs_ex_create_rwq_ind_table(struct ib_uverbs_file *file,
init_attr.log_ind_tbl_size = cmd.log_ind_tbl_size;
init_attr.ind_tbl = wqs;
- if (!ib_dev->create_rwq_ind_table) {
- err = -EOPNOTSUPP;
- goto err_uobj;
- }
- rwq_ind_tbl = ib_dev->create_rwq_ind_table(ib_dev, &init_attr, uhw);
+ rwq_ind_tbl = ib_dev->ops.create_rwq_ind_table(ib_dev, &init_attr,
+ &attrs->driver_udata);
if (IS_ERR(rwq_ind_tbl)) {
err = PTR_ERR(rwq_ind_tbl);
@@ -3408,10 +3125,9 @@ int ib_uverbs_ex_create_rwq_ind_table(struct ib_uverbs_file *file,
resp.ind_tbl_handle = uobj->id;
resp.ind_tbl_num = rwq_ind_tbl->ind_tbl_num;
- resp.response_length = required_resp_len;
+ resp.response_length = uverbs_response_length(attrs, sizeof(resp));
- err = ib_copy_to_udata(ucore,
- &resp, resp.response_length);
+ err = uverbs_response(attrs, &resp, sizeof(resp));
if (err)
goto err_copy;
@@ -3420,7 +3136,7 @@ int ib_uverbs_ex_create_rwq_ind_table(struct ib_uverbs_file *file,
for (j = 0; j < num_read_wqs; j++)
uobj_put_obj_read(wqs[j]);
- return uobj_alloc_commit(uobj, 0);
+ return uobj_alloc_commit(uobj);
err_copy:
ib_destroy_rwq_ind_table(rwq_ind_tbl);
@@ -3435,25 +3151,12 @@ err_free:
return err;
}
-int ib_uverbs_ex_destroy_rwq_ind_table(struct ib_uverbs_file *file,
- struct ib_udata *ucore,
- struct ib_udata *uhw)
+static int ib_uverbs_ex_destroy_rwq_ind_table(struct uverbs_attr_bundle *attrs)
{
- struct ib_uverbs_ex_destroy_rwq_ind_table cmd = {};
- int ret;
- size_t required_cmd_sz;
-
- required_cmd_sz = offsetof(typeof(cmd), ind_tbl_handle) + sizeof(cmd.ind_tbl_handle);
-
- if (ucore->inlen < required_cmd_sz)
- return -EINVAL;
-
- if (ucore->inlen > sizeof(cmd) &&
- !ib_is_udata_cleared(ucore, sizeof(cmd),
- ucore->inlen - sizeof(cmd)))
- return -EOPNOTSUPP;
+ struct ib_uverbs_ex_destroy_rwq_ind_table cmd;
+ int ret;
- ret = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen));
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
if (ret)
return ret;
@@ -3461,12 +3164,10 @@ int ib_uverbs_ex_destroy_rwq_ind_table(struct ib_uverbs_file *file,
return -EOPNOTSUPP;
return uobj_perform_destroy(UVERBS_OBJECT_RWQ_IND_TBL,
- cmd.ind_tbl_handle, file, 0);
+ cmd.ind_tbl_handle, attrs);
}
-int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file,
- struct ib_udata *ucore,
- struct ib_udata *uhw)
+static int ib_uverbs_ex_create_flow(struct uverbs_attr_bundle *attrs)
{
struct ib_uverbs_create_flow cmd;
struct ib_uverbs_create_flow_resp resp;
@@ -3477,24 +3178,16 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file,
struct ib_qp *qp;
struct ib_uflow_resources *uflow_res;
struct ib_uverbs_flow_spec_hdr *kern_spec;
- int err = 0;
+ struct uverbs_req_iter iter;
+ int err;
void *ib_spec;
int i;
struct ib_device *ib_dev;
- if (ucore->inlen < sizeof(cmd))
- return -EINVAL;
-
- if (ucore->outlen < sizeof(resp))
- return -ENOSPC;
-
- err = ib_copy_from_udata(&cmd, ucore, sizeof(cmd));
+ err = uverbs_request_start(attrs, &iter, &cmd, sizeof(cmd));
if (err)
return err;
- ucore->inbuf += sizeof(cmd);
- ucore->inlen -= sizeof(cmd);
-
if (cmd.comp_mask)
return -EINVAL;
@@ -3512,8 +3205,7 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file,
if (cmd.flow_attr.num_of_specs > IB_FLOW_SPEC_SUPPORT_LAYERS)
return -EINVAL;
- if (cmd.flow_attr.size > ucore->inlen ||
- cmd.flow_attr.size >
+ if (cmd.flow_attr.size >
(cmd.flow_attr.num_of_specs * sizeof(struct ib_uverbs_flow_spec)))
return -EINVAL;
@@ -3528,21 +3220,25 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file,
return -ENOMEM;
*kern_flow_attr = cmd.flow_attr;
- err = ib_copy_from_udata(&kern_flow_attr->flow_specs, ucore,
- cmd.flow_attr.size);
+ err = uverbs_request_next(&iter, &kern_flow_attr->flow_specs,
+ cmd.flow_attr.size);
if (err)
goto err_free_attr;
} else {
kern_flow_attr = &cmd.flow_attr;
}
- uobj = uobj_alloc(UVERBS_OBJECT_FLOW, file, &ib_dev);
+ err = uverbs_request_finish(&iter);
+ if (err)
+ goto err_free_attr;
+
+ uobj = uobj_alloc(UVERBS_OBJECT_FLOW, attrs, &ib_dev);
if (IS_ERR(uobj)) {
err = PTR_ERR(uobj);
goto err_free_attr;
}
- qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, file);
+ qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs);
if (!qp) {
err = -EINVAL;
goto err_uobj;
@@ -3553,11 +3249,6 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file,
goto err_put;
}
- if (!qp->device->create_flow) {
- err = -EOPNOTSUPP;
- goto err_put;
- }
-
flow_attr = kzalloc(struct_size(flow_attr, flows,
cmd.flow_attr.num_of_specs), GFP_KERNEL);
if (!flow_attr) {
@@ -3584,7 +3275,7 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file,
cmd.flow_attr.size >= kern_spec->size;
i++) {
err = kern_spec_to_ib_spec(
- file, (struct ib_uverbs_flow_spec *)kern_spec,
+ attrs, (struct ib_uverbs_flow_spec *)kern_spec,
ib_spec, uflow_res);
if (err)
goto err_free;
@@ -3602,8 +3293,8 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file,
goto err_free;
}
- flow_id = qp->device->create_flow(qp, flow_attr,
- IB_FLOW_DOMAIN_USER, uhw);
+ flow_id = qp->device->ops.create_flow(
+ qp, flow_attr, IB_FLOW_DOMAIN_USER, &attrs->driver_udata);
if (IS_ERR(flow_id)) {
err = PTR_ERR(flow_id);
@@ -3615,8 +3306,7 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file,
memset(&resp, 0, sizeof(resp));
resp.flow_handle = uobj->id;
- err = ib_copy_to_udata(ucore,
- &resp, sizeof(resp));
+ err = uverbs_response(attrs, &resp, sizeof(resp));
if (err)
goto err_copy;
@@ -3624,9 +3314,9 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file,
kfree(flow_attr);
if (cmd.flow_attr.num_of_specs)
kfree(kern_flow_attr);
- return uobj_alloc_commit(uobj, 0);
+ return uobj_alloc_commit(uobj);
err_copy:
- if (!qp->device->destroy_flow(flow_id))
+ if (!qp->device->ops.destroy_flow(flow_id))
atomic_dec(&qp->usecnt);
err_free:
ib_uverbs_flow_resources_free(uflow_res);
@@ -3642,28 +3332,22 @@ err_free_attr:
return err;
}
-int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file,
- struct ib_udata *ucore,
- struct ib_udata *uhw)
+static int ib_uverbs_ex_destroy_flow(struct uverbs_attr_bundle *attrs)
{
struct ib_uverbs_destroy_flow cmd;
int ret;
- if (ucore->inlen < sizeof(cmd))
- return -EINVAL;
-
- ret = ib_copy_from_udata(&cmd, ucore, sizeof(cmd));
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
if (ret)
return ret;
if (cmd.comp_mask)
return -EINVAL;
- return uobj_perform_destroy(UVERBS_OBJECT_FLOW, cmd.flow_handle, file,
- 0);
+ return uobj_perform_destroy(UVERBS_OBJECT_FLOW, cmd.flow_handle, attrs);
}
-static int __uverbs_create_xsrq(struct ib_uverbs_file *file,
+static int __uverbs_create_xsrq(struct uverbs_attr_bundle *attrs,
struct ib_uverbs_create_xsrq *cmd,
struct ib_udata *udata)
{
@@ -3676,7 +3360,7 @@ static int __uverbs_create_xsrq(struct ib_uverbs_file *file,
int ret;
struct ib_device *ib_dev;
- obj = (struct ib_usrq_object *)uobj_alloc(UVERBS_OBJECT_SRQ, file,
+ obj = (struct ib_usrq_object *)uobj_alloc(UVERBS_OBJECT_SRQ, attrs,
&ib_dev);
if (IS_ERR(obj))
return PTR_ERR(obj);
@@ -3686,7 +3370,7 @@ static int __uverbs_create_xsrq(struct ib_uverbs_file *file,
if (cmd->srq_type == IB_SRQT_XRC) {
xrcd_uobj = uobj_get_read(UVERBS_OBJECT_XRCD, cmd->xrcd_handle,
- file);
+ attrs);
if (IS_ERR(xrcd_uobj)) {
ret = -EINVAL;
goto err;
@@ -3704,21 +3388,21 @@ static int __uverbs_create_xsrq(struct ib_uverbs_file *file,
if (ib_srq_has_cq(cmd->srq_type)) {
attr.ext.cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ,
- cmd->cq_handle, file);
+ cmd->cq_handle, attrs);
if (!attr.ext.cq) {
ret = -EINVAL;
goto err_put_xrcd;
}
}
- pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd->pd_handle, file);
+ pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd->pd_handle, attrs);
if (!pd) {
ret = -EINVAL;
goto err_put_cq;
}
attr.event_handler = ib_uverbs_srq_event_handler;
- attr.srq_context = file;
+ attr.srq_context = attrs->ufile;
attr.srq_type = cmd->srq_type;
attr.attr.max_wr = cmd->max_wr;
attr.attr.max_sge = cmd->max_sge;
@@ -3727,7 +3411,7 @@ static int __uverbs_create_xsrq(struct ib_uverbs_file *file,
obj->uevent.events_reported = 0;
INIT_LIST_HEAD(&obj->uevent.event_list);
- srq = pd->device->create_srq(pd, &attr, udata);
+ srq = pd->device->ops.create_srq(pd, &attr, udata);
if (IS_ERR(srq)) {
ret = PTR_ERR(srq);
goto err_put;
@@ -3763,11 +3447,9 @@ static int __uverbs_create_xsrq(struct ib_uverbs_file *file,
if (cmd->srq_type == IB_SRQT_XRC)
resp.srqn = srq->ext.xrc.srq_num;
- if (copy_to_user(u64_to_user_ptr(cmd->response),
- &resp, sizeof resp)) {
- ret = -EFAULT;
+ ret = uverbs_response(attrs, &resp, sizeof(resp));
+ if (ret)
goto err_copy;
- }
if (cmd->srq_type == IB_SRQT_XRC)
uobj_put_read(xrcd_uobj);
@@ -3776,7 +3458,7 @@ static int __uverbs_create_xsrq(struct ib_uverbs_file *file,
uobj_put_obj_read(attr.ext.cq);
uobj_put_obj_read(pd);
- return uobj_alloc_commit(&obj->uevent.uobject, 0);
+ return uobj_alloc_commit(&obj->uevent.uobject);
err_copy:
ib_destroy_srq(srq);
@@ -3799,21 +3481,15 @@ err:
return ret;
}
-ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+static int ib_uverbs_create_srq(struct uverbs_attr_bundle *attrs)
{
struct ib_uverbs_create_srq cmd;
struct ib_uverbs_create_xsrq xcmd;
- struct ib_uverbs_create_srq_resp resp;
- struct ib_udata udata;
int ret;
- if (out_len < sizeof resp)
- return -ENOSPC;
-
- if (copy_from_user(&cmd, buf, sizeof cmd))
- return -EFAULT;
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
memset(&xcmd, 0, sizeof(xcmd));
xcmd.response = cmd.response;
@@ -3824,77 +3500,48 @@ ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file,
xcmd.max_sge = cmd.max_sge;
xcmd.srq_limit = cmd.srq_limit;
- ib_uverbs_init_udata(&udata, buf + sizeof(cmd),
- u64_to_user_ptr(cmd.response) + sizeof(resp),
- in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
- out_len - sizeof(resp));
-
- ret = __uverbs_create_xsrq(file, &xcmd, &udata);
- if (ret)
- return ret;
-
- return in_len;
+ return __uverbs_create_xsrq(attrs, &xcmd, &attrs->driver_udata);
}
-ssize_t ib_uverbs_create_xsrq(struct ib_uverbs_file *file,
- const char __user *buf, int in_len, int out_len)
+static int ib_uverbs_create_xsrq(struct uverbs_attr_bundle *attrs)
{
struct ib_uverbs_create_xsrq cmd;
- struct ib_uverbs_create_srq_resp resp;
- struct ib_udata udata;
int ret;
- if (out_len < sizeof resp)
- return -ENOSPC;
-
- if (copy_from_user(&cmd, buf, sizeof cmd))
- return -EFAULT;
-
- ib_uverbs_init_udata(&udata, buf + sizeof(cmd),
- u64_to_user_ptr(cmd.response) + sizeof(resp),
- in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
- out_len - sizeof(resp));
-
- ret = __uverbs_create_xsrq(file, &cmd, &udata);
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
if (ret)
return ret;
- return in_len;
+ return __uverbs_create_xsrq(attrs, &cmd, &attrs->driver_udata);
}
-ssize_t ib_uverbs_modify_srq(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+static int ib_uverbs_modify_srq(struct uverbs_attr_bundle *attrs)
{
struct ib_uverbs_modify_srq cmd;
- struct ib_udata udata;
struct ib_srq *srq;
struct ib_srq_attr attr;
int ret;
- if (copy_from_user(&cmd, buf, sizeof cmd))
- return -EFAULT;
-
- ib_uverbs_init_udata(&udata, buf + sizeof cmd, NULL, in_len - sizeof cmd,
- out_len);
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
- srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, cmd.srq_handle, file);
+ srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, cmd.srq_handle, attrs);
if (!srq)
return -EINVAL;
attr.max_wr = cmd.max_wr;
attr.srq_limit = cmd.srq_limit;
- ret = srq->device->modify_srq(srq, &attr, cmd.attr_mask, &udata);
+ ret = srq->device->ops.modify_srq(srq, &attr, cmd.attr_mask,
+ &attrs->driver_udata);
uobj_put_obj_read(srq);
- return ret ? ret : in_len;
+ return ret;
}
-ssize_t ib_uverbs_query_srq(struct ib_uverbs_file *file,
- const char __user *buf,
- int in_len, int out_len)
+static int ib_uverbs_query_srq(struct uverbs_attr_bundle *attrs)
{
struct ib_uverbs_query_srq cmd;
struct ib_uverbs_query_srq_resp resp;
@@ -3902,13 +3549,11 @@ ssize_t ib_uverbs_query_srq(struct ib_uverbs_file *file,
struct ib_srq *srq;
int ret;
- if (out_len < sizeof resp)
- return -ENOSPC;
-
- if (copy_from_user(&cmd, buf, sizeof cmd))
- return -EFAULT;
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
- srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, cmd.srq_handle, file);
+ srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, cmd.srq_handle, attrs);
if (!srq)
return -EINVAL;
@@ -3925,25 +3570,22 @@ ssize_t ib_uverbs_query_srq(struct ib_uverbs_file *file,
resp.max_sge = attr.max_sge;
resp.srq_limit = attr.srq_limit;
- if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp))
- return -EFAULT;
-
- return in_len;
+ return uverbs_response(attrs, &resp, sizeof(resp));
}
-ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+static int ib_uverbs_destroy_srq(struct uverbs_attr_bundle *attrs)
{
struct ib_uverbs_destroy_srq cmd;
struct ib_uverbs_destroy_srq_resp resp;
struct ib_uobject *uobj;
struct ib_uevent_object *obj;
+ int ret;
- if (copy_from_user(&cmd, buf, sizeof cmd))
- return -EFAULT;
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
- uobj = uobj_get_destroy(UVERBS_OBJECT_SRQ, cmd.srq_handle, file);
+ uobj = uobj_get_destroy(UVERBS_OBJECT_SRQ, cmd.srq_handle, attrs);
if (IS_ERR(uobj))
return PTR_ERR(uobj);
@@ -3953,35 +3595,24 @@ ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file,
uobj_put_destroy(uobj);
- if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof(resp)))
- return -EFAULT;
-
- return in_len;
+ return uverbs_response(attrs, &resp, sizeof(resp));
}
-int ib_uverbs_ex_query_device(struct ib_uverbs_file *file,
- struct ib_udata *ucore,
- struct ib_udata *uhw)
+static int ib_uverbs_ex_query_device(struct uverbs_attr_bundle *attrs)
{
- struct ib_uverbs_ex_query_device_resp resp = { {0} };
+ struct ib_uverbs_ex_query_device_resp resp = {};
struct ib_uverbs_ex_query_device cmd;
struct ib_device_attr attr = {0};
struct ib_ucontext *ucontext;
struct ib_device *ib_dev;
int err;
- ucontext = ib_uverbs_get_ucontext(file);
+ ucontext = ib_uverbs_get_ucontext(attrs);
if (IS_ERR(ucontext))
return PTR_ERR(ucontext);
ib_dev = ucontext->device;
- if (!ib_dev->query_device)
- return -EOPNOTSUPP;
-
- if (ucore->inlen < sizeof(cmd))
- return -EINVAL;
-
- err = ib_copy_from_udata(&cmd, ucore, sizeof(cmd));
+ err = uverbs_request(attrs, &cmd, sizeof(cmd));
if (err)
return err;
@@ -3991,21 +3622,12 @@ int ib_uverbs_ex_query_device(struct ib_uverbs_file *file,
if (cmd.reserved)
return -EINVAL;
- resp.response_length = offsetof(typeof(resp), odp_caps);
-
- if (ucore->outlen < resp.response_length)
- return -ENOSPC;
-
- err = ib_dev->query_device(ib_dev, &attr, uhw);
+ err = ib_dev->ops.query_device(ib_dev, &attr, &attrs->driver_udata);
if (err)
return err;
copy_query_dev_fields(ucontext, &resp.base, &attr);
- if (ucore->outlen < resp.response_length + sizeof(resp.odp_caps))
- goto end;
-
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
resp.odp_caps.general_caps = attr.odp_caps.general_caps;
resp.odp_caps.per_transport_caps.rc_odp_caps =
attr.odp_caps.per_transport_caps.rc_odp_caps;
@@ -4013,100 +3635,40 @@ int ib_uverbs_ex_query_device(struct ib_uverbs_file *file,
attr.odp_caps.per_transport_caps.uc_odp_caps;
resp.odp_caps.per_transport_caps.ud_odp_caps =
attr.odp_caps.per_transport_caps.ud_odp_caps;
-#endif
- resp.response_length += sizeof(resp.odp_caps);
-
- if (ucore->outlen < resp.response_length + sizeof(resp.timestamp_mask))
- goto end;
+ resp.xrc_odp_caps = attr.odp_caps.per_transport_caps.xrc_odp_caps;
resp.timestamp_mask = attr.timestamp_mask;
- resp.response_length += sizeof(resp.timestamp_mask);
-
- if (ucore->outlen < resp.response_length + sizeof(resp.hca_core_clock))
- goto end;
-
resp.hca_core_clock = attr.hca_core_clock;
- resp.response_length += sizeof(resp.hca_core_clock);
-
- if (ucore->outlen < resp.response_length + sizeof(resp.device_cap_flags_ex))
- goto end;
-
resp.device_cap_flags_ex = attr.device_cap_flags;
- resp.response_length += sizeof(resp.device_cap_flags_ex);
-
- if (ucore->outlen < resp.response_length + sizeof(resp.rss_caps))
- goto end;
-
resp.rss_caps.supported_qpts = attr.rss_caps.supported_qpts;
resp.rss_caps.max_rwq_indirection_tables =
attr.rss_caps.max_rwq_indirection_tables;
resp.rss_caps.max_rwq_indirection_table_size =
attr.rss_caps.max_rwq_indirection_table_size;
-
- resp.response_length += sizeof(resp.rss_caps);
-
- if (ucore->outlen < resp.response_length + sizeof(resp.max_wq_type_rq))
- goto end;
-
resp.max_wq_type_rq = attr.max_wq_type_rq;
- resp.response_length += sizeof(resp.max_wq_type_rq);
-
- if (ucore->outlen < resp.response_length + sizeof(resp.raw_packet_caps))
- goto end;
-
resp.raw_packet_caps = attr.raw_packet_caps;
- resp.response_length += sizeof(resp.raw_packet_caps);
-
- if (ucore->outlen < resp.response_length + sizeof(resp.tm_caps))
- goto end;
-
resp.tm_caps.max_rndv_hdr_size = attr.tm_caps.max_rndv_hdr_size;
resp.tm_caps.max_num_tags = attr.tm_caps.max_num_tags;
resp.tm_caps.max_ops = attr.tm_caps.max_ops;
resp.tm_caps.max_sge = attr.tm_caps.max_sge;
resp.tm_caps.flags = attr.tm_caps.flags;
- resp.response_length += sizeof(resp.tm_caps);
-
- if (ucore->outlen < resp.response_length + sizeof(resp.cq_moderation_caps))
- goto end;
-
resp.cq_moderation_caps.max_cq_moderation_count =
attr.cq_caps.max_cq_moderation_count;
resp.cq_moderation_caps.max_cq_moderation_period =
attr.cq_caps.max_cq_moderation_period;
- resp.response_length += sizeof(resp.cq_moderation_caps);
-
- if (ucore->outlen < resp.response_length + sizeof(resp.max_dm_size))
- goto end;
-
resp.max_dm_size = attr.max_dm_size;
- resp.response_length += sizeof(resp.max_dm_size);
-end:
- err = ib_copy_to_udata(ucore, &resp, resp.response_length);
- return err;
+ resp.response_length = uverbs_response_length(attrs, sizeof(resp));
+
+ return uverbs_response(attrs, &resp, sizeof(resp));
}
-int ib_uverbs_ex_modify_cq(struct ib_uverbs_file *file,
- struct ib_udata *ucore,
- struct ib_udata *uhw)
+static int ib_uverbs_ex_modify_cq(struct uverbs_attr_bundle *attrs)
{
- struct ib_uverbs_ex_modify_cq cmd = {};
+ struct ib_uverbs_ex_modify_cq cmd;
struct ib_cq *cq;
- size_t required_cmd_sz;
int ret;
- required_cmd_sz = offsetof(typeof(cmd), reserved) +
- sizeof(cmd.reserved);
- if (ucore->inlen < required_cmd_sz)
- return -EINVAL;
-
- /* sanity checks */
- if (ucore->inlen > sizeof(cmd) &&
- !ib_is_udata_cleared(ucore, sizeof(cmd),
- ucore->inlen - sizeof(cmd)))
- return -EOPNOTSUPP;
-
- ret = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen));
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
if (ret)
return ret;
@@ -4116,7 +3678,7 @@ int ib_uverbs_ex_modify_cq(struct ib_uverbs_file *file,
if (cmd.attr_mask > IB_CQ_MODERATE)
return -EOPNOTSUPP;
- cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, file);
+ cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, attrs);
if (!cq)
return -EINVAL;
@@ -4126,3 +3688,381 @@ int ib_uverbs_ex_modify_cq(struct ib_uverbs_file *file,
return ret;
}
+
+/*
+ * Describe the input structs for write(). Some write methods have an input
+ * only struct, most have an input and output. If the struct has an output then
+ * the 'response' u64 must be the first field in the request structure.
+ *
+ * If udata is present then both the request and response structs have a
+ * trailing driver_data flex array. In this case the size of the base struct
+ * cannot be changed.
+ */
+#define offsetof_after(_struct, _member) \
+ (offsetof(_struct, _member) + sizeof(((_struct *)NULL)->_member))
+
+#define UAPI_DEF_WRITE_IO(req, resp) \
+ .write.has_resp = 1 + \
+ BUILD_BUG_ON_ZERO(offsetof(req, response) != 0) + \
+ BUILD_BUG_ON_ZERO(sizeof(((req *)0)->response) != \
+ sizeof(u64)), \
+ .write.req_size = sizeof(req), .write.resp_size = sizeof(resp)
+
+#define UAPI_DEF_WRITE_I(req) .write.req_size = sizeof(req)
+
+#define UAPI_DEF_WRITE_UDATA_IO(req, resp) \
+ UAPI_DEF_WRITE_IO(req, resp), \
+ .write.has_udata = \
+ 1 + \
+ BUILD_BUG_ON_ZERO(offsetof(req, driver_data) != \
+ sizeof(req)) + \
+ BUILD_BUG_ON_ZERO(offsetof(resp, driver_data) != \
+ sizeof(resp))
+
+#define UAPI_DEF_WRITE_UDATA_I(req) \
+ UAPI_DEF_WRITE_I(req), \
+ .write.has_udata = \
+ 1 + BUILD_BUG_ON_ZERO(offsetof(req, driver_data) != \
+ sizeof(req))
+
+/*
+ * The _EX versions are for use with WRITE_EX and allow the last struct member
+ * to be specified. Buffers that do not include that member will be rejected.
+ */
+#define UAPI_DEF_WRITE_IO_EX(req, req_last_member, resp, resp_last_member) \
+ .write.has_resp = 1, \
+ .write.req_size = offsetof_after(req, req_last_member), \
+ .write.resp_size = offsetof_after(resp, resp_last_member)
+
+#define UAPI_DEF_WRITE_I_EX(req, req_last_member) \
+ .write.req_size = offsetof_after(req, req_last_member)
+
+const struct uapi_definition uverbs_def_write_intf[] = {
+ DECLARE_UVERBS_OBJECT(
+ UVERBS_OBJECT_AH,
+ DECLARE_UVERBS_WRITE(IB_USER_VERBS_CMD_CREATE_AH,
+ ib_uverbs_create_ah,
+ UAPI_DEF_WRITE_UDATA_IO(
+ struct ib_uverbs_create_ah,
+ struct ib_uverbs_create_ah_resp),
+ UAPI_DEF_METHOD_NEEDS_FN(create_ah)),
+ DECLARE_UVERBS_WRITE(
+ IB_USER_VERBS_CMD_DESTROY_AH,
+ ib_uverbs_destroy_ah,
+ UAPI_DEF_WRITE_I(struct ib_uverbs_destroy_ah),
+ UAPI_DEF_METHOD_NEEDS_FN(destroy_ah))),
+
+ DECLARE_UVERBS_OBJECT(
+ UVERBS_OBJECT_COMP_CHANNEL,
+ DECLARE_UVERBS_WRITE(
+ IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL,
+ ib_uverbs_create_comp_channel,
+ UAPI_DEF_WRITE_IO(
+ struct ib_uverbs_create_comp_channel,
+ struct ib_uverbs_create_comp_channel_resp))),
+
+ DECLARE_UVERBS_OBJECT(
+ UVERBS_OBJECT_CQ,
+ DECLARE_UVERBS_WRITE(IB_USER_VERBS_CMD_CREATE_CQ,
+ ib_uverbs_create_cq,
+ UAPI_DEF_WRITE_UDATA_IO(
+ struct ib_uverbs_create_cq,
+ struct ib_uverbs_create_cq_resp),
+ UAPI_DEF_METHOD_NEEDS_FN(create_cq)),
+ DECLARE_UVERBS_WRITE(
+ IB_USER_VERBS_CMD_DESTROY_CQ,
+ ib_uverbs_destroy_cq,
+ UAPI_DEF_WRITE_IO(struct ib_uverbs_destroy_cq,
+ struct ib_uverbs_destroy_cq_resp),
+ UAPI_DEF_METHOD_NEEDS_FN(destroy_cq)),
+ DECLARE_UVERBS_WRITE(
+ IB_USER_VERBS_CMD_POLL_CQ,
+ ib_uverbs_poll_cq,
+ UAPI_DEF_WRITE_IO(struct ib_uverbs_poll_cq,
+ struct ib_uverbs_poll_cq_resp),
+ UAPI_DEF_METHOD_NEEDS_FN(poll_cq)),
+ DECLARE_UVERBS_WRITE(
+ IB_USER_VERBS_CMD_REQ_NOTIFY_CQ,
+ ib_uverbs_req_notify_cq,
+ UAPI_DEF_WRITE_I(struct ib_uverbs_req_notify_cq),
+ UAPI_DEF_METHOD_NEEDS_FN(req_notify_cq)),
+ DECLARE_UVERBS_WRITE(IB_USER_VERBS_CMD_RESIZE_CQ,
+ ib_uverbs_resize_cq,
+ UAPI_DEF_WRITE_UDATA_IO(
+ struct ib_uverbs_resize_cq,
+ struct ib_uverbs_resize_cq_resp),
+ UAPI_DEF_METHOD_NEEDS_FN(resize_cq)),
+ DECLARE_UVERBS_WRITE_EX(
+ IB_USER_VERBS_EX_CMD_CREATE_CQ,
+ ib_uverbs_ex_create_cq,
+ UAPI_DEF_WRITE_IO_EX(struct ib_uverbs_ex_create_cq,
+ reserved,
+ struct ib_uverbs_ex_create_cq_resp,
+ response_length),
+ UAPI_DEF_METHOD_NEEDS_FN(create_cq)),
+ DECLARE_UVERBS_WRITE_EX(
+ IB_USER_VERBS_EX_CMD_MODIFY_CQ,
+ ib_uverbs_ex_modify_cq,
+ UAPI_DEF_WRITE_I(struct ib_uverbs_ex_modify_cq),
+ UAPI_DEF_METHOD_NEEDS_FN(create_cq))),
+
+ DECLARE_UVERBS_OBJECT(
+ UVERBS_OBJECT_DEVICE,
+ DECLARE_UVERBS_WRITE(IB_USER_VERBS_CMD_GET_CONTEXT,
+ ib_uverbs_get_context,
+ UAPI_DEF_WRITE_UDATA_IO(
+ struct ib_uverbs_get_context,
+ struct ib_uverbs_get_context_resp)),
+ DECLARE_UVERBS_WRITE(
+ IB_USER_VERBS_CMD_QUERY_DEVICE,
+ ib_uverbs_query_device,
+ UAPI_DEF_WRITE_IO(struct ib_uverbs_query_device,
+ struct ib_uverbs_query_device_resp)),
+ DECLARE_UVERBS_WRITE(
+ IB_USER_VERBS_CMD_QUERY_PORT,
+ ib_uverbs_query_port,
+ UAPI_DEF_WRITE_IO(struct ib_uverbs_query_port,
+ struct ib_uverbs_query_port_resp),
+ UAPI_DEF_METHOD_NEEDS_FN(query_port)),
+ DECLARE_UVERBS_WRITE_EX(
+ IB_USER_VERBS_EX_CMD_QUERY_DEVICE,
+ ib_uverbs_ex_query_device,
+ UAPI_DEF_WRITE_IO_EX(
+ struct ib_uverbs_ex_query_device,
+ reserved,
+ struct ib_uverbs_ex_query_device_resp,
+ response_length),
+ UAPI_DEF_METHOD_NEEDS_FN(query_device)),
+ UAPI_DEF_OBJ_NEEDS_FN(alloc_ucontext),
+ UAPI_DEF_OBJ_NEEDS_FN(dealloc_ucontext)),
+
+ DECLARE_UVERBS_OBJECT(
+ UVERBS_OBJECT_FLOW,
+ DECLARE_UVERBS_WRITE_EX(
+ IB_USER_VERBS_EX_CMD_CREATE_FLOW,
+ ib_uverbs_ex_create_flow,
+ UAPI_DEF_WRITE_IO_EX(struct ib_uverbs_create_flow,
+ flow_attr,
+ struct ib_uverbs_create_flow_resp,
+ flow_handle),
+ UAPI_DEF_METHOD_NEEDS_FN(create_flow)),
+ DECLARE_UVERBS_WRITE_EX(
+ IB_USER_VERBS_EX_CMD_DESTROY_FLOW,
+ ib_uverbs_ex_destroy_flow,
+ UAPI_DEF_WRITE_I(struct ib_uverbs_destroy_flow),
+ UAPI_DEF_METHOD_NEEDS_FN(destroy_flow))),
+
+ DECLARE_UVERBS_OBJECT(
+ UVERBS_OBJECT_MR,
+ DECLARE_UVERBS_WRITE(IB_USER_VERBS_CMD_DEREG_MR,
+ ib_uverbs_dereg_mr,
+ UAPI_DEF_WRITE_I(struct ib_uverbs_dereg_mr),
+ UAPI_DEF_METHOD_NEEDS_FN(dereg_mr)),
+ DECLARE_UVERBS_WRITE(
+ IB_USER_VERBS_CMD_REG_MR,
+ ib_uverbs_reg_mr,
+ UAPI_DEF_WRITE_UDATA_IO(struct ib_uverbs_reg_mr,
+ struct ib_uverbs_reg_mr_resp),
+ UAPI_DEF_METHOD_NEEDS_FN(reg_user_mr)),
+ DECLARE_UVERBS_WRITE(
+ IB_USER_VERBS_CMD_REREG_MR,
+ ib_uverbs_rereg_mr,
+ UAPI_DEF_WRITE_UDATA_IO(struct ib_uverbs_rereg_mr,
+ struct ib_uverbs_rereg_mr_resp),
+ UAPI_DEF_METHOD_NEEDS_FN(rereg_user_mr))),
+
+ DECLARE_UVERBS_OBJECT(
+ UVERBS_OBJECT_MW,
+ DECLARE_UVERBS_WRITE(
+ IB_USER_VERBS_CMD_ALLOC_MW,
+ ib_uverbs_alloc_mw,
+ UAPI_DEF_WRITE_UDATA_IO(struct ib_uverbs_alloc_mw,
+ struct ib_uverbs_alloc_mw_resp),
+ UAPI_DEF_METHOD_NEEDS_FN(alloc_mw)),
+ DECLARE_UVERBS_WRITE(
+ IB_USER_VERBS_CMD_DEALLOC_MW,
+ ib_uverbs_dealloc_mw,
+ UAPI_DEF_WRITE_I(struct ib_uverbs_dealloc_mw),
+ UAPI_DEF_METHOD_NEEDS_FN(dealloc_mw))),
+
+ DECLARE_UVERBS_OBJECT(
+ UVERBS_OBJECT_PD,
+ DECLARE_UVERBS_WRITE(
+ IB_USER_VERBS_CMD_ALLOC_PD,
+ ib_uverbs_alloc_pd,
+ UAPI_DEF_WRITE_UDATA_IO(struct ib_uverbs_alloc_pd,
+ struct ib_uverbs_alloc_pd_resp),
+ UAPI_DEF_METHOD_NEEDS_FN(alloc_pd)),
+ DECLARE_UVERBS_WRITE(
+ IB_USER_VERBS_CMD_DEALLOC_PD,
+ ib_uverbs_dealloc_pd,
+ UAPI_DEF_WRITE_I(struct ib_uverbs_dealloc_pd),
+ UAPI_DEF_METHOD_NEEDS_FN(dealloc_pd))),
+
+ DECLARE_UVERBS_OBJECT(
+ UVERBS_OBJECT_QP,
+ DECLARE_UVERBS_WRITE(
+ IB_USER_VERBS_CMD_ATTACH_MCAST,
+ ib_uverbs_attach_mcast,
+ UAPI_DEF_WRITE_I(struct ib_uverbs_attach_mcast),
+ UAPI_DEF_METHOD_NEEDS_FN(attach_mcast),
+ UAPI_DEF_METHOD_NEEDS_FN(detach_mcast)),
+ DECLARE_UVERBS_WRITE(IB_USER_VERBS_CMD_CREATE_QP,
+ ib_uverbs_create_qp,
+ UAPI_DEF_WRITE_UDATA_IO(
+ struct ib_uverbs_create_qp,
+ struct ib_uverbs_create_qp_resp),
+ UAPI_DEF_METHOD_NEEDS_FN(create_qp)),
+ DECLARE_UVERBS_WRITE(
+ IB_USER_VERBS_CMD_DESTROY_QP,
+ ib_uverbs_destroy_qp,
+ UAPI_DEF_WRITE_IO(struct ib_uverbs_destroy_qp,
+ struct ib_uverbs_destroy_qp_resp),
+ UAPI_DEF_METHOD_NEEDS_FN(destroy_qp)),
+ DECLARE_UVERBS_WRITE(
+ IB_USER_VERBS_CMD_DETACH_MCAST,
+ ib_uverbs_detach_mcast,
+ UAPI_DEF_WRITE_I(struct ib_uverbs_detach_mcast),
+ UAPI_DEF_METHOD_NEEDS_FN(detach_mcast)),
+ DECLARE_UVERBS_WRITE(
+ IB_USER_VERBS_CMD_MODIFY_QP,
+ ib_uverbs_modify_qp,
+ UAPI_DEF_WRITE_I(struct ib_uverbs_modify_qp),
+ UAPI_DEF_METHOD_NEEDS_FN(modify_qp)),
+ DECLARE_UVERBS_WRITE(
+ IB_USER_VERBS_CMD_POST_RECV,
+ ib_uverbs_post_recv,
+ UAPI_DEF_WRITE_IO(struct ib_uverbs_post_recv,
+ struct ib_uverbs_post_recv_resp),
+ UAPI_DEF_METHOD_NEEDS_FN(post_recv)),
+ DECLARE_UVERBS_WRITE(
+ IB_USER_VERBS_CMD_POST_SEND,
+ ib_uverbs_post_send,
+ UAPI_DEF_WRITE_IO(struct ib_uverbs_post_send,
+ struct ib_uverbs_post_send_resp),
+ UAPI_DEF_METHOD_NEEDS_FN(post_send)),
+ DECLARE_UVERBS_WRITE(
+ IB_USER_VERBS_CMD_QUERY_QP,
+ ib_uverbs_query_qp,
+ UAPI_DEF_WRITE_IO(struct ib_uverbs_query_qp,
+ struct ib_uverbs_query_qp_resp),
+ UAPI_DEF_METHOD_NEEDS_FN(query_qp)),
+ DECLARE_UVERBS_WRITE_EX(
+ IB_USER_VERBS_EX_CMD_CREATE_QP,
+ ib_uverbs_ex_create_qp,
+ UAPI_DEF_WRITE_IO_EX(struct ib_uverbs_ex_create_qp,
+ comp_mask,
+ struct ib_uverbs_ex_create_qp_resp,
+ response_length),
+ UAPI_DEF_METHOD_NEEDS_FN(create_qp)),
+ DECLARE_UVERBS_WRITE_EX(
+ IB_USER_VERBS_EX_CMD_MODIFY_QP,
+ ib_uverbs_ex_modify_qp,
+ UAPI_DEF_WRITE_IO_EX(struct ib_uverbs_ex_modify_qp,
+ base,
+ struct ib_uverbs_ex_modify_qp_resp,
+ response_length),
+ UAPI_DEF_METHOD_NEEDS_FN(modify_qp))),
+
+ DECLARE_UVERBS_OBJECT(
+ UVERBS_OBJECT_RWQ_IND_TBL,
+ DECLARE_UVERBS_WRITE_EX(
+ IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL,
+ ib_uverbs_ex_create_rwq_ind_table,
+ UAPI_DEF_WRITE_IO_EX(
+ struct ib_uverbs_ex_create_rwq_ind_table,
+ log_ind_tbl_size,
+ struct ib_uverbs_ex_create_rwq_ind_table_resp,
+ ind_tbl_num),
+ UAPI_DEF_METHOD_NEEDS_FN(create_rwq_ind_table)),
+ DECLARE_UVERBS_WRITE_EX(
+ IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL,
+ ib_uverbs_ex_destroy_rwq_ind_table,
+ UAPI_DEF_WRITE_I(
+ struct ib_uverbs_ex_destroy_rwq_ind_table),
+ UAPI_DEF_METHOD_NEEDS_FN(destroy_rwq_ind_table))),
+
+ DECLARE_UVERBS_OBJECT(
+ UVERBS_OBJECT_WQ,
+ DECLARE_UVERBS_WRITE_EX(
+ IB_USER_VERBS_EX_CMD_CREATE_WQ,
+ ib_uverbs_ex_create_wq,
+ UAPI_DEF_WRITE_IO_EX(struct ib_uverbs_ex_create_wq,
+ max_sge,
+ struct ib_uverbs_ex_create_wq_resp,
+ wqn),
+ UAPI_DEF_METHOD_NEEDS_FN(create_wq)),
+ DECLARE_UVERBS_WRITE_EX(
+ IB_USER_VERBS_EX_CMD_DESTROY_WQ,
+ ib_uverbs_ex_destroy_wq,
+ UAPI_DEF_WRITE_IO_EX(struct ib_uverbs_ex_destroy_wq,
+ wq_handle,
+ struct ib_uverbs_ex_destroy_wq_resp,
+ reserved),
+ UAPI_DEF_METHOD_NEEDS_FN(destroy_wq)),
+ DECLARE_UVERBS_WRITE_EX(
+ IB_USER_VERBS_EX_CMD_MODIFY_WQ,
+ ib_uverbs_ex_modify_wq,
+ UAPI_DEF_WRITE_I_EX(struct ib_uverbs_ex_modify_wq,
+ curr_wq_state),
+ UAPI_DEF_METHOD_NEEDS_FN(modify_wq))),
+
+ DECLARE_UVERBS_OBJECT(
+ UVERBS_OBJECT_SRQ,
+ DECLARE_UVERBS_WRITE(IB_USER_VERBS_CMD_CREATE_SRQ,
+ ib_uverbs_create_srq,
+ UAPI_DEF_WRITE_UDATA_IO(
+ struct ib_uverbs_create_srq,
+ struct ib_uverbs_create_srq_resp),
+ UAPI_DEF_METHOD_NEEDS_FN(create_srq)),
+ DECLARE_UVERBS_WRITE(IB_USER_VERBS_CMD_CREATE_XSRQ,
+ ib_uverbs_create_xsrq,
+ UAPI_DEF_WRITE_UDATA_IO(
+ struct ib_uverbs_create_xsrq,
+ struct ib_uverbs_create_srq_resp),
+ UAPI_DEF_METHOD_NEEDS_FN(create_srq)),
+ DECLARE_UVERBS_WRITE(
+ IB_USER_VERBS_CMD_DESTROY_SRQ,
+ ib_uverbs_destroy_srq,
+ UAPI_DEF_WRITE_IO(struct ib_uverbs_destroy_srq,
+ struct ib_uverbs_destroy_srq_resp),
+ UAPI_DEF_METHOD_NEEDS_FN(destroy_srq)),
+ DECLARE_UVERBS_WRITE(
+ IB_USER_VERBS_CMD_MODIFY_SRQ,
+ ib_uverbs_modify_srq,
+ UAPI_DEF_WRITE_UDATA_I(struct ib_uverbs_modify_srq),
+ UAPI_DEF_METHOD_NEEDS_FN(modify_srq)),
+ DECLARE_UVERBS_WRITE(
+ IB_USER_VERBS_CMD_POST_SRQ_RECV,
+ ib_uverbs_post_srq_recv,
+ UAPI_DEF_WRITE_IO(struct ib_uverbs_post_srq_recv,
+ struct ib_uverbs_post_srq_recv_resp),
+ UAPI_DEF_METHOD_NEEDS_FN(post_srq_recv)),
+ DECLARE_UVERBS_WRITE(
+ IB_USER_VERBS_CMD_QUERY_SRQ,
+ ib_uverbs_query_srq,
+ UAPI_DEF_WRITE_IO(struct ib_uverbs_query_srq,
+ struct ib_uverbs_query_srq_resp),
+ UAPI_DEF_METHOD_NEEDS_FN(query_srq))),
+
+ DECLARE_UVERBS_OBJECT(
+ UVERBS_OBJECT_XRCD,
+ DECLARE_UVERBS_WRITE(
+ IB_USER_VERBS_CMD_CLOSE_XRCD,
+ ib_uverbs_close_xrcd,
+ UAPI_DEF_WRITE_I(struct ib_uverbs_close_xrcd),
+ UAPI_DEF_METHOD_NEEDS_FN(dealloc_xrcd)),
+ DECLARE_UVERBS_WRITE(IB_USER_VERBS_CMD_OPEN_QP,
+ ib_uverbs_open_qp,
+ UAPI_DEF_WRITE_UDATA_IO(
+ struct ib_uverbs_open_qp,
+ struct ib_uverbs_create_qp_resp)),
+ DECLARE_UVERBS_WRITE(IB_USER_VERBS_CMD_OPEN_XRCD,
+ ib_uverbs_open_xrcd,
+ UAPI_DEF_WRITE_UDATA_IO(
+ struct ib_uverbs_open_xrcd,
+ struct ib_uverbs_open_xrcd_resp),
+ UAPI_DEF_METHOD_NEEDS_FN(alloc_xrcd))),
+
+ {},
+};
diff --git a/drivers/infiniband/core/uverbs_ioctl.c b/drivers/infiniband/core/uverbs_ioctl.c
index b0e493e8d860..e1379949e663 100644
--- a/drivers/infiniband/core/uverbs_ioctl.c
+++ b/drivers/infiniband/core/uverbs_ioctl.c
@@ -144,6 +144,21 @@ static bool uverbs_is_attr_cleared(const struct ib_uverbs_attr *uattr,
0, uattr->len - len);
}
+static int uverbs_set_output(const struct uverbs_attr_bundle *bundle,
+ const struct uverbs_attr *attr)
+{
+ struct bundle_priv *pbundle =
+ container_of(bundle, struct bundle_priv, bundle);
+ u16 flags;
+
+ flags = pbundle->uattrs[attr->ptr_attr.uattr_idx].flags |
+ UVERBS_ATTR_F_VALID_OUTPUT;
+ if (put_user(flags,
+ &pbundle->user_attrs[attr->ptr_attr.uattr_idx].flags))
+ return -EFAULT;
+ return 0;
+}
+
static int uverbs_process_idrs_array(struct bundle_priv *pbundle,
const struct uverbs_api_attr *attr_uapi,
struct uverbs_objs_arr_attr *attr,
@@ -198,6 +213,7 @@ static int uverbs_process_idrs_array(struct bundle_priv *pbundle,
ret = PTR_ERR(attr->uobjects[i]);
break;
}
+ pbundle->bundle.context = attr->uobjects[i]->context;
}
attr->len = i;
@@ -315,6 +331,7 @@ static int uverbs_process_attr(struct bundle_priv *pbundle,
uattr->data_s64);
if (IS_ERR(o_attr->uobject))
return PTR_ERR(o_attr->uobject);
+ pbundle->bundle.context = o_attr->uobject->context;
__set_bit(attr_bkey, pbundle->uobj_finalize);
if (spec->u.obj.access == UVERBS_ACCESS_NEW) {
@@ -404,8 +421,7 @@ static int uverbs_set_attr(struct bundle_priv *pbundle,
static int ib_uverbs_run_method(struct bundle_priv *pbundle,
unsigned int num_attrs)
{
- int (*handler)(struct ib_uverbs_file *ufile,
- struct uverbs_attr_bundle *ctx);
+ int (*handler)(struct uverbs_attr_bundle *attrs);
size_t uattrs_size = array_size(sizeof(*pbundle->uattrs), num_attrs);
unsigned int destroy_bkey = pbundle->method_elm->destroy_bkey;
unsigned int i;
@@ -436,6 +452,11 @@ static int ib_uverbs_run_method(struct bundle_priv *pbundle,
pbundle->method_elm->key_bitmap_len)))
return -EINVAL;
+ if (pbundle->method_elm->has_udata)
+ uverbs_fill_udata(&pbundle->bundle,
+ &pbundle->bundle.driver_udata,
+ UVERBS_ATTR_UHW_IN, UVERBS_ATTR_UHW_OUT);
+
if (destroy_bkey != UVERBS_API_ATTR_BKEY_LEN) {
struct uverbs_obj_attr *destroy_attr =
&pbundle->bundle.attrs[destroy_bkey].obj_attr;
@@ -445,10 +466,23 @@ static int ib_uverbs_run_method(struct bundle_priv *pbundle,
return ret;
__clear_bit(destroy_bkey, pbundle->uobj_finalize);
- ret = handler(pbundle->bundle.ufile, &pbundle->bundle);
+ ret = handler(&pbundle->bundle);
uobj_put_destroy(destroy_attr->uobject);
} else {
- ret = handler(pbundle->bundle.ufile, &pbundle->bundle);
+ ret = handler(&pbundle->bundle);
+ }
+
+ /*
+ * Until the drivers are revised to use the bundle directly we have to
+ * assume that the driver wrote to its UHW_OUT and flag userspace
+ * appropriately.
+ */
+ if (!ret && pbundle->method_elm->has_udata) {
+ const struct uverbs_attr *attr =
+ uverbs_attr_get(&pbundle->bundle, UVERBS_ATTR_UHW_OUT);
+
+ if (!IS_ERR(attr))
+ ret = uverbs_set_output(&pbundle->bundle, attr);
}
/*
@@ -560,6 +594,7 @@ static int ib_uverbs_cmd_verbs(struct ib_uverbs_file *ufile,
pbundle->method_elm = method_elm;
pbundle->method_key = attrs_iter.index;
pbundle->bundle.ufile = ufile;
+ pbundle->bundle.context = NULL; /* only valid if bundle has uobject */
pbundle->radix = &uapi->radix;
pbundle->radix_slots = slot;
pbundle->radix_slots_len = radix_tree_chunk_size(&attrs_iter);
@@ -662,35 +697,37 @@ int uverbs_get_flags32(u32 *to, const struct uverbs_attr_bundle *attrs_bundle,
EXPORT_SYMBOL(uverbs_get_flags32);
/*
- * This is for ease of conversion. The purpose is to convert all drivers to
- * use uverbs_attr_bundle instead of ib_udata. Assume attr == 0 is input and
- * attr == 1 is output.
+ * Fill a ib_udata struct (core or uhw) using the given attribute IDs.
+ * This is primarily used to convert the UVERBS_ATTR_UHW() into the
+ * ib_udata format used by the drivers.
*/
-void create_udata(struct uverbs_attr_bundle *bundle, struct ib_udata *udata)
+void uverbs_fill_udata(struct uverbs_attr_bundle *bundle,
+ struct ib_udata *udata, unsigned int attr_in,
+ unsigned int attr_out)
{
struct bundle_priv *pbundle =
container_of(bundle, struct bundle_priv, bundle);
- const struct uverbs_attr *uhw_in =
- uverbs_attr_get(bundle, UVERBS_ATTR_UHW_IN);
- const struct uverbs_attr *uhw_out =
- uverbs_attr_get(bundle, UVERBS_ATTR_UHW_OUT);
-
- if (!IS_ERR(uhw_in)) {
- udata->inlen = uhw_in->ptr_attr.len;
- if (uverbs_attr_ptr_is_inline(uhw_in))
+ const struct uverbs_attr *in =
+ uverbs_attr_get(&pbundle->bundle, attr_in);
+ const struct uverbs_attr *out =
+ uverbs_attr_get(&pbundle->bundle, attr_out);
+
+ if (!IS_ERR(in)) {
+ udata->inlen = in->ptr_attr.len;
+ if (uverbs_attr_ptr_is_inline(in))
udata->inbuf =
- &pbundle->user_attrs[uhw_in->ptr_attr.uattr_idx]
+ &pbundle->user_attrs[in->ptr_attr.uattr_idx]
.data;
else
- udata->inbuf = u64_to_user_ptr(uhw_in->ptr_attr.data);
+ udata->inbuf = u64_to_user_ptr(in->ptr_attr.data);
} else {
udata->inbuf = NULL;
udata->inlen = 0;
}
- if (!IS_ERR(uhw_out)) {
- udata->outbuf = u64_to_user_ptr(uhw_out->ptr_attr.data);
- udata->outlen = uhw_out->ptr_attr.len;
+ if (!IS_ERR(out)) {
+ udata->outbuf = u64_to_user_ptr(out->ptr_attr.data);
+ udata->outlen = out->ptr_attr.len;
} else {
udata->outbuf = NULL;
udata->outlen = 0;
@@ -700,10 +737,7 @@ void create_udata(struct uverbs_attr_bundle *bundle, struct ib_udata *udata)
int uverbs_copy_to(const struct uverbs_attr_bundle *bundle, size_t idx,
const void *from, size_t size)
{
- struct bundle_priv *pbundle =
- container_of(bundle, struct bundle_priv, bundle);
const struct uverbs_attr *attr = uverbs_attr_get(bundle, idx);
- u16 flags;
size_t min_size;
if (IS_ERR(attr))
@@ -713,16 +747,25 @@ int uverbs_copy_to(const struct uverbs_attr_bundle *bundle, size_t idx,
if (copy_to_user(u64_to_user_ptr(attr->ptr_attr.data), from, min_size))
return -EFAULT;
- flags = pbundle->uattrs[attr->ptr_attr.uattr_idx].flags |
- UVERBS_ATTR_F_VALID_OUTPUT;
- if (put_user(flags,
- &pbundle->user_attrs[attr->ptr_attr.uattr_idx].flags))
- return -EFAULT;
-
- return 0;
+ return uverbs_set_output(bundle, attr);
}
EXPORT_SYMBOL(uverbs_copy_to);
+
+/*
+ * This is only used if the caller has directly used copy_to_use to write the
+ * data. It signals to user space that the buffer is filled in.
+ */
+int uverbs_output_written(const struct uverbs_attr_bundle *bundle, size_t idx)
+{
+ const struct uverbs_attr *attr = uverbs_attr_get(bundle, idx);
+
+ if (IS_ERR(attr))
+ return PTR_ERR(attr);
+
+ return uverbs_set_output(bundle, attr);
+}
+
int _uverbs_get_const(s64 *to, const struct uverbs_attr_bundle *attrs_bundle,
size_t idx, s64 lower_bound, u64 upper_bound,
s64 *def_val)
@@ -745,3 +788,16 @@ int _uverbs_get_const(s64 *to, const struct uverbs_attr_bundle *attrs_bundle,
return 0;
}
EXPORT_SYMBOL(_uverbs_get_const);
+
+int uverbs_copy_to_struct_or_zero(const struct uverbs_attr_bundle *bundle,
+ size_t idx, const void *from, size_t size)
+{
+ const struct uverbs_attr *attr = uverbs_attr_get(bundle, idx);
+
+ if (size < attr->ptr_attr.len) {
+ if (clear_user(u64_to_user_ptr(attr->ptr_attr.data) + size,
+ attr->ptr_attr.len - size))
+ return -EFAULT;
+ }
+ return uverbs_copy_to(bundle, idx, from, size);
+}
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index 6d373f5515b7..c489f545baae 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -74,64 +74,6 @@ static dev_t dynamic_uverbs_dev;
static struct class *uverbs_class;
static DEFINE_IDA(uverbs_ida);
-
-static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len) = {
- [IB_USER_VERBS_CMD_GET_CONTEXT] = ib_uverbs_get_context,
- [IB_USER_VERBS_CMD_QUERY_DEVICE] = ib_uverbs_query_device,
- [IB_USER_VERBS_CMD_QUERY_PORT] = ib_uverbs_query_port,
- [IB_USER_VERBS_CMD_ALLOC_PD] = ib_uverbs_alloc_pd,
- [IB_USER_VERBS_CMD_DEALLOC_PD] = ib_uverbs_dealloc_pd,
- [IB_USER_VERBS_CMD_REG_MR] = ib_uverbs_reg_mr,
- [IB_USER_VERBS_CMD_REREG_MR] = ib_uverbs_rereg_mr,
- [IB_USER_VERBS_CMD_DEREG_MR] = ib_uverbs_dereg_mr,
- [IB_USER_VERBS_CMD_ALLOC_MW] = ib_uverbs_alloc_mw,
- [IB_USER_VERBS_CMD_DEALLOC_MW] = ib_uverbs_dealloc_mw,
- [IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL] = ib_uverbs_create_comp_channel,
- [IB_USER_VERBS_CMD_CREATE_CQ] = ib_uverbs_create_cq,
- [IB_USER_VERBS_CMD_RESIZE_CQ] = ib_uverbs_resize_cq,
- [IB_USER_VERBS_CMD_POLL_CQ] = ib_uverbs_poll_cq,
- [IB_USER_VERBS_CMD_REQ_NOTIFY_CQ] = ib_uverbs_req_notify_cq,
- [IB_USER_VERBS_CMD_DESTROY_CQ] = ib_uverbs_destroy_cq,
- [IB_USER_VERBS_CMD_CREATE_QP] = ib_uverbs_create_qp,
- [IB_USER_VERBS_CMD_QUERY_QP] = ib_uverbs_query_qp,
- [IB_USER_VERBS_CMD_MODIFY_QP] = ib_uverbs_modify_qp,
- [IB_USER_VERBS_CMD_DESTROY_QP] = ib_uverbs_destroy_qp,
- [IB_USER_VERBS_CMD_POST_SEND] = ib_uverbs_post_send,
- [IB_USER_VERBS_CMD_POST_RECV] = ib_uverbs_post_recv,
- [IB_USER_VERBS_CMD_POST_SRQ_RECV] = ib_uverbs_post_srq_recv,
- [IB_USER_VERBS_CMD_CREATE_AH] = ib_uverbs_create_ah,
- [IB_USER_VERBS_CMD_DESTROY_AH] = ib_uverbs_destroy_ah,
- [IB_USER_VERBS_CMD_ATTACH_MCAST] = ib_uverbs_attach_mcast,
- [IB_USER_VERBS_CMD_DETACH_MCAST] = ib_uverbs_detach_mcast,
- [IB_USER_VERBS_CMD_CREATE_SRQ] = ib_uverbs_create_srq,
- [IB_USER_VERBS_CMD_MODIFY_SRQ] = ib_uverbs_modify_srq,
- [IB_USER_VERBS_CMD_QUERY_SRQ] = ib_uverbs_query_srq,
- [IB_USER_VERBS_CMD_DESTROY_SRQ] = ib_uverbs_destroy_srq,
- [IB_USER_VERBS_CMD_OPEN_XRCD] = ib_uverbs_open_xrcd,
- [IB_USER_VERBS_CMD_CLOSE_XRCD] = ib_uverbs_close_xrcd,
- [IB_USER_VERBS_CMD_CREATE_XSRQ] = ib_uverbs_create_xsrq,
- [IB_USER_VERBS_CMD_OPEN_QP] = ib_uverbs_open_qp,
-};
-
-static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file,
- struct ib_udata *ucore,
- struct ib_udata *uhw) = {
- [IB_USER_VERBS_EX_CMD_CREATE_FLOW] = ib_uverbs_ex_create_flow,
- [IB_USER_VERBS_EX_CMD_DESTROY_FLOW] = ib_uverbs_ex_destroy_flow,
- [IB_USER_VERBS_EX_CMD_QUERY_DEVICE] = ib_uverbs_ex_query_device,
- [IB_USER_VERBS_EX_CMD_CREATE_CQ] = ib_uverbs_ex_create_cq,
- [IB_USER_VERBS_EX_CMD_CREATE_QP] = ib_uverbs_ex_create_qp,
- [IB_USER_VERBS_EX_CMD_CREATE_WQ] = ib_uverbs_ex_create_wq,
- [IB_USER_VERBS_EX_CMD_MODIFY_WQ] = ib_uverbs_ex_modify_wq,
- [IB_USER_VERBS_EX_CMD_DESTROY_WQ] = ib_uverbs_ex_destroy_wq,
- [IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL] = ib_uverbs_ex_create_rwq_ind_table,
- [IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL] = ib_uverbs_ex_destroy_rwq_ind_table,
- [IB_USER_VERBS_EX_CMD_MODIFY_QP] = ib_uverbs_ex_modify_qp,
- [IB_USER_VERBS_EX_CMD_MODIFY_CQ] = ib_uverbs_ex_modify_cq,
-};
-
static void ib_uverbs_add_one(struct ib_device *device);
static void ib_uverbs_remove_one(struct ib_device *device, void *client_data);
@@ -139,7 +81,7 @@ static void ib_uverbs_remove_one(struct ib_device *device, void *client_data);
* Must be called with the ufile->device->disassociate_srcu held, and the lock
* must be held until use of the ucontext is finished.
*/
-struct ib_ucontext *ib_uverbs_get_ucontext(struct ib_uverbs_file *ufile)
+struct ib_ucontext *ib_uverbs_get_ucontext_file(struct ib_uverbs_file *ufile)
{
/*
* We do not hold the hw_destroy_rwsem lock for this flow, instead
@@ -157,14 +99,14 @@ struct ib_ucontext *ib_uverbs_get_ucontext(struct ib_uverbs_file *ufile)
return ucontext;
}
-EXPORT_SYMBOL(ib_uverbs_get_ucontext);
+EXPORT_SYMBOL(ib_uverbs_get_ucontext_file);
int uverbs_dealloc_mw(struct ib_mw *mw)
{
struct ib_pd *pd = mw->pd;
int ret;
- ret = mw->device->dealloc_mw(mw);
+ ret = mw->device->ops.dealloc_mw(mw);
if (!ret)
atomic_dec(&pd->usecnt);
return ret;
@@ -255,14 +197,20 @@ void ib_uverbs_release_file(struct kref *ref)
srcu_key = srcu_read_lock(&file->device->disassociate_srcu);
ib_dev = srcu_dereference(file->device->ib_dev,
&file->device->disassociate_srcu);
- if (ib_dev && !ib_dev->disassociate_ucontext)
+ if (ib_dev && !ib_dev->ops.disassociate_ucontext)
module_put(ib_dev->owner);
srcu_read_unlock(&file->device->disassociate_srcu, srcu_key);
if (atomic_dec_and_test(&file->device->refcount))
ib_uverbs_comp_dev(file->device);
+ if (file->async_file)
+ kref_put(&file->async_file->ref,
+ ib_uverbs_release_async_event_file);
put_device(&file->device->dev);
+
+ if (file->disassociate_page)
+ __free_pages(file->disassociate_page, 0);
kfree(file);
}
@@ -646,51 +594,19 @@ err_put_refs:
return filp;
}
-static bool verify_command_mask(struct ib_uverbs_file *ufile, u32 command,
- bool extended)
-{
- if (!extended)
- return ufile->uverbs_cmd_mask & BIT_ULL(command);
-
- return ufile->uverbs_ex_cmd_mask & BIT_ULL(command);
-}
-
-static bool verify_command_idx(u32 command, bool extended)
-{
- if (extended)
- return command < ARRAY_SIZE(uverbs_ex_cmd_table) &&
- uverbs_ex_cmd_table[command];
-
- return command < ARRAY_SIZE(uverbs_cmd_table) &&
- uverbs_cmd_table[command];
-}
-
-static ssize_t process_hdr(struct ib_uverbs_cmd_hdr *hdr,
- u32 *command, bool *extended)
-{
- if (hdr->command & ~(u32)(IB_USER_VERBS_CMD_FLAG_EXTENDED |
- IB_USER_VERBS_CMD_COMMAND_MASK))
- return -EINVAL;
-
- *command = hdr->command & IB_USER_VERBS_CMD_COMMAND_MASK;
- *extended = hdr->command & IB_USER_VERBS_CMD_FLAG_EXTENDED;
-
- if (!verify_command_idx(*command, *extended))
- return -EOPNOTSUPP;
-
- return 0;
-}
-
static ssize_t verify_hdr(struct ib_uverbs_cmd_hdr *hdr,
- struct ib_uverbs_ex_cmd_hdr *ex_hdr,
- size_t count, bool extended)
+ struct ib_uverbs_ex_cmd_hdr *ex_hdr, size_t count,
+ const struct uverbs_api_write_method *method_elm)
{
- if (extended) {
+ if (method_elm->is_ex) {
count -= sizeof(*hdr) + sizeof(*ex_hdr);
if ((hdr->in_words + ex_hdr->provider_in_words) * 8 != count)
return -EINVAL;
+ if (hdr->in_words * 8 < method_elm->req_size)
+ return -ENOSPC;
+
if (ex_hdr->cmd_hdr_reserved)
return -EINVAL;
@@ -698,8 +614,10 @@ static ssize_t verify_hdr(struct ib_uverbs_cmd_hdr *hdr,
if (!hdr->out_words && !ex_hdr->provider_out_words)
return -EINVAL;
- if (!access_ok(VERIFY_WRITE,
- u64_to_user_ptr(ex_hdr->response),
+ if (hdr->out_words * 8 < method_elm->resp_size)
+ return -ENOSPC;
+
+ if (!access_ok(u64_to_user_ptr(ex_hdr->response),
(hdr->out_words + ex_hdr->provider_out_words) * 8))
return -EFAULT;
} else {
@@ -714,6 +632,24 @@ static ssize_t verify_hdr(struct ib_uverbs_cmd_hdr *hdr,
if (hdr->in_words * 4 != count)
return -EINVAL;
+ if (count < method_elm->req_size + sizeof(hdr)) {
+ /*
+ * rdma-core v18 and v19 have a bug where they send DESTROY_CQ
+ * with a 16 byte write instead of 24. Old kernels didn't
+ * check the size so they allowed this. Now that the size is
+ * checked provide a compatibility work around to not break
+ * those userspaces.
+ */
+ if (hdr->command == IB_USER_VERBS_CMD_DESTROY_CQ &&
+ count == 16) {
+ hdr->in_words = 6;
+ return 0;
+ }
+ return -ENOSPC;
+ }
+ if (hdr->out_words * 4 < method_elm->resp_size)
+ return -ENOSPC;
+
return 0;
}
@@ -721,11 +657,12 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
size_t count, loff_t *pos)
{
struct ib_uverbs_file *file = filp->private_data;
+ const struct uverbs_api_write_method *method_elm;
+ struct uverbs_api *uapi = file->device->uapi;
struct ib_uverbs_ex_cmd_hdr ex_hdr;
struct ib_uverbs_cmd_hdr hdr;
- bool extended;
+ struct uverbs_attr_bundle bundle;
int srcu_key;
- u32 command;
ssize_t ret;
if (!ib_safe_file_access(filp)) {
@@ -740,57 +677,94 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
if (copy_from_user(&hdr, buf, sizeof(hdr)))
return -EFAULT;
- ret = process_hdr(&hdr, &command, &extended);
- if (ret)
- return ret;
+ method_elm = uapi_get_method(uapi, hdr.command);
+ if (IS_ERR(method_elm))
+ return PTR_ERR(method_elm);
- if (extended) {
+ if (method_elm->is_ex) {
if (count < (sizeof(hdr) + sizeof(ex_hdr)))
return -EINVAL;
if (copy_from_user(&ex_hdr, buf + sizeof(hdr), sizeof(ex_hdr)))
return -EFAULT;
}
- ret = verify_hdr(&hdr, &ex_hdr, count, extended);
+ ret = verify_hdr(&hdr, &ex_hdr, count, method_elm);
if (ret)
return ret;
srcu_key = srcu_read_lock(&file->device->disassociate_srcu);
- if (!verify_command_mask(file, command, extended)) {
- ret = -EOPNOTSUPP;
- goto out;
- }
-
buf += sizeof(hdr);
- if (!extended) {
- ret = uverbs_cmd_table[command](file, buf,
- hdr.in_words * 4,
- hdr.out_words * 4);
- } else {
- struct ib_udata ucore;
- struct ib_udata uhw;
+ memset(bundle.attr_present, 0, sizeof(bundle.attr_present));
+ bundle.ufile = file;
+ bundle.context = NULL; /* only valid if bundle has uobject */
+ if (!method_elm->is_ex) {
+ size_t in_len = hdr.in_words * 4 - sizeof(hdr);
+ size_t out_len = hdr.out_words * 4;
+ u64 response = 0;
+
+ if (method_elm->has_udata) {
+ bundle.driver_udata.inlen =
+ in_len - method_elm->req_size;
+ in_len = method_elm->req_size;
+ if (bundle.driver_udata.inlen)
+ bundle.driver_udata.inbuf = buf + in_len;
+ else
+ bundle.driver_udata.inbuf = NULL;
+ } else {
+ memset(&bundle.driver_udata, 0,
+ sizeof(bundle.driver_udata));
+ }
+
+ if (method_elm->has_resp) {
+ /*
+ * The macros check that if has_resp is set
+ * then the command request structure starts
+ * with a '__aligned u64 response' member.
+ */
+ ret = get_user(response, (const u64 *)buf);
+ if (ret)
+ goto out_unlock;
+
+ if (method_elm->has_udata) {
+ bundle.driver_udata.outlen =
+ out_len - method_elm->resp_size;
+ out_len = method_elm->resp_size;
+ if (bundle.driver_udata.outlen)
+ bundle.driver_udata.outbuf =
+ u64_to_user_ptr(response +
+ out_len);
+ else
+ bundle.driver_udata.outbuf = NULL;
+ }
+ } else {
+ bundle.driver_udata.outlen = 0;
+ bundle.driver_udata.outbuf = NULL;
+ }
+ ib_uverbs_init_udata_buf_or_null(
+ &bundle.ucore, buf, u64_to_user_ptr(response),
+ in_len, out_len);
+ } else {
buf += sizeof(ex_hdr);
- ib_uverbs_init_udata_buf_or_null(&ucore, buf,
+ ib_uverbs_init_udata_buf_or_null(&bundle.ucore, buf,
u64_to_user_ptr(ex_hdr.response),
hdr.in_words * 8, hdr.out_words * 8);
- ib_uverbs_init_udata_buf_or_null(&uhw,
- buf + ucore.inlen,
- u64_to_user_ptr(ex_hdr.response) + ucore.outlen,
- ex_hdr.provider_in_words * 8,
- ex_hdr.provider_out_words * 8);
+ ib_uverbs_init_udata_buf_or_null(
+ &bundle.driver_udata, buf + bundle.ucore.inlen,
+ u64_to_user_ptr(ex_hdr.response) + bundle.ucore.outlen,
+ ex_hdr.provider_in_words * 8,
+ ex_hdr.provider_out_words * 8);
- ret = uverbs_ex_cmd_table[command](file, &ucore, &uhw);
- ret = (ret) ? : count;
}
-out:
+ ret = method_elm->handler(&bundle);
+out_unlock:
srcu_read_unlock(&file->device->disassociate_srcu, srcu_key);
- return ret;
+ return (ret) ? : count;
}
static int ib_uverbs_mmap(struct file *filp, struct vm_area_struct *vma)
@@ -801,13 +775,13 @@ static int ib_uverbs_mmap(struct file *filp, struct vm_area_struct *vma)
int srcu_key;
srcu_key = srcu_read_lock(&file->device->disassociate_srcu);
- ucontext = ib_uverbs_get_ucontext(file);
+ ucontext = ib_uverbs_get_ucontext_file(file);
if (IS_ERR(ucontext)) {
ret = PTR_ERR(ucontext);
goto out;
}
- ret = ucontext->device->mmap(ucontext, vma);
+ ret = ucontext->device->ops.mmap(ucontext, vma);
out:
srcu_read_unlock(&file->device->disassociate_srcu, srcu_key);
return ret;
@@ -906,9 +880,50 @@ static void rdma_umap_close(struct vm_area_struct *vma)
kfree(priv);
}
+/*
+ * Once the zap_vma_ptes has been called touches to the VMA will come here and
+ * we return a dummy writable zero page for all the pfns.
+ */
+static vm_fault_t rdma_umap_fault(struct vm_fault *vmf)
+{
+ struct ib_uverbs_file *ufile = vmf->vma->vm_file->private_data;
+ struct rdma_umap_priv *priv = vmf->vma->vm_private_data;
+ vm_fault_t ret = 0;
+
+ if (!priv)
+ return VM_FAULT_SIGBUS;
+
+ /* Read only pages can just use the system zero page. */
+ if (!(vmf->vma->vm_flags & (VM_WRITE | VM_MAYWRITE))) {
+ vmf->page = ZERO_PAGE(vmf->address);
+ get_page(vmf->page);
+ return 0;
+ }
+
+ mutex_lock(&ufile->umap_lock);
+ if (!ufile->disassociate_page)
+ ufile->disassociate_page =
+ alloc_pages(vmf->gfp_mask | __GFP_ZERO, 0);
+
+ if (ufile->disassociate_page) {
+ /*
+ * This VMA is forced to always be shared so this doesn't have
+ * to worry about COW.
+ */
+ vmf->page = ufile->disassociate_page;
+ get_page(vmf->page);
+ } else {
+ ret = VM_FAULT_SIGBUS;
+ }
+ mutex_unlock(&ufile->umap_lock);
+
+ return ret;
+}
+
static const struct vm_operations_struct rdma_umap_ops = {
.open = rdma_umap_open,
.close = rdma_umap_close,
+ .fault = rdma_umap_fault,
};
static struct rdma_umap_priv *rdma_user_mmap_pre(struct ib_ucontext *ucontext,
@@ -918,6 +933,9 @@ static struct rdma_umap_priv *rdma_user_mmap_pre(struct ib_ucontext *ucontext,
struct ib_uverbs_file *ufile = ucontext->ufile;
struct rdma_umap_priv *priv;
+ if (!(vma->vm_flags & VM_SHARED))
+ return ERR_PTR(-EINVAL);
+
if (vma->vm_end - vma->vm_start != size)
return ERR_PTR(-EINVAL);
@@ -997,11 +1015,19 @@ void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile)
/* Get an arbitrary mm pointer that hasn't been cleaned yet */
mutex_lock(&ufile->umap_lock);
- if (!list_empty(&ufile->umaps)) {
- mm = list_first_entry(&ufile->umaps,
- struct rdma_umap_priv, list)
- ->vma->vm_mm;
- mmget(mm);
+ while (!list_empty(&ufile->umaps)) {
+ int ret;
+
+ priv = list_first_entry(&ufile->umaps,
+ struct rdma_umap_priv, list);
+ mm = priv->vma->vm_mm;
+ ret = mmget_not_zero(mm);
+ if (!ret) {
+ list_del_init(&priv->list);
+ mm = NULL;
+ continue;
+ }
+ break;
}
mutex_unlock(&ufile->umap_lock);
if (!mm)
@@ -1013,7 +1039,9 @@ void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile)
* at a time to get the lock ordering right. Typically there
* will only be one mm, so no big deal.
*/
- down_write(&mm->mmap_sem);
+ down_read(&mm->mmap_sem);
+ if (!mmget_still_valid(mm))
+ goto skip_mm;
mutex_lock(&ufile->umap_lock);
list_for_each_entry_safe (priv, next_priv, &ufile->umaps,
list) {
@@ -1025,10 +1053,10 @@ void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile)
zap_vma_ptes(vma, vma->vm_start,
vma->vm_end - vma->vm_start);
- vma->vm_flags &= ~(VM_SHARED | VM_MAYSHARE);
}
mutex_unlock(&ufile->umap_lock);
- up_write(&mm->mmap_sem);
+ skip_mm:
+ up_read(&mm->mmap_sem);
mmput(mm);
}
}
@@ -1069,7 +1097,7 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp)
/* In case IB device supports disassociate ucontext, there is no hard
* dependency between uverbs device and its low level device.
*/
- module_dependent = !(ib_dev->disassociate_ucontext);
+ module_dependent = !(ib_dev->ops.disassociate_ucontext);
if (module_dependent) {
if (!try_module_get(ib_dev->owner)) {
@@ -1102,9 +1130,6 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp)
mutex_unlock(&dev->lists_mutex);
srcu_read_unlock(&dev->disassociate_srcu, srcu_key);
- file->uverbs_cmd_mask = ib_dev->uverbs_cmd_mask;
- file->uverbs_ex_cmd_mask = ib_dev->uverbs_ex_cmd_mask;
-
setup_ufile_idr_uobject(file);
return nonseekable_open(inode, filp);
@@ -1132,10 +1157,6 @@ static int ib_uverbs_close(struct inode *inode, struct file *filp)
list_del_init(&file->list);
mutex_unlock(&file->device->lists_mutex);
- if (file->async_file)
- kref_put(&file->async_file->ref,
- ib_uverbs_release_async_event_file);
-
kref_put(&file->ref, ib_uverbs_release_file);
return 0;
@@ -1164,6 +1185,7 @@ static const struct file_operations uverbs_mmap_fops = {
static struct ib_client uverbs_client = {
.name = "uverbs",
+ .no_kverbs_req = true,
.add = ib_uverbs_add_one,
.remove = ib_uverbs_remove_one
};
@@ -1224,7 +1246,7 @@ static int ib_uverbs_create_uapi(struct ib_device *device,
{
struct uverbs_api *uapi;
- uapi = uverbs_alloc_api(device->driver_specs, device->driver_id);
+ uapi = uverbs_alloc_api(device);
if (IS_ERR(uapi))
return PTR_ERR(uapi);
@@ -1239,7 +1261,7 @@ static void ib_uverbs_add_one(struct ib_device *device)
struct ib_uverbs_device *uverbs_dev;
int ret;
- if (!device->alloc_ucontext)
+ if (!device->ops.alloc_ucontext)
return;
uverbs_dev = kzalloc(sizeof(*uverbs_dev), GFP_KERNEL);
@@ -1285,7 +1307,7 @@ static void ib_uverbs_add_one(struct ib_device *device)
dev_set_name(&uverbs_dev->dev, "uverbs%d", uverbs_dev->devnum);
cdev_init(&uverbs_dev->cdev,
- device->mmap ? &uverbs_mmap_fops : &uverbs_fops);
+ device->ops.mmap ? &uverbs_mmap_fops : &uverbs_fops);
uverbs_dev->cdev.owner = THIS_MODULE;
ret = cdev_device_add(&uverbs_dev->cdev, &uverbs_dev->dev);
@@ -1373,7 +1395,7 @@ static void ib_uverbs_remove_one(struct ib_device *device, void *client_data)
cdev_device_del(&uverbs_dev->cdev, &uverbs_dev->dev);
ida_free(&uverbs_ida, uverbs_dev->devnum);
- if (device->disassociate_ucontext) {
+ if (device->ops.disassociate_ucontext) {
/* We disassociate HW resources and immediately return.
* Userspace will see a EIO errno for all future access.
* Upon returning, ib_device may be freed internally and is not
diff --git a/drivers/infiniband/core/uverbs_std_types.c b/drivers/infiniband/core/uverbs_std_types.c
index 203cc96ac6f5..f224cb727224 100644
--- a/drivers/infiniband/core/uverbs_std_types.c
+++ b/drivers/infiniband/core/uverbs_std_types.c
@@ -42,7 +42,8 @@
static int uverbs_free_ah(struct ib_uobject *uobject,
enum rdma_remove_reason why)
{
- return rdma_destroy_ah((struct ib_ah *)uobject->object);
+ return rdma_destroy_ah((struct ib_ah *)uobject->object,
+ RDMA_DESTROY_AH_SLEEPABLE);
}
static int uverbs_free_flow(struct ib_uobject *uobject,
@@ -54,7 +55,7 @@ static int uverbs_free_flow(struct ib_uobject *uobject,
struct ib_qp *qp = flow->qp;
int ret;
- ret = flow->device->destroy_flow(flow);
+ ret = flow->device->ops.destroy_flow(flow);
if (!ret) {
if (qp)
atomic_dec(&qp->usecnt);
@@ -187,7 +188,7 @@ static int uverbs_free_pd(struct ib_uobject *uobject,
if (ret)
return ret;
- ib_dealloc_pd((struct ib_pd *)uobject->object);
+ ib_dealloc_pd(pd);
return 0;
}
@@ -210,8 +211,7 @@ static int uverbs_hot_unplug_completion_event_file(struct ib_uobject *uobj,
return 0;
};
-int uverbs_destroy_def_handler(struct ib_uverbs_file *file,
- struct uverbs_attr_bundle *attrs)
+int uverbs_destroy_def_handler(struct uverbs_attr_bundle *attrs)
{
return 0;
}
@@ -229,58 +229,106 @@ DECLARE_UVERBS_NAMED_OBJECT(
UVERBS_OBJECT_QP,
UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uqp_object), uverbs_free_qp));
+DECLARE_UVERBS_NAMED_METHOD_DESTROY(
+ UVERBS_METHOD_MW_DESTROY,
+ UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_MW_HANDLE,
+ UVERBS_OBJECT_MW,
+ UVERBS_ACCESS_DESTROY,
+ UA_MANDATORY));
+
DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_MW,
- UVERBS_TYPE_ALLOC_IDR(uverbs_free_mw));
+ UVERBS_TYPE_ALLOC_IDR(uverbs_free_mw),
+ &UVERBS_METHOD(UVERBS_METHOD_MW_DESTROY));
DECLARE_UVERBS_NAMED_OBJECT(
UVERBS_OBJECT_SRQ,
UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_usrq_object),
uverbs_free_srq));
+DECLARE_UVERBS_NAMED_METHOD_DESTROY(
+ UVERBS_METHOD_AH_DESTROY,
+ UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_AH_HANDLE,
+ UVERBS_OBJECT_AH,
+ UVERBS_ACCESS_DESTROY,
+ UA_MANDATORY));
+
DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_AH,
- UVERBS_TYPE_ALLOC_IDR(uverbs_free_ah));
+ UVERBS_TYPE_ALLOC_IDR(uverbs_free_ah),
+ &UVERBS_METHOD(UVERBS_METHOD_AH_DESTROY));
+
+DECLARE_UVERBS_NAMED_METHOD_DESTROY(
+ UVERBS_METHOD_FLOW_DESTROY,
+ UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_FLOW_HANDLE,
+ UVERBS_OBJECT_FLOW,
+ UVERBS_ACCESS_DESTROY,
+ UA_MANDATORY));
DECLARE_UVERBS_NAMED_OBJECT(
UVERBS_OBJECT_FLOW,
UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uflow_object),
- uverbs_free_flow));
+ uverbs_free_flow),
+ &UVERBS_METHOD(UVERBS_METHOD_FLOW_DESTROY));
DECLARE_UVERBS_NAMED_OBJECT(
UVERBS_OBJECT_WQ,
UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uwq_object), uverbs_free_wq));
+DECLARE_UVERBS_NAMED_METHOD_DESTROY(
+ UVERBS_METHOD_RWQ_IND_TBL_DESTROY,
+ UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_RWQ_IND_TBL_HANDLE,
+ UVERBS_OBJECT_RWQ_IND_TBL,
+ UVERBS_ACCESS_DESTROY,
+ UA_MANDATORY));
+
DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_RWQ_IND_TBL,
- UVERBS_TYPE_ALLOC_IDR(uverbs_free_rwq_ind_tbl));
+ UVERBS_TYPE_ALLOC_IDR(uverbs_free_rwq_ind_tbl),
+ &UVERBS_METHOD(UVERBS_METHOD_RWQ_IND_TBL_DESTROY));
+
+DECLARE_UVERBS_NAMED_METHOD_DESTROY(
+ UVERBS_METHOD_XRCD_DESTROY,
+ UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_XRCD_HANDLE,
+ UVERBS_OBJECT_XRCD,
+ UVERBS_ACCESS_DESTROY,
+ UA_MANDATORY));
DECLARE_UVERBS_NAMED_OBJECT(
UVERBS_OBJECT_XRCD,
UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uxrcd_object),
- uverbs_free_xrcd));
+ uverbs_free_xrcd),
+ &UVERBS_METHOD(UVERBS_METHOD_XRCD_DESTROY));
+
+DECLARE_UVERBS_NAMED_METHOD_DESTROY(
+ UVERBS_METHOD_PD_DESTROY,
+ UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_PD_HANDLE,
+ UVERBS_OBJECT_PD,
+ UVERBS_ACCESS_DESTROY,
+ UA_MANDATORY));
DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_PD,
- UVERBS_TYPE_ALLOC_IDR(uverbs_free_pd));
-
-DECLARE_UVERBS_GLOBAL_METHODS(UVERBS_OBJECT_DEVICE);
-
-DECLARE_UVERBS_OBJECT_TREE(uverbs_default_objects,
- &UVERBS_OBJECT(UVERBS_OBJECT_DEVICE),
- &UVERBS_OBJECT(UVERBS_OBJECT_PD),
- &UVERBS_OBJECT(UVERBS_OBJECT_MR),
- &UVERBS_OBJECT(UVERBS_OBJECT_COMP_CHANNEL),
- &UVERBS_OBJECT(UVERBS_OBJECT_CQ),
- &UVERBS_OBJECT(UVERBS_OBJECT_QP),
- &UVERBS_OBJECT(UVERBS_OBJECT_AH),
- &UVERBS_OBJECT(UVERBS_OBJECT_MW),
- &UVERBS_OBJECT(UVERBS_OBJECT_SRQ),
- &UVERBS_OBJECT(UVERBS_OBJECT_FLOW),
- &UVERBS_OBJECT(UVERBS_OBJECT_WQ),
- &UVERBS_OBJECT(UVERBS_OBJECT_RWQ_IND_TBL),
- &UVERBS_OBJECT(UVERBS_OBJECT_XRCD),
- &UVERBS_OBJECT(UVERBS_OBJECT_FLOW_ACTION),
- &UVERBS_OBJECT(UVERBS_OBJECT_DM),
- &UVERBS_OBJECT(UVERBS_OBJECT_COUNTERS));
-
-const struct uverbs_object_tree_def *uverbs_default_get_objects(void)
-{
- return &uverbs_default_objects;
-}
+ UVERBS_TYPE_ALLOC_IDR(uverbs_free_pd),
+ &UVERBS_METHOD(UVERBS_METHOD_PD_DESTROY));
+
+const struct uapi_definition uverbs_def_obj_intf[] = {
+ UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_PD,
+ UAPI_DEF_OBJ_NEEDS_FN(dealloc_pd)),
+ UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_COMP_CHANNEL,
+ UAPI_DEF_OBJ_NEEDS_FN(dealloc_pd)),
+ UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_QP,
+ UAPI_DEF_OBJ_NEEDS_FN(destroy_qp)),
+ UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_AH,
+ UAPI_DEF_OBJ_NEEDS_FN(destroy_ah)),
+ UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_MW,
+ UAPI_DEF_OBJ_NEEDS_FN(dealloc_mw)),
+ UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_SRQ,
+ UAPI_DEF_OBJ_NEEDS_FN(destroy_srq)),
+ UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_FLOW,
+ UAPI_DEF_OBJ_NEEDS_FN(destroy_flow)),
+ UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_WQ,
+ UAPI_DEF_OBJ_NEEDS_FN(destroy_wq)),
+ UAPI_DEF_CHAIN_OBJ_TREE_NAMED(
+ UVERBS_OBJECT_RWQ_IND_TBL,
+ UAPI_DEF_OBJ_NEEDS_FN(destroy_rwq_ind_table)),
+ UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_XRCD,
+ UAPI_DEF_OBJ_NEEDS_FN(dealloc_xrcd)),
+ {}
+};
diff --git a/drivers/infiniband/core/uverbs_std_types_counters.c b/drivers/infiniband/core/uverbs_std_types_counters.c
index a0ffdcf9a51c..309c5e80988d 100644
--- a/drivers/infiniband/core/uverbs_std_types_counters.c
+++ b/drivers/infiniband/core/uverbs_std_types_counters.c
@@ -44,11 +44,11 @@ static int uverbs_free_counters(struct ib_uobject *uobject,
if (ret)
return ret;
- return counters->device->destroy_counters(counters);
+ return counters->device->ops.destroy_counters(counters);
}
static int UVERBS_HANDLER(UVERBS_METHOD_COUNTERS_CREATE)(
- struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs)
+ struct uverbs_attr_bundle *attrs)
{
struct ib_uobject *uobj = uverbs_attr_get_uobject(
attrs, UVERBS_ATTR_CREATE_COUNTERS_HANDLE);
@@ -61,10 +61,10 @@ static int UVERBS_HANDLER(UVERBS_METHOD_COUNTERS_CREATE)(
* have the ability to remove methods from parse tree once
* such condition is met.
*/
- if (!ib_dev->create_counters)
+ if (!ib_dev->ops.create_counters)
return -EOPNOTSUPP;
- counters = ib_dev->create_counters(ib_dev, attrs);
+ counters = ib_dev->ops.create_counters(ib_dev, attrs);
if (IS_ERR(counters)) {
ret = PTR_ERR(counters);
goto err_create_counters;
@@ -82,7 +82,7 @@ err_create_counters:
}
static int UVERBS_HANDLER(UVERBS_METHOD_COUNTERS_READ)(
- struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs)
+ struct uverbs_attr_bundle *attrs)
{
struct ib_counters_read_attr read_attr = {};
const struct uverbs_attr *uattr;
@@ -90,7 +90,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_COUNTERS_READ)(
uverbs_attr_get_obj(attrs, UVERBS_ATTR_READ_COUNTERS_HANDLE);
int ret;
- if (!counters->device->read_counters)
+ if (!counters->device->ops.read_counters)
return -EOPNOTSUPP;
if (!atomic_read(&counters->usecnt))
@@ -109,7 +109,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_COUNTERS_READ)(
if (IS_ERR(read_attr.counters_buff))
return PTR_ERR(read_attr.counters_buff);
- ret = counters->device->read_counters(counters, &read_attr, attrs);
+ ret = counters->device->ops.read_counters(counters, &read_attr, attrs);
if (ret)
return ret;
@@ -149,3 +149,9 @@ DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_COUNTERS,
&UVERBS_METHOD(UVERBS_METHOD_COUNTERS_CREATE),
&UVERBS_METHOD(UVERBS_METHOD_COUNTERS_DESTROY),
&UVERBS_METHOD(UVERBS_METHOD_COUNTERS_READ));
+
+const struct uapi_definition uverbs_def_obj_counters[] = {
+ UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_COUNTERS,
+ UAPI_DEF_OBJ_NEEDS_FN(destroy_counters)),
+ {}
+};
diff --git a/drivers/infiniband/core/uverbs_std_types_cq.c b/drivers/infiniband/core/uverbs_std_types_cq.c
index 5b5f2052cd52..a59ea89e3f2b 100644
--- a/drivers/infiniband/core/uverbs_std_types_cq.c
+++ b/drivers/infiniband/core/uverbs_std_types_cq.c
@@ -58,13 +58,12 @@ static int uverbs_free_cq(struct ib_uobject *uobject,
}
static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(
- struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs)
+ struct uverbs_attr_bundle *attrs)
{
struct ib_ucq_object *obj = container_of(
uverbs_attr_get_uobject(attrs, UVERBS_ATTR_CREATE_CQ_HANDLE),
typeof(*obj), uobject);
struct ib_device *ib_dev = obj->uobject.context->device;
- struct ib_udata uhw;
int ret;
u64 user_handle;
struct ib_cq_init_attr attr = {};
@@ -72,7 +71,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(
struct ib_uverbs_completion_event_file *ev_file = NULL;
struct ib_uobject *ev_file_uobj;
- if (!ib_dev->create_cq || !ib_dev->destroy_cq)
+ if (!ib_dev->ops.create_cq || !ib_dev->ops.destroy_cq)
return -EOPNOTSUPP;
ret = uverbs_copy_from(&attr.comp_vector, attrs,
@@ -101,7 +100,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(
uverbs_uobject_get(ev_file_uobj);
}
- if (attr.comp_vector >= file->device->num_comp_vectors) {
+ if (attr.comp_vector >= attrs->ufile->device->num_comp_vectors) {
ret = -EINVAL;
goto err_event_file;
}
@@ -111,10 +110,8 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(
INIT_LIST_HEAD(&obj->comp_list);
INIT_LIST_HEAD(&obj->async_list);
- /* Temporary, only until drivers get the new uverbs_attr_bundle */
- create_udata(attrs, &uhw);
-
- cq = ib_dev->create_cq(ib_dev, &attr, obj->uobject.context, &uhw);
+ cq = ib_dev->ops.create_cq(ib_dev, &attr, obj->uobject.context,
+ &attrs->driver_udata);
if (IS_ERR(cq)) {
ret = PTR_ERR(cq);
goto err_event_file;
@@ -129,7 +126,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(
obj->uobject.user_handle = user_handle;
atomic_set(&cq->usecnt, 0);
cq->res.type = RDMA_RESTRACK_CQ;
- rdma_restrack_add(&cq->res);
+ rdma_restrack_uadd(&cq->res);
ret = uverbs_copy_to(attrs, UVERBS_ATTR_CREATE_CQ_RESP_CQE, &cq->cqe,
sizeof(cq->cqe));
@@ -173,7 +170,7 @@ DECLARE_UVERBS_NAMED_METHOD(
UVERBS_ATTR_UHW());
static int UVERBS_HANDLER(UVERBS_METHOD_CQ_DESTROY)(
- struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs)
+ struct uverbs_attr_bundle *attrs)
{
struct ib_uobject *uobj =
uverbs_attr_get_uobject(attrs, UVERBS_ATTR_DESTROY_CQ_HANDLE);
@@ -207,3 +204,9 @@ DECLARE_UVERBS_NAMED_OBJECT(
&UVERBS_METHOD(UVERBS_METHOD_CQ_DESTROY)
#endif
);
+
+const struct uapi_definition uverbs_def_obj_cq[] = {
+ UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_CQ,
+ UAPI_DEF_OBJ_NEEDS_FN(destroy_cq)),
+ {}
+};
diff --git a/drivers/infiniband/core/uverbs_std_types_device.c b/drivers/infiniband/core/uverbs_std_types_device.c
new file mode 100644
index 000000000000..2a3f2f01028d
--- /dev/null
+++ b/drivers/infiniband/core/uverbs_std_types_device.c
@@ -0,0 +1,230 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2018, Mellanox Technologies inc. All rights reserved.
+ */
+
+#include <rdma/uverbs_std_types.h>
+#include "rdma_core.h"
+#include "uverbs.h"
+#include <rdma/uverbs_ioctl.h>
+#include <rdma/opa_addr.h>
+
+/*
+ * This ioctl method allows calling any defined write or write_ex
+ * handler. This essentially replaces the hdr/ex_hdr system with the ioctl
+ * marshalling, and brings the non-ex path into the same marshalling as the ex
+ * path.
+ */
+static int UVERBS_HANDLER(UVERBS_METHOD_INVOKE_WRITE)(
+ struct uverbs_attr_bundle *attrs)
+{
+ struct uverbs_api *uapi = attrs->ufile->device->uapi;
+ const struct uverbs_api_write_method *method_elm;
+ u32 cmd;
+ int rc;
+
+ rc = uverbs_get_const(&cmd, attrs, UVERBS_ATTR_WRITE_CMD);
+ if (rc)
+ return rc;
+
+ method_elm = uapi_get_method(uapi, cmd);
+ if (IS_ERR(method_elm))
+ return PTR_ERR(method_elm);
+
+ uverbs_fill_udata(attrs, &attrs->ucore, UVERBS_ATTR_CORE_IN,
+ UVERBS_ATTR_CORE_OUT);
+
+ if (attrs->ucore.inlen < method_elm->req_size ||
+ attrs->ucore.outlen < method_elm->resp_size)
+ return -ENOSPC;
+
+ return method_elm->handler(attrs);
+}
+
+DECLARE_UVERBS_NAMED_METHOD(UVERBS_METHOD_INVOKE_WRITE,
+ UVERBS_ATTR_CONST_IN(UVERBS_ATTR_WRITE_CMD,
+ enum ib_uverbs_write_cmds,
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CORE_IN,
+ UVERBS_ATTR_MIN_SIZE(sizeof(u32)),
+ UA_OPTIONAL),
+ UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CORE_OUT,
+ UVERBS_ATTR_MIN_SIZE(0),
+ UA_OPTIONAL),
+ UVERBS_ATTR_UHW());
+
+static uint32_t *
+gather_objects_handle(struct ib_uverbs_file *ufile,
+ const struct uverbs_api_object *uapi_object,
+ struct uverbs_attr_bundle *attrs,
+ ssize_t out_len,
+ u64 *total)
+{
+ u64 max_count = out_len / sizeof(u32);
+ struct ib_uobject *obj;
+ u64 count = 0;
+ u32 *handles;
+
+ /* Allocated memory that cannot page out where we gather
+ * all object ids under a spin_lock.
+ */
+ handles = uverbs_zalloc(attrs, out_len);
+ if (IS_ERR(handles))
+ return handles;
+
+ spin_lock_irq(&ufile->uobjects_lock);
+ list_for_each_entry(obj, &ufile->uobjects, list) {
+ u32 obj_id = obj->id;
+
+ if (obj->uapi_object != uapi_object)
+ continue;
+
+ if (count >= max_count)
+ break;
+
+ handles[count] = obj_id;
+ count++;
+ }
+ spin_unlock_irq(&ufile->uobjects_lock);
+
+ *total = count;
+ return handles;
+}
+
+static int UVERBS_HANDLER(UVERBS_METHOD_INFO_HANDLES)(
+ struct uverbs_attr_bundle *attrs)
+{
+ const struct uverbs_api_object *uapi_object;
+ ssize_t out_len;
+ u64 total = 0;
+ u16 object_id;
+ u32 *handles;
+ int ret;
+
+ out_len = uverbs_attr_get_len(attrs, UVERBS_ATTR_INFO_HANDLES_LIST);
+ if (out_len <= 0 || (out_len % sizeof(u32) != 0))
+ return -EINVAL;
+
+ ret = uverbs_get_const(&object_id, attrs, UVERBS_ATTR_INFO_OBJECT_ID);
+ if (ret)
+ return ret;
+
+ uapi_object = uapi_get_object(attrs->ufile->device->uapi, object_id);
+ if (!uapi_object)
+ return -EINVAL;
+
+ handles = gather_objects_handle(attrs->ufile, uapi_object, attrs,
+ out_len, &total);
+ if (IS_ERR(handles))
+ return PTR_ERR(handles);
+
+ ret = uverbs_copy_to(attrs, UVERBS_ATTR_INFO_HANDLES_LIST, handles,
+ sizeof(u32) * total);
+ if (ret)
+ goto err;
+
+ ret = uverbs_copy_to(attrs, UVERBS_ATTR_INFO_TOTAL_HANDLES, &total,
+ sizeof(total));
+err:
+ return ret;
+}
+
+void copy_port_attr_to_resp(struct ib_port_attr *attr,
+ struct ib_uverbs_query_port_resp *resp,
+ struct ib_device *ib_dev, u8 port_num)
+{
+ resp->state = attr->state;
+ resp->max_mtu = attr->max_mtu;
+ resp->active_mtu = attr->active_mtu;
+ resp->gid_tbl_len = attr->gid_tbl_len;
+ resp->port_cap_flags = make_port_cap_flags(attr);
+ resp->max_msg_sz = attr->max_msg_sz;
+ resp->bad_pkey_cntr = attr->bad_pkey_cntr;
+ resp->qkey_viol_cntr = attr->qkey_viol_cntr;
+ resp->pkey_tbl_len = attr->pkey_tbl_len;
+
+ if (rdma_is_grh_required(ib_dev, port_num))
+ resp->flags |= IB_UVERBS_QPF_GRH_REQUIRED;
+
+ if (rdma_cap_opa_ah(ib_dev, port_num)) {
+ resp->lid = OPA_TO_IB_UCAST_LID(attr->lid);
+ resp->sm_lid = OPA_TO_IB_UCAST_LID(attr->sm_lid);
+ } else {
+ resp->lid = ib_lid_cpu16(attr->lid);
+ resp->sm_lid = ib_lid_cpu16(attr->sm_lid);
+ }
+
+ resp->lmc = attr->lmc;
+ resp->max_vl_num = attr->max_vl_num;
+ resp->sm_sl = attr->sm_sl;
+ resp->subnet_timeout = attr->subnet_timeout;
+ resp->init_type_reply = attr->init_type_reply;
+ resp->active_width = attr->active_width;
+ resp->active_speed = attr->active_speed;
+ resp->phys_state = attr->phys_state;
+ resp->link_layer = rdma_port_get_link_layer(ib_dev, port_num);
+}
+
+static int UVERBS_HANDLER(UVERBS_METHOD_QUERY_PORT)(
+ struct uverbs_attr_bundle *attrs)
+{
+ struct ib_device *ib_dev;
+ struct ib_port_attr attr = {};
+ struct ib_uverbs_query_port_resp_ex resp = {};
+ struct ib_ucontext *ucontext;
+ int ret;
+ u8 port_num;
+
+ ucontext = ib_uverbs_get_ucontext(attrs);
+ if (IS_ERR(ucontext))
+ return PTR_ERR(ucontext);
+ ib_dev = ucontext->device;
+
+ /* FIXME: Extend the UAPI_DEF_OBJ_NEEDS_FN stuff.. */
+ if (!ib_dev->ops.query_port)
+ return -EOPNOTSUPP;
+
+ ret = uverbs_get_const(&port_num, attrs,
+ UVERBS_ATTR_QUERY_PORT_PORT_NUM);
+ if (ret)
+ return ret;
+
+ ret = ib_query_port(ib_dev, port_num, &attr);
+ if (ret)
+ return ret;
+
+ copy_port_attr_to_resp(&attr, &resp.legacy_resp, ib_dev, port_num);
+ resp.port_cap_flags2 = attr.port_cap_flags2;
+
+ return uverbs_copy_to_struct_or_zero(attrs, UVERBS_ATTR_QUERY_PORT_RESP,
+ &resp, sizeof(resp));
+}
+
+DECLARE_UVERBS_NAMED_METHOD(
+ UVERBS_METHOD_INFO_HANDLES,
+ /* Also includes any device specific object ids */
+ UVERBS_ATTR_CONST_IN(UVERBS_ATTR_INFO_OBJECT_ID,
+ enum uverbs_default_objects, UA_MANDATORY),
+ UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_INFO_TOTAL_HANDLES,
+ UVERBS_ATTR_TYPE(u32), UA_OPTIONAL),
+ UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_INFO_HANDLES_LIST,
+ UVERBS_ATTR_MIN_SIZE(sizeof(u32)), UA_OPTIONAL));
+
+DECLARE_UVERBS_NAMED_METHOD(
+ UVERBS_METHOD_QUERY_PORT,
+ UVERBS_ATTR_CONST_IN(UVERBS_ATTR_QUERY_PORT_PORT_NUM, u8, UA_MANDATORY),
+ UVERBS_ATTR_PTR_OUT(
+ UVERBS_ATTR_QUERY_PORT_RESP,
+ UVERBS_ATTR_STRUCT(struct ib_uverbs_query_port_resp_ex,
+ reserved),
+ UA_MANDATORY));
+
+DECLARE_UVERBS_GLOBAL_METHODS(UVERBS_OBJECT_DEVICE,
+ &UVERBS_METHOD(UVERBS_METHOD_INVOKE_WRITE),
+ &UVERBS_METHOD(UVERBS_METHOD_INFO_HANDLES),
+ &UVERBS_METHOD(UVERBS_METHOD_QUERY_PORT));
+
+const struct uapi_definition uverbs_def_obj_device[] = {
+ UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_DEVICE),
+ {},
+};
diff --git a/drivers/infiniband/core/uverbs_std_types_dm.c b/drivers/infiniband/core/uverbs_std_types_dm.c
index edc3ff7733d4..2ef70637bee1 100644
--- a/drivers/infiniband/core/uverbs_std_types_dm.c
+++ b/drivers/infiniband/core/uverbs_std_types_dm.c
@@ -43,12 +43,11 @@ static int uverbs_free_dm(struct ib_uobject *uobject,
if (ret)
return ret;
- return dm->device->dealloc_dm(dm);
+ return dm->device->ops.dealloc_dm(dm);
}
-static int
-UVERBS_HANDLER(UVERBS_METHOD_DM_ALLOC)(struct ib_uverbs_file *file,
- struct uverbs_attr_bundle *attrs)
+static int UVERBS_HANDLER(UVERBS_METHOD_DM_ALLOC)(
+ struct uverbs_attr_bundle *attrs)
{
struct ib_dm_alloc_attr attr = {};
struct ib_uobject *uobj =
@@ -58,7 +57,7 @@ UVERBS_HANDLER(UVERBS_METHOD_DM_ALLOC)(struct ib_uverbs_file *file,
struct ib_dm *dm;
int ret;
- if (!ib_dev->alloc_dm)
+ if (!ib_dev->ops.alloc_dm)
return -EOPNOTSUPP;
ret = uverbs_copy_from(&attr.length, attrs,
@@ -71,7 +70,7 @@ UVERBS_HANDLER(UVERBS_METHOD_DM_ALLOC)(struct ib_uverbs_file *file,
if (ret)
return ret;
- dm = ib_dev->alloc_dm(ib_dev, uobj->context, &attr, attrs);
+ dm = ib_dev->ops.alloc_dm(ib_dev, uobj->context, &attr, attrs);
if (IS_ERR(dm))
return PTR_ERR(dm);
@@ -109,3 +108,9 @@ DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_DM,
UVERBS_TYPE_ALLOC_IDR(uverbs_free_dm),
&UVERBS_METHOD(UVERBS_METHOD_DM_ALLOC),
&UVERBS_METHOD(UVERBS_METHOD_DM_FREE));
+
+const struct uapi_definition uverbs_def_obj_dm[] = {
+ UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_DM,
+ UAPI_DEF_OBJ_NEEDS_FN(dealloc_dm)),
+ {}
+};
diff --git a/drivers/infiniband/core/uverbs_std_types_flow_action.c b/drivers/infiniband/core/uverbs_std_types_flow_action.c
index cb9486ad5c67..4962b87fa600 100644
--- a/drivers/infiniband/core/uverbs_std_types_flow_action.c
+++ b/drivers/infiniband/core/uverbs_std_types_flow_action.c
@@ -43,7 +43,7 @@ static int uverbs_free_flow_action(struct ib_uobject *uobject,
if (ret)
return ret;
- return action->device->destroy_flow_action(action);
+ return action->device->ops.destroy_flow_action(action);
}
static u64 esp_flags_uverbs_to_verbs(struct uverbs_attr_bundle *attrs,
@@ -223,7 +223,6 @@ struct ib_flow_action_esp_attr {
#define ESP_LAST_SUPPORTED_FLAG IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW
static int parse_flow_action_esp(struct ib_device *ib_dev,
- struct ib_uverbs_file *file,
struct uverbs_attr_bundle *attrs,
struct ib_flow_action_esp_attr *esp_attr,
bool is_modify)
@@ -305,7 +304,7 @@ static int parse_flow_action_esp(struct ib_device *ib_dev,
}
static int UVERBS_HANDLER(UVERBS_METHOD_FLOW_ACTION_ESP_CREATE)(
- struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs)
+ struct uverbs_attr_bundle *attrs)
{
struct ib_uobject *uobj = uverbs_attr_get_uobject(
attrs, UVERBS_ATTR_CREATE_FLOW_ACTION_ESP_HANDLE);
@@ -314,15 +313,16 @@ static int UVERBS_HANDLER(UVERBS_METHOD_FLOW_ACTION_ESP_CREATE)(
struct ib_flow_action *action;
struct ib_flow_action_esp_attr esp_attr = {};
- if (!ib_dev->create_flow_action_esp)
+ if (!ib_dev->ops.create_flow_action_esp)
return -EOPNOTSUPP;
- ret = parse_flow_action_esp(ib_dev, file, attrs, &esp_attr, false);
+ ret = parse_flow_action_esp(ib_dev, attrs, &esp_attr, false);
if (ret)
return ret;
/* No need to check as this attribute is marked as MANDATORY */
- action = ib_dev->create_flow_action_esp(ib_dev, &esp_attr.hdr, attrs);
+ action = ib_dev->ops.create_flow_action_esp(ib_dev, &esp_attr.hdr,
+ attrs);
if (IS_ERR(action))
return PTR_ERR(action);
@@ -333,7 +333,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_FLOW_ACTION_ESP_CREATE)(
}
static int UVERBS_HANDLER(UVERBS_METHOD_FLOW_ACTION_ESP_MODIFY)(
- struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs)
+ struct uverbs_attr_bundle *attrs)
{
struct ib_uobject *uobj = uverbs_attr_get_uobject(
attrs, UVERBS_ATTR_MODIFY_FLOW_ACTION_ESP_HANDLE);
@@ -341,19 +341,19 @@ static int UVERBS_HANDLER(UVERBS_METHOD_FLOW_ACTION_ESP_MODIFY)(
int ret;
struct ib_flow_action_esp_attr esp_attr = {};
- if (!action->device->modify_flow_action_esp)
+ if (!action->device->ops.modify_flow_action_esp)
return -EOPNOTSUPP;
- ret = parse_flow_action_esp(action->device, file, attrs, &esp_attr,
- true);
+ ret = parse_flow_action_esp(action->device, attrs, &esp_attr, true);
if (ret)
return ret;
if (action->type != IB_FLOW_ACTION_ESP)
return -EINVAL;
- return action->device->modify_flow_action_esp(action, &esp_attr.hdr,
- attrs);
+ return action->device->ops.modify_flow_action_esp(action,
+ &esp_attr.hdr,
+ attrs);
}
static const struct uverbs_attr_spec uverbs_flow_action_esp_keymat[] = {
@@ -438,3 +438,10 @@ DECLARE_UVERBS_NAMED_OBJECT(
&UVERBS_METHOD(UVERBS_METHOD_FLOW_ACTION_ESP_CREATE),
&UVERBS_METHOD(UVERBS_METHOD_FLOW_ACTION_DESTROY),
&UVERBS_METHOD(UVERBS_METHOD_FLOW_ACTION_ESP_MODIFY));
+
+const struct uapi_definition uverbs_def_obj_flow_action[] = {
+ UAPI_DEF_CHAIN_OBJ_TREE_NAMED(
+ UVERBS_OBJECT_FLOW_ACTION,
+ UAPI_DEF_OBJ_NEEDS_FN(destroy_flow_action)),
+ {}
+};
diff --git a/drivers/infiniband/core/uverbs_std_types_mr.c b/drivers/infiniband/core/uverbs_std_types_mr.c
index cf02e774303e..4d4be0c2b752 100644
--- a/drivers/infiniband/core/uverbs_std_types_mr.c
+++ b/drivers/infiniband/core/uverbs_std_types_mr.c
@@ -39,8 +39,44 @@ static int uverbs_free_mr(struct ib_uobject *uobject,
return ib_dereg_mr((struct ib_mr *)uobject->object);
}
+static int UVERBS_HANDLER(UVERBS_METHOD_ADVISE_MR)(
+ struct uverbs_attr_bundle *attrs)
+{
+ struct ib_pd *pd =
+ uverbs_attr_get_obj(attrs, UVERBS_ATTR_ADVISE_MR_PD_HANDLE);
+ enum ib_uverbs_advise_mr_advice advice;
+ struct ib_device *ib_dev = pd->device;
+ struct ib_sge *sg_list;
+ int num_sge;
+ u32 flags;
+ int ret;
+
+ /* FIXME: Extend the UAPI_DEF_OBJ_NEEDS_FN stuff.. */
+ if (!ib_dev->ops.advise_mr)
+ return -EOPNOTSUPP;
+
+ ret = uverbs_get_const(&advice, attrs, UVERBS_ATTR_ADVISE_MR_ADVICE);
+ if (ret)
+ return ret;
+
+ ret = uverbs_get_flags32(&flags, attrs, UVERBS_ATTR_ADVISE_MR_FLAGS,
+ IB_UVERBS_ADVISE_MR_FLAG_FLUSH);
+ if (ret)
+ return ret;
+
+ num_sge = uverbs_attr_ptr_get_array_size(
+ attrs, UVERBS_ATTR_ADVISE_MR_SGE_LIST, sizeof(struct ib_sge));
+ if (num_sge < 0)
+ return num_sge;
+
+ sg_list = uverbs_attr_get_alloced_ptr(attrs,
+ UVERBS_ATTR_ADVISE_MR_SGE_LIST);
+ return ib_dev->ops.advise_mr(pd, advice, flags, sg_list, num_sge,
+ attrs);
+}
+
static int UVERBS_HANDLER(UVERBS_METHOD_DM_MR_REG)(
- struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs)
+ struct uverbs_attr_bundle *attrs)
{
struct ib_dm_mr_attr attr = {};
struct ib_uobject *uobj =
@@ -54,7 +90,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_DM_MR_REG)(
struct ib_mr *mr;
int ret;
- if (!ib_dev->reg_dm_mr)
+ if (!ib_dev->ops.reg_dm_mr)
return -EOPNOTSUPP;
ret = uverbs_copy_from(&attr.offset, attrs, UVERBS_ATTR_REG_DM_MR_OFFSET);
@@ -83,7 +119,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_DM_MR_REG)(
attr.length > dm->length - attr.offset)
return -EINVAL;
- mr = pd->device->reg_dm_mr(pd, dm, &attr, attrs);
+ mr = pd->device->ops.reg_dm_mr(pd, dm, &attr, attrs);
if (IS_ERR(mr))
return PTR_ERR(mr);
@@ -115,6 +151,23 @@ err_dereg:
}
DECLARE_UVERBS_NAMED_METHOD(
+ UVERBS_METHOD_ADVISE_MR,
+ UVERBS_ATTR_IDR(UVERBS_ATTR_ADVISE_MR_PD_HANDLE,
+ UVERBS_OBJECT_PD,
+ UVERBS_ACCESS_READ,
+ UA_MANDATORY),
+ UVERBS_ATTR_CONST_IN(UVERBS_ATTR_ADVISE_MR_ADVICE,
+ enum ib_uverbs_advise_mr_advice,
+ UA_MANDATORY),
+ UVERBS_ATTR_FLAGS_IN(UVERBS_ATTR_ADVISE_MR_FLAGS,
+ enum ib_uverbs_advise_mr_flag,
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_ADVISE_MR_SGE_LIST,
+ UVERBS_ATTR_MIN_SIZE(sizeof(struct ib_uverbs_sge)),
+ UA_MANDATORY,
+ UA_ALLOC_AND_COPY));
+
+DECLARE_UVERBS_NAMED_METHOD(
UVERBS_METHOD_DM_MR_REG,
UVERBS_ATTR_IDR(UVERBS_ATTR_REG_DM_MR_HANDLE,
UVERBS_OBJECT_MR,
@@ -143,7 +196,22 @@ DECLARE_UVERBS_NAMED_METHOD(
UVERBS_ATTR_TYPE(u32),
UA_MANDATORY));
+DECLARE_UVERBS_NAMED_METHOD_DESTROY(
+ UVERBS_METHOD_MR_DESTROY,
+ UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_MR_HANDLE,
+ UVERBS_OBJECT_MR,
+ UVERBS_ACCESS_DESTROY,
+ UA_MANDATORY));
+
DECLARE_UVERBS_NAMED_OBJECT(
UVERBS_OBJECT_MR,
UVERBS_TYPE_ALLOC_IDR(uverbs_free_mr),
- &UVERBS_METHOD(UVERBS_METHOD_DM_MR_REG));
+ &UVERBS_METHOD(UVERBS_METHOD_DM_MR_REG),
+ &UVERBS_METHOD(UVERBS_METHOD_MR_DESTROY),
+ &UVERBS_METHOD(UVERBS_METHOD_ADVISE_MR));
+
+const struct uapi_definition uverbs_def_obj_mr[] = {
+ UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_MR,
+ UAPI_DEF_OBJ_NEEDS_FN(dereg_mr)),
+ {}
+};
diff --git a/drivers/infiniband/core/uverbs_uapi.c b/drivers/infiniband/core/uverbs_uapi.c
index 86f3fc5e04b4..7a987acf0c0b 100644
--- a/drivers/infiniband/core/uverbs_uapi.c
+++ b/drivers/infiniband/core/uverbs_uapi.c
@@ -8,6 +8,11 @@
#include "rdma_core.h"
#include "uverbs.h"
+static int ib_uverbs_notsupp(struct uverbs_attr_bundle *attrs)
+{
+ return -EOPNOTSUPP;
+}
+
static void *uapi_add_elm(struct uverbs_api *uapi, u32 key, size_t alloc_size)
{
void *elm;
@@ -26,6 +31,70 @@ static void *uapi_add_elm(struct uverbs_api *uapi, u32 key, size_t alloc_size)
return elm;
}
+static void *uapi_add_get_elm(struct uverbs_api *uapi, u32 key,
+ size_t alloc_size, bool *exists)
+{
+ void *elm;
+
+ elm = uapi_add_elm(uapi, key, alloc_size);
+ if (!IS_ERR(elm)) {
+ *exists = false;
+ return elm;
+ }
+
+ if (elm != ERR_PTR(-EEXIST))
+ return elm;
+
+ elm = radix_tree_lookup(&uapi->radix, key);
+ if (WARN_ON(!elm))
+ return ERR_PTR(-EINVAL);
+ *exists = true;
+ return elm;
+}
+
+static int uapi_create_write(struct uverbs_api *uapi,
+ struct ib_device *ibdev,
+ const struct uapi_definition *def,
+ u32 obj_key,
+ u32 *cur_method_key)
+{
+ struct uverbs_api_write_method *method_elm;
+ u32 method_key = obj_key;
+ bool exists;
+
+ if (def->write.is_ex)
+ method_key |= uapi_key_write_ex_method(def->write.command_num);
+ else
+ method_key |= uapi_key_write_method(def->write.command_num);
+
+ method_elm = uapi_add_get_elm(uapi, method_key, sizeof(*method_elm),
+ &exists);
+ if (IS_ERR(method_elm))
+ return PTR_ERR(method_elm);
+
+ if (WARN_ON(exists && (def->write.is_ex != method_elm->is_ex)))
+ return -EINVAL;
+
+ method_elm->is_ex = def->write.is_ex;
+ method_elm->handler = def->func_write;
+ if (def->write.is_ex)
+ method_elm->disabled = !(ibdev->uverbs_ex_cmd_mask &
+ BIT_ULL(def->write.command_num));
+ else
+ method_elm->disabled = !(ibdev->uverbs_cmd_mask &
+ BIT_ULL(def->write.command_num));
+
+ if (!def->write.is_ex && def->func_write) {
+ method_elm->has_udata = def->write.has_udata;
+ method_elm->has_resp = def->write.has_resp;
+ method_elm->req_size = def->write.req_size;
+ method_elm->resp_size = def->write.resp_size;
+ }
+
+ *cur_method_key = method_key;
+ return 0;
+}
+
static int uapi_merge_method(struct uverbs_api *uapi,
struct uverbs_api_object *obj_elm, u32 obj_key,
const struct uverbs_method_def *method,
@@ -34,23 +103,21 @@ static int uapi_merge_method(struct uverbs_api *uapi,
u32 method_key = obj_key | uapi_key_ioctl_method(method->id);
struct uverbs_api_ioctl_method *method_elm;
unsigned int i;
+ bool exists;
if (!method->attrs)
return 0;
- method_elm = uapi_add_elm(uapi, method_key, sizeof(*method_elm));
- if (IS_ERR(method_elm)) {
- if (method_elm != ERR_PTR(-EEXIST))
- return PTR_ERR(method_elm);
-
+ method_elm = uapi_add_get_elm(uapi, method_key, sizeof(*method_elm),
+ &exists);
+ if (IS_ERR(method_elm))
+ return PTR_ERR(method_elm);
+ if (exists) {
/*
* This occurs when a driver uses ADD_UVERBS_ATTRIBUTES_SIMPLE
*/
if (WARN_ON(method->handler))
return -EINVAL;
- method_elm = radix_tree_lookup(&uapi->radix, method_key);
- if (WARN_ON(!method_elm))
- return -EINVAL;
} else {
WARN_ON(!method->handler);
rcu_assign_pointer(method_elm->handler, method->handler);
@@ -98,72 +165,188 @@ static int uapi_merge_method(struct uverbs_api *uapi,
return 0;
}
-static int uapi_merge_tree(struct uverbs_api *uapi,
- const struct uverbs_object_tree_def *tree,
- bool is_driver)
+static int uapi_merge_obj_tree(struct uverbs_api *uapi,
+ const struct uverbs_object_def *obj,
+ bool is_driver)
{
- unsigned int i, j;
+ struct uverbs_api_object *obj_elm;
+ unsigned int i;
+ u32 obj_key;
+ bool exists;
int rc;
- if (!tree->objects)
+ obj_key = uapi_key_obj(obj->id);
+ obj_elm = uapi_add_get_elm(uapi, obj_key, sizeof(*obj_elm), &exists);
+ if (IS_ERR(obj_elm))
+ return PTR_ERR(obj_elm);
+
+ if (obj->type_attrs) {
+ if (WARN_ON(obj_elm->type_attrs))
+ return -EINVAL;
+
+ obj_elm->id = obj->id;
+ obj_elm->type_attrs = obj->type_attrs;
+ obj_elm->type_class = obj->type_attrs->type_class;
+ /*
+ * Today drivers are only permitted to use idr_class and
+ * fd_class types. We can revoke the IDR types during
+ * disassociation, and the FD types require the driver to use
+ * struct file_operations.owner to prevent the driver module
+ * code from unloading while the file is open. This provides
+ * enough safety that uverbs_close_fd() will continue to work.
+ * Drivers using FD are responsible to handle disassociation of
+ * the device on their own.
+ */
+ if (WARN_ON(is_driver &&
+ obj->type_attrs->type_class != &uverbs_idr_class &&
+ obj->type_attrs->type_class != &uverbs_fd_class))
+ return -EINVAL;
+ }
+
+ if (!obj->methods)
return 0;
- for (i = 0; i != tree->num_objects; i++) {
- const struct uverbs_object_def *obj = (*tree->objects)[i];
- struct uverbs_api_object *obj_elm;
- u32 obj_key;
+ for (i = 0; i != obj->num_methods; i++) {
+ const struct uverbs_method_def *method = (*obj->methods)[i];
- if (!obj)
+ if (!method)
continue;
- obj_key = uapi_key_obj(obj->id);
- obj_elm = uapi_add_elm(uapi, obj_key, sizeof(*obj_elm));
- if (IS_ERR(obj_elm)) {
- if (obj_elm != ERR_PTR(-EEXIST))
- return PTR_ERR(obj_elm);
+ rc = uapi_merge_method(uapi, obj_elm, obj_key, method,
+ is_driver);
+ if (rc)
+ return rc;
+ }
- /* This occurs when a driver uses ADD_UVERBS_METHODS */
- if (WARN_ON(obj->type_attrs))
- return -EINVAL;
- obj_elm = radix_tree_lookup(&uapi->radix, obj_key);
- if (WARN_ON(!obj_elm))
+ return 0;
+}
+
+static int uapi_disable_elm(struct uverbs_api *uapi,
+ const struct uapi_definition *def,
+ u32 obj_key,
+ u32 method_key)
+{
+ bool exists;
+
+ if (def->scope == UAPI_SCOPE_OBJECT) {
+ struct uverbs_api_object *obj_elm;
+
+ obj_elm = uapi_add_get_elm(
+ uapi, obj_key, sizeof(*obj_elm), &exists);
+ if (IS_ERR(obj_elm))
+ return PTR_ERR(obj_elm);
+ obj_elm->disabled = 1;
+ return 0;
+ }
+
+ if (def->scope == UAPI_SCOPE_METHOD &&
+ uapi_key_is_ioctl_method(method_key)) {
+ struct uverbs_api_ioctl_method *method_elm;
+
+ method_elm = uapi_add_get_elm(uapi, method_key,
+ sizeof(*method_elm), &exists);
+ if (IS_ERR(method_elm))
+ return PTR_ERR(method_elm);
+ method_elm->disabled = 1;
+ return 0;
+ }
+
+ if (def->scope == UAPI_SCOPE_METHOD &&
+ (uapi_key_is_write_method(method_key) ||
+ uapi_key_is_write_ex_method(method_key))) {
+ struct uverbs_api_write_method *write_elm;
+
+ write_elm = uapi_add_get_elm(uapi, method_key,
+ sizeof(*write_elm), &exists);
+ if (IS_ERR(write_elm))
+ return PTR_ERR(write_elm);
+ write_elm->disabled = 1;
+ return 0;
+ }
+
+ WARN_ON(true);
+ return -EINVAL;
+}
+
+static int uapi_merge_def(struct uverbs_api *uapi, struct ib_device *ibdev,
+ const struct uapi_definition *def_list,
+ bool is_driver)
+{
+ const struct uapi_definition *def = def_list;
+ u32 cur_obj_key = UVERBS_API_KEY_ERR;
+ u32 cur_method_key = UVERBS_API_KEY_ERR;
+ bool exists;
+ int rc;
+
+ if (!def_list)
+ return 0;
+
+ for (;; def++) {
+ switch ((enum uapi_definition_kind)def->kind) {
+ case UAPI_DEF_CHAIN:
+ rc = uapi_merge_def(uapi, ibdev, def->chain, is_driver);
+ if (rc)
+ return rc;
+ continue;
+
+ case UAPI_DEF_CHAIN_OBJ_TREE:
+ if (WARN_ON(def->object_start.object_id !=
+ def->chain_obj_tree->id))
return -EINVAL;
- } else {
- obj_elm->type_attrs = obj->type_attrs;
- if (obj->type_attrs) {
- obj_elm->type_class =
- obj->type_attrs->type_class;
- /*
- * Today drivers are only permitted to use
- * idr_class types. They cannot use FD types
- * because we currently have no way to revoke
- * the fops pointer after device
- * disassociation.
- */
- if (WARN_ON(is_driver &&
- obj->type_attrs->type_class !=
- &uverbs_idr_class))
- return -EINVAL;
- }
- }
- if (!obj->methods)
+ cur_obj_key = uapi_key_obj(def->object_start.object_id);
+ rc = uapi_merge_obj_tree(uapi, def->chain_obj_tree,
+ is_driver);
+ if (rc)
+ return rc;
continue;
- for (j = 0; j != obj->num_methods; j++) {
- const struct uverbs_method_def *method =
- (*obj->methods)[j];
- if (!method)
+ case UAPI_DEF_END:
+ return 0;
+
+ case UAPI_DEF_IS_SUPPORTED_DEV_FN: {
+ void **ibdev_fn =
+ (void *)(&ibdev->ops) + def->needs_fn_offset;
+
+ if (*ibdev_fn)
continue;
+ rc = uapi_disable_elm(
+ uapi, def, cur_obj_key, cur_method_key);
+ if (rc)
+ return rc;
+ continue;
+ }
- rc = uapi_merge_method(uapi, obj_elm, obj_key, method,
- is_driver);
+ case UAPI_DEF_IS_SUPPORTED_FUNC:
+ if (def->func_is_supported(ibdev))
+ continue;
+ rc = uapi_disable_elm(
+ uapi, def, cur_obj_key, cur_method_key);
if (rc)
return rc;
+ continue;
+
+ case UAPI_DEF_OBJECT_START: {
+ struct uverbs_api_object *obj_elm;
+
+ cur_obj_key = uapi_key_obj(def->object_start.object_id);
+ obj_elm = uapi_add_get_elm(uapi, cur_obj_key,
+ sizeof(*obj_elm), &exists);
+ if (IS_ERR(obj_elm))
+ return PTR_ERR(obj_elm);
+ continue;
}
- }
- return 0;
+ case UAPI_DEF_WRITE:
+ rc = uapi_create_write(
+ uapi, ibdev, def, cur_obj_key, &cur_method_key);
+ if (rc)
+ return rc;
+ continue;
+ }
+ WARN_ON(true);
+ return -EINVAL;
+ }
}
static int
@@ -186,13 +369,16 @@ uapi_finalize_ioctl_method(struct uverbs_api *uapi,
u32 attr_bkey = uapi_bkey_attr(attr_key);
u8 type = elm->spec.type;
- if (uapi_key_attr_to_method(iter.index) !=
- uapi_key_attr_to_method(method_key))
+ if (uapi_key_attr_to_ioctl_method(iter.index) !=
+ uapi_key_attr_to_ioctl_method(method_key))
break;
if (elm->spec.mandatory)
__set_bit(attr_bkey, method_elm->attr_mandatory);
+ if (elm->spec.is_udata)
+ method_elm->has_udata = true;
+
if (type == UVERBS_ATTR_TYPE_IDR ||
type == UVERBS_ATTR_TYPE_FD) {
u8 access = elm->spec.u.obj.access;
@@ -229,9 +415,13 @@ uapi_finalize_ioctl_method(struct uverbs_api *uapi,
static int uapi_finalize(struct uverbs_api *uapi)
{
+ const struct uverbs_api_write_method **data;
+ unsigned long max_write_ex = 0;
+ unsigned long max_write = 0;
struct radix_tree_iter iter;
void __rcu **slot;
int rc;
+ int i;
radix_tree_for_each_slot (slot, &uapi->radix, &iter, 0) {
struct uverbs_api_ioctl_method *method_elm =
@@ -243,29 +433,209 @@ static int uapi_finalize(struct uverbs_api *uapi)
if (rc)
return rc;
}
+
+ if (uapi_key_is_write_method(iter.index))
+ max_write = max(max_write,
+ iter.index & UVERBS_API_ATTR_KEY_MASK);
+ if (uapi_key_is_write_ex_method(iter.index))
+ max_write_ex =
+ max(max_write_ex,
+ iter.index & UVERBS_API_ATTR_KEY_MASK);
+ }
+
+ uapi->notsupp_method.handler = ib_uverbs_notsupp;
+ uapi->num_write = max_write + 1;
+ uapi->num_write_ex = max_write_ex + 1;
+ data = kmalloc_array(uapi->num_write + uapi->num_write_ex,
+ sizeof(*uapi->write_methods), GFP_KERNEL);
+ for (i = 0; i != uapi->num_write + uapi->num_write_ex; i++)
+ data[i] = &uapi->notsupp_method;
+ uapi->write_methods = data;
+ uapi->write_ex_methods = data + uapi->num_write;
+
+ radix_tree_for_each_slot (slot, &uapi->radix, &iter, 0) {
+ if (uapi_key_is_write_method(iter.index))
+ uapi->write_methods[iter.index &
+ UVERBS_API_ATTR_KEY_MASK] =
+ rcu_dereference_protected(*slot, true);
+ if (uapi_key_is_write_ex_method(iter.index))
+ uapi->write_ex_methods[iter.index &
+ UVERBS_API_ATTR_KEY_MASK] =
+ rcu_dereference_protected(*slot, true);
}
return 0;
}
-void uverbs_destroy_api(struct uverbs_api *uapi)
+static void uapi_remove_range(struct uverbs_api *uapi, u32 start, u32 last)
{
struct radix_tree_iter iter;
void __rcu **slot;
- if (!uapi)
- return;
-
- radix_tree_for_each_slot (slot, &uapi->radix, &iter, 0) {
+ radix_tree_for_each_slot (slot, &uapi->radix, &iter, start) {
+ if (iter.index > last)
+ return;
kfree(rcu_dereference_protected(*slot, true));
radix_tree_iter_delete(&uapi->radix, &iter, slot);
}
+}
+
+static void uapi_remove_object(struct uverbs_api *uapi, u32 obj_key)
+{
+ uapi_remove_range(uapi, obj_key,
+ obj_key | UVERBS_API_METHOD_KEY_MASK |
+ UVERBS_API_ATTR_KEY_MASK);
+}
+
+static void uapi_remove_method(struct uverbs_api *uapi, u32 method_key)
+{
+ uapi_remove_range(uapi, method_key,
+ method_key | UVERBS_API_ATTR_KEY_MASK);
+}
+
+
+static u32 uapi_get_obj_id(struct uverbs_attr_spec *spec)
+{
+ if (spec->type == UVERBS_ATTR_TYPE_IDR ||
+ spec->type == UVERBS_ATTR_TYPE_FD)
+ return spec->u.obj.obj_type;
+ if (spec->type == UVERBS_ATTR_TYPE_IDRS_ARRAY)
+ return spec->u2.objs_arr.obj_type;
+ return UVERBS_API_KEY_ERR;
+}
+
+static void uapi_key_okay(u32 key)
+{
+ unsigned int count = 0;
+
+ if (uapi_key_is_object(key))
+ count++;
+ if (uapi_key_is_ioctl_method(key))
+ count++;
+ if (uapi_key_is_write_method(key))
+ count++;
+ if (uapi_key_is_write_ex_method(key))
+ count++;
+ if (uapi_key_is_attr(key))
+ count++;
+ WARN(count != 1, "Bad count %d key=%x", count, key);
+}
+
+static void uapi_finalize_disable(struct uverbs_api *uapi)
+{
+ struct radix_tree_iter iter;
+ u32 starting_key = 0;
+ bool scan_again = false;
+ void __rcu **slot;
+
+again:
+ radix_tree_for_each_slot (slot, &uapi->radix, &iter, starting_key) {
+ uapi_key_okay(iter.index);
+
+ if (uapi_key_is_object(iter.index)) {
+ struct uverbs_api_object *obj_elm =
+ rcu_dereference_protected(*slot, true);
+
+ if (obj_elm->disabled) {
+ /* Have to check all the attrs again */
+ scan_again = true;
+ starting_key = iter.index;
+ uapi_remove_object(uapi, iter.index);
+ goto again;
+ }
+ continue;
+ }
+
+ if (uapi_key_is_ioctl_method(iter.index)) {
+ struct uverbs_api_ioctl_method *method_elm =
+ rcu_dereference_protected(*slot, true);
+
+ if (method_elm->disabled) {
+ starting_key = iter.index;
+ uapi_remove_method(uapi, iter.index);
+ goto again;
+ }
+ continue;
+ }
+
+ if (uapi_key_is_write_method(iter.index) ||
+ uapi_key_is_write_ex_method(iter.index)) {
+ struct uverbs_api_write_method *method_elm =
+ rcu_dereference_protected(*slot, true);
+
+ if (method_elm->disabled) {
+ kfree(method_elm);
+ radix_tree_iter_delete(&uapi->radix, &iter, slot);
+ }
+ continue;
+ }
+
+ if (uapi_key_is_attr(iter.index)) {
+ struct uverbs_api_attr *attr_elm =
+ rcu_dereference_protected(*slot, true);
+ const struct uverbs_api_object *tmp_obj;
+ u32 obj_key;
+
+ /*
+ * If the method has a mandatory object handle
+ * attribute which relies on an object which is not
+ * present then the entire method is uncallable.
+ */
+ if (!attr_elm->spec.mandatory)
+ continue;
+ obj_key = uapi_get_obj_id(&attr_elm->spec);
+ if (obj_key == UVERBS_API_KEY_ERR)
+ continue;
+ tmp_obj = uapi_get_object(uapi, obj_key);
+ if (IS_ERR(tmp_obj)) {
+ if (PTR_ERR(tmp_obj) == -ENOMSG)
+ continue;
+ } else {
+ if (!tmp_obj->disabled)
+ continue;
+ }
+
+ starting_key = iter.index;
+ uapi_remove_method(
+ uapi,
+ iter.index & (UVERBS_API_OBJ_KEY_MASK |
+ UVERBS_API_METHOD_KEY_MASK));
+ goto again;
+ }
+
+ WARN_ON(false);
+ }
+
+ if (!scan_again)
+ return;
+ scan_again = false;
+ starting_key = 0;
+ goto again;
+}
+
+void uverbs_destroy_api(struct uverbs_api *uapi)
+{
+ if (!uapi)
+ return;
+
+ uapi_remove_range(uapi, 0, U32_MAX);
+ kfree(uapi->write_methods);
kfree(uapi);
}
-struct uverbs_api *uverbs_alloc_api(
- const struct uverbs_object_tree_def *const *driver_specs,
- enum rdma_driver_id driver_id)
+static const struct uapi_definition uverbs_core_api[] = {
+ UAPI_DEF_CHAIN(uverbs_def_obj_counters),
+ UAPI_DEF_CHAIN(uverbs_def_obj_cq),
+ UAPI_DEF_CHAIN(uverbs_def_obj_device),
+ UAPI_DEF_CHAIN(uverbs_def_obj_dm),
+ UAPI_DEF_CHAIN(uverbs_def_obj_flow_action),
+ UAPI_DEF_CHAIN(uverbs_def_obj_intf),
+ UAPI_DEF_CHAIN(uverbs_def_obj_mr),
+ UAPI_DEF_CHAIN(uverbs_def_write_intf),
+ {},
+};
+
+struct uverbs_api *uverbs_alloc_api(struct ib_device *ibdev)
{
struct uverbs_api *uapi;
int rc;
@@ -275,18 +645,16 @@ struct uverbs_api *uverbs_alloc_api(
return ERR_PTR(-ENOMEM);
INIT_RADIX_TREE(&uapi->radix, GFP_KERNEL);
- uapi->driver_id = driver_id;
+ uapi->driver_id = ibdev->driver_id;
- rc = uapi_merge_tree(uapi, uverbs_default_get_objects(), false);
+ rc = uapi_merge_def(uapi, ibdev, uverbs_core_api, false);
+ if (rc)
+ goto err;
+ rc = uapi_merge_def(uapi, ibdev, ibdev->driver_def, true);
if (rc)
goto err;
- for (; driver_specs && *driver_specs; driver_specs++) {
- rc = uapi_merge_tree(uapi, *driver_specs, true);
- if (rc)
- goto err;
- }
-
+ uapi_finalize_disable(uapi);
rc = uapi_finalize(uapi);
if (rc)
goto err;
@@ -294,8 +662,9 @@ struct uverbs_api *uverbs_alloc_api(
return uapi;
err:
if (rc != -ENOMEM)
- pr_err("Setup of uverbs_api failed, kernel parsing tree description is not valid (%d)??\n",
- rc);
+ dev_err(&ibdev->dev,
+ "Setup of uverbs_api failed, kernel parsing tree description is not valid (%d)??\n",
+ rc);
uverbs_destroy_api(uapi);
return ERR_PTR(rc);
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 178899e3ce73..5a5e83f5f0fc 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -141,6 +141,10 @@ __attribute_const__ int ib_rate_to_mult(enum ib_rate rate)
case IB_RATE_100_GBPS: return 40;
case IB_RATE_200_GBPS: return 80;
case IB_RATE_300_GBPS: return 120;
+ case IB_RATE_28_GBPS: return 11;
+ case IB_RATE_50_GBPS: return 20;
+ case IB_RATE_400_GBPS: return 160;
+ case IB_RATE_600_GBPS: return 240;
default: return -1;
}
}
@@ -166,6 +170,10 @@ __attribute_const__ enum ib_rate mult_to_ib_rate(int mult)
case 40: return IB_RATE_100_GBPS;
case 80: return IB_RATE_200_GBPS;
case 120: return IB_RATE_300_GBPS;
+ case 11: return IB_RATE_28_GBPS;
+ case 20: return IB_RATE_50_GBPS;
+ case 160: return IB_RATE_400_GBPS;
+ case 240: return IB_RATE_600_GBPS;
default: return IB_RATE_PORT_CURRENT;
}
}
@@ -191,6 +199,10 @@ __attribute_const__ int ib_rate_to_mbps(enum ib_rate rate)
case IB_RATE_100_GBPS: return 103125;
case IB_RATE_200_GBPS: return 206250;
case IB_RATE_300_GBPS: return 309375;
+ case IB_RATE_28_GBPS: return 28125;
+ case IB_RATE_50_GBPS: return 53125;
+ case IB_RATE_400_GBPS: return 425000;
+ case IB_RATE_600_GBPS: return 637500;
default: return -1;
}
}
@@ -214,8 +226,8 @@ EXPORT_SYMBOL(rdma_node_get_transport);
enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, u8 port_num)
{
enum rdma_transport_type lt;
- if (device->get_link_layer)
- return device->get_link_layer(device, port_num);
+ if (device->ops.get_link_layer)
+ return device->ops.get_link_layer(device, port_num);
lt = rdma_node_get_transport(device->node_type);
if (lt == RDMA_TRANSPORT_IB)
@@ -242,10 +254,11 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags,
{
struct ib_pd *pd;
int mr_access_flags = 0;
+ int ret;
- pd = device->alloc_pd(device, NULL, NULL);
- if (IS_ERR(pd))
- return pd;
+ pd = rdma_zalloc_drv_obj(device, ib_pd);
+ if (!pd)
+ return ERR_PTR(-ENOMEM);
pd->device = device;
pd->uobject = NULL;
@@ -253,6 +266,16 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags,
atomic_set(&pd->usecnt, 0);
pd->flags = flags;
+ pd->res.type = RDMA_RESTRACK_PD;
+ rdma_restrack_set_task(&pd->res, caller);
+
+ ret = device->ops.alloc_pd(pd, NULL, NULL);
+ if (ret) {
+ kfree(pd);
+ return ERR_PTR(ret);
+ }
+ rdma_restrack_kadd(&pd->res);
+
if (device->attrs.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)
pd->local_dma_lkey = device->local_dma_lkey;
else
@@ -263,14 +286,10 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags,
mr_access_flags |= IB_ACCESS_REMOTE_READ | IB_ACCESS_REMOTE_WRITE;
}
- pd->res.type = RDMA_RESTRACK_PD;
- rdma_restrack_set_task(&pd->res, caller);
- rdma_restrack_add(&pd->res);
-
if (mr_access_flags) {
struct ib_mr *mr;
- mr = pd->device->get_dma_mr(pd, mr_access_flags);
+ mr = pd->device->ops.get_dma_mr(pd, mr_access_flags);
if (IS_ERR(mr)) {
ib_dealloc_pd(pd);
return ERR_CAST(mr);
@@ -307,7 +326,7 @@ void ib_dealloc_pd(struct ib_pd *pd)
int ret;
if (pd->__internal_mr) {
- ret = pd->device->dereg_mr(pd->__internal_mr);
+ ret = pd->device->ops.dereg_mr(pd->__internal_mr);
WARN_ON(ret);
pd->__internal_mr = NULL;
}
@@ -317,10 +336,8 @@ void ib_dealloc_pd(struct ib_pd *pd)
WARN_ON(atomic_read(&pd->usecnt));
rdma_restrack_del(&pd->res);
- /* Making delalloc_pd a void return is a WIP, no driver should return
- an error here. */
- ret = pd->device->dealloc_pd(pd);
- WARN_ONCE(ret, "Infiniband HW driver failed dealloc_pd");
+ pd->device->ops.dealloc_pd(pd);
+ kfree(pd);
}
EXPORT_SYMBOL(ib_dealloc_pd);
@@ -475,14 +492,17 @@ rdma_update_sgid_attr(struct rdma_ah_attr *ah_attr,
static struct ib_ah *_rdma_create_ah(struct ib_pd *pd,
struct rdma_ah_attr *ah_attr,
+ u32 flags,
struct ib_udata *udata)
{
struct ib_ah *ah;
- if (!pd->device->create_ah)
+ might_sleep_if(flags & RDMA_CREATE_AH_SLEEPABLE);
+
+ if (!pd->device->ops.create_ah)
return ERR_PTR(-EOPNOTSUPP);
- ah = pd->device->create_ah(pd, ah_attr, udata);
+ ah = pd->device->ops.create_ah(pd, ah_attr, flags, udata);
if (!IS_ERR(ah)) {
ah->device = pd->device;
@@ -502,12 +522,14 @@ static struct ib_ah *_rdma_create_ah(struct ib_pd *pd,
* given address vector.
* @pd: The protection domain associated with the address handle.
* @ah_attr: The attributes of the address vector.
+ * @flags: Create address handle flags (see enum rdma_create_ah_flags).
*
* It returns 0 on success and returns appropriate error code on error.
* The address handle is used to reference a local or global destination
* in all UD QP post sends.
*/
-struct ib_ah *rdma_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr)
+struct ib_ah *rdma_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr,
+ u32 flags)
{
const struct ib_gid_attr *old_sgid_attr;
struct ib_ah *ah;
@@ -517,7 +539,7 @@ struct ib_ah *rdma_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr)
if (ret)
return ERR_PTR(ret);
- ah = _rdma_create_ah(pd, ah_attr, NULL);
+ ah = _rdma_create_ah(pd, ah_attr, flags, NULL);
rdma_unfill_sgid_attr(ah_attr, old_sgid_attr);
return ah;
@@ -557,7 +579,7 @@ struct ib_ah *rdma_create_user_ah(struct ib_pd *pd,
}
}
- ah = _rdma_create_ah(pd, ah_attr, udata);
+ ah = _rdma_create_ah(pd, ah_attr, RDMA_CREATE_AH_SLEEPABLE, udata);
out:
rdma_unfill_sgid_attr(ah_attr, old_sgid_attr);
@@ -869,7 +891,7 @@ struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, const struct ib_wc *wc,
if (ret)
return ERR_PTR(ret);
- ah = rdma_create_ah(pd, &ah_attr);
+ ah = rdma_create_ah(pd, &ah_attr, RDMA_CREATE_AH_SLEEPABLE);
rdma_destroy_ah_attr(&ah_attr);
return ah;
@@ -888,8 +910,8 @@ int rdma_modify_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr)
if (ret)
return ret;
- ret = ah->device->modify_ah ?
- ah->device->modify_ah(ah, ah_attr) :
+ ret = ah->device->ops.modify_ah ?
+ ah->device->ops.modify_ah(ah, ah_attr) :
-EOPNOTSUPP;
ah->sgid_attr = rdma_update_sgid_attr(ah_attr, ah->sgid_attr);
@@ -902,20 +924,22 @@ int rdma_query_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr)
{
ah_attr->grh.sgid_attr = NULL;
- return ah->device->query_ah ?
- ah->device->query_ah(ah, ah_attr) :
+ return ah->device->ops.query_ah ?
+ ah->device->ops.query_ah(ah, ah_attr) :
-EOPNOTSUPP;
}
EXPORT_SYMBOL(rdma_query_ah);
-int rdma_destroy_ah(struct ib_ah *ah)
+int rdma_destroy_ah(struct ib_ah *ah, u32 flags)
{
const struct ib_gid_attr *sgid_attr = ah->sgid_attr;
struct ib_pd *pd;
int ret;
+ might_sleep_if(flags & RDMA_DESTROY_AH_SLEEPABLE);
+
pd = ah->pd;
- ret = ah->device->destroy_ah(ah);
+ ret = ah->device->ops.destroy_ah(ah, flags);
if (!ret) {
atomic_dec(&pd->usecnt);
if (sgid_attr)
@@ -933,10 +957,10 @@ struct ib_srq *ib_create_srq(struct ib_pd *pd,
{
struct ib_srq *srq;
- if (!pd->device->create_srq)
+ if (!pd->device->ops.create_srq)
return ERR_PTR(-EOPNOTSUPP);
- srq = pd->device->create_srq(pd, srq_init_attr, NULL);
+ srq = pd->device->ops.create_srq(pd, srq_init_attr, NULL);
if (!IS_ERR(srq)) {
srq->device = pd->device;
@@ -965,17 +989,17 @@ int ib_modify_srq(struct ib_srq *srq,
struct ib_srq_attr *srq_attr,
enum ib_srq_attr_mask srq_attr_mask)
{
- return srq->device->modify_srq ?
- srq->device->modify_srq(srq, srq_attr, srq_attr_mask, NULL) :
- -EOPNOTSUPP;
+ return srq->device->ops.modify_srq ?
+ srq->device->ops.modify_srq(srq, srq_attr, srq_attr_mask,
+ NULL) : -EOPNOTSUPP;
}
EXPORT_SYMBOL(ib_modify_srq);
int ib_query_srq(struct ib_srq *srq,
struct ib_srq_attr *srq_attr)
{
- return srq->device->query_srq ?
- srq->device->query_srq(srq, srq_attr) : -EOPNOTSUPP;
+ return srq->device->ops.query_srq ?
+ srq->device->ops.query_srq(srq, srq_attr) : -EOPNOTSUPP;
}
EXPORT_SYMBOL(ib_query_srq);
@@ -997,7 +1021,7 @@ int ib_destroy_srq(struct ib_srq *srq)
if (srq_type == IB_SRQT_XRC)
xrcd = srq->ext.xrc.xrcd;
- ret = srq->device->destroy_srq(srq);
+ ret = srq->device->ops.destroy_srq(srq);
if (!ret) {
atomic_dec(&pd->usecnt);
if (srq_type == IB_SRQT_XRC)
@@ -1087,8 +1111,8 @@ struct ib_qp *ib_open_qp(struct ib_xrcd *xrcd,
}
EXPORT_SYMBOL(ib_open_qp);
-static struct ib_qp *ib_create_xrc_qp(struct ib_qp *qp,
- struct ib_qp_init_attr *qp_init_attr)
+static struct ib_qp *create_xrc_qp(struct ib_qp *qp,
+ struct ib_qp_init_attr *qp_init_attr)
{
struct ib_qp *real_qp = qp;
@@ -1103,10 +1127,10 @@ static struct ib_qp *ib_create_xrc_qp(struct ib_qp *qp,
qp = __ib_open_qp(real_qp, qp_init_attr->event_handler,
qp_init_attr->qp_context);
- if (!IS_ERR(qp))
- __ib_insert_xrcd_qp(qp_init_attr->xrcd, real_qp);
- else
- real_qp->device->destroy_qp(real_qp);
+ if (IS_ERR(qp))
+ return qp;
+
+ __ib_insert_xrcd_qp(qp_init_attr->xrcd, real_qp);
return qp;
}
@@ -1137,10 +1161,8 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd,
return qp;
ret = ib_create_qp_security(qp, device);
- if (ret) {
- ib_destroy_qp(qp);
- return ERR_PTR(ret);
- }
+ if (ret)
+ goto err;
qp->real_qp = qp;
qp->qp_type = qp_init_attr->qp_type;
@@ -1153,8 +1175,15 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd,
INIT_LIST_HEAD(&qp->sig_mrs);
qp->port = 0;
- if (qp_init_attr->qp_type == IB_QPT_XRC_TGT)
- return ib_create_xrc_qp(qp, qp_init_attr);
+ if (qp_init_attr->qp_type == IB_QPT_XRC_TGT) {
+ struct ib_qp *xrc_qp = create_xrc_qp(qp, qp_init_attr);
+
+ if (IS_ERR(xrc_qp)) {
+ ret = PTR_ERR(xrc_qp);
+ goto err;
+ }
+ return xrc_qp;
+ }
qp->event_handler = qp_init_attr->event_handler;
qp->qp_context = qp_init_attr->qp_context;
@@ -1181,11 +1210,8 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd,
if (qp_init_attr->cap.max_rdma_ctxs) {
ret = rdma_rw_init_mrs(qp, qp_init_attr);
- if (ret) {
- pr_err("failed to init MR pool ret= %d\n", ret);
- ib_destroy_qp(qp);
- return ERR_PTR(ret);
- }
+ if (ret)
+ goto err;
}
/*
@@ -1198,6 +1224,11 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd,
device->attrs.max_sge_rd);
return qp;
+
+err:
+ ib_destroy_qp(qp);
+ return ERR_PTR(ret);
+
}
EXPORT_SYMBOL(ib_create_qp);
@@ -1692,10 +1723,7 @@ int ib_get_eth_speed(struct ib_device *dev, u8 port_num, u8 *speed, u8 *width)
if (rdma_port_get_link_layer(dev, port_num) != IB_LINK_LAYER_ETHERNET)
return -EINVAL;
- if (!dev->get_netdev)
- return -EOPNOTSUPP;
-
- netdev = dev->get_netdev(dev, port_num);
+ netdev = ib_device_get_netdev(dev, port_num);
if (!netdev)
return -ENODEV;
@@ -1753,9 +1781,9 @@ int ib_query_qp(struct ib_qp *qp,
qp_attr->ah_attr.grh.sgid_attr = NULL;
qp_attr->alt_ah_attr.grh.sgid_attr = NULL;
- return qp->device->query_qp ?
- qp->device->query_qp(qp->real_qp, qp_attr, qp_attr_mask, qp_init_attr) :
- -EOPNOTSUPP;
+ return qp->device->ops.query_qp ?
+ qp->device->ops.query_qp(qp->real_qp, qp_attr, qp_attr_mask,
+ qp_init_attr) : -EOPNOTSUPP;
}
EXPORT_SYMBOL(ib_query_qp);
@@ -1841,7 +1869,7 @@ int ib_destroy_qp(struct ib_qp *qp)
rdma_rw_cleanup_mrs(qp);
rdma_restrack_del(&qp->res);
- ret = qp->device->destroy_qp(qp);
+ ret = qp->device->ops.destroy_qp(qp);
if (!ret) {
if (alt_path_sgid_attr)
rdma_put_gid_attr(alt_path_sgid_attr);
@@ -1879,7 +1907,7 @@ struct ib_cq *__ib_create_cq(struct ib_device *device,
{
struct ib_cq *cq;
- cq = device->create_cq(device, cq_attr, NULL, NULL);
+ cq = device->ops.create_cq(device, cq_attr, NULL, NULL);
if (!IS_ERR(cq)) {
cq->device = device;
@@ -1890,7 +1918,7 @@ struct ib_cq *__ib_create_cq(struct ib_device *device,
atomic_set(&cq->usecnt, 0);
cq->res.type = RDMA_RESTRACK_CQ;
rdma_restrack_set_task(&cq->res, caller);
- rdma_restrack_add(&cq->res);
+ rdma_restrack_kadd(&cq->res);
}
return cq;
@@ -1899,8 +1927,9 @@ EXPORT_SYMBOL(__ib_create_cq);
int rdma_set_cq_moderation(struct ib_cq *cq, u16 cq_count, u16 cq_period)
{
- return cq->device->modify_cq ?
- cq->device->modify_cq(cq, cq_count, cq_period) : -EOPNOTSUPP;
+ return cq->device->ops.modify_cq ?
+ cq->device->ops.modify_cq(cq, cq_count,
+ cq_period) : -EOPNOTSUPP;
}
EXPORT_SYMBOL(rdma_set_cq_moderation);
@@ -1910,14 +1939,14 @@ int ib_destroy_cq(struct ib_cq *cq)
return -EBUSY;
rdma_restrack_del(&cq->res);
- return cq->device->destroy_cq(cq);
+ return cq->device->ops.destroy_cq(cq);
}
EXPORT_SYMBOL(ib_destroy_cq);
int ib_resize_cq(struct ib_cq *cq, int cqe)
{
- return cq->device->resize_cq ?
- cq->device->resize_cq(cq, cqe, NULL) : -EOPNOTSUPP;
+ return cq->device->ops.resize_cq ?
+ cq->device->ops.resize_cq(cq, cqe, NULL) : -EOPNOTSUPP;
}
EXPORT_SYMBOL(ib_resize_cq);
@@ -1930,7 +1959,7 @@ int ib_dereg_mr(struct ib_mr *mr)
int ret;
rdma_restrack_del(&mr->res);
- ret = mr->device->dereg_mr(mr);
+ ret = mr->device->ops.dereg_mr(mr);
if (!ret) {
atomic_dec(&pd->usecnt);
if (dm)
@@ -1959,10 +1988,10 @@ struct ib_mr *ib_alloc_mr(struct ib_pd *pd,
{
struct ib_mr *mr;
- if (!pd->device->alloc_mr)
+ if (!pd->device->ops.alloc_mr)
return ERR_PTR(-EOPNOTSUPP);
- mr = pd->device->alloc_mr(pd, mr_type, max_num_sg);
+ mr = pd->device->ops.alloc_mr(pd, mr_type, max_num_sg);
if (!IS_ERR(mr)) {
mr->device = pd->device;
mr->pd = pd;
@@ -1971,7 +2000,7 @@ struct ib_mr *ib_alloc_mr(struct ib_pd *pd,
atomic_inc(&pd->usecnt);
mr->need_inval = false;
mr->res.type = RDMA_RESTRACK_MR;
- rdma_restrack_add(&mr->res);
+ rdma_restrack_kadd(&mr->res);
}
return mr;
@@ -1986,10 +2015,10 @@ struct ib_fmr *ib_alloc_fmr(struct ib_pd *pd,
{
struct ib_fmr *fmr;
- if (!pd->device->alloc_fmr)
+ if (!pd->device->ops.alloc_fmr)
return ERR_PTR(-EOPNOTSUPP);
- fmr = pd->device->alloc_fmr(pd, mr_access_flags, fmr_attr);
+ fmr = pd->device->ops.alloc_fmr(pd, mr_access_flags, fmr_attr);
if (!IS_ERR(fmr)) {
fmr->device = pd->device;
fmr->pd = pd;
@@ -2008,7 +2037,7 @@ int ib_unmap_fmr(struct list_head *fmr_list)
return 0;
fmr = list_entry(fmr_list->next, struct ib_fmr, list);
- return fmr->device->unmap_fmr(fmr_list);
+ return fmr->device->ops.unmap_fmr(fmr_list);
}
EXPORT_SYMBOL(ib_unmap_fmr);
@@ -2018,7 +2047,7 @@ int ib_dealloc_fmr(struct ib_fmr *fmr)
int ret;
pd = fmr->pd;
- ret = fmr->device->dealloc_fmr(fmr);
+ ret = fmr->device->ops.dealloc_fmr(fmr);
if (!ret)
atomic_dec(&pd->usecnt);
@@ -2070,14 +2099,14 @@ int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid)
{
int ret;
- if (!qp->device->attach_mcast)
+ if (!qp->device->ops.attach_mcast)
return -EOPNOTSUPP;
if (!rdma_is_multicast_addr((struct in6_addr *)gid->raw) ||
qp->qp_type != IB_QPT_UD || !is_valid_mcast_lid(qp, lid))
return -EINVAL;
- ret = qp->device->attach_mcast(qp, gid, lid);
+ ret = qp->device->ops.attach_mcast(qp, gid, lid);
if (!ret)
atomic_inc(&qp->usecnt);
return ret;
@@ -2088,14 +2117,14 @@ int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid)
{
int ret;
- if (!qp->device->detach_mcast)
+ if (!qp->device->ops.detach_mcast)
return -EOPNOTSUPP;
if (!rdma_is_multicast_addr((struct in6_addr *)gid->raw) ||
qp->qp_type != IB_QPT_UD || !is_valid_mcast_lid(qp, lid))
return -EINVAL;
- ret = qp->device->detach_mcast(qp, gid, lid);
+ ret = qp->device->ops.detach_mcast(qp, gid, lid);
if (!ret)
atomic_dec(&qp->usecnt);
return ret;
@@ -2106,10 +2135,10 @@ struct ib_xrcd *__ib_alloc_xrcd(struct ib_device *device, const char *caller)
{
struct ib_xrcd *xrcd;
- if (!device->alloc_xrcd)
+ if (!device->ops.alloc_xrcd)
return ERR_PTR(-EOPNOTSUPP);
- xrcd = device->alloc_xrcd(device, NULL, NULL);
+ xrcd = device->ops.alloc_xrcd(device, NULL, NULL);
if (!IS_ERR(xrcd)) {
xrcd->device = device;
xrcd->inode = NULL;
@@ -2137,7 +2166,7 @@ int ib_dealloc_xrcd(struct ib_xrcd *xrcd)
return ret;
}
- return xrcd->device->dealloc_xrcd(xrcd);
+ return xrcd->device->ops.dealloc_xrcd(xrcd);
}
EXPORT_SYMBOL(ib_dealloc_xrcd);
@@ -2160,10 +2189,10 @@ struct ib_wq *ib_create_wq(struct ib_pd *pd,
{
struct ib_wq *wq;
- if (!pd->device->create_wq)
+ if (!pd->device->ops.create_wq)
return ERR_PTR(-EOPNOTSUPP);
- wq = pd->device->create_wq(pd, wq_attr, NULL);
+ wq = pd->device->ops.create_wq(pd, wq_attr, NULL);
if (!IS_ERR(wq)) {
wq->event_handler = wq_attr->event_handler;
wq->wq_context = wq_attr->wq_context;
@@ -2193,7 +2222,7 @@ int ib_destroy_wq(struct ib_wq *wq)
if (atomic_read(&wq->usecnt))
return -EBUSY;
- err = wq->device->destroy_wq(wq);
+ err = wq->device->ops.destroy_wq(wq);
if (!err) {
atomic_dec(&pd->usecnt);
atomic_dec(&cq->usecnt);
@@ -2215,10 +2244,10 @@ int ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr,
{
int err;
- if (!wq->device->modify_wq)
+ if (!wq->device->ops.modify_wq)
return -EOPNOTSUPP;
- err = wq->device->modify_wq(wq, wq_attr, wq_attr_mask, NULL);
+ err = wq->device->ops.modify_wq(wq, wq_attr, wq_attr_mask, NULL);
return err;
}
EXPORT_SYMBOL(ib_modify_wq);
@@ -2240,12 +2269,12 @@ struct ib_rwq_ind_table *ib_create_rwq_ind_table(struct ib_device *device,
int i;
u32 table_size;
- if (!device->create_rwq_ind_table)
+ if (!device->ops.create_rwq_ind_table)
return ERR_PTR(-EOPNOTSUPP);
table_size = (1 << init_attr->log_ind_tbl_size);
- rwq_ind_table = device->create_rwq_ind_table(device,
- init_attr, NULL);
+ rwq_ind_table = device->ops.create_rwq_ind_table(device,
+ init_attr, NULL);
if (IS_ERR(rwq_ind_table))
return rwq_ind_table;
@@ -2275,7 +2304,7 @@ int ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *rwq_ind_table)
if (atomic_read(&rwq_ind_table->usecnt))
return -EBUSY;
- err = rwq_ind_table->device->destroy_rwq_ind_table(rwq_ind_table);
+ err = rwq_ind_table->device->ops.destroy_rwq_ind_table(rwq_ind_table);
if (!err) {
for (i = 0; i < table_size; i++)
atomic_dec(&ind_tbl[i]->usecnt);
@@ -2288,48 +2317,50 @@ EXPORT_SYMBOL(ib_destroy_rwq_ind_table);
int ib_check_mr_status(struct ib_mr *mr, u32 check_mask,
struct ib_mr_status *mr_status)
{
- return mr->device->check_mr_status ?
- mr->device->check_mr_status(mr, check_mask, mr_status) : -EOPNOTSUPP;
+ if (!mr->device->ops.check_mr_status)
+ return -EOPNOTSUPP;
+
+ return mr->device->ops.check_mr_status(mr, check_mask, mr_status);
}
EXPORT_SYMBOL(ib_check_mr_status);
int ib_set_vf_link_state(struct ib_device *device, int vf, u8 port,
int state)
{
- if (!device->set_vf_link_state)
+ if (!device->ops.set_vf_link_state)
return -EOPNOTSUPP;
- return device->set_vf_link_state(device, vf, port, state);
+ return device->ops.set_vf_link_state(device, vf, port, state);
}
EXPORT_SYMBOL(ib_set_vf_link_state);
int ib_get_vf_config(struct ib_device *device, int vf, u8 port,
struct ifla_vf_info *info)
{
- if (!device->get_vf_config)
+ if (!device->ops.get_vf_config)
return -EOPNOTSUPP;
- return device->get_vf_config(device, vf, port, info);
+ return device->ops.get_vf_config(device, vf, port, info);
}
EXPORT_SYMBOL(ib_get_vf_config);
int ib_get_vf_stats(struct ib_device *device, int vf, u8 port,
struct ifla_vf_stats *stats)
{
- if (!device->get_vf_stats)
+ if (!device->ops.get_vf_stats)
return -EOPNOTSUPP;
- return device->get_vf_stats(device, vf, port, stats);
+ return device->ops.get_vf_stats(device, vf, port, stats);
}
EXPORT_SYMBOL(ib_get_vf_stats);
int ib_set_vf_guid(struct ib_device *device, int vf, u8 port, u64 guid,
int type)
{
- if (!device->set_vf_guid)
+ if (!device->ops.set_vf_guid)
return -EOPNOTSUPP;
- return device->set_vf_guid(device, vf, port, guid, type);
+ return device->ops.set_vf_guid(device, vf, port, guid, type);
}
EXPORT_SYMBOL(ib_set_vf_guid);
@@ -2361,12 +2392,12 @@ EXPORT_SYMBOL(ib_set_vf_guid);
int ib_map_mr_sg(struct ib_mr *mr, struct scatterlist *sg, int sg_nents,
unsigned int *sg_offset, unsigned int page_size)
{
- if (unlikely(!mr->device->map_mr_sg))
+ if (unlikely(!mr->device->ops.map_mr_sg))
return -EOPNOTSUPP;
mr->page_size = page_size;
- return mr->device->map_mr_sg(mr, sg, sg_nents, sg_offset);
+ return mr->device->ops.map_mr_sg(mr, sg, sg_nents, sg_offset);
}
EXPORT_SYMBOL(ib_map_mr_sg);
@@ -2565,8 +2596,8 @@ static void __ib_drain_rq(struct ib_qp *qp)
*/
void ib_drain_sq(struct ib_qp *qp)
{
- if (qp->device->drain_sq)
- qp->device->drain_sq(qp);
+ if (qp->device->ops.drain_sq)
+ qp->device->ops.drain_sq(qp);
else
__ib_drain_sq(qp);
}
@@ -2593,8 +2624,8 @@ EXPORT_SYMBOL(ib_drain_sq);
*/
void ib_drain_rq(struct ib_qp *qp)
{
- if (qp->device->drain_rq)
- qp->device->drain_rq(qp);
+ if (qp->device->ops.drain_rq)
+ qp->device->ops.drain_rq(qp);
else
__ib_drain_rq(qp);
}
@@ -2632,10 +2663,11 @@ struct net_device *rdma_alloc_netdev(struct ib_device *device, u8 port_num,
struct net_device *netdev;
int rc;
- if (!device->rdma_netdev_get_params)
+ if (!device->ops.rdma_netdev_get_params)
return ERR_PTR(-EOPNOTSUPP);
- rc = device->rdma_netdev_get_params(device, port_num, type, &params);
+ rc = device->ops.rdma_netdev_get_params(device, port_num, type,
+ &params);
if (rc)
return ERR_PTR(rc);
@@ -2657,10 +2689,11 @@ int rdma_init_netdev(struct ib_device *device, u8 port_num,
struct rdma_netdev_alloc_params params;
int rc;
- if (!device->rdma_netdev_get_params)
+ if (!device->ops.rdma_netdev_get_params)
return -EOPNOTSUPP;
- rc = device->rdma_netdev_get_params(device, port_num, type, &params);
+ rc = device->ops.rdma_netdev_get_params(device, port_num, type,
+ &params);
if (rc)
return rc;
diff --git a/drivers/infiniband/hw/bnxt_re/Kconfig b/drivers/infiniband/hw/bnxt_re/Kconfig
index 18f5ed082f41..d25439c305f7 100644
--- a/drivers/infiniband/hw/bnxt_re/Kconfig
+++ b/drivers/infiniband/hw/bnxt_re/Kconfig
@@ -1,7 +1,7 @@
config INFINIBAND_BNXT_RE
tristate "Broadcom Netxtreme HCA support"
+ depends on 64BIT
depends on ETHERNET && NETDEVICES && PCI && INET && DCB
- depends on MAY_USE_DEVLINK
select NET_VENDOR_BROADCOM
select BNXT
---help---
diff --git a/drivers/infiniband/hw/bnxt_re/Makefile b/drivers/infiniband/hw/bnxt_re/Makefile
index 6e3bc25cc140..ee9bb1be61ea 100644
--- a/drivers/infiniband/hw/bnxt_re/Makefile
+++ b/drivers/infiniband/hw/bnxt_re/Makefile
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: GPL-2.0
-ccflags-y := -Idrivers/net/ethernet/broadcom/bnxt
+ccflags-y := -I $(srctree)/drivers/net/ethernet/broadcom/bnxt
obj-$(CONFIG_INFINIBAND_BNXT_RE) += bnxt_re.o
bnxt_re-y := main.o ib_verbs.o \
qplib_res.o qplib_rcfw.o \
diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h
index 31baa8939a4f..e55a1666c0cd 100644
--- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h
+++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h
@@ -124,6 +124,7 @@ struct bnxt_re_dev {
#define BNXT_RE_FLAG_ISSUE_ROCE_STATS 29
struct net_device *netdev;
unsigned int version, major, minor;
+ struct bnxt_qplib_chip_ctx chip_ctx;
struct bnxt_en_dev *en_dev;
struct bnxt_msix_entry msix_entries[BNXT_RE_MAX_MSIX];
int num_msix;
diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
index 54fdd4cf5288..071b2fc38b0b 100644
--- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c
+++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
@@ -48,6 +48,7 @@
#include <rdma/ib_addr.h>
#include <rdma/ib_mad.h>
#include <rdma/ib_cache.h>
+#include <rdma/uverbs_ioctl.h>
#include "bnxt_ulp.h"
@@ -563,41 +564,29 @@ fail:
}
/* Protection Domains */
-int bnxt_re_dealloc_pd(struct ib_pd *ib_pd)
+void bnxt_re_dealloc_pd(struct ib_pd *ib_pd)
{
struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd);
struct bnxt_re_dev *rdev = pd->rdev;
- int rc;
bnxt_re_destroy_fence_mr(pd);
- if (pd->qplib_pd.id) {
- rc = bnxt_qplib_dealloc_pd(&rdev->qplib_res,
- &rdev->qplib_res.pd_tbl,
- &pd->qplib_pd);
- if (rc)
- dev_err(rdev_to_dev(rdev), "Failed to deallocate HW PD");
- }
-
- kfree(pd);
- return 0;
+ if (pd->qplib_pd.id)
+ bnxt_qplib_dealloc_pd(&rdev->qplib_res, &rdev->qplib_res.pd_tbl,
+ &pd->qplib_pd);
}
-struct ib_pd *bnxt_re_alloc_pd(struct ib_device *ibdev,
- struct ib_ucontext *ucontext,
- struct ib_udata *udata)
+int bnxt_re_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *ucontext,
+ struct ib_udata *udata)
{
+ struct ib_device *ibdev = ibpd->device;
struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev);
struct bnxt_re_ucontext *ucntx = container_of(ucontext,
struct bnxt_re_ucontext,
ib_uctx);
- struct bnxt_re_pd *pd;
+ struct bnxt_re_pd *pd = container_of(ibpd, struct bnxt_re_pd, ib_pd);
int rc;
- pd = kzalloc(sizeof(*pd), GFP_KERNEL);
- if (!pd)
- return ERR_PTR(-ENOMEM);
-
pd->rdev = rdev;
if (bnxt_qplib_alloc_pd(&rdev->qplib_res.pd_tbl, &pd->qplib_pd)) {
dev_err(rdev_to_dev(rdev), "Failed to allocate HW PD");
@@ -637,23 +626,23 @@ struct ib_pd *bnxt_re_alloc_pd(struct ib_device *ibdev,
if (bnxt_re_create_fence_mr(pd))
dev_warn(rdev_to_dev(rdev),
"Failed to create Fence-MR\n");
- return &pd->ib_pd;
+ return 0;
dbfail:
- (void)bnxt_qplib_dealloc_pd(&rdev->qplib_res, &rdev->qplib_res.pd_tbl,
- &pd->qplib_pd);
+ bnxt_qplib_dealloc_pd(&rdev->qplib_res, &rdev->qplib_res.pd_tbl,
+ &pd->qplib_pd);
fail:
- kfree(pd);
- return ERR_PTR(rc);
+ return rc;
}
/* Address Handles */
-int bnxt_re_destroy_ah(struct ib_ah *ib_ah)
+int bnxt_re_destroy_ah(struct ib_ah *ib_ah, u32 flags)
{
struct bnxt_re_ah *ah = container_of(ib_ah, struct bnxt_re_ah, ib_ah);
struct bnxt_re_dev *rdev = ah->rdev;
int rc;
- rc = bnxt_qplib_destroy_ah(&rdev->qplib_res, &ah->qplib_ah);
+ rc = bnxt_qplib_destroy_ah(&rdev->qplib_res, &ah->qplib_ah,
+ !(flags & RDMA_DESTROY_AH_SLEEPABLE));
if (rc) {
dev_err(rdev_to_dev(rdev), "Failed to destroy HW AH");
return rc;
@@ -662,16 +651,36 @@ int bnxt_re_destroy_ah(struct ib_ah *ib_ah)
return 0;
}
+static u8 bnxt_re_stack_to_dev_nw_type(enum rdma_network_type ntype)
+{
+ u8 nw_type;
+
+ switch (ntype) {
+ case RDMA_NETWORK_IPV4:
+ nw_type = CMDQ_CREATE_AH_TYPE_V2IPV4;
+ break;
+ case RDMA_NETWORK_IPV6:
+ nw_type = CMDQ_CREATE_AH_TYPE_V2IPV6;
+ break;
+ default:
+ nw_type = CMDQ_CREATE_AH_TYPE_V1;
+ break;
+ }
+ return nw_type;
+}
+
struct ib_ah *bnxt_re_create_ah(struct ib_pd *ib_pd,
struct rdma_ah_attr *ah_attr,
+ u32 flags,
struct ib_udata *udata)
{
struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd);
+ const struct ib_global_route *grh = rdma_ah_read_grh(ah_attr);
struct bnxt_re_dev *rdev = pd->rdev;
+ const struct ib_gid_attr *sgid_attr;
struct bnxt_re_ah *ah;
- const struct ib_global_route *grh = rdma_ah_read_grh(ah_attr);
- int rc;
u8 nw_type;
+ int rc;
if (!(rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH)) {
dev_err(rdev_to_dev(rdev), "Failed to alloc AH: GRH not set");
@@ -698,44 +707,27 @@ struct ib_ah *bnxt_re_create_ah(struct ib_pd *ib_pd,
ah->qplib_ah.flow_label = grh->flow_label;
ah->qplib_ah.hop_limit = grh->hop_limit;
ah->qplib_ah.sl = rdma_ah_get_sl(ah_attr);
- if (ib_pd->uobject &&
- !rdma_is_multicast_addr((struct in6_addr *)
- grh->dgid.raw) &&
- !rdma_link_local_addr((struct in6_addr *)
- grh->dgid.raw)) {
- const struct ib_gid_attr *sgid_attr;
- sgid_attr = grh->sgid_attr;
- /* Get network header type for this GID */
- nw_type = rdma_gid_attr_network_type(sgid_attr);
- switch (nw_type) {
- case RDMA_NETWORK_IPV4:
- ah->qplib_ah.nw_type = CMDQ_CREATE_AH_TYPE_V2IPV4;
- break;
- case RDMA_NETWORK_IPV6:
- ah->qplib_ah.nw_type = CMDQ_CREATE_AH_TYPE_V2IPV6;
- break;
- default:
- ah->qplib_ah.nw_type = CMDQ_CREATE_AH_TYPE_V1;
- break;
- }
- }
+ sgid_attr = grh->sgid_attr;
+ /* Get network header type for this GID */
+ nw_type = rdma_gid_attr_network_type(sgid_attr);
+ ah->qplib_ah.nw_type = bnxt_re_stack_to_dev_nw_type(nw_type);
memcpy(ah->qplib_ah.dmac, ah_attr->roce.dmac, ETH_ALEN);
- rc = bnxt_qplib_create_ah(&rdev->qplib_res, &ah->qplib_ah);
+ rc = bnxt_qplib_create_ah(&rdev->qplib_res, &ah->qplib_ah,
+ !(flags & RDMA_CREATE_AH_SLEEPABLE));
if (rc) {
dev_err(rdev_to_dev(rdev), "Failed to allocate HW AH");
goto fail;
}
/* Write AVID to shared page. */
- if (ib_pd->uobject) {
- struct ib_ucontext *ib_uctx = ib_pd->uobject->context;
- struct bnxt_re_ucontext *uctx;
+ if (udata) {
+ struct bnxt_re_ucontext *uctx = rdma_udata_to_drv_context(
+ udata, struct bnxt_re_ucontext, ib_uctx);
unsigned long flag;
u32 *wrptr;
- uctx = container_of(ib_uctx, struct bnxt_re_ucontext, ib_uctx);
spin_lock_irqsave(&uctx->sh_lock, flag);
wrptr = (u32 *)(uctx->shpg + BNXT_RE_AVID_OFFT);
*wrptr = ah->qplib_ah.id;
@@ -801,8 +793,8 @@ int bnxt_re_destroy_qp(struct ib_qp *ib_qp)
{
struct bnxt_re_qp *qp = container_of(ib_qp, struct bnxt_re_qp, ib_qp);
struct bnxt_re_dev *rdev = qp->rdev;
- int rc;
unsigned int flags;
+ int rc;
bnxt_qplib_flush_cqn_wq(&qp->qplib_qp);
rc = bnxt_qplib_destroy_qp(&rdev->qplib_res, &qp->qplib_qp);
@@ -811,14 +803,17 @@ int bnxt_re_destroy_qp(struct ib_qp *ib_qp)
return rc;
}
- flags = bnxt_re_lock_cqs(qp);
- bnxt_qplib_clean_qp(&qp->qplib_qp);
- bnxt_re_unlock_cqs(qp, flags);
+ if (rdma_is_kernel_res(&qp->ib_qp.res)) {
+ flags = bnxt_re_lock_cqs(qp);
+ bnxt_qplib_clean_qp(&qp->qplib_qp);
+ bnxt_re_unlock_cqs(qp, flags);
+ }
+
bnxt_qplib_free_qp_res(&rdev->qplib_res, &qp->qplib_qp);
if (ib_qp->qp_type == IB_QPT_GSI && rdev->qp1_sqp) {
rc = bnxt_qplib_destroy_ah(&rdev->qplib_res,
- &rdev->sqp_ah->qplib_ah);
+ &rdev->sqp_ah->qplib_ah, false);
if (rc) {
dev_err(rdev_to_dev(rdev),
"Failed to destroy HW AH for shadow QP");
@@ -879,21 +874,23 @@ static int bnxt_re_init_user_qp(struct bnxt_re_dev *rdev, struct bnxt_re_pd *pd,
struct bnxt_re_qp_req ureq;
struct bnxt_qplib_qp *qplib_qp = &qp->qplib_qp;
struct ib_umem *umem;
- int bytes = 0;
- struct ib_ucontext *context = pd->ib_pd.uobject->context;
- struct bnxt_re_ucontext *cntx = container_of(context,
- struct bnxt_re_ucontext,
- ib_uctx);
+ int bytes = 0, psn_sz;
+ struct bnxt_re_ucontext *cntx = rdma_udata_to_drv_context(
+ udata, struct bnxt_re_ucontext, ib_uctx);
+
if (ib_copy_from_udata(&ureq, udata, sizeof(ureq)))
return -EFAULT;
bytes = (qplib_qp->sq.max_wqe * BNXT_QPLIB_MAX_SQE_ENTRY_SIZE);
/* Consider mapping PSN search memory only for RC QPs. */
- if (qplib_qp->type == CMDQ_CREATE_QP_TYPE_RC)
- bytes += (qplib_qp->sq.max_wqe * sizeof(struct sq_psn_search));
+ if (qplib_qp->type == CMDQ_CREATE_QP_TYPE_RC) {
+ psn_sz = bnxt_qplib_is_chip_gen_p5(&rdev->chip_ctx) ?
+ sizeof(struct sq_psn_search_ext) :
+ sizeof(struct sq_psn_search);
+ bytes += (qplib_qp->sq.max_wqe * psn_sz);
+ }
bytes = PAGE_ALIGN(bytes);
- umem = ib_umem_get(context, ureq.qpsva, bytes,
- IB_ACCESS_LOCAL_WRITE, 1);
+ umem = ib_umem_get(udata, ureq.qpsva, bytes, IB_ACCESS_LOCAL_WRITE, 1);
if (IS_ERR(umem))
return PTR_ERR(umem);
@@ -905,7 +902,7 @@ static int bnxt_re_init_user_qp(struct bnxt_re_dev *rdev, struct bnxt_re_pd *pd,
if (!qp->qplib_qp.srq) {
bytes = (qplib_qp->rq.max_wqe * BNXT_QPLIB_MAX_RQE_ENTRY_SIZE);
bytes = PAGE_ALIGN(bytes);
- umem = ib_umem_get(context, ureq.qprva, bytes,
+ umem = ib_umem_get(udata, ureq.qprva, bytes,
IB_ACCESS_LOCAL_WRITE, 1);
if (IS_ERR(umem))
goto rqfail;
@@ -958,7 +955,7 @@ static struct bnxt_re_ah *bnxt_re_create_shadow_qp_ah
/* Have DMAC same as SMAC */
ether_addr_copy(ah->qplib_ah.dmac, rdev->netdev->dev_addr);
- rc = bnxt_qplib_create_ah(&rdev->qplib_res, &ah->qplib_ah);
+ rc = bnxt_qplib_create_ah(&rdev->qplib_res, &ah->qplib_ah, false);
if (rc) {
dev_err(rdev_to_dev(rdev),
"Failed to allocate HW AH for Shadow QP");
@@ -1063,12 +1060,17 @@ struct ib_qp *bnxt_re_create_qp(struct ib_pd *ib_pd,
qp->qplib_qp.pd = &pd->qplib_pd;
qp->qplib_qp.qp_handle = (u64)(unsigned long)(&qp->qplib_qp);
qp->qplib_qp.type = __from_ib_qp_type(qp_init_attr->qp_type);
+
+ if (qp_init_attr->qp_type == IB_QPT_GSI &&
+ bnxt_qplib_is_chip_gen_p5(&rdev->chip_ctx))
+ qp->qplib_qp.type = CMDQ_CREATE_QP_TYPE_GSI;
if (qp->qplib_qp.type == IB_QPT_MAX) {
dev_err(rdev_to_dev(rdev), "QP type 0x%x not supported",
qp->qplib_qp.type);
rc = -EINVAL;
goto fail;
}
+
qp->qplib_qp.max_inline_data = qp_init_attr->cap.max_inline_data;
qp->qplib_qp.sig_type = ((qp_init_attr->sq_sig_type ==
IB_SIGNAL_ALL_WR) ? true : false);
@@ -1129,7 +1131,8 @@ struct ib_qp *bnxt_re_create_qp(struct ib_pd *ib_pd,
qp->qplib_qp.mtu = ib_mtu_enum_to_int(iboe_get_mtu(rdev->netdev->mtu));
- if (qp_init_attr->qp_type == IB_QPT_GSI) {
+ if (qp_init_attr->qp_type == IB_QPT_GSI &&
+ !(bnxt_qplib_is_chip_gen_p5(&rdev->chip_ctx))) {
/* Allocate 1 more than what's provided */
entries = roundup_pow_of_two(qp_init_attr->cap.max_send_wr + 1);
qp->qplib_qp.sq.max_wqe = min_t(u32, entries,
@@ -1358,17 +1361,15 @@ static int bnxt_re_init_user_srq(struct bnxt_re_dev *rdev,
struct bnxt_qplib_srq *qplib_srq = &srq->qplib_srq;
struct ib_umem *umem;
int bytes = 0;
- struct ib_ucontext *context = pd->ib_pd.uobject->context;
- struct bnxt_re_ucontext *cntx = container_of(context,
- struct bnxt_re_ucontext,
- ib_uctx);
+ struct bnxt_re_ucontext *cntx = rdma_udata_to_drv_context(
+ udata, struct bnxt_re_ucontext, ib_uctx);
+
if (ib_copy_from_udata(&ureq, udata, sizeof(ureq)))
return -EFAULT;
bytes = (qplib_srq->max_wqe * BNXT_QPLIB_MAX_RQE_ENTRY_SIZE);
bytes = PAGE_ALIGN(bytes);
- umem = ib_umem_get(context, ureq.srqva, bytes,
- IB_ACCESS_LOCAL_WRITE, 1);
+ umem = ib_umem_get(udata, ureq.srqva, bytes, IB_ACCESS_LOCAL_WRITE, 1);
if (IS_ERR(umem))
return PTR_ERR(umem);
@@ -1643,6 +1644,9 @@ int bnxt_re_modify_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr,
__from_ib_access_flags(qp_attr->qp_access_flags);
/* LOCAL_WRITE access must be set to allow RC receive */
qp->qplib_qp.access |= BNXT_QPLIB_ACCESS_LOCAL_WRITE;
+ /* Temp: Set all params on QP as of now */
+ qp->qplib_qp.access |= CMDQ_MODIFY_QP_ACCESS_REMOTE_WRITE;
+ qp->qplib_qp.access |= CMDQ_MODIFY_QP_ACCESS_REMOTE_READ;
}
if (qp_attr_mask & IB_QP_PKEY_INDEX) {
qp->qplib_qp.modify_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_PKEY;
@@ -2090,7 +2094,8 @@ static int bnxt_re_build_qp1_shadow_qp_recv(struct bnxt_re_qp *qp,
static int is_ud_qp(struct bnxt_re_qp *qp)
{
- return qp->qplib_qp.type == CMDQ_CREATE_QP_TYPE_UD;
+ return (qp->qplib_qp.type == CMDQ_CREATE_QP_TYPE_UD ||
+ qp->qplib_qp.type == CMDQ_CREATE_QP_TYPE_GSI);
}
static int bnxt_re_build_send_wqe(struct bnxt_re_qp *qp,
@@ -2394,7 +2399,7 @@ int bnxt_re_post_send(struct ib_qp *ib_qp, const struct ib_send_wr *wr,
switch (wr->opcode) {
case IB_WR_SEND:
case IB_WR_SEND_WITH_IMM:
- if (ib_qp->qp_type == IB_QPT_GSI) {
+ if (qp->qplib_qp.type == CMDQ_CREATE_QP1_TYPE_GSI) {
rc = bnxt_re_build_qp1_send_v2(qp, wr, &wqe,
payload_sz);
if (rc)
@@ -2524,7 +2529,8 @@ int bnxt_re_post_recv(struct ib_qp *ib_qp, const struct ib_recv_wr *wr,
wqe.wr_id = wr->wr_id;
wqe.type = BNXT_QPLIB_SWQE_TYPE_RECV;
- if (ib_qp->qp_type == IB_QPT_GSI)
+ if (ib_qp->qp_type == IB_QPT_GSI &&
+ qp->qplib_qp.type != CMDQ_CREATE_QP_TYPE_GSI)
rc = bnxt_re_build_qp1_shadow_qp_recv(qp, wr, &wqe,
payload_sz);
if (!rc)
@@ -2619,7 +2625,7 @@ struct ib_cq *bnxt_re_create_cq(struct ib_device *ibdev,
goto fail;
}
- cq->umem = ib_umem_get(context, req.cq_va,
+ cq->umem = ib_umem_get(udata, req.cq_va,
entries * sizeof(struct cq_base),
IB_ACCESS_LOCAL_WRITE, 1);
if (IS_ERR(cq->umem)) {
@@ -3119,19 +3125,33 @@ static void bnxt_re_process_res_shadow_qp_wc(struct bnxt_re_qp *qp,
}
}
-static void bnxt_re_process_res_ud_wc(struct ib_wc *wc,
+static void bnxt_re_process_res_ud_wc(struct bnxt_re_qp *qp,
+ struct ib_wc *wc,
struct bnxt_qplib_cqe *cqe)
{
+ u8 nw_type;
+
wc->opcode = IB_WC_RECV;
wc->status = __rc_to_ib_wc_status(cqe->status);
- if (cqe->flags & CQ_RES_RC_FLAGS_IMM)
+ if (cqe->flags & CQ_RES_UD_FLAGS_IMM)
wc->wc_flags |= IB_WC_WITH_IMM;
- if (cqe->flags & CQ_RES_RC_FLAGS_INV)
- wc->wc_flags |= IB_WC_WITH_INVALIDATE;
- if ((cqe->flags & (CQ_RES_RC_FLAGS_RDMA | CQ_RES_RC_FLAGS_IMM)) ==
- (CQ_RES_RC_FLAGS_RDMA | CQ_RES_RC_FLAGS_IMM))
- wc->opcode = IB_WC_RECV_RDMA_WITH_IMM;
+ /* report only on GSI QP for Thor */
+ if (qp->qplib_qp.type == CMDQ_CREATE_QP_TYPE_GSI) {
+ wc->wc_flags |= IB_WC_GRH;
+ memcpy(wc->smac, cqe->smac, ETH_ALEN);
+ wc->wc_flags |= IB_WC_WITH_SMAC;
+ if (cqe->flags & CQ_RES_UD_FLAGS_META_FORMAT_VLAN) {
+ wc->vlan_id = (cqe->cfa_meta & 0xFFF);
+ if (wc->vlan_id < 0x1000)
+ wc->wc_flags |= IB_WC_WITH_VLAN;
+ }
+ nw_type = (cqe->flags & CQ_RES_UD_FLAGS_ROCE_IP_VER_MASK) >>
+ CQ_RES_UD_FLAGS_ROCE_IP_VER_SFT;
+ wc->network_hdr_type = bnxt_re_to_ib_nw_type(nw_type);
+ wc->wc_flags |= IB_WC_WITH_NETWORK_HDR_TYPE;
+ }
+
}
static int send_phantom_wqe(struct bnxt_re_qp *qp)
@@ -3223,7 +3243,7 @@ int bnxt_re_poll_cq(struct ib_cq *ib_cq, int num_entries, struct ib_wc *wc)
switch (cqe->opcode) {
case CQ_BASE_CQE_TYPE_REQ:
- if (qp->qplib_qp.id ==
+ if (qp->rdev->qp1_sqp && qp->qplib_qp.id ==
qp->rdev->qp1_sqp->qplib_qp.id) {
/* Handle this completion with
* the stored completion
@@ -3258,7 +3278,7 @@ int bnxt_re_poll_cq(struct ib_cq *ib_cq, int num_entries, struct ib_wc *wc)
bnxt_re_process_res_rc_wc(wc, cqe);
break;
case CQ_BASE_CQE_TYPE_RES_UD:
- if (qp->qplib_qp.id ==
+ if (qp->rdev->qp1_sqp && qp->qplib_qp.id ==
qp->rdev->qp1_sqp->qplib_qp.id) {
/* Handle this completion with
* the stored completion
@@ -3271,7 +3291,7 @@ int bnxt_re_poll_cq(struct ib_cq *ib_cq, int num_entries, struct ib_wc *wc)
break;
}
}
- bnxt_re_process_res_ud_wc(wc, cqe);
+ bnxt_re_process_res_ud_wc(qp, wc, cqe);
break;
default:
dev_err(rdev_to_dev(cq->rdev),
@@ -3298,10 +3318,10 @@ int bnxt_re_req_notify_cq(struct ib_cq *ib_cq,
spin_lock_irqsave(&cq->cq_lock, flags);
/* Trigger on the very next completion */
if (ib_cqn_flags & IB_CQ_NEXT_COMP)
- type = DBR_DBR_TYPE_CQ_ARMALL;
+ type = DBC_DBC_TYPE_CQ_ARMALL;
/* Trigger on the next solicited completion */
else if (ib_cqn_flags & IB_CQ_SOLICITED)
- type = DBR_DBR_TYPE_CQ_ARMSE;
+ type = DBC_DBC_TYPE_CQ_ARMSE;
/* Poll to see if there are missed events */
if ((ib_cqn_flags & IB_CQ_REPORT_MISSED_EVENTS) &&
@@ -3534,19 +3554,14 @@ static int fill_umem_pbl_tbl(struct ib_umem *umem, u64 *pbl_tbl_orig,
u64 *pbl_tbl = pbl_tbl_orig;
u64 paddr;
u64 page_mask = (1ULL << page_shift) - 1;
- int i, pages;
- struct scatterlist *sg;
- int entry;
-
- for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
- pages = sg_dma_len(sg) >> PAGE_SHIFT;
- for (i = 0; i < pages; i++) {
- paddr = sg_dma_address(sg) + (i << PAGE_SHIFT);
- if (pbl_tbl == pbl_tbl_orig)
- *pbl_tbl++ = paddr & ~page_mask;
- else if ((paddr & page_mask) == 0)
- *pbl_tbl++ = paddr;
- }
+ struct sg_dma_page_iter sg_iter;
+
+ for_each_sg_dma_page (umem->sg_head.sgl, &sg_iter, umem->nmap, 0) {
+ paddr = sg_page_iter_dma_address(&sg_iter);
+ if (pbl_tbl == pbl_tbl_orig)
+ *pbl_tbl++ = paddr & ~page_mask;
+ else if ((paddr & page_mask) == 0)
+ *pbl_tbl++ = paddr;
}
return pbl_tbl - pbl_tbl_orig;
}
@@ -3586,8 +3601,7 @@ struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *ib_pd, u64 start, u64 length,
/* The fixed portion of the rkey is the same as the lkey */
mr->ib_mr.rkey = mr->qplib_mr.rkey;
- umem = ib_umem_get(ib_pd->uobject->context, start, length,
- mr_access_flags, 0);
+ umem = ib_umem_get(udata, start, length, mr_access_flags, 0);
if (IS_ERR(umem)) {
dev_err(rdev_to_dev(rdev), "Failed to get umem");
rc = -EFAULT;
@@ -3610,7 +3624,7 @@ struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *ib_pd, u64 start, u64 length,
goto free_umem;
}
- page_shift = umem->page_shift;
+ page_shift = PAGE_SHIFT;
if (!bnxt_re_page_size_ok(page_shift)) {
dev_err(rdev_to_dev(rdev), "umem page size unsupported!");
@@ -3657,13 +3671,15 @@ free_mr:
return ERR_PTR(rc);
}
-struct ib_ucontext *bnxt_re_alloc_ucontext(struct ib_device *ibdev,
- struct ib_udata *udata)
+int bnxt_re_alloc_ucontext(struct ib_ucontext *ctx, struct ib_udata *udata)
{
+ struct ib_device *ibdev = ctx->device;
+ struct bnxt_re_ucontext *uctx =
+ container_of(ctx, struct bnxt_re_ucontext, ib_uctx);
struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev);
- struct bnxt_re_uctx_resp resp;
- struct bnxt_re_ucontext *uctx;
struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr;
+ struct bnxt_re_uctx_resp resp;
+ u32 chip_met_rev_num = 0;
int rc;
dev_dbg(rdev_to_dev(rdev), "ABI version requested %d",
@@ -3672,13 +3688,9 @@ struct ib_ucontext *bnxt_re_alloc_ucontext(struct ib_device *ibdev,
if (ibdev->uverbs_abi_ver != BNXT_RE_ABI_VERSION) {
dev_dbg(rdev_to_dev(rdev), " is different from the device %d ",
BNXT_RE_ABI_VERSION);
- return ERR_PTR(-EPERM);
+ return -EPERM;
}
- uctx = kzalloc(sizeof(*uctx), GFP_KERNEL);
- if (!uctx)
- return ERR_PTR(-ENOMEM);
-
uctx->rdev = rdev;
uctx->shpg = (void *)__get_free_page(GFP_KERNEL);
@@ -3688,37 +3700,45 @@ struct ib_ucontext *bnxt_re_alloc_ucontext(struct ib_device *ibdev,
}
spin_lock_init(&uctx->sh_lock);
- resp.dev_id = rdev->en_dev->pdev->devfn; /*Temp, Use idr_alloc instead*/
+ resp.comp_mask = BNXT_RE_UCNTX_CMASK_HAVE_CCTX;
+ chip_met_rev_num = rdev->chip_ctx.chip_num;
+ chip_met_rev_num |= ((u32)rdev->chip_ctx.chip_rev & 0xFF) <<
+ BNXT_RE_CHIP_ID0_CHIP_REV_SFT;
+ chip_met_rev_num |= ((u32)rdev->chip_ctx.chip_metal & 0xFF) <<
+ BNXT_RE_CHIP_ID0_CHIP_MET_SFT;
+ resp.chip_id0 = chip_met_rev_num;
+ /* Future extension of chip info */
+ resp.chip_id1 = 0;
+ /*Temp, Use idr_alloc instead */
+ resp.dev_id = rdev->en_dev->pdev->devfn;
resp.max_qp = rdev->qplib_ctx.qpc_count;
resp.pg_size = PAGE_SIZE;
resp.cqe_sz = sizeof(struct cq_base);
resp.max_cqd = dev_attr->max_cq_wqes;
resp.rsvd = 0;
- rc = ib_copy_to_udata(udata, &resp, sizeof(resp));
+ rc = ib_copy_to_udata(udata, &resp, min(udata->outlen, sizeof(resp)));
if (rc) {
dev_err(rdev_to_dev(rdev), "Failed to copy user context");
rc = -EFAULT;
goto cfail;
}
- return &uctx->ib_uctx;
+ return 0;
cfail:
free_page((unsigned long)uctx->shpg);
uctx->shpg = NULL;
fail:
- kfree(uctx);
- return ERR_PTR(rc);
+ return rc;
}
-int bnxt_re_dealloc_ucontext(struct ib_ucontext *ib_uctx)
+void bnxt_re_dealloc_ucontext(struct ib_ucontext *ib_uctx)
{
struct bnxt_re_ucontext *uctx = container_of(ib_uctx,
struct bnxt_re_ucontext,
ib_uctx);
struct bnxt_re_dev *rdev = uctx->rdev;
- int rc = 0;
if (uctx->shpg)
free_page((unsigned long)uctx->shpg);
@@ -3727,17 +3747,10 @@ int bnxt_re_dealloc_ucontext(struct ib_ucontext *ib_uctx)
/* Free DPI only if this is the first PD allocated by the
* application and mark the context dpi as NULL
*/
- rc = bnxt_qplib_dealloc_dpi(&rdev->qplib_res,
- &rdev->qplib_res.dpi_tbl,
- &uctx->dpi);
- if (rc)
- dev_err(rdev_to_dev(rdev), "Deallocate HW DPI failed!");
- /* Don't fail, continue*/
+ bnxt_qplib_dealloc_dpi(&rdev->qplib_res,
+ &rdev->qplib_res.dpi_tbl, &uctx->dpi);
uctx->dpi.dbr = NULL;
}
-
- kfree(uctx);
- return 0;
}
/* Helper function to mmap the virtual memory from user app */
diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.h b/drivers/infiniband/hw/bnxt_re/ib_verbs.h
index aa33e7b82c84..e45465ed4eee 100644
--- a/drivers/infiniband/hw/bnxt_re/ib_verbs.h
+++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.h
@@ -56,8 +56,8 @@ struct bnxt_re_fence_data {
};
struct bnxt_re_pd {
+ struct ib_pd ib_pd;
struct bnxt_re_dev *rdev;
- struct ib_pd ib_pd;
struct bnxt_qplib_pd qplib_pd;
struct bnxt_re_fence_data fence;
};
@@ -135,8 +135,8 @@ struct bnxt_re_mw {
};
struct bnxt_re_ucontext {
+ struct ib_ucontext ib_uctx;
struct bnxt_re_dev *rdev;
- struct ib_ucontext ib_uctx;
struct bnxt_qplib_dpi dpi;
void *shpg;
spinlock_t sh_lock; /* protect shpg */
@@ -163,16 +163,16 @@ int bnxt_re_query_gid(struct ib_device *ibdev, u8 port_num,
int index, union ib_gid *gid);
enum rdma_link_layer bnxt_re_get_link_layer(struct ib_device *ibdev,
u8 port_num);
-struct ib_pd *bnxt_re_alloc_pd(struct ib_device *ibdev,
- struct ib_ucontext *context,
- struct ib_udata *udata);
-int bnxt_re_dealloc_pd(struct ib_pd *pd);
+int bnxt_re_alloc_pd(struct ib_pd *pd, struct ib_ucontext *context,
+ struct ib_udata *udata);
+void bnxt_re_dealloc_pd(struct ib_pd *pd);
struct ib_ah *bnxt_re_create_ah(struct ib_pd *pd,
struct rdma_ah_attr *ah_attr,
+ u32 flags,
struct ib_udata *udata);
int bnxt_re_modify_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr);
int bnxt_re_query_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr);
-int bnxt_re_destroy_ah(struct ib_ah *ah);
+int bnxt_re_destroy_ah(struct ib_ah *ah, u32 flags);
struct ib_srq *bnxt_re_create_srq(struct ib_pd *pd,
struct ib_srq_init_attr *srq_init_attr,
struct ib_udata *udata);
@@ -215,9 +215,8 @@ int bnxt_re_dealloc_mw(struct ib_mw *mw);
struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
u64 virt_addr, int mr_access_flags,
struct ib_udata *udata);
-struct ib_ucontext *bnxt_re_alloc_ucontext(struct ib_device *ibdev,
- struct ib_udata *udata);
-int bnxt_re_dealloc_ucontext(struct ib_ucontext *context);
+int bnxt_re_alloc_ucontext(struct ib_ucontext *ctx, struct ib_udata *udata);
+void bnxt_re_dealloc_ucontext(struct ib_ucontext *context);
int bnxt_re_mmap(struct ib_ucontext *context, struct vm_area_struct *vma);
unsigned long bnxt_re_lock_cqs(struct bnxt_re_qp *qp);
diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c
index 77f095e5fbe3..2bd24ac45ee4 100644
--- a/drivers/infiniband/hw/bnxt_re/main.c
+++ b/drivers/infiniband/hw/bnxt_re/main.c
@@ -80,6 +80,29 @@ static DEFINE_MUTEX(bnxt_re_dev_lock);
static struct workqueue_struct *bnxt_re_wq;
static void bnxt_re_ib_unreg(struct bnxt_re_dev *rdev);
+static void bnxt_re_destroy_chip_ctx(struct bnxt_re_dev *rdev)
+{
+ rdev->rcfw.res = NULL;
+ rdev->qplib_res.cctx = NULL;
+}
+
+static int bnxt_re_setup_chip_ctx(struct bnxt_re_dev *rdev)
+{
+ struct bnxt_en_dev *en_dev;
+ struct bnxt *bp;
+
+ en_dev = rdev->en_dev;
+ bp = netdev_priv(en_dev->net);
+
+ rdev->chip_ctx.chip_num = bp->chip_num;
+ /* rest members to follow eventually */
+
+ rdev->qplib_res.cctx = &rdev->chip_ctx;
+ rdev->rcfw.res = &rdev->qplib_res;
+
+ return 0;
+}
+
/* SR-IOV helper functions */
static void bnxt_re_get_sriov_func_type(struct bnxt_re_dev *rdev)
@@ -278,6 +301,7 @@ static int bnxt_re_register_netdev(struct bnxt_re_dev *rdev)
rc = en_dev->en_ops->bnxt_register_device(en_dev, BNXT_ROCE_ULP,
&bnxt_re_ulp_ops, rdev);
+ rdev->qplib_res.pdev = rdev->en_dev->pdev;
return rc;
}
@@ -345,7 +369,8 @@ static void bnxt_re_fill_fw_msg(struct bnxt_fw_msg *fw_msg, void *msg,
fw_msg->timeout = timeout;
}
-static int bnxt_re_net_ring_free(struct bnxt_re_dev *rdev, u16 fw_ring_id)
+static int bnxt_re_net_ring_free(struct bnxt_re_dev *rdev,
+ u16 fw_ring_id, int type)
{
struct bnxt_en_dev *en_dev = rdev->en_dev;
struct hwrm_ring_free_input req = {0};
@@ -359,7 +384,7 @@ static int bnxt_re_net_ring_free(struct bnxt_re_dev *rdev, u16 fw_ring_id)
memset(&fw_msg, 0, sizeof(fw_msg));
bnxt_re_init_hwrm_hdr(rdev, (void *)&req, HWRM_RING_FREE, -1, -1);
- req.ring_type = RING_ALLOC_REQ_RING_TYPE_L2_CMPL;
+ req.ring_type = type;
req.ring_id = cpu_to_le16(fw_ring_id);
bnxt_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp,
sizeof(resp), DFLT_HWRM_CMD_TIMEOUT);
@@ -396,7 +421,7 @@ static int bnxt_re_net_ring_alloc(struct bnxt_re_dev *rdev, dma_addr_t *dma_arr,
/* Association of ring index with doorbell index and MSIX number */
req.logical_id = cpu_to_le16(map_index);
req.length = cpu_to_le32(ring_mask + 1);
- req.ring_type = RING_ALLOC_REQ_RING_TYPE_L2_CMPL;
+ req.ring_type = type;
req.int_mode = RING_ALLOC_REQ_INT_MODE_MSIX;
bnxt_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp,
sizeof(resp), DFLT_HWRM_CMD_TIMEOUT);
@@ -538,7 +563,8 @@ static struct bnxt_en_dev *bnxt_re_dev_probe(struct net_device *netdev)
static ssize_t hw_rev_show(struct device *device, struct device_attribute *attr,
char *buf)
{
- struct bnxt_re_dev *rdev = to_bnxt_re_dev(device, ibdev.dev);
+ struct bnxt_re_dev *rdev =
+ rdma_device_to_drv_device(device, struct bnxt_re_dev, ibdev);
return scnprintf(buf, PAGE_SIZE, "0x%x\n", rdev->en_dev->pdev->vendor);
}
@@ -547,7 +573,8 @@ static DEVICE_ATTR_RO(hw_rev);
static ssize_t hca_type_show(struct device *device,
struct device_attribute *attr, char *buf)
{
- struct bnxt_re_dev *rdev = to_bnxt_re_dev(device, ibdev.dev);
+ struct bnxt_re_dev *rdev =
+ rdma_device_to_drv_device(device, struct bnxt_re_dev, ibdev);
return scnprintf(buf, PAGE_SIZE, "%s\n", rdev->ibdev.node_desc);
}
@@ -568,6 +595,52 @@ static void bnxt_re_unregister_ib(struct bnxt_re_dev *rdev)
ib_unregister_device(&rdev->ibdev);
}
+static const struct ib_device_ops bnxt_re_dev_ops = {
+ .add_gid = bnxt_re_add_gid,
+ .alloc_hw_stats = bnxt_re_ib_alloc_hw_stats,
+ .alloc_mr = bnxt_re_alloc_mr,
+ .alloc_pd = bnxt_re_alloc_pd,
+ .alloc_ucontext = bnxt_re_alloc_ucontext,
+ .create_ah = bnxt_re_create_ah,
+ .create_cq = bnxt_re_create_cq,
+ .create_qp = bnxt_re_create_qp,
+ .create_srq = bnxt_re_create_srq,
+ .dealloc_pd = bnxt_re_dealloc_pd,
+ .dealloc_ucontext = bnxt_re_dealloc_ucontext,
+ .del_gid = bnxt_re_del_gid,
+ .dereg_mr = bnxt_re_dereg_mr,
+ .destroy_ah = bnxt_re_destroy_ah,
+ .destroy_cq = bnxt_re_destroy_cq,
+ .destroy_qp = bnxt_re_destroy_qp,
+ .destroy_srq = bnxt_re_destroy_srq,
+ .get_dev_fw_str = bnxt_re_query_fw_str,
+ .get_dma_mr = bnxt_re_get_dma_mr,
+ .get_hw_stats = bnxt_re_ib_get_hw_stats,
+ .get_link_layer = bnxt_re_get_link_layer,
+ .get_netdev = bnxt_re_get_netdev,
+ .get_port_immutable = bnxt_re_get_port_immutable,
+ .map_mr_sg = bnxt_re_map_mr_sg,
+ .mmap = bnxt_re_mmap,
+ .modify_ah = bnxt_re_modify_ah,
+ .modify_device = bnxt_re_modify_device,
+ .modify_qp = bnxt_re_modify_qp,
+ .modify_srq = bnxt_re_modify_srq,
+ .poll_cq = bnxt_re_poll_cq,
+ .post_recv = bnxt_re_post_recv,
+ .post_send = bnxt_re_post_send,
+ .post_srq_recv = bnxt_re_post_srq_recv,
+ .query_ah = bnxt_re_query_ah,
+ .query_device = bnxt_re_query_device,
+ .query_pkey = bnxt_re_query_pkey,
+ .query_port = bnxt_re_query_port,
+ .query_qp = bnxt_re_query_qp,
+ .query_srq = bnxt_re_query_srq,
+ .reg_user_mr = bnxt_re_reg_user_mr,
+ .req_notify_cq = bnxt_re_req_notify_cq,
+ INIT_RDMA_OBJ_SIZE(ib_pd, bnxt_re_pd, ib_pd),
+ INIT_RDMA_OBJ_SIZE(ib_ucontext, bnxt_re_ucontext, ib_uctx),
+};
+
static int bnxt_re_register_ib(struct bnxt_re_dev *rdev)
{
struct ib_device *ibdev = &rdev->ibdev;
@@ -614,61 +687,11 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev)
(1ull << IB_USER_VERBS_CMD_DESTROY_AH);
/* POLL_CQ and REQ_NOTIFY_CQ is directly handled in libbnxt_re */
- /* Kernel verbs */
- ibdev->query_device = bnxt_re_query_device;
- ibdev->modify_device = bnxt_re_modify_device;
-
- ibdev->query_port = bnxt_re_query_port;
- ibdev->get_port_immutable = bnxt_re_get_port_immutable;
- ibdev->get_dev_fw_str = bnxt_re_query_fw_str;
- ibdev->query_pkey = bnxt_re_query_pkey;
- ibdev->get_netdev = bnxt_re_get_netdev;
- ibdev->add_gid = bnxt_re_add_gid;
- ibdev->del_gid = bnxt_re_del_gid;
- ibdev->get_link_layer = bnxt_re_get_link_layer;
-
- ibdev->alloc_pd = bnxt_re_alloc_pd;
- ibdev->dealloc_pd = bnxt_re_dealloc_pd;
-
- ibdev->create_ah = bnxt_re_create_ah;
- ibdev->modify_ah = bnxt_re_modify_ah;
- ibdev->query_ah = bnxt_re_query_ah;
- ibdev->destroy_ah = bnxt_re_destroy_ah;
-
- ibdev->create_srq = bnxt_re_create_srq;
- ibdev->modify_srq = bnxt_re_modify_srq;
- ibdev->query_srq = bnxt_re_query_srq;
- ibdev->destroy_srq = bnxt_re_destroy_srq;
- ibdev->post_srq_recv = bnxt_re_post_srq_recv;
-
- ibdev->create_qp = bnxt_re_create_qp;
- ibdev->modify_qp = bnxt_re_modify_qp;
- ibdev->query_qp = bnxt_re_query_qp;
- ibdev->destroy_qp = bnxt_re_destroy_qp;
-
- ibdev->post_send = bnxt_re_post_send;
- ibdev->post_recv = bnxt_re_post_recv;
-
- ibdev->create_cq = bnxt_re_create_cq;
- ibdev->destroy_cq = bnxt_re_destroy_cq;
- ibdev->poll_cq = bnxt_re_poll_cq;
- ibdev->req_notify_cq = bnxt_re_req_notify_cq;
-
- ibdev->get_dma_mr = bnxt_re_get_dma_mr;
- ibdev->dereg_mr = bnxt_re_dereg_mr;
- ibdev->alloc_mr = bnxt_re_alloc_mr;
- ibdev->map_mr_sg = bnxt_re_map_mr_sg;
-
- ibdev->reg_user_mr = bnxt_re_reg_user_mr;
- ibdev->alloc_ucontext = bnxt_re_alloc_ucontext;
- ibdev->dealloc_ucontext = bnxt_re_dealloc_ucontext;
- ibdev->mmap = bnxt_re_mmap;
- ibdev->get_hw_stats = bnxt_re_ib_get_hw_stats;
- ibdev->alloc_hw_stats = bnxt_re_ib_alloc_hw_stats;
rdma_set_device_sysfs_group(ibdev, &bnxt_re_dev_attr_group);
ibdev->driver_id = RDMA_DRIVER_BNXT_RE;
- return ib_register_device(ibdev, "bnxt_re%d", NULL);
+ ib_set_device_ops(ibdev, &bnxt_re_dev_ops);
+ return ib_register_device(ibdev, "bnxt_re%d");
}
static void bnxt_re_dev_remove(struct bnxt_re_dev *rdev)
@@ -692,7 +715,7 @@ static struct bnxt_re_dev *bnxt_re_dev_add(struct net_device *netdev,
struct bnxt_re_dev *rdev;
/* Allocate bnxt_re_dev instance here */
- rdev = (struct bnxt_re_dev *)ib_alloc_device(sizeof(*rdev));
+ rdev = ib_alloc_device(bnxt_re_dev, ibdev);
if (!rdev) {
dev_err(NULL, "%s: bnxt_re_dev allocation failure!",
ROCE_DRV_MODULE_NAME);
@@ -864,6 +887,12 @@ static int bnxt_re_cqn_handler(struct bnxt_qplib_nq *nq,
return 0;
}
+static u32 bnxt_re_get_nqdb_offset(struct bnxt_re_dev *rdev, u16 indx)
+{
+ return bnxt_qplib_is_chip_gen_p5(&rdev->chip_ctx) ?
+ 0x10000 : rdev->msix_entries[indx].db_offset;
+}
+
static void bnxt_re_cleanup_res(struct bnxt_re_dev *rdev)
{
int i;
@@ -877,18 +906,18 @@ static void bnxt_re_cleanup_res(struct bnxt_re_dev *rdev)
static int bnxt_re_init_res(struct bnxt_re_dev *rdev)
{
- int rc = 0, i;
int num_vec_enabled = 0;
+ int rc = 0, i;
+ u32 db_offt;
bnxt_qplib_init_res(&rdev->qplib_res);
for (i = 1; i < rdev->num_msix ; i++) {
+ db_offt = bnxt_re_get_nqdb_offset(rdev, i);
rc = bnxt_qplib_enable_nq(rdev->en_dev->pdev, &rdev->nq[i - 1],
i - 1, rdev->msix_entries[i].vector,
- rdev->msix_entries[i].db_offset,
- &bnxt_re_cqn_handler,
+ db_offt, &bnxt_re_cqn_handler,
&bnxt_re_srqn_handler);
-
if (rc) {
dev_err(rdev_to_dev(rdev),
"Failed to enable NQ with rc = 0x%x", rc);
@@ -900,16 +929,18 @@ static int bnxt_re_init_res(struct bnxt_re_dev *rdev)
fail:
for (i = num_vec_enabled; i >= 0; i--)
bnxt_qplib_disable_nq(&rdev->nq[i]);
-
return rc;
}
static void bnxt_re_free_nq_res(struct bnxt_re_dev *rdev)
{
+ u8 type;
int i;
for (i = 0; i < rdev->num_msix - 1; i++) {
- bnxt_re_net_ring_free(rdev, rdev->nq[i].ring_id);
+ type = bnxt_qplib_get_ring_type(&rdev->chip_ctx);
+ bnxt_re_net_ring_free(rdev, rdev->nq[i].ring_id, type);
+ rdev->nq[i].res = NULL;
bnxt_qplib_free_nq(&rdev->nq[i]);
}
}
@@ -931,8 +962,11 @@ static void bnxt_re_free_res(struct bnxt_re_dev *rdev)
static int bnxt_re_alloc_res(struct bnxt_re_dev *rdev)
{
- int rc = 0, i;
int num_vec_created = 0;
+ dma_addr_t *pg_map;
+ int rc = 0, i;
+ int pages;
+ u8 type;
/* Configure and allocate resources for qplib */
rdev->qplib_res.rcfw = &rdev->rcfw;
@@ -953,6 +987,7 @@ static int bnxt_re_alloc_res(struct bnxt_re_dev *rdev)
goto dealloc_res;
for (i = 0; i < rdev->num_msix - 1; i++) {
+ rdev->nq[i].res = &rdev->qplib_res;
rdev->nq[i].hwq.max_elements = BNXT_RE_MAX_CQ_COUNT +
BNXT_RE_MAX_SRQC_COUNT + 2;
rc = bnxt_qplib_alloc_nq(rdev->en_dev->pdev, &rdev->nq[i]);
@@ -961,13 +996,13 @@ static int bnxt_re_alloc_res(struct bnxt_re_dev *rdev)
i, rc);
goto free_nq;
}
- rc = bnxt_re_net_ring_alloc
- (rdev, rdev->nq[i].hwq.pbl[PBL_LVL_0].pg_map_arr,
- rdev->nq[i].hwq.pbl[rdev->nq[i].hwq.level].pg_count,
- HWRM_RING_ALLOC_CMPL,
- BNXT_QPLIB_NQE_MAX_CNT - 1,
- rdev->msix_entries[i + 1].ring_idx,
- &rdev->nq[i].ring_id);
+ type = bnxt_qplib_get_ring_type(&rdev->chip_ctx);
+ pg_map = rdev->nq[i].hwq.pbl[PBL_LVL_0].pg_map_arr;
+ pages = rdev->nq[i].hwq.pbl[rdev->nq[i].hwq.level].pg_count;
+ rc = bnxt_re_net_ring_alloc(rdev, pg_map, pages, type,
+ BNXT_QPLIB_NQE_MAX_CNT - 1,
+ rdev->msix_entries[i + 1].ring_idx,
+ &rdev->nq[i].ring_id);
if (rc) {
dev_err(rdev_to_dev(rdev),
"Failed to allocate NQ fw id with rc = 0x%x",
@@ -980,7 +1015,8 @@ static int bnxt_re_alloc_res(struct bnxt_re_dev *rdev)
return 0;
free_nq:
for (i = num_vec_created; i >= 0; i--) {
- bnxt_re_net_ring_free(rdev, rdev->nq[i].ring_id);
+ type = bnxt_qplib_get_ring_type(&rdev->chip_ctx);
+ bnxt_re_net_ring_free(rdev, rdev->nq[i].ring_id, type);
bnxt_qplib_free_nq(&rdev->nq[i]);
}
bnxt_qplib_dealloc_dpi(&rdev->qplib_res,
@@ -1203,8 +1239,38 @@ static int bnxt_re_setup_qos(struct bnxt_re_dev *rdev)
return 0;
}
+static void bnxt_re_query_hwrm_intf_version(struct bnxt_re_dev *rdev)
+{
+ struct bnxt_en_dev *en_dev = rdev->en_dev;
+ struct hwrm_ver_get_output resp = {0};
+ struct hwrm_ver_get_input req = {0};
+ struct bnxt_fw_msg fw_msg;
+ int rc = 0;
+
+ memset(&fw_msg, 0, sizeof(fw_msg));
+ bnxt_re_init_hwrm_hdr(rdev, (void *)&req,
+ HWRM_VER_GET, -1, -1);
+ req.hwrm_intf_maj = HWRM_VERSION_MAJOR;
+ req.hwrm_intf_min = HWRM_VERSION_MINOR;
+ req.hwrm_intf_upd = HWRM_VERSION_UPDATE;
+ bnxt_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp,
+ sizeof(resp), DFLT_HWRM_CMD_TIMEOUT);
+ rc = en_dev->en_ops->bnxt_send_fw_msg(en_dev, BNXT_ROCE_ULP, &fw_msg);
+ if (rc) {
+ dev_err(rdev_to_dev(rdev),
+ "Failed to query HW version, rc = 0x%x", rc);
+ return;
+ }
+ rdev->qplib_ctx.hwrm_intf_ver =
+ (u64)resp.hwrm_intf_major << 48 |
+ (u64)resp.hwrm_intf_minor << 32 |
+ (u64)resp.hwrm_intf_build << 16 |
+ resp.hwrm_intf_patch;
+}
+
static void bnxt_re_ib_unreg(struct bnxt_re_dev *rdev)
{
+ u8 type;
int rc;
if (test_and_clear_bit(BNXT_RE_FLAG_IBDEV_REGISTERED, &rdev->flags)) {
@@ -1228,7 +1294,8 @@ static void bnxt_re_ib_unreg(struct bnxt_re_dev *rdev)
bnxt_re_net_stats_ctx_free(rdev, rdev->qplib_ctx.stats.fw_id);
bnxt_qplib_free_ctx(rdev->en_dev->pdev, &rdev->qplib_ctx);
bnxt_qplib_disable_rcfw_channel(&rdev->rcfw);
- bnxt_re_net_ring_free(rdev, rdev->rcfw.creq_ring_id);
+ type = bnxt_qplib_get_ring_type(&rdev->chip_ctx);
+ bnxt_re_net_ring_free(rdev, rdev->rcfw.creq_ring_id, type);
bnxt_qplib_free_rcfw_channel(&rdev->rcfw);
}
if (test_and_clear_bit(BNXT_RE_FLAG_GOT_MSIX, &rdev->flags)) {
@@ -1237,6 +1304,8 @@ static void bnxt_re_ib_unreg(struct bnxt_re_dev *rdev)
dev_warn(rdev_to_dev(rdev),
"Failed to free MSI-X vectors: %#x", rc);
}
+
+ bnxt_re_destroy_chip_ctx(rdev);
if (test_and_clear_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags)) {
rc = bnxt_re_unregister_netdev(rdev);
if (rc)
@@ -1257,9 +1326,12 @@ static void bnxt_re_worker(struct work_struct *work)
static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev)
{
- int rc;
-
+ dma_addr_t *pg_map;
+ u32 db_offt, ridx;
+ int pages, vid;
bool locked;
+ u8 type;
+ int rc;
/* Acquire rtnl lock through out this function */
rtnl_lock();
@@ -1274,6 +1346,12 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev)
}
set_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags);
+ rc = bnxt_re_setup_chip_ctx(rdev);
+ if (rc) {
+ dev_err(rdev_to_dev(rdev), "Failed to get chip context\n");
+ return -EINVAL;
+ }
+
/* Check whether VF or PF */
bnxt_re_get_sriov_func_type(rdev);
@@ -1285,30 +1363,34 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev)
}
set_bit(BNXT_RE_FLAG_GOT_MSIX, &rdev->flags);
+ bnxt_re_query_hwrm_intf_version(rdev);
+
/* Establish RCFW Communication Channel to initialize the context
* memory for the function and all child VFs
*/
rc = bnxt_qplib_alloc_rcfw_channel(rdev->en_dev->pdev, &rdev->rcfw,
+ &rdev->qplib_ctx,
BNXT_RE_MAX_QPC_COUNT);
if (rc) {
pr_err("Failed to allocate RCFW Channel: %#x\n", rc);
goto fail;
}
- rc = bnxt_re_net_ring_alloc
- (rdev, rdev->rcfw.creq.pbl[PBL_LVL_0].pg_map_arr,
- rdev->rcfw.creq.pbl[rdev->rcfw.creq.level].pg_count,
- HWRM_RING_ALLOC_CMPL, BNXT_QPLIB_CREQE_MAX_CNT - 1,
- rdev->msix_entries[BNXT_RE_AEQ_IDX].ring_idx,
- &rdev->rcfw.creq_ring_id);
+ type = bnxt_qplib_get_ring_type(&rdev->chip_ctx);
+ pg_map = rdev->rcfw.creq.pbl[PBL_LVL_0].pg_map_arr;
+ pages = rdev->rcfw.creq.pbl[rdev->rcfw.creq.level].pg_count;
+ ridx = rdev->msix_entries[BNXT_RE_AEQ_IDX].ring_idx;
+ rc = bnxt_re_net_ring_alloc(rdev, pg_map, pages, type,
+ BNXT_QPLIB_CREQE_MAX_CNT - 1,
+ ridx, &rdev->rcfw.creq_ring_id);
if (rc) {
pr_err("Failed to allocate CREQ: %#x\n", rc);
goto free_rcfw;
}
- rc = bnxt_qplib_enable_rcfw_channel
- (rdev->en_dev->pdev, &rdev->rcfw,
- rdev->msix_entries[BNXT_RE_AEQ_IDX].vector,
- rdev->msix_entries[BNXT_RE_AEQ_IDX].db_offset,
- rdev->is_virtfn, &bnxt_re_aeq_handler);
+ db_offt = bnxt_re_get_nqdb_offset(rdev, BNXT_RE_AEQ_IDX);
+ vid = rdev->msix_entries[BNXT_RE_AEQ_IDX].vector;
+ rc = bnxt_qplib_enable_rcfw_channel(rdev->en_dev->pdev, &rdev->rcfw,
+ vid, db_offt, rdev->is_virtfn,
+ &bnxt_re_aeq_handler);
if (rc) {
pr_err("Failed to enable RCFW channel: %#x\n", rc);
goto free_ring;
@@ -1321,7 +1403,8 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev)
if (!rdev->is_virtfn)
bnxt_re_set_resource_limits(rdev);
- rc = bnxt_qplib_alloc_ctx(rdev->en_dev->pdev, &rdev->qplib_ctx, 0);
+ rc = bnxt_qplib_alloc_ctx(rdev->en_dev->pdev, &rdev->qplib_ctx, 0,
+ bnxt_qplib_is_chip_gen_p5(&rdev->chip_ctx));
if (rc) {
pr_err("Failed to allocate QPLIB context: %#x\n", rc);
goto disable_rcfw;
@@ -1392,7 +1475,8 @@ free_ctx:
disable_rcfw:
bnxt_qplib_disable_rcfw_channel(&rdev->rcfw);
free_ring:
- bnxt_re_net_ring_free(rdev, rdev->rcfw.creq_ring_id);
+ type = bnxt_qplib_get_ring_type(&rdev->chip_ctx);
+ bnxt_re_net_ring_free(rdev, rdev->rcfw.creq_ring_id, type);
free_rcfw:
bnxt_qplib_free_rcfw_channel(&rdev->rcfw);
fail:
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c
index b98b054148cd..71c34d5b0ac0 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c
+++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c
@@ -44,6 +44,7 @@
#include <linux/slab.h>
#include <linux/pci.h>
#include <linux/prefetch.h>
+#include <linux/if_ether.h>
#include "roce_hsi.h"
@@ -244,6 +245,7 @@ static void bnxt_qplib_service_nq(unsigned long data)
u16 type;
int budget = nq->budget;
uintptr_t q_handle;
+ bool gen_p5 = bnxt_qplib_is_chip_gen_p5(nq->res->cctx);
/* Service the NQ until empty */
raw_cons = hwq->cons;
@@ -290,7 +292,7 @@ static void bnxt_qplib_service_nq(unsigned long data)
q_handle |= (u64)le32_to_cpu(nqsrqe->srq_handle_high)
<< 32;
bnxt_qplib_arm_srq((struct bnxt_qplib_srq *)q_handle,
- DBR_DBR_TYPE_SRQ_ARMENA);
+ DBC_DBC_TYPE_SRQ_ARMENA);
if (!nq->srqn_handler(nq,
(struct bnxt_qplib_srq *)q_handle,
nqsrqe->event))
@@ -312,7 +314,9 @@ static void bnxt_qplib_service_nq(unsigned long data)
}
if (hwq->cons != raw_cons) {
hwq->cons = raw_cons;
- NQ_DB_REARM(nq->bar_reg_iomem, hwq->cons, hwq->max_elements);
+ bnxt_qplib_ring_nq_db_rearm(nq->bar_reg_iomem, hwq->cons,
+ hwq->max_elements, nq->ring_id,
+ gen_p5);
}
}
@@ -336,9 +340,11 @@ static irqreturn_t bnxt_qplib_nq_irq(int irq, void *dev_instance)
void bnxt_qplib_nq_stop_irq(struct bnxt_qplib_nq *nq, bool kill)
{
+ bool gen_p5 = bnxt_qplib_is_chip_gen_p5(nq->res->cctx);
tasklet_disable(&nq->worker);
/* Mask h/w interrupt */
- NQ_DB(nq->bar_reg_iomem, nq->hwq.cons, nq->hwq.max_elements);
+ bnxt_qplib_ring_nq_db(nq->bar_reg_iomem, nq->hwq.cons,
+ nq->hwq.max_elements, nq->ring_id, gen_p5);
/* Sync with last running IRQ handler */
synchronize_irq(nq->vector);
if (kill)
@@ -373,6 +379,7 @@ void bnxt_qplib_disable_nq(struct bnxt_qplib_nq *nq)
int bnxt_qplib_nq_start_irq(struct bnxt_qplib_nq *nq, int nq_indx,
int msix_vector, bool need_init)
{
+ bool gen_p5 = bnxt_qplib_is_chip_gen_p5(nq->res->cctx);
int rc;
if (nq->requested)
@@ -399,7 +406,8 @@ int bnxt_qplib_nq_start_irq(struct bnxt_qplib_nq *nq, int nq_indx,
nq->vector, nq_indx);
}
nq->requested = true;
- NQ_DB_REARM(nq->bar_reg_iomem, nq->hwq.cons, nq->hwq.max_elements);
+ bnxt_qplib_ring_nq_db_rearm(nq->bar_reg_iomem, nq->hwq.cons,
+ nq->hwq.max_elements, nq->ring_id, gen_p5);
return rc;
}
@@ -433,7 +441,8 @@ int bnxt_qplib_enable_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq,
rc = -ENOMEM;
goto fail;
}
- nq->bar_reg_iomem = ioremap_nocache(nq_base + nq->bar_reg_off, 4);
+ /* Unconditionally map 8 bytes to support 57500 series */
+ nq->bar_reg_iomem = ioremap_nocache(nq_base + nq->bar_reg_off, 8);
if (!nq->bar_reg_iomem) {
rc = -ENOMEM;
goto fail;
@@ -462,15 +471,17 @@ void bnxt_qplib_free_nq(struct bnxt_qplib_nq *nq)
int bnxt_qplib_alloc_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq)
{
+ u8 hwq_type;
+
nq->pdev = pdev;
if (!nq->hwq.max_elements ||
nq->hwq.max_elements > BNXT_QPLIB_NQE_MAX_CNT)
nq->hwq.max_elements = BNXT_QPLIB_NQE_MAX_CNT;
-
+ hwq_type = bnxt_qplib_get_hwq_type(nq->res);
if (bnxt_qplib_alloc_init_hwq(nq->pdev, &nq->hwq, NULL, 0,
&nq->hwq.max_elements,
BNXT_QPLIB_MAX_NQE_ENTRY_SIZE, 0,
- PAGE_SIZE, HWQ_TYPE_L2_CMPL))
+ PAGE_SIZE, hwq_type))
return -ENOMEM;
nq->budget = 8;
@@ -481,21 +492,19 @@ int bnxt_qplib_alloc_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq)
static void bnxt_qplib_arm_srq(struct bnxt_qplib_srq *srq, u32 arm_type)
{
struct bnxt_qplib_hwq *srq_hwq = &srq->hwq;
- struct dbr_dbr db_msg = { 0 };
void __iomem *db;
- u32 sw_prod = 0;
+ u32 sw_prod;
+ u64 val = 0;
/* Ring DB */
- sw_prod = (arm_type == DBR_DBR_TYPE_SRQ_ARM) ? srq->threshold :
- HWQ_CMP(srq_hwq->prod, srq_hwq);
- db_msg.index = cpu_to_le32((sw_prod << DBR_DBR_INDEX_SFT) &
- DBR_DBR_INDEX_MASK);
- db_msg.type_xid = cpu_to_le32(((srq->id << DBR_DBR_XID_SFT) &
- DBR_DBR_XID_MASK) | arm_type);
- db = (arm_type == DBR_DBR_TYPE_SRQ_ARMENA) ?
- srq->dbr_base : srq->dpi->dbr;
- wmb(); /* barrier before db ring */
- __iowrite64_copy(db, &db_msg, sizeof(db_msg) / sizeof(u64));
+ sw_prod = (arm_type == DBC_DBC_TYPE_SRQ_ARM) ?
+ srq->threshold : HWQ_CMP(srq_hwq->prod, srq_hwq);
+ db = (arm_type == DBC_DBC_TYPE_SRQ_ARMENA) ? srq->dbr_base :
+ srq->dpi->dbr;
+ val = ((srq->id << DBC_DBC_XID_SFT) & DBC_DBC_XID_MASK) | arm_type;
+ val <<= 32;
+ val |= (sw_prod << DBC_DBC_INDEX_SFT) & DBC_DBC_INDEX_MASK;
+ writeq(val, db);
}
int bnxt_qplib_destroy_srq(struct bnxt_qplib_res *res,
@@ -590,7 +599,7 @@ int bnxt_qplib_create_srq(struct bnxt_qplib_res *res,
srq->id = le32_to_cpu(resp.xid);
srq->dbr_base = res->dpi_tbl.dbr_bar_reg_iomem;
if (srq->threshold)
- bnxt_qplib_arm_srq(srq, DBR_DBR_TYPE_SRQ_ARMENA);
+ bnxt_qplib_arm_srq(srq, DBC_DBC_TYPE_SRQ_ARMENA);
srq->arm_req = false;
return 0;
@@ -614,7 +623,7 @@ int bnxt_qplib_modify_srq(struct bnxt_qplib_res *res,
srq_hwq->max_elements - sw_cons + sw_prod;
if (count > srq->threshold) {
srq->arm_req = false;
- bnxt_qplib_arm_srq(srq, DBR_DBR_TYPE_SRQ_ARM);
+ bnxt_qplib_arm_srq(srq, DBC_DBC_TYPE_SRQ_ARM);
} else {
/* Deferred arming */
srq->arm_req = true;
@@ -702,10 +711,10 @@ int bnxt_qplib_post_srq_recv(struct bnxt_qplib_srq *srq,
srq_hwq->max_elements - sw_cons + sw_prod;
spin_unlock(&srq_hwq->lock);
/* Ring DB */
- bnxt_qplib_arm_srq(srq, DBR_DBR_TYPE_SRQ);
+ bnxt_qplib_arm_srq(srq, DBC_DBC_TYPE_SRQ);
if (srq->arm_req == true && count > srq->threshold) {
srq->arm_req = false;
- bnxt_qplib_arm_srq(srq, DBR_DBR_TYPE_SRQ_ARM);
+ bnxt_qplib_arm_srq(srq, DBC_DBC_TYPE_SRQ_ARM);
}
done:
return rc;
@@ -853,18 +862,19 @@ exit:
int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
{
struct bnxt_qplib_rcfw *rcfw = res->rcfw;
- struct sq_send *hw_sq_send_hdr, **hw_sq_send_ptr;
- struct cmdq_create_qp req;
- struct creq_create_qp_resp resp;
- struct bnxt_qplib_pbl *pbl;
- struct sq_psn_search **psn_search_ptr;
unsigned long int psn_search, poff = 0;
+ struct sq_psn_search **psn_search_ptr;
struct bnxt_qplib_q *sq = &qp->sq;
struct bnxt_qplib_q *rq = &qp->rq;
+ int i, rc, req_size, psn_sz = 0;
+ struct sq_send **hw_sq_send_ptr;
+ struct creq_create_qp_resp resp;
struct bnxt_qplib_hwq *xrrq;
- int i, rc, req_size, psn_sz;
u16 cmd_flags = 0, max_ssge;
- u32 sw_prod, qp_flags = 0;
+ struct cmdq_create_qp req;
+ struct bnxt_qplib_pbl *pbl;
+ u32 qp_flags = 0;
+ u16 max_rsge;
RCFW_CMD_PREP(req, CREATE_QP, cmd_flags);
@@ -874,8 +884,11 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
req.qp_handle = cpu_to_le64(qp->qp_handle);
/* SQ */
- psn_sz = (qp->type == CMDQ_CREATE_QP_TYPE_RC) ?
- sizeof(struct sq_psn_search) : 0;
+ if (qp->type == CMDQ_CREATE_QP_TYPE_RC) {
+ psn_sz = bnxt_qplib_is_chip_gen_p5(res->cctx) ?
+ sizeof(struct sq_psn_search_ext) :
+ sizeof(struct sq_psn_search);
+ }
sq->hwq.max_elements = sq->max_wqe;
rc = bnxt_qplib_alloc_init_hwq(res->pdev, &sq->hwq, sq->sglist,
sq->nmap, &sq->hwq.max_elements,
@@ -905,10 +918,16 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
poff = (psn_search & ~PAGE_MASK) /
BNXT_QPLIB_MAX_PSNE_ENTRY_SIZE;
}
- for (i = 0; i < sq->hwq.max_elements; i++)
+ for (i = 0; i < sq->hwq.max_elements; i++) {
sq->swq[i].psn_search =
&psn_search_ptr[get_psne_pg(i + poff)]
[get_psne_idx(i + poff)];
+ /*psns_ext will be used only for P5 chips. */
+ sq->swq[i].psn_ext =
+ (struct sq_psn_search_ext *)
+ &psn_search_ptr[get_psne_pg(i + poff)]
+ [get_psne_idx(i + poff)];
+ }
}
pbl = &sq->hwq.pbl[PBL_LVL_0];
req.sq_pbl = cpu_to_le64(pbl->pg_map_arr[0]);
@@ -929,14 +948,6 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
CMDQ_CREATE_QP_SQ_PG_SIZE_PG_1G :
CMDQ_CREATE_QP_SQ_PG_SIZE_PG_4K);
- /* initialize all SQ WQEs to LOCAL_INVALID (sq prep for hw fetch) */
- hw_sq_send_ptr = (struct sq_send **)sq->hwq.pbl_ptr;
- for (sw_prod = 0; sw_prod < sq->hwq.max_elements; sw_prod++) {
- hw_sq_send_hdr = &hw_sq_send_ptr[get_sqe_pg(sw_prod)]
- [get_sqe_idx(sw_prod)];
- hw_sq_send_hdr->wqe_type = SQ_BASE_WQE_TYPE_LOCAL_INVALID;
- }
-
if (qp->scq)
req.scq_cid = cpu_to_le32(qp->scq->id);
@@ -1007,8 +1018,9 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
req.sq_fwo_sq_sge = cpu_to_le16(
((max_ssge & CMDQ_CREATE_QP_SQ_SGE_MASK)
<< CMDQ_CREATE_QP_SQ_SGE_SFT) | 0);
+ max_rsge = bnxt_qplib_is_chip_gen_p5(res->cctx) ? 6 : rq->max_sge;
req.rq_fwo_rq_sge = cpu_to_le16(
- ((rq->max_sge & CMDQ_CREATE_QP_RQ_SGE_MASK)
+ ((max_rsge & CMDQ_CREATE_QP_RQ_SGE_MASK)
<< CMDQ_CREATE_QP_RQ_SGE_SFT) | 0);
/* ORRQ and IRRQ */
if (psn_sz) {
@@ -1053,6 +1065,7 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
qp->id = le32_to_cpu(resp.xid);
qp->cur_qp_state = CMDQ_MODIFY_QP_NEW_STATE_RESET;
+ qp->cctx = res->cctx;
INIT_LIST_HEAD(&qp->sq_flush);
INIT_LIST_HEAD(&qp->rq_flush);
rcfw->qp_tbl[qp->id].qp_id = qp->id;
@@ -1494,19 +1507,16 @@ void *bnxt_qplib_get_qp1_rq_buf(struct bnxt_qplib_qp *qp,
void bnxt_qplib_post_send_db(struct bnxt_qplib_qp *qp)
{
struct bnxt_qplib_q *sq = &qp->sq;
- struct dbr_dbr db_msg = { 0 };
u32 sw_prod;
+ u64 val = 0;
+ val = (((qp->id << DBC_DBC_XID_SFT) & DBC_DBC_XID_MASK) |
+ DBC_DBC_TYPE_SQ);
+ val <<= 32;
sw_prod = HWQ_CMP(sq->hwq.prod, &sq->hwq);
-
- db_msg.index = cpu_to_le32((sw_prod << DBR_DBR_INDEX_SFT) &
- DBR_DBR_INDEX_MASK);
- db_msg.type_xid =
- cpu_to_le32(((qp->id << DBR_DBR_XID_SFT) & DBR_DBR_XID_MASK) |
- DBR_DBR_TYPE_SQ);
+ val |= (sw_prod << DBC_DBC_INDEX_SFT) & DBC_DBC_INDEX_MASK;
/* Flush all the WQE writes to HW */
- wmb();
- __iowrite64_copy(qp->dpi->dbr, &db_msg, sizeof(db_msg) / sizeof(u64));
+ writeq(val, qp->dpi->dbr);
}
int bnxt_qplib_post_send(struct bnxt_qplib_qp *qp,
@@ -1617,7 +1627,8 @@ int bnxt_qplib_post_send(struct bnxt_qplib_qp *qp,
((offsetof(typeof(*sqe), data) + 15) >> 4);
sqe->inv_key_or_imm_data = cpu_to_le32(
wqe->send.inv_key);
- if (qp->type == CMDQ_CREATE_QP_TYPE_UD) {
+ if (qp->type == CMDQ_CREATE_QP_TYPE_UD ||
+ qp->type == CMDQ_CREATE_QP_TYPE_GSI) {
sqe->q_key = cpu_to_le32(wqe->send.q_key);
sqe->dst_qp = cpu_to_le32(
wqe->send.dst_qp & SQ_SEND_DST_QP_MASK);
@@ -1741,14 +1752,26 @@ int bnxt_qplib_post_send(struct bnxt_qplib_qp *qp,
}
swq->next_psn = sq->psn & BTH_PSN_MASK;
if (swq->psn_search) {
- swq->psn_search->opcode_start_psn = cpu_to_le32(
- ((swq->start_psn << SQ_PSN_SEARCH_START_PSN_SFT) &
- SQ_PSN_SEARCH_START_PSN_MASK) |
- ((wqe->type << SQ_PSN_SEARCH_OPCODE_SFT) &
- SQ_PSN_SEARCH_OPCODE_MASK));
- swq->psn_search->flags_next_psn = cpu_to_le32(
- ((swq->next_psn << SQ_PSN_SEARCH_NEXT_PSN_SFT) &
- SQ_PSN_SEARCH_NEXT_PSN_MASK));
+ u32 opcd_spsn;
+ u32 flg_npsn;
+
+ opcd_spsn = ((swq->start_psn << SQ_PSN_SEARCH_START_PSN_SFT) &
+ SQ_PSN_SEARCH_START_PSN_MASK);
+ opcd_spsn |= ((wqe->type << SQ_PSN_SEARCH_OPCODE_SFT) &
+ SQ_PSN_SEARCH_OPCODE_MASK);
+ flg_npsn = ((swq->next_psn << SQ_PSN_SEARCH_NEXT_PSN_SFT) &
+ SQ_PSN_SEARCH_NEXT_PSN_MASK);
+ if (bnxt_qplib_is_chip_gen_p5(qp->cctx)) {
+ swq->psn_ext->opcode_start_psn =
+ cpu_to_le32(opcd_spsn);
+ swq->psn_ext->flags_next_psn =
+ cpu_to_le32(flg_npsn);
+ } else {
+ swq->psn_search->opcode_start_psn =
+ cpu_to_le32(opcd_spsn);
+ swq->psn_search->flags_next_psn =
+ cpu_to_le32(flg_npsn);
+ }
}
queue_err:
if (sch_handler) {
@@ -1785,19 +1808,16 @@ done:
void bnxt_qplib_post_recv_db(struct bnxt_qplib_qp *qp)
{
struct bnxt_qplib_q *rq = &qp->rq;
- struct dbr_dbr db_msg = { 0 };
u32 sw_prod;
+ u64 val = 0;
+ val = (((qp->id << DBC_DBC_XID_SFT) & DBC_DBC_XID_MASK) |
+ DBC_DBC_TYPE_RQ);
+ val <<= 32;
sw_prod = HWQ_CMP(rq->hwq.prod, &rq->hwq);
- db_msg.index = cpu_to_le32((sw_prod << DBR_DBR_INDEX_SFT) &
- DBR_DBR_INDEX_MASK);
- db_msg.type_xid =
- cpu_to_le32(((qp->id << DBR_DBR_XID_SFT) & DBR_DBR_XID_MASK) |
- DBR_DBR_TYPE_RQ);
-
+ val |= (sw_prod << DBC_DBC_INDEX_SFT) & DBC_DBC_INDEX_MASK;
/* Flush the writes to HW Rx WQE before the ringing Rx DB */
- wmb();
- __iowrite64_copy(qp->dpi->dbr, &db_msg, sizeof(db_msg) / sizeof(u64));
+ writeq(val, qp->dpi->dbr);
}
int bnxt_qplib_post_recv(struct bnxt_qplib_qp *qp,
@@ -1881,32 +1901,28 @@ done:
/* Spinlock must be held */
static void bnxt_qplib_arm_cq_enable(struct bnxt_qplib_cq *cq)
{
- struct dbr_dbr db_msg = { 0 };
+ u64 val = 0;
- db_msg.type_xid =
- cpu_to_le32(((cq->id << DBR_DBR_XID_SFT) & DBR_DBR_XID_MASK) |
- DBR_DBR_TYPE_CQ_ARMENA);
+ val = ((cq->id << DBC_DBC_XID_SFT) & DBC_DBC_XID_MASK) |
+ DBC_DBC_TYPE_CQ_ARMENA;
+ val <<= 32;
/* Flush memory writes before enabling the CQ */
- wmb();
- __iowrite64_copy(cq->dbr_base, &db_msg, sizeof(db_msg) / sizeof(u64));
+ writeq(val, cq->dbr_base);
}
static void bnxt_qplib_arm_cq(struct bnxt_qplib_cq *cq, u32 arm_type)
{
struct bnxt_qplib_hwq *cq_hwq = &cq->hwq;
- struct dbr_dbr db_msg = { 0 };
u32 sw_cons;
+ u64 val = 0;
/* Ring DB */
+ val = ((cq->id << DBC_DBC_XID_SFT) & DBC_DBC_XID_MASK) | arm_type;
+ val <<= 32;
sw_cons = HWQ_CMP(cq_hwq->cons, cq_hwq);
- db_msg.index = cpu_to_le32((sw_cons << DBR_DBR_INDEX_SFT) &
- DBR_DBR_INDEX_MASK);
- db_msg.type_xid =
- cpu_to_le32(((cq->id << DBR_DBR_XID_SFT) & DBR_DBR_XID_MASK) |
- arm_type);
+ val |= (sw_cons << DBC_DBC_INDEX_SFT) & DBC_DBC_INDEX_MASK;
/* flush memory writes before arming the CQ */
- wmb();
- __iowrite64_copy(cq->dpi->dbr, &db_msg, sizeof(db_msg) / sizeof(u64));
+ writeq(val, cq->dpi->dbr);
}
int bnxt_qplib_create_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq)
@@ -2053,6 +2069,7 @@ static int __flush_rq(struct bnxt_qplib_q *rq, struct bnxt_qplib_qp *qp,
opcode = CQ_BASE_CQE_TYPE_RES_RC;
break;
case CMDQ_CREATE_QP_TYPE_UD:
+ case CMDQ_CREATE_QP_TYPE_GSI:
opcode = CQ_BASE_CQE_TYPE_RES_UD;
break;
}
@@ -2125,7 +2142,7 @@ static int do_wa9060(struct bnxt_qplib_qp *qp, struct bnxt_qplib_cq *cq,
sq->send_phantom = true;
/* TODO: Only ARM if the previous SQE is ARMALL */
- bnxt_qplib_arm_cq(cq, DBR_DBR_TYPE_CQ_ARMALL);
+ bnxt_qplib_arm_cq(cq, DBC_DBC_TYPE_CQ_ARMALL);
rc = -EAGAIN;
goto out;
@@ -2410,12 +2427,14 @@ static int bnxt_qplib_cq_process_res_ud(struct bnxt_qplib_cq *cq,
}
cqe = *pcqe;
cqe->opcode = hwcqe->cqe_type_toggle & CQ_BASE_CQE_TYPE_MASK;
- cqe->length = le32_to_cpu(hwcqe->length);
+ cqe->length = (u32)le16_to_cpu(hwcqe->length);
+ cqe->cfa_meta = le16_to_cpu(hwcqe->cfa_metadata);
cqe->invrkey = le32_to_cpu(hwcqe->imm_data);
cqe->flags = le16_to_cpu(hwcqe->flags);
cqe->status = hwcqe->status;
cqe->qp_handle = (u64)(unsigned long)qp;
- memcpy(cqe->smac, hwcqe->src_mac, 6);
+ /*FIXME: Endianness fix needed for smace */
+ memcpy(cqe->smac, hwcqe->src_mac, ETH_ALEN);
wr_id_idx = le32_to_cpu(hwcqe->src_qp_high_srq_or_rq_wr_id)
& CQ_RES_UD_SRQ_OR_RQ_WR_ID_MASK;
cqe->src_qp = le16_to_cpu(hwcqe->src_qp_low) |
@@ -2794,7 +2813,7 @@ int bnxt_qplib_poll_cq(struct bnxt_qplib_cq *cq, struct bnxt_qplib_cqe *cqe,
}
if (cq->hwq.cons != raw_cons) {
cq->hwq.cons = raw_cons;
- bnxt_qplib_arm_cq(cq, DBR_DBR_TYPE_CQ);
+ bnxt_qplib_arm_cq(cq, DBC_DBC_TYPE_CQ);
}
exit:
return num_cqes - budget;
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.h b/drivers/infiniband/hw/bnxt_re/qplib_fp.h
index 72352ca80ace..3f618b5f1f06 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_fp.h
+++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.h
@@ -106,6 +106,7 @@ struct bnxt_qplib_swq {
u32 start_psn;
u32 next_psn;
struct sq_psn_search *psn_search;
+ struct sq_psn_search_ext *psn_ext;
};
struct bnxt_qplib_swqe {
@@ -254,6 +255,7 @@ struct bnxt_qplib_q {
struct bnxt_qplib_qp {
struct bnxt_qplib_pd *pd;
struct bnxt_qplib_dpi *dpi;
+ struct bnxt_qplib_chip_ctx *cctx;
u64 qp_handle;
#define BNXT_QPLIB_QP_ID_INVALID 0xFFFFFFFF
u32 id;
@@ -347,6 +349,7 @@ struct bnxt_qplib_cqe {
u8 type;
u8 opcode;
u32 length;
+ u16 cfa_meta;
u64 wr_id;
union {
__be32 immdata;
@@ -432,13 +435,47 @@ struct bnxt_qplib_cq {
#define NQ_DB_CP_FLAGS (NQ_DB_KEY_CP | \
NQ_DB_IDX_VALID | \
NQ_DB_IRQ_DIS)
-#define NQ_DB_REARM(db, raw_cons, cp_bit) \
- writel(NQ_DB_CP_FLAGS_REARM | ((raw_cons) & ((cp_bit) - 1)), db)
-#define NQ_DB(db, raw_cons, cp_bit) \
- writel(NQ_DB_CP_FLAGS | ((raw_cons) & ((cp_bit) - 1)), db)
+
+static inline void bnxt_qplib_ring_nq_db64(void __iomem *db, u32 index,
+ u32 xid, bool arm)
+{
+ u64 val;
+
+ val = xid & DBC_DBC_XID_MASK;
+ val |= DBC_DBC_PATH_ROCE;
+ val |= arm ? DBC_DBC_TYPE_NQ_ARM : DBC_DBC_TYPE_NQ;
+ val <<= 32;
+ val |= index & DBC_DBC_INDEX_MASK;
+ writeq(val, db);
+}
+
+static inline void bnxt_qplib_ring_nq_db_rearm(void __iomem *db, u32 raw_cons,
+ u32 max_elements, u32 xid,
+ bool gen_p5)
+{
+ u32 index = raw_cons & (max_elements - 1);
+
+ if (gen_p5)
+ bnxt_qplib_ring_nq_db64(db, index, xid, true);
+ else
+ writel(NQ_DB_CP_FLAGS_REARM | (index & DBC_DBC32_XID_MASK), db);
+}
+
+static inline void bnxt_qplib_ring_nq_db(void __iomem *db, u32 raw_cons,
+ u32 max_elements, u32 xid,
+ bool gen_p5)
+{
+ u32 index = raw_cons & (max_elements - 1);
+
+ if (gen_p5)
+ bnxt_qplib_ring_nq_db64(db, index, xid, false);
+ else
+ writel(NQ_DB_CP_FLAGS | (index & DBC_DBC32_XID_MASK), db);
+}
struct bnxt_qplib_nq {
struct pci_dev *pdev;
+ struct bnxt_qplib_res *res;
int vector;
cpumask_t mask;
@@ -448,7 +485,7 @@ struct bnxt_qplib_nq {
struct bnxt_qplib_hwq hwq;
u16 bar_reg;
- u16 bar_reg_off;
+ u32 bar_reg_off;
u16 ring_id;
void __iomem *bar_reg_iomem;
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c
index be4e33e9f962..c6461e957078 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c
+++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c
@@ -58,7 +58,7 @@ static int __wait_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie)
u16 cbit;
int rc;
- cbit = cookie % RCFW_MAX_OUTSTANDING_CMD;
+ cbit = cookie % rcfw->cmdq_depth;
rc = wait_event_timeout(rcfw->waitq,
!test_bit(cbit, rcfw->cmdq_bitmap),
msecs_to_jiffies(RCFW_CMD_WAIT_TIME_MS));
@@ -70,7 +70,7 @@ static int __block_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie)
u32 count = RCFW_BLOCKED_CMD_WAIT_COUNT;
u16 cbit;
- cbit = cookie % RCFW_MAX_OUTSTANDING_CMD;
+ cbit = cookie % rcfw->cmdq_depth;
if (!test_bit(cbit, rcfw->cmdq_bitmap))
goto done;
do {
@@ -86,6 +86,7 @@ static int __send_message(struct bnxt_qplib_rcfw *rcfw, struct cmdq_base *req,
{
struct bnxt_qplib_cmdqe *cmdqe, **cmdq_ptr;
struct bnxt_qplib_hwq *cmdq = &rcfw->cmdq;
+ u32 cmdq_depth = rcfw->cmdq_depth;
struct bnxt_qplib_crsq *crsqe;
u32 sw_prod, cmdq_prod;
unsigned long flags;
@@ -124,7 +125,7 @@ static int __send_message(struct bnxt_qplib_rcfw *rcfw, struct cmdq_base *req,
cookie = rcfw->seq_num & RCFW_MAX_COOKIE_VALUE;
- cbit = cookie % RCFW_MAX_OUTSTANDING_CMD;
+ cbit = cookie % rcfw->cmdq_depth;
if (is_block)
cookie |= RCFW_CMD_IS_BLOCKING;
@@ -153,7 +154,8 @@ static int __send_message(struct bnxt_qplib_rcfw *rcfw, struct cmdq_base *req,
do {
/* Locate the next cmdq slot */
sw_prod = HWQ_CMP(cmdq->prod, cmdq);
- cmdqe = &cmdq_ptr[get_cmdq_pg(sw_prod)][get_cmdq_idx(sw_prod)];
+ cmdqe = &cmdq_ptr[get_cmdq_pg(sw_prod, cmdq_depth)]
+ [get_cmdq_idx(sw_prod, cmdq_depth)];
if (!cmdqe) {
dev_err(&rcfw->pdev->dev,
"RCFW request failed with no cmdqe!\n");
@@ -326,7 +328,7 @@ static int bnxt_qplib_process_qp_event(struct bnxt_qplib_rcfw *rcfw,
mcookie = qp_event->cookie;
blocked = cookie & RCFW_CMD_IS_BLOCKING;
cookie &= RCFW_MAX_COOKIE_VALUE;
- cbit = cookie % RCFW_MAX_OUTSTANDING_CMD;
+ cbit = cookie % rcfw->cmdq_depth;
crsqe = &rcfw->crsqe_tbl[cbit];
if (crsqe->resp &&
crsqe->resp->cookie == mcookie) {
@@ -357,11 +359,12 @@ static int bnxt_qplib_process_qp_event(struct bnxt_qplib_rcfw *rcfw,
static void bnxt_qplib_service_creq(unsigned long data)
{
struct bnxt_qplib_rcfw *rcfw = (struct bnxt_qplib_rcfw *)data;
+ bool gen_p5 = bnxt_qplib_is_chip_gen_p5(rcfw->res->cctx);
struct bnxt_qplib_hwq *creq = &rcfw->creq;
+ u32 type, budget = CREQ_ENTRY_POLL_BUDGET;
struct creq_base *creqe, **creq_ptr;
u32 sw_cons, raw_cons;
unsigned long flags;
- u32 type, budget = CREQ_ENTRY_POLL_BUDGET;
/* Service the CREQ until budget is over */
spin_lock_irqsave(&creq->lock, flags);
@@ -405,8 +408,9 @@ static void bnxt_qplib_service_creq(unsigned long data)
if (creq->cons != raw_cons) {
creq->cons = raw_cons;
- CREQ_DB_REARM(rcfw->creq_bar_reg_iomem, raw_cons,
- creq->max_elements);
+ bnxt_qplib_ring_creq_db_rearm(rcfw->creq_bar_reg_iomem,
+ raw_cons, creq->max_elements,
+ rcfw->creq_ring_id, gen_p5);
}
spin_unlock_irqrestore(&creq->lock, flags);
}
@@ -478,11 +482,13 @@ int bnxt_qplib_init_rcfw(struct bnxt_qplib_rcfw *rcfw,
req.log2_dbr_pg_size = cpu_to_le16(PAGE_SHIFT -
RCFW_DBR_BASE_PAGE_SHIFT);
/*
- * VFs need not setup the HW context area, PF
+ * Gen P5 devices doesn't require this allocation
+ * as the L2 driver does the same for RoCE also.
+ * Also, VFs need not setup the HW context area, PF
* shall setup this area for VF. Skipping the
* HW programming
*/
- if (is_virtfn)
+ if (is_virtfn || bnxt_qplib_is_chip_gen_p5(rcfw->res->cctx))
goto skip_ctx_setup;
level = ctx->qpc_tbl.level;
@@ -555,23 +561,34 @@ void bnxt_qplib_free_rcfw_channel(struct bnxt_qplib_rcfw *rcfw)
int bnxt_qplib_alloc_rcfw_channel(struct pci_dev *pdev,
struct bnxt_qplib_rcfw *rcfw,
+ struct bnxt_qplib_ctx *ctx,
int qp_tbl_sz)
{
+ u8 hwq_type;
+
rcfw->pdev = pdev;
rcfw->creq.max_elements = BNXT_QPLIB_CREQE_MAX_CNT;
+ hwq_type = bnxt_qplib_get_hwq_type(rcfw->res);
if (bnxt_qplib_alloc_init_hwq(rcfw->pdev, &rcfw->creq, NULL, 0,
&rcfw->creq.max_elements,
- BNXT_QPLIB_CREQE_UNITS, 0, PAGE_SIZE,
- HWQ_TYPE_L2_CMPL)) {
+ BNXT_QPLIB_CREQE_UNITS,
+ 0, PAGE_SIZE, hwq_type)) {
dev_err(&rcfw->pdev->dev,
"HW channel CREQ allocation failed\n");
goto fail;
}
- rcfw->cmdq.max_elements = BNXT_QPLIB_CMDQE_MAX_CNT;
- if (bnxt_qplib_alloc_init_hwq(rcfw->pdev, &rcfw->cmdq, NULL, 0,
- &rcfw->cmdq.max_elements,
- BNXT_QPLIB_CMDQE_UNITS, 0, PAGE_SIZE,
- HWQ_TYPE_CTX)) {
+ if (ctx->hwrm_intf_ver < HWRM_VERSION_RCFW_CMDQ_DEPTH_CHECK)
+ rcfw->cmdq_depth = BNXT_QPLIB_CMDQE_MAX_CNT_256;
+ else
+ rcfw->cmdq_depth = BNXT_QPLIB_CMDQE_MAX_CNT_8192;
+
+ rcfw->cmdq.max_elements = rcfw->cmdq_depth;
+ if (bnxt_qplib_alloc_init_hwq
+ (rcfw->pdev, &rcfw->cmdq, NULL, 0,
+ &rcfw->cmdq.max_elements,
+ BNXT_QPLIB_CMDQE_UNITS, 0,
+ bnxt_qplib_cmdqe_page_size(rcfw->cmdq_depth),
+ HWQ_TYPE_CTX)) {
dev_err(&rcfw->pdev->dev,
"HW channel CMDQ allocation failed\n");
goto fail;
@@ -597,10 +614,13 @@ fail:
void bnxt_qplib_rcfw_stop_irq(struct bnxt_qplib_rcfw *rcfw, bool kill)
{
+ bool gen_p5 = bnxt_qplib_is_chip_gen_p5(rcfw->res->cctx);
+
tasklet_disable(&rcfw->worker);
/* Mask h/w interrupts */
- CREQ_DB(rcfw->creq_bar_reg_iomem, rcfw->creq.cons,
- rcfw->creq.max_elements);
+ bnxt_qplib_ring_creq_db(rcfw->creq_bar_reg_iomem, rcfw->creq.cons,
+ rcfw->creq.max_elements, rcfw->creq_ring_id,
+ gen_p5);
/* Sync with last running IRQ-handler */
synchronize_irq(rcfw->vector);
if (kill)
@@ -637,6 +657,7 @@ void bnxt_qplib_disable_rcfw_channel(struct bnxt_qplib_rcfw *rcfw)
int bnxt_qplib_rcfw_start_irq(struct bnxt_qplib_rcfw *rcfw, int msix_vector,
bool need_init)
{
+ bool gen_p5 = bnxt_qplib_is_chip_gen_p5(rcfw->res->cctx);
int rc;
if (rcfw->requested)
@@ -653,8 +674,9 @@ int bnxt_qplib_rcfw_start_irq(struct bnxt_qplib_rcfw *rcfw, int msix_vector,
if (rc)
return rc;
rcfw->requested = true;
- CREQ_DB_REARM(rcfw->creq_bar_reg_iomem, rcfw->creq.cons,
- rcfw->creq.max_elements);
+ bnxt_qplib_ring_creq_db_rearm(rcfw->creq_bar_reg_iomem,
+ rcfw->creq.cons, rcfw->creq.max_elements,
+ rcfw->creq_ring_id, gen_p5);
return 0;
}
@@ -674,8 +696,7 @@ int bnxt_qplib_enable_rcfw_channel(struct pci_dev *pdev,
/* General */
rcfw->seq_num = 0;
set_bit(FIRMWARE_FIRST_FLAG, &rcfw->flags);
- bmap_size = BITS_TO_LONGS(RCFW_MAX_OUTSTANDING_CMD *
- sizeof(unsigned long));
+ bmap_size = BITS_TO_LONGS(rcfw->cmdq_depth) * sizeof(unsigned long);
rcfw->cmdq_bitmap = kzalloc(bmap_size, GFP_KERNEL);
if (!rcfw->cmdq_bitmap)
return -ENOMEM;
@@ -708,8 +729,9 @@ int bnxt_qplib_enable_rcfw_channel(struct pci_dev *pdev,
dev_err(&rcfw->pdev->dev,
"CREQ BAR region %d resc start is 0!\n",
rcfw->creq_bar_reg);
+ /* Unconditionally map 8 bytes to support 57500 series */
rcfw->creq_bar_reg_iomem = ioremap_nocache(res_base + cp_bar_reg_off,
- 4);
+ 8);
if (!rcfw->creq_bar_reg_iomem) {
dev_err(&rcfw->pdev->dev, "CREQ BAR region %d mapping failed\n",
rcfw->creq_bar_reg);
@@ -734,7 +756,7 @@ int bnxt_qplib_enable_rcfw_channel(struct pci_dev *pdev,
init.cmdq_pbl = cpu_to_le64(rcfw->cmdq.pbl[PBL_LVL_0].pg_map_arr[0]);
init.cmdq_size_cmdq_lvl = cpu_to_le16(
- ((BNXT_QPLIB_CMDQE_MAX_CNT << CMDQ_INIT_CMDQ_SIZE_SFT) &
+ ((rcfw->cmdq_depth << CMDQ_INIT_CMDQ_SIZE_SFT) &
CMDQ_INIT_CMDQ_SIZE_MASK) |
((rcfw->cmdq.level << CMDQ_INIT_CMDQ_LVL_SFT) &
CMDQ_INIT_CMDQ_LVL_MASK));
@@ -756,8 +778,8 @@ struct bnxt_qplib_rcfw_sbuf *bnxt_qplib_rcfw_alloc_sbuf(
return NULL;
sbuf->size = size;
- sbuf->sb = dma_zalloc_coherent(&rcfw->pdev->dev, sbuf->size,
- &sbuf->dma_addr, GFP_ATOMIC);
+ sbuf->sb = dma_alloc_coherent(&rcfw->pdev->dev, sbuf->size,
+ &sbuf->dma_addr, GFP_ATOMIC);
if (!sbuf->sb)
goto bail;
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h
index 9a8687dc0a79..2138533bb642 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h
+++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h
@@ -63,32 +63,60 @@
#define RCFW_CMD_WAIT_TIME_MS 20000 /* 20 Seconds timeout */
+/* Cmdq contains a fix number of a 16-Byte slots */
+struct bnxt_qplib_cmdqe {
+ u8 data[16];
+};
+
/* CMDQ elements */
-#define BNXT_QPLIB_CMDQE_MAX_CNT 256
+#define BNXT_QPLIB_CMDQE_MAX_CNT_256 256
+#define BNXT_QPLIB_CMDQE_MAX_CNT_8192 8192
#define BNXT_QPLIB_CMDQE_UNITS sizeof(struct bnxt_qplib_cmdqe)
-#define BNXT_QPLIB_CMDQE_CNT_PER_PG (PAGE_SIZE / BNXT_QPLIB_CMDQE_UNITS)
+#define BNXT_QPLIB_CMDQE_BYTES(depth) ((depth) * BNXT_QPLIB_CMDQE_UNITS)
+
+static inline u32 bnxt_qplib_cmdqe_npages(u32 depth)
+{
+ u32 npages;
+
+ npages = BNXT_QPLIB_CMDQE_BYTES(depth) / PAGE_SIZE;
+ if (BNXT_QPLIB_CMDQE_BYTES(depth) % PAGE_SIZE)
+ npages++;
+ return npages;
+}
-#define MAX_CMDQ_IDX (BNXT_QPLIB_CMDQE_MAX_CNT - 1)
-#define MAX_CMDQ_IDX_PER_PG (BNXT_QPLIB_CMDQE_CNT_PER_PG - 1)
+static inline u32 bnxt_qplib_cmdqe_page_size(u32 depth)
+{
+ return (bnxt_qplib_cmdqe_npages(depth) * PAGE_SIZE);
+}
+
+static inline u32 bnxt_qplib_cmdqe_cnt_per_pg(u32 depth)
+{
+ return (bnxt_qplib_cmdqe_page_size(depth) /
+ BNXT_QPLIB_CMDQE_UNITS);
+}
+
+#define MAX_CMDQ_IDX(depth) ((depth) - 1)
+
+static inline u32 bnxt_qplib_max_cmdq_idx_per_pg(u32 depth)
+{
+ return (bnxt_qplib_cmdqe_cnt_per_pg(depth) - 1);
+}
-#define RCFW_MAX_OUTSTANDING_CMD BNXT_QPLIB_CMDQE_MAX_CNT
#define RCFW_MAX_COOKIE_VALUE 0x7FFF
#define RCFW_CMD_IS_BLOCKING 0x8000
#define RCFW_BLOCKED_CMD_WAIT_COUNT 0x4E20
-/* Cmdq contains a fix number of a 16-Byte slots */
-struct bnxt_qplib_cmdqe {
- u8 data[16];
-};
+#define HWRM_VERSION_RCFW_CMDQ_DEPTH_CHECK 0x1000900020011ULL
-static inline u32 get_cmdq_pg(u32 val)
+static inline u32 get_cmdq_pg(u32 val, u32 depth)
{
- return (val & ~MAX_CMDQ_IDX_PER_PG) / BNXT_QPLIB_CMDQE_CNT_PER_PG;
+ return (val & ~(bnxt_qplib_max_cmdq_idx_per_pg(depth))) /
+ (bnxt_qplib_cmdqe_cnt_per_pg(depth));
}
-static inline u32 get_cmdq_idx(u32 val)
+static inline u32 get_cmdq_idx(u32 val, u32 depth)
{
- return val & MAX_CMDQ_IDX_PER_PG;
+ return val & (bnxt_qplib_max_cmdq_idx_per_pg(depth));
}
/* Crsq buf is 1024-Byte */
@@ -129,10 +157,46 @@ static inline u32 get_creq_idx(u32 val)
#define CREQ_DB_CP_FLAGS (CREQ_DB_KEY_CP | \
CREQ_DB_IDX_VALID | \
CREQ_DB_IRQ_DIS)
-#define CREQ_DB_REARM(db, raw_cons, cp_bit) \
- writel(CREQ_DB_CP_FLAGS_REARM | ((raw_cons) & ((cp_bit) - 1)), db)
-#define CREQ_DB(db, raw_cons, cp_bit) \
- writel(CREQ_DB_CP_FLAGS | ((raw_cons) & ((cp_bit) - 1)), db)
+
+static inline void bnxt_qplib_ring_creq_db64(void __iomem *db, u32 index,
+ u32 xid, bool arm)
+{
+ u64 val = 0;
+
+ val = xid & DBC_DBC_XID_MASK;
+ val |= DBC_DBC_PATH_ROCE;
+ val |= arm ? DBC_DBC_TYPE_NQ_ARM : DBC_DBC_TYPE_NQ;
+ val <<= 32;
+ val |= index & DBC_DBC_INDEX_MASK;
+
+ writeq(val, db);
+}
+
+static inline void bnxt_qplib_ring_creq_db_rearm(void __iomem *db, u32 raw_cons,
+ u32 max_elements, u32 xid,
+ bool gen_p5)
+{
+ u32 index = raw_cons & (max_elements - 1);
+
+ if (gen_p5)
+ bnxt_qplib_ring_creq_db64(db, index, xid, true);
+ else
+ writel(CREQ_DB_CP_FLAGS_REARM | (index & DBC_DBC32_XID_MASK),
+ db);
+}
+
+static inline void bnxt_qplib_ring_creq_db(void __iomem *db, u32 raw_cons,
+ u32 max_elements, u32 xid,
+ bool gen_p5)
+{
+ u32 index = raw_cons & (max_elements - 1);
+
+ if (gen_p5)
+ bnxt_qplib_ring_creq_db64(db, index, xid, true);
+ else
+ writel(CREQ_DB_CP_FLAGS | (index & DBC_DBC32_XID_MASK),
+ db);
+}
#define CREQ_ENTRY_POLL_BUDGET 0x100
@@ -159,6 +223,7 @@ struct bnxt_qplib_qp_node {
/* RCFW Communication Channels */
struct bnxt_qplib_rcfw {
struct pci_dev *pdev;
+ struct bnxt_qplib_res *res;
int vector;
struct tasklet_struct worker;
bool requested;
@@ -194,11 +259,14 @@ struct bnxt_qplib_rcfw {
struct bnxt_qplib_qp_node *qp_tbl;
u64 oos_prev;
u32 init_oos_stats;
+ u32 cmdq_depth;
};
void bnxt_qplib_free_rcfw_channel(struct bnxt_qplib_rcfw *rcfw);
int bnxt_qplib_alloc_rcfw_channel(struct pci_dev *pdev,
- struct bnxt_qplib_rcfw *rcfw, int qp_tbl_sz);
+ struct bnxt_qplib_rcfw *rcfw,
+ struct bnxt_qplib_ctx *ctx,
+ int qp_tbl_sz);
void bnxt_qplib_rcfw_stop_irq(struct bnxt_qplib_rcfw *rcfw, bool kill);
void bnxt_qplib_disable_rcfw_channel(struct bnxt_qplib_rcfw *rcfw);
int bnxt_qplib_rcfw_start_irq(struct bnxt_qplib_rcfw *rcfw, int msix_vector,
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.c b/drivers/infiniband/hw/bnxt_re/qplib_res.c
index 59eeac55626f..0bc24f934829 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_res.c
+++ b/drivers/infiniband/hw/bnxt_re/qplib_res.c
@@ -85,7 +85,7 @@ static void __free_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl,
static int __alloc_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl,
struct scatterlist *sghead, u32 pages, u32 pg_size)
{
- struct scatterlist *sg;
+ struct sg_dma_page_iter sg_iter;
bool is_umem = false;
int i;
@@ -105,10 +105,10 @@ static int __alloc_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl,
if (!sghead) {
for (i = 0; i < pages; i++) {
- pbl->pg_arr[i] = dma_zalloc_coherent(&pdev->dev,
- pbl->pg_size,
- &pbl->pg_map_arr[i],
- GFP_KERNEL);
+ pbl->pg_arr[i] = dma_alloc_coherent(&pdev->dev,
+ pbl->pg_size,
+ &pbl->pg_map_arr[i],
+ GFP_KERNEL);
if (!pbl->pg_arr[i])
goto fail;
pbl->pg_count++;
@@ -116,13 +116,11 @@ static int __alloc_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl,
} else {
i = 0;
is_umem = true;
- for_each_sg(sghead, sg, pages, i) {
- pbl->pg_map_arr[i] = sg_dma_address(sg);
- pbl->pg_arr[i] = sg_virt(sg);
- if (!pbl->pg_arr[i])
- goto fail;
-
+ for_each_sg_dma_page (sghead, &sg_iter, pages, 0) {
+ pbl->pg_map_arr[i] = sg_page_iter_dma_address(&sg_iter);
+ pbl->pg_arr[i] = NULL;
pbl->pg_count++;
+ i++;
}
}
@@ -330,13 +328,13 @@ void bnxt_qplib_free_ctx(struct pci_dev *pdev,
*/
int bnxt_qplib_alloc_ctx(struct pci_dev *pdev,
struct bnxt_qplib_ctx *ctx,
- bool virt_fn)
+ bool virt_fn, bool is_p5)
{
int i, j, k, rc = 0;
int fnz_idx = -1;
__le64 **pbl_ptr;
- if (virt_fn)
+ if (virt_fn || is_p5)
goto stats_alloc;
/* QPC Tables */
@@ -762,7 +760,11 @@ static int bnxt_qplib_alloc_stats_ctx(struct pci_dev *pdev,
{
memset(stats, 0, sizeof(*stats));
stats->fw_id = -1;
- stats->size = sizeof(struct ctx_hw_stats);
+ /* 128 byte aligned context memory is required only for 57500.
+ * However making this unconditional, it does not harm previous
+ * generation.
+ */
+ stats->size = ALIGN(sizeof(struct ctx_hw_stats), 128);
stats->dma = dma_alloc_coherent(&pdev->dev, stats->size,
&stats->dma_map, GFP_KERNEL);
if (!stats->dma) {
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.h b/drivers/infiniband/hw/bnxt_re/qplib_res.h
index 2e5c052da5a9..32cebd0f1436 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_res.h
+++ b/drivers/infiniband/hw/bnxt_re/qplib_res.h
@@ -177,14 +177,23 @@ struct bnxt_qplib_ctx {
struct bnxt_qplib_hwq tqm_tbl[MAX_TQM_ALLOC_REQ];
struct bnxt_qplib_stats stats;
struct bnxt_qplib_vf_res vf_res;
+ u64 hwrm_intf_ver;
};
+struct bnxt_qplib_chip_ctx {
+ u16 chip_num;
+ u8 chip_rev;
+ u8 chip_metal;
+};
+
+#define CHIP_NUM_57500 0x1750
+
struct bnxt_qplib_res {
struct pci_dev *pdev;
+ struct bnxt_qplib_chip_ctx *cctx;
struct net_device *netdev;
struct bnxt_qplib_rcfw *rcfw;
-
struct bnxt_qplib_pd_tbl pd_tbl;
struct bnxt_qplib_sgid_tbl sgid_tbl;
struct bnxt_qplib_pkey_tbl pkey_tbl;
@@ -192,6 +201,24 @@ struct bnxt_qplib_res {
bool prio;
};
+static inline bool bnxt_qplib_is_chip_gen_p5(struct bnxt_qplib_chip_ctx *cctx)
+{
+ return (cctx->chip_num == CHIP_NUM_57500);
+}
+
+static inline u8 bnxt_qplib_get_hwq_type(struct bnxt_qplib_res *res)
+{
+ return bnxt_qplib_is_chip_gen_p5(res->cctx) ?
+ HWQ_TYPE_QUEUE : HWQ_TYPE_L2_CMPL;
+}
+
+static inline u8 bnxt_qplib_get_ring_type(struct bnxt_qplib_chip_ctx *cctx)
+{
+ return bnxt_qplib_is_chip_gen_p5(cctx) ?
+ RING_ALLOC_REQ_RING_TYPE_NQ :
+ RING_ALLOC_REQ_RING_TYPE_ROCE_CMPL;
+}
+
#define to_bnxt_qplib(ptr, type, member) \
container_of(ptr, type, member)
@@ -225,5 +252,5 @@ void bnxt_qplib_free_ctx(struct pci_dev *pdev,
struct bnxt_qplib_ctx *ctx);
int bnxt_qplib_alloc_ctx(struct pci_dev *pdev,
struct bnxt_qplib_ctx *ctx,
- bool virt_fn);
+ bool virt_fn, bool is_p5);
#endif /* __BNXT_QPLIB_RES_H__ */
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c
index 5216b5f844cc..e9c53e406404 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_sp.c
+++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.c
@@ -119,7 +119,8 @@ int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw,
* reporting the max number
*/
attr->max_qp_wqes -= BNXT_QPLIB_RESERVED_QP_WRS;
- attr->max_qp_sges = sb->max_sge;
+ attr->max_qp_sges = bnxt_qplib_is_chip_gen_p5(rcfw->res->cctx) ?
+ 6 : sb->max_sge;
attr->max_cq = le32_to_cpu(sb->max_cq);
attr->max_cq_wqes = le32_to_cpu(sb->max_cqe);
attr->max_cq_sges = attr->max_qp_sges;
@@ -488,7 +489,8 @@ int bnxt_qplib_add_pkey(struct bnxt_qplib_res *res,
}
/* AH */
-int bnxt_qplib_create_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah)
+int bnxt_qplib_create_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah,
+ bool block)
{
struct bnxt_qplib_rcfw *rcfw = res->rcfw;
struct cmdq_create_ah req;
@@ -522,7 +524,7 @@ int bnxt_qplib_create_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah)
req.dest_mac[2] = cpu_to_le16(temp16[2]);
rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req, (void *)&resp,
- NULL, 1);
+ NULL, block);
if (rc)
return rc;
@@ -530,7 +532,8 @@ int bnxt_qplib_create_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah)
return 0;
}
-int bnxt_qplib_destroy_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah)
+int bnxt_qplib_destroy_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah,
+ bool block)
{
struct bnxt_qplib_rcfw *rcfw = res->rcfw;
struct cmdq_destroy_ah req;
@@ -544,7 +547,7 @@ int bnxt_qplib_destroy_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah)
req.ah_cid = cpu_to_le32(ah->id);
rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req, (void *)&resp,
- NULL, 1);
+ NULL, block);
if (rc)
return rc;
return 0;
@@ -778,9 +781,8 @@ int bnxt_qplib_map_tc2cos(struct bnxt_qplib_res *res, u16 *cids)
req.cos0 = cpu_to_le16(cids[0]);
req.cos1 = cpu_to_le16(cids[1]);
- bnxt_qplib_rcfw_send_message(rcfw, (void *)&req, (void *)&resp, NULL,
- 0);
- return 0;
+ return bnxt_qplib_rcfw_send_message(rcfw, (void *)&req, (void *)&resp,
+ NULL, 0);
}
int bnxt_qplib_get_roce_stats(struct bnxt_qplib_rcfw *rcfw,
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.h b/drivers/infiniband/hw/bnxt_re/qplib_sp.h
index 8079d7f5a008..39454b3f738d 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_sp.h
+++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.h
@@ -241,8 +241,10 @@ int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw,
int bnxt_qplib_set_func_resources(struct bnxt_qplib_res *res,
struct bnxt_qplib_rcfw *rcfw,
struct bnxt_qplib_ctx *ctx);
-int bnxt_qplib_create_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah);
-int bnxt_qplib_destroy_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah);
+int bnxt_qplib_create_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah,
+ bool block);
+int bnxt_qplib_destroy_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah,
+ bool block);
int bnxt_qplib_alloc_mrw(struct bnxt_qplib_res *res,
struct bnxt_qplib_mrw *mrw);
int bnxt_qplib_dereg_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mrw,
diff --git a/drivers/infiniband/hw/bnxt_re/roce_hsi.h b/drivers/infiniband/hw/bnxt_re/roce_hsi.h
index 8a9ead419ac2..e4b09e7c2175 100644
--- a/drivers/infiniband/hw/bnxt_re/roce_hsi.h
+++ b/drivers/infiniband/hw/bnxt_re/roce_hsi.h
@@ -49,11 +49,11 @@ struct cmpl_doorbell {
#define CMPL_DOORBELL_IDX_SFT 0
#define CMPL_DOORBELL_RESERVED_MASK 0x3000000UL
#define CMPL_DOORBELL_RESERVED_SFT 24
- #define CMPL_DOORBELL_IDX_VALID 0x4000000UL
+ #define CMPL_DOORBELL_IDX_VALID 0x4000000UL
#define CMPL_DOORBELL_MASK 0x8000000UL
#define CMPL_DOORBELL_KEY_MASK 0xf0000000UL
#define CMPL_DOORBELL_KEY_SFT 28
- #define CMPL_DOORBELL_KEY_CMPL (0x2UL << 28)
+ #define CMPL_DOORBELL_KEY_CMPL (0x2UL << 28)
};
/* Status Door Bell Format (4 bytes) */
@@ -71,46 +71,56 @@ struct status_doorbell {
/* RoCE Host Structures */
/* Doorbell Structures */
-/* 64b Doorbell Format (8 bytes) */
-struct dbr_dbr {
- __le32 index;
- #define DBR_DBR_INDEX_MASK 0xfffffUL
- #define DBR_DBR_INDEX_SFT 0
- #define DBR_DBR_RESERVED12_MASK 0xfff00000UL
- #define DBR_DBR_RESERVED12_SFT 20
- __le32 type_xid;
- #define DBR_DBR_XID_MASK 0xfffffUL
- #define DBR_DBR_XID_SFT 0
- #define DBR_DBR_RESERVED8_MASK 0xff00000UL
- #define DBR_DBR_RESERVED8_SFT 20
- #define DBR_DBR_TYPE_MASK 0xf0000000UL
- #define DBR_DBR_TYPE_SFT 28
- #define DBR_DBR_TYPE_SQ (0x0UL << 28)
- #define DBR_DBR_TYPE_RQ (0x1UL << 28)
- #define DBR_DBR_TYPE_SRQ (0x2UL << 28)
- #define DBR_DBR_TYPE_SRQ_ARM (0x3UL << 28)
- #define DBR_DBR_TYPE_CQ (0x4UL << 28)
- #define DBR_DBR_TYPE_CQ_ARMSE (0x5UL << 28)
- #define DBR_DBR_TYPE_CQ_ARMALL (0x6UL << 28)
- #define DBR_DBR_TYPE_CQ_ARMENA (0x7UL << 28)
- #define DBR_DBR_TYPE_SRQ_ARMENA (0x8UL << 28)
- #define DBR_DBR_TYPE_CQ_CUTOFF_ACK (0x9UL << 28)
- #define DBR_DBR_TYPE_NULL (0xfUL << 28)
-};
-
-/* 32b Doorbell Format (4 bytes) */
-struct dbr_dbr32 {
- __le32 type_abs_incr_xid;
- #define DBR_DBR32_XID_MASK 0xfffffUL
- #define DBR_DBR32_XID_SFT 0
- #define DBR_DBR32_RESERVED4_MASK 0xf00000UL
- #define DBR_DBR32_RESERVED4_SFT 20
- #define DBR_DBR32_INCR_MASK 0xf000000UL
- #define DBR_DBR32_INCR_SFT 24
- #define DBR_DBR32_ABS 0x10000000UL
- #define DBR_DBR32_TYPE_MASK 0xe0000000UL
- #define DBR_DBR32_TYPE_SFT 29
- #define DBR_DBR32_TYPE_SQ (0x0UL << 29)
+/* dbc_dbc (size:64b/8B) */
+struct dbc_dbc {
+ __le32 index;
+ #define DBC_DBC_INDEX_MASK 0xffffffUL
+ #define DBC_DBC_INDEX_SFT 0
+ __le32 type_path_xid;
+ #define DBC_DBC_XID_MASK 0xfffffUL
+ #define DBC_DBC_XID_SFT 0
+ #define DBC_DBC_PATH_MASK 0x3000000UL
+ #define DBC_DBC_PATH_SFT 24
+ #define DBC_DBC_PATH_ROCE (0x0UL << 24)
+ #define DBC_DBC_PATH_L2 (0x1UL << 24)
+ #define DBC_DBC_PATH_ENGINE (0x2UL << 24)
+ #define DBC_DBC_PATH_LAST DBC_DBC_PATH_ENGINE
+ #define DBC_DBC_DEBUG_TRACE 0x8000000UL
+ #define DBC_DBC_TYPE_MASK 0xf0000000UL
+ #define DBC_DBC_TYPE_SFT 28
+ #define DBC_DBC_TYPE_SQ (0x0UL << 28)
+ #define DBC_DBC_TYPE_RQ (0x1UL << 28)
+ #define DBC_DBC_TYPE_SRQ (0x2UL << 28)
+ #define DBC_DBC_TYPE_SRQ_ARM (0x3UL << 28)
+ #define DBC_DBC_TYPE_CQ (0x4UL << 28)
+ #define DBC_DBC_TYPE_CQ_ARMSE (0x5UL << 28)
+ #define DBC_DBC_TYPE_CQ_ARMALL (0x6UL << 28)
+ #define DBC_DBC_TYPE_CQ_ARMENA (0x7UL << 28)
+ #define DBC_DBC_TYPE_SRQ_ARMENA (0x8UL << 28)
+ #define DBC_DBC_TYPE_CQ_CUTOFF_ACK (0x9UL << 28)
+ #define DBC_DBC_TYPE_NQ (0xaUL << 28)
+ #define DBC_DBC_TYPE_NQ_ARM (0xbUL << 28)
+ #define DBC_DBC_TYPE_NULL (0xfUL << 28)
+ #define DBC_DBC_TYPE_LAST DBC_DBC_TYPE_NULL
+};
+
+/* dbc_dbc32 (size:32b/4B) */
+struct dbc_dbc32 {
+ __le32 type_abs_incr_xid;
+ #define DBC_DBC32_XID_MASK 0xfffffUL
+ #define DBC_DBC32_XID_SFT 0
+ #define DBC_DBC32_PATH_MASK 0xc00000UL
+ #define DBC_DBC32_PATH_SFT 22
+ #define DBC_DBC32_PATH_ROCE (0x0UL << 22)
+ #define DBC_DBC32_PATH_L2 (0x1UL << 22)
+ #define DBC_DBC32_PATH_LAST DBC_DBC32_PATH_L2
+ #define DBC_DBC32_INCR_MASK 0xf000000UL
+ #define DBC_DBC32_INCR_SFT 24
+ #define DBC_DBC32_ABS 0x10000000UL
+ #define DBC_DBC32_TYPE_MASK 0xe0000000UL
+ #define DBC_DBC32_TYPE_SFT 29
+ #define DBC_DBC32_TYPE_SQ (0x0UL << 29)
+ #define DBC_DBC32_TYPE_LAST DBC_DBC32_TYPE_SQ
};
/* SQ WQE Structures */
@@ -149,7 +159,24 @@ struct sq_psn_search {
#define SQ_PSN_SEARCH_NEXT_PSN_MASK 0xffffffUL
#define SQ_PSN_SEARCH_NEXT_PSN_SFT 0
#define SQ_PSN_SEARCH_FLAGS_MASK 0xff000000UL
- #define SQ_PSN_SEARCH_FLAGS_SFT 24
+ #define SQ_PSN_SEARCH_FLAGS_SFT 24
+};
+
+/* sq_psn_search_ext (size:128b/16B) */
+struct sq_psn_search_ext {
+ __le32 opcode_start_psn;
+ #define SQ_PSN_SEARCH_EXT_START_PSN_MASK 0xffffffUL
+ #define SQ_PSN_SEARCH_EXT_START_PSN_SFT 0
+ #define SQ_PSN_SEARCH_EXT_OPCODE_MASK 0xff000000UL
+ #define SQ_PSN_SEARCH_EXT_OPCODE_SFT 24
+ __le32 flags_next_psn;
+ #define SQ_PSN_SEARCH_EXT_NEXT_PSN_MASK 0xffffffUL
+ #define SQ_PSN_SEARCH_EXT_NEXT_PSN_SFT 0
+ #define SQ_PSN_SEARCH_EXT_FLAGS_MASK 0xff000000UL
+ #define SQ_PSN_SEARCH_EXT_FLAGS_SFT 24
+ __le16 start_slot_idx;
+ __le16 reserved16;
+ __le32 reserved32;
};
/* Send SQ WQE (40 bytes) */
@@ -505,22 +532,24 @@ struct cq_res_rc {
/* Responder UD CQE (32 bytes) */
struct cq_res_ud {
- __le32 length;
+ __le16 length;
#define CQ_RES_UD_LENGTH_MASK 0x3fffUL
#define CQ_RES_UD_LENGTH_SFT 0
- #define CQ_RES_UD_RESERVED18_MASK 0xffffc000UL
- #define CQ_RES_UD_RESERVED18_SFT 14
+ __le16 cfa_metadata;
+ #define CQ_RES_UD_CFA_METADATA_VID_MASK 0xfffUL
+ #define CQ_RES_UD_CFA_METADATA_VID_SFT 0
+ #define CQ_RES_UD_CFA_METADATA_DE 0x1000UL
+ #define CQ_RES_UD_CFA_METADATA_PRI_MASK 0xe000UL
+ #define CQ_RES_UD_CFA_METADATA_PRI_SFT 13
__le32 imm_data;
__le64 qp_handle;
__le16 src_mac[3];
__le16 src_qp_low;
u8 cqe_type_toggle;
- #define CQ_RES_UD_TOGGLE 0x1UL
- #define CQ_RES_UD_CQE_TYPE_MASK 0x1eUL
- #define CQ_RES_UD_CQE_TYPE_SFT 1
+ #define CQ_RES_UD_TOGGLE 0x1UL
+ #define CQ_RES_UD_CQE_TYPE_MASK 0x1eUL
+ #define CQ_RES_UD_CQE_TYPE_SFT 1
#define CQ_RES_UD_CQE_TYPE_RES_UD (0x2UL << 1)
- #define CQ_RES_UD_RESERVED3_MASK 0xe0UL
- #define CQ_RES_UD_RESERVED3_SFT 5
u8 status;
#define CQ_RES_UD_STATUS_OK 0x0UL
#define CQ_RES_UD_STATUS_LOCAL_ACCESS_ERROR 0x1UL
@@ -536,18 +565,30 @@ struct cq_res_ud {
#define CQ_RES_UD_FLAGS_SRQ_SRQ (0x1UL << 0)
#define CQ_RES_UD_FLAGS_SRQ_LAST CQ_RES_UD_FLAGS_SRQ_SRQ
#define CQ_RES_UD_FLAGS_IMM 0x2UL
- #define CQ_RES_UD_FLAGS_ROCE_IP_VER_MASK 0xcUL
- #define CQ_RES_UD_FLAGS_ROCE_IP_VER_SFT 2
- #define CQ_RES_UD_FLAGS_ROCE_IP_VER_V1 (0x0UL << 2)
- #define CQ_RES_UD_FLAGS_ROCE_IP_VER_V2IPV4 (0x2UL << 2)
- #define CQ_RES_UD_FLAGS_ROCE_IP_VER_V2IPV6 (0x3UL << 2)
+ #define CQ_RES_UD_FLAGS_UNUSED_MASK 0xcUL
+ #define CQ_RES_UD_FLAGS_UNUSED_SFT 2
+ #define CQ_RES_UD_FLAGS_ROCE_IP_VER_MASK 0x30UL
+ #define CQ_RES_UD_FLAGS_ROCE_IP_VER_SFT 4
+ #define CQ_RES_UD_FLAGS_ROCE_IP_VER_V1 (0x0UL << 4)
+ #define CQ_RES_UD_FLAGS_ROCE_IP_VER_V2IPV4 (0x2UL << 4)
+ #define CQ_RES_UD_FLAGS_ROCE_IP_VER_V2IPV6 (0x3UL << 4)
#define CQ_RES_UD_FLAGS_ROCE_IP_VER_LAST \
CQ_RES_UD_FLAGS_ROCE_IP_VER_V2IPV6
+ #define CQ_RES_UD_FLAGS_META_FORMAT_MASK 0x3c0UL
+ #define CQ_RES_UD_FLAGS_META_FORMAT_SFT 6
+ #define CQ_RES_UD_FLAGS_META_FORMAT_NONE (0x0UL << 6)
+ #define CQ_RES_UD_FLAGS_META_FORMAT_VLAN (0x1UL << 6)
+ #define CQ_RES_UD_FLAGS_META_FORMAT_TUNNEL_ID (0x2UL << 6)
+ #define CQ_RES_UD_FLAGS_META_FORMAT_CHDR_DATA (0x3UL << 6)
+ #define CQ_RES_UD_FLAGS_META_FORMAT_HDR_OFFSET (0x4UL << 6)
+ #define CQ_RES_UD_FLAGS_META_FORMAT_LAST \
+ CQ_RES_UD_FLAGS_META_FORMAT_HDR_OFFSET
+ #define CQ_RES_UD_FLAGS_EXT_META_FORMAT_MASK 0xc00UL
+ #define CQ_RES_UD_FLAGS_EXT_META_FORMAT_SFT 10
+
__le32 src_qp_high_srq_or_rq_wr_id;
#define CQ_RES_UD_SRQ_OR_RQ_WR_ID_MASK 0xfffffUL
#define CQ_RES_UD_SRQ_OR_RQ_WR_ID_SFT 0
- #define CQ_RES_UD_RESERVED4_MASK 0xf00000UL
- #define CQ_RES_UD_RESERVED4_SFT 20
#define CQ_RES_UD_SRC_QP_HIGH_MASK 0xff000000UL
#define CQ_RES_UD_SRC_QP_HIGH_SFT 24
};
@@ -983,6 +1024,7 @@ struct cmdq_create_qp {
#define CMDQ_CREATE_QP_TYPE_RC 0x2UL
#define CMDQ_CREATE_QP_TYPE_UD 0x4UL
#define CMDQ_CREATE_QP_TYPE_RAW_ETHERTYPE 0x6UL
+ #define CMDQ_CREATE_QP_TYPE_GSI 0x7UL
u8 sq_pg_size_sq_lvl;
#define CMDQ_CREATE_QP_SQ_LVL_MASK 0xfUL
#define CMDQ_CREATE_QP_SQ_LVL_SFT 0
@@ -2719,6 +2761,8 @@ struct creq_query_func_resp_sb {
__le16 max_srq;
__le32 max_gid;
__le32 tqm_alloc_reqs[12];
+ __le32 max_dpi;
+ __le32 reserved_32;
};
/* Set resources command response (16 bytes) */
diff --git a/drivers/infiniband/hw/cxgb3/Makefile b/drivers/infiniband/hw/cxgb3/Makefile
index 66fe0917aba0..34bb86a6ae3a 100644
--- a/drivers/infiniband/hw/cxgb3/Makefile
+++ b/drivers/infiniband/hw/cxgb3/Makefile
@@ -1,5 +1,5 @@
# SPDX-License-Identifier: GPL-2.0
-ccflags-y := -Idrivers/net/ethernet/chelsio/cxgb3
+ccflags-y := -I $(srctree)/drivers/net/ethernet/chelsio/cxgb3
obj-$(CONFIG_INFINIBAND_CXGB3) += iw_cxgb3.o
diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.c b/drivers/infiniband/hw/cxgb3/cxio_hal.c
index dcb4bba522ba..8ac72ac7cbac 100644
--- a/drivers/infiniband/hw/cxgb3/cxio_hal.c
+++ b/drivers/infiniband/hw/cxgb3/cxio_hal.c
@@ -292,12 +292,11 @@ int cxio_create_qp(struct cxio_rdev *rdev_p, u32 kernel_domain,
goto err3;
wq->queue = dma_alloc_coherent(&(rdev_p->rnic_info.pdev->dev),
- depth * sizeof(union t3_wr),
- &(wq->dma_addr), GFP_KERNEL);
+ depth * sizeof(union t3_wr),
+ &(wq->dma_addr), GFP_KERNEL);
if (!wq->queue)
goto err4;
- memset(wq->queue, 0, depth * sizeof(union t3_wr));
dma_unmap_addr_set(wq, mapping, wq->dma_addr);
wq->doorbell = (void __iomem *)rdev_p->rnic_info.kdb_addr;
if (!kernel_domain)
diff --git a/drivers/infiniband/hw/cxgb3/iwch.c b/drivers/infiniband/hw/cxgb3/iwch.c
index 591de319c178..fb03bc492ef7 100644
--- a/drivers/infiniband/hw/cxgb3/iwch.c
+++ b/drivers/infiniband/hw/cxgb3/iwch.c
@@ -146,7 +146,7 @@ static void open_rnic_dev(struct t3cdev *tdev)
pr_debug("%s t3cdev %p\n", __func__, tdev);
pr_info_once("Chelsio T3 RDMA Driver - version %s\n", DRV_VERSION);
- rnicp = (struct iwch_dev *)ib_alloc_device(sizeof(*rnicp));
+ rnicp = ib_alloc_device(iwch_dev, ibdev);
if (!rnicp) {
pr_err("Cannot allocate ib device\n");
return;
diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c
index ebbec02cebe0..4accf7b3dcf2 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_provider.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c
@@ -53,6 +53,7 @@
#include <rdma/ib_smi.h>
#include <rdma/ib_umem.h>
#include <rdma/ib_user_verbs.h>
+#include <rdma/uverbs_ioctl.h>
#include "cxio_hal.h"
#include "iwch.h"
@@ -61,7 +62,7 @@
#include <rdma/cxgb3-abi.h>
#include "common.h"
-static int iwch_dealloc_ucontext(struct ib_ucontext *context)
+static void iwch_dealloc_ucontext(struct ib_ucontext *context)
{
struct iwch_dev *rhp = to_iwch_dev(context->device);
struct iwch_ucontext *ucontext = to_iwch_ucontext(context);
@@ -71,24 +72,20 @@ static int iwch_dealloc_ucontext(struct ib_ucontext *context)
list_for_each_entry_safe(mm, tmp, &ucontext->mmaps, entry)
kfree(mm);
cxio_release_ucontext(&rhp->rdev, &ucontext->uctx);
- kfree(ucontext);
- return 0;
}
-static struct ib_ucontext *iwch_alloc_ucontext(struct ib_device *ibdev,
- struct ib_udata *udata)
+static int iwch_alloc_ucontext(struct ib_ucontext *ucontext,
+ struct ib_udata *udata)
{
- struct iwch_ucontext *context;
+ struct ib_device *ibdev = ucontext->device;
+ struct iwch_ucontext *context = to_iwch_ucontext(ucontext);
struct iwch_dev *rhp = to_iwch_dev(ibdev);
pr_debug("%s ibdev %p\n", __func__, ibdev);
- context = kzalloc(sizeof(*context), GFP_KERNEL);
- if (!context)
- return ERR_PTR(-ENOMEM);
cxio_init_ucontext(&rhp->rdev, &context->uctx);
INIT_LIST_HEAD(&context->mmaps);
spin_lock_init(&context->mmap_lock);
- return &context->ibucontext;
+ return 0;
}
static int iwch_destroy_cq(struct ib_cq *ib_cq)
@@ -370,7 +367,7 @@ static int iwch_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
return ret;
}
-static int iwch_deallocate_pd(struct ib_pd *pd)
+static void iwch_deallocate_pd(struct ib_pd *pd)
{
struct iwch_dev *rhp;
struct iwch_pd *php;
@@ -379,15 +376,13 @@ static int iwch_deallocate_pd(struct ib_pd *pd)
rhp = php->rhp;
pr_debug("%s ibpd %p pdid 0x%x\n", __func__, pd, php->pdid);
cxio_hal_put_pdid(rhp->rdev.rscp, php->pdid);
- kfree(php);
- return 0;
}
-static struct ib_pd *iwch_allocate_pd(struct ib_device *ibdev,
- struct ib_ucontext *context,
- struct ib_udata *udata)
+static int iwch_allocate_pd(struct ib_pd *pd, struct ib_ucontext *context,
+ struct ib_udata *udata)
{
- struct iwch_pd *php;
+ struct iwch_pd *php = to_iwch_pd(pd);
+ struct ib_device *ibdev = pd->device;
u32 pdid;
struct iwch_dev *rhp;
@@ -395,12 +390,8 @@ static struct ib_pd *iwch_allocate_pd(struct ib_device *ibdev,
rhp = (struct iwch_dev *) ibdev;
pdid = cxio_hal_get_pdid(rhp->rdev.rscp);
if (!pdid)
- return ERR_PTR(-EINVAL);
- php = kzalloc(sizeof(*php), GFP_KERNEL);
- if (!php) {
- cxio_hal_put_pdid(rhp->rdev.rscp, pdid);
- return ERR_PTR(-ENOMEM);
- }
+ return -EINVAL;
+
php->pdid = pdid;
php->rhp = rhp;
if (context) {
@@ -408,11 +399,11 @@ static struct ib_pd *iwch_allocate_pd(struct ib_device *ibdev,
if (ib_copy_to_udata(udata, &resp, sizeof(resp))) {
iwch_deallocate_pd(&php->ibpd);
- return ERR_PTR(-EFAULT);
+ return -EFAULT;
}
}
pr_debug("%s pdid 0x%0x ptr 0x%p\n", __func__, pdid, php);
- return &php->ibpd;
+ return 0;
}
static int iwch_dereg_mr(struct ib_mr *ib_mr)
@@ -522,14 +513,13 @@ static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
u64 virt, int acc, struct ib_udata *udata)
{
__be64 *pages;
- int shift, n, len;
- int i, k, entry;
+ int shift, n, i;
int err = 0;
struct iwch_dev *rhp;
struct iwch_pd *php;
struct iwch_mr *mhp;
struct iwch_reg_user_mr_resp uresp;
- struct scatterlist *sg;
+ struct sg_dma_page_iter sg_iter;
pr_debug("%s ib_pd %p\n", __func__, pd);
php = to_iwch_pd(pd);
@@ -540,14 +530,14 @@ static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
mhp->rhp = rhp;
- mhp->umem = ib_umem_get(pd->uobject->context, start, length, acc, 0);
+ mhp->umem = ib_umem_get(udata, start, length, acc, 0);
if (IS_ERR(mhp->umem)) {
err = PTR_ERR(mhp->umem);
kfree(mhp);
return ERR_PTR(err);
}
- shift = mhp->umem->page_shift;
+ shift = PAGE_SHIFT;
n = mhp->umem->nmap;
@@ -563,19 +553,15 @@ static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
i = n = 0;
- for_each_sg(mhp->umem->sg_head.sgl, sg, mhp->umem->nmap, entry) {
- len = sg_dma_len(sg) >> shift;
- for (k = 0; k < len; ++k) {
- pages[i++] = cpu_to_be64(sg_dma_address(sg) +
- (k << shift));
- if (i == PAGE_SIZE / sizeof *pages) {
- err = iwch_write_pbl(mhp, pages, i, n);
- if (err)
- goto pbl_done;
- n += i;
- i = 0;
- }
- }
+ for_each_sg_dma_page(mhp->umem->sg_head.sgl, &sg_iter, mhp->umem->nmap, 0) {
+ pages[i++] = cpu_to_be64(sg_page_iter_dma_address(&sg_iter));
+ if (i == PAGE_SIZE / sizeof *pages) {
+ err = iwch_write_pbl(mhp, pages, i, n);
+ if (err)
+ goto pbl_done;
+ n += i;
+ i = 0;
+ }
}
if (i)
@@ -836,7 +822,8 @@ static struct ib_qp *iwch_create_qp(struct ib_pd *pd,
* Kernel users need more wq space for fastreg WRs which can take
* 2 WR fragments.
*/
- ucontext = pd->uobject ? to_iwch_ucontext(pd->uobject->context) : NULL;
+ ucontext = rdma_udata_to_drv_context(udata, struct iwch_ucontext,
+ ibucontext);
if (!ucontext && wqsize < (rqsize + (2 * sqsize)))
wqsize = roundup_pow_of_two(rqsize +
roundup_pow_of_two(attrs->cap.max_send_wr * 2));
@@ -1130,8 +1117,9 @@ static int iwch_query_port(struct ib_device *ibdev,
static ssize_t hw_rev_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
- struct iwch_dev *iwch_dev = container_of(dev, struct iwch_dev,
- ibdev.dev);
+ struct iwch_dev *iwch_dev =
+ rdma_device_to_drv_device(dev, struct iwch_dev, ibdev);
+
pr_debug("%s dev 0x%p\n", __func__, dev);
return sprintf(buf, "%d\n", iwch_dev->rdev.t3cdev_p->type);
}
@@ -1140,8 +1128,8 @@ static DEVICE_ATTR_RO(hw_rev);
static ssize_t hca_type_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
- struct iwch_dev *iwch_dev = container_of(dev, struct iwch_dev,
- ibdev.dev);
+ struct iwch_dev *iwch_dev =
+ rdma_device_to_drv_device(dev, struct iwch_dev, ibdev);
struct ethtool_drvinfo info;
struct net_device *lldev = iwch_dev->rdev.t3cdev_p->lldev;
@@ -1154,8 +1142,9 @@ static DEVICE_ATTR_RO(hca_type);
static ssize_t board_id_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
- struct iwch_dev *iwch_dev = container_of(dev, struct iwch_dev,
- ibdev.dev);
+ struct iwch_dev *iwch_dev =
+ rdma_device_to_drv_device(dev, struct iwch_dev, ibdev);
+
pr_debug("%s dev 0x%p\n", __func__, dev);
return sprintf(buf, "%x.%x\n", iwch_dev->rdev.rnic_info.pdev->vendor,
iwch_dev->rdev.rnic_info.pdev->device);
@@ -1317,6 +1306,41 @@ static void get_dev_fw_ver_str(struct ib_device *ibdev, char *str)
snprintf(str, IB_FW_VERSION_NAME_MAX, "%s", info.fw_version);
}
+static const struct ib_device_ops iwch_dev_ops = {
+ .alloc_hw_stats = iwch_alloc_stats,
+ .alloc_mr = iwch_alloc_mr,
+ .alloc_mw = iwch_alloc_mw,
+ .alloc_pd = iwch_allocate_pd,
+ .alloc_ucontext = iwch_alloc_ucontext,
+ .create_cq = iwch_create_cq,
+ .create_qp = iwch_create_qp,
+ .dealloc_mw = iwch_dealloc_mw,
+ .dealloc_pd = iwch_deallocate_pd,
+ .dealloc_ucontext = iwch_dealloc_ucontext,
+ .dereg_mr = iwch_dereg_mr,
+ .destroy_cq = iwch_destroy_cq,
+ .destroy_qp = iwch_destroy_qp,
+ .get_dev_fw_str = get_dev_fw_ver_str,
+ .get_dma_mr = iwch_get_dma_mr,
+ .get_hw_stats = iwch_get_mib,
+ .get_port_immutable = iwch_port_immutable,
+ .map_mr_sg = iwch_map_mr_sg,
+ .mmap = iwch_mmap,
+ .modify_qp = iwch_ib_modify_qp,
+ .poll_cq = iwch_poll_cq,
+ .post_recv = iwch_post_receive,
+ .post_send = iwch_post_send,
+ .query_device = iwch_query_device,
+ .query_gid = iwch_query_gid,
+ .query_pkey = iwch_query_pkey,
+ .query_port = iwch_query_port,
+ .reg_user_mr = iwch_reg_user_mr,
+ .req_notify_cq = iwch_arm_cq,
+ .resize_cq = iwch_resize_cq,
+ INIT_RDMA_OBJ_SIZE(ib_pd, iwch_pd, ibpd),
+ INIT_RDMA_OBJ_SIZE(ib_ucontext, iwch_ucontext, ibucontext),
+};
+
int iwch_register_device(struct iwch_dev *dev)
{
int ret;
@@ -1356,39 +1380,9 @@ int iwch_register_device(struct iwch_dev *dev)
dev->ibdev.phys_port_cnt = dev->rdev.port_info.nports;
dev->ibdev.num_comp_vectors = 1;
dev->ibdev.dev.parent = &dev->rdev.rnic_info.pdev->dev;
- dev->ibdev.query_device = iwch_query_device;
- dev->ibdev.query_port = iwch_query_port;
- dev->ibdev.query_pkey = iwch_query_pkey;
- dev->ibdev.query_gid = iwch_query_gid;
- dev->ibdev.alloc_ucontext = iwch_alloc_ucontext;
- dev->ibdev.dealloc_ucontext = iwch_dealloc_ucontext;
- dev->ibdev.mmap = iwch_mmap;
- dev->ibdev.alloc_pd = iwch_allocate_pd;
- dev->ibdev.dealloc_pd = iwch_deallocate_pd;
- dev->ibdev.create_qp = iwch_create_qp;
- dev->ibdev.modify_qp = iwch_ib_modify_qp;
- dev->ibdev.destroy_qp = iwch_destroy_qp;
- dev->ibdev.create_cq = iwch_create_cq;
- dev->ibdev.destroy_cq = iwch_destroy_cq;
- dev->ibdev.resize_cq = iwch_resize_cq;
- dev->ibdev.poll_cq = iwch_poll_cq;
- dev->ibdev.get_dma_mr = iwch_get_dma_mr;
- dev->ibdev.reg_user_mr = iwch_reg_user_mr;
- dev->ibdev.dereg_mr = iwch_dereg_mr;
- dev->ibdev.alloc_mw = iwch_alloc_mw;
- dev->ibdev.dealloc_mw = iwch_dealloc_mw;
- dev->ibdev.alloc_mr = iwch_alloc_mr;
- dev->ibdev.map_mr_sg = iwch_map_mr_sg;
- dev->ibdev.req_notify_cq = iwch_arm_cq;
- dev->ibdev.post_send = iwch_post_send;
- dev->ibdev.post_recv = iwch_post_receive;
- dev->ibdev.alloc_hw_stats = iwch_alloc_stats;
- dev->ibdev.get_hw_stats = iwch_get_mib;
dev->ibdev.uverbs_abi_ver = IWCH_UVERBS_ABI_VERSION;
- dev->ibdev.get_port_immutable = iwch_port_immutable;
- dev->ibdev.get_dev_fw_str = get_dev_fw_ver_str;
- dev->ibdev.iwcm = kmalloc(sizeof(struct iw_cm_verbs), GFP_KERNEL);
+ dev->ibdev.iwcm = kzalloc(sizeof(struct iw_cm_verbs), GFP_KERNEL);
if (!dev->ibdev.iwcm)
return -ENOMEM;
@@ -1405,7 +1399,8 @@ int iwch_register_device(struct iwch_dev *dev)
dev->ibdev.driver_id = RDMA_DRIVER_CXGB3;
rdma_set_device_sysfs_group(&dev->ibdev, &iwch_attr_group);
- ret = ib_register_device(&dev->ibdev, "cxgb3_%d", NULL);
+ ib_set_device_ops(&dev->ibdev, &iwch_dev_ops);
+ ret = ib_register_device(&dev->ibdev, "cxgb3_%d");
if (ret)
kfree(dev->ibdev.iwcm);
return ret;
diff --git a/drivers/infiniband/hw/cxgb4/Makefile b/drivers/infiniband/hw/cxgb4/Makefile
index 9edd92023e18..31a87d90a40b 100644
--- a/drivers/infiniband/hw/cxgb4/Makefile
+++ b/drivers/infiniband/hw/cxgb4/Makefile
@@ -1,5 +1,5 @@
-ccflags-y := -Idrivers/net/ethernet/chelsio/cxgb4
-ccflags-y += -Idrivers/net/ethernet/chelsio/libcxgb
+ccflags-y := -I $(srctree)/drivers/net/ethernet/chelsio/cxgb4
+ccflags-y += -I $(srctree)/drivers/net/ethernet/chelsio/libcxgb
obj-$(CONFIG_INFINIBAND_CXGB4) += iw_cxgb4.o
diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c
index 615413bd3e8d..4d232bdf9e97 100644
--- a/drivers/infiniband/hw/cxgb4/cm.c
+++ b/drivers/infiniband/hw/cxgb4/cm.c
@@ -655,7 +655,33 @@ static int send_halfclose(struct c4iw_ep *ep)
return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
}
-static int send_abort(struct c4iw_ep *ep)
+static void read_tcb(struct c4iw_ep *ep)
+{
+ struct sk_buff *skb;
+ struct cpl_get_tcb *req;
+ int wrlen = roundup(sizeof(*req), 16);
+
+ skb = get_skb(NULL, sizeof(*req), GFP_KERNEL);
+ if (WARN_ON(!skb))
+ return;
+
+ set_wr_txq(skb, CPL_PRIORITY_CONTROL, ep->ctrlq_idx);
+ req = (struct cpl_get_tcb *) skb_put(skb, wrlen);
+ memset(req, 0, wrlen);
+ INIT_TP_WR(req, ep->hwtid);
+ OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_GET_TCB, ep->hwtid));
+ req->reply_ctrl = htons(REPLY_CHAN_V(0) | QUEUENO_V(ep->rss_qid));
+
+ /*
+ * keep a ref on the ep so the tcb is not unlocked before this
+ * cpl completes. The ref is released in read_tcb_rpl().
+ */
+ c4iw_get_ep(&ep->com);
+ if (WARN_ON(c4iw_ofld_send(&ep->com.dev->rdev, skb)))
+ c4iw_put_ep(&ep->com);
+}
+
+static int send_abort_req(struct c4iw_ep *ep)
{
u32 wrlen = roundup(sizeof(struct cpl_abort_req), 16);
struct sk_buff *req_skb = skb_dequeue(&ep->com.ep_skb_list);
@@ -670,6 +696,17 @@ static int send_abort(struct c4iw_ep *ep)
return c4iw_l2t_send(&ep->com.dev->rdev, req_skb, ep->l2t);
}
+static int send_abort(struct c4iw_ep *ep)
+{
+ if (!ep->com.qp || !ep->com.qp->srq) {
+ send_abort_req(ep);
+ return 0;
+ }
+ set_bit(ABORT_REQ_IN_PROGRESS, &ep->com.flags);
+ read_tcb(ep);
+ return 0;
+}
+
static int send_connect(struct c4iw_ep *ep)
{
struct cpl_act_open_req *req = NULL;
@@ -1851,14 +1888,11 @@ static int rx_data(struct c4iw_dev *dev, struct sk_buff *skb)
return 0;
}
-static void complete_cached_srq_buffers(struct c4iw_ep *ep,
- __be32 srqidx_status)
+static void complete_cached_srq_buffers(struct c4iw_ep *ep, u32 srqidx)
{
enum chip_type adapter_type;
- u32 srqidx;
adapter_type = ep->com.dev->rdev.lldi.adapter_type;
- srqidx = ABORT_RSS_SRQIDX_G(be32_to_cpu(srqidx_status));
/*
* If this TCB had a srq buffer cached, then we must complete
@@ -1876,6 +1910,7 @@ static void complete_cached_srq_buffers(struct c4iw_ep *ep,
static int abort_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
{
+ u32 srqidx;
struct c4iw_ep *ep;
struct cpl_abort_rpl_rss6 *rpl = cplhdr(skb);
int release = 0;
@@ -1887,7 +1922,10 @@ static int abort_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
return 0;
}
- complete_cached_srq_buffers(ep, rpl->srqidx_status);
+ if (ep->com.qp && ep->com.qp->srq) {
+ srqidx = ABORT_RSS_SRQIDX_G(be32_to_cpu(rpl->srqidx_status));
+ complete_cached_srq_buffers(ep, srqidx ? srqidx : ep->srqe_idx);
+ }
pr_debug("ep %p tid %u\n", ep, ep->hwtid);
mutex_lock(&ep->com.mutex);
@@ -1903,8 +1941,10 @@ static int abort_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
}
mutex_unlock(&ep->com.mutex);
- if (release)
+ if (release) {
+ close_complete_upcall(ep, -ECONNRESET);
release_ep_resources(ep);
+ }
c4iw_put_ep(&ep->com);
return 0;
}
@@ -2058,8 +2098,7 @@ static int import_ep(struct c4iw_ep *ep, int iptype, __u8 *peer_ip,
}
ep->mtu = pdev->mtu;
ep->tx_chan = cxgb4_port_chan(pdev);
- ep->smac_idx = cxgb4_tp_smt_idx(adapter_type,
- cxgb4_port_viid(pdev));
+ ep->smac_idx = ((struct port_info *)netdev_priv(pdev))->smt_idx;
step = cdev->rdev.lldi.ntxq /
cdev->rdev.lldi.nchan;
ep->txq_idx = cxgb4_port_idx(pdev) * step;
@@ -2073,13 +2112,12 @@ static int import_ep(struct c4iw_ep *ep, int iptype, __u8 *peer_ip,
} else {
pdev = get_real_dev(n->dev);
ep->l2t = cxgb4_l2t_get(cdev->rdev.lldi.l2t,
- n, pdev, 0);
+ n, pdev, rt_tos2priority(tos));
if (!ep->l2t)
goto out;
ep->mtu = dst_mtu(dst);
ep->tx_chan = cxgb4_port_chan(pdev);
- ep->smac_idx = cxgb4_tp_smt_idx(adapter_type,
- cxgb4_port_viid(pdev));
+ ep->smac_idx = ((struct port_info *)netdev_priv(pdev))->smt_idx;
step = cdev->rdev.lldi.ntxq /
cdev->rdev.lldi.nchan;
ep->txq_idx = cxgb4_port_idx(pdev) * step;
@@ -2163,7 +2201,8 @@ static int c4iw_reconnect(struct c4iw_ep *ep)
laddr6->sin6_addr.s6_addr,
raddr6->sin6_addr.s6_addr,
laddr6->sin6_port,
- raddr6->sin6_port, 0,
+ raddr6->sin6_port,
+ ep->com.cm_id->tos,
raddr6->sin6_scope_id);
iptype = 6;
ra = (__u8 *)&raddr6->sin6_addr;
@@ -2478,7 +2517,7 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb)
u16 peer_mss = ntohs(req->tcpopt.mss);
int iptype;
unsigned short hdrs;
- u8 tos = PASS_OPEN_TOS_G(ntohl(req->tos_stid));
+ u8 tos;
parent_ep = (struct c4iw_ep *)get_ep_from_stid(dev, stid);
if (!parent_ep) {
@@ -2492,6 +2531,11 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb)
goto reject;
}
+ if (parent_ep->com.cm_id->tos_set)
+ tos = parent_ep->com.cm_id->tos;
+ else
+ tos = PASS_OPEN_TOS_G(ntohl(req->tos_stid));
+
cxgb_get_4tuple(req, parent_ep->com.dev->rdev.lldi.adapter_type,
&iptype, local_ip, peer_ip, &local_port, &peer_port);
@@ -2511,7 +2555,7 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb)
ntohs(peer_port), peer_mss);
dst = cxgb_find_route6(&dev->rdev.lldi, get_real_dev,
local_ip, peer_ip, local_port, peer_port,
- PASS_OPEN_TOS_G(ntohl(req->tos_stid)),
+ tos,
((struct sockaddr_in6 *)
&parent_ep->com.local_addr)->sin6_scope_id);
}
@@ -2742,6 +2786,21 @@ static int peer_close(struct c4iw_dev *dev, struct sk_buff *skb)
return 0;
}
+static void finish_peer_abort(struct c4iw_dev *dev, struct c4iw_ep *ep)
+{
+ complete_cached_srq_buffers(ep, ep->srqe_idx);
+ if (ep->com.cm_id && ep->com.qp) {
+ struct c4iw_qp_attributes attrs;
+
+ attrs.next_state = C4IW_QP_STATE_ERROR;
+ c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp,
+ C4IW_QP_ATTR_NEXT_STATE, &attrs, 1);
+ }
+ peer_abort_upcall(ep);
+ release_ep_resources(ep);
+ c4iw_put_ep(&ep->com);
+}
+
static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb)
{
struct cpl_abort_req_rss6 *req = cplhdr(skb);
@@ -2752,6 +2811,7 @@ static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb)
int release = 0;
unsigned int tid = GET_TID(req);
u8 status;
+ u32 srqidx;
u32 len = roundup(sizeof(struct cpl_abort_rpl), 16);
@@ -2771,8 +2831,6 @@ static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb)
goto deref_ep;
}
- complete_cached_srq_buffers(ep, req->srqidx_status);
-
pr_debug("ep %p tid %u state %u\n", ep, ep->hwtid,
ep->com.state);
set_bit(PEER_ABORT, &ep->com.history);
@@ -2795,7 +2853,8 @@ static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb)
break;
case MPA_REQ_SENT:
(void)stop_ep_timer(ep);
- if (mpa_rev == 1 || (mpa_rev == 2 && ep->tried_with_mpa_v1))
+ if (status != CPL_ERR_CONN_RESET || mpa_rev == 1 ||
+ (mpa_rev == 2 && ep->tried_with_mpa_v1))
connect_reply_upcall(ep, -ECONNRESET);
else {
/*
@@ -2820,6 +2879,23 @@ static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb)
stop_ep_timer(ep);
/*FALLTHROUGH*/
case FPDU_MODE:
+ if (ep->com.qp && ep->com.qp->srq) {
+ srqidx = ABORT_RSS_SRQIDX_G(
+ be32_to_cpu(req->srqidx_status));
+ if (srqidx) {
+ complete_cached_srq_buffers(ep,
+ req->srqidx_status);
+ } else {
+ /* Hold ep ref until finish_peer_abort() */
+ c4iw_get_ep(&ep->com);
+ __state_set(&ep->com, ABORTING);
+ set_bit(PEER_ABORT_IN_PROGRESS, &ep->com.flags);
+ read_tcb(ep);
+ break;
+
+ }
+ }
+
if (ep->com.cm_id && ep->com.qp) {
attrs.next_state = C4IW_QP_STATE_ERROR;
ret = c4iw_modify_qp(ep->com.qp->rhp,
@@ -2943,15 +3019,18 @@ static int terminate(struct c4iw_dev *dev, struct sk_buff *skb)
ep = get_ep_from_tid(dev, tid);
- if (ep && ep->com.qp) {
- pr_warn("TERM received tid %u qpid %u\n",
- tid, ep->com.qp->wq.sq.qid);
- attrs.next_state = C4IW_QP_STATE_TERMINATE;
- c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp,
- C4IW_QP_ATTR_NEXT_STATE, &attrs, 1);
+ if (ep) {
+ if (ep->com.qp) {
+ pr_warn("TERM received tid %u qpid %u\n", tid,
+ ep->com.qp->wq.sq.qid);
+ attrs.next_state = C4IW_QP_STATE_TERMINATE;
+ c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp,
+ C4IW_QP_ATTR_NEXT_STATE, &attrs, 1);
+ }
+
+ c4iw_put_ep(&ep->com);
} else
pr_warn("TERM received tid %u no ep/qp\n", tid);
- c4iw_put_ep(&ep->com);
return 0;
}
@@ -3319,7 +3398,7 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
laddr6->sin6_addr.s6_addr,
raddr6->sin6_addr.s6_addr,
laddr6->sin6_port,
- raddr6->sin6_port, 0,
+ raddr6->sin6_port, cm_id->tos,
raddr6->sin6_scope_id);
}
if (!ep->dst) {
@@ -3607,7 +3686,6 @@ int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp)
if (close) {
if (abrupt) {
set_bit(EP_DISC_ABORT, &ep->com.history);
- close_complete_upcall(ep, -ECONNRESET);
ret = send_abort(ep);
} else {
set_bit(EP_DISC_CLOSE, &ep->com.history);
@@ -3718,6 +3796,80 @@ static void passive_ofld_conn_reply(struct c4iw_dev *dev, struct sk_buff *skb,
return;
}
+static inline u64 t4_tcb_get_field64(__be64 *tcb, u16 word)
+{
+ u64 tlo = be64_to_cpu(tcb[((31 - word) / 2)]);
+ u64 thi = be64_to_cpu(tcb[((31 - word) / 2) - 1]);
+ u64 t;
+ u32 shift = 32;
+
+ t = (thi << shift) | (tlo >> shift);
+
+ return t;
+}
+
+static inline u32 t4_tcb_get_field32(__be64 *tcb, u16 word, u32 mask, u32 shift)
+{
+ u32 v;
+ u64 t = be64_to_cpu(tcb[(31 - word) / 2]);
+
+ if (word & 0x1)
+ shift += 32;
+ v = (t >> shift) & mask;
+ return v;
+}
+
+static int read_tcb_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+ struct cpl_get_tcb_rpl *rpl = cplhdr(skb);
+ __be64 *tcb = (__be64 *)(rpl + 1);
+ unsigned int tid = GET_TID(rpl);
+ struct c4iw_ep *ep;
+ u64 t_flags_64;
+ u32 rx_pdu_out;
+
+ ep = get_ep_from_tid(dev, tid);
+ if (!ep)
+ return 0;
+ /* Examine the TF_RX_PDU_OUT (bit 49 of the t_flags) in order to
+ * determine if there's a rx PDU feedback event pending.
+ *
+ * If that bit is set, it means we'll need to re-read the TCB's
+ * rq_start value. The final value is the one present in a TCB
+ * with the TF_RX_PDU_OUT bit cleared.
+ */
+
+ t_flags_64 = t4_tcb_get_field64(tcb, TCB_T_FLAGS_W);
+ rx_pdu_out = (t_flags_64 & TF_RX_PDU_OUT_V(1)) >> TF_RX_PDU_OUT_S;
+
+ c4iw_put_ep(&ep->com); /* from get_ep_from_tid() */
+ c4iw_put_ep(&ep->com); /* from read_tcb() */
+
+ /* If TF_RX_PDU_OUT bit is set, re-read the TCB */
+ if (rx_pdu_out) {
+ if (++ep->rx_pdu_out_cnt >= 2) {
+ WARN_ONCE(1, "tcb re-read() reached the guard limit, finishing the cleanup\n");
+ goto cleanup;
+ }
+ read_tcb(ep);
+ return 0;
+ }
+
+ ep->srqe_idx = t4_tcb_get_field32(tcb, TCB_RQ_START_W, TCB_RQ_START_W,
+ TCB_RQ_START_S);
+cleanup:
+ pr_debug("ep %p tid %u %016x\n", ep, ep->hwtid, ep->srqe_idx);
+
+ if (test_bit(PEER_ABORT_IN_PROGRESS, &ep->com.flags))
+ finish_peer_abort(dev, ep);
+ else if (test_bit(ABORT_REQ_IN_PROGRESS, &ep->com.flags))
+ send_abort_req(ep);
+ else
+ WARN_ONCE(1, "unexpected state!");
+
+ return 0;
+}
+
static int deferred_fw6_msg(struct c4iw_dev *dev, struct sk_buff *skb)
{
struct cpl_fw6_msg *rpl = cplhdr(skb);
@@ -3944,7 +4096,7 @@ static int rx_pkt(struct c4iw_dev *dev, struct sk_buff *skb)
} else {
vlan_eh = (struct vlan_ethhdr *)(req + 1);
iph = (struct iphdr *)(vlan_eh + 1);
- skb->vlan_tci = ntohs(cpl->vlan);
+ __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), ntohs(cpl->vlan));
}
if (iph->version != 0x4)
@@ -4038,6 +4190,7 @@ static c4iw_handler_func work_handlers[NUM_CPL_CMDS + NUM_FAKE_CPLS] = {
[CPL_CLOSE_CON_RPL] = close_con_rpl,
[CPL_RDMA_TERMINATE] = terminate,
[CPL_FW4_ACK] = fw4_ack,
+ [CPL_GET_TCB_RPL] = read_tcb_rpl,
[CPL_FW6_MSG] = deferred_fw6_msg,
[CPL_RX_PKT] = rx_pkt,
[FAKE_CPL_PUT_EP_SAFE] = _put_ep_safe,
@@ -4269,6 +4422,7 @@ c4iw_handler_func c4iw_handlers[NUM_CPL_CMDS] = {
[CPL_RDMA_TERMINATE] = sched,
[CPL_FW4_ACK] = sched,
[CPL_SET_TCB_RPL] = set_tcb_rpl,
+ [CPL_GET_TCB_RPL] = sched,
[CPL_FW6_MSG] = fw6_msg,
[CPL_RX_PKT] = sched
};
diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c
index c13c0ba30f63..c79cf63fb0bb 100644
--- a/drivers/infiniband/hw/cxgb4/device.c
+++ b/drivers/infiniband/hw/cxgb4/device.c
@@ -720,11 +720,8 @@ static const struct file_operations ep_debugfs_fops = {
.read = debugfs_read,
};
-static int setup_debugfs(struct c4iw_dev *devp)
+static void setup_debugfs(struct c4iw_dev *devp)
{
- if (!devp->debugfs_root)
- return -1;
-
debugfs_create_file_size("qps", S_IWUSR, devp->debugfs_root,
(void *)devp, &qp_debugfs_fops, 4096);
@@ -740,7 +737,6 @@ static int setup_debugfs(struct c4iw_dev *devp)
if (c4iw_wr_log)
debugfs_create_file_size("wr_log", S_IWUSR, devp->debugfs_root,
(void *)devp, &wr_log_debugfs_fops, 4096);
- return 0;
}
void c4iw_release_dev_ucontext(struct c4iw_rdev *rdev,
@@ -783,6 +779,7 @@ void c4iw_init_dev_ucontext(struct c4iw_rdev *rdev,
static int c4iw_rdev_open(struct c4iw_rdev *rdev)
{
int err;
+ unsigned int factor;
c4iw_init_dev_ucontext(rdev, &rdev->uctx);
@@ -806,8 +803,18 @@ static int c4iw_rdev_open(struct c4iw_rdev *rdev)
return -EINVAL;
}
- rdev->qpmask = rdev->lldi.udb_density - 1;
- rdev->cqmask = rdev->lldi.ucq_density - 1;
+ /* This implementation requires a sge_host_page_size <= PAGE_SIZE. */
+ if (rdev->lldi.sge_host_page_size > PAGE_SIZE) {
+ pr_err("%s: unsupported sge host page size %u\n",
+ pci_name(rdev->lldi.pdev),
+ rdev->lldi.sge_host_page_size);
+ return -EINVAL;
+ }
+
+ factor = PAGE_SIZE / rdev->lldi.sge_host_page_size;
+ rdev->qpmask = (rdev->lldi.udb_density * factor) - 1;
+ rdev->cqmask = (rdev->lldi.ucq_density * factor) - 1;
+
pr_debug("dev %s stag start 0x%0x size 0x%0x num stags %d pbl start 0x%0x size 0x%0x rq start 0x%0x size 0x%0x qp qid start %u size %u cq qid start %u size %u srq size %u\n",
pci_name(rdev->lldi.pdev), rdev->lldi.vr->stag.start,
rdev->lldi.vr->stag.size, c4iw_num_stags(rdev),
@@ -970,7 +977,7 @@ static struct c4iw_dev *c4iw_alloc(const struct cxgb4_lld_info *infop)
pr_info("%s: On-Chip Queues not supported on this device\n",
pci_name(infop->pdev));
- devp = (struct c4iw_dev *)ib_alloc_device(sizeof(*devp));
+ devp = ib_alloc_device(c4iw_dev, ibdev);
if (!devp) {
pr_err("Cannot allocate ib device\n");
return ERR_PTR(-ENOMEM);
@@ -1553,8 +1560,6 @@ static int __init c4iw_init_module(void)
return err;
c4iw_debugfs_root = debugfs_create_dir(DRV_NAME, NULL);
- if (!c4iw_debugfs_root)
- pr_warn("could not create debugfs entry, continuing\n");
reg_workq = create_singlethread_workqueue("Register_iWARP_device");
if (!reg_workq) {
diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
index f0fceadd0d12..5a5da41faef6 100644
--- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
+++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
@@ -589,7 +589,6 @@ struct c4iw_ucontext {
u32 key;
spinlock_t mmap_lock;
struct list_head mmaps;
- struct kref kref;
bool is_32b_cqe;
};
@@ -598,18 +597,6 @@ static inline struct c4iw_ucontext *to_c4iw_ucontext(struct ib_ucontext *c)
return container_of(c, struct c4iw_ucontext, ibucontext);
}
-void _c4iw_free_ucontext(struct kref *kref);
-
-static inline void c4iw_put_ucontext(struct c4iw_ucontext *ucontext)
-{
- kref_put(&ucontext->kref, _c4iw_free_ucontext);
-}
-
-static inline void c4iw_get_ucontext(struct c4iw_ucontext *ucontext)
-{
- kref_get(&ucontext->kref);
-}
-
struct c4iw_mm_entry {
struct list_head entry;
u64 addr;
@@ -982,6 +969,9 @@ struct c4iw_ep {
int rcv_win;
u32 snd_wscale;
struct c4iw_ep_stats stats;
+ u32 srqe_idx;
+ u32 rx_pdu_out_cnt;
+ struct sk_buff *peer_abort_skb;
};
static inline struct c4iw_ep *to_ep(struct iw_cm_id *cm_id)
diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c
index 7b76e6f81aeb..5baa31ab6366 100644
--- a/drivers/infiniband/hw/cxgb4/mem.c
+++ b/drivers/infiniband/hw/cxgb4/mem.c
@@ -502,10 +502,9 @@ struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
u64 virt, int acc, struct ib_udata *udata)
{
__be64 *pages;
- int shift, n, len;
- int i, k, entry;
+ int shift, n, i;
int err = -ENOMEM;
- struct scatterlist *sg;
+ struct sg_dma_page_iter sg_iter;
struct c4iw_dev *rhp;
struct c4iw_pd *php;
struct c4iw_mr *mhp;
@@ -537,11 +536,11 @@ struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
mhp->rhp = rhp;
- mhp->umem = ib_umem_get(pd->uobject->context, start, length, acc, 0);
+ mhp->umem = ib_umem_get(udata, start, length, acc, 0);
if (IS_ERR(mhp->umem))
goto err_free_skb;
- shift = mhp->umem->page_shift;
+ shift = PAGE_SHIFT;
n = mhp->umem->nmap;
err = alloc_pbl(mhp, n);
@@ -556,21 +555,16 @@ struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
i = n = 0;
- for_each_sg(mhp->umem->sg_head.sgl, sg, mhp->umem->nmap, entry) {
- len = sg_dma_len(sg) >> shift;
- for (k = 0; k < len; ++k) {
- pages[i++] = cpu_to_be64(sg_dma_address(sg) +
- (k << shift));
- if (i == PAGE_SIZE / sizeof *pages) {
- err = write_pbl(&mhp->rhp->rdev,
- pages,
- mhp->attr.pbl_addr + (n << 3), i,
- mhp->wr_waitp);
- if (err)
- goto pbl_done;
- n += i;
- i = 0;
- }
+ for_each_sg_dma_page(mhp->umem->sg_head.sgl, &sg_iter, mhp->umem->nmap, 0) {
+ pages[i++] = cpu_to_be64(sg_page_iter_dma_address(&sg_iter));
+ if (i == PAGE_SIZE / sizeof(*pages)) {
+ err = write_pbl(&mhp->rhp->rdev, pages,
+ mhp->attr.pbl_addr + (n << 3), i,
+ mhp->wr_waitp);
+ if (err)
+ goto pbl_done;
+ n += i;
+ i = 0;
}
}
@@ -684,8 +678,8 @@ int c4iw_dealloc_mw(struct ib_mw *mw)
mhp->wr_waitp);
kfree_skb(mhp->dereg_skb);
c4iw_put_wr_wait(mhp->wr_waitp);
- kfree(mhp);
pr_debug("ib_mw %p mmid 0x%x ptr %p\n", mw, mmid, mhp);
+ kfree(mhp);
return 0;
}
diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c
index cbb3c0ddd990..507c54572cc9 100644
--- a/drivers/infiniband/hw/cxgb4/provider.c
+++ b/drivers/infiniband/hw/cxgb4/provider.c
@@ -58,51 +58,34 @@ static int fastreg_support = 1;
module_param(fastreg_support, int, 0644);
MODULE_PARM_DESC(fastreg_support, "Advertise fastreg support (default=1)");
-void _c4iw_free_ucontext(struct kref *kref)
+static void c4iw_dealloc_ucontext(struct ib_ucontext *context)
{
- struct c4iw_ucontext *ucontext;
+ struct c4iw_ucontext *ucontext = to_c4iw_ucontext(context);
struct c4iw_dev *rhp;
struct c4iw_mm_entry *mm, *tmp;
- ucontext = container_of(kref, struct c4iw_ucontext, kref);
+ pr_debug("context %p\n", context);
rhp = to_c4iw_dev(ucontext->ibucontext.device);
- pr_debug("ucontext %p\n", ucontext);
list_for_each_entry_safe(mm, tmp, &ucontext->mmaps, entry)
kfree(mm);
c4iw_release_dev_ucontext(&rhp->rdev, &ucontext->uctx);
- kfree(ucontext);
-}
-
-static int c4iw_dealloc_ucontext(struct ib_ucontext *context)
-{
- struct c4iw_ucontext *ucontext = to_c4iw_ucontext(context);
-
- pr_debug("context %p\n", context);
- c4iw_put_ucontext(ucontext);
- return 0;
}
-static struct ib_ucontext *c4iw_alloc_ucontext(struct ib_device *ibdev,
- struct ib_udata *udata)
+static int c4iw_alloc_ucontext(struct ib_ucontext *ucontext,
+ struct ib_udata *udata)
{
- struct c4iw_ucontext *context;
+ struct ib_device *ibdev = ucontext->device;
+ struct c4iw_ucontext *context = to_c4iw_ucontext(ucontext);
struct c4iw_dev *rhp = to_c4iw_dev(ibdev);
struct c4iw_alloc_ucontext_resp uresp;
int ret = 0;
struct c4iw_mm_entry *mm = NULL;
pr_debug("ibdev %p\n", ibdev);
- context = kzalloc(sizeof(*context), GFP_KERNEL);
- if (!context) {
- ret = -ENOMEM;
- goto err;
- }
-
c4iw_init_dev_ucontext(&rhp->rdev, &context->uctx);
INIT_LIST_HEAD(&context->mmaps);
spin_lock_init(&context->mmap_lock);
- kref_init(&context->kref);
if (udata->outlen < sizeof(uresp) - sizeof(uresp.reserved)) {
pr_err_once("Warning - downlevel libcxgb4 (non-fatal), device status page disabled\n");
@@ -111,7 +94,7 @@ static struct ib_ucontext *c4iw_alloc_ucontext(struct ib_device *ibdev,
mm = kmalloc(sizeof(*mm), GFP_KERNEL);
if (!mm) {
ret = -ENOMEM;
- goto err_free;
+ goto err;
}
uresp.status_page_size = PAGE_SIZE;
@@ -131,13 +114,11 @@ static struct ib_ucontext *c4iw_alloc_ucontext(struct ib_device *ibdev,
mm->len = PAGE_SIZE;
insert_mmap(context, mm);
}
- return &context->ibucontext;
+ return 0;
err_mm:
kfree(mm);
-err_free:
- kfree(context);
err:
- return ERR_PTR(ret);
+ return ret;
}
static int c4iw_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
@@ -209,7 +190,7 @@ static int c4iw_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
return ret;
}
-static int c4iw_deallocate_pd(struct ib_pd *pd)
+static void c4iw_deallocate_pd(struct ib_pd *pd)
{
struct c4iw_dev *rhp;
struct c4iw_pd *php;
@@ -221,15 +202,13 @@ static int c4iw_deallocate_pd(struct ib_pd *pd)
mutex_lock(&rhp->rdev.stats.lock);
rhp->rdev.stats.pd.cur--;
mutex_unlock(&rhp->rdev.stats.lock);
- kfree(php);
- return 0;
}
-static struct ib_pd *c4iw_allocate_pd(struct ib_device *ibdev,
- struct ib_ucontext *context,
- struct ib_udata *udata)
+static int c4iw_allocate_pd(struct ib_pd *pd, struct ib_ucontext *context,
+ struct ib_udata *udata)
{
- struct c4iw_pd *php;
+ struct c4iw_pd *php = to_c4iw_pd(pd);
+ struct ib_device *ibdev = pd->device;
u32 pdid;
struct c4iw_dev *rhp;
@@ -237,12 +216,8 @@ static struct ib_pd *c4iw_allocate_pd(struct ib_device *ibdev,
rhp = (struct c4iw_dev *) ibdev;
pdid = c4iw_get_resource(&rhp->rdev.resource.pdid_table);
if (!pdid)
- return ERR_PTR(-EINVAL);
- php = kzalloc(sizeof(*php), GFP_KERNEL);
- if (!php) {
- c4iw_put_resource(&rhp->rdev.resource.pdid_table, pdid);
- return ERR_PTR(-ENOMEM);
- }
+ return -EINVAL;
+
php->pdid = pdid;
php->rhp = rhp;
if (context) {
@@ -250,7 +225,7 @@ static struct ib_pd *c4iw_allocate_pd(struct ib_device *ibdev,
if (ib_copy_to_udata(udata, &uresp, sizeof(uresp))) {
c4iw_deallocate_pd(&php->ibpd);
- return ERR_PTR(-EFAULT);
+ return -EFAULT;
}
}
mutex_lock(&rhp->rdev.stats.lock);
@@ -259,7 +234,7 @@ static struct ib_pd *c4iw_allocate_pd(struct ib_device *ibdev,
rhp->rdev.stats.pd.max = rhp->rdev.stats.pd.cur;
mutex_unlock(&rhp->rdev.stats.lock);
pr_debug("pdid 0x%0x ptr 0x%p\n", pdid, php);
- return &php->ibpd;
+ return 0;
}
static int c4iw_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
@@ -376,8 +351,9 @@ static int c4iw_query_port(struct ib_device *ibdev, u8 port,
static ssize_t hw_rev_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
- struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev,
- ibdev.dev);
+ struct c4iw_dev *c4iw_dev =
+ rdma_device_to_drv_device(dev, struct c4iw_dev, ibdev);
+
pr_debug("dev 0x%p\n", dev);
return sprintf(buf, "%d\n",
CHELSIO_CHIP_RELEASE(c4iw_dev->rdev.lldi.adapter_type));
@@ -387,8 +363,8 @@ static DEVICE_ATTR_RO(hw_rev);
static ssize_t hca_type_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
- struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev,
- ibdev.dev);
+ struct c4iw_dev *c4iw_dev =
+ rdma_device_to_drv_device(dev, struct c4iw_dev, ibdev);
struct ethtool_drvinfo info;
struct net_device *lldev = c4iw_dev->rdev.lldi.ports[0];
@@ -401,8 +377,9 @@ static DEVICE_ATTR_RO(hca_type);
static ssize_t board_id_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
- struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev,
- ibdev.dev);
+ struct c4iw_dev *c4iw_dev =
+ rdma_device_to_drv_device(dev, struct c4iw_dev, ibdev);
+
pr_debug("dev 0x%p\n", dev);
return sprintf(buf, "%x.%x\n", c4iw_dev->rdev.lldi.pdev->vendor,
c4iw_dev->rdev.lldi.pdev->device);
@@ -531,6 +508,47 @@ static int fill_res_entry(struct sk_buff *msg, struct rdma_restrack_entry *res)
c4iw_restrack_funcs[res->type](msg, res) : 0;
}
+static const struct ib_device_ops c4iw_dev_ops = {
+ .alloc_hw_stats = c4iw_alloc_stats,
+ .alloc_mr = c4iw_alloc_mr,
+ .alloc_mw = c4iw_alloc_mw,
+ .alloc_pd = c4iw_allocate_pd,
+ .alloc_ucontext = c4iw_alloc_ucontext,
+ .create_cq = c4iw_create_cq,
+ .create_qp = c4iw_create_qp,
+ .create_srq = c4iw_create_srq,
+ .dealloc_mw = c4iw_dealloc_mw,
+ .dealloc_pd = c4iw_deallocate_pd,
+ .dealloc_ucontext = c4iw_dealloc_ucontext,
+ .dereg_mr = c4iw_dereg_mr,
+ .destroy_cq = c4iw_destroy_cq,
+ .destroy_qp = c4iw_destroy_qp,
+ .destroy_srq = c4iw_destroy_srq,
+ .fill_res_entry = fill_res_entry,
+ .get_dev_fw_str = get_dev_fw_str,
+ .get_dma_mr = c4iw_get_dma_mr,
+ .get_hw_stats = c4iw_get_mib,
+ .get_netdev = get_netdev,
+ .get_port_immutable = c4iw_port_immutable,
+ .map_mr_sg = c4iw_map_mr_sg,
+ .mmap = c4iw_mmap,
+ .modify_qp = c4iw_ib_modify_qp,
+ .modify_srq = c4iw_modify_srq,
+ .poll_cq = c4iw_poll_cq,
+ .post_recv = c4iw_post_receive,
+ .post_send = c4iw_post_send,
+ .post_srq_recv = c4iw_post_srq_recv,
+ .query_device = c4iw_query_device,
+ .query_gid = c4iw_query_gid,
+ .query_pkey = c4iw_query_pkey,
+ .query_port = c4iw_query_port,
+ .query_qp = c4iw_ib_query_qp,
+ .reg_user_mr = c4iw_reg_user_mr,
+ .req_notify_cq = c4iw_arm_cq,
+ INIT_RDMA_OBJ_SIZE(ib_pd, c4iw_pd, ibpd),
+ INIT_RDMA_OBJ_SIZE(ib_ucontext, c4iw_ucontext, ibucontext),
+};
+
void c4iw_register_device(struct work_struct *work)
{
int ret;
@@ -573,44 +591,9 @@ void c4iw_register_device(struct work_struct *work)
dev->ibdev.phys_port_cnt = dev->rdev.lldi.nports;
dev->ibdev.num_comp_vectors = dev->rdev.lldi.nciq;
dev->ibdev.dev.parent = &dev->rdev.lldi.pdev->dev;
- dev->ibdev.query_device = c4iw_query_device;
- dev->ibdev.query_port = c4iw_query_port;
- dev->ibdev.query_pkey = c4iw_query_pkey;
- dev->ibdev.query_gid = c4iw_query_gid;
- dev->ibdev.alloc_ucontext = c4iw_alloc_ucontext;
- dev->ibdev.dealloc_ucontext = c4iw_dealloc_ucontext;
- dev->ibdev.mmap = c4iw_mmap;
- dev->ibdev.alloc_pd = c4iw_allocate_pd;
- dev->ibdev.dealloc_pd = c4iw_deallocate_pd;
- dev->ibdev.create_qp = c4iw_create_qp;
- dev->ibdev.modify_qp = c4iw_ib_modify_qp;
- dev->ibdev.query_qp = c4iw_ib_query_qp;
- dev->ibdev.destroy_qp = c4iw_destroy_qp;
- dev->ibdev.create_srq = c4iw_create_srq;
- dev->ibdev.modify_srq = c4iw_modify_srq;
- dev->ibdev.destroy_srq = c4iw_destroy_srq;
- dev->ibdev.create_cq = c4iw_create_cq;
- dev->ibdev.destroy_cq = c4iw_destroy_cq;
- dev->ibdev.poll_cq = c4iw_poll_cq;
- dev->ibdev.get_dma_mr = c4iw_get_dma_mr;
- dev->ibdev.reg_user_mr = c4iw_reg_user_mr;
- dev->ibdev.dereg_mr = c4iw_dereg_mr;
- dev->ibdev.alloc_mw = c4iw_alloc_mw;
- dev->ibdev.dealloc_mw = c4iw_dealloc_mw;
- dev->ibdev.alloc_mr = c4iw_alloc_mr;
- dev->ibdev.map_mr_sg = c4iw_map_mr_sg;
- dev->ibdev.req_notify_cq = c4iw_arm_cq;
- dev->ibdev.post_send = c4iw_post_send;
- dev->ibdev.post_recv = c4iw_post_receive;
- dev->ibdev.post_srq_recv = c4iw_post_srq_recv;
- dev->ibdev.alloc_hw_stats = c4iw_alloc_stats;
- dev->ibdev.get_hw_stats = c4iw_get_mib;
dev->ibdev.uverbs_abi_ver = C4IW_UVERBS_ABI_VERSION;
- dev->ibdev.get_port_immutable = c4iw_port_immutable;
- dev->ibdev.get_dev_fw_str = get_dev_fw_str;
- dev->ibdev.get_netdev = get_netdev;
- dev->ibdev.iwcm = kmalloc(sizeof(struct iw_cm_verbs), GFP_KERNEL);
+ dev->ibdev.iwcm = kzalloc(sizeof(struct iw_cm_verbs), GFP_KERNEL);
if (!dev->ibdev.iwcm) {
ret = -ENOMEM;
goto err_dealloc_ctx;
@@ -624,13 +607,13 @@ void c4iw_register_device(struct work_struct *work)
dev->ibdev.iwcm->add_ref = c4iw_qp_add_ref;
dev->ibdev.iwcm->rem_ref = c4iw_qp_rem_ref;
dev->ibdev.iwcm->get_qp = c4iw_get_qp;
- dev->ibdev.res.fill_res_entry = fill_res_entry;
memcpy(dev->ibdev.iwcm->ifname, dev->rdev.lldi.ports[0]->name,
sizeof(dev->ibdev.iwcm->ifname));
rdma_set_device_sysfs_group(&dev->ibdev, &c4iw_attr_group);
dev->ibdev.driver_id = RDMA_DRIVER_CXGB4;
- ret = ib_register_device(&dev->ibdev, "cxgb4_%d", NULL);
+ ib_set_device_ops(&dev->ibdev, &c4iw_dev_ops);
+ ret = ib_register_device(&dev->ibdev, "cxgb4_%d");
if (ret)
goto err_kfree_iwcm;
return;
diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c
index 13478f3b7057..d3a82839f5ea 100644
--- a/drivers/infiniband/hw/cxgb4/qp.c
+++ b/drivers/infiniband/hw/cxgb4/qp.c
@@ -31,6 +31,7 @@
*/
#include <linux/module.h>
+#include <rdma/uverbs_ioctl.h>
#include "iw_cxgb4.h"
@@ -632,7 +633,10 @@ static void build_rdma_write_cmpl(struct t4_sq *sq,
wcwr->stag_sink = cpu_to_be32(rdma_wr(wr)->rkey);
wcwr->to_sink = cpu_to_be64(rdma_wr(wr)->remote_addr);
- wcwr->stag_inv = cpu_to_be32(wr->next->ex.invalidate_rkey);
+ if (wr->next->opcode == IB_WR_SEND)
+ wcwr->stag_inv = 0;
+ else
+ wcwr->stag_inv = cpu_to_be32(wr->next->ex.invalidate_rkey);
wcwr->r2 = 0;
wcwr->r3 = 0;
@@ -726,7 +730,10 @@ static void post_write_cmpl(struct c4iw_qp *qhp, const struct ib_send_wr *wr)
/* SEND_WITH_INV swsqe */
swsqe = &qhp->wq.sq.sw_sq[qhp->wq.sq.pidx];
- swsqe->opcode = FW_RI_SEND_WITH_INV;
+ if (wr->next->opcode == IB_WR_SEND)
+ swsqe->opcode = FW_RI_SEND;
+ else
+ swsqe->opcode = FW_RI_SEND_WITH_INV;
swsqe->idx = qhp->wq.sq.pidx;
swsqe->complete = 0;
swsqe->signaled = send_signaled;
@@ -897,8 +904,6 @@ static void free_qp_work(struct work_struct *work)
destroy_qp(&rhp->rdev, &qhp->wq,
ucontext ? &ucontext->uctx : &rhp->rdev.uctx, !qhp->srq);
- if (ucontext)
- c4iw_put_ucontext(ucontext);
c4iw_put_wr_wait(qhp->wr_waitp);
kfree(qhp);
}
@@ -1133,9 +1138,9 @@ int c4iw_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
/*
* Fastpath for NVMe-oF target WRITE + SEND_WITH_INV wr chain which is
* the response for small NVMEe-oF READ requests. If the chain is
- * exactly a WRITE->SEND_WITH_INV and the sgl depths and lengths
- * meet the requirements of the fw_ri_write_cmpl_wr work request,
- * then build and post the write_cmpl WR. If any of the tests
+ * exactly a WRITE->SEND_WITH_INV or a WRITE->SEND and the sgl depths
+ * and lengths meet the requirements of the fw_ri_write_cmpl_wr work
+ * request, then build and post the write_cmpl WR. If any of the tests
* below are not true, then we continue on with the tradtional WRITE
* and SEND WRs.
*/
@@ -1145,7 +1150,8 @@ int c4iw_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
wr && wr->next && !wr->next->next &&
wr->opcode == IB_WR_RDMA_WRITE &&
wr->sg_list[0].length && wr->num_sge <= T4_WRITE_CMPL_MAX_SGL &&
- wr->next->opcode == IB_WR_SEND_WITH_INV &&
+ (wr->next->opcode == IB_WR_SEND ||
+ wr->next->opcode == IB_WR_SEND_WITH_INV) &&
wr->next->sg_list[0].length == T4_WRITE_CMPL_MAX_CQE &&
wr->next->num_sge == 1 && num_wrs >= 2) {
post_write_cmpl(qhp, wr);
@@ -2129,7 +2135,8 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs,
struct c4iw_cq *rchp;
struct c4iw_create_qp_resp uresp;
unsigned int sqsize, rqsize = 0;
- struct c4iw_ucontext *ucontext;
+ struct c4iw_ucontext *ucontext = rdma_udata_to_drv_context(
+ udata, struct c4iw_ucontext, ibucontext);
int ret;
struct c4iw_mm_entry *sq_key_mm, *rq_key_mm = NULL, *sq_db_key_mm;
struct c4iw_mm_entry *rq_db_key_mm = NULL, *ma_sync_key_mm = NULL;
@@ -2163,8 +2170,6 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs,
if (sqsize < 8)
sqsize = 8;
- ucontext = pd->uobject ? to_c4iw_ucontext(pd->uobject->context) : NULL;
-
qhp = kzalloc(sizeof(*qhp), GFP_KERNEL);
if (!qhp)
return ERR_PTR(-ENOMEM);
@@ -2331,7 +2336,6 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs,
insert_mmap(ucontext, ma_sync_key_mm);
}
- c4iw_get_ucontext(ucontext);
qhp->ucontext = ucontext;
}
if (!attrs->srq) {
@@ -2564,13 +2568,11 @@ static int alloc_srq_queue(struct c4iw_srq *srq, struct c4iw_dev_ucontext *uctx,
wq->rqt_abs_idx = (wq->rqt_hwaddr - rdev->lldi.vr->rq.start) >>
T4_RQT_ENTRY_SHIFT;
- wq->queue = dma_alloc_coherent(&rdev->lldi.pdev->dev,
- wq->memsize, &wq->dma_addr,
- GFP_KERNEL);
+ wq->queue = dma_alloc_coherent(&rdev->lldi.pdev->dev, wq->memsize,
+ &wq->dma_addr, GFP_KERNEL);
if (!wq->queue)
goto err_free_rqtpool;
- memset(wq->queue, 0, wq->memsize);
dma_unmap_addr_set(wq, mapping, wq->dma_addr);
wq->bar2_va = c4iw_bar2_addrs(rdev, wq->qid, CXGB4_BAR2_QTYPE_EGRESS,
@@ -2591,7 +2593,7 @@ static int alloc_srq_queue(struct c4iw_srq *srq, struct c4iw_dev_ucontext *uctx,
/* build fw_ri_res_wr */
wr_len = sizeof(*res_wr) + sizeof(*res);
- skb = alloc_skb(wr_len, GFP_KERNEL | __GFP_NOFAIL);
+ skb = alloc_skb(wr_len, GFP_KERNEL);
if (!skb)
goto err_free_queue;
set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0);
@@ -2713,7 +2715,8 @@ struct ib_srq *c4iw_create_srq(struct ib_pd *pd, struct ib_srq_init_attr *attrs,
rqsize = attrs->attr.max_wr + 1;
rqsize = roundup_pow_of_two(max_t(u16, rqsize, 16));
- ucontext = pd->uobject ? to_c4iw_ucontext(pd->uobject->context) : NULL;
+ ucontext = rdma_udata_to_drv_context(udata, struct c4iw_ucontext,
+ ibucontext);
srq = kzalloc(sizeof(*srq), GFP_KERNEL);
if (!srq)
diff --git a/drivers/infiniband/hw/cxgb4/t4.h b/drivers/infiniband/hw/cxgb4/t4.h
index fff6d48d262f..b170817b2741 100644
--- a/drivers/infiniband/hw/cxgb4/t4.h
+++ b/drivers/infiniband/hw/cxgb4/t4.h
@@ -35,6 +35,7 @@
#include "t4_regs.h"
#include "t4_values.h"
#include "t4_msg.h"
+#include "t4_tcb.h"
#include "t4fw_ri_api.h"
#define T4_MAX_NUM_PD 65536
diff --git a/drivers/infiniband/hw/hfi1/Makefile b/drivers/infiniband/hw/hfi1/Makefile
index ff790390c91a..4044a8c8dbf4 100644
--- a/drivers/infiniband/hw/hfi1/Makefile
+++ b/drivers/infiniband/hw/hfi1/Makefile
@@ -24,6 +24,7 @@ hfi1-y := \
mad.o \
mmu_rb.o \
msix.o \
+ opfn.o \
pcie.o \
pio.o \
pio_copy.o \
@@ -34,6 +35,7 @@ hfi1-y := \
ruc.o \
sdma.o \
sysfs.o \
+ tid_rdma.o \
trace.o \
uc.o \
ud.o \
diff --git a/drivers/infiniband/hw/hfi1/affinity.c b/drivers/infiniband/hw/hfi1/affinity.c
index 2baf38cc1e23..4fe662c3bbc1 100644
--- a/drivers/infiniband/hw/hfi1/affinity.c
+++ b/drivers/infiniband/hw/hfi1/affinity.c
@@ -48,6 +48,7 @@
#include <linux/cpumask.h>
#include <linux/module.h>
#include <linux/interrupt.h>
+#include <linux/numa.h>
#include "hfi.h"
#include "affinity.h"
@@ -777,7 +778,7 @@ void hfi1_dev_affinity_clean_up(struct hfi1_devdata *dd)
_dev_comp_vect_cpu_mask_clean_up(dd, entry);
unlock:
mutex_unlock(&node_affinity.lock);
- dd->node = -1;
+ dd->node = NUMA_NO_NODE;
}
/*
diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c
index 7e6d70936c63..9784c6c0d2ec 100644
--- a/drivers/infiniband/hw/hfi1/chip.c
+++ b/drivers/infiniband/hw/hfi1/chip.c
@@ -1072,6 +1072,8 @@ static void log_state_transition(struct hfi1_pportdata *ppd, u32 state);
static void log_physical_state(struct hfi1_pportdata *ppd, u32 state);
static int wait_physical_linkstate(struct hfi1_pportdata *ppd, u32 state,
int msecs);
+static int wait_phys_link_out_of_offline(struct hfi1_pportdata *ppd,
+ int msecs);
static void read_planned_down_reason_code(struct hfi1_devdata *dd, u8 *pdrrc);
static void read_link_down_reason(struct hfi1_devdata *dd, u8 *ldr);
static void handle_temp_err(struct hfi1_devdata *dd);
@@ -4251,6 +4253,8 @@ static struct cntr_entry dev_cntrs[DEV_CNTR_LAST] = {
access_sw_pio_drain),
[C_SW_KMEM_WAIT] = CNTR_ELEM("KmemWait", 0, 0, CNTR_NORMAL,
access_sw_kmem_wait),
+[C_SW_TID_WAIT] = CNTR_ELEM("TidWait", 0, 0, CNTR_NORMAL,
+ hfi1_access_sw_tid_wait),
[C_SW_SEND_SCHED] = CNTR_ELEM("SendSched", 0, 0, CNTR_NORMAL,
access_sw_send_schedule),
[C_SDMA_DESC_FETCHED_CNT] = CNTR_ELEM("SDEDscFdCn",
@@ -5220,6 +5224,17 @@ int is_bx(struct hfi1_devdata *dd)
return (chip_rev_minor & 0xF0) == 0x10;
}
+/* return true is kernel urg disabled for rcd */
+bool is_urg_masked(struct hfi1_ctxtdata *rcd)
+{
+ u64 mask;
+ u32 is = IS_RCVURGENT_START + rcd->ctxt;
+ u8 bit = is % 64;
+
+ mask = read_csr(rcd->dd, CCE_INT_MASK + (8 * (is / 64)));
+ return !(mask & BIT_ULL(bit));
+}
+
/*
* Append string s to buffer buf. Arguments curp and len are the current
* position and remaining length, respectively.
@@ -10770,13 +10785,15 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state)
break;
ppd->port_error_action = 0;
- ppd->host_link_state = HLS_DN_POLL;
if (quick_linkup) {
/* quick linkup does not go into polling */
ret = do_quick_linkup(dd);
} else {
ret1 = set_physical_link_state(dd, PLS_POLLING);
+ if (!ret1)
+ ret1 = wait_phys_link_out_of_offline(ppd,
+ 3000);
if (ret1 != HCMD_SUCCESS) {
dd_dev_err(dd,
"Failed to transition to Polling link state, return 0x%x\n",
@@ -10784,6 +10801,14 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state)
ret = -EINVAL;
}
}
+
+ /*
+ * Change the host link state after requesting DC8051 to
+ * change its physical state so that we can ignore any
+ * interrupt with stale LNI(XX) error, which will not be
+ * cleared until DC8051 transitions to Polling state.
+ */
+ ppd->host_link_state = HLS_DN_POLL;
ppd->offline_disabled_reason =
HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE);
/*
@@ -12928,6 +12953,39 @@ static int wait_phys_link_offline_substates(struct hfi1_pportdata *ppd,
return read_state;
}
+/*
+ * wait_phys_link_out_of_offline - wait for any out of offline state
+ * @ppd: port device
+ * @msecs: the number of milliseconds to wait
+ *
+ * Wait up to msecs milliseconds for any out of offline physical link
+ * state change to occur.
+ * Returns 0 if at least one state is reached, otherwise -ETIMEDOUT.
+ */
+static int wait_phys_link_out_of_offline(struct hfi1_pportdata *ppd,
+ int msecs)
+{
+ u32 read_state;
+ unsigned long timeout;
+
+ timeout = jiffies + msecs_to_jiffies(msecs);
+ while (1) {
+ read_state = read_physical_state(ppd->dd);
+ if ((read_state & 0xF0) != PLS_OFFLINE)
+ break;
+ if (time_after(jiffies, timeout)) {
+ dd_dev_err(ppd->dd,
+ "timeout waiting for phy link out of offline. Read state 0x%x, %dms\n",
+ read_state, msecs);
+ return -ETIMEDOUT;
+ }
+ usleep_range(1950, 2050); /* sleep 2ms-ish */
+ }
+
+ log_state_transition(ppd, read_state);
+ return read_state;
+}
+
#define CLEAR_STATIC_RATE_CONTROL_SMASK(r) \
(r &= ~SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK)
@@ -13174,7 +13232,7 @@ static int set_up_context_variables(struct hfi1_devdata *dd)
int total_contexts;
int ret;
unsigned ngroups;
- int qos_rmt_count;
+ int rmt_count;
int user_rmt_reduced;
u32 n_usr_ctxts;
u32 send_contexts = chip_send_contexts(dd);
@@ -13236,10 +13294,20 @@ static int set_up_context_variables(struct hfi1_devdata *dd)
n_usr_ctxts = rcv_contexts - total_contexts;
}
- /* each user context requires an entry in the RMT */
- qos_rmt_count = qos_rmt_entries(dd, NULL, NULL);
- if (qos_rmt_count + n_usr_ctxts > NUM_MAP_ENTRIES) {
- user_rmt_reduced = NUM_MAP_ENTRIES - qos_rmt_count;
+ /*
+ * The RMT entries are currently allocated as shown below:
+ * 1. QOS (0 to 128 entries);
+ * 2. FECN for PSM (num_user_contexts + num_vnic_contexts);
+ * 3. VNIC (num_vnic_contexts).
+ * It should be noted that PSM FECN oversubscribe num_vnic_contexts
+ * entries of RMT because both VNIC and PSM could allocate any receive
+ * context between dd->first_dyn_alloc_text and dd->num_rcv_contexts,
+ * and PSM FECN must reserve an RMT entry for each possible PSM receive
+ * context.
+ */
+ rmt_count = qos_rmt_entries(dd, NULL, NULL) + (num_vnic_contexts * 2);
+ if (rmt_count + n_usr_ctxts > NUM_MAP_ENTRIES) {
+ user_rmt_reduced = NUM_MAP_ENTRIES - rmt_count;
dd_dev_err(dd,
"RMT size is reducing the number of user receive contexts from %u to %d\n",
n_usr_ctxts,
@@ -14227,9 +14295,11 @@ static void init_user_fecn_handling(struct hfi1_devdata *dd,
u64 reg;
int i, idx, regoff, regidx;
u8 offset;
+ u32 total_cnt;
/* there needs to be enough room in the map table */
- if (rmt->used + dd->num_user_contexts >= NUM_MAP_ENTRIES) {
+ total_cnt = dd->num_rcv_contexts - dd->first_dyn_alloc_ctxt;
+ if (rmt->used + total_cnt >= NUM_MAP_ENTRIES) {
dd_dev_err(dd, "User FECN handling disabled - too many user contexts allocated\n");
return;
}
@@ -14283,7 +14353,7 @@ static void init_user_fecn_handling(struct hfi1_devdata *dd,
/* add rule 1 */
add_rsm_rule(dd, RSM_INS_FECN, &rrd);
- rmt->used += dd->num_user_contexts;
+ rmt->used += total_cnt;
}
/* Initialize RSM for VNIC */
diff --git a/drivers/infiniband/hw/hfi1/chip.h b/drivers/infiniband/hw/hfi1/chip.h
index 6b9c8f12dff8..6c27c1c6a868 100644
--- a/drivers/infiniband/hw/hfi1/chip.h
+++ b/drivers/infiniband/hw/hfi1/chip.h
@@ -1,7 +1,7 @@
#ifndef _CHIP_H
#define _CHIP_H
/*
- * Copyright(c) 2015 - 2017 Intel Corporation.
+ * Copyright(c) 2015 - 2018 Intel Corporation.
*
* This file is provided under a dual BSD/GPLv2 license. When using or
* redistributing this file, you may do so under either license.
@@ -804,6 +804,7 @@ void clear_linkup_counters(struct hfi1_devdata *dd);
u32 hdrqempty(struct hfi1_ctxtdata *rcd);
int is_ax(struct hfi1_devdata *dd);
int is_bx(struct hfi1_devdata *dd);
+bool is_urg_masked(struct hfi1_ctxtdata *rcd);
u32 read_physical_state(struct hfi1_devdata *dd);
u32 chip_to_opa_pstate(struct hfi1_devdata *dd, u32 chip_pstate);
const char *opa_lstate_name(u32 lstate);
@@ -926,6 +927,7 @@ enum {
C_SW_PIO_WAIT,
C_SW_PIO_DRAIN,
C_SW_KMEM_WAIT,
+ C_SW_TID_WAIT,
C_SW_SEND_SCHED,
C_SDMA_DESC_FETCHED_CNT,
C_SDMA_INT_CNT,
diff --git a/drivers/infiniband/hw/hfi1/chip_registers.h b/drivers/infiniband/hw/hfi1/chip_registers.h
index c6163a347e93..c0800ea5a3f8 100644
--- a/drivers/infiniband/hw/hfi1/chip_registers.h
+++ b/drivers/infiniband/hw/hfi1/chip_registers.h
@@ -935,6 +935,10 @@
#define SEND_CTXT_CREDIT_CTRL_THRESHOLD_MASK 0x7FFull
#define SEND_CTXT_CREDIT_CTRL_THRESHOLD_SHIFT 0
#define SEND_CTXT_CREDIT_CTRL_THRESHOLD_SMASK 0x7FFull
+#define SEND_CTXT_CREDIT_STATUS (TXE + 0x000000100018)
+#define SEND_CTXT_CREDIT_STATUS_CURRENT_FREE_COUNTER_MASK 0x7FFull
+#define SEND_CTXT_CREDIT_STATUS_CURRENT_FREE_COUNTER_SHIFT 32
+#define SEND_CTXT_CREDIT_STATUS_LAST_RETURNED_COUNTER_SMASK 0x7FFull
#define SEND_CTXT_CREDIT_FORCE (TXE + 0x000000100028)
#define SEND_CTXT_CREDIT_FORCE_FORCE_RETURN_SMASK 0x1ull
#define SEND_CTXT_CREDIT_RETURN_ADDR (TXE + 0x000000100020)
diff --git a/drivers/infiniband/hw/hfi1/common.h b/drivers/infiniband/hw/hfi1/common.h
index 7108d4d92259..7310a5dba420 100644
--- a/drivers/infiniband/hw/hfi1/common.h
+++ b/drivers/infiniband/hw/hfi1/common.h
@@ -1,5 +1,5 @@
/*
- * Copyright(c) 2015, 2016 Intel Corporation.
+ * Copyright(c) 2015 - 2018 Intel Corporation.
*
* This file is provided under a dual BSD/GPLv2 license. When using or
* redistributing this file, you may do so under either license.
@@ -136,18 +136,21 @@
HFI1_CAP_ALLOW_PERM_JKEY | \
HFI1_CAP_STATIC_RATE_CTRL | \
HFI1_CAP_PRINT_UNIMPL | \
- HFI1_CAP_TID_UNMAP)
+ HFI1_CAP_TID_UNMAP | \
+ HFI1_CAP_OPFN)
/*
* A set of capability bits that are "global" and are not allowed to be
* set in the user bitmask.
*/
#define HFI1_CAP_RESERVED_MASK ((HFI1_CAP_SDMA | \
- HFI1_CAP_USE_SDMA_HEAD | \
- HFI1_CAP_EXTENDED_PSN | \
- HFI1_CAP_PRINT_UNIMPL | \
- HFI1_CAP_NO_INTEGRITY | \
- HFI1_CAP_PKEY_CHECK) << \
- HFI1_CAP_USER_SHIFT)
+ HFI1_CAP_USE_SDMA_HEAD | \
+ HFI1_CAP_EXTENDED_PSN | \
+ HFI1_CAP_PRINT_UNIMPL | \
+ HFI1_CAP_NO_INTEGRITY | \
+ HFI1_CAP_PKEY_CHECK | \
+ HFI1_CAP_TID_RDMA | \
+ HFI1_CAP_OPFN) << \
+ HFI1_CAP_USER_SHIFT)
/*
* Set of capabilities that need to be enabled for kernel context in
* order to be allowed for user contexts, as well.
@@ -337,6 +340,10 @@ struct diag_pkt {
#define HFI1_PSM_IOC_BASE_SEQ 0x0
+/* Number of BTH.PSN bits used for sequence number in expected rcvs */
+#define HFI1_KDETH_BTH_SEQ_SHIFT 11
+#define HFI1_KDETH_BTH_SEQ_MASK (BIT(HFI1_KDETH_BTH_SEQ_SHIFT) - 1)
+
static inline __u64 rhf_to_cpu(const __le32 *rbuf)
{
return __le64_to_cpu(*((__le64 *)rbuf));
diff --git a/drivers/infiniband/hw/hfi1/debugfs.c b/drivers/infiniband/hw/hfi1/debugfs.c
index 9f992ae36c89..427ba0ce74a5 100644
--- a/drivers/infiniband/hw/hfi1/debugfs.c
+++ b/drivers/infiniband/hw/hfi1/debugfs.c
@@ -407,6 +407,54 @@ DEBUGFS_SEQ_FILE_OPS(rcds);
DEBUGFS_SEQ_FILE_OPEN(rcds)
DEBUGFS_FILE_OPS(rcds);
+static void *_pios_seq_start(struct seq_file *s, loff_t *pos)
+{
+ struct hfi1_ibdev *ibd;
+ struct hfi1_devdata *dd;
+
+ ibd = (struct hfi1_ibdev *)s->private;
+ dd = dd_from_dev(ibd);
+ if (!dd->send_contexts || *pos >= dd->num_send_contexts)
+ return NULL;
+ return pos;
+}
+
+static void *_pios_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+ struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
+ struct hfi1_devdata *dd = dd_from_dev(ibd);
+
+ ++*pos;
+ if (!dd->send_contexts || *pos >= dd->num_send_contexts)
+ return NULL;
+ return pos;
+}
+
+static void _pios_seq_stop(struct seq_file *s, void *v)
+{
+}
+
+static int _pios_seq_show(struct seq_file *s, void *v)
+{
+ struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
+ struct hfi1_devdata *dd = dd_from_dev(ibd);
+ struct send_context_info *sci;
+ loff_t *spos = v;
+ loff_t i = *spos;
+ unsigned long flags;
+
+ spin_lock_irqsave(&dd->sc_lock, flags);
+ sci = &dd->send_contexts[i];
+ if (sci && sci->type != SC_USER && sci->allocated && sci->sc)
+ seqfile_dump_sci(s, i, sci);
+ spin_unlock_irqrestore(&dd->sc_lock, flags);
+ return 0;
+}
+
+DEBUGFS_SEQ_FILE_OPS(pios);
+DEBUGFS_SEQ_FILE_OPEN(pios)
+DEBUGFS_FILE_OPS(pios);
+
/* read the per-device counters */
static ssize_t dev_counters_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
@@ -1119,6 +1167,7 @@ void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd)
char link[10];
struct hfi1_devdata *dd = dd_from_dev(ibd);
struct hfi1_pportdata *ppd;
+ struct dentry *root;
int unit = dd->unit;
int i, j;
@@ -1126,30 +1175,29 @@ void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd)
return;
snprintf(name, sizeof(name), "%s_%d", class_name(), unit);
snprintf(link, sizeof(link), "%d", unit);
- ibd->hfi1_ibdev_dbg = debugfs_create_dir(name, hfi1_dbg_root);
- if (!ibd->hfi1_ibdev_dbg) {
- pr_warn("create of %s failed\n", name);
- return;
- }
+ root = debugfs_create_dir(name, hfi1_dbg_root);
+ ibd->hfi1_ibdev_dbg = root;
+
ibd->hfi1_ibdev_link =
debugfs_create_symlink(link, hfi1_dbg_root, name);
- if (!ibd->hfi1_ibdev_link) {
- pr_warn("create of %s symlink failed\n", name);
- return;
- }
- DEBUGFS_SEQ_FILE_CREATE(opcode_stats, ibd->hfi1_ibdev_dbg, ibd);
- DEBUGFS_SEQ_FILE_CREATE(tx_opcode_stats, ibd->hfi1_ibdev_dbg, ibd);
- DEBUGFS_SEQ_FILE_CREATE(ctx_stats, ibd->hfi1_ibdev_dbg, ibd);
- DEBUGFS_SEQ_FILE_CREATE(qp_stats, ibd->hfi1_ibdev_dbg, ibd);
- DEBUGFS_SEQ_FILE_CREATE(sdes, ibd->hfi1_ibdev_dbg, ibd);
- DEBUGFS_SEQ_FILE_CREATE(rcds, ibd->hfi1_ibdev_dbg, ibd);
- DEBUGFS_SEQ_FILE_CREATE(sdma_cpu_list, ibd->hfi1_ibdev_dbg, ibd);
+
+ debugfs_create_file("opcode_stats", 0444, root, ibd,
+ &_opcode_stats_file_ops);
+ debugfs_create_file("tx_opcode_stats", 0444, root, ibd,
+ &_tx_opcode_stats_file_ops);
+ debugfs_create_file("ctx_stats", 0444, root, ibd, &_ctx_stats_file_ops);
+ debugfs_create_file("qp_stats", 0444, root, ibd, &_qp_stats_file_ops);
+ debugfs_create_file("sdes", 0444, root, ibd, &_sdes_file_ops);
+ debugfs_create_file("rcds", 0444, root, ibd, &_rcds_file_ops);
+ debugfs_create_file("pios", 0444, root, ibd, &_pios_file_ops);
+ debugfs_create_file("sdma_cpu_list", 0444, root, ibd,
+ &_sdma_cpu_list_file_ops);
+
/* dev counter files */
for (i = 0; i < ARRAY_SIZE(cntr_ops); i++)
- DEBUGFS_FILE_CREATE(cntr_ops[i].name,
- ibd->hfi1_ibdev_dbg,
- dd,
- &cntr_ops[i].ops, S_IRUGO);
+ debugfs_create_file(cntr_ops[i].name, 0444, root, dd,
+ &cntr_ops[i].ops);
+
/* per port files */
for (ppd = dd->pport, j = 0; j < dd->num_pports; j++, ppd++)
for (i = 0; i < ARRAY_SIZE(port_cntr_ops); i++) {
@@ -1157,12 +1205,11 @@ void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd)
sizeof(name),
port_cntr_ops[i].name,
j + 1);
- DEBUGFS_FILE_CREATE(name,
- ibd->hfi1_ibdev_dbg,
- ppd,
- &port_cntr_ops[i].ops,
+ debugfs_create_file(name,
!port_cntr_ops[i].ops.write ?
- S_IRUGO : S_IRUGO | S_IWUSR);
+ S_IRUGO :
+ S_IRUGO | S_IWUSR,
+ root, ppd, &port_cntr_ops[i].ops);
}
hfi1_fault_init_debugfs(ibd);
@@ -1292,10 +1339,10 @@ DEBUGFS_FILE_OPS(driver_stats);
void hfi1_dbg_init(void)
{
hfi1_dbg_root = debugfs_create_dir(DRIVER_NAME, NULL);
- if (!hfi1_dbg_root)
- pr_warn("init of debugfs failed\n");
- DEBUGFS_SEQ_FILE_CREATE(driver_stats_names, hfi1_dbg_root, NULL);
- DEBUGFS_SEQ_FILE_CREATE(driver_stats, hfi1_dbg_root, NULL);
+ debugfs_create_file("driver_stats_names", 0444, hfi1_dbg_root, NULL,
+ &_driver_stats_names_file_ops);
+ debugfs_create_file("driver_stats", 0444, hfi1_dbg_root, NULL,
+ &_driver_stats_file_ops);
}
void hfi1_dbg_exit(void)
diff --git a/drivers/infiniband/hw/hfi1/debugfs.h b/drivers/infiniband/hw/hfi1/debugfs.h
index d5d824459fcc..57e582caa5eb 100644
--- a/drivers/infiniband/hw/hfi1/debugfs.h
+++ b/drivers/infiniband/hw/hfi1/debugfs.h
@@ -49,16 +49,6 @@
struct hfi1_ibdev;
-#define DEBUGFS_FILE_CREATE(name, parent, data, ops, mode) \
-do { \
- struct dentry *ent; \
- const char *__name = name; \
- ent = debugfs_create_file(__name, mode, parent, \
- data, ops); \
- if (!ent) \
- pr_warn("create of %s failed\n", __name); \
-} while (0)
-
#define DEBUGFS_SEQ_FILE_OPS(name) \
static const struct seq_operations _##name##_seq_ops = { \
.start = _##name##_seq_start, \
@@ -89,8 +79,6 @@ static const struct file_operations _##name##_file_ops = { \
.release = seq_release \
}
-#define DEBUGFS_SEQ_FILE_CREATE(name, parent, data) \
- DEBUGFS_FILE_CREATE(#name, parent, data, &_##name##_file_ops, 0444)
ssize_t hfi1_seq_read(struct file *file, char __user *buf, size_t size,
loff_t *ppos);
diff --git a/drivers/infiniband/hw/hfi1/driver.c b/drivers/infiniband/hw/hfi1/driver.c
index a41f85558312..2a9d2912f5db 100644
--- a/drivers/infiniband/hw/hfi1/driver.c
+++ b/drivers/infiniband/hw/hfi1/driver.c
@@ -430,40 +430,60 @@ static const hfi1_handle_cnp hfi1_handle_cnp_tbl[2] = {
[HFI1_PKT_TYPE_16B] = &return_cnp_16B
};
-void hfi1_process_ecn_slowpath(struct rvt_qp *qp, struct hfi1_packet *pkt,
- bool do_cnp)
+/**
+ * hfi1_process_ecn_slowpath - Process FECN or BECN bits
+ * @qp: The packet's destination QP
+ * @pkt: The packet itself.
+ * @prescan: Is the caller the RXQ prescan
+ *
+ * Process the packet's FECN or BECN bits. By now, the packet
+ * has already been evaluated whether processing of those bit should
+ * be done.
+ * The significance of the @prescan argument is that if the caller
+ * is the RXQ prescan, a CNP will be send out instead of waiting for the
+ * normal packet processing to send an ACK with BECN set (or a CNP).
+ */
+bool hfi1_process_ecn_slowpath(struct rvt_qp *qp, struct hfi1_packet *pkt,
+ bool prescan)
{
struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
struct ib_other_headers *ohdr = pkt->ohdr;
struct ib_grh *grh = pkt->grh;
- u32 rqpn = 0, bth1;
+ u32 rqpn = 0;
u16 pkey;
u32 rlid, slid, dlid = 0;
- u8 hdr_type, sc, svc_type;
- bool is_mcast = false;
+ u8 hdr_type, sc, svc_type, opcode;
+ bool is_mcast = false, ignore_fecn = false, do_cnp = false,
+ fecn, becn;
/* can be called from prescan */
if (pkt->etype == RHF_RCV_TYPE_BYPASS) {
- is_mcast = hfi1_is_16B_mcast(dlid);
pkey = hfi1_16B_get_pkey(pkt->hdr);
sc = hfi1_16B_get_sc(pkt->hdr);
dlid = hfi1_16B_get_dlid(pkt->hdr);
slid = hfi1_16B_get_slid(pkt->hdr);
+ is_mcast = hfi1_is_16B_mcast(dlid);
+ opcode = ib_bth_get_opcode(ohdr);
hdr_type = HFI1_PKT_TYPE_16B;
+ fecn = hfi1_16B_get_fecn(pkt->hdr);
+ becn = hfi1_16B_get_becn(pkt->hdr);
} else {
- is_mcast = (dlid > be16_to_cpu(IB_MULTICAST_LID_BASE)) &&
- (dlid != be16_to_cpu(IB_LID_PERMISSIVE));
pkey = ib_bth_get_pkey(ohdr);
sc = hfi1_9B_get_sc5(pkt->hdr, pkt->rhf);
- dlid = ib_get_dlid(pkt->hdr);
+ dlid = qp->ibqp.qp_type != IB_QPT_UD ? ib_get_dlid(pkt->hdr) :
+ ppd->lid;
slid = ib_get_slid(pkt->hdr);
+ is_mcast = (dlid > be16_to_cpu(IB_MULTICAST_LID_BASE)) &&
+ (dlid != be16_to_cpu(IB_LID_PERMISSIVE));
+ opcode = ib_bth_get_opcode(ohdr);
hdr_type = HFI1_PKT_TYPE_9B;
+ fecn = ib_bth_get_fecn(ohdr);
+ becn = ib_bth_get_becn(ohdr);
}
switch (qp->ibqp.qp_type) {
case IB_QPT_UD:
- dlid = ppd->lid;
rlid = slid;
rqpn = ib_get_sqpn(pkt->ohdr);
svc_type = IB_CC_SVCTYPE_UD;
@@ -485,22 +505,31 @@ void hfi1_process_ecn_slowpath(struct rvt_qp *qp, struct hfi1_packet *pkt,
svc_type = IB_CC_SVCTYPE_RC;
break;
default:
- return;
+ return false;
}
- bth1 = be32_to_cpu(ohdr->bth[1]);
+ ignore_fecn = is_mcast || (opcode == IB_OPCODE_CNP) ||
+ (opcode == IB_OPCODE_RC_ACKNOWLEDGE);
+ /*
+ * ACKNOWLEDGE packets do not get a CNP but this will be
+ * guarded by ignore_fecn above.
+ */
+ do_cnp = prescan ||
+ (opcode >= IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST &&
+ opcode <= IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE);
+
/* Call appropriate CNP handler */
- if (do_cnp && (bth1 & IB_FECN_SMASK))
+ if (!ignore_fecn && do_cnp && fecn)
hfi1_handle_cnp_tbl[hdr_type](ibp, qp, rqpn, pkey,
dlid, rlid, sc, grh);
- if (!is_mcast && (bth1 & IB_BECN_SMASK)) {
- u32 lqpn = bth1 & RVT_QPN_MASK;
+ if (becn) {
+ u32 lqpn = be32_to_cpu(ohdr->bth[1]) & RVT_QPN_MASK;
u8 sl = ibp->sc_to_sl[sc];
process_becn(ppd, sl, rlid, lqpn, rqpn, svc_type);
}
-
+ return !ignore_fecn && fecn;
}
struct ps_mdata {
@@ -599,7 +628,6 @@ static void __prescan_rxq(struct hfi1_packet *packet)
struct rvt_dev_info *rdi = &rcd->dd->verbs_dev.rdi;
u64 rhf = rhf_to_cpu(rhf_addr);
u32 etype = rhf_rcv_type(rhf), qpn, bth1;
- int is_ecn = 0;
u8 lnh;
if (ps_done(&mdata, rhf, rcd))
@@ -625,12 +653,10 @@ static void __prescan_rxq(struct hfi1_packet *packet)
goto next; /* just in case */
}
- bth1 = be32_to_cpu(packet->ohdr->bth[1]);
- is_ecn = !!(bth1 & (IB_FECN_SMASK | IB_BECN_SMASK));
-
- if (!is_ecn)
+ if (!hfi1_may_ecn(packet))
goto next;
+ bth1 = be32_to_cpu(packet->ohdr->bth[1]);
qpn = bth1 & RVT_QPN_MASK;
rcu_read_lock();
qp = rvt_lookup_qpn(rdi, &ibp->rvp, qpn);
@@ -640,7 +666,7 @@ static void __prescan_rxq(struct hfi1_packet *packet)
goto next;
}
- process_ecn(qp, packet, true);
+ hfi1_process_ecn_slowpath(qp, packet, true);
rcu_read_unlock();
/* turn off BECN, FECN */
@@ -1400,7 +1426,7 @@ static int hfi1_bypass_ingress_pkt_check(struct hfi1_packet *packet)
if ((!(hfi1_is_16B_mcast(packet->dlid))) &&
(packet->dlid !=
opa_get_lid(be32_to_cpu(OPA_LID_PERMISSIVE), 16B))) {
- if (packet->dlid != ppd->lid)
+ if ((packet->dlid & ~((1 << ppd->lmc) - 1)) != ppd->lid)
return -EINVAL;
}
@@ -1549,25 +1575,32 @@ drop:
return -EINVAL;
}
-void handle_eflags(struct hfi1_packet *packet)
+static void show_eflags_errs(struct hfi1_packet *packet)
{
struct hfi1_ctxtdata *rcd = packet->rcd;
u32 rte = rhf_rcv_type_err(packet->rhf);
+ dd_dev_err(rcd->dd,
+ "receive context %d: rhf 0x%016llx, errs [ %s%s%s%s%s%s%s%s] rte 0x%x\n",
+ rcd->ctxt, packet->rhf,
+ packet->rhf & RHF_K_HDR_LEN_ERR ? "k_hdr_len " : "",
+ packet->rhf & RHF_DC_UNC_ERR ? "dc_unc " : "",
+ packet->rhf & RHF_DC_ERR ? "dc " : "",
+ packet->rhf & RHF_TID_ERR ? "tid " : "",
+ packet->rhf & RHF_LEN_ERR ? "len " : "",
+ packet->rhf & RHF_ECC_ERR ? "ecc " : "",
+ packet->rhf & RHF_VCRC_ERR ? "vcrc " : "",
+ packet->rhf & RHF_ICRC_ERR ? "icrc " : "",
+ rte);
+}
+
+void handle_eflags(struct hfi1_packet *packet)
+{
+ struct hfi1_ctxtdata *rcd = packet->rcd;
+
rcv_hdrerr(rcd, rcd->ppd, packet);
if (rhf_err_flags(packet->rhf))
- dd_dev_err(rcd->dd,
- "receive context %d: rhf 0x%016llx, errs [ %s%s%s%s%s%s%s%s] rte 0x%x\n",
- rcd->ctxt, packet->rhf,
- packet->rhf & RHF_K_HDR_LEN_ERR ? "k_hdr_len " : "",
- packet->rhf & RHF_DC_UNC_ERR ? "dc_unc " : "",
- packet->rhf & RHF_DC_ERR ? "dc " : "",
- packet->rhf & RHF_TID_ERR ? "tid " : "",
- packet->rhf & RHF_LEN_ERR ? "len " : "",
- packet->rhf & RHF_ECC_ERR ? "ecc " : "",
- packet->rhf & RHF_VCRC_ERR ? "vcrc " : "",
- packet->rhf & RHF_ICRC_ERR ? "icrc " : "",
- rte);
+ show_eflags_errs(packet);
}
/*
@@ -1673,11 +1706,14 @@ static int kdeth_process_expected(struct hfi1_packet *packet)
if (unlikely(hfi1_dbg_should_fault_rx(packet)))
return RHF_RCV_CONTINUE;
- if (unlikely(rhf_err_flags(packet->rhf)))
- handle_eflags(packet);
+ if (unlikely(rhf_err_flags(packet->rhf))) {
+ struct hfi1_ctxtdata *rcd = packet->rcd;
- dd_dev_err(packet->rcd->dd,
- "Unhandled expected packet received. Dropping.\n");
+ if (hfi1_handle_kdeth_eflags(rcd, rcd->ppd, packet))
+ return RHF_RCV_CONTINUE;
+ }
+
+ hfi1_kdeth_expected_rcv(packet);
return RHF_RCV_CONTINUE;
}
@@ -1686,11 +1722,17 @@ static int kdeth_process_eager(struct hfi1_packet *packet)
hfi1_setup_9B_packet(packet);
if (unlikely(hfi1_dbg_should_fault_rx(packet)))
return RHF_RCV_CONTINUE;
- if (unlikely(rhf_err_flags(packet->rhf)))
- handle_eflags(packet);
- dd_dev_err(packet->rcd->dd,
- "Unhandled eager packet received. Dropping.\n");
+ trace_hfi1_rcvhdr(packet);
+ if (unlikely(rhf_err_flags(packet->rhf))) {
+ struct hfi1_ctxtdata *rcd = packet->rcd;
+
+ show_eflags_errs(packet);
+ if (hfi1_handle_kdeth_eflags(rcd, rcd->ppd, packet))
+ return RHF_RCV_CONTINUE;
+ }
+
+ hfi1_kdeth_eager_rcv(packet);
return RHF_RCV_CONTINUE;
}
diff --git a/drivers/infiniband/hw/hfi1/fault.c b/drivers/infiniband/hw/hfi1/fault.c
index e2290f32c8d9..3fd3315d0fb0 100644
--- a/drivers/infiniband/hw/hfi1/fault.c
+++ b/drivers/infiniband/hw/hfi1/fault.c
@@ -250,6 +250,7 @@ void hfi1_fault_exit_debugfs(struct hfi1_ibdev *ibd)
int hfi1_fault_init_debugfs(struct hfi1_ibdev *ibd)
{
struct dentry *parent = ibd->hfi1_ibdev_dbg;
+ struct dentry *fault_dir;
ibd->fault = kzalloc(sizeof(*ibd->fault), GFP_KERNEL);
if (!ibd->fault)
@@ -269,45 +270,31 @@ int hfi1_fault_init_debugfs(struct hfi1_ibdev *ibd)
bitmap_zero(ibd->fault->opcodes,
sizeof(ibd->fault->opcodes) * BITS_PER_BYTE);
- ibd->fault->dir =
- fault_create_debugfs_attr("fault", parent,
- &ibd->fault->attr);
- if (IS_ERR(ibd->fault->dir)) {
+ fault_dir =
+ fault_create_debugfs_attr("fault", parent, &ibd->fault->attr);
+ if (IS_ERR(fault_dir)) {
kfree(ibd->fault);
ibd->fault = NULL;
return -ENOENT;
}
-
- DEBUGFS_SEQ_FILE_CREATE(fault_stats, ibd->fault->dir, ibd);
- if (!debugfs_create_bool("enable", 0600, ibd->fault->dir,
- &ibd->fault->enable))
- goto fail;
- if (!debugfs_create_bool("suppress_err", 0600,
- ibd->fault->dir,
- &ibd->fault->suppress_err))
- goto fail;
- if (!debugfs_create_bool("opcode_mode", 0600, ibd->fault->dir,
- &ibd->fault->opcode))
- goto fail;
- if (!debugfs_create_file("opcodes", 0600, ibd->fault->dir,
- ibd->fault, &__fault_opcodes_fops))
- goto fail;
- if (!debugfs_create_u64("skip_pkts", 0600,
- ibd->fault->dir,
- &ibd->fault->fault_skip))
- goto fail;
- if (!debugfs_create_u64("skip_usec", 0600,
- ibd->fault->dir,
- &ibd->fault->fault_skip_usec))
- goto fail;
- if (!debugfs_create_u8("direction", 0600, ibd->fault->dir,
- &ibd->fault->direction))
- goto fail;
+ ibd->fault->dir = fault_dir;
+
+ debugfs_create_file("fault_stats", 0444, fault_dir, ibd,
+ &_fault_stats_file_ops);
+ debugfs_create_bool("enable", 0600, fault_dir, &ibd->fault->enable);
+ debugfs_create_bool("suppress_err", 0600, fault_dir,
+ &ibd->fault->suppress_err);
+ debugfs_create_bool("opcode_mode", 0600, fault_dir,
+ &ibd->fault->opcode);
+ debugfs_create_file("opcodes", 0600, fault_dir, ibd->fault,
+ &__fault_opcodes_fops);
+ debugfs_create_u64("skip_pkts", 0600, fault_dir,
+ &ibd->fault->fault_skip);
+ debugfs_create_u64("skip_usec", 0600, fault_dir,
+ &ibd->fault->fault_skip_usec);
+ debugfs_create_u8("direction", 0600, fault_dir, &ibd->fault->direction);
return 0;
-fail:
- hfi1_fault_exit_debugfs(ibd);
- return -ENOMEM;
}
bool hfi1_dbg_fault_suppress_err(struct hfi1_ibdev *ibd)
diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c
index c22ebc774a6a..f9a7e9d29c8b 100644
--- a/drivers/infiniband/hw/hfi1/file_ops.c
+++ b/drivers/infiniband/hw/hfi1/file_ops.c
@@ -488,7 +488,7 @@ static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma)
vmf = 1;
break;
case STATUS:
- if (flags & (unsigned long)(VM_WRITE | VM_EXEC)) {
+ if (flags & VM_WRITE) {
ret = -EPERM;
goto done;
}
diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h
index 2b882347d0c2..048b5d73ba39 100644
--- a/drivers/infiniband/hw/hfi1/hfi.h
+++ b/drivers/infiniband/hw/hfi1/hfi.h
@@ -73,6 +73,7 @@
#include "chip_registers.h"
#include "common.h"
+#include "opfn.h"
#include "verbs.h"
#include "pio.h"
#include "chip.h"
@@ -98,6 +99,8 @@
#define NEIGHBOR_TYPE_HFI 0
#define NEIGHBOR_TYPE_SWITCH 1
+#define HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES 5
+
extern unsigned long hfi1_cap_mask;
#define HFI1_CAP_KGET_MASK(mask, cap) ((mask) & HFI1_CAP_##cap)
#define HFI1_CAP_UGET_MASK(mask, cap) \
@@ -195,6 +198,14 @@ struct exp_tid_set {
};
typedef int (*rhf_rcv_function_ptr)(struct hfi1_packet *packet);
+
+struct tid_queue {
+ struct list_head queue_head;
+ /* queue head for QP TID resource waiters */
+ u32 enqueue; /* count of tid enqueues */
+ u32 dequeue; /* count of tid dequeues */
+};
+
struct hfi1_ctxtdata {
/* rcvhdrq base, needs mmap before useful */
void *rcvhdrq;
@@ -288,6 +299,12 @@ struct hfi1_ctxtdata {
/* PSM Specific fields */
/* lock protecting all Expected TID data */
struct mutex exp_mutex;
+ /* lock protecting all Expected TID data of kernel contexts */
+ spinlock_t exp_lock;
+ /* Queue for QP's waiting for HW TID flows */
+ struct tid_queue flow_queue;
+ /* Queue for QP's waiting for HW receive array entries */
+ struct tid_queue rarr_queue;
/* when waiting for rcv or pioavail */
wait_queue_head_t wait;
/* uuid from PSM */
@@ -320,6 +337,9 @@ struct hfi1_ctxtdata {
*/
u8 subctxt_cnt;
+ /* Bit mask to track free TID RDMA HW flows */
+ unsigned long flow_mask;
+ struct tid_flow_state flows[RXE_NUM_TID_FLOWS];
};
/**
@@ -1435,7 +1455,7 @@ void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd,
struct hfi1_devdata *dd, u8 hw_pidx, u8 port);
void hfi1_free_ctxtdata(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd);
int hfi1_rcd_put(struct hfi1_ctxtdata *rcd);
-void hfi1_rcd_get(struct hfi1_ctxtdata *rcd);
+int hfi1_rcd_get(struct hfi1_ctxtdata *rcd);
struct hfi1_ctxtdata *hfi1_rcd_get_by_index_safe(struct hfi1_devdata *dd,
u16 ctxt);
struct hfi1_ctxtdata *hfi1_rcd_get_by_index(struct hfi1_devdata *dd, u16 ctxt);
@@ -1804,13 +1824,20 @@ static inline struct hfi1_ibport *rcd_to_iport(struct hfi1_ctxtdata *rcd)
return &rcd->ppd->ibport_data;
}
-void hfi1_process_ecn_slowpath(struct rvt_qp *qp, struct hfi1_packet *pkt,
- bool do_cnp);
-static inline bool process_ecn(struct rvt_qp *qp, struct hfi1_packet *pkt,
- bool do_cnp)
+/**
+ * hfi1_may_ecn - Check whether FECN or BECN processing should be done
+ * @pkt: the packet to be evaluated
+ *
+ * Check whether the FECN or BECN bits in the packet's header are
+ * enabled, depending on packet type.
+ *
+ * This function only checks for FECN and BECN bits. Additional checks
+ * are done in the slowpath (hfi1_process_ecn_slowpath()) in order to
+ * ensure correct handling.
+ */
+static inline bool hfi1_may_ecn(struct hfi1_packet *pkt)
{
- bool becn;
- bool fecn;
+ bool fecn, becn;
if (pkt->etype == RHF_RCV_TYPE_BYPASS) {
fecn = hfi1_16B_get_fecn(pkt->hdr);
@@ -1819,10 +1846,18 @@ static inline bool process_ecn(struct rvt_qp *qp, struct hfi1_packet *pkt,
fecn = ib_bth_get_fecn(pkt->ohdr);
becn = ib_bth_get_becn(pkt->ohdr);
}
- if (unlikely(fecn || becn)) {
- hfi1_process_ecn_slowpath(qp, pkt, do_cnp);
- return fecn;
- }
+ return fecn || becn;
+}
+
+bool hfi1_process_ecn_slowpath(struct rvt_qp *qp, struct hfi1_packet *pkt,
+ bool prescan);
+static inline bool process_ecn(struct rvt_qp *qp, struct hfi1_packet *pkt)
+{
+ bool do_work;
+
+ do_work = hfi1_may_ecn(pkt);
+ if (unlikely(do_work))
+ return hfi1_process_ecn_slowpath(qp, pkt, false);
return false;
}
@@ -2085,7 +2120,7 @@ static inline u64 hfi1_pkt_default_send_ctxt_mask(struct hfi1_devdata *dd,
SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_TEST_SMASK |
#endif
HFI1_PKT_USER_SC_INTEGRITY;
- else
+ else if (ctxt_type != SC_KERNEL)
base_sc_integrity |= HFI1_PKT_KERNEL_SC_INTEGRITY;
/* turn on send-side job key checks if !A0 */
diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c
index 09044905284f..faaaac8fbc55 100644
--- a/drivers/infiniband/hw/hfi1/init.c
+++ b/drivers/infiniband/hw/hfi1/init.c
@@ -54,6 +54,7 @@
#include <linux/printk.h>
#include <linux/hrtimer.h>
#include <linux/bitmap.h>
+#include <linux/numa.h>
#include <rdma/rdma_vt.h>
#include "hfi.h"
@@ -72,7 +73,6 @@
#undef pr_fmt
#define pr_fmt(fmt) DRIVER_NAME ": " fmt
-#define HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES 5
/*
* min buffers we want to have per context, after driver
*/
@@ -215,12 +215,12 @@ static void hfi1_rcd_free(struct kref *kref)
struct hfi1_ctxtdata *rcd =
container_of(kref, struct hfi1_ctxtdata, kref);
- hfi1_free_ctxtdata(rcd->dd, rcd);
-
spin_lock_irqsave(&rcd->dd->uctxt_lock, flags);
rcd->dd->rcd[rcd->ctxt] = NULL;
spin_unlock_irqrestore(&rcd->dd->uctxt_lock, flags);
+ hfi1_free_ctxtdata(rcd->dd, rcd);
+
kfree(rcd);
}
@@ -243,10 +243,13 @@ int hfi1_rcd_put(struct hfi1_ctxtdata *rcd)
* @rcd: pointer to an initialized rcd data structure
*
* Use this to get a reference after the init.
+ *
+ * Return : reflect kref_get_unless_zero(), which returns non-zero on
+ * increment, otherwise 0.
*/
-void hfi1_rcd_get(struct hfi1_ctxtdata *rcd)
+int hfi1_rcd_get(struct hfi1_ctxtdata *rcd)
{
- kref_get(&rcd->kref);
+ return kref_get_unless_zero(&rcd->kref);
}
/**
@@ -326,7 +329,8 @@ struct hfi1_ctxtdata *hfi1_rcd_get_by_index(struct hfi1_devdata *dd, u16 ctxt)
spin_lock_irqsave(&dd->uctxt_lock, flags);
if (dd->rcd[ctxt]) {
rcd = dd->rcd[ctxt];
- hfi1_rcd_get(rcd);
+ if (!hfi1_rcd_get(rcd))
+ rcd = NULL;
}
spin_unlock_irqrestore(&dd->uctxt_lock, flags);
@@ -371,6 +375,9 @@ int hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, int numa,
rcd->rhf_rcv_function_map = normal_rhf_rcv_functions;
mutex_init(&rcd->exp_mutex);
+ spin_lock_init(&rcd->exp_lock);
+ INIT_LIST_HEAD(&rcd->flow_queue.queue_head);
+ INIT_LIST_HEAD(&rcd->rarr_queue.queue_head);
hfi1_cdbg(PROC, "setting up context %u\n", rcd->ctxt);
@@ -473,6 +480,9 @@ int hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, int numa,
GFP_KERNEL, numa);
if (!rcd->opstats)
goto bail;
+
+ /* Initialize TID flow generations for the context */
+ hfi1_kern_init_ctxt_generations(rcd);
}
*context = rcd;
@@ -772,6 +782,8 @@ static void enable_chip(struct hfi1_devdata *dd)
rcvmask |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB;
if (HFI1_CAP_KGET_MASK(rcd->flags, NODROP_EGR_FULL))
rcvmask |= HFI1_RCVCTRL_NO_EGR_DROP_ENB;
+ if (HFI1_CAP_IS_KSET(TID_RDMA))
+ rcvmask |= HFI1_RCVCTRL_TIDFLOW_ENB;
hfi1_rcvctrl(dd, rcvmask, rcd);
sc_enable(rcd->sc);
hfi1_rcd_put(rcd);
@@ -899,10 +911,10 @@ int hfi1_init(struct hfi1_devdata *dd, int reinit)
goto done;
/* allocate dummy tail memory for all receive contexts */
- dd->rcvhdrtail_dummy_kvaddr = dma_zalloc_coherent(
- &dd->pcidev->dev, sizeof(u64),
- &dd->rcvhdrtail_dummy_dma,
- GFP_KERNEL);
+ dd->rcvhdrtail_dummy_kvaddr = dma_alloc_coherent(&dd->pcidev->dev,
+ sizeof(u64),
+ &dd->rcvhdrtail_dummy_dma,
+ GFP_KERNEL);
if (!dd->rcvhdrtail_dummy_kvaddr) {
dd_dev_err(dd, "cannot allocate dummy tail memory\n");
@@ -927,6 +939,8 @@ int hfi1_init(struct hfi1_devdata *dd, int reinit)
lastfail = hfi1_create_rcvhdrq(dd, rcd);
if (!lastfail)
lastfail = hfi1_setup_eagerbufs(rcd);
+ if (!lastfail)
+ lastfail = hfi1_kern_exp_rcv_init(rcd, reinit);
if (lastfail) {
dd_dev_err(dd,
"failed to allocate kernel ctxt's rcvhdrq and/or egr bufs\n");
@@ -1303,7 +1317,7 @@ static struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev,
dd->unit = ret;
list_add(&dd->list, &hfi1_dev_list);
}
- dd->node = -1;
+ dd->node = NUMA_NO_NODE;
spin_unlock_irqrestore(&hfi1_devs_lock, flags);
idr_preload_end();
@@ -1497,6 +1511,13 @@ static int __init hfi1_mod_init(void)
/* sanitize link CRC options */
link_crc_mask &= SUPPORTED_CRCS;
+ ret = opfn_init();
+ if (ret < 0) {
+ pr_err("Failed to allocate opfn_wq");
+ goto bail_dev;
+ }
+
+ hfi1_compute_tid_rdma_flow_wt();
/*
* These must be called before the driver is registered with
* the PCI subsystem.
@@ -1527,6 +1548,7 @@ module_init(hfi1_mod_init);
static void __exit hfi1_mod_cleanup(void)
{
pci_unregister_driver(&hfi1_pci_driver);
+ opfn_exit();
node_affinity_destroy_all();
hfi1_dbg_exit();
@@ -1581,7 +1603,7 @@ static void cleanup_device_data(struct hfi1_devdata *dd)
struct hfi1_ctxtdata *rcd = dd->rcd[ctxt];
if (rcd) {
- hfi1_clear_tids(rcd);
+ hfi1_free_ctxt_rcv_groups(rcd);
hfi1_free_ctxt(rcd);
}
}
@@ -1863,9 +1885,9 @@ int hfi1_create_rcvhdrq(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd)
gfp_flags = GFP_KERNEL;
else
gfp_flags = GFP_USER;
- rcd->rcvhdrq = dma_zalloc_coherent(
- &dd->pcidev->dev, amt, &rcd->rcvhdrq_dma,
- gfp_flags | __GFP_COMP);
+ rcd->rcvhdrq = dma_alloc_coherent(&dd->pcidev->dev, amt,
+ &rcd->rcvhdrq_dma,
+ gfp_flags | __GFP_COMP);
if (!rcd->rcvhdrq) {
dd_dev_err(dd,
@@ -1876,9 +1898,10 @@ int hfi1_create_rcvhdrq(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd)
if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL) ||
HFI1_CAP_UGET_MASK(rcd->flags, DMA_RTAIL)) {
- rcd->rcvhdrtail_kvaddr = dma_zalloc_coherent(
- &dd->pcidev->dev, PAGE_SIZE,
- &rcd->rcvhdrqtailaddr_dma, gfp_flags);
+ rcd->rcvhdrtail_kvaddr = dma_alloc_coherent(&dd->pcidev->dev,
+ PAGE_SIZE,
+ &rcd->rcvhdrqtailaddr_dma,
+ gfp_flags);
if (!rcd->rcvhdrtail_kvaddr)
goto bail_free;
}
@@ -1974,10 +1997,10 @@ int hfi1_setup_eagerbufs(struct hfi1_ctxtdata *rcd)
while (alloced_bytes < rcd->egrbufs.size &&
rcd->egrbufs.alloced < rcd->egrbufs.count) {
rcd->egrbufs.buffers[idx].addr =
- dma_zalloc_coherent(&dd->pcidev->dev,
- rcd->egrbufs.rcvtid_size,
- &rcd->egrbufs.buffers[idx].dma,
- gfp_flags);
+ dma_alloc_coherent(&dd->pcidev->dev,
+ rcd->egrbufs.rcvtid_size,
+ &rcd->egrbufs.buffers[idx].dma,
+ gfp_flags);
if (rcd->egrbufs.buffers[idx].addr) {
rcd->egrbufs.buffers[idx].len =
rcd->egrbufs.rcvtid_size;
diff --git a/drivers/infiniband/hw/hfi1/iowait.c b/drivers/infiniband/hw/hfi1/iowait.c
index 582f1ba136ff..adb4a1ba921b 100644
--- a/drivers/infiniband/hw/hfi1/iowait.c
+++ b/drivers/infiniband/hw/hfi1/iowait.c
@@ -6,6 +6,9 @@
#include "iowait.h"
#include "trace_iowait.h"
+/* 1 priority == 16 starve_cnt */
+#define IOWAIT_PRIORITY_STARVE_SHIFT 4
+
void iowait_set_flag(struct iowait *wait, u32 flag)
{
trace_hfi1_iowait_set(wait, flag);
@@ -44,7 +47,8 @@ void iowait_init(struct iowait *wait, u32 tx_limit,
uint seq,
bool pkts_sent),
void (*wakeup)(struct iowait *wait, int reason),
- void (*sdma_drained)(struct iowait *wait))
+ void (*sdma_drained)(struct iowait *wait),
+ void (*init_priority)(struct iowait *wait))
{
int i;
@@ -58,6 +62,7 @@ void iowait_init(struct iowait *wait, u32 tx_limit,
wait->sleep = sleep;
wait->wakeup = wakeup;
wait->sdma_drained = sdma_drained;
+ wait->init_priority = init_priority;
wait->flags = 0;
for (i = 0; i < IOWAIT_SES; i++) {
wait->wait[i].iow = wait;
@@ -92,3 +97,30 @@ int iowait_set_work_flag(struct iowait_work *w)
iowait_set_flag(w->iow, IOWAIT_PENDING_TID);
return IOWAIT_TID_SE;
}
+
+/**
+ * iowait_priority_update_top - update the top priority entry
+ * @w: the iowait struct
+ * @top: a pointer to the top priority entry
+ * @idx: the index of the current iowait in an array
+ * @top_idx: the array index for the iowait entry that has the top priority
+ *
+ * This function is called to compare the priority of a given
+ * iowait with the given top priority entry. The top index will
+ * be returned.
+ */
+uint iowait_priority_update_top(struct iowait *w,
+ struct iowait *top,
+ uint idx, uint top_idx)
+{
+ u8 cnt, tcnt;
+
+ /* Convert priority into starve_cnt and compare the total.*/
+ cnt = (w->priority << IOWAIT_PRIORITY_STARVE_SHIFT) + w->starved_cnt;
+ tcnt = (top->priority << IOWAIT_PRIORITY_STARVE_SHIFT) +
+ top->starved_cnt;
+ if (cnt > tcnt)
+ return idx;
+ else
+ return top_idx;
+}
diff --git a/drivers/infiniband/hw/hfi1/iowait.h b/drivers/infiniband/hw/hfi1/iowait.h
index 23a58ac0d47c..07847cb72169 100644
--- a/drivers/infiniband/hw/hfi1/iowait.h
+++ b/drivers/infiniband/hw/hfi1/iowait.h
@@ -100,6 +100,7 @@ struct iowait_work {
* @sleep: no space callback
* @wakeup: space callback wakeup
* @sdma_drained: sdma count drained
+ * @init_priority: callback to manipulate priority
* @lock: lock protected head of wait queue
* @iowork: workqueue overhead
* @wait_dma: wait for sdma_busy == 0
@@ -109,7 +110,7 @@ struct iowait_work {
* @tx_limit: limit for overflow queuing
* @tx_count: number of tx entry's in tx_head'ed list
* @flags: wait flags (one per QP)
- * @wait: SE array
+ * @wait: SE array for multiple legs
*
* This is to be embedded in user's state structure
* (QP or PQ).
@@ -120,10 +121,13 @@ struct iowait_work {
* are callbacks for the ULP to implement
* what ever queuing/dequeuing of
* the embedded iowait and its containing struct
- * when a resource shortage like SDMA ring space is seen.
+ * when a resource shortage like SDMA ring space
+ * or PIO credit space is seen.
*
* Both potentially have locks help
- * so sleeping is not allowed.
+ * so sleeping is not allowed and it is not
+ * supported to submit txreqs from the wakeup
+ * call directly because of lock conflicts.
*
* The wait_dma member along with the iow
*
@@ -143,6 +147,7 @@ struct iowait {
);
void (*wakeup)(struct iowait *wait, int reason);
void (*sdma_drained)(struct iowait *wait);
+ void (*init_priority)(struct iowait *wait);
seqlock_t *lock;
wait_queue_head_t wait_dma;
wait_queue_head_t wait_pio;
@@ -152,6 +157,7 @@ struct iowait {
u32 tx_limit;
u32 tx_count;
u8 starved_cnt;
+ u8 priority;
unsigned long flags;
struct iowait_work wait[IOWAIT_SES];
};
@@ -171,7 +177,8 @@ void iowait_init(struct iowait *wait, u32 tx_limit,
uint seq,
bool pkts_sent),
void (*wakeup)(struct iowait *wait, int reason),
- void (*sdma_drained)(struct iowait *wait));
+ void (*sdma_drained)(struct iowait *wait),
+ void (*init_priority)(struct iowait *wait));
/**
* iowait_schedule() - schedule the default send engine work
@@ -186,6 +193,18 @@ static inline bool iowait_schedule(struct iowait *wait,
}
/**
+ * iowait_tid_schedule - schedule the tid SE
+ * @wait: the iowait structure
+ * @wq: the work queue
+ * @cpu: the cpu
+ */
+static inline bool iowait_tid_schedule(struct iowait *wait,
+ struct workqueue_struct *wq, int cpu)
+{
+ return !!queue_work_on(cpu, wq, &wait->wait[IOWAIT_TID_SE].iowork);
+}
+
+/**
* iowait_sdma_drain() - wait for DMAs to drain
*
* @wait: iowait structure
@@ -327,6 +346,8 @@ static inline u16 iowait_get_desc(struct iowait_work *w)
tx = list_first_entry(&w->tx_head, struct sdma_txreq,
list);
num_desc = tx->num_desc;
+ if (tx->flags & SDMA_TXREQ_F_VIP)
+ w->iow->priority++;
}
return num_desc;
}
@@ -340,6 +361,37 @@ static inline u32 iowait_get_all_desc(struct iowait *w)
return num_desc;
}
+static inline void iowait_update_priority(struct iowait_work *w)
+{
+ struct sdma_txreq *tx = NULL;
+
+ if (!list_empty(&w->tx_head)) {
+ tx = list_first_entry(&w->tx_head, struct sdma_txreq,
+ list);
+ if (tx->flags & SDMA_TXREQ_F_VIP)
+ w->iow->priority++;
+ }
+}
+
+static inline void iowait_update_all_priority(struct iowait *w)
+{
+ iowait_update_priority(&w->wait[IOWAIT_IB_SE]);
+ iowait_update_priority(&w->wait[IOWAIT_TID_SE]);
+}
+
+static inline void iowait_init_priority(struct iowait *w)
+{
+ w->priority = 0;
+ if (w->init_priority)
+ w->init_priority(w);
+}
+
+static inline void iowait_get_priority(struct iowait *w)
+{
+ iowait_init_priority(w);
+ iowait_update_all_priority(w);
+}
+
/**
* iowait_queue - Put the iowait on a wait queue
* @pkts_sent: have some packets been sent before queuing?
@@ -356,14 +408,18 @@ static inline void iowait_queue(bool pkts_sent, struct iowait *w,
/*
* To play fair, insert the iowait at the tail of the wait queue if it
* has already sent some packets; Otherwise, put it at the head.
+ * However, if it has priority packets to send, also put it at the
+ * head.
*/
- if (pkts_sent) {
- list_add_tail(&w->list, wait_head);
+ if (pkts_sent)
w->starved_cnt = 0;
- } else {
- list_add(&w->list, wait_head);
+ else
w->starved_cnt++;
- }
+
+ if (w->priority > 0 || !pkts_sent)
+ list_add(&w->list, wait_head);
+ else
+ list_add_tail(&w->list, wait_head);
}
/**
@@ -380,27 +436,10 @@ static inline void iowait_starve_clear(bool pkts_sent, struct iowait *w)
w->starved_cnt = 0;
}
-/**
- * iowait_starve_find_max - Find the maximum of the starve count
- * @w: the iowait struct
- * @max: a variable containing the max starve count
- * @idx: the index of the current iowait in an array
- * @max_idx: a variable containing the array index for the
- * iowait entry that has the max starve count
- *
- * This function is called to compare the starve count of a
- * given iowait with the given max starve count. The max starve
- * count and the index will be updated if the iowait's start
- * count is larger.
- */
-static inline void iowait_starve_find_max(struct iowait *w, u8 *max,
- uint idx, uint *max_idx)
-{
- if (w->starved_cnt > *max) {
- *max = w->starved_cnt;
- *max_idx = idx;
- }
-}
+/* Update the top priority index */
+uint iowait_priority_update_top(struct iowait *w,
+ struct iowait *top,
+ uint idx, uint top_idx);
/**
* iowait_packet_queued() - determine if a packet is queued
diff --git a/drivers/infiniband/hw/hfi1/mad.c b/drivers/infiniband/hw/hfi1/mad.c
index 88a0cf930136..4228393e6c4c 100644
--- a/drivers/infiniband/hw/hfi1/mad.c
+++ b/drivers/infiniband/hw/hfi1/mad.c
@@ -305,7 +305,7 @@ static struct ib_ah *hfi1_create_qp0_ah(struct hfi1_ibport *ibp, u32 dlid)
rcu_read_lock();
qp0 = rcu_dereference(ibp->rvp.qp[0]);
if (qp0)
- ah = rdma_create_ah(qp0->ibqp.pd, &attr);
+ ah = rdma_create_ah(qp0->ibqp.pd, &attr, 0);
rcu_read_unlock();
return ah;
}
diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c
index 475b769e120c..14d2a90964c3 100644
--- a/drivers/infiniband/hw/hfi1/mmu_rb.c
+++ b/drivers/infiniband/hw/hfi1/mmu_rb.c
@@ -68,8 +68,7 @@ struct mmu_rb_handler {
static unsigned long mmu_node_start(struct mmu_rb_node *);
static unsigned long mmu_node_last(struct mmu_rb_node *);
static int mmu_notifier_range_start(struct mmu_notifier *,
- struct mm_struct *,
- unsigned long, unsigned long, bool);
+ const struct mmu_notifier_range *);
static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *,
unsigned long, unsigned long);
static void do_remove(struct mmu_rb_handler *handler,
@@ -284,10 +283,7 @@ void hfi1_mmu_rb_remove(struct mmu_rb_handler *handler,
}
static int mmu_notifier_range_start(struct mmu_notifier *mn,
- struct mm_struct *mm,
- unsigned long start,
- unsigned long end,
- bool blockable)
+ const struct mmu_notifier_range *range)
{
struct mmu_rb_handler *handler =
container_of(mn, struct mmu_rb_handler, mn);
@@ -297,10 +293,11 @@ static int mmu_notifier_range_start(struct mmu_notifier *mn,
bool added = false;
spin_lock_irqsave(&handler->lock, flags);
- for (node = __mmu_int_rb_iter_first(root, start, end - 1);
+ for (node = __mmu_int_rb_iter_first(root, range->start, range->end-1);
node; node = ptr) {
/* Guard against node removal. */
- ptr = __mmu_int_rb_iter_next(node, start, end - 1);
+ ptr = __mmu_int_rb_iter_next(node, range->start,
+ range->end - 1);
trace_hfi1_mmu_mem_invalidate(node->addr, node->len);
if (handler->ops->invalidate(handler->ops_arg, node)) {
__mmu_int_rb_remove(node, root);
diff --git a/drivers/infiniband/hw/hfi1/opfn.c b/drivers/infiniband/hw/hfi1/opfn.c
new file mode 100644
index 000000000000..370a5a8eaa71
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/opfn.c
@@ -0,0 +1,323 @@
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+/*
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ */
+#include "hfi.h"
+#include "trace.h"
+#include "qp.h"
+#include "opfn.h"
+
+#define IB_BTHE_E BIT(IB_BTHE_E_SHIFT)
+
+#define OPFN_CODE(code) BIT((code) - 1)
+#define OPFN_MASK(code) OPFN_CODE(STL_VERBS_EXTD_##code)
+
+struct hfi1_opfn_type {
+ bool (*request)(struct rvt_qp *qp, u64 *data);
+ bool (*response)(struct rvt_qp *qp, u64 *data);
+ bool (*reply)(struct rvt_qp *qp, u64 data);
+ void (*error)(struct rvt_qp *qp);
+};
+
+static struct hfi1_opfn_type hfi1_opfn_handlers[STL_VERBS_EXTD_MAX] = {
+ [STL_VERBS_EXTD_TID_RDMA] = {
+ .request = tid_rdma_conn_req,
+ .response = tid_rdma_conn_resp,
+ .reply = tid_rdma_conn_reply,
+ .error = tid_rdma_conn_error,
+ },
+};
+
+static struct workqueue_struct *opfn_wq;
+
+static void opfn_schedule_conn_request(struct rvt_qp *qp);
+
+static bool hfi1_opfn_extended(u32 bth1)
+{
+ return !!(bth1 & IB_BTHE_E);
+}
+
+static void opfn_conn_request(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+ struct ib_atomic_wr wr;
+ u16 mask, capcode;
+ struct hfi1_opfn_type *extd;
+ u64 data;
+ unsigned long flags;
+ int ret = 0;
+
+ trace_hfi1_opfn_state_conn_request(qp);
+ spin_lock_irqsave(&priv->opfn.lock, flags);
+ /*
+ * Exit if the extended bit is not set, or if nothing is requested, or
+ * if we have completed all requests, or if a previous request is in
+ * progress
+ */
+ if (!priv->opfn.extended || !priv->opfn.requested ||
+ priv->opfn.requested == priv->opfn.completed || priv->opfn.curr)
+ goto done;
+
+ mask = priv->opfn.requested & ~priv->opfn.completed;
+ capcode = ilog2(mask & ~(mask - 1)) + 1;
+ if (capcode >= STL_VERBS_EXTD_MAX) {
+ priv->opfn.completed |= OPFN_CODE(capcode);
+ goto done;
+ }
+
+ extd = &hfi1_opfn_handlers[capcode];
+ if (!extd || !extd->request || !extd->request(qp, &data)) {
+ /*
+ * Either there is no handler for this capability or the request
+ * packet could not be generated. Either way, mark it as done so
+ * we don't keep attempting to complete it.
+ */
+ priv->opfn.completed |= OPFN_CODE(capcode);
+ goto done;
+ }
+
+ trace_hfi1_opfn_data_conn_request(qp, capcode, data);
+ data = (data & ~0xf) | capcode;
+
+ memset(&wr, 0, sizeof(wr));
+ wr.wr.opcode = IB_WR_OPFN;
+ wr.remote_addr = HFI1_VERBS_E_ATOMIC_VADDR;
+ wr.compare_add = data;
+
+ priv->opfn.curr = capcode; /* A new request is now in progress */
+ /* Drop opfn.lock before calling ib_post_send() */
+ spin_unlock_irqrestore(&priv->opfn.lock, flags);
+
+ ret = ib_post_send(&qp->ibqp, &wr.wr, NULL);
+ if (ret)
+ goto err;
+ trace_hfi1_opfn_state_conn_request(qp);
+ return;
+err:
+ trace_hfi1_msg_opfn_conn_request(qp, "ib_ost_send failed: ret = ",
+ (u64)ret);
+ spin_lock_irqsave(&priv->opfn.lock, flags);
+ /*
+ * In case of an unexpected error return from ib_post_send
+ * clear opfn.curr and reschedule to try again
+ */
+ priv->opfn.curr = STL_VERBS_EXTD_NONE;
+ opfn_schedule_conn_request(qp);
+done:
+ spin_unlock_irqrestore(&priv->opfn.lock, flags);
+}
+
+void opfn_send_conn_request(struct work_struct *work)
+{
+ struct hfi1_opfn_data *od;
+ struct hfi1_qp_priv *qpriv;
+
+ od = container_of(work, struct hfi1_opfn_data, opfn_work);
+ qpriv = container_of(od, struct hfi1_qp_priv, opfn);
+
+ opfn_conn_request(qpriv->owner);
+}
+
+/*
+ * When QP s_lock is held in the caller, the OPFN request must be scheduled
+ * to a different workqueue to avoid double locking QP s_lock in call to
+ * ib_post_send in opfn_conn_request
+ */
+static void opfn_schedule_conn_request(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ trace_hfi1_opfn_state_sched_conn_request(qp);
+ queue_work(opfn_wq, &priv->opfn.opfn_work);
+}
+
+void opfn_conn_response(struct rvt_qp *qp, struct rvt_ack_entry *e,
+ struct ib_atomic_eth *ateth)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+ u64 data = be64_to_cpu(ateth->compare_data);
+ struct hfi1_opfn_type *extd;
+ u8 capcode;
+ unsigned long flags;
+
+ trace_hfi1_opfn_state_conn_response(qp);
+ capcode = data & 0xf;
+ trace_hfi1_opfn_data_conn_response(qp, capcode, data);
+ if (!capcode || capcode >= STL_VERBS_EXTD_MAX)
+ return;
+
+ extd = &hfi1_opfn_handlers[capcode];
+
+ if (!extd || !extd->response) {
+ e->atomic_data = capcode;
+ return;
+ }
+
+ spin_lock_irqsave(&priv->opfn.lock, flags);
+ if (priv->opfn.completed & OPFN_CODE(capcode)) {
+ /*
+ * We are receiving a request for a feature that has already
+ * been negotiated. This may mean that the other side has reset
+ */
+ priv->opfn.completed &= ~OPFN_CODE(capcode);
+ if (extd->error)
+ extd->error(qp);
+ }
+
+ if (extd->response(qp, &data))
+ priv->opfn.completed |= OPFN_CODE(capcode);
+ e->atomic_data = (data & ~0xf) | capcode;
+ trace_hfi1_opfn_state_conn_response(qp);
+ spin_unlock_irqrestore(&priv->opfn.lock, flags);
+}
+
+void opfn_conn_reply(struct rvt_qp *qp, u64 data)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+ struct hfi1_opfn_type *extd;
+ u8 capcode;
+ unsigned long flags;
+
+ trace_hfi1_opfn_state_conn_reply(qp);
+ capcode = data & 0xf;
+ trace_hfi1_opfn_data_conn_reply(qp, capcode, data);
+ if (!capcode || capcode >= STL_VERBS_EXTD_MAX)
+ return;
+
+ spin_lock_irqsave(&priv->opfn.lock, flags);
+ /*
+ * Either there is no previous request or the reply is not for the
+ * current request
+ */
+ if (!priv->opfn.curr || capcode != priv->opfn.curr)
+ goto done;
+
+ extd = &hfi1_opfn_handlers[capcode];
+
+ if (!extd || !extd->reply)
+ goto clear;
+
+ if (extd->reply(qp, data))
+ priv->opfn.completed |= OPFN_CODE(capcode);
+clear:
+ /*
+ * Clear opfn.curr to indicate that the previous request is no longer in
+ * progress
+ */
+ priv->opfn.curr = STL_VERBS_EXTD_NONE;
+ trace_hfi1_opfn_state_conn_reply(qp);
+done:
+ spin_unlock_irqrestore(&priv->opfn.lock, flags);
+}
+
+void opfn_conn_error(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+ struct hfi1_opfn_type *extd = NULL;
+ unsigned long flags;
+ u16 capcode;
+
+ trace_hfi1_opfn_state_conn_error(qp);
+ trace_hfi1_msg_opfn_conn_error(qp, "error. qp state ", (u64)qp->state);
+ /*
+ * The QP has gone into the Error state. We have to invalidate all
+ * negotiated feature, including the one in progress (if any). The RC
+ * QP handling will clean the WQE for the connection request.
+ */
+ spin_lock_irqsave(&priv->opfn.lock, flags);
+ while (priv->opfn.completed) {
+ capcode = priv->opfn.completed & ~(priv->opfn.completed - 1);
+ extd = &hfi1_opfn_handlers[ilog2(capcode) + 1];
+ if (extd->error)
+ extd->error(qp);
+ priv->opfn.completed &= ~OPFN_CODE(capcode);
+ }
+ priv->opfn.extended = 0;
+ priv->opfn.requested = 0;
+ priv->opfn.curr = STL_VERBS_EXTD_NONE;
+ spin_unlock_irqrestore(&priv->opfn.lock, flags);
+}
+
+void opfn_qp_init(struct rvt_qp *qp, struct ib_qp_attr *attr, int attr_mask)
+{
+ struct ib_qp *ibqp = &qp->ibqp;
+ struct hfi1_qp_priv *priv = qp->priv;
+ unsigned long flags;
+
+ if (attr_mask & IB_QP_RETRY_CNT)
+ priv->s_retry = attr->retry_cnt;
+
+ spin_lock_irqsave(&priv->opfn.lock, flags);
+ if (ibqp->qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) {
+ struct tid_rdma_params *local = &priv->tid_rdma.local;
+
+ if (attr_mask & IB_QP_TIMEOUT)
+ priv->tid_retry_timeout_jiffies = qp->timeout_jiffies;
+ if (qp->pmtu == enum_to_mtu(OPA_MTU_4096) ||
+ qp->pmtu == enum_to_mtu(OPA_MTU_8192)) {
+ tid_rdma_opfn_init(qp, local);
+ /*
+ * We only want to set the OPFN requested bit when the
+ * QP transitions to RTS.
+ */
+ if (attr_mask & IB_QP_STATE &&
+ attr->qp_state == IB_QPS_RTS) {
+ priv->opfn.requested |= OPFN_MASK(TID_RDMA);
+ /*
+ * If the QP is transitioning to RTS and the
+ * opfn.completed for TID RDMA has already been
+ * set, the QP is being moved *back* into RTS.
+ * We can now renegotiate the TID RDMA
+ * parameters.
+ */
+ if (priv->opfn.completed &
+ OPFN_MASK(TID_RDMA)) {
+ priv->opfn.completed &=
+ ~OPFN_MASK(TID_RDMA);
+ /*
+ * Since the opfn.completed bit was
+ * already set, it is safe to assume
+ * that the opfn.extended is also set.
+ */
+ opfn_schedule_conn_request(qp);
+ }
+ }
+ } else {
+ memset(local, 0, sizeof(*local));
+ }
+ }
+ spin_unlock_irqrestore(&priv->opfn.lock, flags);
+}
+
+void opfn_trigger_conn_request(struct rvt_qp *qp, u32 bth1)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ if (!priv->opfn.extended && hfi1_opfn_extended(bth1) &&
+ HFI1_CAP_IS_KSET(OPFN)) {
+ priv->opfn.extended = 1;
+ if (qp->state == IB_QPS_RTS)
+ opfn_conn_request(qp);
+ }
+}
+
+int opfn_init(void)
+{
+ opfn_wq = alloc_workqueue("hfi_opfn",
+ WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE |
+ WQ_MEM_RECLAIM,
+ HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES);
+ if (!opfn_wq)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void opfn_exit(void)
+{
+ if (opfn_wq) {
+ destroy_workqueue(opfn_wq);
+ opfn_wq = NULL;
+ }
+}
diff --git a/drivers/infiniband/hw/hfi1/opfn.h b/drivers/infiniband/hw/hfi1/opfn.h
new file mode 100644
index 000000000000..5f2011cabc25
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/opfn.h
@@ -0,0 +1,85 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
+/*
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ */
+#ifndef _HFI1_OPFN_H
+#define _HFI1_OPFN_H
+
+/**
+ * DOC: Omni Path Feature Negotion (OPFN)
+ *
+ * OPFN is a discovery protocol for Intel Omni-Path fabric that
+ * allows two RC QPs to negotiate a common feature that both QPs
+ * can support. Currently, the only OPA feature that OPFN
+ * supports is TID RDMA.
+ *
+ * Architecture
+ *
+ * OPFN involves the communication between two QPs on the HFI
+ * level on an Omni-Path fabric, and ULPs have no knowledge of
+ * OPFN at all.
+ *
+ * Implementation
+ *
+ * OPFN extends the existing IB RC protocol with the following
+ * changes:
+ * -- Uses Bit 24 (reserved) of DWORD 1 of Base Transport
+ * Header (BTH1) to indicate that the RC QP supports OPFN;
+ * -- Uses a combination of RC COMPARE_SWAP opcode (0x13) and
+ * the address U64_MAX (0xFFFFFFFFFFFFFFFF) as an OPFN
+ * request; The 64-bit data carried with the request/response
+ * contains the parameters for negotiation and will be
+ * defined in tid_rdma.c file;
+ * -- Defines IB_WR_RESERVED3 as IB_WR_OPFN.
+ *
+ * The OPFN communication will be triggered when an RC QP
+ * receives a request with Bit 24 of BTH1 set. The responder QP
+ * will then post send an OPFN request with its local
+ * parameters, which will be sent to the requester QP once all
+ * existing requests on the responder QP side have been sent.
+ * Once the requester QP receives the OPFN request, it will
+ * keep a copy of the responder QP's parameters, and return a
+ * response packet with its own local parameters. The responder
+ * QP receives the response packet and keeps a copy of the requester
+ * QP's parameters. After this exchange, each side has the parameters
+ * for both sides and therefore can select the right parameters
+ * for future transactions
+ */
+
+/* STL Verbs Extended */
+#define IB_BTHE_E_SHIFT 24
+#define HFI1_VERBS_E_ATOMIC_VADDR U64_MAX
+
+struct ib_atomic_eth;
+
+enum hfi1_opfn_codes {
+ STL_VERBS_EXTD_NONE = 0,
+ STL_VERBS_EXTD_TID_RDMA,
+ STL_VERBS_EXTD_MAX
+};
+
+struct hfi1_opfn_data {
+ u8 extended;
+ u16 requested;
+ u16 completed;
+ enum hfi1_opfn_codes curr;
+ /* serialize opfn function calls */
+ spinlock_t lock;
+ struct work_struct opfn_work;
+};
+
+/* WR opcode for OPFN */
+#define IB_WR_OPFN IB_WR_RESERVED3
+
+void opfn_send_conn_request(struct work_struct *work);
+void opfn_conn_response(struct rvt_qp *qp, struct rvt_ack_entry *e,
+ struct ib_atomic_eth *ateth);
+void opfn_conn_reply(struct rvt_qp *qp, u64 data);
+void opfn_conn_error(struct rvt_qp *qp);
+void opfn_qp_init(struct rvt_qp *qp, struct ib_qp_attr *attr, int attr_mask);
+void opfn_trigger_conn_request(struct rvt_qp *qp, u32 bth1);
+int opfn_init(void);
+void opfn_exit(void);
+
+#endif /* _HFI1_OPFN_H */
diff --git a/drivers/infiniband/hw/hfi1/pio.c b/drivers/infiniband/hw/hfi1/pio.c
index 9ab50d2308dc..a1de566fe95e 100644
--- a/drivers/infiniband/hw/hfi1/pio.c
+++ b/drivers/infiniband/hw/hfi1/pio.c
@@ -742,6 +742,7 @@ struct send_context *sc_alloc(struct hfi1_devdata *dd, int type,
spin_lock_init(&sc->alloc_lock);
spin_lock_init(&sc->release_lock);
spin_lock_init(&sc->credit_ctrl_lock);
+ seqlock_init(&sc->waitlock);
INIT_LIST_HEAD(&sc->piowait);
INIT_WORK(&sc->halt_work, sc_halted);
init_waitqueue_head(&sc->halt_wait);
@@ -1593,14 +1594,12 @@ void hfi1_sc_wantpiobuf_intr(struct send_context *sc, u32 needint)
static void sc_piobufavail(struct send_context *sc)
{
struct hfi1_devdata *dd = sc->dd;
- struct hfi1_ibdev *dev = &dd->verbs_dev;
struct list_head *list;
struct rvt_qp *qps[PIO_WAIT_BATCH_SIZE];
struct rvt_qp *qp;
struct hfi1_qp_priv *priv;
unsigned long flags;
- uint i, n = 0, max_idx = 0;
- u8 max_starved_cnt = 0;
+ uint i, n = 0, top_idx = 0;
if (dd->send_contexts[sc->sw_index].type != SC_KERNEL &&
dd->send_contexts[sc->sw_index].type != SC_VL15)
@@ -1612,18 +1611,25 @@ static void sc_piobufavail(struct send_context *sc)
* could end up with QPs on the wait list with the interrupt
* disabled.
*/
- write_seqlock_irqsave(&dev->iowait_lock, flags);
+ write_seqlock_irqsave(&sc->waitlock, flags);
while (!list_empty(list)) {
struct iowait *wait;
if (n == ARRAY_SIZE(qps))
break;
wait = list_first_entry(list, struct iowait, list);
+ iowait_get_priority(wait);
qp = iowait_to_qp(wait);
priv = qp->priv;
list_del_init(&priv->s_iowait.list);
priv->s_iowait.lock = NULL;
- iowait_starve_find_max(wait, &max_starved_cnt, n, &max_idx);
+ if (n) {
+ priv = qps[top_idx]->priv;
+ top_idx = iowait_priority_update_top(wait,
+ &priv->s_iowait,
+ n, top_idx);
+ }
+
/* refcount held until actual wake up */
qps[n++] = qp;
}
@@ -1636,14 +1642,14 @@ static void sc_piobufavail(struct send_context *sc)
if (!list_empty(list))
hfi1_sc_wantpiobuf_intr(sc, 1);
}
- write_sequnlock_irqrestore(&dev->iowait_lock, flags);
+ write_sequnlock_irqrestore(&sc->waitlock, flags);
- /* Wake up the most starved one first */
+ /* Wake up the top-priority one first */
if (n)
- hfi1_qp_wakeup(qps[max_idx],
+ hfi1_qp_wakeup(qps[top_idx],
RVT_S_WAIT_PIO | HFI1_S_WAIT_PIO_DRAIN);
for (i = 0; i < n; i++)
- if (i != max_idx)
+ if (i != top_idx)
hfi1_qp_wakeup(qps[i],
RVT_S_WAIT_PIO | HFI1_S_WAIT_PIO_DRAIN);
}
@@ -2098,11 +2104,10 @@ int init_credit_return(struct hfi1_devdata *dd)
int bytes = TXE_NUM_CONTEXTS * sizeof(struct credit_return);
set_dev_node(&dd->pcidev->dev, i);
- dd->cr_base[i].va = dma_zalloc_coherent(
- &dd->pcidev->dev,
- bytes,
- &dd->cr_base[i].dma,
- GFP_KERNEL);
+ dd->cr_base[i].va = dma_alloc_coherent(&dd->pcidev->dev,
+ bytes,
+ &dd->cr_base[i].dma,
+ GFP_KERNEL);
if (!dd->cr_base[i].va) {
set_dev_node(&dd->pcidev->dev, dd->node);
dd_dev_err(dd,
@@ -2137,3 +2142,28 @@ void free_credit_return(struct hfi1_devdata *dd)
kfree(dd->cr_base);
dd->cr_base = NULL;
}
+
+void seqfile_dump_sci(struct seq_file *s, u32 i,
+ struct send_context_info *sci)
+{
+ struct send_context *sc = sci->sc;
+ u64 reg;
+
+ seq_printf(s, "SCI %u: type %u base %u credits %u\n",
+ i, sci->type, sci->base, sci->credits);
+ seq_printf(s, " flags 0x%x sw_inx %u hw_ctxt %u grp %u\n",
+ sc->flags, sc->sw_index, sc->hw_context, sc->group);
+ seq_printf(s, " sr_size %u credits %u sr_head %u sr_tail %u\n",
+ sc->sr_size, sc->credits, sc->sr_head, sc->sr_tail);
+ seq_printf(s, " fill %lu free %lu fill_wrap %u alloc_free %lu\n",
+ sc->fill, sc->free, sc->fill_wrap, sc->alloc_free);
+ seq_printf(s, " credit_intr_count %u credit_ctrl 0x%llx\n",
+ sc->credit_intr_count, sc->credit_ctrl);
+ reg = read_kctxt_csr(sc->dd, sc->hw_context, SC(CREDIT_STATUS));
+ seq_printf(s, " *hw_free %llu CurrentFree %llu LastReturned %llu\n",
+ (le64_to_cpu(*sc->hw_free) & CR_COUNTER_SMASK) >>
+ CR_COUNTER_SHIFT,
+ (reg >> SC(CREDIT_STATUS_CURRENT_FREE_COUNTER_SHIFT)) &
+ SC(CREDIT_STATUS_CURRENT_FREE_COUNTER_MASK),
+ reg & SC(CREDIT_STATUS_LAST_RETURNED_COUNTER_SMASK));
+}
diff --git a/drivers/infiniband/hw/hfi1/pio.h b/drivers/infiniband/hw/hfi1/pio.h
index aaf372c3e5d6..c9a58b642bdd 100644
--- a/drivers/infiniband/hw/hfi1/pio.h
+++ b/drivers/infiniband/hw/hfi1/pio.h
@@ -127,6 +127,8 @@ struct send_context {
volatile __le64 *hw_free; /* HW free counter */
/* list for PIO waiters */
struct list_head piowait ____cacheline_aligned_in_smp;
+ seqlock_t waitlock;
+
spinlock_t credit_ctrl_lock ____cacheline_aligned_in_smp;
u32 credit_intr_count; /* count of credit intr users */
u64 credit_ctrl; /* cache for credit control */
@@ -329,4 +331,7 @@ void seg_pio_copy_start(struct pio_buf *pbuf, u64 pbc,
void seg_pio_copy_mid(struct pio_buf *pbuf, const void *from, size_t nbytes);
void seg_pio_copy_end(struct pio_buf *pbuf);
+void seqfile_dump_sci(struct seq_file *s, u32 i,
+ struct send_context_info *sci);
+
#endif /* _PIO_H */
diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c
index 1a016248039f..eba300330a02 100644
--- a/drivers/infiniband/hw/hfi1/qp.c
+++ b/drivers/infiniband/hw/hfi1/qp.c
@@ -132,6 +132,18 @@ const struct rvt_operation_params hfi1_post_parms[RVT_OPERATION_MAX] = {
.qpt_support = BIT(IB_QPT_RC),
},
+[IB_WR_OPFN] = {
+ .length = sizeof(struct ib_atomic_wr),
+ .qpt_support = BIT(IB_QPT_RC),
+ .flags = RVT_OPERATION_USE_RESERVE,
+},
+
+[IB_WR_TID_RDMA_WRITE] = {
+ .length = sizeof(struct ib_rdma_wr),
+ .qpt_support = BIT(IB_QPT_RC),
+ .flags = RVT_OPERATION_IGN_RNR_CNT,
+},
+
};
static void flush_list_head(struct list_head *l)
@@ -285,6 +297,8 @@ void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr,
priv->s_sendcontext = qp_to_send_context(qp, priv->s_sc);
qp_set_16b(qp);
}
+
+ opfn_qp_init(qp, attr, attr_mask);
}
/**
@@ -311,6 +325,8 @@ int hfi1_setup_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe, bool *call_send)
switch (qp->ibqp.qp_type) {
case IB_QPT_RC:
+ hfi1_setup_tid_rdma_wqe(qp, wqe);
+ /* fall through */
case IB_QPT_UC:
if (wqe->length > 0x80000000U)
return -EINVAL;
@@ -375,20 +391,18 @@ bool _hfi1_schedule_send(struct rvt_qp *qp)
static void qp_pio_drain(struct rvt_qp *qp)
{
- struct hfi1_ibdev *dev;
struct hfi1_qp_priv *priv = qp->priv;
if (!priv->s_sendcontext)
return;
- dev = to_idev(qp->ibqp.device);
while (iowait_pio_pending(&priv->s_iowait)) {
- write_seqlock_irq(&dev->iowait_lock);
+ write_seqlock_irq(&priv->s_sendcontext->waitlock);
hfi1_sc_wantpiobuf_intr(priv->s_sendcontext, 1);
- write_sequnlock_irq(&dev->iowait_lock);
+ write_sequnlock_irq(&priv->s_sendcontext->waitlock);
iowait_pio_drain(&priv->s_iowait);
- write_seqlock_irq(&dev->iowait_lock);
+ write_seqlock_irq(&priv->s_sendcontext->waitlock);
hfi1_sc_wantpiobuf_intr(priv->s_sendcontext, 0);
- write_sequnlock_irq(&dev->iowait_lock);
+ write_sequnlock_irq(&priv->s_sendcontext->waitlock);
}
}
@@ -424,6 +438,11 @@ static void hfi1_qp_schedule(struct rvt_qp *qp)
if (ret)
iowait_clear_flag(&priv->s_iowait, IOWAIT_PENDING_IB);
}
+ if (iowait_flag_set(&priv->s_iowait, IOWAIT_PENDING_TID)) {
+ ret = hfi1_schedule_tid_send(qp);
+ if (ret)
+ iowait_clear_flag(&priv->s_iowait, IOWAIT_PENDING_TID);
+ }
}
void hfi1_qp_wakeup(struct rvt_qp *qp, u32 flag)
@@ -443,8 +462,27 @@ void hfi1_qp_wakeup(struct rvt_qp *qp, u32 flag)
void hfi1_qp_unbusy(struct rvt_qp *qp, struct iowait_work *wait)
{
- if (iowait_set_work_flag(wait) == IOWAIT_IB_SE)
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ if (iowait_set_work_flag(wait) == IOWAIT_IB_SE) {
qp->s_flags &= ~RVT_S_BUSY;
+ /*
+ * If we are sending a first-leg packet from the second leg,
+ * we need to clear the busy flag from priv->s_flags to
+ * avoid a race condition when the qp wakes up before
+ * the call to hfi1_verbs_send() returns to the second
+ * leg. In that case, the second leg will terminate without
+ * being re-scheduled, resulting in failure to send TID RDMA
+ * WRITE DATA and TID RDMA ACK packets.
+ */
+ if (priv->s_flags & HFI1_S_TID_BUSY_SET) {
+ priv->s_flags &= ~(HFI1_S_TID_BUSY_SET |
+ RVT_S_BUSY);
+ iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_TID);
+ }
+ } else {
+ priv->s_flags &= ~RVT_S_BUSY;
+ }
}
static int iowait_sleep(
@@ -459,7 +497,6 @@ static int iowait_sleep(
struct hfi1_qp_priv *priv;
unsigned long flags;
int ret = 0;
- struct hfi1_ibdev *dev;
qp = tx->qp;
priv = qp->priv;
@@ -472,9 +509,8 @@ static int iowait_sleep(
* buffer and undoing the side effects of the copy.
*/
/* Make a common routine? */
- dev = &sde->dd->verbs_dev;
list_add_tail(&stx->list, &wait->tx_head);
- write_seqlock(&dev->iowait_lock);
+ write_seqlock(&sde->waitlock);
if (sdma_progress(sde, seq, stx))
goto eagain;
if (list_empty(&priv->s_iowait.list)) {
@@ -483,13 +519,14 @@ static int iowait_sleep(
ibp->rvp.n_dmawait++;
qp->s_flags |= RVT_S_WAIT_DMA_DESC;
+ iowait_get_priority(&priv->s_iowait);
iowait_queue(pkts_sent, &priv->s_iowait,
&sde->dmawait);
- priv->s_iowait.lock = &dev->iowait_lock;
+ priv->s_iowait.lock = &sde->waitlock;
trace_hfi1_qpsleep(qp, RVT_S_WAIT_DMA_DESC);
rvt_get_qp(qp);
}
- write_sequnlock(&dev->iowait_lock);
+ write_sequnlock(&sde->waitlock);
hfi1_qp_unbusy(qp, wait);
spin_unlock_irqrestore(&qp->s_lock, flags);
ret = -EBUSY;
@@ -499,7 +536,7 @@ static int iowait_sleep(
}
return ret;
eagain:
- write_sequnlock(&dev->iowait_lock);
+ write_sequnlock(&sde->waitlock);
spin_unlock_irqrestore(&qp->s_lock, flags);
list_del_init(&stx->list);
return -EAGAIN;
@@ -532,6 +569,17 @@ static void iowait_sdma_drained(struct iowait *wait)
spin_unlock_irqrestore(&qp->s_lock, flags);
}
+static void hfi1_init_priority(struct iowait *w)
+{
+ struct rvt_qp *qp = iowait_to_qp(w);
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ if (qp->s_flags & RVT_S_ACK_PENDING)
+ w->priority++;
+ if (priv->s_flags & RVT_S_ACK_PENDING)
+ w->priority++;
+}
+
/**
* qp_to_sdma_engine - map a qp to a send engine
* @qp: the QP
@@ -689,10 +737,11 @@ void *qp_priv_alloc(struct rvt_dev_info *rdi, struct rvt_qp *qp)
&priv->s_iowait,
1,
_hfi1_do_send,
- NULL,
+ _hfi1_do_tid_send,
iowait_sleep,
iowait_wakeup,
- iowait_sdma_drained);
+ iowait_sdma_drained,
+ hfi1_init_priority);
return priv;
}
@@ -700,6 +749,7 @@ void qp_priv_free(struct rvt_dev_info *rdi, struct rvt_qp *qp)
{
struct hfi1_qp_priv *priv = qp->priv;
+ hfi1_qp_priv_tid_free(rdi, qp);
kfree(priv->s_ahg);
kfree(priv);
}
@@ -733,6 +783,7 @@ void flush_qp_waiters(struct rvt_qp *qp)
{
lockdep_assert_held(&qp->s_lock);
flush_iowait(qp);
+ hfi1_tid_rdma_flush_wait(qp);
}
void stop_send_queue(struct rvt_qp *qp)
@@ -740,12 +791,16 @@ void stop_send_queue(struct rvt_qp *qp)
struct hfi1_qp_priv *priv = qp->priv;
iowait_cancel_work(&priv->s_iowait);
+ if (cancel_work_sync(&priv->tid_rdma.trigger_work))
+ rvt_put_qp(qp);
}
void quiesce_qp(struct rvt_qp *qp)
{
struct hfi1_qp_priv *priv = qp->priv;
+ hfi1_del_tid_reap_timer(qp);
+ hfi1_del_tid_retry_timer(qp);
iowait_sdma_drain(&priv->s_iowait);
qp_pio_drain(qp);
flush_tx_list(qp);
@@ -753,8 +808,13 @@ void quiesce_qp(struct rvt_qp *qp)
void notify_qp_reset(struct rvt_qp *qp)
{
+ hfi1_qp_kern_exp_rcv_clear_all(qp);
qp->r_adefered = 0;
clear_ahg(qp);
+
+ /* Clear any OPFN state */
+ if (qp->ibqp.qp_type == IB_QPT_RC)
+ opfn_conn_error(qp);
}
/*
@@ -836,8 +896,11 @@ void notify_error_qp(struct rvt_qp *qp)
if (lock) {
write_seqlock(lock);
if (!list_empty(&priv->s_iowait.list) &&
- !(qp->s_flags & RVT_S_BUSY)) {
- qp->s_flags &= ~RVT_S_ANY_WAIT_IO;
+ !(qp->s_flags & RVT_S_BUSY) &&
+ !(priv->s_flags & RVT_S_BUSY)) {
+ qp->s_flags &= ~HFI1_S_ANY_WAIT_IO;
+ iowait_clear_flag(&priv->s_iowait, IOWAIT_PENDING_IB);
+ iowait_clear_flag(&priv->s_iowait, IOWAIT_PENDING_TID);
list_del_init(&priv->s_iowait.list);
priv->s_iowait.lock = NULL;
rvt_put_qp(qp);
@@ -845,7 +908,8 @@ void notify_error_qp(struct rvt_qp *qp)
write_sequnlock(lock);
}
- if (!(qp->s_flags & RVT_S_BUSY)) {
+ if (!(qp->s_flags & RVT_S_BUSY) && !(priv->s_flags & RVT_S_BUSY)) {
+ qp->s_hdrwords = 0;
if (qp->s_rdma_mr) {
rvt_put_mr(qp->s_rdma_mr);
qp->s_rdma_mr = NULL;
diff --git a/drivers/infiniband/hw/hfi1/qp.h b/drivers/infiniband/hw/hfi1/qp.h
index 7adb6dff6813..b670321365d3 100644
--- a/drivers/infiniband/hw/hfi1/qp.h
+++ b/drivers/infiniband/hw/hfi1/qp.h
@@ -63,11 +63,17 @@ extern const struct rvt_operation_params hfi1_post_parms[];
* HFI1_S_AHG_VALID - ahg header valid on chip
* HFI1_S_AHG_CLEAR - have send engine clear ahg state
* HFI1_S_WAIT_PIO_DRAIN - qp waiting for PIOs to drain
+ * HFI1_S_WAIT_TID_SPACE - a QP is waiting for TID resource
+ * HFI1_S_WAIT_TID_RESP - waiting for a TID RDMA WRITE response
+ * HFI1_S_WAIT_HALT - halt the first leg send engine
* HFI1_S_MIN_BIT_MASK - the lowest bit that can be used by hfi1
*/
#define HFI1_S_AHG_VALID 0x80000000
#define HFI1_S_AHG_CLEAR 0x40000000
#define HFI1_S_WAIT_PIO_DRAIN 0x20000000
+#define HFI1_S_WAIT_TID_SPACE 0x10000000
+#define HFI1_S_WAIT_TID_RESP 0x08000000
+#define HFI1_S_WAIT_HALT 0x04000000
#define HFI1_S_MIN_BIT_MASK 0x01000000
/*
@@ -76,6 +82,7 @@ extern const struct rvt_operation_params hfi1_post_parms[];
#define HFI1_S_ANY_WAIT_IO (RVT_S_ANY_WAIT_IO | HFI1_S_WAIT_PIO_DRAIN)
#define HFI1_S_ANY_WAIT (HFI1_S_ANY_WAIT_IO | RVT_S_ANY_WAIT_SEND)
+#define HFI1_S_ANY_TID_WAIT_SEND (RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_DMA)
/*
* Send if not busy or waiting for I/O and either
diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c
index 188aa4f686a0..5991211d72bd 100644
--- a/drivers/infiniband/hw/hfi1/rc.c
+++ b/drivers/infiniband/hw/hfi1/rc.c
@@ -51,24 +51,48 @@
#include "hfi.h"
#include "qp.h"
+#include "rc.h"
#include "verbs_txreq.h"
#include "trace.h"
-/* cut down ridiculously long IB macro names */
-#define OP(x) RC_OP(x)
-
-static u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe,
- u32 psn, u32 pmtu)
+struct rvt_ack_entry *find_prev_entry(struct rvt_qp *qp, u32 psn, u8 *prev,
+ u8 *prev_ack, bool *scheduled)
+ __must_hold(&qp->s_lock)
{
- u32 len;
-
- len = delta_psn(psn, wqe->psn) * pmtu;
- ss->sge = wqe->sg_list[0];
- ss->sg_list = wqe->sg_list + 1;
- ss->num_sge = wqe->wr.num_sge;
- ss->total_len = wqe->length;
- rvt_skip_sge(ss, len, false);
- return wqe->length - len;
+ struct rvt_ack_entry *e = NULL;
+ u8 i, p;
+ bool s = true;
+
+ for (i = qp->r_head_ack_queue; ; i = p) {
+ if (i == qp->s_tail_ack_queue)
+ s = false;
+ if (i)
+ p = i - 1;
+ else
+ p = rvt_size_atomic(ib_to_rvt(qp->ibqp.device));
+ if (p == qp->r_head_ack_queue) {
+ e = NULL;
+ break;
+ }
+ e = &qp->s_ack_queue[p];
+ if (!e->opcode) {
+ e = NULL;
+ break;
+ }
+ if (cmp_psn(psn, e->psn) >= 0) {
+ if (p == qp->s_tail_ack_queue &&
+ cmp_psn(psn, e->lpsn) <= 0)
+ s = false;
+ break;
+ }
+ }
+ if (prev)
+ *prev = p;
+ if (prev_ack)
+ *prev_ack = i;
+ if (scheduled)
+ *scheduled = s;
+ return e;
}
/**
@@ -87,20 +111,25 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
struct hfi1_pkt_state *ps)
{
struct rvt_ack_entry *e;
- u32 hwords;
- u32 len;
- u32 bth0;
- u32 bth2;
+ u32 hwords, hdrlen;
+ u32 len = 0;
+ u32 bth0 = 0, bth2 = 0;
+ u32 bth1 = qp->remote_qpn | (HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT);
int middle = 0;
u32 pmtu = qp->pmtu;
- struct hfi1_qp_priv *priv = qp->priv;
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ bool last_pkt;
+ u32 delta;
+ u8 next = qp->s_tail_ack_queue;
+ struct tid_rdma_request *req;
+ trace_hfi1_rsp_make_rc_ack(qp, 0);
lockdep_assert_held(&qp->s_lock);
/* Don't send an ACK if we aren't supposed to. */
if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
goto bail;
- if (priv->hdr_type == HFI1_PKT_TYPE_9B)
+ if (qpriv->hdr_type == HFI1_PKT_TYPE_9B)
/* header size in 32-bit words LRH+BTH = (8+12)/4. */
hwords = 5;
else
@@ -122,8 +151,18 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
* response has been sent instead of only being
* constructed.
*/
- if (++qp->s_tail_ack_queue > HFI1_MAX_RDMA_ATOMIC)
- qp->s_tail_ack_queue = 0;
+ if (++next > rvt_size_atomic(&dev->rdi))
+ next = 0;
+ /*
+ * Only advance the s_acked_ack_queue pointer if there
+ * have been no TID RDMA requests.
+ */
+ e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+ if (e->opcode != TID_OP(WRITE_REQ) &&
+ qp->s_acked_ack_queue == qp->s_tail_ack_queue)
+ qp->s_acked_ack_queue = next;
+ qp->s_tail_ack_queue = next;
+ trace_hfi1_rsp_make_rc_ack(qp, e->psn);
/* FALLTHROUGH */
case OP(SEND_ONLY):
case OP(ACKNOWLEDGE):
@@ -135,6 +174,12 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
}
e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+ /* Check for tid write fence */
+ if ((qpriv->s_flags & HFI1_R_TID_WAIT_INTERLCK) ||
+ hfi1_tid_rdma_ack_interlock(qp, e)) {
+ iowait_set_flag(&qpriv->s_iowait, IOWAIT_PENDING_IB);
+ goto bail;
+ }
if (e->opcode == OP(RDMA_READ_REQUEST)) {
/*
* If a RDMA read response is being resent and
@@ -144,6 +189,10 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
*/
len = e->rdma_sge.sge_length;
if (len && !e->rdma_sge.mr) {
+ if (qp->s_acked_ack_queue ==
+ qp->s_tail_ack_queue)
+ qp->s_acked_ack_queue =
+ qp->r_head_ack_queue;
qp->s_tail_ack_queue = qp->r_head_ack_queue;
goto bail;
}
@@ -165,6 +214,45 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
hwords++;
qp->s_ack_rdma_psn = e->psn;
bth2 = mask_psn(qp->s_ack_rdma_psn++);
+ } else if (e->opcode == TID_OP(WRITE_REQ)) {
+ /*
+ * If a TID RDMA WRITE RESP is being resent, we have to
+ * wait for the actual request. All requests that are to
+ * be resent will have their state set to
+ * TID_REQUEST_RESEND. When the new request arrives, the
+ * state will be changed to TID_REQUEST_RESEND_ACTIVE.
+ */
+ req = ack_to_tid_req(e);
+ if (req->state == TID_REQUEST_RESEND ||
+ req->state == TID_REQUEST_INIT_RESEND)
+ goto bail;
+ qp->s_ack_state = TID_OP(WRITE_RESP);
+ qp->s_ack_rdma_psn = mask_psn(e->psn + req->cur_seg);
+ goto write_resp;
+ } else if (e->opcode == TID_OP(READ_REQ)) {
+ /*
+ * If a TID RDMA read response is being resent and
+ * we haven't seen the duplicate request yet,
+ * then stop sending the remaining responses the
+ * responder has seen until the requester re-sends it.
+ */
+ len = e->rdma_sge.sge_length;
+ if (len && !e->rdma_sge.mr) {
+ if (qp->s_acked_ack_queue ==
+ qp->s_tail_ack_queue)
+ qp->s_acked_ack_queue =
+ qp->r_head_ack_queue;
+ qp->s_tail_ack_queue = qp->r_head_ack_queue;
+ goto bail;
+ }
+ /* Copy SGE state in case we need to resend */
+ ps->s_txreq->mr = e->rdma_sge.mr;
+ if (ps->s_txreq->mr)
+ rvt_get_mr(ps->s_txreq->mr);
+ qp->s_ack_rdma_sge.sge = e->rdma_sge;
+ qp->s_ack_rdma_sge.num_sge = 1;
+ qp->s_ack_state = TID_OP(READ_RESP);
+ goto read_resp;
} else {
/* COMPARE_SWAP or FETCH_ADD */
ps->s_txreq->ss = NULL;
@@ -176,6 +264,7 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
bth2 = mask_psn(e->psn);
e->sent = 1;
}
+ trace_hfi1_tid_write_rsp_make_rc_ack(qp);
bth0 = qp->s_ack_state << 24;
break;
@@ -202,6 +291,83 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
bth2 = mask_psn(qp->s_ack_rdma_psn++);
break;
+ case TID_OP(WRITE_RESP):
+write_resp:
+ /*
+ * 1. Check if RVT_S_ACK_PENDING is set. If yes,
+ * goto normal.
+ * 2. Attempt to allocate TID resources.
+ * 3. Remove RVT_S_RESP_PENDING flags from s_flags
+ * 4. If resources not available:
+ * 4.1 Set RVT_S_WAIT_TID_SPACE
+ * 4.2 Queue QP on RCD TID queue
+ * 4.3 Put QP on iowait list.
+ * 4.4 Build IB RNR NAK with appropriate timeout value
+ * 4.5 Return indication progress made.
+ * 5. If resources are available:
+ * 5.1 Program HW flow CSRs
+ * 5.2 Build TID RDMA WRITE RESP packet
+ * 5.3 If more resources needed, do 2.1 - 2.3.
+ * 5.4 Wake up next QP on RCD TID queue.
+ * 5.5 Return indication progress made.
+ */
+
+ e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+ req = ack_to_tid_req(e);
+
+ /*
+ * Send scheduled RNR NAK's. RNR NAK's need to be sent at
+ * segment boundaries, not at request boundaries. Don't change
+ * s_ack_state because we are still in the middle of a request
+ */
+ if (qpriv->rnr_nak_state == TID_RNR_NAK_SEND &&
+ qp->s_tail_ack_queue == qpriv->r_tid_alloc &&
+ req->cur_seg == req->alloc_seg) {
+ qpriv->rnr_nak_state = TID_RNR_NAK_SENT;
+ goto normal_no_state;
+ }
+
+ bth2 = mask_psn(qp->s_ack_rdma_psn);
+ hdrlen = hfi1_build_tid_rdma_write_resp(qp, e, ohdr, &bth1,
+ bth2, &len,
+ &ps->s_txreq->ss);
+ if (!hdrlen)
+ return 0;
+
+ hwords += hdrlen;
+ bth0 = qp->s_ack_state << 24;
+ qp->s_ack_rdma_psn++;
+ trace_hfi1_tid_req_make_rc_ack_write(qp, 0, e->opcode, e->psn,
+ e->lpsn, req);
+ if (req->cur_seg != req->total_segs)
+ break;
+
+ e->sent = 1;
+ qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
+ break;
+
+ case TID_OP(READ_RESP):
+read_resp:
+ e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+ ps->s_txreq->ss = &qp->s_ack_rdma_sge;
+ delta = hfi1_build_tid_rdma_read_resp(qp, e, ohdr, &bth0,
+ &bth1, &bth2, &len,
+ &last_pkt);
+ if (delta == 0)
+ goto error_qp;
+ hwords += delta;
+ if (last_pkt) {
+ e->sent = 1;
+ /*
+ * Increment qp->s_tail_ack_queue through s_ack_state
+ * transition.
+ */
+ qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
+ }
+ break;
+ case TID_OP(READ_REQ):
+ goto bail;
+
default:
normal:
/*
@@ -211,8 +377,7 @@ normal:
* (see above).
*/
qp->s_ack_state = OP(SEND_ONLY);
- qp->s_flags &= ~RVT_S_ACK_PENDING;
- ps->s_txreq->ss = NULL;
+normal_no_state:
if (qp->s_nak_state)
ohdr->u.aeth =
cpu_to_be32((qp->r_msn & IB_MSN_MASK) |
@@ -224,14 +389,24 @@ normal:
len = 0;
bth0 = OP(ACKNOWLEDGE) << 24;
bth2 = mask_psn(qp->s_ack_psn);
+ qp->s_flags &= ~RVT_S_ACK_PENDING;
+ ps->s_txreq->txreq.flags |= SDMA_TXREQ_F_VIP;
+ ps->s_txreq->ss = NULL;
}
qp->s_rdma_ack_cnt++;
- ps->s_txreq->sde = priv->s_sde;
+ ps->s_txreq->sde = qpriv->s_sde;
ps->s_txreq->s_cur_size = len;
ps->s_txreq->hdr_dwords = hwords;
- hfi1_make_ruc_header(qp, ohdr, bth0, bth2, middle, ps);
+ hfi1_make_ruc_header(qp, ohdr, bth0, bth1, bth2, middle, ps);
return 1;
-
+error_qp:
+ spin_unlock_irqrestore(&qp->s_lock, ps->flags);
+ spin_lock_irqsave(&qp->r_lock, ps->flags);
+ spin_lock(&qp->s_lock);
+ rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+ spin_unlock(&qp->s_lock);
+ spin_unlock_irqrestore(&qp->r_lock, ps->flags);
+ spin_lock_irqsave(&qp->s_lock, ps->flags);
bail:
qp->s_ack_state = OP(ACKNOWLEDGE);
/*
@@ -258,17 +433,23 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
struct hfi1_qp_priv *priv = qp->priv;
struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
struct ib_other_headers *ohdr;
- struct rvt_sge_state *ss;
+ struct rvt_sge_state *ss = NULL;
struct rvt_swqe *wqe;
- u32 hwords;
- u32 len;
- u32 bth0 = 0;
- u32 bth2;
+ struct hfi1_swqe_priv *wpriv;
+ struct tid_rdma_request *req = NULL;
+ /* header size in 32-bit words LRH+BTH = (8+12)/4. */
+ u32 hwords = 5;
+ u32 len = 0;
+ u32 bth0 = 0, bth2 = 0;
+ u32 bth1 = qp->remote_qpn | (HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT);
u32 pmtu = qp->pmtu;
char newreq;
int middle = 0;
int delta;
+ struct tid_rdma_flow *flow = NULL;
+ struct tid_rdma_params *remote;
+ trace_hfi1_sender_make_rc_req(qp);
lockdep_assert_held(&qp->s_lock);
ps->s_txreq = get_txreq(ps->dev, qp);
if (!ps->s_txreq)
@@ -309,13 +490,13 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
}
clear_ahg(qp);
wqe = rvt_get_swqe_ptr(qp, qp->s_last);
- rvt_send_complete(qp, wqe, qp->s_last != qp->s_acked ?
- IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR);
+ hfi1_trdma_send_complete(qp, wqe, qp->s_last != qp->s_acked ?
+ IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR);
/* will get called again */
goto done_free_tx;
}
- if (qp->s_flags & (RVT_S_WAIT_RNR | RVT_S_WAIT_ACK))
+ if (qp->s_flags & (RVT_S_WAIT_RNR | RVT_S_WAIT_ACK | HFI1_S_WAIT_HALT))
goto bail;
if (cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) {
@@ -329,6 +510,7 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
/* Send a request. */
wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
+check_s_state:
switch (qp->s_state) {
default:
if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_NEXT_SEND_OK))
@@ -350,9 +532,13 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
/*
* If a fence is requested, wait for previous
* RDMA read and atomic operations to finish.
+ * However, there is no need to guard against
+ * TID RDMA READ after TID RDMA READ.
*/
if ((wqe->wr.send_flags & IB_SEND_FENCE) &&
- qp->s_num_rd_atomic) {
+ qp->s_num_rd_atomic &&
+ (wqe->wr.opcode != IB_WR_TID_RDMA_READ ||
+ priv->pending_tid_r_segs < qp->s_num_rd_atomic)) {
qp->s_flags |= RVT_S_WAIT_FENCE;
goto bail;
}
@@ -397,6 +583,15 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
len = wqe->length;
ss = &qp->s_sge;
bth2 = mask_psn(qp->s_psn);
+
+ /*
+ * Interlock between various IB requests and TID RDMA
+ * if necessary.
+ */
+ if ((priv->s_flags & HFI1_S_TID_WAIT_INTERLCK) ||
+ hfi1_tid_rdma_wqe_interlock(qp, wqe))
+ goto bail;
+
switch (wqe->wr.opcode) {
case IB_WR_SEND:
case IB_WR_SEND_WITH_IMM:
@@ -473,21 +668,126 @@ no_flow_control:
qp->s_cur = 0;
break;
+ case IB_WR_TID_RDMA_WRITE:
+ if (newreq) {
+ /*
+ * Limit the number of TID RDMA WRITE requests.
+ */
+ if (atomic_read(&priv->n_tid_requests) >=
+ HFI1_TID_RDMA_WRITE_CNT)
+ goto bail;
+
+ if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
+ qp->s_lsn++;
+ }
+
+ hwords += hfi1_build_tid_rdma_write_req(qp, wqe, ohdr,
+ &bth1, &bth2,
+ &len);
+ ss = NULL;
+ if (priv->s_tid_cur == HFI1_QP_WQE_INVALID) {
+ priv->s_tid_cur = qp->s_cur;
+ if (priv->s_tid_tail == HFI1_QP_WQE_INVALID) {
+ priv->s_tid_tail = qp->s_cur;
+ priv->s_state = TID_OP(WRITE_RESP);
+ }
+ } else if (priv->s_tid_cur == priv->s_tid_head) {
+ struct rvt_swqe *__w;
+ struct tid_rdma_request *__r;
+
+ __w = rvt_get_swqe_ptr(qp, priv->s_tid_cur);
+ __r = wqe_to_tid_req(__w);
+
+ /*
+ * The s_tid_cur pointer is advanced to s_cur if
+ * any of the following conditions about the WQE
+ * to which s_ti_cur currently points to are
+ * satisfied:
+ * 1. The request is not a TID RDMA WRITE
+ * request,
+ * 2. The request is in the INACTIVE or
+ * COMPLETE states (TID RDMA READ requests
+ * stay at INACTIVE and TID RDMA WRITE
+ * transition to COMPLETE when done),
+ * 3. The request is in the ACTIVE or SYNC
+ * state and the number of completed
+ * segments is equal to the total segment
+ * count.
+ * (If ACTIVE, the request is waiting for
+ * ACKs. If SYNC, the request has not
+ * received any responses because it's
+ * waiting on a sync point.)
+ */
+ if (__w->wr.opcode != IB_WR_TID_RDMA_WRITE ||
+ __r->state == TID_REQUEST_INACTIVE ||
+ __r->state == TID_REQUEST_COMPLETE ||
+ ((__r->state == TID_REQUEST_ACTIVE ||
+ __r->state == TID_REQUEST_SYNC) &&
+ __r->comp_seg == __r->total_segs)) {
+ if (priv->s_tid_tail ==
+ priv->s_tid_cur &&
+ priv->s_state ==
+ TID_OP(WRITE_DATA_LAST)) {
+ priv->s_tid_tail = qp->s_cur;
+ priv->s_state =
+ TID_OP(WRITE_RESP);
+ }
+ priv->s_tid_cur = qp->s_cur;
+ }
+ /*
+ * A corner case: when the last TID RDMA WRITE
+ * request was completed, s_tid_head,
+ * s_tid_cur, and s_tid_tail all point to the
+ * same location. Other requests are posted and
+ * s_cur wraps around to the same location,
+ * where a new TID RDMA WRITE is posted. In
+ * this case, none of the indices need to be
+ * updated. However, the priv->s_state should.
+ */
+ if (priv->s_tid_tail == qp->s_cur &&
+ priv->s_state == TID_OP(WRITE_DATA_LAST))
+ priv->s_state = TID_OP(WRITE_RESP);
+ }
+ req = wqe_to_tid_req(wqe);
+ if (newreq) {
+ priv->s_tid_head = qp->s_cur;
+ priv->pending_tid_w_resp += req->total_segs;
+ atomic_inc(&priv->n_tid_requests);
+ atomic_dec(&priv->n_requests);
+ } else {
+ req->state = TID_REQUEST_RESEND;
+ req->comp_seg = delta_psn(bth2, wqe->psn);
+ /*
+ * Pull back any segments since we are going
+ * to re-receive them.
+ */
+ req->setup_head = req->clear_tail;
+ priv->pending_tid_w_resp +=
+ delta_psn(wqe->lpsn, bth2) + 1;
+ }
+
+ trace_hfi1_tid_write_sender_make_req(qp, newreq);
+ trace_hfi1_tid_req_make_req_write(qp, newreq,
+ wqe->wr.opcode,
+ wqe->psn, wqe->lpsn,
+ req);
+ if (++qp->s_cur == qp->s_size)
+ qp->s_cur = 0;
+ break;
+
case IB_WR_RDMA_READ:
/*
* Don't allow more operations to be started
* than the QP limits allow.
*/
- if (newreq) {
- if (qp->s_num_rd_atomic >=
- qp->s_max_rd_atomic) {
- qp->s_flags |= RVT_S_WAIT_RDMAR;
- goto bail;
- }
- qp->s_num_rd_atomic++;
- if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
- qp->s_lsn++;
+ if (qp->s_num_rd_atomic >=
+ qp->s_max_rd_atomic) {
+ qp->s_flags |= RVT_S_WAIT_RDMAR;
+ goto bail;
}
+ qp->s_num_rd_atomic++;
+ if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
+ qp->s_lsn++;
put_ib_reth_vaddr(
wqe->rdma_wr.remote_addr,
&ohdr->u.rc.reth);
@@ -503,23 +803,99 @@ no_flow_control:
qp->s_cur = 0;
break;
+ case IB_WR_TID_RDMA_READ:
+ trace_hfi1_tid_read_sender_make_req(qp, newreq);
+ wpriv = wqe->priv;
+ req = wqe_to_tid_req(wqe);
+ trace_hfi1_tid_req_make_req_read(qp, newreq,
+ wqe->wr.opcode,
+ wqe->psn, wqe->lpsn,
+ req);
+ delta = cmp_psn(qp->s_psn, wqe->psn);
+
+ /*
+ * Don't allow more operations to be started
+ * than the QP limits allow. We could get here under
+ * three conditions; (1) It's a new request; (2) We are
+ * sending the second or later segment of a request,
+ * but the qp->s_state is set to OP(RDMA_READ_REQUEST)
+ * when the last segment of a previous request is
+ * received just before this; (3) We are re-sending a
+ * request.
+ */
+ if (qp->s_num_rd_atomic >= qp->s_max_rd_atomic) {
+ qp->s_flags |= RVT_S_WAIT_RDMAR;
+ goto bail;
+ }
+ if (newreq) {
+ struct tid_rdma_flow *flow =
+ &req->flows[req->setup_head];
+
+ /*
+ * Set up s_sge as it is needed for TID
+ * allocation. However, if the pages have been
+ * walked and mapped, skip it. An earlier try
+ * has failed to allocate the TID entries.
+ */
+ if (!flow->npagesets) {
+ qp->s_sge.sge = wqe->sg_list[0];
+ qp->s_sge.sg_list = wqe->sg_list + 1;
+ qp->s_sge.num_sge = wqe->wr.num_sge;
+ qp->s_sge.total_len = wqe->length;
+ qp->s_len = wqe->length;
+ req->isge = 0;
+ req->clear_tail = req->setup_head;
+ req->flow_idx = req->setup_head;
+ req->state = TID_REQUEST_ACTIVE;
+ }
+ } else if (delta == 0) {
+ /* Re-send a request */
+ req->cur_seg = 0;
+ req->comp_seg = 0;
+ req->ack_pending = 0;
+ req->flow_idx = req->clear_tail;
+ req->state = TID_REQUEST_RESEND;
+ }
+ req->s_next_psn = qp->s_psn;
+ /* Read one segment at a time */
+ len = min_t(u32, req->seg_len,
+ wqe->length - req->seg_len * req->cur_seg);
+ delta = hfi1_build_tid_rdma_read_req(qp, wqe, ohdr,
+ &bth1, &bth2,
+ &len);
+ if (delta <= 0) {
+ /* Wait for TID space */
+ goto bail;
+ }
+ if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
+ qp->s_lsn++;
+ hwords += delta;
+ ss = &wpriv->ss;
+ /* Check if this is the last segment */
+ if (req->cur_seg >= req->total_segs &&
+ ++qp->s_cur == qp->s_size)
+ qp->s_cur = 0;
+ break;
+
case IB_WR_ATOMIC_CMP_AND_SWP:
case IB_WR_ATOMIC_FETCH_AND_ADD:
/*
* Don't allow more operations to be started
* than the QP limits allow.
*/
- if (newreq) {
- if (qp->s_num_rd_atomic >=
- qp->s_max_rd_atomic) {
- qp->s_flags |= RVT_S_WAIT_RDMAR;
- goto bail;
- }
- qp->s_num_rd_atomic++;
- if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
- qp->s_lsn++;
+ if (qp->s_num_rd_atomic >=
+ qp->s_max_rd_atomic) {
+ qp->s_flags |= RVT_S_WAIT_RDMAR;
+ goto bail;
}
- if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
+ qp->s_num_rd_atomic++;
+
+ /* FALLTHROUGH */
+ case IB_WR_OPFN:
+ if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
+ qp->s_lsn++;
+ if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+ wqe->wr.opcode == IB_WR_OPFN) {
qp->s_state = OP(COMPARE_SWAP);
put_ib_ateth_swap(wqe->atomic_wr.swap,
&ohdr->u.atomic_eth);
@@ -546,18 +922,23 @@ no_flow_control:
default:
goto bail;
}
- qp->s_sge.sge = wqe->sg_list[0];
- qp->s_sge.sg_list = wqe->sg_list + 1;
- qp->s_sge.num_sge = wqe->wr.num_sge;
- qp->s_sge.total_len = wqe->length;
- qp->s_len = wqe->length;
+ if (wqe->wr.opcode != IB_WR_TID_RDMA_READ) {
+ qp->s_sge.sge = wqe->sg_list[0];
+ qp->s_sge.sg_list = wqe->sg_list + 1;
+ qp->s_sge.num_sge = wqe->wr.num_sge;
+ qp->s_sge.total_len = wqe->length;
+ qp->s_len = wqe->length;
+ }
if (newreq) {
qp->s_tail++;
if (qp->s_tail >= qp->s_size)
qp->s_tail = 0;
}
- if (wqe->wr.opcode == IB_WR_RDMA_READ)
+ if (wqe->wr.opcode == IB_WR_RDMA_READ ||
+ wqe->wr.opcode == IB_WR_TID_RDMA_WRITE)
qp->s_psn = wqe->lpsn + 1;
+ else if (wqe->wr.opcode == IB_WR_TID_RDMA_READ)
+ qp->s_psn = req->s_next_psn;
else
qp->s_psn++;
break;
@@ -674,10 +1055,137 @@ no_flow_control:
if (qp->s_cur == qp->s_size)
qp->s_cur = 0;
break;
+
+ case TID_OP(WRITE_RESP):
+ /*
+ * This value for s_state is used for restarting a TID RDMA
+ * WRITE request. See comment in OP(RDMA_READ_RESPONSE_MIDDLE
+ * for more).
+ */
+ req = wqe_to_tid_req(wqe);
+ req->state = TID_REQUEST_RESEND;
+ rcu_read_lock();
+ remote = rcu_dereference(priv->tid_rdma.remote);
+ req->comp_seg = delta_psn(qp->s_psn, wqe->psn);
+ len = wqe->length - (req->comp_seg * remote->max_len);
+ rcu_read_unlock();
+
+ bth2 = mask_psn(qp->s_psn);
+ hwords += hfi1_build_tid_rdma_write_req(qp, wqe, ohdr, &bth1,
+ &bth2, &len);
+ qp->s_psn = wqe->lpsn + 1;
+ ss = NULL;
+ qp->s_state = TID_OP(WRITE_REQ);
+ priv->pending_tid_w_resp += delta_psn(wqe->lpsn, bth2) + 1;
+ priv->s_tid_cur = qp->s_cur;
+ if (++qp->s_cur == qp->s_size)
+ qp->s_cur = 0;
+ trace_hfi1_tid_req_make_req_write(qp, 0, wqe->wr.opcode,
+ wqe->psn, wqe->lpsn, req);
+ break;
+
+ case TID_OP(READ_RESP):
+ if (wqe->wr.opcode != IB_WR_TID_RDMA_READ)
+ goto bail;
+ /* This is used to restart a TID read request */
+ req = wqe_to_tid_req(wqe);
+ wpriv = wqe->priv;
+ /*
+ * Back down. The field qp->s_psn has been set to the psn with
+ * which the request should be restart. It's OK to use division
+ * as this is on the retry path.
+ */
+ req->cur_seg = delta_psn(qp->s_psn, wqe->psn) / priv->pkts_ps;
+
+ /*
+ * The following function need to be redefined to return the
+ * status to make sure that we find the flow. At the same
+ * time, we can use the req->state change to check if the
+ * call succeeds or not.
+ */
+ req->state = TID_REQUEST_RESEND;
+ hfi1_tid_rdma_restart_req(qp, wqe, &bth2);
+ if (req->state != TID_REQUEST_ACTIVE) {
+ /*
+ * Failed to find the flow. Release all allocated tid
+ * resources.
+ */
+ hfi1_kern_exp_rcv_clear_all(req);
+ hfi1_kern_clear_hw_flow(priv->rcd, qp);
+
+ hfi1_trdma_send_complete(qp, wqe, IB_WC_LOC_QP_OP_ERR);
+ goto bail;
+ }
+ req->state = TID_REQUEST_RESEND;
+ len = min_t(u32, req->seg_len,
+ wqe->length - req->seg_len * req->cur_seg);
+ flow = &req->flows[req->flow_idx];
+ len -= flow->sent;
+ req->s_next_psn = flow->flow_state.ib_lpsn + 1;
+ delta = hfi1_build_tid_rdma_read_packet(wqe, ohdr, &bth1,
+ &bth2, &len);
+ if (delta <= 0) {
+ /* Wait for TID space */
+ goto bail;
+ }
+ hwords += delta;
+ ss = &wpriv->ss;
+ /* Check if this is the last segment */
+ if (req->cur_seg >= req->total_segs &&
+ ++qp->s_cur == qp->s_size)
+ qp->s_cur = 0;
+ qp->s_psn = req->s_next_psn;
+ trace_hfi1_tid_req_make_req_read(qp, 0, wqe->wr.opcode,
+ wqe->psn, wqe->lpsn, req);
+ break;
+ case TID_OP(READ_REQ):
+ req = wqe_to_tid_req(wqe);
+ delta = cmp_psn(qp->s_psn, wqe->psn);
+ /*
+ * If the current WR is not TID RDMA READ, or this is the start
+ * of a new request, we need to change the qp->s_state so that
+ * the request can be set up properly.
+ */
+ if (wqe->wr.opcode != IB_WR_TID_RDMA_READ || delta == 0 ||
+ qp->s_cur == qp->s_tail) {
+ qp->s_state = OP(RDMA_READ_REQUEST);
+ if (delta == 0 || qp->s_cur == qp->s_tail)
+ goto check_s_state;
+ else
+ goto bail;
+ }
+
+ /* Rate limiting */
+ if (qp->s_num_rd_atomic >= qp->s_max_rd_atomic) {
+ qp->s_flags |= RVT_S_WAIT_RDMAR;
+ goto bail;
+ }
+
+ wpriv = wqe->priv;
+ /* Read one segment at a time */
+ len = min_t(u32, req->seg_len,
+ wqe->length - req->seg_len * req->cur_seg);
+ delta = hfi1_build_tid_rdma_read_req(qp, wqe, ohdr, &bth1,
+ &bth2, &len);
+ if (delta <= 0) {
+ /* Wait for TID space */
+ goto bail;
+ }
+ hwords += delta;
+ ss = &wpriv->ss;
+ /* Check if this is the last segment */
+ if (req->cur_seg >= req->total_segs &&
+ ++qp->s_cur == qp->s_size)
+ qp->s_cur = 0;
+ qp->s_psn = req->s_next_psn;
+ trace_hfi1_tid_req_make_req_read(qp, 0, wqe->wr.opcode,
+ wqe->psn, wqe->lpsn, req);
+ break;
}
qp->s_sending_hpsn = bth2;
delta = delta_psn(bth2, wqe->psn);
- if (delta && delta % HFI1_PSN_CREDIT == 0)
+ if (delta && delta % HFI1_PSN_CREDIT == 0 &&
+ wqe->wr.opcode != IB_WR_TID_RDMA_WRITE)
bth2 |= IB_BTH_REQ_ACK;
if (qp->s_flags & RVT_S_SEND_ONE) {
qp->s_flags &= ~RVT_S_SEND_ONE;
@@ -693,6 +1201,7 @@ no_flow_control:
qp,
ohdr,
bth0 | (qp->s_state << 24),
+ bth1,
bth2,
middle,
ps);
@@ -709,6 +1218,12 @@ bail:
bail_no_tx:
ps->s_txreq = NULL;
qp->s_flags &= ~RVT_S_BUSY;
+ /*
+ * If we didn't get a txreq, the QP will be woken up later to try
+ * again. Set the flags to indicate which work item to wake
+ * up.
+ */
+ iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_IB);
return 0;
}
@@ -796,6 +1311,11 @@ static inline void hfi1_make_rc_ack_9B(struct hfi1_packet *packet,
if (qp->s_mig_state == IB_MIG_MIGRATED)
bth0 |= IB_BTH_MIG_REQ;
bth1 = (!!is_fecn) << IB_BECN_SHIFT;
+ /*
+ * Inline ACKs go out without the use of the Verbs send engine, so
+ * we need to set the STL Verbs Extended bit here
+ */
+ bth1 |= HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT;
hfi1_make_bth_aeth(qp, ohdr, bth0, bth1);
}
@@ -936,6 +1456,43 @@ void hfi1_send_rc_ack(struct hfi1_packet *packet, bool is_fecn)
}
/**
+ * update_num_rd_atomic - update the qp->s_num_rd_atomic
+ * @qp: the QP
+ * @psn: the packet sequence number to restart at
+ * @wqe: the wqe
+ *
+ * This is called from reset_psn() to update qp->s_num_rd_atomic
+ * for the current wqe.
+ * Called at interrupt level with the QP s_lock held.
+ */
+static void update_num_rd_atomic(struct rvt_qp *qp, u32 psn,
+ struct rvt_swqe *wqe)
+{
+ u32 opcode = wqe->wr.opcode;
+
+ if (opcode == IB_WR_RDMA_READ ||
+ opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+ opcode == IB_WR_ATOMIC_FETCH_AND_ADD) {
+ qp->s_num_rd_atomic++;
+ } else if (opcode == IB_WR_TID_RDMA_READ) {
+ struct tid_rdma_request *req = wqe_to_tid_req(wqe);
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ if (cmp_psn(psn, wqe->lpsn) <= 0) {
+ u32 cur_seg;
+
+ cur_seg = (psn - wqe->psn) / priv->pkts_ps;
+ req->ack_pending = cur_seg - req->comp_seg;
+ priv->pending_tid_r_segs += req->ack_pending;
+ qp->s_num_rd_atomic += req->ack_pending;
+ } else {
+ priv->pending_tid_r_segs += req->total_segs;
+ qp->s_num_rd_atomic += req->total_segs;
+ }
+ }
+}
+
+/**
* reset_psn - reset the QP state to send starting from PSN
* @qp: the QP
* @psn: the packet sequence number to restart at
@@ -949,9 +1506,13 @@ static void reset_psn(struct rvt_qp *qp, u32 psn)
u32 n = qp->s_acked;
struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, n);
u32 opcode;
+ struct hfi1_qp_priv *priv = qp->priv;
lockdep_assert_held(&qp->s_lock);
qp->s_cur = n;
+ priv->pending_tid_r_segs = 0;
+ priv->pending_tid_w_resp = 0;
+ qp->s_num_rd_atomic = 0;
/*
* If we are starting the request from the beginning,
@@ -961,9 +1522,9 @@ static void reset_psn(struct rvt_qp *qp, u32 psn)
qp->s_state = OP(SEND_LAST);
goto done;
}
+ update_num_rd_atomic(qp, psn, wqe);
/* Find the work request opcode corresponding to the given PSN. */
- opcode = wqe->wr.opcode;
for (;;) {
int diff;
@@ -973,8 +1534,11 @@ static void reset_psn(struct rvt_qp *qp, u32 psn)
break;
wqe = rvt_get_swqe_ptr(qp, n);
diff = cmp_psn(psn, wqe->psn);
- if (diff < 0)
+ if (diff < 0) {
+ /* Point wqe back to the previous one*/
+ wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
break;
+ }
qp->s_cur = n;
/*
* If we are starting the request from the beginning,
@@ -984,8 +1548,10 @@ static void reset_psn(struct rvt_qp *qp, u32 psn)
qp->s_state = OP(SEND_LAST);
goto done;
}
- opcode = wqe->wr.opcode;
+
+ update_num_rd_atomic(qp, psn, wqe);
}
+ opcode = wqe->wr.opcode;
/*
* Set the state to restart in the middle of a request.
@@ -1003,10 +1569,18 @@ static void reset_psn(struct rvt_qp *qp, u32 psn)
qp->s_state = OP(RDMA_READ_RESPONSE_LAST);
break;
+ case IB_WR_TID_RDMA_WRITE:
+ qp->s_state = TID_OP(WRITE_RESP);
+ break;
+
case IB_WR_RDMA_READ:
qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE);
break;
+ case IB_WR_TID_RDMA_READ:
+ qp->s_state = TID_OP(READ_RESP);
+ break;
+
default:
/*
* This case shouldn't happen since its only
@@ -1015,6 +1589,7 @@ static void reset_psn(struct rvt_qp *qp, u32 psn)
qp->s_state = OP(SEND_LAST);
}
done:
+ priv->s_flags &= ~HFI1_S_TID_WAIT_INTERLCK;
qp->s_psn = psn;
/*
* Set RVT_S_WAIT_PSN as rc_complete() may start the timer
@@ -1025,6 +1600,7 @@ done:
(cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0))
qp->s_flags |= RVT_S_WAIT_PSN;
qp->s_flags &= ~HFI1_S_AHG_VALID;
+ trace_hfi1_sender_reset_psn(qp);
}
/*
@@ -1033,18 +1609,47 @@ done:
*/
void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait)
{
+ struct hfi1_qp_priv *priv = qp->priv;
struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
struct hfi1_ibport *ibp;
lockdep_assert_held(&qp->r_lock);
lockdep_assert_held(&qp->s_lock);
+ trace_hfi1_sender_restart_rc(qp);
if (qp->s_retry == 0) {
if (qp->s_mig_state == IB_MIG_ARMED) {
hfi1_migrate_qp(qp);
qp->s_retry = qp->s_retry_cnt;
} else if (qp->s_last == qp->s_acked) {
- rvt_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR);
- rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+ /*
+ * We need special handling for the OPFN request WQEs as
+ * they are not allowed to generate real user errors
+ */
+ if (wqe->wr.opcode == IB_WR_OPFN) {
+ struct hfi1_ibport *ibp =
+ to_iport(qp->ibqp.device, qp->port_num);
+ /*
+ * Call opfn_conn_reply() with capcode and
+ * remaining data as 0 to close out the
+ * current request
+ */
+ opfn_conn_reply(qp, priv->opfn.curr);
+ wqe = do_rc_completion(qp, wqe, ibp);
+ qp->s_flags &= ~RVT_S_WAIT_ACK;
+ } else {
+ trace_hfi1_tid_write_sender_restart_rc(qp, 0);
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
+ struct tid_rdma_request *req;
+
+ req = wqe_to_tid_req(wqe);
+ hfi1_kern_exp_rcv_clear_all(req);
+ hfi1_kern_clear_hw_flow(priv->rcd, qp);
+ }
+
+ hfi1_trdma_send_complete(qp, wqe,
+ IB_WC_RETRY_EXC_ERR);
+ rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+ }
return;
} else { /* need to handle delayed completion */
return;
@@ -1054,14 +1659,15 @@ void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait)
}
ibp = to_iport(qp->ibqp.device, qp->port_num);
- if (wqe->wr.opcode == IB_WR_RDMA_READ)
+ if (wqe->wr.opcode == IB_WR_RDMA_READ ||
+ wqe->wr.opcode == IB_WR_TID_RDMA_READ)
ibp->rvp.n_rc_resends++;
else
ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn);
qp->s_flags &= ~(RVT_S_WAIT_FENCE | RVT_S_WAIT_RDMAR |
RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_PSN |
- RVT_S_WAIT_ACK);
+ RVT_S_WAIT_ACK | HFI1_S_WAIT_TID_RESP);
if (wait)
qp->s_flags |= RVT_S_SEND_ONE;
reset_psn(qp, psn);
@@ -1069,7 +1675,8 @@ void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait)
/*
* Set qp->s_sending_psn to the next PSN after the given one.
- * This would be psn+1 except when RDMA reads are present.
+ * This would be psn+1 except when RDMA reads or TID RDMA ops
+ * are present.
*/
static void reset_sending_psn(struct rvt_qp *qp, u32 psn)
{
@@ -1081,7 +1688,9 @@ static void reset_sending_psn(struct rvt_qp *qp, u32 psn)
for (;;) {
wqe = rvt_get_swqe_ptr(qp, n);
if (cmp_psn(psn, wqe->lpsn) <= 0) {
- if (wqe->wr.opcode == IB_WR_RDMA_READ)
+ if (wqe->wr.opcode == IB_WR_RDMA_READ ||
+ wqe->wr.opcode == IB_WR_TID_RDMA_READ ||
+ wqe->wr.opcode == IB_WR_TID_RDMA_WRITE)
qp->s_sending_psn = wqe->lpsn + 1;
else
qp->s_sending_psn = psn + 1;
@@ -1104,8 +1713,9 @@ void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah)
struct rvt_swqe *wqe;
struct ib_header *hdr = NULL;
struct hfi1_16b_header *hdr_16b = NULL;
- u32 opcode;
+ u32 opcode, head, tail;
u32 psn;
+ struct tid_rdma_request *req;
lockdep_assert_held(&qp->s_lock);
if (!(ib_rvt_state_ops[qp->state] & RVT_SEND_OR_FLUSH_OR_RECV_OK))
@@ -1130,25 +1740,85 @@ void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah)
}
opcode = ib_bth_get_opcode(ohdr);
- if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
- opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
+ if ((opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
+ opcode <= OP(ATOMIC_ACKNOWLEDGE)) ||
+ opcode == TID_OP(READ_RESP) ||
+ opcode == TID_OP(WRITE_RESP)) {
WARN_ON(!qp->s_rdma_ack_cnt);
qp->s_rdma_ack_cnt--;
return;
}
psn = ib_bth_get_psn(ohdr);
- reset_sending_psn(qp, psn);
+ /*
+ * Don't attempt to reset the sending PSN for packets in the
+ * KDETH PSN space since the PSN does not match anything.
+ */
+ if (opcode != TID_OP(WRITE_DATA) &&
+ opcode != TID_OP(WRITE_DATA_LAST) &&
+ opcode != TID_OP(ACK) && opcode != TID_OP(RESYNC))
+ reset_sending_psn(qp, psn);
+
+ /* Handle TID RDMA WRITE packets differently */
+ if (opcode >= TID_OP(WRITE_REQ) &&
+ opcode <= TID_OP(WRITE_DATA_LAST)) {
+ head = priv->s_tid_head;
+ tail = priv->s_tid_cur;
+ /*
+ * s_tid_cur is set to s_tid_head in the case, where
+ * a new TID RDMA request is being started and all
+ * previous ones have been completed.
+ * Therefore, we need to do a secondary check in order
+ * to properly determine whether we should start the
+ * RC timer.
+ */
+ wqe = rvt_get_swqe_ptr(qp, tail);
+ req = wqe_to_tid_req(wqe);
+ if (head == tail && req->comp_seg < req->total_segs) {
+ if (tail == 0)
+ tail = qp->s_size - 1;
+ else
+ tail -= 1;
+ }
+ } else {
+ head = qp->s_tail;
+ tail = qp->s_acked;
+ }
/*
* Start timer after a packet requesting an ACK has been sent and
* there are still requests that haven't been acked.
*/
- if ((psn & IB_BTH_REQ_ACK) && qp->s_acked != qp->s_tail &&
+ if ((psn & IB_BTH_REQ_ACK) && tail != head &&
+ opcode != TID_OP(WRITE_DATA) && opcode != TID_OP(WRITE_DATA_LAST) &&
+ opcode != TID_OP(RESYNC) &&
!(qp->s_flags &
- (RVT_S_TIMER | RVT_S_WAIT_RNR | RVT_S_WAIT_PSN)) &&
- (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
- rvt_add_retry_timer(qp);
+ (RVT_S_TIMER | RVT_S_WAIT_RNR | RVT_S_WAIT_PSN)) &&
+ (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
+ if (opcode == TID_OP(READ_REQ))
+ rvt_add_retry_timer_ext(qp, priv->timeout_shift);
+ else
+ rvt_add_retry_timer(qp);
+ }
+
+ /* Start TID RDMA ACK timer */
+ if ((opcode == TID_OP(WRITE_DATA) ||
+ opcode == TID_OP(WRITE_DATA_LAST) ||
+ opcode == TID_OP(RESYNC)) &&
+ (psn & IB_BTH_REQ_ACK) &&
+ !(priv->s_flags & HFI1_S_TID_RETRY_TIMER) &&
+ (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
+ /*
+ * The TID RDMA ACK packet could be received before this
+ * function is called. Therefore, add the timer only if TID
+ * RDMA ACK packets are actually pending.
+ */
+ wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+ req = wqe_to_tid_req(wqe);
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE &&
+ req->ack_seg < req->cur_seg)
+ hfi1_add_tid_retry_timer(qp);
+ }
while (qp->s_last != qp->s_acked) {
u32 s_last;
@@ -1157,6 +1827,8 @@ void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah)
if (cmp_psn(wqe->lpsn, qp->s_sending_psn) >= 0 &&
cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)
break;
+ trdma_clean_swqe(qp, wqe);
+ rvt_qp_wqe_unreserve(qp, wqe);
s_last = qp->s_last;
trace_hfi1_qp_send_completion(qp, wqe, s_last);
if (++s_last >= qp->s_size)
@@ -1194,21 +1866,26 @@ static inline void update_last_psn(struct rvt_qp *qp, u32 psn)
* This is similar to hfi1_send_complete but has to check to be sure
* that the SGEs are not being referenced if the SWQE is being resent.
*/
-static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
- struct rvt_swqe *wqe,
- struct hfi1_ibport *ibp)
+struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
+ struct rvt_swqe *wqe,
+ struct hfi1_ibport *ibp)
{
+ struct hfi1_qp_priv *priv = qp->priv;
+
lockdep_assert_held(&qp->s_lock);
/*
* Don't decrement refcount and don't generate a
* completion if the SWQE is being resent until the send
* is finished.
*/
+ trace_hfi1_rc_completion(qp, wqe->lpsn);
if (cmp_psn(wqe->lpsn, qp->s_sending_psn) < 0 ||
cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
u32 s_last;
+ trdma_clean_swqe(qp, wqe);
rvt_put_swqe(wqe);
+ rvt_qp_wqe_unreserve(qp, wqe);
s_last = qp->s_last;
trace_hfi1_qp_send_completion(qp, wqe, s_last);
if (++s_last >= qp->s_size)
@@ -1241,7 +1918,16 @@ static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
}
qp->s_retry = qp->s_retry_cnt;
- update_last_psn(qp, wqe->lpsn);
+ /*
+ * Don't update the last PSN if the request being completed is
+ * a TID RDMA WRITE request.
+ * Completion of the TID RDMA WRITE requests are done by the
+ * TID RDMA ACKs and as such could be for a request that has
+ * already been ACKed as far as the IB state machine is
+ * concerned.
+ */
+ if (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE)
+ update_last_psn(qp, wqe->lpsn);
/*
* If we are completing a request which is in the process of
@@ -1264,9 +1950,61 @@ static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
qp->s_draining = 0;
wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
}
+ if (priv->s_flags & HFI1_S_TID_WAIT_INTERLCK) {
+ priv->s_flags &= ~HFI1_S_TID_WAIT_INTERLCK;
+ hfi1_schedule_send(qp);
+ }
return wqe;
}
+static void set_restart_qp(struct rvt_qp *qp, struct hfi1_ctxtdata *rcd)
+{
+ /* Retry this request. */
+ if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) {
+ qp->r_flags |= RVT_R_RDMAR_SEQ;
+ hfi1_restart_rc(qp, qp->s_last_psn + 1, 0);
+ if (list_empty(&qp->rspwait)) {
+ qp->r_flags |= RVT_R_RSP_SEND;
+ rvt_get_qp(qp);
+ list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
+ }
+ }
+}
+
+/**
+ * update_qp_retry_state - Update qp retry state.
+ * @qp: the QP
+ * @psn: the packet sequence number of the TID RDMA WRITE RESP.
+ * @spsn: The start psn for the given TID RDMA WRITE swqe.
+ * @lpsn: The last psn for the given TID RDMA WRITE swqe.
+ *
+ * This function is called to update the qp retry state upon
+ * receiving a TID WRITE RESP after the qp is scheduled to retry
+ * a request.
+ */
+static void update_qp_retry_state(struct rvt_qp *qp, u32 psn, u32 spsn,
+ u32 lpsn)
+{
+ struct hfi1_qp_priv *qpriv = qp->priv;
+
+ qp->s_psn = psn + 1;
+ /*
+ * If this is the first TID RDMA WRITE RESP packet for the current
+ * request, change the s_state so that the retry will be processed
+ * correctly. Similarly, if this is the last TID RDMA WRITE RESP
+ * packet, change the s_state and advance the s_cur.
+ */
+ if (cmp_psn(psn, lpsn) >= 0) {
+ qp->s_cur = qpriv->s_tid_cur + 1;
+ if (qp->s_cur >= qp->s_size)
+ qp->s_cur = 0;
+ qp->s_state = TID_OP(WRITE_REQ);
+ } else if (!cmp_psn(psn, spsn)) {
+ qp->s_cur = qpriv->s_tid_cur;
+ qp->s_state = TID_OP(WRITE_RESP);
+ }
+}
+
/**
* do_rc_ack - process an incoming RC ACK
* @qp: the QP the ACK came in on
@@ -1278,15 +2016,17 @@ static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
* May be called at interrupt level, with the QP s_lock held.
* Returns 1 if OK, 0 if current operation should be aborted (NAK).
*/
-static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
- u64 val, struct hfi1_ctxtdata *rcd)
+int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
+ u64 val, struct hfi1_ctxtdata *rcd)
{
struct hfi1_ibport *ibp;
enum ib_wc_status status;
+ struct hfi1_qp_priv *qpriv = qp->priv;
struct rvt_swqe *wqe;
int ret = 0;
u32 ack_psn;
int diff;
+ struct rvt_dev_info *rdi;
lockdep_assert_held(&qp->s_lock);
/*
@@ -1329,20 +2069,14 @@ static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
*/
if ((wqe->wr.opcode == IB_WR_RDMA_READ &&
(opcode != OP(RDMA_READ_RESPONSE_LAST) || diff != 0)) ||
+ (wqe->wr.opcode == IB_WR_TID_RDMA_READ &&
+ (opcode != TID_OP(READ_RESP) || diff != 0)) ||
((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) &&
- (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) {
- /* Retry this request. */
- if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) {
- qp->r_flags |= RVT_R_RDMAR_SEQ;
- hfi1_restart_rc(qp, qp->s_last_psn + 1, 0);
- if (list_empty(&qp->rspwait)) {
- qp->r_flags |= RVT_R_RSP_SEND;
- rvt_get_qp(qp);
- list_add_tail(&qp->rspwait,
- &rcd->qp_wait_list);
- }
- }
+ (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0)) ||
+ (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE &&
+ (delta_psn(psn, qp->s_last_psn) != 1))) {
+ set_restart_qp(qp, rcd);
/*
* No need to process the ACK/NAK since we are
* restarting an earlier request.
@@ -1354,6 +2088,9 @@ static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
u64 *vaddr = wqe->sg_list[0].vaddr;
*vaddr = val;
}
+ if (wqe->wr.opcode == IB_WR_OPFN)
+ opfn_conn_reply(qp, val);
+
if (qp->s_num_rd_atomic &&
(wqe->wr.opcode == IB_WR_RDMA_READ ||
wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
@@ -1371,26 +2108,85 @@ static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
hfi1_schedule_send(qp);
}
}
+
+ /*
+ * TID RDMA WRITE requests will be completed by the TID RDMA
+ * ACK packet handler (see tid_rdma.c).
+ */
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE)
+ break;
+
wqe = do_rc_completion(qp, wqe, ibp);
if (qp->s_acked == qp->s_tail)
break;
}
+ trace_hfi1_rc_ack_do(qp, aeth, psn, wqe);
+ trace_hfi1_sender_do_rc_ack(qp);
switch (aeth >> IB_AETH_NAK_SHIFT) {
case 0: /* ACK */
this_cpu_inc(*ibp->rvp.rc_acks);
- if (qp->s_acked != qp->s_tail) {
- /*
- * We are expecting more ACKs so
- * mod the retry timer.
- */
- rvt_mod_retry_timer(qp);
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
+ if (wqe_to_tid_req(wqe)->ack_pending)
+ rvt_mod_retry_timer_ext(qp,
+ qpriv->timeout_shift);
+ else
+ rvt_stop_rc_timers(qp);
+ } else if (qp->s_acked != qp->s_tail) {
+ struct rvt_swqe *__w = NULL;
+
+ if (qpriv->s_tid_cur != HFI1_QP_WQE_INVALID)
+ __w = rvt_get_swqe_ptr(qp, qpriv->s_tid_cur);
+
/*
- * We can stop re-sending the earlier packets and
- * continue with the next packet the receiver wants.
+ * Stop timers if we've received all of the TID RDMA
+ * WRITE * responses.
*/
- if (cmp_psn(qp->s_psn, psn) <= 0)
- reset_psn(qp, psn + 1);
+ if (__w && __w->wr.opcode == IB_WR_TID_RDMA_WRITE &&
+ opcode == TID_OP(WRITE_RESP)) {
+ /*
+ * Normally, the loop above would correctly
+ * process all WQEs from s_acked onward and
+ * either complete them or check for correct
+ * PSN sequencing.
+ * However, for TID RDMA, due to pipelining,
+ * the response may not be for the request at
+ * s_acked so the above look would just be
+ * skipped. This does not allow for checking
+ * the PSN sequencing. It has to be done
+ * separately.
+ */
+ if (cmp_psn(psn, qp->s_last_psn + 1)) {
+ set_restart_qp(qp, rcd);
+ goto bail_stop;
+ }
+ /*
+ * If the psn is being resent, stop the
+ * resending.
+ */
+ if (qp->s_cur != qp->s_tail &&
+ cmp_psn(qp->s_psn, psn) <= 0)
+ update_qp_retry_state(qp, psn,
+ __w->psn,
+ __w->lpsn);
+ else if (--qpriv->pending_tid_w_resp)
+ rvt_mod_retry_timer(qp);
+ else
+ rvt_stop_rc_timers(qp);
+ } else {
+ /*
+ * We are expecting more ACKs so
+ * mod the retry timer.
+ */
+ rvt_mod_retry_timer(qp);
+ /*
+ * We can stop re-sending the earlier packets
+ * and continue with the next packet the
+ * receiver wants.
+ */
+ if (cmp_psn(qp->s_psn, psn) <= 0)
+ reset_psn(qp, psn + 1);
+ }
} else {
/* No more acks - kill all timers */
rvt_stop_rc_timers(qp);
@@ -1406,6 +2202,15 @@ static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
rvt_get_credit(qp, aeth);
qp->s_rnr_retry = qp->s_rnr_retry_cnt;
qp->s_retry = qp->s_retry_cnt;
+ /*
+ * If the current request is a TID RDMA WRITE request and the
+ * response is not a TID RDMA WRITE RESP packet, s_last_psn
+ * can't be advanced.
+ */
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE &&
+ opcode != TID_OP(WRITE_RESP) &&
+ cmp_psn(psn, wqe->psn) >= 0)
+ return 1;
update_last_psn(qp, psn);
return 1;
@@ -1415,20 +2220,31 @@ static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
goto bail_stop;
if (qp->s_flags & RVT_S_WAIT_RNR)
goto bail_stop;
- if (qp->s_rnr_retry == 0) {
+ rdi = ib_to_rvt(qp->ibqp.device);
+ if (qp->s_rnr_retry == 0 &&
+ !((rdi->post_parms[wqe->wr.opcode].flags &
+ RVT_OPERATION_IGN_RNR_CNT) &&
+ qp->s_rnr_retry_cnt == 0)) {
status = IB_WC_RNR_RETRY_EXC_ERR;
goto class_b;
}
- if (qp->s_rnr_retry_cnt < 7)
+ if (qp->s_rnr_retry_cnt < 7 && qp->s_rnr_retry_cnt > 0)
qp->s_rnr_retry--;
- /* The last valid PSN is the previous PSN. */
- update_last_psn(qp, psn - 1);
+ /*
+ * The last valid PSN is the previous PSN. For TID RDMA WRITE
+ * request, s_last_psn should be incremented only when a TID
+ * RDMA WRITE RESP is received to avoid skipping lost TID RDMA
+ * WRITE RESP packets.
+ */
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) {
+ reset_psn(qp, qp->s_last_psn + 1);
+ } else {
+ update_last_psn(qp, psn - 1);
+ reset_psn(qp, psn);
+ }
ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn);
-
- reset_psn(qp, psn);
-
qp->s_flags &= ~(RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_ACK);
rvt_stop_rc_timers(qp);
rvt_add_rnr_timer(qp, aeth);
@@ -1468,7 +2284,10 @@ static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
ibp->rvp.n_other_naks++;
class_b:
if (qp->s_last == qp->s_acked) {
- rvt_send_complete(qp, wqe, status);
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_READ)
+ hfi1_kern_read_tid_flow_free(qp);
+
+ hfi1_trdma_send_complete(qp, wqe, status);
rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
}
break;
@@ -1509,6 +2328,8 @@ static void rdma_seq_err(struct rvt_qp *qp, struct hfi1_ibport *ibp, u32 psn,
while (cmp_psn(psn, wqe->lpsn) > 0) {
if (wqe->wr.opcode == IB_WR_RDMA_READ ||
+ wqe->wr.opcode == IB_WR_TID_RDMA_READ ||
+ wqe->wr.opcode == IB_WR_TID_RDMA_WRITE ||
wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)
break;
@@ -1715,16 +2536,6 @@ bail:
return;
}
-static inline void rc_defered_ack(struct hfi1_ctxtdata *rcd,
- struct rvt_qp *qp)
-{
- if (list_empty(&qp->rspwait)) {
- qp->r_flags |= RVT_R_RSP_NAK;
- rvt_get_qp(qp);
- list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
- }
-}
-
static inline void rc_cancel_ack(struct rvt_qp *qp)
{
qp->r_adefered = 0;
@@ -1757,8 +2568,9 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data,
struct hfi1_ibport *ibp = rcd_to_iport(rcd);
struct rvt_ack_entry *e;
unsigned long flags;
- u8 i, prev;
- int old_req;
+ u8 prev;
+ u8 mra; /* most recent ACK */
+ bool old_req;
trace_hfi1_rcv_error(qp, psn);
if (diff > 0) {
@@ -1804,29 +2616,8 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data,
spin_lock_irqsave(&qp->s_lock, flags);
- for (i = qp->r_head_ack_queue; ; i = prev) {
- if (i == qp->s_tail_ack_queue)
- old_req = 0;
- if (i)
- prev = i - 1;
- else
- prev = HFI1_MAX_RDMA_ATOMIC;
- if (prev == qp->r_head_ack_queue) {
- e = NULL;
- break;
- }
- e = &qp->s_ack_queue[prev];
- if (!e->opcode) {
- e = NULL;
- break;
- }
- if (cmp_psn(psn, e->psn) >= 0) {
- if (prev == qp->s_tail_ack_queue &&
- cmp_psn(psn, e->lpsn) <= 0)
- old_req = 0;
- break;
- }
- }
+ e = find_prev_entry(qp, psn, &prev, &mra, &old_req);
+
switch (opcode) {
case OP(RDMA_READ_REQUEST): {
struct ib_reth *reth;
@@ -1873,6 +2664,8 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data,
e->psn = psn;
if (old_req)
goto unlock_done;
+ if (qp->s_acked_ack_queue == qp->s_tail_ack_queue)
+ qp->s_acked_ack_queue = prev;
qp->s_tail_ack_queue = prev;
break;
}
@@ -1886,6 +2679,8 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data,
*/
if (!e || e->opcode != (u8)opcode || old_req)
goto unlock_done;
+ if (qp->s_tail_ack_queue == qp->s_acked_ack_queue)
+ qp->s_acked_ack_queue = prev;
qp->s_tail_ack_queue = prev;
break;
}
@@ -1901,7 +2696,7 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data,
* Resend the most recent ACK if this request is
* after all the previous RDMA reads and atomics.
*/
- if (i == qp->r_head_ack_queue) {
+ if (mra == qp->r_head_ack_queue) {
spin_unlock_irqrestore(&qp->s_lock, flags);
qp->r_nak_state = 0;
qp->r_ack_psn = qp->r_psn - 1;
@@ -1912,7 +2707,9 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data,
* Resend the RDMA read or atomic op which
* ACKs this duplicate request.
*/
- qp->s_tail_ack_queue = i;
+ if (qp->s_tail_ack_queue == qp->s_acked_ack_queue)
+ qp->s_acked_ack_queue = mra;
+ qp->s_tail_ack_queue = mra;
break;
}
qp->s_ack_state = OP(ACKNOWLEDGE);
@@ -1929,17 +2726,6 @@ send_ack:
return 0;
}
-static inline void update_ack_queue(struct rvt_qp *qp, unsigned n)
-{
- unsigned next;
-
- next = n + 1;
- if (next > HFI1_MAX_RDMA_ATOMIC)
- next = 0;
- qp->s_tail_ack_queue = next;
- qp->s_ack_state = OP(ACKNOWLEDGE);
-}
-
static void log_cca_event(struct hfi1_pportdata *ppd, u8 sl, u32 rlid,
u32 lqpn, u32 rqpn, u8 svc_type)
{
@@ -2037,6 +2823,7 @@ void hfi1_rc_rcv(struct hfi1_packet *packet)
void *data = packet->payload;
u32 tlen = packet->tlen;
struct rvt_qp *qp = packet->qp;
+ struct hfi1_qp_priv *qpriv = qp->priv;
struct hfi1_ibport *ibp = rcd_to_iport(rcd);
struct ib_other_headers *ohdr = packet->ohdr;
u32 opcode = packet->opcode;
@@ -2049,8 +2836,7 @@ void hfi1_rc_rcv(struct hfi1_packet *packet)
struct ib_reth *reth;
unsigned long flags;
int ret;
- bool is_fecn = false;
- bool copy_last = false;
+ bool copy_last = false, fecn;
u32 rkey;
u8 extra_bytes = pad + packet->extra_byte + (SIZE_OF_CRC << 2);
@@ -2059,7 +2845,8 @@ void hfi1_rc_rcv(struct hfi1_packet *packet)
if (hfi1_ruc_check_hdr(ibp, packet))
return;
- is_fecn = process_ecn(qp, packet, false);
+ fecn = process_ecn(qp, packet);
+ opfn_trigger_conn_request(qp, be32_to_cpu(ohdr->bth[1]));
/*
* Process responses (ACKs) before anything else. Note that the
@@ -2070,8 +2857,6 @@ void hfi1_rc_rcv(struct hfi1_packet *packet)
if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
rc_rcv_resp(packet);
- if (is_fecn)
- goto send_ack;
return;
}
@@ -2293,17 +3078,17 @@ send_last:
if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
goto nack_inv;
next = qp->r_head_ack_queue + 1;
- /* s_ack_queue is size HFI1_MAX_RDMA_ATOMIC+1 so use > not >= */
- if (next > HFI1_MAX_RDMA_ATOMIC)
+ /* s_ack_queue is size rvt_size_atomic()+1 so use > not >= */
+ if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
next = 0;
spin_lock_irqsave(&qp->s_lock, flags);
- if (unlikely(next == qp->s_tail_ack_queue)) {
+ if (unlikely(next == qp->s_acked_ack_queue)) {
if (!qp->s_ack_queue[next].sent)
goto nack_inv_unlck;
update_ack_queue(qp, next);
}
e = &qp->s_ack_queue[qp->r_head_ack_queue];
- if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) {
+ if (e->rdma_sge.mr) {
rvt_put_mr(e->rdma_sge.mr);
e->rdma_sge.mr = NULL;
}
@@ -2344,45 +3129,52 @@ send_last:
qp->r_state = opcode;
qp->r_nak_state = 0;
qp->r_head_ack_queue = next;
+ qpriv->r_tid_alloc = qp->r_head_ack_queue;
/* Schedule the send engine. */
qp->s_flags |= RVT_S_RESP_PENDING;
+ if (fecn)
+ qp->s_flags |= RVT_S_ECN;
hfi1_schedule_send(qp);
spin_unlock_irqrestore(&qp->s_lock, flags);
- if (is_fecn)
- goto send_ack;
return;
}
case OP(COMPARE_SWAP):
case OP(FETCH_ADD): {
- struct ib_atomic_eth *ateth;
+ struct ib_atomic_eth *ateth = &ohdr->u.atomic_eth;
+ u64 vaddr = get_ib_ateth_vaddr(ateth);
+ bool opfn = opcode == OP(COMPARE_SWAP) &&
+ vaddr == HFI1_VERBS_E_ATOMIC_VADDR;
struct rvt_ack_entry *e;
- u64 vaddr;
atomic64_t *maddr;
u64 sdata;
u32 rkey;
u8 next;
- if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
+ if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC) &&
+ !opfn))
goto nack_inv;
next = qp->r_head_ack_queue + 1;
- if (next > HFI1_MAX_RDMA_ATOMIC)
+ if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
next = 0;
spin_lock_irqsave(&qp->s_lock, flags);
- if (unlikely(next == qp->s_tail_ack_queue)) {
+ if (unlikely(next == qp->s_acked_ack_queue)) {
if (!qp->s_ack_queue[next].sent)
goto nack_inv_unlck;
update_ack_queue(qp, next);
}
e = &qp->s_ack_queue[qp->r_head_ack_queue];
- if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) {
+ if (e->rdma_sge.mr) {
rvt_put_mr(e->rdma_sge.mr);
e->rdma_sge.mr = NULL;
}
- ateth = &ohdr->u.atomic_eth;
- vaddr = get_ib_ateth_vaddr(ateth);
+ /* Process OPFN special virtual address */
+ if (opfn) {
+ opfn_conn_response(qp, e, ateth);
+ goto ack;
+ }
if (unlikely(vaddr & (sizeof(u64) - 1)))
goto nack_inv_unlck;
rkey = be32_to_cpu(ateth->rkey);
@@ -2401,6 +3193,7 @@ send_last:
sdata);
rvt_put_mr(qp->r_sge.sge.mr);
qp->r_sge.num_sge = 0;
+ack:
e->opcode = opcode;
e->sent = 0;
e->psn = psn;
@@ -2410,14 +3203,15 @@ send_last:
qp->r_state = opcode;
qp->r_nak_state = 0;
qp->r_head_ack_queue = next;
+ qpriv->r_tid_alloc = qp->r_head_ack_queue;
/* Schedule the send engine. */
qp->s_flags |= RVT_S_RESP_PENDING;
+ if (fecn)
+ qp->s_flags |= RVT_S_ECN;
hfi1_schedule_send(qp);
spin_unlock_irqrestore(&qp->s_lock, flags);
- if (is_fecn)
- goto send_ack;
return;
}
@@ -2430,16 +3224,9 @@ send_last:
qp->r_ack_psn = psn;
qp->r_nak_state = 0;
/* Send an ACK if requested or required. */
- if (psn & IB_BTH_REQ_ACK) {
- if (packet->numpkt == 0) {
- rc_cancel_ack(qp);
- goto send_ack;
- }
- if (qp->r_adefered >= HFI1_PSN_CREDIT) {
- rc_cancel_ack(qp);
- goto send_ack;
- }
- if (unlikely(is_fecn)) {
+ if (psn & IB_BTH_REQ_ACK || fecn) {
+ if (packet->numpkt == 0 || fecn ||
+ qp->r_adefered >= HFI1_PSN_CREDIT) {
rc_cancel_ack(qp);
goto send_ack;
}
@@ -2480,7 +3267,7 @@ nack_acc:
qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
qp->r_ack_psn = qp->r_psn;
send_ack:
- hfi1_send_rc_ack(packet, is_fecn);
+ hfi1_send_rc_ack(packet, fecn);
}
void hfi1_rc_hdrerr(
diff --git a/drivers/infiniband/hw/hfi1/rc.h b/drivers/infiniband/hw/hfi1/rc.h
new file mode 100644
index 000000000000..8e0935b9bf2a
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/rc.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
+/*
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ */
+
+#ifndef HFI1_RC_H
+#define HFI1_RC_H
+
+/* cut down ridiculously long IB macro names */
+#define OP(x) IB_OPCODE_RC_##x
+
+static inline void update_ack_queue(struct rvt_qp *qp, unsigned int n)
+{
+ unsigned int next;
+
+ next = n + 1;
+ if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
+ next = 0;
+ qp->s_tail_ack_queue = next;
+ qp->s_acked_ack_queue = next;
+ qp->s_ack_state = OP(ACKNOWLEDGE);
+}
+
+static inline void rc_defered_ack(struct hfi1_ctxtdata *rcd,
+ struct rvt_qp *qp)
+{
+ if (list_empty(&qp->rspwait)) {
+ qp->r_flags |= RVT_R_RSP_NAK;
+ rvt_get_qp(qp);
+ list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
+ }
+}
+
+static inline u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe,
+ u32 psn, u32 pmtu)
+{
+ u32 len;
+
+ len = delta_psn(psn, wqe->psn) * pmtu;
+ return rvt_restart_sge(ss, wqe, len);
+}
+
+struct rvt_ack_entry *find_prev_entry(struct rvt_qp *qp, u32 psn, u8 *prev,
+ u8 *prev_ack, bool *scheduled);
+int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, u64 val,
+ struct hfi1_ctxtdata *rcd);
+struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, struct rvt_swqe *wqe,
+ struct hfi1_ibport *ibp);
+
+#endif /* HFI1_RC_H */
diff --git a/drivers/infiniband/hw/hfi1/ruc.c b/drivers/infiniband/hw/hfi1/ruc.c
index 7fb317c711df..124a3ec1e15c 100644
--- a/drivers/infiniband/hw/hfi1/ruc.c
+++ b/drivers/infiniband/hw/hfi1/ruc.c
@@ -250,7 +250,6 @@ static inline void hfi1_make_ruc_bth(struct rvt_qp *qp,
struct ib_other_headers *ohdr,
u32 bth0, u32 bth1, u32 bth2)
{
- bth1 |= qp->remote_qpn;
ohdr->bth[0] = cpu_to_be32(bth0);
ohdr->bth[1] = cpu_to_be32(bth1);
ohdr->bth[2] = cpu_to_be32(bth2);
@@ -272,13 +271,13 @@ static inline void hfi1_make_ruc_bth(struct rvt_qp *qp,
*/
static inline void hfi1_make_ruc_header_16B(struct rvt_qp *qp,
struct ib_other_headers *ohdr,
- u32 bth0, u32 bth2, int middle,
+ u32 bth0, u32 bth1, u32 bth2,
+ int middle,
struct hfi1_pkt_state *ps)
{
struct hfi1_qp_priv *priv = qp->priv;
struct hfi1_ibport *ibp = ps->ibp;
struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
- u32 bth1 = 0;
u32 slid;
u16 pkey = hfi1_get_pkey(ibp, qp->s_pkey_index);
u8 l4 = OPA_16B_L4_IB_LOCAL;
@@ -360,12 +359,12 @@ static inline void hfi1_make_ruc_header_16B(struct rvt_qp *qp,
*/
static inline void hfi1_make_ruc_header_9B(struct rvt_qp *qp,
struct ib_other_headers *ohdr,
- u32 bth0, u32 bth2, int middle,
+ u32 bth0, u32 bth1, u32 bth2,
+ int middle,
struct hfi1_pkt_state *ps)
{
struct hfi1_qp_priv *priv = qp->priv;
struct hfi1_ibport *ibp = ps->ibp;
- u32 bth1 = 0;
u16 pkey = hfi1_get_pkey(ibp, qp->s_pkey_index);
u16 lrh0 = HFI1_LRH_BTH;
u8 extra_bytes = -ps->s_txreq->s_cur_size & 3;
@@ -415,7 +414,7 @@ static inline void hfi1_make_ruc_header_9B(struct rvt_qp *qp,
typedef void (*hfi1_make_ruc_hdr)(struct rvt_qp *qp,
struct ib_other_headers *ohdr,
- u32 bth0, u32 bth2, int middle,
+ u32 bth0, u32 bth1, u32 bth2, int middle,
struct hfi1_pkt_state *ps);
/* We support only two types - 9B and 16B for now */
@@ -425,7 +424,7 @@ static const hfi1_make_ruc_hdr hfi1_ruc_header_tbl[2] = {
};
void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr,
- u32 bth0, u32 bth2, int middle,
+ u32 bth0, u32 bth1, u32 bth2, int middle,
struct hfi1_pkt_state *ps)
{
struct hfi1_qp_priv *priv = qp->priv;
@@ -446,18 +445,21 @@ void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr,
priv->s_ahg->ahgidx = 0;
/* Make the appropriate header */
- hfi1_ruc_header_tbl[priv->hdr_type](qp, ohdr, bth0, bth2, middle, ps);
+ hfi1_ruc_header_tbl[priv->hdr_type](qp, ohdr, bth0, bth1, bth2, middle,
+ ps);
}
/* when sending, force a reschedule every one of these periods */
#define SEND_RESCHED_TIMEOUT (5 * HZ) /* 5s in jiffies */
/**
- * schedule_send_yield - test for a yield required for QP send engine
+ * hfi1_schedule_send_yield - test for a yield required for QP
+ * send engine
* @timeout: Final time for timeout slice for jiffies
* @qp: a pointer to QP
* @ps: a pointer to a structure with commonly lookup values for
* the the send engine progress
+ * @tid - true if it is the tid leg
*
* This routine checks if the time slice for the QP has expired
* for RC QPs, if so an additional work entry is queued. At this
@@ -465,8 +467,8 @@ void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr,
* returns true if a yield is required, otherwise, false
* is returned.
*/
-static bool schedule_send_yield(struct rvt_qp *qp,
- struct hfi1_pkt_state *ps)
+bool hfi1_schedule_send_yield(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
+ bool tid)
{
ps->pkts_sent = true;
@@ -474,8 +476,24 @@ static bool schedule_send_yield(struct rvt_qp *qp,
if (!ps->in_thread ||
workqueue_congested(ps->cpu, ps->ppd->hfi1_wq)) {
spin_lock_irqsave(&qp->s_lock, ps->flags);
- qp->s_flags &= ~RVT_S_BUSY;
- hfi1_schedule_send(qp);
+ if (!tid) {
+ qp->s_flags &= ~RVT_S_BUSY;
+ hfi1_schedule_send(qp);
+ } else {
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ if (priv->s_flags &
+ HFI1_S_TID_BUSY_SET) {
+ qp->s_flags &= ~RVT_S_BUSY;
+ priv->s_flags &=
+ ~(HFI1_S_TID_BUSY_SET |
+ RVT_S_BUSY);
+ } else {
+ priv->s_flags &= ~RVT_S_BUSY;
+ }
+ hfi1_schedule_tid_send(qp);
+ }
+
spin_unlock_irqrestore(&qp->s_lock, ps->flags);
this_cpu_inc(*ps->ppd->dd->send_schedule);
trace_hfi1_rc_expired_time_slice(qp, true);
@@ -576,6 +594,8 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread)
do {
/* Check for a constructed packet to be sent. */
if (ps.s_txreq) {
+ if (priv->s_flags & HFI1_S_TID_BUSY_SET)
+ qp->s_flags |= RVT_S_BUSY;
spin_unlock_irqrestore(&qp->s_lock, ps.flags);
/*
* If the packet cannot be sent now, return and
@@ -585,7 +605,7 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread)
return;
/* allow other tasks to run */
- if (schedule_send_yield(qp, &ps))
+ if (hfi1_schedule_send_yield(qp, &ps, false))
return;
spin_lock_irqsave(&qp->s_lock, ps.flags);
diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c
index 891d2386d1ca..b0110728f541 100644
--- a/drivers/infiniband/hw/hfi1/sdma.c
+++ b/drivers/infiniband/hw/hfi1/sdma.c
@@ -1424,6 +1424,7 @@ int sdma_init(struct hfi1_devdata *dd, u8 port)
seqlock_init(&sde->head_lock);
spin_lock_init(&sde->senddmactrl_lock);
spin_lock_init(&sde->flushlist_lock);
+ seqlock_init(&sde->waitlock);
/* insure there is always a zero bit */
sde->ahg_bits = 0xfffffffe00000000ULL;
@@ -1452,12 +1453,9 @@ int sdma_init(struct hfi1_devdata *dd, u8 port)
timer_setup(&sde->err_progress_check_timer,
sdma_err_progress_check, 0);
- sde->descq = dma_zalloc_coherent(
- &dd->pcidev->dev,
- descq_cnt * sizeof(u64[2]),
- &sde->descq_phys,
- GFP_KERNEL
- );
+ sde->descq = dma_alloc_coherent(&dd->pcidev->dev,
+ descq_cnt * sizeof(u64[2]),
+ &sde->descq_phys, GFP_KERNEL);
if (!sde->descq)
goto bail;
sde->tx_ring =
@@ -1470,24 +1468,18 @@ int sdma_init(struct hfi1_devdata *dd, u8 port)
dd->sdma_heads_size = L1_CACHE_BYTES * num_engines;
/* Allocate memory for DMA of head registers to memory */
- dd->sdma_heads_dma = dma_zalloc_coherent(
- &dd->pcidev->dev,
- dd->sdma_heads_size,
- &dd->sdma_heads_phys,
- GFP_KERNEL
- );
+ dd->sdma_heads_dma = dma_alloc_coherent(&dd->pcidev->dev,
+ dd->sdma_heads_size,
+ &dd->sdma_heads_phys,
+ GFP_KERNEL);
if (!dd->sdma_heads_dma) {
dd_dev_err(dd, "failed to allocate SendDMA head memory\n");
goto bail;
}
/* Allocate memory for pad */
- dd->sdma_pad_dma = dma_zalloc_coherent(
- &dd->pcidev->dev,
- sizeof(u32),
- &dd->sdma_pad_phys,
- GFP_KERNEL
- );
+ dd->sdma_pad_dma = dma_alloc_coherent(&dd->pcidev->dev, sizeof(u32),
+ &dd->sdma_pad_phys, GFP_KERNEL);
if (!dd->sdma_pad_dma) {
dd_dev_err(dd, "failed to allocate SendDMA pad memory\n");
goto bail;
@@ -1755,11 +1747,9 @@ retry:
*/
static void sdma_desc_avail(struct sdma_engine *sde, uint avail)
{
- struct iowait *wait, *nw;
+ struct iowait *wait, *nw, *twait;
struct iowait *waits[SDMA_WAIT_BATCH_SIZE];
- uint i, n = 0, seq, max_idx = 0;
- struct hfi1_ibdev *dev = &sde->dd->verbs_dev;
- u8 max_starved_cnt = 0;
+ uint i, n = 0, seq, tidx = 0;
#ifdef CONFIG_SDMA_VERBOSITY
dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx,
@@ -1768,10 +1758,10 @@ static void sdma_desc_avail(struct sdma_engine *sde, uint avail)
#endif
do {
- seq = read_seqbegin(&dev->iowait_lock);
+ seq = read_seqbegin(&sde->waitlock);
if (!list_empty(&sde->dmawait)) {
/* at least one item */
- write_seqlock(&dev->iowait_lock);
+ write_seqlock(&sde->waitlock);
/* Harvest waiters wanting DMA descriptors */
list_for_each_entry_safe(
wait,
@@ -1784,27 +1774,34 @@ static void sdma_desc_avail(struct sdma_engine *sde, uint avail)
continue;
if (n == ARRAY_SIZE(waits))
break;
+ iowait_init_priority(wait);
num_desc = iowait_get_all_desc(wait);
if (num_desc > avail)
break;
avail -= num_desc;
- /* Find the most starved wait memeber */
- iowait_starve_find_max(wait, &max_starved_cnt,
- n, &max_idx);
+ /* Find the top-priority wait memeber */
+ if (n) {
+ twait = waits[tidx];
+ tidx =
+ iowait_priority_update_top(wait,
+ twait,
+ n,
+ tidx);
+ }
list_del_init(&wait->list);
waits[n++] = wait;
}
- write_sequnlock(&dev->iowait_lock);
+ write_sequnlock(&sde->waitlock);
break;
}
- } while (read_seqretry(&dev->iowait_lock, seq));
+ } while (read_seqretry(&sde->waitlock, seq));
- /* Schedule the most starved one first */
+ /* Schedule the top-priority entry first */
if (n)
- waits[max_idx]->wakeup(waits[max_idx], SDMA_AVAIL_REASON);
+ waits[tidx]->wakeup(waits[tidx], SDMA_AVAIL_REASON);
for (i = 0; i < n; i++)
- if (i != max_idx)
+ if (i != tidx)
waits[i]->wakeup(waits[i], SDMA_AVAIL_REASON);
}
diff --git a/drivers/infiniband/hw/hfi1/sdma.h b/drivers/infiniband/hw/hfi1/sdma.h
index 6dc63d7c5685..1e2e40f79cb2 100644
--- a/drivers/infiniband/hw/hfi1/sdma.h
+++ b/drivers/infiniband/hw/hfi1/sdma.h
@@ -382,6 +382,7 @@ struct sdma_engine {
u64 progress_int_cnt;
/* private: */
+ seqlock_t waitlock;
struct list_head dmawait;
/* CONFIG SDMA for now, just blindly duplicate */
diff --git a/drivers/infiniband/hw/hfi1/sdma_txreq.h b/drivers/infiniband/hw/hfi1/sdma_txreq.h
index bf7d777d756e..514a4784566b 100644
--- a/drivers/infiniband/hw/hfi1/sdma_txreq.h
+++ b/drivers/infiniband/hw/hfi1/sdma_txreq.h
@@ -91,6 +91,7 @@ struct sdma_desc {
#define SDMA_TXREQ_F_URGENT 0x0001
#define SDMA_TXREQ_F_AHG_COPY 0x0002
#define SDMA_TXREQ_F_USE_AHG 0x0004
+#define SDMA_TXREQ_F_VIP 0x0010
struct sdma_txreq;
typedef void (*callback_t)(struct sdma_txreq *, int);
diff --git a/drivers/infiniband/hw/hfi1/sysfs.c b/drivers/infiniband/hw/hfi1/sysfs.c
index 2be513d4c9da..90f62c4bddba 100644
--- a/drivers/infiniband/hw/hfi1/sysfs.c
+++ b/drivers/infiniband/hw/hfi1/sysfs.c
@@ -498,7 +498,7 @@ static ssize_t hw_rev_show(struct device *device, struct device_attribute *attr,
char *buf)
{
struct hfi1_ibdev *dev =
- container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+ rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev);
return sprintf(buf, "%x\n", dd_from_dev(dev)->minrev);
}
@@ -508,7 +508,7 @@ static ssize_t board_id_show(struct device *device,
struct device_attribute *attr, char *buf)
{
struct hfi1_ibdev *dev =
- container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+ rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev);
struct hfi1_devdata *dd = dd_from_dev(dev);
int ret;
@@ -524,7 +524,7 @@ static ssize_t boardversion_show(struct device *device,
struct device_attribute *attr, char *buf)
{
struct hfi1_ibdev *dev =
- container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+ rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev);
struct hfi1_devdata *dd = dd_from_dev(dev);
/* The string printed here is already newline-terminated. */
@@ -536,7 +536,7 @@ static ssize_t nctxts_show(struct device *device,
struct device_attribute *attr, char *buf)
{
struct hfi1_ibdev *dev =
- container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+ rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev);
struct hfi1_devdata *dd = dd_from_dev(dev);
/*
@@ -555,7 +555,7 @@ static ssize_t nfreectxts_show(struct device *device,
struct device_attribute *attr, char *buf)
{
struct hfi1_ibdev *dev =
- container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+ rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev);
struct hfi1_devdata *dd = dd_from_dev(dev);
/* Return the number of free user ports (contexts) available. */
@@ -567,7 +567,7 @@ static ssize_t serial_show(struct device *device,
struct device_attribute *attr, char *buf)
{
struct hfi1_ibdev *dev =
- container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+ rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev);
struct hfi1_devdata *dd = dd_from_dev(dev);
return scnprintf(buf, PAGE_SIZE, "%s", dd->serial);
@@ -579,7 +579,7 @@ static ssize_t chip_reset_store(struct device *device,
size_t count)
{
struct hfi1_ibdev *dev =
- container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+ rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev);
struct hfi1_devdata *dd = dd_from_dev(dev);
int ret;
@@ -609,7 +609,7 @@ static ssize_t tempsense_show(struct device *device,
struct device_attribute *attr, char *buf)
{
struct hfi1_ibdev *dev =
- container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+ rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev);
struct hfi1_devdata *dd = dd_from_dev(dev);
struct hfi1_temp temp;
int ret;
diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c
new file mode 100644
index 000000000000..43cbce7a19ea
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/tid_rdma.c
@@ -0,0 +1,5451 @@
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+/*
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ */
+
+#include "hfi.h"
+#include "qp.h"
+#include "rc.h"
+#include "verbs.h"
+#include "tid_rdma.h"
+#include "exp_rcv.h"
+#include "trace.h"
+
+/**
+ * DOC: TID RDMA READ protocol
+ *
+ * This is an end-to-end protocol at the hfi1 level between two nodes that
+ * improves performance by avoiding data copy on the requester side. It
+ * converts a qualified RDMA READ request into a TID RDMA READ request on
+ * the requester side and thereafter handles the request and response
+ * differently. To be qualified, the RDMA READ request should meet the
+ * following:
+ * -- The total data length should be greater than 256K;
+ * -- The total data length should be a multiple of 4K page size;
+ * -- Each local scatter-gather entry should be 4K page aligned;
+ * -- Each local scatter-gather entry should be a multiple of 4K page size;
+ */
+
+#define RCV_TID_FLOW_TABLE_CTRL_FLOW_VALID_SMASK BIT_ULL(32)
+#define RCV_TID_FLOW_TABLE_CTRL_HDR_SUPP_EN_SMASK BIT_ULL(33)
+#define RCV_TID_FLOW_TABLE_CTRL_KEEP_AFTER_SEQ_ERR_SMASK BIT_ULL(34)
+#define RCV_TID_FLOW_TABLE_CTRL_KEEP_ON_GEN_ERR_SMASK BIT_ULL(35)
+#define RCV_TID_FLOW_TABLE_STATUS_SEQ_MISMATCH_SMASK BIT_ULL(37)
+#define RCV_TID_FLOW_TABLE_STATUS_GEN_MISMATCH_SMASK BIT_ULL(38)
+
+/* Maximum number of packets within a flow generation. */
+#define MAX_TID_FLOW_PSN BIT(HFI1_KDETH_BTH_SEQ_SHIFT)
+
+#define GENERATION_MASK 0xFFFFF
+
+static u32 mask_generation(u32 a)
+{
+ return a & GENERATION_MASK;
+}
+
+/* Reserved generation value to set to unused flows for kernel contexts */
+#define KERN_GENERATION_RESERVED mask_generation(U32_MAX)
+
+/*
+ * J_KEY for kernel contexts when TID RDMA is used.
+ * See generate_jkey() in hfi.h for more information.
+ */
+#define TID_RDMA_JKEY 32
+#define HFI1_KERNEL_MIN_JKEY HFI1_ADMIN_JKEY_RANGE
+#define HFI1_KERNEL_MAX_JKEY (2 * HFI1_ADMIN_JKEY_RANGE - 1)
+
+/* Maximum number of segments in flight per QP request. */
+#define TID_RDMA_MAX_READ_SEGS_PER_REQ 6
+#define TID_RDMA_MAX_WRITE_SEGS_PER_REQ 4
+#define MAX_REQ max_t(u16, TID_RDMA_MAX_READ_SEGS_PER_REQ, \
+ TID_RDMA_MAX_WRITE_SEGS_PER_REQ)
+#define MAX_FLOWS roundup_pow_of_two(MAX_REQ + 1)
+
+#define MAX_EXPECTED_PAGES (MAX_EXPECTED_BUFFER / PAGE_SIZE)
+
+#define TID_RDMA_DESTQP_FLOW_SHIFT 11
+#define TID_RDMA_DESTQP_FLOW_MASK 0x1f
+
+#define TID_FLOW_SW_PSN BIT(0)
+
+#define TID_OPFN_QP_CTXT_MASK 0xff
+#define TID_OPFN_QP_CTXT_SHIFT 56
+#define TID_OPFN_QP_KDETH_MASK 0xff
+#define TID_OPFN_QP_KDETH_SHIFT 48
+#define TID_OPFN_MAX_LEN_MASK 0x7ff
+#define TID_OPFN_MAX_LEN_SHIFT 37
+#define TID_OPFN_TIMEOUT_MASK 0x1f
+#define TID_OPFN_TIMEOUT_SHIFT 32
+#define TID_OPFN_RESERVED_MASK 0x3f
+#define TID_OPFN_RESERVED_SHIFT 26
+#define TID_OPFN_URG_MASK 0x1
+#define TID_OPFN_URG_SHIFT 25
+#define TID_OPFN_VER_MASK 0x7
+#define TID_OPFN_VER_SHIFT 22
+#define TID_OPFN_JKEY_MASK 0x3f
+#define TID_OPFN_JKEY_SHIFT 16
+#define TID_OPFN_MAX_READ_MASK 0x3f
+#define TID_OPFN_MAX_READ_SHIFT 10
+#define TID_OPFN_MAX_WRITE_MASK 0x3f
+#define TID_OPFN_MAX_WRITE_SHIFT 4
+
+/*
+ * OPFN TID layout
+ *
+ * 63 47 31 15
+ * NNNNNNNNKKKKKKKK MMMMMMMMMMMTTTTT DDDDDDUVVVJJJJJJ RRRRRRWWWWWWCCCC
+ * 3210987654321098 7654321098765432 1098765432109876 5432109876543210
+ * N - the context Number
+ * K - the Kdeth_qp
+ * M - Max_len
+ * T - Timeout
+ * D - reserveD
+ * V - version
+ * U - Urg capable
+ * J - Jkey
+ * R - max_Read
+ * W - max_Write
+ * C - Capcode
+ */
+
+static u32 tid_rdma_flow_wt;
+
+static void tid_rdma_trigger_resume(struct work_struct *work);
+static void hfi1_kern_exp_rcv_free_flows(struct tid_rdma_request *req);
+static int hfi1_kern_exp_rcv_alloc_flows(struct tid_rdma_request *req,
+ gfp_t gfp);
+static void hfi1_init_trdma_req(struct rvt_qp *qp,
+ struct tid_rdma_request *req);
+static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx);
+static void hfi1_tid_timeout(struct timer_list *t);
+static void hfi1_add_tid_reap_timer(struct rvt_qp *qp);
+static void hfi1_mod_tid_reap_timer(struct rvt_qp *qp);
+static void hfi1_mod_tid_retry_timer(struct rvt_qp *qp);
+static int hfi1_stop_tid_retry_timer(struct rvt_qp *qp);
+static void hfi1_tid_retry_timeout(struct timer_list *t);
+static int make_tid_rdma_ack(struct rvt_qp *qp,
+ struct ib_other_headers *ohdr,
+ struct hfi1_pkt_state *ps);
+static void hfi1_do_tid_send(struct rvt_qp *qp);
+
+static u64 tid_rdma_opfn_encode(struct tid_rdma_params *p)
+{
+ return
+ (((u64)p->qp & TID_OPFN_QP_CTXT_MASK) <<
+ TID_OPFN_QP_CTXT_SHIFT) |
+ ((((u64)p->qp >> 16) & TID_OPFN_QP_KDETH_MASK) <<
+ TID_OPFN_QP_KDETH_SHIFT) |
+ (((u64)((p->max_len >> PAGE_SHIFT) - 1) &
+ TID_OPFN_MAX_LEN_MASK) << TID_OPFN_MAX_LEN_SHIFT) |
+ (((u64)p->timeout & TID_OPFN_TIMEOUT_MASK) <<
+ TID_OPFN_TIMEOUT_SHIFT) |
+ (((u64)p->urg & TID_OPFN_URG_MASK) << TID_OPFN_URG_SHIFT) |
+ (((u64)p->jkey & TID_OPFN_JKEY_MASK) << TID_OPFN_JKEY_SHIFT) |
+ (((u64)p->max_read & TID_OPFN_MAX_READ_MASK) <<
+ TID_OPFN_MAX_READ_SHIFT) |
+ (((u64)p->max_write & TID_OPFN_MAX_WRITE_MASK) <<
+ TID_OPFN_MAX_WRITE_SHIFT);
+}
+
+static void tid_rdma_opfn_decode(struct tid_rdma_params *p, u64 data)
+{
+ p->max_len = (((data >> TID_OPFN_MAX_LEN_SHIFT) &
+ TID_OPFN_MAX_LEN_MASK) + 1) << PAGE_SHIFT;
+ p->jkey = (data >> TID_OPFN_JKEY_SHIFT) & TID_OPFN_JKEY_MASK;
+ p->max_write = (data >> TID_OPFN_MAX_WRITE_SHIFT) &
+ TID_OPFN_MAX_WRITE_MASK;
+ p->max_read = (data >> TID_OPFN_MAX_READ_SHIFT) &
+ TID_OPFN_MAX_READ_MASK;
+ p->qp =
+ ((((data >> TID_OPFN_QP_KDETH_SHIFT) & TID_OPFN_QP_KDETH_MASK)
+ << 16) |
+ ((data >> TID_OPFN_QP_CTXT_SHIFT) & TID_OPFN_QP_CTXT_MASK));
+ p->urg = (data >> TID_OPFN_URG_SHIFT) & TID_OPFN_URG_MASK;
+ p->timeout = (data >> TID_OPFN_TIMEOUT_SHIFT) & TID_OPFN_TIMEOUT_MASK;
+}
+
+void tid_rdma_opfn_init(struct rvt_qp *qp, struct tid_rdma_params *p)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ p->qp = (kdeth_qp << 16) | priv->rcd->ctxt;
+ p->max_len = TID_RDMA_MAX_SEGMENT_SIZE;
+ p->jkey = priv->rcd->jkey;
+ p->max_read = TID_RDMA_MAX_READ_SEGS_PER_REQ;
+ p->max_write = TID_RDMA_MAX_WRITE_SEGS_PER_REQ;
+ p->timeout = qp->timeout;
+ p->urg = is_urg_masked(priv->rcd);
+}
+
+bool tid_rdma_conn_req(struct rvt_qp *qp, u64 *data)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ *data = tid_rdma_opfn_encode(&priv->tid_rdma.local);
+ return true;
+}
+
+bool tid_rdma_conn_reply(struct rvt_qp *qp, u64 data)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+ struct tid_rdma_params *remote, *old;
+ bool ret = true;
+
+ old = rcu_dereference_protected(priv->tid_rdma.remote,
+ lockdep_is_held(&priv->opfn.lock));
+ data &= ~0xfULL;
+ /*
+ * If data passed in is zero, return true so as not to continue the
+ * negotiation process
+ */
+ if (!data || !HFI1_CAP_IS_KSET(TID_RDMA))
+ goto null;
+ /*
+ * If kzalloc fails, return false. This will result in:
+ * * at the requester a new OPFN request being generated to retry
+ * the negotiation
+ * * at the responder, 0 being returned to the requester so as to
+ * disable TID RDMA at both the requester and the responder
+ */
+ remote = kzalloc(sizeof(*remote), GFP_ATOMIC);
+ if (!remote) {
+ ret = false;
+ goto null;
+ }
+
+ tid_rdma_opfn_decode(remote, data);
+ priv->tid_timer_timeout_jiffies =
+ usecs_to_jiffies((((4096UL * (1UL << remote->timeout)) /
+ 1000UL) << 3) * 7);
+ trace_hfi1_opfn_param(qp, 0, &priv->tid_rdma.local);
+ trace_hfi1_opfn_param(qp, 1, remote);
+ rcu_assign_pointer(priv->tid_rdma.remote, remote);
+ /*
+ * A TID RDMA READ request's segment size is not equal to
+ * remote->max_len only when the request's data length is smaller
+ * than remote->max_len. In that case, there will be only one segment.
+ * Therefore, when priv->pkts_ps is used to calculate req->cur_seg
+ * during retry, it will lead to req->cur_seg = 0, which is exactly
+ * what is expected.
+ */
+ priv->pkts_ps = (u16)rvt_div_mtu(qp, remote->max_len);
+ priv->timeout_shift = ilog2(priv->pkts_ps - 1) + 1;
+ goto free;
+null:
+ RCU_INIT_POINTER(priv->tid_rdma.remote, NULL);
+ priv->timeout_shift = 0;
+free:
+ if (old)
+ kfree_rcu(old, rcu_head);
+ return ret;
+}
+
+bool tid_rdma_conn_resp(struct rvt_qp *qp, u64 *data)
+{
+ bool ret;
+
+ ret = tid_rdma_conn_reply(qp, *data);
+ *data = 0;
+ /*
+ * If tid_rdma_conn_reply() returns error, set *data as 0 to indicate
+ * TID RDMA could not be enabled. This will result in TID RDMA being
+ * disabled at the requester too.
+ */
+ if (ret)
+ (void)tid_rdma_conn_req(qp, data);
+ return ret;
+}
+
+void tid_rdma_conn_error(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+ struct tid_rdma_params *old;
+
+ old = rcu_dereference_protected(priv->tid_rdma.remote,
+ lockdep_is_held(&priv->opfn.lock));
+ RCU_INIT_POINTER(priv->tid_rdma.remote, NULL);
+ if (old)
+ kfree_rcu(old, rcu_head);
+}
+
+/* This is called at context initialization time */
+int hfi1_kern_exp_rcv_init(struct hfi1_ctxtdata *rcd, int reinit)
+{
+ if (reinit)
+ return 0;
+
+ BUILD_BUG_ON(TID_RDMA_JKEY < HFI1_KERNEL_MIN_JKEY);
+ BUILD_BUG_ON(TID_RDMA_JKEY > HFI1_KERNEL_MAX_JKEY);
+ rcd->jkey = TID_RDMA_JKEY;
+ hfi1_set_ctxt_jkey(rcd->dd, rcd, rcd->jkey);
+ return hfi1_alloc_ctxt_rcv_groups(rcd);
+}
+
+/**
+ * qp_to_rcd - determine the receive context used by a qp
+ * @qp - the qp
+ *
+ * This routine returns the receive context associated
+ * with a a qp's qpn.
+ *
+ * Returns the context.
+ */
+static struct hfi1_ctxtdata *qp_to_rcd(struct rvt_dev_info *rdi,
+ struct rvt_qp *qp)
+{
+ struct hfi1_ibdev *verbs_dev = container_of(rdi,
+ struct hfi1_ibdev,
+ rdi);
+ struct hfi1_devdata *dd = container_of(verbs_dev,
+ struct hfi1_devdata,
+ verbs_dev);
+ unsigned int ctxt;
+
+ if (qp->ibqp.qp_num == 0)
+ ctxt = 0;
+ else
+ ctxt = ((qp->ibqp.qp_num >> dd->qos_shift) %
+ (dd->n_krcv_queues - 1)) + 1;
+
+ return dd->rcd[ctxt];
+}
+
+int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp,
+ struct ib_qp_init_attr *init_attr)
+{
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ int i, ret;
+
+ qpriv->rcd = qp_to_rcd(rdi, qp);
+
+ spin_lock_init(&qpriv->opfn.lock);
+ INIT_WORK(&qpriv->opfn.opfn_work, opfn_send_conn_request);
+ INIT_WORK(&qpriv->tid_rdma.trigger_work, tid_rdma_trigger_resume);
+ qpriv->flow_state.psn = 0;
+ qpriv->flow_state.index = RXE_NUM_TID_FLOWS;
+ qpriv->flow_state.last_index = RXE_NUM_TID_FLOWS;
+ qpriv->flow_state.generation = KERN_GENERATION_RESERVED;
+ qpriv->s_state = TID_OP(WRITE_RESP);
+ qpriv->s_tid_cur = HFI1_QP_WQE_INVALID;
+ qpriv->s_tid_head = HFI1_QP_WQE_INVALID;
+ qpriv->s_tid_tail = HFI1_QP_WQE_INVALID;
+ qpriv->rnr_nak_state = TID_RNR_NAK_INIT;
+ qpriv->r_tid_head = HFI1_QP_WQE_INVALID;
+ qpriv->r_tid_tail = HFI1_QP_WQE_INVALID;
+ qpriv->r_tid_ack = HFI1_QP_WQE_INVALID;
+ qpriv->r_tid_alloc = HFI1_QP_WQE_INVALID;
+ atomic_set(&qpriv->n_requests, 0);
+ atomic_set(&qpriv->n_tid_requests, 0);
+ timer_setup(&qpriv->s_tid_timer, hfi1_tid_timeout, 0);
+ timer_setup(&qpriv->s_tid_retry_timer, hfi1_tid_retry_timeout, 0);
+ INIT_LIST_HEAD(&qpriv->tid_wait);
+
+ if (init_attr->qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) {
+ struct hfi1_devdata *dd = qpriv->rcd->dd;
+
+ qpriv->pages = kzalloc_node(TID_RDMA_MAX_PAGES *
+ sizeof(*qpriv->pages),
+ GFP_KERNEL, dd->node);
+ if (!qpriv->pages)
+ return -ENOMEM;
+ for (i = 0; i < qp->s_size; i++) {
+ struct hfi1_swqe_priv *priv;
+ struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, i);
+
+ priv = kzalloc_node(sizeof(*priv), GFP_KERNEL,
+ dd->node);
+ if (!priv)
+ return -ENOMEM;
+
+ hfi1_init_trdma_req(qp, &priv->tid_req);
+ priv->tid_req.e.swqe = wqe;
+ wqe->priv = priv;
+ }
+ for (i = 0; i < rvt_max_atomic(rdi); i++) {
+ struct hfi1_ack_priv *priv;
+
+ priv = kzalloc_node(sizeof(*priv), GFP_KERNEL,
+ dd->node);
+ if (!priv)
+ return -ENOMEM;
+
+ hfi1_init_trdma_req(qp, &priv->tid_req);
+ priv->tid_req.e.ack = &qp->s_ack_queue[i];
+
+ ret = hfi1_kern_exp_rcv_alloc_flows(&priv->tid_req,
+ GFP_KERNEL);
+ if (ret) {
+ kfree(priv);
+ return ret;
+ }
+ qp->s_ack_queue[i].priv = priv;
+ }
+ }
+
+ return 0;
+}
+
+void hfi1_qp_priv_tid_free(struct rvt_dev_info *rdi, struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct rvt_swqe *wqe;
+ u32 i;
+
+ if (qp->ibqp.qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) {
+ for (i = 0; i < qp->s_size; i++) {
+ wqe = rvt_get_swqe_ptr(qp, i);
+ kfree(wqe->priv);
+ wqe->priv = NULL;
+ }
+ for (i = 0; i < rvt_max_atomic(rdi); i++) {
+ struct hfi1_ack_priv *priv = qp->s_ack_queue[i].priv;
+
+ if (priv)
+ hfi1_kern_exp_rcv_free_flows(&priv->tid_req);
+ kfree(priv);
+ qp->s_ack_queue[i].priv = NULL;
+ }
+ cancel_work_sync(&qpriv->opfn.opfn_work);
+ kfree(qpriv->pages);
+ qpriv->pages = NULL;
+ }
+}
+
+/* Flow and tid waiter functions */
+/**
+ * DOC: lock ordering
+ *
+ * There are two locks involved with the queuing
+ * routines: the qp s_lock and the exp_lock.
+ *
+ * Since the tid space allocation is called from
+ * the send engine, the qp s_lock is already held.
+ *
+ * The allocation routines will get the exp_lock.
+ *
+ * The first_qp() call is provided to allow the head of
+ * the rcd wait queue to be fetched under the exp_lock and
+ * followed by a drop of the exp_lock.
+ *
+ * Any qp in the wait list will have the qp reference count held
+ * to hold the qp in memory.
+ */
+
+/*
+ * return head of rcd wait list
+ *
+ * Must hold the exp_lock.
+ *
+ * Get a reference to the QP to hold the QP in memory.
+ *
+ * The caller must release the reference when the local
+ * is no longer being used.
+ */
+static struct rvt_qp *first_qp(struct hfi1_ctxtdata *rcd,
+ struct tid_queue *queue)
+ __must_hold(&rcd->exp_lock)
+{
+ struct hfi1_qp_priv *priv;
+
+ lockdep_assert_held(&rcd->exp_lock);
+ priv = list_first_entry_or_null(&queue->queue_head,
+ struct hfi1_qp_priv,
+ tid_wait);
+ if (!priv)
+ return NULL;
+ rvt_get_qp(priv->owner);
+ return priv->owner;
+}
+
+/**
+ * kernel_tid_waiters - determine rcd wait
+ * @rcd: the receive context
+ * @qp: the head of the qp being processed
+ *
+ * This routine will return false IFF
+ * the list is NULL or the head of the
+ * list is the indicated qp.
+ *
+ * Must hold the qp s_lock and the exp_lock.
+ *
+ * Return:
+ * false if either of the conditions below are statisfied:
+ * 1. The list is empty or
+ * 2. The indicated qp is at the head of the list and the
+ * HFI1_S_WAIT_TID_SPACE bit is set in qp->s_flags.
+ * true is returned otherwise.
+ */
+static bool kernel_tid_waiters(struct hfi1_ctxtdata *rcd,
+ struct tid_queue *queue, struct rvt_qp *qp)
+ __must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock)
+{
+ struct rvt_qp *fqp;
+ bool ret = true;
+
+ lockdep_assert_held(&qp->s_lock);
+ lockdep_assert_held(&rcd->exp_lock);
+ fqp = first_qp(rcd, queue);
+ if (!fqp || (fqp == qp && (qp->s_flags & HFI1_S_WAIT_TID_SPACE)))
+ ret = false;
+ rvt_put_qp(fqp);
+ return ret;
+}
+
+/**
+ * dequeue_tid_waiter - dequeue the qp from the list
+ * @qp - the qp to remove the wait list
+ *
+ * This routine removes the indicated qp from the
+ * wait list if it is there.
+ *
+ * This should be done after the hardware flow and
+ * tid array resources have been allocated.
+ *
+ * Must hold the qp s_lock and the rcd exp_lock.
+ *
+ * It assumes the s_lock to protect the s_flags
+ * field and to reliably test the HFI1_S_WAIT_TID_SPACE flag.
+ */
+static void dequeue_tid_waiter(struct hfi1_ctxtdata *rcd,
+ struct tid_queue *queue, struct rvt_qp *qp)
+ __must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ lockdep_assert_held(&qp->s_lock);
+ lockdep_assert_held(&rcd->exp_lock);
+ if (list_empty(&priv->tid_wait))
+ return;
+ list_del_init(&priv->tid_wait);
+ qp->s_flags &= ~HFI1_S_WAIT_TID_SPACE;
+ queue->dequeue++;
+ rvt_put_qp(qp);
+}
+
+/**
+ * queue_qp_for_tid_wait - suspend QP on tid space
+ * @rcd: the receive context
+ * @qp: the qp
+ *
+ * The qp is inserted at the tail of the rcd
+ * wait queue and the HFI1_S_WAIT_TID_SPACE s_flag is set.
+ *
+ * Must hold the qp s_lock and the exp_lock.
+ */
+static void queue_qp_for_tid_wait(struct hfi1_ctxtdata *rcd,
+ struct tid_queue *queue, struct rvt_qp *qp)
+ __must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ lockdep_assert_held(&qp->s_lock);
+ lockdep_assert_held(&rcd->exp_lock);
+ if (list_empty(&priv->tid_wait)) {
+ qp->s_flags |= HFI1_S_WAIT_TID_SPACE;
+ list_add_tail(&priv->tid_wait, &queue->queue_head);
+ priv->tid_enqueue = ++queue->enqueue;
+ rcd->dd->verbs_dev.n_tidwait++;
+ trace_hfi1_qpsleep(qp, HFI1_S_WAIT_TID_SPACE);
+ rvt_get_qp(qp);
+ }
+}
+
+/**
+ * __trigger_tid_waiter - trigger tid waiter
+ * @qp: the qp
+ *
+ * This is a private entrance to schedule the qp
+ * assuming the caller is holding the qp->s_lock.
+ */
+static void __trigger_tid_waiter(struct rvt_qp *qp)
+ __must_hold(&qp->s_lock)
+{
+ lockdep_assert_held(&qp->s_lock);
+ if (!(qp->s_flags & HFI1_S_WAIT_TID_SPACE))
+ return;
+ trace_hfi1_qpwakeup(qp, HFI1_S_WAIT_TID_SPACE);
+ hfi1_schedule_send(qp);
+}
+
+/**
+ * tid_rdma_schedule_tid_wakeup - schedule wakeup for a qp
+ * @qp - the qp
+ *
+ * trigger a schedule or a waiting qp in a deadlock
+ * safe manner. The qp reference is held prior
+ * to this call via first_qp().
+ *
+ * If the qp trigger was already scheduled (!rval)
+ * the the reference is dropped, otherwise the resume
+ * or the destroy cancel will dispatch the reference.
+ */
+static void tid_rdma_schedule_tid_wakeup(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *priv;
+ struct hfi1_ibport *ibp;
+ struct hfi1_pportdata *ppd;
+ struct hfi1_devdata *dd;
+ bool rval;
+
+ if (!qp)
+ return;
+
+ priv = qp->priv;
+ ibp = to_iport(qp->ibqp.device, qp->port_num);
+ ppd = ppd_from_ibp(ibp);
+ dd = dd_from_ibdev(qp->ibqp.device);
+
+ rval = queue_work_on(priv->s_sde ?
+ priv->s_sde->cpu :
+ cpumask_first(cpumask_of_node(dd->node)),
+ ppd->hfi1_wq,
+ &priv->tid_rdma.trigger_work);
+ if (!rval)
+ rvt_put_qp(qp);
+}
+
+/**
+ * tid_rdma_trigger_resume - field a trigger work request
+ * @work - the work item
+ *
+ * Complete the off qp trigger processing by directly
+ * calling the progress routine.
+ */
+static void tid_rdma_trigger_resume(struct work_struct *work)
+{
+ struct tid_rdma_qp_params *tr;
+ struct hfi1_qp_priv *priv;
+ struct rvt_qp *qp;
+
+ tr = container_of(work, struct tid_rdma_qp_params, trigger_work);
+ priv = container_of(tr, struct hfi1_qp_priv, tid_rdma);
+ qp = priv->owner;
+ spin_lock_irq(&qp->s_lock);
+ if (qp->s_flags & HFI1_S_WAIT_TID_SPACE) {
+ spin_unlock_irq(&qp->s_lock);
+ hfi1_do_send(priv->owner, true);
+ } else {
+ spin_unlock_irq(&qp->s_lock);
+ }
+ rvt_put_qp(qp);
+}
+
+/**
+ * tid_rdma_flush_wait - unwind any tid space wait
+ *
+ * This is called when resetting a qp to
+ * allow a destroy or reset to get rid
+ * of any tid space linkage and reference counts.
+ */
+static void _tid_rdma_flush_wait(struct rvt_qp *qp, struct tid_queue *queue)
+ __must_hold(&qp->s_lock)
+{
+ struct hfi1_qp_priv *priv;
+
+ if (!qp)
+ return;
+ lockdep_assert_held(&qp->s_lock);
+ priv = qp->priv;
+ qp->s_flags &= ~HFI1_S_WAIT_TID_SPACE;
+ spin_lock(&priv->rcd->exp_lock);
+ if (!list_empty(&priv->tid_wait)) {
+ list_del_init(&priv->tid_wait);
+ qp->s_flags &= ~HFI1_S_WAIT_TID_SPACE;
+ queue->dequeue++;
+ rvt_put_qp(qp);
+ }
+ spin_unlock(&priv->rcd->exp_lock);
+}
+
+void hfi1_tid_rdma_flush_wait(struct rvt_qp *qp)
+ __must_hold(&qp->s_lock)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ _tid_rdma_flush_wait(qp, &priv->rcd->flow_queue);
+ _tid_rdma_flush_wait(qp, &priv->rcd->rarr_queue);
+}
+
+/* Flow functions */
+/**
+ * kern_reserve_flow - allocate a hardware flow
+ * @rcd - the context to use for allocation
+ * @last - the index of the preferred flow. Use RXE_NUM_TID_FLOWS to
+ * signify "don't care".
+ *
+ * Use a bit mask based allocation to reserve a hardware
+ * flow for use in receiving KDETH data packets. If a preferred flow is
+ * specified the function will attempt to reserve that flow again, if
+ * available.
+ *
+ * The exp_lock must be held.
+ *
+ * Return:
+ * On success: a value postive value between 0 and RXE_NUM_TID_FLOWS - 1
+ * On failure: -EAGAIN
+ */
+static int kern_reserve_flow(struct hfi1_ctxtdata *rcd, int last)
+ __must_hold(&rcd->exp_lock)
+{
+ int nr;
+
+ /* Attempt to reserve the preferred flow index */
+ if (last >= 0 && last < RXE_NUM_TID_FLOWS &&
+ !test_and_set_bit(last, &rcd->flow_mask))
+ return last;
+
+ nr = ffz(rcd->flow_mask);
+ BUILD_BUG_ON(RXE_NUM_TID_FLOWS >=
+ (sizeof(rcd->flow_mask) * BITS_PER_BYTE));
+ if (nr > (RXE_NUM_TID_FLOWS - 1))
+ return -EAGAIN;
+ set_bit(nr, &rcd->flow_mask);
+ return nr;
+}
+
+static void kern_set_hw_flow(struct hfi1_ctxtdata *rcd, u32 generation,
+ u32 flow_idx)
+{
+ u64 reg;
+
+ reg = ((u64)generation << HFI1_KDETH_BTH_SEQ_SHIFT) |
+ RCV_TID_FLOW_TABLE_CTRL_FLOW_VALID_SMASK |
+ RCV_TID_FLOW_TABLE_CTRL_KEEP_AFTER_SEQ_ERR_SMASK |
+ RCV_TID_FLOW_TABLE_CTRL_KEEP_ON_GEN_ERR_SMASK |
+ RCV_TID_FLOW_TABLE_STATUS_SEQ_MISMATCH_SMASK |
+ RCV_TID_FLOW_TABLE_STATUS_GEN_MISMATCH_SMASK;
+
+ if (generation != KERN_GENERATION_RESERVED)
+ reg |= RCV_TID_FLOW_TABLE_CTRL_HDR_SUPP_EN_SMASK;
+
+ write_uctxt_csr(rcd->dd, rcd->ctxt,
+ RCV_TID_FLOW_TABLE + 8 * flow_idx, reg);
+}
+
+static u32 kern_setup_hw_flow(struct hfi1_ctxtdata *rcd, u32 flow_idx)
+ __must_hold(&rcd->exp_lock)
+{
+ u32 generation = rcd->flows[flow_idx].generation;
+
+ kern_set_hw_flow(rcd, generation, flow_idx);
+ return generation;
+}
+
+static u32 kern_flow_generation_next(u32 gen)
+{
+ u32 generation = mask_generation(gen + 1);
+
+ if (generation == KERN_GENERATION_RESERVED)
+ generation = mask_generation(generation + 1);
+ return generation;
+}
+
+static void kern_clear_hw_flow(struct hfi1_ctxtdata *rcd, u32 flow_idx)
+ __must_hold(&rcd->exp_lock)
+{
+ rcd->flows[flow_idx].generation =
+ kern_flow_generation_next(rcd->flows[flow_idx].generation);
+ kern_set_hw_flow(rcd, KERN_GENERATION_RESERVED, flow_idx);
+}
+
+int hfi1_kern_setup_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *qpriv = (struct hfi1_qp_priv *)qp->priv;
+ struct tid_flow_state *fs = &qpriv->flow_state;
+ struct rvt_qp *fqp;
+ unsigned long flags;
+ int ret = 0;
+
+ /* The QP already has an allocated flow */
+ if (fs->index != RXE_NUM_TID_FLOWS)
+ return ret;
+
+ spin_lock_irqsave(&rcd->exp_lock, flags);
+ if (kernel_tid_waiters(rcd, &rcd->flow_queue, qp))
+ goto queue;
+
+ ret = kern_reserve_flow(rcd, fs->last_index);
+ if (ret < 0)
+ goto queue;
+ fs->index = ret;
+ fs->last_index = fs->index;
+
+ /* Generation received in a RESYNC overrides default flow generation */
+ if (fs->generation != KERN_GENERATION_RESERVED)
+ rcd->flows[fs->index].generation = fs->generation;
+ fs->generation = kern_setup_hw_flow(rcd, fs->index);
+ fs->psn = 0;
+ fs->flags = 0;
+ dequeue_tid_waiter(rcd, &rcd->flow_queue, qp);
+ /* get head before dropping lock */
+ fqp = first_qp(rcd, &rcd->flow_queue);
+ spin_unlock_irqrestore(&rcd->exp_lock, flags);
+
+ tid_rdma_schedule_tid_wakeup(fqp);
+ return 0;
+queue:
+ queue_qp_for_tid_wait(rcd, &rcd->flow_queue, qp);
+ spin_unlock_irqrestore(&rcd->exp_lock, flags);
+ return -EAGAIN;
+}
+
+void hfi1_kern_clear_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *qpriv = (struct hfi1_qp_priv *)qp->priv;
+ struct tid_flow_state *fs = &qpriv->flow_state;
+ struct rvt_qp *fqp;
+ unsigned long flags;
+
+ if (fs->index >= RXE_NUM_TID_FLOWS)
+ return;
+ spin_lock_irqsave(&rcd->exp_lock, flags);
+ kern_clear_hw_flow(rcd, fs->index);
+ clear_bit(fs->index, &rcd->flow_mask);
+ fs->index = RXE_NUM_TID_FLOWS;
+ fs->psn = 0;
+ fs->generation = KERN_GENERATION_RESERVED;
+
+ /* get head before dropping lock */
+ fqp = first_qp(rcd, &rcd->flow_queue);
+ spin_unlock_irqrestore(&rcd->exp_lock, flags);
+
+ if (fqp == qp) {
+ __trigger_tid_waiter(fqp);
+ rvt_put_qp(fqp);
+ } else {
+ tid_rdma_schedule_tid_wakeup(fqp);
+ }
+}
+
+void hfi1_kern_init_ctxt_generations(struct hfi1_ctxtdata *rcd)
+{
+ int i;
+
+ for (i = 0; i < RXE_NUM_TID_FLOWS; i++) {
+ rcd->flows[i].generation = mask_generation(prandom_u32());
+ kern_set_hw_flow(rcd, KERN_GENERATION_RESERVED, i);
+ }
+}
+
+/* TID allocation functions */
+static u8 trdma_pset_order(struct tid_rdma_pageset *s)
+{
+ u8 count = s->count;
+
+ return ilog2(count) + 1;
+}
+
+/**
+ * tid_rdma_find_phys_blocks_4k - get groups base on mr info
+ * @npages - number of pages
+ * @pages - pointer to an array of page structs
+ * @list - page set array to return
+ *
+ * This routine returns the number of groups associated with
+ * the current sge information. This implementation is based
+ * on the expected receive find_phys_blocks() adjusted to
+ * use the MR information vs. the pfn.
+ *
+ * Return:
+ * the number of RcvArray entries
+ */
+static u32 tid_rdma_find_phys_blocks_4k(struct tid_rdma_flow *flow,
+ struct page **pages,
+ u32 npages,
+ struct tid_rdma_pageset *list)
+{
+ u32 pagecount, pageidx, setcount = 0, i;
+ void *vaddr, *this_vaddr;
+
+ if (!npages)
+ return 0;
+
+ /*
+ * Look for sets of physically contiguous pages in the user buffer.
+ * This will allow us to optimize Expected RcvArray entry usage by
+ * using the bigger supported sizes.
+ */
+ vaddr = page_address(pages[0]);
+ trace_hfi1_tid_flow_page(flow->req->qp, flow, 0, 0, 0, vaddr);
+ for (pageidx = 0, pagecount = 1, i = 1; i <= npages; i++) {
+ this_vaddr = i < npages ? page_address(pages[i]) : NULL;
+ trace_hfi1_tid_flow_page(flow->req->qp, flow, i, 0, 0,
+ this_vaddr);
+ /*
+ * If the vaddr's are not sequential, pages are not physically
+ * contiguous.
+ */
+ if (this_vaddr != (vaddr + PAGE_SIZE)) {
+ /*
+ * At this point we have to loop over the set of
+ * physically contiguous pages and break them down it
+ * sizes supported by the HW.
+ * There are two main constraints:
+ * 1. The max buffer size is MAX_EXPECTED_BUFFER.
+ * If the total set size is bigger than that
+ * program only a MAX_EXPECTED_BUFFER chunk.
+ * 2. The buffer size has to be a power of two. If
+ * it is not, round down to the closes power of
+ * 2 and program that size.
+ */
+ while (pagecount) {
+ int maxpages = pagecount;
+ u32 bufsize = pagecount * PAGE_SIZE;
+
+ if (bufsize > MAX_EXPECTED_BUFFER)
+ maxpages =
+ MAX_EXPECTED_BUFFER >>
+ PAGE_SHIFT;
+ else if (!is_power_of_2(bufsize))
+ maxpages =
+ rounddown_pow_of_two(bufsize) >>
+ PAGE_SHIFT;
+
+ list[setcount].idx = pageidx;
+ list[setcount].count = maxpages;
+ trace_hfi1_tid_pageset(flow->req->qp, setcount,
+ list[setcount].idx,
+ list[setcount].count);
+ pagecount -= maxpages;
+ pageidx += maxpages;
+ setcount++;
+ }
+ pageidx = i;
+ pagecount = 1;
+ vaddr = this_vaddr;
+ } else {
+ vaddr += PAGE_SIZE;
+ pagecount++;
+ }
+ }
+ /* insure we always return an even number of sets */
+ if (setcount & 1)
+ list[setcount++].count = 0;
+ return setcount;
+}
+
+/**
+ * tid_flush_pages - dump out pages into pagesets
+ * @list - list of pagesets
+ * @idx - pointer to current page index
+ * @pages - number of pages to dump
+ * @sets - current number of pagesset
+ *
+ * This routine flushes out accumuated pages.
+ *
+ * To insure an even number of sets the
+ * code may add a filler.
+ *
+ * This can happen with when pages is not
+ * a power of 2 or pages is a power of 2
+ * less than the maximum pages.
+ *
+ * Return:
+ * The new number of sets
+ */
+
+static u32 tid_flush_pages(struct tid_rdma_pageset *list,
+ u32 *idx, u32 pages, u32 sets)
+{
+ while (pages) {
+ u32 maxpages = pages;
+
+ if (maxpages > MAX_EXPECTED_PAGES)
+ maxpages = MAX_EXPECTED_PAGES;
+ else if (!is_power_of_2(maxpages))
+ maxpages = rounddown_pow_of_two(maxpages);
+ list[sets].idx = *idx;
+ list[sets++].count = maxpages;
+ *idx += maxpages;
+ pages -= maxpages;
+ }
+ /* might need a filler */
+ if (sets & 1)
+ list[sets++].count = 0;
+ return sets;
+}
+
+/**
+ * tid_rdma_find_phys_blocks_8k - get groups base on mr info
+ * @pages - pointer to an array of page structs
+ * @npages - number of pages
+ * @list - page set array to return
+ *
+ * This routine parses an array of pages to compute pagesets
+ * in an 8k compatible way.
+ *
+ * pages are tested two at a time, i, i + 1 for contiguous
+ * pages and i - 1 and i contiguous pages.
+ *
+ * If any condition is false, any accumlated pages are flushed and
+ * v0,v1 are emitted as separate PAGE_SIZE pagesets
+ *
+ * Otherwise, the current 8k is totaled for a future flush.
+ *
+ * Return:
+ * The number of pagesets
+ * list set with the returned number of pagesets
+ *
+ */
+static u32 tid_rdma_find_phys_blocks_8k(struct tid_rdma_flow *flow,
+ struct page **pages,
+ u32 npages,
+ struct tid_rdma_pageset *list)
+{
+ u32 idx, sets = 0, i;
+ u32 pagecnt = 0;
+ void *v0, *v1, *vm1;
+
+ if (!npages)
+ return 0;
+ for (idx = 0, i = 0, vm1 = NULL; i < npages; i += 2) {
+ /* get a new v0 */
+ v0 = page_address(pages[i]);
+ trace_hfi1_tid_flow_page(flow->req->qp, flow, i, 1, 0, v0);
+ v1 = i + 1 < npages ?
+ page_address(pages[i + 1]) : NULL;
+ trace_hfi1_tid_flow_page(flow->req->qp, flow, i, 1, 1, v1);
+ /* compare i, i + 1 vaddr */
+ if (v1 != (v0 + PAGE_SIZE)) {
+ /* flush out pages */
+ sets = tid_flush_pages(list, &idx, pagecnt, sets);
+ /* output v0,v1 as two pagesets */
+ list[sets].idx = idx++;
+ list[sets++].count = 1;
+ if (v1) {
+ list[sets].count = 1;
+ list[sets++].idx = idx++;
+ } else {
+ list[sets++].count = 0;
+ }
+ vm1 = NULL;
+ pagecnt = 0;
+ continue;
+ }
+ /* i,i+1 consecutive, look at i-1,i */
+ if (vm1 && v0 != (vm1 + PAGE_SIZE)) {
+ /* flush out pages */
+ sets = tid_flush_pages(list, &idx, pagecnt, sets);
+ pagecnt = 0;
+ }
+ /* pages will always be a multiple of 8k */
+ pagecnt += 2;
+ /* save i-1 */
+ vm1 = v1;
+ /* move to next pair */
+ }
+ /* dump residual pages at end */
+ sets = tid_flush_pages(list, &idx, npages - idx, sets);
+ /* by design cannot be odd sets */
+ WARN_ON(sets & 1);
+ return sets;
+}
+
+/**
+ * Find pages for one segment of a sge array represented by @ss. The function
+ * does not check the sge, the sge must have been checked for alignment with a
+ * prior call to hfi1_kern_trdma_ok. Other sge checking is done as part of
+ * rvt_lkey_ok and rvt_rkey_ok. Also, the function only modifies the local sge
+ * copy maintained in @ss->sge, the original sge is not modified.
+ *
+ * Unlike IB RDMA WRITE, we can't decrement ss->num_sge here because we are not
+ * releasing the MR reference count at the same time. Otherwise, we'll "leak"
+ * references to the MR. This difference requires that we keep track of progress
+ * into the sg_list. This is done by the cur_seg cursor in the tid_rdma_request
+ * structure.
+ */
+static u32 kern_find_pages(struct tid_rdma_flow *flow,
+ struct page **pages,
+ struct rvt_sge_state *ss, bool *last)
+{
+ struct tid_rdma_request *req = flow->req;
+ struct rvt_sge *sge = &ss->sge;
+ u32 length = flow->req->seg_len;
+ u32 len = PAGE_SIZE;
+ u32 i = 0;
+
+ while (length && req->isge < ss->num_sge) {
+ pages[i++] = virt_to_page(sge->vaddr);
+
+ sge->vaddr += len;
+ sge->length -= len;
+ sge->sge_length -= len;
+ if (!sge->sge_length) {
+ if (++req->isge < ss->num_sge)
+ *sge = ss->sg_list[req->isge - 1];
+ } else if (sge->length == 0 && sge->mr->lkey) {
+ if (++sge->n >= RVT_SEGSZ) {
+ ++sge->m;
+ sge->n = 0;
+ }
+ sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr;
+ sge->length = sge->mr->map[sge->m]->segs[sge->n].length;
+ }
+ length -= len;
+ }
+
+ flow->length = flow->req->seg_len - length;
+ *last = req->isge == ss->num_sge ? false : true;
+ return i;
+}
+
+static void dma_unmap_flow(struct tid_rdma_flow *flow)
+{
+ struct hfi1_devdata *dd;
+ int i;
+ struct tid_rdma_pageset *pset;
+
+ dd = flow->req->rcd->dd;
+ for (i = 0, pset = &flow->pagesets[0]; i < flow->npagesets;
+ i++, pset++) {
+ if (pset->count && pset->addr) {
+ dma_unmap_page(&dd->pcidev->dev,
+ pset->addr,
+ PAGE_SIZE * pset->count,
+ DMA_FROM_DEVICE);
+ pset->mapped = 0;
+ }
+ }
+}
+
+static int dma_map_flow(struct tid_rdma_flow *flow, struct page **pages)
+{
+ int i;
+ struct hfi1_devdata *dd = flow->req->rcd->dd;
+ struct tid_rdma_pageset *pset;
+
+ for (i = 0, pset = &flow->pagesets[0]; i < flow->npagesets;
+ i++, pset++) {
+ if (pset->count) {
+ pset->addr = dma_map_page(&dd->pcidev->dev,
+ pages[pset->idx],
+ 0,
+ PAGE_SIZE * pset->count,
+ DMA_FROM_DEVICE);
+
+ if (dma_mapping_error(&dd->pcidev->dev, pset->addr)) {
+ dma_unmap_flow(flow);
+ return -ENOMEM;
+ }
+ pset->mapped = 1;
+ }
+ }
+ return 0;
+}
+
+static inline bool dma_mapped(struct tid_rdma_flow *flow)
+{
+ return !!flow->pagesets[0].mapped;
+}
+
+/*
+ * Get pages pointers and identify contiguous physical memory chunks for a
+ * segment. All segments are of length flow->req->seg_len.
+ */
+static int kern_get_phys_blocks(struct tid_rdma_flow *flow,
+ struct page **pages,
+ struct rvt_sge_state *ss, bool *last)
+{
+ u8 npages;
+
+ /* Reuse previously computed pagesets, if any */
+ if (flow->npagesets) {
+ trace_hfi1_tid_flow_alloc(flow->req->qp, flow->req->setup_head,
+ flow);
+ if (!dma_mapped(flow))
+ return dma_map_flow(flow, pages);
+ return 0;
+ }
+
+ npages = kern_find_pages(flow, pages, ss, last);
+
+ if (flow->req->qp->pmtu == enum_to_mtu(OPA_MTU_4096))
+ flow->npagesets =
+ tid_rdma_find_phys_blocks_4k(flow, pages, npages,
+ flow->pagesets);
+ else
+ flow->npagesets =
+ tid_rdma_find_phys_blocks_8k(flow, pages, npages,
+ flow->pagesets);
+
+ return dma_map_flow(flow, pages);
+}
+
+static inline void kern_add_tid_node(struct tid_rdma_flow *flow,
+ struct hfi1_ctxtdata *rcd, char *s,
+ struct tid_group *grp, u8 cnt)
+{
+ struct kern_tid_node *node = &flow->tnode[flow->tnode_cnt++];
+
+ WARN_ON_ONCE(flow->tnode_cnt >=
+ (TID_RDMA_MAX_SEGMENT_SIZE >> PAGE_SHIFT));
+ if (WARN_ON_ONCE(cnt & 1))
+ dd_dev_err(rcd->dd,
+ "unexpected odd allocation cnt %u map 0x%x used %u",
+ cnt, grp->map, grp->used);
+
+ node->grp = grp;
+ node->map = grp->map;
+ node->cnt = cnt;
+ trace_hfi1_tid_node_add(flow->req->qp, s, flow->tnode_cnt - 1,
+ grp->base, grp->map, grp->used, cnt);
+}
+
+/*
+ * Try to allocate pageset_count TID's from TID groups for a context
+ *
+ * This function allocates TID's without moving groups between lists or
+ * modifying grp->map. This is done as follows, being cogizant of the lists
+ * between which the TID groups will move:
+ * 1. First allocate complete groups of 8 TID's since this is more efficient,
+ * these groups will move from group->full without affecting used
+ * 2. If more TID's are needed allocate from used (will move from used->full or
+ * stay in used)
+ * 3. If we still don't have the required number of TID's go back and look again
+ * at a complete group (will move from group->used)
+ */
+static int kern_alloc_tids(struct tid_rdma_flow *flow)
+{
+ struct hfi1_ctxtdata *rcd = flow->req->rcd;
+ struct hfi1_devdata *dd = rcd->dd;
+ u32 ngroups, pageidx = 0;
+ struct tid_group *group = NULL, *used;
+ u8 use;
+
+ flow->tnode_cnt = 0;
+ ngroups = flow->npagesets / dd->rcv_entries.group_size;
+ if (!ngroups)
+ goto used_list;
+
+ /* First look at complete groups */
+ list_for_each_entry(group, &rcd->tid_group_list.list, list) {
+ kern_add_tid_node(flow, rcd, "complete groups", group,
+ group->size);
+
+ pageidx += group->size;
+ if (!--ngroups)
+ break;
+ }
+
+ if (pageidx >= flow->npagesets)
+ goto ok;
+
+used_list:
+ /* Now look at partially used groups */
+ list_for_each_entry(used, &rcd->tid_used_list.list, list) {
+ use = min_t(u32, flow->npagesets - pageidx,
+ used->size - used->used);
+ kern_add_tid_node(flow, rcd, "used groups", used, use);
+
+ pageidx += use;
+ if (pageidx >= flow->npagesets)
+ goto ok;
+ }
+
+ /*
+ * Look again at a complete group, continuing from where we left.
+ * However, if we are at the head, we have reached the end of the
+ * complete groups list from the first loop above
+ */
+ if (group && &group->list == &rcd->tid_group_list.list)
+ goto bail_eagain;
+ group = list_prepare_entry(group, &rcd->tid_group_list.list,
+ list);
+ if (list_is_last(&group->list, &rcd->tid_group_list.list))
+ goto bail_eagain;
+ group = list_next_entry(group, list);
+ use = min_t(u32, flow->npagesets - pageidx, group->size);
+ kern_add_tid_node(flow, rcd, "complete continue", group, use);
+ pageidx += use;
+ if (pageidx >= flow->npagesets)
+ goto ok;
+bail_eagain:
+ trace_hfi1_msg_alloc_tids(flow->req->qp, " insufficient tids: needed ",
+ (u64)flow->npagesets);
+ return -EAGAIN;
+ok:
+ return 0;
+}
+
+static void kern_program_rcv_group(struct tid_rdma_flow *flow, int grp_num,
+ u32 *pset_idx)
+{
+ struct hfi1_ctxtdata *rcd = flow->req->rcd;
+ struct hfi1_devdata *dd = rcd->dd;
+ struct kern_tid_node *node = &flow->tnode[grp_num];
+ struct tid_group *grp = node->grp;
+ struct tid_rdma_pageset *pset;
+ u32 pmtu_pg = flow->req->qp->pmtu >> PAGE_SHIFT;
+ u32 rcventry, npages = 0, pair = 0, tidctrl;
+ u8 i, cnt = 0;
+
+ for (i = 0; i < grp->size; i++) {
+ rcventry = grp->base + i;
+
+ if (node->map & BIT(i) || cnt >= node->cnt) {
+ rcv_array_wc_fill(dd, rcventry);
+ continue;
+ }
+ pset = &flow->pagesets[(*pset_idx)++];
+ if (pset->count) {
+ hfi1_put_tid(dd, rcventry, PT_EXPECTED,
+ pset->addr, trdma_pset_order(pset));
+ } else {
+ hfi1_put_tid(dd, rcventry, PT_INVALID, 0, 0);
+ }
+ npages += pset->count;
+
+ rcventry -= rcd->expected_base;
+ tidctrl = pair ? 0x3 : rcventry & 0x1 ? 0x2 : 0x1;
+ /*
+ * A single TID entry will be used to use a rcvarr pair (with
+ * tidctrl 0x3), if ALL these are true (a) the bit pos is even
+ * (b) the group map shows current and the next bits as free
+ * indicating two consecutive rcvarry entries are available (c)
+ * we actually need 2 more entries
+ */
+ pair = !(i & 0x1) && !((node->map >> i) & 0x3) &&
+ node->cnt >= cnt + 2;
+ if (!pair) {
+ if (!pset->count)
+ tidctrl = 0x1;
+ flow->tid_entry[flow->tidcnt++] =
+ EXP_TID_SET(IDX, rcventry >> 1) |
+ EXP_TID_SET(CTRL, tidctrl) |
+ EXP_TID_SET(LEN, npages);
+ trace_hfi1_tid_entry_alloc(/* entry */
+ flow->req->qp, flow->tidcnt - 1,
+ flow->tid_entry[flow->tidcnt - 1]);
+
+ /* Efficient DIV_ROUND_UP(npages, pmtu_pg) */
+ flow->npkts += (npages + pmtu_pg - 1) >> ilog2(pmtu_pg);
+ npages = 0;
+ }
+
+ if (grp->used == grp->size - 1)
+ tid_group_move(grp, &rcd->tid_used_list,
+ &rcd->tid_full_list);
+ else if (!grp->used)
+ tid_group_move(grp, &rcd->tid_group_list,
+ &rcd->tid_used_list);
+
+ grp->used++;
+ grp->map |= BIT(i);
+ cnt++;
+ }
+}
+
+static void kern_unprogram_rcv_group(struct tid_rdma_flow *flow, int grp_num)
+{
+ struct hfi1_ctxtdata *rcd = flow->req->rcd;
+ struct hfi1_devdata *dd = rcd->dd;
+ struct kern_tid_node *node = &flow->tnode[grp_num];
+ struct tid_group *grp = node->grp;
+ u32 rcventry;
+ u8 i, cnt = 0;
+
+ for (i = 0; i < grp->size; i++) {
+ rcventry = grp->base + i;
+
+ if (node->map & BIT(i) || cnt >= node->cnt) {
+ rcv_array_wc_fill(dd, rcventry);
+ continue;
+ }
+
+ hfi1_put_tid(dd, rcventry, PT_INVALID, 0, 0);
+
+ grp->used--;
+ grp->map &= ~BIT(i);
+ cnt++;
+
+ if (grp->used == grp->size - 1)
+ tid_group_move(grp, &rcd->tid_full_list,
+ &rcd->tid_used_list);
+ else if (!grp->used)
+ tid_group_move(grp, &rcd->tid_used_list,
+ &rcd->tid_group_list);
+ }
+ if (WARN_ON_ONCE(cnt & 1)) {
+ struct hfi1_ctxtdata *rcd = flow->req->rcd;
+ struct hfi1_devdata *dd = rcd->dd;
+
+ dd_dev_err(dd, "unexpected odd free cnt %u map 0x%x used %u",
+ cnt, grp->map, grp->used);
+ }
+}
+
+static void kern_program_rcvarray(struct tid_rdma_flow *flow)
+{
+ u32 pset_idx = 0;
+ int i;
+
+ flow->npkts = 0;
+ flow->tidcnt = 0;
+ for (i = 0; i < flow->tnode_cnt; i++)
+ kern_program_rcv_group(flow, i, &pset_idx);
+ trace_hfi1_tid_flow_alloc(flow->req->qp, flow->req->setup_head, flow);
+}
+
+/**
+ * hfi1_kern_exp_rcv_setup() - setup TID's and flow for one segment of a
+ * TID RDMA request
+ *
+ * @req: TID RDMA request for which the segment/flow is being set up
+ * @ss: sge state, maintains state across successive segments of a sge
+ * @last: set to true after the last sge segment has been processed
+ *
+ * This function
+ * (1) finds a free flow entry in the flow circular buffer
+ * (2) finds pages and continuous physical chunks constituing one segment
+ * of an sge
+ * (3) allocates TID group entries for those chunks
+ * (4) programs rcvarray entries in the hardware corresponding to those
+ * TID's
+ * (5) computes a tidarray with formatted TID entries which can be sent
+ * to the sender
+ * (6) Reserves and programs HW flows.
+ * (7) It also manages queing the QP when TID/flow resources are not
+ * available.
+ *
+ * @req points to struct tid_rdma_request of which the segments are a part. The
+ * function uses qp, rcd and seg_len members of @req. In the absence of errors,
+ * req->flow_idx is the index of the flow which has been prepared in this
+ * invocation of function call. With flow = &req->flows[req->flow_idx],
+ * flow->tid_entry contains the TID array which the sender can use for TID RDMA
+ * sends and flow->npkts contains number of packets required to send the
+ * segment.
+ *
+ * hfi1_check_sge_align should be called prior to calling this function and if
+ * it signals error TID RDMA cannot be used for this sge and this function
+ * should not be called.
+ *
+ * For the queuing, caller must hold the flow->req->qp s_lock from the send
+ * engine and the function will procure the exp_lock.
+ *
+ * Return:
+ * The function returns -EAGAIN if sufficient number of TID/flow resources to
+ * map the segment could not be allocated. In this case the function should be
+ * called again with previous arguments to retry the TID allocation. There are
+ * no other error returns. The function returns 0 on success.
+ */
+int hfi1_kern_exp_rcv_setup(struct tid_rdma_request *req,
+ struct rvt_sge_state *ss, bool *last)
+ __must_hold(&req->qp->s_lock)
+{
+ struct tid_rdma_flow *flow = &req->flows[req->setup_head];
+ struct hfi1_ctxtdata *rcd = req->rcd;
+ struct hfi1_qp_priv *qpriv = req->qp->priv;
+ unsigned long flags;
+ struct rvt_qp *fqp;
+ u16 clear_tail = req->clear_tail;
+
+ lockdep_assert_held(&req->qp->s_lock);
+ /*
+ * We return error if either (a) we don't have space in the flow
+ * circular buffer, or (b) we already have max entries in the buffer.
+ * Max entries depend on the type of request we are processing and the
+ * negotiated TID RDMA parameters.
+ */
+ if (!CIRC_SPACE(req->setup_head, clear_tail, MAX_FLOWS) ||
+ CIRC_CNT(req->setup_head, clear_tail, MAX_FLOWS) >=
+ req->n_flows)
+ return -EINVAL;
+
+ /*
+ * Get pages, identify contiguous physical memory chunks for the segment
+ * If we can not determine a DMA address mapping we will treat it just
+ * like if we ran out of space above.
+ */
+ if (kern_get_phys_blocks(flow, qpriv->pages, ss, last)) {
+ hfi1_wait_kmem(flow->req->qp);
+ return -ENOMEM;
+ }
+
+ spin_lock_irqsave(&rcd->exp_lock, flags);
+ if (kernel_tid_waiters(rcd, &rcd->rarr_queue, flow->req->qp))
+ goto queue;
+
+ /*
+ * At this point we know the number of pagesets and hence the number of
+ * TID's to map the segment. Allocate the TID's from the TID groups. If
+ * we cannot allocate the required number we exit and try again later
+ */
+ if (kern_alloc_tids(flow))
+ goto queue;
+ /*
+ * Finally program the TID entries with the pagesets, compute the
+ * tidarray and enable the HW flow
+ */
+ kern_program_rcvarray(flow);
+
+ /*
+ * Setup the flow state with relevant information.
+ * This information is used for tracking the sequence of data packets
+ * for the segment.
+ * The flow is setup here as this is the most accurate time and place
+ * to do so. Doing at a later time runs the risk of the flow data in
+ * qpriv getting out of sync.
+ */
+ memset(&flow->flow_state, 0x0, sizeof(flow->flow_state));
+ flow->idx = qpriv->flow_state.index;
+ flow->flow_state.generation = qpriv->flow_state.generation;
+ flow->flow_state.spsn = qpriv->flow_state.psn;
+ flow->flow_state.lpsn = flow->flow_state.spsn + flow->npkts - 1;
+ flow->flow_state.r_next_psn =
+ full_flow_psn(flow, flow->flow_state.spsn);
+ qpriv->flow_state.psn += flow->npkts;
+
+ dequeue_tid_waiter(rcd, &rcd->rarr_queue, flow->req->qp);
+ /* get head before dropping lock */
+ fqp = first_qp(rcd, &rcd->rarr_queue);
+ spin_unlock_irqrestore(&rcd->exp_lock, flags);
+ tid_rdma_schedule_tid_wakeup(fqp);
+
+ req->setup_head = (req->setup_head + 1) & (MAX_FLOWS - 1);
+ return 0;
+queue:
+ queue_qp_for_tid_wait(rcd, &rcd->rarr_queue, flow->req->qp);
+ spin_unlock_irqrestore(&rcd->exp_lock, flags);
+ return -EAGAIN;
+}
+
+static void hfi1_tid_rdma_reset_flow(struct tid_rdma_flow *flow)
+{
+ flow->npagesets = 0;
+}
+
+/*
+ * This function is called after one segment has been successfully sent to
+ * release the flow and TID HW/SW resources for that segment. The segments for a
+ * TID RDMA request are setup and cleared in FIFO order which is managed using a
+ * circular buffer.
+ */
+int hfi1_kern_exp_rcv_clear(struct tid_rdma_request *req)
+ __must_hold(&req->qp->s_lock)
+{
+ struct tid_rdma_flow *flow = &req->flows[req->clear_tail];
+ struct hfi1_ctxtdata *rcd = req->rcd;
+ unsigned long flags;
+ int i;
+ struct rvt_qp *fqp;
+
+ lockdep_assert_held(&req->qp->s_lock);
+ /* Exit if we have nothing in the flow circular buffer */
+ if (!CIRC_CNT(req->setup_head, req->clear_tail, MAX_FLOWS))
+ return -EINVAL;
+
+ spin_lock_irqsave(&rcd->exp_lock, flags);
+
+ for (i = 0; i < flow->tnode_cnt; i++)
+ kern_unprogram_rcv_group(flow, i);
+ /* To prevent double unprogramming */
+ flow->tnode_cnt = 0;
+ /* get head before dropping lock */
+ fqp = first_qp(rcd, &rcd->rarr_queue);
+ spin_unlock_irqrestore(&rcd->exp_lock, flags);
+
+ dma_unmap_flow(flow);
+
+ hfi1_tid_rdma_reset_flow(flow);
+ req->clear_tail = (req->clear_tail + 1) & (MAX_FLOWS - 1);
+
+ if (fqp == req->qp) {
+ __trigger_tid_waiter(fqp);
+ rvt_put_qp(fqp);
+ } else {
+ tid_rdma_schedule_tid_wakeup(fqp);
+ }
+
+ return 0;
+}
+
+/*
+ * This function is called to release all the tid entries for
+ * a request.
+ */
+void hfi1_kern_exp_rcv_clear_all(struct tid_rdma_request *req)
+ __must_hold(&req->qp->s_lock)
+{
+ /* Use memory barrier for proper ordering */
+ while (CIRC_CNT(req->setup_head, req->clear_tail, MAX_FLOWS)) {
+ if (hfi1_kern_exp_rcv_clear(req))
+ break;
+ }
+}
+
+/**
+ * hfi1_kern_exp_rcv_free_flows - free priviously allocated flow information
+ * @req - the tid rdma request to be cleaned
+ */
+static void hfi1_kern_exp_rcv_free_flows(struct tid_rdma_request *req)
+{
+ kfree(req->flows);
+ req->flows = NULL;
+}
+
+/**
+ * __trdma_clean_swqe - clean up for large sized QPs
+ * @qp: the queue patch
+ * @wqe: the send wqe
+ */
+void __trdma_clean_swqe(struct rvt_qp *qp, struct rvt_swqe *wqe)
+{
+ struct hfi1_swqe_priv *p = wqe->priv;
+
+ hfi1_kern_exp_rcv_free_flows(&p->tid_req);
+}
+
+/*
+ * This can be called at QP create time or in the data path.
+ */
+static int hfi1_kern_exp_rcv_alloc_flows(struct tid_rdma_request *req,
+ gfp_t gfp)
+{
+ struct tid_rdma_flow *flows;
+ int i;
+
+ if (likely(req->flows))
+ return 0;
+ flows = kmalloc_node(MAX_FLOWS * sizeof(*flows), gfp,
+ req->rcd->numa_id);
+ if (!flows)
+ return -ENOMEM;
+ /* mini init */
+ for (i = 0; i < MAX_FLOWS; i++) {
+ flows[i].req = req;
+ flows[i].npagesets = 0;
+ flows[i].pagesets[0].mapped = 0;
+ }
+ req->flows = flows;
+ return 0;
+}
+
+static void hfi1_init_trdma_req(struct rvt_qp *qp,
+ struct tid_rdma_request *req)
+{
+ struct hfi1_qp_priv *qpriv = qp->priv;
+
+ /*
+ * Initialize various TID RDMA request variables.
+ * These variables are "static", which is why they
+ * can be pre-initialized here before the WRs has
+ * even been submitted.
+ * However, non-NULL values for these variables do not
+ * imply that this WQE has been enabled for TID RDMA.
+ * Drivers should check the WQE's opcode to determine
+ * if a request is a TID RDMA one or not.
+ */
+ req->qp = qp;
+ req->rcd = qpriv->rcd;
+}
+
+u64 hfi1_access_sw_tid_wait(const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = context;
+
+ return dd->verbs_dev.n_tidwait;
+}
+
+static struct tid_rdma_flow *find_flow_ib(struct tid_rdma_request *req,
+ u32 psn, u16 *fidx)
+{
+ u16 head, tail;
+ struct tid_rdma_flow *flow;
+
+ head = req->setup_head;
+ tail = req->clear_tail;
+ for ( ; CIRC_CNT(head, tail, MAX_FLOWS);
+ tail = CIRC_NEXT(tail, MAX_FLOWS)) {
+ flow = &req->flows[tail];
+ if (cmp_psn(psn, flow->flow_state.ib_spsn) >= 0 &&
+ cmp_psn(psn, flow->flow_state.ib_lpsn) <= 0) {
+ if (fidx)
+ *fidx = tail;
+ return flow;
+ }
+ }
+ return NULL;
+}
+
+static struct tid_rdma_flow *
+__find_flow_ranged(struct tid_rdma_request *req, u16 head, u16 tail,
+ u32 psn, u16 *fidx)
+{
+ for ( ; CIRC_CNT(head, tail, MAX_FLOWS);
+ tail = CIRC_NEXT(tail, MAX_FLOWS)) {
+ struct tid_rdma_flow *flow = &req->flows[tail];
+ u32 spsn, lpsn;
+
+ spsn = full_flow_psn(flow, flow->flow_state.spsn);
+ lpsn = full_flow_psn(flow, flow->flow_state.lpsn);
+
+ if (cmp_psn(psn, spsn) >= 0 && cmp_psn(psn, lpsn) <= 0) {
+ if (fidx)
+ *fidx = tail;
+ return flow;
+ }
+ }
+ return NULL;
+}
+
+static struct tid_rdma_flow *find_flow(struct tid_rdma_request *req,
+ u32 psn, u16 *fidx)
+{
+ return __find_flow_ranged(req, req->setup_head, req->clear_tail, psn,
+ fidx);
+}
+
+/* TID RDMA READ functions */
+u32 hfi1_build_tid_rdma_read_packet(struct rvt_swqe *wqe,
+ struct ib_other_headers *ohdr, u32 *bth1,
+ u32 *bth2, u32 *len)
+{
+ struct tid_rdma_request *req = wqe_to_tid_req(wqe);
+ struct tid_rdma_flow *flow = &req->flows[req->flow_idx];
+ struct rvt_qp *qp = req->qp;
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct hfi1_swqe_priv *wpriv = wqe->priv;
+ struct tid_rdma_read_req *rreq = &ohdr->u.tid_rdma.r_req;
+ struct tid_rdma_params *remote;
+ u32 req_len = 0;
+ void *req_addr = NULL;
+
+ /* This is the IB psn used to send the request */
+ *bth2 = mask_psn(flow->flow_state.ib_spsn + flow->pkt);
+ trace_hfi1_tid_flow_build_read_pkt(qp, req->flow_idx, flow);
+
+ /* TID Entries for TID RDMA READ payload */
+ req_addr = &flow->tid_entry[flow->tid_idx];
+ req_len = sizeof(*flow->tid_entry) *
+ (flow->tidcnt - flow->tid_idx);
+
+ memset(&ohdr->u.tid_rdma.r_req, 0, sizeof(ohdr->u.tid_rdma.r_req));
+ wpriv->ss.sge.vaddr = req_addr;
+ wpriv->ss.sge.sge_length = req_len;
+ wpriv->ss.sge.length = wpriv->ss.sge.sge_length;
+ /*
+ * We can safely zero these out. Since the first SGE covers the
+ * entire packet, nothing else should even look at the MR.
+ */
+ wpriv->ss.sge.mr = NULL;
+ wpriv->ss.sge.m = 0;
+ wpriv->ss.sge.n = 0;
+
+ wpriv->ss.sg_list = NULL;
+ wpriv->ss.total_len = wpriv->ss.sge.sge_length;
+ wpriv->ss.num_sge = 1;
+
+ /* Construct the TID RDMA READ REQ packet header */
+ rcu_read_lock();
+ remote = rcu_dereference(qpriv->tid_rdma.remote);
+
+ KDETH_RESET(rreq->kdeth0, KVER, 0x1);
+ KDETH_RESET(rreq->kdeth1, JKEY, remote->jkey);
+ rreq->reth.vaddr = cpu_to_be64(wqe->rdma_wr.remote_addr +
+ req->cur_seg * req->seg_len + flow->sent);
+ rreq->reth.rkey = cpu_to_be32(wqe->rdma_wr.rkey);
+ rreq->reth.length = cpu_to_be32(*len);
+ rreq->tid_flow_psn =
+ cpu_to_be32((flow->flow_state.generation <<
+ HFI1_KDETH_BTH_SEQ_SHIFT) |
+ ((flow->flow_state.spsn + flow->pkt) &
+ HFI1_KDETH_BTH_SEQ_MASK));
+ rreq->tid_flow_qp =
+ cpu_to_be32(qpriv->tid_rdma.local.qp |
+ ((flow->idx & TID_RDMA_DESTQP_FLOW_MASK) <<
+ TID_RDMA_DESTQP_FLOW_SHIFT) |
+ qpriv->rcd->ctxt);
+ rreq->verbs_qp = cpu_to_be32(qp->remote_qpn);
+ *bth1 &= ~RVT_QPN_MASK;
+ *bth1 |= remote->qp;
+ *bth2 |= IB_BTH_REQ_ACK;
+ rcu_read_unlock();
+
+ /* We are done with this segment */
+ flow->sent += *len;
+ req->cur_seg++;
+ qp->s_state = TID_OP(READ_REQ);
+ req->ack_pending++;
+ req->flow_idx = (req->flow_idx + 1) & (MAX_FLOWS - 1);
+ qpriv->pending_tid_r_segs++;
+ qp->s_num_rd_atomic++;
+
+ /* Set the TID RDMA READ request payload size */
+ *len = req_len;
+
+ return sizeof(ohdr->u.tid_rdma.r_req) / sizeof(u32);
+}
+
+/*
+ * @len: contains the data length to read upon entry and the read request
+ * payload length upon exit.
+ */
+u32 hfi1_build_tid_rdma_read_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
+ struct ib_other_headers *ohdr, u32 *bth1,
+ u32 *bth2, u32 *len)
+ __must_hold(&qp->s_lock)
+{
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct tid_rdma_request *req = wqe_to_tid_req(wqe);
+ struct tid_rdma_flow *flow = NULL;
+ u32 hdwords = 0;
+ bool last;
+ bool retry = true;
+ u32 npkts = rvt_div_round_up_mtu(qp, *len);
+
+ trace_hfi1_tid_req_build_read_req(qp, 0, wqe->wr.opcode, wqe->psn,
+ wqe->lpsn, req);
+ /*
+ * Check sync conditions. Make sure that there are no pending
+ * segments before freeing the flow.
+ */
+sync_check:
+ if (req->state == TID_REQUEST_SYNC) {
+ if (qpriv->pending_tid_r_segs)
+ goto done;
+
+ hfi1_kern_clear_hw_flow(req->rcd, qp);
+ req->state = TID_REQUEST_ACTIVE;
+ }
+
+ /*
+ * If the request for this segment is resent, the tid resources should
+ * have been allocated before. In this case, req->flow_idx should
+ * fall behind req->setup_head.
+ */
+ if (req->flow_idx == req->setup_head) {
+ retry = false;
+ if (req->state == TID_REQUEST_RESEND) {
+ /*
+ * This is the first new segment for a request whose
+ * earlier segments have been re-sent. We need to
+ * set up the sge pointer correctly.
+ */
+ restart_sge(&qp->s_sge, wqe, req->s_next_psn,
+ qp->pmtu);
+ req->isge = 0;
+ req->state = TID_REQUEST_ACTIVE;
+ }
+
+ /*
+ * Check sync. The last PSN of each generation is reserved for
+ * RESYNC.
+ */
+ if ((qpriv->flow_state.psn + npkts) > MAX_TID_FLOW_PSN - 1) {
+ req->state = TID_REQUEST_SYNC;
+ goto sync_check;
+ }
+
+ /* Allocate the flow if not yet */
+ if (hfi1_kern_setup_hw_flow(qpriv->rcd, qp))
+ goto done;
+
+ /*
+ * The following call will advance req->setup_head after
+ * allocating the tid entries.
+ */
+ if (hfi1_kern_exp_rcv_setup(req, &qp->s_sge, &last)) {
+ req->state = TID_REQUEST_QUEUED;
+
+ /*
+ * We don't have resources for this segment. The QP has
+ * already been queued.
+ */
+ goto done;
+ }
+ }
+
+ /* req->flow_idx should only be one slot behind req->setup_head */
+ flow = &req->flows[req->flow_idx];
+ flow->pkt = 0;
+ flow->tid_idx = 0;
+ flow->sent = 0;
+ if (!retry) {
+ /* Set the first and last IB PSN for the flow in use.*/
+ flow->flow_state.ib_spsn = req->s_next_psn;
+ flow->flow_state.ib_lpsn =
+ flow->flow_state.ib_spsn + flow->npkts - 1;
+ }
+
+ /* Calculate the next segment start psn.*/
+ req->s_next_psn += flow->npkts;
+
+ /* Build the packet header */
+ hdwords = hfi1_build_tid_rdma_read_packet(wqe, ohdr, bth1, bth2, len);
+done:
+ return hdwords;
+}
+
+/*
+ * Validate and accept the TID RDMA READ request parameters.
+ * Return 0 if the request is accepted successfully;
+ * Return 1 otherwise.
+ */
+static int tid_rdma_rcv_read_request(struct rvt_qp *qp,
+ struct rvt_ack_entry *e,
+ struct hfi1_packet *packet,
+ struct ib_other_headers *ohdr,
+ u32 bth0, u32 psn, u64 vaddr, u32 len)
+{
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct tid_rdma_request *req;
+ struct tid_rdma_flow *flow;
+ u32 flow_psn, i, tidlen = 0, pktlen, tlen;
+
+ req = ack_to_tid_req(e);
+
+ /* Validate the payload first */
+ flow = &req->flows[req->setup_head];
+
+ /* payload length = packet length - (header length + ICRC length) */
+ pktlen = packet->tlen - (packet->hlen + 4);
+ if (pktlen > sizeof(flow->tid_entry))
+ return 1;
+ memcpy(flow->tid_entry, packet->ebuf, pktlen);
+ flow->tidcnt = pktlen / sizeof(*flow->tid_entry);
+
+ /*
+ * Walk the TID_ENTRY list to make sure we have enough space for a
+ * complete segment. Also calculate the number of required packets.
+ */
+ flow->npkts = rvt_div_round_up_mtu(qp, len);
+ for (i = 0; i < flow->tidcnt; i++) {
+ trace_hfi1_tid_entry_rcv_read_req(qp, i,
+ flow->tid_entry[i]);
+ tlen = EXP_TID_GET(flow->tid_entry[i], LEN);
+ if (!tlen)
+ return 1;
+
+ /*
+ * For tid pair (tidctr == 3), the buffer size of the pair
+ * should be the sum of the buffer size described by each
+ * tid entry. However, only the first entry needs to be
+ * specified in the request (see WFR HAS Section 8.5.7.1).
+ */
+ tidlen += tlen;
+ }
+ if (tidlen * PAGE_SIZE < len)
+ return 1;
+
+ /* Empty the flow array */
+ req->clear_tail = req->setup_head;
+ flow->pkt = 0;
+ flow->tid_idx = 0;
+ flow->tid_offset = 0;
+ flow->sent = 0;
+ flow->tid_qpn = be32_to_cpu(ohdr->u.tid_rdma.r_req.tid_flow_qp);
+ flow->idx = (flow->tid_qpn >> TID_RDMA_DESTQP_FLOW_SHIFT) &
+ TID_RDMA_DESTQP_FLOW_MASK;
+ flow_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.r_req.tid_flow_psn));
+ flow->flow_state.generation = flow_psn >> HFI1_KDETH_BTH_SEQ_SHIFT;
+ flow->flow_state.spsn = flow_psn & HFI1_KDETH_BTH_SEQ_MASK;
+ flow->length = len;
+
+ flow->flow_state.lpsn = flow->flow_state.spsn +
+ flow->npkts - 1;
+ flow->flow_state.ib_spsn = psn;
+ flow->flow_state.ib_lpsn = flow->flow_state.ib_spsn + flow->npkts - 1;
+
+ trace_hfi1_tid_flow_rcv_read_req(qp, req->setup_head, flow);
+ /* Set the initial flow index to the current flow. */
+ req->flow_idx = req->setup_head;
+
+ /* advance circular buffer head */
+ req->setup_head = (req->setup_head + 1) & (MAX_FLOWS - 1);
+
+ /*
+ * Compute last PSN for request.
+ */
+ e->opcode = (bth0 >> 24) & 0xff;
+ e->psn = psn;
+ e->lpsn = psn + flow->npkts - 1;
+ e->sent = 0;
+
+ req->n_flows = qpriv->tid_rdma.local.max_read;
+ req->state = TID_REQUEST_ACTIVE;
+ req->cur_seg = 0;
+ req->comp_seg = 0;
+ req->ack_seg = 0;
+ req->isge = 0;
+ req->seg_len = qpriv->tid_rdma.local.max_len;
+ req->total_len = len;
+ req->total_segs = 1;
+ req->r_flow_psn = e->psn;
+
+ trace_hfi1_tid_req_rcv_read_req(qp, 0, e->opcode, e->psn, e->lpsn,
+ req);
+ return 0;
+}
+
+static int tid_rdma_rcv_error(struct hfi1_packet *packet,
+ struct ib_other_headers *ohdr,
+ struct rvt_qp *qp, u32 psn, int diff)
+{
+ struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+ struct hfi1_ctxtdata *rcd = ((struct hfi1_qp_priv *)qp->priv)->rcd;
+ struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct rvt_ack_entry *e;
+ struct tid_rdma_request *req;
+ unsigned long flags;
+ u8 prev;
+ bool old_req;
+
+ trace_hfi1_rsp_tid_rcv_error(qp, psn);
+ trace_hfi1_tid_rdma_rcv_err(qp, 0, psn, diff);
+ if (diff > 0) {
+ /* sequence error */
+ if (!qp->r_nak_state) {
+ ibp->rvp.n_rc_seqnak++;
+ qp->r_nak_state = IB_NAK_PSN_ERROR;
+ qp->r_ack_psn = qp->r_psn;
+ rc_defered_ack(rcd, qp);
+ }
+ goto done;
+ }
+
+ ibp->rvp.n_rc_dupreq++;
+
+ spin_lock_irqsave(&qp->s_lock, flags);
+ e = find_prev_entry(qp, psn, &prev, NULL, &old_req);
+ if (!e || (e->opcode != TID_OP(READ_REQ) &&
+ e->opcode != TID_OP(WRITE_REQ)))
+ goto unlock;
+
+ req = ack_to_tid_req(e);
+ req->r_flow_psn = psn;
+ trace_hfi1_tid_req_rcv_err(qp, 0, e->opcode, e->psn, e->lpsn, req);
+ if (e->opcode == TID_OP(READ_REQ)) {
+ struct ib_reth *reth;
+ u32 offset;
+ u32 len;
+ u32 rkey;
+ u64 vaddr;
+ int ok;
+ u32 bth0;
+
+ reth = &ohdr->u.tid_rdma.r_req.reth;
+ /*
+ * The requester always restarts from the start of the original
+ * request.
+ */
+ offset = delta_psn(psn, e->psn) * qp->pmtu;
+ len = be32_to_cpu(reth->length);
+ if (psn != e->psn || len != req->total_len)
+ goto unlock;
+
+ if (e->rdma_sge.mr) {
+ rvt_put_mr(e->rdma_sge.mr);
+ e->rdma_sge.mr = NULL;
+ }
+
+ rkey = be32_to_cpu(reth->rkey);
+ vaddr = get_ib_reth_vaddr(reth);
+
+ qp->r_len = len;
+ ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr, rkey,
+ IB_ACCESS_REMOTE_READ);
+ if (unlikely(!ok))
+ goto unlock;
+
+ /*
+ * If all the response packets for the current request have
+ * been sent out and this request is complete (old_request
+ * == false) and the TID flow may be unusable (the
+ * req->clear_tail is advanced). However, when an earlier
+ * request is received, this request will not be complete any
+ * more (qp->s_tail_ack_queue is moved back, see below).
+ * Consequently, we need to update the TID flow info everytime
+ * a duplicate request is received.
+ */
+ bth0 = be32_to_cpu(ohdr->bth[0]);
+ if (tid_rdma_rcv_read_request(qp, e, packet, ohdr, bth0, psn,
+ vaddr, len))
+ goto unlock;
+
+ /*
+ * True if the request is already scheduled (between
+ * qp->s_tail_ack_queue and qp->r_head_ack_queue);
+ */
+ if (old_req)
+ goto unlock;
+ } else {
+ struct flow_state *fstate;
+ bool schedule = false;
+ u8 i;
+
+ if (req->state == TID_REQUEST_RESEND) {
+ req->state = TID_REQUEST_RESEND_ACTIVE;
+ } else if (req->state == TID_REQUEST_INIT_RESEND) {
+ req->state = TID_REQUEST_INIT;
+ schedule = true;
+ }
+
+ /*
+ * True if the request is already scheduled (between
+ * qp->s_tail_ack_queue and qp->r_head_ack_queue).
+ * Also, don't change requests, which are at the SYNC
+ * point and haven't generated any responses yet.
+ * There is nothing to retransmit for them yet.
+ */
+ if (old_req || req->state == TID_REQUEST_INIT ||
+ (req->state == TID_REQUEST_SYNC && !req->cur_seg)) {
+ for (i = prev + 1; ; i++) {
+ if (i > rvt_size_atomic(&dev->rdi))
+ i = 0;
+ if (i == qp->r_head_ack_queue)
+ break;
+ e = &qp->s_ack_queue[i];
+ req = ack_to_tid_req(e);
+ if (e->opcode == TID_OP(WRITE_REQ) &&
+ req->state == TID_REQUEST_INIT)
+ req->state = TID_REQUEST_INIT_RESEND;
+ }
+ /*
+ * If the state of the request has been changed,
+ * the first leg needs to get scheduled in order to
+ * pick up the change. Otherwise, normal response
+ * processing should take care of it.
+ */
+ if (!schedule)
+ goto unlock;
+ }
+
+ /*
+ * If there is no more allocated segment, just schedule the qp
+ * without changing any state.
+ */
+ if (req->clear_tail == req->setup_head)
+ goto schedule;
+ /*
+ * If this request has sent responses for segments, which have
+ * not received data yet (flow_idx != clear_tail), the flow_idx
+ * pointer needs to be adjusted so the same responses can be
+ * re-sent.
+ */
+ if (CIRC_CNT(req->flow_idx, req->clear_tail, MAX_FLOWS)) {
+ fstate = &req->flows[req->clear_tail].flow_state;
+ qpriv->pending_tid_w_segs -=
+ CIRC_CNT(req->flow_idx, req->clear_tail,
+ MAX_FLOWS);
+ req->flow_idx =
+ CIRC_ADD(req->clear_tail,
+ delta_psn(psn, fstate->resp_ib_psn),
+ MAX_FLOWS);
+ qpriv->pending_tid_w_segs +=
+ delta_psn(psn, fstate->resp_ib_psn);
+ /*
+ * When flow_idx == setup_head, we've gotten a duplicate
+ * request for a segment, which has not been allocated
+ * yet. In that case, don't adjust this request.
+ * However, we still want to go through the loop below
+ * to adjust all subsequent requests.
+ */
+ if (CIRC_CNT(req->setup_head, req->flow_idx,
+ MAX_FLOWS)) {
+ req->cur_seg = delta_psn(psn, e->psn);
+ req->state = TID_REQUEST_RESEND_ACTIVE;
+ }
+ }
+
+ for (i = prev + 1; ; i++) {
+ /*
+ * Look at everything up to and including
+ * s_tail_ack_queue
+ */
+ if (i > rvt_size_atomic(&dev->rdi))
+ i = 0;
+ if (i == qp->r_head_ack_queue)
+ break;
+ e = &qp->s_ack_queue[i];
+ req = ack_to_tid_req(e);
+ trace_hfi1_tid_req_rcv_err(qp, 0, e->opcode, e->psn,
+ e->lpsn, req);
+ if (e->opcode != TID_OP(WRITE_REQ) ||
+ req->cur_seg == req->comp_seg ||
+ req->state == TID_REQUEST_INIT ||
+ req->state == TID_REQUEST_INIT_RESEND) {
+ if (req->state == TID_REQUEST_INIT)
+ req->state = TID_REQUEST_INIT_RESEND;
+ continue;
+ }
+ qpriv->pending_tid_w_segs -=
+ CIRC_CNT(req->flow_idx,
+ req->clear_tail,
+ MAX_FLOWS);
+ req->flow_idx = req->clear_tail;
+ req->state = TID_REQUEST_RESEND;
+ req->cur_seg = req->comp_seg;
+ }
+ qpriv->s_flags &= ~HFI1_R_TID_WAIT_INTERLCK;
+ }
+ /* Re-process old requests.*/
+ if (qp->s_acked_ack_queue == qp->s_tail_ack_queue)
+ qp->s_acked_ack_queue = prev;
+ qp->s_tail_ack_queue = prev;
+ /*
+ * Since the qp->s_tail_ack_queue is modified, the
+ * qp->s_ack_state must be changed to re-initialize
+ * qp->s_ack_rdma_sge; Otherwise, we will end up in
+ * wrong memory region.
+ */
+ qp->s_ack_state = OP(ACKNOWLEDGE);
+schedule:
+ /*
+ * It's possible to receive a retry psn that is earlier than an RNRNAK
+ * psn. In this case, the rnrnak state should be cleared.
+ */
+ if (qpriv->rnr_nak_state) {
+ qp->s_nak_state = 0;
+ qpriv->rnr_nak_state = TID_RNR_NAK_INIT;
+ qp->r_psn = e->lpsn + 1;
+ hfi1_tid_write_alloc_resources(qp, true);
+ }
+
+ qp->r_state = e->opcode;
+ qp->r_nak_state = 0;
+ qp->s_flags |= RVT_S_RESP_PENDING;
+ hfi1_schedule_send(qp);
+unlock:
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+done:
+ return 1;
+}
+
+void hfi1_rc_rcv_tid_rdma_read_req(struct hfi1_packet *packet)
+{
+ /* HANDLER FOR TID RDMA READ REQUEST packet (Responder side)*/
+
+ /*
+ * 1. Verify TID RDMA READ REQ as per IB_OPCODE_RC_RDMA_READ
+ * (see hfi1_rc_rcv())
+ * 2. Put TID RDMA READ REQ into the response queueu (s_ack_queue)
+ * - Setup struct tid_rdma_req with request info
+ * - Initialize struct tid_rdma_flow info;
+ * - Copy TID entries;
+ * 3. Set the qp->s_ack_state.
+ * 4. Set RVT_S_RESP_PENDING in s_flags.
+ * 5. Kick the send engine (hfi1_schedule_send())
+ */
+ struct hfi1_ctxtdata *rcd = packet->rcd;
+ struct rvt_qp *qp = packet->qp;
+ struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+ struct ib_other_headers *ohdr = packet->ohdr;
+ struct rvt_ack_entry *e;
+ unsigned long flags;
+ struct ib_reth *reth;
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ u32 bth0, psn, len, rkey;
+ bool is_fecn;
+ u8 next;
+ u64 vaddr;
+ int diff;
+ u8 nack_state = IB_NAK_INVALID_REQUEST;
+
+ bth0 = be32_to_cpu(ohdr->bth[0]);
+ if (hfi1_ruc_check_hdr(ibp, packet))
+ return;
+
+ is_fecn = process_ecn(qp, packet);
+ psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
+ trace_hfi1_rsp_rcv_tid_read_req(qp, psn);
+
+ if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST))
+ rvt_comm_est(qp);
+
+ if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
+ goto nack_inv;
+
+ reth = &ohdr->u.tid_rdma.r_req.reth;
+ vaddr = be64_to_cpu(reth->vaddr);
+ len = be32_to_cpu(reth->length);
+ /* The length needs to be in multiples of PAGE_SIZE */
+ if (!len || len & ~PAGE_MASK || len > qpriv->tid_rdma.local.max_len)
+ goto nack_inv;
+
+ diff = delta_psn(psn, qp->r_psn);
+ if (unlikely(diff)) {
+ if (tid_rdma_rcv_error(packet, ohdr, qp, psn, diff))
+ return;
+ goto send_ack;
+ }
+
+ /* We've verified the request, insert it into the ack queue. */
+ next = qp->r_head_ack_queue + 1;
+ if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
+ next = 0;
+ spin_lock_irqsave(&qp->s_lock, flags);
+ if (unlikely(next == qp->s_tail_ack_queue)) {
+ if (!qp->s_ack_queue[next].sent) {
+ nack_state = IB_NAK_REMOTE_OPERATIONAL_ERROR;
+ goto nack_inv_unlock;
+ }
+ update_ack_queue(qp, next);
+ }
+ e = &qp->s_ack_queue[qp->r_head_ack_queue];
+ if (e->rdma_sge.mr) {
+ rvt_put_mr(e->rdma_sge.mr);
+ e->rdma_sge.mr = NULL;
+ }
+
+ rkey = be32_to_cpu(reth->rkey);
+ qp->r_len = len;
+
+ if (unlikely(!rvt_rkey_ok(qp, &e->rdma_sge, qp->r_len, vaddr,
+ rkey, IB_ACCESS_REMOTE_READ)))
+ goto nack_acc;
+
+ /* Accept the request parameters */
+ if (tid_rdma_rcv_read_request(qp, e, packet, ohdr, bth0, psn, vaddr,
+ len))
+ goto nack_inv_unlock;
+
+ qp->r_state = e->opcode;
+ qp->r_nak_state = 0;
+ /*
+ * We need to increment the MSN here instead of when we
+ * finish sending the result since a duplicate request would
+ * increment it more than once.
+ */
+ qp->r_msn++;
+ qp->r_psn += e->lpsn - e->psn + 1;
+
+ qp->r_head_ack_queue = next;
+
+ /*
+ * For all requests other than TID WRITE which are added to the ack
+ * queue, qpriv->r_tid_alloc follows qp->r_head_ack_queue. It is ok to
+ * do this because of interlocks between these and TID WRITE
+ * requests. The same change has also been made in hfi1_rc_rcv().
+ */
+ qpriv->r_tid_alloc = qp->r_head_ack_queue;
+
+ /* Schedule the send tasklet. */
+ qp->s_flags |= RVT_S_RESP_PENDING;
+ hfi1_schedule_send(qp);
+
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ if (is_fecn)
+ goto send_ack;
+ return;
+
+nack_inv_unlock:
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+nack_inv:
+ rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+ qp->r_nak_state = nack_state;
+ qp->r_ack_psn = qp->r_psn;
+ /* Queue NAK for later */
+ rc_defered_ack(rcd, qp);
+ return;
+nack_acc:
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ rvt_rc_error(qp, IB_WC_LOC_PROT_ERR);
+ qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
+ qp->r_ack_psn = qp->r_psn;
+send_ack:
+ hfi1_send_rc_ack(packet, is_fecn);
+}
+
+u32 hfi1_build_tid_rdma_read_resp(struct rvt_qp *qp, struct rvt_ack_entry *e,
+ struct ib_other_headers *ohdr, u32 *bth0,
+ u32 *bth1, u32 *bth2, u32 *len, bool *last)
+{
+ struct hfi1_ack_priv *epriv = e->priv;
+ struct tid_rdma_request *req = &epriv->tid_req;
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct tid_rdma_flow *flow = &req->flows[req->clear_tail];
+ u32 tidentry = flow->tid_entry[flow->tid_idx];
+ u32 tidlen = EXP_TID_GET(tidentry, LEN) << PAGE_SHIFT;
+ struct tid_rdma_read_resp *resp = &ohdr->u.tid_rdma.r_rsp;
+ u32 next_offset, om = KDETH_OM_LARGE;
+ bool last_pkt;
+ u32 hdwords = 0;
+ struct tid_rdma_params *remote;
+
+ *len = min_t(u32, qp->pmtu, tidlen - flow->tid_offset);
+ flow->sent += *len;
+ next_offset = flow->tid_offset + *len;
+ last_pkt = (flow->sent >= flow->length);
+
+ trace_hfi1_tid_entry_build_read_resp(qp, flow->tid_idx, tidentry);
+ trace_hfi1_tid_flow_build_read_resp(qp, req->clear_tail, flow);
+
+ rcu_read_lock();
+ remote = rcu_dereference(qpriv->tid_rdma.remote);
+ if (!remote) {
+ rcu_read_unlock();
+ goto done;
+ }
+ KDETH_RESET(resp->kdeth0, KVER, 0x1);
+ KDETH_SET(resp->kdeth0, SH, !last_pkt);
+ KDETH_SET(resp->kdeth0, INTR, !!(!last_pkt && remote->urg));
+ KDETH_SET(resp->kdeth0, TIDCTRL, EXP_TID_GET(tidentry, CTRL));
+ KDETH_SET(resp->kdeth0, TID, EXP_TID_GET(tidentry, IDX));
+ KDETH_SET(resp->kdeth0, OM, om == KDETH_OM_LARGE);
+ KDETH_SET(resp->kdeth0, OFFSET, flow->tid_offset / om);
+ KDETH_RESET(resp->kdeth1, JKEY, remote->jkey);
+ resp->verbs_qp = cpu_to_be32(qp->remote_qpn);
+ rcu_read_unlock();
+
+ resp->aeth = rvt_compute_aeth(qp);
+ resp->verbs_psn = cpu_to_be32(mask_psn(flow->flow_state.ib_spsn +
+ flow->pkt));
+
+ *bth0 = TID_OP(READ_RESP) << 24;
+ *bth1 = flow->tid_qpn;
+ *bth2 = mask_psn(((flow->flow_state.spsn + flow->pkt++) &
+ HFI1_KDETH_BTH_SEQ_MASK) |
+ (flow->flow_state.generation <<
+ HFI1_KDETH_BTH_SEQ_SHIFT));
+ *last = last_pkt;
+ if (last_pkt)
+ /* Advance to next flow */
+ req->clear_tail = (req->clear_tail + 1) &
+ (MAX_FLOWS - 1);
+
+ if (next_offset >= tidlen) {
+ flow->tid_offset = 0;
+ flow->tid_idx++;
+ } else {
+ flow->tid_offset = next_offset;
+ }
+
+ hdwords = sizeof(ohdr->u.tid_rdma.r_rsp) / sizeof(u32);
+
+done:
+ return hdwords;
+}
+
+static inline struct tid_rdma_request *
+find_tid_request(struct rvt_qp *qp, u32 psn, enum ib_wr_opcode opcode)
+ __must_hold(&qp->s_lock)
+{
+ struct rvt_swqe *wqe;
+ struct tid_rdma_request *req = NULL;
+ u32 i, end;
+
+ end = qp->s_cur + 1;
+ if (end == qp->s_size)
+ end = 0;
+ for (i = qp->s_acked; i != end;) {
+ wqe = rvt_get_swqe_ptr(qp, i);
+ if (cmp_psn(psn, wqe->psn) >= 0 &&
+ cmp_psn(psn, wqe->lpsn) <= 0) {
+ if (wqe->wr.opcode == opcode)
+ req = wqe_to_tid_req(wqe);
+ break;
+ }
+ if (++i == qp->s_size)
+ i = 0;
+ }
+
+ return req;
+}
+
+void hfi1_rc_rcv_tid_rdma_read_resp(struct hfi1_packet *packet)
+{
+ /* HANDLER FOR TID RDMA READ RESPONSE packet (Requestor side */
+
+ /*
+ * 1. Find matching SWQE
+ * 2. Check that the entire segment has been read.
+ * 3. Remove HFI1_S_WAIT_TID_RESP from s_flags.
+ * 4. Free the TID flow resources.
+ * 5. Kick the send engine (hfi1_schedule_send())
+ */
+ struct ib_other_headers *ohdr = packet->ohdr;
+ struct rvt_qp *qp = packet->qp;
+ struct hfi1_qp_priv *priv = qp->priv;
+ struct hfi1_ctxtdata *rcd = packet->rcd;
+ struct tid_rdma_request *req;
+ struct tid_rdma_flow *flow;
+ u32 opcode, aeth;
+ bool is_fecn;
+ unsigned long flags;
+ u32 kpsn, ipsn;
+
+ trace_hfi1_sender_rcv_tid_read_resp(qp);
+ is_fecn = process_ecn(qp, packet);
+ kpsn = mask_psn(be32_to_cpu(ohdr->bth[2]));
+ aeth = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.aeth);
+ opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
+
+ spin_lock_irqsave(&qp->s_lock, flags);
+ ipsn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_psn));
+ req = find_tid_request(qp, ipsn, IB_WR_TID_RDMA_READ);
+ if (unlikely(!req))
+ goto ack_op_err;
+
+ flow = &req->flows[req->clear_tail];
+ /* When header suppression is disabled */
+ if (cmp_psn(ipsn, flow->flow_state.ib_lpsn))
+ goto ack_done;
+ req->ack_pending--;
+ priv->pending_tid_r_segs--;
+ qp->s_num_rd_atomic--;
+ if ((qp->s_flags & RVT_S_WAIT_FENCE) &&
+ !qp->s_num_rd_atomic) {
+ qp->s_flags &= ~(RVT_S_WAIT_FENCE |
+ RVT_S_WAIT_ACK);
+ hfi1_schedule_send(qp);
+ }
+ if (qp->s_flags & RVT_S_WAIT_RDMAR) {
+ qp->s_flags &= ~(RVT_S_WAIT_RDMAR | RVT_S_WAIT_ACK);
+ hfi1_schedule_send(qp);
+ }
+
+ trace_hfi1_ack(qp, ipsn);
+ trace_hfi1_tid_req_rcv_read_resp(qp, 0, req->e.swqe->wr.opcode,
+ req->e.swqe->psn, req->e.swqe->lpsn,
+ req);
+ trace_hfi1_tid_flow_rcv_read_resp(qp, req->clear_tail, flow);
+
+ /* Release the tid resources */
+ hfi1_kern_exp_rcv_clear(req);
+
+ if (!do_rc_ack(qp, aeth, ipsn, opcode, 0, rcd))
+ goto ack_done;
+
+ /* If not done yet, build next read request */
+ if (++req->comp_seg >= req->total_segs) {
+ priv->tid_r_comp++;
+ req->state = TID_REQUEST_COMPLETE;
+ }
+
+ /*
+ * Clear the hw flow under two conditions:
+ * 1. This request is a sync point and it is complete;
+ * 2. Current request is completed and there are no more requests.
+ */
+ if ((req->state == TID_REQUEST_SYNC &&
+ req->comp_seg == req->cur_seg) ||
+ priv->tid_r_comp == priv->tid_r_reqs) {
+ hfi1_kern_clear_hw_flow(priv->rcd, qp);
+ if (req->state == TID_REQUEST_SYNC)
+ req->state = TID_REQUEST_ACTIVE;
+ }
+
+ hfi1_schedule_send(qp);
+ goto ack_done;
+
+ack_op_err:
+ /*
+ * The test indicates that the send engine has finished its cleanup
+ * after sending the request and it's now safe to put the QP into error
+ * state. However, if the wqe queue is empty (qp->s_acked == qp->s_tail
+ * == qp->s_head), it would be unsafe to complete the wqe pointed by
+ * qp->s_acked here. Putting the qp into error state will safely flush
+ * all remaining requests.
+ */
+ if (qp->s_last == qp->s_acked)
+ rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+
+ack_done:
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ if (is_fecn)
+ hfi1_send_rc_ack(packet, is_fecn);
+}
+
+void hfi1_kern_read_tid_flow_free(struct rvt_qp *qp)
+ __must_hold(&qp->s_lock)
+{
+ u32 n = qp->s_acked;
+ struct rvt_swqe *wqe;
+ struct tid_rdma_request *req;
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ lockdep_assert_held(&qp->s_lock);
+ /* Free any TID entries */
+ while (n != qp->s_tail) {
+ wqe = rvt_get_swqe_ptr(qp, n);
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
+ req = wqe_to_tid_req(wqe);
+ hfi1_kern_exp_rcv_clear_all(req);
+ }
+
+ if (++n == qp->s_size)
+ n = 0;
+ }
+ /* Free flow */
+ hfi1_kern_clear_hw_flow(priv->rcd, qp);
+}
+
+static bool tid_rdma_tid_err(struct hfi1_ctxtdata *rcd,
+ struct hfi1_packet *packet, u8 rcv_type,
+ u8 opcode)
+{
+ struct rvt_qp *qp = packet->qp;
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ u32 ipsn;
+ struct ib_other_headers *ohdr = packet->ohdr;
+ struct rvt_ack_entry *e;
+ struct tid_rdma_request *req;
+ struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
+ u32 i;
+
+ if (rcv_type >= RHF_RCV_TYPE_IB)
+ goto done;
+
+ spin_lock(&qp->s_lock);
+
+ /*
+ * We've ran out of space in the eager buffer.
+ * Eagerly received KDETH packets which require space in the
+ * Eager buffer (packet that have payload) are TID RDMA WRITE
+ * response packets. In this case, we have to re-transmit the
+ * TID RDMA WRITE request.
+ */
+ if (rcv_type == RHF_RCV_TYPE_EAGER) {
+ hfi1_restart_rc(qp, qp->s_last_psn + 1, 1);
+ hfi1_schedule_send(qp);
+ goto done_unlock;
+ }
+
+ /*
+ * For TID READ response, error out QP after freeing the tid
+ * resources.
+ */
+ if (opcode == TID_OP(READ_RESP)) {
+ ipsn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_psn));
+ if (cmp_psn(ipsn, qp->s_last_psn) > 0 &&
+ cmp_psn(ipsn, qp->s_psn) < 0) {
+ hfi1_kern_read_tid_flow_free(qp);
+ spin_unlock(&qp->s_lock);
+ rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+ goto done;
+ }
+ goto done_unlock;
+ }
+
+ /*
+ * Error out the qp for TID RDMA WRITE
+ */
+ hfi1_kern_clear_hw_flow(qpriv->rcd, qp);
+ for (i = 0; i < rvt_max_atomic(rdi); i++) {
+ e = &qp->s_ack_queue[i];
+ if (e->opcode == TID_OP(WRITE_REQ)) {
+ req = ack_to_tid_req(e);
+ hfi1_kern_exp_rcv_clear_all(req);
+ }
+ }
+ spin_unlock(&qp->s_lock);
+ rvt_rc_error(qp, IB_WC_LOC_LEN_ERR);
+ goto done;
+
+done_unlock:
+ spin_unlock(&qp->s_lock);
+done:
+ return true;
+}
+
+static void restart_tid_rdma_read_req(struct hfi1_ctxtdata *rcd,
+ struct rvt_qp *qp, struct rvt_swqe *wqe)
+{
+ struct tid_rdma_request *req;
+ struct tid_rdma_flow *flow;
+
+ /* Start from the right segment */
+ qp->r_flags |= RVT_R_RDMAR_SEQ;
+ req = wqe_to_tid_req(wqe);
+ flow = &req->flows[req->clear_tail];
+ hfi1_restart_rc(qp, flow->flow_state.ib_spsn, 0);
+ if (list_empty(&qp->rspwait)) {
+ qp->r_flags |= RVT_R_RSP_SEND;
+ rvt_get_qp(qp);
+ list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
+ }
+}
+
+/*
+ * Handle the KDETH eflags for TID RDMA READ response.
+ *
+ * Return true if the last packet for a segment has been received and it is
+ * time to process the response normally; otherwise, return true.
+ *
+ * The caller must hold the packet->qp->r_lock and the rcu_read_lock.
+ */
+static bool handle_read_kdeth_eflags(struct hfi1_ctxtdata *rcd,
+ struct hfi1_packet *packet, u8 rcv_type,
+ u8 rte, u32 psn, u32 ibpsn)
+ __must_hold(&packet->qp->r_lock) __must_hold(RCU)
+{
+ struct hfi1_pportdata *ppd = rcd->ppd;
+ struct hfi1_devdata *dd = ppd->dd;
+ struct hfi1_ibport *ibp;
+ struct rvt_swqe *wqe;
+ struct tid_rdma_request *req;
+ struct tid_rdma_flow *flow;
+ u32 ack_psn;
+ struct rvt_qp *qp = packet->qp;
+ struct hfi1_qp_priv *priv = qp->priv;
+ bool ret = true;
+ int diff = 0;
+ u32 fpsn;
+
+ lockdep_assert_held(&qp->r_lock);
+ /* If the psn is out of valid range, drop the packet */
+ if (cmp_psn(ibpsn, qp->s_last_psn) < 0 ||
+ cmp_psn(ibpsn, qp->s_psn) > 0)
+ return ret;
+
+ spin_lock(&qp->s_lock);
+ /*
+ * Note that NAKs implicitly ACK outstanding SEND and RDMA write
+ * requests and implicitly NAK RDMA read and atomic requests issued
+ * before the NAK'ed request.
+ */
+ ack_psn = ibpsn - 1;
+ wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+ ibp = to_iport(qp->ibqp.device, qp->port_num);
+
+ /* Complete WQEs that the PSN finishes. */
+ while ((int)delta_psn(ack_psn, wqe->lpsn) >= 0) {
+ /*
+ * If this request is a RDMA read or atomic, and the NACK is
+ * for a later operation, this NACK NAKs the RDMA read or
+ * atomic.
+ */
+ if (wqe->wr.opcode == IB_WR_RDMA_READ ||
+ wqe->wr.opcode == IB_WR_TID_RDMA_READ ||
+ wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+ wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) {
+ /* Retry this request. */
+ if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) {
+ qp->r_flags |= RVT_R_RDMAR_SEQ;
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
+ restart_tid_rdma_read_req(rcd, qp,
+ wqe);
+ } else {
+ hfi1_restart_rc(qp, qp->s_last_psn + 1,
+ 0);
+ if (list_empty(&qp->rspwait)) {
+ qp->r_flags |= RVT_R_RSP_SEND;
+ rvt_get_qp(qp);
+ list_add_tail(/* wait */
+ &qp->rspwait,
+ &rcd->qp_wait_list);
+ }
+ }
+ }
+ /*
+ * No need to process the NAK since we are
+ * restarting an earlier request.
+ */
+ break;
+ }
+
+ wqe = do_rc_completion(qp, wqe, ibp);
+ if (qp->s_acked == qp->s_tail)
+ break;
+ }
+
+ /* Handle the eflags for the request */
+ if (wqe->wr.opcode != IB_WR_TID_RDMA_READ)
+ goto s_unlock;
+
+ req = wqe_to_tid_req(wqe);
+ switch (rcv_type) {
+ case RHF_RCV_TYPE_EXPECTED:
+ switch (rte) {
+ case RHF_RTE_EXPECTED_FLOW_SEQ_ERR:
+ /*
+ * On the first occurrence of a Flow Sequence error,
+ * the flag TID_FLOW_SW_PSN is set.
+ *
+ * After that, the flow is *not* reprogrammed and the
+ * protocol falls back to SW PSN checking. This is done
+ * to prevent continuous Flow Sequence errors for any
+ * packets that could be still in the fabric.
+ */
+ flow = find_flow(req, psn, NULL);
+ if (!flow) {
+ /*
+ * We can't find the IB PSN matching the
+ * received KDETH PSN. The only thing we can
+ * do at this point is report the error to
+ * the QP.
+ */
+ hfi1_kern_read_tid_flow_free(qp);
+ spin_unlock(&qp->s_lock);
+ rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+ return ret;
+ }
+ if (priv->flow_state.flags & TID_FLOW_SW_PSN) {
+ diff = cmp_psn(psn,
+ priv->flow_state.r_next_psn);
+ if (diff > 0) {
+ if (!(qp->r_flags & RVT_R_RDMAR_SEQ))
+ restart_tid_rdma_read_req(rcd,
+ qp,
+ wqe);
+
+ /* Drop the packet.*/
+ goto s_unlock;
+ } else if (diff < 0) {
+ /*
+ * If a response packet for a restarted
+ * request has come back, reset the
+ * restart flag.
+ */
+ if (qp->r_flags & RVT_R_RDMAR_SEQ)
+ qp->r_flags &=
+ ~RVT_R_RDMAR_SEQ;
+
+ /* Drop the packet.*/
+ goto s_unlock;
+ }
+
+ /*
+ * If SW PSN verification is successful and
+ * this is the last packet in the segment, tell
+ * the caller to process it as a normal packet.
+ */
+ fpsn = full_flow_psn(flow,
+ flow->flow_state.lpsn);
+ if (cmp_psn(fpsn, psn) == 0) {
+ ret = false;
+ if (qp->r_flags & RVT_R_RDMAR_SEQ)
+ qp->r_flags &=
+ ~RVT_R_RDMAR_SEQ;
+ }
+ priv->flow_state.r_next_psn++;
+ } else {
+ u64 reg;
+ u32 last_psn;
+
+ /*
+ * The only sane way to get the amount of
+ * progress is to read the HW flow state.
+ */
+ reg = read_uctxt_csr(dd, rcd->ctxt,
+ RCV_TID_FLOW_TABLE +
+ (8 * flow->idx));
+ last_psn = mask_psn(reg);
+
+ priv->flow_state.r_next_psn = last_psn;
+ priv->flow_state.flags |= TID_FLOW_SW_PSN;
+ /*
+ * If no request has been restarted yet,
+ * restart the current one.
+ */
+ if (!(qp->r_flags & RVT_R_RDMAR_SEQ))
+ restart_tid_rdma_read_req(rcd, qp,
+ wqe);
+ }
+
+ break;
+
+ case RHF_RTE_EXPECTED_FLOW_GEN_ERR:
+ /*
+ * Since the TID flow is able to ride through
+ * generation mismatch, drop this stale packet.
+ */
+ break;
+
+ default:
+ break;
+ }
+ break;
+
+ case RHF_RCV_TYPE_ERROR:
+ switch (rte) {
+ case RHF_RTE_ERROR_OP_CODE_ERR:
+ case RHF_RTE_ERROR_KHDR_MIN_LEN_ERR:
+ case RHF_RTE_ERROR_KHDR_HCRC_ERR:
+ case RHF_RTE_ERROR_KHDR_KVER_ERR:
+ case RHF_RTE_ERROR_CONTEXT_ERR:
+ case RHF_RTE_ERROR_KHDR_TID_ERR:
+ default:
+ break;
+ }
+ default:
+ break;
+ }
+s_unlock:
+ spin_unlock(&qp->s_lock);
+ return ret;
+}
+
+bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd,
+ struct hfi1_pportdata *ppd,
+ struct hfi1_packet *packet)
+{
+ struct hfi1_ibport *ibp = &ppd->ibport_data;
+ struct hfi1_devdata *dd = ppd->dd;
+ struct rvt_dev_info *rdi = &dd->verbs_dev.rdi;
+ u8 rcv_type = rhf_rcv_type(packet->rhf);
+ u8 rte = rhf_rcv_type_err(packet->rhf);
+ struct ib_header *hdr = packet->hdr;
+ struct ib_other_headers *ohdr = NULL;
+ int lnh = be16_to_cpu(hdr->lrh[0]) & 3;
+ u16 lid = be16_to_cpu(hdr->lrh[1]);
+ u8 opcode;
+ u32 qp_num, psn, ibpsn;
+ struct rvt_qp *qp;
+ struct hfi1_qp_priv *qpriv;
+ unsigned long flags;
+ bool ret = true;
+ struct rvt_ack_entry *e;
+ struct tid_rdma_request *req;
+ struct tid_rdma_flow *flow;
+
+ trace_hfi1_msg_handle_kdeth_eflags(NULL, "Kdeth error: rhf ",
+ packet->rhf);
+ if (packet->rhf & (RHF_VCRC_ERR | RHF_ICRC_ERR))
+ return ret;
+
+ packet->ohdr = &hdr->u.oth;
+ ohdr = packet->ohdr;
+ trace_input_ibhdr(rcd->dd, packet, !!(rhf_dc_info(packet->rhf)));
+
+ /* Get the destination QP number. */
+ qp_num = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_qp) &
+ RVT_QPN_MASK;
+ if (lid >= be16_to_cpu(IB_MULTICAST_LID_BASE))
+ goto drop;
+
+ psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
+ opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
+
+ rcu_read_lock();
+ qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num);
+ if (!qp)
+ goto rcu_unlock;
+
+ packet->qp = qp;
+
+ /* Check for valid receive state. */
+ spin_lock_irqsave(&qp->r_lock, flags);
+ if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
+ ibp->rvp.n_pkt_drops++;
+ goto r_unlock;
+ }
+
+ if (packet->rhf & RHF_TID_ERR) {
+ /* For TIDERR and RC QPs preemptively schedule a NAK */
+ u32 tlen = rhf_pkt_len(packet->rhf); /* in bytes */
+
+ /* Sanity check packet */
+ if (tlen < 24)
+ goto r_unlock;
+
+ /*
+ * Check for GRH. We should never get packets with GRH in this
+ * path.
+ */
+ if (lnh == HFI1_LRH_GRH)
+ goto r_unlock;
+
+ if (tid_rdma_tid_err(rcd, packet, rcv_type, opcode))
+ goto r_unlock;
+ }
+
+ /* handle TID RDMA READ */
+ if (opcode == TID_OP(READ_RESP)) {
+ ibpsn = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_psn);
+ ibpsn = mask_psn(ibpsn);
+ ret = handle_read_kdeth_eflags(rcd, packet, rcv_type, rte, psn,
+ ibpsn);
+ goto r_unlock;
+ }
+
+ /*
+ * qp->s_tail_ack_queue points to the rvt_ack_entry currently being
+ * processed. These a completed sequentially so we can be sure that
+ * the pointer will not change until the entire request has completed.
+ */
+ spin_lock(&qp->s_lock);
+ qpriv = qp->priv;
+ e = &qp->s_ack_queue[qpriv->r_tid_tail];
+ req = ack_to_tid_req(e);
+ flow = &req->flows[req->clear_tail];
+ trace_hfi1_eflags_err_write(qp, rcv_type, rte, psn);
+ trace_hfi1_rsp_handle_kdeth_eflags(qp, psn);
+ trace_hfi1_tid_write_rsp_handle_kdeth_eflags(qp);
+ trace_hfi1_tid_req_handle_kdeth_eflags(qp, 0, e->opcode, e->psn,
+ e->lpsn, req);
+ trace_hfi1_tid_flow_handle_kdeth_eflags(qp, req->clear_tail, flow);
+
+ switch (rcv_type) {
+ case RHF_RCV_TYPE_EXPECTED:
+ switch (rte) {
+ case RHF_RTE_EXPECTED_FLOW_SEQ_ERR:
+ if (!(qpriv->s_flags & HFI1_R_TID_SW_PSN)) {
+ u64 reg;
+
+ qpriv->s_flags |= HFI1_R_TID_SW_PSN;
+ /*
+ * The only sane way to get the amount of
+ * progress is to read the HW flow state.
+ */
+ reg = read_uctxt_csr(dd, rcd->ctxt,
+ RCV_TID_FLOW_TABLE +
+ (8 * flow->idx));
+ flow->flow_state.r_next_psn = mask_psn(reg);
+ qpriv->r_next_psn_kdeth =
+ flow->flow_state.r_next_psn;
+ goto nak_psn;
+ } else {
+ /*
+ * If the received PSN does not match the next
+ * expected PSN, NAK the packet.
+ * However, only do that if we know that the a
+ * NAK has already been sent. Otherwise, this
+ * mismatch could be due to packets that were
+ * already in flight.
+ */
+ if (psn != flow->flow_state.r_next_psn) {
+ psn = flow->flow_state.r_next_psn;
+ goto nak_psn;
+ }
+
+ qpriv->s_nak_state = 0;
+ /*
+ * If SW PSN verification is successful and this
+ * is the last packet in the segment, tell the
+ * caller to process it as a normal packet.
+ */
+ if (psn == full_flow_psn(flow,
+ flow->flow_state.lpsn))
+ ret = false;
+ qpriv->r_next_psn_kdeth =
+ ++flow->flow_state.r_next_psn;
+ }
+ break;
+
+ case RHF_RTE_EXPECTED_FLOW_GEN_ERR:
+ goto nak_psn;
+
+ default:
+ break;
+ }
+ break;
+
+ case RHF_RCV_TYPE_ERROR:
+ switch (rte) {
+ case RHF_RTE_ERROR_OP_CODE_ERR:
+ case RHF_RTE_ERROR_KHDR_MIN_LEN_ERR:
+ case RHF_RTE_ERROR_KHDR_HCRC_ERR:
+ case RHF_RTE_ERROR_KHDR_KVER_ERR:
+ case RHF_RTE_ERROR_CONTEXT_ERR:
+ case RHF_RTE_ERROR_KHDR_TID_ERR:
+ default:
+ break;
+ }
+ default:
+ break;
+ }
+
+unlock:
+ spin_unlock(&qp->s_lock);
+r_unlock:
+ spin_unlock_irqrestore(&qp->r_lock, flags);
+rcu_unlock:
+ rcu_read_unlock();
+drop:
+ return ret;
+nak_psn:
+ ibp->rvp.n_rc_seqnak++;
+ if (!qpriv->s_nak_state) {
+ qpriv->s_nak_state = IB_NAK_PSN_ERROR;
+ /* We are NAK'ing the next expected PSN */
+ qpriv->s_nak_psn = mask_psn(flow->flow_state.r_next_psn);
+ qpriv->s_flags |= RVT_S_ACK_PENDING;
+ if (qpriv->r_tid_ack == HFI1_QP_WQE_INVALID)
+ qpriv->r_tid_ack = qpriv->r_tid_tail;
+ hfi1_schedule_tid_send(qp);
+ }
+ goto unlock;
+}
+
+/*
+ * "Rewind" the TID request information.
+ * This means that we reset the state back to ACTIVE,
+ * find the proper flow, set the flow index to that flow,
+ * and reset the flow information.
+ */
+void hfi1_tid_rdma_restart_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
+ u32 *bth2)
+{
+ struct tid_rdma_request *req = wqe_to_tid_req(wqe);
+ struct tid_rdma_flow *flow;
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ int diff, delta_pkts;
+ u32 tididx = 0, i;
+ u16 fidx;
+
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
+ *bth2 = mask_psn(qp->s_psn);
+ flow = find_flow_ib(req, *bth2, &fidx);
+ if (!flow) {
+ trace_hfi1_msg_tid_restart_req(/* msg */
+ qp, "!!!!!! Could not find flow to restart: bth2 ",
+ (u64)*bth2);
+ trace_hfi1_tid_req_restart_req(qp, 0, wqe->wr.opcode,
+ wqe->psn, wqe->lpsn,
+ req);
+ return;
+ }
+ } else {
+ fidx = req->acked_tail;
+ flow = &req->flows[fidx];
+ *bth2 = mask_psn(req->r_ack_psn);
+ }
+
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_READ)
+ delta_pkts = delta_psn(*bth2, flow->flow_state.ib_spsn);
+ else
+ delta_pkts = delta_psn(*bth2,
+ full_flow_psn(flow,
+ flow->flow_state.spsn));
+
+ trace_hfi1_tid_flow_restart_req(qp, fidx, flow);
+ diff = delta_pkts + flow->resync_npkts;
+
+ flow->sent = 0;
+ flow->pkt = 0;
+ flow->tid_idx = 0;
+ flow->tid_offset = 0;
+ if (diff) {
+ for (tididx = 0; tididx < flow->tidcnt; tididx++) {
+ u32 tidentry = flow->tid_entry[tididx], tidlen,
+ tidnpkts, npkts;
+
+ flow->tid_offset = 0;
+ tidlen = EXP_TID_GET(tidentry, LEN) * PAGE_SIZE;
+ tidnpkts = rvt_div_round_up_mtu(qp, tidlen);
+ npkts = min_t(u32, diff, tidnpkts);
+ flow->pkt += npkts;
+ flow->sent += (npkts == tidnpkts ? tidlen :
+ npkts * qp->pmtu);
+ flow->tid_offset += npkts * qp->pmtu;
+ diff -= npkts;
+ if (!diff)
+ break;
+ }
+ }
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) {
+ rvt_skip_sge(&qpriv->tid_ss, (req->cur_seg * req->seg_len) +
+ flow->sent, 0);
+ /*
+ * Packet PSN is based on flow_state.spsn + flow->pkt. However,
+ * during a RESYNC, the generation is incremented and the
+ * sequence is reset to 0. Since we've adjusted the npkts in the
+ * flow and the SGE has been sufficiently advanced, we have to
+ * adjust flow->pkt in order to calculate the correct PSN.
+ */
+ flow->pkt -= flow->resync_npkts;
+ }
+
+ if (flow->tid_offset ==
+ EXP_TID_GET(flow->tid_entry[tididx], LEN) * PAGE_SIZE) {
+ tididx++;
+ flow->tid_offset = 0;
+ }
+ flow->tid_idx = tididx;
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_READ)
+ /* Move flow_idx to correct index */
+ req->flow_idx = fidx;
+ else
+ req->clear_tail = fidx;
+
+ trace_hfi1_tid_flow_restart_req(qp, fidx, flow);
+ trace_hfi1_tid_req_restart_req(qp, 0, wqe->wr.opcode, wqe->psn,
+ wqe->lpsn, req);
+ req->state = TID_REQUEST_ACTIVE;
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) {
+ /* Reset all the flows that we are going to resend */
+ fidx = CIRC_NEXT(fidx, MAX_FLOWS);
+ i = qpriv->s_tid_tail;
+ do {
+ for (; CIRC_CNT(req->setup_head, fidx, MAX_FLOWS);
+ fidx = CIRC_NEXT(fidx, MAX_FLOWS)) {
+ req->flows[fidx].sent = 0;
+ req->flows[fidx].pkt = 0;
+ req->flows[fidx].tid_idx = 0;
+ req->flows[fidx].tid_offset = 0;
+ req->flows[fidx].resync_npkts = 0;
+ }
+ if (i == qpriv->s_tid_cur)
+ break;
+ do {
+ i = (++i == qp->s_size ? 0 : i);
+ wqe = rvt_get_swqe_ptr(qp, i);
+ } while (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE);
+ req = wqe_to_tid_req(wqe);
+ req->cur_seg = req->ack_seg;
+ fidx = req->acked_tail;
+ /* Pull req->clear_tail back */
+ req->clear_tail = fidx;
+ } while (1);
+ }
+}
+
+void hfi1_qp_kern_exp_rcv_clear_all(struct rvt_qp *qp)
+{
+ int i, ret;
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct tid_flow_state *fs;
+
+ if (qp->ibqp.qp_type != IB_QPT_RC || !HFI1_CAP_IS_KSET(TID_RDMA))
+ return;
+
+ /*
+ * First, clear the flow to help prevent any delayed packets from
+ * being delivered.
+ */
+ fs = &qpriv->flow_state;
+ if (fs->index != RXE_NUM_TID_FLOWS)
+ hfi1_kern_clear_hw_flow(qpriv->rcd, qp);
+
+ for (i = qp->s_acked; i != qp->s_head;) {
+ struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, i);
+
+ if (++i == qp->s_size)
+ i = 0;
+ /* Free only locally allocated TID entries */
+ if (wqe->wr.opcode != IB_WR_TID_RDMA_READ)
+ continue;
+ do {
+ struct hfi1_swqe_priv *priv = wqe->priv;
+
+ ret = hfi1_kern_exp_rcv_clear(&priv->tid_req);
+ } while (!ret);
+ }
+ for (i = qp->s_acked_ack_queue; i != qp->r_head_ack_queue;) {
+ struct rvt_ack_entry *e = &qp->s_ack_queue[i];
+
+ if (++i == rvt_max_atomic(ib_to_rvt(qp->ibqp.device)))
+ i = 0;
+ /* Free only locally allocated TID entries */
+ if (e->opcode != TID_OP(WRITE_REQ))
+ continue;
+ do {
+ struct hfi1_ack_priv *priv = e->priv;
+
+ ret = hfi1_kern_exp_rcv_clear(&priv->tid_req);
+ } while (!ret);
+ }
+}
+
+bool hfi1_tid_rdma_wqe_interlock(struct rvt_qp *qp, struct rvt_swqe *wqe)
+{
+ struct rvt_swqe *prev;
+ struct hfi1_qp_priv *priv = qp->priv;
+ u32 s_prev;
+ struct tid_rdma_request *req;
+
+ s_prev = (qp->s_cur == 0 ? qp->s_size : qp->s_cur) - 1;
+ prev = rvt_get_swqe_ptr(qp, s_prev);
+
+ switch (wqe->wr.opcode) {
+ case IB_WR_SEND:
+ case IB_WR_SEND_WITH_IMM:
+ case IB_WR_SEND_WITH_INV:
+ case IB_WR_ATOMIC_CMP_AND_SWP:
+ case IB_WR_ATOMIC_FETCH_AND_ADD:
+ case IB_WR_RDMA_WRITE:
+ switch (prev->wr.opcode) {
+ case IB_WR_TID_RDMA_WRITE:
+ req = wqe_to_tid_req(prev);
+ if (req->ack_seg != req->total_segs)
+ goto interlock;
+ default:
+ break;
+ }
+ break;
+ case IB_WR_RDMA_READ:
+ if (prev->wr.opcode != IB_WR_TID_RDMA_WRITE)
+ break;
+ /* fall through */
+ case IB_WR_TID_RDMA_READ:
+ switch (prev->wr.opcode) {
+ case IB_WR_RDMA_READ:
+ if (qp->s_acked != qp->s_cur)
+ goto interlock;
+ break;
+ case IB_WR_TID_RDMA_WRITE:
+ req = wqe_to_tid_req(prev);
+ if (req->ack_seg != req->total_segs)
+ goto interlock;
+ default:
+ break;
+ }
+ default:
+ break;
+ }
+ return false;
+
+interlock:
+ priv->s_flags |= HFI1_S_TID_WAIT_INTERLCK;
+ return true;
+}
+
+/* Does @sge meet the alignment requirements for tid rdma? */
+static inline bool hfi1_check_sge_align(struct rvt_qp *qp,
+ struct rvt_sge *sge, int num_sge)
+{
+ int i;
+
+ for (i = 0; i < num_sge; i++, sge++) {
+ trace_hfi1_sge_check_align(qp, i, sge);
+ if ((u64)sge->vaddr & ~PAGE_MASK ||
+ sge->sge_length & ~PAGE_MASK)
+ return false;
+ }
+ return true;
+}
+
+void setup_tid_rdma_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe)
+{
+ struct hfi1_qp_priv *qpriv = (struct hfi1_qp_priv *)qp->priv;
+ struct hfi1_swqe_priv *priv = wqe->priv;
+ struct tid_rdma_params *remote;
+ enum ib_wr_opcode new_opcode;
+ bool do_tid_rdma = false;
+ struct hfi1_pportdata *ppd = qpriv->rcd->ppd;
+
+ if ((rdma_ah_get_dlid(&qp->remote_ah_attr) & ~((1 << ppd->lmc) - 1)) ==
+ ppd->lid)
+ return;
+ if (qpriv->hdr_type != HFI1_PKT_TYPE_9B)
+ return;
+
+ rcu_read_lock();
+ remote = rcu_dereference(qpriv->tid_rdma.remote);
+ /*
+ * If TID RDMA is disabled by the negotiation, don't
+ * use it.
+ */
+ if (!remote)
+ goto exit;
+
+ if (wqe->wr.opcode == IB_WR_RDMA_READ) {
+ if (hfi1_check_sge_align(qp, &wqe->sg_list[0],
+ wqe->wr.num_sge)) {
+ new_opcode = IB_WR_TID_RDMA_READ;
+ do_tid_rdma = true;
+ }
+ } else if (wqe->wr.opcode == IB_WR_RDMA_WRITE) {
+ /*
+ * TID RDMA is enabled for this RDMA WRITE request iff:
+ * 1. The remote address is page-aligned,
+ * 2. The length is larger than the minimum segment size,
+ * 3. The length is page-multiple.
+ */
+ if (!(wqe->rdma_wr.remote_addr & ~PAGE_MASK) &&
+ !(wqe->length & ~PAGE_MASK)) {
+ new_opcode = IB_WR_TID_RDMA_WRITE;
+ do_tid_rdma = true;
+ }
+ }
+
+ if (do_tid_rdma) {
+ if (hfi1_kern_exp_rcv_alloc_flows(&priv->tid_req, GFP_ATOMIC))
+ goto exit;
+ wqe->wr.opcode = new_opcode;
+ priv->tid_req.seg_len =
+ min_t(u32, remote->max_len, wqe->length);
+ priv->tid_req.total_segs =
+ DIV_ROUND_UP(wqe->length, priv->tid_req.seg_len);
+ /* Compute the last PSN of the request */
+ wqe->lpsn = wqe->psn;
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
+ priv->tid_req.n_flows = remote->max_read;
+ qpriv->tid_r_reqs++;
+ wqe->lpsn += rvt_div_round_up_mtu(qp, wqe->length) - 1;
+ } else {
+ wqe->lpsn += priv->tid_req.total_segs - 1;
+ atomic_inc(&qpriv->n_requests);
+ }
+
+ priv->tid_req.cur_seg = 0;
+ priv->tid_req.comp_seg = 0;
+ priv->tid_req.ack_seg = 0;
+ priv->tid_req.state = TID_REQUEST_INACTIVE;
+ /*
+ * Reset acked_tail.
+ * TID RDMA READ does not have ACKs so it does not
+ * update the pointer. We have to reset it so TID RDMA
+ * WRITE does not get confused.
+ */
+ priv->tid_req.acked_tail = priv->tid_req.setup_head;
+ trace_hfi1_tid_req_setup_tid_wqe(qp, 1, wqe->wr.opcode,
+ wqe->psn, wqe->lpsn,
+ &priv->tid_req);
+ }
+exit:
+ rcu_read_unlock();
+}
+
+/* TID RDMA WRITE functions */
+
+u32 hfi1_build_tid_rdma_write_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
+ struct ib_other_headers *ohdr,
+ u32 *bth1, u32 *bth2, u32 *len)
+{
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct tid_rdma_request *req = wqe_to_tid_req(wqe);
+ struct tid_rdma_params *remote;
+
+ rcu_read_lock();
+ remote = rcu_dereference(qpriv->tid_rdma.remote);
+ /*
+ * Set the number of flow to be used based on negotiated
+ * parameters.
+ */
+ req->n_flows = remote->max_write;
+ req->state = TID_REQUEST_ACTIVE;
+
+ KDETH_RESET(ohdr->u.tid_rdma.w_req.kdeth0, KVER, 0x1);
+ KDETH_RESET(ohdr->u.tid_rdma.w_req.kdeth1, JKEY, remote->jkey);
+ ohdr->u.tid_rdma.w_req.reth.vaddr =
+ cpu_to_be64(wqe->rdma_wr.remote_addr + (wqe->length - *len));
+ ohdr->u.tid_rdma.w_req.reth.rkey =
+ cpu_to_be32(wqe->rdma_wr.rkey);
+ ohdr->u.tid_rdma.w_req.reth.length = cpu_to_be32(*len);
+ ohdr->u.tid_rdma.w_req.verbs_qp = cpu_to_be32(qp->remote_qpn);
+ *bth1 &= ~RVT_QPN_MASK;
+ *bth1 |= remote->qp;
+ qp->s_state = TID_OP(WRITE_REQ);
+ qp->s_flags |= HFI1_S_WAIT_TID_RESP;
+ *bth2 |= IB_BTH_REQ_ACK;
+ *len = 0;
+
+ rcu_read_unlock();
+ return sizeof(ohdr->u.tid_rdma.w_req) / sizeof(u32);
+}
+
+void hfi1_compute_tid_rdma_flow_wt(void)
+{
+ /*
+ * Heuristic for computing the RNR timeout when waiting on the flow
+ * queue. Rather than a computationaly expensive exact estimate of when
+ * a flow will be available, we assume that if a QP is at position N in
+ * the flow queue it has to wait approximately (N + 1) * (number of
+ * segments between two sync points), assuming PMTU of 4K. The rationale
+ * for this is that flows are released and recycled at each sync point.
+ */
+ tid_rdma_flow_wt = MAX_TID_FLOW_PSN * enum_to_mtu(OPA_MTU_4096) /
+ TID_RDMA_MAX_SEGMENT_SIZE;
+}
+
+static u32 position_in_queue(struct hfi1_qp_priv *qpriv,
+ struct tid_queue *queue)
+{
+ return qpriv->tid_enqueue - queue->dequeue;
+}
+
+/*
+ * @qp: points to rvt_qp context.
+ * @to_seg: desired RNR timeout in segments.
+ * Return: index of the next highest timeout in the ib_hfi1_rnr_table[]
+ */
+static u32 hfi1_compute_tid_rnr_timeout(struct rvt_qp *qp, u32 to_seg)
+{
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ u64 timeout;
+ u32 bytes_per_us;
+ u8 i;
+
+ bytes_per_us = active_egress_rate(qpriv->rcd->ppd) / 8;
+ timeout = (to_seg * TID_RDMA_MAX_SEGMENT_SIZE) / bytes_per_us;
+ /*
+ * Find the next highest value in the RNR table to the required
+ * timeout. This gives the responder some padding.
+ */
+ for (i = 1; i <= IB_AETH_CREDIT_MASK; i++)
+ if (rvt_rnr_tbl_to_usec(i) >= timeout)
+ return i;
+ return 0;
+}
+
+/**
+ * Central place for resource allocation at TID write responder,
+ * is called from write_req and write_data interrupt handlers as
+ * well as the send thread when a queued QP is scheduled for
+ * resource allocation.
+ *
+ * Iterates over (a) segments of a request and then (b) queued requests
+ * themselves to allocate resources for up to local->max_write
+ * segments across multiple requests. Stop allocating when we
+ * hit a sync point, resume allocating after data packets at
+ * sync point have been received.
+ *
+ * Resource allocation and sending of responses is decoupled. The
+ * request/segment which are being allocated and sent are as follows.
+ * Resources are allocated for:
+ * [request: qpriv->r_tid_alloc, segment: req->alloc_seg]
+ * The send thread sends:
+ * [request: qp->s_tail_ack_queue, segment:req->cur_seg]
+ */
+static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx)
+{
+ struct tid_rdma_request *req;
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct hfi1_ctxtdata *rcd = qpriv->rcd;
+ struct tid_rdma_params *local = &qpriv->tid_rdma.local;
+ struct rvt_ack_entry *e;
+ u32 npkts, to_seg;
+ bool last;
+ int ret = 0;
+
+ lockdep_assert_held(&qp->s_lock);
+
+ while (1) {
+ trace_hfi1_rsp_tid_write_alloc_res(qp, 0);
+ trace_hfi1_tid_write_rsp_alloc_res(qp);
+ /*
+ * Don't allocate more segments if a RNR NAK has already been
+ * scheduled to avoid messing up qp->r_psn: the RNR NAK will
+ * be sent only when all allocated segments have been sent.
+ * However, if more segments are allocated before that, TID RDMA
+ * WRITE RESP packets will be sent out for these new segments
+ * before the RNR NAK packet. When the requester receives the
+ * RNR NAK packet, it will restart with qp->s_last_psn + 1,
+ * which does not match qp->r_psn and will be dropped.
+ * Consequently, the requester will exhaust its retries and
+ * put the qp into error state.
+ */
+ if (qpriv->rnr_nak_state == TID_RNR_NAK_SEND)
+ break;
+
+ /* No requests left to process */
+ if (qpriv->r_tid_alloc == qpriv->r_tid_head) {
+ /* If all data has been received, clear the flow */
+ if (qpriv->flow_state.index < RXE_NUM_TID_FLOWS &&
+ !qpriv->alloc_w_segs)
+ hfi1_kern_clear_hw_flow(rcd, qp);
+ break;
+ }
+
+ e = &qp->s_ack_queue[qpriv->r_tid_alloc];
+ if (e->opcode != TID_OP(WRITE_REQ))
+ goto next_req;
+ req = ack_to_tid_req(e);
+ trace_hfi1_tid_req_write_alloc_res(qp, 0, e->opcode, e->psn,
+ e->lpsn, req);
+ /* Finished allocating for all segments of this request */
+ if (req->alloc_seg >= req->total_segs)
+ goto next_req;
+
+ /* Can allocate only a maximum of local->max_write for a QP */
+ if (qpriv->alloc_w_segs >= local->max_write)
+ break;
+
+ /* Don't allocate at a sync point with data packets pending */
+ if (qpriv->sync_pt && qpriv->alloc_w_segs)
+ break;
+
+ /* All data received at the sync point, continue */
+ if (qpriv->sync_pt && !qpriv->alloc_w_segs) {
+ hfi1_kern_clear_hw_flow(rcd, qp);
+ qpriv->sync_pt = false;
+ if (qpriv->s_flags & HFI1_R_TID_SW_PSN)
+ qpriv->s_flags &= ~HFI1_R_TID_SW_PSN;
+ }
+
+ /* Allocate flow if we don't have one */
+ if (qpriv->flow_state.index >= RXE_NUM_TID_FLOWS) {
+ ret = hfi1_kern_setup_hw_flow(qpriv->rcd, qp);
+ if (ret) {
+ to_seg = tid_rdma_flow_wt *
+ position_in_queue(qpriv,
+ &rcd->flow_queue);
+ break;
+ }
+ }
+
+ npkts = rvt_div_round_up_mtu(qp, req->seg_len);
+
+ /*
+ * We are at a sync point if we run out of KDETH PSN space.
+ * Last PSN of every generation is reserved for RESYNC.
+ */
+ if (qpriv->flow_state.psn + npkts > MAX_TID_FLOW_PSN - 1) {
+ qpriv->sync_pt = true;
+ break;
+ }
+
+ /*
+ * If overtaking req->acked_tail, send an RNR NAK. Because the
+ * QP is not queued in this case, and the issue can only be
+ * caused due a delay in scheduling the second leg which we
+ * cannot estimate, we use a rather arbitrary RNR timeout of
+ * (MAX_FLOWS / 2) segments
+ */
+ if (!CIRC_SPACE(req->setup_head, req->acked_tail,
+ MAX_FLOWS)) {
+ ret = -EAGAIN;
+ to_seg = MAX_FLOWS >> 1;
+ qpriv->s_flags |= RVT_S_ACK_PENDING;
+ hfi1_schedule_tid_send(qp);
+ break;
+ }
+
+ /* Try to allocate rcv array / TID entries */
+ ret = hfi1_kern_exp_rcv_setup(req, &req->ss, &last);
+ if (ret == -EAGAIN)
+ to_seg = position_in_queue(qpriv, &rcd->rarr_queue);
+ if (ret)
+ break;
+
+ qpriv->alloc_w_segs++;
+ req->alloc_seg++;
+ continue;
+next_req:
+ /* Begin processing the next request */
+ if (++qpriv->r_tid_alloc >
+ rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
+ qpriv->r_tid_alloc = 0;
+ }
+
+ /*
+ * Schedule an RNR NAK to be sent if (a) flow or rcv array allocation
+ * has failed (b) we are called from the rcv handler interrupt context
+ * (c) an RNR NAK has not already been scheduled
+ */
+ if (ret == -EAGAIN && intr_ctx && !qp->r_nak_state)
+ goto send_rnr_nak;
+
+ return;
+
+send_rnr_nak:
+ lockdep_assert_held(&qp->r_lock);
+
+ /* Set r_nak_state to prevent unrelated events from generating NAK's */
+ qp->r_nak_state = hfi1_compute_tid_rnr_timeout(qp, to_seg) | IB_RNR_NAK;
+
+ /* Pull back r_psn to the segment being RNR NAK'd */
+ qp->r_psn = e->psn + req->alloc_seg;
+ qp->r_ack_psn = qp->r_psn;
+ /*
+ * Pull back r_head_ack_queue to the ack entry following the request
+ * being RNR NAK'd. This allows resources to be allocated to the request
+ * if the queued QP is scheduled.
+ */
+ qp->r_head_ack_queue = qpriv->r_tid_alloc + 1;
+ if (qp->r_head_ack_queue > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
+ qp->r_head_ack_queue = 0;
+ qpriv->r_tid_head = qp->r_head_ack_queue;
+ /*
+ * These send side fields are used in make_rc_ack(). They are set in
+ * hfi1_send_rc_ack() but must be set here before dropping qp->s_lock
+ * for consistency
+ */
+ qp->s_nak_state = qp->r_nak_state;
+ qp->s_ack_psn = qp->r_ack_psn;
+ /*
+ * Clear the ACK PENDING flag to prevent unwanted ACK because we
+ * have modified qp->s_ack_psn here.
+ */
+ qp->s_flags &= ~(RVT_S_ACK_PENDING);
+
+ trace_hfi1_rsp_tid_write_alloc_res(qp, qp->r_psn);
+ /*
+ * qpriv->rnr_nak_state is used to determine when the scheduled RNR NAK
+ * has actually been sent. qp->s_flags RVT_S_ACK_PENDING bit cannot be
+ * used for this because qp->s_lock is dropped before calling
+ * hfi1_send_rc_ack() leading to inconsistency between the receive
+ * interrupt handlers and the send thread in make_rc_ack()
+ */
+ qpriv->rnr_nak_state = TID_RNR_NAK_SEND;
+
+ /*
+ * Schedule RNR NAK to be sent. RNR NAK's are scheduled from the receive
+ * interrupt handlers but will be sent from the send engine behind any
+ * previous responses that may have been scheduled
+ */
+ rc_defered_ack(rcd, qp);
+}
+
+void hfi1_rc_rcv_tid_rdma_write_req(struct hfi1_packet *packet)
+{
+ /* HANDLER FOR TID RDMA WRITE REQUEST packet (Responder side)*/
+
+ /*
+ * 1. Verify TID RDMA WRITE REQ as per IB_OPCODE_RC_RDMA_WRITE_FIRST
+ * (see hfi1_rc_rcv())
+ * - Don't allow 0-length requests.
+ * 2. Put TID RDMA WRITE REQ into the response queueu (s_ack_queue)
+ * - Setup struct tid_rdma_req with request info
+ * - Prepare struct tid_rdma_flow array?
+ * 3. Set the qp->s_ack_state as state diagram in design doc.
+ * 4. Set RVT_S_RESP_PENDING in s_flags.
+ * 5. Kick the send engine (hfi1_schedule_send())
+ */
+ struct hfi1_ctxtdata *rcd = packet->rcd;
+ struct rvt_qp *qp = packet->qp;
+ struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+ struct ib_other_headers *ohdr = packet->ohdr;
+ struct rvt_ack_entry *e;
+ unsigned long flags;
+ struct ib_reth *reth;
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct tid_rdma_request *req;
+ u32 bth0, psn, len, rkey, num_segs;
+ bool is_fecn;
+ u8 next;
+ u64 vaddr;
+ int diff;
+
+ bth0 = be32_to_cpu(ohdr->bth[0]);
+ if (hfi1_ruc_check_hdr(ibp, packet))
+ return;
+
+ is_fecn = process_ecn(qp, packet);
+ psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
+ trace_hfi1_rsp_rcv_tid_write_req(qp, psn);
+
+ if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST))
+ rvt_comm_est(qp);
+
+ if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
+ goto nack_inv;
+
+ reth = &ohdr->u.tid_rdma.w_req.reth;
+ vaddr = be64_to_cpu(reth->vaddr);
+ len = be32_to_cpu(reth->length);
+
+ num_segs = DIV_ROUND_UP(len, qpriv->tid_rdma.local.max_len);
+ diff = delta_psn(psn, qp->r_psn);
+ if (unlikely(diff)) {
+ if (tid_rdma_rcv_error(packet, ohdr, qp, psn, diff))
+ return;
+ goto send_ack;
+ }
+
+ /*
+ * The resent request which was previously RNR NAK'd is inserted at the
+ * location of the original request, which is one entry behind
+ * r_head_ack_queue
+ */
+ if (qpriv->rnr_nak_state)
+ qp->r_head_ack_queue = qp->r_head_ack_queue ?
+ qp->r_head_ack_queue - 1 :
+ rvt_size_atomic(ib_to_rvt(qp->ibqp.device));
+
+ /* We've verified the request, insert it into the ack queue. */
+ next = qp->r_head_ack_queue + 1;
+ if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
+ next = 0;
+ spin_lock_irqsave(&qp->s_lock, flags);
+ if (unlikely(next == qp->s_acked_ack_queue)) {
+ if (!qp->s_ack_queue[next].sent)
+ goto nack_inv_unlock;
+ update_ack_queue(qp, next);
+ }
+ e = &qp->s_ack_queue[qp->r_head_ack_queue];
+ req = ack_to_tid_req(e);
+
+ /* Bring previously RNR NAK'd request back to life */
+ if (qpriv->rnr_nak_state) {
+ qp->r_nak_state = 0;
+ qp->s_nak_state = 0;
+ qpriv->rnr_nak_state = TID_RNR_NAK_INIT;
+ qp->r_psn = e->lpsn + 1;
+ req->state = TID_REQUEST_INIT;
+ goto update_head;
+ }
+
+ if (e->rdma_sge.mr) {
+ rvt_put_mr(e->rdma_sge.mr);
+ e->rdma_sge.mr = NULL;
+ }
+
+ /* The length needs to be in multiples of PAGE_SIZE */
+ if (!len || len & ~PAGE_MASK)
+ goto nack_inv_unlock;
+
+ rkey = be32_to_cpu(reth->rkey);
+ qp->r_len = len;
+
+ if (e->opcode == TID_OP(WRITE_REQ) &&
+ (req->setup_head != req->clear_tail ||
+ req->clear_tail != req->acked_tail))
+ goto nack_inv_unlock;
+
+ if (unlikely(!rvt_rkey_ok(qp, &e->rdma_sge, qp->r_len, vaddr,
+ rkey, IB_ACCESS_REMOTE_WRITE)))
+ goto nack_acc;
+
+ qp->r_psn += num_segs - 1;
+
+ e->opcode = (bth0 >> 24) & 0xff;
+ e->psn = psn;
+ e->lpsn = qp->r_psn;
+ e->sent = 0;
+
+ req->n_flows = min_t(u16, num_segs, qpriv->tid_rdma.local.max_write);
+ req->state = TID_REQUEST_INIT;
+ req->cur_seg = 0;
+ req->comp_seg = 0;
+ req->ack_seg = 0;
+ req->alloc_seg = 0;
+ req->isge = 0;
+ req->seg_len = qpriv->tid_rdma.local.max_len;
+ req->total_len = len;
+ req->total_segs = num_segs;
+ req->r_flow_psn = e->psn;
+ req->ss.sge = e->rdma_sge;
+ req->ss.num_sge = 1;
+
+ req->flow_idx = req->setup_head;
+ req->clear_tail = req->setup_head;
+ req->acked_tail = req->setup_head;
+
+ qp->r_state = e->opcode;
+ qp->r_nak_state = 0;
+ /*
+ * We need to increment the MSN here instead of when we
+ * finish sending the result since a duplicate request would
+ * increment it more than once.
+ */
+ qp->r_msn++;
+ qp->r_psn++;
+
+ trace_hfi1_tid_req_rcv_write_req(qp, 0, e->opcode, e->psn, e->lpsn,
+ req);
+
+ if (qpriv->r_tid_tail == HFI1_QP_WQE_INVALID) {
+ qpriv->r_tid_tail = qp->r_head_ack_queue;
+ } else if (qpriv->r_tid_tail == qpriv->r_tid_head) {
+ struct tid_rdma_request *ptr;
+
+ e = &qp->s_ack_queue[qpriv->r_tid_tail];
+ ptr = ack_to_tid_req(e);
+
+ if (e->opcode != TID_OP(WRITE_REQ) ||
+ ptr->comp_seg == ptr->total_segs) {
+ if (qpriv->r_tid_tail == qpriv->r_tid_ack)
+ qpriv->r_tid_ack = qp->r_head_ack_queue;
+ qpriv->r_tid_tail = qp->r_head_ack_queue;
+ }
+ }
+update_head:
+ qp->r_head_ack_queue = next;
+ qpriv->r_tid_head = qp->r_head_ack_queue;
+
+ hfi1_tid_write_alloc_resources(qp, true);
+ trace_hfi1_tid_write_rsp_rcv_req(qp);
+
+ /* Schedule the send tasklet. */
+ qp->s_flags |= RVT_S_RESP_PENDING;
+ hfi1_schedule_send(qp);
+
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ if (is_fecn)
+ goto send_ack;
+ return;
+
+nack_inv_unlock:
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+nack_inv:
+ rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+ qp->r_nak_state = IB_NAK_INVALID_REQUEST;
+ qp->r_ack_psn = qp->r_psn;
+ /* Queue NAK for later */
+ rc_defered_ack(rcd, qp);
+ return;
+nack_acc:
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ rvt_rc_error(qp, IB_WC_LOC_PROT_ERR);
+ qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
+ qp->r_ack_psn = qp->r_psn;
+send_ack:
+ hfi1_send_rc_ack(packet, is_fecn);
+}
+
+u32 hfi1_build_tid_rdma_write_resp(struct rvt_qp *qp, struct rvt_ack_entry *e,
+ struct ib_other_headers *ohdr, u32 *bth1,
+ u32 bth2, u32 *len,
+ struct rvt_sge_state **ss)
+{
+ struct hfi1_ack_priv *epriv = e->priv;
+ struct tid_rdma_request *req = &epriv->tid_req;
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct tid_rdma_flow *flow = NULL;
+ u32 resp_len = 0, hdwords = 0;
+ void *resp_addr = NULL;
+ struct tid_rdma_params *remote;
+
+ trace_hfi1_tid_req_build_write_resp(qp, 0, e->opcode, e->psn, e->lpsn,
+ req);
+ trace_hfi1_tid_write_rsp_build_resp(qp);
+ trace_hfi1_rsp_build_tid_write_resp(qp, bth2);
+ flow = &req->flows[req->flow_idx];
+ switch (req->state) {
+ default:
+ /*
+ * Try to allocate resources here in case QP was queued and was
+ * later scheduled when resources became available
+ */
+ hfi1_tid_write_alloc_resources(qp, false);
+
+ /* We've already sent everything which is ready */
+ if (req->cur_seg >= req->alloc_seg)
+ goto done;
+
+ /*
+ * Resources can be assigned but responses cannot be sent in
+ * rnr_nak state, till the resent request is received
+ */
+ if (qpriv->rnr_nak_state == TID_RNR_NAK_SENT)
+ goto done;
+
+ req->state = TID_REQUEST_ACTIVE;
+ trace_hfi1_tid_flow_build_write_resp(qp, req->flow_idx, flow);
+ req->flow_idx = CIRC_NEXT(req->flow_idx, MAX_FLOWS);
+ hfi1_add_tid_reap_timer(qp);
+ break;
+
+ case TID_REQUEST_RESEND_ACTIVE:
+ case TID_REQUEST_RESEND:
+ trace_hfi1_tid_flow_build_write_resp(qp, req->flow_idx, flow);
+ req->flow_idx = CIRC_NEXT(req->flow_idx, MAX_FLOWS);
+ if (!CIRC_CNT(req->setup_head, req->flow_idx, MAX_FLOWS))
+ req->state = TID_REQUEST_ACTIVE;
+
+ hfi1_mod_tid_reap_timer(qp);
+ break;
+ }
+ flow->flow_state.resp_ib_psn = bth2;
+ resp_addr = (void *)flow->tid_entry;
+ resp_len = sizeof(*flow->tid_entry) * flow->tidcnt;
+ req->cur_seg++;
+
+ memset(&ohdr->u.tid_rdma.w_rsp, 0, sizeof(ohdr->u.tid_rdma.w_rsp));
+ epriv->ss.sge.vaddr = resp_addr;
+ epriv->ss.sge.sge_length = resp_len;
+ epriv->ss.sge.length = epriv->ss.sge.sge_length;
+ /*
+ * We can safely zero these out. Since the first SGE covers the
+ * entire packet, nothing else should even look at the MR.
+ */
+ epriv->ss.sge.mr = NULL;
+ epriv->ss.sge.m = 0;
+ epriv->ss.sge.n = 0;
+
+ epriv->ss.sg_list = NULL;
+ epriv->ss.total_len = epriv->ss.sge.sge_length;
+ epriv->ss.num_sge = 1;
+
+ *ss = &epriv->ss;
+ *len = epriv->ss.total_len;
+
+ /* Construct the TID RDMA WRITE RESP packet header */
+ rcu_read_lock();
+ remote = rcu_dereference(qpriv->tid_rdma.remote);
+
+ KDETH_RESET(ohdr->u.tid_rdma.w_rsp.kdeth0, KVER, 0x1);
+ KDETH_RESET(ohdr->u.tid_rdma.w_rsp.kdeth1, JKEY, remote->jkey);
+ ohdr->u.tid_rdma.w_rsp.aeth = rvt_compute_aeth(qp);
+ ohdr->u.tid_rdma.w_rsp.tid_flow_psn =
+ cpu_to_be32((flow->flow_state.generation <<
+ HFI1_KDETH_BTH_SEQ_SHIFT) |
+ (flow->flow_state.spsn &
+ HFI1_KDETH_BTH_SEQ_MASK));
+ ohdr->u.tid_rdma.w_rsp.tid_flow_qp =
+ cpu_to_be32(qpriv->tid_rdma.local.qp |
+ ((flow->idx & TID_RDMA_DESTQP_FLOW_MASK) <<
+ TID_RDMA_DESTQP_FLOW_SHIFT) |
+ qpriv->rcd->ctxt);
+ ohdr->u.tid_rdma.w_rsp.verbs_qp = cpu_to_be32(qp->remote_qpn);
+ *bth1 = remote->qp;
+ rcu_read_unlock();
+ hdwords = sizeof(ohdr->u.tid_rdma.w_rsp) / sizeof(u32);
+ qpriv->pending_tid_w_segs++;
+done:
+ return hdwords;
+}
+
+static void hfi1_add_tid_reap_timer(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *qpriv = qp->priv;
+
+ lockdep_assert_held(&qp->s_lock);
+ if (!(qpriv->s_flags & HFI1_R_TID_RSC_TIMER)) {
+ qpriv->s_flags |= HFI1_R_TID_RSC_TIMER;
+ qpriv->s_tid_timer.expires = jiffies +
+ qpriv->tid_timer_timeout_jiffies;
+ add_timer(&qpriv->s_tid_timer);
+ }
+}
+
+static void hfi1_mod_tid_reap_timer(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *qpriv = qp->priv;
+
+ lockdep_assert_held(&qp->s_lock);
+ qpriv->s_flags |= HFI1_R_TID_RSC_TIMER;
+ mod_timer(&qpriv->s_tid_timer, jiffies +
+ qpriv->tid_timer_timeout_jiffies);
+}
+
+static int hfi1_stop_tid_reap_timer(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ int rval = 0;
+
+ lockdep_assert_held(&qp->s_lock);
+ if (qpriv->s_flags & HFI1_R_TID_RSC_TIMER) {
+ rval = del_timer(&qpriv->s_tid_timer);
+ qpriv->s_flags &= ~HFI1_R_TID_RSC_TIMER;
+ }
+ return rval;
+}
+
+void hfi1_del_tid_reap_timer(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *qpriv = qp->priv;
+
+ del_timer_sync(&qpriv->s_tid_timer);
+ qpriv->s_flags &= ~HFI1_R_TID_RSC_TIMER;
+}
+
+static void hfi1_tid_timeout(struct timer_list *t)
+{
+ struct hfi1_qp_priv *qpriv = from_timer(qpriv, t, s_tid_timer);
+ struct rvt_qp *qp = qpriv->owner;
+ struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
+ unsigned long flags;
+ u32 i;
+
+ spin_lock_irqsave(&qp->r_lock, flags);
+ spin_lock(&qp->s_lock);
+ if (qpriv->s_flags & HFI1_R_TID_RSC_TIMER) {
+ dd_dev_warn(dd_from_ibdev(qp->ibqp.device), "[QP%u] %s %d\n",
+ qp->ibqp.qp_num, __func__, __LINE__);
+ trace_hfi1_msg_tid_timeout(/* msg */
+ qp, "resource timeout = ",
+ (u64)qpriv->tid_timer_timeout_jiffies);
+ hfi1_stop_tid_reap_timer(qp);
+ /*
+ * Go though the entire ack queue and clear any outstanding
+ * HW flow and RcvArray resources.
+ */
+ hfi1_kern_clear_hw_flow(qpriv->rcd, qp);
+ for (i = 0; i < rvt_max_atomic(rdi); i++) {
+ struct tid_rdma_request *req =
+ ack_to_tid_req(&qp->s_ack_queue[i]);
+
+ hfi1_kern_exp_rcv_clear_all(req);
+ }
+ spin_unlock(&qp->s_lock);
+ if (qp->ibqp.event_handler) {
+ struct ib_event ev;
+
+ ev.device = qp->ibqp.device;
+ ev.element.qp = &qp->ibqp;
+ ev.event = IB_EVENT_QP_FATAL;
+ qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+ }
+ rvt_rc_error(qp, IB_WC_RESP_TIMEOUT_ERR);
+ goto unlock_r_lock;
+ }
+ spin_unlock(&qp->s_lock);
+unlock_r_lock:
+ spin_unlock_irqrestore(&qp->r_lock, flags);
+}
+
+void hfi1_rc_rcv_tid_rdma_write_resp(struct hfi1_packet *packet)
+{
+ /* HANDLER FOR TID RDMA WRITE RESPONSE packet (Requestor side */
+
+ /*
+ * 1. Find matching SWQE
+ * 2. Check that TIDENTRY array has enough space for a complete
+ * segment. If not, put QP in error state.
+ * 3. Save response data in struct tid_rdma_req and struct tid_rdma_flow
+ * 4. Remove HFI1_S_WAIT_TID_RESP from s_flags.
+ * 5. Set qp->s_state
+ * 6. Kick the send engine (hfi1_schedule_send())
+ */
+ struct ib_other_headers *ohdr = packet->ohdr;
+ struct rvt_qp *qp = packet->qp;
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct hfi1_ctxtdata *rcd = packet->rcd;
+ struct rvt_swqe *wqe;
+ struct tid_rdma_request *req;
+ struct tid_rdma_flow *flow;
+ enum ib_wc_status status;
+ u32 opcode, aeth, psn, flow_psn, i, tidlen = 0, pktlen;
+ bool is_fecn;
+ unsigned long flags;
+
+ is_fecn = process_ecn(qp, packet);
+ psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
+ aeth = be32_to_cpu(ohdr->u.tid_rdma.w_rsp.aeth);
+ opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
+
+ spin_lock_irqsave(&qp->s_lock, flags);
+
+ /* Ignore invalid responses */
+ if (cmp_psn(psn, qp->s_next_psn) >= 0)
+ goto ack_done;
+
+ /* Ignore duplicate responses. */
+ if (unlikely(cmp_psn(psn, qp->s_last_psn) <= 0))
+ goto ack_done;
+
+ if (unlikely(qp->s_acked == qp->s_tail))
+ goto ack_done;
+
+ /*
+ * If we are waiting for a particular packet sequence number
+ * due to a request being resent, check for it. Otherwise,
+ * ensure that we haven't missed anything.
+ */
+ if (qp->r_flags & RVT_R_RDMAR_SEQ) {
+ if (cmp_psn(psn, qp->s_last_psn + 1) != 0)
+ goto ack_done;
+ qp->r_flags &= ~RVT_R_RDMAR_SEQ;
+ }
+
+ wqe = rvt_get_swqe_ptr(qp, qpriv->s_tid_cur);
+ if (unlikely(wqe->wr.opcode != IB_WR_TID_RDMA_WRITE))
+ goto ack_op_err;
+
+ req = wqe_to_tid_req(wqe);
+ /*
+ * If we've lost ACKs and our acked_tail pointer is too far
+ * behind, don't overwrite segments. Just drop the packet and
+ * let the reliability protocol take care of it.
+ */
+ if (!CIRC_SPACE(req->setup_head, req->acked_tail, MAX_FLOWS))
+ goto ack_done;
+
+ /*
+ * The call to do_rc_ack() should be last in the chain of
+ * packet checks because it will end up updating the QP state.
+ * Therefore, anything that would prevent the packet from
+ * being accepted as a successful response should be prior
+ * to it.
+ */
+ if (!do_rc_ack(qp, aeth, psn, opcode, 0, rcd))
+ goto ack_done;
+
+ trace_hfi1_ack(qp, psn);
+
+ flow = &req->flows[req->setup_head];
+ flow->pkt = 0;
+ flow->tid_idx = 0;
+ flow->tid_offset = 0;
+ flow->sent = 0;
+ flow->resync_npkts = 0;
+ flow->tid_qpn = be32_to_cpu(ohdr->u.tid_rdma.w_rsp.tid_flow_qp);
+ flow->idx = (flow->tid_qpn >> TID_RDMA_DESTQP_FLOW_SHIFT) &
+ TID_RDMA_DESTQP_FLOW_MASK;
+ flow_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.w_rsp.tid_flow_psn));
+ flow->flow_state.generation = flow_psn >> HFI1_KDETH_BTH_SEQ_SHIFT;
+ flow->flow_state.spsn = flow_psn & HFI1_KDETH_BTH_SEQ_MASK;
+ flow->flow_state.resp_ib_psn = psn;
+ flow->length = min_t(u32, req->seg_len,
+ (wqe->length - (req->comp_seg * req->seg_len)));
+
+ flow->npkts = rvt_div_round_up_mtu(qp, flow->length);
+ flow->flow_state.lpsn = flow->flow_state.spsn +
+ flow->npkts - 1;
+ /* payload length = packet length - (header length + ICRC length) */
+ pktlen = packet->tlen - (packet->hlen + 4);
+ if (pktlen > sizeof(flow->tid_entry)) {
+ status = IB_WC_LOC_LEN_ERR;
+ goto ack_err;
+ }
+ memcpy(flow->tid_entry, packet->ebuf, pktlen);
+ flow->tidcnt = pktlen / sizeof(*flow->tid_entry);
+ trace_hfi1_tid_flow_rcv_write_resp(qp, req->setup_head, flow);
+
+ req->comp_seg++;
+ trace_hfi1_tid_write_sender_rcv_resp(qp, 0);
+ /*
+ * Walk the TID_ENTRY list to make sure we have enough space for a
+ * complete segment.
+ */
+ for (i = 0; i < flow->tidcnt; i++) {
+ trace_hfi1_tid_entry_rcv_write_resp(/* entry */
+ qp, i, flow->tid_entry[i]);
+ if (!EXP_TID_GET(flow->tid_entry[i], LEN)) {
+ status = IB_WC_LOC_LEN_ERR;
+ goto ack_err;
+ }
+ tidlen += EXP_TID_GET(flow->tid_entry[i], LEN);
+ }
+ if (tidlen * PAGE_SIZE < flow->length) {
+ status = IB_WC_LOC_LEN_ERR;
+ goto ack_err;
+ }
+
+ trace_hfi1_tid_req_rcv_write_resp(qp, 0, wqe->wr.opcode, wqe->psn,
+ wqe->lpsn, req);
+ /*
+ * If this is the first response for this request, set the initial
+ * flow index to the current flow.
+ */
+ if (!cmp_psn(psn, wqe->psn)) {
+ req->r_last_acked = mask_psn(wqe->psn - 1);
+ /* Set acked flow index to head index */
+ req->acked_tail = req->setup_head;
+ }
+
+ /* advance circular buffer head */
+ req->setup_head = CIRC_NEXT(req->setup_head, MAX_FLOWS);
+ req->state = TID_REQUEST_ACTIVE;
+
+ /*
+ * If all responses for this TID RDMA WRITE request have been received
+ * advance the pointer to the next one.
+ * Since TID RDMA requests could be mixed in with regular IB requests,
+ * they might not appear sequentially in the queue. Therefore, the
+ * next request needs to be "found".
+ */
+ if (qpriv->s_tid_cur != qpriv->s_tid_head &&
+ req->comp_seg == req->total_segs) {
+ for (i = qpriv->s_tid_cur + 1; ; i++) {
+ if (i == qp->s_size)
+ i = 0;
+ wqe = rvt_get_swqe_ptr(qp, i);
+ if (i == qpriv->s_tid_head)
+ break;
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE)
+ break;
+ }
+ qpriv->s_tid_cur = i;
+ }
+ qp->s_flags &= ~HFI1_S_WAIT_TID_RESP;
+
+ hfi1_schedule_tid_send(qp);
+ goto ack_done;
+
+ack_op_err:
+ status = IB_WC_LOC_QP_OP_ERR;
+ack_err:
+ rvt_error_qp(qp, status);
+ack_done:
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ if (is_fecn)
+ hfi1_send_rc_ack(packet, is_fecn);
+}
+
+bool hfi1_build_tid_rdma_packet(struct rvt_swqe *wqe,
+ struct ib_other_headers *ohdr,
+ u32 *bth1, u32 *bth2, u32 *len)
+{
+ struct tid_rdma_request *req = wqe_to_tid_req(wqe);
+ struct tid_rdma_flow *flow = &req->flows[req->clear_tail];
+ struct tid_rdma_params *remote;
+ struct rvt_qp *qp = req->qp;
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ u32 tidentry = flow->tid_entry[flow->tid_idx];
+ u32 tidlen = EXP_TID_GET(tidentry, LEN) << PAGE_SHIFT;
+ struct tid_rdma_write_data *wd = &ohdr->u.tid_rdma.w_data;
+ u32 next_offset, om = KDETH_OM_LARGE;
+ bool last_pkt;
+
+ if (!tidlen) {
+ hfi1_trdma_send_complete(qp, wqe, IB_WC_REM_INV_RD_REQ_ERR);
+ rvt_error_qp(qp, IB_WC_REM_INV_RD_REQ_ERR);
+ }
+
+ *len = min_t(u32, qp->pmtu, tidlen - flow->tid_offset);
+ flow->sent += *len;
+ next_offset = flow->tid_offset + *len;
+ last_pkt = (flow->tid_idx == (flow->tidcnt - 1) &&
+ next_offset >= tidlen) || (flow->sent >= flow->length);
+ trace_hfi1_tid_entry_build_write_data(qp, flow->tid_idx, tidentry);
+ trace_hfi1_tid_flow_build_write_data(qp, req->clear_tail, flow);
+
+ rcu_read_lock();
+ remote = rcu_dereference(qpriv->tid_rdma.remote);
+ KDETH_RESET(wd->kdeth0, KVER, 0x1);
+ KDETH_SET(wd->kdeth0, SH, !last_pkt);
+ KDETH_SET(wd->kdeth0, INTR, !!(!last_pkt && remote->urg));
+ KDETH_SET(wd->kdeth0, TIDCTRL, EXP_TID_GET(tidentry, CTRL));
+ KDETH_SET(wd->kdeth0, TID, EXP_TID_GET(tidentry, IDX));
+ KDETH_SET(wd->kdeth0, OM, om == KDETH_OM_LARGE);
+ KDETH_SET(wd->kdeth0, OFFSET, flow->tid_offset / om);
+ KDETH_RESET(wd->kdeth1, JKEY, remote->jkey);
+ wd->verbs_qp = cpu_to_be32(qp->remote_qpn);
+ rcu_read_unlock();
+
+ *bth1 = flow->tid_qpn;
+ *bth2 = mask_psn(((flow->flow_state.spsn + flow->pkt++) &
+ HFI1_KDETH_BTH_SEQ_MASK) |
+ (flow->flow_state.generation <<
+ HFI1_KDETH_BTH_SEQ_SHIFT));
+ if (last_pkt) {
+ /* PSNs are zero-based, so +1 to count number of packets */
+ if (flow->flow_state.lpsn + 1 +
+ rvt_div_round_up_mtu(qp, req->seg_len) >
+ MAX_TID_FLOW_PSN)
+ req->state = TID_REQUEST_SYNC;
+ *bth2 |= IB_BTH_REQ_ACK;
+ }
+
+ if (next_offset >= tidlen) {
+ flow->tid_offset = 0;
+ flow->tid_idx++;
+ } else {
+ flow->tid_offset = next_offset;
+ }
+ return last_pkt;
+}
+
+void hfi1_rc_rcv_tid_rdma_write_data(struct hfi1_packet *packet)
+{
+ struct rvt_qp *qp = packet->qp;
+ struct hfi1_qp_priv *priv = qp->priv;
+ struct hfi1_ctxtdata *rcd = priv->rcd;
+ struct ib_other_headers *ohdr = packet->ohdr;
+ struct rvt_ack_entry *e;
+ struct tid_rdma_request *req;
+ struct tid_rdma_flow *flow;
+ struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
+ unsigned long flags;
+ u32 psn, next;
+ u8 opcode;
+
+ psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
+ opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
+
+ /*
+ * All error handling should be done by now. If we are here, the packet
+ * is either good or been accepted by the error handler.
+ */
+ spin_lock_irqsave(&qp->s_lock, flags);
+ e = &qp->s_ack_queue[priv->r_tid_tail];
+ req = ack_to_tid_req(e);
+ flow = &req->flows[req->clear_tail];
+ if (cmp_psn(psn, full_flow_psn(flow, flow->flow_state.lpsn))) {
+ if (cmp_psn(psn, flow->flow_state.r_next_psn))
+ goto send_nak;
+ flow->flow_state.r_next_psn++;
+ goto exit;
+ }
+ flow->flow_state.r_next_psn = mask_psn(psn + 1);
+ hfi1_kern_exp_rcv_clear(req);
+ priv->alloc_w_segs--;
+ rcd->flows[flow->idx].psn = psn & HFI1_KDETH_BTH_SEQ_MASK;
+ req->comp_seg++;
+ priv->s_nak_state = 0;
+
+ /*
+ * Release the flow if one of the following conditions has been met:
+ * - The request has reached a sync point AND all outstanding
+ * segments have been completed, or
+ * - The entire request is complete and there are no more requests
+ * (of any kind) in the queue.
+ */
+ trace_hfi1_rsp_rcv_tid_write_data(qp, psn);
+ trace_hfi1_tid_req_rcv_write_data(qp, 0, e->opcode, e->psn, e->lpsn,
+ req);
+ trace_hfi1_tid_write_rsp_rcv_data(qp);
+ if (priv->r_tid_ack == HFI1_QP_WQE_INVALID)
+ priv->r_tid_ack = priv->r_tid_tail;
+
+ if (opcode == TID_OP(WRITE_DATA_LAST)) {
+ for (next = priv->r_tid_tail + 1; ; next++) {
+ if (next > rvt_size_atomic(&dev->rdi))
+ next = 0;
+ if (next == priv->r_tid_head)
+ break;
+ e = &qp->s_ack_queue[next];
+ if (e->opcode == TID_OP(WRITE_REQ))
+ break;
+ }
+ priv->r_tid_tail = next;
+ if (++qp->s_acked_ack_queue > rvt_size_atomic(&dev->rdi))
+ qp->s_acked_ack_queue = 0;
+ }
+
+ hfi1_tid_write_alloc_resources(qp, true);
+
+ /*
+ * If we need to generate more responses, schedule the
+ * send engine.
+ */
+ if (req->cur_seg < req->total_segs ||
+ qp->s_tail_ack_queue != qp->r_head_ack_queue) {
+ qp->s_flags |= RVT_S_RESP_PENDING;
+ hfi1_schedule_send(qp);
+ }
+
+ priv->pending_tid_w_segs--;
+ if (priv->s_flags & HFI1_R_TID_RSC_TIMER) {
+ if (priv->pending_tid_w_segs)
+ hfi1_mod_tid_reap_timer(req->qp);
+ else
+ hfi1_stop_tid_reap_timer(req->qp);
+ }
+
+done:
+ priv->s_flags |= RVT_S_ACK_PENDING;
+ hfi1_schedule_tid_send(qp);
+exit:
+ priv->r_next_psn_kdeth = flow->flow_state.r_next_psn;
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ return;
+
+send_nak:
+ if (!priv->s_nak_state) {
+ priv->s_nak_state = IB_NAK_PSN_ERROR;
+ priv->s_nak_psn = flow->flow_state.r_next_psn;
+ priv->s_flags |= RVT_S_ACK_PENDING;
+ if (priv->r_tid_ack == HFI1_QP_WQE_INVALID)
+ priv->r_tid_ack = priv->r_tid_tail;
+ hfi1_schedule_tid_send(qp);
+ }
+ goto done;
+}
+
+static bool hfi1_tid_rdma_is_resync_psn(u32 psn)
+{
+ return (bool)((psn & HFI1_KDETH_BTH_SEQ_MASK) ==
+ HFI1_KDETH_BTH_SEQ_MASK);
+}
+
+u32 hfi1_build_tid_rdma_write_ack(struct rvt_qp *qp, struct rvt_ack_entry *e,
+ struct ib_other_headers *ohdr, u16 iflow,
+ u32 *bth1, u32 *bth2)
+{
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct tid_flow_state *fs = &qpriv->flow_state;
+ struct tid_rdma_request *req = ack_to_tid_req(e);
+ struct tid_rdma_flow *flow = &req->flows[iflow];
+ struct tid_rdma_params *remote;
+
+ rcu_read_lock();
+ remote = rcu_dereference(qpriv->tid_rdma.remote);
+ KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth1, JKEY, remote->jkey);
+ ohdr->u.tid_rdma.ack.verbs_qp = cpu_to_be32(qp->remote_qpn);
+ *bth1 = remote->qp;
+ rcu_read_unlock();
+
+ if (qpriv->resync) {
+ *bth2 = mask_psn((fs->generation <<
+ HFI1_KDETH_BTH_SEQ_SHIFT) - 1);
+ ohdr->u.tid_rdma.ack.aeth = rvt_compute_aeth(qp);
+ } else if (qpriv->s_nak_state) {
+ *bth2 = mask_psn(qpriv->s_nak_psn);
+ ohdr->u.tid_rdma.ack.aeth =
+ cpu_to_be32((qp->r_msn & IB_MSN_MASK) |
+ (qpriv->s_nak_state <<
+ IB_AETH_CREDIT_SHIFT));
+ } else {
+ *bth2 = full_flow_psn(flow, flow->flow_state.lpsn);
+ ohdr->u.tid_rdma.ack.aeth = rvt_compute_aeth(qp);
+ }
+ KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth0, KVER, 0x1);
+ ohdr->u.tid_rdma.ack.tid_flow_qp =
+ cpu_to_be32(qpriv->tid_rdma.local.qp |
+ ((flow->idx & TID_RDMA_DESTQP_FLOW_MASK) <<
+ TID_RDMA_DESTQP_FLOW_SHIFT) |
+ qpriv->rcd->ctxt);
+
+ ohdr->u.tid_rdma.ack.tid_flow_psn = 0;
+ ohdr->u.tid_rdma.ack.verbs_psn =
+ cpu_to_be32(flow->flow_state.resp_ib_psn);
+
+ if (qpriv->resync) {
+ /*
+ * If the PSN before the current expect KDETH PSN is the
+ * RESYNC PSN, then we never received a good TID RDMA WRITE
+ * DATA packet after a previous RESYNC.
+ * In this case, the next expected KDETH PSN stays the same.
+ */
+ if (hfi1_tid_rdma_is_resync_psn(qpriv->r_next_psn_kdeth - 1)) {
+ ohdr->u.tid_rdma.ack.tid_flow_psn =
+ cpu_to_be32(qpriv->r_next_psn_kdeth_save);
+ } else {
+ /*
+ * Because the KDETH PSNs jump during a RESYNC, it's
+ * not possible to infer (or compute) the previous value
+ * of r_next_psn_kdeth in the case of back-to-back
+ * RESYNC packets. Therefore, we save it.
+ */
+ qpriv->r_next_psn_kdeth_save =
+ qpriv->r_next_psn_kdeth - 1;
+ ohdr->u.tid_rdma.ack.tid_flow_psn =
+ cpu_to_be32(qpriv->r_next_psn_kdeth_save);
+ qpriv->r_next_psn_kdeth = mask_psn(*bth2 + 1);
+ }
+ qpriv->resync = false;
+ }
+
+ return sizeof(ohdr->u.tid_rdma.ack) / sizeof(u32);
+}
+
+void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet)
+{
+ struct ib_other_headers *ohdr = packet->ohdr;
+ struct rvt_qp *qp = packet->qp;
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct rvt_swqe *wqe;
+ struct tid_rdma_request *req;
+ struct tid_rdma_flow *flow;
+ u32 aeth, psn, req_psn, ack_psn, fspsn, resync_psn, ack_kpsn;
+ bool is_fecn;
+ unsigned long flags;
+ u16 fidx;
+
+ trace_hfi1_tid_write_sender_rcv_tid_ack(qp, 0);
+ is_fecn = process_ecn(qp, packet);
+ psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
+ aeth = be32_to_cpu(ohdr->u.tid_rdma.ack.aeth);
+ req_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.ack.verbs_psn));
+ resync_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.ack.tid_flow_psn));
+
+ spin_lock_irqsave(&qp->s_lock, flags);
+ trace_hfi1_rcv_tid_ack(qp, aeth, psn, req_psn, resync_psn);
+
+ /* If we are waiting for an ACK to RESYNC, drop any other packets */
+ if ((qp->s_flags & HFI1_S_WAIT_HALT) &&
+ cmp_psn(psn, qpriv->s_resync_psn))
+ goto ack_op_err;
+
+ ack_psn = req_psn;
+ if (hfi1_tid_rdma_is_resync_psn(psn))
+ ack_kpsn = resync_psn;
+ else
+ ack_kpsn = psn;
+ if (aeth >> 29) {
+ ack_psn--;
+ ack_kpsn--;
+ }
+
+ wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+
+ if (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE)
+ goto ack_op_err;
+
+ req = wqe_to_tid_req(wqe);
+ trace_hfi1_tid_req_rcv_tid_ack(qp, 0, wqe->wr.opcode, wqe->psn,
+ wqe->lpsn, req);
+ flow = &req->flows[req->acked_tail];
+ trace_hfi1_tid_flow_rcv_tid_ack(qp, req->acked_tail, flow);
+
+ /* Drop stale ACK/NAK */
+ if (cmp_psn(psn, full_flow_psn(flow, flow->flow_state.spsn)) < 0)
+ goto ack_op_err;
+
+ while (cmp_psn(ack_kpsn,
+ full_flow_psn(flow, flow->flow_state.lpsn)) >= 0 &&
+ req->ack_seg < req->cur_seg) {
+ req->ack_seg++;
+ /* advance acked segment pointer */
+ req->acked_tail = CIRC_NEXT(req->acked_tail, MAX_FLOWS);
+ req->r_last_acked = flow->flow_state.resp_ib_psn;
+ trace_hfi1_tid_req_rcv_tid_ack(qp, 0, wqe->wr.opcode, wqe->psn,
+ wqe->lpsn, req);
+ if (req->ack_seg == req->total_segs) {
+ req->state = TID_REQUEST_COMPLETE;
+ wqe = do_rc_completion(qp, wqe,
+ to_iport(qp->ibqp.device,
+ qp->port_num));
+ trace_hfi1_sender_rcv_tid_ack(qp);
+ atomic_dec(&qpriv->n_tid_requests);
+ if (qp->s_acked == qp->s_tail)
+ break;
+ if (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE)
+ break;
+ req = wqe_to_tid_req(wqe);
+ }
+ flow = &req->flows[req->acked_tail];
+ trace_hfi1_tid_flow_rcv_tid_ack(qp, req->acked_tail, flow);
+ }
+
+ trace_hfi1_tid_req_rcv_tid_ack(qp, 0, wqe->wr.opcode, wqe->psn,
+ wqe->lpsn, req);
+ switch (aeth >> 29) {
+ case 0: /* ACK */
+ if (qpriv->s_flags & RVT_S_WAIT_ACK)
+ qpriv->s_flags &= ~RVT_S_WAIT_ACK;
+ if (!hfi1_tid_rdma_is_resync_psn(psn)) {
+ /* Check if there is any pending TID ACK */
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE &&
+ req->ack_seg < req->cur_seg)
+ hfi1_mod_tid_retry_timer(qp);
+ else
+ hfi1_stop_tid_retry_timer(qp);
+ hfi1_schedule_send(qp);
+ } else {
+ u32 spsn, fpsn, last_acked, generation;
+ struct tid_rdma_request *rptr;
+
+ /* ACK(RESYNC) */
+ hfi1_stop_tid_retry_timer(qp);
+ /* Allow new requests (see hfi1_make_tid_rdma_pkt) */
+ qp->s_flags &= ~HFI1_S_WAIT_HALT;
+ /*
+ * Clear RVT_S_SEND_ONE flag in case that the TID RDMA
+ * ACK is received after the TID retry timer is fired
+ * again. In this case, do not send any more TID
+ * RESYNC request or wait for any more TID ACK packet.
+ */
+ qpriv->s_flags &= ~RVT_S_SEND_ONE;
+ hfi1_schedule_send(qp);
+
+ if ((qp->s_acked == qpriv->s_tid_tail &&
+ req->ack_seg == req->total_segs) ||
+ qp->s_acked == qp->s_tail) {
+ qpriv->s_state = TID_OP(WRITE_DATA_LAST);
+ goto done;
+ }
+
+ if (req->ack_seg == req->comp_seg) {
+ qpriv->s_state = TID_OP(WRITE_DATA);
+ goto done;
+ }
+
+ /*
+ * The PSN to start with is the next PSN after the
+ * RESYNC PSN.
+ */
+ psn = mask_psn(psn + 1);
+ generation = psn >> HFI1_KDETH_BTH_SEQ_SHIFT;
+ spsn = 0;
+
+ /*
+ * Update to the correct WQE when we get an ACK(RESYNC)
+ * in the middle of a request.
+ */
+ if (delta_psn(ack_psn, wqe->lpsn))
+ wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+ req = wqe_to_tid_req(wqe);
+ flow = &req->flows[req->acked_tail];
+ /*
+ * RESYNC re-numbers the PSN ranges of all remaining
+ * segments. Also, PSN's start from 0 in the middle of a
+ * segment and the first segment size is less than the
+ * default number of packets. flow->resync_npkts is used
+ * to track the number of packets from the start of the
+ * real segment to the point of 0 PSN after the RESYNC
+ * in order to later correctly rewind the SGE.
+ */
+ fpsn = full_flow_psn(flow, flow->flow_state.spsn);
+ req->r_ack_psn = psn;
+ flow->resync_npkts +=
+ delta_psn(mask_psn(resync_psn + 1), fpsn);
+ /*
+ * Renumber all packet sequence number ranges
+ * based on the new generation.
+ */
+ last_acked = qp->s_acked;
+ rptr = req;
+ while (1) {
+ /* start from last acked segment */
+ for (fidx = rptr->acked_tail;
+ CIRC_CNT(rptr->setup_head, fidx,
+ MAX_FLOWS);
+ fidx = CIRC_NEXT(fidx, MAX_FLOWS)) {
+ u32 lpsn;
+ u32 gen;
+
+ flow = &rptr->flows[fidx];
+ gen = flow->flow_state.generation;
+ if (WARN_ON(gen == generation &&
+ flow->flow_state.spsn !=
+ spsn))
+ continue;
+ lpsn = flow->flow_state.lpsn;
+ lpsn = full_flow_psn(flow, lpsn);
+ flow->npkts =
+ delta_psn(lpsn,
+ mask_psn(resync_psn)
+ );
+ flow->flow_state.generation =
+ generation;
+ flow->flow_state.spsn = spsn;
+ flow->flow_state.lpsn =
+ flow->flow_state.spsn +
+ flow->npkts - 1;
+ flow->pkt = 0;
+ spsn += flow->npkts;
+ resync_psn += flow->npkts;
+ trace_hfi1_tid_flow_rcv_tid_ack(qp,
+ fidx,
+ flow);
+ }
+ if (++last_acked == qpriv->s_tid_cur + 1)
+ break;
+ if (last_acked == qp->s_size)
+ last_acked = 0;
+ wqe = rvt_get_swqe_ptr(qp, last_acked);
+ rptr = wqe_to_tid_req(wqe);
+ }
+ req->cur_seg = req->ack_seg;
+ qpriv->s_tid_tail = qp->s_acked;
+ qpriv->s_state = TID_OP(WRITE_REQ);
+ hfi1_schedule_tid_send(qp);
+ }
+done:
+ qpriv->s_retry = qp->s_retry_cnt;
+ break;
+
+ case 3: /* NAK */
+ hfi1_stop_tid_retry_timer(qp);
+ switch ((aeth >> IB_AETH_CREDIT_SHIFT) &
+ IB_AETH_CREDIT_MASK) {
+ case 0: /* PSN sequence error */
+ flow = &req->flows[req->acked_tail];
+ fspsn = full_flow_psn(flow, flow->flow_state.spsn);
+ trace_hfi1_tid_flow_rcv_tid_ack(qp, req->acked_tail,
+ flow);
+ req->r_ack_psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
+ req->cur_seg = req->ack_seg;
+ qpriv->s_tid_tail = qp->s_acked;
+ qpriv->s_state = TID_OP(WRITE_REQ);
+ qpriv->s_retry = qp->s_retry_cnt;
+ hfi1_schedule_tid_send(qp);
+ break;
+
+ default:
+ break;
+ }
+ break;
+
+ default:
+ break;
+ }
+
+ack_op_err:
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+}
+
+void hfi1_add_tid_retry_timer(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+ struct ib_qp *ibqp = &qp->ibqp;
+ struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
+
+ lockdep_assert_held(&qp->s_lock);
+ if (!(priv->s_flags & HFI1_S_TID_RETRY_TIMER)) {
+ priv->s_flags |= HFI1_S_TID_RETRY_TIMER;
+ priv->s_tid_retry_timer.expires = jiffies +
+ priv->tid_retry_timeout_jiffies + rdi->busy_jiffies;
+ add_timer(&priv->s_tid_retry_timer);
+ }
+}
+
+static void hfi1_mod_tid_retry_timer(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+ struct ib_qp *ibqp = &qp->ibqp;
+ struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
+
+ lockdep_assert_held(&qp->s_lock);
+ priv->s_flags |= HFI1_S_TID_RETRY_TIMER;
+ mod_timer(&priv->s_tid_retry_timer, jiffies +
+ priv->tid_retry_timeout_jiffies + rdi->busy_jiffies);
+}
+
+static int hfi1_stop_tid_retry_timer(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+ int rval = 0;
+
+ lockdep_assert_held(&qp->s_lock);
+ if (priv->s_flags & HFI1_S_TID_RETRY_TIMER) {
+ rval = del_timer(&priv->s_tid_retry_timer);
+ priv->s_flags &= ~HFI1_S_TID_RETRY_TIMER;
+ }
+ return rval;
+}
+
+void hfi1_del_tid_retry_timer(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ del_timer_sync(&priv->s_tid_retry_timer);
+ priv->s_flags &= ~HFI1_S_TID_RETRY_TIMER;
+}
+
+static void hfi1_tid_retry_timeout(struct timer_list *t)
+{
+ struct hfi1_qp_priv *priv = from_timer(priv, t, s_tid_retry_timer);
+ struct rvt_qp *qp = priv->owner;
+ struct rvt_swqe *wqe;
+ unsigned long flags;
+ struct tid_rdma_request *req;
+
+ spin_lock_irqsave(&qp->r_lock, flags);
+ spin_lock(&qp->s_lock);
+ trace_hfi1_tid_write_sender_retry_timeout(qp, 0);
+ if (priv->s_flags & HFI1_S_TID_RETRY_TIMER) {
+ hfi1_stop_tid_retry_timer(qp);
+ if (!priv->s_retry) {
+ trace_hfi1_msg_tid_retry_timeout(/* msg */
+ qp,
+ "Exhausted retries. Tid retry timeout = ",
+ (u64)priv->tid_retry_timeout_jiffies);
+
+ wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+ hfi1_trdma_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR);
+ rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+ } else {
+ wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+ req = wqe_to_tid_req(wqe);
+ trace_hfi1_tid_req_tid_retry_timeout(/* req */
+ qp, 0, wqe->wr.opcode, wqe->psn, wqe->lpsn, req);
+
+ priv->s_flags &= ~RVT_S_WAIT_ACK;
+ /* Only send one packet (the RESYNC) */
+ priv->s_flags |= RVT_S_SEND_ONE;
+ /*
+ * No additional request shall be made by this QP until
+ * the RESYNC has been complete.
+ */
+ qp->s_flags |= HFI1_S_WAIT_HALT;
+ priv->s_state = TID_OP(RESYNC);
+ priv->s_retry--;
+ hfi1_schedule_tid_send(qp);
+ }
+ }
+ spin_unlock(&qp->s_lock);
+ spin_unlock_irqrestore(&qp->r_lock, flags);
+}
+
+u32 hfi1_build_tid_rdma_resync(struct rvt_qp *qp, struct rvt_swqe *wqe,
+ struct ib_other_headers *ohdr, u32 *bth1,
+ u32 *bth2, u16 fidx)
+{
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct tid_rdma_params *remote;
+ struct tid_rdma_request *req = wqe_to_tid_req(wqe);
+ struct tid_rdma_flow *flow = &req->flows[fidx];
+ u32 generation;
+
+ rcu_read_lock();
+ remote = rcu_dereference(qpriv->tid_rdma.remote);
+ KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth1, JKEY, remote->jkey);
+ ohdr->u.tid_rdma.ack.verbs_qp = cpu_to_be32(qp->remote_qpn);
+ *bth1 = remote->qp;
+ rcu_read_unlock();
+
+ generation = kern_flow_generation_next(flow->flow_state.generation);
+ *bth2 = mask_psn((generation << HFI1_KDETH_BTH_SEQ_SHIFT) - 1);
+ qpriv->s_resync_psn = *bth2;
+ *bth2 |= IB_BTH_REQ_ACK;
+ KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth0, KVER, 0x1);
+
+ return sizeof(ohdr->u.tid_rdma.resync) / sizeof(u32);
+}
+
+void hfi1_rc_rcv_tid_rdma_resync(struct hfi1_packet *packet)
+{
+ struct ib_other_headers *ohdr = packet->ohdr;
+ struct rvt_qp *qp = packet->qp;
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct hfi1_ctxtdata *rcd = qpriv->rcd;
+ struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
+ struct rvt_ack_entry *e;
+ struct tid_rdma_request *req;
+ struct tid_rdma_flow *flow;
+ struct tid_flow_state *fs = &qpriv->flow_state;
+ u32 psn, generation, idx, gen_next;
+ bool is_fecn;
+ unsigned long flags;
+
+ is_fecn = process_ecn(qp, packet);
+ psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
+
+ generation = mask_psn(psn + 1) >> HFI1_KDETH_BTH_SEQ_SHIFT;
+ spin_lock_irqsave(&qp->s_lock, flags);
+
+ gen_next = (fs->generation == KERN_GENERATION_RESERVED) ?
+ generation : kern_flow_generation_next(fs->generation);
+ /*
+ * RESYNC packet contains the "next" generation and can only be
+ * from the current or previous generations
+ */
+ if (generation != mask_generation(gen_next - 1) &&
+ generation != gen_next)
+ goto bail;
+ /* Already processing a resync */
+ if (qpriv->resync)
+ goto bail;
+
+ spin_lock(&rcd->exp_lock);
+ if (fs->index >= RXE_NUM_TID_FLOWS) {
+ /*
+ * If we don't have a flow, save the generation so it can be
+ * applied when a new flow is allocated
+ */
+ fs->generation = generation;
+ } else {
+ /* Reprogram the QP flow with new generation */
+ rcd->flows[fs->index].generation = generation;
+ fs->generation = kern_setup_hw_flow(rcd, fs->index);
+ }
+ fs->psn = 0;
+ /*
+ * Disable SW PSN checking since a RESYNC is equivalent to a
+ * sync point and the flow has/will be reprogrammed
+ */
+ qpriv->s_flags &= ~HFI1_R_TID_SW_PSN;
+ trace_hfi1_tid_write_rsp_rcv_resync(qp);
+
+ /*
+ * Reset all TID flow information with the new generation.
+ * This is done for all requests and segments after the
+ * last received segment
+ */
+ for (idx = qpriv->r_tid_tail; ; idx++) {
+ u16 flow_idx;
+
+ if (idx > rvt_size_atomic(&dev->rdi))
+ idx = 0;
+ e = &qp->s_ack_queue[idx];
+ if (e->opcode == TID_OP(WRITE_REQ)) {
+ req = ack_to_tid_req(e);
+ trace_hfi1_tid_req_rcv_resync(qp, 0, e->opcode, e->psn,
+ e->lpsn, req);
+
+ /* start from last unacked segment */
+ for (flow_idx = req->clear_tail;
+ CIRC_CNT(req->setup_head, flow_idx,
+ MAX_FLOWS);
+ flow_idx = CIRC_NEXT(flow_idx, MAX_FLOWS)) {
+ u32 lpsn;
+ u32 next;
+
+ flow = &req->flows[flow_idx];
+ lpsn = full_flow_psn(flow,
+ flow->flow_state.lpsn);
+ next = flow->flow_state.r_next_psn;
+ flow->npkts = delta_psn(lpsn, next - 1);
+ flow->flow_state.generation = fs->generation;
+ flow->flow_state.spsn = fs->psn;
+ flow->flow_state.lpsn =
+ flow->flow_state.spsn + flow->npkts - 1;
+ flow->flow_state.r_next_psn =
+ full_flow_psn(flow,
+ flow->flow_state.spsn);
+ fs->psn += flow->npkts;
+ trace_hfi1_tid_flow_rcv_resync(qp, flow_idx,
+ flow);
+ }
+ }
+ if (idx == qp->s_tail_ack_queue)
+ break;
+ }
+
+ spin_unlock(&rcd->exp_lock);
+ qpriv->resync = true;
+ /* RESYNC request always gets a TID RDMA ACK. */
+ qpriv->s_nak_state = 0;
+ qpriv->s_flags |= RVT_S_ACK_PENDING;
+ hfi1_schedule_tid_send(qp);
+bail:
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+}
+
+/*
+ * Call this function when the last TID RDMA WRITE DATA packet for a request
+ * is built.
+ */
+static void update_tid_tail(struct rvt_qp *qp)
+ __must_hold(&qp->s_lock)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+ u32 i;
+ struct rvt_swqe *wqe;
+
+ lockdep_assert_held(&qp->s_lock);
+ /* Can't move beyond s_tid_cur */
+ if (priv->s_tid_tail == priv->s_tid_cur)
+ return;
+ for (i = priv->s_tid_tail + 1; ; i++) {
+ if (i == qp->s_size)
+ i = 0;
+
+ if (i == priv->s_tid_cur)
+ break;
+ wqe = rvt_get_swqe_ptr(qp, i);
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE)
+ break;
+ }
+ priv->s_tid_tail = i;
+ priv->s_state = TID_OP(WRITE_RESP);
+}
+
+int hfi1_make_tid_rdma_pkt(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
+ __must_hold(&qp->s_lock)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+ struct rvt_swqe *wqe;
+ u32 bth1 = 0, bth2 = 0, hwords = 5, len, middle = 0;
+ struct ib_other_headers *ohdr;
+ struct rvt_sge_state *ss = &qp->s_sge;
+ struct rvt_ack_entry *e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+ struct tid_rdma_request *req = ack_to_tid_req(e);
+ bool last = false;
+ u8 opcode = TID_OP(WRITE_DATA);
+
+ lockdep_assert_held(&qp->s_lock);
+ trace_hfi1_tid_write_sender_make_tid_pkt(qp, 0);
+ /*
+ * Prioritize the sending of the requests and responses over the
+ * sending of the TID RDMA data packets.
+ */
+ if (((atomic_read(&priv->n_tid_requests) < HFI1_TID_RDMA_WRITE_CNT) &&
+ atomic_read(&priv->n_requests) &&
+ !(qp->s_flags & (RVT_S_BUSY | RVT_S_WAIT_ACK |
+ HFI1_S_ANY_WAIT_IO))) ||
+ (e->opcode == TID_OP(WRITE_REQ) && req->cur_seg < req->alloc_seg &&
+ !(qp->s_flags & (RVT_S_BUSY | HFI1_S_ANY_WAIT_IO)))) {
+ struct iowait_work *iowork;
+
+ iowork = iowait_get_ib_work(&priv->s_iowait);
+ ps->s_txreq = get_waiting_verbs_txreq(iowork);
+ if (ps->s_txreq || hfi1_make_rc_req(qp, ps)) {
+ priv->s_flags |= HFI1_S_TID_BUSY_SET;
+ return 1;
+ }
+ }
+
+ ps->s_txreq = get_txreq(ps->dev, qp);
+ if (!ps->s_txreq)
+ goto bail_no_tx;
+
+ ohdr = &ps->s_txreq->phdr.hdr.ibh.u.oth;
+
+ if ((priv->s_flags & RVT_S_ACK_PENDING) &&
+ make_tid_rdma_ack(qp, ohdr, ps))
+ return 1;
+
+ /*
+ * Bail out if we can't send data.
+ * Be reminded that this check must been done after the call to
+ * make_tid_rdma_ack() because the responding QP could be in
+ * RTR state where it can send TID RDMA ACK, not TID RDMA WRITE DATA.
+ */
+ if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_SEND_OK))
+ goto bail;
+
+ if (priv->s_flags & RVT_S_WAIT_ACK)
+ goto bail;
+
+ /* Check whether there is anything to do. */
+ if (priv->s_tid_tail == HFI1_QP_WQE_INVALID)
+ goto bail;
+ wqe = rvt_get_swqe_ptr(qp, priv->s_tid_tail);
+ req = wqe_to_tid_req(wqe);
+ trace_hfi1_tid_req_make_tid_pkt(qp, 0, wqe->wr.opcode, wqe->psn,
+ wqe->lpsn, req);
+ switch (priv->s_state) {
+ case TID_OP(WRITE_REQ):
+ case TID_OP(WRITE_RESP):
+ priv->tid_ss.sge = wqe->sg_list[0];
+ priv->tid_ss.sg_list = wqe->sg_list + 1;
+ priv->tid_ss.num_sge = wqe->wr.num_sge;
+ priv->tid_ss.total_len = wqe->length;
+
+ if (priv->s_state == TID_OP(WRITE_REQ))
+ hfi1_tid_rdma_restart_req(qp, wqe, &bth2);
+ priv->s_state = TID_OP(WRITE_DATA);
+ /* fall through */
+
+ case TID_OP(WRITE_DATA):
+ /*
+ * 1. Check whether TID RDMA WRITE RESP available.
+ * 2. If no:
+ * 2.1 If have more segments and no TID RDMA WRITE RESP,
+ * set HFI1_S_WAIT_TID_RESP
+ * 2.2 Return indicating no progress made.
+ * 3. If yes:
+ * 3.1 Build TID RDMA WRITE DATA packet.
+ * 3.2 If last packet in segment:
+ * 3.2.1 Change KDETH header bits
+ * 3.2.2 Advance RESP pointers.
+ * 3.3 Return indicating progress made.
+ */
+ trace_hfi1_sender_make_tid_pkt(qp);
+ trace_hfi1_tid_write_sender_make_tid_pkt(qp, 0);
+ wqe = rvt_get_swqe_ptr(qp, priv->s_tid_tail);
+ req = wqe_to_tid_req(wqe);
+ len = wqe->length;
+
+ if (!req->comp_seg || req->cur_seg == req->comp_seg)
+ goto bail;
+
+ trace_hfi1_tid_req_make_tid_pkt(qp, 0, wqe->wr.opcode,
+ wqe->psn, wqe->lpsn, req);
+ last = hfi1_build_tid_rdma_packet(wqe, ohdr, &bth1, &bth2,
+ &len);
+
+ if (last) {
+ /* move pointer to next flow */
+ req->clear_tail = CIRC_NEXT(req->clear_tail,
+ MAX_FLOWS);
+ if (++req->cur_seg < req->total_segs) {
+ if (!CIRC_CNT(req->setup_head, req->clear_tail,
+ MAX_FLOWS))
+ qp->s_flags |= HFI1_S_WAIT_TID_RESP;
+ } else {
+ priv->s_state = TID_OP(WRITE_DATA_LAST);
+ opcode = TID_OP(WRITE_DATA_LAST);
+
+ /* Advance the s_tid_tail now */
+ update_tid_tail(qp);
+ }
+ }
+ hwords += sizeof(ohdr->u.tid_rdma.w_data) / sizeof(u32);
+ ss = &priv->tid_ss;
+ break;
+
+ case TID_OP(RESYNC):
+ trace_hfi1_sender_make_tid_pkt(qp);
+ /* Use generation from the most recently received response */
+ wqe = rvt_get_swqe_ptr(qp, priv->s_tid_cur);
+ req = wqe_to_tid_req(wqe);
+ /* If no responses for this WQE look at the previous one */
+ if (!req->comp_seg) {
+ wqe = rvt_get_swqe_ptr(qp,
+ (!priv->s_tid_cur ? qp->s_size :
+ priv->s_tid_cur) - 1);
+ req = wqe_to_tid_req(wqe);
+ }
+ hwords += hfi1_build_tid_rdma_resync(qp, wqe, ohdr, &bth1,
+ &bth2,
+ CIRC_PREV(req->setup_head,
+ MAX_FLOWS));
+ ss = NULL;
+ len = 0;
+ opcode = TID_OP(RESYNC);
+ break;
+
+ default:
+ goto bail;
+ }
+ if (priv->s_flags & RVT_S_SEND_ONE) {
+ priv->s_flags &= ~RVT_S_SEND_ONE;
+ priv->s_flags |= RVT_S_WAIT_ACK;
+ bth2 |= IB_BTH_REQ_ACK;
+ }
+ qp->s_len -= len;
+ ps->s_txreq->hdr_dwords = hwords;
+ ps->s_txreq->sde = priv->s_sde;
+ ps->s_txreq->ss = ss;
+ ps->s_txreq->s_cur_size = len;
+ hfi1_make_ruc_header(qp, ohdr, (opcode << 24), bth1, bth2,
+ middle, ps);
+ return 1;
+bail:
+ hfi1_put_txreq(ps->s_txreq);
+bail_no_tx:
+ ps->s_txreq = NULL;
+ priv->s_flags &= ~RVT_S_BUSY;
+ /*
+ * If we didn't get a txreq, the QP will be woken up later to try
+ * again, set the flags to the the wake up which work item to wake
+ * up.
+ * (A better algorithm should be found to do this and generalize the
+ * sleep/wakeup flags.)
+ */
+ iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_TID);
+ return 0;
+}
+
+static int make_tid_rdma_ack(struct rvt_qp *qp,
+ struct ib_other_headers *ohdr,
+ struct hfi1_pkt_state *ps)
+{
+ struct rvt_ack_entry *e;
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
+ u32 hwords, next;
+ u32 len = 0;
+ u32 bth1 = 0, bth2 = 0;
+ int middle = 0;
+ u16 flow;
+ struct tid_rdma_request *req, *nreq;
+
+ trace_hfi1_tid_write_rsp_make_tid_ack(qp);
+ /* Don't send an ACK if we aren't supposed to. */
+ if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
+ goto bail;
+
+ /* header size in 32-bit words LRH+BTH = (8+12)/4. */
+ hwords = 5;
+
+ e = &qp->s_ack_queue[qpriv->r_tid_ack];
+ req = ack_to_tid_req(e);
+ /*
+ * In the RESYNC case, we are exactly one segment past the
+ * previously sent ack or at the previously sent NAK. So to send
+ * the resync ack, we go back one segment (which might be part of
+ * the previous request) and let the do-while loop execute again.
+ * The advantage of executing the do-while loop is that any data
+ * received after the previous ack is automatically acked in the
+ * RESYNC ack. It turns out that for the do-while loop we only need
+ * to pull back qpriv->r_tid_ack, not the segment
+ * indices/counters. The scheme works even if the previous request
+ * was not a TID WRITE request.
+ */
+ if (qpriv->resync) {
+ if (!req->ack_seg || req->ack_seg == req->total_segs)
+ qpriv->r_tid_ack = !qpriv->r_tid_ack ?
+ rvt_size_atomic(&dev->rdi) :
+ qpriv->r_tid_ack - 1;
+ e = &qp->s_ack_queue[qpriv->r_tid_ack];
+ req = ack_to_tid_req(e);
+ }
+
+ trace_hfi1_rsp_make_tid_ack(qp, e->psn);
+ trace_hfi1_tid_req_make_tid_ack(qp, 0, e->opcode, e->psn, e->lpsn,
+ req);
+ /*
+ * If we've sent all the ACKs that we can, we are done
+ * until we get more segments...
+ */
+ if (!qpriv->s_nak_state && !qpriv->resync &&
+ req->ack_seg == req->comp_seg)
+ goto bail;
+
+ do {
+ /*
+ * To deal with coalesced ACKs, the acked_tail pointer
+ * into the flow array is used. The distance between it
+ * and the clear_tail is the number of flows that are
+ * being ACK'ed.
+ */
+ req->ack_seg +=
+ /* Get up-to-date value */
+ CIRC_CNT(req->clear_tail, req->acked_tail,
+ MAX_FLOWS);
+ /* Advance acked index */
+ req->acked_tail = req->clear_tail;
+
+ /*
+ * req->clear_tail points to the segment currently being
+ * received. So, when sending an ACK, the previous
+ * segment is being ACK'ed.
+ */
+ flow = CIRC_PREV(req->acked_tail, MAX_FLOWS);
+ if (req->ack_seg != req->total_segs)
+ break;
+ req->state = TID_REQUEST_COMPLETE;
+
+ next = qpriv->r_tid_ack + 1;
+ if (next > rvt_size_atomic(&dev->rdi))
+ next = 0;
+ qpriv->r_tid_ack = next;
+ if (qp->s_ack_queue[next].opcode != TID_OP(WRITE_REQ))
+ break;
+ nreq = ack_to_tid_req(&qp->s_ack_queue[next]);
+ if (!nreq->comp_seg || nreq->ack_seg == nreq->comp_seg)
+ break;
+
+ /* Move to the next ack entry now */
+ e = &qp->s_ack_queue[qpriv->r_tid_ack];
+ req = ack_to_tid_req(e);
+ } while (1);
+
+ /*
+ * At this point qpriv->r_tid_ack == qpriv->r_tid_tail but e and
+ * req could be pointing at the previous ack queue entry
+ */
+ if (qpriv->s_nak_state ||
+ (qpriv->resync &&
+ !hfi1_tid_rdma_is_resync_psn(qpriv->r_next_psn_kdeth - 1) &&
+ (cmp_psn(qpriv->r_next_psn_kdeth - 1,
+ full_flow_psn(&req->flows[flow],
+ req->flows[flow].flow_state.lpsn)) > 0))) {
+ /*
+ * A NAK will implicitly acknowledge all previous TID RDMA
+ * requests. Therefore, we NAK with the req->acked_tail
+ * segment for the request at qpriv->r_tid_ack (same at
+ * this point as the req->clear_tail segment for the
+ * qpriv->r_tid_tail request)
+ */
+ e = &qp->s_ack_queue[qpriv->r_tid_ack];
+ req = ack_to_tid_req(e);
+ flow = req->acked_tail;
+ } else if (req->ack_seg == req->total_segs &&
+ qpriv->s_flags & HFI1_R_TID_WAIT_INTERLCK)
+ qpriv->s_flags &= ~HFI1_R_TID_WAIT_INTERLCK;
+
+ trace_hfi1_tid_write_rsp_make_tid_ack(qp);
+ trace_hfi1_tid_req_make_tid_ack(qp, 0, e->opcode, e->psn, e->lpsn,
+ req);
+ hwords += hfi1_build_tid_rdma_write_ack(qp, e, ohdr, flow, &bth1,
+ &bth2);
+ len = 0;
+ qpriv->s_flags &= ~RVT_S_ACK_PENDING;
+ ps->s_txreq->hdr_dwords = hwords;
+ ps->s_txreq->sde = qpriv->s_sde;
+ ps->s_txreq->s_cur_size = len;
+ ps->s_txreq->ss = NULL;
+ hfi1_make_ruc_header(qp, ohdr, (TID_OP(ACK) << 24), bth1, bth2, middle,
+ ps);
+ ps->s_txreq->txreq.flags |= SDMA_TXREQ_F_VIP;
+ return 1;
+bail:
+ /*
+ * Ensure s_rdma_ack_cnt changes are committed prior to resetting
+ * RVT_S_RESP_PENDING
+ */
+ smp_wmb();
+ qpriv->s_flags &= ~RVT_S_ACK_PENDING;
+ return 0;
+}
+
+static int hfi1_send_tid_ok(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ return !(priv->s_flags & RVT_S_BUSY ||
+ qp->s_flags & HFI1_S_ANY_WAIT_IO) &&
+ (verbs_txreq_queued(iowait_get_tid_work(&priv->s_iowait)) ||
+ (priv->s_flags & RVT_S_RESP_PENDING) ||
+ !(qp->s_flags & HFI1_S_ANY_TID_WAIT_SEND));
+}
+
+void _hfi1_do_tid_send(struct work_struct *work)
+{
+ struct iowait_work *w = container_of(work, struct iowait_work, iowork);
+ struct rvt_qp *qp = iowait_to_qp(w->iow);
+
+ hfi1_do_tid_send(qp);
+}
+
+static void hfi1_do_tid_send(struct rvt_qp *qp)
+{
+ struct hfi1_pkt_state ps;
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ ps.dev = to_idev(qp->ibqp.device);
+ ps.ibp = to_iport(qp->ibqp.device, qp->port_num);
+ ps.ppd = ppd_from_ibp(ps.ibp);
+ ps.wait = iowait_get_tid_work(&priv->s_iowait);
+ ps.in_thread = false;
+ ps.timeout_int = qp->timeout_jiffies / 8;
+
+ trace_hfi1_rc_do_tid_send(qp, false);
+ spin_lock_irqsave(&qp->s_lock, ps.flags);
+
+ /* Return if we are already busy processing a work request. */
+ if (!hfi1_send_tid_ok(qp)) {
+ if (qp->s_flags & HFI1_S_ANY_WAIT_IO)
+ iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_TID);
+ spin_unlock_irqrestore(&qp->s_lock, ps.flags);
+ return;
+ }
+
+ priv->s_flags |= RVT_S_BUSY;
+
+ ps.timeout = jiffies + ps.timeout_int;
+ ps.cpu = priv->s_sde ? priv->s_sde->cpu :
+ cpumask_first(cpumask_of_node(ps.ppd->dd->node));
+ ps.pkts_sent = false;
+
+ /* insure a pre-built packet is handled */
+ ps.s_txreq = get_waiting_verbs_txreq(ps.wait);
+ do {
+ /* Check for a constructed packet to be sent. */
+ if (ps.s_txreq) {
+ if (priv->s_flags & HFI1_S_TID_BUSY_SET) {
+ qp->s_flags |= RVT_S_BUSY;
+ ps.wait = iowait_get_ib_work(&priv->s_iowait);
+ }
+ spin_unlock_irqrestore(&qp->s_lock, ps.flags);
+
+ /*
+ * If the packet cannot be sent now, return and
+ * the send tasklet will be woken up later.
+ */
+ if (hfi1_verbs_send(qp, &ps))
+ return;
+
+ /* allow other tasks to run */
+ if (hfi1_schedule_send_yield(qp, &ps, true))
+ return;
+
+ spin_lock_irqsave(&qp->s_lock, ps.flags);
+ if (priv->s_flags & HFI1_S_TID_BUSY_SET) {
+ qp->s_flags &= ~RVT_S_BUSY;
+ priv->s_flags &= ~HFI1_S_TID_BUSY_SET;
+ ps.wait = iowait_get_tid_work(&priv->s_iowait);
+ if (iowait_flag_set(&priv->s_iowait,
+ IOWAIT_PENDING_IB))
+ hfi1_schedule_send(qp);
+ }
+ }
+ } while (hfi1_make_tid_rdma_pkt(qp, &ps));
+ iowait_starve_clear(ps.pkts_sent, &priv->s_iowait);
+ spin_unlock_irqrestore(&qp->s_lock, ps.flags);
+}
+
+static bool _hfi1_schedule_tid_send(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+ struct hfi1_ibport *ibp =
+ to_iport(qp->ibqp.device, qp->port_num);
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+ struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
+
+ return iowait_tid_schedule(&priv->s_iowait, ppd->hfi1_wq,
+ priv->s_sde ?
+ priv->s_sde->cpu :
+ cpumask_first(cpumask_of_node(dd->node)));
+}
+
+/**
+ * hfi1_schedule_tid_send - schedule progress on TID RDMA state machine
+ * @qp: the QP
+ *
+ * This schedules qp progress on the TID RDMA state machine. Caller
+ * should hold the s_lock.
+ * Unlike hfi1_schedule_send(), this cannot use hfi1_send_ok() because
+ * the two state machines can step on each other with respect to the
+ * RVT_S_BUSY flag.
+ * Therefore, a modified test is used.
+ * @return true if the second leg is scheduled;
+ * false if the second leg is not scheduled.
+ */
+bool hfi1_schedule_tid_send(struct rvt_qp *qp)
+{
+ lockdep_assert_held(&qp->s_lock);
+ if (hfi1_send_tid_ok(qp)) {
+ /*
+ * The following call returns true if the qp is not on the
+ * queue and false if the qp is already on the queue before
+ * this call. Either way, the qp will be on the queue when the
+ * call returns.
+ */
+ _hfi1_schedule_tid_send(qp);
+ return true;
+ }
+ if (qp->s_flags & HFI1_S_ANY_WAIT_IO)
+ iowait_set_flag(&((struct hfi1_qp_priv *)qp->priv)->s_iowait,
+ IOWAIT_PENDING_TID);
+ return false;
+}
+
+bool hfi1_tid_rdma_ack_interlock(struct rvt_qp *qp, struct rvt_ack_entry *e)
+{
+ struct rvt_ack_entry *prev;
+ struct tid_rdma_request *req;
+ struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
+ struct hfi1_qp_priv *priv = qp->priv;
+ u32 s_prev;
+
+ s_prev = qp->s_tail_ack_queue == 0 ? rvt_size_atomic(&dev->rdi) :
+ (qp->s_tail_ack_queue - 1);
+ prev = &qp->s_ack_queue[s_prev];
+
+ if ((e->opcode == TID_OP(READ_REQ) ||
+ e->opcode == OP(RDMA_READ_REQUEST)) &&
+ prev->opcode == TID_OP(WRITE_REQ)) {
+ req = ack_to_tid_req(prev);
+ if (req->ack_seg != req->total_segs) {
+ priv->s_flags |= HFI1_R_TID_WAIT_INTERLCK;
+ return true;
+ }
+ }
+ return false;
+}
diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h
new file mode 100644
index 000000000000..53ab24ef4f02
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/tid_rdma.h
@@ -0,0 +1,322 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
+/*
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ */
+#ifndef HFI1_TID_RDMA_H
+#define HFI1_TID_RDMA_H
+
+#include <linux/circ_buf.h>
+#include "common.h"
+
+/* Add a convenience helper */
+#define CIRC_ADD(val, add, size) (((val) + (add)) & ((size) - 1))
+#define CIRC_NEXT(val, size) CIRC_ADD(val, 1, size)
+#define CIRC_PREV(val, size) CIRC_ADD(val, -1, size)
+
+#define TID_RDMA_MIN_SEGMENT_SIZE BIT(18) /* 256 KiB (for now) */
+#define TID_RDMA_MAX_SEGMENT_SIZE BIT(18) /* 256 KiB (for now) */
+#define TID_RDMA_MAX_PAGES (BIT(18) >> PAGE_SHIFT)
+
+/*
+ * Bit definitions for priv->s_flags.
+ * These bit flags overload the bit flags defined for the QP's s_flags.
+ * Due to the fact that these bit fields are used only for the QP priv
+ * s_flags, there are no collisions.
+ *
+ * HFI1_S_TID_WAIT_INTERLCK - QP is waiting for requester interlock
+ * HFI1_R_TID_WAIT_INTERLCK - QP is waiting for responder interlock
+ */
+#define HFI1_S_TID_BUSY_SET BIT(0)
+/* BIT(1) reserved for RVT_S_BUSY. */
+#define HFI1_R_TID_RSC_TIMER BIT(2)
+/* BIT(3) reserved for RVT_S_RESP_PENDING. */
+/* BIT(4) reserved for RVT_S_ACK_PENDING. */
+#define HFI1_S_TID_WAIT_INTERLCK BIT(5)
+#define HFI1_R_TID_WAIT_INTERLCK BIT(6)
+/* BIT(7) - BIT(15) reserved for RVT_S_WAIT_*. */
+/* BIT(16) reserved for RVT_S_SEND_ONE */
+#define HFI1_S_TID_RETRY_TIMER BIT(17)
+/* BIT(18) reserved for RVT_S_ECN. */
+#define HFI1_R_TID_SW_PSN BIT(19)
+/* BIT(26) reserved for HFI1_S_WAIT_HALT */
+/* BIT(27) reserved for HFI1_S_WAIT_TID_RESP */
+/* BIT(28) reserved for HFI1_S_WAIT_TID_SPACE */
+
+/*
+ * Unlike regular IB RDMA VERBS, which do not require an entry
+ * in the s_ack_queue, TID RDMA WRITE requests do because they
+ * generate responses.
+ * Therefore, the s_ack_queue needs to be extended by a certain
+ * amount. The key point is that the queue needs to be extended
+ * without letting the "user" know so they user doesn't end up
+ * using these extra entries.
+ */
+#define HFI1_TID_RDMA_WRITE_CNT 8
+
+struct tid_rdma_params {
+ struct rcu_head rcu_head;
+ u32 qp;
+ u32 max_len;
+ u16 jkey;
+ u8 max_read;
+ u8 max_write;
+ u8 timeout;
+ u8 urg;
+ u8 version;
+};
+
+struct tid_rdma_qp_params {
+ struct work_struct trigger_work;
+ struct tid_rdma_params local;
+ struct tid_rdma_params __rcu *remote;
+};
+
+/* Track state for each hardware flow */
+struct tid_flow_state {
+ u32 generation;
+ u32 psn;
+ u32 r_next_psn; /* next PSN to be received (in TID space) */
+ u8 index;
+ u8 last_index;
+ u8 flags;
+};
+
+enum tid_rdma_req_state {
+ TID_REQUEST_INACTIVE = 0,
+ TID_REQUEST_INIT,
+ TID_REQUEST_INIT_RESEND,
+ TID_REQUEST_ACTIVE,
+ TID_REQUEST_RESEND,
+ TID_REQUEST_RESEND_ACTIVE,
+ TID_REQUEST_QUEUED,
+ TID_REQUEST_SYNC,
+ TID_REQUEST_RNR_NAK,
+ TID_REQUEST_COMPLETE,
+};
+
+struct tid_rdma_request {
+ struct rvt_qp *qp;
+ struct hfi1_ctxtdata *rcd;
+ union {
+ struct rvt_swqe *swqe;
+ struct rvt_ack_entry *ack;
+ } e;
+
+ struct tid_rdma_flow *flows; /* array of tid flows */
+ struct rvt_sge_state ss; /* SGE state for TID RDMA requests */
+ u16 n_flows; /* size of the flow buffer window */
+ u16 setup_head; /* flow index we are setting up */
+ u16 clear_tail; /* flow index we are clearing */
+ u16 flow_idx; /* flow index most recently set up */
+ u16 acked_tail;
+
+ u32 seg_len;
+ u32 total_len;
+ u32 r_ack_psn; /* next expected ack PSN */
+ u32 r_flow_psn; /* IB PSN of next segment start */
+ u32 r_last_acked; /* IB PSN of last ACK'ed packet */
+ u32 s_next_psn; /* IB PSN of next segment start for read */
+
+ u32 total_segs; /* segments required to complete a request */
+ u32 cur_seg; /* index of current segment */
+ u32 comp_seg; /* index of last completed segment */
+ u32 ack_seg; /* index of last ack'ed segment */
+ u32 alloc_seg; /* index of next segment to be allocated */
+ u32 isge; /* index of "current" sge */
+ u32 ack_pending; /* num acks pending for this request */
+
+ enum tid_rdma_req_state state;
+};
+
+/*
+ * When header suppression is used, PSNs associated with a "flow" are
+ * relevant (and not the PSNs maintained by verbs). Track per-flow
+ * PSNs here for a TID RDMA segment.
+ *
+ */
+struct flow_state {
+ u32 flags;
+ u32 resp_ib_psn; /* The IB PSN of the response for this flow */
+ u32 generation; /* generation of flow */
+ u32 spsn; /* starting PSN in TID space */
+ u32 lpsn; /* last PSN in TID space */
+ u32 r_next_psn; /* next PSN to be received (in TID space) */
+
+ /* For tid rdma read */
+ u32 ib_spsn; /* starting PSN in Verbs space */
+ u32 ib_lpsn; /* last PSn in Verbs space */
+};
+
+struct tid_rdma_pageset {
+ dma_addr_t addr : 48; /* Only needed for the first page */
+ u8 idx: 8;
+ u8 count : 7;
+ u8 mapped: 1;
+};
+
+/**
+ * kern_tid_node - used for managing TID's in TID groups
+ *
+ * @grp_idx: rcd relative index to tid_group
+ * @map: grp->map captured prior to programming this TID group in HW
+ * @cnt: Only @cnt of available group entries are actually programmed
+ */
+struct kern_tid_node {
+ struct tid_group *grp;
+ u8 map;
+ u8 cnt;
+};
+
+/* Overall info for a TID RDMA segment */
+struct tid_rdma_flow {
+ /*
+ * While a TID RDMA segment is being transferred, it uses a QP number
+ * from the "KDETH section of QP numbers" (which is different from the
+ * QP number that originated the request). Bits 11-15 of these QP
+ * numbers identify the "TID flow" for the segment.
+ */
+ struct flow_state flow_state;
+ struct tid_rdma_request *req;
+ u32 tid_qpn;
+ u32 tid_offset;
+ u32 length;
+ u32 sent;
+ u8 tnode_cnt;
+ u8 tidcnt;
+ u8 tid_idx;
+ u8 idx;
+ u8 npagesets;
+ u8 npkts;
+ u8 pkt;
+ u8 resync_npkts;
+ struct kern_tid_node tnode[TID_RDMA_MAX_PAGES];
+ struct tid_rdma_pageset pagesets[TID_RDMA_MAX_PAGES];
+ u32 tid_entry[TID_RDMA_MAX_PAGES];
+};
+
+enum tid_rnr_nak_state {
+ TID_RNR_NAK_INIT = 0,
+ TID_RNR_NAK_SEND,
+ TID_RNR_NAK_SENT,
+};
+
+bool tid_rdma_conn_req(struct rvt_qp *qp, u64 *data);
+bool tid_rdma_conn_reply(struct rvt_qp *qp, u64 data);
+bool tid_rdma_conn_resp(struct rvt_qp *qp, u64 *data);
+void tid_rdma_conn_error(struct rvt_qp *qp);
+void tid_rdma_opfn_init(struct rvt_qp *qp, struct tid_rdma_params *p);
+
+int hfi1_kern_exp_rcv_init(struct hfi1_ctxtdata *rcd, int reinit);
+int hfi1_kern_exp_rcv_setup(struct tid_rdma_request *req,
+ struct rvt_sge_state *ss, bool *last);
+int hfi1_kern_exp_rcv_clear(struct tid_rdma_request *req);
+void hfi1_kern_exp_rcv_clear_all(struct tid_rdma_request *req);
+void __trdma_clean_swqe(struct rvt_qp *qp, struct rvt_swqe *wqe);
+
+/**
+ * trdma_clean_swqe - clean flows for swqe if large send queue
+ * @qp: the qp
+ * @wqe: the send wqe
+ */
+static inline void trdma_clean_swqe(struct rvt_qp *qp, struct rvt_swqe *wqe)
+{
+ if (!wqe->priv)
+ return;
+ __trdma_clean_swqe(qp, wqe);
+}
+
+void hfi1_kern_read_tid_flow_free(struct rvt_qp *qp);
+
+int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp,
+ struct ib_qp_init_attr *init_attr);
+void hfi1_qp_priv_tid_free(struct rvt_dev_info *rdi, struct rvt_qp *qp);
+
+void hfi1_tid_rdma_flush_wait(struct rvt_qp *qp);
+
+int hfi1_kern_setup_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp);
+void hfi1_kern_clear_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp);
+void hfi1_kern_init_ctxt_generations(struct hfi1_ctxtdata *rcd);
+
+struct cntr_entry;
+u64 hfi1_access_sw_tid_wait(const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data);
+
+u32 hfi1_build_tid_rdma_read_packet(struct rvt_swqe *wqe,
+ struct ib_other_headers *ohdr,
+ u32 *bth1, u32 *bth2, u32 *len);
+u32 hfi1_build_tid_rdma_read_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
+ struct ib_other_headers *ohdr, u32 *bth1,
+ u32 *bth2, u32 *len);
+void hfi1_rc_rcv_tid_rdma_read_req(struct hfi1_packet *packet);
+u32 hfi1_build_tid_rdma_read_resp(struct rvt_qp *qp, struct rvt_ack_entry *e,
+ struct ib_other_headers *ohdr, u32 *bth0,
+ u32 *bth1, u32 *bth2, u32 *len, bool *last);
+void hfi1_rc_rcv_tid_rdma_read_resp(struct hfi1_packet *packet);
+bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd,
+ struct hfi1_pportdata *ppd,
+ struct hfi1_packet *packet);
+void hfi1_tid_rdma_restart_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
+ u32 *bth2);
+void hfi1_qp_kern_exp_rcv_clear_all(struct rvt_qp *qp);
+bool hfi1_tid_rdma_wqe_interlock(struct rvt_qp *qp, struct rvt_swqe *wqe);
+
+void setup_tid_rdma_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe);
+static inline void hfi1_setup_tid_rdma_wqe(struct rvt_qp *qp,
+ struct rvt_swqe *wqe)
+{
+ if (wqe->priv &&
+ (wqe->wr.opcode == IB_WR_RDMA_READ ||
+ wqe->wr.opcode == IB_WR_RDMA_WRITE) &&
+ wqe->length >= TID_RDMA_MIN_SEGMENT_SIZE)
+ setup_tid_rdma_wqe(qp, wqe);
+}
+
+u32 hfi1_build_tid_rdma_write_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
+ struct ib_other_headers *ohdr,
+ u32 *bth1, u32 *bth2, u32 *len);
+
+void hfi1_compute_tid_rdma_flow_wt(void);
+
+void hfi1_rc_rcv_tid_rdma_write_req(struct hfi1_packet *packet);
+
+u32 hfi1_build_tid_rdma_write_resp(struct rvt_qp *qp, struct rvt_ack_entry *e,
+ struct ib_other_headers *ohdr, u32 *bth1,
+ u32 bth2, u32 *len,
+ struct rvt_sge_state **ss);
+
+void hfi1_del_tid_reap_timer(struct rvt_qp *qp);
+
+void hfi1_rc_rcv_tid_rdma_write_resp(struct hfi1_packet *packet);
+
+bool hfi1_build_tid_rdma_packet(struct rvt_swqe *wqe,
+ struct ib_other_headers *ohdr,
+ u32 *bth1, u32 *bth2, u32 *len);
+
+void hfi1_rc_rcv_tid_rdma_write_data(struct hfi1_packet *packet);
+
+u32 hfi1_build_tid_rdma_write_ack(struct rvt_qp *qp, struct rvt_ack_entry *e,
+ struct ib_other_headers *ohdr, u16 iflow,
+ u32 *bth1, u32 *bth2);
+
+void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet);
+
+void hfi1_add_tid_retry_timer(struct rvt_qp *qp);
+void hfi1_del_tid_retry_timer(struct rvt_qp *qp);
+
+u32 hfi1_build_tid_rdma_resync(struct rvt_qp *qp, struct rvt_swqe *wqe,
+ struct ib_other_headers *ohdr, u32 *bth1,
+ u32 *bth2, u16 fidx);
+
+void hfi1_rc_rcv_tid_rdma_resync(struct hfi1_packet *packet);
+
+struct hfi1_pkt_state;
+int hfi1_make_tid_rdma_pkt(struct rvt_qp *qp, struct hfi1_pkt_state *ps);
+
+void _hfi1_do_tid_send(struct work_struct *work);
+
+bool hfi1_schedule_tid_send(struct rvt_qp *qp);
+
+bool hfi1_tid_rdma_ack_interlock(struct rvt_qp *qp, struct rvt_ack_entry *e);
+
+#endif /* HFI1_TID_RDMA_H */
diff --git a/drivers/infiniband/hw/hfi1/trace.c b/drivers/infiniband/hw/hfi1/trace.c
index 7c8aed0ffc07..9a3d236bcc88 100644
--- a/drivers/infiniband/hw/hfi1/trace.c
+++ b/drivers/infiniband/hw/hfi1/trace.c
@@ -46,6 +46,7 @@
*/
#define CREATE_TRACE_POINTS
#include "trace.h"
+#include "exp_rcv.h"
static u8 __get_ib_hdr_len(struct ib_header *hdr)
{
@@ -128,6 +129,15 @@ const char *hfi1_trace_get_packet_l2_str(u8 l2)
#define IETH_PRN "ieth rkey:0x%.8x"
#define ATOMICACKETH_PRN "origdata:%llx"
#define ATOMICETH_PRN "vaddr:0x%llx rkey:0x%.8x sdata:%llx cdata:%llx"
+#define TID_RDMA_KDETH "kdeth0 0x%x kdeth1 0x%x"
+#define TID_RDMA_KDETH_DATA "kdeth0 0x%x: kver %u sh %u intr %u tidctrl %u tid %x offset %x kdeth1 0x%x: jkey %x"
+#define TID_READ_REQ_PRN "tid_flow_psn 0x%x tid_flow_qp 0x%x verbs_qp 0x%x"
+#define TID_READ_RSP_PRN "verbs_qp 0x%x"
+#define TID_WRITE_REQ_PRN "original_qp 0x%x"
+#define TID_WRITE_RSP_PRN "tid_flow_psn 0x%x tid_flow_qp 0x%x verbs_qp 0x%x"
+#define TID_WRITE_DATA_PRN "verbs_qp 0x%x"
+#define TID_ACK_PRN "tid_flow_psn 0x%x verbs_psn 0x%x tid_flow_qp 0x%x verbs_qp 0x%x"
+#define TID_RESYNC_PRN "verbs_qp 0x%x"
#define OP(transport, op) IB_OPCODE_## transport ## _ ## op
@@ -322,6 +332,99 @@ const char *parse_everbs_hdrs(
parse_syndrome(be32_to_cpu(eh->aeth) >> 24),
be32_to_cpu(eh->aeth) & IB_MSN_MASK);
break;
+ case OP(TID_RDMA, WRITE_REQ):
+ trace_seq_printf(p, TID_RDMA_KDETH " " RETH_PRN " "
+ TID_WRITE_REQ_PRN,
+ le32_to_cpu(eh->tid_rdma.w_req.kdeth0),
+ le32_to_cpu(eh->tid_rdma.w_req.kdeth1),
+ ib_u64_get(&eh->tid_rdma.w_req.reth.vaddr),
+ be32_to_cpu(eh->tid_rdma.w_req.reth.rkey),
+ be32_to_cpu(eh->tid_rdma.w_req.reth.length),
+ be32_to_cpu(eh->tid_rdma.w_req.verbs_qp));
+ break;
+ case OP(TID_RDMA, WRITE_RESP):
+ trace_seq_printf(p, TID_RDMA_KDETH " " AETH_PRN " "
+ TID_WRITE_RSP_PRN,
+ le32_to_cpu(eh->tid_rdma.w_rsp.kdeth0),
+ le32_to_cpu(eh->tid_rdma.w_rsp.kdeth1),
+ be32_to_cpu(eh->tid_rdma.w_rsp.aeth) >> 24,
+ parse_syndrome(/* aeth */
+ be32_to_cpu(eh->tid_rdma.w_rsp.aeth)
+ >> 24),
+ (be32_to_cpu(eh->tid_rdma.w_rsp.aeth) &
+ IB_MSN_MASK),
+ be32_to_cpu(eh->tid_rdma.w_rsp.tid_flow_psn),
+ be32_to_cpu(eh->tid_rdma.w_rsp.tid_flow_qp),
+ be32_to_cpu(eh->tid_rdma.w_rsp.verbs_qp));
+ break;
+ case OP(TID_RDMA, WRITE_DATA_LAST):
+ case OP(TID_RDMA, WRITE_DATA):
+ trace_seq_printf(p, TID_RDMA_KDETH_DATA " " TID_WRITE_DATA_PRN,
+ le32_to_cpu(eh->tid_rdma.w_data.kdeth0),
+ KDETH_GET(eh->tid_rdma.w_data.kdeth0, KVER),
+ KDETH_GET(eh->tid_rdma.w_data.kdeth0, SH),
+ KDETH_GET(eh->tid_rdma.w_data.kdeth0, INTR),
+ KDETH_GET(eh->tid_rdma.w_data.kdeth0, TIDCTRL),
+ KDETH_GET(eh->tid_rdma.w_data.kdeth0, TID),
+ KDETH_GET(eh->tid_rdma.w_data.kdeth0, OFFSET),
+ le32_to_cpu(eh->tid_rdma.w_data.kdeth1),
+ KDETH_GET(eh->tid_rdma.w_data.kdeth1, JKEY),
+ be32_to_cpu(eh->tid_rdma.w_data.verbs_qp));
+ break;
+ case OP(TID_RDMA, READ_REQ):
+ trace_seq_printf(p, TID_RDMA_KDETH " " RETH_PRN " "
+ TID_READ_REQ_PRN,
+ le32_to_cpu(eh->tid_rdma.r_req.kdeth0),
+ le32_to_cpu(eh->tid_rdma.r_req.kdeth1),
+ ib_u64_get(&eh->tid_rdma.r_req.reth.vaddr),
+ be32_to_cpu(eh->tid_rdma.r_req.reth.rkey),
+ be32_to_cpu(eh->tid_rdma.r_req.reth.length),
+ be32_to_cpu(eh->tid_rdma.r_req.tid_flow_psn),
+ be32_to_cpu(eh->tid_rdma.r_req.tid_flow_qp),
+ be32_to_cpu(eh->tid_rdma.r_req.verbs_qp));
+ break;
+ case OP(TID_RDMA, READ_RESP):
+ trace_seq_printf(p, TID_RDMA_KDETH_DATA " " AETH_PRN " "
+ TID_READ_RSP_PRN,
+ le32_to_cpu(eh->tid_rdma.r_rsp.kdeth0),
+ KDETH_GET(eh->tid_rdma.r_rsp.kdeth0, KVER),
+ KDETH_GET(eh->tid_rdma.r_rsp.kdeth0, SH),
+ KDETH_GET(eh->tid_rdma.r_rsp.kdeth0, INTR),
+ KDETH_GET(eh->tid_rdma.r_rsp.kdeth0, TIDCTRL),
+ KDETH_GET(eh->tid_rdma.r_rsp.kdeth0, TID),
+ KDETH_GET(eh->tid_rdma.r_rsp.kdeth0, OFFSET),
+ le32_to_cpu(eh->tid_rdma.r_rsp.kdeth1),
+ KDETH_GET(eh->tid_rdma.r_rsp.kdeth1, JKEY),
+ be32_to_cpu(eh->tid_rdma.r_rsp.aeth) >> 24,
+ parse_syndrome(/* aeth */
+ be32_to_cpu(eh->tid_rdma.r_rsp.aeth)
+ >> 24),
+ (be32_to_cpu(eh->tid_rdma.r_rsp.aeth) &
+ IB_MSN_MASK),
+ be32_to_cpu(eh->tid_rdma.r_rsp.verbs_qp));
+ break;
+ case OP(TID_RDMA, ACK):
+ trace_seq_printf(p, TID_RDMA_KDETH " " AETH_PRN " "
+ TID_ACK_PRN,
+ le32_to_cpu(eh->tid_rdma.ack.kdeth0),
+ le32_to_cpu(eh->tid_rdma.ack.kdeth1),
+ be32_to_cpu(eh->tid_rdma.ack.aeth) >> 24,
+ parse_syndrome(/* aeth */
+ be32_to_cpu(eh->tid_rdma.ack.aeth)
+ >> 24),
+ (be32_to_cpu(eh->tid_rdma.ack.aeth) &
+ IB_MSN_MASK),
+ be32_to_cpu(eh->tid_rdma.ack.tid_flow_psn),
+ be32_to_cpu(eh->tid_rdma.ack.verbs_psn),
+ be32_to_cpu(eh->tid_rdma.ack.tid_flow_qp),
+ be32_to_cpu(eh->tid_rdma.ack.verbs_qp));
+ break;
+ case OP(TID_RDMA, RESYNC):
+ trace_seq_printf(p, TID_RDMA_KDETH " " TID_RESYNC_PRN,
+ le32_to_cpu(eh->tid_rdma.resync.kdeth0),
+ le32_to_cpu(eh->tid_rdma.resync.kdeth1),
+ be32_to_cpu(eh->tid_rdma.resync.verbs_qp));
+ break;
/* aeth + atomicacketh */
case OP(RC, ATOMIC_ACKNOWLEDGE):
trace_seq_printf(p, AETH_PRN " " ATOMICACKETH_PRN,
@@ -394,6 +497,21 @@ const char *print_u32_array(
return ret;
}
+u8 hfi1_trace_get_tid_ctrl(u32 ent)
+{
+ return EXP_TID_GET(ent, CTRL);
+}
+
+u16 hfi1_trace_get_tid_len(u32 ent)
+{
+ return EXP_TID_GET(ent, LEN);
+}
+
+u16 hfi1_trace_get_tid_idx(u32 ent)
+{
+ return EXP_TID_GET(ent, IDX);
+}
+
__hfi1_trace_fn(AFFINITY);
__hfi1_trace_fn(PKT);
__hfi1_trace_fn(PROC);
diff --git a/drivers/infiniband/hw/hfi1/trace.h b/drivers/infiniband/hw/hfi1/trace.h
index 84458f1325e1..1ce551864118 100644
--- a/drivers/infiniband/hw/hfi1/trace.h
+++ b/drivers/infiniband/hw/hfi1/trace.h
@@ -63,3 +63,4 @@ __print_symbolic(etype, \
#include "trace_tx.h"
#include "trace_mmu.h"
#include "trace_iowait.h"
+#include "trace_tid.h"
diff --git a/drivers/infiniband/hw/hfi1/trace_ibhdrs.h b/drivers/infiniband/hw/hfi1/trace_ibhdrs.h
index 1dc2c28fc96e..d1372cc66de6 100644
--- a/drivers/infiniband/hw/hfi1/trace_ibhdrs.h
+++ b/drivers/infiniband/hw/hfi1/trace_ibhdrs.h
@@ -79,6 +79,14 @@ __print_symbolic(opcode, \
ib_opcode_name(RC_ATOMIC_ACKNOWLEDGE), \
ib_opcode_name(RC_COMPARE_SWAP), \
ib_opcode_name(RC_FETCH_ADD), \
+ ib_opcode_name(TID_RDMA_WRITE_REQ), \
+ ib_opcode_name(TID_RDMA_WRITE_RESP), \
+ ib_opcode_name(TID_RDMA_WRITE_DATA), \
+ ib_opcode_name(TID_RDMA_WRITE_DATA_LAST), \
+ ib_opcode_name(TID_RDMA_READ_REQ), \
+ ib_opcode_name(TID_RDMA_READ_RESP), \
+ ib_opcode_name(TID_RDMA_RESYNC), \
+ ib_opcode_name(TID_RDMA_ACK), \
ib_opcode_name(UC_SEND_FIRST), \
ib_opcode_name(UC_SEND_MIDDLE), \
ib_opcode_name(UC_SEND_LAST), \
diff --git a/drivers/infiniband/hw/hfi1/trace_rc.h b/drivers/infiniband/hw/hfi1/trace_rc.h
index 8ce476570462..1ebca37862e0 100644
--- a/drivers/infiniband/hw/hfi1/trace_rc.h
+++ b/drivers/infiniband/hw/hfi1/trace_rc.h
@@ -109,6 +109,54 @@ DEFINE_EVENT(hfi1_rc_template, hfi1_rcv_error,
TP_ARGS(qp, psn)
);
+DEFINE_EVENT(/* event */
+ hfi1_rc_template, hfi1_rc_completion,
+ TP_PROTO(struct rvt_qp *qp, u32 psn),
+ TP_ARGS(qp, psn)
+);
+
+DECLARE_EVENT_CLASS(/* rc_ack */
+ hfi1_rc_ack_template,
+ TP_PROTO(struct rvt_qp *qp, u32 aeth, u32 psn,
+ struct rvt_swqe *wqe),
+ TP_ARGS(qp, aeth, psn, wqe),
+ TP_STRUCT__entry(/* entry */
+ DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+ __field(u32, qpn)
+ __field(u32, aeth)
+ __field(u32, psn)
+ __field(u8, opcode)
+ __field(u32, spsn)
+ __field(u32, lpsn)
+ ),
+ TP_fast_assign(/* assign */
+ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
+ __entry->qpn = qp->ibqp.qp_num;
+ __entry->aeth = aeth;
+ __entry->psn = psn;
+ __entry->opcode = wqe->wr.opcode;
+ __entry->spsn = wqe->psn;
+ __entry->lpsn = wqe->lpsn;
+ ),
+ TP_printk(/* print */
+ "[%s] qpn 0x%x aeth 0x%x psn 0x%x opcode 0x%x spsn 0x%x lpsn 0x%x",
+ __get_str(dev),
+ __entry->qpn,
+ __entry->aeth,
+ __entry->psn,
+ __entry->opcode,
+ __entry->spsn,
+ __entry->lpsn
+ )
+);
+
+DEFINE_EVENT(/* do_rc_ack */
+ hfi1_rc_ack_template, hfi1_rc_ack_do,
+ TP_PROTO(struct rvt_qp *qp, u32 aeth, u32 psn,
+ struct rvt_swqe *wqe),
+ TP_ARGS(qp, aeth, psn, wqe)
+);
+
#endif /* __HFI1_TRACE_RC_H */
#undef TRACE_INCLUDE_PATH
diff --git a/drivers/infiniband/hw/hfi1/trace_rx.h b/drivers/infiniband/hw/hfi1/trace_rx.h
index 7eceb57e0415..3cec960e9674 100644
--- a/drivers/infiniband/hw/hfi1/trace_rx.h
+++ b/drivers/infiniband/hw/hfi1/trace_rx.h
@@ -1,5 +1,5 @@
/*
- * Copyright(c) 2015 - 2017 Intel Corporation.
+ * Copyright(c) 2015 - 2018 Intel Corporation.
*
* This file is provided under a dual BSD/GPLv2 license. When using or
* redistributing this file, you may do so under either license.
@@ -128,111 +128,6 @@ TRACE_EVENT(hfi1_receive_interrupt,
)
);
-DECLARE_EVENT_CLASS(
- hfi1_exp_tid_reg_unreg,
- TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr,
- u32 npages, unsigned long va, unsigned long pa,
- dma_addr_t dma),
- TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma),
- TP_STRUCT__entry(
- __field(unsigned int, ctxt)
- __field(u16, subctxt)
- __field(u32, rarr)
- __field(u32, npages)
- __field(unsigned long, va)
- __field(unsigned long, pa)
- __field(dma_addr_t, dma)
- ),
- TP_fast_assign(
- __entry->ctxt = ctxt;
- __entry->subctxt = subctxt;
- __entry->rarr = rarr;
- __entry->npages = npages;
- __entry->va = va;
- __entry->pa = pa;
- __entry->dma = dma;
- ),
- TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx, va:0x%lx dma:0x%llx",
- __entry->ctxt,
- __entry->subctxt,
- __entry->rarr,
- __entry->npages,
- __entry->pa,
- __entry->va,
- __entry->dma
- )
- );
-
-DEFINE_EVENT(
- hfi1_exp_tid_reg_unreg, hfi1_exp_tid_unreg,
- TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, u32 npages,
- unsigned long va, unsigned long pa, dma_addr_t dma),
- TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma));
-
-DEFINE_EVENT(
- hfi1_exp_tid_reg_unreg, hfi1_exp_tid_reg,
- TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, u32 npages,
- unsigned long va, unsigned long pa, dma_addr_t dma),
- TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma));
-
-TRACE_EVENT(
- hfi1_put_tid,
- TP_PROTO(struct hfi1_devdata *dd,
- u32 index, u32 type, unsigned long pa, u16 order),
- TP_ARGS(dd, index, type, pa, order),
- TP_STRUCT__entry(
- DD_DEV_ENTRY(dd)
- __field(unsigned long, pa);
- __field(u32, index);
- __field(u32, type);
- __field(u16, order);
- ),
- TP_fast_assign(
- DD_DEV_ASSIGN(dd);
- __entry->pa = pa;
- __entry->index = index;
- __entry->type = type;
- __entry->order = order;
- ),
- TP_printk("[%s] type %s pa %lx index %u order %u",
- __get_str(dev),
- show_tidtype(__entry->type),
- __entry->pa,
- __entry->index,
- __entry->order
- )
-);
-
-TRACE_EVENT(hfi1_exp_tid_inval,
- TP_PROTO(unsigned int ctxt, u16 subctxt, unsigned long va, u32 rarr,
- u32 npages, dma_addr_t dma),
- TP_ARGS(ctxt, subctxt, va, rarr, npages, dma),
- TP_STRUCT__entry(
- __field(unsigned int, ctxt)
- __field(u16, subctxt)
- __field(unsigned long, va)
- __field(u32, rarr)
- __field(u32, npages)
- __field(dma_addr_t, dma)
- ),
- TP_fast_assign(
- __entry->ctxt = ctxt;
- __entry->subctxt = subctxt;
- __entry->va = va;
- __entry->rarr = rarr;
- __entry->npages = npages;
- __entry->dma = dma;
- ),
- TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx dma: 0x%llx",
- __entry->ctxt,
- __entry->subctxt,
- __entry->rarr,
- __entry->npages,
- __entry->va,
- __entry->dma
- )
- );
-
TRACE_EVENT(hfi1_mmu_invalidate,
TP_PROTO(unsigned int ctxt, u16 subctxt, const char *type,
unsigned long start, unsigned long end),
diff --git a/drivers/infiniband/hw/hfi1/trace_tid.h b/drivers/infiniband/hw/hfi1/trace_tid.h
new file mode 100644
index 000000000000..548dfc45a407
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/trace_tid.h
@@ -0,0 +1,1610 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
+/*
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ */
+#if !defined(__HFI1_TRACE_TID_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __HFI1_TRACE_TID_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+
+#include "hfi.h"
+
+#define tidtype_name(type) { PT_##type, #type }
+#define show_tidtype(type) \
+__print_symbolic(type, \
+ tidtype_name(EXPECTED), \
+ tidtype_name(EAGER), \
+ tidtype_name(INVALID)) \
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_tid
+
+u8 hfi1_trace_get_tid_ctrl(u32 ent);
+u16 hfi1_trace_get_tid_len(u32 ent);
+u16 hfi1_trace_get_tid_idx(u32 ent);
+
+#define OPFN_PARAM_PRN "[%s] qpn 0x%x %s OPFN: qp 0x%x, max read %u, " \
+ "max write %u, max length %u, jkey 0x%x timeout %u " \
+ "urg %u"
+
+#define TID_FLOW_PRN "[%s] qpn 0x%x flow %d: idx %d resp_ib_psn 0x%x " \
+ "generation 0x%x fpsn 0x%x-%x r_next_psn 0x%x " \
+ "ib_psn 0x%x-%x npagesets %u tnode_cnt %u " \
+ "tidcnt %u tid_idx %u tid_offset %u length %u sent %u"
+
+#define TID_NODE_PRN "[%s] qpn 0x%x %s idx %u grp base 0x%x map 0x%x " \
+ "used %u cnt %u"
+
+#define RSP_INFO_PRN "[%s] qpn 0x%x state 0x%x s_state 0x%x psn 0x%x " \
+ "r_psn 0x%x r_state 0x%x r_flags 0x%x " \
+ "r_head_ack_queue %u s_tail_ack_queue %u " \
+ "s_acked_ack_queue %u s_ack_state 0x%x " \
+ "s_nak_state 0x%x s_flags 0x%x ps_flags 0x%x " \
+ "iow_flags 0x%lx"
+
+#define SENDER_INFO_PRN "[%s] qpn 0x%x state 0x%x s_cur %u s_tail %u " \
+ "s_head %u s_acked %u s_last %u s_psn 0x%x " \
+ "s_last_psn 0x%x s_flags 0x%x ps_flags 0x%x " \
+ "iow_flags 0x%lx s_state 0x%x s_num_rd %u s_retry %u"
+
+#define TID_READ_SENDER_PRN "[%s] qpn 0x%x newreq %u tid_r_reqs %u " \
+ "tid_r_comp %u pending_tid_r_segs %u " \
+ "s_flags 0x%x ps_flags 0x%x iow_flags 0x%lx " \
+ "s_state 0x%x hw_flow_index %u generation 0x%x " \
+ "fpsn 0x%x flow_flags 0x%x"
+
+#define TID_REQ_PRN "[%s] qpn 0x%x newreq %u opcode 0x%x psn 0x%x lpsn 0x%x " \
+ "cur_seg %u comp_seg %u ack_seg %u alloc_seg %u " \
+ "total_segs %u setup_head %u clear_tail %u flow_idx %u " \
+ "acked_tail %u state %u r_ack_psn 0x%x r_flow_psn 0x%x " \
+ "r_last_ackd 0x%x s_next_psn 0x%x"
+
+#define RCV_ERR_PRN "[%s] qpn 0x%x s_flags 0x%x state 0x%x " \
+ "s_acked_ack_queue %u s_tail_ack_queue %u " \
+ "r_head_ack_queue %u opcode 0x%x psn 0x%x r_psn 0x%x " \
+ " diff %d"
+
+#define TID_WRITE_RSPDR_PRN "[%s] qpn 0x%x r_tid_head %u r_tid_tail %u " \
+ "r_tid_ack %u r_tid_alloc %u alloc_w_segs %u " \
+ "pending_tid_w_segs %u sync_pt %s " \
+ "ps_nak_psn 0x%x ps_nak_state 0x%x " \
+ "prnr_nak_state 0x%x hw_flow_index %u generation "\
+ "0x%x fpsn 0x%x flow_flags 0x%x resync %s" \
+ "r_next_psn_kdeth 0x%x"
+
+#define TID_WRITE_SENDER_PRN "[%s] qpn 0x%x newreq %u s_tid_cur %u " \
+ "s_tid_tail %u s_tid_head %u " \
+ "pending_tid_w_resp %u n_requests %u " \
+ "n_tid_requests %u s_flags 0x%x ps_flags 0x%x "\
+ "iow_flags 0x%lx s_state 0x%x s_retry %u"
+
+#define KDETH_EFLAGS_ERR_PRN "[%s] qpn 0x%x TID ERR: RcvType 0x%x " \
+ "RcvTypeError 0x%x PSN 0x%x"
+
+DECLARE_EVENT_CLASS(/* class */
+ hfi1_exp_tid_reg_unreg,
+ TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, u32 npages,
+ unsigned long va, unsigned long pa, dma_addr_t dma),
+ TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma),
+ TP_STRUCT__entry(/* entry */
+ __field(unsigned int, ctxt)
+ __field(u16, subctxt)
+ __field(u32, rarr)
+ __field(u32, npages)
+ __field(unsigned long, va)
+ __field(unsigned long, pa)
+ __field(dma_addr_t, dma)
+ ),
+ TP_fast_assign(/* assign */
+ __entry->ctxt = ctxt;
+ __entry->subctxt = subctxt;
+ __entry->rarr = rarr;
+ __entry->npages = npages;
+ __entry->va = va;
+ __entry->pa = pa;
+ __entry->dma = dma;
+ ),
+ TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx, va:0x%lx dma:0x%llx",
+ __entry->ctxt,
+ __entry->subctxt,
+ __entry->rarr,
+ __entry->npages,
+ __entry->pa,
+ __entry->va,
+ __entry->dma
+ )
+);
+
+DEFINE_EVENT(/* exp_tid_unreg */
+ hfi1_exp_tid_reg_unreg, hfi1_exp_tid_unreg,
+ TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, u32 npages,
+ unsigned long va, unsigned long pa, dma_addr_t dma),
+ TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma)
+);
+
+DEFINE_EVENT(/* exp_tid_reg */
+ hfi1_exp_tid_reg_unreg, hfi1_exp_tid_reg,
+ TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, u32 npages,
+ unsigned long va, unsigned long pa, dma_addr_t dma),
+ TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma)
+);
+
+TRACE_EVENT(/* put_tid */
+ hfi1_put_tid,
+ TP_PROTO(struct hfi1_devdata *dd,
+ u32 index, u32 type, unsigned long pa, u16 order),
+ TP_ARGS(dd, index, type, pa, order),
+ TP_STRUCT__entry(/* entry */
+ DD_DEV_ENTRY(dd)
+ __field(unsigned long, pa);
+ __field(u32, index);
+ __field(u32, type);
+ __field(u16, order);
+ ),
+ TP_fast_assign(/* assign */
+ DD_DEV_ASSIGN(dd);
+ __entry->pa = pa;
+ __entry->index = index;
+ __entry->type = type;
+ __entry->order = order;
+ ),
+ TP_printk("[%s] type %s pa %lx index %u order %u",
+ __get_str(dev),
+ show_tidtype(__entry->type),
+ __entry->pa,
+ __entry->index,
+ __entry->order
+ )
+);
+
+TRACE_EVENT(/* exp_tid_inval */
+ hfi1_exp_tid_inval,
+ TP_PROTO(unsigned int ctxt, u16 subctxt, unsigned long va, u32 rarr,
+ u32 npages, dma_addr_t dma),
+ TP_ARGS(ctxt, subctxt, va, rarr, npages, dma),
+ TP_STRUCT__entry(/* entry */
+ __field(unsigned int, ctxt)
+ __field(u16, subctxt)
+ __field(unsigned long, va)
+ __field(u32, rarr)
+ __field(u32, npages)
+ __field(dma_addr_t, dma)
+ ),
+ TP_fast_assign(/* assign */
+ __entry->ctxt = ctxt;
+ __entry->subctxt = subctxt;
+ __entry->va = va;
+ __entry->rarr = rarr;
+ __entry->npages = npages;
+ __entry->dma = dma;
+ ),
+ TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx dma: 0x%llx",
+ __entry->ctxt,
+ __entry->subctxt,
+ __entry->rarr,
+ __entry->npages,
+ __entry->va,
+ __entry->dma
+ )
+);
+
+DECLARE_EVENT_CLASS(/* opfn_state */
+ hfi1_opfn_state_template,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp),
+ TP_STRUCT__entry(/* entry */
+ DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+ __field(u32, qpn)
+ __field(u16, requested)
+ __field(u16, completed)
+ __field(u8, curr)
+ ),
+ TP_fast_assign(/* assign */
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+ __entry->qpn = qp->ibqp.qp_num;
+ __entry->requested = priv->opfn.requested;
+ __entry->completed = priv->opfn.completed;
+ __entry->curr = priv->opfn.curr;
+ ),
+ TP_printk(/* print */
+ "[%s] qpn 0x%x requested 0x%x completed 0x%x curr 0x%x",
+ __get_str(dev),
+ __entry->qpn,
+ __entry->requested,
+ __entry->completed,
+ __entry->curr
+ )
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_opfn_state_template, hfi1_opfn_state_conn_request,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_opfn_state_template, hfi1_opfn_state_sched_conn_request,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_opfn_state_template, hfi1_opfn_state_conn_response,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_opfn_state_template, hfi1_opfn_state_conn_reply,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_opfn_state_template, hfi1_opfn_state_conn_error,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp)
+);
+
+DECLARE_EVENT_CLASS(/* opfn_data */
+ hfi1_opfn_data_template,
+ TP_PROTO(struct rvt_qp *qp, u8 capcode, u64 data),
+ TP_ARGS(qp, capcode, data),
+ TP_STRUCT__entry(/* entry */
+ DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+ __field(u32, qpn)
+ __field(u32, state)
+ __field(u8, capcode)
+ __field(u64, data)
+ ),
+ TP_fast_assign(/* assign */
+ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+ __entry->qpn = qp->ibqp.qp_num;
+ __entry->state = qp->state;
+ __entry->capcode = capcode;
+ __entry->data = data;
+ ),
+ TP_printk(/* printk */
+ "[%s] qpn 0x%x (state 0x%x) Capcode %u data 0x%llx",
+ __get_str(dev),
+ __entry->qpn,
+ __entry->state,
+ __entry->capcode,
+ __entry->data
+ )
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_opfn_data_template, hfi1_opfn_data_conn_request,
+ TP_PROTO(struct rvt_qp *qp, u8 capcode, u64 data),
+ TP_ARGS(qp, capcode, data)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_opfn_data_template, hfi1_opfn_data_conn_response,
+ TP_PROTO(struct rvt_qp *qp, u8 capcode, u64 data),
+ TP_ARGS(qp, capcode, data)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_opfn_data_template, hfi1_opfn_data_conn_reply,
+ TP_PROTO(struct rvt_qp *qp, u8 capcode, u64 data),
+ TP_ARGS(qp, capcode, data)
+);
+
+DECLARE_EVENT_CLASS(/* opfn_param */
+ hfi1_opfn_param_template,
+ TP_PROTO(struct rvt_qp *qp, char remote,
+ struct tid_rdma_params *param),
+ TP_ARGS(qp, remote, param),
+ TP_STRUCT__entry(/* entry */
+ DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+ __field(u32, qpn)
+ __field(char, remote)
+ __field(u32, param_qp)
+ __field(u32, max_len)
+ __field(u16, jkey)
+ __field(u8, max_read)
+ __field(u8, max_write)
+ __field(u8, timeout)
+ __field(u8, urg)
+ ),
+ TP_fast_assign(/* assign */
+ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+ __entry->qpn = qp->ibqp.qp_num;
+ __entry->remote = remote;
+ __entry->param_qp = param->qp;
+ __entry->max_len = param->max_len;
+ __entry->jkey = param->jkey;
+ __entry->max_read = param->max_read;
+ __entry->max_write = param->max_write;
+ __entry->timeout = param->timeout;
+ __entry->urg = param->urg;
+ ),
+ TP_printk(/* print */
+ OPFN_PARAM_PRN,
+ __get_str(dev),
+ __entry->qpn,
+ __entry->remote ? "remote" : "local",
+ __entry->param_qp,
+ __entry->max_read,
+ __entry->max_write,
+ __entry->max_len,
+ __entry->jkey,
+ __entry->timeout,
+ __entry->urg
+ )
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_opfn_param_template, hfi1_opfn_param,
+ TP_PROTO(struct rvt_qp *qp, char remote,
+ struct tid_rdma_params *param),
+ TP_ARGS(qp, remote, param)
+);
+
+DECLARE_EVENT_CLASS(/* msg */
+ hfi1_msg_template,
+ TP_PROTO(struct rvt_qp *qp, const char *msg, u64 more),
+ TP_ARGS(qp, msg, more),
+ TP_STRUCT__entry(/* entry */
+ __field(u32, qpn)
+ __string(msg, msg)
+ __field(u64, more)
+ ),
+ TP_fast_assign(/* assign */
+ __entry->qpn = qp ? qp->ibqp.qp_num : 0;
+ __assign_str(msg, msg);
+ __entry->more = more;
+ ),
+ TP_printk(/* print */
+ "qpn 0x%x %s 0x%llx",
+ __entry->qpn,
+ __get_str(msg),
+ __entry->more
+ )
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_msg_template, hfi1_msg_opfn_conn_request,
+ TP_PROTO(struct rvt_qp *qp, const char *msg, u64 more),
+ TP_ARGS(qp, msg, more)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_msg_template, hfi1_msg_opfn_conn_error,
+ TP_PROTO(struct rvt_qp *qp, const char *msg, u64 more),
+ TP_ARGS(qp, msg, more)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_msg_template, hfi1_msg_alloc_tids,
+ TP_PROTO(struct rvt_qp *qp, const char *msg, u64 more),
+ TP_ARGS(qp, msg, more)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_msg_template, hfi1_msg_tid_restart_req,
+ TP_PROTO(struct rvt_qp *qp, const char *msg, u64 more),
+ TP_ARGS(qp, msg, more)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_msg_template, hfi1_msg_handle_kdeth_eflags,
+ TP_PROTO(struct rvt_qp *qp, const char *msg, u64 more),
+ TP_ARGS(qp, msg, more)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_msg_template, hfi1_msg_tid_timeout,
+ TP_PROTO(struct rvt_qp *qp, const char *msg, u64 more),
+ TP_ARGS(qp, msg, more)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_msg_template, hfi1_msg_tid_retry_timeout,
+ TP_PROTO(struct rvt_qp *qp, const char *msg, u64 more),
+ TP_ARGS(qp, msg, more)
+);
+
+DECLARE_EVENT_CLASS(/* tid_flow_page */
+ hfi1_tid_flow_page_template,
+ TP_PROTO(struct rvt_qp *qp, struct tid_rdma_flow *flow, u32 index,
+ char mtu8k, char v1, void *vaddr),
+ TP_ARGS(qp, flow, index, mtu8k, v1, vaddr),
+ TP_STRUCT__entry(/* entry */
+ DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+ __field(u32, qpn)
+ __field(char, mtu8k)
+ __field(char, v1)
+ __field(u32, index)
+ __field(u64, page)
+ __field(u64, vaddr)
+ ),
+ TP_fast_assign(/* assign */
+ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+ __entry->qpn = qp->ibqp.qp_num;
+ __entry->mtu8k = mtu8k;
+ __entry->v1 = v1;
+ __entry->index = index;
+ __entry->page = vaddr ? (u64)virt_to_page(vaddr) : 0ULL;
+ __entry->vaddr = (u64)vaddr;
+ ),
+ TP_printk(/* print */
+ "[%s] qpn 0x%x page[%u]: page 0x%llx %s 0x%llx",
+ __get_str(dev),
+ __entry->qpn,
+ __entry->index,
+ __entry->page,
+ __entry->mtu8k ? (__entry->v1 ? "v1" : "v0") : "vaddr",
+ __entry->vaddr
+ )
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_flow_page_template, hfi1_tid_flow_page,
+ TP_PROTO(struct rvt_qp *qp, struct tid_rdma_flow *flow, u32 index,
+ char mtu8k, char v1, void *vaddr),
+ TP_ARGS(qp, flow, index, mtu8k, v1, vaddr)
+);
+
+DECLARE_EVENT_CLASS(/* tid_pageset */
+ hfi1_tid_pageset_template,
+ TP_PROTO(struct rvt_qp *qp, u32 index, u16 idx, u16 count),
+ TP_ARGS(qp, index, idx, count),
+ TP_STRUCT__entry(/* entry */
+ DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+ __field(u32, qpn)
+ __field(u32, index)
+ __field(u16, idx)
+ __field(u16, count)
+ ),
+ TP_fast_assign(/* assign */
+ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+ __entry->qpn = qp->ibqp.qp_num;
+ __entry->index = index;
+ __entry->idx = idx;
+ __entry->count = count;
+ ),
+ TP_printk(/* print */
+ "[%s] qpn 0x%x list[%u]: idx %u count %u",
+ __get_str(dev),
+ __entry->qpn,
+ __entry->index,
+ __entry->idx,
+ __entry->count
+ )
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_pageset_template, hfi1_tid_pageset,
+ TP_PROTO(struct rvt_qp *qp, u32 index, u16 idx, u16 count),
+ TP_ARGS(qp, index, idx, count)
+);
+
+DECLARE_EVENT_CLASS(/* tid_fow */
+ hfi1_tid_flow_template,
+ TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+ TP_ARGS(qp, index, flow),
+ TP_STRUCT__entry(/* entry */
+ DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+ __field(u32, qpn)
+ __field(int, index)
+ __field(int, idx)
+ __field(u32, resp_ib_psn)
+ __field(u32, generation)
+ __field(u32, fspsn)
+ __field(u32, flpsn)
+ __field(u32, r_next_psn)
+ __field(u32, ib_spsn)
+ __field(u32, ib_lpsn)
+ __field(u32, npagesets)
+ __field(u32, tnode_cnt)
+ __field(u32, tidcnt)
+ __field(u32, tid_idx)
+ __field(u32, tid_offset)
+ __field(u32, length)
+ __field(u32, sent)
+ ),
+ TP_fast_assign(/* assign */
+ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+ __entry->qpn = qp->ibqp.qp_num;
+ __entry->index = index;
+ __entry->idx = flow->idx;
+ __entry->resp_ib_psn = flow->flow_state.resp_ib_psn;
+ __entry->generation = flow->flow_state.generation;
+ __entry->fspsn = full_flow_psn(flow,
+ flow->flow_state.spsn);
+ __entry->flpsn = full_flow_psn(flow,
+ flow->flow_state.lpsn);
+ __entry->r_next_psn = flow->flow_state.r_next_psn;
+ __entry->ib_spsn = flow->flow_state.ib_spsn;
+ __entry->ib_lpsn = flow->flow_state.ib_lpsn;
+ __entry->npagesets = flow->npagesets;
+ __entry->tnode_cnt = flow->tnode_cnt;
+ __entry->tidcnt = flow->tidcnt;
+ __entry->tid_idx = flow->tid_idx;
+ __entry->tid_offset = flow->tid_offset;
+ __entry->length = flow->length;
+ __entry->sent = flow->sent;
+ ),
+ TP_printk(/* print */
+ TID_FLOW_PRN,
+ __get_str(dev),
+ __entry->qpn,
+ __entry->index,
+ __entry->idx,
+ __entry->resp_ib_psn,
+ __entry->generation,
+ __entry->fspsn,
+ __entry->flpsn,
+ __entry->r_next_psn,
+ __entry->ib_spsn,
+ __entry->ib_lpsn,
+ __entry->npagesets,
+ __entry->tnode_cnt,
+ __entry->tidcnt,
+ __entry->tid_idx,
+ __entry->tid_offset,
+ __entry->length,
+ __entry->sent
+ )
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_flow_template, hfi1_tid_flow_alloc,
+ TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+ TP_ARGS(qp, index, flow)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_flow_template, hfi1_tid_flow_build_read_pkt,
+ TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+ TP_ARGS(qp, index, flow)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_flow_template, hfi1_tid_flow_build_read_resp,
+ TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+ TP_ARGS(qp, index, flow)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_flow_template, hfi1_tid_flow_rcv_read_req,
+ TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+ TP_ARGS(qp, index, flow)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_flow_template, hfi1_tid_flow_rcv_read_resp,
+ TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+ TP_ARGS(qp, index, flow)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_flow_template, hfi1_tid_flow_restart_req,
+ TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+ TP_ARGS(qp, index, flow)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_flow_template, hfi1_tid_flow_build_write_resp,
+ TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+ TP_ARGS(qp, index, flow)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_flow_template, hfi1_tid_flow_rcv_write_resp,
+ TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+ TP_ARGS(qp, index, flow)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_flow_template, hfi1_tid_flow_build_write_data,
+ TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+ TP_ARGS(qp, index, flow)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_flow_template, hfi1_tid_flow_rcv_tid_ack,
+ TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+ TP_ARGS(qp, index, flow)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_flow_template, hfi1_tid_flow_rcv_resync,
+ TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+ TP_ARGS(qp, index, flow)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_flow_template, hfi1_tid_flow_handle_kdeth_eflags,
+ TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+ TP_ARGS(qp, index, flow)
+);
+
+DECLARE_EVENT_CLASS(/* tid_node */
+ hfi1_tid_node_template,
+ TP_PROTO(struct rvt_qp *qp, const char *msg, u32 index, u32 base,
+ u8 map, u8 used, u8 cnt),
+ TP_ARGS(qp, msg, index, base, map, used, cnt),
+ TP_STRUCT__entry(/* entry */
+ DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+ __field(u32, qpn)
+ __string(msg, msg)
+ __field(u32, index)
+ __field(u32, base)
+ __field(u8, map)
+ __field(u8, used)
+ __field(u8, cnt)
+ ),
+ TP_fast_assign(/* assign */
+ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+ __entry->qpn = qp->ibqp.qp_num;
+ __assign_str(msg, msg);
+ __entry->index = index;
+ __entry->base = base;
+ __entry->map = map;
+ __entry->used = used;
+ __entry->cnt = cnt;
+ ),
+ TP_printk(/* print */
+ TID_NODE_PRN,
+ __get_str(dev),
+ __entry->qpn,
+ __get_str(msg),
+ __entry->index,
+ __entry->base,
+ __entry->map,
+ __entry->used,
+ __entry->cnt
+ )
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_node_template, hfi1_tid_node_add,
+ TP_PROTO(struct rvt_qp *qp, const char *msg, u32 index, u32 base,
+ u8 map, u8 used, u8 cnt),
+ TP_ARGS(qp, msg, index, base, map, used, cnt)
+);
+
+DECLARE_EVENT_CLASS(/* tid_entry */
+ hfi1_tid_entry_template,
+ TP_PROTO(struct rvt_qp *qp, int index, u32 ent),
+ TP_ARGS(qp, index, ent),
+ TP_STRUCT__entry(/* entry */
+ DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+ __field(u32, qpn)
+ __field(int, index)
+ __field(u8, ctrl)
+ __field(u16, idx)
+ __field(u16, len)
+ ),
+ TP_fast_assign(/* assign */
+ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+ __entry->qpn = qp->ibqp.qp_num;
+ __entry->index = index;
+ __entry->ctrl = hfi1_trace_get_tid_ctrl(ent);
+ __entry->idx = hfi1_trace_get_tid_idx(ent);
+ __entry->len = hfi1_trace_get_tid_len(ent);
+ ),
+ TP_printk(/* print */
+ "[%s] qpn 0x%x TID entry %d: idx %u len %u ctrl 0x%x",
+ __get_str(dev),
+ __entry->qpn,
+ __entry->index,
+ __entry->idx,
+ __entry->len,
+ __entry->ctrl
+ )
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_entry_template, hfi1_tid_entry_alloc,
+ TP_PROTO(struct rvt_qp *qp, int index, u32 entry),
+ TP_ARGS(qp, index, entry)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_entry_template, hfi1_tid_entry_build_read_resp,
+ TP_PROTO(struct rvt_qp *qp, int index, u32 ent),
+ TP_ARGS(qp, index, ent)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_entry_template, hfi1_tid_entry_rcv_read_req,
+ TP_PROTO(struct rvt_qp *qp, int index, u32 ent),
+ TP_ARGS(qp, index, ent)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_entry_template, hfi1_tid_entry_rcv_write_resp,
+ TP_PROTO(struct rvt_qp *qp, int index, u32 entry),
+ TP_ARGS(qp, index, entry)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_entry_template, hfi1_tid_entry_build_write_data,
+ TP_PROTO(struct rvt_qp *qp, int index, u32 entry),
+ TP_ARGS(qp, index, entry)
+);
+
+DECLARE_EVENT_CLASS(/* rsp_info */
+ hfi1_responder_info_template,
+ TP_PROTO(struct rvt_qp *qp, u32 psn),
+ TP_ARGS(qp, psn),
+ TP_STRUCT__entry(/* entry */
+ DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+ __field(u32, qpn)
+ __field(u8, state)
+ __field(u8, s_state)
+ __field(u32, psn)
+ __field(u32, r_psn)
+ __field(u8, r_state)
+ __field(u8, r_flags)
+ __field(u8, r_head_ack_queue)
+ __field(u8, s_tail_ack_queue)
+ __field(u8, s_acked_ack_queue)
+ __field(u8, s_ack_state)
+ __field(u8, s_nak_state)
+ __field(u8, r_nak_state)
+ __field(u32, s_flags)
+ __field(u32, ps_flags)
+ __field(unsigned long, iow_flags)
+ ),
+ TP_fast_assign(/* assign */
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+ __entry->qpn = qp->ibqp.qp_num;
+ __entry->state = qp->state;
+ __entry->s_state = qp->s_state;
+ __entry->psn = psn;
+ __entry->r_psn = qp->r_psn;
+ __entry->r_state = qp->r_state;
+ __entry->r_flags = qp->r_flags;
+ __entry->r_head_ack_queue = qp->r_head_ack_queue;
+ __entry->s_tail_ack_queue = qp->s_tail_ack_queue;
+ __entry->s_acked_ack_queue = qp->s_acked_ack_queue;
+ __entry->s_ack_state = qp->s_ack_state;
+ __entry->s_nak_state = qp->s_nak_state;
+ __entry->s_flags = qp->s_flags;
+ __entry->ps_flags = priv->s_flags;
+ __entry->iow_flags = priv->s_iowait.flags;
+ ),
+ TP_printk(/* print */
+ RSP_INFO_PRN,
+ __get_str(dev),
+ __entry->qpn,
+ __entry->state,
+ __entry->s_state,
+ __entry->psn,
+ __entry->r_psn,
+ __entry->r_state,
+ __entry->r_flags,
+ __entry->r_head_ack_queue,
+ __entry->s_tail_ack_queue,
+ __entry->s_acked_ack_queue,
+ __entry->s_ack_state,
+ __entry->s_nak_state,
+ __entry->s_flags,
+ __entry->ps_flags,
+ __entry->iow_flags
+ )
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_responder_info_template, hfi1_rsp_make_rc_ack,
+ TP_PROTO(struct rvt_qp *qp, u32 psn),
+ TP_ARGS(qp, psn)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_responder_info_template, hfi1_rsp_rcv_tid_read_req,
+ TP_PROTO(struct rvt_qp *qp, u32 psn),
+ TP_ARGS(qp, psn)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_responder_info_template, hfi1_rsp_tid_rcv_error,
+ TP_PROTO(struct rvt_qp *qp, u32 psn),
+ TP_ARGS(qp, psn)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_responder_info_template, hfi1_rsp_tid_write_alloc_res,
+ TP_PROTO(struct rvt_qp *qp, u32 psn),
+ TP_ARGS(qp, psn)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_responder_info_template, hfi1_rsp_rcv_tid_write_req,
+ TP_PROTO(struct rvt_qp *qp, u32 psn),
+ TP_ARGS(qp, psn)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_responder_info_template, hfi1_rsp_build_tid_write_resp,
+ TP_PROTO(struct rvt_qp *qp, u32 psn),
+ TP_ARGS(qp, psn)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_responder_info_template, hfi1_rsp_rcv_tid_write_data,
+ TP_PROTO(struct rvt_qp *qp, u32 psn),
+ TP_ARGS(qp, psn)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_responder_info_template, hfi1_rsp_make_tid_ack,
+ TP_PROTO(struct rvt_qp *qp, u32 psn),
+ TP_ARGS(qp, psn)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_responder_info_template, hfi1_rsp_handle_kdeth_eflags,
+ TP_PROTO(struct rvt_qp *qp, u32 psn),
+ TP_ARGS(qp, psn)
+);
+
+DECLARE_EVENT_CLASS(/* sender_info */
+ hfi1_sender_info_template,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp),
+ TP_STRUCT__entry(/* entry */
+ DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+ __field(u32, qpn)
+ __field(u8, state)
+ __field(u32, s_cur)
+ __field(u32, s_tail)
+ __field(u32, s_head)
+ __field(u32, s_acked)
+ __field(u32, s_last)
+ __field(u32, s_psn)
+ __field(u32, s_last_psn)
+ __field(u32, s_flags)
+ __field(u32, ps_flags)
+ __field(unsigned long, iow_flags)
+ __field(u8, s_state)
+ __field(u8, s_num_rd)
+ __field(u8, s_retry)
+ ),
+ TP_fast_assign(/* assign */
+ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
+ __entry->qpn = qp->ibqp.qp_num;
+ __entry->state = qp->state;
+ __entry->s_cur = qp->s_cur;
+ __entry->s_tail = qp->s_tail;
+ __entry->s_head = qp->s_head;
+ __entry->s_acked = qp->s_acked;
+ __entry->s_last = qp->s_last;
+ __entry->s_psn = qp->s_psn;
+ __entry->s_last_psn = qp->s_last_psn;
+ __entry->s_flags = qp->s_flags;
+ __entry->ps_flags = ((struct hfi1_qp_priv *)qp->priv)->s_flags;
+ __entry->iow_flags =
+ ((struct hfi1_qp_priv *)qp->priv)->s_iowait.flags;
+ __entry->s_state = qp->s_state;
+ __entry->s_num_rd = qp->s_num_rd_atomic;
+ __entry->s_retry = qp->s_retry;
+ ),
+ TP_printk(/* print */
+ SENDER_INFO_PRN,
+ __get_str(dev),
+ __entry->qpn,
+ __entry->state,
+ __entry->s_cur,
+ __entry->s_tail,
+ __entry->s_head,
+ __entry->s_acked,
+ __entry->s_last,
+ __entry->s_psn,
+ __entry->s_last_psn,
+ __entry->s_flags,
+ __entry->ps_flags,
+ __entry->iow_flags,
+ __entry->s_state,
+ __entry->s_num_rd,
+ __entry->s_retry
+ )
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_sender_info_template, hfi1_sender_make_rc_req,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_sender_info_template, hfi1_sender_reset_psn,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_sender_info_template, hfi1_sender_restart_rc,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_sender_info_template, hfi1_sender_do_rc_ack,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_sender_info_template, hfi1_sender_rcv_tid_read_resp,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_sender_info_template, hfi1_sender_rcv_tid_ack,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_sender_info_template, hfi1_sender_make_tid_pkt,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp)
+);
+
+DECLARE_EVENT_CLASS(/* tid_read_sender */
+ hfi1_tid_read_sender_template,
+ TP_PROTO(struct rvt_qp *qp, char newreq),
+ TP_ARGS(qp, newreq),
+ TP_STRUCT__entry(/* entry */
+ DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+ __field(u32, qpn)
+ __field(char, newreq)
+ __field(u32, tid_r_reqs)
+ __field(u32, tid_r_comp)
+ __field(u32, pending_tid_r_segs)
+ __field(u32, s_flags)
+ __field(u32, ps_flags)
+ __field(unsigned long, iow_flags)
+ __field(u8, s_state)
+ __field(u32, hw_flow_index)
+ __field(u32, generation)
+ __field(u32, fpsn)
+ __field(u32, flow_flags)
+ ),
+ TP_fast_assign(/* assign */
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+ __entry->qpn = qp->ibqp.qp_num;
+ __entry->newreq = newreq;
+ __entry->tid_r_reqs = priv->tid_r_reqs;
+ __entry->tid_r_comp = priv->tid_r_comp;
+ __entry->pending_tid_r_segs = priv->pending_tid_r_segs;
+ __entry->s_flags = qp->s_flags;
+ __entry->ps_flags = priv->s_flags;
+ __entry->iow_flags = priv->s_iowait.flags;
+ __entry->s_state = priv->s_state;
+ __entry->hw_flow_index = priv->flow_state.index;
+ __entry->generation = priv->flow_state.generation;
+ __entry->fpsn = priv->flow_state.psn;
+ __entry->flow_flags = priv->flow_state.flags;
+ ),
+ TP_printk(/* print */
+ TID_READ_SENDER_PRN,
+ __get_str(dev),
+ __entry->qpn,
+ __entry->newreq,
+ __entry->tid_r_reqs,
+ __entry->tid_r_comp,
+ __entry->pending_tid_r_segs,
+ __entry->s_flags,
+ __entry->ps_flags,
+ __entry->iow_flags,
+ __entry->s_state,
+ __entry->hw_flow_index,
+ __entry->generation,
+ __entry->fpsn,
+ __entry->flow_flags
+ )
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_read_sender_template, hfi1_tid_read_sender_make_req,
+ TP_PROTO(struct rvt_qp *qp, char newreq),
+ TP_ARGS(qp, newreq)
+);
+
+DECLARE_EVENT_CLASS(/* tid_rdma_request */
+ hfi1_tid_rdma_request_template,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req),
+ TP_STRUCT__entry(/* entry */
+ DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+ __field(u32, qpn)
+ __field(char, newreq)
+ __field(u8, opcode)
+ __field(u32, psn)
+ __field(u32, lpsn)
+ __field(u32, cur_seg)
+ __field(u32, comp_seg)
+ __field(u32, ack_seg)
+ __field(u32, alloc_seg)
+ __field(u32, total_segs)
+ __field(u16, setup_head)
+ __field(u16, clear_tail)
+ __field(u16, flow_idx)
+ __field(u16, acked_tail)
+ __field(u32, state)
+ __field(u32, r_ack_psn)
+ __field(u32, r_flow_psn)
+ __field(u32, r_last_acked)
+ __field(u32, s_next_psn)
+ ),
+ TP_fast_assign(/* assign */
+ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+ __entry->qpn = qp->ibqp.qp_num;
+ __entry->newreq = newreq;
+ __entry->opcode = opcode;
+ __entry->psn = psn;
+ __entry->lpsn = lpsn;
+ __entry->cur_seg = req->cur_seg;
+ __entry->comp_seg = req->comp_seg;
+ __entry->ack_seg = req->ack_seg;
+ __entry->alloc_seg = req->alloc_seg;
+ __entry->total_segs = req->total_segs;
+ __entry->setup_head = req->setup_head;
+ __entry->clear_tail = req->clear_tail;
+ __entry->flow_idx = req->flow_idx;
+ __entry->acked_tail = req->acked_tail;
+ __entry->state = req->state;
+ __entry->r_ack_psn = req->r_ack_psn;
+ __entry->r_flow_psn = req->r_flow_psn;
+ __entry->r_last_acked = req->r_last_acked;
+ __entry->s_next_psn = req->s_next_psn;
+ ),
+ TP_printk(/* print */
+ TID_REQ_PRN,
+ __get_str(dev),
+ __entry->qpn,
+ __entry->newreq,
+ __entry->opcode,
+ __entry->psn,
+ __entry->lpsn,
+ __entry->cur_seg,
+ __entry->comp_seg,
+ __entry->ack_seg,
+ __entry->alloc_seg,
+ __entry->total_segs,
+ __entry->setup_head,
+ __entry->clear_tail,
+ __entry->flow_idx,
+ __entry->acked_tail,
+ __entry->state,
+ __entry->r_ack_psn,
+ __entry->r_flow_psn,
+ __entry->r_last_acked,
+ __entry->s_next_psn
+ )
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_rdma_request_template, hfi1_tid_req_make_req_read,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_rdma_request_template, hfi1_tid_req_build_read_req,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_read_req,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_read_resp,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_err,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_rdma_request_template, hfi1_tid_req_restart_req,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_rdma_request_template, hfi1_tid_req_setup_tid_wqe,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_rdma_request_template, hfi1_tid_req_write_alloc_res,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_write_req,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_rdma_request_template, hfi1_tid_req_build_write_resp,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_write_resp,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_write_data,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_tid_ack,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_rdma_request_template, hfi1_tid_req_tid_retry_timeout,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_resync,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_rdma_request_template, hfi1_tid_req_make_tid_pkt,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_rdma_request_template, hfi1_tid_req_make_tid_ack,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_rdma_request_template, hfi1_tid_req_handle_kdeth_eflags,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_rdma_request_template, hfi1_tid_req_make_rc_ack_write,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_rdma_request_template, hfi1_tid_req_make_req_write,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DECLARE_EVENT_CLASS(/* rc_rcv_err */
+ hfi1_rc_rcv_err_template,
+ TP_PROTO(struct rvt_qp *qp, u32 opcode, u32 psn, int diff),
+ TP_ARGS(qp, opcode, psn, diff),
+ TP_STRUCT__entry(/* entry */
+ DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+ __field(u32, qpn)
+ __field(u32, s_flags)
+ __field(u8, state)
+ __field(u8, s_acked_ack_queue)
+ __field(u8, s_tail_ack_queue)
+ __field(u8, r_head_ack_queue)
+ __field(u32, opcode)
+ __field(u32, psn)
+ __field(u32, r_psn)
+ __field(int, diff)
+ ),
+ TP_fast_assign(/* assign */
+ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
+ __entry->qpn = qp->ibqp.qp_num;
+ __entry->s_flags = qp->s_flags;
+ __entry->state = qp->state;
+ __entry->s_acked_ack_queue = qp->s_acked_ack_queue;
+ __entry->s_tail_ack_queue = qp->s_tail_ack_queue;
+ __entry->r_head_ack_queue = qp->r_head_ack_queue;
+ __entry->opcode = opcode;
+ __entry->psn = psn;
+ __entry->r_psn = qp->r_psn;
+ __entry->diff = diff;
+ ),
+ TP_printk(/* print */
+ RCV_ERR_PRN,
+ __get_str(dev),
+ __entry->qpn,
+ __entry->s_flags,
+ __entry->state,
+ __entry->s_acked_ack_queue,
+ __entry->s_tail_ack_queue,
+ __entry->r_head_ack_queue,
+ __entry->opcode,
+ __entry->psn,
+ __entry->r_psn,
+ __entry->diff
+ )
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_rc_rcv_err_template, hfi1_tid_rdma_rcv_err,
+ TP_PROTO(struct rvt_qp *qp, u32 opcode, u32 psn, int diff),
+ TP_ARGS(qp, opcode, psn, diff)
+);
+
+DECLARE_EVENT_CLASS(/* sge */
+ hfi1_sge_template,
+ TP_PROTO(struct rvt_qp *qp, int index, struct rvt_sge *sge),
+ TP_ARGS(qp, index, sge),
+ TP_STRUCT__entry(/* entry */
+ DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+ __field(u32, qpn)
+ __field(int, index)
+ __field(u64, vaddr)
+ __field(u32, sge_length)
+ ),
+ TP_fast_assign(/* assign */
+ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+ __entry->qpn = qp->ibqp.qp_num;
+ __entry->index = index;
+ __entry->vaddr = (u64)sge->vaddr;
+ __entry->sge_length = sge->sge_length;
+ ),
+ TP_printk(/* print */
+ "[%s] qpn 0x%x sge %d: vaddr 0x%llx sge_length %u",
+ __get_str(dev),
+ __entry->qpn,
+ __entry->index,
+ __entry->vaddr,
+ __entry->sge_length
+ )
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_sge_template, hfi1_sge_check_align,
+ TP_PROTO(struct rvt_qp *qp, int index, struct rvt_sge *sge),
+ TP_ARGS(qp, index, sge)
+);
+
+DECLARE_EVENT_CLASS(/* tid_write_sp */
+ hfi1_tid_write_rsp_template,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp),
+ TP_STRUCT__entry(/* entry */
+ DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+ __field(u32, qpn)
+ __field(u32, r_tid_head)
+ __field(u32, r_tid_tail)
+ __field(u32, r_tid_ack)
+ __field(u32, r_tid_alloc)
+ __field(u32, alloc_w_segs)
+ __field(u32, pending_tid_w_segs)
+ __field(bool, sync_pt)
+ __field(u32, ps_nak_psn)
+ __field(u8, ps_nak_state)
+ __field(u8, prnr_nak_state)
+ __field(u32, hw_flow_index)
+ __field(u32, generation)
+ __field(u32, fpsn)
+ __field(u32, flow_flags)
+ __field(bool, resync)
+ __field(u32, r_next_psn_kdeth)
+ ),
+ TP_fast_assign(/* assign */
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+ __entry->qpn = qp->ibqp.qp_num;
+ __entry->r_tid_head = priv->r_tid_head;
+ __entry->r_tid_tail = priv->r_tid_tail;
+ __entry->r_tid_ack = priv->r_tid_ack;
+ __entry->r_tid_alloc = priv->r_tid_alloc;
+ __entry->alloc_w_segs = priv->alloc_w_segs;
+ __entry->pending_tid_w_segs = priv->pending_tid_w_segs;
+ __entry->sync_pt = priv->sync_pt;
+ __entry->ps_nak_psn = priv->s_nak_psn;
+ __entry->ps_nak_state = priv->s_nak_state;
+ __entry->prnr_nak_state = priv->rnr_nak_state;
+ __entry->hw_flow_index = priv->flow_state.index;
+ __entry->generation = priv->flow_state.generation;
+ __entry->fpsn = priv->flow_state.psn;
+ __entry->flow_flags = priv->flow_state.flags;
+ __entry->resync = priv->resync;
+ __entry->r_next_psn_kdeth = priv->r_next_psn_kdeth;
+ ),
+ TP_printk(/* print */
+ TID_WRITE_RSPDR_PRN,
+ __get_str(dev),
+ __entry->qpn,
+ __entry->r_tid_head,
+ __entry->r_tid_tail,
+ __entry->r_tid_ack,
+ __entry->r_tid_alloc,
+ __entry->alloc_w_segs,
+ __entry->pending_tid_w_segs,
+ __entry->sync_pt ? "yes" : "no",
+ __entry->ps_nak_psn,
+ __entry->ps_nak_state,
+ __entry->prnr_nak_state,
+ __entry->hw_flow_index,
+ __entry->generation,
+ __entry->fpsn,
+ __entry->flow_flags,
+ __entry->resync ? "yes" : "no",
+ __entry->r_next_psn_kdeth
+ )
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_alloc_res,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_rcv_req,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_build_resp,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_rcv_data,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_rcv_resync,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_make_tid_ack,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_handle_kdeth_eflags,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_make_rc_ack,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp)
+);
+
+DECLARE_EVENT_CLASS(/* tid_write_sender */
+ hfi1_tid_write_sender_template,
+ TP_PROTO(struct rvt_qp *qp, char newreq),
+ TP_ARGS(qp, newreq),
+ TP_STRUCT__entry(/* entry */
+ DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+ __field(u32, qpn)
+ __field(char, newreq)
+ __field(u32, s_tid_cur)
+ __field(u32, s_tid_tail)
+ __field(u32, s_tid_head)
+ __field(u32, pending_tid_w_resp)
+ __field(u32, n_requests)
+ __field(u32, n_tid_requests)
+ __field(u32, s_flags)
+ __field(u32, ps_flags)
+ __field(unsigned long, iow_flags)
+ __field(u8, s_state)
+ __field(u8, s_retry)
+ ),
+ TP_fast_assign(/* assign */
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+ __entry->qpn = qp->ibqp.qp_num;
+ __entry->newreq = newreq;
+ __entry->s_tid_cur = priv->s_tid_cur;
+ __entry->s_tid_tail = priv->s_tid_tail;
+ __entry->s_tid_head = priv->s_tid_head;
+ __entry->pending_tid_w_resp = priv->pending_tid_w_resp;
+ __entry->n_requests = atomic_read(&priv->n_requests);
+ __entry->n_tid_requests = atomic_read(&priv->n_tid_requests);
+ __entry->s_flags = qp->s_flags;
+ __entry->ps_flags = priv->s_flags;
+ __entry->iow_flags = priv->s_iowait.flags;
+ __entry->s_state = priv->s_state;
+ __entry->s_retry = priv->s_retry;
+ ),
+ TP_printk(/* print */
+ TID_WRITE_SENDER_PRN,
+ __get_str(dev),
+ __entry->qpn,
+ __entry->newreq,
+ __entry->s_tid_cur,
+ __entry->s_tid_tail,
+ __entry->s_tid_head,
+ __entry->pending_tid_w_resp,
+ __entry->n_requests,
+ __entry->n_tid_requests,
+ __entry->s_flags,
+ __entry->ps_flags,
+ __entry->iow_flags,
+ __entry->s_state,
+ __entry->s_retry
+ )
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_write_sender_template, hfi1_tid_write_sender_rcv_resp,
+ TP_PROTO(struct rvt_qp *qp, char newreq),
+ TP_ARGS(qp, newreq)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_write_sender_template, hfi1_tid_write_sender_rcv_tid_ack,
+ TP_PROTO(struct rvt_qp *qp, char newreq),
+ TP_ARGS(qp, newreq)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_write_sender_template, hfi1_tid_write_sender_retry_timeout,
+ TP_PROTO(struct rvt_qp *qp, char newreq),
+ TP_ARGS(qp, newreq)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_write_sender_template, hfi1_tid_write_sender_make_tid_pkt,
+ TP_PROTO(struct rvt_qp *qp, char newreq),
+ TP_ARGS(qp, newreq)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_write_sender_template, hfi1_tid_write_sender_make_req,
+ TP_PROTO(struct rvt_qp *qp, char newreq),
+ TP_ARGS(qp, newreq)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_write_sender_template, hfi1_tid_write_sender_restart_rc,
+ TP_PROTO(struct rvt_qp *qp, char newreq),
+ TP_ARGS(qp, newreq)
+);
+
+DECLARE_EVENT_CLASS(/* tid_ack */
+ hfi1_tid_ack_template,
+ TP_PROTO(struct rvt_qp *qp, u32 aeth, u32 psn,
+ u32 req_psn, u32 resync_psn),
+ TP_ARGS(qp, aeth, psn, req_psn, resync_psn),
+ TP_STRUCT__entry(/* entry */
+ DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+ __field(u32, qpn)
+ __field(u32, aeth)
+ __field(u32, psn)
+ __field(u32, req_psn)
+ __field(u32, resync_psn)
+ ),
+ TP_fast_assign(/* assign */
+ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
+ __entry->qpn = qp->ibqp.qp_num;
+ __entry->aeth = aeth;
+ __entry->psn = psn;
+ __entry->req_psn = req_psn;
+ __entry->resync_psn = resync_psn;
+ ),
+ TP_printk(/* print */
+ "[%s] qpn 0x%x aeth 0x%x psn 0x%x req_psn 0x%x resync_psn 0x%x",
+ __get_str(dev),
+ __entry->qpn,
+ __entry->aeth,
+ __entry->psn,
+ __entry->req_psn,
+ __entry->resync_psn
+ )
+);
+
+DEFINE_EVENT(/* rcv_tid_ack */
+ hfi1_tid_ack_template, hfi1_rcv_tid_ack,
+ TP_PROTO(struct rvt_qp *qp, u32 aeth, u32 psn,
+ u32 req_psn, u32 resync_psn),
+ TP_ARGS(qp, aeth, psn, req_psn, resync_psn)
+);
+
+DECLARE_EVENT_CLASS(/* kdeth_eflags_error */
+ hfi1_kdeth_eflags_error_template,
+ TP_PROTO(struct rvt_qp *qp, u8 rcv_type, u8 rte, u32 psn),
+ TP_ARGS(qp, rcv_type, rte, psn),
+ TP_STRUCT__entry(/* entry */
+ DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+ __field(u32, qpn)
+ __field(u8, rcv_type)
+ __field(u8, rte)
+ __field(u32, psn)
+ ),
+ TP_fast_assign(/* assign */
+ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+ __entry->qpn = qp->ibqp.qp_num;
+ __entry->rcv_type = rcv_type;
+ __entry->rte = rte;
+ __entry->psn = psn;
+ ),
+ TP_printk(/* print */
+ KDETH_EFLAGS_ERR_PRN,
+ __get_str(dev),
+ __entry->qpn,
+ __entry->rcv_type,
+ __entry->rte,
+ __entry->psn
+ )
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_kdeth_eflags_error_template, hfi1_eflags_err_write,
+ TP_PROTO(struct rvt_qp *qp, u8 rcv_type, u8 rte, u32 psn),
+ TP_ARGS(qp, rcv_type, rte, psn)
+);
+
+#endif /* __HFI1_TRACE_TID_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_tid
+#include <trace/define_trace.h>
diff --git a/drivers/infiniband/hw/hfi1/trace_tx.h b/drivers/infiniband/hw/hfi1/trace_tx.h
index c57af3b31fe1..09eb0c9ada00 100644
--- a/drivers/infiniband/hw/hfi1/trace_tx.h
+++ b/drivers/infiniband/hw/hfi1/trace_tx.h
@@ -114,19 +114,27 @@ DECLARE_EVENT_CLASS(hfi1_qpsleepwakeup_template,
__field(u32, qpn)
__field(u32, flags)
__field(u32, s_flags)
+ __field(u32, ps_flags)
+ __field(unsigned long, iow_flags)
),
TP_fast_assign(
DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
__entry->flags = flags;
__entry->qpn = qp->ibqp.qp_num;
__entry->s_flags = qp->s_flags;
+ __entry->ps_flags =
+ ((struct hfi1_qp_priv *)qp->priv)->s_flags;
+ __entry->iow_flags =
+ ((struct hfi1_qp_priv *)qp->priv)->s_iowait.flags;
),
TP_printk(
- "[%s] qpn 0x%x flags 0x%x s_flags 0x%x",
+ "[%s] qpn 0x%x flags 0x%x s_flags 0x%x ps_flags 0x%x iow_flags 0x%lx",
__get_str(dev),
__entry->qpn,
__entry->flags,
- __entry->s_flags
+ __entry->s_flags,
+ __entry->ps_flags,
+ __entry->iow_flags
)
);
@@ -838,6 +846,12 @@ DEFINE_EVENT(
TP_ARGS(qp, flag)
);
+DEFINE_EVENT(/* event */
+ hfi1_do_send_template, hfi1_rc_do_tid_send,
+ TP_PROTO(struct rvt_qp *qp, bool flag),
+ TP_ARGS(qp, flag)
+);
+
DEFINE_EVENT(
hfi1_do_send_template, hfi1_rc_expired_time_slice,
TP_PROTO(struct rvt_qp *qp, bool flag),
diff --git a/drivers/infiniband/hw/hfi1/uc.c b/drivers/infiniband/hw/hfi1/uc.c
index 6aca0c5a7f97..4ed4fcfabd6c 100644
--- a/drivers/infiniband/hw/hfi1/uc.c
+++ b/drivers/infiniband/hw/hfi1/uc.c
@@ -271,7 +271,8 @@ int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
ps->s_txreq->ss = &qp->s_sge;
ps->s_txreq->s_cur_size = len;
hfi1_make_ruc_header(qp, ohdr, bth0 | (qp->s_state << 24),
- mask_psn(qp->s_psn++), middle, ps);
+ qp->remote_qpn, mask_psn(qp->s_psn++),
+ middle, ps);
return 1;
done_free_tx:
@@ -321,7 +322,7 @@ void hfi1_uc_rcv(struct hfi1_packet *packet)
if (hfi1_ruc_check_hdr(ibp, packet))
return;
- process_ecn(qp, packet, true);
+ process_ecn(qp, packet);
psn = ib_bth_get_psn(ohdr);
/* Compare the PSN verses the expected PSN. */
diff --git a/drivers/infiniband/hw/hfi1/ud.c b/drivers/infiniband/hw/hfi1/ud.c
index 4baa8f4d49de..f88ad425664a 100644
--- a/drivers/infiniband/hw/hfi1/ud.c
+++ b/drivers/infiniband/hw/hfi1/ud.c
@@ -51,6 +51,7 @@
#include "hfi.h"
#include "mad.h"
#include "verbs_txreq.h"
+#include "trace_ibhdrs.h"
#include "qp.h"
/* We support only two types - 9B and 16B for now */
@@ -221,31 +222,11 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe)
ssge.num_sge = swqe->wr.num_sge;
sge = &ssge.sge;
while (length) {
- u32 len = sge->length;
+ u32 len = rvt_get_sge_length(sge, length);
- if (len > length)
- len = length;
- if (len > sge->sge_length)
- len = sge->sge_length;
WARN_ON_ONCE(len == 0);
rvt_copy_sge(qp, &qp->r_sge, sge->vaddr, len, true, false);
- sge->vaddr += len;
- sge->length -= len;
- sge->sge_length -= len;
- if (sge->sge_length == 0) {
- if (--ssge.num_sge)
- *sge = *ssge.sg_list++;
- } else if (sge->length == 0 && sge->mr->lkey) {
- if (++sge->n >= RVT_SEGSZ) {
- if (++sge->m >= sge->mr->mapsz)
- break;
- sge->n = 0;
- }
- sge->vaddr =
- sge->mr->map[sge->m]->segs[sge->n].vaddr;
- sge->length =
- sge->mr->map[sge->m]->segs[sge->n].length;
- }
+ rvt_update_sge(&ssge, len, false);
length -= len;
}
rvt_put_ss(&qp->r_sge);
@@ -656,18 +637,19 @@ void return_cnp_16B(struct hfi1_ibport *ibp, struct rvt_qp *qp,
u32 bth0, plen, vl, hwords = 7;
u16 len;
u8 l4;
- struct hfi1_16b_header hdr;
+ struct hfi1_opa_header hdr;
struct ib_other_headers *ohdr;
struct pio_buf *pbuf;
struct send_context *ctxt = qp_to_send_context(qp, sc5);
struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
u32 nwords;
+ hdr.hdr_type = HFI1_PKT_TYPE_16B;
/* Populate length */
nwords = ((hfi1_get_16b_padding(hwords << 2, 0) +
SIZE_OF_LT) >> 2) + SIZE_OF_CRC;
if (old_grh) {
- struct ib_grh *grh = &hdr.u.l.grh;
+ struct ib_grh *grh = &hdr.opah.u.l.grh;
grh->version_tclass_flow = old_grh->version_tclass_flow;
grh->paylen = cpu_to_be16(
@@ -675,11 +657,11 @@ void return_cnp_16B(struct hfi1_ibport *ibp, struct rvt_qp *qp,
grh->hop_limit = 0xff;
grh->sgid = old_grh->dgid;
grh->dgid = old_grh->sgid;
- ohdr = &hdr.u.l.oth;
+ ohdr = &hdr.opah.u.l.oth;
l4 = OPA_16B_L4_IB_GLOBAL;
hwords += sizeof(struct ib_grh) / sizeof(u32);
} else {
- ohdr = &hdr.u.oth;
+ ohdr = &hdr.opah.u.oth;
l4 = OPA_16B_L4_IB_LOCAL;
}
@@ -693,7 +675,7 @@ void return_cnp_16B(struct hfi1_ibport *ibp, struct rvt_qp *qp,
/* Convert dwords to flits */
len = (hwords + nwords) >> 1;
- hfi1_make_16b_hdr(&hdr, slid, dlid, len, pkey, 1, 0, l4, sc5);
+ hfi1_make_16b_hdr(&hdr.opah, slid, dlid, len, pkey, 1, 0, l4, sc5);
plen = 2 /* PBC */ + hwords + nwords;
pbc_flags |= PBC_PACKET_BYPASS | PBC_INSERT_BYPASS_ICRC;
@@ -701,9 +683,11 @@ void return_cnp_16B(struct hfi1_ibport *ibp, struct rvt_qp *qp,
pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen);
if (ctxt) {
pbuf = sc_buffer_alloc(ctxt, plen, NULL, NULL);
- if (pbuf)
+ if (pbuf) {
+ trace_pio_output_ibhdr(ppd->dd, &hdr, sc5);
ppd->dd->pio_inline_send(ppd->dd, pbuf, pbc,
&hdr, hwords);
+ }
}
}
@@ -715,14 +699,15 @@ void return_cnp(struct hfi1_ibport *ibp, struct rvt_qp *qp, u32 remote_qpn,
u32 bth0, plen, vl, hwords = 5;
u16 lrh0;
u8 sl = ibp->sc_to_sl[sc5];
- struct ib_header hdr;
+ struct hfi1_opa_header hdr;
struct ib_other_headers *ohdr;
struct pio_buf *pbuf;
struct send_context *ctxt = qp_to_send_context(qp, sc5);
struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+ hdr.hdr_type = HFI1_PKT_TYPE_9B;
if (old_grh) {
- struct ib_grh *grh = &hdr.u.l.grh;
+ struct ib_grh *grh = &hdr.ibh.u.l.grh;
grh->version_tclass_flow = old_grh->version_tclass_flow;
grh->paylen = cpu_to_be16(
@@ -730,11 +715,11 @@ void return_cnp(struct hfi1_ibport *ibp, struct rvt_qp *qp, u32 remote_qpn,
grh->hop_limit = 0xff;
grh->sgid = old_grh->dgid;
grh->dgid = old_grh->sgid;
- ohdr = &hdr.u.l.oth;
+ ohdr = &hdr.ibh.u.l.oth;
lrh0 = HFI1_LRH_GRH;
hwords += sizeof(struct ib_grh) / sizeof(u32);
} else {
- ohdr = &hdr.u.oth;
+ ohdr = &hdr.ibh.u.oth;
lrh0 = HFI1_LRH_BTH;
}
@@ -746,16 +731,18 @@ void return_cnp(struct hfi1_ibport *ibp, struct rvt_qp *qp, u32 remote_qpn,
ohdr->bth[1] = cpu_to_be32(remote_qpn | (1 << IB_BECN_SHIFT));
ohdr->bth[2] = 0; /* PSN 0 */
- hfi1_make_ib_hdr(&hdr, lrh0, hwords + SIZE_OF_CRC, dlid, slid);
+ hfi1_make_ib_hdr(&hdr.ibh, lrh0, hwords + SIZE_OF_CRC, dlid, slid);
plen = 2 /* PBC */ + hwords;
pbc_flags |= (ib_is_sc5(sc5) << PBC_DC_INFO_SHIFT);
vl = sc_to_vlt(ppd->dd, sc5);
pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen);
if (ctxt) {
pbuf = sc_buffer_alloc(ctxt, plen, NULL, NULL);
- if (pbuf)
+ if (pbuf) {
+ trace_pio_output_ibhdr(ppd->dd, &hdr, sc5);
ppd->dd->pio_inline_send(ppd->dd, pbuf, pbc,
&hdr, hwords);
+ }
}
}
@@ -912,7 +899,7 @@ void hfi1_ud_rcv(struct hfi1_packet *packet)
src_qp = hfi1_16B_get_src_qpn(packet->mgmt);
}
- process_ecn(qp, packet, (opcode != IB_OPCODE_CNP));
+ process_ecn(qp, packet);
/*
* Get the number of bytes the message was padded by
* and drop incomplete packets.
@@ -980,7 +967,6 @@ void hfi1_ud_rcv(struct hfi1_packet *packet)
opcode == IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE) {
wc.ex.imm_data = packet->ohdr->u.ud.imm_data;
wc.wc_flags = IB_WC_WITH_IMM;
- tlen -= sizeof(u32);
} else if (opcode == IB_OPCODE_UD_SEND_ONLY) {
wc.ex.imm_data = 0;
wc.wc_flags = 0;
diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.c b/drivers/infiniband/hw/hfi1/user_exp_rcv.c
index dbe7d14a5c76..0cd71ce7cc71 100644
--- a/drivers/infiniband/hw/hfi1/user_exp_rcv.c
+++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.c
@@ -232,7 +232,7 @@ static int pin_rcv_pages(struct hfi1_filedata *fd, struct tid_user_buf *tidbuf)
}
/* Verify that access is OK for the user buffer */
- if (!access_ok(VERIFY_WRITE, (void __user *)vaddr,
+ if (!access_ok((void __user *)vaddr,
npages * PAGE_SIZE)) {
dd_dev_err(dd, "Fail vaddr %p, %u pages, !access_ok\n",
(void *)vaddr, npages);
diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.h b/drivers/infiniband/hw/hfi1/user_exp_rcv.h
index e383cc01a2bf..43b105de1d54 100644
--- a/drivers/infiniband/hw/hfi1/user_exp_rcv.h
+++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.h
@@ -48,7 +48,6 @@
*/
#include "hfi.h"
-
#include "exp_rcv.h"
struct tid_pageset {
diff --git a/drivers/infiniband/hw/hfi1/user_pages.c b/drivers/infiniband/hw/hfi1/user_pages.c
index e341e6dcc388..24b592c6522e 100644
--- a/drivers/infiniband/hw/hfi1/user_pages.c
+++ b/drivers/infiniband/hw/hfi1/user_pages.c
@@ -91,9 +91,7 @@ bool hfi1_can_pin_pages(struct hfi1_devdata *dd, struct mm_struct *mm,
/* Convert to number of pages */
size = DIV_ROUND_UP(size, PAGE_SIZE);
- down_read(&mm->mmap_sem);
- pinned = mm->pinned_vm;
- up_read(&mm->mmap_sem);
+ pinned = atomic64_read(&mm->pinned_vm);
/* First, check the absolute limit against all pinned pages. */
if (pinned + npages >= ulimit && !can_lock)
@@ -111,9 +109,7 @@ int hfi1_acquire_user_pages(struct mm_struct *mm, unsigned long vaddr, size_t np
if (ret < 0)
return ret;
- down_write(&mm->mmap_sem);
- mm->pinned_vm += ret;
- up_write(&mm->mmap_sem);
+ atomic64_add(ret, &mm->pinned_vm);
return ret;
}
@@ -130,8 +126,6 @@ void hfi1_release_user_pages(struct mm_struct *mm, struct page **p,
}
if (mm) { /* during close after signal, mm can be NULL */
- down_write(&mm->mmap_sem);
- mm->pinned_vm -= npages;
- up_write(&mm->mmap_sem);
+ atomic64_sub(npages, &mm->pinned_vm);
}
}
diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c
index 3f0aadccd9f6..8bfbc6d7ea34 100644
--- a/drivers/infiniband/hw/hfi1/user_sdma.c
+++ b/drivers/infiniband/hw/hfi1/user_sdma.c
@@ -130,7 +130,6 @@ static int defer_packet_queue(
{
struct hfi1_user_sdma_pkt_q *pq =
container_of(wait->iow, struct hfi1_user_sdma_pkt_q, busy);
- struct hfi1_ibdev *dev = &pq->dd->verbs_dev;
struct user_sdma_txreq *tx =
container_of(txreq, struct user_sdma_txreq, txreq);
@@ -144,10 +143,12 @@ static int defer_packet_queue(
* it is supposed to be enqueued.
*/
xchg(&pq->state, SDMA_PKT_Q_DEFERRED);
- write_seqlock(&dev->iowait_lock);
- if (list_empty(&pq->busy.list))
+ write_seqlock(&sde->waitlock);
+ if (list_empty(&pq->busy.list)) {
+ iowait_get_priority(&pq->busy);
iowait_queue(pkts_sent, &pq->busy, &sde->dmawait);
- write_sequnlock(&dev->iowait_lock);
+ }
+ write_sequnlock(&sde->waitlock);
return -EBUSY;
eagain:
return -EAGAIN;
@@ -192,7 +193,7 @@ int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt,
pq->mm = fd->mm;
iowait_init(&pq->busy, 0, NULL, NULL, defer_packet_queue,
- activate_packet_queue, NULL);
+ activate_packet_queue, NULL, NULL);
pq->reqidx = 0;
pq->reqs = kcalloc(hfi1_sdma_comp_ring_size,
@@ -1127,7 +1128,8 @@ static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags)
0xffffffull),
psn = val & mask;
if (expct)
- psn = (psn & ~BTH_SEQ_MASK) | ((psn + frags) & BTH_SEQ_MASK);
+ psn = (psn & ~HFI1_KDETH_BTH_SEQ_MASK) |
+ ((psn + frags) & HFI1_KDETH_BTH_SEQ_MASK);
else
psn = psn + frags;
return psn & mask;
diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c
index a365089a9305..55a56b3d7f83 100644
--- a/drivers/infiniband/hw/hfi1/verbs.c
+++ b/drivers/infiniband/hw/hfi1/verbs.c
@@ -161,10 +161,12 @@ MODULE_PARM_DESC(wss_clean_period, "Count of verbs copies before an entry in the
*/
const enum ib_wc_opcode ib_hfi1_wc_opcode[] = {
[IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE,
+ [IB_WR_TID_RDMA_WRITE] = IB_WC_RDMA_WRITE,
[IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE,
[IB_WR_SEND] = IB_WC_SEND,
[IB_WR_SEND_WITH_IMM] = IB_WC_SEND,
[IB_WR_RDMA_READ] = IB_WC_RDMA_READ,
+ [IB_WR_TID_RDMA_READ] = IB_WC_RDMA_READ,
[IB_WR_ATOMIC_CMP_AND_SWP] = IB_WC_COMP_SWAP,
[IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD,
[IB_WR_SEND_WITH_INV] = IB_WC_SEND,
@@ -200,6 +202,14 @@ const u8 hdr_len_by_opcode[256] = {
[IB_OPCODE_RC_FETCH_ADD] = 12 + 8 + 28,
[IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE] = 12 + 8 + 4,
[IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE] = 12 + 8 + 4,
+ [IB_OPCODE_TID_RDMA_READ_REQ] = 12 + 8 + 36,
+ [IB_OPCODE_TID_RDMA_READ_RESP] = 12 + 8 + 36,
+ [IB_OPCODE_TID_RDMA_WRITE_REQ] = 12 + 8 + 36,
+ [IB_OPCODE_TID_RDMA_WRITE_RESP] = 12 + 8 + 36,
+ [IB_OPCODE_TID_RDMA_WRITE_DATA] = 12 + 8 + 36,
+ [IB_OPCODE_TID_RDMA_WRITE_DATA_LAST] = 12 + 8 + 36,
+ [IB_OPCODE_TID_RDMA_ACK] = 12 + 8 + 36,
+ [IB_OPCODE_TID_RDMA_RESYNC] = 12 + 8 + 36,
/* UC */
[IB_OPCODE_UC_SEND_FIRST] = 12 + 8,
[IB_OPCODE_UC_SEND_MIDDLE] = 12 + 8,
@@ -243,6 +253,17 @@ static const opcode_handler opcode_handler_tbl[256] = {
[IB_OPCODE_RC_FETCH_ADD] = &hfi1_rc_rcv,
[IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE] = &hfi1_rc_rcv,
[IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE] = &hfi1_rc_rcv,
+
+ /* TID RDMA has separate handlers for different opcodes.*/
+ [IB_OPCODE_TID_RDMA_WRITE_REQ] = &hfi1_rc_rcv_tid_rdma_write_req,
+ [IB_OPCODE_TID_RDMA_WRITE_RESP] = &hfi1_rc_rcv_tid_rdma_write_resp,
+ [IB_OPCODE_TID_RDMA_WRITE_DATA] = &hfi1_rc_rcv_tid_rdma_write_data,
+ [IB_OPCODE_TID_RDMA_WRITE_DATA_LAST] = &hfi1_rc_rcv_tid_rdma_write_data,
+ [IB_OPCODE_TID_RDMA_READ_REQ] = &hfi1_rc_rcv_tid_rdma_read_req,
+ [IB_OPCODE_TID_RDMA_READ_RESP] = &hfi1_rc_rcv_tid_rdma_read_resp,
+ [IB_OPCODE_TID_RDMA_RESYNC] = &hfi1_rc_rcv_tid_rdma_resync,
+ [IB_OPCODE_TID_RDMA_ACK] = &hfi1_rc_rcv_tid_rdma_ack,
+
/* UC */
[IB_OPCODE_UC_SEND_FIRST] = &hfi1_uc_rcv,
[IB_OPCODE_UC_SEND_MIDDLE] = &hfi1_uc_rcv,
@@ -308,7 +329,7 @@ static inline opcode_handler qp_ok(struct hfi1_packet *packet)
static u64 hfi1_fault_tx(struct rvt_qp *qp, u8 opcode, u64 pbc)
{
#ifdef CONFIG_FAULT_INJECTION
- if ((opcode & IB_OPCODE_MSP) == IB_OPCODE_MSP)
+ if ((opcode & IB_OPCODE_MSP) == IB_OPCODE_MSP) {
/*
* In order to drop non-IB traffic we
* set PbcInsertHrc to NONE (0x2).
@@ -319,8 +340,9 @@ static u64 hfi1_fault_tx(struct rvt_qp *qp, u8 opcode, u64 pbc)
* packet will not be delivered to the
* correct context.
*/
+ pbc &= ~PBC_INSERT_HCRC_SMASK;
pbc |= (u64)PBC_IHCRC_NONE << PBC_INSERT_HCRC_SHIFT;
- else
+ } else {
/*
* In order to drop regular verbs
* traffic we set the PbcTestEbp
@@ -330,10 +352,129 @@ static u64 hfi1_fault_tx(struct rvt_qp *qp, u8 opcode, u64 pbc)
* triggered and will be dropped.
*/
pbc |= PBC_TEST_EBP;
+ }
#endif
return pbc;
}
+static opcode_handler tid_qp_ok(int opcode, struct hfi1_packet *packet)
+{
+ if (packet->qp->ibqp.qp_type != IB_QPT_RC ||
+ !(ib_rvt_state_ops[packet->qp->state] & RVT_PROCESS_RECV_OK))
+ return NULL;
+ if ((opcode & RVT_OPCODE_QP_MASK) == IB_OPCODE_TID_RDMA)
+ return opcode_handler_tbl[opcode];
+ return NULL;
+}
+
+void hfi1_kdeth_eager_rcv(struct hfi1_packet *packet)
+{
+ struct hfi1_ctxtdata *rcd = packet->rcd;
+ struct ib_header *hdr = packet->hdr;
+ u32 tlen = packet->tlen;
+ struct hfi1_pportdata *ppd = rcd->ppd;
+ struct hfi1_ibport *ibp = &ppd->ibport_data;
+ struct rvt_dev_info *rdi = &ppd->dd->verbs_dev.rdi;
+ opcode_handler opcode_handler;
+ unsigned long flags;
+ u32 qp_num;
+ int lnh;
+ u8 opcode;
+
+ /* DW == LRH (2) + BTH (3) + KDETH (9) + CRC (1) */
+ if (unlikely(tlen < 15 * sizeof(u32)))
+ goto drop;
+
+ lnh = be16_to_cpu(hdr->lrh[0]) & 3;
+ if (lnh != HFI1_LRH_BTH)
+ goto drop;
+
+ packet->ohdr = &hdr->u.oth;
+ trace_input_ibhdr(rcd->dd, packet, !!(rhf_dc_info(packet->rhf)));
+
+ opcode = (be32_to_cpu(packet->ohdr->bth[0]) >> 24);
+ inc_opstats(tlen, &rcd->opstats->stats[opcode]);
+
+ /* verbs_qp can be picked up from any tid_rdma header struct */
+ qp_num = be32_to_cpu(packet->ohdr->u.tid_rdma.r_req.verbs_qp) &
+ RVT_QPN_MASK;
+
+ rcu_read_lock();
+ packet->qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num);
+ if (!packet->qp)
+ goto drop_rcu;
+ spin_lock_irqsave(&packet->qp->r_lock, flags);
+ opcode_handler = tid_qp_ok(opcode, packet);
+ if (likely(opcode_handler))
+ opcode_handler(packet);
+ else
+ goto drop_unlock;
+ spin_unlock_irqrestore(&packet->qp->r_lock, flags);
+ rcu_read_unlock();
+
+ return;
+drop_unlock:
+ spin_unlock_irqrestore(&packet->qp->r_lock, flags);
+drop_rcu:
+ rcu_read_unlock();
+drop:
+ ibp->rvp.n_pkt_drops++;
+}
+
+void hfi1_kdeth_expected_rcv(struct hfi1_packet *packet)
+{
+ struct hfi1_ctxtdata *rcd = packet->rcd;
+ struct ib_header *hdr = packet->hdr;
+ u32 tlen = packet->tlen;
+ struct hfi1_pportdata *ppd = rcd->ppd;
+ struct hfi1_ibport *ibp = &ppd->ibport_data;
+ struct rvt_dev_info *rdi = &ppd->dd->verbs_dev.rdi;
+ opcode_handler opcode_handler;
+ unsigned long flags;
+ u32 qp_num;
+ int lnh;
+ u8 opcode;
+
+ /* DW == LRH (2) + BTH (3) + KDETH (9) + CRC (1) */
+ if (unlikely(tlen < 15 * sizeof(u32)))
+ goto drop;
+
+ lnh = be16_to_cpu(hdr->lrh[0]) & 3;
+ if (lnh != HFI1_LRH_BTH)
+ goto drop;
+
+ packet->ohdr = &hdr->u.oth;
+ trace_input_ibhdr(rcd->dd, packet, !!(rhf_dc_info(packet->rhf)));
+
+ opcode = (be32_to_cpu(packet->ohdr->bth[0]) >> 24);
+ inc_opstats(tlen, &rcd->opstats->stats[opcode]);
+
+ /* verbs_qp can be picked up from any tid_rdma header struct */
+ qp_num = be32_to_cpu(packet->ohdr->u.tid_rdma.r_rsp.verbs_qp) &
+ RVT_QPN_MASK;
+
+ rcu_read_lock();
+ packet->qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num);
+ if (!packet->qp)
+ goto drop_rcu;
+ spin_lock_irqsave(&packet->qp->r_lock, flags);
+ opcode_handler = tid_qp_ok(opcode, packet);
+ if (likely(opcode_handler))
+ opcode_handler(packet);
+ else
+ goto drop_unlock;
+ spin_unlock_irqrestore(&packet->qp->r_lock, flags);
+ rcu_read_unlock();
+
+ return;
+drop_unlock:
+ spin_unlock_irqrestore(&packet->qp->r_lock, flags);
+drop_rcu:
+ rcu_read_unlock();
+drop:
+ ibp->rvp.n_pkt_drops++;
+}
+
static int hfi1_do_pkey_check(struct hfi1_packet *packet)
{
struct hfi1_ctxtdata *rcd = packet->rcd;
@@ -504,11 +645,28 @@ static void verbs_sdma_complete(
hfi1_put_txreq(tx);
}
+void hfi1_wait_kmem(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+ struct ib_qp *ibqp = &qp->ibqp;
+ struct ib_device *ibdev = ibqp->device;
+ struct hfi1_ibdev *dev = to_idev(ibdev);
+
+ if (list_empty(&priv->s_iowait.list)) {
+ if (list_empty(&dev->memwait))
+ mod_timer(&dev->mem_timer, jiffies + 1);
+ qp->s_flags |= RVT_S_WAIT_KMEM;
+ list_add_tail(&priv->s_iowait.list, &dev->memwait);
+ priv->s_iowait.lock = &dev->iowait_lock;
+ trace_hfi1_qpsleep(qp, RVT_S_WAIT_KMEM);
+ rvt_get_qp(qp);
+ }
+}
+
static int wait_kmem(struct hfi1_ibdev *dev,
struct rvt_qp *qp,
struct hfi1_pkt_state *ps)
{
- struct hfi1_qp_priv *priv = qp->priv;
unsigned long flags;
int ret = 0;
@@ -517,15 +675,7 @@ static int wait_kmem(struct hfi1_ibdev *dev,
write_seqlock(&dev->iowait_lock);
list_add_tail(&ps->s_txreq->txreq.list,
&ps->wait->tx_head);
- if (list_empty(&priv->s_iowait.list)) {
- if (list_empty(&dev->memwait))
- mod_timer(&dev->mem_timer, jiffies + 1);
- qp->s_flags |= RVT_S_WAIT_KMEM;
- list_add_tail(&priv->s_iowait.list, &dev->memwait);
- priv->s_iowait.lock = &dev->iowait_lock;
- trace_hfi1_qpsleep(qp, RVT_S_WAIT_KMEM);
- rvt_get_qp(qp);
- }
+ hfi1_wait_kmem(qp);
write_sequnlock(&dev->iowait_lock);
hfi1_qp_unbusy(qp, ps->wait);
ret = -EBUSY;
@@ -553,11 +703,7 @@ static noinline int build_verbs_ulp_payload(
int ret = 0;
while (length) {
- len = ss->sge.length;
- if (len > length)
- len = length;
- if (len > ss->sge.sge_length)
- len = ss->sge.sge_length;
+ len = rvt_get_sge_length(&ss->sge, length);
WARN_ON_ONCE(len == 0);
ret = sdma_txadd_kvaddr(
sde->dd,
@@ -678,6 +824,15 @@ bail_txadd:
return ret;
}
+static u64 update_hcrc(u8 opcode, u64 pbc)
+{
+ if ((opcode & IB_OPCODE_TID_RDMA) == IB_OPCODE_TID_RDMA) {
+ pbc &= ~PBC_INSERT_HCRC_SMASK;
+ pbc |= (u64)PBC_IHCRC_LKDETH << PBC_INSERT_HCRC_SHIFT;
+ }
+ return pbc;
+}
+
int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
u64 pbc)
{
@@ -723,6 +878,9 @@ int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
qp->srate_mbps,
vl,
plen);
+
+ /* Update HCRC based on packet opcode */
+ pbc = update_hcrc(ps->opcode, pbc);
}
tx->wqe = qp->s_wqe;
ret = build_verbs_tx_desc(tx->sde, len, tx, ahg_info, pbc);
@@ -765,7 +923,6 @@ static int pio_wait(struct rvt_qp *qp,
{
struct hfi1_qp_priv *priv = qp->priv;
struct hfi1_devdata *dd = sc->dd;
- struct hfi1_ibdev *dev = &dd->verbs_dev;
unsigned long flags;
int ret = 0;
@@ -777,7 +934,7 @@ static int pio_wait(struct rvt_qp *qp,
*/
spin_lock_irqsave(&qp->s_lock, flags);
if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) {
- write_seqlock(&dev->iowait_lock);
+ write_seqlock(&sc->waitlock);
list_add_tail(&ps->s_txreq->txreq.list,
&ps->wait->tx_head);
if (list_empty(&priv->s_iowait.list)) {
@@ -788,16 +945,17 @@ static int pio_wait(struct rvt_qp *qp,
dev->n_piodrain += !!(flag & HFI1_S_WAIT_PIO_DRAIN);
qp->s_flags |= flag;
was_empty = list_empty(&sc->piowait);
+ iowait_get_priority(&priv->s_iowait);
iowait_queue(ps->pkts_sent, &priv->s_iowait,
&sc->piowait);
- priv->s_iowait.lock = &dev->iowait_lock;
+ priv->s_iowait.lock = &sc->waitlock;
trace_hfi1_qpsleep(qp, RVT_S_WAIT_PIO);
rvt_get_qp(qp);
/* counting: only call wantpiobuf_intr if first user */
if (was_empty)
hfi1_sc_wantpiobuf_intr(sc, 1);
}
- write_sequnlock(&dev->iowait_lock);
+ write_sequnlock(&sc->waitlock);
hfi1_qp_unbusy(qp, ps->wait);
ret = -EBUSY;
}
@@ -872,6 +1030,9 @@ int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
if (unlikely(hfi1_dbg_should_fault_tx(qp, ps->opcode)))
pbc = hfi1_fault_tx(qp, ps->opcode, pbc);
pbc = create_pbc(ppd, pbc, qp->srate_mbps, vl, plen);
+
+ /* Update HCRC based on packet opcode */
+ pbc = update_hcrc(ps->opcode, pbc);
}
if (cb)
iowait_pio_inc(&priv->s_iowait);
@@ -915,10 +1076,8 @@ int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
if (ss) {
while (len) {
void *addr = ss->sge.vaddr;
- u32 slen = ss->sge.length;
+ u32 slen = rvt_get_sge_length(&ss->sge, len);
- if (slen > len)
- slen = len;
rvt_update_sge(ss, slen, false);
seg_pio_copy_mid(pbuf, addr, slen);
len -= slen;
@@ -1187,7 +1346,9 @@ static void hfi1_fill_device_attr(struct hfi1_devdata *dd)
rdi->dparms.props.max_mr_size = U64_MAX;
rdi->dparms.props.max_fast_reg_page_list_len = UINT_MAX;
rdi->dparms.props.max_qp = hfi1_max_qps;
- rdi->dparms.props.max_qp_wr = hfi1_max_qp_wrs;
+ rdi->dparms.props.max_qp_wr =
+ (hfi1_max_qp_wrs >= HFI1_QP_WQE_INVALID ?
+ HFI1_QP_WQE_INVALID - 1 : hfi1_max_qp_wrs);
rdi->dparms.props.max_send_sge = hfi1_max_sges;
rdi->dparms.props.max_recv_sge = hfi1_max_sges;
rdi->dparms.props.max_sge_rd = hfi1_max_sges;
@@ -1616,6 +1777,17 @@ static int get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats,
return count;
}
+static const struct ib_device_ops hfi1_dev_ops = {
+ .alloc_hw_stats = alloc_hw_stats,
+ .alloc_rdma_netdev = hfi1_vnic_alloc_rn,
+ .get_dev_fw_str = hfi1_get_dev_fw_str,
+ .get_hw_stats = get_hw_stats,
+ .init_port = hfi1_create_port_files,
+ .modify_device = modify_device,
+ /* keep process mad in the driver */
+ .process_mad = hfi1_process_mad,
+};
+
/**
* hfi1_register_ib_device - register our device with the infiniband core
* @dd: the device data structure
@@ -1659,14 +1831,8 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd)
ibdev->owner = THIS_MODULE;
ibdev->phys_port_cnt = dd->num_pports;
ibdev->dev.parent = &dd->pcidev->dev;
- ibdev->modify_device = modify_device;
- ibdev->alloc_hw_stats = alloc_hw_stats;
- ibdev->get_hw_stats = get_hw_stats;
- ibdev->alloc_rdma_netdev = hfi1_vnic_alloc_rn;
- /* keep process mad in the driver */
- ibdev->process_mad = hfi1_process_mad;
- ibdev->get_dev_fw_str = hfi1_get_dev_fw_str;
+ ib_set_device_ops(ibdev, &hfi1_dev_ops);
strlcpy(ibdev->node_desc, init_utsname()->nodename,
sizeof(ibdev->node_desc));
@@ -1674,7 +1840,6 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd)
/*
* Fill in rvt info object.
*/
- dd->verbs_dev.rdi.driver_f.port_callback = hfi1_create_port_files;
dd->verbs_dev.rdi.driver_f.get_pci_dev = get_pci_dev;
dd->verbs_dev.rdi.driver_f.check_ah = hfi1_check_ah;
dd->verbs_dev.rdi.driver_f.notify_new_ah = hfi1_notify_new_ah;
@@ -1704,6 +1869,7 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd)
dd->verbs_dev.rdi.dparms.max_mad_size = OPA_MGMT_MAD_SIZE;
dd->verbs_dev.rdi.driver_f.qp_priv_alloc = qp_priv_alloc;
+ dd->verbs_dev.rdi.driver_f.qp_priv_init = hfi1_qp_priv_init;
dd->verbs_dev.rdi.driver_f.qp_priv_free = qp_priv_free;
dd->verbs_dev.rdi.driver_f.free_all_qps = free_all_qps;
dd->verbs_dev.rdi.driver_f.notify_qp_reset = notify_qp_reset;
@@ -1737,6 +1903,8 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd)
dd->verbs_dev.rdi.dparms.sge_copy_mode = sge_copy_mode;
dd->verbs_dev.rdi.dparms.wss_threshold = wss_threshold;
dd->verbs_dev.rdi.dparms.wss_clean_period = wss_clean_period;
+ dd->verbs_dev.rdi.dparms.reserved_operations = 1;
+ dd->verbs_dev.rdi.dparms.extra_rdma_atomic = HFI1_TID_RDMA_WRITE_CNT;
/* post send table */
dd->verbs_dev.rdi.post_parms = hfi1_post_parms;
diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h
index 64c9054db5f3..62ace0b2d17a 100644
--- a/drivers/infiniband/hw/hfi1/verbs.h
+++ b/drivers/infiniband/hw/hfi1/verbs.h
@@ -71,6 +71,8 @@ struct hfi1_devdata;
struct hfi1_packet;
#include "iowait.h"
+#include "tid_rdma.h"
+#include "opfn.h"
#define HFI1_MAX_RDMA_ATOMIC 16
@@ -156,10 +158,69 @@ struct hfi1_qp_priv {
struct hfi1_ahg_info *s_ahg; /* ahg info for next header */
struct sdma_engine *s_sde; /* current sde */
struct send_context *s_sendcontext; /* current sendcontext */
+ struct hfi1_ctxtdata *rcd; /* QP's receive context */
+ struct page **pages; /* for TID page scan */
+ u32 tid_enqueue; /* saved when tid waited */
u8 s_sc; /* SC[0..4] for next packet */
struct iowait s_iowait;
+ struct timer_list s_tid_timer; /* for timing tid wait */
+ struct timer_list s_tid_retry_timer; /* for timing tid ack */
+ struct list_head tid_wait; /* for queueing tid space */
+ struct hfi1_opfn_data opfn;
+ struct tid_flow_state flow_state;
+ struct tid_rdma_qp_params tid_rdma;
struct rvt_qp *owner;
u8 hdr_type; /* 9B or 16B */
+ struct rvt_sge_state tid_ss; /* SGE state pointer for 2nd leg */
+ atomic_t n_requests; /* # of TID RDMA requests in the */
+ /* queue */
+ atomic_t n_tid_requests; /* # of sent TID RDMA requests */
+ unsigned long tid_timer_timeout_jiffies;
+ unsigned long tid_retry_timeout_jiffies;
+
+ /* variables for the TID RDMA SE state machine */
+ u8 s_state;
+ u8 s_retry;
+ u8 rnr_nak_state; /* RNR NAK state */
+ u8 s_nak_state;
+ u32 s_nak_psn;
+ u32 s_flags;
+ u32 s_tid_cur;
+ u32 s_tid_head;
+ u32 s_tid_tail;
+ u32 r_tid_head; /* Most recently added TID RDMA request */
+ u32 r_tid_tail; /* the last completed TID RDMA request */
+ u32 r_tid_ack; /* the TID RDMA request to be ACK'ed */
+ u32 r_tid_alloc; /* Request for which we are allocating resources */
+ u32 pending_tid_w_segs; /* Num of pending tid write segments */
+ u32 pending_tid_w_resp; /* Num of pending tid write responses */
+ u32 alloc_w_segs; /* Number of segments for which write */
+ /* resources have been allocated for this QP */
+
+ /* For TID RDMA READ */
+ u32 tid_r_reqs; /* Num of tid reads requested */
+ u32 tid_r_comp; /* Num of tid reads completed */
+ u32 pending_tid_r_segs; /* Num of pending tid read segments */
+ u16 pkts_ps; /* packets per segment */
+ u8 timeout_shift; /* account for number of packets per segment */
+
+ u32 r_next_psn_kdeth;
+ u32 r_next_psn_kdeth_save;
+ u32 s_resync_psn;
+ u8 sync_pt; /* Set when QP reaches sync point */
+ u8 resync;
+};
+
+#define HFI1_QP_WQE_INVALID ((u32)-1)
+
+struct hfi1_swqe_priv {
+ struct tid_rdma_request tid_req;
+ struct rvt_sge_state ss; /* Used for TID RDMA READ Request */
+};
+
+struct hfi1_ack_priv {
+ struct rvt_sge_state ss; /* used for TID WRITE RESP */
+ struct tid_rdma_request tid_req;
};
/*
@@ -223,6 +284,7 @@ struct hfi1_ibdev {
struct kmem_cache *verbs_txreq_cache;
u64 n_txwait;
u64 n_kmem_wait;
+ u64 n_tidwait;
/* protect iowait lists */
seqlock_t iowait_lock ____cacheline_aligned_in_smp;
@@ -310,6 +372,31 @@ static inline u32 delta_psn(u32 a, u32 b)
return (((int)a - (int)b) << PSN_SHIFT) >> PSN_SHIFT;
}
+static inline struct tid_rdma_request *wqe_to_tid_req(struct rvt_swqe *wqe)
+{
+ return &((struct hfi1_swqe_priv *)wqe->priv)->tid_req;
+}
+
+static inline struct tid_rdma_request *ack_to_tid_req(struct rvt_ack_entry *e)
+{
+ return &((struct hfi1_ack_priv *)e->priv)->tid_req;
+}
+
+/*
+ * Look through all the active flows for a TID RDMA request and find
+ * the one (if it exists) that contains the specified PSN.
+ */
+static inline u32 __full_flow_psn(struct flow_state *state, u32 psn)
+{
+ return mask_psn((state->generation << HFI1_KDETH_BTH_SEQ_SHIFT) |
+ (psn & HFI1_KDETH_BTH_SEQ_MASK));
+}
+
+static inline u32 full_flow_psn(struct tid_rdma_flow *flow, u32 psn)
+{
+ return __full_flow_psn(&flow->flow_state, psn);
+}
+
struct verbs_txreq;
void hfi1_put_txreq(struct verbs_txreq *tx);
@@ -354,9 +441,12 @@ u32 hfi1_make_grh(struct hfi1_ibport *ibp, struct ib_grh *hdr,
const struct ib_global_route *grh, u32 hwords, u32 nwords);
void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr,
- u32 bth0, u32 bth2, int middle,
+ u32 bth0, u32 bth1, u32 bth2, int middle,
struct hfi1_pkt_state *ps);
+bool hfi1_schedule_send_yield(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
+ bool tid);
+
void _hfi1_do_send(struct work_struct *work);
void hfi1_do_send_from_rvt(struct rvt_qp *qp);
@@ -375,6 +465,10 @@ int hfi1_register_ib_device(struct hfi1_devdata *);
void hfi1_unregister_ib_device(struct hfi1_devdata *);
+void hfi1_kdeth_eager_rcv(struct hfi1_packet *packet);
+
+void hfi1_kdeth_expected_rcv(struct hfi1_packet *packet);
+
void hfi1_ib_rcv(struct hfi1_packet *packet);
void hfi1_16B_rcv(struct hfi1_packet *packet);
@@ -392,6 +486,16 @@ static inline bool opa_bth_is_migration(struct ib_other_headers *ohdr)
return ohdr->bth[1] & cpu_to_be32(OPA_BTH_MIG_REQ);
}
+void hfi1_wait_kmem(struct rvt_qp *qp);
+
+static inline void hfi1_trdma_send_complete(struct rvt_qp *qp,
+ struct rvt_swqe *wqe,
+ enum ib_wc_status status)
+{
+ trdma_clean_swqe(qp, wqe);
+ rvt_send_complete(qp, wqe, status);
+}
+
extern const enum ib_wc_opcode ib_hfi1_wc_opcode[];
extern const u8 hdr_len_by_opcode[];
diff --git a/drivers/infiniband/hw/hfi1/verbs_txreq.h b/drivers/infiniband/hw/hfi1/verbs_txreq.h
index 2a77af26a231..b002e96eb335 100644
--- a/drivers/infiniband/hw/hfi1/verbs_txreq.h
+++ b/drivers/infiniband/hw/hfi1/verbs_txreq.h
@@ -94,6 +94,7 @@ static inline struct verbs_txreq *get_txreq(struct hfi1_ibdev *dev,
tx->txreq.num_desc = 0;
/* Set the header type */
tx->phdr.hdr.hdr_type = priv->hdr_type;
+ tx->txreq.flags = 0;
return tx;
}
diff --git a/drivers/infiniband/hw/hfi1/vnic_main.c b/drivers/infiniband/hw/hfi1/vnic_main.c
index c9876d9e3cb9..a922db58be14 100644
--- a/drivers/infiniband/hw/hfi1/vnic_main.c
+++ b/drivers/infiniband/hw/hfi1/vnic_main.c
@@ -816,14 +816,14 @@ struct net_device *hfi1_vnic_alloc_rn(struct ib_device *device,
size = sizeof(struct opa_vnic_rdma_netdev) + sizeof(*vinfo);
netdev = alloc_netdev_mqs(size, name, name_assign_type, setup,
- chip_sdma_engines(dd), dd->num_vnic_contexts);
+ dd->num_sdma, dd->num_vnic_contexts);
if (!netdev)
return ERR_PTR(-ENOMEM);
rn = netdev_priv(netdev);
vinfo = opa_vnic_dev_priv(netdev);
vinfo->dd = dd;
- vinfo->num_tx_q = chip_sdma_engines(dd);
+ vinfo->num_tx_q = dd->num_sdma;
vinfo->num_rx_q = dd->num_vnic_contexts;
vinfo->netdev = netdev;
rn->free_rdma_netdev = hfi1_vnic_free_rn;
diff --git a/drivers/infiniband/hw/hfi1/vnic_sdma.c b/drivers/infiniband/hw/hfi1/vnic_sdma.c
index 97bd940a056a..af1b1ffcb38e 100644
--- a/drivers/infiniband/hw/hfi1/vnic_sdma.c
+++ b/drivers/infiniband/hw/hfi1/vnic_sdma.c
@@ -57,7 +57,6 @@
#define HFI1_VNIC_TXREQ_NAME_LEN 32
#define HFI1_VNIC_SDMA_DESC_WTRMRK 64
-#define HFI1_VNIC_SDMA_RETRY_COUNT 1
/*
* struct vnic_txreq - VNIC transmit descriptor
@@ -67,7 +66,6 @@
* @pad: pad buffer
* @plen: pad length
* @pbc_val: pbc value
- * @retry_count: tx retry count
*/
struct vnic_txreq {
struct sdma_txreq txreq;
@@ -77,8 +75,6 @@ struct vnic_txreq {
unsigned char pad[HFI1_VNIC_MAX_PAD];
u16 plen;
__le64 pbc_val;
-
- u32 retry_count;
};
static void vnic_sdma_complete(struct sdma_txreq *txreq,
@@ -196,7 +192,6 @@ int hfi1_vnic_send_dma(struct hfi1_devdata *dd, u8 q_idx,
ret = build_vnic_tx_desc(sde, tx, pbc);
if (unlikely(ret))
goto free_desc;
- tx->retry_count = 0;
ret = sdma_send_txreq(sde, iowait_get_ib_work(&vnic_sdma->wait),
&tx->txreq, vnic_sdma->pkts_sent);
@@ -237,18 +232,19 @@ static int hfi1_vnic_sdma_sleep(struct sdma_engine *sde,
{
struct hfi1_vnic_sdma *vnic_sdma =
container_of(wait->iow, struct hfi1_vnic_sdma, wait);
- struct hfi1_ibdev *dev = &vnic_sdma->dd->verbs_dev;
- struct vnic_txreq *tx = container_of(txreq, struct vnic_txreq, txreq);
- if (sdma_progress(sde, seq, txreq))
- if (tx->retry_count++ < HFI1_VNIC_SDMA_RETRY_COUNT)
- return -EAGAIN;
+ write_seqlock(&sde->waitlock);
+ if (sdma_progress(sde, seq, txreq)) {
+ write_sequnlock(&sde->waitlock);
+ return -EAGAIN;
+ }
vnic_sdma->state = HFI1_VNIC_SDMA_Q_DEFERRED;
- write_seqlock(&dev->iowait_lock);
- if (list_empty(&vnic_sdma->wait.list))
+ if (list_empty(&vnic_sdma->wait.list)) {
+ iowait_get_priority(wait->iow);
iowait_queue(pkts_sent, wait->iow, &sde->dmawait);
- write_sequnlock(&dev->iowait_lock);
+ }
+ write_sequnlock(&sde->waitlock);
return -EBUSY;
}
@@ -287,7 +283,7 @@ void hfi1_vnic_sdma_init(struct hfi1_vnic_vport_info *vinfo)
iowait_init(&vnic_sdma->wait, 0, NULL, NULL,
hfi1_vnic_sdma_sleep,
- hfi1_vnic_sdma_wakeup, NULL);
+ hfi1_vnic_sdma_wakeup, NULL, NULL);
vnic_sdma->sde = &vinfo->dd->per_sdma[i];
vnic_sdma->dd = vinfo->dd;
vnic_sdma->vinfo = vinfo;
diff --git a/drivers/infiniband/hw/hns/Kconfig b/drivers/infiniband/hw/hns/Kconfig
index 21c2100b2ea9..fddb5fdf92de 100644
--- a/drivers/infiniband/hw/hns/Kconfig
+++ b/drivers/infiniband/hw/hns/Kconfig
@@ -1,7 +1,6 @@
config INFINIBAND_HNS
tristate "HNS RoCE Driver"
depends on NET_VENDOR_HISILICON
- depends on INFINIBAND_USER_ACCESS || !INFINIBAND_USER_ACCESS
depends on ARM64 || (COMPILE_TEST && 64BIT)
---help---
This is a RoCE/RDMA driver for the Hisilicon RoCE engine. The engine
diff --git a/drivers/infiniband/hw/hns/Makefile b/drivers/infiniband/hw/hns/Makefile
index cf03404b9d58..e2a7f1488f76 100644
--- a/drivers/infiniband/hw/hns/Makefile
+++ b/drivers/infiniband/hw/hns/Makefile
@@ -2,12 +2,12 @@
# Makefile for the Hisilicon RoCE drivers.
#
-ccflags-y := -Idrivers/net/ethernet/hisilicon/hns3
+ccflags-y := -I $(srctree)/drivers/net/ethernet/hisilicon/hns3
obj-$(CONFIG_INFINIBAND_HNS) += hns-roce.o
hns-roce-objs := hns_roce_main.o hns_roce_cmd.o hns_roce_pd.o \
hns_roce_ah.o hns_roce_hem.o hns_roce_mr.o hns_roce_qp.o \
- hns_roce_cq.o hns_roce_alloc.o hns_roce_db.o
+ hns_roce_cq.o hns_roce_alloc.o hns_roce_db.o hns_roce_srq.o
obj-$(CONFIG_INFINIBAND_HNS_HIP06) += hns-roce-hw-v1.o
hns-roce-hw-v1-objs := hns_roce_hw_v1.o
obj-$(CONFIG_INFINIBAND_HNS_HIP08) += hns-roce-hw-v2.o
diff --git a/drivers/infiniband/hw/hns/hns_roce_ah.c b/drivers/infiniband/hw/hns/hns_roce_ah.c
index 9990dc9eb96a..b3c8c45ec1e3 100644
--- a/drivers/infiniband/hw/hns/hns_roce_ah.c
+++ b/drivers/infiniband/hw/hns/hns_roce_ah.c
@@ -41,6 +41,7 @@
struct ib_ah *hns_roce_create_ah(struct ib_pd *ibpd,
struct rdma_ah_attr *ah_attr,
+ u32 flags,
struct ib_udata *udata)
{
struct hns_roce_dev *hr_dev = to_hr_dev(ibpd->device);
@@ -110,7 +111,7 @@ int hns_roce_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr)
return 0;
}
-int hns_roce_destroy_ah(struct ib_ah *ah)
+int hns_roce_destroy_ah(struct ib_ah *ah, u32 flags)
{
kfree(to_hr_ah(ah));
diff --git a/drivers/infiniband/hw/hns/hns_roce_alloc.c b/drivers/infiniband/hw/hns/hns_roce_alloc.c
index 46f65f9f59d0..dac058d3df53 100644
--- a/drivers/infiniband/hw/hns/hns_roce_alloc.c
+++ b/drivers/infiniband/hw/hns/hns_roce_alloc.c
@@ -197,8 +197,8 @@ int hns_roce_buf_alloc(struct hns_roce_dev *hr_dev, u32 size, u32 max_direct,
buf->npages = 1 << order;
buf->page_shift = page_shift;
/* MTT PA must be recorded in 4k alignment, t is 4k aligned */
- buf->direct.buf = dma_zalloc_coherent(dev,
- size, &t, GFP_KERNEL);
+ buf->direct.buf = dma_alloc_coherent(dev, size, &t,
+ GFP_KERNEL);
if (!buf->direct.buf)
return -ENOMEM;
@@ -219,9 +219,10 @@ int hns_roce_buf_alloc(struct hns_roce_dev *hr_dev, u32 size, u32 max_direct,
return -ENOMEM;
for (i = 0; i < buf->nbufs; ++i) {
- buf->page_list[i].buf = dma_zalloc_coherent(dev,
- page_size, &t,
- GFP_KERNEL);
+ buf->page_list[i].buf = dma_alloc_coherent(dev,
+ page_size,
+ &t,
+ GFP_KERNEL);
if (!buf->page_list[i].buf)
goto err_free;
@@ -239,6 +240,8 @@ err_free:
void hns_roce_cleanup_bitmap(struct hns_roce_dev *hr_dev)
{
+ if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_SRQ)
+ hns_roce_cleanup_srq_table(hr_dev);
hns_roce_cleanup_qp_table(hr_dev);
hns_roce_cleanup_cq_table(hr_dev);
hns_roce_cleanup_mr_table(hr_dev);
diff --git a/drivers/infiniband/hw/hns/hns_roce_cmd.c b/drivers/infiniband/hw/hns/hns_roce_cmd.c
index a0ba19d4a10e..2acf946d02e5 100644
--- a/drivers/infiniband/hw/hns/hns_roce_cmd.c
+++ b/drivers/infiniband/hw/hns/hns_roce_cmd.c
@@ -176,17 +176,33 @@ int hns_roce_cmd_mbox(struct hns_roce_dev *hr_dev, u64 in_param, u64 out_param,
unsigned long in_modifier, u8 op_modifier, u16 op,
unsigned long timeout)
{
- if (hr_dev->is_reset)
- return 0;
+ int ret;
+
+ if (hr_dev->hw->rst_prc_mbox) {
+ ret = hr_dev->hw->rst_prc_mbox(hr_dev);
+ if (ret == CMD_RST_PRC_SUCCESS)
+ return 0;
+ else if (ret == CMD_RST_PRC_EBUSY)
+ return -EBUSY;
+ }
if (hr_dev->cmd.use_events)
- return hns_roce_cmd_mbox_wait(hr_dev, in_param, out_param,
- in_modifier, op_modifier, op,
- timeout);
+ ret = hns_roce_cmd_mbox_wait(hr_dev, in_param, out_param,
+ in_modifier, op_modifier, op,
+ timeout);
else
- return hns_roce_cmd_mbox_poll(hr_dev, in_param, out_param,
- in_modifier, op_modifier, op,
- timeout);
+ ret = hns_roce_cmd_mbox_poll(hr_dev, in_param, out_param,
+ in_modifier, op_modifier, op,
+ timeout);
+
+ if (ret == CMD_RST_PRC_EBUSY)
+ return -EBUSY;
+
+ if (ret && (hr_dev->hw->rst_prc_mbox &&
+ hr_dev->hw->rst_prc_mbox(hr_dev) == CMD_RST_PRC_SUCCESS))
+ return 0;
+
+ return ret;
}
EXPORT_SYMBOL_GPL(hns_roce_cmd_mbox);
diff --git a/drivers/infiniband/hw/hns/hns_roce_cmd.h b/drivers/infiniband/hw/hns/hns_roce_cmd.h
index 9549ae51a0dd..059fd1da493e 100644
--- a/drivers/infiniband/hw/hns/hns_roce_cmd.h
+++ b/drivers/infiniband/hw/hns/hns_roce_cmd.h
@@ -75,6 +75,10 @@ enum {
HNS_ROCE_CMD_DESTROY_MPT_BT1 = 0x29,
HNS_ROCE_CMD_DESTROY_MPT_BT2 = 0x2a,
+ /* CQC TIMER commands */
+ HNS_ROCE_CMD_WRITE_CQC_TIMER_BT0 = 0x23,
+ HNS_ROCE_CMD_READ_CQC_TIMER_BT0 = 0x27,
+
/* MPT commands */
HNS_ROCE_CMD_QUERY_MPT = 0x62,
@@ -89,6 +93,10 @@ enum {
HNS_ROCE_CMD_DESTROY_SRQC_BT1 = 0x39,
HNS_ROCE_CMD_DESTROY_SRQC_BT2 = 0x3a,
+ /* QPC TIMER commands */
+ HNS_ROCE_CMD_WRITE_QPC_TIMER_BT0 = 0x33,
+ HNS_ROCE_CMD_READ_QPC_TIMER_BT0 = 0x37,
+
/* EQC commands */
HNS_ROCE_CMD_CREATE_AEQC = 0x80,
HNS_ROCE_CMD_MODIFY_AEQC = 0x81,
@@ -98,6 +106,10 @@ enum {
HNS_ROCE_CMD_MODIFY_CEQC = 0x91,
HNS_ROCE_CMD_QUERY_CEQC = 0x92,
HNS_ROCE_CMD_DESTROY_CEQC = 0x93,
+
+ /* SCC CTX BT commands */
+ HNS_ROCE_CMD_READ_SCCC_BT0 = 0xa4,
+ HNS_ROCE_CMD_WRITE_SCCC_BT0 = 0xa5,
};
enum {
@@ -120,6 +132,10 @@ enum {
HNS_ROCE_CMD_SQD2RTS_QP = 0x20,
HNS_ROCE_CMD_2RST_QP = 0x21,
HNS_ROCE_CMD_QUERY_QP = 0x22,
+ HNS_ROCE_CMD_SW2HW_SRQ = 0x70,
+ HNS_ROCE_CMD_MODIFY_SRQC = 0x72,
+ HNS_ROCE_CMD_QUERY_SRQC = 0x73,
+ HNS_ROCE_CMD_HW2SW_SRQ = 0x74,
};
int hns_roce_cmd_mbox(struct hns_roce_dev *hr_dev, u64 in_param, u64 out_param,
diff --git a/drivers/infiniband/hw/hns/hns_roce_common.h b/drivers/infiniband/hw/hns/hns_roce_common.h
index 93d4b4ec002d..f4c92a7ac1ce 100644
--- a/drivers/infiniband/hw/hns/hns_roce_common.h
+++ b/drivers/infiniband/hw/hns/hns_roce_common.h
@@ -376,9 +376,6 @@
#define ROCEE_RX_CMQ_TAIL_REG 0x07024
#define ROCEE_RX_CMQ_HEAD_REG 0x07028
-#define ROCEE_VF_MB_CFG0_REG 0x40
-#define ROCEE_VF_MB_STATUS_REG 0x58
-
#define ROCEE_VF_EQ_DB_CFG0_REG 0x238
#define ROCEE_VF_EQ_DB_CFG1_REG 0x23C
diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c
index 3a485f50fede..1dfe5627006c 100644
--- a/drivers/infiniband/hw/hns/hns_roce_cq.c
+++ b/drivers/infiniband/hw/hns/hns_roce_cq.c
@@ -215,7 +215,7 @@ void hns_roce_free_cq(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq)
EXPORT_SYMBOL_GPL(hns_roce_free_cq);
static int hns_roce_ib_get_cq_umem(struct hns_roce_dev *hr_dev,
- struct ib_ucontext *context,
+ struct ib_udata *udata,
struct hns_roce_cq_buf *buf,
struct ib_umem **umem, u64 buf_addr, int cqe)
{
@@ -223,7 +223,7 @@ static int hns_roce_ib_get_cq_umem(struct hns_roce_dev *hr_dev,
u32 page_shift;
u32 npages;
- *umem = ib_umem_get(context, buf_addr, cqe * hr_dev->caps.cq_entry_sz,
+ *umem = ib_umem_get(udata, buf_addr, cqe * hr_dev->caps.cq_entry_sz,
IB_ACCESS_LOCAL_WRITE, 1);
if (IS_ERR(*umem))
return PTR_ERR(*umem);
@@ -347,7 +347,7 @@ struct ib_cq *hns_roce_ib_create_cq(struct ib_device *ib_dev,
}
/* Get user space address, write it into mtt table */
- ret = hns_roce_ib_get_cq_umem(hr_dev, context, &hr_cq->hr_buf,
+ ret = hns_roce_ib_get_cq_umem(hr_dev, udata, &hr_cq->hr_buf,
&hr_cq->umem, ucmd.buf_addr,
cq_entries);
if (ret) {
@@ -358,7 +358,8 @@ struct ib_cq *hns_roce_ib_create_cq(struct ib_device *ib_dev,
if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) &&
(udata->outlen >= sizeof(resp))) {
ret = hns_roce_db_map_user(to_hr_ucontext(context),
- ucmd.db_addr, &hr_cq->db);
+ udata, ucmd.db_addr,
+ &hr_cq->db);
if (ret) {
dev_err(dev, "cq record doorbell map failed!\n");
goto err_mtt;
diff --git a/drivers/infiniband/hw/hns/hns_roce_db.c b/drivers/infiniband/hw/hns/hns_roce_db.c
index e2f93c1ce86a..0c6c1fe87705 100644
--- a/drivers/infiniband/hw/hns/hns_roce_db.c
+++ b/drivers/infiniband/hw/hns/hns_roce_db.c
@@ -8,7 +8,8 @@
#include <rdma/ib_umem.h>
#include "hns_roce_device.h"
-int hns_roce_db_map_user(struct hns_roce_ucontext *context, unsigned long virt,
+int hns_roce_db_map_user(struct hns_roce_ucontext *context,
+ struct ib_udata *udata, unsigned long virt,
struct hns_roce_db *db)
{
struct hns_roce_user_db_page *page;
@@ -28,8 +29,7 @@ int hns_roce_db_map_user(struct hns_roce_ucontext *context, unsigned long virt,
refcount_set(&page->refcount, 1);
page->user_virt = (virt & PAGE_MASK);
- page->umem = ib_umem_get(&context->ibucontext, virt & PAGE_MASK,
- PAGE_SIZE, 0, 0);
+ page->umem = ib_umem_get(udata, virt & PAGE_MASK, PAGE_SIZE, 0, 0);
if (IS_ERR(page->umem)) {
ret = PTR_ERR(page->umem);
kfree(page);
diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h
index d39bdfdb5de9..9ee86daf1700 100644
--- a/drivers/infiniband/hw/hns/hns_roce_device.h
+++ b/drivers/infiniband/hw/hns/hns_roce_device.h
@@ -111,6 +111,9 @@
#define PAGES_SHIFT_24 24
#define PAGES_SHIFT_32 32
+#define HNS_ROCE_IDX_QUE_ENTRY_SZ 4
+#define SRQ_DB_REG 0x230
+
enum {
HNS_ROCE_SUPPORT_RQ_RECORD_DB = 1 << 0,
HNS_ROCE_SUPPORT_SQ_RECORD_DB = 1 << 1,
@@ -196,20 +199,50 @@ enum {
HNS_ROCE_CAP_FLAG_RQ_INLINE = BIT(2),
HNS_ROCE_CAP_FLAG_RECORD_DB = BIT(3),
HNS_ROCE_CAP_FLAG_SQ_RECORD_DB = BIT(4),
+ HNS_ROCE_CAP_FLAG_SRQ = BIT(5),
HNS_ROCE_CAP_FLAG_MW = BIT(7),
HNS_ROCE_CAP_FLAG_FRMR = BIT(8),
+ HNS_ROCE_CAP_FLAG_QP_FLOW_CTRL = BIT(9),
HNS_ROCE_CAP_FLAG_ATOMIC = BIT(10),
};
enum hns_roce_mtt_type {
MTT_TYPE_WQE,
MTT_TYPE_CQE,
+ MTT_TYPE_SRQWQE,
+ MTT_TYPE_IDX
};
enum {
HNS_ROCE_DB_PER_PAGE = PAGE_SIZE / 4
};
+enum hns_roce_reset_stage {
+ HNS_ROCE_STATE_NON_RST,
+ HNS_ROCE_STATE_RST_BEF_DOWN,
+ HNS_ROCE_STATE_RST_DOWN,
+ HNS_ROCE_STATE_RST_UNINIT,
+ HNS_ROCE_STATE_RST_INIT,
+ HNS_ROCE_STATE_RST_INITED,
+};
+
+enum hns_roce_instance_state {
+ HNS_ROCE_STATE_NON_INIT,
+ HNS_ROCE_STATE_INIT,
+ HNS_ROCE_STATE_INITED,
+ HNS_ROCE_STATE_UNINIT,
+};
+
+enum {
+ HNS_ROCE_RST_DIRECT_RETURN = 0,
+};
+
+enum {
+ CMD_RST_PRC_OTHERS,
+ CMD_RST_PRC_SUCCESS,
+ CMD_RST_PRC_EBUSY,
+};
+
#define HNS_ROCE_CMD_SUCCESS 1
#define HNS_ROCE_PORT_DOWN 0
@@ -339,6 +372,10 @@ struct hns_roce_mr_table {
struct hns_roce_hem_table mtpt_table;
struct hns_roce_buddy mtt_cqe_buddy;
struct hns_roce_hem_table mtt_cqe_table;
+ struct hns_roce_buddy mtt_srqwqe_buddy;
+ struct hns_roce_hem_table mtt_srqwqe_table;
+ struct hns_roce_buddy mtt_idx_buddy;
+ struct hns_roce_hem_table mtt_idx_table;
};
struct hns_roce_wq {
@@ -429,9 +466,37 @@ struct hns_roce_cq {
struct completion free;
};
+struct hns_roce_idx_que {
+ struct hns_roce_buf idx_buf;
+ int entry_sz;
+ u32 buf_size;
+ struct ib_umem *umem;
+ struct hns_roce_mtt mtt;
+ u64 *bitmap;
+};
+
struct hns_roce_srq {
struct ib_srq ibsrq;
- int srqn;
+ void (*event)(struct hns_roce_srq *srq, enum hns_roce_event event);
+ unsigned long srqn;
+ int max;
+ int max_gs;
+ int wqe_shift;
+ void __iomem *db_reg_l;
+
+ atomic_t refcount;
+ struct completion free;
+
+ struct hns_roce_buf buf;
+ u64 *wrid;
+ struct ib_umem *umem;
+ struct hns_roce_mtt mtt;
+ struct hns_roce_idx_que idx_que;
+ spinlock_t lock;
+ int head;
+ int tail;
+ u16 wqe_ctr;
+ struct mutex mutex;
};
struct hns_roce_uar_table {
@@ -444,6 +509,8 @@ struct hns_roce_qp_table {
struct hns_roce_hem_table qp_table;
struct hns_roce_hem_table irrl_table;
struct hns_roce_hem_table trrl_table;
+ struct hns_roce_hem_table sccc_table;
+ struct mutex scc_mutex;
};
struct hns_roce_cq_table {
@@ -453,6 +520,12 @@ struct hns_roce_cq_table {
struct hns_roce_hem_table table;
};
+struct hns_roce_srq_table {
+ struct hns_roce_bitmap bitmap;
+ struct xarray xa;
+ struct hns_roce_hem_table table;
+};
+
struct hns_roce_raq_table {
struct hns_roce_buf_list *e_raq_buf;
};
@@ -603,6 +676,12 @@ struct hns_roce_aeqe {
} qp_event;
struct {
+ __le32 srq;
+ u32 rsv0;
+ u32 rsv1;
+ } srq_event;
+
+ struct {
__le32 cq;
u32 rsv0;
u32 rsv1;
@@ -679,7 +758,14 @@ struct hns_roce_caps {
u32 max_extend_sg;
int num_qps; /* 256k */
int reserved_qps;
+ int num_qpc_timer;
+ int num_cqc_timer;
+ u32 max_srq_sg;
+ int num_srqs;
u32 max_wqes; /* 16k */
+ u32 max_srqs;
+ u32 max_srq_wrs;
+ u32 max_srq_sges;
u32 max_sq_desc_sz; /* 64 */
u32 max_rq_desc_sz; /* 64 */
u32 max_srq_desc_sz;
@@ -690,12 +776,16 @@ struct hns_roce_caps {
int min_cqes;
u32 min_wqes;
int reserved_cqs;
+ int reserved_srqs;
+ u32 max_srqwqes;
int num_aeq_vectors; /* 1 */
int num_comp_vectors;
int num_other_vectors;
int num_mtpts;
u32 num_mtt_segs;
u32 num_cqe_segs;
+ u32 num_srqwqe_segs;
+ u32 num_idx_segs;
int reserved_mrws;
int reserved_uars;
int num_pds;
@@ -709,6 +799,11 @@ struct hns_roce_caps {
int irrl_entry_sz;
int trrl_entry_sz;
int cqc_entry_sz;
+ int sccc_entry_sz;
+ int qpc_timer_entry_sz;
+ int cqc_timer_entry_sz;
+ int srqc_entry_sz;
+ int idx_entry_sz;
u32 pbl_ba_pg_sz;
u32 pbl_buf_pg_sz;
u32 pbl_hop_num;
@@ -716,9 +811,12 @@ struct hns_roce_caps {
int ceqe_depth;
enum ib_mtu max_mtu;
u32 qpc_bt_num;
+ u32 qpc_timer_bt_num;
u32 srqc_bt_num;
u32 cqc_bt_num;
+ u32 cqc_timer_bt_num;
u32 mpt_bt_num;
+ u32 sccc_bt_num;
u32 qpc_ba_pg_sz;
u32 qpc_buf_pg_sz;
u32 qpc_hop_num;
@@ -734,9 +832,24 @@ struct hns_roce_caps {
u32 mtt_ba_pg_sz;
u32 mtt_buf_pg_sz;
u32 mtt_hop_num;
+ u32 sccc_ba_pg_sz;
+ u32 sccc_buf_pg_sz;
+ u32 sccc_hop_num;
+ u32 qpc_timer_ba_pg_sz;
+ u32 qpc_timer_buf_pg_sz;
+ u32 qpc_timer_hop_num;
+ u32 cqc_timer_ba_pg_sz;
+ u32 cqc_timer_buf_pg_sz;
+ u32 cqc_timer_hop_num;
u32 cqe_ba_pg_sz;
u32 cqe_buf_pg_sz;
u32 cqe_hop_num;
+ u32 srqwqe_ba_pg_sz;
+ u32 srqwqe_buf_pg_sz;
+ u32 srqwqe_hop_num;
+ u32 idx_ba_pg_sz;
+ u32 idx_buf_pg_sz;
+ u32 idx_hop_num;
u32 eqe_ba_pg_sz;
u32 eqe_buf_pg_sz;
u32 eqe_hop_num;
@@ -767,6 +880,7 @@ struct hns_roce_hw {
u64 out_param, u32 in_modifier, u8 op_modifier, u16 op,
u16 token, int event);
int (*chk_mbox)(struct hns_roce_dev *hr_dev, unsigned long timeout);
+ int (*rst_prc_mbox)(struct hns_roce_dev *hr_dev);
int (*set_gid)(struct hns_roce_dev *hr_dev, u8 port, int gid_index,
const union ib_gid *gid, const struct ib_gid_attr *attr);
int (*set_mac)(struct hns_roce_dev *hr_dev, u8 phy_port, u8 *addr);
@@ -794,6 +908,8 @@ struct hns_roce_hw {
int attr_mask, enum ib_qp_state cur_state,
enum ib_qp_state new_state);
int (*destroy_qp)(struct ib_qp *ibqp);
+ int (*qp_flow_control_init)(struct hns_roce_dev *hr_dev,
+ struct hns_roce_qp *hr_qp);
int (*post_send)(struct ib_qp *ibqp, const struct ib_send_wr *wr,
const struct ib_send_wr **bad_wr);
int (*post_recv)(struct ib_qp *qp, const struct ib_recv_wr *recv_wr,
@@ -805,6 +921,19 @@ struct hns_roce_hw {
int (*modify_cq)(struct ib_cq *cq, u16 cq_count, u16 cq_period);
int (*init_eq)(struct hns_roce_dev *hr_dev);
void (*cleanup_eq)(struct hns_roce_dev *hr_dev);
+ void (*write_srqc)(struct hns_roce_dev *hr_dev,
+ struct hns_roce_srq *srq, u32 pdn, u16 xrcd, u32 cqn,
+ void *mb_buf, u64 *mtts_wqe, u64 *mtts_idx,
+ dma_addr_t dma_handle_wqe,
+ dma_addr_t dma_handle_idx);
+ int (*modify_srq)(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr,
+ enum ib_srq_attr_mask srq_attr_mask,
+ struct ib_udata *udata);
+ int (*query_srq)(struct ib_srq *ibsrq, struct ib_srq_attr *attr);
+ int (*post_srq_recv)(struct ib_srq *ibsrq, const struct ib_recv_wr *wr,
+ const struct ib_recv_wr **bad_wr);
+ const struct ib_device_ops *hns_roce_dev_ops;
+ const struct ib_device_ops *hns_roce_dev_srq_ops;
};
struct hns_roce_dev {
@@ -818,6 +947,8 @@ struct hns_roce_dev {
spinlock_t bt_cmd_lock;
bool active;
bool is_reset;
+ bool dis_db;
+ unsigned long reset_cnt;
struct hns_roce_ib_iboe iboe;
struct list_head pgdir_list;
@@ -839,8 +970,11 @@ struct hns_roce_dev {
struct hns_roce_uar_table uar_table;
struct hns_roce_mr_table mr_table;
struct hns_roce_cq_table cq_table;
+ struct hns_roce_srq_table srq_table;
struct hns_roce_qp_table qp_table;
struct hns_roce_eq_table eq_table;
+ struct hns_roce_hem_table qpc_timer_table;
+ struct hns_roce_hem_table cqc_timer_table;
int cmd_mod;
int loop_idc;
@@ -951,12 +1085,14 @@ int hns_roce_init_mr_table(struct hns_roce_dev *hr_dev);
int hns_roce_init_eq_table(struct hns_roce_dev *hr_dev);
int hns_roce_init_cq_table(struct hns_roce_dev *hr_dev);
int hns_roce_init_qp_table(struct hns_roce_dev *hr_dev);
+int hns_roce_init_srq_table(struct hns_roce_dev *hr_dev);
void hns_roce_cleanup_pd_table(struct hns_roce_dev *hr_dev);
void hns_roce_cleanup_mr_table(struct hns_roce_dev *hr_dev);
void hns_roce_cleanup_eq_table(struct hns_roce_dev *hr_dev);
void hns_roce_cleanup_cq_table(struct hns_roce_dev *hr_dev);
void hns_roce_cleanup_qp_table(struct hns_roce_dev *hr_dev);
+void hns_roce_cleanup_srq_table(struct hns_roce_dev *hr_dev);
int hns_roce_bitmap_alloc(struct hns_roce_bitmap *bitmap, unsigned long *obj);
void hns_roce_bitmap_free(struct hns_roce_bitmap *bitmap, unsigned long obj,
@@ -973,14 +1109,14 @@ void hns_roce_bitmap_free_range(struct hns_roce_bitmap *bitmap,
struct ib_ah *hns_roce_create_ah(struct ib_pd *pd,
struct rdma_ah_attr *ah_attr,
+ u32 flags,
struct ib_udata *udata);
int hns_roce_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr);
-int hns_roce_destroy_ah(struct ib_ah *ah);
+int hns_roce_destroy_ah(struct ib_ah *ah, u32 flags);
-struct ib_pd *hns_roce_alloc_pd(struct ib_device *ib_dev,
- struct ib_ucontext *context,
- struct ib_udata *udata);
-int hns_roce_dealloc_pd(struct ib_pd *pd);
+int hns_roce_alloc_pd(struct ib_pd *pd, struct ib_ucontext *context,
+ struct ib_udata *udata);
+void hns_roce_dealloc_pd(struct ib_pd *pd);
struct ib_mr *hns_roce_get_dma_mr(struct ib_pd *pd, int acc);
struct ib_mr *hns_roce_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
@@ -1011,6 +1147,14 @@ int hns_roce_buf_alloc(struct hns_roce_dev *hr_dev, u32 size, u32 max_direct,
int hns_roce_ib_umem_write_mtt(struct hns_roce_dev *hr_dev,
struct hns_roce_mtt *mtt, struct ib_umem *umem);
+struct ib_srq *hns_roce_create_srq(struct ib_pd *pd,
+ struct ib_srq_init_attr *srq_init_attr,
+ struct ib_udata *udata);
+int hns_roce_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr,
+ enum ib_srq_attr_mask srq_attr_mask,
+ struct ib_udata *udata);
+int hns_roce_destroy_srq(struct ib_srq *ibsrq);
+
struct ib_qp *hns_roce_create_qp(struct ib_pd *ib_pd,
struct ib_qp_init_attr *init_attr,
struct ib_udata *udata);
@@ -1041,7 +1185,8 @@ struct ib_cq *hns_roce_ib_create_cq(struct ib_device *ib_dev,
int hns_roce_ib_destroy_cq(struct ib_cq *ib_cq);
void hns_roce_free_cq(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq);
-int hns_roce_db_map_user(struct hns_roce_ucontext *context, unsigned long virt,
+int hns_roce_db_map_user(struct hns_roce_ucontext *context,
+ struct ib_udata *udata, unsigned long virt,
struct hns_roce_db *db);
void hns_roce_db_unmap_user(struct hns_roce_ucontext *context,
struct hns_roce_db *db);
@@ -1052,6 +1197,7 @@ void hns_roce_free_db(struct hns_roce_dev *hr_dev, struct hns_roce_db *db);
void hns_roce_cq_completion(struct hns_roce_dev *hr_dev, u32 cqn);
void hns_roce_cq_event(struct hns_roce_dev *hr_dev, u32 cqn, int event_type);
void hns_roce_qp_event(struct hns_roce_dev *hr_dev, u32 qpn, int event_type);
+void hns_roce_srq_event(struct hns_roce_dev *hr_dev, u32 srqn, int event_type);
int hns_get_gid_index(struct hns_roce_dev *hr_dev, u8 port, int gid_index);
int hns_roce_init(struct hns_roce_dev *hr_dev);
void hns_roce_exit(struct hns_roce_dev *hr_dev);
diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.c b/drivers/infiniband/hw/hns/hns_roce_hem.c
index f6faefed96e8..8e29dbb5b5fb 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hem.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hem.c
@@ -45,8 +45,13 @@ bool hns_roce_check_whether_mhop(struct hns_roce_dev *hr_dev, u32 type)
(hr_dev->caps.mpt_hop_num && type == HEM_TYPE_MTPT) ||
(hr_dev->caps.cqc_hop_num && type == HEM_TYPE_CQC) ||
(hr_dev->caps.srqc_hop_num && type == HEM_TYPE_SRQC) ||
+ (hr_dev->caps.sccc_hop_num && type == HEM_TYPE_SCCC) ||
+ (hr_dev->caps.qpc_timer_hop_num && type == HEM_TYPE_QPC_TIMER) ||
+ (hr_dev->caps.cqc_timer_hop_num && type == HEM_TYPE_CQC_TIMER) ||
(hr_dev->caps.cqe_hop_num && type == HEM_TYPE_CQE) ||
- (hr_dev->caps.mtt_hop_num && type == HEM_TYPE_MTT))
+ (hr_dev->caps.mtt_hop_num && type == HEM_TYPE_MTT) ||
+ (hr_dev->caps.srqwqe_hop_num && type == HEM_TYPE_SRQWQE) ||
+ (hr_dev->caps.idx_hop_num && type == HEM_TYPE_IDX))
return true;
return false;
@@ -123,6 +128,30 @@ int hns_roce_calc_hem_mhop(struct hns_roce_dev *hr_dev,
mhop->ba_l0_num = hr_dev->caps.cqc_bt_num;
mhop->hop_num = hr_dev->caps.cqc_hop_num;
break;
+ case HEM_TYPE_SCCC:
+ mhop->buf_chunk_size = 1 << (hr_dev->caps.sccc_buf_pg_sz
+ + PAGE_SHIFT);
+ mhop->bt_chunk_size = 1 << (hr_dev->caps.sccc_ba_pg_sz
+ + PAGE_SHIFT);
+ mhop->ba_l0_num = hr_dev->caps.sccc_bt_num;
+ mhop->hop_num = hr_dev->caps.sccc_hop_num;
+ break;
+ case HEM_TYPE_QPC_TIMER:
+ mhop->buf_chunk_size = 1 << (hr_dev->caps.qpc_timer_buf_pg_sz
+ + PAGE_SHIFT);
+ mhop->bt_chunk_size = 1 << (hr_dev->caps.qpc_timer_ba_pg_sz
+ + PAGE_SHIFT);
+ mhop->ba_l0_num = hr_dev->caps.qpc_timer_bt_num;
+ mhop->hop_num = hr_dev->caps.qpc_timer_hop_num;
+ break;
+ case HEM_TYPE_CQC_TIMER:
+ mhop->buf_chunk_size = 1 << (hr_dev->caps.cqc_timer_buf_pg_sz
+ + PAGE_SHIFT);
+ mhop->bt_chunk_size = 1 << (hr_dev->caps.cqc_timer_ba_pg_sz
+ + PAGE_SHIFT);
+ mhop->ba_l0_num = hr_dev->caps.cqc_timer_bt_num;
+ mhop->hop_num = hr_dev->caps.cqc_timer_hop_num;
+ break;
case HEM_TYPE_SRQC:
mhop->buf_chunk_size = 1 << (hr_dev->caps.srqc_buf_pg_sz
+ PAGE_SHIFT);
@@ -147,6 +176,22 @@ int hns_roce_calc_hem_mhop(struct hns_roce_dev *hr_dev,
mhop->ba_l0_num = mhop->bt_chunk_size / 8;
mhop->hop_num = hr_dev->caps.cqe_hop_num;
break;
+ case HEM_TYPE_SRQWQE:
+ mhop->buf_chunk_size = 1 << (hr_dev->caps.srqwqe_buf_pg_sz
+ + PAGE_SHIFT);
+ mhop->bt_chunk_size = 1 << (hr_dev->caps.srqwqe_ba_pg_sz
+ + PAGE_SHIFT);
+ mhop->ba_l0_num = mhop->bt_chunk_size / 8;
+ mhop->hop_num = hr_dev->caps.srqwqe_hop_num;
+ break;
+ case HEM_TYPE_IDX:
+ mhop->buf_chunk_size = 1 << (hr_dev->caps.idx_buf_pg_sz
+ + PAGE_SHIFT);
+ mhop->bt_chunk_size = 1 << (hr_dev->caps.idx_ba_pg_sz
+ + PAGE_SHIFT);
+ mhop->ba_l0_num = mhop->bt_chunk_size / 8;
+ mhop->hop_num = hr_dev->caps.idx_hop_num;
+ break;
default:
dev_err(dev, "Table %d not support multi-hop addressing!\n",
table->type);
@@ -157,7 +202,7 @@ int hns_roce_calc_hem_mhop(struct hns_roce_dev *hr_dev,
return 0;
/*
- * QPC/MTPT/CQC/SRQC alloc hem for buffer pages.
+ * QPC/MTPT/CQC/SRQC/SCCC alloc hem for buffer pages.
* MTT/CQE alloc hem for bt pages.
*/
bt_num = hns_roce_get_bt_num(table->type, mhop->hop_num);
@@ -468,7 +513,7 @@ static int hns_roce_table_mhop_get(struct hns_roce_dev *hr_dev,
}
/*
- * alloc buffer space chunk for QPC/MTPT/CQC/SRQC.
+ * alloc buffer space chunk for QPC/MTPT/CQC/SRQC/SCCC.
* alloc bt space chunk for MTT/CQE.
*/
size = table->type < HEM_TYPE_MTT ? buf_chunk_size : bt_chunk_size;
@@ -575,6 +620,7 @@ out:
mutex_unlock(&table->mutex);
return ret;
}
+EXPORT_SYMBOL_GPL(hns_roce_table_get);
static void hns_roce_table_mhop_put(struct hns_roce_dev *hr_dev,
struct hns_roce_hem_table *table,
@@ -640,7 +686,7 @@ static void hns_roce_table_mhop_put(struct hns_roce_dev *hr_dev,
}
/*
- * free buffer space chunk for QPC/MTPT/CQC/SRQC.
+ * free buffer space chunk for QPC/MTPT/CQC/SRQC/SCCC.
* free bt space chunk for MTT/CQE.
*/
hns_roce_free_hem(hr_dev, table->hem[hem_idx]);
@@ -717,6 +763,7 @@ void hns_roce_table_put(struct hns_roce_dev *hr_dev,
mutex_unlock(&table->mutex);
}
+EXPORT_SYMBOL_GPL(hns_roce_table_put);
void *hns_roce_table_find(struct hns_roce_dev *hr_dev,
struct hns_roce_hem_table *table,
@@ -745,6 +792,8 @@ void *hns_roce_table_find(struct hns_roce_dev *hr_dev,
idx_offset = (obj & (table->num_obj - 1)) % obj_per_chunk;
dma_offset = offset = idx_offset * table->obj_size;
} else {
+ u32 seg_size = 64; /* 8 bytes per BA and 8 BA per segment */
+
hns_roce_calc_hem_mhop(hr_dev, table, &mhop_obj, &mhop);
/* mtt mhop */
i = mhop.l0_idx;
@@ -756,8 +805,8 @@ void *hns_roce_table_find(struct hns_roce_dev *hr_dev,
hem_idx = i;
hem = table->hem[hem_idx];
- dma_offset = offset = (obj & (table->num_obj - 1)) *
- table->obj_size % mhop.bt_chunk_size;
+ dma_offset = offset = (obj & (table->num_obj - 1)) * seg_size %
+ mhop.bt_chunk_size;
if (mhop.hop_num == 2)
dma_offset = offset = 0;
}
@@ -886,6 +935,30 @@ int hns_roce_init_hem_table(struct hns_roce_dev *hr_dev,
num_bt_l0 = hr_dev->caps.cqc_bt_num;
hop_num = hr_dev->caps.cqc_hop_num;
break;
+ case HEM_TYPE_SCCC:
+ buf_chunk_size = 1 << (hr_dev->caps.sccc_buf_pg_sz
+ + PAGE_SHIFT);
+ bt_chunk_size = 1 << (hr_dev->caps.sccc_ba_pg_sz
+ + PAGE_SHIFT);
+ num_bt_l0 = hr_dev->caps.sccc_bt_num;
+ hop_num = hr_dev->caps.sccc_hop_num;
+ break;
+ case HEM_TYPE_QPC_TIMER:
+ buf_chunk_size = 1 << (hr_dev->caps.qpc_timer_buf_pg_sz
+ + PAGE_SHIFT);
+ bt_chunk_size = 1 << (hr_dev->caps.qpc_timer_ba_pg_sz
+ + PAGE_SHIFT);
+ num_bt_l0 = hr_dev->caps.qpc_timer_bt_num;
+ hop_num = hr_dev->caps.qpc_timer_hop_num;
+ break;
+ case HEM_TYPE_CQC_TIMER:
+ buf_chunk_size = 1 << (hr_dev->caps.cqc_timer_buf_pg_sz
+ + PAGE_SHIFT);
+ bt_chunk_size = 1 << (hr_dev->caps.cqc_timer_ba_pg_sz
+ + PAGE_SHIFT);
+ num_bt_l0 = hr_dev->caps.cqc_timer_bt_num;
+ hop_num = hr_dev->caps.cqc_timer_hop_num;
+ break;
case HEM_TYPE_SRQC:
buf_chunk_size = 1 << (hr_dev->caps.srqc_buf_pg_sz
+ PAGE_SHIFT);
@@ -906,6 +979,18 @@ int hns_roce_init_hem_table(struct hns_roce_dev *hr_dev,
bt_chunk_size = buf_chunk_size;
hop_num = hr_dev->caps.cqe_hop_num;
break;
+ case HEM_TYPE_SRQWQE:
+ buf_chunk_size = 1 << (hr_dev->caps.srqwqe_ba_pg_sz
+ + PAGE_SHIFT);
+ bt_chunk_size = buf_chunk_size;
+ hop_num = hr_dev->caps.srqwqe_hop_num;
+ break;
+ case HEM_TYPE_IDX:
+ buf_chunk_size = 1 << (hr_dev->caps.idx_ba_pg_sz
+ + PAGE_SHIFT);
+ bt_chunk_size = buf_chunk_size;
+ hop_num = hr_dev->caps.idx_hop_num;
+ break;
default:
dev_err(dev,
"Table %d not support to init hem table here!\n",
@@ -1041,7 +1126,25 @@ void hns_roce_cleanup_hem_table(struct hns_roce_dev *hr_dev,
void hns_roce_cleanup_hem(struct hns_roce_dev *hr_dev)
{
+ if ((hr_dev->caps.num_idx_segs))
+ hns_roce_cleanup_hem_table(hr_dev,
+ &hr_dev->mr_table.mtt_idx_table);
+ if (hr_dev->caps.num_srqwqe_segs)
+ hns_roce_cleanup_hem_table(hr_dev,
+ &hr_dev->mr_table.mtt_srqwqe_table);
+ if (hr_dev->caps.srqc_entry_sz)
+ hns_roce_cleanup_hem_table(hr_dev,
+ &hr_dev->srq_table.table);
hns_roce_cleanup_hem_table(hr_dev, &hr_dev->cq_table.table);
+ if (hr_dev->caps.qpc_timer_entry_sz)
+ hns_roce_cleanup_hem_table(hr_dev,
+ &hr_dev->qpc_timer_table);
+ if (hr_dev->caps.cqc_timer_entry_sz)
+ hns_roce_cleanup_hem_table(hr_dev,
+ &hr_dev->cqc_timer_table);
+ if (hr_dev->caps.sccc_entry_sz)
+ hns_roce_cleanup_hem_table(hr_dev,
+ &hr_dev->qp_table.sccc_table);
if (hr_dev->caps.trrl_entry_sz)
hns_roce_cleanup_hem_table(hr_dev,
&hr_dev->qp_table.trrl_table);
diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.h b/drivers/infiniband/hw/hns/hns_roce_hem.h
index e8850d59e780..d9d668992e49 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hem.h
+++ b/drivers/infiniband/hw/hns/hns_roce_hem.h
@@ -44,10 +44,15 @@ enum {
HEM_TYPE_MTPT,
HEM_TYPE_CQC,
HEM_TYPE_SRQC,
+ HEM_TYPE_SCCC,
+ HEM_TYPE_QPC_TIMER,
+ HEM_TYPE_CQC_TIMER,
/* UNMAP HEM */
HEM_TYPE_MTT,
HEM_TYPE_CQE,
+ HEM_TYPE_SRQWQE,
+ HEM_TYPE_IDX,
HEM_TYPE_IRRL,
HEM_TYPE_TRRL,
};
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
index ca05810c92dc..97515c340134 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
@@ -711,13 +711,14 @@ static int hns_roce_v1_rsv_lp_qp(struct hns_roce_dev *hr_dev)
struct ib_qp_attr attr = { 0 };
struct hns_roce_v1_priv *priv;
struct hns_roce_qp *hr_qp;
+ struct ib_device *ibdev;
struct ib_cq *cq;
struct ib_pd *pd;
union ib_gid dgid;
u64 subnet_prefix;
int attr_mask = 0;
+ int ret = -ENOMEM;
int i, j;
- int ret;
u8 queue_en[HNS_ROCE_V1_RESV_QP] = { 0 };
u8 phy_port;
u8 port = 0;
@@ -742,12 +743,16 @@ static int hns_roce_v1_rsv_lp_qp(struct hns_roce_dev *hr_dev)
free_mr->mr_free_cq->ib_cq.cq_context = NULL;
atomic_set(&free_mr->mr_free_cq->ib_cq.usecnt, 0);
- pd = hns_roce_alloc_pd(&hr_dev->ib_dev, NULL, NULL);
- if (IS_ERR(pd)) {
- dev_err(dev, "Create pd for reserved loop qp failed!");
- ret = -ENOMEM;
+ ibdev = &hr_dev->ib_dev;
+ pd = rdma_zalloc_drv_obj(ibdev, ib_pd);
+ if (!pd)
+ goto alloc_mem_failed;
+
+ pd->device = ibdev;
+ ret = hns_roce_alloc_pd(pd, NULL, NULL);
+ if (ret)
goto alloc_pd_failed;
- }
+
free_mr->mr_free_pd = to_hr_pd(pd);
free_mr->mr_free_pd->ibpd.device = &hr_dev->ib_dev;
free_mr->mr_free_pd->ibpd.uobject = NULL;
@@ -854,10 +859,12 @@ create_lp_qp_failed:
dev_err(dev, "Destroy qp %d for mr free failed!\n", i);
}
- if (hns_roce_dealloc_pd(pd))
- dev_err(dev, "Destroy pd for create_lp_qp failed!\n");
+ hns_roce_dealloc_pd(pd);
alloc_pd_failed:
+ kfree(pd);
+
+alloc_mem_failed:
if (hns_roce_ib_destroy_cq(cq))
dev_err(dev, "Destroy cq for create_lp_qp failed!\n");
@@ -891,9 +898,7 @@ static void hns_roce_v1_release_lp_qp(struct hns_roce_dev *hr_dev)
if (ret)
dev_err(dev, "Destroy cq for mr_free failed(%d)!\n", ret);
- ret = hns_roce_dealloc_pd(&free_mr->mr_free_pd->ibpd);
- if (ret)
- dev_err(dev, "Destroy pd for mr_free failed(%d)!\n", ret);
+ hns_roce_dealloc_pd(&free_mr->mr_free_pd->ibpd);
}
static int hns_roce_db_init(struct hns_roce_dev *hr_dev)
@@ -1866,9 +1871,8 @@ static int hns_roce_v1_write_mtpt(void *mb_buf, struct hns_roce_mr *mr,
unsigned long mtpt_idx)
{
struct hns_roce_v1_mpt_entry *mpt_entry;
- struct scatterlist *sg;
+ struct sg_dma_page_iter sg_iter;
u64 *pages;
- int entry;
int i;
/* MPT filled into mailbox buf */
@@ -1923,8 +1927,8 @@ static int hns_roce_v1_write_mtpt(void *mb_buf, struct hns_roce_mr *mr,
return -ENOMEM;
i = 0;
- for_each_sg(mr->umem->sg_head.sgl, sg, mr->umem->nmap, entry) {
- pages[i] = ((u64)sg_dma_address(sg)) >> 12;
+ for_each_sg_dma_page(mr->umem->sg_head.sgl, &sg_iter, mr->umem->nmap, 0) {
+ pages[i] = ((u64)sg_page_iter_dma_address(&sg_iter)) >> 12;
/* Directly record to MTPT table firstly 7 entry */
if (i >= HNS_ROCE_MAX_INNER_MTPT_NUM)
@@ -3926,7 +3930,7 @@ int hns_roce_v1_destroy_qp(struct ib_qp *ibqp)
struct hns_roce_qp_work *qp_work;
struct hns_roce_v1_priv *priv;
struct hns_roce_cq *send_cq, *recv_cq;
- int is_user = !!ibqp->pd->uobject;
+ bool is_user = ibqp->uobject;
int is_timeout = 0;
int ret;
@@ -4793,6 +4797,16 @@ static void hns_roce_v1_cleanup_eq_table(struct hns_roce_dev *hr_dev)
kfree(eq_table->eq);
}
+static const struct ib_device_ops hns_roce_v1_dev_ops = {
+ .destroy_qp = hns_roce_v1_destroy_qp,
+ .modify_cq = hns_roce_v1_modify_cq,
+ .poll_cq = hns_roce_v1_poll_cq,
+ .post_recv = hns_roce_v1_post_recv,
+ .post_send = hns_roce_v1_post_send,
+ .query_qp = hns_roce_v1_query_qp,
+ .req_notify_cq = hns_roce_v1_req_notify_cq,
+};
+
static const struct hns_roce_hw hns_roce_hw_v1 = {
.reset = hns_roce_v1_reset,
.hw_profile = hns_roce_v1_profile,
@@ -4818,6 +4832,7 @@ static const struct hns_roce_hw hns_roce_hw_v1 = {
.destroy_cq = hns_roce_v1_destroy_cq,
.init_eq = hns_roce_v1_init_eq_table,
.cleanup_eq = hns_roce_v1_cleanup_eq_table,
+ .hns_roce_dev_ops = &hns_roce_v1_dev_ops,
};
static const struct of_device_id hns_roce_of_match[] = {
@@ -4991,7 +5006,7 @@ static int hns_roce_probe(struct platform_device *pdev)
struct hns_roce_dev *hr_dev;
struct device *dev = &pdev->dev;
- hr_dev = (struct hns_roce_dev *)ib_alloc_device(sizeof(*hr_dev));
+ hr_dev = ib_alloc_device(hns_roce_dev, ib_dev);
if (!hr_dev)
return -ENOMEM;
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index 3beb1523e17c..1c54390e1c85 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -587,7 +587,7 @@ out:
roce_set_field(sq_db.parameter, V2_DB_PARAMETER_SL_M,
V2_DB_PARAMETER_SL_S, qp->sl);
- hns_roce_write64_k((__le32 *)&sq_db, qp->sq.db_reg_l);
+ hns_roce_write64(hr_dev, (__le32 *)&sq_db, qp->sq.db_reg_l);
qp->sq_next_wqe = ind;
qp->next_sge = sge_ind;
@@ -712,6 +712,113 @@ out:
return ret;
}
+static int hns_roce_v2_cmd_hw_reseted(struct hns_roce_dev *hr_dev,
+ unsigned long instance_stage,
+ unsigned long reset_stage)
+{
+ /* When hardware reset has been completed once or more, we should stop
+ * sending mailbox&cmq&doorbell to hardware. If now in .init_instance()
+ * function, we should exit with error. If now at HNAE3_INIT_CLIENT
+ * stage of soft reset process, we should exit with error, and then
+ * HNAE3_INIT_CLIENT related process can rollback the operation like
+ * notifing hardware to free resources, HNAE3_INIT_CLIENT related
+ * process will exit with error to notify NIC driver to reschedule soft
+ * reset process once again.
+ */
+ hr_dev->is_reset = true;
+ hr_dev->dis_db = true;
+
+ if (reset_stage == HNS_ROCE_STATE_RST_INIT ||
+ instance_stage == HNS_ROCE_STATE_INIT)
+ return CMD_RST_PRC_EBUSY;
+
+ return CMD_RST_PRC_SUCCESS;
+}
+
+static int hns_roce_v2_cmd_hw_resetting(struct hns_roce_dev *hr_dev,
+ unsigned long instance_stage,
+ unsigned long reset_stage)
+{
+ struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv;
+ struct hnae3_handle *handle = priv->handle;
+ const struct hnae3_ae_ops *ops = handle->ae_algo->ops;
+
+ /* When hardware reset is detected, we should stop sending mailbox&cmq&
+ * doorbell to hardware. If now in .init_instance() function, we should
+ * exit with error. If now at HNAE3_INIT_CLIENT stage of soft reset
+ * process, we should exit with error, and then HNAE3_INIT_CLIENT
+ * related process can rollback the operation like notifing hardware to
+ * free resources, HNAE3_INIT_CLIENT related process will exit with
+ * error to notify NIC driver to reschedule soft reset process once
+ * again.
+ */
+ hr_dev->dis_db = true;
+ if (!ops->get_hw_reset_stat(handle))
+ hr_dev->is_reset = true;
+
+ if (!hr_dev->is_reset || reset_stage == HNS_ROCE_STATE_RST_INIT ||
+ instance_stage == HNS_ROCE_STATE_INIT)
+ return CMD_RST_PRC_EBUSY;
+
+ return CMD_RST_PRC_SUCCESS;
+}
+
+static int hns_roce_v2_cmd_sw_resetting(struct hns_roce_dev *hr_dev)
+{
+ struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv;
+ struct hnae3_handle *handle = priv->handle;
+ const struct hnae3_ae_ops *ops = handle->ae_algo->ops;
+
+ /* When software reset is detected at .init_instance() function, we
+ * should stop sending mailbox&cmq&doorbell to hardware, and exit
+ * with error.
+ */
+ hr_dev->dis_db = true;
+ if (ops->ae_dev_reset_cnt(handle) != hr_dev->reset_cnt)
+ hr_dev->is_reset = true;
+
+ return CMD_RST_PRC_EBUSY;
+}
+
+static int hns_roce_v2_rst_process_cmd(struct hns_roce_dev *hr_dev)
+{
+ struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv;
+ struct hnae3_handle *handle = priv->handle;
+ const struct hnae3_ae_ops *ops = handle->ae_algo->ops;
+ unsigned long instance_stage; /* the current instance stage */
+ unsigned long reset_stage; /* the current reset stage */
+ unsigned long reset_cnt;
+ bool sw_resetting;
+ bool hw_resetting;
+
+ if (hr_dev->is_reset)
+ return CMD_RST_PRC_SUCCESS;
+
+ /* Get information about reset from NIC driver or RoCE driver itself,
+ * the meaning of the following variables from NIC driver are described
+ * as below:
+ * reset_cnt -- The count value of completed hardware reset.
+ * hw_resetting -- Whether hardware device is resetting now.
+ * sw_resetting -- Whether NIC's software reset process is running now.
+ */
+ instance_stage = handle->rinfo.instance_state;
+ reset_stage = handle->rinfo.reset_state;
+ reset_cnt = ops->ae_dev_reset_cnt(handle);
+ hw_resetting = ops->get_hw_reset_stat(handle);
+ sw_resetting = ops->ae_dev_resetting(handle);
+
+ if (reset_cnt != hr_dev->reset_cnt)
+ return hns_roce_v2_cmd_hw_reseted(hr_dev, instance_stage,
+ reset_stage);
+ else if (hw_resetting)
+ return hns_roce_v2_cmd_hw_resetting(hr_dev, instance_stage,
+ reset_stage);
+ else if (sw_resetting && instance_stage == HNS_ROCE_STATE_INIT)
+ return hns_roce_v2_cmd_sw_resetting(hr_dev);
+
+ return 0;
+}
+
static int hns_roce_cmq_space(struct hns_roce_v2_cmq_ring *ring)
{
int ntu = ring->next_to_use;
@@ -892,8 +999,8 @@ static int hns_roce_cmq_csq_clean(struct hns_roce_dev *hr_dev)
return clean;
}
-static int hns_roce_cmq_send(struct hns_roce_dev *hr_dev,
- struct hns_roce_cmq_desc *desc, int num)
+static int __hns_roce_cmq_send(struct hns_roce_dev *hr_dev,
+ struct hns_roce_cmq_desc *desc, int num)
{
struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv;
struct hns_roce_v2_cmq_ring *csq = &priv->cmq.csq;
@@ -905,9 +1012,6 @@ static int hns_roce_cmq_send(struct hns_roce_dev *hr_dev,
int ret = 0;
int ntc;
- if (hr_dev->is_reset)
- return 0;
-
spin_lock_bh(&csq->lock);
if (num > hns_roce_cmq_space(csq)) {
@@ -982,6 +1086,30 @@ static int hns_roce_cmq_send(struct hns_roce_dev *hr_dev,
return ret;
}
+int hns_roce_cmq_send(struct hns_roce_dev *hr_dev,
+ struct hns_roce_cmq_desc *desc, int num)
+{
+ int retval;
+ int ret;
+
+ ret = hns_roce_v2_rst_process_cmd(hr_dev);
+ if (ret == CMD_RST_PRC_SUCCESS)
+ return 0;
+ if (ret == CMD_RST_PRC_EBUSY)
+ return ret;
+
+ ret = __hns_roce_cmq_send(hr_dev, desc, num);
+ if (ret) {
+ retval = hns_roce_v2_rst_process_cmd(hr_dev);
+ if (retval == CMD_RST_PRC_SUCCESS)
+ return 0;
+ else if (retval == CMD_RST_PRC_EBUSY)
+ return retval;
+ }
+
+ return ret;
+}
+
static int hns_roce_cmq_query_hw_info(struct hns_roce_dev *hr_dev)
{
struct hns_roce_query_version *resp;
@@ -1078,10 +1206,75 @@ static int hns_roce_query_pf_resource(struct hns_roce_dev *hr_dev)
hr_dev->caps.sl_num = roce_get_field(req_b->qid_idx_sl_num,
PF_RES_DATA_3_PF_SL_NUM_M,
PF_RES_DATA_3_PF_SL_NUM_S);
+ hr_dev->caps.sccc_bt_num = roce_get_field(req_b->sccc_bt_idx_num,
+ PF_RES_DATA_4_PF_SCCC_BT_NUM_M,
+ PF_RES_DATA_4_PF_SCCC_BT_NUM_S);
+
+ return 0;
+}
+
+static int hns_roce_query_pf_timer_resource(struct hns_roce_dev *hr_dev)
+{
+ struct hns_roce_pf_timer_res_a *req_a;
+ struct hns_roce_cmq_desc desc[2];
+ int ret, i;
+
+ for (i = 0; i < 2; i++) {
+ hns_roce_cmq_setup_basic_desc(&desc[i],
+ HNS_ROCE_OPC_QUERY_PF_TIMER_RES,
+ true);
+
+ if (i == 0)
+ desc[i].flag |= cpu_to_le16(HNS_ROCE_CMD_FLAG_NEXT);
+ else
+ desc[i].flag &= ~cpu_to_le16(HNS_ROCE_CMD_FLAG_NEXT);
+ }
+
+ ret = hns_roce_cmq_send(hr_dev, desc, 2);
+ if (ret)
+ return ret;
+
+ req_a = (struct hns_roce_pf_timer_res_a *)desc[0].data;
+
+ hr_dev->caps.qpc_timer_bt_num =
+ roce_get_field(req_a->qpc_timer_bt_idx_num,
+ PF_RES_DATA_1_PF_QPC_TIMER_BT_NUM_M,
+ PF_RES_DATA_1_PF_QPC_TIMER_BT_NUM_S);
+ hr_dev->caps.cqc_timer_bt_num =
+ roce_get_field(req_a->cqc_timer_bt_idx_num,
+ PF_RES_DATA_2_PF_CQC_TIMER_BT_NUM_M,
+ PF_RES_DATA_2_PF_CQC_TIMER_BT_NUM_S);
return 0;
}
+static int hns_roce_set_vf_switch_param(struct hns_roce_dev *hr_dev,
+ int vf_id)
+{
+ struct hns_roce_cmq_desc desc;
+ struct hns_roce_vf_switch *swt;
+ int ret;
+
+ swt = (struct hns_roce_vf_switch *)desc.data;
+ hns_roce_cmq_setup_basic_desc(&desc, HNS_SWITCH_PARAMETER_CFG, true);
+ swt->rocee_sel |= cpu_to_le16(HNS_ICL_SWITCH_CMD_ROCEE_SEL);
+ roce_set_field(swt->fun_id,
+ VF_SWITCH_DATA_FUN_ID_VF_ID_M,
+ VF_SWITCH_DATA_FUN_ID_VF_ID_S,
+ vf_id);
+ ret = hns_roce_cmq_send(hr_dev, &desc, 1);
+ if (ret)
+ return ret;
+ desc.flag =
+ cpu_to_le16(HNS_ROCE_CMD_FLAG_NO_INTR | HNS_ROCE_CMD_FLAG_IN);
+ desc.flag &= cpu_to_le16(~HNS_ROCE_CMD_FLAG_WR);
+ roce_set_bit(swt->cfg, VF_SWITCH_DATA_CFG_ALW_LPBK_S, 1);
+ roce_set_bit(swt->cfg, VF_SWITCH_DATA_CFG_ALW_LCL_LPBK_S, 1);
+ roce_set_bit(swt->cfg, VF_SWITCH_DATA_CFG_ALW_DST_OVRD_S, 1);
+
+ return hns_roce_cmq_send(hr_dev, &desc, 1);
+}
+
static int hns_roce_alloc_vf_resource(struct hns_roce_dev *hr_dev)
{
struct hns_roce_cmq_desc desc[2];
@@ -1166,6 +1359,14 @@ static int hns_roce_alloc_vf_resource(struct hns_roce_dev *hr_dev)
VF_RES_B_DATA_3_VF_SL_NUM_M,
VF_RES_B_DATA_3_VF_SL_NUM_S,
HNS_ROCE_VF_SL_NUM);
+
+ roce_set_field(req_b->vf_sccc_idx_num,
+ VF_RES_B_DATA_4_VF_SCCC_BT_IDX_M,
+ VF_RES_B_DATA_4_VF_SCCC_BT_IDX_S, 0);
+ roce_set_field(req_b->vf_sccc_idx_num,
+ VF_RES_B_DATA_4_VF_SCCC_BT_NUM_M,
+ VF_RES_B_DATA_4_VF_SCCC_BT_NUM_S,
+ HNS_ROCE_VF_SCCC_BT_NUM);
}
}
@@ -1178,6 +1379,7 @@ static int hns_roce_v2_set_bt(struct hns_roce_dev *hr_dev)
u8 qpc_hop_num = hr_dev->caps.qpc_hop_num;
u8 cqc_hop_num = hr_dev->caps.cqc_hop_num;
u8 mpt_hop_num = hr_dev->caps.mpt_hop_num;
+ u8 sccc_hop_num = hr_dev->caps.sccc_hop_num;
struct hns_roce_cfg_bt_attr *req;
struct hns_roce_cmq_desc desc;
@@ -1225,6 +1427,20 @@ static int hns_roce_v2_set_bt(struct hns_roce_dev *hr_dev)
CFG_BT_ATTR_DATA_3_VF_MPT_HOPNUM_S,
mpt_hop_num == HNS_ROCE_HOP_NUM_0 ? 0 : mpt_hop_num);
+ roce_set_field(req->vf_sccc_cfg,
+ CFG_BT_ATTR_DATA_4_VF_SCCC_BA_PGSZ_M,
+ CFG_BT_ATTR_DATA_4_VF_SCCC_BA_PGSZ_S,
+ hr_dev->caps.sccc_ba_pg_sz + PG_SHIFT_OFFSET);
+ roce_set_field(req->vf_sccc_cfg,
+ CFG_BT_ATTR_DATA_4_VF_SCCC_BUF_PGSZ_M,
+ CFG_BT_ATTR_DATA_4_VF_SCCC_BUF_PGSZ_S,
+ hr_dev->caps.sccc_buf_pg_sz + PG_SHIFT_OFFSET);
+ roce_set_field(req->vf_sccc_cfg,
+ CFG_BT_ATTR_DATA_4_VF_SCCC_HOPNUM_M,
+ CFG_BT_ATTR_DATA_4_VF_SCCC_HOPNUM_S,
+ sccc_hop_num ==
+ HNS_ROCE_HOP_NUM_0 ? 0 : sccc_hop_num);
+
return hns_roce_cmq_send(hr_dev, &desc, 1);
}
@@ -1262,6 +1478,16 @@ static int hns_roce_v2_profile(struct hns_roce_dev *hr_dev)
return ret;
}
+ if (hr_dev->pci_dev->revision == 0x21) {
+ ret = hns_roce_query_pf_timer_resource(hr_dev);
+ if (ret) {
+ dev_err(hr_dev->dev,
+ "Query pf timer resource fail, ret = %d.\n",
+ ret);
+ return ret;
+ }
+ }
+
ret = hns_roce_alloc_vf_resource(hr_dev);
if (ret) {
dev_err(hr_dev->dev, "Allocate vf resource fail, ret = %d.\n",
@@ -1269,6 +1495,15 @@ static int hns_roce_v2_profile(struct hns_roce_dev *hr_dev)
return ret;
}
+ if (hr_dev->pci_dev->revision == 0x21) {
+ ret = hns_roce_set_vf_switch_param(hr_dev, 0);
+ if (ret) {
+ dev_err(hr_dev->dev,
+ "Set function switch param fail, ret = %d.\n",
+ ret);
+ return ret;
+ }
+ }
hr_dev->vendor_part_id = hr_dev->pci_dev->device;
hr_dev->sys_image_guid = be64_to_cpu(hr_dev->ib_dev.node_guid);
@@ -1276,11 +1511,15 @@ static int hns_roce_v2_profile(struct hns_roce_dev *hr_dev)
caps->num_qps = HNS_ROCE_V2_MAX_QP_NUM;
caps->max_wqes = HNS_ROCE_V2_MAX_WQE_NUM;
caps->num_cqs = HNS_ROCE_V2_MAX_CQ_NUM;
+ caps->num_srqs = HNS_ROCE_V2_MAX_SRQ_NUM;
+ caps->min_cqes = HNS_ROCE_MIN_CQE_NUM;
caps->max_cqes = HNS_ROCE_V2_MAX_CQE_NUM;
+ caps->max_srqwqes = HNS_ROCE_V2_MAX_SRQWQE_NUM;
caps->max_sq_sg = HNS_ROCE_V2_MAX_SQ_SGE_NUM;
caps->max_extend_sg = HNS_ROCE_V2_MAX_EXTEND_SGE_NUM;
caps->max_rq_sg = HNS_ROCE_V2_MAX_RQ_SGE_NUM;
caps->max_sq_inline = HNS_ROCE_V2_MAX_SQ_INLINE;
+ caps->max_srq_sg = HNS_ROCE_V2_MAX_SRQ_SGE_NUM;
caps->num_uars = HNS_ROCE_V2_UAR_NUM;
caps->phy_num_uars = HNS_ROCE_V2_PHY_UAR_NUM;
caps->num_aeq_vectors = HNS_ROCE_V2_AEQE_VEC_NUM;
@@ -1289,6 +1528,8 @@ static int hns_roce_v2_profile(struct hns_roce_dev *hr_dev)
caps->num_mtpts = HNS_ROCE_V2_MAX_MTPT_NUM;
caps->num_mtt_segs = HNS_ROCE_V2_MAX_MTT_SEGS;
caps->num_cqe_segs = HNS_ROCE_V2_MAX_CQE_SEGS;
+ caps->num_srqwqe_segs = HNS_ROCE_V2_MAX_SRQWQE_SEGS;
+ caps->num_idx_segs = HNS_ROCE_V2_MAX_IDX_SEGS;
caps->num_pds = HNS_ROCE_V2_MAX_PD_NUM;
caps->max_qp_init_rdma = HNS_ROCE_V2_MAX_QP_INIT_RDMA;
caps->max_qp_dest_rdma = HNS_ROCE_V2_MAX_QP_DEST_RDMA;
@@ -1299,8 +1540,10 @@ static int hns_roce_v2_profile(struct hns_roce_dev *hr_dev)
caps->irrl_entry_sz = HNS_ROCE_V2_IRRL_ENTRY_SZ;
caps->trrl_entry_sz = HNS_ROCE_V2_TRRL_ENTRY_SZ;
caps->cqc_entry_sz = HNS_ROCE_V2_CQC_ENTRY_SZ;
+ caps->srqc_entry_sz = HNS_ROCE_V2_SRQC_ENTRY_SZ;
caps->mtpt_entry_sz = HNS_ROCE_V2_MTPT_ENTRY_SZ;
caps->mtt_entry_sz = HNS_ROCE_V2_MTT_ENTRY_SZ;
+ caps->idx_entry_sz = 4;
caps->cq_entry_sz = HNS_ROCE_V2_CQE_ENTRY_SIZE;
caps->page_size_cap = HNS_ROCE_V2_PAGE_SIZE_SUPPORTED;
caps->reserved_lkey = 0;
@@ -1308,6 +1551,7 @@ static int hns_roce_v2_profile(struct hns_roce_dev *hr_dev)
caps->reserved_mrws = 1;
caps->reserved_uars = 0;
caps->reserved_cqs = 0;
+ caps->reserved_srqs = 0;
caps->reserved_qps = HNS_ROCE_V2_RSV_QPS;
caps->qpc_ba_pg_sz = 0;
@@ -1322,7 +1566,7 @@ static int hns_roce_v2_profile(struct hns_roce_dev *hr_dev)
caps->mpt_ba_pg_sz = 0;
caps->mpt_buf_pg_sz = 0;
caps->mpt_hop_num = HNS_ROCE_CONTEXT_HOP_NUM;
- caps->pbl_ba_pg_sz = 0;
+ caps->pbl_ba_pg_sz = 2;
caps->pbl_buf_pg_sz = 0;
caps->pbl_hop_num = HNS_ROCE_PBL_HOP_NUM;
caps->mtt_ba_pg_sz = 0;
@@ -1331,6 +1575,12 @@ static int hns_roce_v2_profile(struct hns_roce_dev *hr_dev)
caps->cqe_ba_pg_sz = 0;
caps->cqe_buf_pg_sz = 0;
caps->cqe_hop_num = HNS_ROCE_CQE_HOP_NUM;
+ caps->srqwqe_ba_pg_sz = 0;
+ caps->srqwqe_buf_pg_sz = 0;
+ caps->srqwqe_hop_num = HNS_ROCE_SRQWQE_HOP_NUM;
+ caps->idx_ba_pg_sz = 0;
+ caps->idx_buf_pg_sz = 0;
+ caps->idx_hop_num = HNS_ROCE_IDX_HOP_NUM;
caps->eqe_ba_pg_sz = 0;
caps->eqe_buf_pg_sz = 0;
caps->eqe_hop_num = HNS_ROCE_EQE_HOP_NUM;
@@ -1354,8 +1604,31 @@ static int hns_roce_v2_profile(struct hns_roce_dev *hr_dev)
caps->local_ca_ack_delay = 0;
caps->max_mtu = IB_MTU_4096;
- if (hr_dev->pci_dev->revision == 0x21)
- caps->flags |= HNS_ROCE_CAP_FLAG_ATOMIC;
+ caps->max_srqs = HNS_ROCE_V2_MAX_SRQ;
+ caps->max_srq_wrs = HNS_ROCE_V2_MAX_SRQ_WR;
+ caps->max_srq_sges = HNS_ROCE_V2_MAX_SRQ_SGE;
+
+ if (hr_dev->pci_dev->revision == 0x21) {
+ caps->flags |= HNS_ROCE_CAP_FLAG_ATOMIC |
+ HNS_ROCE_CAP_FLAG_SRQ |
+ HNS_ROCE_CAP_FLAG_QP_FLOW_CTRL;
+
+ caps->num_qpc_timer = HNS_ROCE_V2_MAX_QPC_TIMER_NUM;
+ caps->qpc_timer_entry_sz = HNS_ROCE_V2_QPC_TIMER_ENTRY_SZ;
+ caps->qpc_timer_ba_pg_sz = 0;
+ caps->qpc_timer_buf_pg_sz = 0;
+ caps->qpc_timer_hop_num = HNS_ROCE_HOP_NUM_0;
+ caps->num_cqc_timer = HNS_ROCE_V2_MAX_CQC_TIMER_NUM;
+ caps->cqc_timer_entry_sz = HNS_ROCE_V2_CQC_TIMER_ENTRY_SZ;
+ caps->cqc_timer_ba_pg_sz = 0;
+ caps->cqc_timer_buf_pg_sz = 0;
+ caps->cqc_timer_hop_num = HNS_ROCE_HOP_NUM_0;
+
+ caps->sccc_entry_sz = HNS_ROCE_V2_SCCC_ENTRY_SZ;
+ caps->sccc_ba_pg_sz = 0;
+ caps->sccc_buf_pg_sz = 0;
+ caps->sccc_hop_num = HNS_ROCE_SCCC_HOP_NUM;
+ }
ret = hns_roce_v2_set_bt(hr_dev);
if (ret)
@@ -1556,7 +1829,8 @@ static void hns_roce_free_link_table(struct hns_roce_dev *hr_dev,
static int hns_roce_v2_init(struct hns_roce_dev *hr_dev)
{
struct hns_roce_v2_priv *priv = hr_dev->priv;
- int ret;
+ int qpc_count, cqc_count;
+ int ret, i;
/* TSQ includes SQ doorbell and ack doorbell */
ret = hns_roce_init_link_table(hr_dev, TSQ_LINK_TABLE);
@@ -1571,8 +1845,40 @@ static int hns_roce_v2_init(struct hns_roce_dev *hr_dev)
goto err_tpq_init_failed;
}
+ /* Alloc memory for QPC Timer buffer space chunk*/
+ for (qpc_count = 0; qpc_count < hr_dev->caps.qpc_timer_bt_num;
+ qpc_count++) {
+ ret = hns_roce_table_get(hr_dev, &hr_dev->qpc_timer_table,
+ qpc_count);
+ if (ret) {
+ dev_err(hr_dev->dev, "QPC Timer get failed\n");
+ goto err_qpc_timer_failed;
+ }
+ }
+
+ /* Alloc memory for CQC Timer buffer space chunk*/
+ for (cqc_count = 0; cqc_count < hr_dev->caps.cqc_timer_bt_num;
+ cqc_count++) {
+ ret = hns_roce_table_get(hr_dev, &hr_dev->cqc_timer_table,
+ cqc_count);
+ if (ret) {
+ dev_err(hr_dev->dev, "CQC Timer get failed\n");
+ goto err_cqc_timer_failed;
+ }
+ }
+
return 0;
+err_cqc_timer_failed:
+ for (i = 0; i < cqc_count; i++)
+ hns_roce_table_put(hr_dev, &hr_dev->cqc_timer_table, i);
+
+err_qpc_timer_failed:
+ for (i = 0; i < qpc_count; i++)
+ hns_roce_table_put(hr_dev, &hr_dev->qpc_timer_table, i);
+
+ hns_roce_free_link_table(hr_dev, &priv->tpq);
+
err_tpq_init_failed:
hns_roce_free_link_table(hr_dev, &priv->tsq);
@@ -1587,30 +1893,62 @@ static void hns_roce_v2_exit(struct hns_roce_dev *hr_dev)
hns_roce_free_link_table(hr_dev, &priv->tsq);
}
+static int hns_roce_query_mbox_status(struct hns_roce_dev *hr_dev)
+{
+ struct hns_roce_cmq_desc desc;
+ struct hns_roce_mbox_status *mb_st =
+ (struct hns_roce_mbox_status *)desc.data;
+ enum hns_roce_cmd_return_status status;
+
+ hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_QUERY_MB_ST, true);
+
+ status = hns_roce_cmq_send(hr_dev, &desc, 1);
+ if (status)
+ return status;
+
+ return cpu_to_le32(mb_st->mb_status_hw_run);
+}
+
static int hns_roce_v2_cmd_pending(struct hns_roce_dev *hr_dev)
{
- u32 status = readl(hr_dev->reg_base + ROCEE_VF_MB_STATUS_REG);
+ u32 status = hns_roce_query_mbox_status(hr_dev);
return status >> HNS_ROCE_HW_RUN_BIT_SHIFT;
}
static int hns_roce_v2_cmd_complete(struct hns_roce_dev *hr_dev)
{
- u32 status = readl(hr_dev->reg_base + ROCEE_VF_MB_STATUS_REG);
+ u32 status = hns_roce_query_mbox_status(hr_dev);
return status & HNS_ROCE_HW_MB_STATUS_MASK;
}
+static int hns_roce_mbox_post(struct hns_roce_dev *hr_dev, u64 in_param,
+ u64 out_param, u32 in_modifier, u8 op_modifier,
+ u16 op, u16 token, int event)
+{
+ struct hns_roce_cmq_desc desc;
+ struct hns_roce_post_mbox *mb = (struct hns_roce_post_mbox *)desc.data;
+
+ hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_POST_MB, false);
+
+ mb->in_param_l = cpu_to_le64(in_param);
+ mb->in_param_h = cpu_to_le64(in_param) >> 32;
+ mb->out_param_l = cpu_to_le64(out_param);
+ mb->out_param_h = cpu_to_le64(out_param) >> 32;
+ mb->cmd_tag = cpu_to_le32(in_modifier << 8 | op);
+ mb->token_event_en = cpu_to_le32(event << 16 | token);
+
+ return hns_roce_cmq_send(hr_dev, &desc, 1);
+}
+
static int hns_roce_v2_post_mbox(struct hns_roce_dev *hr_dev, u64 in_param,
u64 out_param, u32 in_modifier, u8 op_modifier,
u16 op, u16 token, int event)
{
struct device *dev = hr_dev->dev;
- u32 __iomem *hcr = (u32 __iomem *)(hr_dev->reg_base +
- ROCEE_VF_MB_CFG0_REG);
unsigned long end;
- u32 val0 = 0;
- u32 val1 = 0;
+ int ret;
end = msecs_to_jiffies(HNS_ROCE_V2_GO_BIT_TIMEOUT_MSECS) + jiffies;
while (hns_roce_v2_cmd_pending(hr_dev)) {
@@ -1622,27 +1960,12 @@ static int hns_roce_v2_post_mbox(struct hns_roce_dev *hr_dev, u64 in_param,
cond_resched();
}
- roce_set_field(val0, HNS_ROCE_VF_MB4_TAG_MASK,
- HNS_ROCE_VF_MB4_TAG_SHIFT, in_modifier);
- roce_set_field(val0, HNS_ROCE_VF_MB4_CMD_MASK,
- HNS_ROCE_VF_MB4_CMD_SHIFT, op);
- roce_set_field(val1, HNS_ROCE_VF_MB5_EVENT_MASK,
- HNS_ROCE_VF_MB5_EVENT_SHIFT, event);
- roce_set_field(val1, HNS_ROCE_VF_MB5_TOKEN_MASK,
- HNS_ROCE_VF_MB5_TOKEN_SHIFT, token);
-
- writeq(in_param, hcr + 0);
- writeq(out_param, hcr + 2);
-
- /* Memory barrier */
- wmb();
-
- writel(val0, hcr + 4);
- writel(val1, hcr + 5);
-
- mmiowb();
+ ret = hns_roce_mbox_post(hr_dev, in_param, out_param, in_modifier,
+ op_modifier, op, token, event);
+ if (ret)
+ dev_err(dev, "Post mailbox fail(%d)\n", ret);
- return 0;
+ return ret;
}
static int hns_roce_v2_chk_mbox(struct hns_roce_dev *hr_dev,
@@ -1663,6 +1986,9 @@ static int hns_roce_v2_chk_mbox(struct hns_roce_dev *hr_dev,
status = hns_roce_v2_cmd_complete(hr_dev);
if (status != 0x1) {
+ if (status == CMD_RST_PRC_EBUSY)
+ return status;
+
dev_err(dev, "mailbox status 0x%x!\n", status);
return -EBUSY;
}
@@ -1759,12 +2085,10 @@ static int hns_roce_v2_set_mac(struct hns_roce_dev *hr_dev, u8 phy_port,
static int set_mtpt_pbl(struct hns_roce_v2_mpt_entry *mpt_entry,
struct hns_roce_mr *mr)
{
- struct scatterlist *sg;
+ struct sg_dma_page_iter sg_iter;
u64 page_addr;
u64 *pages;
- int i, j;
- int len;
- int entry;
+ int i;
mpt_entry->pbl_size = cpu_to_le32(mr->pbl_size);
mpt_entry->pbl_ba_l = cpu_to_le32(lower_32_bits(mr->pbl_ba >> 3));
@@ -1777,17 +2101,14 @@ static int set_mtpt_pbl(struct hns_roce_v2_mpt_entry *mpt_entry,
return -ENOMEM;
i = 0;
- for_each_sg(mr->umem->sg_head.sgl, sg, mr->umem->nmap, entry) {
- len = sg_dma_len(sg) >> PAGE_SHIFT;
- for (j = 0; j < len; ++j) {
- page_addr = sg_dma_address(sg) +
- (j << mr->umem->page_shift);
- pages[i] = page_addr >> 6;
- /* Record the first 2 entry directly to MTPT table */
- if (i >= HNS_ROCE_V2_MAX_INNER_MTPT_NUM - 1)
- goto found;
- i++;
- }
+ for_each_sg_dma_page(mr->umem->sg_head.sgl, &sg_iter, mr->umem->nmap, 0) {
+ page_addr = sg_page_iter_dma_address(&sg_iter);
+ pages[i] = page_addr >> 6;
+
+ /* Record the first 2 entry directly to MTPT table */
+ if (i >= HNS_ROCE_V2_MAX_INNER_MTPT_NUM - 1)
+ goto found;
+ i++;
}
found:
mpt_entry->pa0_l = cpu_to_le32(lower_32_bits(pages[0]));
@@ -1869,6 +2190,9 @@ static int hns_roce_v2_rereg_write_mtpt(struct hns_roce_dev *hr_dev,
struct hns_roce_v2_mpt_entry *mpt_entry = mb_buf;
int ret = 0;
+ roce_set_field(mpt_entry->byte_4_pd_hop_st, V2_MPT_BYTE_4_MPT_ST_M,
+ V2_MPT_BYTE_4_MPT_ST_S, V2_MPT_ST_VALID);
+
if (flags & IB_MR_REREG_PD) {
roce_set_field(mpt_entry->byte_4_pd_hop_st, V2_MPT_BYTE_4_PD_M,
V2_MPT_BYTE_4_PD_S, pdn);
@@ -2007,6 +2331,27 @@ static struct hns_roce_v2_cqe *next_cqe_sw_v2(struct hns_roce_cq *hr_cq)
return get_sw_cqe_v2(hr_cq, hr_cq->cons_index);
}
+static void *get_srq_wqe(struct hns_roce_srq *srq, int n)
+{
+ return hns_roce_buf_offset(&srq->buf, n << srq->wqe_shift);
+}
+
+static void hns_roce_free_srq_wqe(struct hns_roce_srq *srq, int wqe_index)
+{
+ u32 bitmap_num;
+ int bit_num;
+
+ /* always called with interrupts disabled. */
+ spin_lock(&srq->lock);
+
+ bitmap_num = wqe_index / (sizeof(u64) * 8);
+ bit_num = wqe_index % (sizeof(u64) * 8);
+ srq->idx_que.bitmap[bitmap_num] |= (1ULL << bit_num);
+ srq->tail++;
+
+ spin_unlock(&srq->lock);
+}
+
static void hns_roce_v2_cq_set_ci(struct hns_roce_cq *hr_cq, u32 cons_index)
{
*hr_cq->set_ci_db = cons_index & 0xffffff;
@@ -2018,6 +2363,7 @@ static void __hns_roce_v2_cq_clean(struct hns_roce_cq *hr_cq, u32 qpn,
struct hns_roce_v2_cqe *cqe, *dest;
u32 prod_index;
int nfreed = 0;
+ int wqe_index;
u8 owner_bit;
for (prod_index = hr_cq->cons_index; get_sw_cqe_v2(hr_cq, prod_index);
@@ -2035,7 +2381,13 @@ static void __hns_roce_v2_cq_clean(struct hns_roce_cq *hr_cq, u32 qpn,
if ((roce_get_field(cqe->byte_16, V2_CQE_BYTE_16_LCL_QPN_M,
V2_CQE_BYTE_16_LCL_QPN_S) &
HNS_ROCE_V2_CQE_QPN_MASK) == qpn) {
- /* In v1 engine, not support SRQ */
+ if (srq &&
+ roce_get_bit(cqe->byte_4, V2_CQE_BYTE_4_S_R_S)) {
+ wqe_index = roce_get_field(cqe->byte_4,
+ V2_CQE_BYTE_4_WQE_INDX_M,
+ V2_CQE_BYTE_4_WQE_INDX_S);
+ hns_roce_free_srq_wqe(srq, wqe_index);
+ }
++nfreed;
} else if (nfreed) {
dest = get_cqe_v2(hr_cq, (prod_index + nfreed) &
@@ -2145,6 +2497,7 @@ static void hns_roce_v2_write_cqc(struct hns_roce_dev *hr_dev,
static int hns_roce_v2_req_notify_cq(struct ib_cq *ibcq,
enum ib_cq_notify_flags flags)
{
+ struct hns_roce_dev *hr_dev = to_hr_dev(ibcq->device);
struct hns_roce_cq *hr_cq = to_hr_cq(ibcq);
u32 notification_flag;
u32 doorbell[2];
@@ -2170,7 +2523,7 @@ static int hns_roce_v2_req_notify_cq(struct ib_cq *ibcq,
roce_set_bit(doorbell[1], V2_CQ_DB_PARAMETER_NOTIFY_S,
notification_flag);
- hns_roce_write64_k(doorbell, hr_cq->cq_db_l);
+ hns_roce_write64(hr_dev, doorbell, hr_cq->cq_db_l);
return 0;
}
@@ -2212,6 +2565,7 @@ static int hns_roce_handle_recv_inl_wqe(struct hns_roce_v2_cqe *cqe,
static int hns_roce_v2_poll_one(struct hns_roce_cq *hr_cq,
struct hns_roce_qp **cur_qp, struct ib_wc *wc)
{
+ struct hns_roce_srq *srq = NULL;
struct hns_roce_dev *hr_dev;
struct hns_roce_v2_cqe *cqe;
struct hns_roce_qp *hr_qp;
@@ -2254,6 +2608,37 @@ static int hns_roce_v2_poll_one(struct hns_roce_cq *hr_cq,
wc->qp = &(*cur_qp)->ibqp;
wc->vendor_err = 0;
+ if (is_send) {
+ wq = &(*cur_qp)->sq;
+ if ((*cur_qp)->sq_signal_bits) {
+ /*
+ * If sg_signal_bit is 1,
+ * firstly tail pointer updated to wqe
+ * which current cqe correspond to
+ */
+ wqe_ctr = (u16)roce_get_field(cqe->byte_4,
+ V2_CQE_BYTE_4_WQE_INDX_M,
+ V2_CQE_BYTE_4_WQE_INDX_S);
+ wq->tail += (wqe_ctr - (u16)wq->tail) &
+ (wq->wqe_cnt - 1);
+ }
+
+ wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
+ ++wq->tail;
+ } else if ((*cur_qp)->ibqp.srq) {
+ srq = to_hr_srq((*cur_qp)->ibqp.srq);
+ wqe_ctr = le16_to_cpu(roce_get_field(cqe->byte_4,
+ V2_CQE_BYTE_4_WQE_INDX_M,
+ V2_CQE_BYTE_4_WQE_INDX_S));
+ wc->wr_id = srq->wrid[wqe_ctr];
+ hns_roce_free_srq_wqe(srq, wqe_ctr);
+ } else {
+ /* Update tail pointer, record wr_id */
+ wq = &(*cur_qp)->rq;
+ wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
+ ++wq->tail;
+ }
+
status = roce_get_field(cqe->byte_4, V2_CQE_BYTE_4_STATUS_M,
V2_CQE_BYTE_4_STATUS_S);
switch (status & HNS_ROCE_V2_CQE_STATUS_MASK) {
@@ -2373,23 +2758,6 @@ static int hns_roce_v2_poll_one(struct hns_roce_cq *hr_cq,
wc->status = IB_WC_GENERAL_ERR;
break;
}
-
- wq = &(*cur_qp)->sq;
- if ((*cur_qp)->sq_signal_bits) {
- /*
- * If sg_signal_bit is 1,
- * firstly tail pointer updated to wqe
- * which current cqe correspond to
- */
- wqe_ctr = (u16)roce_get_field(cqe->byte_4,
- V2_CQE_BYTE_4_WQE_INDX_M,
- V2_CQE_BYTE_4_WQE_INDX_S);
- wq->tail += (wqe_ctr - (u16)wq->tail) &
- (wq->wqe_cnt - 1);
- }
-
- wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
- ++wq->tail;
} else {
/* RQ correspond to CQE */
wc->byte_len = le32_to_cpu(cqe->byte_cnt);
@@ -2434,11 +2802,6 @@ static int hns_roce_v2_poll_one(struct hns_roce_cq *hr_cq,
return -EAGAIN;
}
- /* Update tail pointer, record wr_id */
- wq = &(*cur_qp)->rq;
- wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
- ++wq->tail;
-
wc->sl = (u8)roce_get_field(cqe->byte_32, V2_CQE_BYTE_32_SL_M,
V2_CQE_BYTE_32_SL_S);
wc->src_qp = (u8)roce_get_field(cqe->byte_32,
@@ -2553,17 +2916,33 @@ static int hns_roce_v2_set_hem(struct hns_roce_dev *hr_dev,
case HEM_TYPE_SRQC:
op = HNS_ROCE_CMD_WRITE_SRQC_BT0;
break;
+ case HEM_TYPE_SCCC:
+ op = HNS_ROCE_CMD_WRITE_SCCC_BT0;
+ break;
+ case HEM_TYPE_QPC_TIMER:
+ op = HNS_ROCE_CMD_WRITE_QPC_TIMER_BT0;
+ break;
+ case HEM_TYPE_CQC_TIMER:
+ op = HNS_ROCE_CMD_WRITE_CQC_TIMER_BT0;
+ break;
default:
dev_warn(dev, "Table %d not to be written by mailbox!\n",
table->type);
return 0;
}
+
+ if (table->type == HEM_TYPE_SCCC && step_idx)
+ return 0;
+
op += step_idx;
mailbox = hns_roce_alloc_cmd_mailbox(hr_dev);
if (IS_ERR(mailbox))
return PTR_ERR(mailbox);
+ if (table->type == HEM_TYPE_SCCC)
+ obj = mhop.l0_idx;
+
if (check_whether_last_step(hop_num, step_idx)) {
hem = table->hem[hem_idx];
for (hns_roce_hem_first(hem, &iter);
@@ -2612,6 +2991,10 @@ static int hns_roce_v2_clear_hem(struct hns_roce_dev *hr_dev,
case HEM_TYPE_CQC:
op = HNS_ROCE_CMD_DESTROY_CQC_BT0;
break;
+ case HEM_TYPE_SCCC:
+ case HEM_TYPE_QPC_TIMER:
+ case HEM_TYPE_CQC_TIMER:
+ break;
case HEM_TYPE_SRQC:
op = HNS_ROCE_CMD_DESTROY_SRQC_BT0;
break;
@@ -2620,6 +3003,12 @@ static int hns_roce_v2_clear_hem(struct hns_roce_dev *hr_dev,
table->type);
return 0;
}
+
+ if (table->type == HEM_TYPE_SCCC ||
+ table->type == HEM_TYPE_QPC_TIMER ||
+ table->type == HEM_TYPE_CQC_TIMER)
+ return 0;
+
op += step_idx;
mailbox = hns_roce_alloc_cmd_mailbox(hr_dev);
@@ -2747,6 +3136,8 @@ static void modify_qp_reset_to_init(struct ib_qp *ibqp,
roce_set_field(context->byte_20_smac_sgid_idx,
V2_QPC_BYTE_20_RQ_SHIFT_M, V2_QPC_BYTE_20_RQ_SHIFT_S,
+ (hr_qp->ibqp.qp_type == IB_QPT_XRC_INI ||
+ hr_qp->ibqp.qp_type == IB_QPT_XRC_TGT || ibqp->srq) ? 0 :
ilog2((unsigned int)hr_qp->rq.wqe_cnt));
roce_set_field(qpc_mask->byte_20_smac_sgid_idx,
V2_QPC_BYTE_20_RQ_SHIFT_M, V2_QPC_BYTE_20_RQ_SHIFT_S, 0);
@@ -3088,6 +3479,8 @@ static void modify_qp_init_to_init(struct ib_qp *ibqp,
roce_set_field(context->byte_20_smac_sgid_idx,
V2_QPC_BYTE_20_RQ_SHIFT_M, V2_QPC_BYTE_20_RQ_SHIFT_S,
+ (hr_qp->ibqp.qp_type == IB_QPT_XRC_INI ||
+ hr_qp->ibqp.qp_type == IB_QPT_XRC_TGT || ibqp->srq) ? 0 :
ilog2((unsigned int)hr_qp->rq.wqe_cnt));
roce_set_field(qpc_mask->byte_20_smac_sgid_idx,
V2_QPC_BYTE_20_RQ_SHIFT_M, V2_QPC_BYTE_20_RQ_SHIFT_S, 0);
@@ -3572,10 +3965,16 @@ static int modify_qp_rtr_to_rts(struct ib_qp *ibqp,
V2_QPC_BYTE_212_LSN_S, 0);
if (attr_mask & IB_QP_TIMEOUT) {
- roce_set_field(context->byte_28_at_fl, V2_QPC_BYTE_28_AT_M,
- V2_QPC_BYTE_28_AT_S, attr->timeout);
- roce_set_field(qpc_mask->byte_28_at_fl, V2_QPC_BYTE_28_AT_M,
- V2_QPC_BYTE_28_AT_S, 0);
+ if (attr->timeout < 31) {
+ roce_set_field(context->byte_28_at_fl,
+ V2_QPC_BYTE_28_AT_M, V2_QPC_BYTE_28_AT_S,
+ attr->timeout);
+ roce_set_field(qpc_mask->byte_28_at_fl,
+ V2_QPC_BYTE_28_AT_M, V2_QPC_BYTE_28_AT_S,
+ 0);
+ } else {
+ dev_warn(dev, "Local ACK timeout shall be 0 to 30.\n");
+ }
}
roce_set_field(context->byte_172_sq_psn, V2_QPC_BYTE_172_SQ_CUR_PSN_M,
@@ -3601,6 +4000,21 @@ static int modify_qp_rtr_to_rts(struct ib_qp *ibqp,
return 0;
}
+static inline bool hns_roce_v2_check_qp_stat(enum ib_qp_state cur_state,
+ enum ib_qp_state new_state)
+{
+
+ if ((cur_state != IB_QPS_RESET &&
+ (new_state == IB_QPS_ERR || new_state == IB_QPS_RESET)) ||
+ ((cur_state == IB_QPS_RTS || cur_state == IB_QPS_SQD) &&
+ (new_state == IB_QPS_RTS || new_state == IB_QPS_SQD)) ||
+ (cur_state == IB_QPS_SQE && new_state == IB_QPS_RTS))
+ return true;
+
+ return false;
+
+}
+
static int hns_roce_v2_modify_qp(struct ib_qp *ibqp,
const struct ib_qp_attr *attr,
int attr_mask, enum ib_qp_state cur_state,
@@ -3613,7 +4027,7 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp,
struct device *dev = hr_dev->dev;
int ret = -EINVAL;
- context = kcalloc(2, sizeof(*context), GFP_KERNEL);
+ context = kcalloc(2, sizeof(*context), GFP_ATOMIC);
if (!context)
return -ENOMEM;
@@ -3626,6 +4040,7 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp,
*/
memset(qpc_mask, 0xff, sizeof(*qpc_mask));
if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
+ memset(qpc_mask, 0, sizeof(*qpc_mask));
modify_qp_reset_to_init(ibqp, attr, attr_mask, context,
qpc_mask);
} else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_INIT) {
@@ -3641,21 +4056,7 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp,
qpc_mask);
if (ret)
goto out;
- } else if ((cur_state == IB_QPS_RTS && new_state == IB_QPS_RTS) ||
- (cur_state == IB_QPS_SQE && new_state == IB_QPS_RTS) ||
- (cur_state == IB_QPS_RTS && new_state == IB_QPS_SQD) ||
- (cur_state == IB_QPS_SQD && new_state == IB_QPS_SQD) ||
- (cur_state == IB_QPS_SQD && new_state == IB_QPS_RTS) ||
- (cur_state == IB_QPS_INIT && new_state == IB_QPS_RESET) ||
- (cur_state == IB_QPS_RTR && new_state == IB_QPS_RESET) ||
- (cur_state == IB_QPS_RTS && new_state == IB_QPS_RESET) ||
- (cur_state == IB_QPS_ERR && new_state == IB_QPS_RESET) ||
- (cur_state == IB_QPS_INIT && new_state == IB_QPS_ERR) ||
- (cur_state == IB_QPS_RTR && new_state == IB_QPS_ERR) ||
- (cur_state == IB_QPS_RTS && new_state == IB_QPS_ERR) ||
- (cur_state == IB_QPS_SQD && new_state == IB_QPS_ERR) ||
- (cur_state == IB_QPS_SQE && new_state == IB_QPS_ERR) ||
- (cur_state == IB_QPS_ERR && new_state == IB_QPS_ERR)) {
+ } else if (hns_roce_v2_check_qp_stat(cur_state, new_state)) {
/* Nothing */
;
} else {
@@ -3673,13 +4074,16 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp,
roce_set_field(qpc_mask->byte_160_sq_ci_pi,
V2_QPC_BYTE_160_SQ_PRODUCER_IDX_M,
V2_QPC_BYTE_160_SQ_PRODUCER_IDX_S, 0);
- roce_set_field(context->byte_84_rq_ci_pi,
+
+ if (!ibqp->srq) {
+ roce_set_field(context->byte_84_rq_ci_pi,
V2_QPC_BYTE_84_RQ_PRODUCER_IDX_M,
V2_QPC_BYTE_84_RQ_PRODUCER_IDX_S,
hr_qp->rq.head);
- roce_set_field(qpc_mask->byte_84_rq_ci_pi,
+ roce_set_field(qpc_mask->byte_84_rq_ci_pi,
V2_QPC_BYTE_84_RQ_PRODUCER_IDX_M,
V2_QPC_BYTE_84_RQ_PRODUCER_IDX_S, 0);
+ }
}
if (attr_mask & IB_QP_AV) {
@@ -3789,6 +4193,11 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp,
if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC))
set_access_flags(hr_qp, context, qpc_mask, attr, attr_mask);
+ roce_set_bit(context->byte_108_rx_reqepsn, V2_QPC_BYTE_108_INV_CREDIT_S,
+ ibqp->srq ? 1 : 0);
+ roce_set_bit(qpc_mask->byte_108_rx_reqepsn,
+ V2_QPC_BYTE_108_INV_CREDIT_S, 0);
+
/* Every status migrate must change state */
roce_set_field(context->byte_60_qpst_tempid, V2_QPC_BYTE_60_QP_ST_M,
V2_QPC_BYTE_60_QP_ST_S, new_state);
@@ -4012,7 +4421,7 @@ out:
static int hns_roce_v2_destroy_qp_common(struct hns_roce_dev *hr_dev,
struct hns_roce_qp *hr_qp,
- int is_user)
+ bool is_user)
{
struct hns_roce_cq *send_cq, *recv_cq;
struct device *dev = hr_dev->dev;
@@ -4074,7 +4483,8 @@ static int hns_roce_v2_destroy_qp_common(struct hns_roce_dev *hr_dev,
hns_roce_free_db(hr_dev, &hr_qp->rdb);
}
- if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE) {
+ if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE) &&
+ hr_qp->rq.wqe_cnt) {
kfree(hr_qp->rq_inl_buf.wqe_list[0].sg_list);
kfree(hr_qp->rq_inl_buf.wqe_list);
}
@@ -4088,7 +4498,7 @@ static int hns_roce_v2_destroy_qp(struct ib_qp *ibqp)
struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
int ret;
- ret = hns_roce_v2_destroy_qp_common(hr_dev, hr_qp, !!ibqp->pd->uobject);
+ ret = hns_roce_v2_destroy_qp_common(hr_dev, hr_qp, ibqp->uobject);
if (ret) {
dev_err(hr_dev->dev, "Destroy qp failed(%d)\n", ret);
return ret;
@@ -4102,6 +4512,59 @@ static int hns_roce_v2_destroy_qp(struct ib_qp *ibqp)
return 0;
}
+static int hns_roce_v2_qp_flow_control_init(struct hns_roce_dev *hr_dev,
+ struct hns_roce_qp *hr_qp)
+{
+ struct hns_roce_sccc_clr_done *resp;
+ struct hns_roce_sccc_clr *clr;
+ struct hns_roce_cmq_desc desc;
+ int ret, i;
+
+ mutex_lock(&hr_dev->qp_table.scc_mutex);
+
+ /* set scc ctx clear done flag */
+ hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_RESET_SCCC, false);
+ ret = hns_roce_cmq_send(hr_dev, &desc, 1);
+ if (ret) {
+ dev_err(hr_dev->dev, "Reset SCC ctx failed(%d)\n", ret);
+ goto out;
+ }
+
+ /* clear scc context */
+ hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_CLR_SCCC, false);
+ clr = (struct hns_roce_sccc_clr *)desc.data;
+ clr->qpn = cpu_to_le32(hr_qp->qpn);
+ ret = hns_roce_cmq_send(hr_dev, &desc, 1);
+ if (ret) {
+ dev_err(hr_dev->dev, "Clear SCC ctx failed(%d)\n", ret);
+ goto out;
+ }
+
+ /* query scc context clear is done or not */
+ resp = (struct hns_roce_sccc_clr_done *)desc.data;
+ for (i = 0; i <= HNS_ROCE_CMQ_SCC_CLR_DONE_CNT; i++) {
+ hns_roce_cmq_setup_basic_desc(&desc,
+ HNS_ROCE_OPC_QUERY_SCCC, true);
+ ret = hns_roce_cmq_send(hr_dev, &desc, 1);
+ if (ret) {
+ dev_err(hr_dev->dev, "Query clr cmq failed(%d)\n", ret);
+ goto out;
+ }
+
+ if (resp->clr_done)
+ goto out;
+
+ msleep(20);
+ }
+
+ dev_err(hr_dev->dev, "Query SCC clr done flag overtime.\n");
+ ret = -ETIMEDOUT;
+
+out:
+ mutex_unlock(&hr_dev->qp_table.scc_mutex);
+ return ret;
+}
+
static int hns_roce_v2_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period)
{
struct hns_roce_dev *hr_dev = to_hr_dev(cq->device);
@@ -4159,7 +4622,8 @@ static void hns_roce_set_qps_to_err(struct hns_roce_dev *hr_dev, u32 qpn)
if (hr_qp->ibqp.uobject) {
if (hr_qp->sdb_en == 1) {
hr_qp->sq.head = *(int *)(hr_qp->sdb.virt_addr);
- hr_qp->rq.head = *(int *)(hr_qp->rdb.virt_addr);
+ if (hr_qp->rdb_en == 1)
+ hr_qp->rq.head = *(int *)(hr_qp->rdb.virt_addr);
} else {
dev_warn(hr_dev->dev, "flush cqe is unsupported in userspace!\n");
return;
@@ -4197,64 +4661,19 @@ static void hns_roce_irq_work_handle(struct work_struct *work)
dev_warn(dev, "Send queue drained.\n");
break;
case HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR:
- dev_err(dev, "Local work queue catastrophic error.\n");
+ dev_err(dev, "Local work queue 0x%x catas error, sub_type:%d\n",
+ qpn, irq_work->sub_type);
hns_roce_set_qps_to_err(irq_work->hr_dev, qpn);
- switch (irq_work->sub_type) {
- case HNS_ROCE_LWQCE_QPC_ERROR:
- dev_err(dev, "QP %d, QPC error.\n", qpn);
- break;
- case HNS_ROCE_LWQCE_MTU_ERROR:
- dev_err(dev, "QP %d, MTU error.\n", qpn);
- break;
- case HNS_ROCE_LWQCE_WQE_BA_ADDR_ERROR:
- dev_err(dev, "QP %d, WQE BA addr error.\n", qpn);
- break;
- case HNS_ROCE_LWQCE_WQE_ADDR_ERROR:
- dev_err(dev, "QP %d, WQE addr error.\n", qpn);
- break;
- case HNS_ROCE_LWQCE_SQ_WQE_SHIFT_ERROR:
- dev_err(dev, "QP %d, WQE shift error.\n", qpn);
- break;
- default:
- dev_err(dev, "Unhandled sub_event type %d.\n",
- irq_work->sub_type);
- break;
- }
break;
case HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR:
- dev_err(dev, "Invalid request local work queue error.\n");
+ dev_err(dev, "Invalid request local work queue 0x%x error.\n",
+ qpn);
hns_roce_set_qps_to_err(irq_work->hr_dev, qpn);
break;
case HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR:
- dev_err(dev, "Local access violation work queue error.\n");
+ dev_err(dev, "Local access violation work queue 0x%x error, sub_type:%d\n",
+ qpn, irq_work->sub_type);
hns_roce_set_qps_to_err(irq_work->hr_dev, qpn);
- switch (irq_work->sub_type) {
- case HNS_ROCE_LAVWQE_R_KEY_VIOLATION:
- dev_err(dev, "QP %d, R_key violation.\n", qpn);
- break;
- case HNS_ROCE_LAVWQE_LENGTH_ERROR:
- dev_err(dev, "QP %d, length error.\n", qpn);
- break;
- case HNS_ROCE_LAVWQE_VA_ERROR:
- dev_err(dev, "QP %d, VA error.\n", qpn);
- break;
- case HNS_ROCE_LAVWQE_PD_ERROR:
- dev_err(dev, "QP %d, PD error.\n", qpn);
- break;
- case HNS_ROCE_LAVWQE_RW_ACC_ERROR:
- dev_err(dev, "QP %d, rw acc error.\n", qpn);
- break;
- case HNS_ROCE_LAVWQE_KEY_STATE_ERROR:
- dev_err(dev, "QP %d, key state error.\n", qpn);
- break;
- case HNS_ROCE_LAVWQE_MR_OPERATION_ERROR:
- dev_err(dev, "QP %d, MR operation error.\n", qpn);
- break;
- default:
- dev_err(dev, "Unhandled sub_event type %d.\n",
- irq_work->sub_type);
- break;
- }
break;
case HNS_ROCE_EVENT_TYPE_SRQ_LIMIT_REACH:
dev_warn(dev, "SRQ limit reach.\n");
@@ -4305,6 +4724,7 @@ static void hns_roce_v2_init_irq_work(struct hns_roce_dev *hr_dev,
static void set_eq_cons_index_v2(struct hns_roce_eq *eq)
{
+ struct hns_roce_dev *hr_dev = eq->hr_dev;
u32 doorbell[2];
doorbell[0] = 0;
@@ -4331,7 +4751,7 @@ static void set_eq_cons_index_v2(struct hns_roce_eq *eq)
HNS_ROCE_V2_EQ_DB_PARA_S,
(eq->cons_index & HNS_ROCE_V2_CONS_IDX_M));
- hns_roce_write64_k(doorbell, eq->doorbell);
+ hns_roce_write64(hr_dev, doorbell, eq->doorbell);
}
static struct hns_roce_aeqe *get_aeqe_v2(struct hns_roce_eq *eq, u32 entry)
@@ -4384,6 +4804,7 @@ static int hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev,
int aeqe_found = 0;
int event_type;
int sub_type;
+ u32 srqn;
u32 qpn;
u32 cqn;
@@ -4406,6 +4827,9 @@ static int hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev,
cqn = roce_get_field(aeqe->event.cq_event.cq,
HNS_ROCE_V2_AEQE_EVENT_QUEUE_NUM_M,
HNS_ROCE_V2_AEQE_EVENT_QUEUE_NUM_S);
+ srqn = roce_get_field(aeqe->event.srq_event.srq,
+ HNS_ROCE_V2_AEQE_EVENT_QUEUE_NUM_M,
+ HNS_ROCE_V2_AEQE_EVENT_QUEUE_NUM_S);
switch (event_type) {
case HNS_ROCE_EVENT_TYPE_PATH_MIG:
@@ -4413,13 +4837,14 @@ static int hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev,
case HNS_ROCE_EVENT_TYPE_COMM_EST:
case HNS_ROCE_EVENT_TYPE_SQ_DRAINED:
case HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR:
+ case HNS_ROCE_EVENT_TYPE_SRQ_LAST_WQE_REACH:
case HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR:
case HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR:
hns_roce_qp_event(hr_dev, qpn, event_type);
break;
case HNS_ROCE_EVENT_TYPE_SRQ_LIMIT_REACH:
- case HNS_ROCE_EVENT_TYPE_SRQ_LAST_WQE_REACH:
case HNS_ROCE_EVENT_TYPE_SRQ_CATAS_ERROR:
+ hns_roce_srq_event(hr_dev, srqn, event_type);
break;
case HNS_ROCE_EVENT_TYPE_CQ_ACCESS_ERROR:
case HNS_ROCE_EVENT_TYPE_CQ_OVERFLOW:
@@ -4441,7 +4866,7 @@ static int hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev,
dev_err(dev, "Unhandled event %d on EQ %d at idx %u.\n",
event_type, eq->eqn, eq->cons_index);
break;
- };
+ }
eq->event_type = event_type;
eq->sub_type = sub_type;
@@ -4565,11 +4990,22 @@ static irqreturn_t hns_roce_v2_msix_interrupt_abn(int irq, void *dev_id)
int_en = roce_read(hr_dev, ROCEE_VF_ABN_INT_EN_REG);
if (roce_get_bit(int_st, HNS_ROCE_V2_VF_INT_ST_AEQ_OVERFLOW_S)) {
+ struct pci_dev *pdev = hr_dev->pci_dev;
+ struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev);
+ const struct hnae3_ae_ops *ops = ae_dev->ops;
+
dev_err(dev, "AEQ overflow!\n");
roce_set_bit(int_st, HNS_ROCE_V2_VF_INT_ST_AEQ_OVERFLOW_S, 1);
roce_write(hr_dev, ROCEE_VF_ABN_INT_ST_REG, int_st);
+ /* Set reset level for reset_event() */
+ if (ops->set_default_reset_request)
+ ops->set_default_reset_request(ae_dev,
+ HNAE3_FUNC_RESET);
+ if (ops->reset_event)
+ ops->reset_event(pdev, NULL);
+
roce_set_bit(int_en, HNS_ROCE_V2_VF_ABN_INT_EN_S, 1);
roce_write(hr_dev, ROCEE_VF_ABN_INT_EN_REG, int_en);
@@ -4970,7 +5406,6 @@ static int hns_roce_mhop_alloc_eq(struct hns_roce_dev *hr_dev,
if (!eq->buf[i])
goto err_dma_alloc_buf;
- memset(eq->buf[i], 0, size);
*(eq->bt_l0 + i) = eq->buf_dma[i];
eq_buf_cnt++;
@@ -5001,12 +5436,11 @@ static int hns_roce_mhop_alloc_eq(struct hns_roce_dev *hr_dev,
* eq->eqe_size;
}
eq->buf[idx] = dma_alloc_coherent(dev, size,
- &(eq->buf_dma[idx]),
- GFP_KERNEL);
+ &(eq->buf_dma[idx]),
+ GFP_KERNEL);
if (!eq->buf[idx])
goto err_dma_alloc_buf;
- memset(eq->buf[idx], 0, size);
*(eq->bt_l1[i] + j) = eq->buf_dma[idx];
eq_buf_cnt++;
@@ -5124,7 +5558,6 @@ static int hns_roce_v2_create_eq(struct hns_roce_dev *hr_dev,
goto err_alloc_buf;
}
- memset(eq->buf_list->buf, 0, buf_chk_sz);
} else {
ret = hns_roce_mhop_alloc_eq(hr_dev, eq);
if (ret) {
@@ -5332,6 +5765,301 @@ static void hns_roce_v2_cleanup_eq_table(struct hns_roce_dev *hr_dev)
destroy_workqueue(hr_dev->irq_workq);
}
+static void hns_roce_v2_write_srqc(struct hns_roce_dev *hr_dev,
+ struct hns_roce_srq *srq, u32 pdn, u16 xrcd,
+ u32 cqn, void *mb_buf, u64 *mtts_wqe,
+ u64 *mtts_idx, dma_addr_t dma_handle_wqe,
+ dma_addr_t dma_handle_idx)
+{
+ struct hns_roce_srq_context *srq_context;
+
+ srq_context = mb_buf;
+ memset(srq_context, 0, sizeof(*srq_context));
+
+ roce_set_field(srq_context->byte_4_srqn_srqst, SRQC_BYTE_4_SRQ_ST_M,
+ SRQC_BYTE_4_SRQ_ST_S, 1);
+
+ roce_set_field(srq_context->byte_4_srqn_srqst,
+ SRQC_BYTE_4_SRQ_WQE_HOP_NUM_M,
+ SRQC_BYTE_4_SRQ_WQE_HOP_NUM_S,
+ (hr_dev->caps.srqwqe_hop_num == HNS_ROCE_HOP_NUM_0 ? 0 :
+ hr_dev->caps.srqwqe_hop_num));
+ roce_set_field(srq_context->byte_4_srqn_srqst,
+ SRQC_BYTE_4_SRQ_SHIFT_M, SRQC_BYTE_4_SRQ_SHIFT_S,
+ ilog2(srq->max));
+
+ roce_set_field(srq_context->byte_4_srqn_srqst, SRQC_BYTE_4_SRQN_M,
+ SRQC_BYTE_4_SRQN_S, srq->srqn);
+
+ roce_set_field(srq_context->byte_8_limit_wl, SRQC_BYTE_8_SRQ_LIMIT_WL_M,
+ SRQC_BYTE_8_SRQ_LIMIT_WL_S, 0);
+
+ roce_set_field(srq_context->byte_12_xrcd, SRQC_BYTE_12_SRQ_XRCD_M,
+ SRQC_BYTE_12_SRQ_XRCD_S, xrcd);
+
+ srq_context->wqe_bt_ba = cpu_to_le32((u32)(dma_handle_wqe >> 3));
+
+ roce_set_field(srq_context->byte_24_wqe_bt_ba,
+ SRQC_BYTE_24_SRQ_WQE_BT_BA_M,
+ SRQC_BYTE_24_SRQ_WQE_BT_BA_S,
+ cpu_to_le32(dma_handle_wqe >> 35));
+
+ roce_set_field(srq_context->byte_28_rqws_pd, SRQC_BYTE_28_PD_M,
+ SRQC_BYTE_28_PD_S, pdn);
+ roce_set_field(srq_context->byte_28_rqws_pd, SRQC_BYTE_28_RQWS_M,
+ SRQC_BYTE_28_RQWS_S, srq->max_gs <= 0 ? 0 :
+ fls(srq->max_gs - 1));
+
+ srq_context->idx_bt_ba = (u32)(dma_handle_idx >> 3);
+ srq_context->idx_bt_ba = cpu_to_le32(srq_context->idx_bt_ba);
+ roce_set_field(srq_context->rsv_idx_bt_ba,
+ SRQC_BYTE_36_SRQ_IDX_BT_BA_M,
+ SRQC_BYTE_36_SRQ_IDX_BT_BA_S,
+ cpu_to_le32(dma_handle_idx >> 35));
+
+ srq_context->idx_cur_blk_addr = (u32)(mtts_idx[0] >> PAGE_ADDR_SHIFT);
+ srq_context->idx_cur_blk_addr =
+ cpu_to_le32(srq_context->idx_cur_blk_addr);
+ roce_set_field(srq_context->byte_44_idxbufpgsz_addr,
+ SRQC_BYTE_44_SRQ_IDX_CUR_BLK_ADDR_M,
+ SRQC_BYTE_44_SRQ_IDX_CUR_BLK_ADDR_S,
+ cpu_to_le32((mtts_idx[0]) >> (32 + PAGE_ADDR_SHIFT)));
+ roce_set_field(srq_context->byte_44_idxbufpgsz_addr,
+ SRQC_BYTE_44_SRQ_IDX_HOP_NUM_M,
+ SRQC_BYTE_44_SRQ_IDX_HOP_NUM_S,
+ hr_dev->caps.idx_hop_num == HNS_ROCE_HOP_NUM_0 ? 0 :
+ hr_dev->caps.idx_hop_num);
+
+ roce_set_field(srq_context->byte_44_idxbufpgsz_addr,
+ SRQC_BYTE_44_SRQ_IDX_BA_PG_SZ_M,
+ SRQC_BYTE_44_SRQ_IDX_BA_PG_SZ_S,
+ hr_dev->caps.idx_ba_pg_sz);
+ roce_set_field(srq_context->byte_44_idxbufpgsz_addr,
+ SRQC_BYTE_44_SRQ_IDX_BUF_PG_SZ_M,
+ SRQC_BYTE_44_SRQ_IDX_BUF_PG_SZ_S,
+ hr_dev->caps.idx_buf_pg_sz);
+
+ srq_context->idx_nxt_blk_addr = (u32)(mtts_idx[1] >> PAGE_ADDR_SHIFT);
+ srq_context->idx_nxt_blk_addr =
+ cpu_to_le32(srq_context->idx_nxt_blk_addr);
+ roce_set_field(srq_context->rsv_idxnxtblkaddr,
+ SRQC_BYTE_52_SRQ_IDX_NXT_BLK_ADDR_M,
+ SRQC_BYTE_52_SRQ_IDX_NXT_BLK_ADDR_S,
+ cpu_to_le32((mtts_idx[1]) >> (32 + PAGE_ADDR_SHIFT)));
+ roce_set_field(srq_context->byte_56_xrc_cqn,
+ SRQC_BYTE_56_SRQ_XRC_CQN_M, SRQC_BYTE_56_SRQ_XRC_CQN_S,
+ cqn);
+ roce_set_field(srq_context->byte_56_xrc_cqn,
+ SRQC_BYTE_56_SRQ_WQE_BA_PG_SZ_M,
+ SRQC_BYTE_56_SRQ_WQE_BA_PG_SZ_S,
+ hr_dev->caps.srqwqe_ba_pg_sz + PG_SHIFT_OFFSET);
+ roce_set_field(srq_context->byte_56_xrc_cqn,
+ SRQC_BYTE_56_SRQ_WQE_BUF_PG_SZ_M,
+ SRQC_BYTE_56_SRQ_WQE_BUF_PG_SZ_S,
+ hr_dev->caps.srqwqe_buf_pg_sz + PG_SHIFT_OFFSET);
+
+ roce_set_bit(srq_context->db_record_addr_record_en,
+ SRQC_BYTE_60_SRQ_RECORD_EN_S, 0);
+}
+
+static int hns_roce_v2_modify_srq(struct ib_srq *ibsrq,
+ struct ib_srq_attr *srq_attr,
+ enum ib_srq_attr_mask srq_attr_mask,
+ struct ib_udata *udata)
+{
+ struct hns_roce_dev *hr_dev = to_hr_dev(ibsrq->device);
+ struct hns_roce_srq *srq = to_hr_srq(ibsrq);
+ struct hns_roce_srq_context *srq_context;
+ struct hns_roce_srq_context *srqc_mask;
+ struct hns_roce_cmd_mailbox *mailbox;
+ int ret;
+
+ if (srq_attr_mask & IB_SRQ_LIMIT) {
+ if (srq_attr->srq_limit >= srq->max)
+ return -EINVAL;
+
+ mailbox = hns_roce_alloc_cmd_mailbox(hr_dev);
+ if (IS_ERR(mailbox))
+ return PTR_ERR(mailbox);
+
+ srq_context = mailbox->buf;
+ srqc_mask = (struct hns_roce_srq_context *)mailbox->buf + 1;
+
+ memset(srqc_mask, 0xff, sizeof(*srqc_mask));
+
+ roce_set_field(srq_context->byte_8_limit_wl,
+ SRQC_BYTE_8_SRQ_LIMIT_WL_M,
+ SRQC_BYTE_8_SRQ_LIMIT_WL_S, srq_attr->srq_limit);
+ roce_set_field(srqc_mask->byte_8_limit_wl,
+ SRQC_BYTE_8_SRQ_LIMIT_WL_M,
+ SRQC_BYTE_8_SRQ_LIMIT_WL_S, 0);
+
+ ret = hns_roce_cmd_mbox(hr_dev, mailbox->dma, 0, srq->srqn, 0,
+ HNS_ROCE_CMD_MODIFY_SRQC,
+ HNS_ROCE_CMD_TIMEOUT_MSECS);
+ hns_roce_free_cmd_mailbox(hr_dev, mailbox);
+ if (ret) {
+ dev_err(hr_dev->dev,
+ "MODIFY SRQ Failed to cmd mailbox.\n");
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static int hns_roce_v2_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr)
+{
+ struct hns_roce_dev *hr_dev = to_hr_dev(ibsrq->device);
+ struct hns_roce_srq *srq = to_hr_srq(ibsrq);
+ struct hns_roce_srq_context *srq_context;
+ struct hns_roce_cmd_mailbox *mailbox;
+ int limit_wl;
+ int ret;
+
+ mailbox = hns_roce_alloc_cmd_mailbox(hr_dev);
+ if (IS_ERR(mailbox))
+ return PTR_ERR(mailbox);
+
+ srq_context = mailbox->buf;
+ ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma, srq->srqn, 0,
+ HNS_ROCE_CMD_QUERY_SRQC,
+ HNS_ROCE_CMD_TIMEOUT_MSECS);
+ if (ret) {
+ dev_err(hr_dev->dev, "QUERY SRQ cmd process error\n");
+ goto out;
+ }
+
+ limit_wl = roce_get_field(srq_context->byte_8_limit_wl,
+ SRQC_BYTE_8_SRQ_LIMIT_WL_M,
+ SRQC_BYTE_8_SRQ_LIMIT_WL_S);
+
+ attr->srq_limit = limit_wl;
+ attr->max_wr = srq->max - 1;
+ attr->max_sge = srq->max_gs;
+
+ memcpy(srq_context, mailbox->buf, sizeof(*srq_context));
+
+out:
+ hns_roce_free_cmd_mailbox(hr_dev, mailbox);
+ return ret;
+}
+
+static int find_empty_entry(struct hns_roce_idx_que *idx_que)
+{
+ int bit_num;
+ int i;
+
+ /* bitmap[i] is set zero if all bits are allocated */
+ for (i = 0; idx_que->bitmap[i] == 0; ++i)
+ ;
+ bit_num = ffs(idx_que->bitmap[i]);
+ idx_que->bitmap[i] &= ~(1ULL << (bit_num - 1));
+
+ return i * sizeof(u64) * 8 + (bit_num - 1);
+}
+
+static void fill_idx_queue(struct hns_roce_idx_que *idx_que,
+ int cur_idx, int wqe_idx)
+{
+ unsigned int *addr;
+
+ addr = (unsigned int *)hns_roce_buf_offset(&idx_que->idx_buf,
+ cur_idx * idx_que->entry_sz);
+ *addr = wqe_idx;
+}
+
+static int hns_roce_v2_post_srq_recv(struct ib_srq *ibsrq,
+ const struct ib_recv_wr *wr,
+ const struct ib_recv_wr **bad_wr)
+{
+ struct hns_roce_dev *hr_dev = to_hr_dev(ibsrq->device);
+ struct hns_roce_srq *srq = to_hr_srq(ibsrq);
+ struct hns_roce_v2_wqe_data_seg *dseg;
+ struct hns_roce_v2_db srq_db;
+ unsigned long flags;
+ int ret = 0;
+ int wqe_idx;
+ void *wqe;
+ int nreq;
+ int ind;
+ int i;
+
+ spin_lock_irqsave(&srq->lock, flags);
+
+ ind = srq->head & (srq->max - 1);
+
+ for (nreq = 0; wr; ++nreq, wr = wr->next) {
+ if (unlikely(wr->num_sge > srq->max_gs)) {
+ ret = -EINVAL;
+ *bad_wr = wr;
+ break;
+ }
+
+ if (unlikely(srq->head == srq->tail)) {
+ ret = -ENOMEM;
+ *bad_wr = wr;
+ break;
+ }
+
+ wqe_idx = find_empty_entry(&srq->idx_que);
+ fill_idx_queue(&srq->idx_que, ind, wqe_idx);
+ wqe = get_srq_wqe(srq, wqe_idx);
+ dseg = (struct hns_roce_v2_wqe_data_seg *)wqe;
+
+ for (i = 0; i < wr->num_sge; ++i) {
+ dseg[i].len = cpu_to_le32(wr->sg_list[i].length);
+ dseg[i].lkey = cpu_to_le32(wr->sg_list[i].lkey);
+ dseg[i].addr = cpu_to_le64(wr->sg_list[i].addr);
+ }
+
+ if (i < srq->max_gs) {
+ dseg->len = 0;
+ dseg->lkey = cpu_to_le32(0x100);
+ dseg->addr = 0;
+ }
+
+ srq->wrid[wqe_idx] = wr->wr_id;
+ ind = (ind + 1) & (srq->max - 1);
+ }
+
+ if (likely(nreq)) {
+ srq->head += nreq;
+
+ /*
+ * Make sure that descriptors are written before
+ * doorbell record.
+ */
+ wmb();
+
+ srq_db.byte_4 = HNS_ROCE_V2_SRQ_DB << 24 | srq->srqn;
+ srq_db.parameter = srq->head;
+
+ hns_roce_write64(hr_dev, (__le32 *)&srq_db, srq->db_reg_l);
+
+ }
+
+ spin_unlock_irqrestore(&srq->lock, flags);
+
+ return ret;
+}
+
+static const struct ib_device_ops hns_roce_v2_dev_ops = {
+ .destroy_qp = hns_roce_v2_destroy_qp,
+ .modify_cq = hns_roce_v2_modify_cq,
+ .poll_cq = hns_roce_v2_poll_cq,
+ .post_recv = hns_roce_v2_post_recv,
+ .post_send = hns_roce_v2_post_send,
+ .query_qp = hns_roce_v2_query_qp,
+ .req_notify_cq = hns_roce_v2_req_notify_cq,
+};
+
+static const struct ib_device_ops hns_roce_v2_dev_srq_ops = {
+ .modify_srq = hns_roce_v2_modify_srq,
+ .post_srq_recv = hns_roce_v2_post_srq_recv,
+ .query_srq = hns_roce_v2_query_srq,
+};
+
static const struct hns_roce_hw hns_roce_hw_v2 = {
.cmq_init = hns_roce_v2_cmq_init,
.cmq_exit = hns_roce_v2_cmq_exit,
@@ -5340,6 +6068,7 @@ static const struct hns_roce_hw hns_roce_hw_v2 = {
.hw_exit = hns_roce_v2_exit,
.post_mbox = hns_roce_v2_post_mbox,
.chk_mbox = hns_roce_v2_chk_mbox,
+ .rst_prc_mbox = hns_roce_v2_rst_process_cmd,
.set_gid = hns_roce_v2_set_gid,
.set_mac = hns_roce_v2_set_mac,
.write_mtpt = hns_roce_v2_write_mtpt,
@@ -5352,6 +6081,7 @@ static const struct hns_roce_hw hns_roce_hw_v2 = {
.modify_qp = hns_roce_v2_modify_qp,
.query_qp = hns_roce_v2_query_qp,
.destroy_qp = hns_roce_v2_destroy_qp,
+ .qp_flow_control_init = hns_roce_v2_qp_flow_control_init,
.modify_cq = hns_roce_v2_modify_cq,
.post_send = hns_roce_v2_post_send,
.post_recv = hns_roce_v2_post_recv,
@@ -5359,6 +6089,12 @@ static const struct hns_roce_hw hns_roce_hw_v2 = {
.poll_cq = hns_roce_v2_poll_cq,
.init_eq = hns_roce_v2_init_eq_table,
.cleanup_eq = hns_roce_v2_cleanup_eq_table,
+ .write_srqc = hns_roce_v2_write_srqc,
+ .modify_srq = hns_roce_v2_modify_srq,
+ .query_srq = hns_roce_v2_query_srq,
+ .post_srq_recv = hns_roce_v2_post_srq_recv,
+ .hns_roce_dev_ops = &hns_roce_v2_dev_ops,
+ .hns_roce_dev_srq_ops = &hns_roce_v2_dev_srq_ops,
};
static const struct pci_device_id hns_roce_hw_v2_pci_tbl[] = {
@@ -5376,6 +6112,7 @@ MODULE_DEVICE_TABLE(pci, hns_roce_hw_v2_pci_tbl);
static int hns_roce_hw_v2_get_cfg(struct hns_roce_dev *hr_dev,
struct hnae3_handle *handle)
{
+ struct hns_roce_v2_priv *priv = hr_dev->priv;
const struct pci_device_id *id;
int i;
@@ -5406,15 +6143,18 @@ static int hns_roce_hw_v2_get_cfg(struct hns_roce_dev *hr_dev,
hr_dev->cmd_mod = 1;
hr_dev->loop_idc = 0;
+ hr_dev->reset_cnt = handle->ae_algo->ops->ae_dev_reset_cnt(handle);
+ priv->handle = handle;
+
return 0;
}
-static int hns_roce_hw_v2_init_instance(struct hnae3_handle *handle)
+static int __hns_roce_hw_v2_init_instance(struct hnae3_handle *handle)
{
struct hns_roce_dev *hr_dev;
int ret;
- hr_dev = (struct hns_roce_dev *)ib_alloc_device(sizeof(*hr_dev));
+ hr_dev = ib_alloc_device(hns_roce_dev, ib_dev);
if (!hr_dev)
return -ENOMEM;
@@ -5426,7 +6166,6 @@ static int hns_roce_hw_v2_init_instance(struct hnae3_handle *handle)
hr_dev->pci_dev = handle->pdev;
hr_dev->dev = &handle->pdev->dev;
- handle->priv = hr_dev;
ret = hns_roce_hw_v2_get_cfg(hr_dev, handle);
if (ret) {
@@ -5440,6 +6179,8 @@ static int hns_roce_hw_v2_init_instance(struct hnae3_handle *handle)
goto error_failed_get_cfg;
}
+ handle->priv = hr_dev;
+
return 0;
error_failed_get_cfg:
@@ -5451,7 +6192,7 @@ error_failed_kzalloc:
return ret;
}
-static void hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle,
+static void __hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle,
bool reset)
{
struct hns_roce_dev *hr_dev = (struct hns_roce_dev *)handle->priv;
@@ -5459,24 +6200,79 @@ static void hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle,
if (!hr_dev)
return;
+ handle->priv = NULL;
hns_roce_exit(hr_dev);
kfree(hr_dev->priv);
ib_dealloc_device(&hr_dev->ib_dev);
}
+static int hns_roce_hw_v2_init_instance(struct hnae3_handle *handle)
+{
+ const struct hnae3_ae_ops *ops = handle->ae_algo->ops;
+ struct device *dev = &handle->pdev->dev;
+ int ret;
+
+ handle->rinfo.instance_state = HNS_ROCE_STATE_INIT;
+
+ if (ops->ae_dev_resetting(handle) || ops->get_hw_reset_stat(handle)) {
+ handle->rinfo.instance_state = HNS_ROCE_STATE_NON_INIT;
+ goto reset_chk_err;
+ }
+
+ ret = __hns_roce_hw_v2_init_instance(handle);
+ if (ret) {
+ handle->rinfo.instance_state = HNS_ROCE_STATE_NON_INIT;
+ dev_err(dev, "RoCE instance init failed! ret = %d\n", ret);
+ if (ops->ae_dev_resetting(handle) ||
+ ops->get_hw_reset_stat(handle))
+ goto reset_chk_err;
+ else
+ return ret;
+ }
+
+ handle->rinfo.instance_state = HNS_ROCE_STATE_INITED;
+
+
+ return 0;
+
+reset_chk_err:
+ dev_err(dev, "Device is busy in resetting state.\n"
+ "please retry later.\n");
+
+ return -EBUSY;
+}
+
+static void hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle,
+ bool reset)
+{
+ if (handle->rinfo.instance_state != HNS_ROCE_STATE_INITED)
+ return;
+
+ handle->rinfo.instance_state = HNS_ROCE_STATE_UNINIT;
+
+ __hns_roce_hw_v2_uninit_instance(handle, reset);
+
+ handle->rinfo.instance_state = HNS_ROCE_STATE_NON_INIT;
+}
static int hns_roce_hw_v2_reset_notify_down(struct hnae3_handle *handle)
{
- struct hns_roce_dev *hr_dev = (struct hns_roce_dev *)handle->priv;
+ struct hns_roce_dev *hr_dev;
struct ib_event event;
- if (!hr_dev) {
- dev_err(&handle->pdev->dev,
- "Input parameter handle->priv is NULL!\n");
- return -EINVAL;
+ if (handle->rinfo.instance_state != HNS_ROCE_STATE_INITED) {
+ set_bit(HNS_ROCE_RST_DIRECT_RETURN, &handle->rinfo.state);
+ return 0;
}
+ handle->rinfo.reset_state = HNS_ROCE_STATE_RST_DOWN;
+ clear_bit(HNS_ROCE_RST_DIRECT_RETURN, &handle->rinfo.state);
+
+ hr_dev = (struct hns_roce_dev *)handle->priv;
+ if (!hr_dev)
+ return 0;
+
hr_dev->active = false;
- hr_dev->is_reset = true;
+ hr_dev->dis_db = true;
event.event = IB_EVENT_DEVICE_FATAL;
event.device = &hr_dev->ib_dev;
@@ -5488,17 +6284,29 @@ static int hns_roce_hw_v2_reset_notify_down(struct hnae3_handle *handle)
static int hns_roce_hw_v2_reset_notify_init(struct hnae3_handle *handle)
{
+ struct device *dev = &handle->pdev->dev;
int ret;
- ret = hns_roce_hw_v2_init_instance(handle);
+ if (test_and_clear_bit(HNS_ROCE_RST_DIRECT_RETURN,
+ &handle->rinfo.state)) {
+ handle->rinfo.reset_state = HNS_ROCE_STATE_RST_INITED;
+ return 0;
+ }
+
+ handle->rinfo.reset_state = HNS_ROCE_STATE_RST_INIT;
+
+ dev_info(&handle->pdev->dev, "In reset process RoCE client reinit.\n");
+ ret = __hns_roce_hw_v2_init_instance(handle);
if (ret) {
/* when reset notify type is HNAE3_INIT_CLIENT In reset notify
* callback function, RoCE Engine reinitialize. If RoCE reinit
* failed, we should inform NIC driver.
*/
handle->priv = NULL;
- dev_err(&handle->pdev->dev,
- "In reset process RoCE reinit failed %d.\n", ret);
+ dev_err(dev, "In reset process RoCE reinit failed %d.\n", ret);
+ } else {
+ handle->rinfo.reset_state = HNS_ROCE_STATE_RST_INITED;
+ dev_info(dev, "Reset done, RoCE client reinit finished.\n");
}
return ret;
@@ -5506,8 +6314,14 @@ static int hns_roce_hw_v2_reset_notify_init(struct hnae3_handle *handle)
static int hns_roce_hw_v2_reset_notify_uninit(struct hnae3_handle *handle)
{
+ if (test_bit(HNS_ROCE_RST_DIRECT_RETURN, &handle->rinfo.state))
+ return 0;
+
+ handle->rinfo.reset_state = HNS_ROCE_STATE_RST_UNINIT;
+ dev_info(&handle->pdev->dev, "In reset process RoCE client uninit.\n");
msleep(100);
- hns_roce_hw_v2_uninit_instance(handle, false);
+ __hns_roce_hw_v2_uninit_instance(handle, false);
+
return 0;
}
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
index 8bc820635bbd..f1f1b75812f9 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
@@ -36,6 +36,7 @@
#include <linux/bitops.h>
#define HNS_ROCE_VF_QPC_BT_NUM 256
+#define HNS_ROCE_VF_SCCC_BT_NUM 64
#define HNS_ROCE_VF_SRQC_BT_NUM 64
#define HNS_ROCE_VF_CQC_BT_NUM 64
#define HNS_ROCE_VF_MPT_BT_NUM 64
@@ -44,12 +45,20 @@
#define HNS_ROCE_VF_SGID_NUM 32
#define HNS_ROCE_VF_SL_NUM 8
-#define HNS_ROCE_V2_MAX_QP_NUM 0x2000
+#define HNS_ROCE_V2_MAX_QP_NUM 0x100000
+#define HNS_ROCE_V2_MAX_QPC_TIMER_NUM 0x200
#define HNS_ROCE_V2_MAX_WQE_NUM 0x8000
-#define HNS_ROCE_V2_MAX_CQ_NUM 0x8000
+#define HNS_ROCE_V2_MAX_SRQ 0x100000
+#define HNS_ROCE_V2_MAX_SRQ_WR 0x8000
+#define HNS_ROCE_V2_MAX_SRQ_SGE 0x100
+#define HNS_ROCE_V2_MAX_CQ_NUM 0x100000
+#define HNS_ROCE_V2_MAX_CQC_TIMER_NUM 0x100
+#define HNS_ROCE_V2_MAX_SRQ_NUM 0x100000
#define HNS_ROCE_V2_MAX_CQE_NUM 0x10000
+#define HNS_ROCE_V2_MAX_SRQWQE_NUM 0x8000
#define HNS_ROCE_V2_MAX_RQ_SGE_NUM 0x100
#define HNS_ROCE_V2_MAX_SQ_SGE_NUM 0xff
+#define HNS_ROCE_V2_MAX_SRQ_SGE_NUM 0x100
#define HNS_ROCE_V2_MAX_EXTEND_SGE_NUM 0x200000
#define HNS_ROCE_V2_MAX_SQ_INLINE 0x20
#define HNS_ROCE_V2_UAR_NUM 256
@@ -58,9 +67,11 @@
#define HNS_ROCE_V2_COMP_VEC_NUM 63
#define HNS_ROCE_V2_AEQE_VEC_NUM 1
#define HNS_ROCE_V2_ABNORMAL_VEC_NUM 1
-#define HNS_ROCE_V2_MAX_MTPT_NUM 0x8000
+#define HNS_ROCE_V2_MAX_MTPT_NUM 0x100000
#define HNS_ROCE_V2_MAX_MTT_SEGS 0x1000000
#define HNS_ROCE_V2_MAX_CQE_SEGS 0x1000000
+#define HNS_ROCE_V2_MAX_SRQWQE_SEGS 0x1000000
+#define HNS_ROCE_V2_MAX_IDX_SEGS 0x1000000
#define HNS_ROCE_V2_MAX_PD_NUM 0x1000000
#define HNS_ROCE_V2_MAX_QP_INIT_RDMA 128
#define HNS_ROCE_V2_MAX_QP_DEST_RDMA 128
@@ -71,9 +82,13 @@
#define HNS_ROCE_V2_IRRL_ENTRY_SZ 64
#define HNS_ROCE_V2_TRRL_ENTRY_SZ 48
#define HNS_ROCE_V2_CQC_ENTRY_SZ 64
+#define HNS_ROCE_V2_SRQC_ENTRY_SZ 64
#define HNS_ROCE_V2_MTPT_ENTRY_SZ 64
#define HNS_ROCE_V2_MTT_ENTRY_SZ 64
#define HNS_ROCE_V2_CQE_ENTRY_SIZE 32
+#define HNS_ROCE_V2_SCCC_ENTRY_SZ 32
+#define HNS_ROCE_V2_QPC_TIMER_ENTRY_SZ 4096
+#define HNS_ROCE_V2_CQC_TIMER_ENTRY_SZ 4096
#define HNS_ROCE_V2_PAGE_SIZE_SUPPORTED 0xFFFFF000
#define HNS_ROCE_V2_MAX_INNER_MTPT_NUM 2
#define HNS_ROCE_INVALID_LKEY 0x100
@@ -81,11 +96,16 @@
#define HNS_ROCE_V2_UC_RC_SGE_NUM_IN_WQE 2
#define HNS_ROCE_V2_RSV_QPS 8
+#define HNS_ROCE_V2_HW_RST_TIMEOUT 1000
+
#define HNS_ROCE_CONTEXT_HOP_NUM 1
+#define HNS_ROCE_SCCC_HOP_NUM 1
#define HNS_ROCE_MTT_HOP_NUM 1
#define HNS_ROCE_CQE_HOP_NUM 1
+#define HNS_ROCE_SRQWQE_HOP_NUM 1
#define HNS_ROCE_PBL_HOP_NUM 2
#define HNS_ROCE_EQE_HOP_NUM 2
+#define HNS_ROCE_IDX_HOP_NUM 1
#define HNS_ROCE_V2_GID_INDEX_NUM 256
@@ -109,10 +129,14 @@
#define HNS_ROCE_CMQ_EN_B 16
#define HNS_ROCE_CMQ_ENABLE BIT(HNS_ROCE_CMQ_EN_B)
+#define HNS_ROCE_CMQ_SCC_CLR_DONE_CNT 5
+
#define check_whether_last_step(hop_num, step_idx) \
((step_idx == 0 && hop_num == HNS_ROCE_HOP_NUM_0) || \
(step_idx == 1 && hop_num == 1) || \
(step_idx == 2 && hop_num == 2))
+#define HNS_ICL_SWITCH_CMD_ROCEE_SEL_SHIFT 0
+#define HNS_ICL_SWITCH_CMD_ROCEE_SEL BIT(HNS_ICL_SWITCH_CMD_ROCEE_SEL_SHIFT)
#define CMD_CSQ_DESC_NUM 1024
#define CMD_CRQ_DESC_NUM 1024
@@ -211,9 +235,16 @@ enum hns_roce_opcode_type {
HNS_ROCE_OPC_ALLOC_VF_RES = 0x8401,
HNS_ROCE_OPC_CFG_EXT_LLM = 0x8403,
HNS_ROCE_OPC_CFG_TMOUT_LLM = 0x8404,
+ HNS_ROCE_OPC_QUERY_PF_TIMER_RES = 0x8406,
HNS_ROCE_OPC_CFG_SGID_TB = 0x8500,
HNS_ROCE_OPC_CFG_SMAC_TB = 0x8501,
+ HNS_ROCE_OPC_POST_MB = 0x8504,
+ HNS_ROCE_OPC_QUERY_MB_ST = 0x8505,
HNS_ROCE_OPC_CFG_BT_ATTR = 0x8506,
+ HNS_ROCE_OPC_CLR_SCCC = 0x8509,
+ HNS_ROCE_OPC_QUERY_SCCC = 0x850a,
+ HNS_ROCE_OPC_RESET_SCCC = 0x850b,
+ HNS_SWITCH_PARAMETER_CFG = 0x1033,
};
enum {
@@ -325,6 +356,90 @@ struct hns_roce_v2_cq_context {
#define V2_CQC_BYTE_64_SE_CQE_IDX_S 0
#define V2_CQC_BYTE_64_SE_CQE_IDX_M GENMASK(23, 0)
+struct hns_roce_srq_context {
+ __le32 byte_4_srqn_srqst;
+ __le32 byte_8_limit_wl;
+ __le32 byte_12_xrcd;
+ __le32 byte_16_pi_ci;
+ __le32 wqe_bt_ba;
+ __le32 byte_24_wqe_bt_ba;
+ __le32 byte_28_rqws_pd;
+ __le32 idx_bt_ba;
+ __le32 rsv_idx_bt_ba;
+ __le32 idx_cur_blk_addr;
+ __le32 byte_44_idxbufpgsz_addr;
+ __le32 idx_nxt_blk_addr;
+ __le32 rsv_idxnxtblkaddr;
+ __le32 byte_56_xrc_cqn;
+ __le32 db_record_addr_record_en;
+ __le32 db_record_addr;
+};
+
+#define SRQC_BYTE_4_SRQ_ST_S 0
+#define SRQC_BYTE_4_SRQ_ST_M GENMASK(1, 0)
+
+#define SRQC_BYTE_4_SRQ_WQE_HOP_NUM_S 2
+#define SRQC_BYTE_4_SRQ_WQE_HOP_NUM_M GENMASK(3, 2)
+
+#define SRQC_BYTE_4_SRQ_SHIFT_S 4
+#define SRQC_BYTE_4_SRQ_SHIFT_M GENMASK(7, 4)
+
+#define SRQC_BYTE_4_SRQN_S 8
+#define SRQC_BYTE_4_SRQN_M GENMASK(31, 8)
+
+#define SRQC_BYTE_8_SRQ_LIMIT_WL_S 0
+#define SRQC_BYTE_8_SRQ_LIMIT_WL_M GENMASK(15, 0)
+
+#define SRQC_BYTE_12_SRQ_XRCD_S 0
+#define SRQC_BYTE_12_SRQ_XRCD_M GENMASK(23, 0)
+
+#define SRQC_BYTE_16_SRQ_PRODUCER_IDX_S 0
+#define SRQC_BYTE_16_SRQ_PRODUCER_IDX_M GENMASK(15, 0)
+
+#define SRQC_BYTE_16_SRQ_CONSUMER_IDX_S 0
+#define SRQC_BYTE_16_SRQ_CONSUMER_IDX_M GENMASK(31, 16)
+
+#define SRQC_BYTE_24_SRQ_WQE_BT_BA_S 0
+#define SRQC_BYTE_24_SRQ_WQE_BT_BA_M GENMASK(28, 0)
+
+#define SRQC_BYTE_28_PD_S 0
+#define SRQC_BYTE_28_PD_M GENMASK(23, 0)
+
+#define SRQC_BYTE_28_RQWS_S 24
+#define SRQC_BYTE_28_RQWS_M GENMASK(27, 24)
+
+#define SRQC_BYTE_36_SRQ_IDX_BT_BA_S 0
+#define SRQC_BYTE_36_SRQ_IDX_BT_BA_M GENMASK(28, 0)
+
+#define SRQC_BYTE_44_SRQ_IDX_CUR_BLK_ADDR_S 0
+#define SRQC_BYTE_44_SRQ_IDX_CUR_BLK_ADDR_M GENMASK(19, 0)
+
+#define SRQC_BYTE_44_SRQ_IDX_HOP_NUM_S 22
+#define SRQC_BYTE_44_SRQ_IDX_HOP_NUM_M GENMASK(23, 22)
+
+#define SRQC_BYTE_44_SRQ_IDX_BA_PG_SZ_S 24
+#define SRQC_BYTE_44_SRQ_IDX_BA_PG_SZ_M GENMASK(27, 24)
+
+#define SRQC_BYTE_44_SRQ_IDX_BUF_PG_SZ_S 28
+#define SRQC_BYTE_44_SRQ_IDX_BUF_PG_SZ_M GENMASK(31, 28)
+
+#define SRQC_BYTE_52_SRQ_IDX_NXT_BLK_ADDR_S 0
+#define SRQC_BYTE_52_SRQ_IDX_NXT_BLK_ADDR_M GENMASK(19, 0)
+
+#define SRQC_BYTE_56_SRQ_XRC_CQN_S 0
+#define SRQC_BYTE_56_SRQ_XRC_CQN_M GENMASK(23, 0)
+
+#define SRQC_BYTE_56_SRQ_WQE_BA_PG_SZ_S 24
+#define SRQC_BYTE_56_SRQ_WQE_BA_PG_SZ_M GENMASK(27, 24)
+
+#define SRQC_BYTE_56_SRQ_WQE_BUF_PG_SZ_S 28
+#define SRQC_BYTE_56_SRQ_WQE_BUF_PG_SZ_M GENMASK(31, 28)
+
+#define SRQC_BYTE_60_SRQ_RECORD_EN_S 0
+
+#define SRQC_BYTE_60_SRQ_DB_RECORD_ADDR_S 1
+#define SRQC_BYTE_60_SRQ_DB_RECORD_ADDR_M GENMASK(31, 1)
+
enum{
V2_MPT_ST_VALID = 0x1,
V2_MPT_ST_FREE = 0x2,
@@ -1200,7 +1315,8 @@ struct hns_roce_pf_res_b {
__le32 smac_idx_num;
__le32 sgid_idx_num;
__le32 qid_idx_sl_num;
- __le32 rsv[2];
+ __le32 sccc_bt_idx_num;
+ __le32 rsv;
};
#define PF_RES_DATA_1_PF_SMAC_IDX_S 0
@@ -1221,6 +1337,31 @@ struct hns_roce_pf_res_b {
#define PF_RES_DATA_3_PF_SL_NUM_S 16
#define PF_RES_DATA_3_PF_SL_NUM_M GENMASK(26, 16)
+#define PF_RES_DATA_4_PF_SCCC_BT_IDX_S 0
+#define PF_RES_DATA_4_PF_SCCC_BT_IDX_M GENMASK(8, 0)
+
+#define PF_RES_DATA_4_PF_SCCC_BT_NUM_S 9
+#define PF_RES_DATA_4_PF_SCCC_BT_NUM_M GENMASK(17, 9)
+
+struct hns_roce_pf_timer_res_a {
+ __le32 rsv0;
+ __le32 qpc_timer_bt_idx_num;
+ __le32 cqc_timer_bt_idx_num;
+ __le32 rsv[3];
+};
+
+#define PF_RES_DATA_1_PF_QPC_TIMER_BT_IDX_S 0
+#define PF_RES_DATA_1_PF_QPC_TIMER_BT_IDX_M GENMASK(11, 0)
+
+#define PF_RES_DATA_1_PF_QPC_TIMER_BT_NUM_S 16
+#define PF_RES_DATA_1_PF_QPC_TIMER_BT_NUM_M GENMASK(28, 16)
+
+#define PF_RES_DATA_2_PF_CQC_TIMER_BT_IDX_S 0
+#define PF_RES_DATA_2_PF_CQC_TIMER_BT_IDX_M GENMASK(10, 0)
+
+#define PF_RES_DATA_2_PF_CQC_TIMER_BT_NUM_S 16
+#define PF_RES_DATA_2_PF_CQC_TIMER_BT_NUM_M GENMASK(27, 16)
+
struct hns_roce_vf_res_a {
__le32 vf_id;
__le32 vf_qpc_bt_idx_num;
@@ -1265,7 +1406,8 @@ struct hns_roce_vf_res_b {
__le32 vf_smac_idx_num;
__le32 vf_sgid_idx_num;
__le32 vf_qid_idx_sl_num;
- __le32 rsv[2];
+ __le32 vf_sccc_idx_num;
+ __le32 rsv1;
};
#define VF_RES_B_DATA_0_VF_ID_S 0
@@ -1289,12 +1431,49 @@ struct hns_roce_vf_res_b {
#define VF_RES_B_DATA_3_VF_SL_NUM_S 16
#define VF_RES_B_DATA_3_VF_SL_NUM_M GENMASK(19, 16)
+#define VF_RES_B_DATA_4_VF_SCCC_BT_IDX_S 0
+#define VF_RES_B_DATA_4_VF_SCCC_BT_IDX_M GENMASK(8, 0)
+
+#define VF_RES_B_DATA_4_VF_SCCC_BT_NUM_S 9
+#define VF_RES_B_DATA_4_VF_SCCC_BT_NUM_M GENMASK(17, 9)
+
+struct hns_roce_vf_switch {
+ __le32 rocee_sel;
+ __le32 fun_id;
+ __le32 cfg;
+ __le32 resv1;
+ __le32 resv2;
+ __le32 resv3;
+};
+
+#define VF_SWITCH_DATA_FUN_ID_VF_ID_S 3
+#define VF_SWITCH_DATA_FUN_ID_VF_ID_M GENMASK(10, 3)
+
+#define VF_SWITCH_DATA_CFG_ALW_LPBK_S 1
+#define VF_SWITCH_DATA_CFG_ALW_LCL_LPBK_S 2
+#define VF_SWITCH_DATA_CFG_ALW_DST_OVRD_S 3
+
+struct hns_roce_post_mbox {
+ __le32 in_param_l;
+ __le32 in_param_h;
+ __le32 out_param_l;
+ __le32 out_param_h;
+ __le32 cmd_tag;
+ __le32 token_event_en;
+};
+
+struct hns_roce_mbox_status {
+ __le32 mb_status_hw_run;
+ __le32 rsv[5];
+};
+
struct hns_roce_cfg_bt_attr {
__le32 vf_qpc_cfg;
__le32 vf_srqc_cfg;
__le32 vf_cqc_cfg;
__le32 vf_mpt_cfg;
- __le32 rsv[2];
+ __le32 vf_sccc_cfg;
+ __le32 rsv;
};
#define CFG_BT_ATTR_DATA_0_VF_QPC_BA_PGSZ_S 0
@@ -1333,6 +1512,15 @@ struct hns_roce_cfg_bt_attr {
#define CFG_BT_ATTR_DATA_3_VF_MPT_HOPNUM_S 8
#define CFG_BT_ATTR_DATA_3_VF_MPT_HOPNUM_M GENMASK(9, 8)
+#define CFG_BT_ATTR_DATA_4_VF_SCCC_BA_PGSZ_S 0
+#define CFG_BT_ATTR_DATA_4_VF_SCCC_BA_PGSZ_M GENMASK(3, 0)
+
+#define CFG_BT_ATTR_DATA_4_VF_SCCC_BUF_PGSZ_S 4
+#define CFG_BT_ATTR_DATA_4_VF_SCCC_BUF_PGSZ_M GENMASK(7, 4)
+
+#define CFG_BT_ATTR_DATA_4_VF_SCCC_HOPNUM_S 8
+#define CFG_BT_ATTR_DATA_4_VF_SCCC_HOPNUM_M GENMASK(9, 8)
+
struct hns_roce_cfg_sgid_tb {
__le32 table_idx_rsv;
__le32 vf_sgid_l;
@@ -1372,18 +1560,6 @@ struct hns_roce_cmq_desc {
#define HNS_ROCE_HW_RUN_BIT_SHIFT 31
#define HNS_ROCE_HW_MB_STATUS_MASK 0xFF
-#define HNS_ROCE_VF_MB4_TAG_MASK 0xFFFFFF00
-#define HNS_ROCE_VF_MB4_TAG_SHIFT 8
-
-#define HNS_ROCE_VF_MB4_CMD_MASK 0xFF
-#define HNS_ROCE_VF_MB4_CMD_SHIFT 0
-
-#define HNS_ROCE_VF_MB5_EVENT_MASK 0x10000
-#define HNS_ROCE_VF_MB5_EVENT_SHIFT 16
-
-#define HNS_ROCE_VF_MB5_TOKEN_MASK 0xFFFF
-#define HNS_ROCE_VF_MB5_TOKEN_SHIFT 0
-
struct hns_roce_v2_cmq_ring {
dma_addr_t desc_dma_addr;
struct hns_roce_cmq_desc *desc;
@@ -1428,6 +1604,7 @@ struct hns_roce_link_table_entry {
#define HNS_ROCE_LINK_TABLE_NXT_PTR_M GENMASK(31, 20)
struct hns_roce_v2_priv {
+ struct hnae3_handle *handle;
struct hns_roce_v2_cmq cmq;
struct hns_roce_link_table tsq;
struct hns_roce_link_table tpq;
@@ -1612,4 +1789,25 @@ struct hns_roce_wqe_atomic_seg {
__le64 cmp_data;
};
+struct hns_roce_sccc_clr {
+ __le32 qpn;
+ __le32 rsv[5];
+};
+
+struct hns_roce_sccc_clr_done {
+ __le32 clr_done;
+ __le32 rsv[5];
+};
+
+static inline void hns_roce_write64(struct hns_roce_dev *hr_dev, __le32 val[2],
+ void __iomem *dest)
+{
+ struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv;
+ struct hnae3_handle *handle = priv->handle;
+ const struct hnae3_ae_ops *ops = handle->ae_algo->ops;
+
+ if (!hr_dev->dis_db && !ops->get_hw_reset_stat(handle))
+ hns_roce_write64_k(val, dest);
+}
+
#endif
diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c
index 1b3ee514f2ef..c929125da84b 100644
--- a/drivers/infiniband/hw/hns/hns_roce_main.c
+++ b/drivers/infiniband/hw/hns/hns_roce_main.c
@@ -220,6 +220,16 @@ static int hns_roce_query_device(struct ib_device *ib_dev,
IB_ATOMIC_HCA : IB_ATOMIC_NONE;
props->max_pkeys = 1;
props->local_ca_ack_delay = hr_dev->caps.local_ca_ack_delay;
+ if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_SRQ) {
+ props->max_srq = hr_dev->caps.max_srqs;
+ props->max_srq_wr = hr_dev->caps.max_srq_wrs;
+ props->max_srq_sge = hr_dev->caps.max_srq_sges;
+ }
+
+ if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_FRMR) {
+ props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS;
+ props->max_fast_reg_page_list_len = HNS_ROCE_FRMR_MAX_PA;
+ }
return 0;
}
@@ -325,23 +335,19 @@ static int hns_roce_modify_port(struct ib_device *ib_dev, u8 port_num, int mask,
return 0;
}
-static struct ib_ucontext *hns_roce_alloc_ucontext(struct ib_device *ib_dev,
- struct ib_udata *udata)
+static int hns_roce_alloc_ucontext(struct ib_ucontext *uctx,
+ struct ib_udata *udata)
{
int ret = 0;
- struct hns_roce_ucontext *context;
+ struct hns_roce_ucontext *context = to_hr_ucontext(uctx);
struct hns_roce_ib_alloc_ucontext_resp resp = {};
- struct hns_roce_dev *hr_dev = to_hr_dev(ib_dev);
+ struct hns_roce_dev *hr_dev = to_hr_dev(uctx->device);
if (!hr_dev->active)
- return ERR_PTR(-EAGAIN);
+ return -EAGAIN;
resp.qp_tab_size = hr_dev->caps.num_qps;
- context = kmalloc(sizeof(*context), GFP_KERNEL);
- if (!context)
- return ERR_PTR(-ENOMEM);
-
ret = hns_roce_uar_alloc(hr_dev, &context->uar);
if (ret)
goto error_fail_uar_alloc;
@@ -355,25 +361,20 @@ static struct ib_ucontext *hns_roce_alloc_ucontext(struct ib_device *ib_dev,
if (ret)
goto error_fail_copy_to_udata;
- return &context->ibucontext;
+ return 0;
error_fail_copy_to_udata:
hns_roce_uar_free(hr_dev, &context->uar);
error_fail_uar_alloc:
- kfree(context);
-
- return ERR_PTR(ret);
+ return ret;
}
-static int hns_roce_dealloc_ucontext(struct ib_ucontext *ibcontext)
+static void hns_roce_dealloc_ucontext(struct ib_ucontext *ibcontext)
{
struct hns_roce_ucontext *context = to_hr_ucontext(ibcontext);
hns_roce_uar_free(to_hr_dev(ibcontext->device), &context->uar);
- kfree(context);
-
- return 0;
}
static int hns_roce_mmap(struct ib_ucontext *context,
@@ -440,6 +441,56 @@ static void hns_roce_unregister_device(struct hns_roce_dev *hr_dev)
ib_unregister_device(&hr_dev->ib_dev);
}
+static const struct ib_device_ops hns_roce_dev_ops = {
+ .add_gid = hns_roce_add_gid,
+ .alloc_pd = hns_roce_alloc_pd,
+ .alloc_ucontext = hns_roce_alloc_ucontext,
+ .create_ah = hns_roce_create_ah,
+ .create_cq = hns_roce_ib_create_cq,
+ .create_qp = hns_roce_create_qp,
+ .dealloc_pd = hns_roce_dealloc_pd,
+ .dealloc_ucontext = hns_roce_dealloc_ucontext,
+ .del_gid = hns_roce_del_gid,
+ .dereg_mr = hns_roce_dereg_mr,
+ .destroy_ah = hns_roce_destroy_ah,
+ .destroy_cq = hns_roce_ib_destroy_cq,
+ .disassociate_ucontext = hns_roce_disassociate_ucontext,
+ .get_dma_mr = hns_roce_get_dma_mr,
+ .get_link_layer = hns_roce_get_link_layer,
+ .get_netdev = hns_roce_get_netdev,
+ .get_port_immutable = hns_roce_port_immutable,
+ .mmap = hns_roce_mmap,
+ .modify_device = hns_roce_modify_device,
+ .modify_port = hns_roce_modify_port,
+ .modify_qp = hns_roce_modify_qp,
+ .query_ah = hns_roce_query_ah,
+ .query_device = hns_roce_query_device,
+ .query_pkey = hns_roce_query_pkey,
+ .query_port = hns_roce_query_port,
+ .reg_user_mr = hns_roce_reg_user_mr,
+ INIT_RDMA_OBJ_SIZE(ib_pd, hns_roce_pd, ibpd),
+ INIT_RDMA_OBJ_SIZE(ib_ucontext, hns_roce_ucontext, ibucontext),
+};
+
+static const struct ib_device_ops hns_roce_dev_mr_ops = {
+ .rereg_user_mr = hns_roce_rereg_user_mr,
+};
+
+static const struct ib_device_ops hns_roce_dev_mw_ops = {
+ .alloc_mw = hns_roce_alloc_mw,
+ .dealloc_mw = hns_roce_dealloc_mw,
+};
+
+static const struct ib_device_ops hns_roce_dev_frmr_ops = {
+ .alloc_mr = hns_roce_alloc_mr,
+ .map_mr_sg = hns_roce_map_mr_sg,
+};
+
+static const struct ib_device_ops hns_roce_dev_srq_ops = {
+ .create_srq = hns_roce_create_srq,
+ .destroy_srq = hns_roce_destroy_srq,
+};
+
static int hns_roce_register_device(struct hns_roce_dev *hr_dev)
{
int ret;
@@ -479,74 +530,39 @@ static int hns_roce_register_device(struct hns_roce_dev *hr_dev)
ib_dev->uverbs_ex_cmd_mask |=
(1ULL << IB_USER_VERBS_EX_CMD_MODIFY_CQ);
- /* HCA||device||port */
- ib_dev->modify_device = hns_roce_modify_device;
- ib_dev->query_device = hns_roce_query_device;
- ib_dev->query_port = hns_roce_query_port;
- ib_dev->modify_port = hns_roce_modify_port;
- ib_dev->get_link_layer = hns_roce_get_link_layer;
- ib_dev->get_netdev = hns_roce_get_netdev;
- ib_dev->add_gid = hns_roce_add_gid;
- ib_dev->del_gid = hns_roce_del_gid;
- ib_dev->query_pkey = hns_roce_query_pkey;
- ib_dev->alloc_ucontext = hns_roce_alloc_ucontext;
- ib_dev->dealloc_ucontext = hns_roce_dealloc_ucontext;
- ib_dev->mmap = hns_roce_mmap;
-
- /* PD */
- ib_dev->alloc_pd = hns_roce_alloc_pd;
- ib_dev->dealloc_pd = hns_roce_dealloc_pd;
-
- /* AH */
- ib_dev->create_ah = hns_roce_create_ah;
- ib_dev->query_ah = hns_roce_query_ah;
- ib_dev->destroy_ah = hns_roce_destroy_ah;
-
- /* QP */
- ib_dev->create_qp = hns_roce_create_qp;
- ib_dev->modify_qp = hns_roce_modify_qp;
- ib_dev->query_qp = hr_dev->hw->query_qp;
- ib_dev->destroy_qp = hr_dev->hw->destroy_qp;
- ib_dev->post_send = hr_dev->hw->post_send;
- ib_dev->post_recv = hr_dev->hw->post_recv;
-
- /* CQ */
- ib_dev->create_cq = hns_roce_ib_create_cq;
- ib_dev->modify_cq = hr_dev->hw->modify_cq;
- ib_dev->destroy_cq = hns_roce_ib_destroy_cq;
- ib_dev->req_notify_cq = hr_dev->hw->req_notify_cq;
- ib_dev->poll_cq = hr_dev->hw->poll_cq;
-
- /* MR */
- ib_dev->get_dma_mr = hns_roce_get_dma_mr;
- ib_dev->reg_user_mr = hns_roce_reg_user_mr;
- ib_dev->dereg_mr = hns_roce_dereg_mr;
if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_REREG_MR) {
- ib_dev->rereg_user_mr = hns_roce_rereg_user_mr;
ib_dev->uverbs_cmd_mask |= (1ULL << IB_USER_VERBS_CMD_REREG_MR);
+ ib_set_device_ops(ib_dev, &hns_roce_dev_mr_ops);
}
/* MW */
if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_MW) {
- ib_dev->alloc_mw = hns_roce_alloc_mw;
- ib_dev->dealloc_mw = hns_roce_dealloc_mw;
ib_dev->uverbs_cmd_mask |=
(1ULL << IB_USER_VERBS_CMD_ALLOC_MW) |
(1ULL << IB_USER_VERBS_CMD_DEALLOC_MW);
+ ib_set_device_ops(ib_dev, &hns_roce_dev_mw_ops);
}
/* FRMR */
- if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_FRMR) {
- ib_dev->alloc_mr = hns_roce_alloc_mr;
- ib_dev->map_mr_sg = hns_roce_map_mr_sg;
- }
+ if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_FRMR)
+ ib_set_device_ops(ib_dev, &hns_roce_dev_frmr_ops);
- /* OTHERS */
- ib_dev->get_port_immutable = hns_roce_port_immutable;
- ib_dev->disassociate_ucontext = hns_roce_disassociate_ucontext;
+ /* SRQ */
+ if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_SRQ) {
+ ib_dev->uverbs_cmd_mask |=
+ (1ULL << IB_USER_VERBS_CMD_CREATE_SRQ) |
+ (1ULL << IB_USER_VERBS_CMD_MODIFY_SRQ) |
+ (1ULL << IB_USER_VERBS_CMD_QUERY_SRQ) |
+ (1ULL << IB_USER_VERBS_CMD_DESTROY_SRQ) |
+ (1ULL << IB_USER_VERBS_CMD_POST_SRQ_RECV);
+ ib_set_device_ops(ib_dev, &hns_roce_dev_srq_ops);
+ ib_set_device_ops(ib_dev, hr_dev->hw->hns_roce_dev_srq_ops);
+ }
ib_dev->driver_id = RDMA_DRIVER_HNS;
- ret = ib_register_device(ib_dev, "hns_%d", NULL);
+ ib_set_device_ops(ib_dev, hr_dev->hw->hns_roce_dev_ops);
+ ib_set_device_ops(ib_dev, &hns_roce_dev_ops);
+ ret = ib_register_device(ib_dev, "hns_%d");
if (ret) {
dev_err(dev, "ib_register_device failed!\n");
return ret;
@@ -646,8 +662,112 @@ static int hns_roce_init_hem(struct hns_roce_dev *hr_dev)
goto err_unmap_trrl;
}
+ if (hr_dev->caps.srqc_entry_sz) {
+ ret = hns_roce_init_hem_table(hr_dev, &hr_dev->srq_table.table,
+ HEM_TYPE_SRQC,
+ hr_dev->caps.srqc_entry_sz,
+ hr_dev->caps.num_srqs, 1);
+ if (ret) {
+ dev_err(dev,
+ "Failed to init SRQ context memory, aborting.\n");
+ goto err_unmap_cq;
+ }
+ }
+
+ if (hr_dev->caps.num_srqwqe_segs) {
+ ret = hns_roce_init_hem_table(hr_dev,
+ &hr_dev->mr_table.mtt_srqwqe_table,
+ HEM_TYPE_SRQWQE,
+ hr_dev->caps.mtt_entry_sz,
+ hr_dev->caps.num_srqwqe_segs, 1);
+ if (ret) {
+ dev_err(dev,
+ "Failed to init MTT srqwqe memory, aborting.\n");
+ goto err_unmap_srq;
+ }
+ }
+
+ if (hr_dev->caps.num_idx_segs) {
+ ret = hns_roce_init_hem_table(hr_dev,
+ &hr_dev->mr_table.mtt_idx_table,
+ HEM_TYPE_IDX,
+ hr_dev->caps.idx_entry_sz,
+ hr_dev->caps.num_idx_segs, 1);
+ if (ret) {
+ dev_err(dev,
+ "Failed to init MTT idx memory, aborting.\n");
+ goto err_unmap_srqwqe;
+ }
+ }
+
+ if (hr_dev->caps.sccc_entry_sz) {
+ ret = hns_roce_init_hem_table(hr_dev,
+ &hr_dev->qp_table.sccc_table,
+ HEM_TYPE_SCCC,
+ hr_dev->caps.sccc_entry_sz,
+ hr_dev->caps.num_qps, 1);
+ if (ret) {
+ dev_err(dev,
+ "Failed to init SCC context memory, aborting.\n");
+ goto err_unmap_idx;
+ }
+ }
+
+ if (hr_dev->caps.qpc_timer_entry_sz) {
+ ret = hns_roce_init_hem_table(hr_dev,
+ &hr_dev->qpc_timer_table,
+ HEM_TYPE_QPC_TIMER,
+ hr_dev->caps.qpc_timer_entry_sz,
+ hr_dev->caps.num_qpc_timer, 1);
+ if (ret) {
+ dev_err(dev,
+ "Failed to init QPC timer memory, aborting.\n");
+ goto err_unmap_ctx;
+ }
+ }
+
+ if (hr_dev->caps.cqc_timer_entry_sz) {
+ ret = hns_roce_init_hem_table(hr_dev,
+ &hr_dev->cqc_timer_table,
+ HEM_TYPE_CQC_TIMER,
+ hr_dev->caps.cqc_timer_entry_sz,
+ hr_dev->caps.num_cqc_timer, 1);
+ if (ret) {
+ dev_err(dev,
+ "Failed to init CQC timer memory, aborting.\n");
+ goto err_unmap_qpc_timer;
+ }
+ }
+
return 0;
+err_unmap_qpc_timer:
+ if (hr_dev->caps.qpc_timer_entry_sz)
+ hns_roce_cleanup_hem_table(hr_dev,
+ &hr_dev->qpc_timer_table);
+
+err_unmap_ctx:
+ if (hr_dev->caps.sccc_entry_sz)
+ hns_roce_cleanup_hem_table(hr_dev,
+ &hr_dev->qp_table.sccc_table);
+
+err_unmap_idx:
+ if (hr_dev->caps.num_idx_segs)
+ hns_roce_cleanup_hem_table(hr_dev,
+ &hr_dev->mr_table.mtt_idx_table);
+
+err_unmap_srqwqe:
+ if (hr_dev->caps.num_srqwqe_segs)
+ hns_roce_cleanup_hem_table(hr_dev,
+ &hr_dev->mr_table.mtt_srqwqe_table);
+
+err_unmap_srq:
+ if (hr_dev->caps.srqc_entry_sz)
+ hns_roce_cleanup_hem_table(hr_dev, &hr_dev->srq_table.table);
+
+err_unmap_cq:
+ hns_roce_cleanup_hem_table(hr_dev, &hr_dev->cq_table.table);
+
err_unmap_trrl:
if (hr_dev->caps.trrl_entry_sz)
hns_roce_cleanup_hem_table(hr_dev,
@@ -727,8 +847,21 @@ static int hns_roce_setup_hca(struct hns_roce_dev *hr_dev)
goto err_cq_table_free;
}
+ if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_SRQ) {
+ ret = hns_roce_init_srq_table(hr_dev);
+ if (ret) {
+ dev_err(dev,
+ "Failed to init share receive queue table.\n");
+ goto err_qp_table_free;
+ }
+ }
+
return 0;
+err_qp_table_free:
+ if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_SRQ)
+ hns_roce_cleanup_qp_table(hr_dev);
+
err_cq_table_free:
hns_roce_cleanup_cq_table(hr_dev);
diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c
index 521ad2aa3a4e..08be0e4eabcd 100644
--- a/drivers/infiniband/hw/hns/hns_roce_mr.c
+++ b/drivers/infiniband/hw/hns/hns_roce_mr.c
@@ -184,12 +184,27 @@ static int hns_roce_alloc_mtt_range(struct hns_roce_dev *hr_dev, int order,
struct hns_roce_buddy *buddy;
int ret;
- if (mtt_type == MTT_TYPE_WQE) {
+ switch (mtt_type) {
+ case MTT_TYPE_WQE:
buddy = &mr_table->mtt_buddy;
table = &mr_table->mtt_table;
- } else {
+ break;
+ case MTT_TYPE_CQE:
buddy = &mr_table->mtt_cqe_buddy;
table = &mr_table->mtt_cqe_table;
+ break;
+ case MTT_TYPE_SRQWQE:
+ buddy = &mr_table->mtt_srqwqe_buddy;
+ table = &mr_table->mtt_srqwqe_table;
+ break;
+ case MTT_TYPE_IDX:
+ buddy = &mr_table->mtt_idx_buddy;
+ table = &mr_table->mtt_idx_table;
+ break;
+ default:
+ dev_err(hr_dev->dev, "Unsupport MTT table type: %d\n",
+ mtt_type);
+ return -EINVAL;
}
ret = hns_roce_buddy_alloc(buddy, order, seg);
@@ -242,18 +257,40 @@ void hns_roce_mtt_cleanup(struct hns_roce_dev *hr_dev, struct hns_roce_mtt *mtt)
if (mtt->order < 0)
return;
- if (mtt->mtt_type == MTT_TYPE_WQE) {
+ switch (mtt->mtt_type) {
+ case MTT_TYPE_WQE:
hns_roce_buddy_free(&mr_table->mtt_buddy, mtt->first_seg,
mtt->order);
hns_roce_table_put_range(hr_dev, &mr_table->mtt_table,
mtt->first_seg,
mtt->first_seg + (1 << mtt->order) - 1);
- } else {
+ break;
+ case MTT_TYPE_CQE:
hns_roce_buddy_free(&mr_table->mtt_cqe_buddy, mtt->first_seg,
mtt->order);
hns_roce_table_put_range(hr_dev, &mr_table->mtt_cqe_table,
mtt->first_seg,
mtt->first_seg + (1 << mtt->order) - 1);
+ break;
+ case MTT_TYPE_SRQWQE:
+ hns_roce_buddy_free(&mr_table->mtt_srqwqe_buddy, mtt->first_seg,
+ mtt->order);
+ hns_roce_table_put_range(hr_dev, &mr_table->mtt_srqwqe_table,
+ mtt->first_seg,
+ mtt->first_seg + (1 << mtt->order) - 1);
+ break;
+ case MTT_TYPE_IDX:
+ hns_roce_buddy_free(&mr_table->mtt_idx_buddy, mtt->first_seg,
+ mtt->order);
+ hns_roce_table_put_range(hr_dev, &mr_table->mtt_idx_table,
+ mtt->first_seg,
+ mtt->first_seg + (1 << mtt->order) - 1);
+ break;
+ default:
+ dev_err(hr_dev->dev,
+ "Unsupport mtt type %d, clean mtt failed\n",
+ mtt->mtt_type);
+ break;
}
}
EXPORT_SYMBOL_GPL(hns_roce_mtt_cleanup);
@@ -709,14 +746,29 @@ static int hns_roce_write_mtt_chunk(struct hns_roce_dev *hr_dev,
struct hns_roce_hem_table *table;
dma_addr_t dma_handle;
__le64 *mtts;
- u32 s = start_index * sizeof(u64);
u32 bt_page_size;
u32 i;
- if (mtt->mtt_type == MTT_TYPE_WQE)
+ switch (mtt->mtt_type) {
+ case MTT_TYPE_WQE:
+ table = &hr_dev->mr_table.mtt_table;
bt_page_size = 1 << (hr_dev->caps.mtt_ba_pg_sz + PAGE_SHIFT);
- else
+ break;
+ case MTT_TYPE_CQE:
+ table = &hr_dev->mr_table.mtt_cqe_table;
bt_page_size = 1 << (hr_dev->caps.cqe_ba_pg_sz + PAGE_SHIFT);
+ break;
+ case MTT_TYPE_SRQWQE:
+ table = &hr_dev->mr_table.mtt_srqwqe_table;
+ bt_page_size = 1 << (hr_dev->caps.srqwqe_ba_pg_sz + PAGE_SHIFT);
+ break;
+ case MTT_TYPE_IDX:
+ table = &hr_dev->mr_table.mtt_idx_table;
+ bt_page_size = 1 << (hr_dev->caps.idx_ba_pg_sz + PAGE_SHIFT);
+ break;
+ default:
+ return -EINVAL;
+ }
/* All MTTs must fit in the same page */
if (start_index / (bt_page_size / sizeof(u64)) !=
@@ -726,13 +778,9 @@ static int hns_roce_write_mtt_chunk(struct hns_roce_dev *hr_dev,
if (start_index & (HNS_ROCE_MTT_ENTRY_PER_SEG - 1))
return -EINVAL;
- if (mtt->mtt_type == MTT_TYPE_WQE)
- table = &hr_dev->mr_table.mtt_table;
- else
- table = &hr_dev->mr_table.mtt_cqe_table;
-
mtts = hns_roce_table_find(hr_dev, table,
- mtt->first_seg + s / hr_dev->caps.mtt_entry_sz,
+ mtt->first_seg +
+ start_index / HNS_ROCE_MTT_ENTRY_PER_SEG,
&dma_handle);
if (!mtts)
return -ENOMEM;
@@ -759,10 +807,25 @@ static int hns_roce_write_mtt(struct hns_roce_dev *hr_dev,
if (mtt->order < 0)
return -EINVAL;
- if (mtt->mtt_type == MTT_TYPE_WQE)
+ switch (mtt->mtt_type) {
+ case MTT_TYPE_WQE:
bt_page_size = 1 << (hr_dev->caps.mtt_ba_pg_sz + PAGE_SHIFT);
- else
+ break;
+ case MTT_TYPE_CQE:
bt_page_size = 1 << (hr_dev->caps.cqe_ba_pg_sz + PAGE_SHIFT);
+ break;
+ case MTT_TYPE_SRQWQE:
+ bt_page_size = 1 << (hr_dev->caps.srqwqe_ba_pg_sz + PAGE_SHIFT);
+ break;
+ case MTT_TYPE_IDX:
+ bt_page_size = 1 << (hr_dev->caps.idx_ba_pg_sz + PAGE_SHIFT);
+ break;
+ default:
+ dev_err(hr_dev->dev,
+ "Unsupport mtt type %d, write mtt failed\n",
+ mtt->mtt_type);
+ return -EINVAL;
+ }
while (npages > 0) {
chunk = min_t(int, bt_page_size / sizeof(u64), npages);
@@ -828,8 +891,31 @@ int hns_roce_init_mr_table(struct hns_roce_dev *hr_dev)
if (ret)
goto err_buddy_cqe;
}
+
+ if (hr_dev->caps.num_srqwqe_segs) {
+ ret = hns_roce_buddy_init(&mr_table->mtt_srqwqe_buddy,
+ ilog2(hr_dev->caps.num_srqwqe_segs));
+ if (ret)
+ goto err_buddy_srqwqe;
+ }
+
+ if (hr_dev->caps.num_idx_segs) {
+ ret = hns_roce_buddy_init(&mr_table->mtt_idx_buddy,
+ ilog2(hr_dev->caps.num_idx_segs));
+ if (ret)
+ goto err_buddy_idx;
+ }
+
return 0;
+err_buddy_idx:
+ if (hr_dev->caps.num_srqwqe_segs)
+ hns_roce_buddy_cleanup(&mr_table->mtt_srqwqe_buddy);
+
+err_buddy_srqwqe:
+ if (hns_roce_check_whether_mhop(hr_dev, HEM_TYPE_CQE))
+ hns_roce_buddy_cleanup(&mr_table->mtt_cqe_buddy);
+
err_buddy_cqe:
hns_roce_buddy_cleanup(&mr_table->mtt_buddy);
@@ -842,6 +928,10 @@ void hns_roce_cleanup_mr_table(struct hns_roce_dev *hr_dev)
{
struct hns_roce_mr_table *mr_table = &hr_dev->mr_table;
+ if (hr_dev->caps.num_idx_segs)
+ hns_roce_buddy_cleanup(&mr_table->mtt_idx_buddy);
+ if (hr_dev->caps.num_srqwqe_segs)
+ hns_roce_buddy_cleanup(&mr_table->mtt_srqwqe_buddy);
hns_roce_buddy_cleanup(&mr_table->mtt_buddy);
if (hns_roce_check_whether_mhop(hr_dev, HEM_TYPE_CQE))
hns_roce_buddy_cleanup(&mr_table->mtt_cqe_buddy);
@@ -886,19 +976,35 @@ int hns_roce_ib_umem_write_mtt(struct hns_roce_dev *hr_dev,
struct hns_roce_mtt *mtt, struct ib_umem *umem)
{
struct device *dev = hr_dev->dev;
- struct scatterlist *sg;
+ struct sg_dma_page_iter sg_iter;
unsigned int order;
- int i, k, entry;
int npage = 0;
int ret = 0;
- int len;
+ int i;
u64 page_addr;
u64 *pages;
u32 bt_page_size;
u32 n;
- order = mtt->mtt_type == MTT_TYPE_WQE ? hr_dev->caps.mtt_ba_pg_sz :
- hr_dev->caps.cqe_ba_pg_sz;
+ switch (mtt->mtt_type) {
+ case MTT_TYPE_WQE:
+ order = hr_dev->caps.mtt_ba_pg_sz;
+ break;
+ case MTT_TYPE_CQE:
+ order = hr_dev->caps.cqe_ba_pg_sz;
+ break;
+ case MTT_TYPE_SRQWQE:
+ order = hr_dev->caps.srqwqe_ba_pg_sz;
+ break;
+ case MTT_TYPE_IDX:
+ order = hr_dev->caps.idx_ba_pg_sz;
+ break;
+ default:
+ dev_err(dev, "Unsupport mtt type %d, write mtt failed\n",
+ mtt->mtt_type);
+ return -EINVAL;
+ }
+
bt_page_size = 1 << (order + PAGE_SHIFT);
pages = (u64 *) __get_free_pages(GFP_KERNEL, order);
@@ -907,29 +1013,25 @@ int hns_roce_ib_umem_write_mtt(struct hns_roce_dev *hr_dev,
i = n = 0;
- for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
- len = sg_dma_len(sg) >> PAGE_SHIFT;
- for (k = 0; k < len; ++k) {
- page_addr =
- sg_dma_address(sg) + (k << umem->page_shift);
- if (!(npage % (1 << (mtt->page_shift - PAGE_SHIFT)))) {
- if (page_addr & ((1 << mtt->page_shift) - 1)) {
- dev_err(dev, "page_addr 0x%llx is not page_shift %d alignment!\n",
- page_addr, mtt->page_shift);
- ret = -EINVAL;
- goto out;
- }
- pages[i++] = page_addr;
- }
- npage++;
- if (i == bt_page_size / sizeof(u64)) {
- ret = hns_roce_write_mtt(hr_dev, mtt, n, i,
- pages);
- if (ret)
- goto out;
- n += i;
- i = 0;
+ for_each_sg_dma_page(umem->sg_head.sgl, &sg_iter, umem->nmap, 0) {
+ page_addr = sg_page_iter_dma_address(&sg_iter);
+ if (!(npage % (1 << (mtt->page_shift - PAGE_SHIFT)))) {
+ if (page_addr & ((1 << mtt->page_shift) - 1)) {
+ dev_err(dev,
+ "page_addr 0x%llx is not page_shift %d alignment!\n",
+ page_addr, mtt->page_shift);
+ ret = -EINVAL;
+ goto out;
}
+ pages[i++] = page_addr;
+ }
+ npage++;
+ if (i == bt_page_size / sizeof(u64)) {
+ ret = hns_roce_write_mtt(hr_dev, mtt, n, i, pages);
+ if (ret)
+ goto out;
+ n += i;
+ i = 0;
}
}
@@ -945,10 +1047,8 @@ static int hns_roce_ib_umem_write_mr(struct hns_roce_dev *hr_dev,
struct hns_roce_mr *mr,
struct ib_umem *umem)
{
- struct scatterlist *sg;
- int i = 0, j = 0, k;
- int entry;
- int len;
+ struct sg_dma_page_iter sg_iter;
+ int i = 0, j = 0;
u64 page_addr;
u32 pbl_bt_sz;
@@ -956,27 +1056,22 @@ static int hns_roce_ib_umem_write_mr(struct hns_roce_dev *hr_dev,
return 0;
pbl_bt_sz = 1 << (hr_dev->caps.pbl_ba_pg_sz + PAGE_SHIFT);
- for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
- len = sg_dma_len(sg) >> PAGE_SHIFT;
- for (k = 0; k < len; ++k) {
- page_addr = sg_dma_address(sg) +
- (k << umem->page_shift);
-
- if (!hr_dev->caps.pbl_hop_num) {
- mr->pbl_buf[i++] = page_addr >> 12;
- } else if (hr_dev->caps.pbl_hop_num == 1) {
- mr->pbl_buf[i++] = page_addr;
- } else {
- if (hr_dev->caps.pbl_hop_num == 2)
- mr->pbl_bt_l1[i][j] = page_addr;
- else if (hr_dev->caps.pbl_hop_num == 3)
- mr->pbl_bt_l2[i][j] = page_addr;
-
- j++;
- if (j >= (pbl_bt_sz / 8)) {
- i++;
- j = 0;
- }
+ for_each_sg_dma_page(umem->sg_head.sgl, &sg_iter, umem->nmap, 0) {
+ page_addr = sg_page_iter_dma_address(&sg_iter);
+ if (!hr_dev->caps.pbl_hop_num) {
+ mr->pbl_buf[i++] = page_addr >> 12;
+ } else if (hr_dev->caps.pbl_hop_num == 1) {
+ mr->pbl_buf[i++] = page_addr;
+ } else {
+ if (hr_dev->caps.pbl_hop_num == 2)
+ mr->pbl_bt_l1[i][j] = page_addr;
+ else if (hr_dev->caps.pbl_hop_num == 3)
+ mr->pbl_bt_l2[i][j] = page_addr;
+
+ j++;
+ if (j >= (pbl_bt_sz / 8)) {
+ i++;
+ j = 0;
}
}
}
@@ -1003,8 +1098,7 @@ struct ib_mr *hns_roce_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
if (!mr)
return ERR_PTR(-ENOMEM);
- mr->umem = ib_umem_get(pd->uobject->context, start, length,
- access_flags, 0);
+ mr->umem = ib_umem_get(udata, start, length, access_flags, 0);
if (IS_ERR(mr->umem)) {
ret = PTR_ERR(mr->umem);
goto err_free;
@@ -1021,14 +1115,14 @@ struct ib_mr *hns_roce_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
goto err_umem;
}
} else {
- int pbl_size = 1;
+ u64 pbl_size = 1;
bt_size = (1 << (hr_dev->caps.pbl_ba_pg_sz + PAGE_SHIFT)) / 8;
for (i = 0; i < hr_dev->caps.pbl_hop_num; i++)
pbl_size *= bt_size;
if (n > pbl_size) {
dev_err(dev,
- " MR len %lld err. MR page num is limited to %d!\n",
+ " MR len %lld err. MR page num is limited to %lld!\n",
length, pbl_size);
ret = -EINVAL;
goto err_umem;
@@ -1113,8 +1207,8 @@ int hns_roce_rereg_user_mr(struct ib_mr *ibmr, int flags, u64 start, u64 length,
}
ib_umem_release(mr->umem);
- mr->umem = ib_umem_get(ibmr->uobject->context, start, length,
- mr_access_flags, 0);
+ mr->umem =
+ ib_umem_get(udata, start, length, mr_access_flags, 0);
if (IS_ERR(mr->umem)) {
ret = PTR_ERR(mr->umem);
mr->umem = NULL;
diff --git a/drivers/infiniband/hw/hns/hns_roce_pd.c b/drivers/infiniband/hw/hns/hns_roce_pd.c
index e11c149da04d..b9b97c5e97e6 100644
--- a/drivers/infiniband/hw/hns/hns_roce_pd.c
+++ b/drivers/infiniband/hw/hns/hns_roce_pd.c
@@ -57,24 +57,19 @@ void hns_roce_cleanup_pd_table(struct hns_roce_dev *hr_dev)
hns_roce_bitmap_cleanup(&hr_dev->pd_bitmap);
}
-struct ib_pd *hns_roce_alloc_pd(struct ib_device *ib_dev,
- struct ib_ucontext *context,
- struct ib_udata *udata)
+int hns_roce_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context,
+ struct ib_udata *udata)
{
+ struct ib_device *ib_dev = ibpd->device;
struct hns_roce_dev *hr_dev = to_hr_dev(ib_dev);
struct device *dev = hr_dev->dev;
- struct hns_roce_pd *pd;
+ struct hns_roce_pd *pd = to_hr_pd(ibpd);
int ret;
- pd = kmalloc(sizeof(*pd), GFP_KERNEL);
- if (!pd)
- return ERR_PTR(-ENOMEM);
-
ret = hns_roce_pd_alloc(to_hr_dev(ib_dev), &pd->pdn);
if (ret) {
- kfree(pd);
dev_err(dev, "[alloc_pd]hns_roce_pd_alloc failed!\n");
- return ERR_PTR(ret);
+ return ret;
}
if (context) {
@@ -83,21 +78,17 @@ struct ib_pd *hns_roce_alloc_pd(struct ib_device *ib_dev,
if (ib_copy_to_udata(udata, &uresp, sizeof(uresp))) {
hns_roce_pd_free(to_hr_dev(ib_dev), pd->pdn);
dev_err(dev, "[alloc_pd]ib_copy_to_udata failed!\n");
- kfree(pd);
- return ERR_PTR(-EFAULT);
+ return -EFAULT;
}
}
- return &pd->ibpd;
+ return 0;
}
EXPORT_SYMBOL_GPL(hns_roce_alloc_pd);
-int hns_roce_dealloc_pd(struct ib_pd *pd)
+void hns_roce_dealloc_pd(struct ib_pd *pd)
{
hns_roce_pd_free(to_hr_dev(pd->device), to_hr_pd(pd)->pdn);
- kfree(to_hr_pd(pd));
-
- return 0;
}
EXPORT_SYMBOL_GPL(hns_roce_dealloc_pd);
diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c
index 5ebf481a39d9..60cf9f03e941 100644
--- a/drivers/infiniband/hw/hns/hns_roce_qp.c
+++ b/drivers/infiniband/hw/hns/hns_roce_qp.c
@@ -35,6 +35,7 @@
#include <linux/platform_device.h>
#include <rdma/ib_addr.h>
#include <rdma/ib_umem.h>
+#include <rdma/uverbs_ioctl.h>
#include "hns_roce_common.h"
#include "hns_roce_device.h"
#include "hns_roce_hem.h"
@@ -209,13 +210,23 @@ static int hns_roce_qp_alloc(struct hns_roce_dev *hr_dev, unsigned long qpn,
}
}
+ if (hr_dev->caps.sccc_entry_sz) {
+ /* Alloc memory for SCC CTX */
+ ret = hns_roce_table_get(hr_dev, &qp_table->sccc_table,
+ hr_qp->qpn);
+ if (ret) {
+ dev_err(dev, "SCC CTX table get failed\n");
+ goto err_put_trrl;
+ }
+ }
+
spin_lock_irq(&qp_table->lock);
ret = radix_tree_insert(&hr_dev->qp_table_tree,
hr_qp->qpn & (hr_dev->caps.num_qps - 1), hr_qp);
spin_unlock_irq(&qp_table->lock);
if (ret) {
dev_err(dev, "QPC radix_tree_insert failed\n");
- goto err_put_trrl;
+ goto err_put_sccc;
}
atomic_set(&hr_qp->refcount, 1);
@@ -223,6 +234,11 @@ static int hns_roce_qp_alloc(struct hns_roce_dev *hr_dev, unsigned long qpn,
return 0;
+err_put_sccc:
+ if (hr_dev->caps.sccc_entry_sz)
+ hns_roce_table_put(hr_dev, &qp_table->sccc_table,
+ hr_qp->qpn);
+
err_put_trrl:
if (hr_dev->caps.trrl_entry_sz)
hns_roce_table_put(hr_dev, &qp_table->trrl_table, hr_qp->qpn);
@@ -280,7 +296,7 @@ void hns_roce_release_range_qp(struct hns_roce_dev *hr_dev, int base_qpn,
EXPORT_SYMBOL_GPL(hns_roce_release_range_qp);
static int hns_roce_set_rq_size(struct hns_roce_dev *hr_dev,
- struct ib_qp_cap *cap, int is_user, int has_srq,
+ struct ib_qp_cap *cap, bool is_user, int has_rq,
struct hns_roce_qp *hr_qp)
{
struct device *dev = hr_dev->dev;
@@ -294,14 +310,12 @@ static int hns_roce_set_rq_size(struct hns_roce_dev *hr_dev,
return -EINVAL;
}
- /* If srq exit, set zero for relative number of rq */
- if (has_srq) {
- if (cap->max_recv_wr) {
- dev_dbg(dev, "srq no need config max_recv_wr\n");
- return -EINVAL;
- }
-
- hr_qp->rq.wqe_cnt = hr_qp->rq.max_gs = 0;
+ /* If srq exist, set zero for relative number of rq */
+ if (!has_rq) {
+ hr_qp->rq.wqe_cnt = 0;
+ hr_qp->rq.max_gs = 0;
+ cap->max_recv_wr = 0;
+ cap->max_recv_sge = 0;
} else {
if (is_user && (!cap->max_recv_wr || !cap->max_recv_sge)) {
dev_err(dev, "user space no need config max_recv_wr max_recv_sge\n");
@@ -519,7 +533,7 @@ static int hns_roce_set_kernel_sq_size(struct hns_roce_dev *hr_dev,
static int hns_roce_qp_has_sq(struct ib_qp_init_attr *attr)
{
- if (attr->qp_type == IB_QPT_XRC_TGT)
+ if (attr->qp_type == IB_QPT_XRC_TGT || !attr->cap.max_send_wr)
return 0;
return 1;
@@ -528,7 +542,8 @@ static int hns_roce_qp_has_sq(struct ib_qp_init_attr *attr)
static int hns_roce_qp_has_rq(struct ib_qp_init_attr *attr)
{
if (attr->qp_type == IB_QPT_XRC_INI ||
- attr->qp_type == IB_QPT_XRC_TGT || attr->srq)
+ attr->qp_type == IB_QPT_XRC_TGT || attr->srq ||
+ !attr->cap.max_recv_wr)
return 0;
return 1;
@@ -543,6 +558,8 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
struct device *dev = hr_dev->dev;
struct hns_roce_ib_create_qp ucmd;
struct hns_roce_ib_create_qp_resp resp = {};
+ struct hns_roce_ucontext *uctx = rdma_udata_to_drv_context(
+ udata, struct hns_roce_ucontext, ibucontext);
unsigned long qpn = 0;
int ret = 0;
u32 page_shift;
@@ -562,14 +579,15 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
else
hr_qp->sq_signal_bits = cpu_to_le32(IB_SIGNAL_REQ_WR);
- ret = hns_roce_set_rq_size(hr_dev, &init_attr->cap, !!ib_pd->uobject,
- !!init_attr->srq, hr_qp);
+ ret = hns_roce_set_rq_size(hr_dev, &init_attr->cap, udata,
+ hns_roce_qp_has_rq(init_attr), hr_qp);
if (ret) {
dev_err(dev, "hns_roce_set_rq_size failed\n");
goto err_out;
}
- if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE) {
+ if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE) &&
+ hns_roce_qp_has_rq(init_attr)) {
/* allocate recv inline buf */
hr_qp->rq_inl_buf.wqe_list = kcalloc(hr_qp->rq.wqe_cnt,
sizeof(struct hns_roce_rinl_wqe),
@@ -599,7 +617,7 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
init_attr->cap.max_recv_sge];
}
- if (ib_pd->uobject) {
+ if (udata) {
if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) {
dev_err(dev, "ib_copy_from_udata error for create qp\n");
ret = -EFAULT;
@@ -613,9 +631,8 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
goto err_rq_sge_list;
}
- hr_qp->umem = ib_umem_get(ib_pd->uobject->context,
- ucmd.buf_addr, hr_qp->buff_size, 0,
- 0);
+ hr_qp->umem = ib_umem_get(udata, ucmd.buf_addr,
+ hr_qp->buff_size, 0, 0);
if (IS_ERR(hr_qp->umem)) {
dev_err(dev, "ib_umem_get error for create qp\n");
ret = PTR_ERR(hr_qp->umem);
@@ -623,19 +640,19 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
}
hr_qp->mtt.mtt_type = MTT_TYPE_WQE;
+ page_shift = PAGE_SHIFT;
if (hr_dev->caps.mtt_buf_pg_sz) {
npages = (ib_umem_page_count(hr_qp->umem) +
(1 << hr_dev->caps.mtt_buf_pg_sz) - 1) /
- (1 << hr_dev->caps.mtt_buf_pg_sz);
- page_shift = PAGE_SHIFT + hr_dev->caps.mtt_buf_pg_sz;
+ (1 << hr_dev->caps.mtt_buf_pg_sz);
+ page_shift += hr_dev->caps.mtt_buf_pg_sz;
ret = hns_roce_mtt_init(hr_dev, npages,
page_shift,
&hr_qp->mtt);
} else {
ret = hns_roce_mtt_init(hr_dev,
- ib_umem_page_count(hr_qp->umem),
- hr_qp->umem->page_shift,
- &hr_qp->mtt);
+ ib_umem_page_count(hr_qp->umem),
+ page_shift, &hr_qp->mtt);
}
if (ret) {
dev_err(dev, "hns_roce_mtt_init error for create qp\n");
@@ -653,9 +670,8 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
(udata->inlen >= sizeof(ucmd)) &&
(udata->outlen >= sizeof(resp)) &&
hns_roce_qp_has_sq(init_attr)) {
- ret = hns_roce_db_map_user(
- to_hr_ucontext(ib_pd->uobject->context),
- ucmd.sdb_addr, &hr_qp->sdb);
+ ret = hns_roce_db_map_user(uctx, udata, ucmd.sdb_addr,
+ &hr_qp->sdb);
if (ret) {
dev_err(dev, "sq record doorbell map failed!\n");
goto err_mtt;
@@ -669,13 +685,16 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) &&
(udata->outlen >= sizeof(resp)) &&
hns_roce_qp_has_rq(init_attr)) {
- ret = hns_roce_db_map_user(
- to_hr_ucontext(ib_pd->uobject->context),
- ucmd.db_addr, &hr_qp->rdb);
+ ret = hns_roce_db_map_user(uctx, udata, ucmd.db_addr,
+ &hr_qp->rdb);
if (ret) {
dev_err(dev, "rq record doorbell map failed!\n");
goto err_sq_dbmap;
}
+
+ /* indicate kernel supports rq record db */
+ resp.cap_flags |= HNS_ROCE_SUPPORT_RQ_RECORD_DB;
+ hr_qp->rdb_en = 1;
}
} else {
if (init_attr->create_flags &
@@ -742,10 +761,10 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
goto err_mtt;
}
- hr_qp->sq.wrid = kmalloc_array(hr_qp->sq.wqe_cnt, sizeof(u64),
- GFP_KERNEL);
- hr_qp->rq.wrid = kmalloc_array(hr_qp->rq.wqe_cnt, sizeof(u64),
- GFP_KERNEL);
+ hr_qp->sq.wrid = kcalloc(hr_qp->sq.wqe_cnt, sizeof(u64),
+ GFP_KERNEL);
+ hr_qp->rq.wrid = kcalloc(hr_qp->rq.wqe_cnt, sizeof(u64),
+ GFP_KERNEL);
if (!hr_qp->sq.wrid || !hr_qp->rq.wrid) {
ret = -ENOMEM;
goto err_wrid;
@@ -784,17 +803,19 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
else
hr_qp->doorbell_qpn = cpu_to_le64(hr_qp->qpn);
- if (ib_pd->uobject && (udata->outlen >= sizeof(resp)) &&
- (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB)) {
-
- /* indicate kernel supports rq record db */
- resp.cap_flags |= HNS_ROCE_SUPPORT_RQ_RECORD_DB;
- ret = ib_copy_to_udata(udata, &resp, sizeof(resp));
+ if (udata) {
+ ret = ib_copy_to_udata(udata, &resp,
+ min(udata->outlen, sizeof(resp)));
if (ret)
goto err_qp;
+ }
- hr_qp->rdb_en = 1;
+ if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_QP_FLOW_CTRL) {
+ ret = hr_dev->hw->qp_flow_control_init(hr_dev, hr_qp);
+ if (ret)
+ goto err_qp;
}
+
hr_qp->event = hns_roce_ib_qp_event;
return 0;
@@ -811,39 +832,35 @@ err_qpn:
hns_roce_release_range_qp(hr_dev, qpn, 1);
err_wrid:
- if (ib_pd->uobject) {
+ if (udata) {
if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) &&
(udata->outlen >= sizeof(resp)) &&
hns_roce_qp_has_rq(init_attr))
- hns_roce_db_unmap_user(
- to_hr_ucontext(ib_pd->uobject->context),
- &hr_qp->rdb);
+ hns_roce_db_unmap_user(uctx, &hr_qp->rdb);
} else {
kfree(hr_qp->sq.wrid);
kfree(hr_qp->rq.wrid);
}
err_sq_dbmap:
- if (ib_pd->uobject)
+ if (udata)
if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_SQ_RECORD_DB) &&
(udata->inlen >= sizeof(ucmd)) &&
(udata->outlen >= sizeof(resp)) &&
hns_roce_qp_has_sq(init_attr))
- hns_roce_db_unmap_user(
- to_hr_ucontext(ib_pd->uobject->context),
- &hr_qp->sdb);
+ hns_roce_db_unmap_user(uctx, &hr_qp->sdb);
err_mtt:
hns_roce_mtt_cleanup(hr_dev, &hr_qp->mtt);
err_buf:
- if (ib_pd->uobject)
+ if (hr_qp->umem)
ib_umem_release(hr_qp->umem);
else
hns_roce_buf_free(hr_dev, hr_qp->buff_size, &hr_qp->hr_buf);
err_db:
- if (!ib_pd->uobject && hns_roce_qp_has_rq(init_attr) &&
+ if (!udata && hns_roce_qp_has_rq(init_attr) &&
(hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB))
hns_roce_free_db(hr_dev, &hr_qp->rdb);
@@ -889,7 +906,7 @@ struct ib_qp *hns_roce_create_qp(struct ib_pd *pd,
}
case IB_QPT_GSI: {
/* Userspace is not allowed to create special QPs: */
- if (pd->uobject) {
+ if (udata) {
dev_err(dev, "not support usr space GSI\n");
return ERR_PTR(-EINVAL);
}
@@ -970,7 +987,9 @@ int hns_roce_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
(attr_mask & IB_QP_STATE) && new_state == IB_QPS_ERR) {
if (hr_qp->sdb_en == 1) {
hr_qp->sq.head = *(int *)(hr_qp->sdb.virt_addr);
- hr_qp->rq.head = *(int *)(hr_qp->rdb.virt_addr);
+
+ if (hr_qp->rdb_en == 1)
+ hr_qp->rq.head = *(int *)(hr_qp->rdb.virt_addr);
} else {
dev_warn(dev, "flush cqe is not supported in userspace!\n");
goto out;
@@ -1134,6 +1153,7 @@ int hns_roce_init_qp_table(struct hns_roce_dev *hr_dev)
int reserved_from_bot;
int ret;
+ mutex_init(&qp_table->scc_mutex);
spin_lock_init(&qp_table->lock);
INIT_RADIX_TREE(&hr_dev->qp_table_tree, GFP_ATOMIC);
diff --git a/drivers/infiniband/hw/hns/hns_roce_srq.c b/drivers/infiniband/hw/hns/hns_roce_srq.c
new file mode 100644
index 000000000000..a8ee2f6da967
--- /dev/null
+++ b/drivers/infiniband/hw/hns/hns_roce_srq.c
@@ -0,0 +1,463 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2018 Hisilicon Limited.
+ */
+
+#include <rdma/ib_umem.h>
+#include <rdma/hns-abi.h>
+#include "hns_roce_device.h"
+#include "hns_roce_cmd.h"
+#include "hns_roce_hem.h"
+
+void hns_roce_srq_event(struct hns_roce_dev *hr_dev, u32 srqn, int event_type)
+{
+ struct hns_roce_srq_table *srq_table = &hr_dev->srq_table;
+ struct hns_roce_srq *srq;
+
+ xa_lock(&srq_table->xa);
+ srq = xa_load(&srq_table->xa, srqn & (hr_dev->caps.num_srqs - 1));
+ if (srq)
+ atomic_inc(&srq->refcount);
+ xa_unlock(&srq_table->xa);
+
+ if (!srq) {
+ dev_warn(hr_dev->dev, "Async event for bogus SRQ %08x\n", srqn);
+ return;
+ }
+
+ srq->event(srq, event_type);
+
+ if (atomic_dec_and_test(&srq->refcount))
+ complete(&srq->free);
+}
+EXPORT_SYMBOL_GPL(hns_roce_srq_event);
+
+static void hns_roce_ib_srq_event(struct hns_roce_srq *srq,
+ enum hns_roce_event event_type)
+{
+ struct hns_roce_dev *hr_dev = to_hr_dev(srq->ibsrq.device);
+ struct ib_srq *ibsrq = &srq->ibsrq;
+ struct ib_event event;
+
+ if (ibsrq->event_handler) {
+ event.device = ibsrq->device;
+ event.element.srq = ibsrq;
+ switch (event_type) {
+ case HNS_ROCE_EVENT_TYPE_SRQ_LIMIT_REACH:
+ event.event = IB_EVENT_SRQ_LIMIT_REACHED;
+ break;
+ case HNS_ROCE_EVENT_TYPE_SRQ_CATAS_ERROR:
+ event.event = IB_EVENT_SRQ_ERR;
+ break;
+ default:
+ dev_err(hr_dev->dev,
+ "hns_roce:Unexpected event type 0x%x on SRQ %06lx\n",
+ event_type, srq->srqn);
+ return;
+ }
+
+ ibsrq->event_handler(&event, ibsrq->srq_context);
+ }
+}
+
+static int hns_roce_sw2hw_srq(struct hns_roce_dev *dev,
+ struct hns_roce_cmd_mailbox *mailbox,
+ unsigned long srq_num)
+{
+ return hns_roce_cmd_mbox(dev, mailbox->dma, 0, srq_num, 0,
+ HNS_ROCE_CMD_SW2HW_SRQ,
+ HNS_ROCE_CMD_TIMEOUT_MSECS);
+}
+
+static int hns_roce_hw2sw_srq(struct hns_roce_dev *dev,
+ struct hns_roce_cmd_mailbox *mailbox,
+ unsigned long srq_num)
+{
+ return hns_roce_cmd_mbox(dev, 0, mailbox ? mailbox->dma : 0, srq_num,
+ mailbox ? 0 : 1, HNS_ROCE_CMD_HW2SW_SRQ,
+ HNS_ROCE_CMD_TIMEOUT_MSECS);
+}
+
+static int hns_roce_srq_alloc(struct hns_roce_dev *hr_dev, u32 pdn, u32 cqn,
+ u16 xrcd, struct hns_roce_mtt *hr_mtt,
+ u64 db_rec_addr, struct hns_roce_srq *srq)
+{
+ struct hns_roce_srq_table *srq_table = &hr_dev->srq_table;
+ struct hns_roce_cmd_mailbox *mailbox;
+ dma_addr_t dma_handle_wqe;
+ dma_addr_t dma_handle_idx;
+ u64 *mtts_wqe;
+ u64 *mtts_idx;
+ int ret;
+
+ /* Get the physical address of srq buf */
+ mtts_wqe = hns_roce_table_find(hr_dev,
+ &hr_dev->mr_table.mtt_srqwqe_table,
+ srq->mtt.first_seg,
+ &dma_handle_wqe);
+ if (!mtts_wqe) {
+ dev_err(hr_dev->dev,
+ "SRQ alloc.Failed to find srq buf addr.\n");
+ return -EINVAL;
+ }
+
+ /* Get physical address of idx que buf */
+ mtts_idx = hns_roce_table_find(hr_dev, &hr_dev->mr_table.mtt_idx_table,
+ srq->idx_que.mtt.first_seg,
+ &dma_handle_idx);
+ if (!mtts_idx) {
+ dev_err(hr_dev->dev,
+ "SRQ alloc.Failed to find idx que buf addr.\n");
+ return -EINVAL;
+ }
+
+ ret = hns_roce_bitmap_alloc(&srq_table->bitmap, &srq->srqn);
+ if (ret == -1) {
+ dev_err(hr_dev->dev, "SRQ alloc.Failed to alloc index.\n");
+ return -ENOMEM;
+ }
+
+ ret = hns_roce_table_get(hr_dev, &srq_table->table, srq->srqn);
+ if (ret)
+ goto err_out;
+
+ ret = xa_err(xa_store(&srq_table->xa, srq->srqn, srq, GFP_KERNEL));
+ if (ret)
+ goto err_put;
+
+ mailbox = hns_roce_alloc_cmd_mailbox(hr_dev);
+ if (IS_ERR(mailbox)) {
+ ret = PTR_ERR(mailbox);
+ goto err_xa;
+ }
+
+ hr_dev->hw->write_srqc(hr_dev, srq, pdn, xrcd, cqn, mailbox->buf,
+ mtts_wqe, mtts_idx, dma_handle_wqe,
+ dma_handle_idx);
+
+ ret = hns_roce_sw2hw_srq(hr_dev, mailbox, srq->srqn);
+ hns_roce_free_cmd_mailbox(hr_dev, mailbox);
+ if (ret)
+ goto err_xa;
+
+ atomic_set(&srq->refcount, 1);
+ init_completion(&srq->free);
+ return ret;
+
+err_xa:
+ xa_erase(&srq_table->xa, srq->srqn);
+
+err_put:
+ hns_roce_table_put(hr_dev, &srq_table->table, srq->srqn);
+
+err_out:
+ hns_roce_bitmap_free(&srq_table->bitmap, srq->srqn, BITMAP_NO_RR);
+ return ret;
+}
+
+static void hns_roce_srq_free(struct hns_roce_dev *hr_dev,
+ struct hns_roce_srq *srq)
+{
+ struct hns_roce_srq_table *srq_table = &hr_dev->srq_table;
+ int ret;
+
+ ret = hns_roce_hw2sw_srq(hr_dev, NULL, srq->srqn);
+ if (ret)
+ dev_err(hr_dev->dev, "HW2SW_SRQ failed (%d) for CQN %06lx\n",
+ ret, srq->srqn);
+
+ xa_erase(&srq_table->xa, srq->srqn);
+
+ if (atomic_dec_and_test(&srq->refcount))
+ complete(&srq->free);
+ wait_for_completion(&srq->free);
+
+ hns_roce_table_put(hr_dev, &srq_table->table, srq->srqn);
+ hns_roce_bitmap_free(&srq_table->bitmap, srq->srqn, BITMAP_NO_RR);
+}
+
+static int hns_roce_create_idx_que(struct ib_pd *pd, struct hns_roce_srq *srq,
+ u32 page_shift)
+{
+ struct hns_roce_dev *hr_dev = to_hr_dev(pd->device);
+ struct hns_roce_idx_que *idx_que = &srq->idx_que;
+ u32 bitmap_num;
+ int i;
+
+ bitmap_num = HNS_ROCE_ALOGN_UP(srq->max, 8 * sizeof(u64));
+
+ idx_que->bitmap = kcalloc(1, bitmap_num / 8, GFP_KERNEL);
+ if (!idx_que->bitmap)
+ return -ENOMEM;
+
+ bitmap_num = bitmap_num / (8 * sizeof(u64));
+
+ idx_que->buf_size = srq->idx_que.buf_size;
+
+ if (hns_roce_buf_alloc(hr_dev, idx_que->buf_size, (1 << page_shift) * 2,
+ &idx_que->idx_buf, page_shift)) {
+ kfree(idx_que->bitmap);
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < bitmap_num; i++)
+ idx_que->bitmap[i] = ~(0UL);
+
+ return 0;
+}
+
+struct ib_srq *hns_roce_create_srq(struct ib_pd *pd,
+ struct ib_srq_init_attr *srq_init_attr,
+ struct ib_udata *udata)
+{
+ struct hns_roce_dev *hr_dev = to_hr_dev(pd->device);
+ struct hns_roce_ib_create_srq_resp resp = {};
+ struct hns_roce_srq *srq;
+ int srq_desc_size;
+ int srq_buf_size;
+ u32 page_shift;
+ int ret = 0;
+ u32 npages;
+ u32 cqn;
+
+ /* Check the actual SRQ wqe and SRQ sge num */
+ if (srq_init_attr->attr.max_wr >= hr_dev->caps.max_srq_wrs ||
+ srq_init_attr->attr.max_sge > hr_dev->caps.max_srq_sges)
+ return ERR_PTR(-EINVAL);
+
+ srq = kzalloc(sizeof(*srq), GFP_KERNEL);
+ if (!srq)
+ return ERR_PTR(-ENOMEM);
+
+ mutex_init(&srq->mutex);
+ spin_lock_init(&srq->lock);
+
+ srq->max = roundup_pow_of_two(srq_init_attr->attr.max_wr + 1);
+ srq->max_gs = srq_init_attr->attr.max_sge;
+
+ srq_desc_size = max(16, 16 * srq->max_gs);
+
+ srq->wqe_shift = ilog2(srq_desc_size);
+
+ srq_buf_size = srq->max * srq_desc_size;
+
+ srq->idx_que.entry_sz = HNS_ROCE_IDX_QUE_ENTRY_SZ;
+ srq->idx_que.buf_size = srq->max * srq->idx_que.entry_sz;
+ srq->mtt.mtt_type = MTT_TYPE_SRQWQE;
+ srq->idx_que.mtt.mtt_type = MTT_TYPE_IDX;
+
+ if (udata) {
+ struct hns_roce_ib_create_srq ucmd;
+
+ if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) {
+ ret = -EFAULT;
+ goto err_srq;
+ }
+
+ srq->umem =
+ ib_umem_get(udata, ucmd.buf_addr, srq_buf_size, 0, 0);
+ if (IS_ERR(srq->umem)) {
+ ret = PTR_ERR(srq->umem);
+ goto err_srq;
+ }
+
+ if (hr_dev->caps.srqwqe_buf_pg_sz) {
+ npages = (ib_umem_page_count(srq->umem) +
+ (1 << hr_dev->caps.srqwqe_buf_pg_sz) - 1) /
+ (1 << hr_dev->caps.srqwqe_buf_pg_sz);
+ page_shift = PAGE_SHIFT + hr_dev->caps.srqwqe_buf_pg_sz;
+ ret = hns_roce_mtt_init(hr_dev, npages,
+ page_shift,
+ &srq->mtt);
+ } else
+ ret = hns_roce_mtt_init(hr_dev,
+ ib_umem_page_count(srq->umem),
+ srq->umem->page_shift,
+ &srq->mtt);
+ if (ret)
+ goto err_buf;
+
+ ret = hns_roce_ib_umem_write_mtt(hr_dev, &srq->mtt, srq->umem);
+ if (ret)
+ goto err_srq_mtt;
+
+ /* config index queue BA */
+ srq->idx_que.umem = ib_umem_get(udata, ucmd.que_addr,
+ srq->idx_que.buf_size, 0, 0);
+ if (IS_ERR(srq->idx_que.umem)) {
+ dev_err(hr_dev->dev,
+ "ib_umem_get error for index queue\n");
+ ret = PTR_ERR(srq->idx_que.umem);
+ goto err_srq_mtt;
+ }
+
+ if (hr_dev->caps.idx_buf_pg_sz) {
+ npages = (ib_umem_page_count(srq->idx_que.umem) +
+ (1 << hr_dev->caps.idx_buf_pg_sz) - 1) /
+ (1 << hr_dev->caps.idx_buf_pg_sz);
+ page_shift = PAGE_SHIFT + hr_dev->caps.idx_buf_pg_sz;
+ ret = hns_roce_mtt_init(hr_dev, npages,
+ page_shift, &srq->idx_que.mtt);
+ } else {
+ ret = hns_roce_mtt_init(hr_dev,
+ ib_umem_page_count(srq->idx_que.umem),
+ srq->idx_que.umem->page_shift,
+ &srq->idx_que.mtt);
+ }
+
+ if (ret) {
+ dev_err(hr_dev->dev,
+ "hns_roce_mtt_init error for idx que\n");
+ goto err_idx_mtt;
+ }
+
+ ret = hns_roce_ib_umem_write_mtt(hr_dev, &srq->idx_que.mtt,
+ srq->idx_que.umem);
+ if (ret) {
+ dev_err(hr_dev->dev,
+ "hns_roce_ib_umem_write_mtt error for idx que\n");
+ goto err_idx_buf;
+ }
+ } else {
+ page_shift = PAGE_SHIFT + hr_dev->caps.srqwqe_buf_pg_sz;
+ if (hns_roce_buf_alloc(hr_dev, srq_buf_size,
+ (1 << page_shift) * 2,
+ &srq->buf, page_shift)) {
+ ret = -ENOMEM;
+ goto err_srq;
+ }
+
+ srq->head = 0;
+ srq->tail = srq->max - 1;
+
+ ret = hns_roce_mtt_init(hr_dev, srq->buf.npages,
+ srq->buf.page_shift, &srq->mtt);
+ if (ret)
+ goto err_buf;
+
+ ret = hns_roce_buf_write_mtt(hr_dev, &srq->mtt, &srq->buf);
+ if (ret)
+ goto err_srq_mtt;
+
+ page_shift = PAGE_SHIFT + hr_dev->caps.idx_buf_pg_sz;
+ ret = hns_roce_create_idx_que(pd, srq, page_shift);
+ if (ret) {
+ dev_err(hr_dev->dev, "Create idx queue fail(%d)!\n",
+ ret);
+ goto err_srq_mtt;
+ }
+
+ /* Init mtt table for idx_que */
+ ret = hns_roce_mtt_init(hr_dev, srq->idx_que.idx_buf.npages,
+ srq->idx_que.idx_buf.page_shift,
+ &srq->idx_que.mtt);
+ if (ret)
+ goto err_create_idx;
+
+ /* Write buffer address into the mtt table */
+ ret = hns_roce_buf_write_mtt(hr_dev, &srq->idx_que.mtt,
+ &srq->idx_que.idx_buf);
+ if (ret)
+ goto err_idx_buf;
+
+ srq->wrid = kvmalloc_array(srq->max, sizeof(u64), GFP_KERNEL);
+ if (!srq->wrid) {
+ ret = -ENOMEM;
+ goto err_idx_buf;
+ }
+ }
+
+ cqn = ib_srq_has_cq(srq_init_attr->srq_type) ?
+ to_hr_cq(srq_init_attr->ext.cq)->cqn : 0;
+
+ srq->db_reg_l = hr_dev->reg_base + SRQ_DB_REG;
+
+ ret = hns_roce_srq_alloc(hr_dev, to_hr_pd(pd)->pdn, cqn, 0,
+ &srq->mtt, 0, srq);
+ if (ret)
+ goto err_wrid;
+
+ srq->event = hns_roce_ib_srq_event;
+ srq->ibsrq.ext.xrc.srq_num = srq->srqn;
+ resp.srqn = srq->srqn;
+
+ if (udata) {
+ if (ib_copy_to_udata(udata, &resp,
+ min(udata->outlen, sizeof(resp)))) {
+ ret = -EFAULT;
+ goto err_srqc_alloc;
+ }
+ }
+
+ return &srq->ibsrq;
+
+err_srqc_alloc:
+ hns_roce_srq_free(hr_dev, srq);
+
+err_wrid:
+ kvfree(srq->wrid);
+
+err_idx_buf:
+ hns_roce_mtt_cleanup(hr_dev, &srq->idx_que.mtt);
+
+err_idx_mtt:
+ if (udata)
+ ib_umem_release(srq->idx_que.umem);
+
+err_create_idx:
+ hns_roce_buf_free(hr_dev, srq->idx_que.buf_size,
+ &srq->idx_que.idx_buf);
+ kfree(srq->idx_que.bitmap);
+
+err_srq_mtt:
+ hns_roce_mtt_cleanup(hr_dev, &srq->mtt);
+
+err_buf:
+ if (udata)
+ ib_umem_release(srq->umem);
+ else
+ hns_roce_buf_free(hr_dev, srq_buf_size, &srq->buf);
+
+err_srq:
+ kfree(srq);
+ return ERR_PTR(ret);
+}
+
+int hns_roce_destroy_srq(struct ib_srq *ibsrq)
+{
+ struct hns_roce_dev *hr_dev = to_hr_dev(ibsrq->device);
+ struct hns_roce_srq *srq = to_hr_srq(ibsrq);
+
+ hns_roce_srq_free(hr_dev, srq);
+ hns_roce_mtt_cleanup(hr_dev, &srq->mtt);
+
+ if (ibsrq->uobject) {
+ hns_roce_mtt_cleanup(hr_dev, &srq->idx_que.mtt);
+ ib_umem_release(srq->idx_que.umem);
+ ib_umem_release(srq->umem);
+ } else {
+ kvfree(srq->wrid);
+ hns_roce_buf_free(hr_dev, srq->max << srq->wqe_shift,
+ &srq->buf);
+ }
+
+ kfree(srq);
+
+ return 0;
+}
+
+int hns_roce_init_srq_table(struct hns_roce_dev *hr_dev)
+{
+ struct hns_roce_srq_table *srq_table = &hr_dev->srq_table;
+
+ xa_init(&srq_table->xa);
+
+ return hns_roce_bitmap_init(&srq_table->bitmap, hr_dev->caps.num_srqs,
+ hr_dev->caps.num_srqs - 1,
+ hr_dev->caps.reserved_srqs, 0);
+}
+
+void hns_roce_cleanup_srq_table(struct hns_roce_dev *hr_dev)
+{
+ hns_roce_bitmap_cleanup(&hr_dev->srq_table.bitmap);
+}
diff --git a/drivers/infiniband/hw/i40iw/Makefile b/drivers/infiniband/hw/i40iw/Makefile
index 5a8a7a3f28ae..8942f8229945 100644
--- a/drivers/infiniband/hw/i40iw/Makefile
+++ b/drivers/infiniband/hw/i40iw/Makefile
@@ -1,5 +1,5 @@
# SPDX-License-Identifier: GPL-2.0
-ccflags-y := -Idrivers/net/ethernet/intel/i40e
+ccflags-y := -I $(srctree)/drivers/net/ethernet/intel/i40e
obj-$(CONFIG_INFINIBAND_I40IW) += i40iw.o
diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.c b/drivers/infiniband/hw/i40iw/i40iw_cm.c
index 771eb6bd0785..206cfb0016f8 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_cm.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_cm.c
@@ -404,7 +404,7 @@ static struct i40iw_puda_buf *i40iw_form_cm_frame(struct i40iw_cm_node *cm_node,
if (pdata)
pd_len = pdata->size;
- if (cm_node->vlan_id < VLAN_TAG_PRESENT)
+ if (cm_node->vlan_id <= VLAN_VID_MASK)
eth_hlen += 4;
if (cm_node->ipv4)
@@ -433,7 +433,7 @@ static struct i40iw_puda_buf *i40iw_form_cm_frame(struct i40iw_cm_node *cm_node,
ether_addr_copy(ethh->h_dest, cm_node->rem_mac);
ether_addr_copy(ethh->h_source, cm_node->loc_mac);
- if (cm_node->vlan_id < VLAN_TAG_PRESENT) {
+ if (cm_node->vlan_id <= VLAN_VID_MASK) {
((struct vlan_ethhdr *)ethh)->h_vlan_proto = htons(ETH_P_8021Q);
vtag = (cm_node->user_pri << VLAN_PRIO_SHIFT) | cm_node->vlan_id;
((struct vlan_ethhdr *)ethh)->h_vlan_TCI = htons(vtag);
@@ -463,7 +463,7 @@ static struct i40iw_puda_buf *i40iw_form_cm_frame(struct i40iw_cm_node *cm_node,
ether_addr_copy(ethh->h_dest, cm_node->rem_mac);
ether_addr_copy(ethh->h_source, cm_node->loc_mac);
- if (cm_node->vlan_id < VLAN_TAG_PRESENT) {
+ if (cm_node->vlan_id <= VLAN_VID_MASK) {
((struct vlan_ethhdr *)ethh)->h_vlan_proto = htons(ETH_P_8021Q);
vtag = (cm_node->user_pri << VLAN_PRIO_SHIFT) | cm_node->vlan_id;
((struct vlan_ethhdr *)ethh)->h_vlan_TCI = htons(vtag);
@@ -3323,7 +3323,7 @@ static void i40iw_init_tcp_ctx(struct i40iw_cm_node *cm_node,
tcp_info->flow_label = 0;
tcp_info->snd_mss = cpu_to_le32(((u32)cm_node->tcp_cntxt.mss));
- if (cm_node->vlan_id < VLAN_TAG_PRESENT) {
+ if (cm_node->vlan_id <= VLAN_VID_MASK) {
tcp_info->insert_vlan_tag = true;
tcp_info->vlan_tag = cpu_to_le16(((u16)cm_node->user_pri << I40IW_VLAN_PRIO_SHIFT) |
cm_node->vlan_id);
@@ -3478,7 +3478,7 @@ static void i40iw_qp_disconnect(struct i40iw_qp *iwqp)
/* Need to free the Last Streaming Mode Message */
if (iwqp->ietf_mem.va) {
if (iwqp->lsmm_mr)
- iwibdev->ibdev.dereg_mr(iwqp->lsmm_mr);
+ iwibdev->ibdev.ops.dereg_mr(iwqp->lsmm_mr);
i40iw_free_dma_mem(iwdev->sc_dev.hw, &iwqp->ietf_mem);
}
}
diff --git a/drivers/infiniband/hw/i40iw/i40iw_utils.c b/drivers/infiniband/hw/i40iw/i40iw_utils.c
index a9ea966877f2..337410f40860 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_utils.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_utils.c
@@ -173,7 +173,12 @@ int i40iw_inetaddr_event(struct notifier_block *notifier,
rcu_read_lock();
in = __in_dev_get_rcu(upper_dev);
- local_ipaddr = ntohl(in->ifa_list->ifa_address);
+
+ if (!in->ifa_list)
+ local_ipaddr = 0;
+ else
+ local_ipaddr = ntohl(in->ifa_list->ifa_address);
+
rcu_read_unlock();
} else {
local_ipaddr = ntohl(ifa->ifa_address);
@@ -185,6 +190,11 @@ int i40iw_inetaddr_event(struct notifier_block *notifier,
case NETDEV_UP:
/* Fall through */
case NETDEV_CHANGEADDR:
+
+ /* Just skip if no need to handle ARP cache */
+ if (!local_ipaddr)
+ break;
+
i40iw_manage_arp_cache(iwdev,
netdev->dev_addr,
&local_ipaddr,
@@ -601,7 +611,6 @@ void i40iw_rem_pdusecount(struct i40iw_pd *iwpd, struct i40iw_device *iwdev)
if (!atomic_dec_and_test(&iwpd->usecount))
return;
i40iw_free_resource(iwdev, iwdev->allocated_pds, iwpd->sc_pd.pd_id);
- kfree(iwpd);
}
/**
@@ -745,8 +754,8 @@ enum i40iw_status_code i40iw_allocate_dma_mem(struct i40iw_hw *hw,
if (!mem)
return I40IW_ERR_PARAM;
mem->size = ALIGN(size, alignment);
- mem->va = dma_zalloc_coherent(&pcidev->dev, mem->size,
- (dma_addr_t *)&mem->pa, GFP_KERNEL);
+ mem->va = dma_alloc_coherent(&pcidev->dev, mem->size,
+ (dma_addr_t *)&mem->pa, GFP_KERNEL);
if (!mem->va)
return I40IW_ERR_NO_MEMORY;
return 0;
diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
index 102875872bea..a8352e3ca23d 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
@@ -45,6 +45,7 @@
#include <rdma/iw_cm.h>
#include <rdma/ib_user_verbs.h>
#include <rdma/ib_umem.h>
+#include <rdma/uverbs_ioctl.h>
#include "i40iw.h"
/**
@@ -120,78 +121,55 @@ static int i40iw_query_port(struct ib_device *ibdev,
/**
* i40iw_alloc_ucontext - Allocate the user context data structure
- * @ibdev: device pointer from stack
+ * @uctx: Uverbs context pointer from stack
* @udata: user data
*
* This keeps track of all objects associated with a particular
* user-mode client.
*/
-static struct ib_ucontext *i40iw_alloc_ucontext(struct ib_device *ibdev,
- struct ib_udata *udata)
+static int i40iw_alloc_ucontext(struct ib_ucontext *uctx,
+ struct ib_udata *udata)
{
+ struct ib_device *ibdev = uctx->device;
struct i40iw_device *iwdev = to_iwdev(ibdev);
struct i40iw_alloc_ucontext_req req;
- struct i40iw_alloc_ucontext_resp uresp;
- struct i40iw_ucontext *ucontext;
+ struct i40iw_alloc_ucontext_resp uresp = {};
+ struct i40iw_ucontext *ucontext = to_ucontext(uctx);
if (ib_copy_from_udata(&req, udata, sizeof(req)))
- return ERR_PTR(-EINVAL);
+ return -EINVAL;
if (req.userspace_ver < 4 || req.userspace_ver > I40IW_ABI_VER) {
i40iw_pr_err("Unsupported provider library version %u.\n", req.userspace_ver);
- return ERR_PTR(-EINVAL);
+ return -EINVAL;
}
- memset(&uresp, 0, sizeof(uresp));
uresp.max_qps = iwdev->max_qp;
uresp.max_pds = iwdev->max_pd;
uresp.wq_size = iwdev->max_qp_wr * 2;
uresp.kernel_ver = req.userspace_ver;
- ucontext = kzalloc(sizeof(*ucontext), GFP_KERNEL);
- if (!ucontext)
- return ERR_PTR(-ENOMEM);
-
ucontext->iwdev = iwdev;
ucontext->abi_ver = req.userspace_ver;
- if (ib_copy_to_udata(udata, &uresp, sizeof(uresp))) {
- kfree(ucontext);
- return ERR_PTR(-EFAULT);
- }
+ if (ib_copy_to_udata(udata, &uresp, sizeof(uresp)))
+ return -EFAULT;
INIT_LIST_HEAD(&ucontext->cq_reg_mem_list);
spin_lock_init(&ucontext->cq_reg_mem_list_lock);
INIT_LIST_HEAD(&ucontext->qp_reg_mem_list);
spin_lock_init(&ucontext->qp_reg_mem_list_lock);
- return &ucontext->ibucontext;
+ return 0;
}
/**
* i40iw_dealloc_ucontext - deallocate the user context data structure
* @context: user context created during alloc
*/
-static int i40iw_dealloc_ucontext(struct ib_ucontext *context)
+static void i40iw_dealloc_ucontext(struct ib_ucontext *context)
{
- struct i40iw_ucontext *ucontext = to_ucontext(context);
- unsigned long flags;
-
- spin_lock_irqsave(&ucontext->cq_reg_mem_list_lock, flags);
- if (!list_empty(&ucontext->cq_reg_mem_list)) {
- spin_unlock_irqrestore(&ucontext->cq_reg_mem_list_lock, flags);
- return -EBUSY;
- }
- spin_unlock_irqrestore(&ucontext->cq_reg_mem_list_lock, flags);
- spin_lock_irqsave(&ucontext->qp_reg_mem_list_lock, flags);
- if (!list_empty(&ucontext->qp_reg_mem_list)) {
- spin_unlock_irqrestore(&ucontext->qp_reg_mem_list_lock, flags);
- return -EBUSY;
- }
- spin_unlock_irqrestore(&ucontext->qp_reg_mem_list_lock, flags);
-
- kfree(ucontext);
- return 0;
+ return;
}
/**
@@ -312,16 +290,15 @@ static void i40iw_dealloc_push_page(struct i40iw_device *iwdev, struct i40iw_sc_
/**
* i40iw_alloc_pd - allocate protection domain
- * @ibdev: device pointer from stack
+ * @pd: PD pointer
* @context: user context created during alloc
* @udata: user data
*/
-static struct ib_pd *i40iw_alloc_pd(struct ib_device *ibdev,
- struct ib_ucontext *context,
- struct ib_udata *udata)
+static int i40iw_alloc_pd(struct ib_pd *pd, struct ib_ucontext *context,
+ struct ib_udata *udata)
{
- struct i40iw_pd *iwpd;
- struct i40iw_device *iwdev = to_iwdev(ibdev);
+ struct i40iw_pd *iwpd = to_iwpd(pd);
+ struct i40iw_device *iwdev = to_iwdev(pd->device);
struct i40iw_sc_dev *dev = &iwdev->sc_dev;
struct i40iw_alloc_pd_resp uresp;
struct i40iw_sc_pd *sc_pd;
@@ -330,19 +307,13 @@ static struct ib_pd *i40iw_alloc_pd(struct ib_device *ibdev,
int err;
if (iwdev->closing)
- return ERR_PTR(-ENODEV);
+ return -ENODEV;
err = i40iw_alloc_resource(iwdev, iwdev->allocated_pds,
iwdev->max_pd, &pd_id, &iwdev->next_pd);
if (err) {
i40iw_pr_err("alloc resource failed\n");
- return ERR_PTR(err);
- }
-
- iwpd = kzalloc(sizeof(*iwpd), GFP_KERNEL);
- if (!iwpd) {
- err = -ENOMEM;
- goto free_res;
+ return err;
}
sc_pd = &iwpd->sc_pd;
@@ -361,25 +332,23 @@ static struct ib_pd *i40iw_alloc_pd(struct ib_device *ibdev,
}
i40iw_add_pdusecount(iwpd);
- return &iwpd->ibpd;
+ return 0;
+
error:
- kfree(iwpd);
-free_res:
i40iw_free_resource(iwdev, iwdev->allocated_pds, pd_id);
- return ERR_PTR(err);
+ return err;
}
/**
* i40iw_dealloc_pd - deallocate pd
* @ibpd: ptr of pd to be deallocated
*/
-static int i40iw_dealloc_pd(struct ib_pd *ibpd)
+static void i40iw_dealloc_pd(struct ib_pd *ibpd)
{
struct i40iw_pd *iwpd = to_iwpd(ibpd);
struct i40iw_device *iwdev = to_iwdev(ibpd->device);
i40iw_rem_pdusecount(iwpd, iwdev);
- return 0;
}
/**
@@ -565,7 +534,8 @@ static struct ib_qp *i40iw_create_qp(struct ib_pd *ibpd,
struct i40iw_device *iwdev = to_iwdev(ibpd->device);
struct i40iw_cqp *iwcqp = &iwdev->cqp;
struct i40iw_qp *iwqp;
- struct i40iw_ucontext *ucontext;
+ struct i40iw_ucontext *ucontext = rdma_udata_to_drv_context(
+ udata, struct i40iw_ucontext, ibucontext);
struct i40iw_create_qp_req req;
struct i40iw_create_qp_resp uresp;
u32 qp_num = 0;
@@ -673,28 +643,25 @@ static struct ib_qp *i40iw_create_qp(struct ib_pd *ibpd,
goto error;
}
iwqp->ctx_info.qp_compl_ctx = req.user_compl_ctx;
- if (ibpd->uobject && ibpd->uobject->context) {
- iwqp->user_mode = 1;
- ucontext = to_ucontext(ibpd->uobject->context);
-
- if (req.user_wqe_buffers) {
- struct i40iw_pbl *iwpbl;
-
- spin_lock_irqsave(
- &ucontext->qp_reg_mem_list_lock, flags);
- iwpbl = i40iw_get_pbl(
- (unsigned long)req.user_wqe_buffers,
- &ucontext->qp_reg_mem_list);
- spin_unlock_irqrestore(
- &ucontext->qp_reg_mem_list_lock, flags);
-
- if (!iwpbl) {
- err_code = -ENODATA;
- i40iw_pr_err("no pbl info\n");
- goto error;
- }
- memcpy(&iwqp->iwpbl, iwpbl, sizeof(iwqp->iwpbl));
+ iwqp->user_mode = 1;
+
+ if (req.user_wqe_buffers) {
+ struct i40iw_pbl *iwpbl;
+
+ spin_lock_irqsave(
+ &ucontext->qp_reg_mem_list_lock, flags);
+ iwpbl = i40iw_get_pbl(
+ (unsigned long)req.user_wqe_buffers,
+ &ucontext->qp_reg_mem_list);
+ spin_unlock_irqrestore(
+ &ucontext->qp_reg_mem_list_lock, flags);
+
+ if (!iwpbl) {
+ err_code = -ENODATA;
+ i40iw_pr_err("no pbl info\n");
+ goto error;
}
+ memcpy(&iwqp->iwpbl, iwpbl, sizeof(iwqp->iwpbl));
}
err_code = i40iw_setup_virt_qp(iwdev, iwqp, &init_info);
} else {
@@ -768,7 +735,7 @@ static struct ib_qp *i40iw_create_qp(struct ib_pd *ibpd,
iwdev->qp_table[qp_num] = iwqp;
i40iw_add_pdusecount(iwqp->iwpd);
i40iw_add_devusecount(iwdev);
- if (ibpd->uobject && udata) {
+ if (udata) {
memset(&uresp, 0, sizeof(uresp));
uresp.actual_sq_size = sq_size;
uresp.actual_rq_size = rq_size;
@@ -1371,32 +1338,29 @@ static void i40iw_copy_user_pgaddrs(struct i40iw_mr *iwmr,
{
struct ib_umem *region = iwmr->region;
struct i40iw_pbl *iwpbl = &iwmr->iwpbl;
- int chunk_pages, entry, i;
struct i40iw_pble_alloc *palloc = &iwpbl->pble_alloc;
struct i40iw_pble_info *pinfo;
- struct scatterlist *sg;
+ struct sg_dma_page_iter sg_iter;
u64 pg_addr = 0;
u32 idx = 0;
+ bool first_pg = true;
pinfo = (level == I40IW_LEVEL_1) ? NULL : palloc->level2.leaf;
- for_each_sg(region->sg_head.sgl, sg, region->nmap, entry) {
- chunk_pages = sg_dma_len(sg) >> region->page_shift;
- if ((iwmr->type == IW_MEMREG_TYPE_QP) &&
- !iwpbl->qp_mr.sq_page)
- iwpbl->qp_mr.sq_page = sg_page(sg);
- for (i = 0; i < chunk_pages; i++) {
- pg_addr = sg_dma_address(sg) +
- (i << region->page_shift);
-
- if ((entry + i) == 0)
- *pbl = cpu_to_le64(pg_addr & iwmr->page_msk);
- else if (!(pg_addr & ~iwmr->page_msk))
- *pbl = cpu_to_le64(pg_addr);
- else
- continue;
- pbl = i40iw_next_pbl_addr(pbl, &pinfo, &idx);
- }
+ if (iwmr->type == IW_MEMREG_TYPE_QP)
+ iwpbl->qp_mr.sq_page = sg_page(region->sg_head.sgl);
+
+ for_each_sg_dma_page (region->sg_head.sgl, &sg_iter, region->nmap, 0) {
+ pg_addr = sg_page_iter_dma_address(&sg_iter);
+ if (first_pg)
+ *pbl = cpu_to_le64(pg_addr & iwmr->page_msk);
+ else if (!(pg_addr & ~iwmr->page_msk))
+ *pbl = cpu_to_le64(pg_addr);
+ else
+ continue;
+
+ first_pg = false;
+ pbl = i40iw_next_pbl_addr(pbl, &pinfo, &idx);
}
}
@@ -1833,7 +1797,8 @@ static struct ib_mr *i40iw_reg_user_mr(struct ib_pd *pd,
{
struct i40iw_pd *iwpd = to_iwpd(pd);
struct i40iw_device *iwdev = to_iwdev(pd->device);
- struct i40iw_ucontext *ucontext;
+ struct i40iw_ucontext *ucontext = rdma_udata_to_drv_context(
+ udata, struct i40iw_ucontext, ibucontext);
struct i40iw_pble_alloc *palloc;
struct i40iw_pbl *iwpbl;
struct i40iw_mr *iwmr;
@@ -1854,7 +1819,7 @@ static struct ib_mr *i40iw_reg_user_mr(struct ib_pd *pd,
if (length > I40IW_MAX_MR_SIZE)
return ERR_PTR(-EINVAL);
- region = ib_umem_get(pd->uobject->context, start, length, acc, 0);
+ region = ib_umem_get(udata, start, length, acc, 0);
if (IS_ERR(region))
return (struct ib_mr *)region;
@@ -1874,7 +1839,6 @@ static struct ib_mr *i40iw_reg_user_mr(struct ib_pd *pd,
iwmr->region = region;
iwmr->ibmr.pd = pd;
iwmr->ibmr.device = pd->device;
- ucontext = to_ucontext(pd->uobject->context);
iwmr->page_size = PAGE_SIZE;
iwmr->page_msk = PAGE_MASK;
@@ -2092,7 +2056,8 @@ static int i40iw_dereg_mr(struct ib_mr *ib_mr)
ib_umem_release(iwmr->region);
if (iwmr->type != IW_MEMREG_TYPE_MEM) {
- if (ibpd->uobject) {
+ /* region is released. only test for userness. */
+ if (iwmr->region) {
struct i40iw_ucontext *ucontext;
ucontext = to_ucontext(ibpd->uobject->context);
@@ -2140,9 +2105,8 @@ static int i40iw_dereg_mr(struct ib_mr *ib_mr)
static ssize_t hw_rev_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
- struct i40iw_ib_device *iwibdev = container_of(dev,
- struct i40iw_ib_device,
- ibdev.dev);
+ struct i40iw_ib_device *iwibdev =
+ rdma_device_to_drv_device(dev, struct i40iw_ib_device, ibdev);
u32 hw_rev = iwibdev->iwdev->sc_dev.hw_rev;
return sprintf(buf, "%x\n", hw_rev);
@@ -2721,24 +2685,40 @@ static int i40iw_query_pkey(struct ib_device *ibdev,
return 0;
}
-/**
- * i40iw_get_vector_affinity - report IRQ affinity mask
- * @ibdev: IB device
- * @comp_vector: completion vector index
- */
-static const struct cpumask *i40iw_get_vector_affinity(struct ib_device *ibdev,
- int comp_vector)
-{
- struct i40iw_device *iwdev = to_iwdev(ibdev);
- struct i40iw_msix_vector *msix_vec;
-
- if (iwdev->msix_shared)
- msix_vec = &iwdev->iw_msixtbl[comp_vector];
- else
- msix_vec = &iwdev->iw_msixtbl[comp_vector + 1];
-
- return irq_get_affinity_mask(msix_vec->irq);
-}
+static const struct ib_device_ops i40iw_dev_ops = {
+ .alloc_hw_stats = i40iw_alloc_hw_stats,
+ .alloc_mr = i40iw_alloc_mr,
+ .alloc_pd = i40iw_alloc_pd,
+ .alloc_ucontext = i40iw_alloc_ucontext,
+ .create_cq = i40iw_create_cq,
+ .create_qp = i40iw_create_qp,
+ .dealloc_pd = i40iw_dealloc_pd,
+ .dealloc_ucontext = i40iw_dealloc_ucontext,
+ .dereg_mr = i40iw_dereg_mr,
+ .destroy_cq = i40iw_destroy_cq,
+ .destroy_qp = i40iw_destroy_qp,
+ .drain_rq = i40iw_drain_rq,
+ .drain_sq = i40iw_drain_sq,
+ .get_dev_fw_str = i40iw_get_dev_fw_str,
+ .get_dma_mr = i40iw_get_dma_mr,
+ .get_hw_stats = i40iw_get_hw_stats,
+ .get_port_immutable = i40iw_port_immutable,
+ .map_mr_sg = i40iw_map_mr_sg,
+ .mmap = i40iw_mmap,
+ .modify_qp = i40iw_modify_qp,
+ .poll_cq = i40iw_poll_cq,
+ .post_recv = i40iw_post_recv,
+ .post_send = i40iw_post_send,
+ .query_device = i40iw_query_device,
+ .query_gid = i40iw_query_gid,
+ .query_pkey = i40iw_query_pkey,
+ .query_port = i40iw_query_port,
+ .query_qp = i40iw_query_qp,
+ .reg_user_mr = i40iw_reg_user_mr,
+ .req_notify_cq = i40iw_req_notify_cq,
+ INIT_RDMA_OBJ_SIZE(ib_pd, i40iw_pd, ibpd),
+ INIT_RDMA_OBJ_SIZE(ib_ucontext, i40iw_ucontext, ibucontext),
+};
/**
* i40iw_init_rdma_device - initialization of iwarp device
@@ -2750,7 +2730,7 @@ static struct i40iw_ib_device *i40iw_init_rdma_device(struct i40iw_device *iwdev
struct net_device *netdev = iwdev->netdev;
struct pci_dev *pcidev = (struct pci_dev *)iwdev->hw.dev_context;
- iwibdev = (struct i40iw_ib_device *)ib_alloc_device(sizeof(*iwibdev));
+ iwibdev = ib_alloc_device(i40iw_ib_device, ibdev);
if (!iwibdev) {
i40iw_pr_err("iwdev == NULL\n");
return NULL;
@@ -2786,30 +2766,6 @@ static struct i40iw_ib_device *i40iw_init_rdma_device(struct i40iw_device *iwdev
iwibdev->ibdev.phys_port_cnt = 1;
iwibdev->ibdev.num_comp_vectors = iwdev->ceqs_count;
iwibdev->ibdev.dev.parent = &pcidev->dev;
- iwibdev->ibdev.query_port = i40iw_query_port;
- iwibdev->ibdev.query_pkey = i40iw_query_pkey;
- iwibdev->ibdev.query_gid = i40iw_query_gid;
- iwibdev->ibdev.alloc_ucontext = i40iw_alloc_ucontext;
- iwibdev->ibdev.dealloc_ucontext = i40iw_dealloc_ucontext;
- iwibdev->ibdev.mmap = i40iw_mmap;
- iwibdev->ibdev.alloc_pd = i40iw_alloc_pd;
- iwibdev->ibdev.dealloc_pd = i40iw_dealloc_pd;
- iwibdev->ibdev.create_qp = i40iw_create_qp;
- iwibdev->ibdev.modify_qp = i40iw_modify_qp;
- iwibdev->ibdev.query_qp = i40iw_query_qp;
- iwibdev->ibdev.destroy_qp = i40iw_destroy_qp;
- iwibdev->ibdev.create_cq = i40iw_create_cq;
- iwibdev->ibdev.destroy_cq = i40iw_destroy_cq;
- iwibdev->ibdev.get_dma_mr = i40iw_get_dma_mr;
- iwibdev->ibdev.reg_user_mr = i40iw_reg_user_mr;
- iwibdev->ibdev.dereg_mr = i40iw_dereg_mr;
- iwibdev->ibdev.alloc_hw_stats = i40iw_alloc_hw_stats;
- iwibdev->ibdev.get_hw_stats = i40iw_get_hw_stats;
- iwibdev->ibdev.query_device = i40iw_query_device;
- iwibdev->ibdev.drain_sq = i40iw_drain_sq;
- iwibdev->ibdev.drain_rq = i40iw_drain_rq;
- iwibdev->ibdev.alloc_mr = i40iw_alloc_mr;
- iwibdev->ibdev.map_mr_sg = i40iw_map_mr_sg;
iwibdev->ibdev.iwcm = kzalloc(sizeof(*iwibdev->ibdev.iwcm), GFP_KERNEL);
if (!iwibdev->ibdev.iwcm) {
ib_dealloc_device(&iwibdev->ibdev);
@@ -2826,13 +2782,7 @@ static struct i40iw_ib_device *i40iw_init_rdma_device(struct i40iw_device *iwdev
iwibdev->ibdev.iwcm->destroy_listen = i40iw_destroy_listen;
memcpy(iwibdev->ibdev.iwcm->ifname, netdev->name,
sizeof(iwibdev->ibdev.iwcm->ifname));
- iwibdev->ibdev.get_port_immutable = i40iw_port_immutable;
- iwibdev->ibdev.get_dev_fw_str = i40iw_get_dev_fw_str;
- iwibdev->ibdev.poll_cq = i40iw_poll_cq;
- iwibdev->ibdev.req_notify_cq = i40iw_req_notify_cq;
- iwibdev->ibdev.post_send = i40iw_post_send;
- iwibdev->ibdev.post_recv = i40iw_post_recv;
- iwibdev->ibdev.get_vector_affinity = i40iw_get_vector_affinity;
+ ib_set_device_ops(&iwibdev->ibdev, &i40iw_dev_ops);
return iwibdev;
}
@@ -2885,7 +2835,7 @@ int i40iw_register_rdma_device(struct i40iw_device *iwdev)
iwibdev = iwdev->iwibdev;
rdma_set_device_sysfs_group(&iwibdev->ibdev, &i40iw_attr_group);
iwibdev->ibdev.driver_id = RDMA_DRIVER_I40IW;
- ret = ib_register_device(&iwibdev->ibdev, "i40iw%d", NULL);
+ ret = ib_register_device(&iwibdev->ibdev, "i40iw%d");
if (ret)
goto error;
diff --git a/drivers/infiniband/hw/mlx4/Kconfig b/drivers/infiniband/hw/mlx4/Kconfig
index d1de3285fd88..fc01deac1d3c 100644
--- a/drivers/infiniband/hw/mlx4/Kconfig
+++ b/drivers/infiniband/hw/mlx4/Kconfig
@@ -1,8 +1,6 @@
config MLX4_INFINIBAND
tristate "Mellanox ConnectX HCA support"
depends on NETDEVICES && ETHERNET && PCI && INET
- depends on INFINIBAND_USER_ACCESS || !INFINIBAND_USER_ACCESS
- depends on MAY_USE_DEVLINK
select NET_VENDOR_MELLANOX
select MLX4_CORE
---help---
diff --git a/drivers/infiniband/hw/mlx4/ah.c b/drivers/infiniband/hw/mlx4/ah.c
index e9e3a6f390db..1672808262ba 100644
--- a/drivers/infiniband/hw/mlx4/ah.c
+++ b/drivers/infiniband/hw/mlx4/ah.c
@@ -144,7 +144,7 @@ static struct ib_ah *create_iboe_ah(struct ib_pd *pd,
}
struct ib_ah *mlx4_ib_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr,
- struct ib_udata *udata)
+ u32 flags, struct ib_udata *udata)
{
struct mlx4_ib_ah *ah;
@@ -189,7 +189,7 @@ struct ib_ah *mlx4_ib_create_ah_slave(struct ib_pd *pd,
slave_attr.grh.sgid_attr = NULL;
slave_attr.grh.sgid_index = slave_sgid_index;
- ah = mlx4_ib_create_ah(pd, &slave_attr, NULL);
+ ah = mlx4_ib_create_ah(pd, &slave_attr, 0, NULL);
if (IS_ERR(ah))
return ah;
@@ -250,7 +250,7 @@ int mlx4_ib_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr)
return 0;
}
-int mlx4_ib_destroy_ah(struct ib_ah *ah)
+int mlx4_ib_destroy_ah(struct ib_ah *ah, u32 flags)
{
kfree(to_mah(ah));
return 0;
diff --git a/drivers/infiniband/hw/mlx4/alias_GUID.c b/drivers/infiniband/hw/mlx4/alias_GUID.c
index 155b4dfc0ae8..2a0b59a4b6eb 100644
--- a/drivers/infiniband/hw/mlx4/alias_GUID.c
+++ b/drivers/infiniband/hw/mlx4/alias_GUID.c
@@ -804,8 +804,8 @@ void mlx4_ib_destroy_alias_guid_service(struct mlx4_ib_dev *dev)
unsigned long flags;
for (i = 0 ; i < dev->num_ports; i++) {
- cancel_delayed_work(&dev->sriov.alias_guid.ports_guid[i].alias_guid_work);
det = &sriov->alias_guid.ports_guid[i];
+ cancel_delayed_work_sync(&det->alias_guid_work);
spin_lock_irqsave(&sriov->alias_guid.ag_work_lock, flags);
while (!list_empty(&det->cb_list)) {
cb_ctx = list_entry(det->cb_list.next,
@@ -849,7 +849,7 @@ int mlx4_ib_init_alias_guid_service(struct mlx4_ib_dev *dev)
spin_lock_init(&dev->sriov.alias_guid.ag_work_lock);
for (i = 1; i <= dev->num_ports; ++i) {
- if (dev->ib_dev.query_gid(&dev->ib_dev , i, 0, &gid)) {
+ if (dev->ib_dev.ops.query_gid(&dev->ib_dev, i, 0, &gid)) {
ret = -EFAULT;
goto err_unregister;
}
diff --git a/drivers/infiniband/hw/mlx4/cm.c b/drivers/infiniband/hw/mlx4/cm.c
index fedaf8260105..8c79a480f2b7 100644
--- a/drivers/infiniband/hw/mlx4/cm.c
+++ b/drivers/infiniband/hw/mlx4/cm.c
@@ -39,7 +39,7 @@
#include "mlx4_ib.h"
-#define CM_CLEANUP_CACHE_TIMEOUT (5 * HZ)
+#define CM_CLEANUP_CACHE_TIMEOUT (30 * HZ)
struct id_map_entry {
struct rb_node node;
diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c
index 82adc0d1d30e..03ac72339dd2 100644
--- a/drivers/infiniband/hw/mlx4/cq.c
+++ b/drivers/infiniband/hw/mlx4/cq.c
@@ -134,16 +134,16 @@ static void mlx4_ib_free_cq_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq_buf *
mlx4_buf_free(dev->dev, (cqe + 1) * buf->entry_size, &buf->buf);
}
-static int mlx4_ib_get_cq_umem(struct mlx4_ib_dev *dev, struct ib_ucontext *context,
- struct mlx4_ib_cq_buf *buf, struct ib_umem **umem,
- u64 buf_addr, int cqe)
+static int mlx4_ib_get_cq_umem(struct mlx4_ib_dev *dev, struct ib_udata *udata,
+ struct mlx4_ib_cq_buf *buf,
+ struct ib_umem **umem, u64 buf_addr, int cqe)
{
int err;
int cqe_size = dev->dev->caps.cqe_size;
int shift;
int n;
- *umem = ib_umem_get(context, buf_addr, cqe * cqe_size,
+ *umem = ib_umem_get(udata, buf_addr, cqe * cqe_size,
IB_ACCESS_LOCAL_WRITE, 1);
if (IS_ERR(*umem))
return PTR_ERR(*umem);
@@ -181,6 +181,7 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev,
struct mlx4_ib_dev *dev = to_mdev(ibdev);
struct mlx4_ib_cq *cq;
struct mlx4_uar *uar;
+ void *buf_addr;
int err;
if (entries < 1 || entries > dev->dev->caps.max_cqes)
@@ -189,7 +190,7 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev,
if (attr->flags & ~CQ_CREATE_FLAGS_SUPPORTED)
return ERR_PTR(-EINVAL);
- cq = kmalloc(sizeof *cq, GFP_KERNEL);
+ cq = kzalloc(sizeof(*cq), GFP_KERNEL);
if (!cq)
return ERR_PTR(-ENOMEM);
@@ -211,13 +212,14 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev,
goto err_cq;
}
- err = mlx4_ib_get_cq_umem(dev, context, &cq->buf, &cq->umem,
+ buf_addr = (void *)(unsigned long)ucmd.buf_addr;
+ err = mlx4_ib_get_cq_umem(dev, udata, &cq->buf, &cq->umem,
ucmd.buf_addr, entries);
if (err)
goto err_cq;
- err = mlx4_ib_db_map_user(to_mucontext(context), ucmd.db_addr,
- &cq->db);
+ err = mlx4_ib_db_map_user(to_mucontext(context), udata,
+ ucmd.db_addr, &cq->db);
if (err)
goto err_mtt;
@@ -237,6 +239,8 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev,
if (err)
goto err_db;
+ buf_addr = &cq->buf.buf;
+
uar = &dev->priv_uar;
cq->mcq.usage = MLX4_RES_USAGE_DRIVER;
}
@@ -246,7 +250,9 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev,
err = mlx4_cq_alloc(dev->dev, entries, &cq->buf.mtt, uar,
cq->db.dma, &cq->mcq, vector, 0,
- !!(cq->create_flags & IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION));
+ !!(cq->create_flags &
+ IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION),
+ buf_addr, !!context);
if (err)
goto err_dbmap;
@@ -329,7 +335,7 @@ static int mlx4_alloc_resize_umem(struct mlx4_ib_dev *dev, struct mlx4_ib_cq *cq
if (!cq->resize_buf)
return -ENOMEM;
- err = mlx4_ib_get_cq_umem(dev, cq->umem->context, &cq->resize_buf->buf,
+ err = mlx4_ib_get_cq_umem(dev, udata, &cq->resize_buf->buf,
&cq->resize_umem, ucmd.buf_addr, entries);
if (err) {
kfree(cq->resize_buf);
diff --git a/drivers/infiniband/hw/mlx4/doorbell.c b/drivers/infiniband/hw/mlx4/doorbell.c
index c51740986367..3aab71b29ce8 100644
--- a/drivers/infiniband/hw/mlx4/doorbell.c
+++ b/drivers/infiniband/hw/mlx4/doorbell.c
@@ -41,7 +41,8 @@ struct mlx4_ib_user_db_page {
int refcnt;
};
-int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context, unsigned long virt,
+int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context,
+ struct ib_udata *udata, unsigned long virt,
struct mlx4_db *db)
{
struct mlx4_ib_user_db_page *page;
@@ -61,8 +62,7 @@ int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context, unsigned long virt,
page->user_virt = (virt & PAGE_MASK);
page->refcnt = 0;
- page->umem = ib_umem_get(&context->ibucontext, virt & PAGE_MASK,
- PAGE_SIZE, 0, 0);
+ page->umem = ib_umem_get(udata, virt & PAGE_MASK, PAGE_SIZE, 0, 0);
if (IS_ERR(page->umem)) {
err = PTR_ERR(page->umem);
kfree(page);
diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c
index 8942f5f7f04d..936ee1314bcd 100644
--- a/drivers/infiniband/hw/mlx4/mad.c
+++ b/drivers/infiniband/hw/mlx4/mad.c
@@ -202,13 +202,13 @@ static void update_sm_ah(struct mlx4_ib_dev *dev, u8 port_num, u16 lid, u8 sl)
rdma_ah_set_port_num(&ah_attr, port_num);
new_ah = rdma_create_ah(dev->send_agent[port_num - 1][0]->qp->pd,
- &ah_attr);
+ &ah_attr, 0);
if (IS_ERR(new_ah))
return;
spin_lock_irqsave(&dev->sm_lock, flags);
if (dev->sm_ah[port_num - 1])
- rdma_destroy_ah(dev->sm_ah[port_num - 1]);
+ rdma_destroy_ah(dev->sm_ah[port_num - 1], 0);
dev->sm_ah[port_num - 1] = new_ah;
spin_unlock_irqrestore(&dev->sm_lock, flags);
}
@@ -567,7 +567,7 @@ int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port,
return -EINVAL;
rdma_ah_set_grh(&attr, &dgid, 0, 0, 0, 0);
}
- ah = rdma_create_ah(tun_ctx->pd, &attr);
+ ah = rdma_create_ah(tun_ctx->pd, &attr, 0);
if (IS_ERR(ah))
return -ENOMEM;
@@ -584,7 +584,7 @@ int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port,
tun_mad = (struct mlx4_rcv_tunnel_mad *) (tun_qp->tx_ring[tun_tx_ix].buf.addr);
if (tun_qp->tx_ring[tun_tx_ix].ah)
- rdma_destroy_ah(tun_qp->tx_ring[tun_tx_ix].ah);
+ rdma_destroy_ah(tun_qp->tx_ring[tun_tx_ix].ah, 0);
tun_qp->tx_ring[tun_tx_ix].ah = ah;
ib_dma_sync_single_for_cpu(&dev->ib_dev,
tun_qp->tx_ring[tun_tx_ix].buf.map,
@@ -657,7 +657,7 @@ int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port,
spin_unlock(&tun_qp->tx_lock);
tun_qp->tx_ring[tun_tx_ix].ah = NULL;
end:
- rdma_destroy_ah(ah);
+ rdma_destroy_ah(ah, 0);
return ret;
}
@@ -1024,7 +1024,7 @@ static void send_handler(struct ib_mad_agent *agent,
struct ib_mad_send_wc *mad_send_wc)
{
if (mad_send_wc->send_buf->context[0])
- rdma_destroy_ah(mad_send_wc->send_buf->context[0]);
+ rdma_destroy_ah(mad_send_wc->send_buf->context[0], 0);
ib_free_send_mad(mad_send_wc->send_buf);
}
@@ -1079,7 +1079,7 @@ void mlx4_ib_mad_cleanup(struct mlx4_ib_dev *dev)
}
if (dev->sm_ah[p])
- rdma_destroy_ah(dev->sm_ah[p]);
+ rdma_destroy_ah(dev->sm_ah[p], 0);
}
}
@@ -1411,7 +1411,7 @@ int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port,
sqp_mad = (struct mlx4_mad_snd_buf *) (sqp->tx_ring[wire_tx_ix].buf.addr);
if (sqp->tx_ring[wire_tx_ix].ah)
- rdma_destroy_ah(sqp->tx_ring[wire_tx_ix].ah);
+ mlx4_ib_destroy_ah(sqp->tx_ring[wire_tx_ix].ah, 0);
sqp->tx_ring[wire_tx_ix].ah = ah;
ib_dma_sync_single_for_cpu(&dev->ib_dev,
sqp->tx_ring[wire_tx_ix].buf.map,
@@ -1450,7 +1450,7 @@ int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port,
spin_unlock(&sqp->tx_lock);
sqp->tx_ring[wire_tx_ix].ah = NULL;
out:
- mlx4_ib_destroy_ah(ah);
+ mlx4_ib_destroy_ah(ah, 0);
return ret;
}
@@ -1716,7 +1716,7 @@ static void mlx4_ib_free_pv_qp_bufs(struct mlx4_ib_demux_pv_ctx *ctx,
tx_buf_size, DMA_TO_DEVICE);
kfree(tun_qp->tx_ring[i].buf.addr);
if (tun_qp->tx_ring[i].ah)
- rdma_destroy_ah(tun_qp->tx_ring[i].ah);
+ rdma_destroy_ah(tun_qp->tx_ring[i].ah, 0);
}
kfree(tun_qp->tx_ring);
kfree(tun_qp->ring);
@@ -1749,7 +1749,7 @@ static void mlx4_ib_tunnel_comp_worker(struct work_struct *work)
"wrid=0x%llx, status=0x%x\n",
wc.wr_id, wc.status);
rdma_destroy_ah(tun_qp->tx_ring[wc.wr_id &
- (MLX4_NUM_TUNNEL_BUFS - 1)].ah);
+ (MLX4_NUM_TUNNEL_BUFS - 1)].ah, 0);
tun_qp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah
= NULL;
spin_lock(&tun_qp->tx_lock);
@@ -1766,7 +1766,7 @@ static void mlx4_ib_tunnel_comp_worker(struct work_struct *work)
ctx->slave, wc.status, wc.wr_id);
if (!MLX4_TUN_IS_RECV(wc.wr_id)) {
rdma_destroy_ah(tun_qp->tx_ring[wc.wr_id &
- (MLX4_NUM_TUNNEL_BUFS - 1)].ah);
+ (MLX4_NUM_TUNNEL_BUFS - 1)].ah, 0);
tun_qp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah
= NULL;
spin_lock(&tun_qp->tx_lock);
@@ -1902,8 +1902,8 @@ static void mlx4_ib_sqp_comp_worker(struct work_struct *work)
if (wc.status == IB_WC_SUCCESS) {
switch (wc.opcode) {
case IB_WC_SEND:
- rdma_destroy_ah(sqp->tx_ring[wc.wr_id &
- (MLX4_NUM_TUNNEL_BUFS - 1)].ah);
+ mlx4_ib_destroy_ah(sqp->tx_ring[wc.wr_id &
+ (MLX4_NUM_TUNNEL_BUFS - 1)].ah, 0);
sqp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah
= NULL;
spin_lock(&sqp->tx_lock);
@@ -1931,8 +1931,8 @@ static void mlx4_ib_sqp_comp_worker(struct work_struct *work)
" status = %d, wrid = 0x%llx\n",
ctx->slave, wc.status, wc.wr_id);
if (!MLX4_TUN_IS_RECV(wc.wr_id)) {
- rdma_destroy_ah(sqp->tx_ring[wc.wr_id &
- (MLX4_NUM_TUNNEL_BUFS - 1)].ah);
+ mlx4_ib_destroy_ah(sqp->tx_ring[wc.wr_id &
+ (MLX4_NUM_TUNNEL_BUFS - 1)].ah, 0);
sqp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah
= NULL;
spin_lock(&sqp->tx_lock);
diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 0def2323459c..733f7bbd5901 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -1076,17 +1076,18 @@ out:
return err;
}
-static struct ib_ucontext *mlx4_ib_alloc_ucontext(struct ib_device *ibdev,
- struct ib_udata *udata)
+static int mlx4_ib_alloc_ucontext(struct ib_ucontext *uctx,
+ struct ib_udata *udata)
{
+ struct ib_device *ibdev = uctx->device;
struct mlx4_ib_dev *dev = to_mdev(ibdev);
- struct mlx4_ib_ucontext *context;
+ struct mlx4_ib_ucontext *context = to_mucontext(uctx);
struct mlx4_ib_alloc_ucontext_resp_v3 resp_v3;
struct mlx4_ib_alloc_ucontext_resp resp;
int err;
if (!dev->ib_active)
- return ERR_PTR(-EAGAIN);
+ return -EAGAIN;
if (ibdev->uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION) {
resp_v3.qp_tab_size = dev->dev->caps.num_qps;
@@ -1100,15 +1101,9 @@ static struct ib_ucontext *mlx4_ib_alloc_ucontext(struct ib_device *ibdev,
resp.cqe_size = dev->dev->caps.cqe_size;
}
- context = kzalloc(sizeof(*context), GFP_KERNEL);
- if (!context)
- return ERR_PTR(-ENOMEM);
-
err = mlx4_uar_alloc(to_mdev(ibdev)->dev, &context->uar);
- if (err) {
- kfree(context);
- return ERR_PTR(err);
- }
+ if (err)
+ return err;
INIT_LIST_HEAD(&context->db_page_list);
mutex_init(&context->db_page_mutex);
@@ -1123,21 +1118,17 @@ static struct ib_ucontext *mlx4_ib_alloc_ucontext(struct ib_device *ibdev,
if (err) {
mlx4_uar_free(to_mdev(ibdev)->dev, &context->uar);
- kfree(context);
- return ERR_PTR(-EFAULT);
+ return -EFAULT;
}
- return &context->ibucontext;
+ return err;
}
-static int mlx4_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
+static void mlx4_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
{
struct mlx4_ib_ucontext *context = to_mucontext(ibcontext);
mlx4_uar_free(to_mdev(ibcontext->device)->dev, &context->uar);
- kfree(context);
-
- return 0;
}
static void mlx4_ib_disassociate_ucontext(struct ib_ucontext *ibcontext)
@@ -1186,38 +1177,27 @@ static int mlx4_ib_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
}
}
-static struct ib_pd *mlx4_ib_alloc_pd(struct ib_device *ibdev,
- struct ib_ucontext *context,
- struct ib_udata *udata)
+static int mlx4_ib_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context,
+ struct ib_udata *udata)
{
- struct mlx4_ib_pd *pd;
+ struct mlx4_ib_pd *pd = to_mpd(ibpd);
+ struct ib_device *ibdev = ibpd->device;
int err;
- pd = kzalloc(sizeof(*pd), GFP_KERNEL);
- if (!pd)
- return ERR_PTR(-ENOMEM);
-
err = mlx4_pd_alloc(to_mdev(ibdev)->dev, &pd->pdn);
- if (err) {
- kfree(pd);
- return ERR_PTR(err);
- }
+ if (err)
+ return err;
- if (context)
- if (ib_copy_to_udata(udata, &pd->pdn, sizeof (__u32))) {
- mlx4_pd_free(to_mdev(ibdev)->dev, pd->pdn);
- kfree(pd);
- return ERR_PTR(-EFAULT);
- }
- return &pd->ibpd;
+ if (context && ib_copy_to_udata(udata, &pd->pdn, sizeof(__u32))) {
+ mlx4_pd_free(to_mdev(ibdev)->dev, pd->pdn);
+ return -EFAULT;
+ }
+ return 0;
}
-static int mlx4_ib_dealloc_pd(struct ib_pd *pd)
+static void mlx4_ib_dealloc_pd(struct ib_pd *pd)
{
mlx4_pd_free(to_mdev(pd->device)->dev, to_mpd(pd)->pdn);
- kfree(pd);
-
- return 0;
}
static struct ib_xrcd *mlx4_ib_alloc_xrcd(struct ib_device *ibdev,
@@ -2043,7 +2023,7 @@ static ssize_t hca_type_show(struct device *device,
struct device_attribute *attr, char *buf)
{
struct mlx4_ib_dev *dev =
- container_of(device, struct mlx4_ib_dev, ib_dev.dev);
+ rdma_device_to_drv_device(device, struct mlx4_ib_dev, ib_dev);
return sprintf(buf, "MT%d\n", dev->dev->persist->pdev->device);
}
static DEVICE_ATTR_RO(hca_type);
@@ -2052,7 +2032,7 @@ static ssize_t hw_rev_show(struct device *device,
struct device_attribute *attr, char *buf)
{
struct mlx4_ib_dev *dev =
- container_of(device, struct mlx4_ib_dev, ib_dev.dev);
+ rdma_device_to_drv_device(device, struct mlx4_ib_dev, ib_dev);
return sprintf(buf, "%x\n", dev->dev->rev_id);
}
static DEVICE_ATTR_RO(hw_rev);
@@ -2061,7 +2041,8 @@ static ssize_t board_id_show(struct device *device,
struct device_attribute *attr, char *buf)
{
struct mlx4_ib_dev *dev =
- container_of(device, struct mlx4_ib_dev, ib_dev.dev);
+ rdma_device_to_drv_device(device, struct mlx4_ib_dev, ib_dev);
+
return sprintf(buf, "%.*s\n", MLX4_BOARD_ID_LEN,
dev->dev->board_id);
}
@@ -2220,6 +2201,11 @@ static void mlx4_ib_fill_diag_counters(struct mlx4_ib_dev *ibdev,
}
}
+static const struct ib_device_ops mlx4_ib_hw_stats_ops = {
+ .alloc_hw_stats = mlx4_ib_alloc_hw_stats,
+ .get_hw_stats = mlx4_ib_get_hw_stats,
+};
+
static int mlx4_ib_alloc_diag_counters(struct mlx4_ib_dev *ibdev)
{
struct mlx4_ib_diag_counters *diag = ibdev->diag_counters;
@@ -2246,8 +2232,7 @@ static int mlx4_ib_alloc_diag_counters(struct mlx4_ib_dev *ibdev)
diag[i].offset, i);
}
- ibdev->ib_dev.get_hw_stats = mlx4_ib_get_hw_stats;
- ibdev->ib_dev.alloc_hw_stats = mlx4_ib_alloc_hw_stats;
+ ib_set_device_ops(&ibdev->ib_dev, &mlx4_ib_hw_stats_ops);
return 0;
@@ -2352,6 +2337,32 @@ static void mlx4_ib_scan_netdevs(struct mlx4_ib_dev *ibdev,
event == NETDEV_UP || event == NETDEV_CHANGE))
update_qps_port = port;
+ if (dev == iboe->netdevs[port - 1] &&
+ (event == NETDEV_UP || event == NETDEV_DOWN)) {
+ enum ib_port_state port_state;
+ struct ib_event ibev = { };
+
+ if (ib_get_cached_port_state(&ibdev->ib_dev, port,
+ &port_state))
+ continue;
+
+ if (event == NETDEV_UP &&
+ (port_state != IB_PORT_ACTIVE ||
+ iboe->last_port_state[port - 1] != IB_PORT_DOWN))
+ continue;
+ if (event == NETDEV_DOWN &&
+ (port_state != IB_PORT_DOWN ||
+ iboe->last_port_state[port - 1] != IB_PORT_ACTIVE))
+ continue;
+ iboe->last_port_state[port - 1] = port_state;
+
+ ibev.device = &ibdev->ib_dev;
+ ibev.element.port_num = port;
+ ibev.event = event == NETDEV_UP ? IB_EVENT_PORT_ACTIVE :
+ IB_EVENT_PORT_ERR;
+ ib_dispatch_event(&ibev);
+ }
+
}
spin_unlock_bh(&iboe->lock);
@@ -2499,6 +2510,90 @@ static void get_fw_ver_str(struct ib_device *device, char *str)
(int) dev->dev->caps.fw_ver & 0xffff);
}
+static const struct ib_device_ops mlx4_ib_dev_ops = {
+ .add_gid = mlx4_ib_add_gid,
+ .alloc_mr = mlx4_ib_alloc_mr,
+ .alloc_pd = mlx4_ib_alloc_pd,
+ .alloc_ucontext = mlx4_ib_alloc_ucontext,
+ .attach_mcast = mlx4_ib_mcg_attach,
+ .create_ah = mlx4_ib_create_ah,
+ .create_cq = mlx4_ib_create_cq,
+ .create_qp = mlx4_ib_create_qp,
+ .create_srq = mlx4_ib_create_srq,
+ .dealloc_pd = mlx4_ib_dealloc_pd,
+ .dealloc_ucontext = mlx4_ib_dealloc_ucontext,
+ .del_gid = mlx4_ib_del_gid,
+ .dereg_mr = mlx4_ib_dereg_mr,
+ .destroy_ah = mlx4_ib_destroy_ah,
+ .destroy_cq = mlx4_ib_destroy_cq,
+ .destroy_qp = mlx4_ib_destroy_qp,
+ .destroy_srq = mlx4_ib_destroy_srq,
+ .detach_mcast = mlx4_ib_mcg_detach,
+ .disassociate_ucontext = mlx4_ib_disassociate_ucontext,
+ .drain_rq = mlx4_ib_drain_rq,
+ .drain_sq = mlx4_ib_drain_sq,
+ .get_dev_fw_str = get_fw_ver_str,
+ .get_dma_mr = mlx4_ib_get_dma_mr,
+ .get_link_layer = mlx4_ib_port_link_layer,
+ .get_netdev = mlx4_ib_get_netdev,
+ .get_port_immutable = mlx4_port_immutable,
+ .map_mr_sg = mlx4_ib_map_mr_sg,
+ .mmap = mlx4_ib_mmap,
+ .modify_cq = mlx4_ib_modify_cq,
+ .modify_device = mlx4_ib_modify_device,
+ .modify_port = mlx4_ib_modify_port,
+ .modify_qp = mlx4_ib_modify_qp,
+ .modify_srq = mlx4_ib_modify_srq,
+ .poll_cq = mlx4_ib_poll_cq,
+ .post_recv = mlx4_ib_post_recv,
+ .post_send = mlx4_ib_post_send,
+ .post_srq_recv = mlx4_ib_post_srq_recv,
+ .process_mad = mlx4_ib_process_mad,
+ .query_ah = mlx4_ib_query_ah,
+ .query_device = mlx4_ib_query_device,
+ .query_gid = mlx4_ib_query_gid,
+ .query_pkey = mlx4_ib_query_pkey,
+ .query_port = mlx4_ib_query_port,
+ .query_qp = mlx4_ib_query_qp,
+ .query_srq = mlx4_ib_query_srq,
+ .reg_user_mr = mlx4_ib_reg_user_mr,
+ .req_notify_cq = mlx4_ib_arm_cq,
+ .rereg_user_mr = mlx4_ib_rereg_user_mr,
+ .resize_cq = mlx4_ib_resize_cq,
+ INIT_RDMA_OBJ_SIZE(ib_pd, mlx4_ib_pd, ibpd),
+ INIT_RDMA_OBJ_SIZE(ib_ucontext, mlx4_ib_ucontext, ibucontext),
+};
+
+static const struct ib_device_ops mlx4_ib_dev_wq_ops = {
+ .create_rwq_ind_table = mlx4_ib_create_rwq_ind_table,
+ .create_wq = mlx4_ib_create_wq,
+ .destroy_rwq_ind_table = mlx4_ib_destroy_rwq_ind_table,
+ .destroy_wq = mlx4_ib_destroy_wq,
+ .modify_wq = mlx4_ib_modify_wq,
+};
+
+static const struct ib_device_ops mlx4_ib_dev_fmr_ops = {
+ .alloc_fmr = mlx4_ib_fmr_alloc,
+ .dealloc_fmr = mlx4_ib_fmr_dealloc,
+ .map_phys_fmr = mlx4_ib_map_phys_fmr,
+ .unmap_fmr = mlx4_ib_unmap_fmr,
+};
+
+static const struct ib_device_ops mlx4_ib_dev_mw_ops = {
+ .alloc_mw = mlx4_ib_alloc_mw,
+ .dealloc_mw = mlx4_ib_dealloc_mw,
+};
+
+static const struct ib_device_ops mlx4_ib_dev_xrc_ops = {
+ .alloc_xrcd = mlx4_ib_alloc_xrcd,
+ .dealloc_xrcd = mlx4_ib_dealloc_xrcd,
+};
+
+static const struct ib_device_ops mlx4_ib_dev_fs_ops = {
+ .create_flow = mlx4_ib_create_flow,
+ .destroy_flow = mlx4_ib_destroy_flow,
+};
+
static void *mlx4_ib_add(struct mlx4_dev *dev)
{
struct mlx4_ib_dev *ibdev;
@@ -2522,7 +2617,7 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
if (num_ports == 0)
return NULL;
- ibdev = (struct mlx4_ib_dev *) ib_alloc_device(sizeof *ibdev);
+ ibdev = ib_alloc_device(mlx4_ib_dev, ib_dev);
if (!ibdev) {
dev_err(&dev->persist->pdev->dev,
"Device struct alloc failed\n");
@@ -2554,9 +2649,6 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
1 : ibdev->num_ports;
ibdev->ib_dev.num_comp_vectors = dev->caps.num_comp_vectors;
ibdev->ib_dev.dev.parent = &dev->persist->pdev->dev;
- ibdev->ib_dev.get_netdev = mlx4_ib_get_netdev;
- ibdev->ib_dev.add_gid = mlx4_ib_add_gid;
- ibdev->ib_dev.del_gid = mlx4_ib_del_gid;
if (dev->caps.userspace_caps)
ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_ABI_VERSION;
@@ -2589,116 +2681,53 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
(1ull << IB_USER_VERBS_CMD_CREATE_XSRQ) |
(1ull << IB_USER_VERBS_CMD_OPEN_QP);
- ibdev->ib_dev.query_device = mlx4_ib_query_device;
- ibdev->ib_dev.query_port = mlx4_ib_query_port;
- ibdev->ib_dev.get_link_layer = mlx4_ib_port_link_layer;
- ibdev->ib_dev.query_gid = mlx4_ib_query_gid;
- ibdev->ib_dev.query_pkey = mlx4_ib_query_pkey;
- ibdev->ib_dev.modify_device = mlx4_ib_modify_device;
- ibdev->ib_dev.modify_port = mlx4_ib_modify_port;
- ibdev->ib_dev.alloc_ucontext = mlx4_ib_alloc_ucontext;
- ibdev->ib_dev.dealloc_ucontext = mlx4_ib_dealloc_ucontext;
- ibdev->ib_dev.mmap = mlx4_ib_mmap;
- ibdev->ib_dev.alloc_pd = mlx4_ib_alloc_pd;
- ibdev->ib_dev.dealloc_pd = mlx4_ib_dealloc_pd;
- ibdev->ib_dev.create_ah = mlx4_ib_create_ah;
- ibdev->ib_dev.query_ah = mlx4_ib_query_ah;
- ibdev->ib_dev.destroy_ah = mlx4_ib_destroy_ah;
- ibdev->ib_dev.create_srq = mlx4_ib_create_srq;
- ibdev->ib_dev.modify_srq = mlx4_ib_modify_srq;
- ibdev->ib_dev.query_srq = mlx4_ib_query_srq;
- ibdev->ib_dev.destroy_srq = mlx4_ib_destroy_srq;
- ibdev->ib_dev.post_srq_recv = mlx4_ib_post_srq_recv;
- ibdev->ib_dev.create_qp = mlx4_ib_create_qp;
- ibdev->ib_dev.modify_qp = mlx4_ib_modify_qp;
- ibdev->ib_dev.query_qp = mlx4_ib_query_qp;
- ibdev->ib_dev.destroy_qp = mlx4_ib_destroy_qp;
- ibdev->ib_dev.drain_sq = mlx4_ib_drain_sq;
- ibdev->ib_dev.drain_rq = mlx4_ib_drain_rq;
- ibdev->ib_dev.post_send = mlx4_ib_post_send;
- ibdev->ib_dev.post_recv = mlx4_ib_post_recv;
- ibdev->ib_dev.create_cq = mlx4_ib_create_cq;
- ibdev->ib_dev.modify_cq = mlx4_ib_modify_cq;
- ibdev->ib_dev.resize_cq = mlx4_ib_resize_cq;
- ibdev->ib_dev.destroy_cq = mlx4_ib_destroy_cq;
- ibdev->ib_dev.poll_cq = mlx4_ib_poll_cq;
- ibdev->ib_dev.req_notify_cq = mlx4_ib_arm_cq;
- ibdev->ib_dev.get_dma_mr = mlx4_ib_get_dma_mr;
- ibdev->ib_dev.reg_user_mr = mlx4_ib_reg_user_mr;
- ibdev->ib_dev.rereg_user_mr = mlx4_ib_rereg_user_mr;
- ibdev->ib_dev.dereg_mr = mlx4_ib_dereg_mr;
- ibdev->ib_dev.alloc_mr = mlx4_ib_alloc_mr;
- ibdev->ib_dev.map_mr_sg = mlx4_ib_map_mr_sg;
- ibdev->ib_dev.attach_mcast = mlx4_ib_mcg_attach;
- ibdev->ib_dev.detach_mcast = mlx4_ib_mcg_detach;
- ibdev->ib_dev.process_mad = mlx4_ib_process_mad;
- ibdev->ib_dev.get_port_immutable = mlx4_port_immutable;
- ibdev->ib_dev.get_dev_fw_str = get_fw_ver_str;
- ibdev->ib_dev.disassociate_ucontext = mlx4_ib_disassociate_ucontext;
-
+ ib_set_device_ops(&ibdev->ib_dev, &mlx4_ib_dev_ops);
ibdev->ib_dev.uverbs_ex_cmd_mask |=
- (1ull << IB_USER_VERBS_EX_CMD_MODIFY_CQ);
+ (1ull << IB_USER_VERBS_EX_CMD_MODIFY_CQ) |
+ (1ull << IB_USER_VERBS_EX_CMD_QUERY_DEVICE) |
+ (1ull << IB_USER_VERBS_EX_CMD_CREATE_CQ) |
+ (1ull << IB_USER_VERBS_EX_CMD_CREATE_QP);
if ((dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS) &&
((mlx4_ib_port_link_layer(&ibdev->ib_dev, 1) ==
IB_LINK_LAYER_ETHERNET) ||
(mlx4_ib_port_link_layer(&ibdev->ib_dev, 2) ==
IB_LINK_LAYER_ETHERNET))) {
- ibdev->ib_dev.create_wq = mlx4_ib_create_wq;
- ibdev->ib_dev.modify_wq = mlx4_ib_modify_wq;
- ibdev->ib_dev.destroy_wq = mlx4_ib_destroy_wq;
- ibdev->ib_dev.create_rwq_ind_table =
- mlx4_ib_create_rwq_ind_table;
- ibdev->ib_dev.destroy_rwq_ind_table =
- mlx4_ib_destroy_rwq_ind_table;
ibdev->ib_dev.uverbs_ex_cmd_mask |=
(1ull << IB_USER_VERBS_EX_CMD_CREATE_WQ) |
(1ull << IB_USER_VERBS_EX_CMD_MODIFY_WQ) |
(1ull << IB_USER_VERBS_EX_CMD_DESTROY_WQ) |
(1ull << IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL) |
(1ull << IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL);
+ ib_set_device_ops(&ibdev->ib_dev, &mlx4_ib_dev_wq_ops);
}
- if (!mlx4_is_slave(ibdev->dev)) {
- ibdev->ib_dev.alloc_fmr = mlx4_ib_fmr_alloc;
- ibdev->ib_dev.map_phys_fmr = mlx4_ib_map_phys_fmr;
- ibdev->ib_dev.unmap_fmr = mlx4_ib_unmap_fmr;
- ibdev->ib_dev.dealloc_fmr = mlx4_ib_fmr_dealloc;
- }
+ if (!mlx4_is_slave(ibdev->dev))
+ ib_set_device_ops(&ibdev->ib_dev, &mlx4_ib_dev_fmr_ops);
if (dev->caps.flags & MLX4_DEV_CAP_FLAG_MEM_WINDOW ||
dev->caps.bmme_flags & MLX4_BMME_FLAG_TYPE_2_WIN) {
- ibdev->ib_dev.alloc_mw = mlx4_ib_alloc_mw;
- ibdev->ib_dev.dealloc_mw = mlx4_ib_dealloc_mw;
-
ibdev->ib_dev.uverbs_cmd_mask |=
(1ull << IB_USER_VERBS_CMD_ALLOC_MW) |
(1ull << IB_USER_VERBS_CMD_DEALLOC_MW);
+ ib_set_device_ops(&ibdev->ib_dev, &mlx4_ib_dev_mw_ops);
}
if (dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) {
- ibdev->ib_dev.alloc_xrcd = mlx4_ib_alloc_xrcd;
- ibdev->ib_dev.dealloc_xrcd = mlx4_ib_dealloc_xrcd;
ibdev->ib_dev.uverbs_cmd_mask |=
(1ull << IB_USER_VERBS_CMD_OPEN_XRCD) |
(1ull << IB_USER_VERBS_CMD_CLOSE_XRCD);
+ ib_set_device_ops(&ibdev->ib_dev, &mlx4_ib_dev_xrc_ops);
}
if (check_flow_steering_support(dev)) {
ibdev->steering_support = MLX4_STEERING_MODE_DEVICE_MANAGED;
- ibdev->ib_dev.create_flow = mlx4_ib_create_flow;
- ibdev->ib_dev.destroy_flow = mlx4_ib_destroy_flow;
-
ibdev->ib_dev.uverbs_ex_cmd_mask |=
(1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW) |
(1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW);
+ ib_set_device_ops(&ibdev->ib_dev, &mlx4_ib_dev_fs_ops);
}
- ibdev->ib_dev.uverbs_ex_cmd_mask |=
- (1ull << IB_USER_VERBS_EX_CMD_QUERY_DEVICE) |
- (1ull << IB_USER_VERBS_EX_CMD_CREATE_CQ) |
- (1ull << IB_USER_VERBS_EX_CMD_CREATE_QP);
-
mlx4_ib_alloc_eqs(dev, ibdev);
spin_lock_init(&iboe->lock);
@@ -2710,6 +2739,7 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
for (i = 0; i < ibdev->num_ports; ++i) {
mutex_init(&ibdev->counters_table[i].mutex);
INIT_LIST_HEAD(&ibdev->counters_table[i].counters_list);
+ iboe->last_port_state[i] = IB_PORT_DOWN;
}
num_req_counters = mlx4_is_bonded(dev) ? 1 : ibdev->num_ports;
@@ -2809,7 +2839,7 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
rdma_set_device_sysfs_group(&ibdev->ib_dev, &mlx4_attr_group);
ibdev->ib_dev.driver_id = RDMA_DRIVER_MLX4;
- if (ib_register_device(&ibdev->ib_dev, "mlx4_%d", NULL))
+ if (ib_register_device(&ibdev->ib_dev, "mlx4_%d"))
goto err_diag_counters;
if (mlx4_ib_mad_init(ibdev))
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index 8850dfc3826d..60dc1347c5ab 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -519,6 +519,7 @@ struct mlx4_ib_iboe {
atomic64_t mac[MLX4_MAX_PORTS];
struct notifier_block nb;
struct mlx4_port_gid_table gids[MLX4_MAX_PORTS];
+ enum ib_port_state last_port_state[MLX4_MAX_PORTS];
};
struct pkey_mgt {
@@ -721,7 +722,8 @@ static inline u8 mlx4_ib_bond_next_port(struct mlx4_ib_dev *dev)
int mlx4_ib_init_sriov(struct mlx4_ib_dev *dev);
void mlx4_ib_close_sriov(struct mlx4_ib_dev *dev);
-int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context, unsigned long virt,
+int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context,
+ struct ib_udata *udata, unsigned long virt,
struct mlx4_db *db);
void mlx4_ib_db_unmap_user(struct mlx4_ib_ucontext *context, struct mlx4_db *db);
@@ -753,13 +755,13 @@ void __mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq)
void mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq);
struct ib_ah *mlx4_ib_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr,
- struct ib_udata *udata);
+ u32 flags, struct ib_udata *udata);
struct ib_ah *mlx4_ib_create_ah_slave(struct ib_pd *pd,
struct rdma_ah_attr *ah_attr,
int slave_sgid_index, u8 *s_mac,
u16 vlan_tag);
int mlx4_ib_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr);
-int mlx4_ib_destroy_ah(struct ib_ah *ah);
+int mlx4_ib_destroy_ah(struct ib_ah *ah, u32 flags);
struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd,
struct ib_srq_init_attr *init_attr,
diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c
index c7c85c22e4e3..395379a480cb 100644
--- a/drivers/infiniband/hw/mlx4/mr.c
+++ b/drivers/infiniband/hw/mlx4/mr.c
@@ -367,7 +367,7 @@ end:
return block_shift;
}
-static struct ib_umem *mlx4_get_umem_mr(struct ib_ucontext *context, u64 start,
+static struct ib_umem *mlx4_get_umem_mr(struct ib_udata *udata, u64 start,
u64 length, u64 virt_addr,
int access_flags)
{
@@ -398,7 +398,7 @@ static struct ib_umem *mlx4_get_umem_mr(struct ib_ucontext *context, u64 start,
up_read(&current->mm->mmap_sem);
}
- return ib_umem_get(context, start, length, access_flags, 0);
+ return ib_umem_get(udata, start, length, access_flags, 0);
}
struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
@@ -415,8 +415,8 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
if (!mr)
return ERR_PTR(-ENOMEM);
- mr->umem = mlx4_get_umem_mr(pd->uobject->context, start, length,
- virt_addr, access_flags);
+ mr->umem =
+ mlx4_get_umem_mr(udata, start, length, virt_addr, access_flags);
if (IS_ERR(mr->umem)) {
err = PTR_ERR(mr->umem);
goto err_free;
@@ -505,9 +505,8 @@ int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags,
mlx4_mr_rereg_mem_cleanup(dev->dev, &mmr->mmr);
ib_umem_release(mmr->umem);
- mmr->umem =
- mlx4_get_umem_mr(mr->uobject->context, start, length,
- virt_addr, mr_access_flags);
+ mmr->umem = mlx4_get_umem_mr(udata, start, length, virt_addr,
+ mr_access_flags);
if (IS_ERR(mmr->umem)) {
err = PTR_ERR(mmr->umem);
/* Prevent mlx4_ib_dereg_mr from free'ing invalid pointer */
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 0711ca1dfb8f..429a59c5801c 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -41,6 +41,7 @@
#include <rdma/ib_pack.h>
#include <rdma/ib_addr.h>
#include <rdma/ib_mad.h>
+#include <rdma/uverbs_ioctl.h>
#include <linux/mlx4/driver.h>
#include <linux/mlx4/qp.h>
@@ -52,7 +53,8 @@ static void mlx4_ib_lock_cqs(struct mlx4_ib_cq *send_cq,
struct mlx4_ib_cq *recv_cq);
static void mlx4_ib_unlock_cqs(struct mlx4_ib_cq *send_cq,
struct mlx4_ib_cq *recv_cq);
-static int _mlx4_ib_modify_wq(struct ib_wq *ibwq, enum ib_wq_state new_state);
+static int _mlx4_ib_modify_wq(struct ib_wq *ibwq, enum ib_wq_state new_state,
+ struct ib_udata *udata);
enum {
MLX4_IB_ACK_REQ_FREQ = 8,
@@ -323,7 +325,7 @@ static int send_wqe_overhead(enum mlx4_ib_qp_type type, u32 flags)
}
static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
- int is_user, int has_rq, struct mlx4_ib_qp *qp,
+ bool is_user, int has_rq, struct mlx4_ib_qp *qp,
u32 inl_recv_sz)
{
/* Sanity check RQ size before proceeding */
@@ -401,7 +403,7 @@ static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
* We need to leave 2 KB + 1 WR of headroom in the SQ to
* allow HW to prefetch.
*/
- qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + 1;
+ qp->sq_spare_wqes = MLX4_IB_SQ_HEADROOM(qp->sq.wqe_shift);
qp->sq.wqe_cnt = roundup_pow_of_two(cap->max_send_wr +
qp->sq_spare_wqes);
@@ -863,6 +865,8 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
int err;
struct mlx4_ib_sqp *sqp = NULL;
struct mlx4_ib_qp *qp;
+ struct mlx4_ib_ucontext *context = rdma_udata_to_drv_context(
+ udata, struct mlx4_ib_ucontext, ibucontext);
enum mlx4_ib_qp_type qp_type = (enum mlx4_ib_qp_type) init_attr->qp_type;
struct mlx4_ib_cq *mcq;
unsigned long flags;
@@ -942,7 +946,7 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
- if (pd->uobject) {
+ if (udata) {
union {
struct mlx4_ib_create_qp qp;
struct mlx4_ib_create_wq wq;
@@ -991,7 +995,7 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
qp->flags |= MLX4_IB_QP_SCATTER_FCS;
}
- err = set_rq_size(dev, &init_attr->cap, !!pd->uobject,
+ err = set_rq_size(dev, &init_attr->cap, udata,
qp_has_rq(init_attr), qp, qp->inl_recv_sz);
if (err)
goto err;
@@ -1015,9 +1019,11 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
(qp->sq.wqe_cnt << qp->sq.wqe_shift);
}
- qp->umem = ib_umem_get(pd->uobject->context,
- (src == MLX4_IB_QP_SRC) ? ucmd.qp.buf_addr :
- ucmd.wq.buf_addr, qp->buf_size, 0, 0);
+ qp->umem =
+ ib_umem_get(udata,
+ (src == MLX4_IB_QP_SRC) ? ucmd.qp.buf_addr :
+ ucmd.wq.buf_addr,
+ qp->buf_size, 0, 0);
if (IS_ERR(qp->umem)) {
err = PTR_ERR(qp->umem);
goto err;
@@ -1035,15 +1041,17 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
goto err_mtt;
if (qp_has_rq(init_attr)) {
- err = mlx4_ib_db_map_user(to_mucontext(pd->uobject->context),
+ err = mlx4_ib_db_map_user(
+ context, udata,
(src == MLX4_IB_QP_SRC) ? ucmd.qp.db_addr :
- ucmd.wq.db_addr, &qp->db);
+ ucmd.wq.db_addr,
+ &qp->db);
if (err)
goto err_mtt;
}
qp->mqp.usage = MLX4_RES_USAGE_USER_VERBS;
} else {
- err = set_rq_size(dev, &init_attr->cap, !!pd->uobject,
+ err = set_rq_size(dev, &init_attr->cap, udata,
qp_has_rq(init_attr), qp, 0);
if (err)
goto err;
@@ -1108,8 +1116,7 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
}
}
} else if (src == MLX4_IB_RWQ_SRC) {
- err = mlx4_ib_alloc_wqn(to_mucontext(pd->uobject->context), qp,
- range_size, &qpn);
+ err = mlx4_ib_alloc_wqn(context, qp, range_size, &qpn);
if (err)
goto err_wrid;
} else {
@@ -1180,8 +1187,7 @@ err_qpn:
if (qp->flags & MLX4_IB_QP_NETIF)
mlx4_ib_steer_qp_free(dev, qpn, 1);
else if (src == MLX4_IB_RWQ_SRC)
- mlx4_ib_release_wqn(to_mucontext(pd->uobject->context),
- qp, 0);
+ mlx4_ib_release_wqn(context, qp, 0);
else
mlx4_qp_release_range(dev->dev, qpn, 1);
}
@@ -1189,9 +1195,9 @@ err_proxy:
if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI)
free_proxy_bufs(pd->device, qp);
err_wrid:
- if (pd->uobject) {
+ if (udata) {
if (qp_has_rq(init_attr))
- mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), &qp->db);
+ mlx4_ib_db_unmap_user(context, &qp->db);
} else {
kvfree(qp->sq.wrid);
kvfree(qp->rq.wrid);
@@ -1201,20 +1207,20 @@ err_mtt:
mlx4_mtt_cleanup(dev->dev, &qp->mtt);
err_buf:
- if (pd->uobject)
+ if (qp->umem)
ib_umem_release(qp->umem);
else
mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf);
err_db:
- if (!pd->uobject && qp_has_rq(init_attr))
+ if (!udata && qp_has_rq(init_attr))
mlx4_db_free(dev->dev, &qp->db);
err:
- if (sqp)
- kfree(sqp);
- else if (!*caller_qp)
+ if (!sqp && !*caller_qp)
kfree(qp);
+ kfree(sqp);
+
return err;
}
@@ -1332,7 +1338,7 @@ static void destroy_qp_rss(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
}
static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
- enum mlx4_ib_source_type src, int is_user)
+ enum mlx4_ib_source_type src, bool is_user)
{
struct mlx4_ib_cq *send_cq, *recv_cq;
unsigned long flags;
@@ -1609,10 +1615,7 @@ static int _mlx4_ib_destroy_qp(struct ib_qp *qp)
if (qp->rwq_ind_tbl) {
destroy_qp_rss(dev, mqp);
} else {
- struct mlx4_ib_pd *pd;
-
- pd = get_pd(mqp);
- destroy_qp_common(dev, mqp, MLX4_IB_QP_SRC, !!pd->ibpd.uobject);
+ destroy_qp_common(dev, mqp, MLX4_IB_QP_SRC, qp->uobject);
}
if (is_sqp(dev, mqp))
@@ -1941,7 +1944,8 @@ static u8 gid_type_to_qpc(enum ib_gid_type gid_type)
* Go over all RSS QP's childes (WQs) and apply their HW state according to
* their logic state if the RSS QP is the first RSS QP associated for the WQ.
*/
-static int bringup_rss_rwqs(struct ib_rwq_ind_table *ind_tbl, u8 port_num)
+static int bringup_rss_rwqs(struct ib_rwq_ind_table *ind_tbl, u8 port_num,
+ struct ib_udata *udata)
{
int err = 0;
int i;
@@ -1965,7 +1969,7 @@ static int bringup_rss_rwqs(struct ib_rwq_ind_table *ind_tbl, u8 port_num)
}
wq->port = port_num;
if ((wq->rss_usecnt == 0) && (ibwq->state == IB_WQS_RDY)) {
- err = _mlx4_ib_modify_wq(ibwq, IB_WQS_RDY);
+ err = _mlx4_ib_modify_wq(ibwq, IB_WQS_RDY, udata);
if (err) {
mutex_unlock(&wq->mutex);
break;
@@ -1987,7 +1991,8 @@ static int bringup_rss_rwqs(struct ib_rwq_ind_table *ind_tbl, u8 port_num)
if ((wq->rss_usecnt == 1) &&
(ibwq->state == IB_WQS_RDY))
- if (_mlx4_ib_modify_wq(ibwq, IB_WQS_RESET))
+ if (_mlx4_ib_modify_wq(ibwq, IB_WQS_RESET,
+ udata))
pr_warn("failed to reverse WQN=0x%06x\n",
ibwq->wq_num);
wq->rss_usecnt--;
@@ -1999,7 +2004,8 @@ static int bringup_rss_rwqs(struct ib_rwq_ind_table *ind_tbl, u8 port_num)
return err;
}
-static void bring_down_rss_rwqs(struct ib_rwq_ind_table *ind_tbl)
+static void bring_down_rss_rwqs(struct ib_rwq_ind_table *ind_tbl,
+ struct ib_udata *udata)
{
int i;
@@ -2010,7 +2016,7 @@ static void bring_down_rss_rwqs(struct ib_rwq_ind_table *ind_tbl)
mutex_lock(&wq->mutex);
if ((wq->rss_usecnt == 1) && (ibwq->state == IB_WQS_RDY))
- if (_mlx4_ib_modify_wq(ibwq, IB_WQS_RESET))
+ if (_mlx4_ib_modify_wq(ibwq, IB_WQS_RESET, udata))
pr_warn("failed to reverse WQN=%x\n",
ibwq->wq_num);
wq->rss_usecnt--;
@@ -2042,9 +2048,10 @@ static void fill_qp_rss_context(struct mlx4_qp_context *context,
static int __mlx4_ib_modify_qp(void *src, enum mlx4_ib_source_type src_type,
const struct ib_qp_attr *attr, int attr_mask,
- enum ib_qp_state cur_state, enum ib_qp_state new_state)
+ enum ib_qp_state cur_state,
+ enum ib_qp_state new_state,
+ struct ib_udata *udata)
{
- struct ib_uobject *ibuobject;
struct ib_srq *ibsrq;
const struct ib_gid_attr *gid_attr = NULL;
struct ib_rwq_ind_table *rwq_ind_tbl;
@@ -2053,6 +2060,8 @@ static int __mlx4_ib_modify_qp(void *src, enum mlx4_ib_source_type src_type,
struct mlx4_ib_qp *qp;
struct mlx4_ib_pd *pd;
struct mlx4_ib_cq *send_cq, *recv_cq;
+ struct mlx4_ib_ucontext *ucontext = rdma_udata_to_drv_context(
+ udata, struct mlx4_ib_ucontext, ibucontext);
struct mlx4_qp_context *context;
enum mlx4_qp_optpar optpar = 0;
int sqd_event;
@@ -2064,7 +2073,6 @@ static int __mlx4_ib_modify_qp(void *src, enum mlx4_ib_source_type src_type,
struct ib_wq *ibwq;
ibwq = (struct ib_wq *)src;
- ibuobject = ibwq->uobject;
ibsrq = NULL;
rwq_ind_tbl = NULL;
qp_type = IB_QPT_RAW_PACKET;
@@ -2075,7 +2083,6 @@ static int __mlx4_ib_modify_qp(void *src, enum mlx4_ib_source_type src_type,
struct ib_qp *ibqp;
ibqp = (struct ib_qp *)src;
- ibuobject = ibqp->uobject;
ibsrq = ibqp->srq;
rwq_ind_tbl = ibqp->rwq_ind_tbl;
qp_type = ibqp->qp_type;
@@ -2160,11 +2167,9 @@ static int __mlx4_ib_modify_qp(void *src, enum mlx4_ib_source_type src_type,
context->param3 |= cpu_to_be32(1 << 30);
}
- if (ibuobject)
+ if (ucontext)
context->usr_page = cpu_to_be32(
- mlx4_to_hw_uar_index(dev->dev,
- to_mucontext(ibuobject->context)
- ->uar.index));
+ mlx4_to_hw_uar_index(dev->dev, ucontext->uar.index));
else
context->usr_page = cpu_to_be32(
mlx4_to_hw_uar_index(dev->dev, dev->priv_uar.index));
@@ -2296,7 +2301,7 @@ static int __mlx4_ib_modify_qp(void *src, enum mlx4_ib_source_type src_type,
context->cqn_recv = cpu_to_be32(recv_cq->mcq.cqn);
/* Set "fast registration enabled" for all kernel QPs */
- if (!ibuobject)
+ if (!ucontext)
context->params1 |= cpu_to_be32(1 << 11);
if (attr_mask & IB_QP_RNR_RETRY) {
@@ -2433,7 +2438,7 @@ static int __mlx4_ib_modify_qp(void *src, enum mlx4_ib_source_type src_type,
else
sqd_event = 0;
- if (!ibuobject &&
+ if (!ucontext &&
cur_state == IB_QPS_RESET &&
new_state == IB_QPS_INIT)
context->rlkey_roce_mode |= (1 << 4);
@@ -2444,7 +2449,7 @@ static int __mlx4_ib_modify_qp(void *src, enum mlx4_ib_source_type src_type,
* headroom is stamped so that the hardware doesn't start
* processing stale work requests.
*/
- if (!ibuobject &&
+ if (!ucontext &&
cur_state == IB_QPS_RESET &&
new_state == IB_QPS_INIT) {
struct mlx4_wqe_ctrl_seg *ctrl;
@@ -2508,7 +2513,7 @@ static int __mlx4_ib_modify_qp(void *src, enum mlx4_ib_source_type src_type,
* entries and reinitialize the QP.
*/
if (new_state == IB_QPS_RESET) {
- if (!ibuobject) {
+ if (!ucontext) {
mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn,
ibsrq ? to_msrq(ibsrq) : NULL);
if (send_cq != recv_cq)
@@ -2734,16 +2739,17 @@ static int _mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
}
if (ibqp->rwq_ind_tbl && (new_state == IB_QPS_INIT)) {
- err = bringup_rss_rwqs(ibqp->rwq_ind_tbl, attr->port_num);
+ err = bringup_rss_rwqs(ibqp->rwq_ind_tbl, attr->port_num,
+ udata);
if (err)
goto out;
}
err = __mlx4_ib_modify_qp(ibqp, MLX4_IB_QP_SRC, attr, attr_mask,
- cur_state, new_state);
+ cur_state, new_state, udata);
if (ibqp->rwq_ind_tbl && err)
- bring_down_rss_rwqs(ibqp->rwq_ind_tbl);
+ bring_down_rss_rwqs(ibqp->rwq_ind_tbl, udata);
if (mlx4_is_bonded(dev->dev) && (attr_mask & IB_QP_PORT))
attr->port_num = 1;
@@ -4044,7 +4050,7 @@ struct ib_wq *mlx4_ib_create_wq(struct ib_pd *pd,
struct mlx4_ib_create_wq ucmd;
int err, required_cmd_sz;
- if (!(udata && pd->uobject))
+ if (!udata)
return ERR_PTR(-EINVAL);
required_cmd_sz = offsetof(typeof(ucmd), comp_mask) +
@@ -4121,7 +4127,8 @@ static int ib_wq2qp_state(enum ib_wq_state state)
}
}
-static int _mlx4_ib_modify_wq(struct ib_wq *ibwq, enum ib_wq_state new_state)
+static int _mlx4_ib_modify_wq(struct ib_wq *ibwq, enum ib_wq_state new_state,
+ struct ib_udata *udata)
{
struct mlx4_ib_qp *qp = to_mqp((struct ib_qp *)ibwq);
enum ib_qp_state qp_cur_state;
@@ -4145,7 +4152,8 @@ static int _mlx4_ib_modify_wq(struct ib_wq *ibwq, enum ib_wq_state new_state)
attr_mask = IB_QP_PORT;
err = __mlx4_ib_modify_qp(ibwq, MLX4_IB_RWQ_SRC, &attr,
- attr_mask, IB_QPS_RESET, IB_QPS_INIT);
+ attr_mask, IB_QPS_RESET, IB_QPS_INIT,
+ udata);
if (err) {
pr_debug("WQN=0x%06x failed to apply RST->INIT on the HW QP\n",
ibwq->wq_num);
@@ -4157,12 +4165,13 @@ static int _mlx4_ib_modify_wq(struct ib_wq *ibwq, enum ib_wq_state new_state)
attr_mask = 0;
err = __mlx4_ib_modify_qp(ibwq, MLX4_IB_RWQ_SRC, NULL, attr_mask,
- qp_cur_state, qp_new_state);
+ qp_cur_state, qp_new_state, udata);
if (err && (qp_cur_state == IB_QPS_INIT)) {
qp_new_state = IB_QPS_RESET;
if (__mlx4_ib_modify_qp(ibwq, MLX4_IB_RWQ_SRC, NULL,
- attr_mask, IB_QPS_INIT, IB_QPS_RESET)) {
+ attr_mask, IB_QPS_INIT, IB_QPS_RESET,
+ udata)) {
pr_warn("WQN=0x%06x failed with reverting HW's resources failure\n",
ibwq->wq_num);
qp_new_state = IB_QPS_INIT;
@@ -4225,7 +4234,7 @@ int mlx4_ib_modify_wq(struct ib_wq *ibwq, struct ib_wq_attr *wq_attr,
* WQ, so we can apply its port on the WQ.
*/
if (qp->rss_usecnt)
- err = _mlx4_ib_modify_wq(ibwq, new_state);
+ err = _mlx4_ib_modify_wq(ibwq, new_state, udata);
if (!err)
ibwq->state = new_state;
diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c
index 3731b31c3653..381cf899bcef 100644
--- a/drivers/infiniband/hw/mlx4/srq.c
+++ b/drivers/infiniband/hw/mlx4/srq.c
@@ -37,6 +37,7 @@
#include "mlx4_ib.h"
#include <rdma/mlx4-abi.h>
+#include <rdma/uverbs_ioctl.h>
static void *get_wqe(struct mlx4_ib_srq *srq, int n)
{
@@ -73,6 +74,8 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd,
struct ib_udata *udata)
{
struct mlx4_ib_dev *dev = to_mdev(pd->device);
+ struct mlx4_ib_ucontext *ucontext = rdma_udata_to_drv_context(
+ udata, struct mlx4_ib_ucontext, ibucontext);
struct mlx4_ib_srq *srq;
struct mlx4_wqe_srq_next_seg *next;
struct mlx4_wqe_data_seg *scatter;
@@ -105,7 +108,7 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd,
buf_size = srq->msrq.max * desc_size;
- if (pd->uobject) {
+ if (udata) {
struct mlx4_ib_create_srq ucmd;
if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) {
@@ -113,8 +116,7 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd,
goto err_srq;
}
- srq->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr,
- buf_size, 0, 0);
+ srq->umem = ib_umem_get(udata, ucmd.buf_addr, buf_size, 0, 0);
if (IS_ERR(srq->umem)) {
err = PTR_ERR(srq->umem);
goto err_srq;
@@ -129,8 +131,8 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd,
if (err)
goto err_mtt;
- err = mlx4_ib_db_map_user(to_mucontext(pd->uobject->context),
- ucmd.db_addr, &srq->db);
+ err = mlx4_ib_db_map_user(ucontext, udata, ucmd.db_addr,
+ &srq->db);
if (err)
goto err_mtt;
} else {
@@ -191,7 +193,7 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd,
srq->msrq.event = mlx4_ib_srq_event;
srq->ibsrq.ext.xrc.srq_num = srq->msrq.srqn;
- if (pd->uobject)
+ if (udata)
if (ib_copy_to_udata(udata, &srq->msrq.srqn, sizeof (__u32))) {
err = -EFAULT;
goto err_wrid;
@@ -202,8 +204,8 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd,
return &srq->ibsrq;
err_wrid:
- if (pd->uobject)
- mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), &srq->db);
+ if (udata)
+ mlx4_ib_db_unmap_user(ucontext, &srq->db);
else
kvfree(srq->wrid);
@@ -211,13 +213,13 @@ err_mtt:
mlx4_mtt_cleanup(dev->dev, &srq->mtt);
err_buf:
- if (pd->uobject)
+ if (srq->umem)
ib_umem_release(srq->umem);
else
mlx4_buf_free(dev->dev, buf_size, &srq->buf);
err_db:
- if (!pd->uobject)
+ if (!udata)
mlx4_db_free(dev->dev, &srq->db);
err_srq:
diff --git a/drivers/infiniband/hw/mlx4/sysfs.c b/drivers/infiniband/hw/mlx4/sysfs.c
index 752bdd536130..ea1f3a081b05 100644
--- a/drivers/infiniband/hw/mlx4/sysfs.c
+++ b/drivers/infiniband/hw/mlx4/sysfs.c
@@ -353,16 +353,12 @@ err:
static void get_name(struct mlx4_ib_dev *dev, char *name, int i, int max)
{
- char base_name[9];
-
- /* pci_name format is: bus:dev:func -> xxxx:yy:zz.n */
- strlcpy(name, pci_name(dev->dev->persist->pdev), max);
- strncpy(base_name, name, 8); /*till xxxx:yy:*/
- base_name[8] = '\0';
- /* with no ARI only 3 last bits are used so when the fn is higher than 8
+ /* pci_name format is: bus:dev:func -> xxxx:yy:zz.n
+ * with no ARI only 3 last bits are used so when the fn is higher than 8
* need to add it to the dev num, so count in the last number will be
* modulo 8 */
- sprintf(name, "%s%.2d.%d", base_name, (i/8), (i%8));
+ snprintf(name, max, "%.8s%.2d.%d", pci_name(dev->dev->persist->pdev),
+ i / 8, i % 8);
}
struct mlx4_port {
diff --git a/drivers/infiniband/hw/mlx5/Kconfig b/drivers/infiniband/hw/mlx5/Kconfig
index 0440966bc6ec..8d651c05de62 100644
--- a/drivers/infiniband/hw/mlx5/Kconfig
+++ b/drivers/infiniband/hw/mlx5/Kconfig
@@ -1,7 +1,6 @@
config MLX5_INFINIBAND
tristate "Mellanox 5th generation network adapters (ConnectX series) support"
depends on NETDEVICES && ETHERNET && PCI && MLX5_CORE
- depends on INFINIBAND_USER_ACCESS || INFINIBAND_USER_ACCESS=n
---help---
This driver provides low-level InfiniBand support for
Mellanox Connect-IB PCI Express host channel adapters (HCAs).
diff --git a/drivers/infiniband/hw/mlx5/Makefile b/drivers/infiniband/hw/mlx5/Makefile
index b8e4b15e2674..33f5adb14e4e 100644
--- a/drivers/infiniband/hw/mlx5/Makefile
+++ b/drivers/infiniband/hw/mlx5/Makefile
@@ -1,6 +1,8 @@
obj-$(CONFIG_MLX5_INFINIBAND) += mlx5_ib.o
-mlx5_ib-y := main.o cq.o doorbell.o qp.o mem.o srq.o mr.o ah.o mad.o gsi.o ib_virt.o cmd.o cong.o
+mlx5_ib-y := main.o cq.o doorbell.o qp.o mem.o srq_cmd.o \
+ srq.o mr.o ah.o mad.o gsi.o ib_virt.o cmd.o \
+ cong.o
mlx5_ib-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += odp.o
mlx5_ib-$(CONFIG_MLX5_ESWITCH) += ib_rep.o
mlx5_ib-$(CONFIG_INFINIBAND_USER_ACCESS) += devx.o
diff --git a/drivers/infiniband/hw/mlx5/ah.c b/drivers/infiniband/hw/mlx5/ah.c
index ffd03bf1a71e..420ae0897333 100644
--- a/drivers/infiniband/hw/mlx5/ah.c
+++ b/drivers/infiniband/hw/mlx5/ah.c
@@ -72,7 +72,7 @@ static struct ib_ah *create_ib_ah(struct mlx5_ib_dev *dev,
}
struct ib_ah *mlx5_ib_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr,
- struct ib_udata *udata)
+ u32 flags, struct ib_udata *udata)
{
struct mlx5_ib_ah *ah;
@@ -131,7 +131,7 @@ int mlx5_ib_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr)
return 0;
}
-int mlx5_ib_destroy_ah(struct ib_ah *ah)
+int mlx5_ib_destroy_ah(struct ib_ah *ah, u32 flags)
{
kfree(to_mah(ah));
return 0;
diff --git a/drivers/infiniband/hw/mlx5/cmd.c b/drivers/infiniband/hw/mlx5/cmd.c
index ca060a2e2b36..6bcc63aaa50b 100644
--- a/drivers/infiniband/hw/mlx5/cmd.c
+++ b/drivers/infiniband/hw/mlx5/cmd.c
@@ -240,6 +240,7 @@ int mlx5_cmd_alloc_transport_domain(struct mlx5_core_dev *dev, u32 *tdn,
MLX5_SET(alloc_transport_domain_in, in, opcode,
MLX5_CMD_OP_ALLOC_TRANSPORT_DOMAIN);
+ MLX5_SET(alloc_transport_domain_in, in, uid, uid);
err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
if (!err)
@@ -257,6 +258,7 @@ void mlx5_cmd_dealloc_transport_domain(struct mlx5_core_dev *dev, u32 tdn,
MLX5_SET(dealloc_transport_domain_in, in, opcode,
MLX5_CMD_OP_DEALLOC_TRANSPORT_DOMAIN);
+ MLX5_SET(dealloc_transport_domain_in, in, uid, uid);
MLX5_SET(dealloc_transport_domain_in, in, transport_domain, tdn);
mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
}
@@ -326,3 +328,57 @@ int mlx5_cmd_xrcd_dealloc(struct mlx5_core_dev *dev, u32 xrcdn, u16 uid)
MLX5_SET(dealloc_xrcd_in, in, uid, uid);
return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
}
+
+int mlx5_cmd_alloc_q_counter(struct mlx5_core_dev *dev, u16 *counter_id,
+ u16 uid)
+{
+ u32 in[MLX5_ST_SZ_DW(alloc_q_counter_in)] = {0};
+ u32 out[MLX5_ST_SZ_DW(alloc_q_counter_out)] = {0};
+ int err;
+
+ MLX5_SET(alloc_q_counter_in, in, opcode, MLX5_CMD_OP_ALLOC_Q_COUNTER);
+ MLX5_SET(alloc_q_counter_in, in, uid, uid);
+
+ err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+ if (!err)
+ *counter_id = MLX5_GET(alloc_q_counter_out, out,
+ counter_set_id);
+ return err;
+}
+
+int mlx5_cmd_mad_ifc(struct mlx5_core_dev *dev, const void *inb, void *outb,
+ u16 opmod, u8 port)
+{
+ int outlen = MLX5_ST_SZ_BYTES(mad_ifc_out);
+ int inlen = MLX5_ST_SZ_BYTES(mad_ifc_in);
+ int err = -ENOMEM;
+ void *data;
+ void *resp;
+ u32 *out;
+ u32 *in;
+
+ in = kzalloc(inlen, GFP_KERNEL);
+ out = kzalloc(outlen, GFP_KERNEL);
+ if (!in || !out)
+ goto out;
+
+ MLX5_SET(mad_ifc_in, in, opcode, MLX5_CMD_OP_MAD_IFC);
+ MLX5_SET(mad_ifc_in, in, op_mod, opmod);
+ MLX5_SET(mad_ifc_in, in, port, port);
+
+ data = MLX5_ADDR_OF(mad_ifc_in, in, mad);
+ memcpy(data, inb, MLX5_FLD_SZ_BYTES(mad_ifc_in, mad));
+
+ err = mlx5_cmd_exec(dev, in, inlen, out, outlen);
+ if (err)
+ goto out;
+
+ resp = MLX5_ADDR_OF(mad_ifc_out, out, response_mad_packet);
+ memcpy(outb, resp,
+ MLX5_FLD_SZ_BYTES(mad_ifc_out, response_mad_packet));
+
+out:
+ kfree(out);
+ kfree(in);
+ return err;
+}
diff --git a/drivers/infiniband/hw/mlx5/cmd.h b/drivers/infiniband/hw/mlx5/cmd.h
index c03c56455534..923a7b93f507 100644
--- a/drivers/infiniband/hw/mlx5/cmd.h
+++ b/drivers/infiniband/hw/mlx5/cmd.h
@@ -61,4 +61,8 @@ int mlx5_cmd_detach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid,
u32 qpn, u16 uid);
int mlx5_cmd_xrcd_alloc(struct mlx5_core_dev *dev, u32 *xrcdn, u16 uid);
int mlx5_cmd_xrcd_dealloc(struct mlx5_core_dev *dev, u32 xrcdn, u16 uid);
+int mlx5_cmd_alloc_q_counter(struct mlx5_core_dev *dev, u16 *counter_id,
+ u16 uid);
+int mlx5_cmd_mad_ifc(struct mlx5_core_dev *dev, const void *inb, void *outb,
+ u16 opmod, u8 port);
#endif /* MLX5_IB_CMD_H */
diff --git a/drivers/infiniband/hw/mlx5/cong.c b/drivers/infiniband/hw/mlx5/cong.c
index 7e4e358a4fd8..8ba439fabf7f 100644
--- a/drivers/infiniband/hw/mlx5/cong.c
+++ b/drivers/infiniband/hw/mlx5/cong.c
@@ -389,19 +389,19 @@ void mlx5_ib_cleanup_cong_debugfs(struct mlx5_ib_dev *dev, u8 port_num)
dev->port[port_num].dbg_cc_params = NULL;
}
-int mlx5_ib_init_cong_debugfs(struct mlx5_ib_dev *dev, u8 port_num)
+void mlx5_ib_init_cong_debugfs(struct mlx5_ib_dev *dev, u8 port_num)
{
struct mlx5_ib_dbg_cc_params *dbg_cc_params;
struct mlx5_core_dev *mdev;
int i;
if (!mlx5_debugfs_root)
- goto out;
+ return;
/* Takes a 1-based port number */
mdev = mlx5_ib_get_native_port_mdev(dev, port_num + 1, NULL);
if (!mdev)
- goto out;
+ return;
if (!MLX5_CAP_GEN(mdev, cc_query_allowed) ||
!MLX5_CAP_GEN(mdev, cc_modify_allowed))
@@ -415,8 +415,6 @@ int mlx5_ib_init_cong_debugfs(struct mlx5_ib_dev *dev, u8 port_num)
dbg_cc_params->root = debugfs_create_dir("cc_params",
mdev->priv.dbg_root);
- if (!dbg_cc_params->root)
- goto err;
for (i = 0; i < MLX5_IB_DBG_CC_MAX; i++) {
dbg_cc_params->params[i].offset = i;
@@ -427,14 +425,11 @@ int mlx5_ib_init_cong_debugfs(struct mlx5_ib_dev *dev, u8 port_num)
0600, dbg_cc_params->root,
&dbg_cc_params->params[i],
&dbg_cc_fops);
- if (!dbg_cc_params->params[i].dentry)
- goto err;
}
put_mdev:
mlx5_ib_put_native_port_mdev(dev, port_num + 1);
-out:
- return 0;
+ return;
err:
mlx5_ib_warn(dev, "cong debugfs failure\n");
@@ -445,5 +440,5 @@ err:
* We don't want to fail driver if debugfs failed to initialize,
* so we are not forwarding error to the user.
*/
- return 0;
+ return;
}
diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c
index 7d769b5538b4..18704e503508 100644
--- a/drivers/infiniband/hw/mlx5/cq.c
+++ b/drivers/infiniband/hw/mlx5/cq.c
@@ -35,6 +35,7 @@
#include <rdma/ib_user_verbs.h>
#include <rdma/ib_cache.h>
#include "mlx5_ib.h"
+#include "srq.h"
static void mlx5_ib_cq_comp(struct mlx5_core_cq *cq)
{
@@ -81,7 +82,7 @@ static void *get_sw_cqe(struct mlx5_ib_cq *cq, int n)
cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64;
- if (likely((cqe64->op_own) >> 4 != MLX5_CQE_INVALID) &&
+ if (likely(get_cqe_opcode(cqe64) != MLX5_CQE_INVALID) &&
!((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & (cq->ibcq.cqe + 1)))) {
return cqe;
} else {
@@ -177,8 +178,7 @@ static void handle_responder(struct ib_wc *wc, struct mlx5_cqe64 *cqe,
struct mlx5_core_srq *msrq = NULL;
if (qp->ibqp.xrcd) {
- msrq = mlx5_core_get_srq(dev->mdev,
- be32_to_cpu(cqe->srqn));
+ msrq = mlx5_cmd_get_srq(dev, be32_to_cpu(cqe->srqn));
srq = to_mibsrq(msrq);
} else {
srq = to_msrq(qp->ibqp.srq);
@@ -187,8 +187,8 @@ static void handle_responder(struct ib_wc *wc, struct mlx5_cqe64 *cqe,
wqe_ctr = be16_to_cpu(cqe->wqe_counter);
wc->wr_id = srq->wrid[wqe_ctr];
mlx5_ib_free_srq_wqe(srq, wqe_ctr);
- if (msrq && atomic_dec_and_test(&msrq->refcount))
- complete(&msrq->free);
+ if (msrq)
+ mlx5_core_res_put(&msrq->common);
}
} else {
wq = &qp->rq;
@@ -197,7 +197,7 @@ static void handle_responder(struct ib_wc *wc, struct mlx5_cqe64 *cqe,
}
wc->byte_len = be32_to_cpu(cqe->byte_cnt);
- switch (cqe->op_own >> 4) {
+ switch (get_cqe_opcode(cqe)) {
case MLX5_CQE_RESP_WR_IMM:
wc->opcode = IB_WC_RECV_RDMA_WITH_IMM;
wc->wc_flags = IB_WC_WITH_IMM;
@@ -330,67 +330,6 @@ static void mlx5_handle_error_cqe(struct mlx5_ib_dev *dev,
dump_cqe(dev, cqe);
}
-static int is_atomic_response(struct mlx5_ib_qp *qp, uint16_t idx)
-{
- /* TBD: waiting decision
- */
- return 0;
-}
-
-static void *mlx5_get_atomic_laddr(struct mlx5_ib_qp *qp, uint16_t idx)
-{
- struct mlx5_wqe_data_seg *dpseg;
- void *addr;
-
- dpseg = mlx5_get_send_wqe(qp, idx) + sizeof(struct mlx5_wqe_ctrl_seg) +
- sizeof(struct mlx5_wqe_raddr_seg) +
- sizeof(struct mlx5_wqe_atomic_seg);
- addr = (void *)(unsigned long)be64_to_cpu(dpseg->addr);
- return addr;
-}
-
-static void handle_atomic(struct mlx5_ib_qp *qp, struct mlx5_cqe64 *cqe64,
- uint16_t idx)
-{
- void *addr;
- int byte_count;
- int i;
-
- if (!is_atomic_response(qp, idx))
- return;
-
- byte_count = be32_to_cpu(cqe64->byte_cnt);
- addr = mlx5_get_atomic_laddr(qp, idx);
-
- if (byte_count == 4) {
- *(uint32_t *)addr = be32_to_cpu(*((__be32 *)addr));
- } else {
- for (i = 0; i < byte_count; i += 8) {
- *(uint64_t *)addr = be64_to_cpu(*((__be64 *)addr));
- addr += 8;
- }
- }
-
- return;
-}
-
-static void handle_atomics(struct mlx5_ib_qp *qp, struct mlx5_cqe64 *cqe64,
- u16 tail, u16 head)
-{
- u16 idx;
-
- do {
- idx = tail & (qp->sq.wqe_cnt - 1);
- handle_atomic(qp, cqe64, idx);
- if (idx == head)
- break;
-
- tail = qp->sq.w_list[idx].next;
- } while (1);
- tail = qp->sq.w_list[idx].next;
- qp->sq.last_poll = tail;
-}
-
static void free_cq_buf(struct mlx5_ib_dev *dev, struct mlx5_ib_cq_buf *buf)
{
mlx5_frag_buf_free(dev->mdev, &buf->frag_buf);
@@ -428,45 +367,15 @@ static void get_sig_err_item(struct mlx5_sig_err_cqe *cqe,
item->key = be32_to_cpu(cqe->mkey);
}
-static void sw_send_comp(struct mlx5_ib_qp *qp, int num_entries,
- struct ib_wc *wc, int *npolled)
-{
- struct mlx5_ib_wq *wq;
- unsigned int cur;
- unsigned int idx;
- int np;
- int i;
-
- wq = &qp->sq;
- cur = wq->head - wq->tail;
- np = *npolled;
-
- if (cur == 0)
- return;
-
- for (i = 0; i < cur && np < num_entries; i++) {
- idx = wq->last_poll & (wq->wqe_cnt - 1);
- wc->wr_id = wq->wrid[idx];
- wc->status = IB_WC_WR_FLUSH_ERR;
- wc->vendor_err = MLX5_CQE_SYNDROME_WR_FLUSH_ERR;
- wq->tail++;
- np++;
- wc->qp = &qp->ibqp;
- wc++;
- wq->last_poll = wq->w_list[idx].next;
- }
- *npolled = np;
-}
-
-static void sw_recv_comp(struct mlx5_ib_qp *qp, int num_entries,
- struct ib_wc *wc, int *npolled)
+static void sw_comp(struct mlx5_ib_qp *qp, int num_entries, struct ib_wc *wc,
+ int *npolled, int is_send)
{
struct mlx5_ib_wq *wq;
unsigned int cur;
int np;
int i;
- wq = &qp->rq;
+ wq = (is_send) ? &qp->sq : &qp->rq;
cur = wq->head - wq->tail;
np = *npolled;
@@ -493,13 +402,13 @@ static void mlx5_ib_poll_sw_comp(struct mlx5_ib_cq *cq, int num_entries,
*npolled = 0;
/* Find uncompleted WQEs belonging to that cq and return mmics ones */
list_for_each_entry(qp, &cq->list_send_qp, cq_send_list) {
- sw_send_comp(qp, num_entries, wc + *npolled, npolled);
+ sw_comp(qp, num_entries, wc + *npolled, npolled, true);
if (*npolled >= num_entries)
return;
}
list_for_each_entry(qp, &cq->list_recv_qp, cq_recv_list) {
- sw_recv_comp(qp, num_entries, wc + *npolled, npolled);
+ sw_comp(qp, num_entries, wc + *npolled, npolled, false);
if (*npolled >= num_entries)
return;
}
@@ -537,7 +446,7 @@ repoll:
*/
rmb();
- opcode = cqe64->op_own >> 4;
+ opcode = get_cqe_opcode(cqe64);
if (unlikely(opcode == MLX5_CQE_RESIZE_CQ)) {
if (likely(cq->resize_buf)) {
free_cq_buf(dev, &cq->buf);
@@ -567,7 +476,6 @@ repoll:
wqe_ctr = be16_to_cpu(cqe64->wqe_counter);
idx = wqe_ctr & (wq->wqe_cnt - 1);
handle_good_req(wc, cqe64, wq, idx);
- handle_atomics(*cur_qp, cqe64, wq->last_poll, idx);
wc->wr_id = wq->wrid[idx];
wq->tail = wq->wqe_head[idx] + 1;
wc->status = IB_WC_SUCCESS;
@@ -799,15 +707,15 @@ static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata,
*cqe_size = ucmd.cqe_size;
- cq->buf.umem = ib_umem_get(context, ucmd.buf_addr,
- entries * ucmd.cqe_size,
- IB_ACCESS_LOCAL_WRITE, 1);
+ cq->buf.umem =
+ ib_umem_get(udata, ucmd.buf_addr, entries * ucmd.cqe_size,
+ IB_ACCESS_LOCAL_WRITE, 1);
if (IS_ERR(cq->buf.umem)) {
err = PTR_ERR(cq->buf.umem);
return err;
}
- err = mlx5_ib_db_map_user(to_mucontext(context), ucmd.db_addr,
+ err = mlx5_ib_db_map_user(to_mucontext(context), udata, ucmd.db_addr,
&cq->db);
if (err)
goto err_umem;
@@ -1203,7 +1111,6 @@ static int resize_user(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq,
struct ib_umem *umem;
int err;
int npages;
- struct ib_ucontext *context = cq->buf.umem->context;
err = ib_copy_from_udata(&ucmd, udata, sizeof(ucmd));
if (err)
@@ -1216,7 +1123,7 @@ static int resize_user(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq,
if (ucmd.cqe_size && SIZE_MAX / ucmd.cqe_size <= entries - 1)
return -EINVAL;
- umem = ib_umem_get(context, ucmd.buf_addr,
+ umem = ib_umem_get(udata, ucmd.buf_addr,
(size_t)ucmd.cqe_size * entries,
IB_ACCESS_LOCAL_WRITE, 1);
if (IS_ERR(umem)) {
@@ -1295,7 +1202,7 @@ static int copy_resize_cqes(struct mlx5_ib_cq *cq)
return -EINVAL;
}
- while ((scqe64->op_own >> 4) != MLX5_CQE_RESIZE_CQ) {
+ while (get_cqe_opcode(scqe64) != MLX5_CQE_RESIZE_CQ) {
dcqe = mlx5_frag_buf_get_wqe(&cq->resize_buf->fbc,
(i + 1) & cq->resize_buf->nent);
dcqe64 = dsize == 64 ? dcqe : dcqe + 64;
diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c
index 45c421c87100..9e08df7914aa 100644
--- a/drivers/infiniband/hw/mlx5/devx.c
+++ b/drivers/infiniband/hw/mlx5/devx.c
@@ -8,7 +8,9 @@
#include <rdma/uverbs_types.h>
#include <rdma/uverbs_ioctl.h>
#include <rdma/mlx5_user_ioctl_cmds.h>
+#include <rdma/mlx5_user_ioctl_verbs.h>
#include <rdma/ib_umem.h>
+#include <rdma/uverbs_std_types.h>
#include <linux/mlx5/driver.h>
#include <linux/mlx5/fs.h>
#include "mlx5_ib.h"
@@ -16,12 +18,32 @@
#define UVERBS_MODULE_NAME mlx5_ib
#include <rdma/uverbs_named_ioctl.h>
+enum devx_obj_flags {
+ DEVX_OBJ_FLAGS_INDIRECT_MKEY = 1 << 0,
+ DEVX_OBJ_FLAGS_DCT = 1 << 1,
+};
+
+struct devx_async_data {
+ struct mlx5_ib_dev *mdev;
+ struct list_head list;
+ struct ib_uobject *fd_uobj;
+ struct mlx5_async_work cb_work;
+ u16 cmd_out_len;
+ /* must be last field in this structure */
+ struct mlx5_ib_uapi_devx_async_cmd_hdr hdr;
+};
+
#define MLX5_MAX_DESTROY_INBOX_SIZE_DW MLX5_ST_SZ_DW(delete_fte_in)
struct devx_obj {
struct mlx5_core_dev *mdev;
u64 obj_id;
u32 dinlen; /* destroy inbox length */
u32 dinbox[MLX5_MAX_DESTROY_INBOX_SIZE_DW];
+ u32 flags;
+ union {
+ struct mlx5_ib_devx_mr devx_mr;
+ struct mlx5_core_dct core_dct;
+ };
};
struct devx_umem {
@@ -40,29 +62,32 @@ struct devx_umem_reg_cmd {
u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)];
};
-static struct mlx5_ib_ucontext *devx_ufile2uctx(struct ib_uverbs_file *file)
+static struct mlx5_ib_ucontext *
+devx_ufile2uctx(const struct uverbs_attr_bundle *attrs)
{
- return to_mucontext(ib_uverbs_get_ucontext(file));
+ return to_mucontext(ib_uverbs_get_ucontext(attrs));
}
-int mlx5_ib_devx_create(struct mlx5_ib_dev *dev)
+int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user)
{
u32 in[MLX5_ST_SZ_DW(create_uctx_in)] = {0};
u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {0};
- u64 general_obj_types;
- void *hdr;
+ void *uctx;
int err;
u16 uid;
+ u32 cap = 0;
- hdr = MLX5_ADDR_OF(create_uctx_in, in, hdr);
-
- general_obj_types = MLX5_CAP_GEN_64(dev->mdev, general_obj_types);
- if (!(general_obj_types & MLX5_GENERAL_OBJ_TYPES_CAP_UCTX) ||
- !(general_obj_types & MLX5_GENERAL_OBJ_TYPES_CAP_UMEM))
+ /* 0 means not supported */
+ if (!MLX5_CAP_GEN(dev->mdev, log_max_uctx))
return -EINVAL;
- MLX5_SET(general_obj_in_cmd_hdr, hdr, opcode, MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
- MLX5_SET(general_obj_in_cmd_hdr, hdr, obj_type, MLX5_OBJ_TYPE_UCTX);
+ uctx = MLX5_ADDR_OF(create_uctx_in, in, uctx);
+ if (is_user && capable(CAP_NET_RAW) &&
+ (MLX5_CAP_GEN(dev->mdev, uctx_cap) & MLX5_UCTX_CAP_RAW_TX))
+ cap |= MLX5_UCTX_CAP_RAW_TX;
+
+ MLX5_SET(create_uctx_in, in, opcode, MLX5_CMD_OP_CREATE_UCTX);
+ MLX5_SET(uctx, uctx, cap, cap);
err = mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, sizeof(out));
if (err)
@@ -74,12 +99,11 @@ int mlx5_ib_devx_create(struct mlx5_ib_dev *dev)
void mlx5_ib_devx_destroy(struct mlx5_ib_dev *dev, u16 uid)
{
- u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {0};
+ u32 in[MLX5_ST_SZ_DW(destroy_uctx_in)] = {0};
u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {0};
- MLX5_SET(general_obj_in_cmd_hdr, in, opcode, MLX5_CMD_OP_DESTROY_GENERAL_OBJECT);
- MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, MLX5_OBJ_TYPE_UCTX);
- MLX5_SET(general_obj_in_cmd_hdr, in, obj_id, uid);
+ MLX5_SET(destroy_uctx_in, in, opcode, MLX5_CMD_OP_DESTROY_UCTX);
+ MLX5_SET(destroy_uctx_in, in, uid, uid);
mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, sizeof(out));
}
@@ -106,6 +130,21 @@ bool mlx5_ib_devx_is_flow_dest(void *obj, int *dest_id, int *dest_type)
}
}
+bool mlx5_ib_devx_is_flow_counter(void *obj, u32 *counter_id)
+{
+ struct devx_obj *devx_obj = obj;
+ u16 opcode = MLX5_GET(general_obj_in_cmd_hdr, devx_obj->dinbox, opcode);
+
+ if (opcode == MLX5_CMD_OP_DEALLOC_FLOW_COUNTER) {
+ *counter_id = MLX5_GET(dealloc_flow_counter_in,
+ devx_obj->dinbox,
+ flow_counter_id);
+ return true;
+ }
+
+ return false;
+}
+
/*
* As the obj_id in the firmware is not globally unique the object type
* must be considered upon checking for a valid object id.
@@ -116,7 +155,7 @@ static u64 get_enc_obj_id(u16 opcode, u32 obj_id)
return ((u64)opcode << 32) | obj_id;
}
-static int devx_is_valid_obj_id(struct devx_obj *obj, const void *in)
+static u64 devx_get_obj_id(const void *in)
{
u16 opcode = MLX5_GET(general_obj_in_cmd_hdr, in, opcode);
u64 obj_id;
@@ -290,6 +329,8 @@ static int devx_is_valid_obj_id(struct devx_obj *obj, const void *in)
MLX5_GET(query_dct_in, in, dctn));
break;
case MLX5_CMD_OP_QUERY_XRQ:
+ case MLX5_CMD_OP_QUERY_XRQ_DC_PARAMS_ENTRY:
+ case MLX5_CMD_OP_QUERY_XRQ_ERROR_PARAMS:
obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_XRQ,
MLX5_GET(query_xrq_in, in, xrqn));
break;
@@ -310,23 +351,112 @@ static int devx_is_valid_obj_id(struct devx_obj *obj, const void *in)
obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_RQ,
MLX5_GET(arm_rq_in, in, srq_number));
break;
- case MLX5_CMD_OP_DRAIN_DCT:
case MLX5_CMD_OP_ARM_DCT_FOR_KEY_VIOLATION:
obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_DCT,
MLX5_GET(drain_dct_in, in, dctn));
break;
case MLX5_CMD_OP_ARM_XRQ:
+ case MLX5_CMD_OP_SET_XRQ_DC_PARAMS_ENTRY:
obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_XRQ,
MLX5_GET(arm_xrq_in, in, xrqn));
break;
+ case MLX5_CMD_OP_QUERY_PACKET_REFORMAT_CONTEXT:
+ obj_id = get_enc_obj_id
+ (MLX5_CMD_OP_ALLOC_PACKET_REFORMAT_CONTEXT,
+ MLX5_GET(query_packet_reformat_context_in,
+ in, packet_reformat_id));
+ break;
default:
+ obj_id = 0;
+ }
+
+ return obj_id;
+}
+
+static bool devx_is_valid_obj_id(struct ib_uobject *uobj, const void *in)
+{
+ u64 obj_id = devx_get_obj_id(in);
+
+ if (!obj_id)
return false;
+
+ switch (uobj_get_object_id(uobj)) {
+ case UVERBS_OBJECT_CQ:
+ return get_enc_obj_id(MLX5_CMD_OP_CREATE_CQ,
+ to_mcq(uobj->object)->mcq.cqn) ==
+ obj_id;
+
+ case UVERBS_OBJECT_SRQ:
+ {
+ struct mlx5_core_srq *srq = &(to_msrq(uobj->object)->msrq);
+ struct mlx5_ib_dev *dev = to_mdev(uobj->context->device);
+ u16 opcode;
+
+ switch (srq->common.res) {
+ case MLX5_RES_XSRQ:
+ opcode = MLX5_CMD_OP_CREATE_XRC_SRQ;
+ break;
+ case MLX5_RES_XRQ:
+ opcode = MLX5_CMD_OP_CREATE_XRQ;
+ break;
+ default:
+ if (!dev->mdev->issi)
+ opcode = MLX5_CMD_OP_CREATE_SRQ;
+ else
+ opcode = MLX5_CMD_OP_CREATE_RMP;
+ }
+
+ return get_enc_obj_id(opcode,
+ to_msrq(uobj->object)->msrq.srqn) ==
+ obj_id;
}
- if (obj_id == obj->obj_id)
- return true;
+ case UVERBS_OBJECT_QP:
+ {
+ struct mlx5_ib_qp *qp = to_mqp(uobj->object);
+ enum ib_qp_type qp_type = qp->ibqp.qp_type;
+
+ if (qp_type == IB_QPT_RAW_PACKET ||
+ (qp->flags & MLX5_IB_QP_UNDERLAY)) {
+ struct mlx5_ib_raw_packet_qp *raw_packet_qp =
+ &qp->raw_packet_qp;
+ struct mlx5_ib_rq *rq = &raw_packet_qp->rq;
+ struct mlx5_ib_sq *sq = &raw_packet_qp->sq;
+
+ return (get_enc_obj_id(MLX5_CMD_OP_CREATE_RQ,
+ rq->base.mqp.qpn) == obj_id ||
+ get_enc_obj_id(MLX5_CMD_OP_CREATE_SQ,
+ sq->base.mqp.qpn) == obj_id ||
+ get_enc_obj_id(MLX5_CMD_OP_CREATE_TIR,
+ rq->tirn) == obj_id ||
+ get_enc_obj_id(MLX5_CMD_OP_CREATE_TIS,
+ sq->tisn) == obj_id);
+ }
+
+ if (qp_type == MLX5_IB_QPT_DCT)
+ return get_enc_obj_id(MLX5_CMD_OP_CREATE_DCT,
+ qp->dct.mdct.mqp.qpn) == obj_id;
+
+ return get_enc_obj_id(MLX5_CMD_OP_CREATE_QP,
+ qp->ibqp.qp_num) == obj_id;
+ }
- return false;
+ case UVERBS_OBJECT_WQ:
+ return get_enc_obj_id(MLX5_CMD_OP_CREATE_RQ,
+ to_mrwq(uobj->object)->core_qp.qpn) ==
+ obj_id;
+
+ case UVERBS_OBJECT_RWQ_IND_TBL:
+ return get_enc_obj_id(MLX5_CMD_OP_CREATE_RQT,
+ to_mrwq_ind_table(uobj->object)->rqtn) ==
+ obj_id;
+
+ case MLX5_IB_OBJECT_DEVX_OBJ:
+ return ((struct devx_obj *)uobj->object)->obj_id == obj_id;
+
+ default:
+ return false;
+ }
}
static void devx_set_umem_valid(const void *in)
@@ -491,9 +621,9 @@ static bool devx_is_obj_modify_cmd(const void *in)
case MLX5_CMD_OP_2RST_QP:
case MLX5_CMD_OP_ARM_XRC_SRQ:
case MLX5_CMD_OP_ARM_RQ:
- case MLX5_CMD_OP_DRAIN_DCT:
case MLX5_CMD_OP_ARM_DCT_FOR_KEY_VIOLATION:
case MLX5_CMD_OP_ARM_XRQ:
+ case MLX5_CMD_OP_SET_XRQ_DC_PARAMS_ENTRY:
return true;
case MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY:
{
@@ -535,6 +665,9 @@ static bool devx_is_obj_query_cmd(const void *in)
case MLX5_CMD_OP_QUERY_XRC_SRQ:
case MLX5_CMD_OP_QUERY_DCT:
case MLX5_CMD_OP_QUERY_XRQ:
+ case MLX5_CMD_OP_QUERY_XRQ_DC_PARAMS_ENTRY:
+ case MLX5_CMD_OP_QUERY_XRQ_ERROR_PARAMS:
+ case MLX5_CMD_OP_QUERY_PACKET_REFORMAT_CONTEXT:
return true;
default:
return false;
@@ -572,15 +705,16 @@ static int devx_get_uid(struct mlx5_ib_ucontext *c, void *cmd_in)
if (!c->devx_uid)
return -EINVAL;
- if (!capable(CAP_NET_RAW))
- return -EPERM;
-
return c->devx_uid;
}
static bool devx_is_general_cmd(void *in)
{
u16 opcode = MLX5_GET(general_obj_in_cmd_hdr, in, opcode);
+ if (opcode >= MLX5_CMD_OP_GENERAL_START &&
+ opcode < MLX5_CMD_OP_GENERAL_END)
+ return true;
+
switch (opcode) {
case MLX5_CMD_OP_QUERY_HCA_CAP:
case MLX5_CMD_OP_QUERY_HCA_VPORT_CONTEXT:
@@ -603,7 +737,7 @@ static bool devx_is_general_cmd(void *in)
}
static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_QUERY_EQN)(
- struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs)
+ struct uverbs_attr_bundle *attrs)
{
struct mlx5_ib_ucontext *c;
struct mlx5_ib_dev *dev;
@@ -616,7 +750,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_QUERY_EQN)(
MLX5_IB_ATTR_DEVX_QUERY_EQN_USER_VEC))
return -EFAULT;
- c = devx_ufile2uctx(file);
+ c = devx_ufile2uctx(attrs);
if (IS_ERR(c))
return PTR_ERR(c);
dev = to_mdev(c->ibucontext.device);
@@ -653,14 +787,14 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_QUERY_EQN)(
* queue or arm its CQ for event generation), no further harm is expected.
*/
static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_QUERY_UAR)(
- struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs)
+ struct uverbs_attr_bundle *attrs)
{
struct mlx5_ib_ucontext *c;
struct mlx5_ib_dev *dev;
u32 user_idx;
s32 dev_idx;
- c = devx_ufile2uctx(file);
+ c = devx_ufile2uctx(attrs);
if (IS_ERR(c))
return PTR_ERR(c);
dev = to_mdev(c->ibucontext.device);
@@ -681,7 +815,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_QUERY_UAR)(
}
static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OTHER)(
- struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs)
+ struct uverbs_attr_bundle *attrs)
{
struct mlx5_ib_ucontext *c;
struct mlx5_ib_dev *dev;
@@ -693,7 +827,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OTHER)(
int err;
int uid;
- c = devx_ufile2uctx(file);
+ c = devx_ufile2uctx(attrs);
if (IS_ERR(c))
return PTR_ERR(c);
dev = to_mdev(c->ibucontext.device);
@@ -740,6 +874,10 @@ static void devx_obj_build_destroy_cmd(void *in, void *out, void *din,
MLX5_SET(general_obj_in_cmd_hdr, din, obj_type, obj_type);
break;
+ case MLX5_CMD_OP_CREATE_UMEM:
+ MLX5_SET(general_obj_in_cmd_hdr, din, opcode,
+ MLX5_CMD_OP_DESTROY_UMEM);
+ break;
case MLX5_CMD_OP_CREATE_MKEY:
MLX5_SET(general_obj_in_cmd_hdr, din, opcode, MLX5_CMD_OP_DESTROY_MKEY);
break;
@@ -892,6 +1030,92 @@ static void devx_obj_build_destroy_cmd(void *in, void *out, void *din,
}
}
+static int devx_handle_mkey_indirect(struct devx_obj *obj,
+ struct mlx5_ib_dev *dev,
+ void *in, void *out)
+{
+ struct mlx5_mkey_table *table = &dev->mdev->priv.mkey_table;
+ struct mlx5_ib_devx_mr *devx_mr = &obj->devx_mr;
+ unsigned long flags;
+ struct mlx5_core_mkey *mkey;
+ void *mkc;
+ u8 key;
+ int err;
+
+ mkey = &devx_mr->mmkey;
+ mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+ key = MLX5_GET(mkc, mkc, mkey_7_0);
+ mkey->key = mlx5_idx_to_mkey(
+ MLX5_GET(create_mkey_out, out, mkey_index)) | key;
+ mkey->type = MLX5_MKEY_INDIRECT_DEVX;
+ mkey->iova = MLX5_GET64(mkc, mkc, start_addr);
+ mkey->size = MLX5_GET64(mkc, mkc, len);
+ mkey->pd = MLX5_GET(mkc, mkc, pd);
+ devx_mr->ndescs = MLX5_GET(mkc, mkc, translations_octword_size);
+
+ write_lock_irqsave(&table->lock, flags);
+ err = radix_tree_insert(&table->tree, mlx5_base_mkey(mkey->key),
+ mkey);
+ write_unlock_irqrestore(&table->lock, flags);
+ return err;
+}
+
+static int devx_handle_mkey_create(struct mlx5_ib_dev *dev,
+ struct devx_obj *obj,
+ void *in, int in_len)
+{
+ int min_len = MLX5_BYTE_OFF(create_mkey_in, memory_key_mkey_entry) +
+ MLX5_FLD_SZ_BYTES(create_mkey_in,
+ memory_key_mkey_entry);
+ void *mkc;
+ u8 access_mode;
+
+ if (in_len < min_len)
+ return -EINVAL;
+
+ mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+
+ access_mode = MLX5_GET(mkc, mkc, access_mode_1_0);
+ access_mode |= MLX5_GET(mkc, mkc, access_mode_4_2) << 2;
+
+ if (access_mode == MLX5_MKC_ACCESS_MODE_KLMS ||
+ access_mode == MLX5_MKC_ACCESS_MODE_KSM) {
+ if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
+ obj->flags |= DEVX_OBJ_FLAGS_INDIRECT_MKEY;
+ return 0;
+ }
+
+ MLX5_SET(create_mkey_in, in, mkey_umem_valid, 1);
+ return 0;
+}
+
+static void devx_free_indirect_mkey(struct rcu_head *rcu)
+{
+ kfree(container_of(rcu, struct devx_obj, devx_mr.rcu));
+}
+
+/* This function to delete from the radix tree needs to be called before
+ * destroying the underlying mkey. Otherwise a race might occur in case that
+ * other thread will get the same mkey before this one will be deleted,
+ * in that case it will fail via inserting to the tree its own data.
+ *
+ * Note:
+ * An error in the destroy is not expected unless there is some other indirect
+ * mkey which points to this one. In a kernel cleanup flow it will be just
+ * destroyed in the iterative destruction call. In a user flow, in case
+ * the application didn't close in the expected order it's its own problem,
+ * the mkey won't be part of the tree, in both cases the kernel is safe.
+ */
+static void devx_cleanup_mkey(struct devx_obj *obj)
+{
+ struct mlx5_mkey_table *table = &obj->mdev->priv.mkey_table;
+ unsigned long flags;
+
+ write_lock_irqsave(&table->lock, flags);
+ radix_tree_delete(&table->tree, mlx5_base_mkey(obj->devx_mr.mmkey.key));
+ write_unlock_irqrestore(&table->lock, flags);
+}
+
static int devx_obj_cleanup(struct ib_uobject *uobject,
enum rdma_remove_reason why)
{
@@ -899,24 +1123,42 @@ static int devx_obj_cleanup(struct ib_uobject *uobject,
struct devx_obj *obj = uobject->object;
int ret;
- ret = mlx5_cmd_exec(obj->mdev, obj->dinbox, obj->dinlen, out, sizeof(out));
+ if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY)
+ devx_cleanup_mkey(obj);
+
+ if (obj->flags & DEVX_OBJ_FLAGS_DCT)
+ ret = mlx5_core_destroy_dct(obj->mdev, &obj->core_dct);
+ else
+ ret = mlx5_cmd_exec(obj->mdev, obj->dinbox, obj->dinlen, out,
+ sizeof(out));
if (ib_is_destroy_retryable(ret, why, uobject))
return ret;
+ if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY) {
+ struct mlx5_ib_dev *dev = to_mdev(uobject->context->device);
+
+ call_srcu(&dev->mr_srcu, &obj->devx_mr.rcu,
+ devx_free_indirect_mkey);
+ return ret;
+ }
+
kfree(obj);
return ret;
}
static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)(
- struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs)
+ struct uverbs_attr_bundle *attrs)
{
void *cmd_in = uverbs_attr_get_alloced_ptr(attrs, MLX5_IB_ATTR_DEVX_OBJ_CREATE_CMD_IN);
int cmd_out_len = uverbs_attr_get_len(attrs,
MLX5_IB_ATTR_DEVX_OBJ_CREATE_CMD_OUT);
+ int cmd_in_len = uverbs_attr_get_len(attrs,
+ MLX5_IB_ATTR_DEVX_OBJ_CREATE_CMD_IN);
void *cmd_out;
struct ib_uobject *uobj = uverbs_attr_get_uobject(
attrs, MLX5_IB_ATTR_DEVX_OBJ_CREATE_HANDLE);
- struct mlx5_ib_ucontext *c = to_mucontext(uobj->context);
+ struct mlx5_ib_ucontext *c = rdma_udata_to_drv_context(
+ &attrs->driver_udata, struct mlx5_ib_ucontext, ibucontext);
struct mlx5_ib_dev *dev = to_mdev(c->ibucontext.device);
u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)];
struct devx_obj *obj;
@@ -941,11 +1183,25 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)(
return -ENOMEM;
MLX5_SET(general_obj_in_cmd_hdr, cmd_in, uid, uid);
- devx_set_umem_valid(cmd_in);
+ if (opcode == MLX5_CMD_OP_CREATE_MKEY) {
+ err = devx_handle_mkey_create(dev, obj, cmd_in, cmd_in_len);
+ if (err)
+ goto obj_free;
+ } else {
+ devx_set_umem_valid(cmd_in);
+ }
+
+ if (opcode == MLX5_CMD_OP_CREATE_DCT) {
+ obj->flags |= DEVX_OBJ_FLAGS_DCT;
+ err = mlx5_core_create_dct(dev->mdev, &obj->core_dct,
+ cmd_in, cmd_in_len,
+ cmd_out, cmd_out_len);
+ } else {
+ err = mlx5_cmd_exec(dev->mdev, cmd_in,
+ cmd_in_len,
+ cmd_out, cmd_out_len);
+ }
- err = mlx5_cmd_exec(dev->mdev, cmd_in,
- uverbs_attr_get_len(attrs, MLX5_IB_ATTR_DEVX_OBJ_CREATE_CMD_IN),
- cmd_out, cmd_out_len);
if (err)
goto obj_free;
@@ -955,30 +1211,44 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)(
&obj_id);
WARN_ON(obj->dinlen > MLX5_MAX_DESTROY_INBOX_SIZE_DW * sizeof(u32));
+ if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY) {
+ err = devx_handle_mkey_indirect(obj, dev, cmd_in, cmd_out);
+ if (err)
+ goto obj_destroy;
+ }
+
err = uverbs_copy_to(attrs, MLX5_IB_ATTR_DEVX_OBJ_CREATE_CMD_OUT, cmd_out, cmd_out_len);
if (err)
- goto obj_destroy;
+ goto err_copy;
obj->obj_id = get_enc_obj_id(opcode, obj_id);
return 0;
+err_copy:
+ if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY)
+ devx_cleanup_mkey(obj);
obj_destroy:
- mlx5_cmd_exec(obj->mdev, obj->dinbox, obj->dinlen, out, sizeof(out));
+ if (obj->flags & DEVX_OBJ_FLAGS_DCT)
+ mlx5_core_destroy_dct(obj->mdev, &obj->core_dct);
+ else
+ mlx5_cmd_exec(obj->mdev, obj->dinbox, obj->dinlen, out,
+ sizeof(out));
obj_free:
kfree(obj);
return err;
}
static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_MODIFY)(
- struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs)
+ struct uverbs_attr_bundle *attrs)
{
void *cmd_in = uverbs_attr_get_alloced_ptr(attrs, MLX5_IB_ATTR_DEVX_OBJ_MODIFY_CMD_IN);
int cmd_out_len = uverbs_attr_get_len(attrs,
MLX5_IB_ATTR_DEVX_OBJ_MODIFY_CMD_OUT);
struct ib_uobject *uobj = uverbs_attr_get_uobject(attrs,
MLX5_IB_ATTR_DEVX_OBJ_MODIFY_HANDLE);
- struct mlx5_ib_ucontext *c = to_mucontext(uobj->context);
- struct devx_obj *obj = uobj->object;
+ struct mlx5_ib_ucontext *c = rdma_udata_to_drv_context(
+ &attrs->driver_udata, struct mlx5_ib_ucontext, ibucontext);
+ struct mlx5_ib_dev *mdev = to_mdev(c->ibucontext.device);
void *cmd_out;
int err;
int uid;
@@ -990,7 +1260,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_MODIFY)(
if (!devx_is_obj_modify_cmd(cmd_in))
return -EINVAL;
- if (!devx_is_valid_obj_id(obj, cmd_in))
+ if (!devx_is_valid_obj_id(uobj, cmd_in))
return -EINVAL;
cmd_out = uverbs_zalloc(attrs, cmd_out_len);
@@ -1000,7 +1270,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_MODIFY)(
MLX5_SET(general_obj_in_cmd_hdr, cmd_in, uid, uid);
devx_set_umem_valid(cmd_in);
- err = mlx5_cmd_exec(obj->mdev, cmd_in,
+ err = mlx5_cmd_exec(mdev->mdev, cmd_in,
uverbs_attr_get_len(attrs, MLX5_IB_ATTR_DEVX_OBJ_MODIFY_CMD_IN),
cmd_out, cmd_out_len);
if (err)
@@ -1011,18 +1281,19 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_MODIFY)(
}
static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_QUERY)(
- struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs)
+ struct uverbs_attr_bundle *attrs)
{
void *cmd_in = uverbs_attr_get_alloced_ptr(attrs, MLX5_IB_ATTR_DEVX_OBJ_QUERY_CMD_IN);
int cmd_out_len = uverbs_attr_get_len(attrs,
MLX5_IB_ATTR_DEVX_OBJ_QUERY_CMD_OUT);
struct ib_uobject *uobj = uverbs_attr_get_uobject(attrs,
MLX5_IB_ATTR_DEVX_OBJ_QUERY_HANDLE);
- struct mlx5_ib_ucontext *c = to_mucontext(uobj->context);
- struct devx_obj *obj = uobj->object;
+ struct mlx5_ib_ucontext *c = rdma_udata_to_drv_context(
+ &attrs->driver_udata, struct mlx5_ib_ucontext, ibucontext);
void *cmd_out;
int err;
int uid;
+ struct mlx5_ib_dev *mdev = to_mdev(c->ibucontext.device);
uid = devx_get_uid(c, cmd_in);
if (uid < 0)
@@ -1031,7 +1302,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_QUERY)(
if (!devx_is_obj_query_cmd(cmd_in))
return -EINVAL;
- if (!devx_is_valid_obj_id(obj, cmd_in))
+ if (!devx_is_valid_obj_id(uobj, cmd_in))
return -EINVAL;
cmd_out = uverbs_zalloc(attrs, cmd_out_len);
@@ -1039,7 +1310,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_QUERY)(
return PTR_ERR(cmd_out);
MLX5_SET(general_obj_in_cmd_hdr, cmd_in, uid, uid);
- err = mlx5_cmd_exec(obj->mdev, cmd_in,
+ err = mlx5_cmd_exec(mdev->mdev, cmd_in,
uverbs_attr_get_len(attrs, MLX5_IB_ATTR_DEVX_OBJ_QUERY_CMD_IN),
cmd_out, cmd_out_len);
if (err)
@@ -1049,6 +1320,154 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_QUERY)(
cmd_out, cmd_out_len);
}
+struct devx_async_event_queue {
+ spinlock_t lock;
+ wait_queue_head_t poll_wait;
+ struct list_head event_list;
+ atomic_t bytes_in_use;
+ u8 is_destroyed:1;
+};
+
+struct devx_async_cmd_event_file {
+ struct ib_uobject uobj;
+ struct devx_async_event_queue ev_queue;
+ struct mlx5_async_ctx async_ctx;
+};
+
+static void devx_init_event_queue(struct devx_async_event_queue *ev_queue)
+{
+ spin_lock_init(&ev_queue->lock);
+ INIT_LIST_HEAD(&ev_queue->event_list);
+ init_waitqueue_head(&ev_queue->poll_wait);
+ atomic_set(&ev_queue->bytes_in_use, 0);
+ ev_queue->is_destroyed = 0;
+}
+
+static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_ASYNC_CMD_FD_ALLOC)(
+ struct uverbs_attr_bundle *attrs)
+{
+ struct devx_async_cmd_event_file *ev_file;
+
+ struct ib_uobject *uobj = uverbs_attr_get_uobject(
+ attrs, MLX5_IB_ATTR_DEVX_ASYNC_CMD_FD_ALLOC_HANDLE);
+ struct mlx5_ib_dev *mdev = to_mdev(uobj->context->device);
+
+ ev_file = container_of(uobj, struct devx_async_cmd_event_file,
+ uobj);
+ devx_init_event_queue(&ev_file->ev_queue);
+ mlx5_cmd_init_async_ctx(mdev->mdev, &ev_file->async_ctx);
+ return 0;
+}
+
+static void devx_query_callback(int status, struct mlx5_async_work *context)
+{
+ struct devx_async_data *async_data =
+ container_of(context, struct devx_async_data, cb_work);
+ struct ib_uobject *fd_uobj = async_data->fd_uobj;
+ struct devx_async_cmd_event_file *ev_file;
+ struct devx_async_event_queue *ev_queue;
+ unsigned long flags;
+
+ ev_file = container_of(fd_uobj, struct devx_async_cmd_event_file,
+ uobj);
+ ev_queue = &ev_file->ev_queue;
+
+ spin_lock_irqsave(&ev_queue->lock, flags);
+ list_add_tail(&async_data->list, &ev_queue->event_list);
+ spin_unlock_irqrestore(&ev_queue->lock, flags);
+
+ wake_up_interruptible(&ev_queue->poll_wait);
+ fput(fd_uobj->object);
+}
+
+#define MAX_ASYNC_BYTES_IN_USE (1024 * 1024) /* 1MB */
+
+static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_ASYNC_QUERY)(
+ struct uverbs_attr_bundle *attrs)
+{
+ void *cmd_in = uverbs_attr_get_alloced_ptr(attrs,
+ MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_CMD_IN);
+ struct ib_uobject *uobj = uverbs_attr_get_uobject(
+ attrs,
+ MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_HANDLE);
+ u16 cmd_out_len;
+ struct mlx5_ib_ucontext *c = rdma_udata_to_drv_context(
+ &attrs->driver_udata, struct mlx5_ib_ucontext, ibucontext);
+ struct ib_uobject *fd_uobj;
+ int err;
+ int uid;
+ struct mlx5_ib_dev *mdev = to_mdev(c->ibucontext.device);
+ struct devx_async_cmd_event_file *ev_file;
+ struct devx_async_data *async_data;
+
+ uid = devx_get_uid(c, cmd_in);
+ if (uid < 0)
+ return uid;
+
+ if (!devx_is_obj_query_cmd(cmd_in))
+ return -EINVAL;
+
+ err = uverbs_get_const(&cmd_out_len, attrs,
+ MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_OUT_LEN);
+ if (err)
+ return err;
+
+ if (!devx_is_valid_obj_id(uobj, cmd_in))
+ return -EINVAL;
+
+ fd_uobj = uverbs_attr_get_uobject(attrs,
+ MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_FD);
+ if (IS_ERR(fd_uobj))
+ return PTR_ERR(fd_uobj);
+
+ ev_file = container_of(fd_uobj, struct devx_async_cmd_event_file,
+ uobj);
+
+ if (atomic_add_return(cmd_out_len, &ev_file->ev_queue.bytes_in_use) >
+ MAX_ASYNC_BYTES_IN_USE) {
+ atomic_sub(cmd_out_len, &ev_file->ev_queue.bytes_in_use);
+ return -EAGAIN;
+ }
+
+ async_data = kvzalloc(struct_size(async_data, hdr.out_data,
+ cmd_out_len), GFP_KERNEL);
+ if (!async_data) {
+ err = -ENOMEM;
+ goto sub_bytes;
+ }
+
+ err = uverbs_copy_from(&async_data->hdr.wr_id, attrs,
+ MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_WR_ID);
+ if (err)
+ goto free_async;
+
+ async_data->cmd_out_len = cmd_out_len;
+ async_data->mdev = mdev;
+ async_data->fd_uobj = fd_uobj;
+
+ get_file(fd_uobj->object);
+ MLX5_SET(general_obj_in_cmd_hdr, cmd_in, uid, uid);
+ err = mlx5_cmd_exec_cb(&ev_file->async_ctx, cmd_in,
+ uverbs_attr_get_len(attrs,
+ MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_CMD_IN),
+ async_data->hdr.out_data,
+ async_data->cmd_out_len,
+ devx_query_callback, &async_data->cb_work);
+
+ if (err)
+ goto cb_err;
+
+ return 0;
+
+cb_err:
+ fput(fd_uobj->object);
+free_async:
+ kvfree(async_data);
+sub_bytes:
+ atomic_sub(cmd_out_len, &ev_file->ev_queue.bytes_in_use);
+ return err;
+}
+
static int devx_umem_get(struct mlx5_ib_dev *dev, struct ib_ucontext *ucontext,
struct uverbs_attr_bundle *attrs,
struct devx_umem *obj)
@@ -1076,7 +1495,7 @@ static int devx_umem_get(struct mlx5_ib_dev *dev, struct ib_ucontext *ucontext,
if (err)
return err;
- obj->umem = ib_umem_get(ucontext, addr, size, access, 0);
+ obj->umem = ib_umem_get(&attrs->driver_udata, addr, size, access, 0);
if (IS_ERR(obj->umem))
return PTR_ERR(obj->umem);
@@ -1115,8 +1534,7 @@ static void devx_umem_reg_cmd_build(struct mlx5_ib_dev *dev,
umem = MLX5_ADDR_OF(create_umem_in, cmd->in, umem);
mtt = (__be64 *)MLX5_ADDR_OF(umem, umem, mtt);
- MLX5_SET(general_obj_in_cmd_hdr, cmd->in, opcode, MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
- MLX5_SET(general_obj_in_cmd_hdr, cmd->in, obj_type, MLX5_OBJ_TYPE_UMEM);
+ MLX5_SET(create_umem_in, cmd->in, opcode, MLX5_CMD_OP_CREATE_UMEM);
MLX5_SET64(umem, umem, num_of_mtt, obj->ncont);
MLX5_SET(umem, umem, log_page_size, obj->page_shift -
MLX5_ADAPTER_PAGE_SHIFT);
@@ -1127,23 +1545,21 @@ static void devx_umem_reg_cmd_build(struct mlx5_ib_dev *dev,
}
static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_UMEM_REG)(
- struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs)
+ struct uverbs_attr_bundle *attrs)
{
struct devx_umem_reg_cmd cmd;
struct devx_umem *obj;
struct ib_uobject *uobj = uverbs_attr_get_uobject(
attrs, MLX5_IB_ATTR_DEVX_UMEM_REG_HANDLE);
u32 obj_id;
- struct mlx5_ib_ucontext *c = to_mucontext(uobj->context);
+ struct mlx5_ib_ucontext *c = rdma_udata_to_drv_context(
+ &attrs->driver_udata, struct mlx5_ib_ucontext, ibucontext);
struct mlx5_ib_dev *dev = to_mdev(c->ibucontext.device);
int err;
if (!c->devx_uid)
return -EINVAL;
- if (!capable(CAP_NET_RAW))
- return -EPERM;
-
obj = kzalloc(sizeof(struct devx_umem), GFP_KERNEL);
if (!obj)
return -ENOMEM;
@@ -1158,7 +1574,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_UMEM_REG)(
devx_umem_reg_cmd_build(dev, obj, &cmd);
- MLX5_SET(general_obj_in_cmd_hdr, cmd.in, uid, c->devx_uid);
+ MLX5_SET(create_umem_in, cmd.in, uid, c->devx_uid);
err = mlx5_cmd_exec(dev->mdev, cmd.in, cmd.inlen, cmd.out,
sizeof(cmd.out));
if (err)
@@ -1198,6 +1614,123 @@ static int devx_umem_cleanup(struct ib_uobject *uobject,
return 0;
}
+static ssize_t devx_async_cmd_event_read(struct file *filp, char __user *buf,
+ size_t count, loff_t *pos)
+{
+ struct devx_async_cmd_event_file *comp_ev_file = filp->private_data;
+ struct devx_async_event_queue *ev_queue = &comp_ev_file->ev_queue;
+ struct devx_async_data *event;
+ int ret = 0;
+ size_t eventsz;
+
+ spin_lock_irq(&ev_queue->lock);
+
+ while (list_empty(&ev_queue->event_list)) {
+ spin_unlock_irq(&ev_queue->lock);
+
+ if (filp->f_flags & O_NONBLOCK)
+ return -EAGAIN;
+
+ if (wait_event_interruptible(
+ ev_queue->poll_wait,
+ (!list_empty(&ev_queue->event_list) ||
+ ev_queue->is_destroyed))) {
+ return -ERESTARTSYS;
+ }
+
+ if (list_empty(&ev_queue->event_list) &&
+ ev_queue->is_destroyed)
+ return -EIO;
+
+ spin_lock_irq(&ev_queue->lock);
+ }
+
+ event = list_entry(ev_queue->event_list.next,
+ struct devx_async_data, list);
+ eventsz = event->cmd_out_len +
+ sizeof(struct mlx5_ib_uapi_devx_async_cmd_hdr);
+
+ if (eventsz > count) {
+ spin_unlock_irq(&ev_queue->lock);
+ return -ENOSPC;
+ }
+
+ list_del(ev_queue->event_list.next);
+ spin_unlock_irq(&ev_queue->lock);
+
+ if (copy_to_user(buf, &event->hdr, eventsz))
+ ret = -EFAULT;
+ else
+ ret = eventsz;
+
+ atomic_sub(event->cmd_out_len, &ev_queue->bytes_in_use);
+ kvfree(event);
+ return ret;
+}
+
+static int devx_async_cmd_event_close(struct inode *inode, struct file *filp)
+{
+ struct ib_uobject *uobj = filp->private_data;
+ struct devx_async_cmd_event_file *comp_ev_file = container_of(
+ uobj, struct devx_async_cmd_event_file, uobj);
+ struct devx_async_data *entry, *tmp;
+
+ spin_lock_irq(&comp_ev_file->ev_queue.lock);
+ list_for_each_entry_safe(entry, tmp,
+ &comp_ev_file->ev_queue.event_list, list)
+ kvfree(entry);
+ spin_unlock_irq(&comp_ev_file->ev_queue.lock);
+
+ uverbs_close_fd(filp);
+ return 0;
+}
+
+static __poll_t devx_async_cmd_event_poll(struct file *filp,
+ struct poll_table_struct *wait)
+{
+ struct devx_async_cmd_event_file *comp_ev_file = filp->private_data;
+ struct devx_async_event_queue *ev_queue = &comp_ev_file->ev_queue;
+ __poll_t pollflags = 0;
+
+ poll_wait(filp, &ev_queue->poll_wait, wait);
+
+ spin_lock_irq(&ev_queue->lock);
+ if (ev_queue->is_destroyed)
+ pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
+ else if (!list_empty(&ev_queue->event_list))
+ pollflags = EPOLLIN | EPOLLRDNORM;
+ spin_unlock_irq(&ev_queue->lock);
+
+ return pollflags;
+}
+
+const struct file_operations devx_async_cmd_event_fops = {
+ .owner = THIS_MODULE,
+ .read = devx_async_cmd_event_read,
+ .poll = devx_async_cmd_event_poll,
+ .release = devx_async_cmd_event_close,
+ .llseek = no_llseek,
+};
+
+static int devx_hot_unplug_async_cmd_event_file(struct ib_uobject *uobj,
+ enum rdma_remove_reason why)
+{
+ struct devx_async_cmd_event_file *comp_ev_file =
+ container_of(uobj, struct devx_async_cmd_event_file,
+ uobj);
+ struct devx_async_event_queue *ev_queue = &comp_ev_file->ev_queue;
+
+ spin_lock_irq(&ev_queue->lock);
+ ev_queue->is_destroyed = 1;
+ spin_unlock_irq(&ev_queue->lock);
+
+ if (why == RDMA_REMOVE_DRIVER_REMOVE)
+ wake_up_interruptible(&ev_queue->poll_wait);
+
+ mlx5_cmd_cleanup_async_ctx(&comp_ev_file->async_ctx);
+ return 0;
+};
+
DECLARE_UVERBS_NAMED_METHOD(
MLX5_IB_METHOD_DEVX_UMEM_REG,
UVERBS_ATTR_IDR(MLX5_IB_ATTR_DEVX_UMEM_REG_HANDLE,
@@ -1279,7 +1812,7 @@ DECLARE_UVERBS_NAMED_METHOD_DESTROY(
DECLARE_UVERBS_NAMED_METHOD(
MLX5_IB_METHOD_DEVX_OBJ_MODIFY,
UVERBS_ATTR_IDR(MLX5_IB_ATTR_DEVX_OBJ_MODIFY_HANDLE,
- MLX5_IB_OBJECT_DEVX_OBJ,
+ UVERBS_IDR_ANY_OBJECT,
UVERBS_ACCESS_WRITE,
UA_MANDATORY),
UVERBS_ATTR_PTR_IN(
@@ -1295,7 +1828,7 @@ DECLARE_UVERBS_NAMED_METHOD(
DECLARE_UVERBS_NAMED_METHOD(
MLX5_IB_METHOD_DEVX_OBJ_QUERY,
UVERBS_ATTR_IDR(MLX5_IB_ATTR_DEVX_OBJ_QUERY_HANDLE,
- MLX5_IB_OBJECT_DEVX_OBJ,
+ UVERBS_IDR_ANY_OBJECT,
UVERBS_ACCESS_READ,
UA_MANDATORY),
UVERBS_ATTR_PTR_IN(
@@ -1308,6 +1841,27 @@ DECLARE_UVERBS_NAMED_METHOD(
UVERBS_ATTR_MIN_SIZE(MLX5_ST_SZ_BYTES(general_obj_out_cmd_hdr)),
UA_MANDATORY));
+DECLARE_UVERBS_NAMED_METHOD(
+ MLX5_IB_METHOD_DEVX_OBJ_ASYNC_QUERY,
+ UVERBS_ATTR_IDR(MLX5_IB_ATTR_DEVX_OBJ_QUERY_HANDLE,
+ UVERBS_IDR_ANY_OBJECT,
+ UVERBS_ACCESS_READ,
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_IN(
+ MLX5_IB_ATTR_DEVX_OBJ_QUERY_CMD_IN,
+ UVERBS_ATTR_MIN_SIZE(MLX5_ST_SZ_BYTES(general_obj_in_cmd_hdr)),
+ UA_MANDATORY,
+ UA_ALLOC_AND_COPY),
+ UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_OUT_LEN,
+ u16, UA_MANDATORY),
+ UVERBS_ATTR_FD(MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_FD,
+ MLX5_IB_OBJECT_DEVX_ASYNC_CMD_FD,
+ UVERBS_ACCESS_READ,
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_WR_ID,
+ UVERBS_ATTR_TYPE(u64),
+ UA_MANDATORY));
+
DECLARE_UVERBS_GLOBAL_METHODS(MLX5_IB_OBJECT_DEVX,
&UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OTHER),
&UVERBS_METHOD(MLX5_IB_METHOD_DEVX_QUERY_UAR),
@@ -1318,19 +1872,49 @@ DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_DEVX_OBJ,
&UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OBJ_CREATE),
&UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OBJ_DESTROY),
&UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OBJ_MODIFY),
- &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OBJ_QUERY));
+ &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OBJ_QUERY),
+ &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OBJ_ASYNC_QUERY));
DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_DEVX_UMEM,
UVERBS_TYPE_ALLOC_IDR(devx_umem_cleanup),
&UVERBS_METHOD(MLX5_IB_METHOD_DEVX_UMEM_REG),
&UVERBS_METHOD(MLX5_IB_METHOD_DEVX_UMEM_DEREG));
-DECLARE_UVERBS_OBJECT_TREE(devx_objects,
- &UVERBS_OBJECT(MLX5_IB_OBJECT_DEVX),
- &UVERBS_OBJECT(MLX5_IB_OBJECT_DEVX_OBJ),
- &UVERBS_OBJECT(MLX5_IB_OBJECT_DEVX_UMEM));
-const struct uverbs_object_tree_def *mlx5_ib_get_devx_tree(void)
+DECLARE_UVERBS_NAMED_METHOD(
+ MLX5_IB_METHOD_DEVX_ASYNC_CMD_FD_ALLOC,
+ UVERBS_ATTR_FD(MLX5_IB_ATTR_DEVX_ASYNC_CMD_FD_ALLOC_HANDLE,
+ MLX5_IB_OBJECT_DEVX_ASYNC_CMD_FD,
+ UVERBS_ACCESS_NEW,
+ UA_MANDATORY));
+
+DECLARE_UVERBS_NAMED_OBJECT(
+ MLX5_IB_OBJECT_DEVX_ASYNC_CMD_FD,
+ UVERBS_TYPE_ALLOC_FD(sizeof(struct devx_async_cmd_event_file),
+ devx_hot_unplug_async_cmd_event_file,
+ &devx_async_cmd_event_fops, "[devx_async_cmd]",
+ O_RDONLY),
+ &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_ASYNC_CMD_FD_ALLOC));
+
+static bool devx_is_supported(struct ib_device *device)
{
- return &devx_objects;
+ struct mlx5_ib_dev *dev = to_mdev(device);
+
+ return !dev->rep && MLX5_CAP_GEN(dev->mdev, log_max_uctx);
}
+
+const struct uapi_definition mlx5_ib_devx_defs[] = {
+ UAPI_DEF_CHAIN_OBJ_TREE_NAMED(
+ MLX5_IB_OBJECT_DEVX,
+ UAPI_DEF_IS_OBJ_SUPPORTED(devx_is_supported)),
+ UAPI_DEF_CHAIN_OBJ_TREE_NAMED(
+ MLX5_IB_OBJECT_DEVX_OBJ,
+ UAPI_DEF_IS_OBJ_SUPPORTED(devx_is_supported)),
+ UAPI_DEF_CHAIN_OBJ_TREE_NAMED(
+ MLX5_IB_OBJECT_DEVX_UMEM,
+ UAPI_DEF_IS_OBJ_SUPPORTED(devx_is_supported)),
+ UAPI_DEF_CHAIN_OBJ_TREE_NAMED(
+ MLX5_IB_OBJECT_DEVX_ASYNC_CMD_FD,
+ UAPI_DEF_IS_OBJ_SUPPORTED(devx_is_supported)),
+ {},
+};
diff --git a/drivers/infiniband/hw/mlx5/doorbell.c b/drivers/infiniband/hw/mlx5/doorbell.c
index a0e4e6ddb71a..8f4e5f22b84c 100644
--- a/drivers/infiniband/hw/mlx5/doorbell.c
+++ b/drivers/infiniband/hw/mlx5/doorbell.c
@@ -43,7 +43,8 @@ struct mlx5_ib_user_db_page {
int refcnt;
};
-int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context, unsigned long virt,
+int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context,
+ struct ib_udata *udata, unsigned long virt,
struct mlx5_db *db)
{
struct mlx5_ib_user_db_page *page;
@@ -63,8 +64,7 @@ int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context, unsigned long virt,
page->user_virt = (virt & PAGE_MASK);
page->refcnt = 0;
- page->umem = ib_umem_get(&context->ibucontext, virt & PAGE_MASK,
- PAGE_SIZE, 0, 0);
+ page->umem = ib_umem_get(udata, virt & PAGE_MASK, PAGE_SIZE, 0, 0);
if (IS_ERR(page->umem)) {
err = PTR_ERR(page->umem);
kfree(page);
diff --git a/drivers/infiniband/hw/mlx5/flow.c b/drivers/infiniband/hw/mlx5/flow.c
index f86cdcafdafc..798591a18484 100644
--- a/drivers/infiniband/hw/mlx5/flow.c
+++ b/drivers/infiniband/hw/mlx5/flow.c
@@ -60,7 +60,7 @@ static const struct uverbs_attr_spec mlx5_ib_flow_type[] = {
#define MLX5_IB_CREATE_FLOW_MAX_FLOW_ACTIONS 2
static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)(
- struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs)
+ struct uverbs_attr_bundle *attrs)
{
struct mlx5_flow_act flow_act = {.flow_tag = MLX5_FS_DEFAULT_FLOW_TAG};
struct mlx5_ib_flow_handler *flow_handler;
@@ -77,6 +77,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)(
uverbs_attr_get_uobject(attrs, MLX5_IB_ATTR_CREATE_FLOW_HANDLE);
struct mlx5_ib_dev *dev = to_mdev(uobj->context->device);
int len, ret, i;
+ u32 counter_id = 0;
if (!capable(CAP_NET_RAW))
return -EPERM;
@@ -92,10 +93,6 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)(
((dest_devx && dest_qp) || (!dest_devx && !dest_qp)))
return -EINVAL;
- if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_EGRESS &&
- (dest_devx || dest_qp))
- return -EINVAL;
-
if (dest_devx) {
devx_obj = uverbs_attr_get_obj(
attrs, MLX5_IB_ATTR_CREATE_FLOW_DEST_DEVX);
@@ -128,8 +125,19 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)(
dest_type = MLX5_FLOW_DESTINATION_TYPE_PORT;
}
- if (dev->rep)
- return -ENOTSUPP;
+ len = uverbs_attr_get_uobjs_arr(attrs,
+ MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX, &arr_flow_actions);
+ if (len) {
+ devx_obj = arr_flow_actions[0]->object;
+
+ if (!mlx5_ib_devx_is_flow_counter(devx_obj, &counter_id))
+ return -EINVAL;
+ flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_COUNT;
+ }
+
+ if (dest_type == MLX5_FLOW_DESTINATION_TYPE_TIR &&
+ fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_EGRESS)
+ return -EINVAL;
cmd_in = uverbs_attr_get_alloced_ptr(
attrs, MLX5_IB_ATTR_CREATE_FLOW_MATCH_VALUE);
@@ -164,6 +172,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)(
}
flow_handler = mlx5_ib_raw_fs_rule_add(dev, fs_matcher, &flow_act,
+ counter_id,
cmd_in, inlen,
dest_id, dest_type);
if (IS_ERR(flow_handler)) {
@@ -194,7 +203,7 @@ static int flow_matcher_cleanup(struct ib_uobject *uobject,
}
static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_MATCHER_CREATE)(
- struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs)
+ struct uverbs_attr_bundle *attrs)
{
struct ib_uobject *uobj = uverbs_attr_get_uobject(
attrs, MLX5_IB_ATTR_FLOW_MATCHER_CREATE_HANDLE);
@@ -313,7 +322,6 @@ static bool mlx5_ib_modify_header_supported(struct mlx5_ib_dev *dev)
}
static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_ACTION_CREATE_MODIFY_HEADER)(
- struct ib_uverbs_file *file,
struct uverbs_attr_bundle *attrs)
{
struct ib_uobject *uobj = uverbs_attr_get_uobject(
@@ -321,9 +329,8 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_ACTION_CREATE_MODIFY_HEADER)(
struct mlx5_ib_dev *mdev = to_mdev(uobj->context->device);
enum mlx5_ib_uapi_flow_table_type ft_type;
struct ib_flow_action *action;
- size_t num_actions;
+ int num_actions;
void *in;
- int len;
int ret;
if (!mlx5_ib_modify_header_supported(mdev))
@@ -331,18 +338,17 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_ACTION_CREATE_MODIFY_HEADER)(
in = uverbs_attr_get_alloced_ptr(attrs,
MLX5_IB_ATTR_CREATE_MODIFY_HEADER_ACTIONS_PRM);
- len = uverbs_attr_get_len(attrs,
- MLX5_IB_ATTR_CREATE_MODIFY_HEADER_ACTIONS_PRM);
- if (len % MLX5_UN_SZ_BYTES(set_action_in_add_action_in_auto))
- return -EINVAL;
+ num_actions = uverbs_attr_ptr_get_array_size(
+ attrs, MLX5_IB_ATTR_CREATE_MODIFY_HEADER_ACTIONS_PRM,
+ MLX5_UN_SZ_BYTES(set_action_in_add_action_in_auto));
+ if (num_actions < 0)
+ return num_actions;
ret = uverbs_get_const(&ft_type, attrs,
MLX5_IB_ATTR_CREATE_MODIFY_HEADER_FT_TYPE);
if (ret)
return ret;
-
- num_actions = len / MLX5_UN_SZ_BYTES(set_action_in_add_action_in_auto),
action = mlx5_ib_create_modify_header(mdev, ft_type, num_actions, in);
if (IS_ERR(action))
return PTR_ERR(action);
@@ -435,7 +441,6 @@ static int mlx5_ib_flow_action_create_packet_reformat_ctx(
}
static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_ACTION_CREATE_PACKET_REFORMAT)(
- struct ib_uverbs_file *file,
struct uverbs_attr_bundle *attrs)
{
struct ib_uobject *uobj = uverbs_attr_get_uobject(attrs,
@@ -526,7 +531,11 @@ DECLARE_UVERBS_NAMED_METHOD(
UA_OPTIONAL),
UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_CREATE_FLOW_TAG,
UVERBS_ATTR_TYPE(u32),
- UA_OPTIONAL));
+ UA_OPTIONAL),
+ UVERBS_ATTR_IDRS_ARR(MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX,
+ MLX5_IB_OBJECT_DEVX_OBJ,
+ UVERBS_ACCESS_READ, 1, 1,
+ UA_OPTIONAL));
DECLARE_UVERBS_NAMED_METHOD_DESTROY(
MLX5_IB_METHOD_DESTROY_FLOW,
@@ -610,16 +619,19 @@ DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_FLOW_MATCHER,
&UVERBS_METHOD(MLX5_IB_METHOD_FLOW_MATCHER_CREATE),
&UVERBS_METHOD(MLX5_IB_METHOD_FLOW_MATCHER_DESTROY));
-DECLARE_UVERBS_OBJECT_TREE(flow_objects,
- &UVERBS_OBJECT(MLX5_IB_OBJECT_FLOW_MATCHER));
-
-int mlx5_ib_get_flow_trees(const struct uverbs_object_tree_def **root)
+static bool flow_is_supported(struct ib_device *device)
{
- int i = 0;
-
- root[i++] = &flow_objects;
- root[i++] = &mlx5_ib_fs;
- root[i++] = &mlx5_ib_flow_actions;
-
- return i;
+ return !to_mdev(device)->rep;
}
+
+const struct uapi_definition mlx5_ib_flow_defs[] = {
+ UAPI_DEF_CHAIN_OBJ_TREE_NAMED(
+ MLX5_IB_OBJECT_FLOW_MATCHER,
+ UAPI_DEF_IS_OBJ_SUPPORTED(flow_is_supported)),
+ UAPI_DEF_CHAIN_OBJ_TREE(
+ UVERBS_OBJECT_FLOW,
+ &mlx5_ib_fs),
+ UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_FLOW_ACTION,
+ &mlx5_ib_flow_actions),
+ {},
+};
diff --git a/drivers/infiniband/hw/mlx5/ib_rep.c b/drivers/infiniband/hw/mlx5/ib_rep.c
index 584ff2ea7810..b8639ac71336 100644
--- a/drivers/infiniband/hw/mlx5/ib_rep.c
+++ b/drivers/infiniband/hw/mlx5/ib_rep.c
@@ -3,9 +3,11 @@
* Copyright (c) 2018 Mellanox Technologies. All rights reserved.
*/
+#include <linux/mlx5/vport.h>
#include "ib_rep.h"
+#include "srq.h"
-static const struct mlx5_ib_profile rep_profile = {
+static const struct mlx5_ib_profile vf_rep_profile = {
STAGE_CREATE(MLX5_IB_STAGE_INIT,
mlx5_ib_stage_init_init,
mlx5_ib_stage_init_cleanup),
@@ -21,6 +23,9 @@ static const struct mlx5_ib_profile rep_profile = {
STAGE_CREATE(MLX5_IB_STAGE_ROCE,
mlx5_ib_stage_rep_roce_init,
mlx5_ib_stage_rep_roce_cleanup),
+ STAGE_CREATE(MLX5_IB_STAGE_SRQ,
+ mlx5_init_srq_table,
+ mlx5_cleanup_srq_table),
STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES,
mlx5_ib_stage_dev_res_init,
mlx5_ib_stage_dev_res_cleanup),
@@ -42,23 +47,17 @@ static const struct mlx5_ib_profile rep_profile = {
};
static int
-mlx5_ib_nic_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
-{
- return 0;
-}
-
-static void
-mlx5_ib_nic_rep_unload(struct mlx5_eswitch_rep *rep)
-{
- rep->rep_if[REP_IB].priv = NULL;
-}
-
-static int
mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
{
+ const struct mlx5_ib_profile *profile;
struct mlx5_ib_dev *ibdev;
- ibdev = (struct mlx5_ib_dev *)ib_alloc_device(sizeof(*ibdev));
+ if (rep->vport == MLX5_VPORT_UPLINK)
+ profile = &uplink_rep_profile;
+ else
+ profile = &vf_rep_profile;
+
+ ibdev = ib_alloc_device(mlx5_ib_dev, ib_dev);
if (!ibdev)
return -ENOMEM;
@@ -66,8 +65,10 @@ mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
ibdev->mdev = dev;
ibdev->num_ports = max(MLX5_CAP_GEN(dev, num_ports),
MLX5_CAP_GEN(dev, num_vhca_ports));
- if (!__mlx5_ib_add(ibdev, &rep_profile))
+ if (!__mlx5_ib_add(ibdev, profile)) {
+ ib_dealloc_device(&ibdev->ib_dev);
return -EINVAL;
+ }
rep->rep_if[REP_IB].priv = ibdev;
@@ -85,6 +86,7 @@ mlx5_ib_vport_rep_unload(struct mlx5_eswitch_rep *rep)
dev = mlx5_ib_rep_to_dev(rep);
__mlx5_ib_remove(dev, dev->profile, MLX5_IB_STAGE_MAX);
rep->rep_if[REP_IB].priv = NULL;
+ ib_dealloc_device(&dev->ib_dev);
}
static void *mlx5_ib_vport_get_proto_dev(struct mlx5_eswitch_rep *rep)
@@ -92,53 +94,23 @@ static void *mlx5_ib_vport_get_proto_dev(struct mlx5_eswitch_rep *rep)
return mlx5_ib_rep_to_dev(rep);
}
-static void mlx5_ib_rep_register_vf_vports(struct mlx5_ib_dev *dev)
+void mlx5_ib_register_vport_reps(struct mlx5_core_dev *mdev)
{
- struct mlx5_eswitch *esw = dev->mdev->priv.eswitch;
- int total_vfs = MLX5_TOTAL_VPORTS(dev->mdev);
- int vport;
-
- for (vport = 1; vport < total_vfs; vport++) {
- struct mlx5_eswitch_rep_if rep_if = {};
-
- rep_if.load = mlx5_ib_vport_rep_load;
- rep_if.unload = mlx5_ib_vport_rep_unload;
- rep_if.get_proto_dev = mlx5_ib_vport_get_proto_dev;
- mlx5_eswitch_register_vport_rep(esw, vport, &rep_if, REP_IB);
- }
-}
-
-static void mlx5_ib_rep_unregister_vf_vports(struct mlx5_ib_dev *dev)
-{
- struct mlx5_eswitch *esw = dev->mdev->priv.eswitch;
- int total_vfs = MLX5_TOTAL_VPORTS(dev->mdev);
- int vport;
-
- for (vport = 1; vport < total_vfs; vport++)
- mlx5_eswitch_unregister_vport_rep(esw, vport, REP_IB);
-}
-
-void mlx5_ib_register_vport_reps(struct mlx5_ib_dev *dev)
-{
- struct mlx5_eswitch *esw = dev->mdev->priv.eswitch;
+ struct mlx5_eswitch *esw = mdev->priv.eswitch;
struct mlx5_eswitch_rep_if rep_if = {};
- rep_if.load = mlx5_ib_nic_rep_load;
- rep_if.unload = mlx5_ib_nic_rep_unload;
+ rep_if.load = mlx5_ib_vport_rep_load;
+ rep_if.unload = mlx5_ib_vport_rep_unload;
rep_if.get_proto_dev = mlx5_ib_vport_get_proto_dev;
- rep_if.priv = dev;
-
- mlx5_eswitch_register_vport_rep(esw, 0, &rep_if, REP_IB);
- mlx5_ib_rep_register_vf_vports(dev);
+ mlx5_eswitch_register_vport_reps(esw, &rep_if, REP_IB);
}
-void mlx5_ib_unregister_vport_reps(struct mlx5_ib_dev *dev)
+void mlx5_ib_unregister_vport_reps(struct mlx5_core_dev *mdev)
{
- struct mlx5_eswitch *esw = dev->mdev->priv.eswitch;
+ struct mlx5_eswitch *esw = mdev->priv.eswitch;
- mlx5_ib_rep_unregister_vf_vports(dev); /* VFs vports */
- mlx5_eswitch_unregister_vport_rep(esw, 0, REP_IB); /* UPLINK PF*/
+ mlx5_eswitch_unregister_vport_reps(esw, REP_IB);
}
u8 mlx5_ib_eswitch_mode(struct mlx5_eswitch *esw)
diff --git a/drivers/infiniband/hw/mlx5/ib_rep.h b/drivers/infiniband/hw/mlx5/ib_rep.h
index 2ba73636a2fb..798d41e61fb4 100644
--- a/drivers/infiniband/hw/mlx5/ib_rep.h
+++ b/drivers/infiniband/hw/mlx5/ib_rep.h
@@ -10,14 +10,16 @@
#include "mlx5_ib.h"
#ifdef CONFIG_MLX5_ESWITCH
+extern const struct mlx5_ib_profile uplink_rep_profile;
+
u8 mlx5_ib_eswitch_mode(struct mlx5_eswitch *esw);
struct mlx5_ib_dev *mlx5_ib_get_rep_ibdev(struct mlx5_eswitch *esw,
int vport_index);
struct mlx5_ib_dev *mlx5_ib_get_uplink_ibdev(struct mlx5_eswitch *esw);
struct mlx5_eswitch_rep *mlx5_ib_vport_rep(struct mlx5_eswitch *esw,
int vport_index);
-void mlx5_ib_register_vport_reps(struct mlx5_ib_dev *dev);
-void mlx5_ib_unregister_vport_reps(struct mlx5_ib_dev *dev);
+void mlx5_ib_register_vport_reps(struct mlx5_core_dev *mdev);
+void mlx5_ib_unregister_vport_reps(struct mlx5_core_dev *mdev);
int create_flow_rule_vport_sq(struct mlx5_ib_dev *dev,
struct mlx5_ib_sq *sq);
struct net_device *mlx5_ib_get_rep_netdev(struct mlx5_eswitch *esw,
@@ -48,8 +50,8 @@ struct mlx5_eswitch_rep *mlx5_ib_vport_rep(struct mlx5_eswitch *esw,
return NULL;
}
-static inline void mlx5_ib_register_vport_reps(struct mlx5_ib_dev *dev) {}
-static inline void mlx5_ib_unregister_vport_reps(struct mlx5_ib_dev *dev) {}
+static inline void mlx5_ib_register_vport_reps(struct mlx5_core_dev *mdev) {}
+static inline void mlx5_ib_unregister_vport_reps(struct mlx5_core_dev *mdev) {}
static inline int create_flow_rule_vport_sq(struct mlx5_ib_dev *dev,
struct mlx5_ib_sq *sq)
{
diff --git a/drivers/infiniband/hw/mlx5/mad.c b/drivers/infiniband/hw/mlx5/mad.c
index 32a9e9228b13..6c529e6f3a01 100644
--- a/drivers/infiniband/hw/mlx5/mad.c
+++ b/drivers/infiniband/hw/mlx5/mad.c
@@ -36,6 +36,7 @@
#include <rdma/ib_smi.h>
#include <rdma/ib_pma.h>
#include "mlx5_ib.h"
+#include "cmd.h"
enum {
MLX5_IB_VENDOR_CLASS1 = 0x9,
@@ -51,9 +52,10 @@ static bool can_do_mad_ifc(struct mlx5_ib_dev *dev, u8 port_num,
return dev->mdev->port_caps[port_num - 1].has_smi;
}
-int mlx5_MAD_IFC(struct mlx5_ib_dev *dev, int ignore_mkey, int ignore_bkey,
- u8 port, const struct ib_wc *in_wc, const struct ib_grh *in_grh,
- const void *in_mad, void *response_mad)
+static int mlx5_MAD_IFC(struct mlx5_ib_dev *dev, int ignore_mkey,
+ int ignore_bkey, u8 port, const struct ib_wc *in_wc,
+ const struct ib_grh *in_grh, const void *in_mad,
+ void *response_mad)
{
u8 op_modifier = 0;
@@ -68,7 +70,8 @@ int mlx5_MAD_IFC(struct mlx5_ib_dev *dev, int ignore_mkey, int ignore_bkey,
if (ignore_bkey || !in_wc)
op_modifier |= 0x2;
- return mlx5_core_mad_ifc(dev->mdev, in_mad, response_mad, op_modifier, port);
+ return mlx5_cmd_mad_ifc(dev->mdev, in_mad, response_mad, op_modifier,
+ port);
}
static int process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
@@ -526,11 +529,6 @@ int mlx5_query_mad_ifc_port(struct ib_device *ibdev, u8 port,
int ext_active_speed;
int err = -ENOMEM;
- if (port < 1 || port > dev->num_ports) {
- mlx5_ib_warn(dev, "invalid port number %d\n", port);
- return -EINVAL;
- }
-
in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL);
out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL);
if (!in_mad || !out_mad)
@@ -568,6 +566,14 @@ int mlx5_query_mad_ifc_port(struct ib_device *ibdev, u8 port,
props->max_vl_num = out_mad->data[37] >> 4;
props->init_type_reply = out_mad->data[41] >> 4;
+ if (props->port_cap_flags & IB_PORT_CAP_MASK2_SUP) {
+ props->port_cap_flags2 =
+ be16_to_cpup((__be16 *)(out_mad->data + 60));
+
+ if (props->port_cap_flags2 & IB_PORT_LINK_WIDTH_2X_SUP)
+ props->active_width = out_mad->data[31] & 0x1f;
+ }
+
/* Check if extended speeds (EDR/FDR/...) are supported */
if (props->port_cap_flags & IB_PORT_EXTENDED_SPEEDS_SUP) {
ext_active_speed = out_mad->data[62] >> 4;
@@ -579,6 +585,11 @@ int mlx5_query_mad_ifc_port(struct ib_device *ibdev, u8 port,
case 2:
props->active_speed = 32; /* EDR */
break;
+ case 4:
+ if (props->port_cap_flags & IB_PORT_CAP_MASK2_SUP &&
+ props->port_cap_flags2 & IB_PORT_LINK_SPEED_HDR_SUP)
+ props->active_speed = IB_SPEED_HDR;
+ break;
}
}
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 3569fda07e07..d3dd290ae1b1 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -60,6 +60,7 @@
#include "mlx5_ib.h"
#include "ib_rep.h"
#include "cmd.h"
+#include "srq.h"
#include <linux/mlx5/fs_helpers.h>
#include <linux/mlx5/accel.h>
#include <rdma/uverbs_std_types.h>
@@ -82,10 +83,13 @@ static char mlx5_version[] =
struct mlx5_ib_event_work {
struct work_struct work;
- struct mlx5_core_dev *dev;
- void *context;
- enum mlx5_dev_event event;
- unsigned long param;
+ union {
+ struct mlx5_ib_dev *dev;
+ struct mlx5_ib_multiport_info *mpi;
+ };
+ bool is_slave;
+ unsigned int event;
+ void *param;
};
enum {
@@ -146,7 +150,7 @@ static int get_port_state(struct ib_device *ibdev,
int ret;
memset(&attr, 0, sizeof(attr));
- ret = ibdev->query_port(ibdev, port_num, &attr);
+ ret = ibdev->ops.query_port(ibdev, port_num, &attr);
if (!ret)
*state = attr.state;
return ret;
@@ -168,7 +172,6 @@ static int mlx5_netdev_event(struct notifier_block *this,
switch (event) {
case NETDEV_REGISTER:
- case NETDEV_UNREGISTER:
write_lock(&roce->netdev_lock);
if (ibdev->rep) {
struct mlx5_eswitch *esw = ibdev->mdev->priv.eswitch;
@@ -177,15 +180,20 @@ static int mlx5_netdev_event(struct notifier_block *this,
rep_ndev = mlx5_ib_get_rep_netdev(esw,
ibdev->rep->vport);
if (rep_ndev == ndev)
- roce->netdev = (event == NETDEV_UNREGISTER) ?
- NULL : ndev;
+ roce->netdev = ndev;
} else if (ndev->dev.parent == &mdev->pdev->dev) {
- roce->netdev = (event == NETDEV_UNREGISTER) ?
- NULL : ndev;
+ roce->netdev = ndev;
}
write_unlock(&roce->netdev_lock);
break;
+ case NETDEV_UNREGISTER:
+ write_lock(&roce->netdev_lock);
+ if (roce->netdev == ndev)
+ roce->netdev = NULL;
+ write_unlock(&roce->netdev_lock);
+ break;
+
case NETDEV_CHANGE:
case NETDEV_UP:
case NETDEV_DOWN: {
@@ -323,8 +331,8 @@ out:
spin_unlock(&port->mp.mpi_lock);
}
-static int translate_eth_proto_oper(u32 eth_proto_oper, u8 *active_speed,
- u8 *active_width)
+static int translate_eth_legacy_proto_oper(u32 eth_proto_oper, u8 *active_speed,
+ u8 *active_width)
{
switch (eth_proto_oper) {
case MLX5E_PROT_MASK(MLX5E_1000BASE_CX_SGMII):
@@ -381,10 +389,73 @@ static int translate_eth_proto_oper(u32 eth_proto_oper, u8 *active_speed,
return 0;
}
+static int translate_eth_ext_proto_oper(u32 eth_proto_oper, u8 *active_speed,
+ u8 *active_width)
+{
+ switch (eth_proto_oper) {
+ case MLX5E_PROT_MASK(MLX5E_SGMII_100M):
+ case MLX5E_PROT_MASK(MLX5E_1000BASE_X_SGMII):
+ *active_width = IB_WIDTH_1X;
+ *active_speed = IB_SPEED_SDR;
+ break;
+ case MLX5E_PROT_MASK(MLX5E_5GBASE_R):
+ *active_width = IB_WIDTH_1X;
+ *active_speed = IB_SPEED_DDR;
+ break;
+ case MLX5E_PROT_MASK(MLX5E_10GBASE_XFI_XAUI_1):
+ *active_width = IB_WIDTH_1X;
+ *active_speed = IB_SPEED_QDR;
+ break;
+ case MLX5E_PROT_MASK(MLX5E_40GBASE_XLAUI_4_XLPPI_4):
+ *active_width = IB_WIDTH_4X;
+ *active_speed = IB_SPEED_QDR;
+ break;
+ case MLX5E_PROT_MASK(MLX5E_25GAUI_1_25GBASE_CR_KR):
+ *active_width = IB_WIDTH_1X;
+ *active_speed = IB_SPEED_EDR;
+ break;
+ case MLX5E_PROT_MASK(MLX5E_50GAUI_2_LAUI_2_50GBASE_CR2_KR2):
+ *active_width = IB_WIDTH_2X;
+ *active_speed = IB_SPEED_EDR;
+ break;
+ case MLX5E_PROT_MASK(MLX5E_50GAUI_1_LAUI_1_50GBASE_CR_KR):
+ *active_width = IB_WIDTH_1X;
+ *active_speed = IB_SPEED_HDR;
+ break;
+ case MLX5E_PROT_MASK(MLX5E_CAUI_4_100GBASE_CR4_KR4):
+ *active_width = IB_WIDTH_4X;
+ *active_speed = IB_SPEED_EDR;
+ break;
+ case MLX5E_PROT_MASK(MLX5E_100GAUI_2_100GBASE_CR2_KR2):
+ *active_width = IB_WIDTH_2X;
+ *active_speed = IB_SPEED_HDR;
+ break;
+ case MLX5E_PROT_MASK(MLX5E_200GAUI_4_200GBASE_CR4_KR4):
+ *active_width = IB_WIDTH_4X;
+ *active_speed = IB_SPEED_HDR;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int translate_eth_proto_oper(u32 eth_proto_oper, u8 *active_speed,
+ u8 *active_width, bool ext)
+{
+ return ext ?
+ translate_eth_ext_proto_oper(eth_proto_oper, active_speed,
+ active_width) :
+ translate_eth_legacy_proto_oper(eth_proto_oper, active_speed,
+ active_width);
+}
+
static int mlx5_query_port_roce(struct ib_device *device, u8 port_num,
struct ib_port_attr *props)
{
struct mlx5_ib_dev *dev = to_mdev(device);
+ u32 out[MLX5_ST_SZ_DW(ptys_reg)] = {0};
struct mlx5_core_dev *mdev;
struct net_device *ndev, *upper;
enum ib_mtu ndev_ib_mtu;
@@ -392,6 +463,7 @@ static int mlx5_query_port_roce(struct ib_device *device, u8 port_num,
u16 qkey_viol_cntr;
u32 eth_prot_oper;
u8 mdev_port_num;
+ bool ext;
int err;
mdev = mlx5_ib_get_native_port_mdev(dev, port_num, &mdev_port_num);
@@ -408,16 +480,18 @@ static int mlx5_query_port_roce(struct ib_device *device, u8 port_num,
/* Possible bad flows are checked before filling out props so in case
* of an error it will still be zeroed out.
*/
- err = mlx5_query_port_eth_proto_oper(mdev, &eth_prot_oper,
- mdev_port_num);
+ err = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN,
+ mdev_port_num);
if (err)
goto out;
+ ext = MLX5_CAP_PCAM_FEATURE(dev->mdev, ptys_extended_ethernet);
+ eth_prot_oper = MLX5_GET_ETH_PROTO(ptys_reg, out, ext, eth_proto_oper);
props->active_width = IB_WIDTH_4X;
props->active_speed = IB_SPEED_QDR;
translate_eth_proto_oper(eth_prot_oper, &props->active_speed,
- &props->active_width);
+ &props->active_width, ext);
props->port_cap_flags |= IB_PORT_CM_SUP;
props->ip_gids = true;
@@ -441,7 +515,7 @@ static int mlx5_query_port_roce(struct ib_device *device, u8 port_num,
if (!ndev)
goto out;
- if (mlx5_lag_is_active(dev->mdev)) {
+ if (dev->lag_active) {
rcu_read_lock();
upper = netdev_master_upper_dev_get_rcu(ndev);
if (upper) {
@@ -468,24 +542,51 @@ out:
return err;
}
+struct mlx5_ib_vlan_info {
+ u16 vlan_id;
+ bool vlan;
+};
+
+static int get_lower_dev_vlan(struct net_device *lower_dev, void *data)
+{
+ struct mlx5_ib_vlan_info *vlan_info = data;
+
+ if (is_vlan_dev(lower_dev)) {
+ vlan_info->vlan = true;
+ vlan_info->vlan_id = vlan_dev_vlan_id(lower_dev);
+ }
+ /* We are interested only in first level vlan device, so
+ * always return 1 to stop iterating over next level devices.
+ */
+ return 1;
+}
+
static int set_roce_addr(struct mlx5_ib_dev *dev, u8 port_num,
unsigned int index, const union ib_gid *gid,
const struct ib_gid_attr *attr)
{
enum ib_gid_type gid_type = IB_GID_TYPE_IB;
+ struct mlx5_ib_vlan_info vlan_info = { };
u8 roce_version = 0;
u8 roce_l3_type = 0;
- bool vlan = false;
u8 mac[ETH_ALEN];
- u16 vlan_id = 0;
if (gid) {
gid_type = attr->gid_type;
ether_addr_copy(mac, attr->ndev->dev_addr);
if (is_vlan_dev(attr->ndev)) {
- vlan = true;
- vlan_id = vlan_dev_vlan_id(attr->ndev);
+ vlan_info.vlan = true;
+ vlan_info.vlan_id = vlan_dev_vlan_id(attr->ndev);
+ } else {
+ /* If the netdev is upper device and if it's lower
+ * lower device is vlan device, consider vlan id of
+ * the lower vlan device for this gid entry.
+ */
+ rcu_read_lock();
+ netdev_walk_all_lower_dev_rcu(attr->ndev,
+ get_lower_dev_vlan, &vlan_info);
+ rcu_read_unlock();
}
}
@@ -506,8 +607,9 @@ static int set_roce_addr(struct mlx5_ib_dev *dev, u8 port_num,
}
return mlx5_core_roce_gid_set(dev->mdev, index, roce_version,
- roce_l3_type, gid->raw, mac, vlan,
- vlan_id, port_num);
+ roce_l3_type, gid->raw, mac,
+ vlan_info.vlan, vlan_info.vlan_id,
+ port_num);
}
static int mlx5_ib_add_gid(const struct ib_gid_attr *attr,
@@ -915,11 +1017,11 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
props->hca_core_clock = MLX5_CAP_GEN(mdev, device_frequency_khz);
props->timestamp_mask = 0x7FFFFFFFFFFFFFFFULL;
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
- if (MLX5_CAP_GEN(mdev, pg))
- props->device_cap_flags |= IB_DEVICE_ON_DEMAND_PAGING;
- props->odp_caps = dev->odp_caps;
-#endif
+ if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
+ if (MLX5_CAP_GEN(mdev, pg))
+ props->device_cap_flags |= IB_DEVICE_ON_DEMAND_PAGING;
+ props->odp_caps = dev->odp_caps;
+ }
if (MLX5_CAP_GEN(mdev, cd))
props->device_cap_flags |= IB_DEVICE_CROSS_CHANNEL;
@@ -1014,6 +1116,11 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
if (MLX5_CAP_GEN(mdev, cqe_128_always))
resp.flags |= MLX5_IB_QUERY_DEV_RESP_FLAGS_CQE_128B_PAD;
+ if (MLX5_CAP_GEN(mdev, qp_packet_based))
+ resp.flags |=
+ MLX5_IB_QUERY_DEV_RESP_PACKET_BASED_CREDIT_MODE;
+
+ resp.flags |= MLX5_IB_QUERY_DEV_RESP_FLAGS_SCAT2CQE_DCT;
}
if (field_avail(typeof(resp), sw_parsing_caps,
@@ -1101,6 +1208,8 @@ static void translate_active_width(struct ib_device *ibdev, u8 active_width,
if (active_width & MLX5_IB_WIDTH_1X)
*ib_width = IB_WIDTH_1X;
+ else if (active_width & MLX5_IB_WIDTH_2X)
+ *ib_width = IB_WIDTH_2X;
else if (active_width & MLX5_IB_WIDTH_4X)
*ib_width = IB_WIDTH_4X;
else if (active_width & MLX5_IB_WIDTH_8X)
@@ -1216,6 +1325,9 @@ static int mlx5_query_hca_port(struct ib_device *ibdev, u8 port,
props->subnet_timeout = rep->subnet_timeout;
props->init_type_reply = rep->init_type_reply;
+ if (props->port_cap_flags & IB_PORT_CAP_MASK2_SUP)
+ props->port_cap_flags2 = rep->cap_mask2;
+
err = mlx5_query_port_link_width_oper(mdev, &ib_link_width_oper, port);
if (err)
goto out;
@@ -1642,14 +1754,15 @@ static void mlx5_ib_dealloc_transport_domain(struct mlx5_ib_dev *dev, u32 tdn,
mlx5_ib_disable_lb(dev, true, false);
}
-static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
- struct ib_udata *udata)
+static int mlx5_ib_alloc_ucontext(struct ib_ucontext *uctx,
+ struct ib_udata *udata)
{
+ struct ib_device *ibdev = uctx->device;
struct mlx5_ib_dev *dev = to_mdev(ibdev);
struct mlx5_ib_alloc_ucontext_req_v2 req = {};
struct mlx5_ib_alloc_ucontext_resp resp = {};
struct mlx5_core_dev *mdev = dev->mdev;
- struct mlx5_ib_ucontext *context;
+ struct mlx5_ib_ucontext *context = to_mucontext(uctx);
struct mlx5_bfreg_info *bfregi;
int ver;
int err;
@@ -1659,29 +1772,29 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
bool lib_uar_4k;
if (!dev->ib_active)
- return ERR_PTR(-EAGAIN);
+ return -EAGAIN;
if (udata->inlen == sizeof(struct mlx5_ib_alloc_ucontext_req))
ver = 0;
else if (udata->inlen >= min_req_v2)
ver = 2;
else
- return ERR_PTR(-EINVAL);
+ return -EINVAL;
err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req)));
if (err)
- return ERR_PTR(err);
+ return err;
if (req.flags & ~MLX5_IB_ALLOC_UCTX_DEVX)
- return ERR_PTR(-EOPNOTSUPP);
+ return -EOPNOTSUPP;
if (req.comp_mask || req.reserved0 || req.reserved1 || req.reserved2)
- return ERR_PTR(-EOPNOTSUPP);
+ return -EOPNOTSUPP;
req.total_num_bfregs = ALIGN(req.total_num_bfregs,
MLX5_NON_FP_BFREGS_PER_UAR);
if (req.num_low_latency_bfregs > req.total_num_bfregs - 1)
- return ERR_PTR(-EINVAL);
+ return -EINVAL;
resp.qp_tab_size = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp);
if (mlx5_core_is_pf(dev->mdev) && MLX5_CAP_GEN(dev->mdev, bf))
@@ -1714,10 +1827,6 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
/* MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_FULL_OFFLOAD is currently always 0 */
}
- context = kzalloc(sizeof(*context), GFP_KERNEL);
- if (!context)
- return ERR_PTR(-ENOMEM);
-
lib_uar_4k = req.lib_caps & MLX5_LIB_CAP_4K_UAR;
bfregi = &context->bfregi;
@@ -1747,12 +1856,12 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
if (err)
goto out_sys_pages;
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
- context->ibucontext.invalidate_range = &mlx5_ib_invalidate_range;
-#endif
+ if (ibdev->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING)
+ context->ibucontext.invalidate_range =
+ &mlx5_ib_invalidate_range;
if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX) {
- err = mlx5_ib_devx_create(dev);
+ err = mlx5_ib_devx_create(dev, true);
if (err < 0)
goto out_uars;
context->devx_uid = err;
@@ -1844,7 +1953,7 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
context->lib_caps = req.lib_caps;
print_lib_caps(dev, context->lib_caps);
- if (mlx5_lag_is_active(dev->mdev)) {
+ if (dev->lag_active) {
u8 port = mlx5_core_native_port_num(dev->mdev);
atomic_set(&context->tx_port_affinity,
@@ -1852,7 +1961,7 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
1, &dev->roce[port].tx_port_affinity));
}
- return &context->ibucontext;
+ return 0;
out_mdev:
mlx5_ib_dealloc_transport_domain(dev, context->tdn, context->devx_uid);
@@ -1870,23 +1979,19 @@ out_count:
kfree(bfregi->count);
out_ctx:
- kfree(context);
-
- return ERR_PTR(err);
+ return err;
}
-static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
+static void mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
{
struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
struct mlx5_bfreg_info *bfregi;
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
/* All umem's must be destroyed before destroying the ucontext. */
mutex_lock(&ibcontext->per_mm_list_lock);
WARN_ON(!list_empty(&ibcontext->per_mm_list));
mutex_unlock(&ibcontext->per_mm_list_lock);
-#endif
bfregi = &context->bfregi;
mlx5_ib_dealloc_transport_domain(dev, context->tdn, context->devx_uid);
@@ -1897,9 +2002,6 @@ static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
deallocate_uars(dev, context);
kfree(bfregi->sys_pages);
kfree(bfregi->count);
- kfree(context);
-
- return 0;
}
static phys_addr_t uar_index2pfn(struct mlx5_ib_dev *dev,
@@ -1966,6 +2068,7 @@ static int mlx5_ib_mmap_clock_info_page(struct mlx5_ib_dev *dev,
if (vma->vm_flags & VM_WRITE)
return -EPERM;
+ vma->vm_flags &= ~VM_MAYWRITE;
if (!dev->mdev->clock_info_page)
return -EOPNOTSUPP;
@@ -2131,19 +2234,18 @@ static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vm
if (vma->vm_flags & VM_WRITE)
return -EPERM;
+ vma->vm_flags &= ~VM_MAYWRITE;
/* Don't expose to user-space information it shouldn't have */
if (PAGE_SIZE > 4096)
return -EOPNOTSUPP;
- vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
pfn = (dev->mdev->iseg_base +
offsetof(struct mlx5_init_seg, internal_timer_h)) >>
PAGE_SHIFT;
- if (io_remap_pfn_range(vma, vma->vm_start, pfn,
- PAGE_SIZE, vma->vm_page_prot))
- return -EAGAIN;
- break;
+ return rdma_user_mmap_io(&context->ibucontext, vma, pfn,
+ PAGE_SIZE,
+ pgprot_noncached(vma->vm_page_prot));
case MLX5_IB_MMAP_CLOCK_INFO:
return mlx5_ib_mmap_clock_info_page(dev, vma, context);
@@ -2238,30 +2340,24 @@ int mlx5_ib_dealloc_dm(struct ib_dm *ibdm)
return 0;
}
-static struct ib_pd *mlx5_ib_alloc_pd(struct ib_device *ibdev,
- struct ib_ucontext *context,
- struct ib_udata *udata)
+static int mlx5_ib_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context,
+ struct ib_udata *udata)
{
+ struct mlx5_ib_pd *pd = to_mpd(ibpd);
+ struct ib_device *ibdev = ibpd->device;
struct mlx5_ib_alloc_pd_resp resp;
- struct mlx5_ib_pd *pd;
int err;
u32 out[MLX5_ST_SZ_DW(alloc_pd_out)] = {};
u32 in[MLX5_ST_SZ_DW(alloc_pd_in)] = {};
u16 uid = 0;
- pd = kmalloc(sizeof(*pd), GFP_KERNEL);
- if (!pd)
- return ERR_PTR(-ENOMEM);
-
uid = context ? to_mucontext(context)->devx_uid : 0;
MLX5_SET(alloc_pd_in, in, opcode, MLX5_CMD_OP_ALLOC_PD);
MLX5_SET(alloc_pd_in, in, uid, uid);
err = mlx5_cmd_exec(to_mdev(ibdev)->mdev, in, sizeof(in),
out, sizeof(out));
- if (err) {
- kfree(pd);
- return ERR_PTR(err);
- }
+ if (err)
+ return err;
pd->pdn = MLX5_GET(alloc_pd_out, out, pd);
pd->uid = uid;
@@ -2269,23 +2365,19 @@ static struct ib_pd *mlx5_ib_alloc_pd(struct ib_device *ibdev,
resp.pdn = pd->pdn;
if (ib_copy_to_udata(udata, &resp, sizeof(resp))) {
mlx5_cmd_dealloc_pd(to_mdev(ibdev)->mdev, pd->pdn, uid);
- kfree(pd);
- return ERR_PTR(-EFAULT);
+ return -EFAULT;
}
}
- return &pd->ibpd;
+ return 0;
}
-static int mlx5_ib_dealloc_pd(struct ib_pd *pd)
+static void mlx5_ib_dealloc_pd(struct ib_pd *pd)
{
struct mlx5_ib_dev *mdev = to_mdev(pd->device);
struct mlx5_ib_pd *mpd = to_mpd(pd);
mlx5_cmd_dealloc_pd(mdev->mdev, mpd->pdn, mpd->uid);
- kfree(mpd);
-
- return 0;
}
enum {
@@ -2319,10 +2411,29 @@ static u8 get_match_criteria_enable(u32 *match_criteria)
return match_criteria_enable;
}
-static void set_proto(void *outer_c, void *outer_v, u8 mask, u8 val)
+static int set_proto(void *outer_c, void *outer_v, u8 mask, u8 val)
{
- MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_protocol, mask);
- MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_protocol, val);
+ u8 entry_mask;
+ u8 entry_val;
+ int err = 0;
+
+ if (!mask)
+ goto out;
+
+ entry_mask = MLX5_GET(fte_match_set_lyr_2_4, outer_c,
+ ip_protocol);
+ entry_val = MLX5_GET(fte_match_set_lyr_2_4, outer_v,
+ ip_protocol);
+ if (!entry_mask) {
+ MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_protocol, mask);
+ MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_protocol, val);
+ goto out;
+ }
+ /* Don't override existing ip protocol */
+ if (mask != entry_mask || val != entry_val)
+ err = -EINVAL;
+out:
+ return err;
}
static void set_flow_label(void *misc_c, void *misc_v, u32 mask, u32 val,
@@ -2556,8 +2667,10 @@ static int parse_flow_attr(struct mlx5_core_dev *mdev, u32 *match_c,
set_tos(headers_c, headers_v,
ib_spec->ipv4.mask.tos, ib_spec->ipv4.val.tos);
- set_proto(headers_c, headers_v,
- ib_spec->ipv4.mask.proto, ib_spec->ipv4.val.proto);
+ if (set_proto(headers_c, headers_v,
+ ib_spec->ipv4.mask.proto,
+ ib_spec->ipv4.val.proto))
+ return -EINVAL;
break;
case IB_FLOW_SPEC_IPV6:
if (FIELDS_NOT_SUPPORTED(ib_spec->ipv6.mask, LAST_IPV6_FIELD))
@@ -2596,9 +2709,10 @@ static int parse_flow_attr(struct mlx5_core_dev *mdev, u32 *match_c,
ib_spec->ipv6.mask.traffic_class,
ib_spec->ipv6.val.traffic_class);
- set_proto(headers_c, headers_v,
- ib_spec->ipv6.mask.next_hdr,
- ib_spec->ipv6.val.next_hdr);
+ if (set_proto(headers_c, headers_v,
+ ib_spec->ipv6.mask.next_hdr,
+ ib_spec->ipv6.val.next_hdr))
+ return -EINVAL;
set_flow_label(misc_params_c, misc_params_v,
ntohl(ib_spec->ipv6.mask.flow_label),
@@ -2619,10 +2733,8 @@ static int parse_flow_attr(struct mlx5_core_dev *mdev, u32 *match_c,
LAST_TCP_UDP_FIELD))
return -EOPNOTSUPP;
- MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_protocol,
- 0xff);
- MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol,
- IPPROTO_TCP);
+ if (set_proto(headers_c, headers_v, 0xff, IPPROTO_TCP))
+ return -EINVAL;
MLX5_SET(fte_match_set_lyr_2_4, headers_c, tcp_sport,
ntohs(ib_spec->tcp_udp.mask.src_port));
@@ -2639,10 +2751,8 @@ static int parse_flow_attr(struct mlx5_core_dev *mdev, u32 *match_c,
LAST_TCP_UDP_FIELD))
return -EOPNOTSUPP;
- MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_protocol,
- 0xff);
- MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol,
- IPPROTO_UDP);
+ if (set_proto(headers_c, headers_v, 0xff, IPPROTO_UDP))
+ return -EINVAL;
MLX5_SET(fte_match_set_lyr_2_4, headers_c, udp_sport,
ntohs(ib_spec->tcp_udp.mask.src_port));
@@ -2658,6 +2768,9 @@ static int parse_flow_attr(struct mlx5_core_dev *mdev, u32 *match_c,
if (ib_spec->gre.mask.c_ks_res0_ver)
return -EOPNOTSUPP;
+ if (set_proto(headers_c, headers_v, 0xff, IPPROTO_GRE))
+ return -EINVAL;
+
MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_protocol,
0xff);
MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol,
@@ -2669,11 +2782,11 @@ static int parse_flow_attr(struct mlx5_core_dev *mdev, u32 *match_c,
ntohs(ib_spec->gre.val.protocol));
memcpy(MLX5_ADDR_OF(fte_match_set_misc, misc_params_c,
- gre_key_h),
+ gre_key.nvgre.hi),
&ib_spec->gre.mask.key,
sizeof(ib_spec->gre.mask.key));
memcpy(MLX5_ADDR_OF(fte_match_set_misc, misc_params_v,
- gre_key_h),
+ gre_key.nvgre.hi),
&ib_spec->gre.val.key,
sizeof(ib_spec->gre.val.key));
break;
@@ -3706,7 +3819,8 @@ _create_raw_flow_rule(struct mlx5_ib_dev *dev,
struct mlx5_flow_destination *dst,
struct mlx5_ib_flow_matcher *fs_matcher,
struct mlx5_flow_act *flow_act,
- void *cmd_in, int inlen)
+ void *cmd_in, int inlen,
+ int dst_num)
{
struct mlx5_ib_flow_handler *handler;
struct mlx5_flow_spec *spec;
@@ -3728,7 +3842,7 @@ _create_raw_flow_rule(struct mlx5_ib_dev *dev,
spec->match_criteria_enable = fs_matcher->match_criteria_enable;
handler->rule = mlx5_add_flow_rules(ft, spec,
- flow_act, dst, 1);
+ flow_act, dst, dst_num);
if (IS_ERR(handler->rule)) {
err = PTR_ERR(handler->rule);
@@ -3791,12 +3905,14 @@ struct mlx5_ib_flow_handler *
mlx5_ib_raw_fs_rule_add(struct mlx5_ib_dev *dev,
struct mlx5_ib_flow_matcher *fs_matcher,
struct mlx5_flow_act *flow_act,
+ u32 counter_id,
void *cmd_in, int inlen, int dest_id,
int dest_type)
{
struct mlx5_flow_destination *dst;
struct mlx5_ib_flow_prio *ft_prio;
struct mlx5_ib_flow_handler *handler;
+ int dst_num = 0;
bool mcast;
int err;
@@ -3806,7 +3922,7 @@ mlx5_ib_raw_fs_rule_add(struct mlx5_ib_dev *dev,
if (fs_matcher->priority > MLX5_IB_FLOW_LAST_PRIO)
return ERR_PTR(-ENOMEM);
- dst = kzalloc(sizeof(*dst), GFP_KERNEL);
+ dst = kcalloc(2, sizeof(*dst), GFP_KERNEL);
if (!dst)
return ERR_PTR(-ENOMEM);
@@ -3820,20 +3936,28 @@ mlx5_ib_raw_fs_rule_add(struct mlx5_ib_dev *dev,
}
if (dest_type == MLX5_FLOW_DESTINATION_TYPE_TIR) {
- dst->type = dest_type;
- dst->tir_num = dest_id;
+ dst[dst_num].type = dest_type;
+ dst[dst_num].tir_num = dest_id;
flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
} else if (dest_type == MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE) {
- dst->type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE_NUM;
- dst->ft_num = dest_id;
+ dst[dst_num].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE_NUM;
+ dst[dst_num].ft_num = dest_id;
flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
} else {
- dst->type = MLX5_FLOW_DESTINATION_TYPE_PORT;
+ dst[dst_num].type = MLX5_FLOW_DESTINATION_TYPE_PORT;
flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_ALLOW;
}
+ dst_num++;
+
+ if (flow_act->action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
+ dst[dst_num].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
+ dst[dst_num].counter_id = counter_id;
+ dst_num++;
+ }
+
handler = _create_raw_flow_rule(dev, ft_prio, dst, fs_matcher, flow_act,
- cmd_in, inlen);
+ cmd_in, inlen, dst_num);
if (IS_ERR(handler)) {
err = PTR_ERR(handler);
@@ -4079,7 +4203,7 @@ static ssize_t fw_pages_show(struct device *device,
struct device_attribute *attr, char *buf)
{
struct mlx5_ib_dev *dev =
- container_of(device, struct mlx5_ib_dev, ib_dev.dev);
+ rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev);
return sprintf(buf, "%d\n", dev->mdev->priv.fw_pages);
}
@@ -4089,7 +4213,7 @@ static ssize_t reg_pages_show(struct device *device,
struct device_attribute *attr, char *buf)
{
struct mlx5_ib_dev *dev =
- container_of(device, struct mlx5_ib_dev, ib_dev.dev);
+ rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev);
return sprintf(buf, "%d\n", atomic_read(&dev->mdev->priv.reg_pages));
}
@@ -4099,7 +4223,8 @@ static ssize_t hca_type_show(struct device *device,
struct device_attribute *attr, char *buf)
{
struct mlx5_ib_dev *dev =
- container_of(device, struct mlx5_ib_dev, ib_dev.dev);
+ rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev);
+
return sprintf(buf, "MT%d\n", dev->mdev->pdev->device);
}
static DEVICE_ATTR_RO(hca_type);
@@ -4108,7 +4233,8 @@ static ssize_t hw_rev_show(struct device *device,
struct device_attribute *attr, char *buf)
{
struct mlx5_ib_dev *dev =
- container_of(device, struct mlx5_ib_dev, ib_dev.dev);
+ rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev);
+
return sprintf(buf, "%x\n", dev->mdev->rev_id);
}
static DEVICE_ATTR_RO(hw_rev);
@@ -4117,7 +4243,8 @@ static ssize_t board_id_show(struct device *device,
struct device_attribute *attr, char *buf)
{
struct mlx5_ib_dev *dev =
- container_of(device, struct mlx5_ib_dev, ib_dev.dev);
+ rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev);
+
return sprintf(buf, "%.*s\n", MLX5_BOARD_ID_LEN,
dev->mdev->board_id);
}
@@ -4226,6 +4353,63 @@ static void delay_drop_handler(struct work_struct *work)
mutex_unlock(&delay_drop->lock);
}
+static void handle_general_event(struct mlx5_ib_dev *ibdev, struct mlx5_eqe *eqe,
+ struct ib_event *ibev)
+{
+ switch (eqe->sub_type) {
+ case MLX5_GENERAL_SUBTYPE_DELAY_DROP_TIMEOUT:
+ schedule_work(&ibdev->delay_drop.delay_drop_work);
+ break;
+ default: /* do nothing */
+ return;
+ }
+}
+
+static int handle_port_change(struct mlx5_ib_dev *ibdev, struct mlx5_eqe *eqe,
+ struct ib_event *ibev)
+{
+ u8 port = (eqe->data.port.port >> 4) & 0xf;
+
+ ibev->element.port_num = port;
+
+ switch (eqe->sub_type) {
+ case MLX5_PORT_CHANGE_SUBTYPE_ACTIVE:
+ case MLX5_PORT_CHANGE_SUBTYPE_DOWN:
+ case MLX5_PORT_CHANGE_SUBTYPE_INITIALIZED:
+ /* In RoCE, port up/down events are handled in
+ * mlx5_netdev_event().
+ */
+ if (mlx5_ib_port_link_layer(&ibdev->ib_dev, port) ==
+ IB_LINK_LAYER_ETHERNET)
+ return -EINVAL;
+
+ ibev->event = (eqe->sub_type == MLX5_PORT_CHANGE_SUBTYPE_ACTIVE) ?
+ IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR;
+ break;
+
+ case MLX5_PORT_CHANGE_SUBTYPE_LID:
+ ibev->event = IB_EVENT_LID_CHANGE;
+ break;
+
+ case MLX5_PORT_CHANGE_SUBTYPE_PKEY:
+ ibev->event = IB_EVENT_PKEY_CHANGE;
+ schedule_work(&ibdev->devr.ports[port - 1].pkey_change_work);
+ break;
+
+ case MLX5_PORT_CHANGE_SUBTYPE_GUID:
+ ibev->event = IB_EVENT_GID_CHANGE;
+ break;
+
+ case MLX5_PORT_CHANGE_SUBTYPE_CLIENT_REREG:
+ ibev->event = IB_EVENT_CLIENT_REREGISTER;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static void mlx5_ib_handle_event(struct work_struct *_work)
{
struct mlx5_ib_event_work *work =
@@ -4233,65 +4417,37 @@ static void mlx5_ib_handle_event(struct work_struct *_work)
struct mlx5_ib_dev *ibdev;
struct ib_event ibev;
bool fatal = false;
- u8 port = (u8)work->param;
- if (mlx5_core_is_mp_slave(work->dev)) {
- ibdev = mlx5_ib_get_ibdev_from_mpi(work->context);
+ if (work->is_slave) {
+ ibdev = mlx5_ib_get_ibdev_from_mpi(work->mpi);
if (!ibdev)
goto out;
} else {
- ibdev = work->context;
+ ibdev = work->dev;
}
switch (work->event) {
case MLX5_DEV_EVENT_SYS_ERROR:
ibev.event = IB_EVENT_DEVICE_FATAL;
mlx5_ib_handle_internal_error(ibdev);
+ ibev.element.port_num = (u8)(unsigned long)work->param;
fatal = true;
break;
-
- case MLX5_DEV_EVENT_PORT_UP:
- case MLX5_DEV_EVENT_PORT_DOWN:
- case MLX5_DEV_EVENT_PORT_INITIALIZED:
- /* In RoCE, port up/down events are handled in
- * mlx5_netdev_event().
- */
- if (mlx5_ib_port_link_layer(&ibdev->ib_dev, port) ==
- IB_LINK_LAYER_ETHERNET)
+ case MLX5_EVENT_TYPE_PORT_CHANGE:
+ if (handle_port_change(ibdev, work->param, &ibev))
goto out;
-
- ibev.event = (work->event == MLX5_DEV_EVENT_PORT_UP) ?
- IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR;
- break;
-
- case MLX5_DEV_EVENT_LID_CHANGE:
- ibev.event = IB_EVENT_LID_CHANGE;
- break;
-
- case MLX5_DEV_EVENT_PKEY_CHANGE:
- ibev.event = IB_EVENT_PKEY_CHANGE;
- schedule_work(&ibdev->devr.ports[port - 1].pkey_change_work);
- break;
-
- case MLX5_DEV_EVENT_GUID_CHANGE:
- ibev.event = IB_EVENT_GID_CHANGE;
- break;
-
- case MLX5_DEV_EVENT_CLIENT_REREG:
- ibev.event = IB_EVENT_CLIENT_REREGISTER;
break;
- case MLX5_DEV_EVENT_DELAY_DROP_TIMEOUT:
- schedule_work(&ibdev->delay_drop.delay_drop_work);
- goto out;
+ case MLX5_EVENT_TYPE_GENERAL_EVENT:
+ handle_general_event(ibdev, work->param, &ibev);
+ /* fall through */
default:
goto out;
}
- ibev.device = &ibdev->ib_dev;
- ibev.element.port_num = port;
+ ibev.device = &ibdev->ib_dev;
- if (!rdma_is_port_valid(&ibdev->ib_dev, port)) {
- mlx5_ib_warn(ibdev, "warning: event on port %d\n", port);
+ if (!rdma_is_port_valid(&ibdev->ib_dev, ibev.element.port_num)) {
+ mlx5_ib_warn(ibdev, "warning: event on port %d\n", ibev.element.port_num);
goto out;
}
@@ -4304,22 +4460,43 @@ out:
kfree(work);
}
-static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context,
- enum mlx5_dev_event event, unsigned long param)
+static int mlx5_ib_event(struct notifier_block *nb,
+ unsigned long event, void *param)
{
struct mlx5_ib_event_work *work;
work = kmalloc(sizeof(*work), GFP_ATOMIC);
if (!work)
- return;
+ return NOTIFY_DONE;
INIT_WORK(&work->work, mlx5_ib_handle_event);
- work->dev = dev;
+ work->dev = container_of(nb, struct mlx5_ib_dev, mdev_events);
+ work->is_slave = false;
work->param = param;
- work->context = context;
work->event = event;
queue_work(mlx5_ib_event_wq, &work->work);
+
+ return NOTIFY_OK;
+}
+
+static int mlx5_ib_event_slave_port(struct notifier_block *nb,
+ unsigned long event, void *param)
+{
+ struct mlx5_ib_event_work *work;
+
+ work = kmalloc(sizeof(*work), GFP_ATOMIC);
+ if (!work)
+ return NOTIFY_DONE;
+
+ INIT_WORK(&work->work, mlx5_ib_handle_event);
+ work->mpi = container_of(nb, struct mlx5_ib_multiport_info, mdev_events);
+ work->is_slave = true;
+ work->param = param;
+ work->event = event;
+ queue_work(mlx5_ib_event_wq, &work->work);
+
+ return NOTIFY_OK;
}
static int set_has_smi_cap(struct mlx5_ib_dev *dev)
@@ -4553,23 +4730,28 @@ static int create_dev_resources(struct mlx5_ib_resources *devr)
{
struct ib_srq_init_attr attr;
struct mlx5_ib_dev *dev;
+ struct ib_device *ibdev;
struct ib_cq_init_attr cq_attr = {.cqe = 1};
int port;
int ret = 0;
dev = container_of(devr, struct mlx5_ib_dev, devr);
+ ibdev = &dev->ib_dev;
mutex_init(&devr->mutex);
- devr->p0 = mlx5_ib_alloc_pd(&dev->ib_dev, NULL, NULL);
- if (IS_ERR(devr->p0)) {
- ret = PTR_ERR(devr->p0);
- goto error0;
- }
- devr->p0->device = &dev->ib_dev;
+ devr->p0 = rdma_zalloc_drv_obj(ibdev, ib_pd);
+ if (!devr->p0)
+ return -ENOMEM;
+
+ devr->p0->device = ibdev;
devr->p0->uobject = NULL;
atomic_set(&devr->p0->usecnt, 0);
+ ret = mlx5_ib_alloc_pd(devr->p0, NULL, NULL);
+ if (ret)
+ goto error0;
+
devr->c0 = mlx5_ib_create_cq(&dev->ib_dev, &cq_attr, NULL, NULL);
if (IS_ERR(devr->c0)) {
ret = PTR_ERR(devr->c0);
@@ -4667,6 +4849,7 @@ error2:
error1:
mlx5_ib_dealloc_pd(devr->p0);
error0:
+ kfree(devr->p0);
return ret;
}
@@ -4682,6 +4865,7 @@ static void destroy_dev_resources(struct mlx5_ib_resources *devr)
mlx5_ib_dealloc_xrcd(devr->x1);
mlx5_ib_destroy_cq(devr->c0);
mlx5_ib_dealloc_pd(devr->p0);
+ kfree(devr->p0);
/* Make sure no change P_Key work items are still executing */
for (port = 0; port < dev->num_ports; ++port)
@@ -4787,7 +4971,7 @@ static int mlx5_eth_lag_init(struct mlx5_ib_dev *dev)
struct mlx5_flow_table *ft;
int err;
- if (!ns || !mlx5_lag_is_active(mdev))
+ if (!ns || !mlx5_lag_is_roce(mdev))
return 0;
err = mlx5_cmd_create_vport_lag(mdev);
@@ -4801,6 +4985,7 @@ static int mlx5_eth_lag_init(struct mlx5_ib_dev *dev)
}
dev->flow_db->lag_demux_ft = ft;
+ dev->lag_active = true;
return 0;
err_destroy_vport_lag:
@@ -4812,7 +4997,9 @@ static void mlx5_eth_lag_cleanup(struct mlx5_ib_dev *dev)
{
struct mlx5_core_dev *mdev = dev->mdev;
- if (dev->flow_db->lag_demux_ft) {
+ if (dev->lag_active) {
+ dev->lag_active = false;
+
mlx5_destroy_flow_table(dev->flow_db->lag_demux_ft);
dev->flow_db->lag_demux_ft = NULL;
@@ -5038,6 +5225,9 @@ static int mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev)
{
int err = 0;
int i;
+ bool is_shared;
+
+ is_shared = MLX5_CAP_GEN(dev->mdev, log_max_uctx) != 0;
for (i = 0; i < dev->num_ports; i++) {
err = __mlx5_ib_alloc_counters(dev, &dev->port[i].cnts);
@@ -5047,8 +5237,10 @@ static int mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev)
mlx5_ib_fill_counters(dev, dev->port[i].cnts.names,
dev->port[i].cnts.offsets);
- err = mlx5_core_alloc_q_counter(dev->mdev,
- &dev->port[i].cnts.set_id);
+ err = mlx5_cmd_alloc_q_counter(dev->mdev,
+ &dev->port[i].cnts.set_id,
+ is_shared ?
+ MLX5_SHARED_RESOURCE_UID : 0);
if (err) {
mlx5_ib_warn(dev,
"couldn't allocate queue counter for port %d, err %d\n",
@@ -5325,14 +5517,6 @@ static void init_delay_drop(struct mlx5_ib_dev *dev)
mlx5_ib_warn(dev, "Failed to init delay drop debugfs\n");
}
-static const struct cpumask *
-mlx5_ib_get_vector_affinity(struct ib_device *ibdev, int comp_vector)
-{
- struct mlx5_ib_dev *dev = to_mdev(ibdev);
-
- return mlx5_get_vector_affinity_hint(dev->mdev, comp_vector);
-}
-
/* The mlx5_ib_multiport_mutex should be held when calling this function */
static void mlx5_ib_unbind_slave_port(struct mlx5_ib_dev *ibdev,
struct mlx5_ib_multiport_info *mpi)
@@ -5350,6 +5534,11 @@ static void mlx5_ib_unbind_slave_port(struct mlx5_ib_dev *ibdev,
spin_unlock(&port->mp.mpi_lock);
return;
}
+
+ if (mpi->mdev_events.notifier_call)
+ mlx5_notifier_unregister(mpi->mdev, &mpi->mdev_events);
+ mpi->mdev_events.notifier_call = NULL;
+
mpi->ibdev = NULL;
spin_unlock(&port->mp.mpi_lock);
@@ -5405,6 +5594,7 @@ static bool mlx5_ib_bind_slave_port(struct mlx5_ib_dev *ibdev,
ibdev->port[port_num].mp.mpi = mpi;
mpi->ibdev = ibdev;
+ mpi->mdev_events.notifier_call = NULL;
spin_unlock(&ibdev->port[port_num].mp.mpi_lock);
err = mlx5_nic_vport_affiliate_multiport(ibdev->mdev, mpi->mdev);
@@ -5422,9 +5612,10 @@ static bool mlx5_ib_bind_slave_port(struct mlx5_ib_dev *ibdev,
goto unbind;
}
- err = mlx5_ib_init_cong_debugfs(ibdev, port_num);
- if (err)
- goto unbind;
+ mpi->mdev_events.notifier_call = mlx5_ib_event_slave_port;
+ mlx5_notifier_register(mpi->mdev, &mpi->mdev_events);
+
+ mlx5_ib_init_cong_debugfs(ibdev, port_num);
return true;
@@ -5551,30 +5742,17 @@ ADD_UVERBS_ATTRIBUTES_SIMPLE(
UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_CREATE_FLOW_ACTION_FLAGS,
enum mlx5_ib_uapi_flow_action_flags));
-static int populate_specs_root(struct mlx5_ib_dev *dev)
-{
- const struct uverbs_object_tree_def **trees = dev->driver_trees;
- size_t num_trees = 0;
-
- if (mlx5_accel_ipsec_device_caps(dev->mdev) &
- MLX5_ACCEL_IPSEC_CAP_DEVICE)
- trees[num_trees++] = &mlx5_ib_flow_action;
-
- if (MLX5_CAP_DEV_MEM(dev->mdev, memic))
- trees[num_trees++] = &mlx5_ib_dm;
-
- if (MLX5_CAP_GEN_64(dev->mdev, general_obj_types) &
- MLX5_GENERAL_OBJ_TYPES_CAP_UCTX)
- trees[num_trees++] = mlx5_ib_get_devx_tree();
-
- num_trees += mlx5_ib_get_flow_trees(trees + num_trees);
-
- WARN_ON(num_trees >= ARRAY_SIZE(dev->driver_trees));
- trees[num_trees] = NULL;
- dev->ib_dev.driver_specs = trees;
+static const struct uapi_definition mlx5_ib_defs[] = {
+#if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS)
+ UAPI_DEF_CHAIN(mlx5_ib_devx_defs),
+ UAPI_DEF_CHAIN(mlx5_ib_flow_defs),
+#endif
- return 0;
-}
+ UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_FLOW_ACTION,
+ &mlx5_ib_flow_action),
+ UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_DM, &mlx5_ib_dm),
+ {}
+};
static int mlx5_ib_read_counters(struct ib_counters *counters,
struct ib_counters_read_attr *read_attr,
@@ -5649,9 +5827,10 @@ static struct ib_counters *mlx5_ib_create_counters(struct ib_device *device,
void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev)
{
mlx5_ib_cleanup_multiport_master(dev);
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
- cleanup_srcu_struct(&dev->mr_srcu);
-#endif
+ if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
+ srcu_barrier(&dev->mr_srcu);
+ cleanup_srcu_struct(&dev->mr_srcu);
+ }
kfree(dev->port);
}
@@ -5694,8 +5873,7 @@ int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev)
dev->ib_dev.node_type = RDMA_NODE_IB_CA;
dev->ib_dev.local_dma_lkey = 0 /* not supported for now */;
dev->ib_dev.phys_port_cnt = dev->num_ports;
- dev->ib_dev.num_comp_vectors =
- dev->mdev->priv.eq_table.num_comp_vectors;
+ dev->ib_dev.num_comp_vectors = mlx5_comp_vectors_count(mdev);
dev->ib_dev.dev.parent = &mdev->pdev->dev;
mutex_init(&dev->cap_mask_mutex);
@@ -5705,11 +5883,11 @@ int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev)
spin_lock_init(&dev->memic.memic_lock);
dev->memic.dev = mdev;
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
- err = init_srcu_struct(&dev->mr_srcu);
- if (err)
- goto err_free_port;
-#endif
+ if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
+ err = init_srcu_struct(&dev->mr_srcu);
+ if (err)
+ goto err_mp;
+ }
return 0;
err_mp:
@@ -5752,6 +5930,96 @@ static void mlx5_ib_stage_flow_db_cleanup(struct mlx5_ib_dev *dev)
kfree(dev->flow_db);
}
+static const struct ib_device_ops mlx5_ib_dev_ops = {
+ .add_gid = mlx5_ib_add_gid,
+ .alloc_mr = mlx5_ib_alloc_mr,
+ .alloc_pd = mlx5_ib_alloc_pd,
+ .alloc_ucontext = mlx5_ib_alloc_ucontext,
+ .attach_mcast = mlx5_ib_mcg_attach,
+ .check_mr_status = mlx5_ib_check_mr_status,
+ .create_ah = mlx5_ib_create_ah,
+ .create_counters = mlx5_ib_create_counters,
+ .create_cq = mlx5_ib_create_cq,
+ .create_flow = mlx5_ib_create_flow,
+ .create_qp = mlx5_ib_create_qp,
+ .create_srq = mlx5_ib_create_srq,
+ .dealloc_pd = mlx5_ib_dealloc_pd,
+ .dealloc_ucontext = mlx5_ib_dealloc_ucontext,
+ .del_gid = mlx5_ib_del_gid,
+ .dereg_mr = mlx5_ib_dereg_mr,
+ .destroy_ah = mlx5_ib_destroy_ah,
+ .destroy_counters = mlx5_ib_destroy_counters,
+ .destroy_cq = mlx5_ib_destroy_cq,
+ .destroy_flow = mlx5_ib_destroy_flow,
+ .destroy_flow_action = mlx5_ib_destroy_flow_action,
+ .destroy_qp = mlx5_ib_destroy_qp,
+ .destroy_srq = mlx5_ib_destroy_srq,
+ .detach_mcast = mlx5_ib_mcg_detach,
+ .disassociate_ucontext = mlx5_ib_disassociate_ucontext,
+ .drain_rq = mlx5_ib_drain_rq,
+ .drain_sq = mlx5_ib_drain_sq,
+ .get_dev_fw_str = get_dev_fw_str,
+ .get_dma_mr = mlx5_ib_get_dma_mr,
+ .get_link_layer = mlx5_ib_port_link_layer,
+ .map_mr_sg = mlx5_ib_map_mr_sg,
+ .mmap = mlx5_ib_mmap,
+ .modify_cq = mlx5_ib_modify_cq,
+ .modify_device = mlx5_ib_modify_device,
+ .modify_port = mlx5_ib_modify_port,
+ .modify_qp = mlx5_ib_modify_qp,
+ .modify_srq = mlx5_ib_modify_srq,
+ .poll_cq = mlx5_ib_poll_cq,
+ .post_recv = mlx5_ib_post_recv,
+ .post_send = mlx5_ib_post_send,
+ .post_srq_recv = mlx5_ib_post_srq_recv,
+ .process_mad = mlx5_ib_process_mad,
+ .query_ah = mlx5_ib_query_ah,
+ .query_device = mlx5_ib_query_device,
+ .query_gid = mlx5_ib_query_gid,
+ .query_pkey = mlx5_ib_query_pkey,
+ .query_qp = mlx5_ib_query_qp,
+ .query_srq = mlx5_ib_query_srq,
+ .read_counters = mlx5_ib_read_counters,
+ .reg_user_mr = mlx5_ib_reg_user_mr,
+ .req_notify_cq = mlx5_ib_arm_cq,
+ .rereg_user_mr = mlx5_ib_rereg_user_mr,
+ .resize_cq = mlx5_ib_resize_cq,
+ INIT_RDMA_OBJ_SIZE(ib_pd, mlx5_ib_pd, ibpd),
+ INIT_RDMA_OBJ_SIZE(ib_ucontext, mlx5_ib_ucontext, ibucontext),
+};
+
+static const struct ib_device_ops mlx5_ib_dev_flow_ipsec_ops = {
+ .create_flow_action_esp = mlx5_ib_create_flow_action_esp,
+ .modify_flow_action_esp = mlx5_ib_modify_flow_action_esp,
+};
+
+static const struct ib_device_ops mlx5_ib_dev_ipoib_enhanced_ops = {
+ .rdma_netdev_get_params = mlx5_ib_rn_get_params,
+};
+
+static const struct ib_device_ops mlx5_ib_dev_sriov_ops = {
+ .get_vf_config = mlx5_ib_get_vf_config,
+ .get_vf_stats = mlx5_ib_get_vf_stats,
+ .set_vf_guid = mlx5_ib_set_vf_guid,
+ .set_vf_link_state = mlx5_ib_set_vf_link_state,
+};
+
+static const struct ib_device_ops mlx5_ib_dev_mw_ops = {
+ .alloc_mw = mlx5_ib_alloc_mw,
+ .dealloc_mw = mlx5_ib_dealloc_mw,
+};
+
+static const struct ib_device_ops mlx5_ib_dev_xrc_ops = {
+ .alloc_xrcd = mlx5_ib_alloc_xrcd,
+ .dealloc_xrcd = mlx5_ib_dealloc_xrcd,
+};
+
+static const struct ib_device_ops mlx5_ib_dev_dm_ops = {
+ .alloc_dm = mlx5_ib_alloc_dm,
+ .dealloc_dm = mlx5_ib_dealloc_dm,
+ .reg_dm_mr = mlx5_ib_reg_dm_mr,
+};
+
int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev)
{
struct mlx5_core_dev *mdev = dev->mdev;
@@ -5790,104 +6058,45 @@ int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev)
(1ull << IB_USER_VERBS_EX_CMD_CREATE_CQ) |
(1ull << IB_USER_VERBS_EX_CMD_CREATE_QP) |
(1ull << IB_USER_VERBS_EX_CMD_MODIFY_QP) |
- (1ull << IB_USER_VERBS_EX_CMD_MODIFY_CQ);
-
- dev->ib_dev.query_device = mlx5_ib_query_device;
- dev->ib_dev.get_link_layer = mlx5_ib_port_link_layer;
- dev->ib_dev.query_gid = mlx5_ib_query_gid;
- dev->ib_dev.add_gid = mlx5_ib_add_gid;
- dev->ib_dev.del_gid = mlx5_ib_del_gid;
- dev->ib_dev.query_pkey = mlx5_ib_query_pkey;
- dev->ib_dev.modify_device = mlx5_ib_modify_device;
- dev->ib_dev.modify_port = mlx5_ib_modify_port;
- dev->ib_dev.alloc_ucontext = mlx5_ib_alloc_ucontext;
- dev->ib_dev.dealloc_ucontext = mlx5_ib_dealloc_ucontext;
- dev->ib_dev.mmap = mlx5_ib_mmap;
- dev->ib_dev.alloc_pd = mlx5_ib_alloc_pd;
- dev->ib_dev.dealloc_pd = mlx5_ib_dealloc_pd;
- dev->ib_dev.create_ah = mlx5_ib_create_ah;
- dev->ib_dev.query_ah = mlx5_ib_query_ah;
- dev->ib_dev.destroy_ah = mlx5_ib_destroy_ah;
- dev->ib_dev.create_srq = mlx5_ib_create_srq;
- dev->ib_dev.modify_srq = mlx5_ib_modify_srq;
- dev->ib_dev.query_srq = mlx5_ib_query_srq;
- dev->ib_dev.destroy_srq = mlx5_ib_destroy_srq;
- dev->ib_dev.post_srq_recv = mlx5_ib_post_srq_recv;
- dev->ib_dev.create_qp = mlx5_ib_create_qp;
- dev->ib_dev.modify_qp = mlx5_ib_modify_qp;
- dev->ib_dev.query_qp = mlx5_ib_query_qp;
- dev->ib_dev.destroy_qp = mlx5_ib_destroy_qp;
- dev->ib_dev.drain_sq = mlx5_ib_drain_sq;
- dev->ib_dev.drain_rq = mlx5_ib_drain_rq;
- dev->ib_dev.post_send = mlx5_ib_post_send;
- dev->ib_dev.post_recv = mlx5_ib_post_recv;
- dev->ib_dev.create_cq = mlx5_ib_create_cq;
- dev->ib_dev.modify_cq = mlx5_ib_modify_cq;
- dev->ib_dev.resize_cq = mlx5_ib_resize_cq;
- dev->ib_dev.destroy_cq = mlx5_ib_destroy_cq;
- dev->ib_dev.poll_cq = mlx5_ib_poll_cq;
- dev->ib_dev.req_notify_cq = mlx5_ib_arm_cq;
- dev->ib_dev.get_dma_mr = mlx5_ib_get_dma_mr;
- dev->ib_dev.reg_user_mr = mlx5_ib_reg_user_mr;
- dev->ib_dev.rereg_user_mr = mlx5_ib_rereg_user_mr;
- dev->ib_dev.dereg_mr = mlx5_ib_dereg_mr;
- dev->ib_dev.attach_mcast = mlx5_ib_mcg_attach;
- dev->ib_dev.detach_mcast = mlx5_ib_mcg_detach;
- dev->ib_dev.process_mad = mlx5_ib_process_mad;
- dev->ib_dev.alloc_mr = mlx5_ib_alloc_mr;
- dev->ib_dev.map_mr_sg = mlx5_ib_map_mr_sg;
- dev->ib_dev.check_mr_status = mlx5_ib_check_mr_status;
- dev->ib_dev.get_dev_fw_str = get_dev_fw_str;
- dev->ib_dev.get_vector_affinity = mlx5_ib_get_vector_affinity;
+ (1ull << IB_USER_VERBS_EX_CMD_MODIFY_CQ) |
+ (1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW) |
+ (1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW);
+
if (MLX5_CAP_GEN(mdev, ipoib_enhanced_offloads) &&
IS_ENABLED(CONFIG_MLX5_CORE_IPOIB))
- dev->ib_dev.rdma_netdev_get_params = mlx5_ib_rn_get_params;
-
- if (mlx5_core_is_pf(mdev)) {
- dev->ib_dev.get_vf_config = mlx5_ib_get_vf_config;
- dev->ib_dev.set_vf_link_state = mlx5_ib_set_vf_link_state;
- dev->ib_dev.get_vf_stats = mlx5_ib_get_vf_stats;
- dev->ib_dev.set_vf_guid = mlx5_ib_set_vf_guid;
- }
+ ib_set_device_ops(&dev->ib_dev,
+ &mlx5_ib_dev_ipoib_enhanced_ops);
- dev->ib_dev.disassociate_ucontext = mlx5_ib_disassociate_ucontext;
+ if (mlx5_core_is_pf(mdev))
+ ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_sriov_ops);
dev->umr_fence = mlx5_get_umr_fence(MLX5_CAP_GEN(mdev, umr_fence));
if (MLX5_CAP_GEN(mdev, imaicl)) {
- dev->ib_dev.alloc_mw = mlx5_ib_alloc_mw;
- dev->ib_dev.dealloc_mw = mlx5_ib_dealloc_mw;
dev->ib_dev.uverbs_cmd_mask |=
(1ull << IB_USER_VERBS_CMD_ALLOC_MW) |
(1ull << IB_USER_VERBS_CMD_DEALLOC_MW);
+ ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_mw_ops);
}
if (MLX5_CAP_GEN(mdev, xrc)) {
- dev->ib_dev.alloc_xrcd = mlx5_ib_alloc_xrcd;
- dev->ib_dev.dealloc_xrcd = mlx5_ib_dealloc_xrcd;
dev->ib_dev.uverbs_cmd_mask |=
(1ull << IB_USER_VERBS_CMD_OPEN_XRCD) |
(1ull << IB_USER_VERBS_CMD_CLOSE_XRCD);
+ ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_xrc_ops);
}
- if (MLX5_CAP_DEV_MEM(mdev, memic)) {
- dev->ib_dev.alloc_dm = mlx5_ib_alloc_dm;
- dev->ib_dev.dealloc_dm = mlx5_ib_dealloc_dm;
- dev->ib_dev.reg_dm_mr = mlx5_ib_reg_dm_mr;
- }
+ if (MLX5_CAP_DEV_MEM(mdev, memic))
+ ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_dm_ops);
- dev->ib_dev.create_flow = mlx5_ib_create_flow;
- dev->ib_dev.destroy_flow = mlx5_ib_destroy_flow;
- dev->ib_dev.uverbs_ex_cmd_mask |=
- (1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW) |
- (1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW);
- dev->ib_dev.create_flow_action_esp = mlx5_ib_create_flow_action_esp;
- dev->ib_dev.destroy_flow_action = mlx5_ib_destroy_flow_action;
- dev->ib_dev.modify_flow_action_esp = mlx5_ib_modify_flow_action_esp;
+ if (mlx5_accel_ipsec_device_caps(dev->mdev) &
+ MLX5_ACCEL_IPSEC_CAP_DEVICE)
+ ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_flow_ipsec_ops);
dev->ib_dev.driver_id = RDMA_DRIVER_MLX5;
- dev->ib_dev.create_counters = mlx5_ib_create_counters;
- dev->ib_dev.destroy_counters = mlx5_ib_destroy_counters;
- dev->ib_dev.read_counters = mlx5_ib_read_counters;
+ ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_ops);
+
+ if (IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS))
+ dev->ib_dev.driver_def = mlx5_ib_defs;
err = init_node_data(dev);
if (err)
@@ -5901,22 +6110,37 @@ int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev)
return 0;
}
+static const struct ib_device_ops mlx5_ib_dev_port_ops = {
+ .get_port_immutable = mlx5_port_immutable,
+ .query_port = mlx5_ib_query_port,
+};
+
static int mlx5_ib_stage_non_default_cb(struct mlx5_ib_dev *dev)
{
- dev->ib_dev.get_port_immutable = mlx5_port_immutable;
- dev->ib_dev.query_port = mlx5_ib_query_port;
-
+ ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_port_ops);
return 0;
}
+static const struct ib_device_ops mlx5_ib_dev_port_rep_ops = {
+ .get_port_immutable = mlx5_port_rep_immutable,
+ .query_port = mlx5_ib_rep_query_port,
+};
+
int mlx5_ib_stage_rep_non_default_cb(struct mlx5_ib_dev *dev)
{
- dev->ib_dev.get_port_immutable = mlx5_port_rep_immutable;
- dev->ib_dev.query_port = mlx5_ib_rep_query_port;
-
+ ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_port_rep_ops);
return 0;
}
+static const struct ib_device_ops mlx5_ib_dev_common_roce_ops = {
+ .create_rwq_ind_table = mlx5_ib_create_rwq_ind_table,
+ .create_wq = mlx5_ib_create_wq,
+ .destroy_rwq_ind_table = mlx5_ib_destroy_rwq_ind_table,
+ .destroy_wq = mlx5_ib_destroy_wq,
+ .get_netdev = mlx5_ib_get_netdev,
+ .modify_wq = mlx5_ib_modify_wq,
+};
+
static int mlx5_ib_stage_common_roce_init(struct mlx5_ib_dev *dev)
{
u8 port_num;
@@ -5928,19 +6152,13 @@ static int mlx5_ib_stage_common_roce_init(struct mlx5_ib_dev *dev)
dev->roce[i].last_port_state = IB_PORT_DOWN;
}
- dev->ib_dev.get_netdev = mlx5_ib_get_netdev;
- dev->ib_dev.create_wq = mlx5_ib_create_wq;
- dev->ib_dev.modify_wq = mlx5_ib_modify_wq;
- dev->ib_dev.destroy_wq = mlx5_ib_destroy_wq;
- dev->ib_dev.create_rwq_ind_table = mlx5_ib_create_rwq_ind_table;
- dev->ib_dev.destroy_rwq_ind_table = mlx5_ib_destroy_rwq_ind_table;
-
dev->ib_dev.uverbs_ex_cmd_mask |=
(1ull << IB_USER_VERBS_EX_CMD_CREATE_WQ) |
(1ull << IB_USER_VERBS_EX_CMD_MODIFY_WQ) |
(1ull << IB_USER_VERBS_EX_CMD_DESTROY_WQ) |
(1ull << IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL) |
(1ull << IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL);
+ ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_common_roce_ops);
port_num = mlx5_core_native_port_num(dev->mdev) - 1;
@@ -6034,11 +6252,20 @@ static int mlx5_ib_stage_odp_init(struct mlx5_ib_dev *dev)
return mlx5_ib_odp_init_one(dev);
}
+static void mlx5_ib_stage_odp_cleanup(struct mlx5_ib_dev *dev)
+{
+ mlx5_ib_odp_cleanup_one(dev);
+}
+
+static const struct ib_device_ops mlx5_ib_dev_hw_stats_ops = {
+ .alloc_hw_stats = mlx5_ib_alloc_hw_stats,
+ .get_hw_stats = mlx5_ib_get_hw_stats,
+};
+
int mlx5_ib_stage_counters_init(struct mlx5_ib_dev *dev)
{
if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt)) {
- dev->ib_dev.get_hw_stats = mlx5_ib_get_hw_stats;
- dev->ib_dev.alloc_hw_stats = mlx5_ib_alloc_hw_stats;
+ ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_hw_stats_ops);
return mlx5_ib_alloc_counters(dev);
}
@@ -6054,8 +6281,9 @@ void mlx5_ib_stage_counters_cleanup(struct mlx5_ib_dev *dev)
static int mlx5_ib_stage_cong_debugfs_init(struct mlx5_ib_dev *dev)
{
- return mlx5_ib_init_cong_debugfs(dev,
- mlx5_core_native_port_num(dev->mdev) - 1);
+ mlx5_ib_init_cong_debugfs(dev,
+ mlx5_core_native_port_num(dev->mdev) - 1);
+ return 0;
}
static void mlx5_ib_stage_cong_debugfs_cleanup(struct mlx5_ib_dev *dev)
@@ -6096,21 +6324,16 @@ void mlx5_ib_stage_bfrag_cleanup(struct mlx5_ib_dev *dev)
mlx5_free_bfreg(dev->mdev, &dev->bfreg);
}
-static int mlx5_ib_stage_populate_specs(struct mlx5_ib_dev *dev)
-{
- return populate_specs_root(dev);
-}
-
int mlx5_ib_stage_ib_reg_init(struct mlx5_ib_dev *dev)
{
const char *name;
rdma_set_device_sysfs_group(&dev->ib_dev, &mlx5_attr_group);
- if (!mlx5_lag_is_active(dev->mdev))
+ if (!mlx5_lag_is_roce(dev->mdev))
name = "mlx5_%d";
else
name = "mlx5_bond_%d";
- return ib_register_device(&dev->ib_dev, name, NULL);
+ return ib_register_device(&dev->ib_dev, name);
}
void mlx5_ib_stage_pre_ib_reg_umr_cleanup(struct mlx5_ib_dev *dev)
@@ -6140,16 +6363,32 @@ static void mlx5_ib_stage_delay_drop_cleanup(struct mlx5_ib_dev *dev)
cancel_delay_drop(dev);
}
-static int mlx5_ib_stage_rep_reg_init(struct mlx5_ib_dev *dev)
+static int mlx5_ib_stage_dev_notifier_init(struct mlx5_ib_dev *dev)
{
- mlx5_ib_register_vport_reps(dev);
-
+ dev->mdev_events.notifier_call = mlx5_ib_event;
+ mlx5_notifier_register(dev->mdev, &dev->mdev_events);
return 0;
}
-static void mlx5_ib_stage_rep_reg_cleanup(struct mlx5_ib_dev *dev)
+static void mlx5_ib_stage_dev_notifier_cleanup(struct mlx5_ib_dev *dev)
{
- mlx5_ib_unregister_vport_reps(dev);
+ mlx5_notifier_unregister(dev->mdev, &dev->mdev_events);
+}
+
+static int mlx5_ib_stage_devx_init(struct mlx5_ib_dev *dev)
+{
+ int uid;
+
+ uid = mlx5_ib_devx_create(dev, false);
+ if (uid > 0)
+ dev->devx_whitelist_uid = uid;
+
+ return 0;
+}
+static void mlx5_ib_stage_devx_cleanup(struct mlx5_ib_dev *dev)
+{
+ if (dev->devx_whitelist_uid)
+ mlx5_ib_devx_destroy(dev, dev->devx_whitelist_uid);
}
void __mlx5_ib_remove(struct mlx5_ib_dev *dev,
@@ -6162,10 +6401,6 @@ void __mlx5_ib_remove(struct mlx5_ib_dev *dev,
if (profile->stage[stage].cleanup)
profile->stage[stage].cleanup(dev);
}
-
- if (dev->devx_whitelist_uid)
- mlx5_ib_devx_destroy(dev, dev->devx_whitelist_uid);
- ib_dealloc_device((struct ib_device *)dev);
}
void *__mlx5_ib_add(struct mlx5_ib_dev *dev,
@@ -6173,7 +6408,6 @@ void *__mlx5_ib_add(struct mlx5_ib_dev *dev,
{
int err;
int i;
- int uid;
for (i = 0; i < MLX5_IB_STAGE_MAX; i++) {
if (profile->stage[i].init) {
@@ -6183,10 +6417,6 @@ void *__mlx5_ib_add(struct mlx5_ib_dev *dev,
}
}
- uid = mlx5_ib_devx_create(dev);
- if (uid > 0)
- dev->devx_whitelist_uid = uid;
-
dev->profile = profile;
dev->ib_active = true;
@@ -6214,12 +6444,18 @@ static const struct mlx5_ib_profile pf_profile = {
STAGE_CREATE(MLX5_IB_STAGE_ROCE,
mlx5_ib_stage_roce_init,
mlx5_ib_stage_roce_cleanup),
+ STAGE_CREATE(MLX5_IB_STAGE_SRQ,
+ mlx5_init_srq_table,
+ mlx5_cleanup_srq_table),
STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES,
mlx5_ib_stage_dev_res_init,
mlx5_ib_stage_dev_res_cleanup),
+ STAGE_CREATE(MLX5_IB_STAGE_DEVICE_NOTIFIER,
+ mlx5_ib_stage_dev_notifier_init,
+ mlx5_ib_stage_dev_notifier_cleanup),
STAGE_CREATE(MLX5_IB_STAGE_ODP,
mlx5_ib_stage_odp_init,
- NULL),
+ mlx5_ib_stage_odp_cleanup),
STAGE_CREATE(MLX5_IB_STAGE_COUNTERS,
mlx5_ib_stage_counters_init,
mlx5_ib_stage_counters_cleanup),
@@ -6235,9 +6471,9 @@ static const struct mlx5_ib_profile pf_profile = {
STAGE_CREATE(MLX5_IB_STAGE_PRE_IB_REG_UMR,
NULL,
mlx5_ib_stage_pre_ib_reg_umr_cleanup),
- STAGE_CREATE(MLX5_IB_STAGE_SPECS,
- mlx5_ib_stage_populate_specs,
- NULL),
+ STAGE_CREATE(MLX5_IB_STAGE_WHITELIST_UID,
+ mlx5_ib_stage_devx_init,
+ mlx5_ib_stage_devx_cleanup),
STAGE_CREATE(MLX5_IB_STAGE_IB_REG,
mlx5_ib_stage_ib_reg_init,
mlx5_ib_stage_ib_reg_cleanup),
@@ -6249,7 +6485,7 @@ static const struct mlx5_ib_profile pf_profile = {
mlx5_ib_stage_delay_drop_cleanup),
};
-static const struct mlx5_ib_profile nic_rep_profile = {
+const struct mlx5_ib_profile uplink_rep_profile = {
STAGE_CREATE(MLX5_IB_STAGE_INIT,
mlx5_ib_stage_init_init,
mlx5_ib_stage_init_cleanup),
@@ -6265,9 +6501,15 @@ static const struct mlx5_ib_profile nic_rep_profile = {
STAGE_CREATE(MLX5_IB_STAGE_ROCE,
mlx5_ib_stage_rep_roce_init,
mlx5_ib_stage_rep_roce_cleanup),
+ STAGE_CREATE(MLX5_IB_STAGE_SRQ,
+ mlx5_init_srq_table,
+ mlx5_cleanup_srq_table),
STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES,
mlx5_ib_stage_dev_res_init,
mlx5_ib_stage_dev_res_cleanup),
+ STAGE_CREATE(MLX5_IB_STAGE_DEVICE_NOTIFIER,
+ mlx5_ib_stage_dev_notifier_init,
+ mlx5_ib_stage_dev_notifier_cleanup),
STAGE_CREATE(MLX5_IB_STAGE_COUNTERS,
mlx5_ib_stage_counters_init,
mlx5_ib_stage_counters_cleanup),
@@ -6280,18 +6522,12 @@ static const struct mlx5_ib_profile nic_rep_profile = {
STAGE_CREATE(MLX5_IB_STAGE_PRE_IB_REG_UMR,
NULL,
mlx5_ib_stage_pre_ib_reg_umr_cleanup),
- STAGE_CREATE(MLX5_IB_STAGE_SPECS,
- mlx5_ib_stage_populate_specs,
- NULL),
STAGE_CREATE(MLX5_IB_STAGE_IB_REG,
mlx5_ib_stage_ib_reg_init,
mlx5_ib_stage_ib_reg_cleanup),
STAGE_CREATE(MLX5_IB_STAGE_POST_IB_REG_UMR,
mlx5_ib_stage_post_ib_reg_umr_init,
NULL),
- STAGE_CREATE(MLX5_IB_STAGE_REP_REG,
- mlx5_ib_stage_rep_reg_init,
- mlx5_ib_stage_rep_reg_cleanup),
};
static void *mlx5_ib_add_slave_port(struct mlx5_core_dev *mdev)
@@ -6342,13 +6578,19 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
printk_once(KERN_INFO "%s", mlx5_version);
+ if (MLX5_ESWITCH_MANAGER(mdev) &&
+ mlx5_ib_eswitch_mode(mdev->priv.eswitch) == SRIOV_OFFLOADS) {
+ mlx5_ib_register_vport_reps(mdev);
+ return mdev;
+ }
+
port_type_cap = MLX5_CAP_GEN(mdev, port_type);
ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap);
if (mlx5_core_is_mp_slave(mdev) && ll == IB_LINK_LAYER_ETHERNET)
return mlx5_ib_add_slave_port(mdev);
- dev = (struct mlx5_ib_dev *)ib_alloc_device(sizeof(*dev));
+ dev = ib_alloc_device(mlx5_ib_dev, ib_dev);
if (!dev)
return NULL;
@@ -6356,13 +6598,6 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
dev->num_ports = max(MLX5_CAP_GEN(mdev, num_ports),
MLX5_CAP_GEN(mdev, num_vhca_ports));
- if (MLX5_ESWITCH_MANAGER(mdev) &&
- mlx5_ib_eswitch_mode(mdev->priv.eswitch) == SRIOV_OFFLOADS) {
- dev->rep = mlx5_ib_vport_rep(mdev->priv.eswitch, 0);
-
- return __mlx5_ib_add(dev, &nic_rep_profile);
- }
-
return __mlx5_ib_add(dev, &pf_profile);
}
@@ -6371,6 +6606,11 @@ static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context)
struct mlx5_ib_multiport_info *mpi;
struct mlx5_ib_dev *dev;
+ if (MLX5_ESWITCH_MANAGER(mdev) && context == mdev) {
+ mlx5_ib_unregister_vport_reps(mdev);
+ return;
+ }
+
if (mlx5_core_is_mp_slave(mdev)) {
mpi = context;
mutex_lock(&mlx5_ib_multiport_mutex);
@@ -6383,15 +6623,13 @@ static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context)
dev = context;
__mlx5_ib_remove(dev, dev->profile, MLX5_IB_STAGE_MAX);
+
+ ib_dealloc_device((struct ib_device *)dev);
}
static struct mlx5_interface mlx5_ib_interface = {
.add = mlx5_ib_add,
.remove = mlx5_ib_remove,
- .event = mlx5_ib_event,
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
- .pfault = mlx5_ib_pfault,
-#endif
.protocol = MLX5_INTERFACE_PROTOCOL_IB,
};
diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c
index 549234988bb4..9f90be296ee0 100644
--- a/drivers/infiniband/hw/mlx5/mem.c
+++ b/drivers/infiniband/hw/mlx5/mem.c
@@ -111,7 +111,6 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr,
*count = i;
}
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
static u64 umem_dma_to_mtt(dma_addr_t umem_dma)
{
u64 mtt_entry = umem_dma & ODP_DMA_ADDR_MASK;
@@ -123,7 +122,6 @@ static u64 umem_dma_to_mtt(dma_addr_t umem_dma)
return mtt_entry;
}
-#endif
/*
* Populate the given array with bus addresses from the umem.
@@ -151,7 +149,7 @@ void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
int len;
struct scatterlist *sg;
int entry;
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+
if (umem->is_odp) {
WARN_ON(shift != 0);
WARN_ON(access_flags != (MLX5_IB_MTT_READ | MLX5_IB_MTT_WRITE));
@@ -164,7 +162,6 @@ void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
}
return;
}
-#endif
i = 0;
for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index b651a7a6fde9..4a617d78eae1 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -36,13 +36,12 @@
#include <linux/kernel.h>
#include <linux/sched.h>
#include <rdma/ib_verbs.h>
+#include <rdma/ib_umem.h>
#include <rdma/ib_smi.h>
#include <linux/mlx5/driver.h>
#include <linux/mlx5/cq.h>
#include <linux/mlx5/fs.h>
#include <linux/mlx5/qp.h>
-#include <linux/mlx5/srq.h>
-#include <linux/mlx5/fs.h>
#include <linux/types.h>
#include <linux/mlx5/transobj.h>
#include <rdma/ib_user_verbs.h>
@@ -50,6 +49,8 @@
#include <rdma/uverbs_ioctl.h>
#include <rdma/mlx5_user_ioctl_cmds.h>
+#include "srq.h"
+
#define mlx5_ib_dbg(_dev, format, arg...) \
dev_dbg(&(_dev)->ib_dev.dev, "%s:%d:(pid %d): " format, __func__, \
__LINE__, current->pid, ##arg)
@@ -257,6 +258,7 @@ enum mlx5_ib_rq_flags {
};
struct mlx5_ib_wq {
+ struct mlx5_frag_buf_ctrl fbc;
u64 *wrid;
u32 *wr_data;
struct wr_list *w_list;
@@ -274,8 +276,7 @@ struct mlx5_ib_wq {
unsigned head;
unsigned tail;
u16 cur_post;
- u16 last_poll;
- void *qend;
+ void *cur_edge;
};
enum mlx5_ib_wq_flags {
@@ -460,6 +461,7 @@ enum mlx5_ib_qp_flags {
MLX5_IB_QP_UNDERLAY = 1 << 10,
MLX5_IB_QP_PCI_WRITE_END_PADDING = 1 << 11,
MLX5_IB_QP_TUNNEL_OFFLOAD = 1 << 12,
+ MLX5_IB_QP_PACKET_BASED_CREDIT = 1 << 13,
};
struct mlx5_umr_wr {
@@ -523,6 +525,7 @@ struct mlx5_ib_srq {
struct mlx5_core_srq msrq;
struct mlx5_frag_buf buf;
struct mlx5_db db;
+ struct mlx5_frag_buf_ctrl fbc;
u64 *wrid;
/* protect SRQ hanlding
*/
@@ -540,7 +543,6 @@ struct mlx5_ib_srq {
struct mlx5_ib_xrcd {
struct ib_xrcd ibxrcd;
u32 xrcdn;
- u16 uid;
};
enum mlx5_ib_mtt_access_flags {
@@ -586,14 +588,28 @@ struct mlx5_ib_mr {
struct mlx5_ib_mr *parent;
atomic_t num_leaf_free;
wait_queue_head_t q_leaf_free;
+ struct mlx5_async_work cb_work;
+ atomic_t num_pending_prefetch;
};
+static inline bool is_odp_mr(struct mlx5_ib_mr *mr)
+{
+ return IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && mr->umem &&
+ mr->umem->is_odp;
+}
+
struct mlx5_ib_mw {
struct ib_mw ibmw;
struct mlx5_core_mkey mmkey;
int ndescs;
};
+struct mlx5_ib_devx_mr {
+ struct mlx5_core_mkey mmkey;
+ int ndescs;
+ struct rcu_head rcu;
+};
+
struct mlx5_ib_umr_context {
struct ib_cqe cqe;
enum ib_wc_status status;
@@ -622,7 +638,6 @@ struct mlx5_cache_ent {
spinlock_t lock;
- struct dentry *dir;
char name[4];
u32 order;
u32 xlt;
@@ -634,11 +649,6 @@ struct mlx5_cache_ent {
u32 miss;
u32 limit;
- struct dentry *fsize;
- struct dentry *fcur;
- struct dentry *fmiss;
- struct dentry *flimit;
-
struct mlx5_ib_dev *dev;
struct work_struct work;
struct delayed_work dwork;
@@ -774,19 +784,20 @@ enum mlx5_ib_stages {
MLX5_IB_STAGE_CAPS,
MLX5_IB_STAGE_NON_DEFAULT_CB,
MLX5_IB_STAGE_ROCE,
+ MLX5_IB_STAGE_SRQ,
MLX5_IB_STAGE_DEVICE_RESOURCES,
+ MLX5_IB_STAGE_DEVICE_NOTIFIER,
MLX5_IB_STAGE_ODP,
MLX5_IB_STAGE_COUNTERS,
MLX5_IB_STAGE_CONG_DEBUGFS,
MLX5_IB_STAGE_UAR,
MLX5_IB_STAGE_BFREG,
MLX5_IB_STAGE_PRE_IB_REG_UMR,
- MLX5_IB_STAGE_SPECS,
+ MLX5_IB_STAGE_WHITELIST_UID,
MLX5_IB_STAGE_IB_REG,
MLX5_IB_STAGE_POST_IB_REG_UMR,
MLX5_IB_STAGE_DELAY_DROP,
MLX5_IB_STAGE_CLASS_ATTR,
- MLX5_IB_STAGE_REP_REG,
MLX5_IB_STAGE_MAX,
};
@@ -806,6 +817,7 @@ struct mlx5_ib_multiport_info {
struct list_head list;
struct mlx5_ib_dev *ibdev;
struct mlx5_core_dev *mdev;
+ struct notifier_block mdev_events;
struct completion unref_comp;
u64 sys_image_guid;
u32 mdev_refcnt;
@@ -880,10 +892,19 @@ struct mlx5_ib_lb_state {
bool enabled;
};
+struct mlx5_ib_pf_eq {
+ struct mlx5_ib_dev *dev;
+ struct mlx5_eq *core;
+ struct work_struct work;
+ spinlock_t lock; /* Pagefaults spinlock */
+ struct workqueue_struct *wq;
+ mempool_t *pool;
+};
+
struct mlx5_ib_dev {
struct ib_device ib_dev;
- const struct uverbs_object_tree_def *driver_trees[7];
struct mlx5_core_dev *mdev;
+ struct notifier_block mdev_events;
struct mlx5_roce roce[MLX5_MAX_PORTS];
int num_ports;
/* serialize update of capability mask
@@ -899,16 +920,16 @@ struct mlx5_ib_dev {
/* Prevents soft lock on massive reg MRs */
struct mutex slow_path_mutex;
int fill_delay;
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
struct ib_odp_caps odp_caps;
u64 odp_max_size;
+ struct mlx5_ib_pf_eq odp_pf_eq;
+
/*
* Sleepable RCU that prevents destruction of MRs while they are still
* being used by a page fault handler.
*/
struct srcu_struct mr_srcu;
u32 null_mkey;
-#endif
struct mlx5_ib_flow_db *flow_db;
/* protect resources needed as part of reset flow */
spinlock_t reset_flow_resource_lock;
@@ -920,6 +941,7 @@ struct mlx5_ib_dev {
struct mlx5_ib_delay_drop delay_drop;
const struct mlx5_ib_profile *profile;
struct mlx5_eswitch_rep *rep;
+ int lag_active;
struct mlx5_ib_lb_state lb;
u8 umr_fence;
@@ -927,6 +949,8 @@ struct mlx5_ib_dev {
u64 sys_image_guid;
struct mlx5_memic memic;
u16 devx_whitelist_uid;
+ struct mlx5_srq_table srq_table;
+ struct mlx5_async_ctx async_ctx;
};
static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq)
@@ -1015,19 +1039,17 @@ to_mflow_act(struct ib_flow_action *ibact)
return container_of(ibact, struct mlx5_ib_flow_action, ib_action);
}
-int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context, unsigned long virt,
+int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context,
+ struct ib_udata *udata, unsigned long virt,
struct mlx5_db *db);
void mlx5_ib_db_unmap_user(struct mlx5_ib_ucontext *context, struct mlx5_db *db);
void __mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 qpn, struct mlx5_ib_srq *srq);
void mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 qpn, struct mlx5_ib_srq *srq);
void mlx5_ib_free_srq_wqe(struct mlx5_ib_srq *srq, int wqe_index);
-int mlx5_MAD_IFC(struct mlx5_ib_dev *dev, int ignore_mkey, int ignore_bkey,
- u8 port, const struct ib_wc *in_wc, const struct ib_grh *in_grh,
- const void *in_mad, void *response_mad);
struct ib_ah *mlx5_ib_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr,
- struct ib_udata *udata);
+ u32 flags, struct ib_udata *udata);
int mlx5_ib_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr);
-int mlx5_ib_destroy_ah(struct ib_ah *ah);
+int mlx5_ib_destroy_ah(struct ib_ah *ah, u32 flags);
struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd,
struct ib_srq_init_attr *init_attr,
struct ib_udata *udata);
@@ -1053,10 +1075,12 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
const struct ib_send_wr **bad_wr);
int mlx5_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr,
const struct ib_recv_wr **bad_wr);
-void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n);
-int mlx5_ib_read_user_wqe(struct mlx5_ib_qp *qp, int send, int wqe_index,
- void *buffer, u32 length,
- struct mlx5_ib_qp_base *base);
+int mlx5_ib_read_user_wqe_sq(struct mlx5_ib_qp *qp, int wqe_index, void *buffer,
+ int buflen, size_t *bc);
+int mlx5_ib_read_user_wqe_rq(struct mlx5_ib_qp *qp, int wqe_index, void *buffer,
+ int buflen, size_t *bc);
+int mlx5_ib_read_user_wqe_srq(struct mlx5_ib_srq *srq, int wqe_index,
+ void *buffer, int buflen, size_t *bc);
struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev,
const struct ib_cq_init_attr *attr,
struct ib_ucontext *context,
@@ -1070,12 +1094,19 @@ struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc);
struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
u64 virt_addr, int access_flags,
struct ib_udata *udata);
+int mlx5_ib_advise_mr(struct ib_pd *pd,
+ enum ib_uverbs_advise_mr_advice advice,
+ u32 flags,
+ struct ib_sge *sg_list,
+ u32 num_sge,
+ struct uverbs_attr_bundle *attrs);
struct ib_mw *mlx5_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type,
struct ib_udata *udata);
int mlx5_ib_dealloc_mw(struct ib_mw *mw);
int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
int page_shift, int flags);
struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
+ struct ib_udata *udata,
int access_flags);
void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *mr);
int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
@@ -1158,9 +1189,8 @@ struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm,
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev);
-void mlx5_ib_pfault(struct mlx5_core_dev *mdev, void *context,
- struct mlx5_pagefault *pfault);
int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev);
+void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev);
int __init mlx5_ib_odp_init(void);
void mlx5_ib_odp_cleanup(void);
void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start,
@@ -1168,6 +1198,10 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start,
void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent);
void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset,
size_t nentries, struct mlx5_ib_mr *mr, int flags);
+
+int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
+ enum ib_uverbs_advise_mr_advice advice,
+ u32 flags, struct ib_sge *sg_list, u32 num_sge);
#else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
static inline void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev)
{
@@ -1175,6 +1209,7 @@ static inline void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev)
}
static inline int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev) { return 0; }
+static inline void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev) {}
static inline int mlx5_ib_odp_init(void) { return 0; }
static inline void mlx5_ib_odp_cleanup(void) {}
static inline void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent) {}
@@ -1182,6 +1217,16 @@ static inline void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset,
size_t nentries, struct mlx5_ib_mr *mr,
int flags) {}
+static inline int
+mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
+ enum ib_uverbs_advise_mr_advice advice, u32 flags,
+ struct ib_sge *sg_list, u32 num_sge)
+{
+ return -EOPNOTSUPP;
+}
+static inline void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp,
+ unsigned long start,
+ unsigned long end){};
#endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
/* Needed for rep profile */
@@ -1221,7 +1266,7 @@ __be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev,
const struct ib_gid_attr *attr);
void mlx5_ib_cleanup_cong_debugfs(struct mlx5_ib_dev *dev, u8 port_num);
-int mlx5_ib_init_cong_debugfs(struct mlx5_ib_dev *dev, u8 port_num);
+void mlx5_ib_init_cong_debugfs(struct mlx5_ib_dev *dev, u8 port_num);
/* GSI QP helper functions */
struct ib_qp *mlx5_ib_gsi_create_qp(struct ib_pd *pd,
@@ -1250,32 +1295,29 @@ void mlx5_ib_put_native_port_mdev(struct mlx5_ib_dev *dev,
u8 port_num);
#if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS)
-int mlx5_ib_devx_create(struct mlx5_ib_dev *dev);
+int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user);
void mlx5_ib_devx_destroy(struct mlx5_ib_dev *dev, u16 uid);
const struct uverbs_object_tree_def *mlx5_ib_get_devx_tree(void);
+extern const struct uapi_definition mlx5_ib_devx_defs[];
+extern const struct uapi_definition mlx5_ib_flow_defs[];
struct mlx5_ib_flow_handler *mlx5_ib_raw_fs_rule_add(
struct mlx5_ib_dev *dev, struct mlx5_ib_flow_matcher *fs_matcher,
- struct mlx5_flow_act *flow_act, void *cmd_in, int inlen,
- int dest_id, int dest_type);
+ struct mlx5_flow_act *flow_act, u32 counter_id,
+ void *cmd_in, int inlen, int dest_id, int dest_type);
bool mlx5_ib_devx_is_flow_dest(void *obj, int *dest_id, int *dest_type);
+bool mlx5_ib_devx_is_flow_counter(void *obj, u32 *counter_id);
int mlx5_ib_get_flow_trees(const struct uverbs_object_tree_def **root);
void mlx5_ib_destroy_flow_action_raw(struct mlx5_ib_flow_action *maction);
#else
static inline int
-mlx5_ib_devx_create(struct mlx5_ib_dev *dev) { return -EOPNOTSUPP; };
+mlx5_ib_devx_create(struct mlx5_ib_dev *dev,
+ bool is_user) { return -EOPNOTSUPP; }
static inline void mlx5_ib_devx_destroy(struct mlx5_ib_dev *dev, u16 uid) {}
-static inline const struct uverbs_object_tree_def *
-mlx5_ib_get_devx_tree(void) { return NULL; }
static inline bool mlx5_ib_devx_is_flow_dest(void *obj, int *dest_id,
int *dest_type)
{
return false;
}
-static inline int
-mlx5_ib_get_flow_trees(const struct uverbs_object_tree_def **root)
-{
- return 0;
-}
static inline void
mlx5_ib_destroy_flow_action_raw(struct mlx5_ib_flow_action *maction)
{
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 9b195d65a13e..c85f00255884 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -71,10 +71,9 @@ static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
{
int err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey);
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
- /* Wait until all page fault handlers using the mr complete. */
- synchronize_srcu(&dev->mr_srcu);
-#endif
+ if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
+ /* Wait until all page fault handlers using the mr complete. */
+ synchronize_srcu(&dev->mr_srcu);
return err;
}
@@ -95,10 +94,9 @@ static bool use_umr_mtt_update(struct mlx5_ib_mr *mr, u64 start, u64 length)
length + (start & (MLX5_ADAPTER_PAGE_SIZE - 1));
}
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
static void update_odp_mr(struct mlx5_ib_mr *mr)
{
- if (mr->umem->is_odp) {
+ if (is_odp_mr(mr)) {
/*
* This barrier prevents the compiler from moving the
* setting of umem->odp_data->private to point to our
@@ -121,11 +119,11 @@ static void update_odp_mr(struct mlx5_ib_mr *mr)
smp_wmb();
}
}
-#endif
-static void reg_mr_callback(int status, void *context)
+static void reg_mr_callback(int status, struct mlx5_async_work *context)
{
- struct mlx5_ib_mr *mr = context;
+ struct mlx5_ib_mr *mr =
+ container_of(context, struct mlx5_ib_mr, cb_work);
struct mlx5_ib_dev *dev = mr->dev;
struct mlx5_mr_cache *cache = &dev->cache;
int c = order2idx(dev, mr->order);
@@ -216,9 +214,9 @@ static int add_keys(struct mlx5_ib_dev *dev, int c, int num)
ent->pending++;
spin_unlock_irq(&ent->lock);
err = mlx5_core_create_mkey_cb(dev->mdev, &mr->mmkey,
- in, inlen,
+ &dev->async_ctx, in, inlen,
mr->out, sizeof(mr->out),
- reg_mr_callback, mr);
+ reg_mr_callback, &mr->cb_work);
if (err) {
spin_lock_irq(&ent->lock);
ent->pending--;
@@ -256,9 +254,8 @@ static void remove_keys(struct mlx5_ib_dev *dev, int c, int num)
mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey);
}
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
- synchronize_srcu(&dev->mr_srcu);
-#endif
+ if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
+ synchronize_srcu(&dev->mr_srcu);
list_for_each_entry_safe(mr, tmp_mr, &del_list, list) {
list_del(&mr->list);
@@ -610,52 +607,27 @@ static void mlx5_mr_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
dev->cache.root = NULL;
}
-static int mlx5_mr_cache_debugfs_init(struct mlx5_ib_dev *dev)
+static void mlx5_mr_cache_debugfs_init(struct mlx5_ib_dev *dev)
{
struct mlx5_mr_cache *cache = &dev->cache;
struct mlx5_cache_ent *ent;
+ struct dentry *dir;
int i;
if (!mlx5_debugfs_root || dev->rep)
- return 0;
+ return;
cache->root = debugfs_create_dir("mr_cache", dev->mdev->priv.dbg_root);
- if (!cache->root)
- return -ENOMEM;
for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
ent = &cache->ent[i];
sprintf(ent->name, "%d", ent->order);
- ent->dir = debugfs_create_dir(ent->name, cache->root);
- if (!ent->dir)
- goto err;
-
- ent->fsize = debugfs_create_file("size", 0600, ent->dir, ent,
- &size_fops);
- if (!ent->fsize)
- goto err;
-
- ent->flimit = debugfs_create_file("limit", 0600, ent->dir, ent,
- &limit_fops);
- if (!ent->flimit)
- goto err;
-
- ent->fcur = debugfs_create_u32("cur", 0400, ent->dir,
- &ent->cur);
- if (!ent->fcur)
- goto err;
-
- ent->fmiss = debugfs_create_u32("miss", 0600, ent->dir,
- &ent->miss);
- if (!ent->fmiss)
- goto err;
+ dir = debugfs_create_dir(ent->name, cache->root);
+ debugfs_create_file("size", 0600, dir, ent, &size_fops);
+ debugfs_create_file("limit", 0600, dir, ent, &limit_fops);
+ debugfs_create_u32("cur", 0400, dir, &ent->cur);
+ debugfs_create_u32("miss", 0600, dir, &ent->miss);
}
-
- return 0;
-err:
- mlx5_mr_cache_debugfs_cleanup(dev);
-
- return -ENOMEM;
}
static void delay_time_func(struct timer_list *t)
@@ -669,7 +641,6 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
{
struct mlx5_mr_cache *cache = &dev->cache;
struct mlx5_cache_ent *ent;
- int err;
int i;
mutex_init(&dev->slow_path_mutex);
@@ -679,6 +650,7 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
return -ENOMEM;
}
+ mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx);
timer_setup(&dev->delay_timer, delay_time_func, 0);
for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
ent = &cache->ent[i];
@@ -713,45 +685,11 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
queue_work(cache->wq, &ent->work);
}
- err = mlx5_mr_cache_debugfs_init(dev);
- if (err)
- mlx5_ib_warn(dev, "cache debugfs failure\n");
-
- /*
- * We don't want to fail driver if debugfs failed to initialize,
- * so we are not forwarding error to the user.
- */
+ mlx5_mr_cache_debugfs_init(dev);
return 0;
}
-static void wait_for_async_commands(struct mlx5_ib_dev *dev)
-{
- struct mlx5_mr_cache *cache = &dev->cache;
- struct mlx5_cache_ent *ent;
- int total = 0;
- int i;
- int j;
-
- for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
- ent = &cache->ent[i];
- for (j = 0 ; j < 1000; j++) {
- if (!ent->pending)
- break;
- msleep(50);
- }
- }
- for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
- ent = &cache->ent[i];
- total += ent->pending;
- }
-
- if (total)
- mlx5_ib_warn(dev, "aborted while there are %d pending mr requests\n", total);
- else
- mlx5_ib_warn(dev, "done with all pending requests\n");
-}
-
int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev)
{
int i;
@@ -763,12 +701,12 @@ int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev)
flush_workqueue(dev->cache.wq);
mlx5_mr_cache_debugfs_cleanup(dev);
+ mlx5_cmd_cleanup_async_ctx(&dev->async_ctx);
for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++)
clean_keys(dev, i);
destroy_workqueue(dev->cache.wq);
- wait_for_async_commands(dev);
del_timer_sync(&dev->delay_timer);
return 0;
@@ -847,18 +785,17 @@ static int mr_cache_max_order(struct mlx5_ib_dev *dev)
return MLX5_MAX_UMR_SHIFT;
}
-static int mr_umem_get(struct ib_pd *pd, u64 start, u64 length,
- int access_flags, struct ib_umem **umem,
- int *npages, int *page_shift, int *ncont,
- int *order)
+static int mr_umem_get(struct mlx5_ib_dev *dev, struct ib_udata *udata,
+ u64 start, u64 length, int access_flags,
+ struct ib_umem **umem, int *npages, int *page_shift,
+ int *ncont, int *order)
{
- struct mlx5_ib_dev *dev = to_mdev(pd->device);
struct ib_umem *u;
int err;
*umem = NULL;
- u = ib_umem_get(pd->uobject->context, start, length, access_flags, 0);
+ u = ib_umem_get(udata, start, length, access_flags, 0);
err = PTR_ERR_OR_ZERO(u);
if (err) {
mlx5_ib_dbg(dev, "umem get failed (%d)\n", err);
@@ -1211,7 +1148,7 @@ err_1:
return ERR_PTR(err);
}
-static void set_mr_fileds(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
+static void set_mr_fields(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
int npages, u64 length, int access_flags)
{
mr->npages = npages;
@@ -1267,7 +1204,7 @@ static struct ib_mr *mlx5_ib_get_memic_mr(struct ib_pd *pd, u64 memic_addr,
kfree(in);
mr->umem = NULL;
- set_mr_fileds(dev, mr, 0, length, acc);
+ set_mr_fields(dev, mr, 0, length, acc);
return &mr->ibmr;
@@ -1280,6 +1217,21 @@ err_free:
return ERR_PTR(err);
}
+int mlx5_ib_advise_mr(struct ib_pd *pd,
+ enum ib_uverbs_advise_mr_advice advice,
+ u32 flags,
+ struct ib_sge *sg_list,
+ u32 num_sge,
+ struct uverbs_attr_bundle *attrs)
+{
+ if (advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH &&
+ advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE)
+ return -EOPNOTSUPP;
+
+ return mlx5_ib_advise_mr_prefetch(pd, advice, flags,
+ sg_list, num_sge);
+}
+
struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm,
struct ib_dm_mr_attr *attr,
struct uverbs_attr_bundle *attrs)
@@ -1316,21 +1268,20 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n",
start, virt_addr, length, access_flags);
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
- if (!start && length == U64_MAX) {
+ if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && !start &&
+ length == U64_MAX) {
if (!(access_flags & IB_ACCESS_ON_DEMAND) ||
!(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
return ERR_PTR(-EINVAL);
- mr = mlx5_ib_alloc_implicit_mr(to_mpd(pd), access_flags);
+ mr = mlx5_ib_alloc_implicit_mr(to_mpd(pd), udata, access_flags);
if (IS_ERR(mr))
return ERR_CAST(mr);
return &mr->ibmr;
}
-#endif
- err = mr_umem_get(pd, start, length, access_flags, &umem, &npages,
- &page_shift, &ncont, &order);
+ err = mr_umem_get(dev, udata, start, length, access_flags, &umem,
+ &npages, &page_shift, &ncont, &order);
if (err < 0)
return ERR_PTR(err);
@@ -1369,11 +1320,9 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key);
mr->umem = umem;
- set_mr_fileds(dev, mr, npages, length, access_flags);
+ set_mr_fields(dev, mr, npages, length, access_flags);
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
update_odp_mr(mr);
-#endif
if (!populate_mtts) {
int update_xlt_flags = MLX5_IB_UPD_XLT_ENABLE;
@@ -1390,9 +1339,11 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
}
}
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
- mr->live = 1;
-#endif
+ if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
+ mr->live = 1;
+ atomic_set(&mr->num_pending_prefetch, 0);
+ }
+
return &mr->ibmr;
error:
ib_umem_release(umem);
@@ -1480,8 +1431,9 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
flags |= IB_MR_REREG_TRANS;
ib_umem_release(mr->umem);
mr->umem = NULL;
- err = mr_umem_get(pd, addr, len, access_flags, &mr->umem,
- &npages, &page_shift, &ncont, &order);
+ err = mr_umem_get(dev, udata, addr, len, access_flags,
+ &mr->umem, &npages, &page_shift, &ncont,
+ &order);
if (err)
goto err;
}
@@ -1507,9 +1459,8 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
}
mr->allocated_from_cache = 0;
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
- mr->live = 1;
-#endif
+ if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
+ mr->live = 1;
} else {
/*
* Send a UMR WQE
@@ -1536,11 +1487,9 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
goto err;
}
- set_mr_fileds(dev, mr, npages, len, access_flags);
+ set_mr_fields(dev, mr, npages, len, access_flags);
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
update_odp_mr(mr);
-#endif
return 0;
err:
@@ -1626,12 +1575,19 @@ static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
int npages = mr->npages;
struct ib_umem *umem = mr->umem;
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
- if (umem && umem->is_odp) {
+ if (is_odp_mr(mr)) {
struct ib_umem_odp *umem_odp = to_ib_umem_odp(umem);
- /* Prevent new page faults from succeeding */
+ /* Prevent new page faults and
+ * prefetch requests from succeeding
+ */
mr->live = 0;
+
+ /* dequeue pending prefetch requests for the mr */
+ if (atomic_read(&mr->num_pending_prefetch))
+ flush_workqueue(system_unbound_wq);
+ WARN_ON(atomic_read(&mr->num_pending_prefetch));
+
/* Wait for all running page-fault handlers to finish. */
synchronize_srcu(&dev->mr_srcu);
/* Destroy all page mappings */
@@ -1651,7 +1607,7 @@ static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
/* Avoid double-freeing the umem. */
umem = NULL;
}
-#endif
+
clean_mr(dev, mr);
/*
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index 4dc6cc640ce0..0aa10ebda5d9 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -37,6 +37,46 @@
#include "mlx5_ib.h"
#include "cmd.h"
+#include <linux/mlx5/eq.h>
+
+/* Contains the details of a pagefault. */
+struct mlx5_pagefault {
+ u32 bytes_committed;
+ u32 token;
+ u8 event_subtype;
+ u8 type;
+ union {
+ /* Initiator or send message responder pagefault details. */
+ struct {
+ /* Received packet size, only valid for responders. */
+ u32 packet_size;
+ /*
+ * Number of resource holding WQE, depends on type.
+ */
+ u32 wq_num;
+ /*
+ * WQE index. Refers to either the send queue or
+ * receive queue, according to event_subtype.
+ */
+ u16 wqe_index;
+ } wqe;
+ /* RDMA responder pagefault details */
+ struct {
+ u32 r_key;
+ /*
+ * Received packet size, minimal size page fault
+ * resolution required for forward progress.
+ */
+ u32 packet_size;
+ u32 rdma_op_len;
+ u64 rdma_va;
+ } rdma;
+ };
+
+ struct mlx5_ib_pf_eq *eq;
+ struct work_struct work;
+};
+
#define MAX_PREFETCH_LEN (4*1024*1024U)
/* Timeout in ms to wait for an active mmu notifier to complete when handling
@@ -61,9 +101,9 @@ static int check_parent(struct ib_umem_odp *odp,
return mr && mr->parent == parent && !odp->dying;
}
-struct ib_ucontext_per_mm *mr_to_per_mm(struct mlx5_ib_mr *mr)
+static struct ib_ucontext_per_mm *mr_to_per_mm(struct mlx5_ib_mr *mr)
{
- if (WARN_ON(!mr || !mr->umem || !mr->umem->is_odp))
+ if (WARN_ON(!mr || !is_odp_mr(mr)))
return NULL;
return to_ib_umem_odp(mr->umem)->per_mm;
@@ -275,6 +315,9 @@ void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev)
if (MLX5_CAP_ODP(dev->mdev, ud_odp_caps.send))
caps->per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SEND;
+ if (MLX5_CAP_ODP(dev->mdev, ud_odp_caps.srq_receive))
+ caps->per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV;
+
if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.send))
caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SEND;
@@ -290,6 +333,27 @@ void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev)
if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.atomic))
caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC;
+ if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.srq_receive))
+ caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV;
+
+ if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.send))
+ caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_SEND;
+
+ if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.receive))
+ caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_RECV;
+
+ if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.write))
+ caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_WRITE;
+
+ if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.read))
+ caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_READ;
+
+ if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.atomic))
+ caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_ATOMIC;
+
+ if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.srq_receive))
+ caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV;
+
if (MLX5_CAP_GEN(dev->mdev, fixed_buffer_size) &&
MLX5_CAP_GEN(dev->mdev, null_mkey) &&
MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
@@ -304,14 +368,20 @@ static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev,
{
int wq_num = pfault->event_subtype == MLX5_PFAULT_SUBTYPE_WQE ?
pfault->wqe.wq_num : pfault->token;
- int ret = mlx5_core_page_fault_resume(dev->mdev,
- pfault->token,
- wq_num,
- pfault->type,
- error);
- if (ret)
- mlx5_ib_err(dev, "Failed to resolve the page fault on WQ 0x%x\n",
- wq_num);
+ u32 out[MLX5_ST_SZ_DW(page_fault_resume_out)] = { };
+ u32 in[MLX5_ST_SZ_DW(page_fault_resume_in)] = { };
+ int err;
+
+ MLX5_SET(page_fault_resume_in, in, opcode, MLX5_CMD_OP_PAGE_FAULT_RESUME);
+ MLX5_SET(page_fault_resume_in, in, page_fault_type, pfault->type);
+ MLX5_SET(page_fault_resume_in, in, token, pfault->token);
+ MLX5_SET(page_fault_resume_in, in, wq_number, wq_num);
+ MLX5_SET(page_fault_resume_in, in, error, !!error);
+
+ err = mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, sizeof(out));
+ if (err)
+ mlx5_ib_err(dev, "Failed to resolve the page fault on WQ 0x%x err %d\n",
+ wq_num, err);
}
static struct mlx5_ib_mr *implicit_mr_alloc(struct ib_pd *pd,
@@ -393,7 +463,7 @@ next_mr:
if (nentries)
nentries++;
} else {
- odp = ib_alloc_odp_umem(odp_mr->per_mm, addr,
+ odp = ib_alloc_odp_umem(odp_mr, addr,
MLX5_IMR_MTT_SIZE);
if (IS_ERR(odp)) {
mutex_unlock(&odp_mr->umem_mutex);
@@ -446,13 +516,13 @@ next_mr:
}
struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
+ struct ib_udata *udata,
int access_flags)
{
- struct ib_ucontext *ctx = pd->ibpd.uobject->context;
struct mlx5_ib_mr *imr;
struct ib_umem *umem;
- umem = ib_umem_get(ctx, 0, 0, IB_ACCESS_ON_DEMAND, 0);
+ umem = ib_umem_get(udata, 0, 0, access_flags, 0);
if (IS_ERR(umem))
return ERR_CAST(umem);
@@ -465,6 +535,7 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
imr->umem = umem;
init_waitqueue_head(&imr->q_leaf_free);
atomic_set(&imr->num_leaf_free, 0);
+ atomic_set(&imr->num_pending_prefetch, 0);
return imr;
}
@@ -503,13 +574,18 @@ void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr)
wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free));
}
+#define MLX5_PF_FLAGS_PREFETCH BIT(0)
+#define MLX5_PF_FLAGS_DOWNGRADE BIT(1)
static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
- u64 io_virt, size_t bcnt, u32 *bytes_mapped)
+ u64 io_virt, size_t bcnt, u32 *bytes_mapped,
+ u32 flags)
{
int npages = 0, current_seq, page_shift, ret, np;
bool implicit = false;
struct ib_umem_odp *odp_mr = to_ib_umem_odp(mr->umem);
- u64 access_mask = ODP_READ_ALLOWED_BIT;
+ bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE;
+ bool prefetch = flags & MLX5_PF_FLAGS_PREFETCH;
+ u64 access_mask;
u64 start_idx, page_mask;
struct ib_umem_odp *odp;
size_t size;
@@ -531,8 +607,17 @@ next_mr:
page_shift = mr->umem->page_shift;
page_mask = ~(BIT(page_shift) - 1);
start_idx = (io_virt - (mr->mmkey.iova & page_mask)) >> page_shift;
+ access_mask = ODP_READ_ALLOWED_BIT;
- if (mr->umem->writable)
+ if (prefetch && !downgrade && !mr->umem->writable) {
+ /* prefetch with write-access must
+ * be supported by the MR
+ */
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (mr->umem->writable && !downgrade)
access_mask |= ODP_WRITE_ALLOWED_BIT;
current_seq = READ_ONCE(odp->notifiers_seq);
@@ -606,8 +691,8 @@ out:
if (!wait_for_completion_timeout(
&odp->notifier_completion,
timeout)) {
- mlx5_ib_warn(dev, "timeout waiting for mmu notifier. seq %d against %d\n",
- current_seq, odp->notifiers_seq);
+ mlx5_ib_warn(dev, "timeout waiting for mmu notifier. seq %d against %d. notifiers_count=%d\n",
+ current_seq, odp->notifiers_seq, odp->notifiers_count);
}
} else {
/* The MR is being killed, kill the QP as well. */
@@ -626,6 +711,21 @@ struct pf_frame {
int depth;
};
+static int get_indirect_num_descs(struct mlx5_core_mkey *mmkey)
+{
+ struct mlx5_ib_mw *mw;
+ struct mlx5_ib_devx_mr *devx_mr;
+
+ if (mmkey->type == MLX5_MKEY_MW) {
+ mw = container_of(mmkey, struct mlx5_ib_mw, mmkey);
+ return mw->ndescs;
+ }
+
+ devx_mr = container_of(mmkey, struct mlx5_ib_devx_mr,
+ mmkey);
+ return devx_mr->ndescs;
+}
+
/*
* Handle a single data segment in a page-fault WQE or RDMA region.
*
@@ -638,18 +738,20 @@ struct pf_frame {
* abort the page fault handling.
*/
static int pagefault_single_data_segment(struct mlx5_ib_dev *dev,
- u32 key, u64 io_virt, size_t bcnt,
+ struct ib_pd *pd, u32 key,
+ u64 io_virt, size_t bcnt,
u32 *bytes_committed,
- u32 *bytes_mapped)
+ u32 *bytes_mapped, u32 flags)
{
int npages = 0, srcu_key, ret, i, outlen, cur_outlen = 0, depth = 0;
+ bool prefetch = flags & MLX5_PF_FLAGS_PREFETCH;
struct pf_frame *head = NULL, *frame;
struct mlx5_core_mkey *mmkey;
- struct mlx5_ib_mw *mw;
struct mlx5_ib_mr *mr;
struct mlx5_klm *pklm;
u32 *out = NULL;
size_t offset;
+ int ndescs;
srcu_key = srcu_read_lock(&dev->mr_srcu);
@@ -664,6 +766,12 @@ next_mr:
goto srcu_unlock;
}
+ if (prefetch && mmkey->type != MLX5_MKEY_MR) {
+ mlx5_ib_dbg(dev, "prefetch is allowed only for MR\n");
+ ret = -EINVAL;
+ goto srcu_unlock;
+ }
+
switch (mmkey->type) {
case MLX5_MKEY_MR:
mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
@@ -673,7 +781,18 @@ next_mr:
goto srcu_unlock;
}
- if (!mr->umem->is_odp) {
+ if (prefetch) {
+ if (!is_odp_mr(mr) ||
+ mr->ibmr.pd != pd) {
+ mlx5_ib_dbg(dev, "Invalid prefetch request: %s\n",
+ is_odp_mr(mr) ? "MR is not ODP" :
+ "PD is not of the MR");
+ ret = -EINVAL;
+ goto srcu_unlock;
+ }
+ }
+
+ if (!is_odp_mr(mr)) {
mlx5_ib_dbg(dev, "skipping non ODP MR (lkey=0x%06x) in page fault handler.\n",
key);
if (bytes_mapped)
@@ -682,7 +801,7 @@ next_mr:
goto srcu_unlock;
}
- ret = pagefault_mr(dev, mr, io_virt, bcnt, bytes_mapped);
+ ret = pagefault_mr(dev, mr, io_virt, bcnt, bytes_mapped, flags);
if (ret < 0)
goto srcu_unlock;
@@ -691,7 +810,8 @@ next_mr:
break;
case MLX5_MKEY_MW:
- mw = container_of(mmkey, struct mlx5_ib_mw, mmkey);
+ case MLX5_MKEY_INDIRECT_DEVX:
+ ndescs = get_indirect_num_descs(mmkey);
if (depth >= MLX5_CAP_GEN(dev->mdev, max_indirection)) {
mlx5_ib_dbg(dev, "indirection level exceeded\n");
@@ -700,7 +820,7 @@ next_mr:
}
outlen = MLX5_ST_SZ_BYTES(query_mkey_out) +
- sizeof(*pklm) * (mw->ndescs - 2);
+ sizeof(*pklm) * (ndescs - 2);
if (outlen > cur_outlen) {
kfree(out);
@@ -715,14 +835,14 @@ next_mr:
pklm = (struct mlx5_klm *)MLX5_ADDR_OF(query_mkey_out, out,
bsf0_klm0_pas_mtt0_1);
- ret = mlx5_core_query_mkey(dev->mdev, &mw->mmkey, out, outlen);
+ ret = mlx5_core_query_mkey(dev->mdev, mmkey, out, outlen);
if (ret)
goto srcu_unlock;
offset = io_virt - MLX5_GET64(query_mkey_out, out,
memory_key_mkey_entry.start_addr);
- for (i = 0; bcnt && i < mw->ndescs; i++, pklm++) {
+ for (i = 0; bcnt && i < ndescs; i++, pklm++) {
if (offset >= be32_to_cpu(pklm->bcount)) {
offset -= be32_to_cpu(pklm->bcount);
continue;
@@ -782,7 +902,6 @@ srcu_unlock:
/**
* Parse a series of data segments for page fault handling.
*
- * @qp the QP on which the fault occurred.
* @pfault contains page fault information.
* @wqe points at the first data segment in the WQE.
* @wqe_end points after the end of the WQE.
@@ -799,7 +918,7 @@ srcu_unlock:
*/
static int pagefault_data_segments(struct mlx5_ib_dev *dev,
struct mlx5_pagefault *pfault,
- struct mlx5_ib_qp *qp, void *wqe,
+ void *wqe,
void *wqe_end, u32 *bytes_mapped,
u32 *total_wqe_bytes, int receive_queue)
{
@@ -810,10 +929,6 @@ static int pagefault_data_segments(struct mlx5_ib_dev *dev,
size_t bcnt;
int inline_segment;
- /* Skip SRQ next-WQE segment. */
- if (receive_queue && qp->ibqp.srq)
- wqe += sizeof(struct mlx5_wqe_srq_next_seg);
-
if (bytes_mapped)
*bytes_mapped = 0;
if (total_wqe_bytes)
@@ -857,9 +972,10 @@ static int pagefault_data_segments(struct mlx5_ib_dev *dev,
continue;
}
- ret = pagefault_single_data_segment(dev, key, io_virt, bcnt,
+ ret = pagefault_single_data_segment(dev, NULL, key,
+ io_virt, bcnt,
&pfault->bytes_committed,
- bytes_mapped);
+ bytes_mapped, 0);
if (ret < 0)
break;
npages += ret;
@@ -938,6 +1054,10 @@ static int mlx5_ib_mr_initiator_pfault_handler(
MLX5_WQE_CTRL_OPCODE_MASK;
switch (qp->ibqp.qp_type) {
+ case IB_QPT_XRC_INI:
+ *wqe += sizeof(struct mlx5_wqe_xrc_seg);
+ transport_caps = dev->odp_caps.per_transport_caps.xrc_odp_caps;
+ break;
case IB_QPT_RC:
transport_caps = dev->odp_caps.per_transport_caps.rc_odp_caps;
break;
@@ -957,7 +1077,7 @@ static int mlx5_ib_mr_initiator_pfault_handler(
return -EFAULT;
}
- if (qp->ibqp.qp_type != IB_QPT_RC) {
+ if (qp->ibqp.qp_type == IB_QPT_UD) {
av = *wqe;
if (av->dqp_dct & cpu_to_be32(MLX5_EXTENDED_UD_AV))
*wqe += sizeof(struct mlx5_av);
@@ -982,21 +1102,34 @@ static int mlx5_ib_mr_initiator_pfault_handler(
}
/*
- * Parse responder WQE. Advances the wqe pointer to point at the
- * scatter-gather list, and set wqe_end to the end of the WQE.
+ * Parse responder WQE and set wqe_end to the end of the WQE.
*/
-static int mlx5_ib_mr_responder_pfault_handler(
- struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault,
- struct mlx5_ib_qp *qp, void **wqe, void **wqe_end, int wqe_length)
+static int mlx5_ib_mr_responder_pfault_handler_srq(struct mlx5_ib_dev *dev,
+ struct mlx5_ib_srq *srq,
+ void **wqe, void **wqe_end,
+ int wqe_length)
{
- struct mlx5_ib_wq *wq = &qp->rq;
- int wqe_size = 1 << wq->wqe_shift;
+ int wqe_size = 1 << srq->msrq.wqe_shift;
- if (qp->ibqp.srq) {
- mlx5_ib_err(dev, "ODP fault on SRQ is not supported\n");
+ if (wqe_size > wqe_length) {
+ mlx5_ib_err(dev, "Couldn't read all of the receive WQE's content\n");
return -EFAULT;
}
+ *wqe_end = *wqe + wqe_size;
+ *wqe += sizeof(struct mlx5_wqe_srq_next_seg);
+
+ return 0;
+}
+
+static int mlx5_ib_mr_responder_pfault_handler_rq(struct mlx5_ib_dev *dev,
+ struct mlx5_ib_qp *qp,
+ void *wqe, void **wqe_end,
+ int wqe_length)
+{
+ struct mlx5_ib_wq *wq = &qp->rq;
+ int wqe_size = 1 << wq->wqe_shift;
+
if (qp->wq_sig) {
mlx5_ib_err(dev, "ODP fault with WQE signatures is not supported\n");
return -EFAULT;
@@ -1020,24 +1153,50 @@ invalid_transport_or_opcode:
return -EFAULT;
}
- *wqe_end = *wqe + wqe_size;
+ *wqe_end = wqe + wqe_size;
return 0;
}
-static struct mlx5_ib_qp *mlx5_ib_odp_find_qp(struct mlx5_ib_dev *dev,
- u32 wq_num)
+static inline struct mlx5_core_rsc_common *odp_get_rsc(struct mlx5_ib_dev *dev,
+ u32 wq_num, int pf_type)
{
- struct mlx5_core_qp *mqp = __mlx5_qp_lookup(dev->mdev, wq_num);
-
- if (!mqp) {
- mlx5_ib_err(dev, "QPN 0x%6x not found\n", wq_num);
- return NULL;
+ struct mlx5_core_rsc_common *common = NULL;
+ struct mlx5_core_srq *srq;
+
+ switch (pf_type) {
+ case MLX5_WQE_PF_TYPE_RMP:
+ srq = mlx5_cmd_get_srq(dev, wq_num);
+ if (srq)
+ common = &srq->common;
+ break;
+ case MLX5_WQE_PF_TYPE_REQ_SEND_OR_WRITE:
+ case MLX5_WQE_PF_TYPE_RESP:
+ case MLX5_WQE_PF_TYPE_REQ_READ_OR_ATOMIC:
+ common = mlx5_core_res_hold(dev->mdev, wq_num, MLX5_RES_QP);
+ break;
+ default:
+ break;
}
+ return common;
+}
+
+static inline struct mlx5_ib_qp *res_to_qp(struct mlx5_core_rsc_common *res)
+{
+ struct mlx5_core_qp *mqp = (struct mlx5_core_qp *)res;
+
return to_mibqp(mqp);
}
+static inline struct mlx5_ib_srq *res_to_srq(struct mlx5_core_rsc_common *res)
+{
+ struct mlx5_core_srq *msrq =
+ container_of(res, struct mlx5_core_srq, common);
+
+ return to_mibsrq(msrq);
+}
+
static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev *dev,
struct mlx5_pagefault *pfault)
{
@@ -1048,7 +1207,29 @@ static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev *dev,
int resume_with_error = 1;
u16 wqe_index = pfault->wqe.wqe_index;
int requestor = pfault->type & MLX5_PFAULT_REQUESTOR;
- struct mlx5_ib_qp *qp;
+ struct mlx5_core_rsc_common *res = NULL;
+ struct mlx5_ib_qp *qp = NULL;
+ struct mlx5_ib_srq *srq = NULL;
+ size_t bytes_copied;
+
+ res = odp_get_rsc(dev, pfault->wqe.wq_num, pfault->type);
+ if (!res) {
+ mlx5_ib_dbg(dev, "wqe page fault for missing resource %d\n", pfault->wqe.wq_num);
+ return;
+ }
+
+ switch (res->res) {
+ case MLX5_RES_QP:
+ qp = res_to_qp(res);
+ break;
+ case MLX5_RES_SRQ:
+ case MLX5_RES_XSRQ:
+ srq = res_to_srq(res);
+ break;
+ default:
+ mlx5_ib_err(dev, "wqe page fault for unsupported type %d\n", pfault->type);
+ goto resolve_page_fault;
+ }
buffer = (char *)__get_free_page(GFP_KERNEL);
if (!buffer) {
@@ -1056,13 +1237,23 @@ static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev *dev,
goto resolve_page_fault;
}
- qp = mlx5_ib_odp_find_qp(dev, pfault->wqe.wq_num);
- if (!qp)
- goto resolve_page_fault;
+ if (qp) {
+ if (requestor) {
+ ret = mlx5_ib_read_user_wqe_sq(qp, wqe_index,
+ buffer, PAGE_SIZE,
+ &bytes_copied);
+ } else {
+ ret = mlx5_ib_read_user_wqe_rq(qp, wqe_index,
+ buffer, PAGE_SIZE,
+ &bytes_copied);
+ }
+ } else {
+ ret = mlx5_ib_read_user_wqe_srq(srq, wqe_index,
+ buffer, PAGE_SIZE,
+ &bytes_copied);
+ }
- ret = mlx5_ib_read_user_wqe(qp, requestor, wqe_index, buffer,
- PAGE_SIZE, &qp->trans_qp.base);
- if (ret < 0) {
+ if (ret) {
mlx5_ib_err(dev, "Failed reading a WQE following page fault, error=%d, wqe_index=%x, qpn=%x\n",
ret, wqe_index, pfault->token);
goto resolve_page_fault;
@@ -1070,11 +1261,18 @@ static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev *dev,
wqe = buffer;
if (requestor)
- ret = mlx5_ib_mr_initiator_pfault_handler(dev, pfault, qp, &wqe,
- &wqe_end, ret);
+ ret = mlx5_ib_mr_initiator_pfault_handler(dev, pfault, qp,
+ &wqe, &wqe_end,
+ bytes_copied);
+ else if (qp)
+ ret = mlx5_ib_mr_responder_pfault_handler_rq(dev, qp,
+ wqe, &wqe_end,
+ bytes_copied);
else
- ret = mlx5_ib_mr_responder_pfault_handler(dev, pfault, qp, &wqe,
- &wqe_end, ret);
+ ret = mlx5_ib_mr_responder_pfault_handler_srq(dev, srq,
+ &wqe, &wqe_end,
+ bytes_copied);
+
if (ret < 0)
goto resolve_page_fault;
@@ -1083,7 +1281,7 @@ static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev *dev,
goto resolve_page_fault;
}
- ret = pagefault_data_segments(dev, pfault, qp, wqe, wqe_end,
+ ret = pagefault_data_segments(dev, pfault, wqe, wqe_end,
&bytes_mapped, &total_wqe_bytes,
!requestor);
if (ret == -EAGAIN) {
@@ -1099,6 +1297,7 @@ resolve_page_fault:
mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, type: 0x%x\n",
pfault->wqe.wq_num, resume_with_error,
pfault->type);
+ mlx5_core_res_put(res);
free_page((unsigned long)buffer);
}
@@ -1141,8 +1340,9 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev,
prefetch_len = min(MAX_PREFETCH_LEN, prefetch_len);
}
- ret = pagefault_single_data_segment(dev, rkey, address, length,
- &pfault->bytes_committed, NULL);
+ ret = pagefault_single_data_segment(dev, NULL, rkey, address, length,
+ &pfault->bytes_committed, NULL,
+ 0);
if (ret == -EAGAIN) {
/* We're racing with an invalidation, don't prefetch */
prefetch_activated = 0;
@@ -1167,9 +1367,10 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev,
if (prefetch_activated) {
u32 bytes_committed = 0;
- ret = pagefault_single_data_segment(dev, rkey, address,
+ ret = pagefault_single_data_segment(dev, NULL, rkey, address,
prefetch_len,
- &bytes_committed, NULL);
+ &bytes_committed, NULL,
+ 0);
if (ret < 0 && ret != -EAGAIN) {
mlx5_ib_dbg(dev, "Prefetch failed. ret: %d, QP 0x%x, address: 0x%.16llx, length = 0x%.16x\n",
ret, pfault->token, address, prefetch_len);
@@ -1177,10 +1378,8 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev,
}
}
-void mlx5_ib_pfault(struct mlx5_core_dev *mdev, void *context,
- struct mlx5_pagefault *pfault)
+static void mlx5_ib_pfault(struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault)
{
- struct mlx5_ib_dev *dev = context;
u8 event_subtype = pfault->event_subtype;
switch (event_subtype) {
@@ -1197,6 +1396,203 @@ void mlx5_ib_pfault(struct mlx5_core_dev *mdev, void *context,
}
}
+static void mlx5_ib_eqe_pf_action(struct work_struct *work)
+{
+ struct mlx5_pagefault *pfault = container_of(work,
+ struct mlx5_pagefault,
+ work);
+ struct mlx5_ib_pf_eq *eq = pfault->eq;
+
+ mlx5_ib_pfault(eq->dev, pfault);
+ mempool_free(pfault, eq->pool);
+}
+
+static void mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq *eq)
+{
+ struct mlx5_eqe_page_fault *pf_eqe;
+ struct mlx5_pagefault *pfault;
+ struct mlx5_eqe *eqe;
+ int cc = 0;
+
+ while ((eqe = mlx5_eq_get_eqe(eq->core, cc))) {
+ pfault = mempool_alloc(eq->pool, GFP_ATOMIC);
+ if (!pfault) {
+ schedule_work(&eq->work);
+ break;
+ }
+
+ pf_eqe = &eqe->data.page_fault;
+ pfault->event_subtype = eqe->sub_type;
+ pfault->bytes_committed = be32_to_cpu(pf_eqe->bytes_committed);
+
+ mlx5_ib_dbg(eq->dev,
+ "PAGE_FAULT: subtype: 0x%02x, bytes_committed: 0x%06x\n",
+ eqe->sub_type, pfault->bytes_committed);
+
+ switch (eqe->sub_type) {
+ case MLX5_PFAULT_SUBTYPE_RDMA:
+ /* RDMA based event */
+ pfault->type =
+ be32_to_cpu(pf_eqe->rdma.pftype_token) >> 24;
+ pfault->token =
+ be32_to_cpu(pf_eqe->rdma.pftype_token) &
+ MLX5_24BIT_MASK;
+ pfault->rdma.r_key =
+ be32_to_cpu(pf_eqe->rdma.r_key);
+ pfault->rdma.packet_size =
+ be16_to_cpu(pf_eqe->rdma.packet_length);
+ pfault->rdma.rdma_op_len =
+ be32_to_cpu(pf_eqe->rdma.rdma_op_len);
+ pfault->rdma.rdma_va =
+ be64_to_cpu(pf_eqe->rdma.rdma_va);
+ mlx5_ib_dbg(eq->dev,
+ "PAGE_FAULT: type:0x%x, token: 0x%06x, r_key: 0x%08x\n",
+ pfault->type, pfault->token,
+ pfault->rdma.r_key);
+ mlx5_ib_dbg(eq->dev,
+ "PAGE_FAULT: rdma_op_len: 0x%08x, rdma_va: 0x%016llx\n",
+ pfault->rdma.rdma_op_len,
+ pfault->rdma.rdma_va);
+ break;
+
+ case MLX5_PFAULT_SUBTYPE_WQE:
+ /* WQE based event */
+ pfault->type =
+ (be32_to_cpu(pf_eqe->wqe.pftype_wq) >> 24) & 0x7;
+ pfault->token =
+ be32_to_cpu(pf_eqe->wqe.token);
+ pfault->wqe.wq_num =
+ be32_to_cpu(pf_eqe->wqe.pftype_wq) &
+ MLX5_24BIT_MASK;
+ pfault->wqe.wqe_index =
+ be16_to_cpu(pf_eqe->wqe.wqe_index);
+ pfault->wqe.packet_size =
+ be16_to_cpu(pf_eqe->wqe.packet_length);
+ mlx5_ib_dbg(eq->dev,
+ "PAGE_FAULT: type:0x%x, token: 0x%06x, wq_num: 0x%06x, wqe_index: 0x%04x\n",
+ pfault->type, pfault->token,
+ pfault->wqe.wq_num,
+ pfault->wqe.wqe_index);
+ break;
+
+ default:
+ mlx5_ib_warn(eq->dev,
+ "Unsupported page fault event sub-type: 0x%02hhx\n",
+ eqe->sub_type);
+ /* Unsupported page faults should still be
+ * resolved by the page fault handler
+ */
+ }
+
+ pfault->eq = eq;
+ INIT_WORK(&pfault->work, mlx5_ib_eqe_pf_action);
+ queue_work(eq->wq, &pfault->work);
+
+ cc = mlx5_eq_update_cc(eq->core, ++cc);
+ }
+
+ mlx5_eq_update_ci(eq->core, cc, 1);
+}
+
+static irqreturn_t mlx5_ib_eq_pf_int(int irq, void *eq_ptr)
+{
+ struct mlx5_ib_pf_eq *eq = eq_ptr;
+ unsigned long flags;
+
+ if (spin_trylock_irqsave(&eq->lock, flags)) {
+ mlx5_ib_eq_pf_process(eq);
+ spin_unlock_irqrestore(&eq->lock, flags);
+ } else {
+ schedule_work(&eq->work);
+ }
+
+ return IRQ_HANDLED;
+}
+
+/* mempool_refill() was proposed but unfortunately wasn't accepted
+ * http://lkml.iu.edu/hypermail/linux/kernel/1512.1/05073.html
+ * Cheap workaround.
+ */
+static void mempool_refill(mempool_t *pool)
+{
+ while (pool->curr_nr < pool->min_nr)
+ mempool_free(mempool_alloc(pool, GFP_KERNEL), pool);
+}
+
+static void mlx5_ib_eq_pf_action(struct work_struct *work)
+{
+ struct mlx5_ib_pf_eq *eq =
+ container_of(work, struct mlx5_ib_pf_eq, work);
+
+ mempool_refill(eq->pool);
+
+ spin_lock_irq(&eq->lock);
+ mlx5_ib_eq_pf_process(eq);
+ spin_unlock_irq(&eq->lock);
+}
+
+enum {
+ MLX5_IB_NUM_PF_EQE = 0x1000,
+ MLX5_IB_NUM_PF_DRAIN = 64,
+};
+
+static int
+mlx5_ib_create_pf_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq)
+{
+ struct mlx5_eq_param param = {};
+ int err;
+
+ INIT_WORK(&eq->work, mlx5_ib_eq_pf_action);
+ spin_lock_init(&eq->lock);
+ eq->dev = dev;
+
+ eq->pool = mempool_create_kmalloc_pool(MLX5_IB_NUM_PF_DRAIN,
+ sizeof(struct mlx5_pagefault));
+ if (!eq->pool)
+ return -ENOMEM;
+
+ eq->wq = alloc_workqueue("mlx5_ib_page_fault",
+ WQ_HIGHPRI | WQ_UNBOUND | WQ_MEM_RECLAIM,
+ MLX5_NUM_CMD_EQE);
+ if (!eq->wq) {
+ err = -ENOMEM;
+ goto err_mempool;
+ }
+
+ param = (struct mlx5_eq_param) {
+ .index = MLX5_EQ_PFAULT_IDX,
+ .mask = 1 << MLX5_EVENT_TYPE_PAGE_FAULT,
+ .nent = MLX5_IB_NUM_PF_EQE,
+ .context = eq,
+ .handler = mlx5_ib_eq_pf_int
+ };
+ eq->core = mlx5_eq_create_generic(dev->mdev, "mlx5_ib_page_fault_eq", &param);
+ if (IS_ERR(eq->core)) {
+ err = PTR_ERR(eq->core);
+ goto err_wq;
+ }
+
+ return 0;
+err_wq:
+ destroy_workqueue(eq->wq);
+err_mempool:
+ mempool_destroy(eq->pool);
+ return err;
+}
+
+static int
+mlx5_ib_destroy_pf_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq)
+{
+ int err;
+
+ err = mlx5_eq_destroy_generic(dev->mdev, eq->core);
+ cancel_work_sync(&eq->work);
+ destroy_workqueue(eq->wq);
+ mempool_destroy(eq->pool);
+
+ return err;
+}
+
void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent)
{
if (!(ent->dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
@@ -1223,9 +1619,16 @@ void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent)
}
}
+static const struct ib_device_ops mlx5_ib_dev_odp_ops = {
+ .advise_mr = mlx5_ib_advise_mr,
+};
+
int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev)
{
- int ret;
+ int ret = 0;
+
+ if (dev->odp_caps.general_caps & IB_ODP_SUPPORT)
+ ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_odp_ops);
if (dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT) {
ret = mlx5_cmd_null_mkey(dev->mdev, &dev->null_mkey);
@@ -1235,7 +1638,20 @@ int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev)
}
}
- return 0;
+ if (!MLX5_CAP_GEN(dev->mdev, pg))
+ return ret;
+
+ ret = mlx5_ib_create_pf_eq(dev, &dev->odp_pf_eq);
+
+ return ret;
+}
+
+void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *dev)
+{
+ if (!MLX5_CAP_GEN(dev->mdev, pg))
+ return;
+
+ mlx5_ib_destroy_pf_eq(dev, &dev->odp_pf_eq);
}
int mlx5_ib_odp_init(void)
@@ -1246,3 +1662,160 @@ int mlx5_ib_odp_init(void)
return 0;
}
+struct prefetch_mr_work {
+ struct work_struct work;
+ struct ib_pd *pd;
+ u32 pf_flags;
+ u32 num_sge;
+ struct ib_sge sg_list[0];
+};
+
+static void num_pending_prefetch_dec(struct mlx5_ib_dev *dev,
+ struct ib_sge *sg_list, u32 num_sge,
+ u32 from)
+{
+ u32 i;
+ int srcu_key;
+
+ srcu_key = srcu_read_lock(&dev->mr_srcu);
+
+ for (i = from; i < num_sge; ++i) {
+ struct mlx5_core_mkey *mmkey;
+ struct mlx5_ib_mr *mr;
+
+ mmkey = __mlx5_mr_lookup(dev->mdev,
+ mlx5_base_mkey(sg_list[i].lkey));
+ mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
+ atomic_dec(&mr->num_pending_prefetch);
+ }
+
+ srcu_read_unlock(&dev->mr_srcu, srcu_key);
+}
+
+static bool num_pending_prefetch_inc(struct ib_pd *pd,
+ struct ib_sge *sg_list, u32 num_sge)
+{
+ struct mlx5_ib_dev *dev = to_mdev(pd->device);
+ bool ret = true;
+ u32 i;
+
+ for (i = 0; i < num_sge; ++i) {
+ struct mlx5_core_mkey *mmkey;
+ struct mlx5_ib_mr *mr;
+
+ mmkey = __mlx5_mr_lookup(dev->mdev,
+ mlx5_base_mkey(sg_list[i].lkey));
+ if (!mmkey || mmkey->key != sg_list[i].lkey) {
+ ret = false;
+ break;
+ }
+
+ if (mmkey->type != MLX5_MKEY_MR) {
+ ret = false;
+ break;
+ }
+
+ mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
+
+ if (mr->ibmr.pd != pd) {
+ ret = false;
+ break;
+ }
+
+ if (!mr->live) {
+ ret = false;
+ break;
+ }
+
+ atomic_inc(&mr->num_pending_prefetch);
+ }
+
+ if (!ret)
+ num_pending_prefetch_dec(dev, sg_list, i, 0);
+
+ return ret;
+}
+
+static int mlx5_ib_prefetch_sg_list(struct ib_pd *pd, u32 pf_flags,
+ struct ib_sge *sg_list, u32 num_sge)
+{
+ u32 i;
+ int ret = 0;
+ struct mlx5_ib_dev *dev = to_mdev(pd->device);
+
+ for (i = 0; i < num_sge; ++i) {
+ struct ib_sge *sg = &sg_list[i];
+ int bytes_committed = 0;
+
+ ret = pagefault_single_data_segment(dev, pd, sg->lkey, sg->addr,
+ sg->length,
+ &bytes_committed, NULL,
+ pf_flags);
+ if (ret < 0)
+ break;
+ }
+
+ return ret < 0 ? ret : 0;
+}
+
+static void mlx5_ib_prefetch_mr_work(struct work_struct *work)
+{
+ struct prefetch_mr_work *w =
+ container_of(work, struct prefetch_mr_work, work);
+
+ if (ib_device_try_get(w->pd->device)) {
+ mlx5_ib_prefetch_sg_list(w->pd, w->pf_flags, w->sg_list,
+ w->num_sge);
+ ib_device_put(w->pd->device);
+ }
+
+ num_pending_prefetch_dec(to_mdev(w->pd->device), w->sg_list,
+ w->num_sge, 0);
+ kfree(w);
+}
+
+int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
+ enum ib_uverbs_advise_mr_advice advice,
+ u32 flags, struct ib_sge *sg_list, u32 num_sge)
+{
+ struct mlx5_ib_dev *dev = to_mdev(pd->device);
+ u32 pf_flags = MLX5_PF_FLAGS_PREFETCH;
+ struct prefetch_mr_work *work;
+ bool valid_req;
+ int srcu_key;
+
+ if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH)
+ pf_flags |= MLX5_PF_FLAGS_DOWNGRADE;
+
+ if (flags & IB_UVERBS_ADVISE_MR_FLAG_FLUSH)
+ return mlx5_ib_prefetch_sg_list(pd, pf_flags, sg_list,
+ num_sge);
+
+ work = kvzalloc(struct_size(work, sg_list, num_sge), GFP_KERNEL);
+ if (!work)
+ return -ENOMEM;
+
+ memcpy(work->sg_list, sg_list, num_sge * sizeof(struct ib_sge));
+
+ /* It is guaranteed that the pd when work is executed is the pd when
+ * work was queued since pd can't be destroyed while it holds MRs and
+ * destroying a MR leads to flushing the workquque
+ */
+ work->pd = pd;
+ work->pf_flags = pf_flags;
+ work->num_sge = num_sge;
+
+ INIT_WORK(&work->work, mlx5_ib_prefetch_mr_work);
+
+ srcu_key = srcu_read_lock(&dev->mr_srcu);
+
+ valid_req = num_pending_prefetch_inc(pd, sg_list, num_sge);
+ if (valid_req)
+ queue_work(system_unbound_wq, &work->work);
+ else
+ kfree(work);
+
+ srcu_read_unlock(&dev->mr_srcu, srcu_key);
+
+ return valid_req ? 0 : -EINVAL;
+}
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index 3747cc681b18..8870c350fda0 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -108,91 +108,174 @@ static int is_sqp(enum ib_qp_type qp_type)
return is_qp0(qp_type) || is_qp1(qp_type);
}
-static void *get_wqe(struct mlx5_ib_qp *qp, int offset)
-{
- return mlx5_buf_offset(&qp->buf, offset);
-}
-
-static void *get_recv_wqe(struct mlx5_ib_qp *qp, int n)
-{
- return get_wqe(qp, qp->rq.offset + (n << qp->rq.wqe_shift));
-}
-
-void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n)
-{
- return get_wqe(qp, qp->sq.offset + (n << MLX5_IB_SQ_STRIDE));
-}
-
/**
- * mlx5_ib_read_user_wqe() - Copy a user-space WQE to kernel space.
+ * mlx5_ib_read_user_wqe_common() - Copy a WQE (or part of) from user WQ
+ * to kernel buffer
*
- * @qp: QP to copy from.
- * @send: copy from the send queue when non-zero, use the receive queue
- * otherwise.
- * @wqe_index: index to start copying from. For send work queues, the
- * wqe_index is in units of MLX5_SEND_WQE_BB.
- * For receive work queue, it is the number of work queue
- * element in the queue.
- * @buffer: destination buffer.
- * @length: maximum number of bytes to copy.
+ * @umem: User space memory where the WQ is
+ * @buffer: buffer to copy to
+ * @buflen: buffer length
+ * @wqe_index: index of WQE to copy from
+ * @wq_offset: offset to start of WQ
+ * @wq_wqe_cnt: number of WQEs in WQ
+ * @wq_wqe_shift: log2 of WQE size
+ * @bcnt: number of bytes to copy
+ * @bytes_copied: number of bytes to copy (return value)
*
- * Copies at least a single WQE, but may copy more data.
+ * Copies from start of WQE bcnt or less bytes.
+ * Does not gurantee to copy the entire WQE.
*
- * Return: the number of bytes copied, or an error code.
+ * Return: zero on success, or an error code.
*/
-int mlx5_ib_read_user_wqe(struct mlx5_ib_qp *qp, int send, int wqe_index,
- void *buffer, u32 length,
- struct mlx5_ib_qp_base *base)
+static int mlx5_ib_read_user_wqe_common(struct ib_umem *umem,
+ void *buffer,
+ u32 buflen,
+ int wqe_index,
+ int wq_offset,
+ int wq_wqe_cnt,
+ int wq_wqe_shift,
+ int bcnt,
+ size_t *bytes_copied)
+{
+ size_t offset = wq_offset + ((wqe_index % wq_wqe_cnt) << wq_wqe_shift);
+ size_t wq_end = wq_offset + (wq_wqe_cnt << wq_wqe_shift);
+ size_t copy_length;
+ int ret;
+
+ /* don't copy more than requested, more than buffer length or
+ * beyond WQ end
+ */
+ copy_length = min_t(u32, buflen, wq_end - offset);
+ copy_length = min_t(u32, copy_length, bcnt);
+
+ ret = ib_umem_copy_from(buffer, umem, offset, copy_length);
+ if (ret)
+ return ret;
+
+ if (!ret && bytes_copied)
+ *bytes_copied = copy_length;
+
+ return 0;
+}
+
+int mlx5_ib_read_user_wqe_sq(struct mlx5_ib_qp *qp,
+ int wqe_index,
+ void *buffer,
+ int buflen,
+ size_t *bc)
{
- struct ib_device *ibdev = qp->ibqp.device;
- struct mlx5_ib_dev *dev = to_mdev(ibdev);
- struct mlx5_ib_wq *wq = send ? &qp->sq : &qp->rq;
- size_t offset;
- size_t wq_end;
+ struct mlx5_ib_qp_base *base = &qp->trans_qp.base;
struct ib_umem *umem = base->ubuffer.umem;
- u32 first_copy_length;
- int wqe_length;
+ struct mlx5_ib_wq *wq = &qp->sq;
+ struct mlx5_wqe_ctrl_seg *ctrl;
+ size_t bytes_copied;
+ size_t bytes_copied2;
+ size_t wqe_length;
int ret;
+ int ds;
- if (wq->wqe_cnt == 0) {
- mlx5_ib_dbg(dev, "mlx5_ib_read_user_wqe for a QP with wqe_cnt == 0. qp_type: 0x%x\n",
- qp->ibqp.qp_type);
+ if (buflen < sizeof(*ctrl))
return -EINVAL;
- }
- offset = wq->offset + ((wqe_index % wq->wqe_cnt) << wq->wqe_shift);
- wq_end = wq->offset + (wq->wqe_cnt << wq->wqe_shift);
+ /* at first read as much as possible */
+ ret = mlx5_ib_read_user_wqe_common(umem,
+ buffer,
+ buflen,
+ wqe_index,
+ wq->offset,
+ wq->wqe_cnt,
+ wq->wqe_shift,
+ buflen,
+ &bytes_copied);
+ if (ret)
+ return ret;
- if (send && length < sizeof(struct mlx5_wqe_ctrl_seg))
+ /* we need at least control segment size to proceed */
+ if (bytes_copied < sizeof(*ctrl))
return -EINVAL;
- if (offset > umem->length ||
- (send && offset + sizeof(struct mlx5_wqe_ctrl_seg) > umem->length))
- return -EINVAL;
+ ctrl = buffer;
+ ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK;
+ wqe_length = ds * MLX5_WQE_DS_UNITS;
+
+ /* if we copied enough then we are done */
+ if (bytes_copied >= wqe_length) {
+ *bc = bytes_copied;
+ return 0;
+ }
+
+ /* otherwise this a wrapped around wqe
+ * so read the remaining bytes starting
+ * from wqe_index 0
+ */
+ ret = mlx5_ib_read_user_wqe_common(umem,
+ buffer + bytes_copied,
+ buflen - bytes_copied,
+ 0,
+ wq->offset,
+ wq->wqe_cnt,
+ wq->wqe_shift,
+ wqe_length - bytes_copied,
+ &bytes_copied2);
- first_copy_length = min_t(u32, offset + length, wq_end) - offset;
- ret = ib_umem_copy_from(buffer, umem, offset, first_copy_length);
if (ret)
return ret;
+ *bc = bytes_copied + bytes_copied2;
+ return 0;
+}
- if (send) {
- struct mlx5_wqe_ctrl_seg *ctrl = buffer;
- int ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK;
-
- wqe_length = ds * MLX5_WQE_DS_UNITS;
- } else {
- wqe_length = 1 << wq->wqe_shift;
- }
+int mlx5_ib_read_user_wqe_rq(struct mlx5_ib_qp *qp,
+ int wqe_index,
+ void *buffer,
+ int buflen,
+ size_t *bc)
+{
+ struct mlx5_ib_qp_base *base = &qp->trans_qp.base;
+ struct ib_umem *umem = base->ubuffer.umem;
+ struct mlx5_ib_wq *wq = &qp->rq;
+ size_t bytes_copied;
+ int ret;
- if (wqe_length <= first_copy_length)
- return first_copy_length;
+ ret = mlx5_ib_read_user_wqe_common(umem,
+ buffer,
+ buflen,
+ wqe_index,
+ wq->offset,
+ wq->wqe_cnt,
+ wq->wqe_shift,
+ buflen,
+ &bytes_copied);
- ret = ib_umem_copy_from(buffer + first_copy_length, umem, wq->offset,
- wqe_length - first_copy_length);
if (ret)
return ret;
+ *bc = bytes_copied;
+ return 0;
+}
+
+int mlx5_ib_read_user_wqe_srq(struct mlx5_ib_srq *srq,
+ int wqe_index,
+ void *buffer,
+ int buflen,
+ size_t *bc)
+{
+ struct ib_umem *umem = srq->umem;
+ size_t bytes_copied;
+ int ret;
+
+ ret = mlx5_ib_read_user_wqe_common(umem,
+ buffer,
+ buflen,
+ wqe_index,
+ 0,
+ srq->msrq.max,
+ srq->msrq.wqe_shift,
+ buflen,
+ &bytes_copied);
- return wqe_length;
+ if (ret)
+ return ret;
+ *bc = bytes_copied;
+ return 0;
}
static void mlx5_ib_qp_event(struct mlx5_core_qp *qp, int type)
@@ -450,9 +533,9 @@ static int set_user_buf_size(struct mlx5_ib_dev *dev,
return -EINVAL;
}
- if (ucmd->sq_wqe_count && ((1 << ilog2(ucmd->sq_wqe_count)) != ucmd->sq_wqe_count)) {
- mlx5_ib_warn(dev, "sq_wqe_count %d, sq_wqe_count %d\n",
- ucmd->sq_wqe_count, ucmd->sq_wqe_count);
+ if (ucmd->sq_wqe_count && !is_power_of_2(ucmd->sq_wqe_count)) {
+ mlx5_ib_warn(dev, "sq_wqe_count %d is not a power of two\n",
+ ucmd->sq_wqe_count);
return -EINVAL;
}
@@ -660,16 +743,14 @@ int bfregn_to_uar_index(struct mlx5_ib_dev *dev,
return bfregi->sys_pages[index_of_sys_page] + offset;
}
-static int mlx5_ib_umem_get(struct mlx5_ib_dev *dev,
- struct ib_pd *pd,
+static int mlx5_ib_umem_get(struct mlx5_ib_dev *dev, struct ib_udata *udata,
unsigned long addr, size_t size,
- struct ib_umem **umem,
- int *npages, int *page_shift, int *ncont,
- u32 *offset)
+ struct ib_umem **umem, int *npages, int *page_shift,
+ int *ncont, u32 *offset)
{
int err;
- *umem = ib_umem_get(pd->uobject->context, addr, size, 0, 0);
+ *umem = ib_umem_get(udata, addr, size, 0, 0);
if (IS_ERR(*umem)) {
mlx5_ib_dbg(dev, "umem_get failed\n");
return PTR_ERR(*umem);
@@ -710,10 +791,11 @@ static void destroy_user_rq(struct mlx5_ib_dev *dev, struct ib_pd *pd,
}
static int create_user_rq(struct mlx5_ib_dev *dev, struct ib_pd *pd,
- struct mlx5_ib_rwq *rwq,
+ struct ib_udata *udata, struct mlx5_ib_rwq *rwq,
struct mlx5_ib_create_wq *ucmd)
{
- struct mlx5_ib_ucontext *context;
+ struct mlx5_ib_ucontext *ucontext = rdma_udata_to_drv_context(
+ udata, struct mlx5_ib_ucontext, ibucontext);
int page_shift = 0;
int npages;
u32 offset = 0;
@@ -723,9 +805,7 @@ static int create_user_rq(struct mlx5_ib_dev *dev, struct ib_pd *pd,
if (!ucmd->buf_addr)
return -EINVAL;
- context = to_mucontext(pd->uobject->context);
- rwq->umem = ib_umem_get(pd->uobject->context, ucmd->buf_addr,
- rwq->buf_size, 0, 0);
+ rwq->umem = ib_umem_get(udata, ucmd->buf_addr, rwq->buf_size, 0, 0);
if (IS_ERR(rwq->umem)) {
mlx5_ib_dbg(dev, "umem_get failed\n");
err = PTR_ERR(rwq->umem);
@@ -750,7 +830,7 @@ static int create_user_rq(struct mlx5_ib_dev *dev, struct ib_pd *pd,
(unsigned long long)ucmd->buf_addr, rwq->buf_size,
npages, page_shift, ncont, offset);
- err = mlx5_ib_db_map_user(context, ucmd->db_addr, &rwq->db);
+ err = mlx5_ib_db_map_user(ucontext, udata, ucmd->db_addr, &rwq->db);
if (err) {
mlx5_ib_dbg(dev, "map failed\n");
goto err_umem;
@@ -790,6 +870,7 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
__be64 *pas;
void *qpc;
int err;
+ u16 uid;
err = ib_copy_from_udata(&ucmd, udata, sizeof(ucmd));
if (err) {
@@ -797,7 +878,8 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
return err;
}
- context = to_mucontext(pd->uobject->context);
+ context = rdma_udata_to_drv_context(udata, struct mlx5_ib_ucontext,
+ ibucontext);
if (ucmd.flags & MLX5_QP_FLAG_BFREG_INDEX) {
uar_index = bfregn_to_uar_index(dev, &context->bfregi,
ucmd.bfreg_index, true);
@@ -833,10 +915,9 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
if (ucmd.buf_addr && ubuffer->buf_size) {
ubuffer->buf_addr = ucmd.buf_addr;
- err = mlx5_ib_umem_get(dev, pd, ubuffer->buf_addr,
- ubuffer->buf_size,
- &ubuffer->umem, &npages, &page_shift,
- &ncont, &offset);
+ err = mlx5_ib_umem_get(dev, udata, ubuffer->buf_addr,
+ ubuffer->buf_size, &ubuffer->umem,
+ &npages, &page_shift, &ncont, &offset);
if (err)
goto err_bfreg;
} else {
@@ -851,7 +932,9 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
goto err_umem;
}
- MLX5_SET(create_qp_in, *in, uid, to_mpd(pd)->uid);
+ uid = (attr->qp_type != IB_QPT_XRC_TGT &&
+ attr->qp_type != IB_QPT_XRC_INI) ? to_mpd(pd)->uid : 0;
+ MLX5_SET(create_qp_in, *in, uid, uid);
pas = (__be64 *)MLX5_ADDR_OF(create_qp_in, *in, pas);
if (ubuffer->umem)
mlx5_ib_populate_pas(dev, ubuffer->umem, page_shift, pas, 0);
@@ -868,7 +951,7 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
resp->bfreg_index = MLX5_IB_INVALID_BFREG;
qp->bfregn = bfregn;
- err = mlx5_ib_db_map_user(context, ucmd.db_addr, &qp->db);
+ err = mlx5_ib_db_map_user(context, udata, ucmd.db_addr, &qp->db);
if (err) {
mlx5_ib_dbg(dev, "map failed\n");
goto err_free;
@@ -917,6 +1000,30 @@ static void destroy_qp_user(struct mlx5_ib_dev *dev, struct ib_pd *pd,
mlx5_ib_free_bfreg(dev, &context->bfregi, qp->bfregn);
}
+/* get_sq_edge - Get the next nearby edge.
+ *
+ * An 'edge' is defined as the first following address after the end
+ * of the fragment or the SQ. Accordingly, during the WQE construction
+ * which repetitively increases the pointer to write the next data, it
+ * simply should check if it gets to an edge.
+ *
+ * @sq - SQ buffer.
+ * @idx - Stride index in the SQ buffer.
+ *
+ * Return:
+ * The new edge.
+ */
+static void *get_sq_edge(struct mlx5_ib_wq *sq, u32 idx)
+{
+ void *fragment_end;
+
+ fragment_end = mlx5_frag_buf_get_wqe
+ (&sq->fbc,
+ mlx5_frag_buf_get_idx_last_contig_stride(&sq->fbc, idx));
+
+ return fragment_end + MLX5_SEND_WQE_BB;
+}
+
static int create_kernel_qp(struct mlx5_ib_dev *dev,
struct ib_qp_init_attr *init_attr,
struct mlx5_ib_qp *qp,
@@ -955,13 +1062,29 @@ static int create_kernel_qp(struct mlx5_ib_dev *dev,
qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift;
base->ubuffer.buf_size = err + (qp->rq.wqe_cnt << qp->rq.wqe_shift);
- err = mlx5_buf_alloc(dev->mdev, base->ubuffer.buf_size, &qp->buf);
+ err = mlx5_frag_buf_alloc_node(dev->mdev, base->ubuffer.buf_size,
+ &qp->buf, dev->mdev->priv.numa_node);
if (err) {
mlx5_ib_dbg(dev, "err %d\n", err);
return err;
}
- qp->sq.qend = mlx5_get_send_wqe(qp, qp->sq.wqe_cnt);
+ if (qp->rq.wqe_cnt)
+ mlx5_init_fbc(qp->buf.frags, qp->rq.wqe_shift,
+ ilog2(qp->rq.wqe_cnt), &qp->rq.fbc);
+
+ if (qp->sq.wqe_cnt) {
+ int sq_strides_offset = (qp->sq.offset & (PAGE_SIZE - 1)) /
+ MLX5_SEND_WQE_BB;
+ mlx5_init_fbc_offset(qp->buf.frags +
+ (qp->sq.offset / PAGE_SIZE),
+ ilog2(MLX5_SEND_WQE_BB),
+ ilog2(qp->sq.wqe_cnt),
+ sq_strides_offset, &qp->sq.fbc);
+
+ qp->sq.cur_edge = get_sq_edge(&qp->sq, 0);
+ }
+
*inlen = MLX5_ST_SZ_BYTES(create_qp_in) +
MLX5_FLD_SZ_BYTES(create_qp_in, pas[0]) * qp->buf.npages;
*in = kvzalloc(*inlen, GFP_KERNEL);
@@ -983,8 +1106,9 @@ static int create_kernel_qp(struct mlx5_ib_dev *dev,
qp->flags |= MLX5_IB_QP_SQPN_QP1;
}
- mlx5_fill_page_array(&qp->buf,
- (__be64 *)MLX5_ADDR_OF(create_qp_in, *in, pas));
+ mlx5_fill_page_frag_array(&qp->buf,
+ (__be64 *)MLX5_ADDR_OF(create_qp_in,
+ *in, pas));
err = mlx5_db_alloc(dev->mdev, &qp->db);
if (err) {
@@ -1024,7 +1148,7 @@ err_free:
kvfree(*in);
err_buf:
- mlx5_buf_free(dev->mdev, &qp->buf);
+ mlx5_frag_buf_free(dev->mdev, &qp->buf);
return err;
}
@@ -1036,7 +1160,7 @@ static void destroy_qp_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp)
kvfree(qp->sq.wr_data);
kvfree(qp->rq.wrid);
mlx5_db_free(dev->mdev, &qp->db);
- mlx5_buf_free(dev->mdev, &qp->buf);
+ mlx5_frag_buf_free(dev->mdev, &qp->buf);
}
static u32 get_rx_type(struct mlx5_ib_qp *qp, struct ib_qp_init_attr *attr)
@@ -1090,6 +1214,7 @@ static void destroy_flow_rule_vport_sq(struct mlx5_ib_dev *dev,
}
static int create_raw_packet_qp_sq(struct mlx5_ib_dev *dev,
+ struct ib_udata *udata,
struct mlx5_ib_sq *sq, void *qpin,
struct ib_pd *pd)
{
@@ -1106,9 +1231,9 @@ static int create_raw_packet_qp_sq(struct mlx5_ib_dev *dev,
int ncont = 0;
u32 offset = 0;
- err = mlx5_ib_umem_get(dev, pd, ubuffer->buf_addr, ubuffer->buf_size,
- &sq->ubuffer.umem, &npages, &page_shift,
- &ncont, &offset);
+ err = mlx5_ib_umem_get(dev, udata, ubuffer->buf_addr, ubuffer->buf_size,
+ &sq->ubuffer.umem, &npages, &page_shift, &ncont,
+ &offset);
if (err)
return err;
@@ -1333,9 +1458,8 @@ static int create_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
struct mlx5_ib_raw_packet_qp *raw_packet_qp = &qp->raw_packet_qp;
struct mlx5_ib_sq *sq = &raw_packet_qp->sq;
struct mlx5_ib_rq *rq = &raw_packet_qp->rq;
- struct ib_uobject *uobj = pd->uobject;
- struct ib_ucontext *ucontext = uobj->context;
- struct mlx5_ib_ucontext *mucontext = to_mucontext(ucontext);
+ struct mlx5_ib_ucontext *mucontext = rdma_udata_to_drv_context(
+ udata, struct mlx5_ib_ucontext, ibucontext);
int err;
u32 tdn = mucontext->tdn;
u16 uid = to_mpd(pd)->uid;
@@ -1345,7 +1469,7 @@ static int create_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
if (err)
return err;
- err = create_raw_packet_qp_sq(dev, sq, in, pd);
+ err = create_raw_packet_qp_sq(dev, udata, sq, in, pd);
if (err)
goto err_destroy_tis;
@@ -1449,9 +1573,8 @@ static int create_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
struct ib_qp_init_attr *init_attr,
struct ib_udata *udata)
{
- struct ib_uobject *uobj = pd->uobject;
- struct ib_ucontext *ucontext = uobj->context;
- struct mlx5_ib_ucontext *mucontext = to_mucontext(ucontext);
+ struct mlx5_ib_ucontext *mucontext = rdma_udata_to_drv_context(
+ udata, struct mlx5_ib_ucontext, ibucontext);
struct mlx5_ib_create_qp_resp resp = {};
int inlen;
int err;
@@ -1695,13 +1818,16 @@ static void configure_responder_scat_cqe(struct ib_qp_init_attr *init_attr,
rcqe_sz = mlx5_ib_get_cqe_size(init_attr->recv_cq);
- if (rcqe_sz == 128) {
- MLX5_SET(qpc, qpc, cs_res, MLX5_RES_SCAT_DATA64_CQE);
+ if (init_attr->qp_type == MLX5_IB_QPT_DCT) {
+ if (rcqe_sz == 128)
+ MLX5_SET(dctc, qpc, cs_res, MLX5_RES_SCAT_DATA64_CQE);
+
return;
}
- if (init_attr->qp_type != MLX5_IB_QPT_DCT)
- MLX5_SET(qpc, qpc, cs_res, MLX5_RES_SCAT_DATA32_CQE);
+ MLX5_SET(qpc, qpc, cs_res,
+ rcqe_sz == 128 ? MLX5_RES_SCAT_DATA64_CQE :
+ MLX5_RES_SCAT_DATA32_CQE);
}
static void configure_requester_scat_cqe(struct mlx5_ib_dev *dev,
@@ -1793,6 +1919,8 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
int inlen = MLX5_ST_SZ_BYTES(create_qp_in);
struct mlx5_core_dev *mdev = dev->mdev;
struct mlx5_ib_create_qp_resp resp = {};
+ struct mlx5_ib_ucontext *ucontext = rdma_udata_to_drv_context(
+ udata, struct mlx5_ib_ucontext, ibucontext);
struct mlx5_ib_cq *send_cq;
struct mlx5_ib_cq *recv_cq;
unsigned long flags;
@@ -1876,24 +2004,26 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
qp->flags |= MLX5_IB_QP_CVLAN_STRIPPING;
}
- if (pd && pd->uobject) {
+ if (udata) {
if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) {
mlx5_ib_dbg(dev, "copy failed\n");
return -EFAULT;
}
if (!check_flags_mask(ucmd.flags,
+ MLX5_QP_FLAG_ALLOW_SCATTER_CQE |
+ MLX5_QP_FLAG_BFREG_INDEX |
+ MLX5_QP_FLAG_PACKET_BASED_CREDIT_MODE |
+ MLX5_QP_FLAG_SCATTER_CQE |
MLX5_QP_FLAG_SIGNATURE |
- MLX5_QP_FLAG_SCATTER_CQE |
- MLX5_QP_FLAG_TUNNEL_OFFLOADS |
- MLX5_QP_FLAG_BFREG_INDEX |
- MLX5_QP_FLAG_TYPE_DCT |
- MLX5_QP_FLAG_TYPE_DCI |
- MLX5_QP_FLAG_ALLOW_SCATTER_CQE))
+ MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC |
+ MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC |
+ MLX5_QP_FLAG_TUNNEL_OFFLOADS |
+ MLX5_QP_FLAG_TYPE_DCI |
+ MLX5_QP_FLAG_TYPE_DCT))
return -EINVAL;
- err = get_qp_user_index(to_mucontext(pd->uobject->context),
- &ucmd, udata->inlen, &uidx);
+ err = get_qp_user_index(ucontext, &ucmd, udata->inlen, &uidx);
if (err)
return err;
@@ -1925,6 +2055,15 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
qp->flags_en |= MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC;
}
+ if (ucmd.flags & MLX5_QP_FLAG_PACKET_BASED_CREDIT_MODE) {
+ if (init_attr->qp_type != IB_QPT_RC ||
+ !MLX5_CAP_GEN(dev->mdev, qp_packet_based)) {
+ mlx5_ib_dbg(dev, "packet based credit mode isn't supported\n");
+ return -EOPNOTSUPP;
+ }
+ qp->flags |= MLX5_IB_QP_PACKET_BASED_CREDIT;
+ }
+
if (init_attr->create_flags & IB_QP_CREATE_SOURCE_QPN) {
if (init_attr->qp_type != IB_QPT_UD ||
(MLX5_CAP_GEN(dev->mdev, port_type) !=
@@ -1948,14 +2087,14 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
qp->has_rq = qp_has_rq(init_attr);
err = set_rq_size(dev, &init_attr->cap, qp->has_rq,
- qp, (pd && pd->uobject) ? &ucmd : NULL);
+ qp, udata ? &ucmd : NULL);
if (err) {
mlx5_ib_dbg(dev, "err %d\n", err);
return err;
}
if (pd) {
- if (pd->uobject) {
+ if (udata) {
__u32 max_wqes =
1 << MLX5_CAP_GEN(mdev, log_max_qp_sz);
mlx5_ib_dbg(dev, "requested sq_wqe_count (%d)\n", ucmd.sq_wqe_count);
@@ -2021,11 +2160,12 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
MLX5_SET(qpc, qpc, cd_slave_send, 1);
if (qp->flags & MLX5_IB_QP_MANAGED_RECV)
MLX5_SET(qpc, qpc, cd_slave_receive, 1);
-
+ if (qp->flags & MLX5_IB_QP_PACKET_BASED_CREDIT)
+ MLX5_SET(qpc, qpc, req_e2e_credit_mode, 1);
if (qp->scat_cqe && is_connected(init_attr->qp_type)) {
configure_responder_scat_cqe(init_attr, qpc);
configure_requester_scat_cqe(dev, init_attr,
- (pd && pd->uobject) ? &ucmd : NULL,
+ udata ? &ucmd : NULL,
qpc);
}
@@ -2367,8 +2507,11 @@ static const char *ib_qp_type_str(enum ib_qp_type type)
static struct ib_qp *mlx5_ib_create_dct(struct ib_pd *pd,
struct ib_qp_init_attr *attr,
- struct mlx5_ib_create_qp *ucmd)
+ struct mlx5_ib_create_qp *ucmd,
+ struct ib_udata *udata)
{
+ struct mlx5_ib_ucontext *ucontext = rdma_udata_to_drv_context(
+ udata, struct mlx5_ib_ucontext, ibucontext);
struct mlx5_ib_qp *qp;
int err = 0;
u32 uidx = MLX5_IB_DEFAULT_UIDX;
@@ -2377,8 +2520,7 @@ static struct ib_qp *mlx5_ib_create_dct(struct ib_pd *pd,
if (!attr->srq || !attr->recv_cq)
return ERR_PTR(-EINVAL);
- err = get_qp_user_index(to_mucontext(pd->uobject->context),
- ucmd, sizeof(*ucmd), &uidx);
+ err = get_qp_user_index(ucontext, ucmd, sizeof(*ucmd), &uidx);
if (err)
return ERR_PTR(err);
@@ -2460,15 +2602,17 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd,
int err;
struct ib_qp_init_attr mlx_init_attr;
struct ib_qp_init_attr *init_attr = verbs_init_attr;
+ struct mlx5_ib_ucontext *ucontext = rdma_udata_to_drv_context(
+ udata, struct mlx5_ib_ucontext, ibucontext);
if (pd) {
dev = to_mdev(pd->device);
if (init_attr->qp_type == IB_QPT_RAW_PACKET) {
- if (!pd->uobject) {
+ if (!ucontext) {
mlx5_ib_dbg(dev, "Raw Packet QP is not supported for kernel consumers\n");
return ERR_PTR(-EINVAL);
- } else if (!to_mucontext(pd->uobject->context)->cqe_version) {
+ } else if (!ucontext->cqe_version) {
mlx5_ib_dbg(dev, "Raw Packet QP is only supported for CQE version > 0\n");
return ERR_PTR(-EINVAL);
}
@@ -2500,7 +2644,7 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd,
return ERR_PTR(-EINVAL);
}
} else {
- return mlx5_ib_create_dct(pd, init_attr, &ucmd);
+ return mlx5_ib_create_dct(pd, init_attr, &ucmd, udata);
}
}
@@ -2611,10 +2755,10 @@ int mlx5_ib_destroy_qp(struct ib_qp *qp)
static int to_mlx5_access_flags(struct mlx5_ib_qp *qp,
const struct ib_qp_attr *attr,
- int attr_mask, __be32 *hw_access_flags)
+ int attr_mask, __be32 *hw_access_flags_be)
{
u8 dest_rd_atomic;
- u32 access_flags;
+ u32 access_flags, hw_access_flags = 0;
struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.device);
@@ -2632,7 +2776,7 @@ static int to_mlx5_access_flags(struct mlx5_ib_qp *qp,
access_flags &= IB_ACCESS_REMOTE_WRITE;
if (access_flags & IB_ACCESS_REMOTE_READ)
- *hw_access_flags |= MLX5_QP_BIT_RRE;
+ hw_access_flags |= MLX5_QP_BIT_RRE;
if (access_flags & IB_ACCESS_REMOTE_ATOMIC) {
int atomic_mode;
@@ -2640,14 +2784,14 @@ static int to_mlx5_access_flags(struct mlx5_ib_qp *qp,
if (atomic_mode < 0)
return -EOPNOTSUPP;
- *hw_access_flags |= MLX5_QP_BIT_RAE;
- *hw_access_flags |= atomic_mode << MLX5_ATOMIC_MODE_OFFSET;
+ hw_access_flags |= MLX5_QP_BIT_RAE;
+ hw_access_flags |= atomic_mode << MLX5_ATOMIC_MODE_OFFSET;
}
if (access_flags & IB_ACCESS_REMOTE_WRITE)
- *hw_access_flags |= MLX5_QP_BIT_RWE;
+ hw_access_flags |= MLX5_QP_BIT_RWE;
- *hw_access_flags = cpu_to_be32(*hw_access_flags);
+ *hw_access_flags_be = cpu_to_be32(hw_access_flags);
return 0;
}
@@ -2663,7 +2807,7 @@ static int ib_rate_to_mlx5(struct mlx5_ib_dev *dev, u8 rate)
if (rate == IB_RATE_PORT_CURRENT)
return 0;
- if (rate < IB_RATE_2_5_GBPS || rate > IB_RATE_300_GBPS)
+ if (rate < IB_RATE_2_5_GBPS || rate > IB_RATE_600_GBPS)
return -EINVAL;
while (rate != IB_RATE_PORT_CURRENT &&
@@ -3138,14 +3282,12 @@ static int modify_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
static unsigned int get_tx_affinity(struct mlx5_ib_dev *dev,
struct mlx5_ib_pd *pd,
struct mlx5_ib_qp_base *qp_base,
- u8 port_num)
+ u8 port_num, struct ib_udata *udata)
{
- struct mlx5_ib_ucontext *ucontext = NULL;
+ struct mlx5_ib_ucontext *ucontext = rdma_udata_to_drv_context(
+ udata, struct mlx5_ib_ucontext, ibucontext);
unsigned int tx_port_affinity;
- if (pd && pd->ibpd.uobject && pd->ibpd.uobject->context)
- ucontext = to_mucontext(pd->ibpd.uobject->context);
-
if (ucontext) {
tx_port_affinity = (unsigned int)atomic_add_return(
1, &ucontext->tx_port_affinity) %
@@ -3168,8 +3310,10 @@ static unsigned int get_tx_affinity(struct mlx5_ib_dev *dev,
static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
const struct ib_qp_attr *attr, int attr_mask,
- enum ib_qp_state cur_state, enum ib_qp_state new_state,
- const struct mlx5_ib_modify_qp *ucmd)
+ enum ib_qp_state cur_state,
+ enum ib_qp_state new_state,
+ const struct mlx5_ib_modify_qp *ucmd,
+ struct ib_udata *udata)
{
static const u16 optab[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE] = {
[MLX5_QP_STATE_RST] = {
@@ -3258,9 +3402,10 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
(ibqp->qp_type == IB_QPT_RAW_PACKET) ||
(ibqp->qp_type == IB_QPT_XRC_INI) ||
(ibqp->qp_type == IB_QPT_XRC_TGT)) {
- if (mlx5_lag_is_active(dev->mdev)) {
+ if (dev->lag_active) {
u8 p = mlx5_core_native_port_num(dev->mdev);
- tx_affinity = get_tx_affinity(dev, pd, base, p);
+ tx_affinity = get_tx_affinity(dev, pd, base, p,
+ udata);
context->flags |= cpu_to_be32(tx_affinity << 24);
}
}
@@ -3348,7 +3493,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
}
if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC)) {
- __be32 access_flags = 0;
+ __be32 access_flags;
err = to_mlx5_access_flags(qp, attr, attr_mask, &access_flags);
if (err)
@@ -3475,7 +3620,8 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
qp->sq.head = 0;
qp->sq.tail = 0;
qp->sq.cur_post = 0;
- qp->sq.last_poll = 0;
+ if (qp->sq.wqe_cnt)
+ qp->sq.cur_edge = get_sq_edge(&qp->sq, 0);
qp->db.db[MLX5_RCV_DBR] = 0;
qp->db.db[MLX5_SND_DBR] = 0;
}
@@ -3515,7 +3661,7 @@ static bool modify_dci_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state new
return is_valid_mask(attr_mask, req, opt);
} else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) {
req |= IB_QP_PATH_MTU;
- opt = IB_QP_PKEY_INDEX;
+ opt = IB_QP_PKEY_INDEX | IB_QP_AV;
return is_valid_mask(attr_mask, req, opt);
} else if (cur_state == IB_QPS_RTR && new_state == IB_QPS_RTS) {
req |= IB_QP_TIMEOUT | IB_QP_RETRY_CNT | IB_QP_RNR_RETRY |
@@ -3586,6 +3732,7 @@ static int mlx5_ib_modify_dct(struct ib_qp *ibqp, struct ib_qp_attr *attr,
} else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) {
struct mlx5_ib_modify_qp_resp resp = {};
+ u32 out[MLX5_ST_SZ_DW(create_dct_out)] = {0};
u32 min_resp_len = offsetof(typeof(resp), dctn) +
sizeof(resp.dctn);
@@ -3604,7 +3751,8 @@ static int mlx5_ib_modify_dct(struct ib_qp *ibqp, struct ib_qp_attr *attr,
MLX5_SET(dctc, dctc, hop_limit, attr->ah_attr.grh.hop_limit);
err = mlx5_core_create_dct(dev->mdev, &qp->dct.mdct, qp->dct.in,
- MLX5_ST_SZ_BYTES(create_dct_in));
+ MLX5_ST_SZ_BYTES(create_dct_in), out,
+ sizeof(out));
if (err)
return err;
resp.dctn = qp->dct.mdct.mqp.qpn;
@@ -3742,13 +3890,69 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
}
err = __mlx5_ib_modify_qp(ibqp, attr, attr_mask, cur_state,
- new_state, &ucmd);
+ new_state, &ucmd, udata);
out:
mutex_unlock(&qp->mutex);
return err;
}
+static void _handle_post_send_edge(struct mlx5_ib_wq *sq, void **seg,
+ u32 wqe_sz, void **cur_edge)
+{
+ u32 idx;
+
+ idx = (sq->cur_post + (wqe_sz >> 2)) & (sq->wqe_cnt - 1);
+ *cur_edge = get_sq_edge(sq, idx);
+
+ *seg = mlx5_frag_buf_get_wqe(&sq->fbc, idx);
+}
+
+/* handle_post_send_edge - Check if we get to SQ edge. If yes, update to the
+ * next nearby edge and get new address translation for current WQE position.
+ * @sq - SQ buffer.
+ * @seg: Current WQE position (16B aligned).
+ * @wqe_sz: Total current WQE size [16B].
+ * @cur_edge: Updated current edge.
+ */
+static inline void handle_post_send_edge(struct mlx5_ib_wq *sq, void **seg,
+ u32 wqe_sz, void **cur_edge)
+{
+ if (likely(*seg != *cur_edge))
+ return;
+
+ _handle_post_send_edge(sq, seg, wqe_sz, cur_edge);
+}
+
+/* memcpy_send_wqe - copy data from src to WQE and update the relevant WQ's
+ * pointers. At the end @seg is aligned to 16B regardless the copied size.
+ * @sq - SQ buffer.
+ * @cur_edge: Updated current edge.
+ * @seg: Current WQE position (16B aligned).
+ * @wqe_sz: Total current WQE size [16B].
+ * @src: Pointer to copy from.
+ * @n: Number of bytes to copy.
+ */
+static inline void memcpy_send_wqe(struct mlx5_ib_wq *sq, void **cur_edge,
+ void **seg, u32 *wqe_sz, const void *src,
+ size_t n)
+{
+ while (likely(n)) {
+ size_t leftlen = *cur_edge - *seg;
+ size_t copysz = min_t(size_t, leftlen, n);
+ size_t stride;
+
+ memcpy(*seg, src, copysz);
+
+ n -= copysz;
+ src += copysz;
+ stride = !n ? ALIGN(copysz, 16) : copysz;
+ *seg += stride;
+ *wqe_sz += stride >> 4;
+ handle_post_send_edge(sq, seg, *wqe_sz, cur_edge);
+ }
+}
+
static int mlx5_wq_overflow(struct mlx5_ib_wq *wq, int nreq, struct ib_cq *ib_cq)
{
struct mlx5_ib_cq *cq;
@@ -3774,11 +3978,10 @@ static __always_inline void set_raddr_seg(struct mlx5_wqe_raddr_seg *rseg,
rseg->reserved = 0;
}
-static void *set_eth_seg(struct mlx5_wqe_eth_seg *eseg,
- const struct ib_send_wr *wr, void *qend,
- struct mlx5_ib_qp *qp, int *size)
+static void set_eth_seg(const struct ib_send_wr *wr, struct mlx5_ib_qp *qp,
+ void **seg, int *size, void **cur_edge)
{
- void *seg = eseg;
+ struct mlx5_wqe_eth_seg *eseg = *seg;
memset(eseg, 0, sizeof(struct mlx5_wqe_eth_seg));
@@ -3786,45 +3989,41 @@ static void *set_eth_seg(struct mlx5_wqe_eth_seg *eseg,
eseg->cs_flags = MLX5_ETH_WQE_L3_CSUM |
MLX5_ETH_WQE_L4_CSUM;
- seg += sizeof(struct mlx5_wqe_eth_seg);
- *size += sizeof(struct mlx5_wqe_eth_seg) / 16;
-
if (wr->opcode == IB_WR_LSO) {
struct ib_ud_wr *ud_wr = container_of(wr, struct ib_ud_wr, wr);
- int size_of_inl_hdr_start = sizeof(eseg->inline_hdr.start);
- u64 left, leftlen, copysz;
+ size_t left, copysz;
void *pdata = ud_wr->header;
+ size_t stride;
left = ud_wr->hlen;
eseg->mss = cpu_to_be16(ud_wr->mss);
eseg->inline_hdr.sz = cpu_to_be16(left);
- /*
- * check if there is space till the end of queue, if yes,
- * copy all in one shot, otherwise copy till the end of queue,
- * rollback and than the copy the left
+ /* memcpy_send_wqe should get a 16B align address. Hence, we
+ * first copy up to the current edge and then, if needed,
+ * fall-through to memcpy_send_wqe.
*/
- leftlen = qend - (void *)eseg->inline_hdr.start;
- copysz = min_t(u64, leftlen, left);
-
- memcpy(seg - size_of_inl_hdr_start, pdata, copysz);
-
- if (likely(copysz > size_of_inl_hdr_start)) {
- seg += ALIGN(copysz - size_of_inl_hdr_start, 16);
- *size += ALIGN(copysz - size_of_inl_hdr_start, 16) / 16;
- }
-
- if (unlikely(copysz < left)) { /* the last wqe in the queue */
- seg = mlx5_get_send_wqe(qp, 0);
+ copysz = min_t(u64, *cur_edge - (void *)eseg->inline_hdr.start,
+ left);
+ memcpy(eseg->inline_hdr.start, pdata, copysz);
+ stride = ALIGN(sizeof(struct mlx5_wqe_eth_seg) -
+ sizeof(eseg->inline_hdr.start) + copysz, 16);
+ *size += stride / 16;
+ *seg += stride;
+
+ if (copysz < left) {
+ handle_post_send_edge(&qp->sq, seg, *size, cur_edge);
left -= copysz;
pdata += copysz;
- memcpy(seg, pdata, left);
- seg += ALIGN(left, 16);
- *size += ALIGN(left, 16) / 16;
+ memcpy_send_wqe(&qp->sq, cur_edge, seg, size, pdata,
+ left);
}
+
+ return;
}
- return seg;
+ *seg += sizeof(struct mlx5_wqe_eth_seg);
+ *size += sizeof(struct mlx5_wqe_eth_seg) / 16;
}
static void set_datagram_seg(struct mlx5_wqe_datagram_seg *dseg,
@@ -4083,24 +4282,6 @@ static void set_reg_data_seg(struct mlx5_wqe_data_seg *dseg,
dseg->lkey = cpu_to_be32(pd->ibpd.local_dma_lkey);
}
-static void set_reg_umr_inline_seg(void *seg, struct mlx5_ib_qp *qp,
- struct mlx5_ib_mr *mr, int mr_list_size)
-{
- void *qend = qp->sq.qend;
- void *addr = mr->descs;
- int copy;
-
- if (unlikely(seg + mr_list_size > qend)) {
- copy = qend - seg;
- memcpy(seg, addr, copy);
- addr += copy;
- mr_list_size -= copy;
- seg = mlx5_get_send_wqe(qp, 0);
- }
- memcpy(seg, addr, mr_list_size);
- seg += mr_list_size;
-}
-
static __be32 send_ieth(const struct ib_send_wr *wr)
{
switch (wr->opcode) {
@@ -4134,40 +4315,48 @@ static u8 wq_sig(void *wqe)
}
static int set_data_inl_seg(struct mlx5_ib_qp *qp, const struct ib_send_wr *wr,
- void *wqe, int *sz)
+ void **wqe, int *wqe_sz, void **cur_edge)
{
struct mlx5_wqe_inline_seg *seg;
- void *qend = qp->sq.qend;
- void *addr;
+ size_t offset;
int inl = 0;
- int copy;
- int len;
int i;
- seg = wqe;
- wqe += sizeof(*seg);
+ seg = *wqe;
+ *wqe += sizeof(*seg);
+ offset = sizeof(*seg);
+
for (i = 0; i < wr->num_sge; i++) {
- addr = (void *)(unsigned long)(wr->sg_list[i].addr);
- len = wr->sg_list[i].length;
+ size_t len = wr->sg_list[i].length;
+ void *addr = (void *)(unsigned long)(wr->sg_list[i].addr);
+
inl += len;
if (unlikely(inl > qp->max_inline_data))
return -ENOMEM;
- if (unlikely(wqe + len > qend)) {
- copy = qend - wqe;
- memcpy(wqe, addr, copy);
- addr += copy;
- len -= copy;
- wqe = mlx5_get_send_wqe(qp, 0);
+ while (likely(len)) {
+ size_t leftlen;
+ size_t copysz;
+
+ handle_post_send_edge(&qp->sq, wqe,
+ *wqe_sz + (offset >> 4),
+ cur_edge);
+
+ leftlen = *cur_edge - *wqe;
+ copysz = min_t(size_t, leftlen, len);
+
+ memcpy(*wqe, addr, copysz);
+ len -= copysz;
+ addr += copysz;
+ *wqe += copysz;
+ offset += copysz;
}
- memcpy(wqe, addr, len);
- wqe += len;
}
seg->byte_count = cpu_to_be32(inl | MLX5_INLINE_SEG);
- *sz = ALIGN(inl + sizeof(seg->byte_count), 16) / 16;
+ *wqe_sz += ALIGN(inl + sizeof(seg->byte_count), 16) / 16;
return 0;
}
@@ -4280,7 +4469,8 @@ static int mlx5_set_bsf(struct ib_mr *sig_mr,
}
static int set_sig_data_segment(const struct ib_sig_handover_wr *wr,
- struct mlx5_ib_qp *qp, void **seg, int *size)
+ struct mlx5_ib_qp *qp, void **seg,
+ int *size, void **cur_edge)
{
struct ib_sig_attrs *sig_attrs = wr->sig_attrs;
struct ib_mr *sig_mr = wr->sig_mr;
@@ -4364,8 +4554,7 @@ static int set_sig_data_segment(const struct ib_sig_handover_wr *wr,
*seg += wqe_size;
*size += wqe_size / 16;
- if (unlikely((*seg == qp->sq.qend)))
- *seg = mlx5_get_send_wqe(qp, 0);
+ handle_post_send_edge(&qp->sq, seg, *size, cur_edge);
bsf = *seg;
ret = mlx5_set_bsf(sig_mr, sig_attrs, bsf, data_len);
@@ -4374,8 +4563,7 @@ static int set_sig_data_segment(const struct ib_sig_handover_wr *wr,
*seg += sizeof(*bsf);
*size += sizeof(*bsf) / 16;
- if (unlikely((*seg == qp->sq.qend)))
- *seg = mlx5_get_send_wqe(qp, 0);
+ handle_post_send_edge(&qp->sq, seg, *size, cur_edge);
return 0;
}
@@ -4413,7 +4601,8 @@ static void set_sig_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr,
static int set_sig_umr_wr(const struct ib_send_wr *send_wr,
- struct mlx5_ib_qp *qp, void **seg, int *size)
+ struct mlx5_ib_qp *qp, void **seg, int *size,
+ void **cur_edge)
{
const struct ib_sig_handover_wr *wr = sig_handover_wr(send_wr);
struct mlx5_ib_mr *sig_mr = to_mmr(wr->sig_mr);
@@ -4445,16 +4634,14 @@ static int set_sig_umr_wr(const struct ib_send_wr *send_wr,
set_sig_umr_segment(*seg, xlt_size);
*seg += sizeof(struct mlx5_wqe_umr_ctrl_seg);
*size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16;
- if (unlikely((*seg == qp->sq.qend)))
- *seg = mlx5_get_send_wqe(qp, 0);
+ handle_post_send_edge(&qp->sq, seg, *size, cur_edge);
set_sig_mkey_segment(*seg, wr, xlt_size, region_len, pdn);
*seg += sizeof(struct mlx5_mkey_seg);
*size += sizeof(struct mlx5_mkey_seg) / 16;
- if (unlikely((*seg == qp->sq.qend)))
- *seg = mlx5_get_send_wqe(qp, 0);
+ handle_post_send_edge(&qp->sq, seg, *size, cur_edge);
- ret = set_sig_data_segment(wr, qp, seg, size);
+ ret = set_sig_data_segment(wr, qp, seg, size, cur_edge);
if (ret)
return ret;
@@ -4491,11 +4678,11 @@ static int set_psv_wr(struct ib_sig_domain *domain,
static int set_reg_wr(struct mlx5_ib_qp *qp,
const struct ib_reg_wr *wr,
- void **seg, int *size)
+ void **seg, int *size, void **cur_edge)
{
struct mlx5_ib_mr *mr = to_mmr(wr->mr);
struct mlx5_ib_pd *pd = to_mpd(qp->ibqp.pd);
- int mr_list_size = mr->ndescs * mr->desc_size;
+ size_t mr_list_size = mr->ndescs * mr->desc_size;
bool umr_inline = mr_list_size <= MLX5_IB_SQ_UMR_INLINE_THRESHOLD;
if (unlikely(wr->wr.send_flags & IB_SEND_INLINE)) {
@@ -4507,18 +4694,17 @@ static int set_reg_wr(struct mlx5_ib_qp *qp,
set_reg_umr_seg(*seg, mr, umr_inline);
*seg += sizeof(struct mlx5_wqe_umr_ctrl_seg);
*size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16;
- if (unlikely((*seg == qp->sq.qend)))
- *seg = mlx5_get_send_wqe(qp, 0);
+ handle_post_send_edge(&qp->sq, seg, *size, cur_edge);
set_reg_mkey_seg(*seg, mr, wr->key, wr->access);
*seg += sizeof(struct mlx5_mkey_seg);
*size += sizeof(struct mlx5_mkey_seg) / 16;
- if (unlikely((*seg == qp->sq.qend)))
- *seg = mlx5_get_send_wqe(qp, 0);
+ handle_post_send_edge(&qp->sq, seg, *size, cur_edge);
if (umr_inline) {
- set_reg_umr_inline_seg(*seg, qp, mr, mr_list_size);
- *size += get_xlt_octo(mr_list_size);
+ memcpy_send_wqe(&qp->sq, cur_edge, seg, size, mr->descs,
+ mr_list_size);
+ *size = ALIGN(*size, MLX5_SEND_WQE_BB >> 4);
} else {
set_reg_data_seg(*seg, mr, pd);
*seg += sizeof(struct mlx5_wqe_data_seg);
@@ -4527,32 +4713,31 @@ static int set_reg_wr(struct mlx5_ib_qp *qp,
return 0;
}
-static void set_linv_wr(struct mlx5_ib_qp *qp, void **seg, int *size)
+static void set_linv_wr(struct mlx5_ib_qp *qp, void **seg, int *size,
+ void **cur_edge)
{
set_linv_umr_seg(*seg);
*seg += sizeof(struct mlx5_wqe_umr_ctrl_seg);
*size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16;
- if (unlikely((*seg == qp->sq.qend)))
- *seg = mlx5_get_send_wqe(qp, 0);
+ handle_post_send_edge(&qp->sq, seg, *size, cur_edge);
set_linv_mkey_seg(*seg);
*seg += sizeof(struct mlx5_mkey_seg);
*size += sizeof(struct mlx5_mkey_seg) / 16;
- if (unlikely((*seg == qp->sq.qend)))
- *seg = mlx5_get_send_wqe(qp, 0);
+ handle_post_send_edge(&qp->sq, seg, *size, cur_edge);
}
-static void dump_wqe(struct mlx5_ib_qp *qp, int idx, int size_16)
+static void dump_wqe(struct mlx5_ib_qp *qp, u32 idx, int size_16)
{
__be32 *p = NULL;
- int tidx = idx;
+ u32 tidx = idx;
int i, j;
- pr_debug("dump wqe at %p\n", mlx5_get_send_wqe(qp, tidx));
+ pr_debug("dump WQE index %u:\n", idx);
for (i = 0, j = 0; i < size_16 * 4; i += 4, j += 4) {
if ((i & 0xf) == 0) {
- void *buf = mlx5_get_send_wqe(qp, tidx);
tidx = (tidx + 1) & (qp->sq.wqe_cnt - 1);
- p = buf;
+ p = mlx5_frag_buf_get_wqe(&qp->sq.fbc, tidx);
+ pr_debug("WQBB at %p:\n", (void *)p);
j = 0;
}
pr_debug("%08x %08x %08x %08x\n", be32_to_cpu(p[j]),
@@ -4562,15 +4747,16 @@ static void dump_wqe(struct mlx5_ib_qp *qp, int idx, int size_16)
}
static int __begin_wqe(struct mlx5_ib_qp *qp, void **seg,
- struct mlx5_wqe_ctrl_seg **ctrl,
- const struct ib_send_wr *wr, unsigned *idx,
- int *size, int nreq, bool send_signaled, bool solicited)
+ struct mlx5_wqe_ctrl_seg **ctrl,
+ const struct ib_send_wr *wr, unsigned int *idx,
+ int *size, void **cur_edge, int nreq,
+ bool send_signaled, bool solicited)
{
if (unlikely(mlx5_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)))
return -ENOMEM;
*idx = qp->sq.cur_post & (qp->sq.wqe_cnt - 1);
- *seg = mlx5_get_send_wqe(qp, *idx);
+ *seg = mlx5_frag_buf_get_wqe(&qp->sq.fbc, *idx);
*ctrl = *seg;
*(uint32_t *)(*seg + 8) = 0;
(*ctrl)->imm = send_ieth(wr);
@@ -4580,6 +4766,7 @@ static int __begin_wqe(struct mlx5_ib_qp *qp, void **seg,
*seg += sizeof(**ctrl);
*size = sizeof(**ctrl) / 16;
+ *cur_edge = qp->sq.cur_edge;
return 0;
}
@@ -4587,17 +4774,18 @@ static int __begin_wqe(struct mlx5_ib_qp *qp, void **seg,
static int begin_wqe(struct mlx5_ib_qp *qp, void **seg,
struct mlx5_wqe_ctrl_seg **ctrl,
const struct ib_send_wr *wr, unsigned *idx,
- int *size, int nreq)
+ int *size, void **cur_edge, int nreq)
{
- return __begin_wqe(qp, seg, ctrl, wr, idx, size, nreq,
+ return __begin_wqe(qp, seg, ctrl, wr, idx, size, cur_edge, nreq,
wr->send_flags & IB_SEND_SIGNALED,
wr->send_flags & IB_SEND_SOLICITED);
}
static void finish_wqe(struct mlx5_ib_qp *qp,
struct mlx5_wqe_ctrl_seg *ctrl,
- u8 size, unsigned idx, u64 wr_id,
- int nreq, u8 fence, u32 mlx5_opcode)
+ void *seg, u8 size, void *cur_edge,
+ unsigned int idx, u64 wr_id, int nreq, u8 fence,
+ u32 mlx5_opcode)
{
u8 opmod = 0;
@@ -4613,6 +4801,15 @@ static void finish_wqe(struct mlx5_ib_qp *qp,
qp->sq.wqe_head[idx] = qp->sq.head + nreq;
qp->sq.cur_post += DIV_ROUND_UP(size * 16, MLX5_SEND_WQE_BB);
qp->sq.w_list[idx].next = qp->sq.cur_post;
+
+ /* We save the edge which was possibly updated during the WQE
+ * construction, into SQ's cache.
+ */
+ seg = PTR_ALIGN(seg, MLX5_SEND_WQE_BB);
+ qp->sq.cur_edge = (unlikely(seg == cur_edge)) ?
+ get_sq_edge(&qp->sq, qp->sq.cur_post &
+ (qp->sq.wqe_cnt - 1)) :
+ cur_edge;
}
static int _mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
@@ -4623,11 +4820,10 @@ static int _mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
struct mlx5_core_dev *mdev = dev->mdev;
struct mlx5_ib_qp *qp;
struct mlx5_ib_mr *mr;
- struct mlx5_wqe_data_seg *dpseg;
struct mlx5_wqe_xrc_seg *xrc;
struct mlx5_bf *bf;
+ void *cur_edge;
int uninitialized_var(size);
- void *qend;
unsigned long flags;
unsigned idx;
int err = 0;
@@ -4649,7 +4845,6 @@ static int _mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
qp = to_mqp(ibqp);
bf = &qp->bf;
- qend = qp->sq.qend;
spin_lock_irqsave(&qp->sq.lock, flags);
@@ -4669,7 +4864,8 @@ static int _mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
goto out;
}
- err = begin_wqe(qp, &seg, &ctrl, wr, &idx, &size, nreq);
+ err = begin_wqe(qp, &seg, &ctrl, wr, &idx, &size, &cur_edge,
+ nreq);
if (err) {
mlx5_ib_warn(dev, "\n");
err = -ENOMEM;
@@ -4719,14 +4915,15 @@ static int _mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
case IB_WR_LOCAL_INV:
qp->sq.wr_data[idx] = IB_WR_LOCAL_INV;
ctrl->imm = cpu_to_be32(wr->ex.invalidate_rkey);
- set_linv_wr(qp, &seg, &size);
+ set_linv_wr(qp, &seg, &size, &cur_edge);
num_sge = 0;
break;
case IB_WR_REG_MR:
qp->sq.wr_data[idx] = IB_WR_REG_MR;
ctrl->imm = cpu_to_be32(reg_wr(wr)->key);
- err = set_reg_wr(qp, reg_wr(wr), &seg, &size);
+ err = set_reg_wr(qp, reg_wr(wr), &seg, &size,
+ &cur_edge);
if (err) {
*bad_wr = wr;
goto out;
@@ -4739,21 +4936,24 @@ static int _mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
mr = to_mmr(sig_handover_wr(wr)->sig_mr);
ctrl->imm = cpu_to_be32(mr->ibmr.rkey);
- err = set_sig_umr_wr(wr, qp, &seg, &size);
+ err = set_sig_umr_wr(wr, qp, &seg, &size,
+ &cur_edge);
if (err) {
mlx5_ib_warn(dev, "\n");
*bad_wr = wr;
goto out;
}
- finish_wqe(qp, ctrl, size, idx, wr->wr_id, nreq,
- fence, MLX5_OPCODE_UMR);
+ finish_wqe(qp, ctrl, seg, size, cur_edge, idx,
+ wr->wr_id, nreq, fence,
+ MLX5_OPCODE_UMR);
/*
* SET_PSV WQEs are not signaled and solicited
* on error
*/
err = __begin_wqe(qp, &seg, &ctrl, wr, &idx,
- &size, nreq, false, true);
+ &size, &cur_edge, nreq, false,
+ true);
if (err) {
mlx5_ib_warn(dev, "\n");
err = -ENOMEM;
@@ -4770,10 +4970,12 @@ static int _mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
goto out;
}
- finish_wqe(qp, ctrl, size, idx, wr->wr_id, nreq,
- fence, MLX5_OPCODE_SET_PSV);
+ finish_wqe(qp, ctrl, seg, size, cur_edge, idx,
+ wr->wr_id, nreq, fence,
+ MLX5_OPCODE_SET_PSV);
err = __begin_wqe(qp, &seg, &ctrl, wr, &idx,
- &size, nreq, false, true);
+ &size, &cur_edge, nreq, false,
+ true);
if (err) {
mlx5_ib_warn(dev, "\n");
err = -ENOMEM;
@@ -4790,8 +4992,9 @@ static int _mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
goto out;
}
- finish_wqe(qp, ctrl, size, idx, wr->wr_id, nreq,
- fence, MLX5_OPCODE_SET_PSV);
+ finish_wqe(qp, ctrl, seg, size, cur_edge, idx,
+ wr->wr_id, nreq, fence,
+ MLX5_OPCODE_SET_PSV);
qp->next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL;
num_sge = 0;
goto skip_psv;
@@ -4828,16 +5031,14 @@ static int _mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
set_datagram_seg(seg, wr);
seg += sizeof(struct mlx5_wqe_datagram_seg);
size += sizeof(struct mlx5_wqe_datagram_seg) / 16;
- if (unlikely((seg == qend)))
- seg = mlx5_get_send_wqe(qp, 0);
+ handle_post_send_edge(&qp->sq, &seg, size, &cur_edge);
+
break;
case IB_QPT_UD:
set_datagram_seg(seg, wr);
seg += sizeof(struct mlx5_wqe_datagram_seg);
size += sizeof(struct mlx5_wqe_datagram_seg) / 16;
-
- if (unlikely((seg == qend)))
- seg = mlx5_get_send_wqe(qp, 0);
+ handle_post_send_edge(&qp->sq, &seg, size, &cur_edge);
/* handle qp that supports ud offload */
if (qp->flags & IB_QP_CREATE_IPOIB_UD_LSO) {
@@ -4847,11 +5048,9 @@ static int _mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
memset(pad, 0, sizeof(struct mlx5_wqe_eth_pad));
seg += sizeof(struct mlx5_wqe_eth_pad);
size += sizeof(struct mlx5_wqe_eth_pad) / 16;
-
- seg = set_eth_seg(seg, wr, qend, qp, &size);
-
- if (unlikely((seg == qend)))
- seg = mlx5_get_send_wqe(qp, 0);
+ set_eth_seg(wr, qp, &seg, &size, &cur_edge);
+ handle_post_send_edge(&qp->sq, &seg, size,
+ &cur_edge);
}
break;
case MLX5_IB_QPT_REG_UMR:
@@ -4867,13 +5066,11 @@ static int _mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
goto out;
seg += sizeof(struct mlx5_wqe_umr_ctrl_seg);
size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16;
- if (unlikely((seg == qend)))
- seg = mlx5_get_send_wqe(qp, 0);
+ handle_post_send_edge(&qp->sq, &seg, size, &cur_edge);
set_reg_mkey_segment(seg, wr);
seg += sizeof(struct mlx5_mkey_seg);
size += sizeof(struct mlx5_mkey_seg) / 16;
- if (unlikely((seg == qend)))
- seg = mlx5_get_send_wqe(qp, 0);
+ handle_post_send_edge(&qp->sq, &seg, size, &cur_edge);
break;
default:
@@ -4881,33 +5078,29 @@ static int _mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
}
if (wr->send_flags & IB_SEND_INLINE && num_sge) {
- int uninitialized_var(sz);
-
- err = set_data_inl_seg(qp, wr, seg, &sz);
+ err = set_data_inl_seg(qp, wr, &seg, &size, &cur_edge);
if (unlikely(err)) {
mlx5_ib_warn(dev, "\n");
*bad_wr = wr;
goto out;
}
- size += sz;
} else {
- dpseg = seg;
for (i = 0; i < num_sge; i++) {
- if (unlikely(dpseg == qend)) {
- seg = mlx5_get_send_wqe(qp, 0);
- dpseg = seg;
- }
+ handle_post_send_edge(&qp->sq, &seg, size,
+ &cur_edge);
if (likely(wr->sg_list[i].length)) {
- set_data_ptr_seg(dpseg, wr->sg_list + i);
+ set_data_ptr_seg
+ ((struct mlx5_wqe_data_seg *)seg,
+ wr->sg_list + i);
size += sizeof(struct mlx5_wqe_data_seg) / 16;
- dpseg++;
+ seg += sizeof(struct mlx5_wqe_data_seg);
}
}
}
qp->next_fence = next_fence;
- finish_wqe(qp, ctrl, size, idx, wr->wr_id, nreq, fence,
- mlx5_ib_opcode[wr->opcode]);
+ finish_wqe(qp, ctrl, seg, size, cur_edge, idx, wr->wr_id, nreq,
+ fence, mlx5_ib_opcode[wr->opcode]);
skip_psv:
if (0)
dump_wqe(qp, idx, size);
@@ -4993,7 +5186,7 @@ static int _mlx5_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr,
goto out;
}
- scat = get_recv_wqe(qp, ind);
+ scat = mlx5_frag_buf_get_wqe(&qp->rq.fbc, ind);
if (qp->wq_sig)
scat++;
@@ -5441,7 +5634,6 @@ struct ib_xrcd *mlx5_ib_alloc_xrcd(struct ib_device *ibdev,
struct mlx5_ib_dev *dev = to_mdev(ibdev);
struct mlx5_ib_xrcd *xrcd;
int err;
- u16 uid;
if (!MLX5_CAP_GEN(dev->mdev, xrc))
return ERR_PTR(-ENOSYS);
@@ -5450,14 +5642,12 @@ struct ib_xrcd *mlx5_ib_alloc_xrcd(struct ib_device *ibdev,
if (!xrcd)
return ERR_PTR(-ENOMEM);
- uid = context ? to_mucontext(context)->devx_uid : 0;
- err = mlx5_cmd_xrcd_alloc(dev->mdev, &xrcd->xrcdn, uid);
+ err = mlx5_cmd_xrcd_alloc(dev->mdev, &xrcd->xrcdn, 0);
if (err) {
kfree(xrcd);
return ERR_PTR(-ENOMEM);
}
- xrcd->uid = uid;
return &xrcd->ibxrcd;
}
@@ -5465,10 +5655,9 @@ int mlx5_ib_dealloc_xrcd(struct ib_xrcd *xrcd)
{
struct mlx5_ib_dev *dev = to_mdev(xrcd->device);
u32 xrcdn = to_mxrcd(xrcd)->xrcdn;
- u16 uid = to_mxrcd(xrcd)->uid;
int err;
- err = mlx5_cmd_xrcd_dealloc(dev->mdev, xrcdn, uid);
+ err = mlx5_cmd_xrcd_dealloc(dev->mdev, xrcdn, 0);
if (err)
mlx5_ib_warn(dev, "failed to dealloc xrcdn 0x%x\n", xrcdn);
@@ -5711,7 +5900,7 @@ static int prepare_user_rq(struct ib_pd *pd,
return err;
}
- err = create_user_rq(dev, pd, rwq, &ucmd);
+ err = create_user_rq(dev, pd, udata, rwq, &ucmd);
if (err) {
mlx5_ib_dbg(dev, "err %d\n", err);
return err;
diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c
index d012e7dbcc38..1ec1beb1296b 100644
--- a/drivers/infiniband/hw/mlx5/srq.c
+++ b/drivers/infiniband/hw/mlx5/srq.c
@@ -1,50 +1,19 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
/*
- * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses. You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- * Redistribution and use in source and binary forms, with or
- * without modification, are permitted provided that the following
- * conditions are met:
- *
- * - Redistributions of source code must retain the above
- * copyright notice, this list of conditions and the following
- * disclaimer.
- *
- * - Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following
- * disclaimer in the documentation and/or other materials
- * provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * Copyright (c) 2013-2018, Mellanox Technologies inc. All rights reserved.
*/
#include <linux/module.h>
#include <linux/mlx5/qp.h>
-#include <linux/mlx5/srq.h>
#include <linux/slab.h>
#include <rdma/ib_umem.h>
#include <rdma/ib_user_verbs.h>
-
#include "mlx5_ib.h"
-
-/* not supported currently */
-static int srq_signature;
+#include "srq.h"
static void *get_wqe(struct mlx5_ib_srq *srq, int n)
{
- return mlx5_buf_offset(&srq->buf, n << srq->msrq.wqe_shift);
+ return mlx5_frag_buf_get_wqe(&srq->fbc, n);
}
static void mlx5_ib_srq_event(struct mlx5_core_srq *srq, enum mlx5_event type)
@@ -78,6 +47,8 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq,
{
struct mlx5_ib_dev *dev = to_mdev(pd->device);
struct mlx5_ib_create_srq ucmd = {};
+ struct mlx5_ib_ucontext *ucontext = rdma_udata_to_drv_context(
+ udata, struct mlx5_ib_ucontext, ibucontext);
size_t ucmdlen;
int err;
int npages;
@@ -102,16 +73,14 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq,
return -EINVAL;
if (in->type != IB_SRQT_BASIC) {
- err = get_srq_user_index(to_mucontext(pd->uobject->context),
- &ucmd, udata->inlen, &uidx);
+ err = get_srq_user_index(ucontext, &ucmd, udata->inlen, &uidx);
if (err)
return err;
}
srq->wq_sig = !!(ucmd.flags & MLX5_SRQ_FLAG_SIGNATURE);
- srq->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr, buf_size,
- 0, 0);
+ srq->umem = ib_umem_get(udata, ucmd.buf_addr, buf_size, 0, 0);
if (IS_ERR(srq->umem)) {
mlx5_ib_dbg(dev, "failed umem get, size %d\n", buf_size);
err = PTR_ERR(srq->umem);
@@ -135,8 +104,7 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq,
mlx5_ib_populate_pas(dev, srq->umem, page_shift, in->pas, 0);
- err = mlx5_ib_db_map_user(to_mucontext(pd->uobject->context),
- ucmd.db_addr, &srq->db);
+ err = mlx5_ib_db_map_user(ucontext, udata, ucmd.db_addr, &srq->db);
if (err) {
mlx5_ib_dbg(dev, "map doorbell failed\n");
goto err_in;
@@ -144,7 +112,7 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq,
in->log_page_size = page_shift - MLX5_ADAPTER_PAGE_SHIFT;
in->page_offset = offset;
- in->uid = to_mpd(pd)->uid;
+ in->uid = (in->type != IB_SRQT_XRC) ? to_mpd(pd)->uid : 0;
if (MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1 &&
in->type != IB_SRQT_BASIC)
in->user_index = uidx;
@@ -173,12 +141,16 @@ static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq,
return err;
}
- if (mlx5_buf_alloc(dev->mdev, buf_size, &srq->buf)) {
+ if (mlx5_frag_buf_alloc_node(dev->mdev, buf_size, &srq->buf,
+ dev->mdev->priv.numa_node)) {
mlx5_ib_dbg(dev, "buf alloc failed\n");
err = -ENOMEM;
goto err_db;
}
+ mlx5_init_fbc(srq->buf.frags, srq->msrq.wqe_shift, ilog2(srq->msrq.max),
+ &srq->fbc);
+
srq->head = 0;
srq->tail = srq->msrq.max - 1;
srq->wqe_ctr = 0;
@@ -195,14 +167,14 @@ static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq,
err = -ENOMEM;
goto err_buf;
}
- mlx5_fill_page_array(&srq->buf, in->pas);
+ mlx5_fill_page_frag_array(&srq->buf, in->pas);
srq->wrid = kvmalloc_array(srq->msrq.max, sizeof(u64), GFP_KERNEL);
if (!srq->wrid) {
err = -ENOMEM;
goto err_in;
}
- srq->wq_sig = !!srq_signature;
+ srq->wq_sig = 0;
in->log_page_size = srq->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT;
if (MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1 &&
@@ -215,7 +187,7 @@ err_in:
kvfree(in->pas);
err_buf:
- mlx5_buf_free(dev->mdev, &srq->buf);
+ mlx5_frag_buf_free(dev->mdev, &srq->buf);
err_db:
mlx5_db_free(dev->mdev, &srq->db);
@@ -232,7 +204,7 @@ static void destroy_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq)
static void destroy_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq)
{
kvfree(srq->wrid);
- mlx5_buf_free(dev->mdev, &srq->buf);
+ mlx5_frag_buf_free(dev->mdev, &srq->buf);
mlx5_db_free(dev->mdev, &srq->db);
}
@@ -287,14 +259,14 @@ struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd,
}
in.type = init_attr->srq_type;
- if (pd->uobject)
+ if (udata)
err = create_srq_user(pd, srq, &in, udata, buf_size);
else
err = create_srq_kernel(dev, srq, &in, buf_size);
if (err) {
mlx5_ib_warn(dev, "create srq %s failed, err %d\n",
- pd->uobject ? "user" : "kernel", err);
+ udata ? "user" : "kernel", err);
goto err_srq;
}
@@ -327,7 +299,7 @@ struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd,
in.pd = to_mpd(pd)->pdn;
in.db_record = srq->db.dma;
- err = mlx5_core_create_srq(dev->mdev, &srq->msrq, &in);
+ err = mlx5_cmd_create_srq(dev, &srq->msrq, &in);
kvfree(in.pas);
if (err) {
mlx5_ib_dbg(dev, "create SRQ failed, err %d\n", err);
@@ -339,7 +311,7 @@ struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd,
srq->msrq.event = mlx5_ib_srq_event;
srq->ibsrq.ext.xrc.srq_num = srq->msrq.srqn;
- if (pd->uobject)
+ if (udata)
if (ib_copy_to_udata(udata, &srq->msrq.srqn, sizeof(__u32))) {
mlx5_ib_dbg(dev, "copy to user failed\n");
err = -EFAULT;
@@ -351,10 +323,10 @@ struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd,
return &srq->ibsrq;
err_core:
- mlx5_core_destroy_srq(dev->mdev, &srq->msrq);
+ mlx5_cmd_destroy_srq(dev, &srq->msrq);
err_usr_kern_srq:
- if (pd->uobject)
+ if (udata)
destroy_srq_user(pd, srq);
else
destroy_srq_kernel(dev, srq);
@@ -381,7 +353,7 @@ int mlx5_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
return -EINVAL;
mutex_lock(&srq->mutex);
- ret = mlx5_core_arm_srq(dev->mdev, &srq->msrq, attr->srq_limit, 1);
+ ret = mlx5_cmd_arm_srq(dev, &srq->msrq, attr->srq_limit, 1);
mutex_unlock(&srq->mutex);
if (ret)
@@ -402,7 +374,7 @@ int mlx5_ib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr)
if (!out)
return -ENOMEM;
- ret = mlx5_core_query_srq(dev->mdev, &srq->msrq, out);
+ ret = mlx5_cmd_query_srq(dev, &srq->msrq, out);
if (ret)
goto out_box;
@@ -420,7 +392,7 @@ int mlx5_ib_destroy_srq(struct ib_srq *srq)
struct mlx5_ib_dev *dev = to_mdev(srq->device);
struct mlx5_ib_srq *msrq = to_msrq(srq);
- mlx5_core_destroy_srq(dev->mdev, &msrq->msrq);
+ mlx5_cmd_destroy_srq(dev, &msrq->msrq);
if (srq->uobject) {
mlx5_ib_db_unmap_user(to_mucontext(srq->uobject->context), &msrq->db);
diff --git a/drivers/infiniband/hw/mlx5/srq.h b/drivers/infiniband/hw/mlx5/srq.h
new file mode 100644
index 000000000000..c330af35ff10
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/srq.h
@@ -0,0 +1,71 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2013-2018, Mellanox Technologies. All rights reserved.
+ */
+
+#ifndef MLX5_IB_SRQ_H
+#define MLX5_IB_SRQ_H
+
+enum {
+ MLX5_SRQ_FLAG_ERR = (1 << 0),
+ MLX5_SRQ_FLAG_WQ_SIG = (1 << 1),
+ MLX5_SRQ_FLAG_RNDV = (1 << 2),
+};
+
+struct mlx5_srq_attr {
+ u32 type;
+ u32 flags;
+ u32 log_size;
+ u32 wqe_shift;
+ u32 log_page_size;
+ u32 wqe_cnt;
+ u32 srqn;
+ u32 xrcd;
+ u32 page_offset;
+ u32 cqn;
+ u32 pd;
+ u32 lwm;
+ u32 user_index;
+ u64 db_record;
+ __be64 *pas;
+ u32 tm_log_list_size;
+ u32 tm_next_tag;
+ u32 tm_hw_phase_cnt;
+ u32 tm_sw_phase_cnt;
+ u16 uid;
+};
+
+struct mlx5_ib_dev;
+
+struct mlx5_core_srq {
+ struct mlx5_core_rsc_common common; /* must be first */
+ u32 srqn;
+ int max;
+ size_t max_gs;
+ size_t max_avail_gather;
+ int wqe_shift;
+ void (*event)(struct mlx5_core_srq *srq, enum mlx5_event e);
+
+ u16 uid;
+};
+
+struct mlx5_srq_table {
+ struct notifier_block nb;
+ /* protect radix tree
+ */
+ spinlock_t lock;
+ struct radix_tree_root tree;
+};
+
+int mlx5_cmd_create_srq(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq,
+ struct mlx5_srq_attr *in);
+int mlx5_cmd_destroy_srq(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq);
+int mlx5_cmd_query_srq(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq,
+ struct mlx5_srq_attr *out);
+int mlx5_cmd_arm_srq(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq,
+ u16 lwm, int is_srq);
+struct mlx5_core_srq *mlx5_cmd_get_srq(struct mlx5_ib_dev *dev, u32 srqn);
+
+int mlx5_init_srq_table(struct mlx5_ib_dev *dev);
+void mlx5_cleanup_srq_table(struct mlx5_ib_dev *dev);
+#endif /* MLX5_IB_SRQ_H */
diff --git a/drivers/infiniband/hw/mlx5/srq_cmd.c b/drivers/infiniband/hw/mlx5/srq_cmd.c
new file mode 100644
index 000000000000..63ac38bb3498
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/srq_cmd.c
@@ -0,0 +1,720 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2013-2018, Mellanox Technologies inc. All rights reserved.
+ */
+
+#include <linux/kernel.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/cmd.h>
+#include "mlx5_ib.h"
+#include "srq.h"
+
+static int get_pas_size(struct mlx5_srq_attr *in)
+{
+ u32 log_page_size = in->log_page_size + 12;
+ u32 log_srq_size = in->log_size;
+ u32 log_rq_stride = in->wqe_shift;
+ u32 page_offset = in->page_offset;
+ u32 po_quanta = 1 << (log_page_size - 6);
+ u32 rq_sz = 1 << (log_srq_size + 4 + log_rq_stride);
+ u32 page_size = 1 << log_page_size;
+ u32 rq_sz_po = rq_sz + (page_offset * po_quanta);
+ u32 rq_num_pas = DIV_ROUND_UP(rq_sz_po, page_size);
+
+ return rq_num_pas * sizeof(u64);
+}
+
+static void set_wq(void *wq, struct mlx5_srq_attr *in)
+{
+ MLX5_SET(wq, wq, wq_signature, !!(in->flags
+ & MLX5_SRQ_FLAG_WQ_SIG));
+ MLX5_SET(wq, wq, log_wq_pg_sz, in->log_page_size);
+ MLX5_SET(wq, wq, log_wq_stride, in->wqe_shift + 4);
+ MLX5_SET(wq, wq, log_wq_sz, in->log_size);
+ MLX5_SET(wq, wq, page_offset, in->page_offset);
+ MLX5_SET(wq, wq, lwm, in->lwm);
+ MLX5_SET(wq, wq, pd, in->pd);
+ MLX5_SET64(wq, wq, dbr_addr, in->db_record);
+}
+
+static void set_srqc(void *srqc, struct mlx5_srq_attr *in)
+{
+ MLX5_SET(srqc, srqc, wq_signature, !!(in->flags
+ & MLX5_SRQ_FLAG_WQ_SIG));
+ MLX5_SET(srqc, srqc, log_page_size, in->log_page_size);
+ MLX5_SET(srqc, srqc, log_rq_stride, in->wqe_shift);
+ MLX5_SET(srqc, srqc, log_srq_size, in->log_size);
+ MLX5_SET(srqc, srqc, page_offset, in->page_offset);
+ MLX5_SET(srqc, srqc, lwm, in->lwm);
+ MLX5_SET(srqc, srqc, pd, in->pd);
+ MLX5_SET64(srqc, srqc, dbr_addr, in->db_record);
+ MLX5_SET(srqc, srqc, xrcd, in->xrcd);
+ MLX5_SET(srqc, srqc, cqn, in->cqn);
+}
+
+static void get_wq(void *wq, struct mlx5_srq_attr *in)
+{
+ if (MLX5_GET(wq, wq, wq_signature))
+ in->flags &= MLX5_SRQ_FLAG_WQ_SIG;
+ in->log_page_size = MLX5_GET(wq, wq, log_wq_pg_sz);
+ in->wqe_shift = MLX5_GET(wq, wq, log_wq_stride) - 4;
+ in->log_size = MLX5_GET(wq, wq, log_wq_sz);
+ in->page_offset = MLX5_GET(wq, wq, page_offset);
+ in->lwm = MLX5_GET(wq, wq, lwm);
+ in->pd = MLX5_GET(wq, wq, pd);
+ in->db_record = MLX5_GET64(wq, wq, dbr_addr);
+}
+
+static void get_srqc(void *srqc, struct mlx5_srq_attr *in)
+{
+ if (MLX5_GET(srqc, srqc, wq_signature))
+ in->flags &= MLX5_SRQ_FLAG_WQ_SIG;
+ in->log_page_size = MLX5_GET(srqc, srqc, log_page_size);
+ in->wqe_shift = MLX5_GET(srqc, srqc, log_rq_stride);
+ in->log_size = MLX5_GET(srqc, srqc, log_srq_size);
+ in->page_offset = MLX5_GET(srqc, srqc, page_offset);
+ in->lwm = MLX5_GET(srqc, srqc, lwm);
+ in->pd = MLX5_GET(srqc, srqc, pd);
+ in->db_record = MLX5_GET64(srqc, srqc, dbr_addr);
+}
+
+struct mlx5_core_srq *mlx5_cmd_get_srq(struct mlx5_ib_dev *dev, u32 srqn)
+{
+ struct mlx5_srq_table *table = &dev->srq_table;
+ struct mlx5_core_srq *srq;
+
+ spin_lock(&table->lock);
+
+ srq = radix_tree_lookup(&table->tree, srqn);
+ if (srq)
+ atomic_inc(&srq->common.refcount);
+
+ spin_unlock(&table->lock);
+
+ return srq;
+}
+
+static int create_srq_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq,
+ struct mlx5_srq_attr *in)
+{
+ u32 create_out[MLX5_ST_SZ_DW(create_srq_out)] = {0};
+ void *create_in;
+ void *srqc;
+ void *pas;
+ int pas_size;
+ int inlen;
+ int err;
+
+ pas_size = get_pas_size(in);
+ inlen = MLX5_ST_SZ_BYTES(create_srq_in) + pas_size;
+ create_in = kvzalloc(inlen, GFP_KERNEL);
+ if (!create_in)
+ return -ENOMEM;
+
+ MLX5_SET(create_srq_in, create_in, uid, in->uid);
+ srqc = MLX5_ADDR_OF(create_srq_in, create_in, srq_context_entry);
+ pas = MLX5_ADDR_OF(create_srq_in, create_in, pas);
+
+ set_srqc(srqc, in);
+ memcpy(pas, in->pas, pas_size);
+
+ MLX5_SET(create_srq_in, create_in, opcode,
+ MLX5_CMD_OP_CREATE_SRQ);
+
+ err = mlx5_cmd_exec(dev->mdev, create_in, inlen, create_out,
+ sizeof(create_out));
+ kvfree(create_in);
+ if (!err) {
+ srq->srqn = MLX5_GET(create_srq_out, create_out, srqn);
+ srq->uid = in->uid;
+ }
+
+ return err;
+}
+
+static int destroy_srq_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq)
+{
+ u32 srq_in[MLX5_ST_SZ_DW(destroy_srq_in)] = {0};
+ u32 srq_out[MLX5_ST_SZ_DW(destroy_srq_out)] = {0};
+
+ MLX5_SET(destroy_srq_in, srq_in, opcode,
+ MLX5_CMD_OP_DESTROY_SRQ);
+ MLX5_SET(destroy_srq_in, srq_in, srqn, srq->srqn);
+ MLX5_SET(destroy_srq_in, srq_in, uid, srq->uid);
+
+ return mlx5_cmd_exec(dev->mdev, srq_in, sizeof(srq_in), srq_out,
+ sizeof(srq_out));
+}
+
+static int arm_srq_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq,
+ u16 lwm, int is_srq)
+{
+ u32 srq_in[MLX5_ST_SZ_DW(arm_rq_in)] = {0};
+ u32 srq_out[MLX5_ST_SZ_DW(arm_rq_out)] = {0};
+
+ MLX5_SET(arm_rq_in, srq_in, opcode, MLX5_CMD_OP_ARM_RQ);
+ MLX5_SET(arm_rq_in, srq_in, op_mod, MLX5_ARM_RQ_IN_OP_MOD_SRQ);
+ MLX5_SET(arm_rq_in, srq_in, srq_number, srq->srqn);
+ MLX5_SET(arm_rq_in, srq_in, lwm, lwm);
+ MLX5_SET(arm_rq_in, srq_in, uid, srq->uid);
+
+ return mlx5_cmd_exec(dev->mdev, srq_in, sizeof(srq_in), srq_out,
+ sizeof(srq_out));
+}
+
+static int query_srq_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq,
+ struct mlx5_srq_attr *out)
+{
+ u32 srq_in[MLX5_ST_SZ_DW(query_srq_in)] = {0};
+ u32 *srq_out;
+ void *srqc;
+ int err;
+
+ srq_out = kvzalloc(MLX5_ST_SZ_BYTES(query_srq_out), GFP_KERNEL);
+ if (!srq_out)
+ return -ENOMEM;
+
+ MLX5_SET(query_srq_in, srq_in, opcode,
+ MLX5_CMD_OP_QUERY_SRQ);
+ MLX5_SET(query_srq_in, srq_in, srqn, srq->srqn);
+ err = mlx5_cmd_exec(dev->mdev, srq_in, sizeof(srq_in), srq_out,
+ MLX5_ST_SZ_BYTES(query_srq_out));
+ if (err)
+ goto out;
+
+ srqc = MLX5_ADDR_OF(query_srq_out, srq_out, srq_context_entry);
+ get_srqc(srqc, out);
+ if (MLX5_GET(srqc, srqc, state) != MLX5_SRQC_STATE_GOOD)
+ out->flags |= MLX5_SRQ_FLAG_ERR;
+out:
+ kvfree(srq_out);
+ return err;
+}
+
+static int create_xrc_srq_cmd(struct mlx5_ib_dev *dev,
+ struct mlx5_core_srq *srq,
+ struct mlx5_srq_attr *in)
+{
+ u32 create_out[MLX5_ST_SZ_DW(create_xrc_srq_out)];
+ void *create_in;
+ void *xrc_srqc;
+ void *pas;
+ int pas_size;
+ int inlen;
+ int err;
+
+ pas_size = get_pas_size(in);
+ inlen = MLX5_ST_SZ_BYTES(create_xrc_srq_in) + pas_size;
+ create_in = kvzalloc(inlen, GFP_KERNEL);
+ if (!create_in)
+ return -ENOMEM;
+
+ MLX5_SET(create_xrc_srq_in, create_in, uid, in->uid);
+ xrc_srqc = MLX5_ADDR_OF(create_xrc_srq_in, create_in,
+ xrc_srq_context_entry);
+ pas = MLX5_ADDR_OF(create_xrc_srq_in, create_in, pas);
+
+ set_srqc(xrc_srqc, in);
+ MLX5_SET(xrc_srqc, xrc_srqc, user_index, in->user_index);
+ memcpy(pas, in->pas, pas_size);
+ MLX5_SET(create_xrc_srq_in, create_in, opcode,
+ MLX5_CMD_OP_CREATE_XRC_SRQ);
+
+ memset(create_out, 0, sizeof(create_out));
+ err = mlx5_cmd_exec(dev->mdev, create_in, inlen, create_out,
+ sizeof(create_out));
+ if (err)
+ goto out;
+
+ srq->srqn = MLX5_GET(create_xrc_srq_out, create_out, xrc_srqn);
+ srq->uid = in->uid;
+out:
+ kvfree(create_in);
+ return err;
+}
+
+static int destroy_xrc_srq_cmd(struct mlx5_ib_dev *dev,
+ struct mlx5_core_srq *srq)
+{
+ u32 xrcsrq_in[MLX5_ST_SZ_DW(destroy_xrc_srq_in)] = {0};
+ u32 xrcsrq_out[MLX5_ST_SZ_DW(destroy_xrc_srq_out)] = {0};
+
+ MLX5_SET(destroy_xrc_srq_in, xrcsrq_in, opcode,
+ MLX5_CMD_OP_DESTROY_XRC_SRQ);
+ MLX5_SET(destroy_xrc_srq_in, xrcsrq_in, xrc_srqn, srq->srqn);
+ MLX5_SET(destroy_xrc_srq_in, xrcsrq_in, uid, srq->uid);
+
+ return mlx5_cmd_exec(dev->mdev, xrcsrq_in, sizeof(xrcsrq_in),
+ xrcsrq_out, sizeof(xrcsrq_out));
+}
+
+static int arm_xrc_srq_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq,
+ u16 lwm)
+{
+ u32 xrcsrq_in[MLX5_ST_SZ_DW(arm_xrc_srq_in)] = {0};
+ u32 xrcsrq_out[MLX5_ST_SZ_DW(arm_xrc_srq_out)] = {0};
+
+ MLX5_SET(arm_xrc_srq_in, xrcsrq_in, opcode, MLX5_CMD_OP_ARM_XRC_SRQ);
+ MLX5_SET(arm_xrc_srq_in, xrcsrq_in, op_mod, MLX5_ARM_XRC_SRQ_IN_OP_MOD_XRC_SRQ);
+ MLX5_SET(arm_xrc_srq_in, xrcsrq_in, xrc_srqn, srq->srqn);
+ MLX5_SET(arm_xrc_srq_in, xrcsrq_in, lwm, lwm);
+ MLX5_SET(arm_xrc_srq_in, xrcsrq_in, uid, srq->uid);
+
+ return mlx5_cmd_exec(dev->mdev, xrcsrq_in, sizeof(xrcsrq_in),
+ xrcsrq_out, sizeof(xrcsrq_out));
+}
+
+static int query_xrc_srq_cmd(struct mlx5_ib_dev *dev,
+ struct mlx5_core_srq *srq,
+ struct mlx5_srq_attr *out)
+{
+ u32 xrcsrq_in[MLX5_ST_SZ_DW(query_xrc_srq_in)];
+ u32 *xrcsrq_out;
+ void *xrc_srqc;
+ int err;
+
+ xrcsrq_out = kvzalloc(MLX5_ST_SZ_BYTES(query_xrc_srq_out), GFP_KERNEL);
+ if (!xrcsrq_out)
+ return -ENOMEM;
+ memset(xrcsrq_in, 0, sizeof(xrcsrq_in));
+
+ MLX5_SET(query_xrc_srq_in, xrcsrq_in, opcode,
+ MLX5_CMD_OP_QUERY_XRC_SRQ);
+ MLX5_SET(query_xrc_srq_in, xrcsrq_in, xrc_srqn, srq->srqn);
+
+ err = mlx5_cmd_exec(dev->mdev, xrcsrq_in, sizeof(xrcsrq_in),
+ xrcsrq_out, MLX5_ST_SZ_BYTES(query_xrc_srq_out));
+ if (err)
+ goto out;
+
+ xrc_srqc = MLX5_ADDR_OF(query_xrc_srq_out, xrcsrq_out,
+ xrc_srq_context_entry);
+ get_srqc(xrc_srqc, out);
+ if (MLX5_GET(xrc_srqc, xrc_srqc, state) != MLX5_XRC_SRQC_STATE_GOOD)
+ out->flags |= MLX5_SRQ_FLAG_ERR;
+
+out:
+ kvfree(xrcsrq_out);
+ return err;
+}
+
+static int create_rmp_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq,
+ struct mlx5_srq_attr *in)
+{
+ void *create_out = NULL;
+ void *create_in = NULL;
+ void *rmpc;
+ void *wq;
+ int pas_size;
+ int outlen;
+ int inlen;
+ int err;
+
+ pas_size = get_pas_size(in);
+ inlen = MLX5_ST_SZ_BYTES(create_rmp_in) + pas_size;
+ outlen = MLX5_ST_SZ_BYTES(create_rmp_out);
+ create_in = kvzalloc(inlen, GFP_KERNEL);
+ create_out = kvzalloc(outlen, GFP_KERNEL);
+ if (!create_in || !create_out) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ rmpc = MLX5_ADDR_OF(create_rmp_in, create_in, ctx);
+ wq = MLX5_ADDR_OF(rmpc, rmpc, wq);
+
+ MLX5_SET(rmpc, rmpc, state, MLX5_RMPC_STATE_RDY);
+ MLX5_SET(create_rmp_in, create_in, uid, in->uid);
+ set_wq(wq, in);
+ memcpy(MLX5_ADDR_OF(rmpc, rmpc, wq.pas), in->pas, pas_size);
+
+ MLX5_SET(create_rmp_in, create_in, opcode, MLX5_CMD_OP_CREATE_RMP);
+ err = mlx5_cmd_exec(dev->mdev, create_in, inlen, create_out, outlen);
+ if (!err) {
+ srq->srqn = MLX5_GET(create_rmp_out, create_out, rmpn);
+ srq->uid = in->uid;
+ }
+
+out:
+ kvfree(create_in);
+ kvfree(create_out);
+ return err;
+}
+
+static int destroy_rmp_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq)
+{
+ u32 in[MLX5_ST_SZ_DW(destroy_rmp_in)] = {};
+ u32 out[MLX5_ST_SZ_DW(destroy_rmp_out)] = {};
+
+ MLX5_SET(destroy_rmp_in, in, opcode, MLX5_CMD_OP_DESTROY_RMP);
+ MLX5_SET(destroy_rmp_in, in, rmpn, srq->srqn);
+ MLX5_SET(destroy_rmp_in, in, uid, srq->uid);
+ return mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, sizeof(out));
+}
+
+static int arm_rmp_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq,
+ u16 lwm)
+{
+ void *out = NULL;
+ void *in = NULL;
+ void *rmpc;
+ void *wq;
+ void *bitmask;
+ int outlen;
+ int inlen;
+ int err;
+
+ inlen = MLX5_ST_SZ_BYTES(modify_rmp_in);
+ outlen = MLX5_ST_SZ_BYTES(modify_rmp_out);
+
+ in = kvzalloc(inlen, GFP_KERNEL);
+ out = kvzalloc(outlen, GFP_KERNEL);
+ if (!in || !out) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ rmpc = MLX5_ADDR_OF(modify_rmp_in, in, ctx);
+ bitmask = MLX5_ADDR_OF(modify_rmp_in, in, bitmask);
+ wq = MLX5_ADDR_OF(rmpc, rmpc, wq);
+
+ MLX5_SET(modify_rmp_in, in, rmp_state, MLX5_RMPC_STATE_RDY);
+ MLX5_SET(modify_rmp_in, in, rmpn, srq->srqn);
+ MLX5_SET(modify_rmp_in, in, uid, srq->uid);
+ MLX5_SET(wq, wq, lwm, lwm);
+ MLX5_SET(rmp_bitmask, bitmask, lwm, 1);
+ MLX5_SET(rmpc, rmpc, state, MLX5_RMPC_STATE_RDY);
+ MLX5_SET(modify_rmp_in, in, opcode, MLX5_CMD_OP_MODIFY_RMP);
+
+ err = mlx5_cmd_exec(dev->mdev, in, inlen, out, outlen);
+
+out:
+ kvfree(in);
+ kvfree(out);
+ return err;
+}
+
+static int query_rmp_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq,
+ struct mlx5_srq_attr *out)
+{
+ u32 *rmp_out = NULL;
+ u32 *rmp_in = NULL;
+ void *rmpc;
+ int outlen;
+ int inlen;
+ int err;
+
+ outlen = MLX5_ST_SZ_BYTES(query_rmp_out);
+ inlen = MLX5_ST_SZ_BYTES(query_rmp_in);
+
+ rmp_out = kvzalloc(outlen, GFP_KERNEL);
+ rmp_in = kvzalloc(inlen, GFP_KERNEL);
+ if (!rmp_out || !rmp_in) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ MLX5_SET(query_rmp_in, rmp_in, opcode, MLX5_CMD_OP_QUERY_RMP);
+ MLX5_SET(query_rmp_in, rmp_in, rmpn, srq->srqn);
+ err = mlx5_cmd_exec(dev->mdev, rmp_in, inlen, rmp_out, outlen);
+ if (err)
+ goto out;
+
+ rmpc = MLX5_ADDR_OF(query_rmp_out, rmp_out, rmp_context);
+ get_wq(MLX5_ADDR_OF(rmpc, rmpc, wq), out);
+ if (MLX5_GET(rmpc, rmpc, state) != MLX5_RMPC_STATE_RDY)
+ out->flags |= MLX5_SRQ_FLAG_ERR;
+
+out:
+ kvfree(rmp_out);
+ kvfree(rmp_in);
+ return err;
+}
+
+static int create_xrq_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq,
+ struct mlx5_srq_attr *in)
+{
+ u32 create_out[MLX5_ST_SZ_DW(create_xrq_out)] = {0};
+ void *create_in;
+ void *xrqc;
+ void *wq;
+ int pas_size;
+ int inlen;
+ int err;
+
+ pas_size = get_pas_size(in);
+ inlen = MLX5_ST_SZ_BYTES(create_xrq_in) + pas_size;
+ create_in = kvzalloc(inlen, GFP_KERNEL);
+ if (!create_in)
+ return -ENOMEM;
+
+ xrqc = MLX5_ADDR_OF(create_xrq_in, create_in, xrq_context);
+ wq = MLX5_ADDR_OF(xrqc, xrqc, wq);
+
+ set_wq(wq, in);
+ memcpy(MLX5_ADDR_OF(xrqc, xrqc, wq.pas), in->pas, pas_size);
+
+ if (in->type == IB_SRQT_TM) {
+ MLX5_SET(xrqc, xrqc, topology, MLX5_XRQC_TOPOLOGY_TAG_MATCHING);
+ if (in->flags & MLX5_SRQ_FLAG_RNDV)
+ MLX5_SET(xrqc, xrqc, offload, MLX5_XRQC_OFFLOAD_RNDV);
+ MLX5_SET(xrqc, xrqc,
+ tag_matching_topology_context.log_matching_list_sz,
+ in->tm_log_list_size);
+ }
+ MLX5_SET(xrqc, xrqc, user_index, in->user_index);
+ MLX5_SET(xrqc, xrqc, cqn, in->cqn);
+ MLX5_SET(create_xrq_in, create_in, opcode, MLX5_CMD_OP_CREATE_XRQ);
+ MLX5_SET(create_xrq_in, create_in, uid, in->uid);
+ err = mlx5_cmd_exec(dev->mdev, create_in, inlen, create_out,
+ sizeof(create_out));
+ kvfree(create_in);
+ if (!err) {
+ srq->srqn = MLX5_GET(create_xrq_out, create_out, xrqn);
+ srq->uid = in->uid;
+ }
+
+ return err;
+}
+
+static int destroy_xrq_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq)
+{
+ u32 in[MLX5_ST_SZ_DW(destroy_xrq_in)] = {0};
+ u32 out[MLX5_ST_SZ_DW(destroy_xrq_out)] = {0};
+
+ MLX5_SET(destroy_xrq_in, in, opcode, MLX5_CMD_OP_DESTROY_XRQ);
+ MLX5_SET(destroy_xrq_in, in, xrqn, srq->srqn);
+ MLX5_SET(destroy_xrq_in, in, uid, srq->uid);
+
+ return mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, sizeof(out));
+}
+
+static int arm_xrq_cmd(struct mlx5_ib_dev *dev,
+ struct mlx5_core_srq *srq,
+ u16 lwm)
+{
+ u32 out[MLX5_ST_SZ_DW(arm_rq_out)] = {0};
+ u32 in[MLX5_ST_SZ_DW(arm_rq_in)] = {0};
+
+ MLX5_SET(arm_rq_in, in, opcode, MLX5_CMD_OP_ARM_RQ);
+ MLX5_SET(arm_rq_in, in, op_mod, MLX5_ARM_RQ_IN_OP_MOD_XRQ);
+ MLX5_SET(arm_rq_in, in, srq_number, srq->srqn);
+ MLX5_SET(arm_rq_in, in, lwm, lwm);
+ MLX5_SET(arm_rq_in, in, uid, srq->uid);
+
+ return mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, sizeof(out));
+}
+
+static int query_xrq_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq,
+ struct mlx5_srq_attr *out)
+{
+ u32 in[MLX5_ST_SZ_DW(query_xrq_in)] = {0};
+ u32 *xrq_out;
+ int outlen = MLX5_ST_SZ_BYTES(query_xrq_out);
+ void *xrqc;
+ int err;
+
+ xrq_out = kvzalloc(outlen, GFP_KERNEL);
+ if (!xrq_out)
+ return -ENOMEM;
+
+ MLX5_SET(query_xrq_in, in, opcode, MLX5_CMD_OP_QUERY_XRQ);
+ MLX5_SET(query_xrq_in, in, xrqn, srq->srqn);
+
+ err = mlx5_cmd_exec(dev->mdev, in, sizeof(in), xrq_out, outlen);
+ if (err)
+ goto out;
+
+ xrqc = MLX5_ADDR_OF(query_xrq_out, xrq_out, xrq_context);
+ get_wq(MLX5_ADDR_OF(xrqc, xrqc, wq), out);
+ if (MLX5_GET(xrqc, xrqc, state) != MLX5_XRQC_STATE_GOOD)
+ out->flags |= MLX5_SRQ_FLAG_ERR;
+ out->tm_next_tag =
+ MLX5_GET(xrqc, xrqc,
+ tag_matching_topology_context.append_next_index);
+ out->tm_hw_phase_cnt =
+ MLX5_GET(xrqc, xrqc,
+ tag_matching_topology_context.hw_phase_cnt);
+ out->tm_sw_phase_cnt =
+ MLX5_GET(xrqc, xrqc,
+ tag_matching_topology_context.sw_phase_cnt);
+
+out:
+ kvfree(xrq_out);
+ return err;
+}
+
+static int create_srq_split(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq,
+ struct mlx5_srq_attr *in)
+{
+ if (!dev->mdev->issi)
+ return create_srq_cmd(dev, srq, in);
+ switch (srq->common.res) {
+ case MLX5_RES_XSRQ:
+ return create_xrc_srq_cmd(dev, srq, in);
+ case MLX5_RES_XRQ:
+ return create_xrq_cmd(dev, srq, in);
+ default:
+ return create_rmp_cmd(dev, srq, in);
+ }
+}
+
+static int destroy_srq_split(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq)
+{
+ if (!dev->mdev->issi)
+ return destroy_srq_cmd(dev, srq);
+ switch (srq->common.res) {
+ case MLX5_RES_XSRQ:
+ return destroy_xrc_srq_cmd(dev, srq);
+ case MLX5_RES_XRQ:
+ return destroy_xrq_cmd(dev, srq);
+ default:
+ return destroy_rmp_cmd(dev, srq);
+ }
+}
+
+int mlx5_cmd_create_srq(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq,
+ struct mlx5_srq_attr *in)
+{
+ struct mlx5_srq_table *table = &dev->srq_table;
+ int err;
+
+ switch (in->type) {
+ case IB_SRQT_XRC:
+ srq->common.res = MLX5_RES_XSRQ;
+ break;
+ case IB_SRQT_TM:
+ srq->common.res = MLX5_RES_XRQ;
+ break;
+ default:
+ srq->common.res = MLX5_RES_SRQ;
+ }
+
+ err = create_srq_split(dev, srq, in);
+ if (err)
+ return err;
+
+ atomic_set(&srq->common.refcount, 1);
+ init_completion(&srq->common.free);
+
+ spin_lock_irq(&table->lock);
+ err = radix_tree_insert(&table->tree, srq->srqn, srq);
+ spin_unlock_irq(&table->lock);
+ if (err)
+ goto err_destroy_srq_split;
+
+ return 0;
+
+err_destroy_srq_split:
+ destroy_srq_split(dev, srq);
+
+ return err;
+}
+
+int mlx5_cmd_destroy_srq(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq)
+{
+ struct mlx5_srq_table *table = &dev->srq_table;
+ struct mlx5_core_srq *tmp;
+ int err;
+
+ spin_lock_irq(&table->lock);
+ tmp = radix_tree_delete(&table->tree, srq->srqn);
+ spin_unlock_irq(&table->lock);
+ if (!tmp || tmp != srq)
+ return -EINVAL;
+
+ err = destroy_srq_split(dev, srq);
+ if (err)
+ return err;
+
+ mlx5_core_res_put(&srq->common);
+ wait_for_completion(&srq->common.free);
+
+ return 0;
+}
+
+int mlx5_cmd_query_srq(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq,
+ struct mlx5_srq_attr *out)
+{
+ if (!dev->mdev->issi)
+ return query_srq_cmd(dev, srq, out);
+ switch (srq->common.res) {
+ case MLX5_RES_XSRQ:
+ return query_xrc_srq_cmd(dev, srq, out);
+ case MLX5_RES_XRQ:
+ return query_xrq_cmd(dev, srq, out);
+ default:
+ return query_rmp_cmd(dev, srq, out);
+ }
+}
+
+int mlx5_cmd_arm_srq(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq,
+ u16 lwm, int is_srq)
+{
+ if (!dev->mdev->issi)
+ return arm_srq_cmd(dev, srq, lwm, is_srq);
+ switch (srq->common.res) {
+ case MLX5_RES_XSRQ:
+ return arm_xrc_srq_cmd(dev, srq, lwm);
+ case MLX5_RES_XRQ:
+ return arm_xrq_cmd(dev, srq, lwm);
+ default:
+ return arm_rmp_cmd(dev, srq, lwm);
+ }
+}
+
+static int srq_event_notifier(struct notifier_block *nb,
+ unsigned long type, void *data)
+{
+ struct mlx5_srq_table *table;
+ struct mlx5_core_srq *srq;
+ struct mlx5_eqe *eqe;
+ u32 srqn;
+
+ if (type != MLX5_EVENT_TYPE_SRQ_CATAS_ERROR &&
+ type != MLX5_EVENT_TYPE_SRQ_RQ_LIMIT)
+ return NOTIFY_DONE;
+
+ table = container_of(nb, struct mlx5_srq_table, nb);
+
+ eqe = data;
+ srqn = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff;
+
+ spin_lock(&table->lock);
+
+ srq = radix_tree_lookup(&table->tree, srqn);
+ if (srq)
+ atomic_inc(&srq->common.refcount);
+
+ spin_unlock(&table->lock);
+
+ if (!srq)
+ return NOTIFY_OK;
+
+ srq->event(srq, eqe->type);
+
+ mlx5_core_res_put(&srq->common);
+
+ return NOTIFY_OK;
+}
+
+int mlx5_init_srq_table(struct mlx5_ib_dev *dev)
+{
+ struct mlx5_srq_table *table = &dev->srq_table;
+
+ memset(table, 0, sizeof(*table));
+ spin_lock_init(&table->lock);
+ INIT_RADIX_TREE(&table->tree, GFP_ATOMIC);
+
+ table->nb.notifier_call = srq_event_notifier;
+ mlx5_notifier_register(dev->mdev, &table->nb);
+
+ return 0;
+}
+
+void mlx5_cleanup_srq_table(struct mlx5_ib_dev *dev)
+{
+ struct mlx5_srq_table *table = &dev->srq_table;
+
+ mlx5_notifier_unregister(dev->mdev, &table->nb);
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_dev.h b/drivers/infiniband/hw/mthca/mthca_dev.h
index 220a3e4717a3..bfd4eebc1182 100644
--- a/drivers/infiniband/hw/mthca/mthca_dev.h
+++ b/drivers/infiniband/hw/mthca/mthca_dev.h
@@ -510,7 +510,8 @@ int mthca_alloc_cq_buf(struct mthca_dev *dev, struct mthca_cq_buf *buf, int nent
void mthca_free_cq_buf(struct mthca_dev *dev, struct mthca_cq_buf *buf, int cqe);
int mthca_alloc_srq(struct mthca_dev *dev, struct mthca_pd *pd,
- struct ib_srq_attr *attr, struct mthca_srq *srq);
+ struct ib_srq_attr *attr, struct mthca_srq *srq,
+ struct ib_udata *udata);
void mthca_free_srq(struct mthca_dev *dev, struct mthca_srq *srq);
int mthca_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
enum ib_srq_attr_mask attr_mask, struct ib_udata *udata);
@@ -547,7 +548,8 @@ int mthca_alloc_qp(struct mthca_dev *dev,
enum ib_qp_type type,
enum ib_sig_type send_policy,
struct ib_qp_cap *cap,
- struct mthca_qp *qp);
+ struct mthca_qp *qp,
+ struct ib_udata *udata);
int mthca_alloc_sqp(struct mthca_dev *dev,
struct mthca_pd *pd,
struct mthca_cq *send_cq,
@@ -556,7 +558,8 @@ int mthca_alloc_sqp(struct mthca_dev *dev,
struct ib_qp_cap *cap,
int qpn,
int port,
- struct mthca_sqp *sqp);
+ struct mthca_sqp *sqp,
+ struct ib_udata *udata);
void mthca_free_qp(struct mthca_dev *dev, struct mthca_qp *qp);
int mthca_create_ah(struct mthca_dev *dev,
struct mthca_pd *pd,
diff --git a/drivers/infiniband/hw/mthca/mthca_mad.c b/drivers/infiniband/hw/mthca/mthca_mad.c
index 2e5dc0a67cfc..7ad517da4917 100644
--- a/drivers/infiniband/hw/mthca/mthca_mad.c
+++ b/drivers/infiniband/hw/mthca/mthca_mad.c
@@ -89,13 +89,13 @@ static void update_sm_ah(struct mthca_dev *dev,
rdma_ah_set_port_num(&ah_attr, port_num);
new_ah = rdma_create_ah(dev->send_agent[port_num - 1][0]->qp->pd,
- &ah_attr);
+ &ah_attr, 0);
if (IS_ERR(new_ah))
return;
spin_lock_irqsave(&dev->sm_lock, flags);
if (dev->sm_ah[port_num - 1])
- rdma_destroy_ah(dev->sm_ah[port_num - 1]);
+ rdma_destroy_ah(dev->sm_ah[port_num - 1], 0);
dev->sm_ah[port_num - 1] = new_ah;
spin_unlock_irqrestore(&dev->sm_lock, flags);
}
@@ -347,6 +347,7 @@ void mthca_free_agents(struct mthca_dev *dev)
}
if (dev->sm_ah[p])
- rdma_destroy_ah(dev->sm_ah[p]);
+ rdma_destroy_ah(dev->sm_ah[p],
+ RDMA_DESTROY_AH_SLEEPABLE);
}
}
diff --git a/drivers/infiniband/hw/mthca/mthca_main.c b/drivers/infiniband/hw/mthca/mthca_main.c
index 92c49bff22bc..fe9654a7af71 100644
--- a/drivers/infiniband/hw/mthca/mthca_main.c
+++ b/drivers/infiniband/hw/mthca/mthca_main.c
@@ -961,7 +961,7 @@ static int __mthca_init_one(struct pci_dev *pdev, int hca_type)
/* We can handle large RDMA requests, so allow larger segments. */
dma_set_max_seg_size(&pdev->dev, 1024 * 1024 * 1024);
- mdev = (struct mthca_dev *) ib_alloc_device(sizeof *mdev);
+ mdev = ib_alloc_device(mthca_dev, ib_dev);
if (!mdev) {
dev_err(&pdev->dev, "Device struct alloc failed, "
"aborting.\n");
diff --git a/drivers/infiniband/hw/mthca/mthca_memfree.c b/drivers/infiniband/hw/mthca/mthca_memfree.c
index cc9c0c8ccba3..112d2f38e0de 100644
--- a/drivers/infiniband/hw/mthca/mthca_memfree.c
+++ b/drivers/infiniband/hw/mthca/mthca_memfree.c
@@ -623,8 +623,9 @@ int mthca_alloc_db(struct mthca_dev *dev, enum mthca_db_type type,
page = dev->db_tab->page + end;
alloc:
- page->db_rec = dma_zalloc_coherent(&dev->pdev->dev, MTHCA_ICM_PAGE_SIZE,
- &page->mapping, GFP_KERNEL);
+ page->db_rec = dma_alloc_coherent(&dev->pdev->dev,
+ MTHCA_ICM_PAGE_SIZE, &page->mapping,
+ GFP_KERNEL);
if (!page->db_rec) {
ret = -ENOMEM;
goto out;
diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c
index 691c6f048938..d063d7a37762 100644
--- a/drivers/infiniband/hw/mthca/mthca_provider.c
+++ b/drivers/infiniband/hw/mthca/mthca_provider.c
@@ -37,6 +37,7 @@
#include <rdma/ib_smi.h>
#include <rdma/ib_umem.h>
#include <rdma/ib_user_verbs.h>
+#include <rdma/uverbs_ioctl.h>
#include <linux/sched.h>
#include <linux/slab.h>
@@ -300,17 +301,16 @@ static int mthca_query_gid(struct ib_device *ibdev, u8 port,
return err;
}
-static struct ib_ucontext *mthca_alloc_ucontext(struct ib_device *ibdev,
- struct ib_udata *udata)
+static int mthca_alloc_ucontext(struct ib_ucontext *uctx,
+ struct ib_udata *udata)
{
- struct mthca_alloc_ucontext_resp uresp;
- struct mthca_ucontext *context;
+ struct ib_device *ibdev = uctx->device;
+ struct mthca_alloc_ucontext_resp uresp = {};
+ struct mthca_ucontext *context = to_mucontext(uctx);
int err;
if (!(to_mdev(ibdev)->active))
- return ERR_PTR(-EAGAIN);
-
- memset(&uresp, 0, sizeof uresp);
+ return -EAGAIN;
uresp.qp_tab_size = to_mdev(ibdev)->limits.num_qps;
if (mthca_is_memfree(to_mdev(ibdev)))
@@ -318,44 +318,33 @@ static struct ib_ucontext *mthca_alloc_ucontext(struct ib_device *ibdev,
else
uresp.uarc_size = 0;
- context = kmalloc(sizeof *context, GFP_KERNEL);
- if (!context)
- return ERR_PTR(-ENOMEM);
-
err = mthca_uar_alloc(to_mdev(ibdev), &context->uar);
- if (err) {
- kfree(context);
- return ERR_PTR(err);
- }
+ if (err)
+ return err;
context->db_tab = mthca_init_user_db_tab(to_mdev(ibdev));
if (IS_ERR(context->db_tab)) {
err = PTR_ERR(context->db_tab);
mthca_uar_free(to_mdev(ibdev), &context->uar);
- kfree(context);
- return ERR_PTR(err);
+ return err;
}
- if (ib_copy_to_udata(udata, &uresp, sizeof uresp)) {
+ if (ib_copy_to_udata(udata, &uresp, sizeof(uresp))) {
mthca_cleanup_user_db_tab(to_mdev(ibdev), &context->uar, context->db_tab);
mthca_uar_free(to_mdev(ibdev), &context->uar);
- kfree(context);
- return ERR_PTR(-EFAULT);
+ return -EFAULT;
}
context->reg_mr_warned = 0;
- return &context->ibucontext;
+ return 0;
}
-static int mthca_dealloc_ucontext(struct ib_ucontext *context)
+static void mthca_dealloc_ucontext(struct ib_ucontext *context)
{
mthca_cleanup_user_db_tab(to_mdev(context->device), &to_mucontext(context)->uar,
to_mucontext(context)->db_tab);
mthca_uar_free(to_mdev(context->device), &to_mucontext(context)->uar);
- kfree(to_mucontext(context));
-
- return 0;
}
static int mthca_mmap_uar(struct ib_ucontext *context,
@@ -374,44 +363,35 @@ static int mthca_mmap_uar(struct ib_ucontext *context,
return 0;
}
-static struct ib_pd *mthca_alloc_pd(struct ib_device *ibdev,
- struct ib_ucontext *context,
- struct ib_udata *udata)
+static int mthca_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context,
+ struct ib_udata *udata)
{
- struct mthca_pd *pd;
+ struct ib_device *ibdev = ibpd->device;
+ struct mthca_pd *pd = to_mpd(ibpd);
int err;
- pd = kmalloc(sizeof *pd, GFP_KERNEL);
- if (!pd)
- return ERR_PTR(-ENOMEM);
-
err = mthca_pd_alloc(to_mdev(ibdev), !context, pd);
- if (err) {
- kfree(pd);
- return ERR_PTR(err);
- }
+ if (err)
+ return err;
if (context) {
if (ib_copy_to_udata(udata, &pd->pd_num, sizeof (__u32))) {
mthca_pd_free(to_mdev(ibdev), pd);
- kfree(pd);
- return ERR_PTR(-EFAULT);
+ return -EFAULT;
}
}
- return &pd->ibpd;
+ return 0;
}
-static int mthca_dealloc_pd(struct ib_pd *pd)
+static void mthca_dealloc_pd(struct ib_pd *pd)
{
mthca_pd_free(to_mdev(pd->device), to_mpd(pd));
- kfree(pd);
-
- return 0;
}
static struct ib_ah *mthca_ah_create(struct ib_pd *pd,
struct rdma_ah_attr *ah_attr,
+ u32 flags,
struct ib_udata *udata)
{
@@ -431,7 +411,7 @@ static struct ib_ah *mthca_ah_create(struct ib_pd *pd,
return &ah->ibah;
}
-static int mthca_ah_destroy(struct ib_ah *ah)
+static int mthca_ah_destroy(struct ib_ah *ah, u32 flags)
{
mthca_destroy_ah(to_mdev(ah->device), to_mah(ah));
kfree(ah);
@@ -444,7 +424,8 @@ static struct ib_srq *mthca_create_srq(struct ib_pd *pd,
struct ib_udata *udata)
{
struct mthca_create_srq ucmd;
- struct mthca_ucontext *context = NULL;
+ struct mthca_ucontext *context = rdma_udata_to_drv_context(
+ udata, struct mthca_ucontext, ibucontext);
struct mthca_srq *srq;
int err;
@@ -455,9 +436,7 @@ static struct ib_srq *mthca_create_srq(struct ib_pd *pd,
if (!srq)
return ERR_PTR(-ENOMEM);
- if (pd->uobject) {
- context = to_mucontext(pd->uobject->context);
-
+ if (udata) {
if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) {
err = -EFAULT;
goto err_free;
@@ -475,9 +454,9 @@ static struct ib_srq *mthca_create_srq(struct ib_pd *pd,
}
err = mthca_alloc_srq(to_mdev(pd->device), to_mpd(pd),
- &init_attr->attr, srq);
+ &init_attr->attr, srq, udata);
- if (err && pd->uobject)
+ if (err && udata)
mthca_unmap_user_db(to_mdev(pd->device), &context->uar,
context->db_tab, ucmd.db_index);
@@ -519,6 +498,8 @@ static struct ib_qp *mthca_create_qp(struct ib_pd *pd,
struct ib_qp_init_attr *init_attr,
struct ib_udata *udata)
{
+ struct mthca_ucontext *context = rdma_udata_to_drv_context(
+ udata, struct mthca_ucontext, ibucontext);
struct mthca_create_qp ucmd;
struct mthca_qp *qp;
int err;
@@ -531,15 +512,11 @@ static struct ib_qp *mthca_create_qp(struct ib_pd *pd,
case IB_QPT_UC:
case IB_QPT_UD:
{
- struct mthca_ucontext *context;
-
- qp = kmalloc(sizeof *qp, GFP_KERNEL);
+ qp = kzalloc(sizeof(*qp), GFP_KERNEL);
if (!qp)
return ERR_PTR(-ENOMEM);
- if (pd->uobject) {
- context = to_mucontext(pd->uobject->context);
-
+ if (udata) {
if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) {
kfree(qp);
return ERR_PTR(-EFAULT);
@@ -574,11 +551,9 @@ static struct ib_qp *mthca_create_qp(struct ib_pd *pd,
to_mcq(init_attr->send_cq),
to_mcq(init_attr->recv_cq),
init_attr->qp_type, init_attr->sq_sig_type,
- &init_attr->cap, qp);
-
- if (err && pd->uobject) {
- context = to_mucontext(pd->uobject->context);
+ &init_attr->cap, qp, udata);
+ if (err && udata) {
mthca_unmap_user_db(to_mdev(pd->device),
&context->uar,
context->db_tab,
@@ -596,10 +571,10 @@ static struct ib_qp *mthca_create_qp(struct ib_pd *pd,
case IB_QPT_GSI:
{
/* Don't allow userspace to create special QPs */
- if (pd->uobject)
+ if (udata)
return ERR_PTR(-EINVAL);
- qp = kmalloc(sizeof (struct mthca_sqp), GFP_KERNEL);
+ qp = kzalloc(sizeof(struct mthca_sqp), GFP_KERNEL);
if (!qp)
return ERR_PTR(-ENOMEM);
@@ -610,7 +585,7 @@ static struct ib_qp *mthca_create_qp(struct ib_pd *pd,
to_mcq(init_attr->recv_cq),
init_attr->sq_sig_type, &init_attr->cap,
qp->ibqp.qp_num, init_attr->port_num,
- to_msqp(qp));
+ to_msqp(qp), udata);
break;
}
default:
@@ -683,7 +658,7 @@ static struct ib_cq *mthca_create_cq(struct ib_device *ibdev,
goto err_unmap_set;
}
- cq = kmalloc(sizeof *cq, GFP_KERNEL);
+ cq = kzalloc(sizeof(*cq), GFP_KERNEL);
if (!cq) {
err = -ENOMEM;
goto err_unmap_arm;
@@ -906,22 +881,23 @@ static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
u64 virt, int acc, struct ib_udata *udata)
{
struct mthca_dev *dev = to_mdev(pd->device);
- struct scatterlist *sg;
+ struct sg_dma_page_iter sg_iter;
+ struct mthca_ucontext *context = rdma_udata_to_drv_context(
+ udata, struct mthca_ucontext, ibucontext);
struct mthca_mr *mr;
struct mthca_reg_mr ucmd;
u64 *pages;
- int shift, n, len;
- int i, k, entry;
+ int n, i;
int err = 0;
int write_mtt_size;
if (udata->inlen < sizeof ucmd) {
- if (!to_mucontext(pd->uobject->context)->reg_mr_warned) {
+ if (!context->reg_mr_warned) {
mthca_warn(dev, "Process '%s' did not pass in MR attrs.\n",
current->comm);
mthca_warn(dev, " Update libmthca to fix this.\n");
}
- ++to_mucontext(pd->uobject->context)->reg_mr_warned;
+ ++context->reg_mr_warned;
ucmd.mr_attrs = 0;
} else if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd))
return ERR_PTR(-EFAULT);
@@ -930,7 +906,7 @@ static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
if (!mr)
return ERR_PTR(-ENOMEM);
- mr->umem = ib_umem_get(pd->uobject->context, start, length, acc,
+ mr->umem = ib_umem_get(udata, start, length, acc,
ucmd.mr_attrs & MTHCA_MR_DMASYNC);
if (IS_ERR(mr->umem)) {
@@ -938,7 +914,6 @@ static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
goto err;
}
- shift = mr->umem->page_shift;
n = mr->umem->nmap;
mr->mtt = mthca_alloc_mtt(dev, n);
@@ -957,21 +932,19 @@ static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
write_mtt_size = min(mthca_write_mtt_size(dev), (int) (PAGE_SIZE / sizeof *pages));
- for_each_sg(mr->umem->sg_head.sgl, sg, mr->umem->nmap, entry) {
- len = sg_dma_len(sg) >> shift;
- for (k = 0; k < len; ++k) {
- pages[i++] = sg_dma_address(sg) + (k << shift);
- /*
- * Be friendly to write_mtt and pass it chunks
- * of appropriate size.
- */
- if (i == write_mtt_size) {
- err = mthca_write_mtt(dev, mr->mtt, n, pages, i);
- if (err)
- goto mtt_done;
- n += i;
- i = 0;
- }
+ for_each_sg_dma_page(mr->umem->sg_head.sgl, &sg_iter, mr->umem->nmap, 0) {
+ pages[i++] = sg_page_iter_dma_address(&sg_iter);
+
+ /*
+ * Be friendly to write_mtt and pass it chunks
+ * of appropriate size.
+ */
+ if (i == write_mtt_size) {
+ err = mthca_write_mtt(dev, mr->mtt, n, pages, i);
+ if (err)
+ goto mtt_done;
+ n += i;
+ i = 0;
}
}
@@ -982,7 +955,7 @@ mtt_done:
if (err)
goto err_mtt;
- err = mthca_mr_alloc(dev, to_mpd(pd)->pd_num, shift, virt, length,
+ err = mthca_mr_alloc(dev, to_mpd(pd)->pd_num, PAGE_SHIFT, virt, length,
convert_access(acc), mr);
if (err)
@@ -1080,7 +1053,8 @@ static ssize_t hw_rev_show(struct device *device,
struct device_attribute *attr, char *buf)
{
struct mthca_dev *dev =
- container_of(device, struct mthca_dev, ib_dev.dev);
+ rdma_device_to_drv_device(device, struct mthca_dev, ib_dev);
+
return sprintf(buf, "%x\n", dev->rev_id);
}
static DEVICE_ATTR_RO(hw_rev);
@@ -1089,7 +1063,8 @@ static ssize_t hca_type_show(struct device *device,
struct device_attribute *attr, char *buf)
{
struct mthca_dev *dev =
- container_of(device, struct mthca_dev, ib_dev.dev);
+ rdma_device_to_drv_device(device, struct mthca_dev, ib_dev);
+
switch (dev->pdev->device) {
case PCI_DEVICE_ID_MELLANOX_TAVOR:
return sprintf(buf, "MT23108\n");
@@ -1110,7 +1085,8 @@ static ssize_t board_id_show(struct device *device,
struct device_attribute *attr, char *buf)
{
struct mthca_dev *dev =
- container_of(device, struct mthca_dev, ib_dev.dev);
+ rdma_device_to_drv_device(device, struct mthca_dev, ib_dev);
+
return sprintf(buf, "%.*s\n", MTHCA_BOARD_ID_LEN, dev->board_id);
}
static DEVICE_ATTR_RO(board_id);
@@ -1193,6 +1169,83 @@ static void get_dev_fw_str(struct ib_device *device, char *str)
(int) dev->fw_ver & 0xffff);
}
+static const struct ib_device_ops mthca_dev_ops = {
+ .alloc_pd = mthca_alloc_pd,
+ .alloc_ucontext = mthca_alloc_ucontext,
+ .attach_mcast = mthca_multicast_attach,
+ .create_ah = mthca_ah_create,
+ .create_cq = mthca_create_cq,
+ .create_qp = mthca_create_qp,
+ .dealloc_pd = mthca_dealloc_pd,
+ .dealloc_ucontext = mthca_dealloc_ucontext,
+ .dereg_mr = mthca_dereg_mr,
+ .destroy_ah = mthca_ah_destroy,
+ .destroy_cq = mthca_destroy_cq,
+ .destroy_qp = mthca_destroy_qp,
+ .detach_mcast = mthca_multicast_detach,
+ .get_dev_fw_str = get_dev_fw_str,
+ .get_dma_mr = mthca_get_dma_mr,
+ .get_port_immutable = mthca_port_immutable,
+ .mmap = mthca_mmap_uar,
+ .modify_device = mthca_modify_device,
+ .modify_port = mthca_modify_port,
+ .modify_qp = mthca_modify_qp,
+ .poll_cq = mthca_poll_cq,
+ .process_mad = mthca_process_mad,
+ .query_ah = mthca_ah_query,
+ .query_device = mthca_query_device,
+ .query_gid = mthca_query_gid,
+ .query_pkey = mthca_query_pkey,
+ .query_port = mthca_query_port,
+ .query_qp = mthca_query_qp,
+ .reg_user_mr = mthca_reg_user_mr,
+ .resize_cq = mthca_resize_cq,
+ INIT_RDMA_OBJ_SIZE(ib_pd, mthca_pd, ibpd),
+ INIT_RDMA_OBJ_SIZE(ib_ucontext, mthca_ucontext, ibucontext),
+};
+
+static const struct ib_device_ops mthca_dev_arbel_srq_ops = {
+ .create_srq = mthca_create_srq,
+ .destroy_srq = mthca_destroy_srq,
+ .modify_srq = mthca_modify_srq,
+ .post_srq_recv = mthca_arbel_post_srq_recv,
+ .query_srq = mthca_query_srq,
+};
+
+static const struct ib_device_ops mthca_dev_tavor_srq_ops = {
+ .create_srq = mthca_create_srq,
+ .destroy_srq = mthca_destroy_srq,
+ .modify_srq = mthca_modify_srq,
+ .post_srq_recv = mthca_tavor_post_srq_recv,
+ .query_srq = mthca_query_srq,
+};
+
+static const struct ib_device_ops mthca_dev_arbel_fmr_ops = {
+ .alloc_fmr = mthca_alloc_fmr,
+ .dealloc_fmr = mthca_dealloc_fmr,
+ .map_phys_fmr = mthca_arbel_map_phys_fmr,
+ .unmap_fmr = mthca_unmap_fmr,
+};
+
+static const struct ib_device_ops mthca_dev_tavor_fmr_ops = {
+ .alloc_fmr = mthca_alloc_fmr,
+ .dealloc_fmr = mthca_dealloc_fmr,
+ .map_phys_fmr = mthca_tavor_map_phys_fmr,
+ .unmap_fmr = mthca_unmap_fmr,
+};
+
+static const struct ib_device_ops mthca_dev_arbel_ops = {
+ .post_recv = mthca_arbel_post_receive,
+ .post_send = mthca_arbel_post_send,
+ .req_notify_cq = mthca_arbel_arm_cq,
+};
+
+static const struct ib_device_ops mthca_dev_tavor_ops = {
+ .post_recv = mthca_tavor_post_receive,
+ .post_send = mthca_tavor_post_send,
+ .req_notify_cq = mthca_tavor_arm_cq,
+};
+
int mthca_register_device(struct mthca_dev *dev)
{
int ret;
@@ -1226,26 +1279,8 @@ int mthca_register_device(struct mthca_dev *dev)
dev->ib_dev.phys_port_cnt = dev->limits.num_ports;
dev->ib_dev.num_comp_vectors = 1;
dev->ib_dev.dev.parent = &dev->pdev->dev;
- dev->ib_dev.query_device = mthca_query_device;
- dev->ib_dev.query_port = mthca_query_port;
- dev->ib_dev.modify_device = mthca_modify_device;
- dev->ib_dev.modify_port = mthca_modify_port;
- dev->ib_dev.query_pkey = mthca_query_pkey;
- dev->ib_dev.query_gid = mthca_query_gid;
- dev->ib_dev.alloc_ucontext = mthca_alloc_ucontext;
- dev->ib_dev.dealloc_ucontext = mthca_dealloc_ucontext;
- dev->ib_dev.mmap = mthca_mmap_uar;
- dev->ib_dev.alloc_pd = mthca_alloc_pd;
- dev->ib_dev.dealloc_pd = mthca_dealloc_pd;
- dev->ib_dev.create_ah = mthca_ah_create;
- dev->ib_dev.query_ah = mthca_ah_query;
- dev->ib_dev.destroy_ah = mthca_ah_destroy;
if (dev->mthca_flags & MTHCA_FLAG_SRQ) {
- dev->ib_dev.create_srq = mthca_create_srq;
- dev->ib_dev.modify_srq = mthca_modify_srq;
- dev->ib_dev.query_srq = mthca_query_srq;
- dev->ib_dev.destroy_srq = mthca_destroy_srq;
dev->ib_dev.uverbs_cmd_mask |=
(1ull << IB_USER_VERBS_CMD_CREATE_SRQ) |
(1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) |
@@ -1253,54 +1288,34 @@ int mthca_register_device(struct mthca_dev *dev)
(1ull << IB_USER_VERBS_CMD_DESTROY_SRQ);
if (mthca_is_memfree(dev))
- dev->ib_dev.post_srq_recv = mthca_arbel_post_srq_recv;
+ ib_set_device_ops(&dev->ib_dev,
+ &mthca_dev_arbel_srq_ops);
else
- dev->ib_dev.post_srq_recv = mthca_tavor_post_srq_recv;
+ ib_set_device_ops(&dev->ib_dev,
+ &mthca_dev_tavor_srq_ops);
}
- dev->ib_dev.create_qp = mthca_create_qp;
- dev->ib_dev.modify_qp = mthca_modify_qp;
- dev->ib_dev.query_qp = mthca_query_qp;
- dev->ib_dev.destroy_qp = mthca_destroy_qp;
- dev->ib_dev.create_cq = mthca_create_cq;
- dev->ib_dev.resize_cq = mthca_resize_cq;
- dev->ib_dev.destroy_cq = mthca_destroy_cq;
- dev->ib_dev.poll_cq = mthca_poll_cq;
- dev->ib_dev.get_dma_mr = mthca_get_dma_mr;
- dev->ib_dev.reg_user_mr = mthca_reg_user_mr;
- dev->ib_dev.dereg_mr = mthca_dereg_mr;
- dev->ib_dev.get_port_immutable = mthca_port_immutable;
- dev->ib_dev.get_dev_fw_str = get_dev_fw_str;
-
if (dev->mthca_flags & MTHCA_FLAG_FMR) {
- dev->ib_dev.alloc_fmr = mthca_alloc_fmr;
- dev->ib_dev.unmap_fmr = mthca_unmap_fmr;
- dev->ib_dev.dealloc_fmr = mthca_dealloc_fmr;
if (mthca_is_memfree(dev))
- dev->ib_dev.map_phys_fmr = mthca_arbel_map_phys_fmr;
+ ib_set_device_ops(&dev->ib_dev,
+ &mthca_dev_arbel_fmr_ops);
else
- dev->ib_dev.map_phys_fmr = mthca_tavor_map_phys_fmr;
+ ib_set_device_ops(&dev->ib_dev,
+ &mthca_dev_tavor_fmr_ops);
}
- dev->ib_dev.attach_mcast = mthca_multicast_attach;
- dev->ib_dev.detach_mcast = mthca_multicast_detach;
- dev->ib_dev.process_mad = mthca_process_mad;
+ ib_set_device_ops(&dev->ib_dev, &mthca_dev_ops);
- if (mthca_is_memfree(dev)) {
- dev->ib_dev.req_notify_cq = mthca_arbel_arm_cq;
- dev->ib_dev.post_send = mthca_arbel_post_send;
- dev->ib_dev.post_recv = mthca_arbel_post_receive;
- } else {
- dev->ib_dev.req_notify_cq = mthca_tavor_arm_cq;
- dev->ib_dev.post_send = mthca_tavor_post_send;
- dev->ib_dev.post_recv = mthca_tavor_post_receive;
- }
+ if (mthca_is_memfree(dev))
+ ib_set_device_ops(&dev->ib_dev, &mthca_dev_arbel_ops);
+ else
+ ib_set_device_ops(&dev->ib_dev, &mthca_dev_tavor_ops);
mutex_init(&dev->cap_mask_mutex);
rdma_set_device_sysfs_group(&dev->ib_dev, &mthca_attr_group);
dev->ib_dev.driver_id = RDMA_DRIVER_MTHCA;
- ret = ib_register_device(&dev->ib_dev, "mthca%d", NULL);
+ ret = ib_register_device(&dev->ib_dev, "mthca%d");
if (ret)
return ret;
diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c
index 9d178ee3c96a..7a5b25d13faa 100644
--- a/drivers/infiniband/hw/mthca/mthca_qp.c
+++ b/drivers/infiniband/hw/mthca/mthca_qp.c
@@ -42,6 +42,7 @@
#include <rdma/ib_verbs.h>
#include <rdma/ib_cache.h>
#include <rdma/ib_pack.h>
+#include <rdma/uverbs_ioctl.h>
#include "mthca_dev.h"
#include "mthca_cmd.h"
@@ -554,10 +555,14 @@ static int mthca_path_set(struct mthca_dev *dev, const struct rdma_ah_attr *ah,
static int __mthca_modify_qp(struct ib_qp *ibqp,
const struct ib_qp_attr *attr, int attr_mask,
- enum ib_qp_state cur_state, enum ib_qp_state new_state)
+ enum ib_qp_state cur_state,
+ enum ib_qp_state new_state,
+ struct ib_udata *udata)
{
struct mthca_dev *dev = to_mdev(ibqp->device);
struct mthca_qp *qp = to_mqp(ibqp);
+ struct mthca_ucontext *context = rdma_udata_to_drv_context(
+ udata, struct mthca_ucontext, ibucontext);
struct mthca_mailbox *mailbox;
struct mthca_qp_param *qp_param;
struct mthca_qp_context *qp_context;
@@ -619,8 +624,7 @@ static int __mthca_modify_qp(struct ib_qp *ibqp,
/* leave arbel_sched_queue as 0 */
if (qp->ibqp.uobject)
- qp_context->usr_page =
- cpu_to_be32(to_mucontext(qp->ibqp.uobject->context)->uar.index);
+ qp_context->usr_page = cpu_to_be32(context->uar.index);
else
qp_context->usr_page = cpu_to_be32(dev->driver_uar.index);
qp_context->local_qpn = cpu_to_be32(qp->qpn);
@@ -913,7 +917,8 @@ int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask,
goto out;
}
- err = __mthca_modify_qp(ibqp, attr, attr_mask, cur_state, new_state);
+ err = __mthca_modify_qp(ibqp, attr, attr_mask, cur_state, new_state,
+ udata);
out:
mutex_unlock(&qp->mutex);
@@ -981,7 +986,8 @@ static void mthca_adjust_qp_caps(struct mthca_dev *dev,
*/
static int mthca_alloc_wqe_buf(struct mthca_dev *dev,
struct mthca_pd *pd,
- struct mthca_qp *qp)
+ struct mthca_qp *qp,
+ struct ib_udata *udata)
{
int size;
int err = -ENOMEM;
@@ -1048,7 +1054,7 @@ static int mthca_alloc_wqe_buf(struct mthca_dev *dev,
* allocate anything. All we need is to calculate the WQE
* sizes and the send_wqe_offset, so we're done now.
*/
- if (pd->ibpd.uobject)
+ if (udata)
return 0;
size = PAGE_ALIGN(qp->send_wqe_offset +
@@ -1155,7 +1161,8 @@ static int mthca_alloc_qp_common(struct mthca_dev *dev,
struct mthca_cq *send_cq,
struct mthca_cq *recv_cq,
enum ib_sig_type send_policy,
- struct mthca_qp *qp)
+ struct mthca_qp *qp,
+ struct ib_udata *udata)
{
int ret;
int i;
@@ -1178,7 +1185,7 @@ static int mthca_alloc_qp_common(struct mthca_dev *dev,
if (ret)
return ret;
- ret = mthca_alloc_wqe_buf(dev, pd, qp);
+ ret = mthca_alloc_wqe_buf(dev, pd, qp, udata);
if (ret) {
mthca_unmap_memfree(dev, qp);
return ret;
@@ -1191,7 +1198,7 @@ static int mthca_alloc_qp_common(struct mthca_dev *dev,
* will be allocated and buffers will be initialized in
* userspace.
*/
- if (pd->ibpd.uobject)
+ if (udata)
return 0;
ret = mthca_alloc_memfree(dev, qp);
@@ -1285,7 +1292,8 @@ int mthca_alloc_qp(struct mthca_dev *dev,
enum ib_qp_type type,
enum ib_sig_type send_policy,
struct ib_qp_cap *cap,
- struct mthca_qp *qp)
+ struct mthca_qp *qp,
+ struct ib_udata *udata)
{
int err;
@@ -1308,7 +1316,7 @@ int mthca_alloc_qp(struct mthca_dev *dev,
qp->port = 0;
err = mthca_alloc_qp_common(dev, pd, send_cq, recv_cq,
- send_policy, qp);
+ send_policy, qp, udata);
if (err) {
mthca_free(&dev->qp_table.alloc, qp->qpn);
return err;
@@ -1360,7 +1368,8 @@ int mthca_alloc_sqp(struct mthca_dev *dev,
struct ib_qp_cap *cap,
int qpn,
int port,
- struct mthca_sqp *sqp)
+ struct mthca_sqp *sqp,
+ struct ib_udata *udata)
{
u32 mqpn = qpn * 2 + dev->qp_table.sqp_start + port - 1;
int err;
@@ -1391,7 +1400,7 @@ int mthca_alloc_sqp(struct mthca_dev *dev,
sqp->qp.transport = MLX;
err = mthca_alloc_qp_common(dev, pd, send_cq, recv_cq,
- send_policy, &sqp->qp);
+ send_policy, &sqp->qp, udata);
if (err)
goto err_out_free;
diff --git a/drivers/infiniband/hw/mthca/mthca_srq.c b/drivers/infiniband/hw/mthca/mthca_srq.c
index 9a3fc6fb0d7e..06b920385512 100644
--- a/drivers/infiniband/hw/mthca/mthca_srq.c
+++ b/drivers/infiniband/hw/mthca/mthca_srq.c
@@ -36,6 +36,8 @@
#include <asm/io.h>
+#include <rdma/uverbs_ioctl.h>
+
#include "mthca_dev.h"
#include "mthca_cmd.h"
#include "mthca_memfree.h"
@@ -95,17 +97,20 @@ static inline int *wqe_to_link(void *wqe)
static void mthca_tavor_init_srq_context(struct mthca_dev *dev,
struct mthca_pd *pd,
struct mthca_srq *srq,
- struct mthca_tavor_srq_context *context)
+ struct mthca_tavor_srq_context *context,
+ struct ib_udata *udata)
{
+ struct mthca_ucontext *ucontext = rdma_udata_to_drv_context(
+ udata, struct mthca_ucontext, ibucontext);
+
memset(context, 0, sizeof *context);
context->wqe_base_ds = cpu_to_be64(1 << (srq->wqe_shift - 4));
context->state_pd = cpu_to_be32(pd->pd_num);
context->lkey = cpu_to_be32(srq->mr.ibmr.lkey);
- if (pd->ibpd.uobject)
- context->uar =
- cpu_to_be32(to_mucontext(pd->ibpd.uobject->context)->uar.index);
+ if (udata)
+ context->uar = cpu_to_be32(ucontext->uar.index);
else
context->uar = cpu_to_be32(dev->driver_uar.index);
}
@@ -113,8 +118,11 @@ static void mthca_tavor_init_srq_context(struct mthca_dev *dev,
static void mthca_arbel_init_srq_context(struct mthca_dev *dev,
struct mthca_pd *pd,
struct mthca_srq *srq,
- struct mthca_arbel_srq_context *context)
+ struct mthca_arbel_srq_context *context,
+ struct ib_udata *udata)
{
+ struct mthca_ucontext *ucontext = rdma_udata_to_drv_context(
+ udata, struct mthca_ucontext, ibucontext);
int logsize, max;
memset(context, 0, sizeof *context);
@@ -129,9 +137,8 @@ static void mthca_arbel_init_srq_context(struct mthca_dev *dev,
context->lkey = cpu_to_be32(srq->mr.ibmr.lkey);
context->db_index = cpu_to_be32(srq->db_index);
context->logstride_usrpage = cpu_to_be32((srq->wqe_shift - 4) << 29);
- if (pd->ibpd.uobject)
- context->logstride_usrpage |=
- cpu_to_be32(to_mucontext(pd->ibpd.uobject->context)->uar.index);
+ if (udata)
+ context->logstride_usrpage |= cpu_to_be32(ucontext->uar.index);
else
context->logstride_usrpage |= cpu_to_be32(dev->driver_uar.index);
context->eq_pd = cpu_to_be32(MTHCA_EQ_ASYNC << 24 | pd->pd_num);
@@ -145,14 +152,14 @@ static void mthca_free_srq_buf(struct mthca_dev *dev, struct mthca_srq *srq)
}
static int mthca_alloc_srq_buf(struct mthca_dev *dev, struct mthca_pd *pd,
- struct mthca_srq *srq)
+ struct mthca_srq *srq, struct ib_udata *udata)
{
struct mthca_data_seg *scatter;
void *wqe;
int err;
int i;
- if (pd->ibpd.uobject)
+ if (udata)
return 0;
srq->wrid = kmalloc_array(srq->max, sizeof(u64), GFP_KERNEL);
@@ -197,7 +204,8 @@ static int mthca_alloc_srq_buf(struct mthca_dev *dev, struct mthca_pd *pd,
}
int mthca_alloc_srq(struct mthca_dev *dev, struct mthca_pd *pd,
- struct ib_srq_attr *attr, struct mthca_srq *srq)
+ struct ib_srq_attr *attr, struct mthca_srq *srq,
+ struct ib_udata *udata)
{
struct mthca_mailbox *mailbox;
int ds;
@@ -235,7 +243,7 @@ int mthca_alloc_srq(struct mthca_dev *dev, struct mthca_pd *pd,
if (err)
goto err_out;
- if (!pd->ibpd.uobject) {
+ if (!udata) {
srq->db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_SRQ,
srq->srqn, &srq->db);
if (srq->db_index < 0) {
@@ -251,7 +259,7 @@ int mthca_alloc_srq(struct mthca_dev *dev, struct mthca_pd *pd,
goto err_out_db;
}
- err = mthca_alloc_srq_buf(dev, pd, srq);
+ err = mthca_alloc_srq_buf(dev, pd, srq, udata);
if (err)
goto err_out_mailbox;
@@ -261,9 +269,9 @@ int mthca_alloc_srq(struct mthca_dev *dev, struct mthca_pd *pd,
mutex_init(&srq->mutex);
if (mthca_is_memfree(dev))
- mthca_arbel_init_srq_context(dev, pd, srq, mailbox->buf);
+ mthca_arbel_init_srq_context(dev, pd, srq, mailbox->buf, udata);
else
- mthca_tavor_init_srq_context(dev, pd, srq, mailbox->buf);
+ mthca_tavor_init_srq_context(dev, pd, srq, mailbox->buf, udata);
err = mthca_SW2HW_SRQ(dev, mailbox, srq->srqn);
@@ -297,14 +305,14 @@ err_out_free_srq:
mthca_warn(dev, "HW2SW_SRQ failed (%d)\n", err);
err_out_free_buf:
- if (!pd->ibpd.uobject)
+ if (!udata)
mthca_free_srq_buf(dev, srq);
err_out_mailbox:
mthca_free_mailbox(dev, mailbox);
err_out_db:
- if (!pd->ibpd.uobject && mthca_is_memfree(dev))
+ if (!udata && mthca_is_memfree(dev))
mthca_free_db(dev, MTHCA_DB_TYPE_SRQ, srq->db_index);
err_out_icm:
diff --git a/drivers/infiniband/hw/nes/Kconfig b/drivers/infiniband/hw/nes/Kconfig
index 7964eba8e7ed..52caae954e4a 100644
--- a/drivers/infiniband/hw/nes/Kconfig
+++ b/drivers/infiniband/hw/nes/Kconfig
@@ -1,6 +1,6 @@
config INFINIBAND_NES
tristate "NetEffect RNIC Driver"
- depends on PCI && INET && INFINIBAND
+ depends on PCI && INET
select LIBCRC32C
---help---
This is the RDMA Network Interface Card (RNIC) driver for
diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c
index 2b67ace5b614..032883180f65 100644
--- a/drivers/infiniband/hw/nes/nes_cm.c
+++ b/drivers/infiniband/hw/nes/nes_cm.c
@@ -3033,7 +3033,7 @@ static int nes_disconnect(struct nes_qp *nesqp, int abrupt)
/* Need to free the Last Streaming Mode Message */
if (nesqp->ietf_frame) {
if (nesqp->lsmm_mr)
- nesibdev->ibdev.dereg_mr(nesqp->lsmm_mr);
+ nesibdev->ibdev.ops.dereg_mr(nesqp->lsmm_mr);
pci_free_consistent(nesdev->pcidev,
nesqp->private_data_len + nesqp->ietf_frame_size,
nesqp->ietf_frame, nesqp->ietf_frame_pbase);
diff --git a/drivers/infiniband/hw/nes/nes_mgt.c b/drivers/infiniband/hw/nes/nes_mgt.c
index e96ffff61c3a..cc4dce5c3e5f 100644
--- a/drivers/infiniband/hw/nes/nes_mgt.c
+++ b/drivers/infiniband/hw/nes/nes_mgt.c
@@ -223,11 +223,11 @@ static struct sk_buff *nes_get_next_skb(struct nes_device *nesdev, struct nes_qp
}
old_skb = skb;
- skb = skb->next;
+ skb = skb_peek_next(skb, &nesqp->pau_list);
skb_unlink(old_skb, &nesqp->pau_list);
nes_mgt_free_skb(nesdev, old_skb, PCI_DMA_TODEVICE);
nes_rem_ref_cm_node(nesqp->cm_node);
- if (skb == (struct sk_buff *)&nesqp->pau_list)
+ if (!skb)
goto out;
}
return skb;
@@ -551,14 +551,14 @@ static void queue_fpdus(struct sk_buff *skb, struct nes_vnic *nesvnic, struct ne
/* Queue skb by sequence number */
if (skb_queue_len(&nesqp->pau_list) == 0) {
- skb_queue_head(&nesqp->pau_list, skb);
+ __skb_queue_head(&nesqp->pau_list, skb);
} else {
skb_queue_walk(&nesqp->pau_list, tmpskb) {
cb = (struct nes_rskb_cb *)&tmpskb->cb[0];
if (before(seqnum, cb->seqnum))
break;
}
- skb_insert(tmpskb, skb, &nesqp->pau_list);
+ __skb_insert(skb, tmpskb->prev, tmpskb, &nesqp->pau_list);
}
if (nesqp->pau_state == PAU_READY)
process_it = true;
diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c
index 92d1cadd4cfd..828e4af3f951 100644
--- a/drivers/infiniband/hw/nes/nes_verbs.c
+++ b/drivers/infiniband/hw/nes/nes_verbs.c
@@ -41,6 +41,7 @@
#include <rdma/ib_verbs.h>
#include <rdma/iw_cm.h>
#include <rdma/ib_user_verbs.h>
+#include <rdma/uverbs_ioctl.h>
#include "nes.h"
@@ -528,42 +529,36 @@ static int nes_query_gid(struct ib_device *ibdev, u8 port,
* nes_alloc_ucontext - Allocate the user context data structure. This keeps track
* of all objects associated with a particular user-mode client.
*/
-static struct ib_ucontext *nes_alloc_ucontext(struct ib_device *ibdev,
- struct ib_udata *udata)
+static int nes_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata)
{
+ struct ib_device *ibdev = uctx->device;
struct nes_vnic *nesvnic = to_nesvnic(ibdev);
struct nes_device *nesdev = nesvnic->nesdev;
struct nes_adapter *nesadapter = nesdev->nesadapter;
struct nes_alloc_ucontext_req req;
- struct nes_alloc_ucontext_resp uresp;
- struct nes_ucontext *nes_ucontext;
+ struct nes_alloc_ucontext_resp uresp = {};
+ struct nes_ucontext *nes_ucontext = to_nesucontext(uctx);
struct nes_ib_device *nesibdev = nesvnic->nesibdev;
if (ib_copy_from_udata(&req, udata, sizeof(struct nes_alloc_ucontext_req))) {
printk(KERN_ERR PFX "Invalid structure size on allocate user context.\n");
- return ERR_PTR(-EINVAL);
+ return -EINVAL;
}
if (req.userspace_ver != NES_ABI_USERSPACE_VER) {
printk(KERN_ERR PFX "Invalid userspace driver version detected. Detected version %d, should be %d\n",
req.userspace_ver, NES_ABI_USERSPACE_VER);
- return ERR_PTR(-EINVAL);
+ return -EINVAL;
}
- memset(&uresp, 0, sizeof uresp);
-
uresp.max_qps = nesibdev->max_qp;
uresp.max_pds = nesibdev->max_pd;
uresp.wq_size = nesdev->nesadapter->max_qp_wr * 2;
uresp.virtwq = nesadapter->virtwq;
uresp.kernel_ver = NES_ABI_KERNEL_VER;
- nes_ucontext = kzalloc(sizeof *nes_ucontext, GFP_KERNEL);
- if (!nes_ucontext)
- return ERR_PTR(-ENOMEM);
-
nes_ucontext->nesdev = nesdev;
nes_ucontext->mmap_wq_offset = uresp.max_pds;
nes_ucontext->mmap_cq_offset = nes_ucontext->mmap_wq_offset +
@@ -571,34 +566,22 @@ static struct ib_ucontext *nes_alloc_ucontext(struct ib_device *ibdev,
PAGE_SIZE;
- if (ib_copy_to_udata(udata, &uresp, sizeof uresp)) {
- kfree(nes_ucontext);
- return ERR_PTR(-EFAULT);
- }
+ if (ib_copy_to_udata(udata, &uresp, sizeof(uresp)))
+ return -EFAULT;
INIT_LIST_HEAD(&nes_ucontext->cq_reg_mem_list);
INIT_LIST_HEAD(&nes_ucontext->qp_reg_mem_list);
- atomic_set(&nes_ucontext->usecnt, 1);
- return &nes_ucontext->ibucontext;
+ return 0;
}
-
/**
* nes_dealloc_ucontext
*/
-static int nes_dealloc_ucontext(struct ib_ucontext *context)
+static void nes_dealloc_ucontext(struct ib_ucontext *context)
{
- /* struct nes_vnic *nesvnic = to_nesvnic(context->device); */
- /* struct nes_device *nesdev = nesvnic->nesdev; */
- struct nes_ucontext *nes_ucontext = to_nesucontext(context);
-
- if (!atomic_dec_and_test(&nes_ucontext->usecnt))
- return 0;
- kfree(nes_ucontext);
- return 0;
+ return;
}
-
/**
* nes_mmap
*/
@@ -658,10 +641,11 @@ static int nes_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
/**
* nes_alloc_pd
*/
-static struct ib_pd *nes_alloc_pd(struct ib_device *ibdev,
- struct ib_ucontext *context, struct ib_udata *udata)
+static int nes_alloc_pd(struct ib_pd *pd, struct ib_ucontext *context,
+ struct ib_udata *udata)
{
- struct nes_pd *nespd;
+ struct ib_device *ibdev = pd->device;
+ struct nes_pd *nespd = to_nespd(pd);
struct nes_vnic *nesvnic = to_nesvnic(ibdev);
struct nes_device *nesdev = nesvnic->nesdev;
struct nes_adapter *nesadapter = nesdev->nesadapter;
@@ -676,15 +660,8 @@ static struct ib_pd *nes_alloc_pd(struct ib_device *ibdev,
err = nes_alloc_resource(nesadapter, nesadapter->allocated_pds,
nesadapter->max_pd, &pd_num, &nesadapter->next_pd, NES_RESOURCE_PD);
- if (err) {
- return ERR_PTR(err);
- }
-
- nespd = kzalloc(sizeof (struct nes_pd), GFP_KERNEL);
- if (!nespd) {
- nes_free_resource(nesadapter, nesadapter->allocated_pds, pd_num);
- return ERR_PTR(-ENOMEM);
- }
+ if (err)
+ return err;
nes_debug(NES_DBG_PD, "Allocating PD (%p) for ib device %s\n",
nespd, dev_name(&nesvnic->nesibdev->ibdev.dev));
@@ -700,16 +677,14 @@ static struct ib_pd *nes_alloc_pd(struct ib_device *ibdev,
if (nespd->mmap_db_index >= NES_MAX_USER_DB_REGIONS) {
nes_debug(NES_DBG_PD, "mmap_db_index > MAX\n");
nes_free_resource(nesadapter, nesadapter->allocated_pds, pd_num);
- kfree(nespd);
- return ERR_PTR(-ENOMEM);
+ return -ENOMEM;
}
uresp.pd_id = nespd->pd_id;
uresp.mmap_db_index = nespd->mmap_db_index;
if (ib_copy_to_udata(udata, &uresp, sizeof (struct nes_alloc_pd_resp))) {
nes_free_resource(nesadapter, nesadapter->allocated_pds, pd_num);
- kfree(nespd);
- return ERR_PTR(-EFAULT);
+ return -EFAULT;
}
set_bit(nespd->mmap_db_index, nesucontext->allocated_doorbells);
@@ -718,14 +693,14 @@ static struct ib_pd *nes_alloc_pd(struct ib_device *ibdev,
}
nes_debug(NES_DBG_PD, "PD%u structure located @%p.\n", nespd->pd_id, nespd);
- return &nespd->ibpd;
+ return 0;
}
/**
* nes_dealloc_pd
*/
-static int nes_dealloc_pd(struct ib_pd *ibpd)
+static void nes_dealloc_pd(struct ib_pd *ibpd)
{
struct nes_ucontext *nesucontext;
struct nes_pd *nespd = to_nespd(ibpd);
@@ -748,9 +723,6 @@ static int nes_dealloc_pd(struct ib_pd *ibpd)
nespd->pd_id, nespd);
nes_free_resource(nesadapter, nesadapter->allocated_pds,
(nespd->pd_id-nesadapter->base_pd)>>(PAGE_SHIFT-12));
- kfree(nespd);
-
- return 0;
}
@@ -985,7 +957,8 @@ static struct ib_qp *nes_create_qp(struct ib_pd *ibpd,
struct nes_adapter *nesadapter = nesdev->nesadapter;
struct nes_qp *nesqp;
struct nes_cq *nescq;
- struct nes_ucontext *nes_ucontext;
+ struct nes_ucontext *nes_ucontext = rdma_udata_to_drv_context(
+ udata, struct nes_ucontext, ibucontext);
struct nes_hw_cqp_wqe *cqp_wqe;
struct nes_cqp_request *cqp_request;
struct nes_create_qp_req req;
@@ -1066,9 +1039,8 @@ static struct ib_qp *nes_create_qp(struct ib_pd *ibpd,
}
if (req.user_qp_buffer)
nesqp->nesuqp_addr = req.user_qp_buffer;
- if ((ibpd->uobject) && (ibpd->uobject->context)) {
+ if (udata) {
nesqp->user_mode = 1;
- nes_ucontext = to_nesucontext(ibpd->uobject->context);
if (virt_wqs) {
err = 1;
list_for_each_entry(nespbl, &nes_ucontext->qp_reg_mem_list, list) {
@@ -1089,7 +1061,6 @@ static struct ib_qp *nes_create_qp(struct ib_pd *ibpd,
}
}
- nes_ucontext = to_nesucontext(ibpd->uobject->context);
nesqp->mmap_sq_db_index =
find_next_zero_bit(nes_ucontext->allocated_wqs,
NES_MAX_USER_WQ_REGIONS, nes_ucontext->first_free_wq);
@@ -1257,7 +1228,7 @@ static struct ib_qp *nes_create_qp(struct ib_pd *ibpd,
nes_put_cqp_request(nesdev, cqp_request);
- if (ibpd->uobject) {
+ if (udata) {
uresp.mmap_sq_db_index = nesqp->mmap_sq_db_index;
uresp.mmap_rq_db_index = 0;
uresp.actual_sq_size = sq_size;
@@ -2109,18 +2080,18 @@ static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
struct nes_device *nesdev = nesvnic->nesdev;
struct nes_adapter *nesadapter = nesdev->nesadapter;
struct ib_mr *ibmr = ERR_PTR(-EINVAL);
- struct scatterlist *sg;
- struct nes_ucontext *nes_ucontext;
+ struct sg_dma_page_iter dma_iter;
+ struct nes_ucontext *nes_ucontext = rdma_udata_to_drv_context(
+ udata, struct nes_ucontext, ibucontext);
struct nes_pbl *nespbl;
struct nes_mr *nesmr;
struct ib_umem *region;
struct nes_mem_reg_req req;
struct nes_vpbl vpbl;
struct nes_root_vpbl root_vpbl;
- int entry, page_index;
+ int page_index;
int page_count = 0;
int err, pbl_depth = 0;
- int chunk_pages;
int ret;
u32 stag;
u32 stag_index = 0;
@@ -2132,9 +2103,8 @@ static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
u16 pbl_count;
u8 single_page = 1;
u8 stag_key;
- int first_page = 1;
- region = ib_umem_get(pd->uobject->context, start, length, acc, 0);
+ region = ib_umem_get(udata, start, length, acc, 0);
if (IS_ERR(region)) {
return (struct ib_mr *)region;
}
@@ -2183,127 +2153,99 @@ static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
}
nesmr->region = region;
- for_each_sg(region->sg_head.sgl, sg, region->nmap, entry) {
- if (sg_dma_address(sg) & ~PAGE_MASK) {
- ib_umem_release(region);
- nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
- nes_debug(NES_DBG_MR, "Unaligned Memory Buffer: 0x%x\n",
- (unsigned int) sg_dma_address(sg));
- ibmr = ERR_PTR(-EINVAL);
- kfree(nesmr);
- goto reg_user_mr_err;
- }
-
- if (!sg_dma_len(sg)) {
- ib_umem_release(region);
- nes_free_resource(nesadapter, nesadapter->allocated_mrs,
- stag_index);
- nes_debug(NES_DBG_MR, "Invalid Buffer Size\n");
- ibmr = ERR_PTR(-EINVAL);
- kfree(nesmr);
- goto reg_user_mr_err;
- }
+ for_each_sg_dma_page (region->sg_head.sgl, &dma_iter, region->nmap, 0) {
- region_length += sg_dma_len(sg);
- chunk_pages = sg_dma_len(sg) >> 12;
+ region_length += PAGE_SIZE;
region_length -= skip_pages << 12;
- for (page_index = skip_pages; page_index < chunk_pages; page_index++) {
- skip_pages = 0;
- if ((page_count != 0) && (page_count << 12) - (ib_umem_offset(region) & (4096 - 1)) >= region->length)
- goto enough_pages;
- if ((page_count&0x01FF) == 0) {
- if (page_count >= 1024 * 512) {
+ skip_pages = 0;
+ if ((page_count != 0) && (page_count << 12) - (ib_umem_offset(region) & (4096 - 1)) >= region->length)
+ goto enough_pages;
+ if ((page_count & 0x01FF) == 0) {
+ if (page_count >= 1024 * 512) {
+ ib_umem_release(region);
+ nes_free_resource(nesadapter,
+ nesadapter->allocated_mrs, stag_index);
+ kfree(nesmr);
+ ibmr = ERR_PTR(-E2BIG);
+ goto reg_user_mr_err;
+ }
+ if (root_pbl_index == 1) {
+ root_vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev,
+ 8192, &root_vpbl.pbl_pbase);
+ nes_debug(NES_DBG_MR, "Allocating root PBL, va = %p, pa = 0x%08X\n",
+ root_vpbl.pbl_vbase, (unsigned int)root_vpbl.pbl_pbase);
+ if (!root_vpbl.pbl_vbase) {
ib_umem_release(region);
- nes_free_resource(nesadapter,
- nesadapter->allocated_mrs, stag_index);
+ pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase,
+ vpbl.pbl_pbase);
+ nes_free_resource(nesadapter, nesadapter->allocated_mrs,
+ stag_index);
kfree(nesmr);
- ibmr = ERR_PTR(-E2BIG);
+ ibmr = ERR_PTR(-ENOMEM);
goto reg_user_mr_err;
}
- if (root_pbl_index == 1) {
- root_vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev,
- 8192, &root_vpbl.pbl_pbase);
- nes_debug(NES_DBG_MR, "Allocating root PBL, va = %p, pa = 0x%08X\n",
- root_vpbl.pbl_vbase, (unsigned int)root_vpbl.pbl_pbase);
- if (!root_vpbl.pbl_vbase) {
- ib_umem_release(region);
- pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase,
- vpbl.pbl_pbase);
- nes_free_resource(nesadapter, nesadapter->allocated_mrs,
- stag_index);
- kfree(nesmr);
- ibmr = ERR_PTR(-ENOMEM);
- goto reg_user_mr_err;
- }
- root_vpbl.leaf_vpbl = kcalloc(1024,
- sizeof(*root_vpbl.leaf_vpbl),
- GFP_KERNEL);
- if (!root_vpbl.leaf_vpbl) {
- ib_umem_release(region);
- pci_free_consistent(nesdev->pcidev, 8192, root_vpbl.pbl_vbase,
- root_vpbl.pbl_pbase);
- pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase,
- vpbl.pbl_pbase);
- nes_free_resource(nesadapter, nesadapter->allocated_mrs,
- stag_index);
- kfree(nesmr);
- ibmr = ERR_PTR(-ENOMEM);
- goto reg_user_mr_err;
- }
- root_vpbl.pbl_vbase[0].pa_low =
- cpu_to_le32((u32)vpbl.pbl_pbase);
- root_vpbl.pbl_vbase[0].pa_high =
- cpu_to_le32((u32)((((u64)vpbl.pbl_pbase) >> 32)));
- root_vpbl.leaf_vpbl[0] = vpbl;
- }
- vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 4096,
- &vpbl.pbl_pbase);
- nes_debug(NES_DBG_MR, "Allocating leaf PBL, va = %p, pa = 0x%08X\n",
- vpbl.pbl_vbase, (unsigned int)vpbl.pbl_pbase);
- if (!vpbl.pbl_vbase) {
+ root_vpbl.leaf_vpbl = kcalloc(1024,
+ sizeof(*root_vpbl.leaf_vpbl),
+ GFP_KERNEL);
+ if (!root_vpbl.leaf_vpbl) {
ib_umem_release(region);
- nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
- ibmr = ERR_PTR(-ENOMEM);
+ pci_free_consistent(nesdev->pcidev, 8192, root_vpbl.pbl_vbase,
+ root_vpbl.pbl_pbase);
+ pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase,
+ vpbl.pbl_pbase);
+ nes_free_resource(nesadapter, nesadapter->allocated_mrs,
+ stag_index);
kfree(nesmr);
+ ibmr = ERR_PTR(-ENOMEM);
goto reg_user_mr_err;
}
- if (1 <= root_pbl_index) {
- root_vpbl.pbl_vbase[root_pbl_index].pa_low =
- cpu_to_le32((u32)vpbl.pbl_pbase);
- root_vpbl.pbl_vbase[root_pbl_index].pa_high =
- cpu_to_le32((u32)((((u64)vpbl.pbl_pbase)>>32)));
- root_vpbl.leaf_vpbl[root_pbl_index] = vpbl;
- }
- root_pbl_index++;
- cur_pbl_index = 0;
+ root_vpbl.pbl_vbase[0].pa_low =
+ cpu_to_le32((u32)vpbl.pbl_pbase);
+ root_vpbl.pbl_vbase[0].pa_high =
+ cpu_to_le32((u32)((((u64)vpbl.pbl_pbase) >> 32)));
+ root_vpbl.leaf_vpbl[0] = vpbl;
}
- if (single_page) {
- if (page_count != 0) {
- if ((last_dma_addr+4096) !=
- (sg_dma_address(sg)+
- (page_index*4096)))
- single_page = 0;
- last_dma_addr = sg_dma_address(sg)+
- (page_index*4096);
- } else {
- first_dma_addr = sg_dma_address(sg)+
- (page_index*4096);
- last_dma_addr = first_dma_addr;
- }
+ vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 4096,
+ &vpbl.pbl_pbase);
+ nes_debug(NES_DBG_MR, "Allocating leaf PBL, va = %p, pa = 0x%08X\n",
+ vpbl.pbl_vbase, (unsigned int)vpbl.pbl_pbase);
+ if (!vpbl.pbl_vbase) {
+ ib_umem_release(region);
+ nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
+ ibmr = ERR_PTR(-ENOMEM);
+ kfree(nesmr);
+ goto reg_user_mr_err;
+ }
+ if (1 <= root_pbl_index) {
+ root_vpbl.pbl_vbase[root_pbl_index].pa_low =
+ cpu_to_le32((u32)vpbl.pbl_pbase);
+ root_vpbl.pbl_vbase[root_pbl_index].pa_high =
+ cpu_to_le32((u32)((((u64)vpbl.pbl_pbase) >> 32)));
+ root_vpbl.leaf_vpbl[root_pbl_index] = vpbl;
+ }
+ root_pbl_index++;
+ cur_pbl_index = 0;
+ }
+ if (single_page) {
+ if (page_count != 0) {
+ if ((last_dma_addr + 4096) != sg_page_iter_dma_address(&dma_iter))
+ single_page = 0;
+ last_dma_addr = sg_page_iter_dma_address(&dma_iter);
+ } else {
+ first_dma_addr = sg_page_iter_dma_address(&dma_iter);
+ last_dma_addr = first_dma_addr;
}
-
- vpbl.pbl_vbase[cur_pbl_index].pa_low =
- cpu_to_le32((u32)(sg_dma_address(sg)+
- (page_index*4096)));
- vpbl.pbl_vbase[cur_pbl_index].pa_high =
- cpu_to_le32((u32)((((u64)(sg_dma_address(sg)+
- (page_index*4096))) >> 32)));
- cur_pbl_index++;
- page_count++;
}
+
+ vpbl.pbl_vbase[cur_pbl_index].pa_low =
+ cpu_to_le32((u32)(sg_page_iter_dma_address(&dma_iter)));
+ vpbl.pbl_vbase[cur_pbl_index].pa_high =
+ cpu_to_le32((u32)((u64)(sg_page_iter_dma_address(&dma_iter))));
+ cur_pbl_index++;
+ page_count++;
}
- enough_pages:
+enough_pages:
nes_debug(NES_DBG_MR, "calculating stag, stag_index=0x%08x, driver_key=0x%08x,"
" stag_key=0x%08x\n",
stag_index, driver_key, stag_key);
@@ -2345,7 +2287,7 @@ static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
ibmr = ERR_PTR(-ENOMEM);
}
- reg_user_mr_err:
+reg_user_mr_err:
/* free the resources */
if (root_pbl_index == 1) {
pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase,
@@ -2383,7 +2325,6 @@ static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
return ERR_PTR(-ENOMEM);
}
nesmr->region = region;
- nes_ucontext = to_nesucontext(pd->uobject->context);
pbl_depth = region->length >> 12;
pbl_depth += (region->length & (4096-1)) ? 1 : 0;
nespbl->pbl_size = pbl_depth*sizeof(u64);
@@ -2412,26 +2353,14 @@ static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
nespbl->pbl_size, (unsigned long) nespbl->pbl_pbase,
(void *) nespbl->pbl_vbase, nespbl->user_base);
- for_each_sg(region->sg_head.sgl, sg, region->nmap, entry) {
- chunk_pages = sg_dma_len(sg) >> 12;
- chunk_pages += (sg_dma_len(sg) & (4096-1)) ? 1 : 0;
- if (first_page) {
- nespbl->page = sg_page(sg);
- first_page = 0;
- }
-
- for (page_index = 0; page_index < chunk_pages; page_index++) {
- ((__le32 *)pbl)[0] = cpu_to_le32((u32)
- (sg_dma_address(sg)+
- (page_index*4096)));
- ((__le32 *)pbl)[1] = cpu_to_le32(((u64)
- (sg_dma_address(sg)+
- (page_index*4096)))>>32);
- nes_debug(NES_DBG_MR, "pbl=%p, *pbl=0x%016llx, 0x%08x%08x\n", pbl,
- (unsigned long long)*pbl,
- le32_to_cpu(((__le32 *)pbl)[1]), le32_to_cpu(((__le32 *)pbl)[0]));
- pbl++;
- }
+ nespbl->page = sg_page(region->sg_head.sgl);
+ for_each_sg_dma_page(region->sg_head.sgl, &dma_iter, region->nmap, 0) {
+ ((__le32 *)pbl)[0] = cpu_to_le32((u32)(sg_page_iter_dma_address(&dma_iter)));
+ ((__le32 *)pbl)[1] = cpu_to_le32(((u64)(sg_page_iter_dma_address(&dma_iter)))>>32);
+ nes_debug(NES_DBG_MR, "pbl=%p, *pbl=0x%016llx, 0x%08x%08x\n", pbl,
+ (unsigned long long)*pbl,
+ le32_to_cpu(((__le32 *)pbl)[1]), le32_to_cpu(((__le32 *)pbl)[0]));
+ pbl++;
}
if (req.reg_type == IWNES_MEMREG_TYPE_QP) {
@@ -2560,7 +2489,7 @@ static ssize_t hw_rev_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct nes_ib_device *nesibdev =
- container_of(dev, struct nes_ib_device, ibdev.dev);
+ rdma_device_to_drv_device(dev, struct nes_ib_device, ibdev);
struct nes_vnic *nesvnic = nesibdev->nesvnic;
nes_debug(NES_DBG_INIT, "\n");
@@ -3627,6 +3556,41 @@ static void get_dev_fw_str(struct ib_device *dev, char *str)
(nesvnic->nesdev->nesadapter->firmware_version & 0x000000ff));
}
+static const struct ib_device_ops nes_dev_ops = {
+ .alloc_mr = nes_alloc_mr,
+ .alloc_mw = nes_alloc_mw,
+ .alloc_pd = nes_alloc_pd,
+ .alloc_ucontext = nes_alloc_ucontext,
+ .create_cq = nes_create_cq,
+ .create_qp = nes_create_qp,
+ .dealloc_mw = nes_dealloc_mw,
+ .dealloc_pd = nes_dealloc_pd,
+ .dealloc_ucontext = nes_dealloc_ucontext,
+ .dereg_mr = nes_dereg_mr,
+ .destroy_cq = nes_destroy_cq,
+ .destroy_qp = nes_destroy_qp,
+ .drain_rq = nes_drain_rq,
+ .drain_sq = nes_drain_sq,
+ .get_dev_fw_str = get_dev_fw_str,
+ .get_dma_mr = nes_get_dma_mr,
+ .get_port_immutable = nes_port_immutable,
+ .map_mr_sg = nes_map_mr_sg,
+ .mmap = nes_mmap,
+ .modify_qp = nes_modify_qp,
+ .poll_cq = nes_poll_cq,
+ .post_recv = nes_post_recv,
+ .post_send = nes_post_send,
+ .query_device = nes_query_device,
+ .query_gid = nes_query_gid,
+ .query_pkey = nes_query_pkey,
+ .query_port = nes_query_port,
+ .query_qp = nes_query_qp,
+ .reg_user_mr = nes_reg_user_mr,
+ .req_notify_cq = nes_req_notify_cq,
+ INIT_RDMA_OBJ_SIZE(ib_pd, nes_pd, ibpd),
+ INIT_RDMA_OBJ_SIZE(ib_ucontext, nes_ucontext, ibucontext),
+};
+
/**
* nes_init_ofa_device
*/
@@ -3636,7 +3600,7 @@ struct nes_ib_device *nes_init_ofa_device(struct net_device *netdev)
struct nes_vnic *nesvnic = netdev_priv(netdev);
struct nes_device *nesdev = nesvnic->nesdev;
- nesibdev = (struct nes_ib_device *)ib_alloc_device(sizeof(struct nes_ib_device));
+ nesibdev = ib_alloc_device(nes_ib_device, ibdev);
if (nesibdev == NULL) {
return NULL;
}
@@ -3673,36 +3637,6 @@ struct nes_ib_device *nes_init_ofa_device(struct net_device *netdev)
nesibdev->ibdev.phys_port_cnt = 1;
nesibdev->ibdev.num_comp_vectors = 1;
nesibdev->ibdev.dev.parent = &nesdev->pcidev->dev;
- nesibdev->ibdev.query_device = nes_query_device;
- nesibdev->ibdev.query_port = nes_query_port;
- nesibdev->ibdev.query_pkey = nes_query_pkey;
- nesibdev->ibdev.query_gid = nes_query_gid;
- nesibdev->ibdev.alloc_ucontext = nes_alloc_ucontext;
- nesibdev->ibdev.dealloc_ucontext = nes_dealloc_ucontext;
- nesibdev->ibdev.mmap = nes_mmap;
- nesibdev->ibdev.alloc_pd = nes_alloc_pd;
- nesibdev->ibdev.dealloc_pd = nes_dealloc_pd;
- nesibdev->ibdev.create_qp = nes_create_qp;
- nesibdev->ibdev.modify_qp = nes_modify_qp;
- nesibdev->ibdev.query_qp = nes_query_qp;
- nesibdev->ibdev.destroy_qp = nes_destroy_qp;
- nesibdev->ibdev.create_cq = nes_create_cq;
- nesibdev->ibdev.destroy_cq = nes_destroy_cq;
- nesibdev->ibdev.poll_cq = nes_poll_cq;
- nesibdev->ibdev.get_dma_mr = nes_get_dma_mr;
- nesibdev->ibdev.reg_user_mr = nes_reg_user_mr;
- nesibdev->ibdev.dereg_mr = nes_dereg_mr;
- nesibdev->ibdev.alloc_mw = nes_alloc_mw;
- nesibdev->ibdev.dealloc_mw = nes_dealloc_mw;
-
- nesibdev->ibdev.alloc_mr = nes_alloc_mr;
- nesibdev->ibdev.map_mr_sg = nes_map_mr_sg;
-
- nesibdev->ibdev.req_notify_cq = nes_req_notify_cq;
- nesibdev->ibdev.post_send = nes_post_send;
- nesibdev->ibdev.post_recv = nes_post_recv;
- nesibdev->ibdev.drain_sq = nes_drain_sq;
- nesibdev->ibdev.drain_rq = nes_drain_rq;
nesibdev->ibdev.iwcm = kzalloc(sizeof(*nesibdev->ibdev.iwcm), GFP_KERNEL);
if (nesibdev->ibdev.iwcm == NULL) {
@@ -3717,8 +3651,8 @@ struct nes_ib_device *nes_init_ofa_device(struct net_device *netdev)
nesibdev->ibdev.iwcm->reject = nes_reject;
nesibdev->ibdev.iwcm->create_listen = nes_create_listen;
nesibdev->ibdev.iwcm->destroy_listen = nes_destroy_listen;
- nesibdev->ibdev.get_port_immutable = nes_port_immutable;
- nesibdev->ibdev.get_dev_fw_str = get_dev_fw_str;
+
+ ib_set_device_ops(&nesibdev->ibdev, &nes_dev_ops);
memcpy(nesibdev->ibdev.iwcm->ifname, netdev->name,
sizeof(nesibdev->ibdev.iwcm->ifname));
@@ -3798,7 +3732,7 @@ int nes_register_ofa_device(struct nes_ib_device *nesibdev)
rdma_set_device_sysfs_group(&nesvnic->nesibdev->ibdev, &nes_attr_group);
nesvnic->nesibdev->ibdev.driver_id = RDMA_DRIVER_NES;
- ret = ib_register_device(&nesvnic->nesibdev->ibdev, "nes%d", NULL);
+ ret = ib_register_device(&nesvnic->nesibdev->ibdev, "nes%d");
if (ret) {
return ret;
}
diff --git a/drivers/infiniband/hw/nes/nes_verbs.h b/drivers/infiniband/hw/nes/nes_verbs.h
index e02a5662dc20..114a9b59fefd 100644
--- a/drivers/infiniband/hw/nes/nes_verbs.h
+++ b/drivers/infiniband/hw/nes/nes_verbs.h
@@ -59,7 +59,6 @@ struct nes_ucontext {
struct list_head cq_reg_mem_list;
struct list_head qp_reg_mem_list;
u32 mcrqf;
- atomic_t usecnt;
};
struct nes_pd {
diff --git a/drivers/infiniband/hw/ocrdma/Makefile b/drivers/infiniband/hw/ocrdma/Makefile
index d1bfd4f4cdde..e3f20ca15462 100644
--- a/drivers/infiniband/hw/ocrdma/Makefile
+++ b/drivers/infiniband/hw/ocrdma/Makefile
@@ -1,4 +1,4 @@
-ccflags-y := -Idrivers/net/ethernet/emulex/benet
+ccflags-y := -I $(srctree)/drivers/net/ethernet/emulex/benet
obj-$(CONFIG_INFINIBAND_OCRDMA) += ocrdma.o
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c
index 58188fe5aed2..a7295322efbc 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c
@@ -157,7 +157,7 @@ static inline int set_av_attr(struct ocrdma_dev *dev, struct ocrdma_ah *ah,
}
struct ib_ah *ocrdma_create_ah(struct ib_pd *ibpd, struct rdma_ah_attr *attr,
- struct ib_udata *udata)
+ u32 flags, struct ib_udata *udata)
{
u32 *ahid_addr;
int status;
@@ -219,7 +219,7 @@ av_err:
return ERR_PTR(status);
}
-int ocrdma_destroy_ah(struct ib_ah *ibah)
+int ocrdma_destroy_ah(struct ib_ah *ibah, u32 flags)
{
struct ocrdma_ah *ah = get_ocrdma_ah(ibah);
struct ocrdma_dev *dev = get_ocrdma_dev(ibah->device);
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_ah.h b/drivers/infiniband/hw/ocrdma/ocrdma_ah.h
index c0c32c9b80ae..eb996e14b520 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_ah.h
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_ah.h
@@ -52,8 +52,8 @@ enum {
};
struct ib_ah *ocrdma_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr,
- struct ib_udata *udata);
-int ocrdma_destroy_ah(struct ib_ah *ah);
+ u32 flags, struct ib_udata *udata);
+int ocrdma_destroy_ah(struct ib_ah *ah, u32 flags);
int ocrdma_query_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr);
int ocrdma_process_mad(struct ib_device *,
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c
index 241a57a07485..097e5ab2a19f 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c
@@ -380,8 +380,8 @@ static int ocrdma_alloc_q(struct ocrdma_dev *dev,
q->len = len;
q->entry_size = entry_size;
q->size = len * entry_size;
- q->va = dma_zalloc_coherent(&dev->nic_info.pdev->dev, q->size,
- &q->dma, GFP_KERNEL);
+ q->va = dma_alloc_coherent(&dev->nic_info.pdev->dev, q->size, &q->dma,
+ GFP_KERNEL);
if (!q->va)
return -ENOMEM;
return 0;
@@ -1819,7 +1819,7 @@ int ocrdma_mbx_create_cq(struct ocrdma_dev *dev, struct ocrdma_cq *cq,
return -ENOMEM;
ocrdma_init_mch(&cmd->cmd.req, OCRDMA_CMD_CREATE_CQ,
OCRDMA_SUBSYS_COMMON, sizeof(*cmd));
- cq->va = dma_zalloc_coherent(&pdev->dev, cq->len, &cq->pa, GFP_KERNEL);
+ cq->va = dma_alloc_coherent(&pdev->dev, cq->len, &cq->pa, GFP_KERNEL);
if (!cq->va) {
status = -ENOMEM;
goto mem_err;
@@ -2209,7 +2209,7 @@ static int ocrdma_set_create_qp_sq_cmd(struct ocrdma_create_qp_req *cmd,
qp->sq.max_cnt = max_wqe_allocated;
len = (hw_pages * hw_page_size);
- qp->sq.va = dma_zalloc_coherent(&pdev->dev, len, &pa, GFP_KERNEL);
+ qp->sq.va = dma_alloc_coherent(&pdev->dev, len, &pa, GFP_KERNEL);
if (!qp->sq.va)
return -EINVAL;
qp->sq.len = len;
@@ -2259,7 +2259,7 @@ static int ocrdma_set_create_qp_rq_cmd(struct ocrdma_create_qp_req *cmd,
qp->rq.max_cnt = max_rqe_allocated;
len = (hw_pages * hw_page_size);
- qp->rq.va = dma_zalloc_coherent(&pdev->dev, len, &pa, GFP_KERNEL);
+ qp->rq.va = dma_alloc_coherent(&pdev->dev, len, &pa, GFP_KERNEL);
if (!qp->rq.va)
return -ENOMEM;
qp->rq.pa = pa;
@@ -2315,8 +2315,8 @@ static int ocrdma_set_create_qp_ird_cmd(struct ocrdma_create_qp_req *cmd,
if (dev->attr.ird == 0)
return 0;
- qp->ird_q_va = dma_zalloc_coherent(&pdev->dev, ird_q_len, &pa,
- GFP_KERNEL);
+ qp->ird_q_va = dma_alloc_coherent(&pdev->dev, ird_q_len, &pa,
+ GFP_KERNEL);
if (!qp->ird_q_va)
return -ENOMEM;
ocrdma_build_q_pages(&cmd->ird_addr[0], dev->attr.num_ird_pages,
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c
index 873cc7f6fe61..b9e10d55a58e 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c
@@ -118,7 +118,8 @@ static void get_dev_fw_str(struct ib_device *device, char *str)
static ssize_t hw_rev_show(struct device *device,
struct device_attribute *attr, char *buf)
{
- struct ocrdma_dev *dev = dev_get_drvdata(device);
+ struct ocrdma_dev *dev =
+ rdma_device_to_drv_device(device, struct ocrdma_dev, ibdev);
return scnprintf(buf, PAGE_SIZE, "0x%x\n", dev->nic_info.pdev->vendor);
}
@@ -127,7 +128,8 @@ static DEVICE_ATTR_RO(hw_rev);
static ssize_t hca_type_show(struct device *device,
struct device_attribute *attr, char *buf)
{
- struct ocrdma_dev *dev = dev_get_drvdata(device);
+ struct ocrdma_dev *dev =
+ rdma_device_to_drv_device(device, struct ocrdma_dev, ibdev);
return scnprintf(buf, PAGE_SIZE, "%s\n", &dev->model_number[0]);
}
@@ -143,6 +145,52 @@ static const struct attribute_group ocrdma_attr_group = {
.attrs = ocrdma_attributes,
};
+static const struct ib_device_ops ocrdma_dev_ops = {
+ .alloc_mr = ocrdma_alloc_mr,
+ .alloc_pd = ocrdma_alloc_pd,
+ .alloc_ucontext = ocrdma_alloc_ucontext,
+ .create_ah = ocrdma_create_ah,
+ .create_cq = ocrdma_create_cq,
+ .create_qp = ocrdma_create_qp,
+ .dealloc_pd = ocrdma_dealloc_pd,
+ .dealloc_ucontext = ocrdma_dealloc_ucontext,
+ .dereg_mr = ocrdma_dereg_mr,
+ .destroy_ah = ocrdma_destroy_ah,
+ .destroy_cq = ocrdma_destroy_cq,
+ .destroy_qp = ocrdma_destroy_qp,
+ .get_dev_fw_str = get_dev_fw_str,
+ .get_dma_mr = ocrdma_get_dma_mr,
+ .get_link_layer = ocrdma_link_layer,
+ .get_netdev = ocrdma_get_netdev,
+ .get_port_immutable = ocrdma_port_immutable,
+ .map_mr_sg = ocrdma_map_mr_sg,
+ .mmap = ocrdma_mmap,
+ .modify_port = ocrdma_modify_port,
+ .modify_qp = ocrdma_modify_qp,
+ .poll_cq = ocrdma_poll_cq,
+ .post_recv = ocrdma_post_recv,
+ .post_send = ocrdma_post_send,
+ .process_mad = ocrdma_process_mad,
+ .query_ah = ocrdma_query_ah,
+ .query_device = ocrdma_query_device,
+ .query_pkey = ocrdma_query_pkey,
+ .query_port = ocrdma_query_port,
+ .query_qp = ocrdma_query_qp,
+ .reg_user_mr = ocrdma_reg_user_mr,
+ .req_notify_cq = ocrdma_arm_cq,
+ .resize_cq = ocrdma_resize_cq,
+ INIT_RDMA_OBJ_SIZE(ib_pd, ocrdma_pd, ibpd),
+ INIT_RDMA_OBJ_SIZE(ib_ucontext, ocrdma_ucontext, ibucontext),
+};
+
+static const struct ib_device_ops ocrdma_dev_srq_ops = {
+ .create_srq = ocrdma_create_srq,
+ .destroy_srq = ocrdma_destroy_srq,
+ .modify_srq = ocrdma_modify_srq,
+ .post_srq_recv = ocrdma_post_srq_recv,
+ .query_srq = ocrdma_query_srq,
+};
+
static int ocrdma_register_device(struct ocrdma_dev *dev)
{
ocrdma_get_guid(dev, (u8 *)&dev->ibdev.node_guid);
@@ -182,50 +230,10 @@ static int ocrdma_register_device(struct ocrdma_dev *dev)
dev->ibdev.phys_port_cnt = 1;
dev->ibdev.num_comp_vectors = dev->eq_cnt;
- /* mandatory verbs. */
- dev->ibdev.query_device = ocrdma_query_device;
- dev->ibdev.query_port = ocrdma_query_port;
- dev->ibdev.modify_port = ocrdma_modify_port;
- dev->ibdev.get_netdev = ocrdma_get_netdev;
- dev->ibdev.get_link_layer = ocrdma_link_layer;
- dev->ibdev.alloc_pd = ocrdma_alloc_pd;
- dev->ibdev.dealloc_pd = ocrdma_dealloc_pd;
-
- dev->ibdev.create_cq = ocrdma_create_cq;
- dev->ibdev.destroy_cq = ocrdma_destroy_cq;
- dev->ibdev.resize_cq = ocrdma_resize_cq;
-
- dev->ibdev.create_qp = ocrdma_create_qp;
- dev->ibdev.modify_qp = ocrdma_modify_qp;
- dev->ibdev.query_qp = ocrdma_query_qp;
- dev->ibdev.destroy_qp = ocrdma_destroy_qp;
-
- dev->ibdev.query_pkey = ocrdma_query_pkey;
- dev->ibdev.create_ah = ocrdma_create_ah;
- dev->ibdev.destroy_ah = ocrdma_destroy_ah;
- dev->ibdev.query_ah = ocrdma_query_ah;
-
- dev->ibdev.poll_cq = ocrdma_poll_cq;
- dev->ibdev.post_send = ocrdma_post_send;
- dev->ibdev.post_recv = ocrdma_post_recv;
- dev->ibdev.req_notify_cq = ocrdma_arm_cq;
-
- dev->ibdev.get_dma_mr = ocrdma_get_dma_mr;
- dev->ibdev.dereg_mr = ocrdma_dereg_mr;
- dev->ibdev.reg_user_mr = ocrdma_reg_user_mr;
-
- dev->ibdev.alloc_mr = ocrdma_alloc_mr;
- dev->ibdev.map_mr_sg = ocrdma_map_mr_sg;
-
/* mandatory to support user space verbs consumer. */
- dev->ibdev.alloc_ucontext = ocrdma_alloc_ucontext;
- dev->ibdev.dealloc_ucontext = ocrdma_dealloc_ucontext;
- dev->ibdev.mmap = ocrdma_mmap;
dev->ibdev.dev.parent = &dev->nic_info.pdev->dev;
- dev->ibdev.process_mad = ocrdma_process_mad;
- dev->ibdev.get_port_immutable = ocrdma_port_immutable;
- dev->ibdev.get_dev_fw_str = get_dev_fw_str;
+ ib_set_device_ops(&dev->ibdev, &ocrdma_dev_ops);
if (ocrdma_get_asic_type(dev) == OCRDMA_ASIC_GEN_SKH_R) {
dev->ibdev.uverbs_cmd_mask |=
@@ -235,15 +243,11 @@ static int ocrdma_register_device(struct ocrdma_dev *dev)
OCRDMA_UVERBS(DESTROY_SRQ) |
OCRDMA_UVERBS(POST_SRQ_RECV);
- dev->ibdev.create_srq = ocrdma_create_srq;
- dev->ibdev.modify_srq = ocrdma_modify_srq;
- dev->ibdev.query_srq = ocrdma_query_srq;
- dev->ibdev.destroy_srq = ocrdma_destroy_srq;
- dev->ibdev.post_srq_recv = ocrdma_post_srq_recv;
+ ib_set_device_ops(&dev->ibdev, &ocrdma_dev_srq_ops);
}
rdma_set_device_sysfs_group(&dev->ibdev, &ocrdma_attr_group);
dev->ibdev.driver_id = RDMA_DRIVER_OCRDMA;
- return ib_register_device(&dev->ibdev, "ocrdma%d", NULL);
+ return ib_register_device(&dev->ibdev, "ocrdma%d");
}
static int ocrdma_alloc_resources(struct ocrdma_dev *dev)
@@ -295,7 +299,7 @@ static struct ocrdma_dev *ocrdma_add(struct be_dev_info *dev_info)
u8 lstate = 0;
struct ocrdma_dev *dev;
- dev = (struct ocrdma_dev *)ib_alloc_device(sizeof(struct ocrdma_dev));
+ dev = ib_alloc_device(ocrdma_dev, ibdev);
if (!dev) {
pr_err("Unable to allocate ib device\n");
return NULL;
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_stats.c b/drivers/infiniband/hw/ocrdma/ocrdma_stats.c
index 290d776edf48..a902942adb5d 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_stats.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_stats.c
@@ -73,8 +73,8 @@ bool ocrdma_alloc_stats_resources(struct ocrdma_dev *dev)
mem->size = max_t(u32, sizeof(struct ocrdma_rdma_stats_req),
sizeof(struct ocrdma_rdma_stats_resp));
- mem->va = dma_zalloc_coherent(&dev->nic_info.pdev->dev, mem->size,
- &mem->pa, GFP_KERNEL);
+ mem->va = dma_alloc_coherent(&dev->nic_info.pdev->dev, mem->size,
+ &mem->pa, GFP_KERNEL);
if (!mem->va) {
pr_err("%s: stats mbox allocation failed\n", __func__);
return false;
@@ -760,94 +760,72 @@ static const struct file_operations ocrdma_dbg_ops = {
void ocrdma_add_port_stats(struct ocrdma_dev *dev)
{
+ const struct pci_dev *pdev = dev->nic_info.pdev;
+
if (!ocrdma_dbgfs_dir)
return;
/* Create post stats base dir */
- dev->dir =
- debugfs_create_dir(dev_name(&dev->ibdev.dev), ocrdma_dbgfs_dir);
- if (!dev->dir)
- goto err;
+ dev->dir = debugfs_create_dir(pci_name(pdev), ocrdma_dbgfs_dir);
dev->rsrc_stats.type = OCRDMA_RSRC_STATS;
dev->rsrc_stats.dev = dev;
- if (!debugfs_create_file("resource_stats", S_IRUSR, dev->dir,
- &dev->rsrc_stats, &ocrdma_dbg_ops))
- goto err;
+ debugfs_create_file("resource_stats", S_IRUSR, dev->dir,
+ &dev->rsrc_stats, &ocrdma_dbg_ops);
dev->rx_stats.type = OCRDMA_RXSTATS;
dev->rx_stats.dev = dev;
- if (!debugfs_create_file("rx_stats", S_IRUSR, dev->dir,
- &dev->rx_stats, &ocrdma_dbg_ops))
- goto err;
+ debugfs_create_file("rx_stats", S_IRUSR, dev->dir, &dev->rx_stats,
+ &ocrdma_dbg_ops);
dev->wqe_stats.type = OCRDMA_WQESTATS;
dev->wqe_stats.dev = dev;
- if (!debugfs_create_file("wqe_stats", S_IRUSR, dev->dir,
- &dev->wqe_stats, &ocrdma_dbg_ops))
- goto err;
+ debugfs_create_file("wqe_stats", S_IRUSR, dev->dir, &dev->wqe_stats,
+ &ocrdma_dbg_ops);
dev->tx_stats.type = OCRDMA_TXSTATS;
dev->tx_stats.dev = dev;
- if (!debugfs_create_file("tx_stats", S_IRUSR, dev->dir,
- &dev->tx_stats, &ocrdma_dbg_ops))
- goto err;
+ debugfs_create_file("tx_stats", S_IRUSR, dev->dir, &dev->tx_stats,
+ &ocrdma_dbg_ops);
dev->db_err_stats.type = OCRDMA_DB_ERRSTATS;
dev->db_err_stats.dev = dev;
- if (!debugfs_create_file("db_err_stats", S_IRUSR, dev->dir,
- &dev->db_err_stats, &ocrdma_dbg_ops))
- goto err;
-
+ debugfs_create_file("db_err_stats", S_IRUSR, dev->dir,
+ &dev->db_err_stats, &ocrdma_dbg_ops);
dev->tx_qp_err_stats.type = OCRDMA_TXQP_ERRSTATS;
dev->tx_qp_err_stats.dev = dev;
- if (!debugfs_create_file("tx_qp_err_stats", S_IRUSR, dev->dir,
- &dev->tx_qp_err_stats, &ocrdma_dbg_ops))
- goto err;
+ debugfs_create_file("tx_qp_err_stats", S_IRUSR, dev->dir,
+ &dev->tx_qp_err_stats, &ocrdma_dbg_ops);
dev->rx_qp_err_stats.type = OCRDMA_RXQP_ERRSTATS;
dev->rx_qp_err_stats.dev = dev;
- if (!debugfs_create_file("rx_qp_err_stats", S_IRUSR, dev->dir,
- &dev->rx_qp_err_stats, &ocrdma_dbg_ops))
- goto err;
-
+ debugfs_create_file("rx_qp_err_stats", S_IRUSR, dev->dir,
+ &dev->rx_qp_err_stats, &ocrdma_dbg_ops);
dev->tx_dbg_stats.type = OCRDMA_TX_DBG_STATS;
dev->tx_dbg_stats.dev = dev;
- if (!debugfs_create_file("tx_dbg_stats", S_IRUSR, dev->dir,
- &dev->tx_dbg_stats, &ocrdma_dbg_ops))
- goto err;
+ debugfs_create_file("tx_dbg_stats", S_IRUSR, dev->dir,
+ &dev->tx_dbg_stats, &ocrdma_dbg_ops);
dev->rx_dbg_stats.type = OCRDMA_RX_DBG_STATS;
dev->rx_dbg_stats.dev = dev;
- if (!debugfs_create_file("rx_dbg_stats", S_IRUSR, dev->dir,
- &dev->rx_dbg_stats, &ocrdma_dbg_ops))
- goto err;
+ debugfs_create_file("rx_dbg_stats", S_IRUSR, dev->dir,
+ &dev->rx_dbg_stats, &ocrdma_dbg_ops);
dev->driver_stats.type = OCRDMA_DRV_STATS;
dev->driver_stats.dev = dev;
- if (!debugfs_create_file("driver_dbg_stats", S_IRUSR, dev->dir,
- &dev->driver_stats, &ocrdma_dbg_ops))
- goto err;
+ debugfs_create_file("driver_dbg_stats", S_IRUSR, dev->dir,
+ &dev->driver_stats, &ocrdma_dbg_ops);
dev->reset_stats.type = OCRDMA_RESET_STATS;
dev->reset_stats.dev = dev;
- if (!debugfs_create_file("reset_stats", 0200, dev->dir,
- &dev->reset_stats, &ocrdma_dbg_ops))
- goto err;
-
-
- return;
-err:
- debugfs_remove_recursive(dev->dir);
- dev->dir = NULL;
+ debugfs_create_file("reset_stats", 0200, dev->dir, &dev->reset_stats,
+ &ocrdma_dbg_ops);
}
void ocrdma_rem_port_stats(struct ocrdma_dev *dev)
{
- if (!dev->dir)
- return;
debugfs_remove_recursive(dev->dir);
}
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
index 06d2a7f3304c..b4e1777c2c97 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
@@ -55,7 +55,7 @@
int ocrdma_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey)
{
- if (index > 1)
+ if (index > 0)
return -EINVAL;
*pkey = 0xffff;
@@ -177,11 +177,6 @@ int ocrdma_query_port(struct ib_device *ibdev,
/* props being zeroed by the caller, avoid zeroing it here */
dev = get_ocrdma_dev(ibdev);
- if (port > 1) {
- pr_err("%s(%d) invalid_port=0x%x\n", __func__,
- dev->id, port);
- return -EINVAL;
- }
netdev = dev->nic_info.netdev;
if (netif_running(netdev) && netif_oper_up(netdev)) {
port_state = IB_PORT_ACTIVE;
@@ -215,13 +210,6 @@ int ocrdma_query_port(struct ib_device *ibdev,
int ocrdma_modify_port(struct ib_device *ibdev, u8 port, int mask,
struct ib_port_modify *props)
{
- struct ocrdma_dev *dev;
-
- dev = get_ocrdma_dev(ibdev);
- if (port > 1) {
- pr_err("%s(%d) invalid_port=0x%x\n", __func__, dev->id, port);
- return -EINVAL;
- }
return 0;
}
@@ -379,17 +367,12 @@ static int ocrdma_get_pd_num(struct ocrdma_dev *dev, struct ocrdma_pd *pd)
return status;
}
-static struct ocrdma_pd *_ocrdma_alloc_pd(struct ocrdma_dev *dev,
- struct ocrdma_ucontext *uctx,
- struct ib_udata *udata)
+static int _ocrdma_alloc_pd(struct ocrdma_dev *dev, struct ocrdma_pd *pd,
+ struct ocrdma_ucontext *uctx,
+ struct ib_udata *udata)
{
- struct ocrdma_pd *pd = NULL;
int status;
- pd = kzalloc(sizeof(*pd), GFP_KERNEL);
- if (!pd)
- return ERR_PTR(-ENOMEM);
-
if (udata && uctx && dev->attr.max_dpp_pds) {
pd->dpp_enabled =
ocrdma_get_asic_type(dev) == OCRDMA_ASIC_GEN_SKH_R;
@@ -398,15 +381,8 @@ static struct ocrdma_pd *_ocrdma_alloc_pd(struct ocrdma_dev *dev,
dev->attr.wqe_size) : 0;
}
- if (dev->pd_mgr->pd_prealloc_valid) {
- status = ocrdma_get_pd_num(dev, pd);
- if (status == 0) {
- return pd;
- } else {
- kfree(pd);
- return ERR_PTR(status);
- }
- }
+ if (dev->pd_mgr->pd_prealloc_valid)
+ return ocrdma_get_pd_num(dev, pd);
retry:
status = ocrdma_mbx_alloc_pd(dev, pd);
@@ -415,13 +391,11 @@ retry:
pd->dpp_enabled = false;
pd->num_dpp_qp = 0;
goto retry;
- } else {
- kfree(pd);
- return ERR_PTR(status);
}
+ return status;
}
- return pd;
+ return 0;
}
static inline int is_ucontext_pd(struct ocrdma_ucontext *uctx,
@@ -430,30 +404,33 @@ static inline int is_ucontext_pd(struct ocrdma_ucontext *uctx,
return (uctx->cntxt_pd == pd);
}
-static int _ocrdma_dealloc_pd(struct ocrdma_dev *dev,
+static void _ocrdma_dealloc_pd(struct ocrdma_dev *dev,
struct ocrdma_pd *pd)
{
- int status;
-
if (dev->pd_mgr->pd_prealloc_valid)
- status = ocrdma_put_pd_num(dev, pd->id, pd->dpp_enabled);
+ ocrdma_put_pd_num(dev, pd->id, pd->dpp_enabled);
else
- status = ocrdma_mbx_dealloc_pd(dev, pd);
-
- kfree(pd);
- return status;
+ ocrdma_mbx_dealloc_pd(dev, pd);
}
static int ocrdma_alloc_ucontext_pd(struct ocrdma_dev *dev,
struct ocrdma_ucontext *uctx,
struct ib_udata *udata)
{
- int status = 0;
+ struct ib_device *ibdev = &dev->ibdev;
+ struct ib_pd *pd;
+ int status;
- uctx->cntxt_pd = _ocrdma_alloc_pd(dev, uctx, udata);
- if (IS_ERR(uctx->cntxt_pd)) {
- status = PTR_ERR(uctx->cntxt_pd);
- uctx->cntxt_pd = NULL;
+ pd = rdma_zalloc_drv_obj(ibdev, ib_pd);
+ if (!pd)
+ return -ENOMEM;
+
+ pd->device = ibdev;
+ uctx->cntxt_pd = get_ocrdma_pd(pd);
+
+ status = _ocrdma_alloc_pd(dev, uctx->cntxt_pd, uctx, udata);
+ if (status) {
+ kfree(uctx->cntxt_pd);
goto err;
}
@@ -463,7 +440,7 @@ err:
return status;
}
-static int ocrdma_dealloc_ucontext_pd(struct ocrdma_ucontext *uctx)
+static void ocrdma_dealloc_ucontext_pd(struct ocrdma_ucontext *uctx)
{
struct ocrdma_pd *pd = uctx->cntxt_pd;
struct ocrdma_dev *dev = get_ocrdma_dev(pd->ibpd.device);
@@ -472,9 +449,9 @@ static int ocrdma_dealloc_ucontext_pd(struct ocrdma_ucontext *uctx)
pr_err("%s(%d) Freeing in use pdid=0x%x.\n",
__func__, dev->id, pd->id);
}
+ kfree(uctx->cntxt_pd);
uctx->cntxt_pd = NULL;
- (void)_ocrdma_dealloc_pd(dev, pd);
- return 0;
+ _ocrdma_dealloc_pd(dev, pd);
}
static struct ocrdma_pd *ocrdma_get_ucontext_pd(struct ocrdma_ucontext *uctx)
@@ -498,33 +475,28 @@ static void ocrdma_release_ucontext_pd(struct ocrdma_ucontext *uctx)
mutex_unlock(&uctx->mm_list_lock);
}
-struct ib_ucontext *ocrdma_alloc_ucontext(struct ib_device *ibdev,
- struct ib_udata *udata)
+int ocrdma_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata)
{
+ struct ib_device *ibdev = uctx->device;
int status;
- struct ocrdma_ucontext *ctx;
- struct ocrdma_alloc_ucontext_resp resp;
+ struct ocrdma_ucontext *ctx = get_ocrdma_ucontext(uctx);
+ struct ocrdma_alloc_ucontext_resp resp = {};
struct ocrdma_dev *dev = get_ocrdma_dev(ibdev);
struct pci_dev *pdev = dev->nic_info.pdev;
u32 map_len = roundup(sizeof(u32) * 2048, PAGE_SIZE);
if (!udata)
- return ERR_PTR(-EFAULT);
- ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
- if (!ctx)
- return ERR_PTR(-ENOMEM);
+ return -EFAULT;
INIT_LIST_HEAD(&ctx->mm_head);
mutex_init(&ctx->mm_list_lock);
- ctx->ah_tbl.va = dma_zalloc_coherent(&pdev->dev, map_len,
- &ctx->ah_tbl.pa, GFP_KERNEL);
- if (!ctx->ah_tbl.va) {
- kfree(ctx);
- return ERR_PTR(-ENOMEM);
- }
+ ctx->ah_tbl.va = dma_alloc_coherent(&pdev->dev, map_len,
+ &ctx->ah_tbl.pa, GFP_KERNEL);
+ if (!ctx->ah_tbl.va)
+ return -ENOMEM;
+
ctx->ah_tbl.len = map_len;
- memset(&resp, 0, sizeof(resp));
resp.ah_tbl_len = ctx->ah_tbl.len;
resp.ah_tbl_page = virt_to_phys(ctx->ah_tbl.va);
@@ -546,27 +518,26 @@ struct ib_ucontext *ocrdma_alloc_ucontext(struct ib_device *ibdev,
status = ib_copy_to_udata(udata, &resp, sizeof(resp));
if (status)
goto cpy_err;
- return &ctx->ibucontext;
+ return 0;
cpy_err:
+ ocrdma_dealloc_ucontext_pd(ctx);
pd_err:
ocrdma_del_mmap(ctx, ctx->ah_tbl.pa, ctx->ah_tbl.len);
map_err:
dma_free_coherent(&pdev->dev, ctx->ah_tbl.len, ctx->ah_tbl.va,
ctx->ah_tbl.pa);
- kfree(ctx);
- return ERR_PTR(status);
+ return status;
}
-int ocrdma_dealloc_ucontext(struct ib_ucontext *ibctx)
+void ocrdma_dealloc_ucontext(struct ib_ucontext *ibctx)
{
- int status;
struct ocrdma_mm *mm, *tmp;
struct ocrdma_ucontext *uctx = get_ocrdma_ucontext(ibctx);
struct ocrdma_dev *dev = get_ocrdma_dev(ibctx->device);
struct pci_dev *pdev = dev->nic_info.pdev;
- status = ocrdma_dealloc_ucontext_pd(uctx);
+ ocrdma_dealloc_ucontext_pd(uctx);
ocrdma_del_mmap(uctx, uctx->ah_tbl.pa, uctx->ah_tbl.len);
dma_free_coherent(&pdev->dev, uctx->ah_tbl.len, uctx->ah_tbl.va,
@@ -576,8 +547,6 @@ int ocrdma_dealloc_ucontext(struct ib_ucontext *ibctx)
list_del(&mm->entry);
kfree(mm);
}
- kfree(uctx);
- return status;
}
int ocrdma_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
@@ -670,10 +639,10 @@ dpp_map_err:
return status;
}
-struct ib_pd *ocrdma_alloc_pd(struct ib_device *ibdev,
- struct ib_ucontext *context,
- struct ib_udata *udata)
+int ocrdma_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context,
+ struct ib_udata *udata)
{
+ struct ib_device *ibdev = ibpd->device;
struct ocrdma_dev *dev = get_ocrdma_dev(ibdev);
struct ocrdma_pd *pd;
struct ocrdma_ucontext *uctx = NULL;
@@ -689,11 +658,10 @@ struct ib_pd *ocrdma_alloc_pd(struct ib_device *ibdev,
}
}
- pd = _ocrdma_alloc_pd(dev, uctx, udata);
- if (IS_ERR(pd)) {
- status = PTR_ERR(pd);
+ pd = get_ocrdma_pd(ibpd);
+ status = _ocrdma_alloc_pd(dev, pd, uctx, udata);
+ if (status)
goto exit;
- }
pd_mapping:
if (udata && context) {
@@ -701,25 +669,22 @@ pd_mapping:
if (status)
goto err;
}
- return &pd->ibpd;
+ return 0;
err:
- if (is_uctx_pd) {
+ if (is_uctx_pd)
ocrdma_release_ucontext_pd(uctx);
- } else {
- if (_ocrdma_dealloc_pd(dev, pd))
- pr_err("%s: _ocrdma_dealloc_pd() failed\n", __func__);
- }
+ else
+ _ocrdma_dealloc_pd(dev, pd);
exit:
- return ERR_PTR(status);
+ return status;
}
-int ocrdma_dealloc_pd(struct ib_pd *ibpd)
+void ocrdma_dealloc_pd(struct ib_pd *ibpd)
{
struct ocrdma_pd *pd = get_ocrdma_pd(ibpd);
struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device);
struct ocrdma_ucontext *uctx = NULL;
- int status = 0;
u64 usr_db;
uctx = pd->uctx;
@@ -733,11 +698,10 @@ int ocrdma_dealloc_pd(struct ib_pd *ibpd)
if (is_ucontext_pd(uctx, pd)) {
ocrdma_release_ucontext_pd(uctx);
- return status;
+ return;
}
}
- status = _ocrdma_dealloc_pd(dev, pd);
- return status;
+ _ocrdma_dealloc_pd(dev, pd);
}
static int ocrdma_alloc_lkey(struct ocrdma_dev *dev, struct ocrdma_mr *mr,
@@ -850,7 +814,7 @@ static int ocrdma_build_pbl_tbl(struct ocrdma_dev *dev, struct ocrdma_hw_mr *mr)
return -ENOMEM;
for (i = 0; i < mr->num_pbls; i++) {
- va = dma_zalloc_coherent(&pdev->dev, dma_len, &pa, GFP_KERNEL);
+ va = dma_alloc_coherent(&pdev->dev, dma_len, &pa, GFP_KERNEL);
if (!va) {
ocrdma_free_mr_pbl_tbl(dev, mr);
status = -ENOMEM;
@@ -866,10 +830,11 @@ static void build_user_pbes(struct ocrdma_dev *dev, struct ocrdma_mr *mr,
u32 num_pbes)
{
struct ocrdma_pbe *pbe;
- struct scatterlist *sg;
+ struct sg_dma_page_iter sg_iter;
struct ocrdma_pbl *pbl_tbl = mr->hwmr.pbl_table;
struct ib_umem *umem = mr->umem;
- int shift, pg_cnt, pages, pbe_cnt, entry, total_num_pbes = 0;
+ int pbe_cnt, total_num_pbes = 0;
+ u64 pg_addr;
if (!mr->hwmr.num_pbes)
return;
@@ -877,36 +842,26 @@ static void build_user_pbes(struct ocrdma_dev *dev, struct ocrdma_mr *mr,
pbe = (struct ocrdma_pbe *)pbl_tbl->va;
pbe_cnt = 0;
- shift = umem->page_shift;
-
- for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
- pages = sg_dma_len(sg) >> shift;
- for (pg_cnt = 0; pg_cnt < pages; pg_cnt++) {
- /* store the page address in pbe */
- pbe->pa_lo =
- cpu_to_le32(sg_dma_address(sg) +
- (pg_cnt << shift));
- pbe->pa_hi =
- cpu_to_le32(upper_32_bits(sg_dma_address(sg) +
- (pg_cnt << shift)));
- pbe_cnt += 1;
- total_num_pbes += 1;
- pbe++;
-
- /* if done building pbes, issue the mbx cmd. */
- if (total_num_pbes == num_pbes)
- return;
-
- /* if the given pbl is full storing the pbes,
- * move to next pbl.
- */
- if (pbe_cnt ==
- (mr->hwmr.pbl_size / sizeof(u64))) {
- pbl_tbl++;
- pbe = (struct ocrdma_pbe *)pbl_tbl->va;
- pbe_cnt = 0;
- }
+ for_each_sg_dma_page (umem->sg_head.sgl, &sg_iter, umem->nmap, 0) {
+ /* store the page address in pbe */
+ pg_addr = sg_page_iter_dma_address(&sg_iter);
+ pbe->pa_lo = cpu_to_le32(pg_addr);
+ pbe->pa_hi = cpu_to_le32(upper_32_bits(pg_addr));
+ pbe_cnt += 1;
+ total_num_pbes += 1;
+ pbe++;
+ /* if done building pbes, issue the mbx cmd. */
+ if (total_num_pbes == num_pbes)
+ return;
+
+ /* if the given pbl is full storing the pbes,
+ * move to next pbl.
+ */
+ if (pbe_cnt == (mr->hwmr.pbl_size / sizeof(u64))) {
+ pbl_tbl++;
+ pbe = (struct ocrdma_pbe *)pbl_tbl->va;
+ pbe_cnt = 0;
}
}
}
@@ -928,7 +883,7 @@ struct ib_mr *ocrdma_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len,
mr = kzalloc(sizeof(*mr), GFP_KERNEL);
if (!mr)
return ERR_PTR(status);
- mr->umem = ib_umem_get(ibpd->uobject->context, start, len, acc, 0);
+ mr->umem = ib_umem_get(udata, start, len, acc, 0);
if (IS_ERR(mr->umem)) {
status = -EFAULT;
goto umem_err;
@@ -938,7 +893,7 @@ struct ib_mr *ocrdma_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len,
if (status)
goto umem_err;
- mr->hwmr.pbe_size = BIT(mr->umem->page_shift);
+ mr->hwmr.pbe_size = PAGE_SIZE;
mr->hwmr.fbo = ib_umem_offset(mr->umem);
mr->hwmr.va = usr_addr;
mr->hwmr.len = len;
@@ -1169,7 +1124,8 @@ static void ocrdma_del_qpn_map(struct ocrdma_dev *dev, struct ocrdma_qp *qp)
}
static int ocrdma_check_qp_params(struct ib_pd *ibpd, struct ocrdma_dev *dev,
- struct ib_qp_init_attr *attrs)
+ struct ib_qp_init_attr *attrs,
+ struct ib_udata *udata)
{
if ((attrs->qp_type != IB_QPT_GSI) &&
(attrs->qp_type != IB_QPT_RC) &&
@@ -1217,7 +1173,7 @@ static int ocrdma_check_qp_params(struct ib_pd *ibpd, struct ocrdma_dev *dev,
return -EINVAL;
}
/* unprivileged user space cannot create special QP */
- if (ibpd->uobject && attrs->qp_type == IB_QPT_GSI) {
+ if (udata && attrs->qp_type == IB_QPT_GSI) {
pr_err
("%s(%d) Userspace can't create special QPs of type=0x%x\n",
__func__, dev->id, attrs->qp_type);
@@ -1374,7 +1330,7 @@ struct ib_qp *ocrdma_create_qp(struct ib_pd *ibpd,
struct ocrdma_create_qp_ureq ureq;
u16 dpp_credit_lmt, dpp_offset;
- status = ocrdma_check_qp_params(ibpd, dev, attrs);
+ status = ocrdma_check_qp_params(ibpd, dev, attrs, udata);
if (status)
goto gen_err;
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
index b69cfdce7970..4c04ab40798e 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
@@ -64,15 +64,14 @@ void ocrdma_get_guid(struct ocrdma_dev *, u8 *guid);
struct net_device *ocrdma_get_netdev(struct ib_device *device, u8 port_num);
int ocrdma_query_pkey(struct ib_device *, u8 port, u16 index, u16 *pkey);
-struct ib_ucontext *ocrdma_alloc_ucontext(struct ib_device *,
- struct ib_udata *);
-int ocrdma_dealloc_ucontext(struct ib_ucontext *);
+int ocrdma_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata);
+void ocrdma_dealloc_ucontext(struct ib_ucontext *uctx);
int ocrdma_mmap(struct ib_ucontext *, struct vm_area_struct *vma);
-struct ib_pd *ocrdma_alloc_pd(struct ib_device *,
- struct ib_ucontext *, struct ib_udata *);
-int ocrdma_dealloc_pd(struct ib_pd *pd);
+int ocrdma_alloc_pd(struct ib_pd *pd, struct ib_ucontext *uctx,
+ struct ib_udata *udata);
+void ocrdma_dealloc_pd(struct ib_pd *pd);
struct ib_cq *ocrdma_create_cq(struct ib_device *ibdev,
const struct ib_cq_init_attr *attr,
diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c
index 8d6ff9df49fe..996d9ecd93e0 100644
--- a/drivers/infiniband/hw/qedr/main.c
+++ b/drivers/infiniband/hw/qedr/main.c
@@ -137,7 +137,8 @@ static int qedr_iw_port_immutable(struct ib_device *ibdev, u8 port_num,
static ssize_t hw_rev_show(struct device *device, struct device_attribute *attr,
char *buf)
{
- struct qedr_dev *dev = dev_get_drvdata(device);
+ struct qedr_dev *dev =
+ rdma_device_to_drv_device(device, struct qedr_dev, ibdev);
return scnprintf(buf, PAGE_SIZE, "0x%x\n", dev->pdev->vendor);
}
@@ -160,12 +161,16 @@ static const struct attribute_group qedr_attr_group = {
.attrs = qedr_attributes,
};
+static const struct ib_device_ops qedr_iw_dev_ops = {
+ .get_port_immutable = qedr_iw_port_immutable,
+ .query_gid = qedr_iw_query_gid,
+};
+
static int qedr_iw_register_device(struct qedr_dev *dev)
{
dev->ibdev.node_type = RDMA_NODE_RNIC;
- dev->ibdev.query_gid = qedr_iw_query_gid;
- dev->ibdev.get_port_immutable = qedr_iw_port_immutable;
+ ib_set_device_ops(&dev->ibdev, &qedr_iw_dev_ops);
dev->ibdev.iwcm = kzalloc(sizeof(*dev->ibdev.iwcm), GFP_KERNEL);
if (!dev->ibdev.iwcm)
@@ -186,13 +191,58 @@ static int qedr_iw_register_device(struct qedr_dev *dev)
return 0;
}
+static const struct ib_device_ops qedr_roce_dev_ops = {
+ .get_port_immutable = qedr_roce_port_immutable,
+};
+
static void qedr_roce_register_device(struct qedr_dev *dev)
{
dev->ibdev.node_type = RDMA_NODE_IB_CA;
- dev->ibdev.get_port_immutable = qedr_roce_port_immutable;
+ ib_set_device_ops(&dev->ibdev, &qedr_roce_dev_ops);
}
+static const struct ib_device_ops qedr_dev_ops = {
+ .alloc_mr = qedr_alloc_mr,
+ .alloc_pd = qedr_alloc_pd,
+ .alloc_ucontext = qedr_alloc_ucontext,
+ .create_ah = qedr_create_ah,
+ .create_cq = qedr_create_cq,
+ .create_qp = qedr_create_qp,
+ .create_srq = qedr_create_srq,
+ .dealloc_pd = qedr_dealloc_pd,
+ .dealloc_ucontext = qedr_dealloc_ucontext,
+ .dereg_mr = qedr_dereg_mr,
+ .destroy_ah = qedr_destroy_ah,
+ .destroy_cq = qedr_destroy_cq,
+ .destroy_qp = qedr_destroy_qp,
+ .destroy_srq = qedr_destroy_srq,
+ .get_dev_fw_str = qedr_get_dev_fw_str,
+ .get_dma_mr = qedr_get_dma_mr,
+ .get_link_layer = qedr_link_layer,
+ .get_netdev = qedr_get_netdev,
+ .map_mr_sg = qedr_map_mr_sg,
+ .mmap = qedr_mmap,
+ .modify_port = qedr_modify_port,
+ .modify_qp = qedr_modify_qp,
+ .modify_srq = qedr_modify_srq,
+ .poll_cq = qedr_poll_cq,
+ .post_recv = qedr_post_recv,
+ .post_send = qedr_post_send,
+ .post_srq_recv = qedr_post_srq_recv,
+ .process_mad = qedr_process_mad,
+ .query_device = qedr_query_device,
+ .query_pkey = qedr_query_pkey,
+ .query_port = qedr_query_port,
+ .query_qp = qedr_query_qp,
+ .query_srq = qedr_query_srq,
+ .reg_user_mr = qedr_reg_user_mr,
+ .req_notify_cq = qedr_arm_cq,
+ .resize_cq = qedr_resize_cq,
+ INIT_RDMA_OBJ_SIZE(ib_pd, qedr_pd, ibpd),
+ INIT_RDMA_OBJ_SIZE(ib_ucontext, qedr_ucontext, ibucontext),
+};
+
static int qedr_register_device(struct qedr_dev *dev)
{
int rc;
@@ -237,59 +287,13 @@ static int qedr_register_device(struct qedr_dev *dev)
dev->ibdev.phys_port_cnt = 1;
dev->ibdev.num_comp_vectors = dev->num_cnq;
-
- dev->ibdev.query_device = qedr_query_device;
- dev->ibdev.query_port = qedr_query_port;
- dev->ibdev.modify_port = qedr_modify_port;
-
- dev->ibdev.alloc_ucontext = qedr_alloc_ucontext;
- dev->ibdev.dealloc_ucontext = qedr_dealloc_ucontext;
- dev->ibdev.mmap = qedr_mmap;
-
- dev->ibdev.alloc_pd = qedr_alloc_pd;
- dev->ibdev.dealloc_pd = qedr_dealloc_pd;
-
- dev->ibdev.create_cq = qedr_create_cq;
- dev->ibdev.destroy_cq = qedr_destroy_cq;
- dev->ibdev.resize_cq = qedr_resize_cq;
- dev->ibdev.req_notify_cq = qedr_arm_cq;
-
- dev->ibdev.create_qp = qedr_create_qp;
- dev->ibdev.modify_qp = qedr_modify_qp;
- dev->ibdev.query_qp = qedr_query_qp;
- dev->ibdev.destroy_qp = qedr_destroy_qp;
-
- dev->ibdev.create_srq = qedr_create_srq;
- dev->ibdev.destroy_srq = qedr_destroy_srq;
- dev->ibdev.modify_srq = qedr_modify_srq;
- dev->ibdev.query_srq = qedr_query_srq;
- dev->ibdev.post_srq_recv = qedr_post_srq_recv;
- dev->ibdev.query_pkey = qedr_query_pkey;
-
- dev->ibdev.create_ah = qedr_create_ah;
- dev->ibdev.destroy_ah = qedr_destroy_ah;
-
- dev->ibdev.get_dma_mr = qedr_get_dma_mr;
- dev->ibdev.dereg_mr = qedr_dereg_mr;
- dev->ibdev.reg_user_mr = qedr_reg_user_mr;
- dev->ibdev.alloc_mr = qedr_alloc_mr;
- dev->ibdev.map_mr_sg = qedr_map_mr_sg;
-
- dev->ibdev.poll_cq = qedr_poll_cq;
- dev->ibdev.post_send = qedr_post_send;
- dev->ibdev.post_recv = qedr_post_recv;
-
- dev->ibdev.process_mad = qedr_process_mad;
-
- dev->ibdev.get_netdev = qedr_get_netdev;
-
dev->ibdev.dev.parent = &dev->pdev->dev;
- dev->ibdev.get_link_layer = qedr_link_layer;
- dev->ibdev.get_dev_fw_str = qedr_get_dev_fw_str;
rdma_set_device_sysfs_group(&dev->ibdev, &qedr_attr_group);
+ ib_set_device_ops(&dev->ibdev, &qedr_dev_ops);
+
dev->ibdev.driver_id = RDMA_DRIVER_QEDR;
- return ib_register_device(&dev->ibdev, "qedr%d", NULL);
+ return ib_register_device(&dev->ibdev, "qedr%d");
}
/* This function allocates fast-path status block memory */
@@ -851,7 +855,7 @@ static struct qedr_dev *qedr_add(struct qed_dev *cdev, struct pci_dev *pdev,
struct qedr_dev *dev;
int rc = 0;
- dev = (struct qedr_dev *)ib_alloc_device(sizeof(*dev));
+ dev = ib_alloc_device(qedr_dev, ibdev);
if (!dev) {
pr_err("Unable to allocate ib device\n");
return NULL;
diff --git a/drivers/infiniband/hw/qedr/qedr_iw_cm.c b/drivers/infiniband/hw/qedr/qedr_iw_cm.c
index 505fa3648762..0555e5a8c9ed 100644
--- a/drivers/infiniband/hw/qedr/qedr_iw_cm.c
+++ b/drivers/infiniband/hw/qedr/qedr_iw_cm.c
@@ -349,7 +349,7 @@ qedr_iw_event_handler(void *context, struct qed_iwarp_cm_event_params *params)
default:
DP_NOTICE(dev, "Unknown event received %d\n", params->event);
break;
- };
+ }
return 0;
}
@@ -492,6 +492,8 @@ int qedr_iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
int i;
qp = idr_find(&dev->qpidr.idr, conn_param->qpn);
+ if (unlikely(!qp))
+ return -EINVAL;
laddr = (struct sockaddr_in *)&cm_id->m_local_addr;
raddr = (struct sockaddr_in *)&cm_id->m_remote_addr;
diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c
index 82ee4b4a7084..59ad4202422c 100644
--- a/drivers/infiniband/hw/qedr/verbs.c
+++ b/drivers/infiniband/hw/qedr/verbs.c
@@ -67,7 +67,7 @@ static inline int qedr_ib_copy_to_udata(struct ib_udata *udata, void *src,
int qedr_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey)
{
- if (index > QEDR_ROCE_PKEY_TABLE_LEN)
+ if (index >= QEDR_ROCE_PKEY_TABLE_LEN)
return -EINVAL;
*pkey = QEDR_ROCE_PKEY_DEFAULT;
@@ -216,10 +216,6 @@ int qedr_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *attr)
struct qed_rdma_port *rdma_port;
dev = get_qedr_dev(ibdev);
- if (port > 1) {
- DP_ERR(dev, "invalid_port=0x%x\n", port);
- return -EINVAL;
- }
if (!dev->rdma_ctx) {
DP_ERR(dev, "rdma_ctx is NULL\n");
@@ -263,14 +259,6 @@ int qedr_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *attr)
int qedr_modify_port(struct ib_device *ibdev, u8 port, int mask,
struct ib_port_modify *props)
{
- struct qedr_dev *dev;
-
- dev = get_qedr_dev(ibdev);
- if (port > 1) {
- DP_ERR(dev, "invalid_port=0x%x\n", port);
- return -EINVAL;
- }
-
return 0;
}
@@ -328,28 +316,24 @@ static bool qedr_search_mmap(struct qedr_ucontext *uctx, u64 phy_addr,
return found;
}
-struct ib_ucontext *qedr_alloc_ucontext(struct ib_device *ibdev,
- struct ib_udata *udata)
+int qedr_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata)
{
+ struct ib_device *ibdev = uctx->device;
int rc;
- struct qedr_ucontext *ctx;
- struct qedr_alloc_ucontext_resp uresp;
+ struct qedr_ucontext *ctx = get_qedr_ucontext(uctx);
+ struct qedr_alloc_ucontext_resp uresp = {};
struct qedr_dev *dev = get_qedr_dev(ibdev);
struct qed_rdma_add_user_out_params oparams;
if (!udata)
- return ERR_PTR(-EFAULT);
-
- ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
- if (!ctx)
- return ERR_PTR(-ENOMEM);
+ return -EFAULT;
rc = dev->ops->rdma_add_user(dev->rdma_ctx, &oparams);
if (rc) {
DP_ERR(dev,
"failed to allocate a DPI for a new RoCE application, rc=%d. To overcome this consider to increase the number of DPIs, increase the doorbell BAR size or just close unnecessary RoCE applications. In order to increase the number of DPIs consult the qedr readme\n",
rc);
- goto err;
+ return rc;
}
ctx->dpi = oparams.dpi;
@@ -359,8 +343,6 @@ struct ib_ucontext *qedr_alloc_ucontext(struct ib_device *ibdev,
INIT_LIST_HEAD(&ctx->mm_head);
mutex_init(&ctx->mm_list_lock);
- memset(&uresp, 0, sizeof(uresp));
-
uresp.dpm_enabled = dev->user_dpm_enabled;
uresp.wids_enabled = 1;
uresp.wid_count = oparams.wid_count;
@@ -376,28 +358,23 @@ struct ib_ucontext *qedr_alloc_ucontext(struct ib_device *ibdev,
rc = qedr_ib_copy_to_udata(udata, &uresp, sizeof(uresp));
if (rc)
- goto err;
+ return rc;
ctx->dev = dev;
rc = qedr_add_mmap(ctx, ctx->dpi_phys_addr, ctx->dpi_size);
if (rc)
- goto err;
+ return rc;
DP_DEBUG(dev, QEDR_MSG_INIT, "Allocating user context %p\n",
&ctx->ibucontext);
- return &ctx->ibucontext;
-
-err:
- kfree(ctx);
- return ERR_PTR(rc);
+ return 0;
}
-int qedr_dealloc_ucontext(struct ib_ucontext *ibctx)
+void qedr_dealloc_ucontext(struct ib_ucontext *ibctx)
{
struct qedr_ucontext *uctx = get_qedr_ucontext(ibctx);
struct qedr_mm *mm, *tmp;
- int status = 0;
DP_DEBUG(uctx->dev, QEDR_MSG_INIT, "Deallocating user context %p\n",
uctx);
@@ -410,9 +387,6 @@ int qedr_dealloc_ucontext(struct ib_ucontext *ibctx)
list_del(&mm->entry);
kfree(mm);
}
-
- kfree(uctx);
- return status;
}
int qedr_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
@@ -462,11 +436,12 @@ int qedr_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
vma->vm_page_prot);
}
-struct ib_pd *qedr_alloc_pd(struct ib_device *ibdev,
- struct ib_ucontext *context, struct ib_udata *udata)
+int qedr_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context,
+ struct ib_udata *udata)
{
+ struct ib_device *ibdev = ibpd->device;
struct qedr_dev *dev = get_qedr_dev(ibdev);
- struct qedr_pd *pd;
+ struct qedr_pd *pd = get_qedr_pd(ibpd);
u16 pd_id;
int rc;
@@ -475,16 +450,12 @@ struct ib_pd *qedr_alloc_pd(struct ib_device *ibdev,
if (!dev->rdma_ctx) {
DP_ERR(dev, "invalid RDMA context\n");
- return ERR_PTR(-EINVAL);
+ return -EINVAL;
}
- pd = kzalloc(sizeof(*pd), GFP_KERNEL);
- if (!pd)
- return ERR_PTR(-ENOMEM);
-
rc = dev->ops->rdma_alloc_pd(dev->rdma_ctx, &pd_id);
if (rc)
- goto err;
+ return rc;
pd->pd_id = pd_id;
@@ -497,36 +468,23 @@ struct ib_pd *qedr_alloc_pd(struct ib_device *ibdev,
if (rc) {
DP_ERR(dev, "copy error pd_id=0x%x.\n", pd_id);
dev->ops->rdma_dealloc_pd(dev->rdma_ctx, pd_id);
- goto err;
+ return rc;
}
pd->uctx = get_qedr_ucontext(context);
pd->uctx->pd = pd;
}
- return &pd->ibpd;
-
-err:
- kfree(pd);
- return ERR_PTR(rc);
+ return 0;
}
-int qedr_dealloc_pd(struct ib_pd *ibpd)
+void qedr_dealloc_pd(struct ib_pd *ibpd)
{
struct qedr_dev *dev = get_qedr_dev(ibpd->device);
struct qedr_pd *pd = get_qedr_pd(ibpd);
- if (!pd) {
- pr_err("Invalid PD received in dealloc_pd\n");
- return -EINVAL;
- }
-
DP_DEBUG(dev, QEDR_MSG_INIT, "Deallocating PD %d\n", pd->pd_id);
dev->ops->rdma_dealloc_pd(dev->rdma_ctx, pd->pd_id);
-
- kfree(pd);
-
- return 0;
}
static void qedr_free_pbl(struct qedr_dev *dev,
@@ -568,8 +526,8 @@ static struct qedr_pbl *qedr_alloc_pbl_tbl(struct qedr_dev *dev,
return ERR_PTR(-ENOMEM);
for (i = 0; i < pbl_info->num_pbls; i++) {
- va = dma_zalloc_coherent(&pdev->dev, pbl_info->pbl_size,
- &pa, flags);
+ va = dma_alloc_coherent(&pdev->dev, pbl_info->pbl_size, &pa,
+ flags);
if (!va)
goto err;
@@ -648,13 +606,12 @@ static void qedr_populate_pbls(struct qedr_dev *dev, struct ib_umem *umem,
struct qedr_pbl *pbl,
struct qedr_pbl_info *pbl_info, u32 pg_shift)
{
- int shift, pg_cnt, pages, pbe_cnt, total_num_pbes = 0;
+ int pbe_cnt, total_num_pbes = 0;
u32 fw_pg_cnt, fw_pg_per_umem_pg;
struct qedr_pbl *pbl_tbl;
- struct scatterlist *sg;
+ struct sg_dma_page_iter sg_iter;
struct regpair *pbe;
u64 pg_addr;
- int entry;
if (!pbl_info->num_pbes)
return;
@@ -675,38 +632,32 @@ static void qedr_populate_pbls(struct qedr_dev *dev, struct ib_umem *umem,
pbe_cnt = 0;
- shift = umem->page_shift;
-
- fw_pg_per_umem_pg = BIT(umem->page_shift - pg_shift);
-
- for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
- pages = sg_dma_len(sg) >> shift;
- pg_addr = sg_dma_address(sg);
- for (pg_cnt = 0; pg_cnt < pages; pg_cnt++) {
- for (fw_pg_cnt = 0; fw_pg_cnt < fw_pg_per_umem_pg;) {
- pbe->lo = cpu_to_le32(pg_addr);
- pbe->hi = cpu_to_le32(upper_32_bits(pg_addr));
-
- pg_addr += BIT(pg_shift);
- pbe_cnt++;
- total_num_pbes++;
- pbe++;
-
- if (total_num_pbes == pbl_info->num_pbes)
- return;
-
- /* If the given pbl is full storing the pbes,
- * move to next pbl.
- */
- if (pbe_cnt ==
- (pbl_info->pbl_size / sizeof(u64))) {
- pbl_tbl++;
- pbe = (struct regpair *)pbl_tbl->va;
- pbe_cnt = 0;
- }
+ fw_pg_per_umem_pg = BIT(PAGE_SHIFT - pg_shift);
+
+ for_each_sg_dma_page (umem->sg_head.sgl, &sg_iter, umem->nmap, 0) {
+ pg_addr = sg_page_iter_dma_address(&sg_iter);
+ for (fw_pg_cnt = 0; fw_pg_cnt < fw_pg_per_umem_pg;) {
+ pbe->lo = cpu_to_le32(pg_addr);
+ pbe->hi = cpu_to_le32(upper_32_bits(pg_addr));
+
+ pg_addr += BIT(pg_shift);
+ pbe_cnt++;
+ total_num_pbes++;
+ pbe++;
+
+ if (total_num_pbes == pbl_info->num_pbes)
+ return;
- fw_pg_cnt++;
+ /* If the given pbl is full storing the pbes,
+ * move to next pbl.
+ */
+ if (pbe_cnt == (pbl_info->pbl_size / sizeof(u64))) {
+ pbl_tbl++;
+ pbe = (struct regpair *)pbl_tbl->va;
+ pbe_cnt = 0;
}
+
+ fw_pg_cnt++;
}
}
}
@@ -748,11 +699,10 @@ static inline int qedr_align_cq_entries(int entries)
return aligned_size / QEDR_CQE_SIZE;
}
-static inline int qedr_init_user_queue(struct ib_ucontext *ib_ctx,
+static inline int qedr_init_user_queue(struct ib_udata *udata,
struct qedr_dev *dev,
- struct qedr_userq *q,
- u64 buf_addr, size_t buf_len,
- int access, int dmasync,
+ struct qedr_userq *q, u64 buf_addr,
+ size_t buf_len, int access, int dmasync,
int alloc_and_init)
{
u32 fw_pages;
@@ -760,7 +710,7 @@ static inline int qedr_init_user_queue(struct ib_ucontext *ib_ctx,
q->buf_addr = buf_addr;
q->buf_len = buf_len;
- q->umem = ib_umem_get(ib_ctx, q->buf_addr, q->buf_len, access, dmasync);
+ q->umem = ib_umem_get(udata, q->buf_addr, q->buf_len, access, dmasync);
if (IS_ERR(q->umem)) {
DP_ERR(dev, "create user queue: failed ib_umem_get, got %ld\n",
PTR_ERR(q->umem));
@@ -768,7 +718,7 @@ static inline int qedr_init_user_queue(struct ib_ucontext *ib_ctx,
}
fw_pages = ib_umem_page_count(q->umem) <<
- (q->umem->page_shift - FW_PAGE_SHIFT);
+ (PAGE_SHIFT - FW_PAGE_SHIFT);
rc = qedr_prepare_pbl_tbl(dev, &q->pbl_info, fw_pages, 0);
if (rc)
@@ -917,9 +867,9 @@ struct ib_cq *qedr_create_cq(struct ib_device *ibdev,
cq->cq_type = QEDR_CQ_TYPE_USER;
- rc = qedr_init_user_queue(ib_ctx, dev, &cq->q, ureq.addr,
- ureq.len, IB_ACCESS_LOCAL_WRITE,
- 1, 1);
+ rc = qedr_init_user_queue(udata, dev, &cq->q, ureq.addr,
+ ureq.len, IB_ACCESS_LOCAL_WRITE, 1,
+ 1);
if (rc)
goto err0;
@@ -1148,7 +1098,8 @@ static inline int get_gid_info_from_table(struct ib_qp *ibqp,
}
static int qedr_check_qp_attrs(struct ib_pd *ibpd, struct qedr_dev *dev,
- struct ib_qp_init_attr *attrs)
+ struct ib_qp_init_attr *attrs,
+ struct ib_udata *udata)
{
struct qedr_device_attr *qattr = &dev->attr;
@@ -1189,7 +1140,7 @@ static int qedr_check_qp_attrs(struct ib_pd *ibpd, struct qedr_dev *dev,
}
/* Unprivileged user space cannot create special QP */
- if (ibpd->uobject && attrs->qp_type == IB_QPT_GSI) {
+ if (udata && attrs->qp_type == IB_QPT_GSI) {
DP_ERR(dev,
"create qp: userspace can't create special QPs of type=0x%x\n",
attrs->qp_type);
@@ -1355,7 +1306,7 @@ static void qedr_free_srq_kernel_params(struct qedr_srq *srq)
hw_srq->phy_prod_pair_addr);
}
-static int qedr_init_srq_user_params(struct ib_ucontext *ib_ctx,
+static int qedr_init_srq_user_params(struct ib_udata *udata,
struct qedr_srq *srq,
struct qedr_create_srq_ureq *ureq,
int access, int dmasync)
@@ -1363,14 +1314,14 @@ static int qedr_init_srq_user_params(struct ib_ucontext *ib_ctx,
struct scatterlist *sg;
int rc;
- rc = qedr_init_user_queue(ib_ctx, srq->dev, &srq->usrq, ureq->srq_addr,
+ rc = qedr_init_user_queue(udata, srq->dev, &srq->usrq, ureq->srq_addr,
ureq->srq_len, access, dmasync, 1);
if (rc)
return rc;
- srq->prod_umem = ib_umem_get(ib_ctx, ureq->prod_pair_addr,
- sizeof(struct rdma_srq_producers),
- access, dmasync);
+ srq->prod_umem =
+ ib_umem_get(udata, ureq->prod_pair_addr,
+ sizeof(struct rdma_srq_producers), access, dmasync);
if (IS_ERR(srq->prod_umem)) {
qedr_free_pbl(srq->dev, &srq->usrq.pbl_info, srq->usrq.pbl_tbl);
ib_umem_release(srq->usrq.umem);
@@ -1445,7 +1396,6 @@ struct ib_srq *qedr_create_srq(struct ib_pd *ibpd,
struct qedr_pd *pd = get_qedr_pd(ibpd);
struct qedr_create_srq_ureq ureq = {};
u64 pbl_base_addr, phy_prod_pair_addr;
- struct ib_ucontext *ib_ctx = NULL;
struct qedr_srq_hwq_info *hw_srq;
u32 page_cnt, page_size;
struct qedr_srq *srq;
@@ -1470,23 +1420,21 @@ struct ib_srq *qedr_create_srq(struct ib_pd *ibpd,
hw_srq->max_wr = init_attr->attr.max_wr;
hw_srq->max_sges = init_attr->attr.max_sge;
- if (udata && ibpd->uobject && ibpd->uobject->context) {
- ib_ctx = ibpd->uobject->context;
-
+ if (udata) {
if (ib_copy_from_udata(&ureq, udata, sizeof(ureq))) {
DP_ERR(dev,
"create srq: problem copying data from user space\n");
goto err0;
}
- rc = qedr_init_srq_user_params(ib_ctx, srq, &ureq, 0, 0);
+ rc = qedr_init_srq_user_params(udata, srq, &ureq, 0, 0);
if (rc)
goto err0;
page_cnt = srq->usrq.pbl_info.num_pbes;
pbl_base_addr = srq->usrq.pbl_tbl->pa;
phy_prod_pair_addr = hw_srq->phy_prod_pair_addr;
- page_size = BIT(srq->usrq.umem->page_shift);
+ page_size = PAGE_SIZE;
} else {
struct qed_chain *pbl;
@@ -1552,7 +1500,7 @@ int qedr_destroy_srq(struct ib_srq *ibsrq)
in_params.srq_id = srq->srq_id;
dev->ops->rdma_destroy_srq(dev->rdma_ctx, &in_params);
- if (ibsrq->pd->uobject)
+ if (ibsrq->uobject)
qedr_free_srq_user_params(srq);
else
qedr_free_srq_kernel_params(srq);
@@ -1710,13 +1658,10 @@ static int qedr_create_user_qp(struct qedr_dev *dev,
struct qed_rdma_create_qp_in_params in_params;
struct qed_rdma_create_qp_out_params out_params;
struct qedr_pd *pd = get_qedr_pd(ibpd);
- struct ib_ucontext *ib_ctx = NULL;
struct qedr_create_qp_ureq ureq;
int alloc_and_init = rdma_protocol_roce(&dev->ibdev, 1);
int rc = -EINVAL;
- ib_ctx = ibpd->uobject->context;
-
memset(&ureq, 0, sizeof(ureq));
rc = ib_copy_from_udata(&ureq, udata, sizeof(ureq));
if (rc) {
@@ -1725,14 +1670,14 @@ static int qedr_create_user_qp(struct qedr_dev *dev,
}
/* SQ - read access only (0), dma sync not required (0) */
- rc = qedr_init_user_queue(ib_ctx, dev, &qp->usq, ureq.sq_addr,
+ rc = qedr_init_user_queue(udata, dev, &qp->usq, ureq.sq_addr,
ureq.sq_len, 0, 0, alloc_and_init);
if (rc)
return rc;
if (!qp->srq) {
/* RQ - read access only (0), dma sync not required (0) */
- rc = qedr_init_user_queue(ib_ctx, dev, &qp->urq, ureq.rq_addr,
+ rc = qedr_init_user_queue(udata, dev, &qp->urq, ureq.rq_addr,
ureq.rq_len, 0, 0, alloc_and_init);
if (rc)
return rc;
@@ -2005,7 +1950,7 @@ struct ib_qp *qedr_create_qp(struct ib_pd *ibpd,
DP_DEBUG(dev, QEDR_MSG_QP, "create qp: called from %s, pd=%p\n",
udata ? "user library" : "kernel", pd);
- rc = qedr_check_qp_attrs(ibpd, dev, attrs);
+ rc = qedr_check_qp_attrs(ibpd, dev, attrs, udata);
if (rc)
return ERR_PTR(rc);
@@ -2128,7 +2073,7 @@ static int qedr_update_qp_state(struct qedr_dev *dev,
default:
status = -EINVAL;
break;
- };
+ }
break;
case QED_ROCE_QP_STATE_INIT:
switch (new_state) {
@@ -2149,7 +2094,7 @@ static int qedr_update_qp_state(struct qedr_dev *dev,
/* Invalid state change. */
status = -EINVAL;
break;
- };
+ }
break;
case QED_ROCE_QP_STATE_RTR:
/* RTR->XXX */
@@ -2162,7 +2107,7 @@ static int qedr_update_qp_state(struct qedr_dev *dev,
/* Invalid state change. */
status = -EINVAL;
break;
- };
+ }
break;
case QED_ROCE_QP_STATE_RTS:
/* RTS->XXX */
@@ -2175,7 +2120,7 @@ static int qedr_update_qp_state(struct qedr_dev *dev,
/* Invalid state change. */
status = -EINVAL;
break;
- };
+ }
break;
case QED_ROCE_QP_STATE_SQD:
/* SQD->XXX */
@@ -2187,7 +2132,7 @@ static int qedr_update_qp_state(struct qedr_dev *dev,
/* Invalid state change. */
status = -EINVAL;
break;
- };
+ }
break;
case QED_ROCE_QP_STATE_ERR:
/* ERR->XXX */
@@ -2205,12 +2150,12 @@ static int qedr_update_qp_state(struct qedr_dev *dev,
default:
status = -EINVAL;
break;
- };
+ }
break;
default:
status = -EINVAL;
break;
- };
+ }
return status;
}
@@ -2626,7 +2571,7 @@ int qedr_destroy_qp(struct ib_qp *ibqp)
}
struct ib_ah *qedr_create_ah(struct ib_pd *ibpd, struct rdma_ah_attr *attr,
- struct ib_udata *udata)
+ u32 flags, struct ib_udata *udata)
{
struct qedr_ah *ah;
@@ -2639,7 +2584,7 @@ struct ib_ah *qedr_create_ah(struct ib_pd *ibpd, struct rdma_ah_attr *attr,
return &ah->ibah;
}
-int qedr_destroy_ah(struct ib_ah *ibah)
+int qedr_destroy_ah(struct ib_ah *ibah, u32 flags)
{
struct qedr_ah *ah = get_qedr_ah(ibah);
@@ -2730,7 +2675,7 @@ struct ib_mr *qedr_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len,
mr->type = QEDR_MR_USER;
- mr->umem = ib_umem_get(ibpd->uobject->context, start, len, acc, 0);
+ mr->umem = ib_umem_get(udata, start, len, acc, 0);
if (IS_ERR(mr->umem)) {
rc = -EFAULT;
goto err0;
@@ -2741,7 +2686,7 @@ struct ib_mr *qedr_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len,
goto err1;
qedr_populate_pbls(dev, mr->umem, mr->info.pbl_table,
- &mr->info.pbl_info, mr->umem->page_shift);
+ &mr->info.pbl_info, PAGE_SHIFT);
rc = dev->ops->rdma_alloc_tid(dev->rdma_ctx, &mr->hw_mr.itid);
if (rc) {
@@ -2762,7 +2707,7 @@ struct ib_mr *qedr_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len,
mr->hw_mr.pbl_ptr = mr->info.pbl_table[0].pa;
mr->hw_mr.pbl_two_level = mr->info.pbl_info.two_layered;
mr->hw_mr.pbl_page_size_log = ilog2(mr->info.pbl_info.pbl_size);
- mr->hw_mr.page_size_log = mr->umem->page_shift;
+ mr->hw_mr.page_size_log = PAGE_SHIFT;
mr->hw_mr.fbo = ib_umem_offset(mr->umem);
mr->hw_mr.length = len;
mr->hw_mr.vaddr = usr_addr;
diff --git a/drivers/infiniband/hw/qedr/verbs.h b/drivers/infiniband/hw/qedr/verbs.h
index 0b7d0124b16c..f0c05f4771ac 100644
--- a/drivers/infiniband/hw/qedr/verbs.h
+++ b/drivers/infiniband/hw/qedr/verbs.h
@@ -43,13 +43,13 @@ int qedr_iw_query_gid(struct ib_device *ibdev, u8 port,
int qedr_query_pkey(struct ib_device *, u8 port, u16 index, u16 *pkey);
-struct ib_ucontext *qedr_alloc_ucontext(struct ib_device *, struct ib_udata *);
-int qedr_dealloc_ucontext(struct ib_ucontext *);
+int qedr_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata);
+void qedr_dealloc_ucontext(struct ib_ucontext *uctx);
int qedr_mmap(struct ib_ucontext *, struct vm_area_struct *vma);
-struct ib_pd *qedr_alloc_pd(struct ib_device *,
- struct ib_ucontext *, struct ib_udata *);
-int qedr_dealloc_pd(struct ib_pd *pd);
+int qedr_alloc_pd(struct ib_pd *pd, struct ib_ucontext *uctx,
+ struct ib_udata *udata);
+void qedr_dealloc_pd(struct ib_pd *pd);
struct ib_cq *qedr_create_cq(struct ib_device *ibdev,
const struct ib_cq_init_attr *attr,
@@ -76,8 +76,8 @@ int qedr_destroy_srq(struct ib_srq *ibsrq);
int qedr_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr,
const struct ib_recv_wr **bad_recv_wr);
struct ib_ah *qedr_create_ah(struct ib_pd *ibpd, struct rdma_ah_attr *attr,
- struct ib_udata *udata);
-int qedr_destroy_ah(struct ib_ah *ibah);
+ u32 flags, struct ib_udata *udata);
+int qedr_destroy_ah(struct ib_ah *ibah, u32 flags);
int qedr_dereg_mr(struct ib_mr *);
struct ib_mr *qedr_get_dma_mr(struct ib_pd *, int acc);
diff --git a/drivers/infiniband/hw/qib/qib_debugfs.c b/drivers/infiniband/hw/qib/qib_debugfs.c
index 5ed1ed93380f..caeb77d07a58 100644
--- a/drivers/infiniband/hw/qib/qib_debugfs.c
+++ b/drivers/infiniband/hw/qib/qib_debugfs.c
@@ -66,15 +66,6 @@ static const struct file_operations _##name##_file_ops = { \
.release = seq_release \
};
-#define DEBUGFS_FILE_CREATE(name) \
-do { \
- struct dentry *ent; \
- ent = debugfs_create_file(#name , 0400, ibd->qib_ibdev_dbg, \
- ibd, &_##name##_file_ops); \
- if (!ent) \
- pr_warn("create of " #name " failed\n"); \
-} while (0)
-
static void *_opcode_stats_seq_start(struct seq_file *s, loff_t *pos)
{
struct qib_opcode_stats_perctx *opstats;
@@ -249,17 +240,17 @@ DEBUGFS_FILE(qp_stats)
void qib_dbg_ibdev_init(struct qib_ibdev *ibd)
{
+ struct dentry *root;
char name[10];
snprintf(name, sizeof(name), "qib%d", dd_from_dev(ibd)->unit);
- ibd->qib_ibdev_dbg = debugfs_create_dir(name, qib_dbg_root);
- if (!ibd->qib_ibdev_dbg) {
- pr_warn("create of %s failed\n", name);
- return;
- }
- DEBUGFS_FILE_CREATE(opcode_stats);
- DEBUGFS_FILE_CREATE(ctx_stats);
- DEBUGFS_FILE_CREATE(qp_stats);
+ root = debugfs_create_dir(name, qib_dbg_root);
+ ibd->qib_ibdev_dbg = root;
+
+ debugfs_create_file("opcode_stats", 0400, root, ibd,
+ &_opcode_stats_file_ops);
+ debugfs_create_file("ctx_stats", 0400, root, ibd, &_ctx_stats_file_ops);
+ debugfs_create_file("qp_stats", 0400, root, ibd, &_qp_stats_file_ops);
}
void qib_dbg_ibdev_exit(struct qib_ibdev *ibd)
@@ -274,8 +265,6 @@ out:
void qib_dbg_init(void)
{
qib_dbg_root = debugfs_create_dir(QIB_DRV_NAME, NULL);
- if (!qib_dbg_root)
- pr_warn("init of debugfs failed\n");
}
void qib_dbg_exit(void)
diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c
index 98e1ce14fa2a..78fa634de98a 100644
--- a/drivers/infiniband/hw/qib/qib_file_ops.c
+++ b/drivers/infiniband/hw/qib/qib_file_ops.c
@@ -343,7 +343,7 @@ static int qib_tid_update(struct qib_ctxtdata *rcd, struct file *fp,
/* virtual address of first page in transfer */
vaddr = ti->tidvaddr;
- if (!access_ok(VERIFY_WRITE, (void __user *) vaddr,
+ if (!access_ok((void __user *) vaddr,
cnt * PAGE_SIZE)) {
ret = -EFAULT;
goto done;
diff --git a/drivers/infiniband/hw/qib/qib_iba6120.c b/drivers/infiniband/hw/qib/qib_iba6120.c
index fb1ff59f40bd..cdbf707fa267 100644
--- a/drivers/infiniband/hw/qib/qib_iba6120.c
+++ b/drivers/infiniband/hw/qib/qib_iba6120.c
@@ -3237,7 +3237,6 @@ static int init_6120_variables(struct qib_devdata *dd)
/* we always allocate at least 2048 bytes for eager buffers */
ret = ib_mtu_enum_to_int(qib_ibmtu);
dd->rcvegrbufsize = ret != -1 ? max(ret, 2048) : QIB_DEFAULT_MTU;
- BUG_ON(!is_power_of_2(dd->rcvegrbufsize));
dd->rcvegrbufsize_shift = ilog2(dd->rcvegrbufsize);
qib_6120_tidtemplate(dd);
diff --git a/drivers/infiniband/hw/qib/qib_iba7220.c b/drivers/infiniband/hw/qib/qib_iba7220.c
index 163a57a88742..9fde45538f6e 100644
--- a/drivers/infiniband/hw/qib/qib_iba7220.c
+++ b/drivers/infiniband/hw/qib/qib_iba7220.c
@@ -4043,7 +4043,6 @@ static int qib_init_7220_variables(struct qib_devdata *dd)
/* we always allocate at least 2048 bytes for eager buffers */
ret = ib_mtu_enum_to_int(qib_ibmtu);
dd->rcvegrbufsize = ret != -1 ? max(ret, 2048) : QIB_DEFAULT_MTU;
- BUG_ON(!is_power_of_2(dd->rcvegrbufsize));
dd->rcvegrbufsize_shift = ilog2(dd->rcvegrbufsize);
qib_7220_tidtemplate(dd);
@@ -4252,7 +4251,6 @@ static int init_sdma_7220_regs(struct qib_pportdata *ppd)
unsigned word = i / 64;
unsigned bit = i & 63;
- BUG_ON(word >= 3);
senddmabufmask[word] |= 1ULL << bit;
}
qib_write_kreg(dd, kr_senddmabufmask0, senddmabufmask[0]);
diff --git a/drivers/infiniband/hw/qib/qib_iba7322.c b/drivers/infiniband/hw/qib/qib_iba7322.c
index bf5e222eed8e..17d6b24b3473 100644
--- a/drivers/infiniband/hw/qib/qib_iba7322.c
+++ b/drivers/infiniband/hw/qib/qib_iba7322.c
@@ -1382,7 +1382,6 @@ static void err_decode(char *msg, size_t len, u64 errs,
*msg++ = ',';
len--;
}
- BUG_ON(!msp->sz);
/* msp->sz counts the nul */
took = min_t(size_t, msp->sz - (size_t)1, len);
memcpy(msg, msp->msg, took);
@@ -6599,7 +6598,6 @@ static int qib_init_7322_variables(struct qib_devdata *dd)
/* we always allocate at least 2048 bytes for eager buffers */
dd->rcvegrbufsize = max(mtu, 2048);
- BUG_ON(!is_power_of_2(dd->rcvegrbufsize));
dd->rcvegrbufsize_shift = ilog2(dd->rcvegrbufsize);
qib_7322_tidtemplate(dd);
@@ -6904,7 +6902,6 @@ static int init_sdma_7322_regs(struct qib_pportdata *ppd)
unsigned word = erstbuf / BITS_PER_LONG;
unsigned bit = erstbuf & (BITS_PER_LONG - 1);
- BUG_ON(word >= 3);
senddmabufmask[word] |= 1ULL << bit;
}
qib_write_kreg_port(ppd, krp_senddmabufmask0, senddmabufmask[0]);
diff --git a/drivers/infiniband/hw/qib/qib_init.c b/drivers/infiniband/hw/qib/qib_init.c
index d7cdc77d6306..9fd69903ca57 100644
--- a/drivers/infiniband/hw/qib/qib_init.c
+++ b/drivers/infiniband/hw/qib/qib_init.c
@@ -209,7 +209,6 @@ struct qib_ctxtdata *qib_create_ctxtdata(struct qib_pportdata *ppd, u32 ctxt,
rcd->rcvegrbuf_chunks = (rcd->rcvegrcnt +
rcd->rcvegrbufs_perchunk - 1) /
rcd->rcvegrbufs_perchunk;
- BUG_ON(!is_power_of_2(rcd->rcvegrbufs_perchunk));
rcd->rcvegrbufs_perchunk_shift =
ilog2(rcd->rcvegrbufs_perchunk);
}
diff --git a/drivers/infiniband/hw/qib/qib_mad.c b/drivers/infiniband/hw/qib/qib_mad.c
index 4845d000c22f..f92faf5ec369 100644
--- a/drivers/infiniband/hw/qib/qib_mad.c
+++ b/drivers/infiniband/hw/qib/qib_mad.c
@@ -2494,5 +2494,6 @@ void qib_notify_free_mad_agent(struct rvt_dev_info *rdi, int port_idx)
del_timer_sync(&dd->pport[port_idx].cong_stats.timer);
if (dd->pport[port_idx].ibport_data.smi_ah)
- rdma_destroy_ah(&dd->pport[port_idx].ibport_data.smi_ah->ibah);
+ rdma_destroy_ah(&dd->pport[port_idx].ibport_data.smi_ah->ibah,
+ RDMA_DESTROY_AH_SLEEPABLE);
}
diff --git a/drivers/infiniband/hw/qib/qib_pcie.c b/drivers/infiniband/hw/qib/qib_pcie.c
index 30595b358d8f..864f2af171f7 100644
--- a/drivers/infiniband/hw/qib/qib_pcie.c
+++ b/drivers/infiniband/hw/qib/qib_pcie.c
@@ -387,7 +387,7 @@ void qib_pcie_reenable(struct qib_devdata *dd, u16 cmd, u8 iline, u8 cline)
static int qib_pcie_coalesce;
module_param_named(pcie_coalesce, qib_pcie_coalesce, int, S_IRUGO);
-MODULE_PARM_DESC(pcie_coalesce, "tune PCIe colescing on some Intel chipsets");
+MODULE_PARM_DESC(pcie_coalesce, "tune PCIe coalescing on some Intel chipsets");
/*
* Enable PCIe completion and data coalescing, on Intel 5x00 and 7300
diff --git a/drivers/infiniband/hw/qib/qib_rc.c b/drivers/infiniband/hw/qib/qib_rc.c
index 6fa002940451..50dd9811b088 100644
--- a/drivers/infiniband/hw/qib/qib_rc.c
+++ b/drivers/infiniband/hw/qib/qib_rc.c
@@ -45,12 +45,7 @@ static u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe,
u32 len;
len = ((psn - wqe->psn) & QIB_PSN_MASK) * pmtu;
- ss->sge = wqe->sg_list[0];
- ss->sg_list = wqe->sg_list + 1;
- ss->num_sge = wqe->wr.num_sge;
- ss->total_len = wqe->length;
- rvt_skip_sge(ss, len, false);
- return wqe->length - len;
+ return rvt_restart_sge(ss, wqe, len);
}
/**
diff --git a/drivers/infiniband/hw/qib/qib_sdma.c b/drivers/infiniband/hw/qib/qib_sdma.c
index 757d4c9d713d..99e11c347130 100644
--- a/drivers/infiniband/hw/qib/qib_sdma.c
+++ b/drivers/infiniband/hw/qib/qib_sdma.c
@@ -565,19 +565,15 @@ retry:
sge = &ss->sge;
while (dwords) {
u32 dw;
- u32 len;
-
- len = dwords << 2;
- if (len > sge->length)
- len = sge->length;
- if (len > sge->sge_length)
- len = sge->sge_length;
- BUG_ON(len == 0);
+ u32 len = rvt_get_sge_length(sge, dwords << 2);
+
dw = (len + 3) >> 2;
addr = dma_map_single(&ppd->dd->pcidev->dev, sge->vaddr,
dw << 2, DMA_TO_DEVICE);
- if (dma_mapping_error(&ppd->dd->pcidev->dev, addr))
+ if (dma_mapping_error(&ppd->dd->pcidev->dev, addr)) {
+ ret = -ENOMEM;
goto unmap;
+ }
sdmadesc[0] = 0;
make_sdma_desc(ppd, sdmadesc, (u64) addr, dw, dwoffset);
/* SDmaUseLargeBuf has to be set in every descriptor */
@@ -593,24 +589,7 @@ retry:
descqp = &ppd->sdma_descq[0].qw[0];
++ppd->sdma_generation;
}
- sge->vaddr += len;
- sge->length -= len;
- sge->sge_length -= len;
- if (sge->sge_length == 0) {
- if (--ss->num_sge)
- *sge = *ss->sg_list++;
- } else if (sge->length == 0 && sge->mr->lkey) {
- if (++sge->n >= RVT_SEGSZ) {
- if (++sge->m >= sge->mr->mapsz)
- break;
- sge->n = 0;
- }
- sge->vaddr =
- sge->mr->map[sge->m]->segs[sge->n].vaddr;
- sge->length =
- sge->mr->map[sge->m]->segs[sge->n].length;
- }
-
+ rvt_update_sge(ss, len, false);
dwoffset += dw;
dwords -= dw;
}
diff --git a/drivers/infiniband/hw/qib/qib_sysfs.c b/drivers/infiniband/hw/qib/qib_sysfs.c
index 1cf4ca3f23e3..905206a0c2d5 100644
--- a/drivers/infiniband/hw/qib/qib_sysfs.c
+++ b/drivers/infiniband/hw/qib/qib_sysfs.c
@@ -555,7 +555,7 @@ static ssize_t hw_rev_show(struct device *device, struct device_attribute *attr,
char *buf)
{
struct qib_ibdev *dev =
- container_of(device, struct qib_ibdev, rdi.ibdev.dev);
+ rdma_device_to_drv_device(device, struct qib_ibdev, rdi.ibdev);
return sprintf(buf, "%x\n", dd_from_dev(dev)->minrev);
}
@@ -565,7 +565,7 @@ static ssize_t hca_type_show(struct device *device,
struct device_attribute *attr, char *buf)
{
struct qib_ibdev *dev =
- container_of(device, struct qib_ibdev, rdi.ibdev.dev);
+ rdma_device_to_drv_device(device, struct qib_ibdev, rdi.ibdev);
struct qib_devdata *dd = dd_from_dev(dev);
int ret;
@@ -590,7 +590,7 @@ static ssize_t boardversion_show(struct device *device,
struct device_attribute *attr, char *buf)
{
struct qib_ibdev *dev =
- container_of(device, struct qib_ibdev, rdi.ibdev.dev);
+ rdma_device_to_drv_device(device, struct qib_ibdev, rdi.ibdev);
struct qib_devdata *dd = dd_from_dev(dev);
/* The string printed here is already newline-terminated. */
@@ -602,7 +602,7 @@ static ssize_t localbus_info_show(struct device *device,
struct device_attribute *attr, char *buf)
{
struct qib_ibdev *dev =
- container_of(device, struct qib_ibdev, rdi.ibdev.dev);
+ rdma_device_to_drv_device(device, struct qib_ibdev, rdi.ibdev);
struct qib_devdata *dd = dd_from_dev(dev);
/* The string printed here is already newline-terminated. */
@@ -614,7 +614,7 @@ static ssize_t nctxts_show(struct device *device,
struct device_attribute *attr, char *buf)
{
struct qib_ibdev *dev =
- container_of(device, struct qib_ibdev, rdi.ibdev.dev);
+ rdma_device_to_drv_device(device, struct qib_ibdev, rdi.ibdev);
struct qib_devdata *dd = dd_from_dev(dev);
/* Return the number of user ports (contexts) available. */
@@ -630,7 +630,7 @@ static ssize_t nfreectxts_show(struct device *device,
struct device_attribute *attr, char *buf)
{
struct qib_ibdev *dev =
- container_of(device, struct qib_ibdev, rdi.ibdev.dev);
+ rdma_device_to_drv_device(device, struct qib_ibdev, rdi.ibdev);
struct qib_devdata *dd = dd_from_dev(dev);
/* Return the number of free user ports (contexts) available. */
@@ -642,7 +642,7 @@ static ssize_t serial_show(struct device *device,
struct device_attribute *attr, char *buf)
{
struct qib_ibdev *dev =
- container_of(device, struct qib_ibdev, rdi.ibdev.dev);
+ rdma_device_to_drv_device(device, struct qib_ibdev, rdi.ibdev);
struct qib_devdata *dd = dd_from_dev(dev);
buf[sizeof(dd->serial)] = '\0';
@@ -657,7 +657,7 @@ static ssize_t chip_reset_store(struct device *device,
size_t count)
{
struct qib_ibdev *dev =
- container_of(device, struct qib_ibdev, rdi.ibdev.dev);
+ rdma_device_to_drv_device(device, struct qib_ibdev, rdi.ibdev);
struct qib_devdata *dd = dd_from_dev(dev);
int ret;
@@ -679,7 +679,7 @@ static ssize_t tempsense_show(struct device *device,
struct device_attribute *attr, char *buf)
{
struct qib_ibdev *dev =
- container_of(device, struct qib_ibdev, rdi.ibdev.dev);
+ rdma_device_to_drv_device(device, struct qib_ibdev, rdi.ibdev);
struct qib_devdata *dd = dd_from_dev(dev);
int ret;
int idx;
diff --git a/drivers/infiniband/hw/qib/qib_ud.c b/drivers/infiniband/hw/qib/qib_ud.c
index 4d4c31ea4e2d..5cdedba2d164 100644
--- a/drivers/infiniband/hw/qib/qib_ud.c
+++ b/drivers/infiniband/hw/qib/qib_ud.c
@@ -172,13 +172,8 @@ static void qib_ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe)
ssge.num_sge = swqe->wr.num_sge;
sge = &ssge.sge;
while (length) {
- u32 len = sge->length;
+ u32 len = rvt_get_sge_length(sge, length);
- if (len > length)
- len = length;
- if (len > sge->sge_length)
- len = sge->sge_length;
- BUG_ON(len == 0);
rvt_copy_sge(qp, &qp->r_sge, sge->vaddr, len, true, false);
sge->vaddr += len;
sge->length -= len;
@@ -513,7 +508,6 @@ void qib_ud_rcv(struct qib_ibport *ibp, struct ib_header *hdr,
opcode == IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE) {
wc.ex.imm_data = ohdr->u.ud.imm_data;
wc.wc_flags = IB_WC_WITH_IMM;
- tlen -= sizeof(u32);
} else if (opcode == IB_OPCODE_UD_SEND_ONLY) {
wc.ex.imm_data = 0;
wc.wc_flags = 0;
diff --git a/drivers/infiniband/hw/qib/qib_user_pages.c b/drivers/infiniband/hw/qib/qib_user_pages.c
index 16543d5e80c3..123ca8f64f75 100644
--- a/drivers/infiniband/hw/qib/qib_user_pages.c
+++ b/drivers/infiniband/hw/qib/qib_user_pages.c
@@ -49,43 +49,6 @@ static void __qib_release_user_pages(struct page **p, size_t num_pages,
}
}
-/*
- * Call with current->mm->mmap_sem held.
- */
-static int __qib_get_user_pages(unsigned long start_page, size_t num_pages,
- struct page **p)
-{
- unsigned long lock_limit;
- size_t got;
- int ret;
-
- lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-
- if (num_pages > lock_limit && !capable(CAP_IPC_LOCK)) {
- ret = -ENOMEM;
- goto bail;
- }
-
- for (got = 0; got < num_pages; got += ret) {
- ret = get_user_pages(start_page + got * PAGE_SIZE,
- num_pages - got,
- FOLL_WRITE | FOLL_FORCE,
- p + got, NULL);
- if (ret < 0)
- goto bail_release;
- }
-
- current->mm->pinned_vm += num_pages;
-
- ret = 0;
- goto bail;
-
-bail_release:
- __qib_release_user_pages(p, got, 0);
-bail:
- return ret;
-}
-
/**
* qib_map_page - a safety wrapper around pci_map_page()
*
@@ -137,26 +100,44 @@ int qib_map_page(struct pci_dev *hwdev, struct page *page, dma_addr_t *daddr)
int qib_get_user_pages(unsigned long start_page, size_t num_pages,
struct page **p)
{
+ unsigned long locked, lock_limit;
+ size_t got;
int ret;
- down_write(&current->mm->mmap_sem);
+ lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+ locked = atomic64_add_return(num_pages, &current->mm->pinned_vm);
- ret = __qib_get_user_pages(start_page, num_pages, p);
+ if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
+ ret = -ENOMEM;
+ goto bail;
+ }
- up_write(&current->mm->mmap_sem);
+ down_read(&current->mm->mmap_sem);
+ for (got = 0; got < num_pages; got += ret) {
+ ret = get_user_pages_longterm(start_page + got * PAGE_SIZE,
+ num_pages - got,
+ FOLL_WRITE | FOLL_FORCE,
+ p + got, NULL);
+ if (ret < 0) {
+ up_read(&current->mm->mmap_sem);
+ goto bail_release;
+ }
+ }
+ up_read(&current->mm->mmap_sem);
+ return 0;
+bail_release:
+ __qib_release_user_pages(p, got, 0);
+bail:
+ atomic64_sub(num_pages, &current->mm->pinned_vm);
return ret;
}
void qib_release_user_pages(struct page **p, size_t num_pages)
{
- if (current->mm) /* during close after signal, mm can be NULL */
- down_write(&current->mm->mmap_sem);
-
__qib_release_user_pages(p, num_pages, 1);
- if (current->mm) {
- current->mm->pinned_vm -= num_pages;
- up_write(&current->mm->mmap_sem);
- }
+ /* during close after signal, mm can be NULL */
+ if (current->mm)
+ atomic64_sub(num_pages, &current->mm->pinned_vm);
}
diff --git a/drivers/infiniband/hw/qib/qib_user_sdma.c b/drivers/infiniband/hw/qib/qib_user_sdma.c
index 926f3c8eba69..31c523b2a9f5 100644
--- a/drivers/infiniband/hw/qib/qib_user_sdma.c
+++ b/drivers/infiniband/hw/qib/qib_user_sdma.c
@@ -237,7 +237,6 @@ qib_user_sdma_queue_create(struct device *dev, int unit, int ctxt, int sctxt)
ret = qib_user_sdma_rb_insert(&qib_user_sdma_rb_root,
sdma_rb_node);
- BUG_ON(ret == 0);
}
pq->sdma_rb_node = sdma_rb_node;
diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c
index 4b0f5761a646..5ff32d32c61c 100644
--- a/drivers/infiniband/hw/qib/qib_verbs.c
+++ b/drivers/infiniband/hw/qib/qib_verbs.c
@@ -144,13 +144,8 @@ static u32 qib_count_sge(struct rvt_sge_state *ss, u32 length)
u32 ndesc = 1; /* count the header */
while (length) {
- u32 len = sge.length;
+ u32 len = rvt_get_sge_length(&sge, length);
- if (len > length)
- len = length;
- if (len > sge.sge_length)
- len = sge.sge_length;
- BUG_ON(len == 0);
if (((long) sge.vaddr & (sizeof(u32) - 1)) ||
(len != length && (len & (sizeof(u32) - 1)))) {
ndesc = 0;
@@ -187,13 +182,8 @@ static void qib_copy_from_sge(void *data, struct rvt_sge_state *ss, u32 length)
struct rvt_sge *sge = &ss->sge;
while (length) {
- u32 len = sge->length;
+ u32 len = rvt_get_sge_length(sge, length);
- if (len > length)
- len = length;
- if (len > sge->sge_length)
- len = sge->sge_length;
- BUG_ON(len == 0);
memcpy(data, sge->vaddr, len);
sge->vaddr += len;
sge->length -= len;
@@ -442,14 +432,9 @@ static void copy_io(u32 __iomem *piobuf, struct rvt_sge_state *ss,
u32 last;
while (1) {
- u32 len = ss->sge.length;
+ u32 len = rvt_get_sge_length(&ss->sge, length);
u32 off;
- if (len > length)
- len = length;
- if (len > ss->sge.sge_length)
- len = ss->sge.sge_length;
- BUG_ON(len == 0);
/* If the source address is not aligned, try to align it. */
off = (unsigned long)ss->sge.vaddr & (sizeof(u32) - 1);
if (off) {
@@ -1365,7 +1350,7 @@ struct ib_ah *qib_create_qp0_ah(struct qib_ibport *ibp, u16 dlid)
rcu_read_lock();
qp0 = rcu_dereference(ibp->rvp.qp[0]);
if (qp0)
- ah = rdma_create_ah(qp0->ibqp.pd, &attr);
+ ah = rdma_create_ah(qp0->ibqp.pd, &attr, 0);
rcu_read_unlock();
return ah;
}
@@ -1496,6 +1481,12 @@ static void qib_fill_device_attr(struct qib_devdata *dd)
dd->verbs_dev.rdi.wc_opcode = ib_qib_wc_opcode;
}
+static const struct ib_device_ops qib_dev_ops = {
+ .init_port = qib_create_port_files,
+ .modify_device = qib_modify_device,
+ .process_mad = qib_process_mad,
+};
+
/**
* qib_register_ib_device - register our device with the infiniband core
* @dd: the device data structure
@@ -1558,8 +1549,6 @@ int qib_register_ib_device(struct qib_devdata *dd)
ibdev->node_guid = ppd->guid;
ibdev->phys_port_cnt = dd->num_pports;
ibdev->dev.parent = &dd->pcidev->dev;
- ibdev->modify_device = qib_modify_device;
- ibdev->process_mad = qib_process_mad;
snprintf(ibdev->node_desc, sizeof(ibdev->node_desc),
"Intel Infiniband HCA %s", init_utsname()->nodename);
@@ -1567,7 +1556,6 @@ int qib_register_ib_device(struct qib_devdata *dd)
/*
* Fill in rvt info object.
*/
- dd->verbs_dev.rdi.driver_f.port_callback = qib_create_port_files;
dd->verbs_dev.rdi.driver_f.get_pci_dev = qib_get_pci_dev;
dd->verbs_dev.rdi.driver_f.check_ah = qib_check_ah;
dd->verbs_dev.rdi.driver_f.setup_wqe = qib_check_send_wqe;
@@ -1627,6 +1615,7 @@ int qib_register_ib_device(struct qib_devdata *dd)
}
rdma_set_device_sysfs_group(&dd->verbs_dev.rdi.ibdev, &qib_attr_group);
+ ib_set_device_ops(ibdev, &qib_dev_ops);
ret = rvt_register_device(&dd->verbs_dev.rdi, RDMA_DRIVER_QIB);
if (ret)
goto err_tx;
diff --git a/drivers/infiniband/hw/usnic/Makefile b/drivers/infiniband/hw/usnic/Makefile
index 94ae7a1a6950..f12a4938ffd2 100644
--- a/drivers/infiniband/hw/usnic/Makefile
+++ b/drivers/infiniband/hw/usnic/Makefile
@@ -1,5 +1,5 @@
# SPDX-License-Identifier: GPL-2.0
-ccflags-y := -Idrivers/net/ethernet/cisco/enic
+ccflags-y := -I $(srctree)/drivers/net/ethernet/cisco/enic
obj-$(CONFIG_INFINIBAND_USNIC)+= usnic_verbs.o
diff --git a/drivers/infiniband/hw/usnic/usnic_debugfs.c b/drivers/infiniband/hw/usnic/usnic_debugfs.c
index a3115709fb03..e5a3f02fb078 100644
--- a/drivers/infiniband/hw/usnic/usnic_debugfs.c
+++ b/drivers/infiniband/hw/usnic/usnic_debugfs.c
@@ -113,42 +113,21 @@ static const struct file_operations flowinfo_ops = {
void usnic_debugfs_init(void)
{
debugfs_root = debugfs_create_dir(DRV_NAME, NULL);
- if (IS_ERR(debugfs_root)) {
- usnic_err("Failed to create debugfs root dir, check if debugfs is enabled in kernel configuration\n");
- goto out_clear_root;
- }
flows_dentry = debugfs_create_dir("flows", debugfs_root);
- if (IS_ERR_OR_NULL(flows_dentry)) {
- usnic_err("Failed to create debugfs flow dir with err %ld\n",
- PTR_ERR(flows_dentry));
- goto out_free_root;
- }
debugfs_create_file("build-info", S_IRUGO, debugfs_root,
NULL, &usnic_debugfs_buildinfo_ops);
- return;
-
-out_free_root:
- debugfs_remove_recursive(debugfs_root);
-out_clear_root:
- debugfs_root = NULL;
}
void usnic_debugfs_exit(void)
{
- if (!debugfs_root)
- return;
-
debugfs_remove_recursive(debugfs_root);
debugfs_root = NULL;
}
void usnic_debugfs_flow_add(struct usnic_ib_qp_grp_flow *qp_flow)
{
- if (IS_ERR_OR_NULL(flows_dentry))
- return;
-
scnprintf(qp_flow->dentry_name, sizeof(qp_flow->dentry_name),
"%u", qp_flow->flow->flow_id);
qp_flow->dbgfs_dentry = debugfs_create_file(qp_flow->dentry_name,
@@ -156,11 +135,6 @@ void usnic_debugfs_flow_add(struct usnic_ib_qp_grp_flow *qp_flow)
flows_dentry,
qp_flow,
&flowinfo_ops);
- if (IS_ERR_OR_NULL(qp_flow->dbgfs_dentry)) {
- usnic_err("Failed to create dbg fs entry for flow %u with error %ld\n",
- qp_flow->flow->flow_id,
- PTR_ERR(qp_flow->dbgfs_dentry));
- }
}
void usnic_debugfs_flow_remove(struct usnic_ib_qp_grp_flow *qp_flow)
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_main.c b/drivers/infiniband/hw/usnic/usnic_ib_main.c
index 73bd00f8d2c8..d88d9f8a7f9a 100644
--- a/drivers/infiniband/hw/usnic/usnic_ib_main.c
+++ b/drivers/infiniband/hw/usnic/usnic_ib_main.c
@@ -216,18 +216,17 @@ static int usnic_ib_netdevice_event(struct notifier_block *notifier,
unsigned long event, void *ptr)
{
struct usnic_ib_dev *us_ibdev;
+ struct ib_device *ibdev;
struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
- mutex_lock(&usnic_ib_ibdev_list_lock);
- list_for_each_entry(us_ibdev, &usnic_ib_ibdev_list, ib_dev_link) {
- if (us_ibdev->netdev == netdev) {
- usnic_ib_handle_usdev_event(us_ibdev, event);
- break;
- }
- }
- mutex_unlock(&usnic_ib_ibdev_list_lock);
+ ibdev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_USNIC);
+ if (!ibdev)
+ return NOTIFY_DONE;
+ us_ibdev = container_of(ibdev, struct usnic_ib_dev, ib_dev);
+ usnic_ib_handle_usdev_event(us_ibdev, event);
+ ib_device_put(ibdev);
return NOTIFY_DONE;
}
@@ -282,16 +281,15 @@ static int usnic_ib_inetaddr_event(struct notifier_block *notifier,
struct usnic_ib_dev *us_ibdev;
struct in_ifaddr *ifa = ptr;
struct net_device *netdev = ifa->ifa_dev->dev;
+ struct ib_device *ibdev;
- mutex_lock(&usnic_ib_ibdev_list_lock);
- list_for_each_entry(us_ibdev, &usnic_ib_ibdev_list, ib_dev_link) {
- if (us_ibdev->netdev == netdev) {
- usnic_ib_handle_inet_event(us_ibdev, event, ptr);
- break;
- }
- }
- mutex_unlock(&usnic_ib_ibdev_list_lock);
+ ibdev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_USNIC);
+ if (!ibdev)
+ return NOTIFY_DONE;
+ us_ibdev = container_of(ibdev, struct usnic_ib_dev, ib_dev);
+ usnic_ib_handle_inet_event(us_ibdev, event, ptr);
+ ib_device_put(ibdev);
return NOTIFY_DONE;
}
static struct notifier_block usnic_ib_inetaddr_notifier = {
@@ -330,6 +328,31 @@ static void usnic_get_dev_fw_str(struct ib_device *device, char *str)
snprintf(str, IB_FW_VERSION_NAME_MAX, "%s", info.fw_version);
}
+static const struct ib_device_ops usnic_dev_ops = {
+ .alloc_pd = usnic_ib_alloc_pd,
+ .alloc_ucontext = usnic_ib_alloc_ucontext,
+ .create_cq = usnic_ib_create_cq,
+ .create_qp = usnic_ib_create_qp,
+ .dealloc_pd = usnic_ib_dealloc_pd,
+ .dealloc_ucontext = usnic_ib_dealloc_ucontext,
+ .dereg_mr = usnic_ib_dereg_mr,
+ .destroy_cq = usnic_ib_destroy_cq,
+ .destroy_qp = usnic_ib_destroy_qp,
+ .get_dev_fw_str = usnic_get_dev_fw_str,
+ .get_link_layer = usnic_ib_port_link_layer,
+ .get_port_immutable = usnic_port_immutable,
+ .mmap = usnic_ib_mmap,
+ .modify_qp = usnic_ib_modify_qp,
+ .query_device = usnic_ib_query_device,
+ .query_gid = usnic_ib_query_gid,
+ .query_pkey = usnic_ib_query_pkey,
+ .query_port = usnic_ib_query_port,
+ .query_qp = usnic_ib_query_qp,
+ .reg_user_mr = usnic_ib_reg_mr,
+ INIT_RDMA_OBJ_SIZE(ib_pd, usnic_ib_pd, ibpd),
+ INIT_RDMA_OBJ_SIZE(ib_ucontext, usnic_ib_ucontext, ibucontext),
+};
+
/* Start of PF discovery section */
static void *usnic_ib_device_add(struct pci_dev *dev)
{
@@ -337,11 +360,12 @@ static void *usnic_ib_device_add(struct pci_dev *dev)
union ib_gid gid;
struct in_device *ind;
struct net_device *netdev;
+ int ret;
usnic_dbg("\n");
netdev = pci_get_drvdata(dev);
- us_ibdev = (struct usnic_ib_dev *)ib_alloc_device(sizeof(*us_ibdev));
+ us_ibdev = ib_alloc_device(usnic_ib_dev, ib_dev);
if (!us_ibdev) {
usnic_err("Device %s context alloc failed\n",
netdev_name(pci_get_drvdata(dev)));
@@ -386,40 +410,16 @@ static void *usnic_ib_device_add(struct pci_dev *dev)
(1ull << IB_USER_VERBS_CMD_DETACH_MCAST) |
(1ull << IB_USER_VERBS_CMD_OPEN_QP);
- us_ibdev->ib_dev.query_device = usnic_ib_query_device;
- us_ibdev->ib_dev.query_port = usnic_ib_query_port;
- us_ibdev->ib_dev.query_pkey = usnic_ib_query_pkey;
- us_ibdev->ib_dev.query_gid = usnic_ib_query_gid;
- us_ibdev->ib_dev.get_netdev = usnic_get_netdev;
- us_ibdev->ib_dev.get_link_layer = usnic_ib_port_link_layer;
- us_ibdev->ib_dev.alloc_pd = usnic_ib_alloc_pd;
- us_ibdev->ib_dev.dealloc_pd = usnic_ib_dealloc_pd;
- us_ibdev->ib_dev.create_qp = usnic_ib_create_qp;
- us_ibdev->ib_dev.modify_qp = usnic_ib_modify_qp;
- us_ibdev->ib_dev.query_qp = usnic_ib_query_qp;
- us_ibdev->ib_dev.destroy_qp = usnic_ib_destroy_qp;
- us_ibdev->ib_dev.create_cq = usnic_ib_create_cq;
- us_ibdev->ib_dev.destroy_cq = usnic_ib_destroy_cq;
- us_ibdev->ib_dev.reg_user_mr = usnic_ib_reg_mr;
- us_ibdev->ib_dev.dereg_mr = usnic_ib_dereg_mr;
- us_ibdev->ib_dev.alloc_ucontext = usnic_ib_alloc_ucontext;
- us_ibdev->ib_dev.dealloc_ucontext = usnic_ib_dealloc_ucontext;
- us_ibdev->ib_dev.mmap = usnic_ib_mmap;
- us_ibdev->ib_dev.create_ah = usnic_ib_create_ah;
- us_ibdev->ib_dev.destroy_ah = usnic_ib_destroy_ah;
- us_ibdev->ib_dev.post_send = usnic_ib_post_send;
- us_ibdev->ib_dev.post_recv = usnic_ib_post_recv;
- us_ibdev->ib_dev.poll_cq = usnic_ib_poll_cq;
- us_ibdev->ib_dev.req_notify_cq = usnic_ib_req_notify_cq;
- us_ibdev->ib_dev.get_dma_mr = usnic_ib_get_dma_mr;
- us_ibdev->ib_dev.get_port_immutable = usnic_port_immutable;
- us_ibdev->ib_dev.get_dev_fw_str = usnic_get_dev_fw_str;
-
+ ib_set_device_ops(&us_ibdev->ib_dev, &usnic_dev_ops);
us_ibdev->ib_dev.driver_id = RDMA_DRIVER_USNIC;
rdma_set_device_sysfs_group(&us_ibdev->ib_dev, &usnic_attr_group);
- if (ib_register_device(&us_ibdev->ib_dev, "usnic_%d", NULL))
+ ret = ib_device_set_netdev(&us_ibdev->ib_dev, us_ibdev->netdev, 1);
+ if (ret)
+ goto err_fwd_dealloc;
+
+ if (ib_register_device(&us_ibdev->ib_dev, "usnic_%d"))
goto err_fwd_dealloc;
usnic_fwd_set_mtu(us_ibdev->ufdev, us_ibdev->netdev->mtu);
@@ -474,15 +474,17 @@ static void usnic_ib_undiscover_pf(struct kref *kref)
&usnic_ib_ibdev_list, ib_dev_link) {
if (us_ibdev->pdev == dev) {
list_del(&us_ibdev->ib_dev_link);
- usnic_ib_device_remove(us_ibdev);
found = true;
break;
}
}
- WARN(!found, "Failed to remove PF %s\n", pci_name(dev));
mutex_unlock(&usnic_ib_ibdev_list_lock);
+ if (found)
+ usnic_ib_device_remove(us_ibdev);
+ else
+ WARN(1, "Failed to remove PF %s\n", pci_name(dev));
}
static struct usnic_ib_dev *usnic_ib_discover_pf(struct usnic_vnic *vnic)
@@ -649,7 +651,7 @@ static int __init usnic_ib_init(void)
err = usnic_uiom_init(DRV_NAME);
if (err) {
- usnic_err("Unable to initalize umem with err %d\n", err);
+ usnic_err("Unable to initialize umem with err %d\n", err);
return err;
}
@@ -688,7 +690,6 @@ out_unreg_netdev_notifier:
out_pci_unreg:
pci_unregister_driver(&usnic_ib_pci_driver);
out_umem_fini:
- usnic_uiom_fini();
return err;
}
@@ -701,7 +702,6 @@ static void __exit usnic_ib_destroy(void)
unregister_inetaddr_notifier(&usnic_ib_inetaddr_notifier);
unregister_netdevice_notifier(&usnic_ib_netdevice_notifier);
pci_unregister_driver(&usnic_ib_pci_driver);
- usnic_uiom_fini();
}
MODULE_DESCRIPTION("Cisco VIC (usNIC) Verbs Driver");
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c b/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c
index bf5136533d49..0cdb156e165e 100644
--- a/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c
+++ b/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c
@@ -681,7 +681,7 @@ usnic_ib_qp_grp_create(struct usnic_fwd_dev *ufdev, struct usnic_ib_vf *vf,
err = usnic_vnic_res_spec_satisfied(&min_transport_spec[transport],
res_spec);
if (err) {
- usnic_err("Spec does not meet miniumum req for transport %d\n",
+ usnic_err("Spec does not meet minimum req for transport %d\n",
transport);
log_spec(res_spec);
return ERR_PTR(err);
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c
index a7e4b2ccfaf8..c85d48ae7442 100644
--- a/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c
+++ b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c
@@ -50,7 +50,7 @@ static ssize_t board_id_show(struct device *device,
struct device_attribute *attr, char *buf)
{
struct usnic_ib_dev *us_ibdev =
- container_of(device, struct usnic_ib_dev, ib_dev.dev);
+ rdma_device_to_drv_device(device, struct usnic_ib_dev, ib_dev);
unsigned short subsystem_device_id;
mutex_lock(&us_ibdev->usdev_lock);
@@ -67,14 +67,13 @@ static DEVICE_ATTR_RO(board_id);
static ssize_t
config_show(struct device *device, struct device_attribute *attr, char *buf)
{
- struct usnic_ib_dev *us_ibdev;
+ struct usnic_ib_dev *us_ibdev =
+ rdma_device_to_drv_device(device, struct usnic_ib_dev, ib_dev);
char *ptr;
unsigned left;
unsigned n;
enum usnic_vnic_res_type res_type;
- us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev);
-
/* Buffer space limit is 1 page */
ptr = buf;
left = PAGE_SIZE;
@@ -130,9 +129,8 @@ static DEVICE_ATTR_RO(config);
static ssize_t
iface_show(struct device *device, struct device_attribute *attr, char *buf)
{
- struct usnic_ib_dev *us_ibdev;
-
- us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev);
+ struct usnic_ib_dev *us_ibdev =
+ rdma_device_to_drv_device(device, struct usnic_ib_dev, ib_dev);
return scnprintf(buf, PAGE_SIZE, "%s\n",
netdev_name(us_ibdev->netdev));
@@ -142,9 +140,8 @@ static DEVICE_ATTR_RO(iface);
static ssize_t
max_vf_show(struct device *device, struct device_attribute *attr, char *buf)
{
- struct usnic_ib_dev *us_ibdev;
-
- us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev);
+ struct usnic_ib_dev *us_ibdev =
+ rdma_device_to_drv_device(device, struct usnic_ib_dev, ib_dev);
return scnprintf(buf, PAGE_SIZE, "%u\n",
kref_read(&us_ibdev->vf_cnt));
@@ -154,10 +151,10 @@ static DEVICE_ATTR_RO(max_vf);
static ssize_t
qp_per_vf_show(struct device *device, struct device_attribute *attr, char *buf)
{
- struct usnic_ib_dev *us_ibdev;
+ struct usnic_ib_dev *us_ibdev =
+ rdma_device_to_drv_device(device, struct usnic_ib_dev, ib_dev);
int qp_per_vf;
- us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev);
qp_per_vf = max(us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_WQ],
us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_RQ]);
@@ -169,9 +166,8 @@ static DEVICE_ATTR_RO(qp_per_vf);
static ssize_t
cq_per_vf_show(struct device *device, struct device_attribute *attr, char *buf)
{
- struct usnic_ib_dev *us_ibdev;
-
- us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev);
+ struct usnic_ib_dev *us_ibdev =
+ rdma_device_to_drv_device(device, struct usnic_ib_dev, ib_dev);
return scnprintf(buf, PAGE_SIZE, "%d\n",
us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_CQ]);
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c
index 0b91ff36768a..bd4521b2cc5f 100644
--- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c
+++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c
@@ -37,6 +37,7 @@
#include <rdma/ib_user_verbs.h>
#include <rdma/ib_addr.h>
+#include <rdma/uverbs_ioctl.h>
#include "usnic_abi.h"
#include "usnic_ib.h"
@@ -336,13 +337,16 @@ int usnic_ib_query_port(struct ib_device *ibdev, u8 port,
usnic_dbg("\n");
- mutex_lock(&us_ibdev->usdev_lock);
if (ib_get_eth_speed(ibdev, port, &props->active_speed,
- &props->active_width)) {
- mutex_unlock(&us_ibdev->usdev_lock);
+ &props->active_width))
return -EINVAL;
- }
+ /*
+ * usdev_lock is acquired after (and not before) ib_get_eth_speed call
+ * because acquiring rtnl_lock in ib_get_eth_speed, while holding
+ * usdev_lock could lead to a deadlock.
+ */
+ mutex_lock(&us_ibdev->usdev_lock);
/* props being zeroed by the caller, avoid zeroing it here */
props->lid = 0;
@@ -433,57 +437,33 @@ int usnic_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
return 0;
}
-struct net_device *usnic_get_netdev(struct ib_device *device, u8 port_num)
-{
- struct usnic_ib_dev *us_ibdev = to_usdev(device);
-
- if (us_ibdev->netdev)
- dev_hold(us_ibdev->netdev);
-
- return us_ibdev->netdev;
-}
-
int usnic_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
u16 *pkey)
{
- if (index > 1)
+ if (index > 0)
return -EINVAL;
*pkey = 0xffff;
return 0;
}
-struct ib_pd *usnic_ib_alloc_pd(struct ib_device *ibdev,
- struct ib_ucontext *context,
- struct ib_udata *udata)
+int usnic_ib_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context,
+ struct ib_udata *udata)
{
- struct usnic_ib_pd *pd;
+ struct usnic_ib_pd *pd = to_upd(ibpd);
void *umem_pd;
- usnic_dbg("\n");
-
- pd = kzalloc(sizeof(*pd), GFP_KERNEL);
- if (!pd)
- return ERR_PTR(-ENOMEM);
-
umem_pd = pd->umem_pd = usnic_uiom_alloc_pd();
if (IS_ERR_OR_NULL(umem_pd)) {
- kfree(pd);
- return ERR_PTR(umem_pd ? PTR_ERR(umem_pd) : -ENOMEM);
+ return umem_pd ? PTR_ERR(umem_pd) : -ENOMEM;
}
- usnic_info("domain 0x%p allocated for context 0x%p and device %s\n",
- pd, context, dev_name(&ibdev->dev));
- return &pd->ibpd;
+ return 0;
}
-int usnic_ib_dealloc_pd(struct ib_pd *pd)
+void usnic_ib_dealloc_pd(struct ib_pd *pd)
{
- usnic_info("freeing domain 0x%p\n", pd);
-
usnic_uiom_dealloc_pd((to_upd(pd))->umem_pd);
- kfree(pd);
- return 0;
}
struct ib_qp *usnic_ib_create_qp(struct ib_pd *pd,
@@ -493,7 +473,8 @@ struct ib_qp *usnic_ib_create_qp(struct ib_pd *pd,
int err;
struct usnic_ib_dev *us_ibdev;
struct usnic_ib_qp_grp *qp_grp;
- struct usnic_ib_ucontext *ucontext;
+ struct usnic_ib_ucontext *ucontext = rdma_udata_to_drv_context(
+ udata, struct usnic_ib_ucontext, ibucontext);
int cq_cnt;
struct usnic_vnic_res_spec res_spec;
struct usnic_ib_create_qp_cmd cmd;
@@ -501,7 +482,6 @@ struct ib_qp *usnic_ib_create_qp(struct ib_pd *pd,
usnic_dbg("\n");
- ucontext = to_uucontext(pd->uobject->context);
us_ibdev = to_usdev(pd->device);
if (init_attr->create_flags)
@@ -673,37 +653,31 @@ int usnic_ib_dereg_mr(struct ib_mr *ibmr)
return 0;
}
-struct ib_ucontext *usnic_ib_alloc_ucontext(struct ib_device *ibdev,
- struct ib_udata *udata)
+int usnic_ib_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata)
{
- struct usnic_ib_ucontext *context;
+ struct ib_device *ibdev = uctx->device;
+ struct usnic_ib_ucontext *context = to_ucontext(uctx);
struct usnic_ib_dev *us_ibdev = to_usdev(ibdev);
usnic_dbg("\n");
- context = kmalloc(sizeof(*context), GFP_KERNEL);
- if (!context)
- return ERR_PTR(-ENOMEM);
-
INIT_LIST_HEAD(&context->qp_grp_list);
mutex_lock(&us_ibdev->usdev_lock);
list_add_tail(&context->link, &us_ibdev->ctx_list);
mutex_unlock(&us_ibdev->usdev_lock);
- return &context->ibucontext;
+ return 0;
}
-int usnic_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
+void usnic_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
{
struct usnic_ib_ucontext *context = to_uucontext(ibcontext);
struct usnic_ib_dev *us_ibdev = to_usdev(ibcontext->device);
usnic_dbg("\n");
mutex_lock(&us_ibdev->usdev_lock);
- BUG_ON(!list_empty(&context->qp_grp_list));
+ WARN_ON_ONCE(!list_empty(&context->qp_grp_list));
list_del(&context->link);
mutex_unlock(&us_ibdev->usdev_lock);
- kfree(context);
- return 0;
}
int usnic_ib_mmap(struct ib_ucontext *context,
@@ -757,56 +731,4 @@ int usnic_ib_mmap(struct ib_ucontext *context,
return -EINVAL;
}
-/* In ib callbacks section - Start of stub funcs */
-struct ib_ah *usnic_ib_create_ah(struct ib_pd *pd,
- struct rdma_ah_attr *ah_attr,
- struct ib_udata *udata)
-
-{
- usnic_dbg("\n");
- return ERR_PTR(-EPERM);
-}
-
-int usnic_ib_destroy_ah(struct ib_ah *ah)
-{
- usnic_dbg("\n");
- return -EINVAL;
-}
-
-int usnic_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
- const struct ib_send_wr **bad_wr)
-{
- usnic_dbg("\n");
- return -EINVAL;
-}
-
-int usnic_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr,
- const struct ib_recv_wr **bad_wr)
-{
- usnic_dbg("\n");
- return -EINVAL;
-}
-
-int usnic_ib_poll_cq(struct ib_cq *ibcq, int num_entries,
- struct ib_wc *wc)
-{
- usnic_dbg("\n");
- return -EINVAL;
-}
-
-int usnic_ib_req_notify_cq(struct ib_cq *cq,
- enum ib_cq_notify_flags flags)
-{
- usnic_dbg("\n");
- return -EINVAL;
-}
-
-struct ib_mr *usnic_ib_get_dma_mr(struct ib_pd *pd, int acc)
-{
- usnic_dbg("\n");
- return ERR_PTR(-ENOMEM);
-}
-
-
-/* In ib callbacks section - End of stub funcs */
/* End of ib callbacks section */
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.h b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h
index 2a2c9beb715f..c40e89b6246f 100644
--- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.h
+++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h
@@ -48,13 +48,11 @@ int usnic_ib_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr,
struct ib_qp_init_attr *qp_init_attr);
int usnic_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
union ib_gid *gid);
-struct net_device *usnic_get_netdev(struct ib_device *device, u8 port_num);
int usnic_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
u16 *pkey);
-struct ib_pd *usnic_ib_alloc_pd(struct ib_device *ibdev,
- struct ib_ucontext *context,
- struct ib_udata *udata);
-int usnic_ib_dealloc_pd(struct ib_pd *pd);
+int usnic_ib_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context,
+ struct ib_udata *udata);
+void usnic_ib_dealloc_pd(struct ib_pd *pd);
struct ib_qp *usnic_ib_create_qp(struct ib_pd *pd,
struct ib_qp_init_attr *init_attr,
struct ib_udata *udata);
@@ -70,23 +68,8 @@ struct ib_mr *usnic_ib_reg_mr(struct ib_pd *pd, u64 start, u64 length,
u64 virt_addr, int access_flags,
struct ib_udata *udata);
int usnic_ib_dereg_mr(struct ib_mr *ibmr);
-struct ib_ucontext *usnic_ib_alloc_ucontext(struct ib_device *ibdev,
- struct ib_udata *udata);
-int usnic_ib_dealloc_ucontext(struct ib_ucontext *ibcontext);
+int usnic_ib_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata);
+void usnic_ib_dealloc_ucontext(struct ib_ucontext *ibcontext);
int usnic_ib_mmap(struct ib_ucontext *context,
struct vm_area_struct *vma);
-struct ib_ah *usnic_ib_create_ah(struct ib_pd *pd,
- struct rdma_ah_attr *ah_attr,
- struct ib_udata *udata);
-
-int usnic_ib_destroy_ah(struct ib_ah *ah);
-int usnic_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
- const struct ib_send_wr **bad_wr);
-int usnic_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr,
- const struct ib_recv_wr **bad_wr);
-int usnic_ib_poll_cq(struct ib_cq *ibcq, int num_entries,
- struct ib_wc *wc);
-int usnic_ib_req_notify_cq(struct ib_cq *cq,
- enum ib_cq_notify_flags flags);
-struct ib_mr *usnic_ib_get_dma_mr(struct ib_pd *pd, int acc);
#endif /* !USNIC_IB_VERBS_H */
diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c
index 49275a548751..06862a6af185 100644
--- a/drivers/infiniband/hw/usnic/usnic_uiom.c
+++ b/drivers/infiniband/hw/usnic/usnic_uiom.c
@@ -47,8 +47,6 @@
#include "usnic_uiom.h"
#include "usnic_uiom_interval_tree.h"
-static struct workqueue_struct *usnic_uiom_wq;
-
#define USNIC_UIOM_PAGE_CHUNK \
((PAGE_SIZE - offsetof(struct usnic_uiom_chunk, page_list)) /\
((void *) &((struct usnic_uiom_chunk *) 0)->page_list[1] - \
@@ -127,9 +125,9 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable,
npages = PAGE_ALIGN(size + (addr & ~PAGE_MASK)) >> PAGE_SHIFT;
uiomr->owning_mm = mm = current->mm;
- down_write(&mm->mmap_sem);
+ down_read(&mm->mmap_sem);
- locked = npages + current->mm->pinned_vm;
+ locked = atomic64_add_return(npages, &current->mm->pinned_vm);
lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
@@ -157,9 +155,8 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable,
off = 0;
while (ret) {
- chunk = kmalloc(sizeof(*chunk) +
- sizeof(struct scatterlist) *
- min_t(int, ret, USNIC_UIOM_PAGE_CHUNK),
+ chunk = kmalloc(struct_size(chunk, page_list,
+ min_t(int, ret, USNIC_UIOM_PAGE_CHUNK)),
GFP_KERNEL);
if (!chunk) {
ret = -ENOMEM;
@@ -185,14 +182,13 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable,
}
out:
- if (ret < 0)
+ if (ret < 0) {
usnic_uiom_put_pages(chunk_list, 0);
- else {
- mm->pinned_vm = locked;
+ atomic64_sub(npages, &current->mm->pinned_vm);
+ } else
mmgrab(uiomr->owning_mm);
- }
- up_write(&mm->mmap_sem);
+ up_read(&mm->mmap_sem);
free_page((unsigned long) page_list);
return ret;
}
@@ -436,43 +432,12 @@ static inline size_t usnic_uiom_num_pages(struct usnic_uiom_reg *uiomr)
return PAGE_ALIGN(uiomr->length + uiomr->offset) >> PAGE_SHIFT;
}
-static void usnic_uiom_release_defer(struct work_struct *work)
-{
- struct usnic_uiom_reg *uiomr =
- container_of(work, struct usnic_uiom_reg, work);
-
- down_write(&uiomr->owning_mm->mmap_sem);
- uiomr->owning_mm->pinned_vm -= usnic_uiom_num_pages(uiomr);
- up_write(&uiomr->owning_mm->mmap_sem);
-
- __usnic_uiom_release_tail(uiomr);
-}
-
void usnic_uiom_reg_release(struct usnic_uiom_reg *uiomr,
struct ib_ucontext *context)
{
__usnic_uiom_reg_release(uiomr->pd, uiomr, 1);
- /*
- * We may be called with the mm's mmap_sem already held. This
- * can happen when a userspace munmap() is the call that drops
- * the last reference to our file and calls our release
- * method. If there are memory regions to destroy, we'll end
- * up here and not be able to take the mmap_sem. In that case
- * we defer the vm_locked accounting to a workqueue.
- */
- if (context->closing) {
- if (!down_write_trylock(&uiomr->owning_mm->mmap_sem)) {
- INIT_WORK(&uiomr->work, usnic_uiom_release_defer);
- queue_work(usnic_uiom_wq, &uiomr->work);
- return;
- }
- } else {
- down_write(&uiomr->owning_mm->mmap_sem);
- }
- uiomr->owning_mm->pinned_vm -= usnic_uiom_num_pages(uiomr);
- up_write(&uiomr->owning_mm->mmap_sem);
-
+ atomic64_sub(usnic_uiom_num_pages(uiomr), &uiomr->owning_mm->pinned_vm);
__usnic_uiom_release_tail(uiomr);
}
@@ -601,17 +566,5 @@ int usnic_uiom_init(char *drv_name)
return -EPERM;
}
- usnic_uiom_wq = create_workqueue(drv_name);
- if (!usnic_uiom_wq) {
- usnic_err("Unable to alloc wq for drv %s\n", drv_name);
- return -ENOMEM;
- }
-
return 0;
}
-
-void usnic_uiom_fini(void)
-{
- flush_workqueue(usnic_uiom_wq);
- destroy_workqueue(usnic_uiom_wq);
-}
diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.h b/drivers/infiniband/hw/usnic/usnic_uiom.h
index b86a9731071b..c88cfa087e3a 100644
--- a/drivers/infiniband/hw/usnic/usnic_uiom.h
+++ b/drivers/infiniband/hw/usnic/usnic_uiom.h
@@ -93,5 +93,4 @@ struct usnic_uiom_reg *usnic_uiom_reg_get(struct usnic_uiom_pd *pd,
void usnic_uiom_reg_release(struct usnic_uiom_reg *uiomr,
struct ib_ucontext *ucontext);
int usnic_uiom_init(char *drv_name);
-void usnic_uiom_fini(void);
#endif /* USNIC_UIOM_H_ */
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h b/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h
index 42b8685c997e..3c633ab58052 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h
@@ -427,7 +427,40 @@ static inline enum ib_qp_state pvrdma_qp_state_to_ib(enum pvrdma_qp_state state)
static inline enum pvrdma_wr_opcode ib_wr_opcode_to_pvrdma(enum ib_wr_opcode op)
{
- return (enum pvrdma_wr_opcode)op;
+ switch (op) {
+ case IB_WR_RDMA_WRITE:
+ return PVRDMA_WR_RDMA_WRITE;
+ case IB_WR_RDMA_WRITE_WITH_IMM:
+ return PVRDMA_WR_RDMA_WRITE_WITH_IMM;
+ case IB_WR_SEND:
+ return PVRDMA_WR_SEND;
+ case IB_WR_SEND_WITH_IMM:
+ return PVRDMA_WR_SEND_WITH_IMM;
+ case IB_WR_RDMA_READ:
+ return PVRDMA_WR_RDMA_READ;
+ case IB_WR_ATOMIC_CMP_AND_SWP:
+ return PVRDMA_WR_ATOMIC_CMP_AND_SWP;
+ case IB_WR_ATOMIC_FETCH_AND_ADD:
+ return PVRDMA_WR_ATOMIC_FETCH_AND_ADD;
+ case IB_WR_LSO:
+ return PVRDMA_WR_LSO;
+ case IB_WR_SEND_WITH_INV:
+ return PVRDMA_WR_SEND_WITH_INV;
+ case IB_WR_RDMA_READ_WITH_INV:
+ return PVRDMA_WR_RDMA_READ_WITH_INV;
+ case IB_WR_LOCAL_INV:
+ return PVRDMA_WR_LOCAL_INV;
+ case IB_WR_REG_MR:
+ return PVRDMA_WR_FAST_REG_MR;
+ case IB_WR_MASKED_ATOMIC_CMP_AND_SWP:
+ return PVRDMA_WR_MASKED_ATOMIC_CMP_AND_SWP;
+ case IB_WR_MASKED_ATOMIC_FETCH_AND_ADD:
+ return PVRDMA_WR_MASKED_ATOMIC_FETCH_AND_ADD;
+ case IB_WR_REG_SIG_MR:
+ return PVRDMA_WR_REG_SIG_MR;
+ default:
+ return PVRDMA_WR_ERROR;
+ }
}
static inline enum ib_wc_status pvrdma_wc_status_to_ib(
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c
index 0f004c737620..104c7db4704f 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c
@@ -141,7 +141,7 @@ struct ib_cq *pvrdma_create_cq(struct ib_device *ibdev,
goto err_cq;
}
- cq->umem = ib_umem_get(context, ucmd.buf_addr, ucmd.buf_size,
+ cq->umem = ib_umem_get(udata, ucmd.buf_addr, ucmd.buf_size,
IB_ACCESS_LOCAL_WRITE, 1);
if (IS_ERR(cq->umem)) {
ret = PTR_ERR(cq->umem);
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_dev_api.h b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_dev_api.h
index 6fd5a8f4e2f6..8f9749d54688 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_dev_api.h
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_dev_api.h
@@ -57,7 +57,8 @@
#define PVRDMA_ROCEV1_VERSION 17
#define PVRDMA_ROCEV2_VERSION 18
-#define PVRDMA_VERSION PVRDMA_ROCEV2_VERSION
+#define PVRDMA_PPN64_VERSION 19
+#define PVRDMA_VERSION PVRDMA_PPN64_VERSION
#define PVRDMA_BOARD_ID 1
#define PVRDMA_REV_ID 1
@@ -279,8 +280,10 @@ struct pvrdma_device_shared_region {
/* W: Async ring page info. */
struct pvrdma_ring_page_info cq_ring_pages;
/* W: CQ ring page info. */
- u32 uar_pfn; /* W: UAR pageframe. */
- u32 pad2; /* Pad to 8-byte align. */
+ union {
+ u32 uar_pfn; /* W: UAR pageframe. */
+ u64 uar_pfn64; /* W: 64-bit UAR page frame. */
+ };
struct pvrdma_device_caps caps; /* R: Device capabilities. */
};
@@ -411,8 +414,10 @@ struct pvrdma_cmd_query_pkey_resp {
struct pvrdma_cmd_create_uc {
struct pvrdma_cmd_hdr hdr;
- u32 pfn; /* UAR page frame number */
- u8 reserved[4];
+ union {
+ u32 pfn; /* UAR page frame number */
+ u64 pfn64; /* 64-bit UAR page frame number */
+ };
};
struct pvrdma_cmd_create_uc_resp {
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c
index 398443f43dc3..ec41400fec0c 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c
@@ -161,6 +161,51 @@ static struct net_device *pvrdma_get_netdev(struct ib_device *ibdev,
return netdev;
}
+static const struct ib_device_ops pvrdma_dev_ops = {
+ .add_gid = pvrdma_add_gid,
+ .alloc_mr = pvrdma_alloc_mr,
+ .alloc_pd = pvrdma_alloc_pd,
+ .alloc_ucontext = pvrdma_alloc_ucontext,
+ .create_ah = pvrdma_create_ah,
+ .create_cq = pvrdma_create_cq,
+ .create_qp = pvrdma_create_qp,
+ .dealloc_pd = pvrdma_dealloc_pd,
+ .dealloc_ucontext = pvrdma_dealloc_ucontext,
+ .del_gid = pvrdma_del_gid,
+ .dereg_mr = pvrdma_dereg_mr,
+ .destroy_ah = pvrdma_destroy_ah,
+ .destroy_cq = pvrdma_destroy_cq,
+ .destroy_qp = pvrdma_destroy_qp,
+ .get_dev_fw_str = pvrdma_get_fw_ver_str,
+ .get_dma_mr = pvrdma_get_dma_mr,
+ .get_link_layer = pvrdma_port_link_layer,
+ .get_netdev = pvrdma_get_netdev,
+ .get_port_immutable = pvrdma_port_immutable,
+ .map_mr_sg = pvrdma_map_mr_sg,
+ .mmap = pvrdma_mmap,
+ .modify_port = pvrdma_modify_port,
+ .modify_qp = pvrdma_modify_qp,
+ .poll_cq = pvrdma_poll_cq,
+ .post_recv = pvrdma_post_recv,
+ .post_send = pvrdma_post_send,
+ .query_device = pvrdma_query_device,
+ .query_gid = pvrdma_query_gid,
+ .query_pkey = pvrdma_query_pkey,
+ .query_port = pvrdma_query_port,
+ .query_qp = pvrdma_query_qp,
+ .reg_user_mr = pvrdma_reg_user_mr,
+ .req_notify_cq = pvrdma_req_notify_cq,
+ INIT_RDMA_OBJ_SIZE(ib_pd, pvrdma_pd, ibpd),
+ INIT_RDMA_OBJ_SIZE(ib_ucontext, pvrdma_ucontext, ibucontext),
+};
+
+static const struct ib_device_ops pvrdma_dev_srq_ops = {
+ .create_srq = pvrdma_create_srq,
+ .destroy_srq = pvrdma_destroy_srq,
+ .modify_srq = pvrdma_modify_srq,
+ .query_srq = pvrdma_query_srq,
+};
+
static int pvrdma_register_device(struct pvrdma_dev *dev)
{
int ret = -1;
@@ -197,39 +242,7 @@ static int pvrdma_register_device(struct pvrdma_dev *dev)
dev->ib_dev.node_type = RDMA_NODE_IB_CA;
dev->ib_dev.phys_port_cnt = dev->dsr->caps.phys_port_cnt;
- dev->ib_dev.query_device = pvrdma_query_device;
- dev->ib_dev.query_port = pvrdma_query_port;
- dev->ib_dev.query_gid = pvrdma_query_gid;
- dev->ib_dev.query_pkey = pvrdma_query_pkey;
- dev->ib_dev.modify_port = pvrdma_modify_port;
- dev->ib_dev.alloc_ucontext = pvrdma_alloc_ucontext;
- dev->ib_dev.dealloc_ucontext = pvrdma_dealloc_ucontext;
- dev->ib_dev.mmap = pvrdma_mmap;
- dev->ib_dev.alloc_pd = pvrdma_alloc_pd;
- dev->ib_dev.dealloc_pd = pvrdma_dealloc_pd;
- dev->ib_dev.create_ah = pvrdma_create_ah;
- dev->ib_dev.destroy_ah = pvrdma_destroy_ah;
- dev->ib_dev.create_qp = pvrdma_create_qp;
- dev->ib_dev.modify_qp = pvrdma_modify_qp;
- dev->ib_dev.query_qp = pvrdma_query_qp;
- dev->ib_dev.destroy_qp = pvrdma_destroy_qp;
- dev->ib_dev.post_send = pvrdma_post_send;
- dev->ib_dev.post_recv = pvrdma_post_recv;
- dev->ib_dev.create_cq = pvrdma_create_cq;
- dev->ib_dev.destroy_cq = pvrdma_destroy_cq;
- dev->ib_dev.poll_cq = pvrdma_poll_cq;
- dev->ib_dev.req_notify_cq = pvrdma_req_notify_cq;
- dev->ib_dev.get_dma_mr = pvrdma_get_dma_mr;
- dev->ib_dev.reg_user_mr = pvrdma_reg_user_mr;
- dev->ib_dev.dereg_mr = pvrdma_dereg_mr;
- dev->ib_dev.alloc_mr = pvrdma_alloc_mr;
- dev->ib_dev.map_mr_sg = pvrdma_map_mr_sg;
- dev->ib_dev.add_gid = pvrdma_add_gid;
- dev->ib_dev.del_gid = pvrdma_del_gid;
- dev->ib_dev.get_netdev = pvrdma_get_netdev;
- dev->ib_dev.get_port_immutable = pvrdma_port_immutable;
- dev->ib_dev.get_link_layer = pvrdma_port_link_layer;
- dev->ib_dev.get_dev_fw_str = pvrdma_get_fw_ver_str;
+ ib_set_device_ops(&dev->ib_dev, &pvrdma_dev_ops);
mutex_init(&dev->port_mutex);
spin_lock_init(&dev->desc_lock);
@@ -255,10 +268,7 @@ static int pvrdma_register_device(struct pvrdma_dev *dev)
(1ull << IB_USER_VERBS_CMD_DESTROY_SRQ) |
(1ull << IB_USER_VERBS_CMD_POST_SRQ_RECV);
- dev->ib_dev.create_srq = pvrdma_create_srq;
- dev->ib_dev.modify_srq = pvrdma_modify_srq;
- dev->ib_dev.query_srq = pvrdma_query_srq;
- dev->ib_dev.destroy_srq = pvrdma_destroy_srq;
+ ib_set_device_ops(&dev->ib_dev, &pvrdma_dev_srq_ops);
dev->srq_tbl = kcalloc(dev->dsr->caps.max_srq,
sizeof(struct pvrdma_srq *),
@@ -270,7 +280,7 @@ static int pvrdma_register_device(struct pvrdma_dev *dev)
spin_lock_init(&dev->srq_tbl_lock);
rdma_set_device_sysfs_group(&dev->ib_dev, &pvrdma_attr_group);
- ret = ib_register_device(&dev->ib_dev, "vmw_pvrdma%d", NULL);
+ ret = ib_register_device(&dev->ib_dev, "vmw_pvrdma%d");
if (ret)
goto err_srq_free;
@@ -787,7 +797,7 @@ static int pvrdma_pci_probe(struct pci_dev *pdev,
dev_dbg(&pdev->dev, "initializing driver %s\n", pci_name(pdev));
/* Allocate zero-out device */
- dev = (struct pvrdma_dev *)ib_alloc_device(sizeof(*dev));
+ dev = ib_alloc_device(pvrdma_dev, ib_dev);
if (!dev) {
dev_err(&pdev->dev, "failed to allocate IB device\n");
return -ENOMEM;
@@ -882,8 +892,8 @@ static int pvrdma_pci_probe(struct pci_dev *pdev,
dev_info(&pdev->dev, "device version %d, driver version %d\n",
dev->dsr_version, PVRDMA_VERSION);
- dev->dsr = dma_zalloc_coherent(&pdev->dev, sizeof(*dev->dsr),
- &dev->dsrbase, GFP_KERNEL);
+ dev->dsr = dma_alloc_coherent(&pdev->dev, sizeof(*dev->dsr),
+ &dev->dsrbase, GFP_KERNEL);
if (!dev->dsr) {
dev_err(&pdev->dev, "failed to allocate shared region\n");
ret = -ENOMEM;
@@ -897,7 +907,11 @@ static int pvrdma_pci_probe(struct pci_dev *pdev,
PVRDMA_GOS_BITS_64;
dev->dsr->gos_info.gos_type = PVRDMA_GOS_TYPE_LINUX;
dev->dsr->gos_info.gos_ver = 1;
- dev->dsr->uar_pfn = dev->driver_uar.pfn;
+
+ if (dev->dsr_version < PVRDMA_PPN64_VERSION)
+ dev->dsr->uar_pfn = dev->driver_uar.pfn;
+ else
+ dev->dsr->uar_pfn64 = dev->driver_uar.pfn;
/* Command slot. */
dev->cmd_slot = dma_alloc_coherent(&pdev->dev, PAGE_SIZE,
@@ -1117,6 +1131,8 @@ static void pvrdma_pci_remove(struct pci_dev *pdev)
pvrdma_page_dir_cleanup(dev, &dev->cq_pdir);
pvrdma_page_dir_cleanup(dev, &dev->async_pdir);
pvrdma_free_slots(dev);
+ dma_free_coherent(&pdev->dev, sizeof(*dev->dsr), dev->dsr,
+ dev->dsrbase);
iounmap(dev->regs);
kfree(dev->sgid_tbl);
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_misc.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_misc.c
index fb0c5c0976b3..7944c58ded0e 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_misc.c
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_misc.c
@@ -183,25 +183,20 @@ int pvrdma_page_dir_insert_umem(struct pvrdma_page_dir *pdir,
struct ib_umem *umem, u64 offset)
{
u64 i = offset;
- int j, entry;
- int ret = 0, len = 0;
- struct scatterlist *sg;
+ int ret = 0;
+ struct sg_dma_page_iter sg_iter;
if (offset >= pdir->npages)
return -EINVAL;
- for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
- len = sg_dma_len(sg) >> PAGE_SHIFT;
- for (j = 0; j < len; j++) {
- dma_addr_t addr = sg_dma_address(sg) +
- (j << umem->page_shift);
+ for_each_sg_dma_page(umem->sg_head.sgl, &sg_iter, umem->nmap, 0) {
+ dma_addr_t addr = sg_page_iter_dma_address(&sg_iter);
- ret = pvrdma_page_dir_insert_dma(pdir, i, addr);
- if (ret)
- goto exit;
+ ret = pvrdma_page_dir_insert_dma(pdir, i, addr);
+ if (ret)
+ goto exit;
- i++;
- }
+ i++;
}
exit:
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c
index fa96fa4fb829..a85884e90e84 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c
@@ -126,8 +126,7 @@ struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
return ERR_PTR(-EINVAL);
}
- umem = ib_umem_get(pd->uobject->context, start,
- length, access_flags, 0);
+ umem = ib_umem_get(udata, start, length, access_flags, 0);
if (IS_ERR(umem)) {
dev_warn(&dev->pdev->dev,
"could not get umem for mem region\n");
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c
index cf22f57a9f0d..08f4257169bd 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c
@@ -249,7 +249,7 @@ struct ib_qp *pvrdma_create_qp(struct ib_pd *pd,
init_completion(&qp->free);
qp->state = IB_QPS_RESET;
- qp->is_kernel = !(pd->uobject && udata);
+ qp->is_kernel = !udata;
if (!qp->is_kernel) {
dev_dbg(&dev->pdev->dev,
@@ -262,8 +262,7 @@ struct ib_qp *pvrdma_create_qp(struct ib_pd *pd,
if (!is_srq) {
/* set qp->sq.wqe_cnt, shift, buf_size.. */
- qp->rumem = ib_umem_get(pd->uobject->context,
- ucmd.rbuf_addr,
+ qp->rumem = ib_umem_get(udata, ucmd.rbuf_addr,
ucmd.rbuf_size, 0, 0);
if (IS_ERR(qp->rumem)) {
ret = PTR_ERR(qp->rumem);
@@ -275,8 +274,7 @@ struct ib_qp *pvrdma_create_qp(struct ib_pd *pd,
qp->srq = to_vsrq(init_attr->srq);
}
- qp->sumem = ib_umem_get(pd->uobject->context,
- ucmd.sbuf_addr,
+ qp->sumem = ib_umem_get(udata, ucmd.sbuf_addr,
ucmd.sbuf_size, 0, 0);
if (IS_ERR(qp->sumem)) {
if (!is_srq)
@@ -721,6 +719,12 @@ int pvrdma_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM)
wqe_hdr->ex.imm_data = wr->ex.imm_data;
+ if (unlikely(wqe_hdr->opcode == PVRDMA_WR_ERROR)) {
+ *bad_wr = wr;
+ ret = -EINVAL;
+ goto out;
+ }
+
switch (qp->ibqp.qp_type) {
case IB_QPT_GSI:
case IB_QPT_UD:
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c
index dc0ce877c7a3..951d9d68107a 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c
@@ -111,7 +111,7 @@ struct ib_srq *pvrdma_create_srq(struct ib_pd *pd,
unsigned long flags;
int ret;
- if (!(pd->uobject && udata)) {
+ if (!udata) {
/* No support for kernel clients. */
dev_warn(&dev->pdev->dev,
"no shared receive queue support for kernel client\n");
@@ -153,9 +153,7 @@ struct ib_srq *pvrdma_create_srq(struct ib_pd *pd,
goto err_srq;
}
- srq->umem = ib_umem_get(pd->uobject->context,
- ucmd.buf_addr,
- ucmd.buf_size, 0, 0);
+ srq->umem = ib_umem_get(udata, ucmd.buf_addr, ucmd.buf_size, 0, 0);
if (IS_ERR(srq->umem)) {
ret = PTR_ERR(srq->umem);
goto err_srq;
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c
index b65d10b0a875..42fe821f8d58 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c
@@ -306,47 +306,42 @@ out:
/**
* pvrdma_alloc_ucontext - allocate ucontext
- * @ibdev: the IB device
+ * @uctx: the uverbs countext
* @udata: user data
*
- * @return: the ib_ucontext pointer on success, otherwise errno.
+ * @return: zero on success, otherwise errno.
*/
-struct ib_ucontext *pvrdma_alloc_ucontext(struct ib_device *ibdev,
- struct ib_udata *udata)
+int pvrdma_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata)
{
+ struct ib_device *ibdev = uctx->device;
struct pvrdma_dev *vdev = to_vdev(ibdev);
- struct pvrdma_ucontext *context;
- union pvrdma_cmd_req req;
- union pvrdma_cmd_resp rsp;
+ struct pvrdma_ucontext *context = to_vucontext(uctx);
+ union pvrdma_cmd_req req = {};
+ union pvrdma_cmd_resp rsp = {};
struct pvrdma_cmd_create_uc *cmd = &req.create_uc;
struct pvrdma_cmd_create_uc_resp *resp = &rsp.create_uc_resp;
- struct pvrdma_alloc_ucontext_resp uresp = {0};
+ struct pvrdma_alloc_ucontext_resp uresp = {};
int ret;
- void *ptr;
if (!vdev->ib_active)
- return ERR_PTR(-EAGAIN);
-
- context = kmalloc(sizeof(*context), GFP_KERNEL);
- if (!context)
- return ERR_PTR(-ENOMEM);
+ return -EAGAIN;
context->dev = vdev;
ret = pvrdma_uar_alloc(vdev, &context->uar);
- if (ret) {
- kfree(context);
- return ERR_PTR(-ENOMEM);
- }
+ if (ret)
+ return -ENOMEM;
/* get ctx_handle from host */
- memset(cmd, 0, sizeof(*cmd));
- cmd->pfn = context->uar.pfn;
+ if (vdev->dsr_version < PVRDMA_PPN64_VERSION)
+ cmd->pfn = context->uar.pfn;
+ else
+ cmd->pfn64 = context->uar.pfn;
+
cmd->hdr.cmd = PVRDMA_CMD_CREATE_UC;
ret = pvrdma_cmd_post(vdev, &req, &rsp, PVRDMA_CMD_CREATE_UC_RESP);
if (ret < 0) {
dev_warn(&vdev->pdev->dev,
"could not create ucontext, error: %d\n", ret);
- ptr = ERR_PTR(ret);
goto err;
}
@@ -357,33 +352,28 @@ struct ib_ucontext *pvrdma_alloc_ucontext(struct ib_device *ibdev,
ret = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
if (ret) {
pvrdma_uar_free(vdev, &context->uar);
- context->ibucontext.device = ibdev;
pvrdma_dealloc_ucontext(&context->ibucontext);
- return ERR_PTR(-EFAULT);
+ return -EFAULT;
}
- return &context->ibucontext;
+ return 0;
err:
pvrdma_uar_free(vdev, &context->uar);
- kfree(context);
- return ptr;
+ return ret;
}
/**
* pvrdma_dealloc_ucontext - deallocate ucontext
* @ibcontext: the ucontext
- *
- * @return: 0 on success, otherwise errno.
*/
-int pvrdma_dealloc_ucontext(struct ib_ucontext *ibcontext)
+void pvrdma_dealloc_ucontext(struct ib_ucontext *ibcontext)
{
struct pvrdma_ucontext *context = to_vucontext(ibcontext);
- union pvrdma_cmd_req req;
+ union pvrdma_cmd_req req = {};
struct pvrdma_cmd_destroy_uc *cmd = &req.destroy_uc;
int ret;
- memset(cmd, 0, sizeof(*cmd));
cmd->hdr.cmd = PVRDMA_CMD_DESTROY_UC;
cmd->ctx_handle = context->ctx_handle;
@@ -394,9 +384,6 @@ int pvrdma_dealloc_ucontext(struct ib_ucontext *ibcontext)
/* Free the UAR even if the device command failed */
pvrdma_uar_free(to_vdev(ibcontext->device), &context->uar);
- kfree(context);
-
- return ret;
}
/**
@@ -433,37 +420,29 @@ int pvrdma_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma)
/**
* pvrdma_alloc_pd - allocate protection domain
- * @ibdev: the IB device
+ * @ibpd: PD pointer
* @context: user context
* @udata: user data
*
* @return: the ib_pd protection domain pointer on success, otherwise errno.
*/
-struct ib_pd *pvrdma_alloc_pd(struct ib_device *ibdev,
- struct ib_ucontext *context,
- struct ib_udata *udata)
+int pvrdma_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context,
+ struct ib_udata *udata)
{
- struct pvrdma_pd *pd;
+ struct ib_device *ibdev = ibpd->device;
+ struct pvrdma_pd *pd = to_vpd(ibpd);
struct pvrdma_dev *dev = to_vdev(ibdev);
- union pvrdma_cmd_req req;
- union pvrdma_cmd_resp rsp;
+ union pvrdma_cmd_req req = {};
+ union pvrdma_cmd_resp rsp = {};
struct pvrdma_cmd_create_pd *cmd = &req.create_pd;
struct pvrdma_cmd_create_pd_resp *resp = &rsp.create_pd_resp;
struct pvrdma_alloc_pd_resp pd_resp = {0};
int ret;
- void *ptr;
/* Check allowed max pds */
if (!atomic_add_unless(&dev->num_pds, 1, dev->dsr->caps.max_pd))
- return ERR_PTR(-ENOMEM);
-
- pd = kmalloc(sizeof(*pd), GFP_KERNEL);
- if (!pd) {
- ptr = ERR_PTR(-ENOMEM);
- goto err;
- }
+ return -ENOMEM;
- memset(cmd, 0, sizeof(*cmd));
cmd->hdr.cmd = PVRDMA_CMD_CREATE_PD;
cmd->ctx_handle = (context) ? to_vucontext(context)->ctx_handle : 0;
ret = pvrdma_cmd_post(dev, &req, &rsp, PVRDMA_CMD_CREATE_PD_RESP);
@@ -471,8 +450,7 @@ struct ib_pd *pvrdma_alloc_pd(struct ib_device *ibdev,
dev_warn(&dev->pdev->dev,
"failed to allocate protection domain, error: %d\n",
ret);
- ptr = ERR_PTR(ret);
- goto freepd;
+ goto err;
}
pd->privileged = !context;
@@ -485,18 +463,16 @@ struct ib_pd *pvrdma_alloc_pd(struct ib_device *ibdev,
dev_warn(&dev->pdev->dev,
"failed to copy back protection domain\n");
pvrdma_dealloc_pd(&pd->ibpd);
- return ERR_PTR(-EFAULT);
+ return -EFAULT;
}
}
/* u32 pd handle */
- return &pd->ibpd;
+ return 0;
-freepd:
- kfree(pd);
err:
atomic_dec(&dev->num_pds);
- return ptr;
+ return ret;
}
/**
@@ -505,14 +481,13 @@ err:
*
* @return: 0 on success, otherwise errno.
*/
-int pvrdma_dealloc_pd(struct ib_pd *pd)
+void pvrdma_dealloc_pd(struct ib_pd *pd)
{
struct pvrdma_dev *dev = to_vdev(pd->device);
- union pvrdma_cmd_req req;
+ union pvrdma_cmd_req req = {};
struct pvrdma_cmd_destroy_pd *cmd = &req.destroy_pd;
int ret;
- memset(cmd, 0, sizeof(*cmd));
cmd->hdr.cmd = PVRDMA_CMD_DESTROY_PD;
cmd->pd_handle = to_vpd(pd)->pd_handle;
@@ -522,10 +497,7 @@ int pvrdma_dealloc_pd(struct ib_pd *pd)
"could not dealloc protection domain, error: %d\n",
ret);
- kfree(to_vpd(pd));
atomic_dec(&dev->num_pds);
-
- return 0;
}
/**
@@ -533,11 +505,12 @@ int pvrdma_dealloc_pd(struct ib_pd *pd)
* @pd: the protection domain
* @ah_attr: the attributes of the AH
* @udata: user data blob
+ * @flags: create address handle flags (see enum rdma_create_ah_flags)
*
* @return: the ib_ah pointer on success, otherwise errno.
*/
struct ib_ah *pvrdma_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr,
- struct ib_udata *udata)
+ u32 flags, struct ib_udata *udata)
{
struct pvrdma_dev *dev = to_vdev(pd->device);
struct pvrdma_ah *ah;
@@ -555,7 +528,7 @@ struct ib_ah *pvrdma_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr,
if (!atomic_add_unless(&dev->num_ahs, 1, dev->dsr->caps.max_ah))
return ERR_PTR(-ENOMEM);
- ah = kzalloc(sizeof(*ah), GFP_KERNEL);
+ ah = kzalloc(sizeof(*ah), GFP_ATOMIC);
if (!ah) {
atomic_dec(&dev->num_ahs);
return ERR_PTR(-ENOMEM);
@@ -581,10 +554,11 @@ struct ib_ah *pvrdma_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr,
/**
* pvrdma_destroy_ah - destroy an address handle
* @ah: the address handle to destroyed
+ * @flags: destroy address handle flags (see enum rdma_destroy_ah_flags)
*
* @return: 0 on success.
*/
-int pvrdma_destroy_ah(struct ib_ah *ah)
+int pvrdma_destroy_ah(struct ib_ah *ah, u32 flags)
{
struct pvrdma_dev *dev = to_vdev(ah->device);
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h
index b2e3ab50cb08..607aa131d67c 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h
@@ -396,13 +396,11 @@ int pvrdma_modify_device(struct ib_device *ibdev, int mask,
int pvrdma_modify_port(struct ib_device *ibdev, u8 port,
int mask, struct ib_port_modify *props);
int pvrdma_mmap(struct ib_ucontext *context, struct vm_area_struct *vma);
-struct ib_ucontext *pvrdma_alloc_ucontext(struct ib_device *ibdev,
- struct ib_udata *udata);
-int pvrdma_dealloc_ucontext(struct ib_ucontext *context);
-struct ib_pd *pvrdma_alloc_pd(struct ib_device *ibdev,
- struct ib_ucontext *context,
- struct ib_udata *udata);
-int pvrdma_dealloc_pd(struct ib_pd *ibpd);
+int pvrdma_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata);
+void pvrdma_dealloc_ucontext(struct ib_ucontext *context);
+int pvrdma_alloc_pd(struct ib_pd *pd, struct ib_ucontext *context,
+ struct ib_udata *udata);
+void pvrdma_dealloc_pd(struct ib_pd *ibpd);
struct ib_mr *pvrdma_get_dma_mr(struct ib_pd *pd, int acc);
struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
u64 virt_addr, int access_flags,
@@ -420,8 +418,8 @@ int pvrdma_destroy_cq(struct ib_cq *cq);
int pvrdma_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
int pvrdma_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags);
struct ib_ah *pvrdma_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr,
- struct ib_udata *udata);
-int pvrdma_destroy_ah(struct ib_ah *ah);
+ u32 flags, struct ib_udata *udata);
+int pvrdma_destroy_ah(struct ib_ah *ah, u32 flags);
struct ib_srq *pvrdma_create_srq(struct ib_pd *pd,
struct ib_srq_init_attr *init_attr,
diff --git a/drivers/infiniband/sw/rdmavt/ah.c b/drivers/infiniband/sw/rdmavt/ah.c
index 084bb4baebb5..fc10e4e26ca7 100644
--- a/drivers/infiniband/sw/rdmavt/ah.c
+++ b/drivers/infiniband/sw/rdmavt/ah.c
@@ -91,6 +91,7 @@ EXPORT_SYMBOL(rvt_check_ah);
* rvt_create_ah - create an address handle
* @pd: the protection domain
* @ah_attr: the attributes of the AH
+ * @create_flags: create address handle flags (see enum rdma_create_ah_flags)
* @udata: pointer to user's input output buffer information.
*
* This may be called from interrupt context.
@@ -99,6 +100,7 @@ EXPORT_SYMBOL(rvt_check_ah);
*/
struct ib_ah *rvt_create_ah(struct ib_pd *pd,
struct rdma_ah_attr *ah_attr,
+ u32 create_flags,
struct ib_udata *udata)
{
struct rvt_ah *ah;
@@ -135,10 +137,11 @@ struct ib_ah *rvt_create_ah(struct ib_pd *pd,
/**
* rvt_destory_ah - Destory an address handle
* @ibah: address handle
+ * @destroy_flags: destroy address handle flags (see enum rdma_destroy_ah_flags)
*
* Return: 0 on success
*/
-int rvt_destroy_ah(struct ib_ah *ibah)
+int rvt_destroy_ah(struct ib_ah *ibah, u32 destroy_flags)
{
struct rvt_dev_info *dev = ib_to_rvt(ibah->device);
struct rvt_ah *ah = ibah_to_rvtah(ibah);
diff --git a/drivers/infiniband/sw/rdmavt/ah.h b/drivers/infiniband/sw/rdmavt/ah.h
index 25271b48a683..72431a618d5d 100644
--- a/drivers/infiniband/sw/rdmavt/ah.h
+++ b/drivers/infiniband/sw/rdmavt/ah.h
@@ -52,8 +52,9 @@
struct ib_ah *rvt_create_ah(struct ib_pd *pd,
struct rdma_ah_attr *ah_attr,
+ u32 create_flags,
struct ib_udata *udata);
-int rvt_destroy_ah(struct ib_ah *ibah);
+int rvt_destroy_ah(struct ib_ah *ibah, u32 destroy_flags);
int rvt_modify_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr);
int rvt_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr);
diff --git a/drivers/infiniband/sw/rdmavt/mad.c b/drivers/infiniband/sw/rdmavt/mad.c
index d6981dc04adb..108c71e3ac23 100644
--- a/drivers/infiniband/sw/rdmavt/mad.c
+++ b/drivers/infiniband/sw/rdmavt/mad.c
@@ -160,7 +160,8 @@ void rvt_free_mad_agents(struct rvt_dev_info *rdi)
ib_unregister_mad_agent(agent);
}
if (rvp->sm_ah) {
- rdma_destroy_ah(&rvp->sm_ah->ibah);
+ rdma_destroy_ah(&rvp->sm_ah->ibah,
+ RDMA_DESTROY_AH_SLEEPABLE);
rvp->sm_ah = NULL;
}
diff --git a/drivers/infiniband/sw/rdmavt/mr.c b/drivers/infiniband/sw/rdmavt/mr.c
index 49c9541050d4..0bb6e39dd03a 100644
--- a/drivers/infiniband/sw/rdmavt/mr.c
+++ b/drivers/infiniband/sw/rdmavt/mr.c
@@ -381,15 +381,14 @@ struct ib_mr *rvt_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
{
struct rvt_mr *mr;
struct ib_umem *umem;
- struct scatterlist *sg;
- int n, m, entry;
+ struct sg_page_iter sg_iter;
+ int n, m;
struct ib_mr *ret;
if (length == 0)
return ERR_PTR(-EINVAL);
- umem = ib_umem_get(pd->uobject->context, start, length,
- mr_access_flags, 0);
+ umem = ib_umem_get(udata, start, length, mr_access_flags, 0);
if (IS_ERR(umem))
return (void *)umem;
@@ -408,23 +407,21 @@ struct ib_mr *rvt_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
mr->mr.access_flags = mr_access_flags;
mr->umem = umem;
- mr->mr.page_shift = umem->page_shift;
+ mr->mr.page_shift = PAGE_SHIFT;
m = 0;
n = 0;
- for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
+ for_each_sg_page (umem->sg_head.sgl, &sg_iter, umem->nmap, 0) {
void *vaddr;
- vaddr = page_address(sg_page(sg));
+ vaddr = page_address(sg_page_iter_page(&sg_iter));
if (!vaddr) {
ret = ERR_PTR(-EINVAL);
goto bail_inval;
}
mr->mr.map[m]->segs[n].vaddr = vaddr;
- mr->mr.map[m]->segs[n].length = BIT(umem->page_shift);
- trace_rvt_mr_user_seg(&mr->mr, m, n, vaddr,
- BIT(umem->page_shift));
- n++;
- if (n == RVT_SEGSZ) {
+ mr->mr.map[m]->segs[n].length = PAGE_SIZE;
+ trace_rvt_mr_user_seg(&mr->mr, m, n, vaddr, PAGE_SIZE);
+ if (++n == RVT_SEGSZ) {
m++;
n = 0;
}
@@ -611,11 +608,6 @@ static int rvt_set_page(struct ib_mr *ibmr, u64 addr)
if (unlikely(mapped_segs == mr->mr.max_segs))
return -ENOMEM;
- if (mr->mr.length == 0) {
- mr->mr.user_base = addr;
- mr->mr.iova = addr;
- }
-
m = mapped_segs / RVT_SEGSZ;
n = mapped_segs % RVT_SEGSZ;
mr->mr.map[m]->segs[n].vaddr = (void *)addr;
@@ -633,17 +625,24 @@ static int rvt_set_page(struct ib_mr *ibmr, u64 addr)
* @sg_nents: number of entries in sg
* @sg_offset: offset in bytes into sg
*
+ * Overwrite rvt_mr length with mr length calculated by ib_sg_to_pages.
+ *
* Return: number of sg elements mapped to the memory region
*/
int rvt_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg,
int sg_nents, unsigned int *sg_offset)
{
struct rvt_mr *mr = to_imr(ibmr);
+ int ret;
mr->mr.length = 0;
mr->mr.page_shift = PAGE_SHIFT;
- return ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset,
- rvt_set_page);
+ ret = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, rvt_set_page);
+ mr->mr.user_base = ibmr->iova;
+ mr->mr.iova = ibmr->iova;
+ mr->mr.offset = ibmr->iova - (u64)mr->mr.map[0]->segs[0].vaddr;
+ mr->mr.length = (size_t)ibmr->length;
+ return ret;
}
/**
@@ -674,6 +673,7 @@ int rvt_fast_reg_mr(struct rvt_qp *qp, struct ib_mr *ibmr, u32 key,
ibmr->rkey = key;
mr->mr.lkey = key;
mr->mr.access_flags = access;
+ mr->mr.iova = ibmr->iova;
atomic_set(&mr->mr.lkey_invalid, 0);
return 0;
diff --git a/drivers/infiniband/sw/rdmavt/pd.c b/drivers/infiniband/sw/rdmavt/pd.c
index 8a89afff3363..6033054b22fa 100644
--- a/drivers/infiniband/sw/rdmavt/pd.c
+++ b/drivers/infiniband/sw/rdmavt/pd.c
@@ -50,7 +50,7 @@
/**
* rvt_alloc_pd - allocate a protection domain
- * @ibdev: ib device
+ * @ibpd: PD
* @context: optional user context
* @udata: optional user data
*
@@ -58,19 +58,14 @@
*
* Return: 0 on success
*/
-struct ib_pd *rvt_alloc_pd(struct ib_device *ibdev,
- struct ib_ucontext *context,
- struct ib_udata *udata)
+int rvt_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context,
+ struct ib_udata *udata)
{
+ struct ib_device *ibdev = ibpd->device;
struct rvt_dev_info *dev = ib_to_rvt(ibdev);
- struct rvt_pd *pd;
- struct ib_pd *ret;
+ struct rvt_pd *pd = ibpd_to_rvtpd(ibpd);
+ int ret = 0;
- pd = kmalloc(sizeof(*pd), GFP_KERNEL);
- if (!pd) {
- ret = ERR_PTR(-ENOMEM);
- goto bail;
- }
/*
* While we could continue allocating protecetion domains, being
* constrained only by system resources. The IBTA spec defines that
@@ -81,8 +76,7 @@ struct ib_pd *rvt_alloc_pd(struct ib_device *ibdev,
spin_lock(&dev->n_pds_lock);
if (dev->n_pds_allocated == dev->dparms.props.max_pd) {
spin_unlock(&dev->n_pds_lock);
- kfree(pd);
- ret = ERR_PTR(-ENOMEM);
+ ret = -ENOMEM;
goto bail;
}
@@ -92,8 +86,6 @@ struct ib_pd *rvt_alloc_pd(struct ib_device *ibdev,
/* ib_alloc_pd() will initialize pd->ibpd. */
pd->user = !!udata;
- ret = &pd->ibpd;
-
bail:
return ret;
}
@@ -104,16 +96,11 @@ bail:
*
* Return: always 0
*/
-int rvt_dealloc_pd(struct ib_pd *ibpd)
+void rvt_dealloc_pd(struct ib_pd *ibpd)
{
- struct rvt_pd *pd = ibpd_to_rvtpd(ibpd);
struct rvt_dev_info *dev = ib_to_rvt(ibpd->device);
spin_lock(&dev->n_pds_lock);
dev->n_pds_allocated--;
spin_unlock(&dev->n_pds_lock);
-
- kfree(pd);
-
- return 0;
}
diff --git a/drivers/infiniband/sw/rdmavt/pd.h b/drivers/infiniband/sw/rdmavt/pd.h
index 1892ca4a9746..7a887e4a45e7 100644
--- a/drivers/infiniband/sw/rdmavt/pd.h
+++ b/drivers/infiniband/sw/rdmavt/pd.h
@@ -50,9 +50,8 @@
#include <rdma/rdma_vt.h>
-struct ib_pd *rvt_alloc_pd(struct ib_device *ibdev,
- struct ib_ucontext *context,
- struct ib_udata *udata);
-int rvt_dealloc_pd(struct ib_pd *ibpd);
+int rvt_alloc_pd(struct ib_pd *pd, struct ib_ucontext *context,
+ struct ib_udata *udata);
+void rvt_dealloc_pd(struct ib_pd *ibpd);
#endif /* DEF_RDMAVTPD_H */
diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c
index 1735deb1a9d4..a34b9a2a32b6 100644
--- a/drivers/infiniband/sw/rdmavt/qp.c
+++ b/drivers/infiniband/sw/rdmavt/qp.c
@@ -1,5 +1,5 @@
/*
- * Copyright(c) 2016, 2017 Intel Corporation.
+ * Copyright(c) 2016 - 2018 Intel Corporation.
*
* This file is provided under a dual BSD/GPLv2 license. When using or
* redistributing this file, you may do so under either license.
@@ -53,6 +53,7 @@
#include <rdma/ib_verbs.h>
#include <rdma/ib_hdrs.h>
#include <rdma/opa_addr.h>
+#include <rdma/uverbs_ioctl.h>
#include "qp.h"
#include "vt.h"
#include "trace.h"
@@ -854,6 +855,7 @@ static void rvt_init_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp,
qp->s_mig_state = IB_MIG_MIGRATED;
qp->r_head_ack_queue = 0;
qp->s_tail_ack_queue = 0;
+ qp->s_acked_ack_queue = 0;
qp->s_num_rd_atomic = 0;
if (qp->r_rq.wq) {
qp->r_rq.wq->head = 0;
@@ -955,6 +957,8 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
size_t sg_list_sz;
struct ib_qp *ret = ERR_PTR(-ENOMEM);
struct rvt_dev_info *rdi = ib_to_rvt(ibpd->device);
+ struct rvt_ucontext *ucontext = rdma_udata_to_drv_context(
+ udata, struct rvt_ucontext, ibucontext);
void *priv = NULL;
size_t sqsize;
@@ -1094,6 +1098,13 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
qp->ibqp.qp_num = err;
qp->port_num = init_attr->port_num;
rvt_init_qp(rdi, qp, init_attr->qp_type);
+ if (rdi->driver_f.qp_priv_init) {
+ err = rdi->driver_f.qp_priv_init(rdi, qp, init_attr);
+ if (err) {
+ ret = ERR_PTR(err);
+ goto bail_rq_wq;
+ }
+ }
break;
default:
@@ -1121,7 +1132,7 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
u32 s = sizeof(struct rvt_rwq) + qp->r_rq.size * sz;
qp->ip = rvt_create_mmap_info(rdi, s,
- ibpd->uobject->context,
+ &ucontext->ibucontext,
qp->r_rq.wq);
if (!qp->ip) {
ret = ERR_PTR(-ENOMEM);
@@ -1635,11 +1646,11 @@ int rvt_destroy_qp(struct ib_qp *ibqp)
kref_put(&qp->ip->ref, rvt_release_mmap_info);
else
vfree(qp->r_rq.wq);
- vfree(qp->s_wq);
rdi->driver_f.qp_priv_free(rdi, qp);
kfree(qp->s_ack_queue);
rdma_destroy_ah_attr(&qp->remote_ah_attr);
rdma_destroy_ah_attr(&qp->alt_ah_attr);
+ vfree(qp->s_wq);
kfree(qp);
return 0;
}
@@ -2386,11 +2397,12 @@ static inline unsigned long rvt_aeth_to_usec(u32 aeth)
}
/*
- * rvt_add_retry_timer - add/start a retry timer
+ * rvt_add_retry_timer_ext - add/start a retry timer
* @qp - the QP
+ * @shift - timeout shift to wait for multiple packets
* add a retry timer on the QP
*/
-void rvt_add_retry_timer(struct rvt_qp *qp)
+void rvt_add_retry_timer_ext(struct rvt_qp *qp, u8 shift)
{
struct ib_qp *ibqp = &qp->ibqp;
struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
@@ -2398,11 +2410,11 @@ void rvt_add_retry_timer(struct rvt_qp *qp)
lockdep_assert_held(&qp->s_lock);
qp->s_flags |= RVT_S_TIMER;
/* 4.096 usec. * (1 << qp->timeout) */
- qp->s_timer.expires = jiffies + qp->timeout_jiffies +
- rdi->busy_jiffies;
+ qp->s_timer.expires = jiffies + rdi->busy_jiffies +
+ (qp->timeout_jiffies << shift);
add_timer(&qp->s_timer);
}
-EXPORT_SYMBOL(rvt_add_retry_timer);
+EXPORT_SYMBOL(rvt_add_retry_timer_ext);
/**
* rvt_add_rnr_timer - add/start an rnr timer
@@ -2778,6 +2790,18 @@ again:
}
EXPORT_SYMBOL(rvt_copy_sge);
+static enum ib_wc_status loopback_qp_drop(struct rvt_ibport *rvp,
+ struct rvt_qp *sqp)
+{
+ rvp->n_pkt_drops++;
+ /*
+ * For RC, the requester would timeout and retry so
+ * shortcut the timeouts and just signal too many retries.
+ */
+ return sqp->ibqp.qp_type == IB_QPT_RC ?
+ IB_WC_RETRY_EXC_ERR : IB_WC_SUCCESS;
+}
+
/**
* ruc_loopback - handle UC and RC loopback requests
* @sqp: the sending QP
@@ -2850,17 +2874,14 @@ again:
}
spin_unlock_irqrestore(&sqp->s_lock, flags);
- if (!qp || !(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) ||
+ if (!qp) {
+ send_status = loopback_qp_drop(rvp, sqp);
+ goto serr_no_r_lock;
+ }
+ spin_lock_irqsave(&qp->r_lock, flags);
+ if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) ||
qp->ibqp.qp_type != sqp->ibqp.qp_type) {
- rvp->n_pkt_drops++;
- /*
- * For RC, the requester would timeout and retry so
- * shortcut the timeouts and just signal too many retries.
- */
- if (sqp->ibqp.qp_type == IB_QPT_RC)
- send_status = IB_WC_RETRY_EXC_ERR;
- else
- send_status = IB_WC_SUCCESS;
+ send_status = loopback_qp_drop(rvp, sqp);
goto serr;
}
@@ -2886,23 +2907,31 @@ again:
goto send_comp;
case IB_WR_SEND_WITH_INV:
- if (!rvt_invalidate_rkey(qp, wqe->wr.ex.invalidate_rkey)) {
- wc.wc_flags = IB_WC_WITH_INVALIDATE;
- wc.ex.invalidate_rkey = wqe->wr.ex.invalidate_rkey;
- }
- goto send;
-
case IB_WR_SEND_WITH_IMM:
- wc.wc_flags = IB_WC_WITH_IMM;
- wc.ex.imm_data = wqe->wr.ex.imm_data;
- /* FALLTHROUGH */
case IB_WR_SEND:
-send:
ret = rvt_get_rwqe(qp, false);
if (ret < 0)
goto op_err;
if (!ret)
goto rnr_nak;
+ if (wqe->length > qp->r_len)
+ goto inv_err;
+ switch (wqe->wr.opcode) {
+ case IB_WR_SEND_WITH_INV:
+ if (!rvt_invalidate_rkey(qp,
+ wqe->wr.ex.invalidate_rkey)) {
+ wc.wc_flags = IB_WC_WITH_INVALIDATE;
+ wc.ex.invalidate_rkey =
+ wqe->wr.ex.invalidate_rkey;
+ }
+ break;
+ case IB_WR_SEND_WITH_IMM:
+ wc.wc_flags = IB_WC_WITH_IMM;
+ wc.ex.imm_data = wqe->wr.ex.imm_data;
+ break;
+ default:
+ break;
+ }
break;
case IB_WR_RDMA_WRITE_WITH_IMM:
@@ -2979,34 +3008,12 @@ do_write:
sge = &sqp->s_sge.sge;
while (sqp->s_len) {
- u32 len = sqp->s_len;
+ u32 len = rvt_get_sge_length(sge, sqp->s_len);
- if (len > sge->length)
- len = sge->length;
- if (len > sge->sge_length)
- len = sge->sge_length;
WARN_ON_ONCE(len == 0);
rvt_copy_sge(qp, &qp->r_sge, sge->vaddr,
len, release, copy_last);
- sge->vaddr += len;
- sge->length -= len;
- sge->sge_length -= len;
- if (sge->sge_length == 0) {
- if (!release)
- rvt_put_mr(sge->mr);
- if (--sqp->s_sge.num_sge)
- *sge = *sqp->s_sge.sg_list++;
- } else if (sge->length == 0 && sge->mr->lkey) {
- if (++sge->n >= RVT_SEGSZ) {
- if (++sge->m >= sge->mr->mapsz)
- break;
- sge->n = 0;
- }
- sge->vaddr =
- sge->mr->map[sge->m]->segs[sge->n].vaddr;
- sge->length =
- sge->mr->map[sge->m]->segs[sge->n].length;
- }
+ rvt_update_sge(&sqp->s_sge, len, !release);
sqp->s_len -= len;
}
if (release)
@@ -3032,6 +3039,7 @@ do_write:
wqe->wr.send_flags & IB_SEND_SOLICITED);
send_comp:
+ spin_unlock_irqrestore(&qp->r_lock, flags);
spin_lock_irqsave(&sqp->s_lock, flags);
rvp->n_loop_pkts++;
flush_send:
@@ -3058,6 +3066,7 @@ rnr_nak:
}
if (sqp->s_rnr_retry_cnt < 7)
sqp->s_rnr_retry--;
+ spin_unlock_irqrestore(&qp->r_lock, flags);
spin_lock_irqsave(&sqp->s_lock, flags);
if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_RECV_OK))
goto clr_busy;
@@ -3071,7 +3080,10 @@ op_err:
goto err;
inv_err:
- send_status = IB_WC_REM_INV_REQ_ERR;
+ send_status =
+ sqp->ibqp.qp_type == IB_QPT_RC ?
+ IB_WC_REM_INV_REQ_ERR :
+ IB_WC_SUCCESS;
wc.status = IB_WC_LOC_QP_OP_ERR;
goto err;
@@ -3083,6 +3095,8 @@ err:
rvt_rc_error(qp, wc.status);
serr:
+ spin_unlock_irqrestore(&qp->r_lock, flags);
+serr_no_r_lock:
spin_lock_irqsave(&sqp->s_lock, flags);
rvt_send_complete(sqp, wqe, send_status);
if (sqp->ibqp.qp_type == IB_QPT_RC) {
diff --git a/drivers/infiniband/sw/rdmavt/rc.c b/drivers/infiniband/sw/rdmavt/rc.c
index 6131cc558bdb..8d71647820a8 100644
--- a/drivers/infiniband/sw/rdmavt/rc.c
+++ b/drivers/infiniband/sw/rdmavt/rc.c
@@ -187,3 +187,16 @@ void rvt_get_credit(struct rvt_qp *qp, u32 aeth)
}
}
EXPORT_SYMBOL(rvt_get_credit);
+
+/* rvt_restart_sge - rewind the sge state for a wqe */
+u32 rvt_restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe, u32 len)
+{
+ ss->sge = wqe->sg_list[0];
+ ss->sg_list = wqe->sg_list + 1;
+ ss->num_sge = wqe->wr.num_sge;
+ ss->total_len = wqe->length;
+ rvt_skip_sge(ss, len, false);
+ return wqe->length - len;
+}
+EXPORT_SYMBOL(rvt_restart_sge);
+
diff --git a/drivers/infiniband/sw/rdmavt/srq.c b/drivers/infiniband/sw/rdmavt/srq.c
index 78e06fc456c5..895b3fabd0bf 100644
--- a/drivers/infiniband/sw/rdmavt/srq.c
+++ b/drivers/infiniband/sw/rdmavt/srq.c
@@ -48,6 +48,7 @@
#include <linux/err.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
+#include <rdma/uverbs_ioctl.h>
#include "srq.h"
#include "vt.h"
@@ -77,6 +78,8 @@ struct ib_srq *rvt_create_srq(struct ib_pd *ibpd,
struct ib_udata *udata)
{
struct rvt_dev_info *dev = ib_to_rvt(ibpd->device);
+ struct rvt_ucontext *ucontext = rdma_udata_to_drv_context(
+ udata, struct rvt_ucontext, ibucontext);
struct rvt_srq *srq;
u32 sz;
struct ib_srq *ret;
@@ -119,7 +122,7 @@ struct ib_srq *rvt_create_srq(struct ib_pd *ibpd,
u32 s = sizeof(struct rvt_rwq) + srq->rq.size * sz;
srq->ip =
- rvt_create_mmap_info(dev, s, ibpd->uobject->context,
+ rvt_create_mmap_info(dev, s, &ucontext->ibucontext,
srq->rq.wq);
if (!srq->ip) {
ret = ERR_PTR(-ENOMEM);
diff --git a/drivers/infiniband/sw/rdmavt/trace_cq.h b/drivers/infiniband/sw/rdmavt/trace_cq.h
index df8e1adbef9d..e3c416c6f900 100644
--- a/drivers/infiniband/sw/rdmavt/trace_cq.h
+++ b/drivers/infiniband/sw/rdmavt/trace_cq.h
@@ -105,7 +105,7 @@ DEFINE_EVENT(rvt_cq_template, rvt_create_cq,
TP_ARGS(cq, attr));
#define CQ_PRN \
-"[%s] idx %u wr_id %llx status %u opcode %u,%s length %u qpn %x"
+"[%s] idx %u wr_id %llx status %u opcode %u,%s length %u qpn %x flags %x imm %x"
DECLARE_EVENT_CLASS(
rvt_cq_entry_template,
@@ -119,6 +119,8 @@ DECLARE_EVENT_CLASS(
__field(u32, qpn)
__field(u32, length)
__field(u32, idx)
+ __field(u32, flags)
+ __field(u32, imm)
),
TP_fast_assign(
RDI_DEV_ASSIGN(cq->rdi)
@@ -128,6 +130,8 @@ DECLARE_EVENT_CLASS(
__entry->length = wc->byte_len;
__entry->qpn = wc->qp->qp_num;
__entry->idx = idx;
+ __entry->flags = wc->wc_flags;
+ __entry->imm = be32_to_cpu(wc->ex.imm_data);
),
TP_printk(
CQ_PRN,
@@ -137,7 +141,9 @@ DECLARE_EVENT_CLASS(
__entry->status,
__entry->opcode, show_wc_opcode(__entry->opcode),
__entry->length,
- __entry->qpn
+ __entry->qpn,
+ __entry->flags,
+ __entry->imm
)
);
diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c
index 723d3daf2eba..42c9d35f832d 100644
--- a/drivers/infiniband/sw/rdmavt/vt.c
+++ b/drivers/infiniband/sw/rdmavt/vt.c
@@ -91,7 +91,7 @@ struct rvt_dev_info *rvt_alloc_device(size_t size, int nports)
{
struct rvt_dev_info *rdi;
- rdi = (struct rvt_dev_info *)ib_alloc_device(size);
+ rdi = container_of(_ib_alloc_device(size), struct rvt_dev_info, ibdev);
if (!rdi)
return rdi;
@@ -284,10 +284,6 @@ static int rvt_query_gid(struct ib_device *ibdev, u8 port_num,
&gid->global.interface_id);
}
-struct rvt_ucontext {
- struct ib_ucontext ibucontext;
-};
-
static inline struct rvt_ucontext *to_iucontext(struct ib_ucontext
*ibucontext)
{
@@ -296,28 +292,21 @@ static inline struct rvt_ucontext *to_iucontext(struct ib_ucontext
/**
* rvt_alloc_ucontext - Allocate a user context
- * @ibdev: Verbs IB dev
+ * @uctx: Verbs context
* @udata: User data allocated
*/
-static struct ib_ucontext *rvt_alloc_ucontext(struct ib_device *ibdev,
- struct ib_udata *udata)
+static int rvt_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata)
{
- struct rvt_ucontext *context;
-
- context = kmalloc(sizeof(*context), GFP_KERNEL);
- if (!context)
- return ERR_PTR(-ENOMEM);
- return &context->ibucontext;
+ return 0;
}
/**
- *rvt_dealloc_ucontext - Free a user context
- *@context - Free this
+ * rvt_dealloc_ucontext - Free a user context
+ * @context - Free this
*/
-static int rvt_dealloc_ucontext(struct ib_ucontext *context)
+static void rvt_dealloc_ucontext(struct ib_ucontext *context)
{
- kfree(to_iucontext(context));
- return 0;
+ return;
}
static int rvt_get_port_immutable(struct ib_device *ibdev, u8 port_num,
@@ -392,16 +381,53 @@ enum {
_VERB_IDX_MAX /* Must always be last! */
};
-static inline int check_driver_override(struct rvt_dev_info *rdi,
- size_t offset, void *func)
-{
- if (!*(void **)((void *)&rdi->ibdev + offset)) {
- *(void **)((void *)&rdi->ibdev + offset) = func;
- return 0;
- }
-
- return 1;
-}
+static const struct ib_device_ops rvt_dev_ops = {
+ .alloc_fmr = rvt_alloc_fmr,
+ .alloc_mr = rvt_alloc_mr,
+ .alloc_pd = rvt_alloc_pd,
+ .alloc_ucontext = rvt_alloc_ucontext,
+ .attach_mcast = rvt_attach_mcast,
+ .create_ah = rvt_create_ah,
+ .create_cq = rvt_create_cq,
+ .create_qp = rvt_create_qp,
+ .create_srq = rvt_create_srq,
+ .dealloc_fmr = rvt_dealloc_fmr,
+ .dealloc_pd = rvt_dealloc_pd,
+ .dealloc_ucontext = rvt_dealloc_ucontext,
+ .dereg_mr = rvt_dereg_mr,
+ .destroy_ah = rvt_destroy_ah,
+ .destroy_cq = rvt_destroy_cq,
+ .destroy_qp = rvt_destroy_qp,
+ .destroy_srq = rvt_destroy_srq,
+ .detach_mcast = rvt_detach_mcast,
+ .get_dma_mr = rvt_get_dma_mr,
+ .get_port_immutable = rvt_get_port_immutable,
+ .map_mr_sg = rvt_map_mr_sg,
+ .map_phys_fmr = rvt_map_phys_fmr,
+ .mmap = rvt_mmap,
+ .modify_ah = rvt_modify_ah,
+ .modify_device = rvt_modify_device,
+ .modify_port = rvt_modify_port,
+ .modify_qp = rvt_modify_qp,
+ .modify_srq = rvt_modify_srq,
+ .poll_cq = rvt_poll_cq,
+ .post_recv = rvt_post_recv,
+ .post_send = rvt_post_send,
+ .post_srq_recv = rvt_post_srq_recv,
+ .query_ah = rvt_query_ah,
+ .query_device = rvt_query_device,
+ .query_gid = rvt_query_gid,
+ .query_pkey = rvt_query_pkey,
+ .query_port = rvt_query_port,
+ .query_qp = rvt_query_qp,
+ .query_srq = rvt_query_srq,
+ .reg_user_mr = rvt_reg_user_mr,
+ .req_notify_cq = rvt_req_notify_cq,
+ .resize_cq = rvt_resize_cq,
+ .unmap_fmr = rvt_unmap_fmr,
+ INIT_RDMA_OBJ_SIZE(ib_pd, rvt_pd, ibpd),
+ INIT_RDMA_OBJ_SIZE(ib_ucontext, rvt_ucontext, ibucontext),
+};
static noinline int check_support(struct rvt_dev_info *rdi, int verb)
{
@@ -411,81 +437,41 @@ static noinline int check_support(struct rvt_dev_info *rdi, int verb)
* These functions are not part of verbs specifically but are
* required for rdmavt to function.
*/
- if ((!rdi->driver_f.port_callback) ||
+ if ((!rdi->ibdev.ops.init_port) ||
(!rdi->driver_f.get_pci_dev))
return -EINVAL;
break;
- case QUERY_DEVICE:
- check_driver_override(rdi, offsetof(struct ib_device,
- query_device),
- rvt_query_device);
- break;
-
case MODIFY_DEVICE:
/*
* rdmavt does not support modify device currently drivers must
* provide.
*/
- if (!check_driver_override(rdi, offsetof(struct ib_device,
- modify_device),
- rvt_modify_device))
+ if (!rdi->ibdev.ops.modify_device)
return -EOPNOTSUPP;
break;
case QUERY_PORT:
- if (!check_driver_override(rdi, offsetof(struct ib_device,
- query_port),
- rvt_query_port))
+ if (!rdi->ibdev.ops.query_port)
if (!rdi->driver_f.query_port_state)
return -EINVAL;
break;
case MODIFY_PORT:
- if (!check_driver_override(rdi, offsetof(struct ib_device,
- modify_port),
- rvt_modify_port))
+ if (!rdi->ibdev.ops.modify_port)
if (!rdi->driver_f.cap_mask_chg ||
!rdi->driver_f.shut_down_port)
return -EINVAL;
break;
- case QUERY_PKEY:
- check_driver_override(rdi, offsetof(struct ib_device,
- query_pkey),
- rvt_query_pkey);
- break;
-
case QUERY_GID:
- if (!check_driver_override(rdi, offsetof(struct ib_device,
- query_gid),
- rvt_query_gid))
+ if (!rdi->ibdev.ops.query_gid)
if (!rdi->driver_f.get_guid_be)
return -EINVAL;
break;
- case ALLOC_UCONTEXT:
- check_driver_override(rdi, offsetof(struct ib_device,
- alloc_ucontext),
- rvt_alloc_ucontext);
- break;
-
- case DEALLOC_UCONTEXT:
- check_driver_override(rdi, offsetof(struct ib_device,
- dealloc_ucontext),
- rvt_dealloc_ucontext);
- break;
-
- case GET_PORT_IMMUTABLE:
- check_driver_override(rdi, offsetof(struct ib_device,
- get_port_immutable),
- rvt_get_port_immutable);
- break;
-
case CREATE_QP:
- if (!check_driver_override(rdi, offsetof(struct ib_device,
- create_qp),
- rvt_create_qp))
+ if (!rdi->ibdev.ops.create_qp)
if (!rdi->driver_f.qp_priv_alloc ||
!rdi->driver_f.qp_priv_free ||
!rdi->driver_f.notify_qp_reset ||
@@ -496,9 +482,7 @@ static noinline int check_support(struct rvt_dev_info *rdi, int verb)
break;
case MODIFY_QP:
- if (!check_driver_override(rdi, offsetof(struct ib_device,
- modify_qp),
- rvt_modify_qp))
+ if (!rdi->ibdev.ops.modify_qp)
if (!rdi->driver_f.notify_qp_reset ||
!rdi->driver_f.schedule_send ||
!rdi->driver_f.get_pmtu_from_attr ||
@@ -512,9 +496,7 @@ static noinline int check_support(struct rvt_dev_info *rdi, int verb)
break;
case DESTROY_QP:
- if (!check_driver_override(rdi, offsetof(struct ib_device,
- destroy_qp),
- rvt_destroy_qp))
+ if (!rdi->ibdev.ops.destroy_qp)
if (!rdi->driver_f.qp_priv_free ||
!rdi->driver_f.notify_qp_reset ||
!rdi->driver_f.flush_qp_waiters ||
@@ -523,197 +505,14 @@ static noinline int check_support(struct rvt_dev_info *rdi, int verb)
return -EINVAL;
break;
- case QUERY_QP:
- check_driver_override(rdi, offsetof(struct ib_device,
- query_qp),
- rvt_query_qp);
- break;
-
case POST_SEND:
- if (!check_driver_override(rdi, offsetof(struct ib_device,
- post_send),
- rvt_post_send))
+ if (!rdi->ibdev.ops.post_send)
if (!rdi->driver_f.schedule_send ||
!rdi->driver_f.do_send ||
!rdi->post_parms)
return -EINVAL;
break;
- case POST_RECV:
- check_driver_override(rdi, offsetof(struct ib_device,
- post_recv),
- rvt_post_recv);
- break;
- case POST_SRQ_RECV:
- check_driver_override(rdi, offsetof(struct ib_device,
- post_srq_recv),
- rvt_post_srq_recv);
- break;
-
- case CREATE_AH:
- check_driver_override(rdi, offsetof(struct ib_device,
- create_ah),
- rvt_create_ah);
- break;
-
- case DESTROY_AH:
- check_driver_override(rdi, offsetof(struct ib_device,
- destroy_ah),
- rvt_destroy_ah);
- break;
-
- case MODIFY_AH:
- check_driver_override(rdi, offsetof(struct ib_device,
- modify_ah),
- rvt_modify_ah);
- break;
-
- case QUERY_AH:
- check_driver_override(rdi, offsetof(struct ib_device,
- query_ah),
- rvt_query_ah);
- break;
-
- case CREATE_SRQ:
- check_driver_override(rdi, offsetof(struct ib_device,
- create_srq),
- rvt_create_srq);
- break;
-
- case MODIFY_SRQ:
- check_driver_override(rdi, offsetof(struct ib_device,
- modify_srq),
- rvt_modify_srq);
- break;
-
- case DESTROY_SRQ:
- check_driver_override(rdi, offsetof(struct ib_device,
- destroy_srq),
- rvt_destroy_srq);
- break;
-
- case QUERY_SRQ:
- check_driver_override(rdi, offsetof(struct ib_device,
- query_srq),
- rvt_query_srq);
- break;
-
- case ATTACH_MCAST:
- check_driver_override(rdi, offsetof(struct ib_device,
- attach_mcast),
- rvt_attach_mcast);
- break;
-
- case DETACH_MCAST:
- check_driver_override(rdi, offsetof(struct ib_device,
- detach_mcast),
- rvt_detach_mcast);
- break;
-
- case GET_DMA_MR:
- check_driver_override(rdi, offsetof(struct ib_device,
- get_dma_mr),
- rvt_get_dma_mr);
- break;
-
- case REG_USER_MR:
- check_driver_override(rdi, offsetof(struct ib_device,
- reg_user_mr),
- rvt_reg_user_mr);
- break;
-
- case DEREG_MR:
- check_driver_override(rdi, offsetof(struct ib_device,
- dereg_mr),
- rvt_dereg_mr);
- break;
-
- case ALLOC_FMR:
- check_driver_override(rdi, offsetof(struct ib_device,
- alloc_fmr),
- rvt_alloc_fmr);
- break;
-
- case ALLOC_MR:
- check_driver_override(rdi, offsetof(struct ib_device,
- alloc_mr),
- rvt_alloc_mr);
- break;
-
- case MAP_MR_SG:
- check_driver_override(rdi, offsetof(struct ib_device,
- map_mr_sg),
- rvt_map_mr_sg);
- break;
-
- case MAP_PHYS_FMR:
- check_driver_override(rdi, offsetof(struct ib_device,
- map_phys_fmr),
- rvt_map_phys_fmr);
- break;
-
- case UNMAP_FMR:
- check_driver_override(rdi, offsetof(struct ib_device,
- unmap_fmr),
- rvt_unmap_fmr);
- break;
-
- case DEALLOC_FMR:
- check_driver_override(rdi, offsetof(struct ib_device,
- dealloc_fmr),
- rvt_dealloc_fmr);
- break;
-
- case MMAP:
- check_driver_override(rdi, offsetof(struct ib_device,
- mmap),
- rvt_mmap);
- break;
-
- case CREATE_CQ:
- check_driver_override(rdi, offsetof(struct ib_device,
- create_cq),
- rvt_create_cq);
- break;
-
- case DESTROY_CQ:
- check_driver_override(rdi, offsetof(struct ib_device,
- destroy_cq),
- rvt_destroy_cq);
- break;
-
- case POLL_CQ:
- check_driver_override(rdi, offsetof(struct ib_device,
- poll_cq),
- rvt_poll_cq);
- break;
-
- case REQ_NOTFIY_CQ:
- check_driver_override(rdi, offsetof(struct ib_device,
- req_notify_cq),
- rvt_req_notify_cq);
- break;
-
- case RESIZE_CQ:
- check_driver_override(rdi, offsetof(struct ib_device,
- resize_cq),
- rvt_resize_cq);
- break;
-
- case ALLOC_PD:
- check_driver_override(rdi, offsetof(struct ib_device,
- alloc_pd),
- rvt_alloc_pd);
- break;
-
- case DEALLOC_PD:
- check_driver_override(rdi, offsetof(struct ib_device,
- dealloc_pd),
- rvt_dealloc_pd);
- break;
-
- default:
- return -EINVAL;
}
return 0;
@@ -745,6 +544,7 @@ int rvt_register_device(struct rvt_dev_info *rdi, u32 driver_id)
return -EINVAL;
}
+ ib_set_device_ops(&rdi->ibdev, &rvt_dev_ops);
/* Once we get past here we can use rvt_pr macros and tracepoints */
trace_rvt_dbg(rdi, "Driver attempting registration");
@@ -835,8 +635,7 @@ int rvt_register_device(struct rvt_dev_info *rdi, u32 driver_id)
rdi->ibdev.driver_id = driver_id;
/* We are now good to announce we exist */
- ret = ib_register_device(&rdi->ibdev, dev_name(&rdi->ibdev.dev),
- rdi->driver_f.port_callback);
+ ret = ib_register_device(&rdi->ibdev, dev_name(&rdi->ibdev.dev));
if (ret) {
rvt_pr_err(rdi, "Failed to register driver with ib core.\n");
goto bail_wss;
diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
index 383e65c7bbc0..a8c11b5e1e94 100644
--- a/drivers/infiniband/sw/rxe/rxe.c
+++ b/drivers/infiniband/sw/rxe/rxe.c
@@ -31,6 +31,7 @@
* SOFTWARE.
*/
+#include <rdma/rdma_netlink.h>
#include <net/addrconf.h>
#include "rxe.h"
#include "rxe_loc.h"
@@ -50,8 +51,10 @@ static void rxe_cleanup_ports(struct rxe_dev *rxe)
/* free resources for a rxe device all objects created for this device must
* have been destroyed
*/
-static void rxe_cleanup(struct rxe_dev *rxe)
+void rxe_dealloc(struct ib_device *ib_dev)
{
+ struct rxe_dev *rxe = container_of(ib_dev, struct rxe_dev, ib_dev);
+
rxe_pool_cleanup(&rxe->uc_pool);
rxe_pool_cleanup(&rxe->pd_pool);
rxe_pool_cleanup(&rxe->ah_pool);
@@ -65,16 +68,8 @@ static void rxe_cleanup(struct rxe_dev *rxe)
rxe_cleanup_ports(rxe);
- crypto_free_shash(rxe->tfm);
-}
-
-/* called when all references have been dropped */
-void rxe_release(struct kref *kref)
-{
- struct rxe_dev *rxe = container_of(kref, struct rxe_dev, ref_cnt);
-
- rxe_cleanup(rxe);
- ib_dealloc_device(&rxe->ib_dev);
+ if (rxe->tfm)
+ crypto_free_shash(rxe->tfm);
}
/* initialize rxe device parameters */
@@ -279,7 +274,6 @@ static int rxe_init(struct rxe_dev *rxe)
spin_lock_init(&rxe->mmap_offset_lock);
spin_lock_init(&rxe->pending_lock);
INIT_LIST_HEAD(&rxe->pending_mmaps);
- INIT_LIST_HEAD(&rxe->list);
mutex_init(&rxe->usdev_lock);
@@ -308,37 +302,46 @@ void rxe_set_mtu(struct rxe_dev *rxe, unsigned int ndev_mtu)
/* called by ifc layer to create new rxe device.
* The caller should allocate memory for rxe by calling ib_alloc_device.
*/
-int rxe_add(struct rxe_dev *rxe, unsigned int mtu)
+int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name)
{
int err;
- kref_init(&rxe->ref_cnt);
-
err = rxe_init(rxe);
if (err)
- goto err1;
+ return err;
rxe_set_mtu(rxe, mtu);
- err = rxe_register_device(rxe);
- if (err)
- goto err1;
-
- return 0;
-
-err1:
- rxe_dev_put(rxe);
- return err;
+ return rxe_register_device(rxe, ibdev_name);
}
-/* called by the ifc layer to remove a device */
-void rxe_remove(struct rxe_dev *rxe)
+static int rxe_newlink(const char *ibdev_name, struct net_device *ndev)
{
- rxe_unregister_device(rxe);
+ struct rxe_dev *exists;
+ int err = 0;
+
+ exists = rxe_get_dev_from_net(ndev);
+ if (exists) {
+ ib_device_put(&exists->ib_dev);
+ pr_err("already configured on %s\n", ndev->name);
+ err = -EEXIST;
+ goto err;
+ }
- rxe_dev_put(rxe);
+ err = rxe_net_add(ibdev_name, ndev);
+ if (err) {
+ pr_err("failed to add %s\n", ndev->name);
+ goto err;
+ }
+err:
+ return err;
}
+static struct rdma_link_ops rxe_link_ops = {
+ .type = "rxe",
+ .newlink = rxe_newlink,
+};
+
static int __init rxe_module_init(void)
{
int err;
@@ -354,13 +357,15 @@ static int __init rxe_module_init(void)
if (err)
return err;
+ rdma_link_register(&rxe_link_ops);
pr_info("loaded\n");
return 0;
}
static void __exit rxe_module_exit(void)
{
- rxe_remove_all();
+ rdma_link_unregister(&rxe_link_ops);
+ ib_unregister_driver(RDMA_DRIVER_RXE);
rxe_net_exit();
rxe_cache_exit();
@@ -369,3 +374,5 @@ static void __exit rxe_module_exit(void)
late_initcall(rxe_module_init);
module_exit(rxe_module_exit);
+
+MODULE_ALIAS_RDMA_LINK("rxe");
diff --git a/drivers/infiniband/sw/rxe/rxe.h b/drivers/infiniband/sw/rxe/rxe.h
index d9ec2de68738..2e2dff478833 100644
--- a/drivers/infiniband/sw/rxe/rxe.h
+++ b/drivers/infiniband/sw/rxe/rxe.h
@@ -65,8 +65,9 @@
*/
#define RXE_UVERBS_ABI_VERSION 2
-#define IB_PHYS_STATE_LINK_UP (5)
-#define IB_PHYS_STATE_LINK_DOWN (3)
+#define RDMA_LINK_PHYS_STATE_LINK_UP (5)
+#define RDMA_LINK_PHYS_STATE_DISABLED (3)
+#define RDMA_LINK_PHYS_STATE_POLLING (2)
#define RXE_ROCE_V2_SPORT (0xc000)
@@ -94,20 +95,23 @@ static inline u32 rxe_crc32(struct rxe_dev *rxe,
void rxe_set_mtu(struct rxe_dev *rxe, unsigned int dev_mtu);
-int rxe_add(struct rxe_dev *rxe, unsigned int mtu);
-void rxe_remove(struct rxe_dev *rxe);
-void rxe_remove_all(void);
+int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name);
void rxe_rcv(struct sk_buff *skb);
-static inline void rxe_dev_put(struct rxe_dev *rxe)
+/* The caller must do a matching ib_device_put(&dev->ib_dev) */
+static inline struct rxe_dev *rxe_get_dev_from_net(struct net_device *ndev)
{
- kref_put(&rxe->ref_cnt, rxe_release);
+ struct ib_device *ibdev =
+ ib_device_get_by_netdev(ndev, RDMA_DRIVER_RXE);
+
+ if (!ibdev)
+ return NULL;
+ return container_of(ibdev, struct rxe_dev, ib_dev);
}
-struct rxe_dev *net_to_rxe(struct net_device *ndev);
-struct rxe_dev *get_rxe_by_name(const char *name);
void rxe_port_up(struct rxe_dev *rxe);
void rxe_port_down(struct rxe_dev *rxe);
+void rxe_set_port_state(struct rxe_dev *rxe);
#endif /* RXE_H */
diff --git a/drivers/infiniband/sw/rxe/rxe_av.c b/drivers/infiniband/sw/rxe/rxe_av.c
index 26fe8d7dbc55..81ee756c19b8 100644
--- a/drivers/infiniband/sw/rxe/rxe_av.c
+++ b/drivers/infiniband/sw/rxe/rxe_av.c
@@ -34,6 +34,13 @@
#include "rxe.h"
#include "rxe_loc.h"
+void rxe_init_av(struct rdma_ah_attr *attr, struct rxe_av *av)
+{
+ rxe_av_from_attr(rdma_ah_get_port_num(attr), av, attr);
+ rxe_av_fill_ip_info(av, attr);
+ memcpy(av->dmac, attr->roce.dmac, ETH_ALEN);
+}
+
int rxe_av_chk_attr(struct rxe_dev *rxe, struct rdma_ah_attr *attr)
{
struct rxe_port *port;
diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c
index ea089cb091ad..00eb99d3df86 100644
--- a/drivers/infiniband/sw/rxe/rxe_comp.c
+++ b/drivers/infiniband/sw/rxe/rxe_comp.c
@@ -146,8 +146,7 @@ void retransmit_timer(struct timer_list *t)
}
}
-void rxe_comp_queue_pkt(struct rxe_dev *rxe, struct rxe_qp *qp,
- struct sk_buff *skb)
+void rxe_comp_queue_pkt(struct rxe_qp *qp, struct sk_buff *skb)
{
int must_sched;
@@ -155,7 +154,8 @@ void rxe_comp_queue_pkt(struct rxe_dev *rxe, struct rxe_qp *qp,
must_sched = skb_queue_len(&qp->resp_pkts) > 1;
if (must_sched != 0)
- rxe_counter_inc(rxe, RXE_CNT_COMPLETER_SCHED);
+ rxe_counter_inc(SKB_TO_PKT(skb)->rxe, RXE_CNT_COMPLETER_SCHED);
+
rxe_run_task(&qp->comp.task, must_sched);
}
@@ -439,6 +439,7 @@ static void make_send_cqe(struct rxe_qp *qp, struct rxe_send_wqe *wqe,
*/
static void do_complete(struct rxe_qp *qp, struct rxe_send_wqe *wqe)
{
+ struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
struct rxe_cqe cqe;
if ((qp->sq_sig_type == IB_SIGNAL_ALL_WR) ||
@@ -451,6 +452,11 @@ static void do_complete(struct rxe_qp *qp, struct rxe_send_wqe *wqe)
advance_consumer(qp->sq.queue);
}
+ if (wqe->wr.opcode == IB_WR_SEND ||
+ wqe->wr.opcode == IB_WR_SEND_WITH_IMM ||
+ wqe->wr.opcode == IB_WR_SEND_WITH_INV)
+ rxe_counter_inc(rxe, RXE_CNT_RDMA_SEND);
+
/*
* we completed something so let req run again
* if it is trying to fence
diff --git a/drivers/infiniband/sw/rxe/rxe_hw_counters.c b/drivers/infiniband/sw/rxe/rxe_hw_counters.c
index 6aeb7a165e46..636edb5f4cf4 100644
--- a/drivers/infiniband/sw/rxe/rxe_hw_counters.c
+++ b/drivers/infiniband/sw/rxe/rxe_hw_counters.c
@@ -37,15 +37,18 @@ static const char * const rxe_counter_name[] = {
[RXE_CNT_SENT_PKTS] = "sent_pkts",
[RXE_CNT_RCVD_PKTS] = "rcvd_pkts",
[RXE_CNT_DUP_REQ] = "duplicate_request",
- [RXE_CNT_OUT_OF_SEQ_REQ] = "out_of_sequence",
+ [RXE_CNT_OUT_OF_SEQ_REQ] = "out_of_seq_request",
[RXE_CNT_RCV_RNR] = "rcvd_rnr_err",
[RXE_CNT_SND_RNR] = "send_rnr_err",
[RXE_CNT_RCV_SEQ_ERR] = "rcvd_seq_err",
- [RXE_CNT_COMPLETER_SCHED] = "ack_deffered",
+ [RXE_CNT_COMPLETER_SCHED] = "ack_deferred",
[RXE_CNT_RETRY_EXCEEDED] = "retry_exceeded_err",
[RXE_CNT_RNR_RETRY_EXCEEDED] = "retry_rnr_exceeded_err",
[RXE_CNT_COMP_RETRY] = "completer_retry_err",
[RXE_CNT_SEND_ERR] = "send_err",
+ [RXE_CNT_LINK_DOWNED] = "link_downed",
+ [RXE_CNT_RDMA_SEND] = "rdma_sends",
+ [RXE_CNT_RDMA_RECV] = "rdma_recvs",
};
int rxe_ib_get_hw_stats(struct ib_device *ibdev,
@@ -59,7 +62,7 @@ int rxe_ib_get_hw_stats(struct ib_device *ibdev,
return -EINVAL;
for (cnt = 0; cnt < ARRAY_SIZE(rxe_counter_name); cnt++)
- stats->value[cnt] = dev->stats_counters[cnt];
+ stats->value[cnt] = atomic64_read(&dev->stats_counters[cnt]);
return ARRAY_SIZE(rxe_counter_name);
}
diff --git a/drivers/infiniband/sw/rxe/rxe_hw_counters.h b/drivers/infiniband/sw/rxe/rxe_hw_counters.h
index f44df1b76742..72c0d63c79e0 100644
--- a/drivers/infiniband/sw/rxe/rxe_hw_counters.h
+++ b/drivers/infiniband/sw/rxe/rxe_hw_counters.h
@@ -50,6 +50,9 @@ enum rxe_counters {
RXE_CNT_RNR_RETRY_EXCEEDED,
RXE_CNT_COMP_RETRY,
RXE_CNT_SEND_ERR,
+ RXE_CNT_LINK_DOWNED,
+ RXE_CNT_RDMA_SEND,
+ RXE_CNT_RDMA_RECV,
RXE_NUM_OF_COUNTERS
};
diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h
index afd53f57a62b..3d8cef836f0d 100644
--- a/drivers/infiniband/sw/rxe/rxe_loc.h
+++ b/drivers/infiniband/sw/rxe/rxe_loc.h
@@ -35,6 +35,7 @@
#define RXE_LOC_H
/* rxe_av.c */
+void rxe_init_av(struct rdma_ah_attr *attr, struct rxe_av *av);
int rxe_av_chk_attr(struct rxe_dev *rxe, struct rdma_ah_attr *attr);
@@ -157,7 +158,7 @@ int rxe_qp_chk_init(struct rxe_dev *rxe, struct ib_qp_init_attr *init);
int rxe_qp_from_init(struct rxe_dev *rxe, struct rxe_qp *qp, struct rxe_pd *pd,
struct ib_qp_init_attr *init,
struct rxe_create_qp_resp __user *uresp,
- struct ib_pd *ibpd);
+ struct ib_pd *ibpd, struct ib_udata *udata);
int rxe_qp_to_init(struct rxe_qp *qp, struct ib_qp_init_attr *init);
@@ -231,7 +232,7 @@ int rxe_srq_from_attr(struct rxe_dev *rxe, struct rxe_srq *srq,
struct ib_srq_attr *attr, enum ib_srq_attr_mask mask,
struct rxe_modify_srq_cmd *ucmd);
-void rxe_release(struct kref *kref);
+void rxe_dealloc(struct ib_device *ib_dev);
int rxe_completer(void *arg);
int rxe_requester(void *arg);
@@ -239,22 +240,21 @@ int rxe_responder(void *arg);
u32 rxe_icrc_hdr(struct rxe_pkt_info *pkt, struct sk_buff *skb);
-void rxe_resp_queue_pkt(struct rxe_dev *rxe,
- struct rxe_qp *qp, struct sk_buff *skb);
+void rxe_resp_queue_pkt(struct rxe_qp *qp, struct sk_buff *skb);
-void rxe_comp_queue_pkt(struct rxe_dev *rxe,
- struct rxe_qp *qp, struct sk_buff *skb);
+void rxe_comp_queue_pkt(struct rxe_qp *qp, struct sk_buff *skb);
static inline unsigned int wr_opcode_mask(int opcode, struct rxe_qp *qp)
{
return rxe_wr_opcode_info[opcode].mask[qp->ibqp.qp_type];
}
-static inline int rxe_xmit_packet(struct rxe_dev *rxe, struct rxe_qp *qp,
- struct rxe_pkt_info *pkt, struct sk_buff *skb)
+static inline int rxe_xmit_packet(struct rxe_qp *qp, struct rxe_pkt_info *pkt,
+ struct sk_buff *skb)
{
int err;
int is_request = pkt->mask & RXE_REQ_MASK;
+ struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
if ((is_request && (qp->req.state != QP_STATE_READY)) ||
(!is_request && (qp->resp.state != QP_STATE_READY))) {
diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c
index 9d3916b93f23..42f0f25e396c 100644
--- a/drivers/infiniband/sw/rxe/rxe_mr.c
+++ b/drivers/infiniband/sw/rxe/rxe_mr.c
@@ -162,16 +162,15 @@ int rxe_mem_init_user(struct rxe_pd *pd, u64 start,
u64 length, u64 iova, int access, struct ib_udata *udata,
struct rxe_mem *mem)
{
- int entry;
struct rxe_map **map;
struct rxe_phys_buf *buf = NULL;
struct ib_umem *umem;
- struct scatterlist *sg;
+ struct sg_page_iter sg_iter;
int num_buf;
void *vaddr;
int err;
- umem = ib_umem_get(pd->ibpd.uobject->context, start, length, access, 0);
+ umem = ib_umem_get(udata, start, length, access, 0);
if (IS_ERR(umem)) {
pr_warn("err %d from rxe_umem_get\n",
(int)PTR_ERR(umem));
@@ -191,16 +190,16 @@ int rxe_mem_init_user(struct rxe_pd *pd, u64 start,
goto err1;
}
- mem->page_shift = umem->page_shift;
- mem->page_mask = BIT(umem->page_shift) - 1;
+ mem->page_shift = PAGE_SHIFT;
+ mem->page_mask = PAGE_SIZE - 1;
num_buf = 0;
map = mem->map;
if (length > 0) {
buf = map[0]->buf;
- for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
- vaddr = page_address(sg_page(sg));
+ for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->nmap, 0) {
+ vaddr = page_address(sg_page_iter_page(&sg_iter));
if (!vaddr) {
pr_warn("null vaddr\n");
err = -ENOMEM;
@@ -208,7 +207,7 @@ int rxe_mem_init_user(struct rxe_pd *pd, u64 start,
}
buf->addr = (uintptr_t)vaddr;
- buf->size = BIT(umem->page_shift);
+ buf->size = PAGE_SIZE;
num_buf++;
buf++;
diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c
index 40e82e0f6c2d..753cabcd441c 100644
--- a/drivers/infiniband/sw/rxe/rxe_net.c
+++ b/drivers/infiniband/sw/rxe/rxe_net.c
@@ -45,43 +45,6 @@
#include "rxe_net.h"
#include "rxe_loc.h"
-static LIST_HEAD(rxe_dev_list);
-static DEFINE_SPINLOCK(dev_list_lock); /* spinlock for device list */
-
-struct rxe_dev *net_to_rxe(struct net_device *ndev)
-{
- struct rxe_dev *rxe;
- struct rxe_dev *found = NULL;
-
- spin_lock_bh(&dev_list_lock);
- list_for_each_entry(rxe, &rxe_dev_list, list) {
- if (rxe->ndev == ndev) {
- found = rxe;
- break;
- }
- }
- spin_unlock_bh(&dev_list_lock);
-
- return found;
-}
-
-struct rxe_dev *get_rxe_by_name(const char *name)
-{
- struct rxe_dev *rxe;
- struct rxe_dev *found = NULL;
-
- spin_lock_bh(&dev_list_lock);
- list_for_each_entry(rxe, &rxe_dev_list, list) {
- if (!strcmp(name, dev_name(&rxe->ib_dev.dev))) {
- found = rxe;
- break;
- }
- }
- spin_unlock_bh(&dev_list_lock);
- return found;
-}
-
-
static struct rxe_recv_sockets recv_sockets;
struct device *rxe_dma_device(struct rxe_dev *rxe)
@@ -229,18 +192,19 @@ static int rxe_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
struct udphdr *udph;
struct net_device *ndev = skb->dev;
struct net_device *rdev = ndev;
- struct rxe_dev *rxe = net_to_rxe(ndev);
+ struct rxe_dev *rxe = rxe_get_dev_from_net(ndev);
struct rxe_pkt_info *pkt = SKB_TO_PKT(skb);
if (!rxe && is_vlan_dev(rdev)) {
rdev = vlan_dev_real_dev(ndev);
- rxe = net_to_rxe(rdev);
+ rxe = rxe_get_dev_from_net(rdev);
}
if (!rxe)
goto drop;
if (skb_linearize(skb)) {
pr_err("skb_linearize failed\n");
+ ib_device_put(&rxe->ib_dev);
goto drop;
}
@@ -253,6 +217,12 @@ static int rxe_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
rxe_rcv(skb);
+ /*
+ * FIXME: this is in the wrong place, it needs to be done when pkt is
+ * destroyed
+ */
+ ib_device_put(&rxe->ib_dev);
+
return 0;
drop:
kfree_skb(skb);
@@ -384,9 +354,6 @@ static int prepare4(struct rxe_pkt_info *pkt, struct sk_buff *skb,
return -EHOSTUNREACH;
}
- if (!memcmp(saddr, daddr, sizeof(*daddr)))
- pkt->mask |= RXE_LOOPBACK_MASK;
-
prepare_udp_hdr(skb, cpu_to_be16(qp->src_port),
cpu_to_be16(ROCE_V2_UDP_DPORT));
@@ -411,9 +378,6 @@ static int prepare6(struct rxe_pkt_info *pkt, struct sk_buff *skb,
return -EHOSTUNREACH;
}
- if (!memcmp(saddr, daddr, sizeof(*daddr)))
- pkt->mask |= RXE_LOOPBACK_MASK;
-
prepare_udp_hdr(skb, cpu_to_be16(qp->src_port),
cpu_to_be16(ROCE_V2_UDP_DPORT));
@@ -437,6 +401,9 @@ int rxe_prepare(struct rxe_pkt_info *pkt, struct sk_buff *skb, u32 *crc)
*crc = rxe_icrc_hdr(pkt, skb);
+ if (ether_addr_equal(skb->dev->dev_addr, av->dmac))
+ pkt->mask |= RXE_LOOPBACK_MASK;
+
return err;
}
@@ -550,42 +517,24 @@ enum rdma_link_layer rxe_link_layer(struct rxe_dev *rxe, unsigned int port_num)
return IB_LINK_LAYER_ETHERNET;
}
-struct rxe_dev *rxe_net_add(struct net_device *ndev)
+int rxe_net_add(const char *ibdev_name, struct net_device *ndev)
{
int err;
struct rxe_dev *rxe = NULL;
- rxe = (struct rxe_dev *)ib_alloc_device(sizeof(*rxe));
+ rxe = ib_alloc_device(rxe_dev, ib_dev);
if (!rxe)
- return NULL;
+ return -ENOMEM;
rxe->ndev = ndev;
- err = rxe_add(rxe, ndev->mtu);
+ err = rxe_add(rxe, ndev->mtu, ibdev_name);
if (err) {
ib_dealloc_device(&rxe->ib_dev);
- return NULL;
+ return err;
}
- spin_lock_bh(&dev_list_lock);
- list_add_tail(&rxe->list, &rxe_dev_list);
- spin_unlock_bh(&dev_list_lock);
- return rxe;
-}
-
-void rxe_remove_all(void)
-{
- spin_lock_bh(&dev_list_lock);
- while (!list_empty(&rxe_dev_list)) {
- struct rxe_dev *rxe =
- list_first_entry(&rxe_dev_list, struct rxe_dev, list);
-
- list_del(&rxe->list);
- spin_unlock_bh(&dev_list_lock);
- rxe_remove(rxe);
- spin_lock_bh(&dev_list_lock);
- }
- spin_unlock_bh(&dev_list_lock);
+ return 0;
}
static void rxe_port_event(struct rxe_dev *rxe,
@@ -607,7 +556,6 @@ void rxe_port_up(struct rxe_dev *rxe)
port = &rxe->port;
port->attr.state = IB_PORT_ACTIVE;
- port->attr.phys_state = IB_PHYS_STATE_LINK_UP;
rxe_port_event(rxe, IB_EVENT_PORT_ACTIVE);
dev_info(&rxe->ib_dev.dev, "set active\n");
@@ -620,26 +568,33 @@ void rxe_port_down(struct rxe_dev *rxe)
port = &rxe->port;
port->attr.state = IB_PORT_DOWN;
- port->attr.phys_state = IB_PHYS_STATE_LINK_DOWN;
rxe_port_event(rxe, IB_EVENT_PORT_ERR);
+ rxe_counter_inc(rxe, RXE_CNT_LINK_DOWNED);
dev_info(&rxe->ib_dev.dev, "set down\n");
}
+void rxe_set_port_state(struct rxe_dev *rxe)
+{
+ if (netif_running(rxe->ndev) && netif_carrier_ok(rxe->ndev))
+ rxe_port_up(rxe);
+ else
+ rxe_port_down(rxe);
+}
+
static int rxe_notify(struct notifier_block *not_blk,
unsigned long event,
void *arg)
{
struct net_device *ndev = netdev_notifier_info_to_dev(arg);
- struct rxe_dev *rxe = net_to_rxe(ndev);
+ struct rxe_dev *rxe = rxe_get_dev_from_net(ndev);
if (!rxe)
- goto out;
+ return NOTIFY_OK;
switch (event) {
case NETDEV_UNREGISTER:
- list_del(&rxe->list);
- rxe_remove(rxe);
+ ib_unregister_device_queued(&rxe->ib_dev);
break;
case NETDEV_UP:
rxe_port_up(rxe);
@@ -652,10 +607,7 @@ static int rxe_notify(struct notifier_block *not_blk,
rxe_set_mtu(rxe, ndev->mtu);
break;
case NETDEV_CHANGE:
- if (netif_running(ndev) && netif_carrier_ok(ndev))
- rxe_port_up(rxe);
- else
- rxe_port_down(rxe);
+ rxe_set_port_state(rxe);
break;
case NETDEV_REBOOT:
case NETDEV_GOING_DOWN:
@@ -667,7 +619,8 @@ static int rxe_notify(struct notifier_block *not_blk,
event, ndev->name);
break;
}
-out:
+
+ ib_device_put(&rxe->ib_dev);
return NOTIFY_OK;
}
diff --git a/drivers/infiniband/sw/rxe/rxe_net.h b/drivers/infiniband/sw/rxe/rxe_net.h
index 106c586dbb26..2ca71d3d245c 100644
--- a/drivers/infiniband/sw/rxe/rxe_net.h
+++ b/drivers/infiniband/sw/rxe/rxe_net.h
@@ -43,7 +43,7 @@ struct rxe_recv_sockets {
struct socket *sk6;
};
-struct rxe_dev *rxe_net_add(struct net_device *ndev);
+int rxe_net_add(const char *ibdev_name, struct net_device *ndev);
int rxe_net_init(void);
void rxe_net_exit(void);
diff --git a/drivers/infiniband/sw/rxe/rxe_param.h b/drivers/infiniband/sw/rxe/rxe_param.h
index bdea899a58ac..1abed47ca221 100644
--- a/drivers/infiniband/sw/rxe/rxe_param.h
+++ b/drivers/infiniband/sw/rxe/rxe_param.h
@@ -78,7 +78,8 @@ enum rxe_device_param {
| IB_DEVICE_SYS_IMAGE_GUID
| IB_DEVICE_RC_RNR_NAK_GEN
| IB_DEVICE_SRQ_RESIZE
- | IB_DEVICE_MEM_MGT_EXTENSIONS,
+ | IB_DEVICE_MEM_MGT_EXTENSIONS
+ | IB_DEVICE_ALLOW_USER_UNREG,
RXE_MAX_SGE = 32,
RXE_MAX_SGE_RD = 32,
RXE_MAX_CQ = 16384,
diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c
index 36b53fb94a49..120fa9005954 100644
--- a/drivers/infiniband/sw/rxe/rxe_pool.c
+++ b/drivers/infiniband/sw/rxe/rxe_pool.c
@@ -42,10 +42,12 @@ struct rxe_type_info rxe_type_info[RXE_NUM_TYPES] = {
[RXE_TYPE_UC] = {
.name = "rxe-uc",
.size = sizeof(struct rxe_ucontext),
+ .flags = RXE_POOL_NO_ALLOC,
},
[RXE_TYPE_PD] = {
.name = "rxe-pd",
.size = sizeof(struct rxe_pd),
+ .flags = RXE_POOL_NO_ALLOC,
},
[RXE_TYPE_AH] = {
.name = "rxe-ah",
@@ -112,6 +114,20 @@ static inline struct kmem_cache *pool_cache(struct rxe_pool *pool)
return rxe_type_info[pool->type].cache;
}
+static void rxe_cache_clean(size_t cnt)
+{
+ int i;
+ struct rxe_type_info *type;
+
+ for (i = 0; i < cnt; i++) {
+ type = &rxe_type_info[i];
+ if (!(type->flags & RXE_POOL_NO_ALLOC)) {
+ kmem_cache_destroy(type->cache);
+ type->cache = NULL;
+ }
+ }
+}
+
int rxe_cache_init(void)
{
int err;
@@ -122,38 +138,31 @@ int rxe_cache_init(void)
for (i = 0; i < RXE_NUM_TYPES; i++) {
type = &rxe_type_info[i];
size = ALIGN(type->size, RXE_POOL_ALIGN);
- type->cache = kmem_cache_create(type->name, size,
- RXE_POOL_ALIGN,
- RXE_POOL_CACHE_FLAGS, NULL);
- if (!type->cache) {
- pr_err("Unable to init kmem cache for %s\n",
- type->name);
- err = -ENOMEM;
- goto err1;
+ if (!(type->flags & RXE_POOL_NO_ALLOC)) {
+ type->cache =
+ kmem_cache_create(type->name, size,
+ RXE_POOL_ALIGN,
+ RXE_POOL_CACHE_FLAGS, NULL);
+ if (!type->cache) {
+ pr_err("Unable to init kmem cache for %s\n",
+ type->name);
+ err = -ENOMEM;
+ goto err1;
+ }
}
}
return 0;
err1:
- while (--i >= 0) {
- kmem_cache_destroy(type->cache);
- type->cache = NULL;
- }
+ rxe_cache_clean(i);
return err;
}
void rxe_cache_exit(void)
{
- int i;
- struct rxe_type_info *type;
-
- for (i = 0; i < RXE_NUM_TYPES; i++) {
- type = &rxe_type_info[i];
- kmem_cache_destroy(type->cache);
- type->cache = NULL;
- }
+ rxe_cache_clean(RXE_NUM_TYPES);
}
static int rxe_pool_init_index(struct rxe_pool *pool, u32 max, u32 min)
@@ -241,7 +250,7 @@ static void rxe_pool_put(struct rxe_pool *pool)
kref_put(&pool->ref_cnt, rxe_pool_release);
}
-int rxe_pool_cleanup(struct rxe_pool *pool)
+void rxe_pool_cleanup(struct rxe_pool *pool)
{
unsigned long flags;
@@ -253,8 +262,6 @@ int rxe_pool_cleanup(struct rxe_pool *pool)
write_unlock_irqrestore(&pool->pool_lock, flags);
rxe_pool_put(pool);
-
- return 0;
}
static u32 alloc_index(struct rxe_pool *pool)
@@ -392,29 +399,64 @@ void *rxe_alloc(struct rxe_pool *pool)
kref_get(&pool->ref_cnt);
read_unlock_irqrestore(&pool->pool_lock, flags);
- kref_get(&pool->rxe->ref_cnt);
+ if (!ib_device_try_get(&pool->rxe->ib_dev))
+ goto out_put_pool;
if (atomic_inc_return(&pool->num_elem) > pool->max_elem)
- goto out_put_pool;
+ goto out_cnt;
elem = kmem_cache_zalloc(pool_cache(pool),
(pool->flags & RXE_POOL_ATOMIC) ?
GFP_ATOMIC : GFP_KERNEL);
if (!elem)
- goto out_put_pool;
+ goto out_cnt;
elem->pool = pool;
kref_init(&elem->ref_cnt);
return elem;
-out_put_pool:
+out_cnt:
atomic_dec(&pool->num_elem);
- rxe_dev_put(pool->rxe);
+ ib_device_put(&pool->rxe->ib_dev);
+out_put_pool:
rxe_pool_put(pool);
return NULL;
}
+int rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_entry *elem)
+{
+ unsigned long flags;
+
+ might_sleep_if(!(pool->flags & RXE_POOL_ATOMIC));
+
+ read_lock_irqsave(&pool->pool_lock, flags);
+ if (pool->state != RXE_POOL_STATE_VALID) {
+ read_unlock_irqrestore(&pool->pool_lock, flags);
+ return -EINVAL;
+ }
+ kref_get(&pool->ref_cnt);
+ read_unlock_irqrestore(&pool->pool_lock, flags);
+
+ if (!ib_device_try_get(&pool->rxe->ib_dev))
+ goto out_put_pool;
+
+ if (atomic_inc_return(&pool->num_elem) > pool->max_elem)
+ goto out_cnt;
+
+ elem->pool = pool;
+ kref_init(&elem->ref_cnt);
+
+ return 0;
+
+out_cnt:
+ atomic_dec(&pool->num_elem);
+ ib_device_put(&pool->rxe->ib_dev);
+out_put_pool:
+ rxe_pool_put(pool);
+ return -EINVAL;
+}
+
void rxe_elem_release(struct kref *kref)
{
struct rxe_pool_entry *elem =
@@ -424,9 +466,10 @@ void rxe_elem_release(struct kref *kref)
if (pool->cleanup)
pool->cleanup(elem);
- kmem_cache_free(pool_cache(pool), elem);
+ if (!(pool->flags & RXE_POOL_NO_ALLOC))
+ kmem_cache_free(pool_cache(pool), elem);
atomic_dec(&pool->num_elem);
- rxe_dev_put(pool->rxe);
+ ib_device_put(&pool->rxe->ib_dev);
rxe_pool_put(pool);
}
diff --git a/drivers/infiniband/sw/rxe/rxe_pool.h b/drivers/infiniband/sw/rxe/rxe_pool.h
index aa4ba307097b..2f2cff1cbe43 100644
--- a/drivers/infiniband/sw/rxe/rxe_pool.h
+++ b/drivers/infiniband/sw/rxe/rxe_pool.h
@@ -41,6 +41,7 @@ enum rxe_pool_flags {
RXE_POOL_ATOMIC = BIT(0),
RXE_POOL_INDEX = BIT(1),
RXE_POOL_KEY = BIT(2),
+ RXE_POOL_NO_ALLOC = BIT(4),
};
enum rxe_elem_type {
@@ -126,11 +127,14 @@ int rxe_pool_init(struct rxe_dev *rxe, struct rxe_pool *pool,
enum rxe_elem_type type, u32 max_elem);
/* free resources from object pool */
-int rxe_pool_cleanup(struct rxe_pool *pool);
+void rxe_pool_cleanup(struct rxe_pool *pool);
/* allocate an object from pool */
void *rxe_alloc(struct rxe_pool *pool);
+/* connect already allocated object to pool */
+int rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_entry *elem);
+
/* assign an index to an indexed object and insert object into
* pool's rb tree
*/
diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c
index b9710907dac2..09ede70dc1e8 100644
--- a/drivers/infiniband/sw/rxe/rxe_qp.c
+++ b/drivers/infiniband/sw/rxe/rxe_qp.c
@@ -35,6 +35,7 @@
#include <linux/delay.h>
#include <linux/sched.h>
#include <linux/vmalloc.h>
+#include <rdma/uverbs_ioctl.h>
#include "rxe.h"
#include "rxe_loc.h"
@@ -97,7 +98,7 @@ int rxe_qp_chk_init(struct rxe_dev *rxe, struct ib_qp_init_attr *init)
goto err1;
if (init->qp_type == IB_QPT_SMI || init->qp_type == IB_QPT_GSI) {
- if (port_num != 1) {
+ if (!rdma_is_port_valid(&rxe->ib_dev, port_num)) {
pr_warn("invalid port = %d\n", port_num);
goto err1;
}
@@ -336,13 +337,15 @@ static int rxe_qp_init_resp(struct rxe_dev *rxe, struct rxe_qp *qp,
int rxe_qp_from_init(struct rxe_dev *rxe, struct rxe_qp *qp, struct rxe_pd *pd,
struct ib_qp_init_attr *init,
struct rxe_create_qp_resp __user *uresp,
- struct ib_pd *ibpd)
+ struct ib_pd *ibpd,
+ struct ib_udata *udata)
{
int err;
struct rxe_cq *rcq = to_rcq(init->recv_cq);
struct rxe_cq *scq = to_rcq(init->send_cq);
struct rxe_srq *srq = init->srq ? to_rsrq(init->srq) : NULL;
- struct ib_ucontext *context = ibpd->uobject ? ibpd->uobject->context : NULL;
+ struct rxe_ucontext *ucontext =
+ rdma_udata_to_drv_context(udata, struct rxe_ucontext, ibuc);
rxe_add_ref(pd);
rxe_add_ref(rcq);
@@ -357,11 +360,11 @@ int rxe_qp_from_init(struct rxe_dev *rxe, struct rxe_qp *qp, struct rxe_pd *pd,
rxe_qp_init_misc(rxe, qp, init);
- err = rxe_qp_init_req(rxe, qp, init, context, uresp);
+ err = rxe_qp_init_req(rxe, qp, init, &ucontext->ibuc, uresp);
if (err)
goto err1;
- err = rxe_qp_init_resp(rxe, qp, init, context, uresp);
+ err = rxe_qp_init_resp(rxe, qp, init, &ucontext->ibuc, uresp);
if (err)
goto err2;
@@ -433,7 +436,7 @@ int rxe_qp_chk_attr(struct rxe_dev *rxe, struct rxe_qp *qp,
}
if (mask & IB_QP_PORT) {
- if (attr->port_num != 1) {
+ if (!rdma_is_port_valid(&rxe->ib_dev, attr->port_num)) {
pr_warn("invalid port %d\n", attr->port_num);
goto err1;
}
@@ -448,7 +451,7 @@ int rxe_qp_chk_attr(struct rxe_dev *rxe, struct rxe_qp *qp,
if (mask & IB_QP_ALT_PATH) {
if (rxe_av_chk_attr(rxe, &attr->alt_ah_attr))
goto err1;
- if (attr->alt_port_num != 1) {
+ if (!rdma_is_port_valid(&rxe->ib_dev, attr->alt_port_num)) {
pr_warn("invalid alt port %d\n", attr->alt_port_num);
goto err1;
}
@@ -630,14 +633,11 @@ int rxe_qp_from_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask,
qp->attr.qkey = attr->qkey;
if (mask & IB_QP_AV) {
- rxe_av_from_attr(attr->port_num, &qp->pri_av, &attr->ah_attr);
- rxe_av_fill_ip_info(&qp->pri_av, &attr->ah_attr);
+ rxe_init_av(&attr->ah_attr, &qp->pri_av);
}
if (mask & IB_QP_ALT_PATH) {
- rxe_av_from_attr(attr->alt_port_num, &qp->alt_av,
- &attr->alt_ah_attr);
- rxe_av_fill_ip_info(&qp->alt_av, &attr->alt_ah_attr);
+ rxe_init_av(&attr->alt_ah_attr, &qp->alt_av);
qp->attr.alt_port_num = attr->alt_port_num;
qp->attr.alt_pkey_index = attr->alt_pkey_index;
qp->attr.alt_timeout = attr->alt_timeout;
diff --git a/drivers/infiniband/sw/rxe/rxe_recv.c b/drivers/infiniband/sw/rxe/rxe_recv.c
index 5c29a1bb575a..f9a492ed900b 100644
--- a/drivers/infiniband/sw/rxe/rxe_recv.c
+++ b/drivers/infiniband/sw/rxe/rxe_recv.c
@@ -266,14 +266,12 @@ err1:
return -EINVAL;
}
-static inline void rxe_rcv_pkt(struct rxe_dev *rxe,
- struct rxe_pkt_info *pkt,
- struct sk_buff *skb)
+static inline void rxe_rcv_pkt(struct rxe_pkt_info *pkt, struct sk_buff *skb)
{
if (pkt->mask & RXE_REQ_MASK)
- rxe_resp_queue_pkt(rxe, pkt->qp, skb);
+ rxe_resp_queue_pkt(pkt->qp, skb);
else
- rxe_comp_queue_pkt(rxe, pkt->qp, skb);
+ rxe_comp_queue_pkt(pkt->qp, skb);
}
static void rxe_rcv_mcast_pkt(struct rxe_dev *rxe, struct sk_buff *skb)
@@ -319,7 +317,7 @@ static void rxe_rcv_mcast_pkt(struct rxe_dev *rxe, struct sk_buff *skb)
pkt->qp = qp;
rxe_add_ref(qp);
- rxe_rcv_pkt(rxe, pkt, skb);
+ rxe_rcv_pkt(pkt, skb);
}
spin_unlock_bh(&mcg->mcg_lock);
@@ -411,7 +409,7 @@ void rxe_rcv(struct sk_buff *skb)
if (unlikely(bth_qpn(pkt) == IB_MULTICAST_QPN))
rxe_rcv_mcast_pkt(rxe, skb);
else
- rxe_rcv_pkt(rxe, pkt, skb);
+ rxe_rcv_pkt(pkt, skb);
return;
diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c
index 6c361d70d7cd..c5d9b558fa90 100644
--- a/drivers/infiniband/sw/rxe/rxe_req.c
+++ b/drivers/infiniband/sw/rxe/rxe_req.c
@@ -643,6 +643,7 @@ next_wqe:
rmr->access = wqe->wr.wr.reg.access;
rmr->lkey = wqe->wr.wr.reg.key;
rmr->rkey = wqe->wr.wr.reg.key;
+ rmr->iova = wqe->wr.wr.reg.mr->iova;
wqe->state = wqe_state_done;
wqe->status = IB_WC_SUCCESS;
} else {
@@ -728,7 +729,7 @@ next_wqe:
save_state(wqe, qp, &rollback_wqe, &rollback_psn);
update_wqe_state(qp, wqe, &pkt);
update_wqe_psn(qp, wqe, &pkt, payload);
- ret = rxe_xmit_packet(to_rdev(qp->ibqp.device), qp, &pkt, skb);
+ ret = rxe_xmit_packet(qp, &pkt, skb);
if (ret) {
qp->need_req_skb = 1;
diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c
index c962160292f4..aca9f60f9b21 100644
--- a/drivers/infiniband/sw/rxe/rxe_resp.c
+++ b/drivers/infiniband/sw/rxe/rxe_resp.c
@@ -104,8 +104,7 @@ static char *resp_state_name[] = {
};
/* rxe_recv calls here to add a request packet to the input queue */
-void rxe_resp_queue_pkt(struct rxe_dev *rxe, struct rxe_qp *qp,
- struct sk_buff *skb)
+void rxe_resp_queue_pkt(struct rxe_qp *qp, struct sk_buff *skb)
{
int must_sched;
struct rxe_pkt_info *pkt = SKB_TO_PKT(skb);
@@ -124,12 +123,9 @@ static inline enum resp_states get_req(struct rxe_qp *qp,
struct sk_buff *skb;
if (qp->resp.state == QP_STATE_ERROR) {
- skb = skb_dequeue(&qp->req_pkts);
- if (skb) {
- /* drain request packet queue */
+ while ((skb = skb_dequeue(&qp->req_pkts))) {
rxe_drop_ref(qp);
kfree_skb(skb);
- return RESPST_GET_REQ;
}
/* go drain recv wr queue */
@@ -660,7 +656,6 @@ static struct sk_buff *prepare_ack_packet(struct rxe_qp *qp,
static enum resp_states read_reply(struct rxe_qp *qp,
struct rxe_pkt_info *req_pkt)
{
- struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
struct rxe_pkt_info ack_pkt;
struct sk_buff *skb;
int mtu = qp->mtu;
@@ -739,7 +734,7 @@ static enum resp_states read_reply(struct rxe_qp *qp,
p = payload_addr(&ack_pkt) + payload + bth_pad(&ack_pkt);
*p = ~icrc;
- err = rxe_xmit_packet(rxe, qp, &ack_pkt, skb);
+ err = rxe_xmit_packet(qp, &ack_pkt, skb);
if (err) {
pr_err("Failed sending RDMA reply.\n");
return RESPST_ERR_RNR;
@@ -838,18 +833,25 @@ static enum resp_states do_complete(struct rxe_qp *qp,
struct ib_wc *wc = &cqe.ibwc;
struct ib_uverbs_wc *uwc = &cqe.uibwc;
struct rxe_recv_wqe *wqe = qp->resp.wqe;
+ struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
if (unlikely(!wqe))
return RESPST_CLEANUP;
memset(&cqe, 0, sizeof(cqe));
- wc->wr_id = wqe->wr_id;
- wc->status = qp->resp.status;
- wc->qp = &qp->ibqp;
+ if (qp->rcq->is_user) {
+ uwc->status = qp->resp.status;
+ uwc->qp_num = qp->ibqp.qp_num;
+ uwc->wr_id = wqe->wr_id;
+ } else {
+ wc->status = qp->resp.status;
+ wc->qp = &qp->ibqp;
+ wc->wr_id = wqe->wr_id;
+ }
- /* fields after status are not required for errors */
if (wc->status == IB_WC_SUCCESS) {
+ rxe_counter_inc(rxe, RXE_CNT_RDMA_RECV);
wc->opcode = (pkt->mask & RXE_IMMDT_MASK &&
pkt->mask & RXE_WRITE_MASK) ?
IB_WC_RECV_RDMA_WITH_IMM : IB_WC_RECV;
@@ -898,7 +900,6 @@ static enum resp_states do_complete(struct rxe_qp *qp,
}
if (pkt->mask & RXE_IETH_MASK) {
- struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
struct rxe_mem *rmr;
wc->wc_flags |= IB_WC_WITH_INVALIDATE;
@@ -950,7 +951,6 @@ static int send_ack(struct rxe_qp *qp, struct rxe_pkt_info *pkt,
int err = 0;
struct rxe_pkt_info ack_pkt;
struct sk_buff *skb;
- struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
skb = prepare_ack_packet(qp, pkt, &ack_pkt, IB_OPCODE_RC_ACKNOWLEDGE,
0, psn, syndrome, NULL);
@@ -959,7 +959,7 @@ static int send_ack(struct rxe_qp *qp, struct rxe_pkt_info *pkt,
goto err1;
}
- err = rxe_xmit_packet(rxe, qp, &ack_pkt, skb);
+ err = rxe_xmit_packet(qp, &ack_pkt, skb);
if (err)
pr_err_ratelimited("Failed sending ack\n");
@@ -973,7 +973,6 @@ static int send_atomic_ack(struct rxe_qp *qp, struct rxe_pkt_info *pkt,
int rc = 0;
struct rxe_pkt_info ack_pkt;
struct sk_buff *skb;
- struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
struct resp_res *res;
skb = prepare_ack_packet(qp, pkt, &ack_pkt,
@@ -1001,7 +1000,7 @@ static int send_atomic_ack(struct rxe_qp *qp, struct rxe_pkt_info *pkt,
res->last_psn = ack_pkt.psn;
res->cur_psn = ack_pkt.psn;
- rc = rxe_xmit_packet(rxe, qp, &ack_pkt, skb);
+ rc = rxe_xmit_packet(qp, &ack_pkt, skb);
if (rc) {
pr_err_ratelimited("Failed sending ack\n");
rxe_drop_ref(qp);
@@ -1131,8 +1130,7 @@ static enum resp_states duplicate_request(struct rxe_qp *qp,
if (res) {
skb_get(res->atomic.skb);
/* Resend the result. */
- rc = rxe_xmit_packet(to_rdev(qp->ibqp.device), qp,
- pkt, res->atomic.skb);
+ rc = rxe_xmit_packet(qp, pkt, res->atomic.skb);
if (rc) {
pr_err("Failed resending result. This flow is not handled - skb ignored\n");
rc = RESPST_CLEANUP;
diff --git a/drivers/infiniband/sw/rxe/rxe_sysfs.c b/drivers/infiniband/sw/rxe/rxe_sysfs.c
index 73a19f808e1b..ccda5f5a3bc0 100644
--- a/drivers/infiniband/sw/rxe/rxe_sysfs.c
+++ b/drivers/infiniband/sw/rxe/rxe_sysfs.c
@@ -53,62 +53,42 @@ static int sanitize_arg(const char *val, char *intf, int intf_len)
return len;
}
-static void rxe_set_port_state(struct net_device *ndev)
-{
- struct rxe_dev *rxe = net_to_rxe(ndev);
- bool is_up = netif_running(ndev) && netif_carrier_ok(ndev);
-
- if (!rxe)
- goto out;
-
- if (is_up)
- rxe_port_up(rxe);
- else
- rxe_port_down(rxe); /* down for unknown state */
-out:
- return;
-}
-
static int rxe_param_set_add(const char *val, const struct kernel_param *kp)
{
int len;
int err = 0;
char intf[32];
- struct net_device *ndev = NULL;
- struct rxe_dev *rxe;
+ struct net_device *ndev;
+ struct rxe_dev *exists;
len = sanitize_arg(val, intf, sizeof(intf));
if (!len) {
pr_err("add: invalid interface name\n");
- err = -EINVAL;
- goto err;
+ return -EINVAL;
}
ndev = dev_get_by_name(&init_net, intf);
if (!ndev) {
pr_err("interface %s not found\n", intf);
- err = -EINVAL;
- goto err;
+ return -EINVAL;
}
- if (net_to_rxe(ndev)) {
+ exists = rxe_get_dev_from_net(ndev);
+ if (exists) {
+ ib_device_put(&exists->ib_dev);
pr_err("already configured on %s\n", intf);
err = -EINVAL;
goto err;
}
- rxe = rxe_net_add(ndev);
- if (!rxe) {
+ err = rxe_net_add("rxe%d", ndev);
+ if (err) {
pr_err("failed to add %s\n", intf);
- err = -EINVAL;
goto err;
}
- rxe_set_port_state(ndev);
- dev_info(&rxe->ib_dev.dev, "added %s\n", intf);
err:
- if (ndev)
- dev_put(ndev);
+ dev_put(ndev);
return err;
}
@@ -116,7 +96,7 @@ static int rxe_param_set_remove(const char *val, const struct kernel_param *kp)
{
int len;
char intf[32];
- struct rxe_dev *rxe;
+ struct ib_device *ib_dev;
len = sanitize_arg(val, intf, sizeof(intf));
if (!len) {
@@ -126,19 +106,17 @@ static int rxe_param_set_remove(const char *val, const struct kernel_param *kp)
if (strncmp("all", intf, len) == 0) {
pr_info("rxe_sys: remove all");
- rxe_remove_all();
+ ib_unregister_driver(RDMA_DRIVER_RXE);
return 0;
}
- rxe = get_rxe_by_name(intf);
-
- if (!rxe) {
+ ib_dev = ib_device_get_by_name(intf, RDMA_DRIVER_RXE);
+ if (!ib_dev) {
pr_err("not configured on %s\n", intf);
return -EINVAL;
}
- list_del(&rxe->list);
- rxe_remove(rxe);
+ ib_unregister_device_and_put(ib_dev);
return 0;
}
@@ -152,6 +130,6 @@ static const struct kernel_param_ops rxe_remove_ops = {
};
module_param_cb(add, &rxe_add_ops, NULL, 0200);
-MODULE_PARM_DESC(add, "Create RXE device over network interface");
+MODULE_PARM_DESC(add, "DEPRECATED. Create RXE device over network interface");
module_param_cb(remove, &rxe_remove_ops, NULL, 0200);
-MODULE_PARM_DESC(remove, "Remove RXE device over network interface");
+MODULE_PARM_DESC(remove, "DEPRECATED. Remove RXE device over network interface");
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c
index 9c19f2027511..6ecf28570ff0 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.c
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.c
@@ -33,6 +33,7 @@
#include <linux/dma-mapping.h>
#include <net/addrconf.h>
+#include <rdma/uverbs_ioctl.h>
#include "rxe.h"
#include "rxe_loc.h"
#include "rxe_queue.h"
@@ -56,12 +57,7 @@ static int rxe_query_port(struct ib_device *dev,
{
struct rxe_dev *rxe = to_rdev(dev);
struct rxe_port *port;
- int rc = -EINVAL;
-
- if (unlikely(port_num != 1)) {
- pr_warn("invalid port_number %d\n", port_num);
- goto out;
- }
+ int rc;
port = &rxe->port;
@@ -71,23 +67,17 @@ static int rxe_query_port(struct ib_device *dev,
mutex_lock(&rxe->usdev_lock);
rc = ib_get_eth_speed(dev, port_num, &attr->active_speed,
&attr->active_width);
- mutex_unlock(&rxe->usdev_lock);
-out:
- return rc;
-}
-
-static struct net_device *rxe_get_netdev(struct ib_device *device,
- u8 port_num)
-{
- struct rxe_dev *rxe = to_rdev(device);
+ if (attr->state == IB_PORT_ACTIVE)
+ attr->phys_state = RDMA_LINK_PHYS_STATE_LINK_UP;
+ else if (dev_get_flags(rxe->ndev) & IFF_UP)
+ attr->phys_state = RDMA_LINK_PHYS_STATE_POLLING;
+ else
+ attr->phys_state = RDMA_LINK_PHYS_STATE_DISABLED;
- if (rxe->ndev) {
- dev_hold(rxe->ndev);
- return rxe->ndev;
- }
+ mutex_unlock(&rxe->usdev_lock);
- return NULL;
+ return rc;
}
static int rxe_query_pkey(struct ib_device *device,
@@ -96,12 +86,6 @@ static int rxe_query_pkey(struct ib_device *device,
struct rxe_dev *rxe = to_rdev(device);
struct rxe_port *port;
- if (unlikely(port_num != 1)) {
- dev_warn(device->dev.parent, "invalid port_num = %d\n",
- port_num);
- goto err1;
- }
-
port = &rxe->port;
if (unlikely(index >= port->attr.pkey_tbl_len)) {
@@ -139,11 +123,6 @@ static int rxe_modify_port(struct ib_device *dev,
struct rxe_dev *rxe = to_rdev(dev);
struct rxe_port *port;
- if (unlikely(port_num != 1)) {
- pr_warn("invalid port_num = %d\n", port_num);
- goto err1;
- }
-
port = &rxe->port;
port->attr.port_cap_flags |= attr->set_port_cap_mask;
@@ -153,9 +132,6 @@ static int rxe_modify_port(struct ib_device *dev,
port->attr.qkey_viol_cntr = 0;
return 0;
-
-err1:
- return -EINVAL;
}
static enum rdma_link_layer rxe_get_link_layer(struct ib_device *dev,
@@ -166,22 +142,19 @@ static enum rdma_link_layer rxe_get_link_layer(struct ib_device *dev,
return rxe_link_layer(rxe, port_num);
}
-static struct ib_ucontext *rxe_alloc_ucontext(struct ib_device *dev,
- struct ib_udata *udata)
+static int rxe_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata)
{
- struct rxe_dev *rxe = to_rdev(dev);
- struct rxe_ucontext *uc;
+ struct rxe_dev *rxe = to_rdev(uctx->device);
+ struct rxe_ucontext *uc = to_ruc(uctx);
- uc = rxe_alloc(&rxe->uc_pool);
- return uc ? &uc->ibuc : ERR_PTR(-ENOMEM);
+ return rxe_add_to_pool(&rxe->uc_pool, &uc->pelem);
}
-static int rxe_dealloc_ucontext(struct ib_ucontext *ibuc)
+static void rxe_dealloc_ucontext(struct ib_ucontext *ibuc)
{
struct rxe_ucontext *uc = to_ruc(ibuc);
rxe_drop_ref(uc);
- return 0;
}
static int rxe_port_immutable(struct ib_device *dev, u8 port_num,
@@ -203,34 +176,25 @@ static int rxe_port_immutable(struct ib_device *dev, u8 port_num,
return 0;
}
-static struct ib_pd *rxe_alloc_pd(struct ib_device *dev,
- struct ib_ucontext *context,
- struct ib_udata *udata)
+static int rxe_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context,
+ struct ib_udata *udata)
{
- struct rxe_dev *rxe = to_rdev(dev);
- struct rxe_pd *pd;
+ struct rxe_dev *rxe = to_rdev(ibpd->device);
+ struct rxe_pd *pd = to_rpd(ibpd);
- pd = rxe_alloc(&rxe->pd_pool);
- return pd ? &pd->ibpd : ERR_PTR(-ENOMEM);
+ return rxe_add_to_pool(&rxe->pd_pool, &pd->pelem);
}
-static int rxe_dealloc_pd(struct ib_pd *ibpd)
+static void rxe_dealloc_pd(struct ib_pd *ibpd)
{
struct rxe_pd *pd = to_rpd(ibpd);
rxe_drop_ref(pd);
- return 0;
-}
-
-static void rxe_init_av(struct rxe_dev *rxe, struct rdma_ah_attr *attr,
- struct rxe_av *av)
-{
- rxe_av_from_attr(rdma_ah_get_port_num(attr), av, attr);
- rxe_av_fill_ip_info(av, attr);
}
static struct ib_ah *rxe_create_ah(struct ib_pd *ibpd,
struct rdma_ah_attr *attr,
+ u32 flags,
struct ib_udata *udata)
{
@@ -250,7 +214,7 @@ static struct ib_ah *rxe_create_ah(struct ib_pd *ibpd,
rxe_add_ref(pd);
ah->pd = pd;
- rxe_init_av(rxe, attr, &ah->av);
+ rxe_init_av(attr, &ah->av);
return &ah->ibah;
}
@@ -264,7 +228,7 @@ static int rxe_modify_ah(struct ib_ah *ibah, struct rdma_ah_attr *attr)
if (err)
return err;
- rxe_init_av(rxe, attr, &ah->av);
+ rxe_init_av(attr, &ah->av);
return 0;
}
@@ -278,7 +242,7 @@ static int rxe_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *attr)
return 0;
}
-static int rxe_destroy_ah(struct ib_ah *ibah)
+static int rxe_destroy_ah(struct ib_ah *ibah, u32 flags)
{
struct rxe_ah *ah = to_rah(ibah);
@@ -341,8 +305,9 @@ static struct ib_srq *rxe_create_srq(struct ib_pd *ibpd,
int err;
struct rxe_dev *rxe = to_rdev(ibpd->device);
struct rxe_pd *pd = to_rpd(ibpd);
+ struct rxe_ucontext *ucontext =
+ rdma_udata_to_drv_context(udata, struct rxe_ucontext, ibuc);
struct rxe_srq *srq;
- struct ib_ucontext *context = udata ? ibpd->uobject->context : NULL;
struct rxe_create_srq_resp __user *uresp = NULL;
if (udata) {
@@ -365,7 +330,7 @@ static struct ib_srq *rxe_create_srq(struct ib_pd *ibpd,
rxe_add_ref(pd);
srq->pd = pd;
- err = rxe_srq_from_init(rxe, srq, init, context, uresp);
+ err = rxe_srq_from_init(rxe, srq, init, &ucontext->ibuc, uresp);
if (err)
goto err2;
@@ -498,7 +463,7 @@ static struct ib_qp *rxe_create_qp(struct ib_pd *ibpd,
rxe_add_index(qp);
- err = rxe_qp_from_init(rxe, qp, pd, init, uresp, ibpd);
+ err = rxe_qp_from_init(rxe, qp, pd, init, uresp, ibpd, udata);
if (err)
goto err3;
@@ -1140,8 +1105,8 @@ static int rxe_detach_mcast(struct ib_qp *ibqp, union ib_gid *mgid, u16 mlid)
static ssize_t parent_show(struct device *device,
struct device_attribute *attr, char *buf)
{
- struct rxe_dev *rxe = container_of(device, struct rxe_dev,
- ib_dev.dev);
+ struct rxe_dev *rxe =
+ rdma_device_to_drv_device(device, struct rxe_dev, ib_dev);
return snprintf(buf, 16, "%s\n", rxe_parent_name(rxe, 1));
}
@@ -1157,7 +1122,65 @@ static const struct attribute_group rxe_attr_group = {
.attrs = rxe_dev_attributes,
};
-int rxe_register_device(struct rxe_dev *rxe)
+static int rxe_enable_driver(struct ib_device *ib_dev)
+{
+ struct rxe_dev *rxe = container_of(ib_dev, struct rxe_dev, ib_dev);
+
+ rxe_set_port_state(rxe);
+ dev_info(&rxe->ib_dev.dev, "added %s\n", netdev_name(rxe->ndev));
+ return 0;
+}
+
+static const struct ib_device_ops rxe_dev_ops = {
+ .alloc_hw_stats = rxe_ib_alloc_hw_stats,
+ .alloc_mr = rxe_alloc_mr,
+ .alloc_pd = rxe_alloc_pd,
+ .alloc_ucontext = rxe_alloc_ucontext,
+ .attach_mcast = rxe_attach_mcast,
+ .create_ah = rxe_create_ah,
+ .create_cq = rxe_create_cq,
+ .create_qp = rxe_create_qp,
+ .create_srq = rxe_create_srq,
+ .dealloc_driver = rxe_dealloc,
+ .dealloc_pd = rxe_dealloc_pd,
+ .dealloc_ucontext = rxe_dealloc_ucontext,
+ .dereg_mr = rxe_dereg_mr,
+ .destroy_ah = rxe_destroy_ah,
+ .destroy_cq = rxe_destroy_cq,
+ .destroy_qp = rxe_destroy_qp,
+ .destroy_srq = rxe_destroy_srq,
+ .detach_mcast = rxe_detach_mcast,
+ .enable_driver = rxe_enable_driver,
+ .get_dma_mr = rxe_get_dma_mr,
+ .get_hw_stats = rxe_ib_get_hw_stats,
+ .get_link_layer = rxe_get_link_layer,
+ .get_port_immutable = rxe_port_immutable,
+ .map_mr_sg = rxe_map_mr_sg,
+ .mmap = rxe_mmap,
+ .modify_ah = rxe_modify_ah,
+ .modify_device = rxe_modify_device,
+ .modify_port = rxe_modify_port,
+ .modify_qp = rxe_modify_qp,
+ .modify_srq = rxe_modify_srq,
+ .peek_cq = rxe_peek_cq,
+ .poll_cq = rxe_poll_cq,
+ .post_recv = rxe_post_recv,
+ .post_send = rxe_post_send,
+ .post_srq_recv = rxe_post_srq_recv,
+ .query_ah = rxe_query_ah,
+ .query_device = rxe_query_device,
+ .query_pkey = rxe_query_pkey,
+ .query_port = rxe_query_port,
+ .query_qp = rxe_query_qp,
+ .query_srq = rxe_query_srq,
+ .reg_user_mr = rxe_reg_user_mr,
+ .req_notify_cq = rxe_req_notify_cq,
+ .resize_cq = rxe_resize_cq,
+ INIT_RDMA_OBJ_SIZE(ib_pd, rxe_pd, ibpd),
+ INIT_RDMA_OBJ_SIZE(ib_ucontext, rxe_ucontext, ibuc),
+};
+
+int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name)
{
int err;
struct ib_device *dev = &rxe->ib_dev;
@@ -1211,49 +1234,10 @@ int rxe_register_device(struct rxe_dev *rxe)
| BIT_ULL(IB_USER_VERBS_CMD_DETACH_MCAST)
;
- dev->query_device = rxe_query_device;
- dev->modify_device = rxe_modify_device;
- dev->query_port = rxe_query_port;
- dev->modify_port = rxe_modify_port;
- dev->get_link_layer = rxe_get_link_layer;
- dev->get_netdev = rxe_get_netdev;
- dev->query_pkey = rxe_query_pkey;
- dev->alloc_ucontext = rxe_alloc_ucontext;
- dev->dealloc_ucontext = rxe_dealloc_ucontext;
- dev->mmap = rxe_mmap;
- dev->get_port_immutable = rxe_port_immutable;
- dev->alloc_pd = rxe_alloc_pd;
- dev->dealloc_pd = rxe_dealloc_pd;
- dev->create_ah = rxe_create_ah;
- dev->modify_ah = rxe_modify_ah;
- dev->query_ah = rxe_query_ah;
- dev->destroy_ah = rxe_destroy_ah;
- dev->create_srq = rxe_create_srq;
- dev->modify_srq = rxe_modify_srq;
- dev->query_srq = rxe_query_srq;
- dev->destroy_srq = rxe_destroy_srq;
- dev->post_srq_recv = rxe_post_srq_recv;
- dev->create_qp = rxe_create_qp;
- dev->modify_qp = rxe_modify_qp;
- dev->query_qp = rxe_query_qp;
- dev->destroy_qp = rxe_destroy_qp;
- dev->post_send = rxe_post_send;
- dev->post_recv = rxe_post_recv;
- dev->create_cq = rxe_create_cq;
- dev->destroy_cq = rxe_destroy_cq;
- dev->resize_cq = rxe_resize_cq;
- dev->poll_cq = rxe_poll_cq;
- dev->peek_cq = rxe_peek_cq;
- dev->req_notify_cq = rxe_req_notify_cq;
- dev->get_dma_mr = rxe_get_dma_mr;
- dev->reg_user_mr = rxe_reg_user_mr;
- dev->dereg_mr = rxe_dereg_mr;
- dev->alloc_mr = rxe_alloc_mr;
- dev->map_mr_sg = rxe_map_mr_sg;
- dev->attach_mcast = rxe_attach_mcast;
- dev->detach_mcast = rxe_detach_mcast;
- dev->get_hw_stats = rxe_ib_get_hw_stats;
- dev->alloc_hw_stats = rxe_ib_alloc_hw_stats;
+ ib_set_device_ops(dev, &rxe_dev_ops);
+ err = ib_device_set_netdev(&rxe->ib_dev, rxe->ndev, 1);
+ if (err)
+ return err;
tfm = crypto_alloc_shash("crc32", 0, 0);
if (IS_ERR(tfm)) {
@@ -1265,25 +1249,13 @@ int rxe_register_device(struct rxe_dev *rxe)
rdma_set_device_sysfs_group(dev, &rxe_attr_group);
dev->driver_id = RDMA_DRIVER_RXE;
- err = ib_register_device(dev, "rxe%d", NULL);
- if (err) {
+ err = ib_register_device(dev, ibdev_name);
+ if (err)
pr_warn("%s failed with error %d\n", __func__, err);
- goto err1;
- }
-
- return 0;
-
-err1:
- crypto_free_shash(rxe->tfm);
+ /*
+ * Note that rxe may be invalid at this point if another thread
+ * unregistered it.
+ */
return err;
}
-
-int rxe_unregister_device(struct rxe_dev *rxe)
-{
- struct ib_device *dev = &rxe->ib_dev;
-
- ib_unregister_device(dev);
-
- return 0;
-}
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h
index 82e670d6eeea..157e51aeb1e1 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.h
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.h
@@ -61,13 +61,13 @@ static inline int psn_compare(u32 psn_a, u32 psn_b)
}
struct rxe_ucontext {
+ struct ib_ucontext ibuc;
struct rxe_pool_entry pelem;
- struct ib_ucontext ibuc;
};
struct rxe_pd {
+ struct ib_pd ibpd;
struct rxe_pool_entry pelem;
- struct ib_pd ibpd;
};
struct rxe_ah {
@@ -385,7 +385,6 @@ struct rxe_dev {
struct ib_device_attr attr;
int max_ucontext;
int max_inline_data;
- struct kref ref_cnt;
struct mutex usdev_lock;
struct net_device *ndev;
@@ -409,16 +408,15 @@ struct rxe_dev {
spinlock_t mmap_offset_lock; /* guard mmap_offset */
int mmap_offset;
- u64 stats_counters[RXE_NUM_OF_COUNTERS];
+ atomic64_t stats_counters[RXE_NUM_OF_COUNTERS];
struct rxe_port port;
- struct list_head list;
struct crypto_shash *tfm;
};
-static inline void rxe_counter_inc(struct rxe_dev *rxe, enum rxe_counters cnt)
+static inline void rxe_counter_inc(struct rxe_dev *rxe, enum rxe_counters index)
{
- rxe->stats_counters[cnt]++;
+ atomic64_inc(&rxe->stats_counters[index]);
}
static inline struct rxe_dev *to_rdev(struct ib_device *dev)
@@ -466,8 +464,7 @@ static inline struct rxe_mem *to_rmw(struct ib_mw *mw)
return mw ? container_of(mw, struct rxe_mem, ibmw) : NULL;
}
-int rxe_register_device(struct rxe_dev *rxe);
-int rxe_unregister_device(struct rxe_dev *rxe);
+int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name);
void rxe_mc_cleanup(struct rxe_pool_entry *arg);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index 1da119d901a9..2aa3457a30ce 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -248,7 +248,6 @@ struct ipoib_cm_tx {
struct list_head list;
struct net_device *dev;
struct ipoib_neigh *neigh;
- struct ipoib_path *path;
struct ipoib_tx_buf *tx_ring;
unsigned int tx_head;
unsigned int tx_tail;
@@ -781,12 +780,12 @@ static inline void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *w
#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
void ipoib_create_debug_files(struct net_device *dev);
void ipoib_delete_debug_files(struct net_device *dev);
-int ipoib_register_debugfs(void);
+void ipoib_register_debugfs(void);
void ipoib_unregister_debugfs(void);
#else
static inline void ipoib_create_debug_files(struct net_device *dev) { }
static inline void ipoib_delete_debug_files(struct net_device *dev) { }
-static inline int ipoib_register_debugfs(void) { return 0; }
+static inline void ipoib_register_debugfs(void) { }
static inline void ipoib_unregister_debugfs(void) { }
#endif
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
index 0428e01e8f69..aa9dcfc36cd3 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -1312,7 +1312,6 @@ struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path
neigh->cm = tx;
tx->neigh = neigh;
- tx->path = path;
tx->dev = dev;
list_add(&tx->list, &priv->cm.start_list);
set_bit(IPOIB_FLAG_INITIALIZED, &tx->flags);
@@ -1371,7 +1370,7 @@ static void ipoib_cm_tx_start(struct work_struct *work)
neigh->daddr + QPN_AND_OPTIONS_OFFSET);
goto free_neigh;
}
- memcpy(&pathrec, &p->path->pathrec, sizeof(pathrec));
+ memcpy(&pathrec, &path->pathrec, sizeof(pathrec));
spin_unlock_irqrestore(&priv->lock, flags);
netif_tx_unlock_bh(dev);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_fs.c b/drivers/infiniband/ulp/ipoib/ipoib_fs.c
index 178488028734..64c19f6fa931 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_fs.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_fs.c
@@ -267,14 +267,10 @@ void ipoib_create_debug_files(struct net_device *dev)
snprintf(name, sizeof(name), "%s_mcg", dev->name);
priv->mcg_dentry = debugfs_create_file(name, S_IFREG | S_IRUGO,
ipoib_root, dev, &ipoib_mcg_fops);
- if (!priv->mcg_dentry)
- ipoib_warn(priv, "failed to create mcg debug file\n");
snprintf(name, sizeof(name), "%s_path", dev->name);
priv->path_dentry = debugfs_create_file(name, S_IFREG | S_IRUGO,
ipoib_root, dev, &ipoib_path_fops);
- if (!priv->path_dentry)
- ipoib_warn(priv, "failed to create path debug file\n");
}
void ipoib_delete_debug_files(struct net_device *dev)
@@ -286,10 +282,9 @@ void ipoib_delete_debug_files(struct net_device *dev)
priv->mcg_dentry = priv->path_dentry = NULL;
}
-int ipoib_register_debugfs(void)
+void ipoib_register_debugfs(void)
{
ipoib_root = debugfs_create_dir("ipoib", NULL);
- return ipoib_root ? 0 : -ENOMEM;
}
void ipoib_unregister_debugfs(void)
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index 9006a13af1de..78fa777c87b1 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -66,7 +66,7 @@ struct ipoib_ah *ipoib_create_ah(struct net_device *dev,
ah->last_send = 0;
kref_init(&ah->ref);
- vah = rdma_create_ah(pd, attr);
+ vah = rdma_create_ah(pd, attr, RDMA_CREATE_AH_SLEEPABLE);
if (IS_ERR(vah)) {
kfree(ah);
ah = (struct ipoib_ah *)vah;
@@ -669,7 +669,6 @@ static void __ipoib_reap_ah(struct net_device *dev)
{
struct ipoib_dev_priv *priv = ipoib_priv(dev);
struct ipoib_ah *ah, *tah;
- LIST_HEAD(remove_list);
unsigned long flags;
netif_tx_lock_bh(dev);
@@ -678,7 +677,7 @@ static void __ipoib_reap_ah(struct net_device *dev)
list_for_each_entry_safe(ah, tah, &priv->dead_ahs, list)
if ((int) priv->tx_tail - (int) ah->last_send >= 0) {
list_del(&ah->list);
- rdma_destroy_ah(ah->ah);
+ rdma_destroy_ah(ah->ah, 0);
kfree(ah);
}
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 8710214594d8..48eda16db1a7 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -167,7 +167,7 @@ int ipoib_open(struct net_device *dev)
if (flags & IFF_UP)
continue;
- dev_change_flags(cpriv->dev, flags | IFF_UP);
+ dev_change_flags(cpriv->dev, flags | IFF_UP, NULL);
}
up_read(&priv->vlan_rwsem);
}
@@ -207,7 +207,7 @@ static int ipoib_stop(struct net_device *dev)
if (!(flags & IFF_UP))
continue;
- dev_change_flags(cpriv->dev, flags & ~IFF_UP);
+ dev_change_flags(cpriv->dev, flags & ~IFF_UP, NULL);
}
up_read(&priv->vlan_rwsem);
}
@@ -613,7 +613,7 @@ static void path_free(struct net_device *dev, struct ipoib_path *path)
while ((skb = __skb_dequeue(&path->queue)))
dev_kfree_skb_irq(skb);
- ipoib_dbg(ipoib_priv(dev), "path_free\n");
+ ipoib_dbg(ipoib_priv(dev), "%s\n", __func__);
/* remove all neigh connected to this path */
ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw);
@@ -1641,7 +1641,7 @@ static void ipoib_neigh_hash_uninit(struct net_device *dev)
{
struct ipoib_dev_priv *priv = ipoib_priv(dev);
- ipoib_dbg(priv, "ipoib_neigh_hash_uninit\n");
+ ipoib_dbg(priv, "%s\n", __func__);
init_completion(&priv->ntbl.deleted);
cancel_delayed_work_sync(&priv->neigh_reap_task);
@@ -1823,7 +1823,7 @@ static void ipoib_parent_unregister_pre(struct net_device *ndev)
* running ensures the it will not add more work.
*/
rtnl_lock();
- dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP);
+ dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP, NULL);
rtnl_unlock();
/* ipoib_event() cannot be running once this returns */
@@ -2411,7 +2411,7 @@ static ssize_t dev_id_show(struct device *dev,
}
static DEVICE_ATTR_RO(dev_id);
-int ipoib_intercept_dev_id_attr(struct net_device *dev)
+static int ipoib_intercept_dev_id_attr(struct net_device *dev)
{
device_remove_file(&dev->dev, &dev_attr_dev_id);
return device_create_file(&dev->dev, &dev_attr_dev_id);
@@ -2453,8 +2453,8 @@ static struct net_device *ipoib_add_port(const char *format,
return ERR_PTR(result);
}
- if (hca->rdma_netdev_get_params) {
- int rc = hca->rdma_netdev_get_params(hca, port,
+ if (hca->ops.rdma_netdev_get_params) {
+ int rc = hca->ops.rdma_netdev_get_params(hca, port,
RDMA_NETDEV_IPOIB,
&params);
@@ -2495,7 +2495,7 @@ static void ipoib_add_one(struct ib_device *device)
struct list_head *dev_list;
struct net_device *dev;
struct ipoib_dev_priv *priv;
- int p;
+ unsigned int p;
int count = 0;
dev_list = kmalloc(sizeof(*dev_list), GFP_KERNEL);
@@ -2504,7 +2504,7 @@ static void ipoib_add_one(struct ib_device *device)
INIT_LIST_HEAD(dev_list);
- for (p = rdma_start_port(device); p <= rdma_end_port(device); ++p) {
+ rdma_for_each_port (device, p) {
if (!rdma_protocol_ib(device, p))
continue;
dev = ipoib_add_port("ib%d", device, p);
@@ -2577,9 +2577,7 @@ static int __init ipoib_init_module(void)
*/
BUILD_BUG_ON(IPOIB_CM_COPYBREAK > IPOIB_CM_HEAD_SIZE);
- ret = ipoib_register_debugfs();
- if (ret)
- return ret;
+ ipoib_register_debugfs();
/*
* We create a global workqueue here that is used for all flush
diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c
index 3fecd87c9f2b..8c707accd148 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.c
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.c
@@ -997,7 +997,6 @@ static struct scsi_host_template iscsi_iser_sht = {
.eh_device_reset_handler= iscsi_eh_device_reset,
.eh_target_reset_handler = iscsi_eh_recover_target,
.target_alloc = iscsi_target_alloc,
- .use_clustering = ENABLE_CLUSTERING,
.slave_alloc = iscsi_iser_slave_alloc,
.proc_name = "iscsi_iser",
.this_id = -1,
diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h
index 120b40829560..a7aeaa0c6fbc 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.h
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.h
@@ -197,7 +197,7 @@ struct iser_data_buf {
struct scatterlist *sg;
int size;
unsigned long data_len;
- unsigned int dma_nents;
+ int dma_nents;
};
/* fwd declarations */
diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c
index 009be8889d71..2ba70729d7b0 100644
--- a/drivers/infiniband/ulp/iser/iser_memory.c
+++ b/drivers/infiniband/ulp/iser/iser_memory.c
@@ -77,8 +77,8 @@ int iser_assign_reg_ops(struct iser_device *device)
struct ib_device *ib_dev = device->ib_device;
/* Assign function handles - based on FMR support */
- if (ib_dev->alloc_fmr && ib_dev->dealloc_fmr &&
- ib_dev->map_phys_fmr && ib_dev->unmap_fmr) {
+ if (ib_dev->ops.alloc_fmr && ib_dev->ops.dealloc_fmr &&
+ ib_dev->ops.map_phys_fmr && ib_dev->ops.unmap_fmr) {
iser_info("FMR supported, using FMR for registration\n");
device->reg_ops = &fmr_ops;
} else if (ib_dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
@@ -145,9 +145,8 @@ static void iser_data_buf_dump(struct iser_data_buf *data,
for_each_sg(data->sg, sg, data->dma_nents, i)
iser_dbg("sg[%d] dma_addr:0x%lX page:0x%p "
"off:0x%x sz:0x%x dma_len:0x%x\n",
- i, (unsigned long)ib_sg_dma_address(ibdev, sg),
- sg_page(sg), sg->offset,
- sg->length, ib_sg_dma_len(ibdev, sg));
+ i, (unsigned long)sg_dma_address(sg),
+ sg_page(sg), sg->offset, sg->length, sg_dma_len(sg));
}
static void iser_dump_page_vec(struct iser_page_vec *page_vec)
@@ -204,8 +203,8 @@ iser_reg_dma(struct iser_device *device, struct iser_data_buf *mem,
reg->rkey = device->pd->unsafe_global_rkey;
else
reg->rkey = 0;
- reg->sge.addr = ib_sg_dma_address(device->ib_device, &sg[0]);
- reg->sge.length = ib_sg_dma_len(device->ib_device, &sg[0]);
+ reg->sge.addr = sg_dma_address(&sg[0]);
+ reg->sge.length = sg_dma_len(&sg[0]);
iser_dbg("Single DMA entry: lkey=0x%x, rkey=0x%x, addr=0x%llx,"
" length=0x%x\n", reg->sge.lkey, reg->rkey,
@@ -240,8 +239,8 @@ int iser_fast_reg_fmr(struct iscsi_iser_task *iser_task,
page_vec->npages = 0;
page_vec->fake_mr.page_size = SIZE_4K;
plen = ib_sg_to_pages(&page_vec->fake_mr, mem->sg,
- mem->size, NULL, iser_set_page);
- if (unlikely(plen < mem->size)) {
+ mem->dma_nents, NULL, iser_set_page);
+ if (unlikely(plen < mem->dma_nents)) {
iser_err("page vec too short to hold this SG\n");
iser_data_buf_dump(mem, device->ib_device);
iser_dump_page_vec(page_vec);
@@ -277,16 +276,13 @@ void iser_unreg_mem_fmr(struct iscsi_iser_task *iser_task,
enum iser_data_dir cmd_dir)
{
struct iser_mem_reg *reg = &iser_task->rdma_reg[cmd_dir];
- int ret;
if (!reg->mem_h)
return;
iser_dbg("PHYSICAL Mem.Unregister mem_h %p\n", reg->mem_h);
- ret = ib_fmr_pool_unmap((struct ib_pool_fmr *)reg->mem_h);
- if (ret)
- iser_err("ib_fmr_pool_unmap failed %d\n", ret);
+ ib_fmr_pool_unmap((struct ib_pool_fmr *)reg->mem_h);
reg->mem_h = NULL;
}
@@ -451,10 +447,10 @@ static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task,
ib_update_fast_reg_key(mr, ib_inc_rkey(mr->rkey));
- n = ib_map_mr_sg(mr, mem->sg, mem->size, NULL, SIZE_4K);
- if (unlikely(n != mem->size)) {
+ n = ib_map_mr_sg(mr, mem->sg, mem->dma_nents, NULL, SIZE_4K);
+ if (unlikely(n != mem->dma_nents)) {
iser_err("failed to map sg (%d/%d)\n",
- n, mem->size);
+ n, mem->dma_nents);
return n < 0 ? n : -EINVAL;
}
diff --git a/drivers/infiniband/ulp/isert/Makefile b/drivers/infiniband/ulp/isert/Makefile
index c8bf2421f5bc..a4a4766e3e18 100644
--- a/drivers/infiniband/ulp/isert/Makefile
+++ b/drivers/infiniband/ulp/isert/Makefile
@@ -1,2 +1 @@
-ccflags-y := -Idrivers/target -Idrivers/target/iscsi
obj-$(CONFIG_INFINIBAND_ISERT) += ib_isert.o
diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c
index e3dd13798d79..989f1ac4245c 100644
--- a/drivers/infiniband/ulp/isert/ib_isert.c
+++ b/drivers/infiniband/ulp/isert/ib_isert.c
@@ -1186,7 +1186,7 @@ sequence_cmd:
rc = iscsit_sequence_cmd(conn, cmd, buf, hdr->cmdsn);
if (!rc && dump_payload == false && unsol_data)
- iscsit_set_unsoliticed_dataout(cmd);
+ iscsit_set_unsolicited_dataout(cmd);
else if (dump_payload && imm_data)
target_put_sess_cmd(&cmd->se_cmd);
diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c b/drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c
index 61558788b3fa..ae70cd18903e 100644
--- a/drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c
+++ b/drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c
@@ -330,10 +330,10 @@ struct opa_vnic_adapter *opa_vnic_add_netdev(struct ib_device *ibdev,
struct rdma_netdev *rn;
int rc;
- netdev = ibdev->alloc_rdma_netdev(ibdev, port_num,
- RDMA_NETDEV_OPA_VNIC,
- "veth%d", NET_NAME_UNKNOWN,
- ether_setup);
+ netdev = ibdev->ops.alloc_rdma_netdev(ibdev, port_num,
+ RDMA_NETDEV_OPA_VNIC,
+ "veth%d", NET_NAME_UNKNOWN,
+ ether_setup);
if (!netdev)
return ERR_PTR(-ENOMEM);
else if (IS_ERR(netdev))
diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c b/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c
index d119d9afa845..560e4f2d466e 100644
--- a/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c
+++ b/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c
@@ -606,7 +606,7 @@ static void vema_set(struct opa_vnic_vema_port *port,
static void vema_send(struct ib_mad_agent *mad_agent,
struct ib_mad_send_wc *mad_wc)
{
- rdma_destroy_ah(mad_wc->send_buf->ah);
+ rdma_destroy_ah(mad_wc->send_buf->ah, RDMA_DESTROY_AH_SLEEPABLE);
ib_free_send_mad(mad_wc->send_buf);
}
@@ -680,7 +680,7 @@ static void vema_recv(struct ib_mad_agent *mad_agent,
ib_free_send_mad(rsp);
err_rsp:
- rdma_destroy_ah(ah);
+ rdma_destroy_ah(ah, RDMA_DESTROY_AH_SLEEPABLE);
free_recv_mad:
ib_free_recv_mad(mad_wc);
}
@@ -777,7 +777,7 @@ void opa_vnic_vema_send_trap(struct opa_vnic_adapter *adapter,
}
rdma_ah_set_dlid(&ah_attr, trap_lid);
- ah = rdma_create_ah(port->mad_agent->qp->pd, &ah_attr);
+ ah = rdma_create_ah(port->mad_agent->qp->pd, &ah_attr, 0);
if (IS_ERR(ah)) {
c_err("%s:Couldn't create new AH = %p\n", __func__, ah);
c_err("%s:dlid = %d, sl = %d, port = %d\n", __func__,
@@ -848,7 +848,7 @@ void opa_vnic_vema_send_trap(struct opa_vnic_adapter *adapter,
}
err_sndbuf:
- rdma_destroy_ah(ah);
+ rdma_destroy_ah(ah, 0);
err_exit:
v_err("Aborting trap\n");
}
diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index eed0eb3bb04c..be9ddcad8f28 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -132,6 +132,15 @@ MODULE_PARM_DESC(dev_loss_tmo,
" if fast_io_fail_tmo has not been set. \"off\" means that"
" this functionality is disabled.");
+static bool srp_use_imm_data = true;
+module_param_named(use_imm_data, srp_use_imm_data, bool, 0644);
+MODULE_PARM_DESC(use_imm_data,
+ "Whether or not to request permission to use immediate data during SRP login.");
+
+static unsigned int srp_max_imm_data = 8 * 1024;
+module_param_named(max_imm_data, srp_max_imm_data, uint, 0644);
+MODULE_PARM_DESC(max_imm_data, "Maximum immediate data size.");
+
static unsigned ch_count;
module_param(ch_count, uint, 0444);
MODULE_PARM_DESC(ch_count,
@@ -434,8 +443,7 @@ static struct srp_fr_pool *srp_create_fr_pool(struct ib_device *device,
if (pool_size <= 0)
goto err;
ret = -ENOMEM;
- pool = kzalloc(sizeof(struct srp_fr_pool) +
- pool_size * sizeof(struct srp_fr_desc), GFP_KERNEL);
+ pool = kzalloc(struct_size(pool, desc, pool_size), GFP_KERNEL);
if (!pool)
goto err;
pool->size = pool_size;
@@ -573,7 +581,7 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch)
init_attr->cap.max_send_wr = m * target->queue_size;
init_attr->cap.max_recv_wr = target->queue_size + 1;
init_attr->cap.max_recv_sge = 1;
- init_attr->cap.max_send_sge = 1;
+ init_attr->cap.max_send_sge = SRP_MAX_SGE;
init_attr->sq_sig_type = IB_SIGNAL_REQ_WR;
init_attr->qp_type = IB_QPT_RC;
init_attr->send_cq = send_cq;
@@ -823,7 +831,8 @@ static u8 srp_get_subnet_timeout(struct srp_host *host)
return subnet_timeout;
}
-static int srp_send_req(struct srp_rdma_ch *ch, bool multich)
+static int srp_send_req(struct srp_rdma_ch *ch, uint32_t max_iu_len,
+ bool multich)
{
struct srp_target_port *target = ch->target;
struct {
@@ -852,11 +861,15 @@ static int srp_send_req(struct srp_rdma_ch *ch, bool multich)
req->ib_req.opcode = SRP_LOGIN_REQ;
req->ib_req.tag = 0;
- req->ib_req.req_it_iu_len = cpu_to_be32(target->max_iu_len);
+ req->ib_req.req_it_iu_len = cpu_to_be32(max_iu_len);
req->ib_req.req_buf_fmt = cpu_to_be16(SRP_BUF_FORMAT_DIRECT |
SRP_BUF_FORMAT_INDIRECT);
req->ib_req.req_flags = (multich ? SRP_MULTICHAN_MULTI :
SRP_MULTICHAN_SINGLE);
+ if (srp_use_imm_data) {
+ req->ib_req.req_flags |= SRP_IMMED_REQUESTED;
+ req->ib_req.imm_data_offset = cpu_to_be16(SRP_IMM_DATA_OFFSET);
+ }
if (target->using_rdma_cm) {
req->rdma_param.flow_control = req->ib_param.flow_control;
@@ -873,6 +886,7 @@ static int srp_send_req(struct srp_rdma_ch *ch, bool multich)
req->rdma_req.req_it_iu_len = req->ib_req.req_it_iu_len;
req->rdma_req.req_buf_fmt = req->ib_req.req_buf_fmt;
req->rdma_req.req_flags = req->ib_req.req_flags;
+ req->rdma_req.imm_data_offset = req->ib_req.imm_data_offset;
ipi = req->rdma_req.initiator_port_id;
tpi = req->rdma_req.target_port_id;
@@ -1145,7 +1159,8 @@ static int srp_connected_ch(struct srp_target_port *target)
return c;
}
-static int srp_connect_ch(struct srp_rdma_ch *ch, bool multich)
+static int srp_connect_ch(struct srp_rdma_ch *ch, uint32_t max_iu_len,
+ bool multich)
{
struct srp_target_port *target = ch->target;
int ret;
@@ -1158,7 +1173,7 @@ static int srp_connect_ch(struct srp_rdma_ch *ch, bool multich)
while (1) {
init_completion(&ch->done);
- ret = srp_send_req(ch, multich);
+ ret = srp_send_req(ch, max_iu_len, multich);
if (ret)
goto out;
ret = wait_for_completion_interruptible(&ch->done);
@@ -1344,6 +1359,20 @@ static void srp_terminate_io(struct srp_rport *rport)
}
}
+/* Calculate maximum initiator to target information unit length. */
+static uint32_t srp_max_it_iu_len(int cmd_sg_cnt, bool use_imm_data)
+{
+ uint32_t max_iu_len = sizeof(struct srp_cmd) + SRP_MAX_ADD_CDB_LEN +
+ sizeof(struct srp_indirect_buf) +
+ cmd_sg_cnt * sizeof(struct srp_direct_buf);
+
+ if (use_imm_data)
+ max_iu_len = max(max_iu_len, SRP_IMM_DATA_OFFSET +
+ srp_max_imm_data);
+
+ return max_iu_len;
+}
+
/*
* It is up to the caller to ensure that srp_rport_reconnect() calls are
* serialized and that no concurrent srp_queuecommand(), srp_abort(),
@@ -1357,6 +1386,8 @@ static int srp_rport_reconnect(struct srp_rport *rport)
{
struct srp_target_port *target = rport->lld_data;
struct srp_rdma_ch *ch;
+ uint32_t max_iu_len = srp_max_it_iu_len(target->cmd_sg_cnt,
+ srp_use_imm_data);
int i, j, ret = 0;
bool multich = false;
@@ -1402,7 +1433,7 @@ static int srp_rport_reconnect(struct srp_rport *rport)
ch = &target->ch[i];
if (ret)
break;
- ret = srp_connect_ch(ch, multich);
+ ret = srp_connect_ch(ch, max_iu_len, multich);
multich = true;
}
@@ -1569,9 +1600,8 @@ static int srp_map_sg_entry(struct srp_map_state *state,
{
struct srp_target_port *target = ch->target;
struct srp_device *dev = target->srp_host->srp_dev;
- struct ib_device *ibdev = dev->dev;
- dma_addr_t dma_addr = ib_sg_dma_address(ibdev, sg);
- unsigned int dma_len = ib_sg_dma_len(ibdev, sg);
+ dma_addr_t dma_addr = sg_dma_address(sg);
+ unsigned int dma_len = sg_dma_len(sg);
unsigned int len = 0;
int ret;
@@ -1665,13 +1695,11 @@ static int srp_map_sg_dma(struct srp_map_state *state, struct srp_rdma_ch *ch,
int count)
{
struct srp_target_port *target = ch->target;
- struct srp_device *dev = target->srp_host->srp_dev;
struct scatterlist *sg;
int i;
for_each_sg(scat, sg, count, i) {
- srp_map_desc(state, ib_sg_dma_address(dev->dev, sg),
- ib_sg_dma_len(dev->dev, sg),
+ srp_map_desc(state, sg_dma_address(sg), sg_dma_len(sg),
target->global_rkey);
}
@@ -1764,25 +1792,29 @@ static void srp_check_mapping(struct srp_map_state *state,
* @req: SRP request
*
* Returns the length in bytes of the SRP_CMD IU or a negative value if
- * mapping failed.
+ * mapping failed. The size of any immediate data is not included in the
+ * return value.
*/
static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch,
struct srp_request *req)
{
struct srp_target_port *target = ch->target;
- struct scatterlist *scat;
+ struct scatterlist *scat, *sg;
struct srp_cmd *cmd = req->cmd->buf;
- int len, nents, count, ret;
+ int i, len, nents, count, ret;
struct srp_device *dev;
struct ib_device *ibdev;
struct srp_map_state state;
struct srp_indirect_buf *indirect_hdr;
+ u64 data_len;
u32 idb_len, table_len;
__be32 idb_rkey;
u8 fmt;
+ req->cmd->num_sge = 1;
+
if (!scsi_sglist(scmnd) || scmnd->sc_data_direction == DMA_NONE)
- return sizeof (struct srp_cmd);
+ return sizeof(struct srp_cmd) + cmd->add_cdb_len;
if (scmnd->sc_data_direction != DMA_FROM_DEVICE &&
scmnd->sc_data_direction != DMA_TO_DEVICE) {
@@ -1794,6 +1826,7 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch,
nents = scsi_sg_count(scmnd);
scat = scsi_sglist(scmnd);
+ data_len = scsi_bufflen(scmnd);
dev = target->srp_host->srp_dev;
ibdev = dev->dev;
@@ -1802,8 +1835,31 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch,
if (unlikely(count == 0))
return -EIO;
+ if (ch->use_imm_data &&
+ count <= SRP_MAX_IMM_SGE &&
+ SRP_IMM_DATA_OFFSET + data_len <= ch->max_it_iu_len &&
+ scmnd->sc_data_direction == DMA_TO_DEVICE) {
+ struct srp_imm_buf *buf;
+ struct ib_sge *sge = &req->cmd->sge[1];
+
+ fmt = SRP_DATA_DESC_IMM;
+ len = SRP_IMM_DATA_OFFSET;
+ req->nmdesc = 0;
+ buf = (void *)cmd->add_data + cmd->add_cdb_len;
+ buf->len = cpu_to_be32(data_len);
+ WARN_ON_ONCE((void *)(buf + 1) > (void *)cmd + len);
+ for_each_sg(scat, sg, count, i) {
+ sge[i].addr = sg_dma_address(sg);
+ sge[i].length = sg_dma_len(sg);
+ sge[i].lkey = target->lkey;
+ }
+ req->cmd->num_sge += count;
+ goto map_complete;
+ }
+
fmt = SRP_DATA_DESC_DIRECT;
- len = sizeof (struct srp_cmd) + sizeof (struct srp_direct_buf);
+ len = sizeof(struct srp_cmd) + cmd->add_cdb_len +
+ sizeof(struct srp_direct_buf);
if (count == 1 && target->global_rkey) {
/*
@@ -1812,11 +1868,12 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch,
* single entry. So a direct descriptor along with
* the DMA MR suffices.
*/
- struct srp_direct_buf *buf = (void *) cmd->add_data;
+ struct srp_direct_buf *buf;
- buf->va = cpu_to_be64(ib_sg_dma_address(ibdev, scat));
+ buf = (void *)cmd->add_data + cmd->add_cdb_len;
+ buf->va = cpu_to_be64(sg_dma_address(scat));
buf->key = cpu_to_be32(target->global_rkey);
- buf->len = cpu_to_be32(ib_sg_dma_len(ibdev, scat));
+ buf->len = cpu_to_be32(sg_dma_len(scat));
req->nmdesc = 0;
goto map_complete;
@@ -1826,7 +1883,7 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch,
* We have more than one scatter/gather entry, so build our indirect
* descriptor table, trying to merge as many entries as we can.
*/
- indirect_hdr = (void *) cmd->add_data;
+ indirect_hdr = (void *)cmd->add_data + cmd->add_cdb_len;
ib_dma_sync_single_for_cpu(ibdev, req->indirect_dma_addr,
target->indirect_size, DMA_TO_DEVICE);
@@ -1861,8 +1918,9 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch,
* Memory registration collapsed the sg-list into one entry,
* so use a direct descriptor.
*/
- struct srp_direct_buf *buf = (void *) cmd->add_data;
+ struct srp_direct_buf *buf;
+ buf = (void *)cmd->add_data + cmd->add_cdb_len;
*buf = req->indirect_desc[0];
goto map_complete;
}
@@ -1880,7 +1938,8 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch,
idb_len = sizeof(struct srp_indirect_buf) + table_len;
fmt = SRP_DATA_DESC_INDIRECT;
- len = sizeof(struct srp_cmd) + sizeof (struct srp_indirect_buf);
+ len = sizeof(struct srp_cmd) + cmd->add_cdb_len +
+ sizeof(struct srp_indirect_buf);
len += count * sizeof (struct srp_direct_buf);
memcpy(indirect_hdr->desc_list, req->indirect_desc,
@@ -2001,22 +2060,30 @@ static void srp_send_done(struct ib_cq *cq, struct ib_wc *wc)
list_add(&iu->list, &ch->free_tx);
}
+/**
+ * srp_post_send() - send an SRP information unit
+ * @ch: RDMA channel over which to send the information unit.
+ * @iu: Information unit to send.
+ * @len: Length of the information unit excluding immediate data.
+ */
static int srp_post_send(struct srp_rdma_ch *ch, struct srp_iu *iu, int len)
{
struct srp_target_port *target = ch->target;
- struct ib_sge list;
struct ib_send_wr wr;
- list.addr = iu->dma;
- list.length = len;
- list.lkey = target->lkey;
+ if (WARN_ON_ONCE(iu->num_sge > SRP_MAX_SGE))
+ return -EINVAL;
+
+ iu->sge[0].addr = iu->dma;
+ iu->sge[0].length = len;
+ iu->sge[0].lkey = target->lkey;
iu->cqe.done = srp_send_done;
wr.next = NULL;
wr.wr_cqe = &iu->cqe;
- wr.sg_list = &list;
- wr.num_sge = 1;
+ wr.sg_list = &iu->sge[0];
+ wr.num_sge = iu->num_sge;
wr.opcode = IB_WR_SEND;
wr.send_flags = IB_SEND_SIGNALED;
@@ -2129,6 +2196,7 @@ static int srp_response_common(struct srp_rdma_ch *ch, s32 req_delta,
return 1;
}
+ iu->num_sge = 1;
ib_dma_sync_single_for_cpu(dev, iu->dma, len, DMA_TO_DEVICE);
memcpy(iu->buf, rsp, len);
ib_dma_sync_single_for_device(dev, iu->dma, len, DMA_TO_DEVICE);
@@ -2312,7 +2380,7 @@ static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd)
req = &ch->req_ring[idx];
dev = target->srp_host->srp_dev->dev;
- ib_dma_sync_single_for_cpu(dev, iu->dma, target->max_iu_len,
+ ib_dma_sync_single_for_cpu(dev, iu->dma, ch->max_it_iu_len,
DMA_TO_DEVICE);
scmnd->host_scribble = (void *) req;
@@ -2324,6 +2392,12 @@ static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd)
int_to_scsilun(scmnd->device->lun, &cmd->lun);
cmd->tag = tag;
memcpy(cmd->cdb, scmnd->cmnd, scmnd->cmd_len);
+ if (unlikely(scmnd->cmd_len > sizeof(cmd->cdb))) {
+ cmd->add_cdb_len = round_up(scmnd->cmd_len - sizeof(cmd->cdb),
+ 4);
+ if (WARN_ON_ONCE(cmd->add_cdb_len > SRP_MAX_ADD_CDB_LEN))
+ goto err_iu;
+ }
req->scmnd = scmnd;
req->cmd = iu;
@@ -2343,11 +2417,12 @@ static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd)
goto err_iu;
}
- ib_dma_sync_single_for_device(dev, iu->dma, target->max_iu_len,
+ ib_dma_sync_single_for_device(dev, iu->dma, ch->max_it_iu_len,
DMA_TO_DEVICE);
if (srp_post_send(ch, iu, len)) {
shost_printk(KERN_ERR, target->scsi_host, PFX "Send failed\n");
+ scmnd->result = DID_ERROR << 16;
goto err_unmap;
}
@@ -2410,7 +2485,7 @@ static int srp_alloc_iu_bufs(struct srp_rdma_ch *ch)
for (i = 0; i < target->queue_size; ++i) {
ch->tx_ring[i] = srp_alloc_iu(target->srp_host,
- target->max_iu_len,
+ ch->max_it_iu_len,
GFP_KERNEL, DMA_TO_DEVICE);
if (!ch->tx_ring[i])
goto err;
@@ -2476,6 +2551,15 @@ static void srp_cm_rep_handler(struct ib_cm_id *cm_id,
if (lrsp->opcode == SRP_LOGIN_RSP) {
ch->max_ti_iu_len = be32_to_cpu(lrsp->max_ti_iu_len);
ch->req_lim = be32_to_cpu(lrsp->req_lim_delta);
+ ch->use_imm_data = lrsp->rsp_flags & SRP_LOGIN_RSP_IMMED_SUPP;
+ ch->max_it_iu_len = srp_max_it_iu_len(target->cmd_sg_cnt,
+ ch->use_imm_data);
+ WARN_ON_ONCE(ch->max_it_iu_len >
+ be32_to_cpu(lrsp->max_it_iu_len));
+
+ if (ch->use_imm_data)
+ shost_printk(KERN_DEBUG, target->scsi_host,
+ PFX "using immediate data\n");
/*
* Reserve credits for task management so we don't
@@ -2864,6 +2948,8 @@ static int srp_send_tsk_mgmt(struct srp_rdma_ch *ch, u64 req_tag, u64 lun,
return -1;
}
+ iu->num_sge = 1;
+
ib_dma_sync_single_for_cpu(dev, iu->dma, sizeof *tsk_mgmt,
DMA_TO_DEVICE);
tsk_mgmt = iu->buf;
@@ -2942,7 +3028,6 @@ static int srp_reset_device(struct scsi_cmnd *scmnd)
{
struct srp_target_port *target = host_to_target(scmnd->device->host);
struct srp_rdma_ch *ch;
- int i, j;
u8 status;
shost_printk(KERN_ERR, target->scsi_host, "SRP reset_device called\n");
@@ -2954,15 +3039,6 @@ static int srp_reset_device(struct scsi_cmnd *scmnd)
if (status)
return FAILED;
- for (i = 0; i < target->ch_count; i++) {
- ch = &target->ch[i];
- for (j = 0; j < target->req_ring_size; ++j) {
- struct srp_request *req = &ch->req_ring[j];
-
- srp_finish_req(ch, req, scmnd->device, DID_RESET << 16);
- }
- }
-
return SUCCESS;
}
@@ -3215,7 +3291,6 @@ static struct scsi_host_template srp_template = {
.can_queue = SRP_DEFAULT_CMD_SQ_SIZE,
.this_id = -1,
.cmd_per_lun = SRP_DEFAULT_CMD_SQ_SIZE,
- .use_clustering = ENABLE_CLUSTERING,
.shost_attrs = srp_host_attrs,
.track_queue_depth = 1,
};
@@ -3403,6 +3478,9 @@ static const match_table_t srp_opt_tokens = {
/**
* srp_parse_in - parse an IP address and port number combination
+ * @net: [in] Network namespace.
+ * @sa: [out] Address family, IP address and port number.
+ * @addr_port_str: [in] IP address and port number.
*
* Parse the following address formats:
* - IPv4: <ip_address>:<port>, e.g. 1.2.3.4:5.
@@ -3720,6 +3798,7 @@ static ssize_t srp_create_target(struct device *dev,
int ret, node_idx, node, cpu, i;
unsigned int max_sectors_per_mr, mr_per_cmd = 0;
bool multich = false;
+ uint32_t max_iu_len;
target_host = scsi_host_alloc(&srp_template,
sizeof (struct srp_target_port));
@@ -3731,6 +3810,7 @@ static ssize_t srp_create_target(struct device *dev,
target_host->max_id = 1;
target_host->max_lun = -1LL;
target_host->max_cmd_len = sizeof ((struct srp_cmd *) (void *) 0L)->cdb;
+ target_host->max_segment_size = ib_dma_max_seg_size(ibdev);
target = host_to_target(target_host);
@@ -3825,9 +3905,7 @@ static ssize_t srp_create_target(struct device *dev,
target->mr_per_cmd = mr_per_cmd;
target->indirect_size = target->sg_tablesize *
sizeof (struct srp_direct_buf);
- target->max_iu_len = sizeof (struct srp_cmd) +
- sizeof (struct srp_indirect_buf) +
- target->cmd_sg_cnt * sizeof (struct srp_direct_buf);
+ max_iu_len = srp_max_it_iu_len(target->cmd_sg_cnt, srp_use_imm_data);
INIT_WORK(&target->tl_err_work, srp_tl_err_work);
INIT_WORK(&target->remove_work, srp_remove_work);
@@ -3882,7 +3960,7 @@ static ssize_t srp_create_target(struct device *dev,
if (ret)
goto err_disconnect;
- ret = srp_connect_ch(ch, multich);
+ ret = srp_connect_ch(ch, max_iu_len, multich);
if (ret) {
char dst[64];
@@ -4039,7 +4117,8 @@ static void srp_add_one(struct ib_device *device)
struct srp_device *srp_dev;
struct ib_device_attr *attr = &device->attrs;
struct srp_host *host;
- int mr_page_shift, p;
+ int mr_page_shift;
+ unsigned int p;
u64 max_pages_per_mr;
unsigned int flags = 0;
@@ -4063,8 +4142,10 @@ static void srp_add_one(struct ib_device *device)
srp_dev->max_pages_per_mr = min_t(u64, SRP_MAX_PAGES_PER_MR,
max_pages_per_mr);
- srp_dev->has_fmr = (device->alloc_fmr && device->dealloc_fmr &&
- device->map_phys_fmr && device->unmap_fmr);
+ srp_dev->has_fmr = (device->ops.alloc_fmr &&
+ device->ops.dealloc_fmr &&
+ device->ops.map_phys_fmr &&
+ device->ops.unmap_fmr);
srp_dev->has_fr = (attr->device_cap_flags &
IB_DEVICE_MEM_MGT_EXTENSIONS);
if (!never_register && !srp_dev->has_fmr && !srp_dev->has_fr) {
@@ -4104,7 +4185,7 @@ static void srp_add_one(struct ib_device *device)
WARN_ON_ONCE(srp_dev->global_rkey == 0);
}
- for (p = rdma_start_port(device); p <= rdma_end_port(device); ++p) {
+ rdma_for_each_port (device, p) {
host = srp_add_port(srp_dev, p);
if (host)
list_add_tail(&host->list, &srp_dev->dev_list);
@@ -4172,6 +4253,11 @@ static int __init srp_init_module(void)
{
int ret;
+ BUILD_BUG_ON(sizeof(struct srp_imm_buf) != 4);
+ BUILD_BUG_ON(sizeof(struct srp_login_req) != 64);
+ BUILD_BUG_ON(sizeof(struct srp_login_req_rdma) != 56);
+ BUILD_BUG_ON(sizeof(struct srp_cmd) != 48);
+
if (srp_sg_tablesize) {
pr_warn("srp_sg_tablesize is deprecated, please use cmd_sg_entries\n");
if (!cmd_sg_entries)
diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h
index a2706086b9c7..b2861cd2087a 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.h
+++ b/drivers/infiniband/ulp/srp/ib_srp.h
@@ -67,6 +67,17 @@ enum {
SRP_TAG_TSK_MGMT = 1U << 31,
SRP_MAX_PAGES_PER_MR = 512,
+
+ SRP_MAX_ADD_CDB_LEN = 16,
+
+ SRP_MAX_IMM_SGE = 2,
+ SRP_MAX_SGE = SRP_MAX_IMM_SGE + 1,
+ /*
+ * Choose the immediate data offset such that a 32 byte CDB still fits.
+ */
+ SRP_IMM_DATA_OFFSET = sizeof(struct srp_cmd) +
+ SRP_MAX_ADD_CDB_LEN +
+ sizeof(struct srp_imm_buf),
};
enum srp_target_state {
@@ -130,6 +141,8 @@ struct srp_request {
/**
* struct srp_rdma_ch
* @comp_vector: Completion vector used by this RDMA channel.
+ * @max_it_iu_len: Maximum initiator-to-target information unit length.
+ * @max_ti_iu_len: Maximum target-to-initiator information unit length.
*/
struct srp_rdma_ch {
/* These are RW in the hot path, and commonly used together */
@@ -146,6 +159,9 @@ struct srp_rdma_ch {
struct ib_fmr_pool *fmr_pool;
struct srp_fr_pool *fr_pool;
};
+ uint32_t max_it_iu_len;
+ uint32_t max_ti_iu_len;
+ bool use_imm_data;
/* Everything above this point is used in the hot path of
* command processing. Try to keep them packed into cachelines.
@@ -169,7 +185,6 @@ struct srp_rdma_ch {
struct srp_iu **tx_ring;
struct srp_iu **rx_ring;
struct srp_request *req_ring;
- int max_ti_iu_len;
int comp_vector;
u64 tsk_mgmt_tag;
@@ -194,7 +209,6 @@ struct srp_target_port {
u32 ch_count;
u32 lkey;
enum srp_target_state state;
- unsigned int max_iu_len;
unsigned int cmd_sg_cnt;
unsigned int indirect_size;
bool allow_ext_sg;
@@ -259,6 +273,8 @@ struct srp_iu {
void *buf;
size_t size;
enum dma_data_direction direction;
+ u32 num_sge;
+ struct ib_sge sge[SRP_MAX_SGE];
struct ib_cqe cqe;
};
diff --git a/drivers/infiniband/ulp/srpt/Makefile b/drivers/infiniband/ulp/srpt/Makefile
index e3ee4bdfffa5..43fbde42c58b 100644
--- a/drivers/infiniband/ulp/srpt/Makefile
+++ b/drivers/infiniband/ulp/srpt/Makefile
@@ -1,2 +1 @@
-ccflags-y := -Idrivers/target
obj-$(CONFIG_INFINIBAND_SRPT) += ib_srpt.o
diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c
index 2357aa727dcf..1a039f16d315 100644
--- a/drivers/infiniband/ulp/srpt/ib_srpt.c
+++ b/drivers/infiniband/ulp/srpt/ib_srpt.c
@@ -51,8 +51,6 @@
/* Name of this kernel module. */
#define DRV_NAME "ib_srpt"
-#define DRV_VERSION "2.0.0"
-#define DRV_RELDATE "2011-02-14"
#define SRPT_ID_STRING "Linux SRP target"
@@ -60,8 +58,7 @@
#define pr_fmt(fmt) DRV_NAME " " fmt
MODULE_AUTHOR("Vu Pham and Bart Van Assche");
-MODULE_DESCRIPTION("InfiniBand SCSI RDMA Protocol target "
- "v" DRV_VERSION " (" DRV_RELDATE ")");
+MODULE_DESCRIPTION("SCSI RDMA Protocol target driver");
MODULE_LICENSE("Dual BSD/GPL");
/*
@@ -89,8 +86,7 @@ static int srpt_get_u64_x(char *buffer, const struct kernel_param *kp)
module_param_call(srpt_service_guid, NULL, srpt_get_u64_x, &srpt_service_guid,
0444);
MODULE_PARM_DESC(srpt_service_guid,
- "Using this value for ioc_guid, id_ext, and cm_listen_id"
- " instead of using the node_guid of the first HCA.");
+ "Using this value for ioc_guid, id_ext, and cm_listen_id instead of using the node_guid of the first HCA.");
static struct ib_client srpt_client;
/* Protects both rdma_cm_port and rdma_cm_id. */
@@ -462,7 +458,7 @@ static void srpt_mgmt_method_get(struct srpt_port *sp, struct ib_mad *rq_mad,
static void srpt_mad_send_handler(struct ib_mad_agent *mad_agent,
struct ib_mad_send_wc *mad_wc)
{
- rdma_destroy_ah(mad_wc->send_buf->ah);
+ rdma_destroy_ah(mad_wc->send_buf->ah, RDMA_DESTROY_AH_SLEEPABLE);
ib_free_send_mad(mad_wc->send_buf);
}
@@ -529,7 +525,7 @@ static void srpt_mad_recv_handler(struct ib_mad_agent *mad_agent,
ib_free_send_mad(rsp);
err_rsp:
- rdma_destroy_ah(ah);
+ rdma_destroy_ah(ah, RDMA_DESTROY_AH_SLEEPABLE);
err:
ib_free_recv_mad(mad_wc);
}
@@ -652,31 +648,33 @@ static void srpt_unregister_mad_agent(struct srpt_device *sdev)
* srpt_alloc_ioctx - allocate a SRPT I/O context structure
* @sdev: SRPT HCA pointer.
* @ioctx_size: I/O context size.
- * @dma_size: Size of I/O context DMA buffer.
+ * @buf_cache: I/O buffer cache.
* @dir: DMA data direction.
*/
static struct srpt_ioctx *srpt_alloc_ioctx(struct srpt_device *sdev,
- int ioctx_size, int dma_size,
+ int ioctx_size,
+ struct kmem_cache *buf_cache,
enum dma_data_direction dir)
{
struct srpt_ioctx *ioctx;
- ioctx = kmalloc(ioctx_size, GFP_KERNEL);
+ ioctx = kzalloc(ioctx_size, GFP_KERNEL);
if (!ioctx)
goto err;
- ioctx->buf = kmalloc(dma_size, GFP_KERNEL);
+ ioctx->buf = kmem_cache_alloc(buf_cache, GFP_KERNEL);
if (!ioctx->buf)
goto err_free_ioctx;
- ioctx->dma = ib_dma_map_single(sdev->device, ioctx->buf, dma_size, dir);
+ ioctx->dma = ib_dma_map_single(sdev->device, ioctx->buf,
+ kmem_cache_size(buf_cache), dir);
if (ib_dma_mapping_error(sdev->device, ioctx->dma))
goto err_free_buf;
return ioctx;
err_free_buf:
- kfree(ioctx->buf);
+ kmem_cache_free(buf_cache, ioctx->buf);
err_free_ioctx:
kfree(ioctx);
err:
@@ -687,17 +685,19 @@ err:
* srpt_free_ioctx - free a SRPT I/O context structure
* @sdev: SRPT HCA pointer.
* @ioctx: I/O context pointer.
- * @dma_size: Size of I/O context DMA buffer.
+ * @buf_cache: I/O buffer cache.
* @dir: DMA data direction.
*/
static void srpt_free_ioctx(struct srpt_device *sdev, struct srpt_ioctx *ioctx,
- int dma_size, enum dma_data_direction dir)
+ struct kmem_cache *buf_cache,
+ enum dma_data_direction dir)
{
if (!ioctx)
return;
- ib_dma_unmap_single(sdev->device, ioctx->dma, dma_size, dir);
- kfree(ioctx->buf);
+ ib_dma_unmap_single(sdev->device, ioctx->dma,
+ kmem_cache_size(buf_cache), dir);
+ kmem_cache_free(buf_cache, ioctx->buf);
kfree(ioctx);
}
@@ -706,33 +706,38 @@ static void srpt_free_ioctx(struct srpt_device *sdev, struct srpt_ioctx *ioctx,
* @sdev: Device to allocate the I/O context ring for.
* @ring_size: Number of elements in the I/O context ring.
* @ioctx_size: I/O context size.
- * @dma_size: DMA buffer size.
+ * @buf_cache: I/O buffer cache.
+ * @alignment_offset: Offset in each ring buffer at which the SRP information
+ * unit starts.
* @dir: DMA data direction.
*/
static struct srpt_ioctx **srpt_alloc_ioctx_ring(struct srpt_device *sdev,
int ring_size, int ioctx_size,
- int dma_size, enum dma_data_direction dir)
+ struct kmem_cache *buf_cache,
+ int alignment_offset,
+ enum dma_data_direction dir)
{
struct srpt_ioctx **ring;
int i;
- WARN_ON(ioctx_size != sizeof(struct srpt_recv_ioctx)
- && ioctx_size != sizeof(struct srpt_send_ioctx));
+ WARN_ON(ioctx_size != sizeof(struct srpt_recv_ioctx) &&
+ ioctx_size != sizeof(struct srpt_send_ioctx));
ring = kvmalloc_array(ring_size, sizeof(ring[0]), GFP_KERNEL);
if (!ring)
goto out;
for (i = 0; i < ring_size; ++i) {
- ring[i] = srpt_alloc_ioctx(sdev, ioctx_size, dma_size, dir);
+ ring[i] = srpt_alloc_ioctx(sdev, ioctx_size, buf_cache, dir);
if (!ring[i])
goto err;
ring[i]->index = i;
+ ring[i]->offset = alignment_offset;
}
goto out;
err:
while (--i >= 0)
- srpt_free_ioctx(sdev, ring[i], dma_size, dir);
+ srpt_free_ioctx(sdev, ring[i], buf_cache, dir);
kvfree(ring);
ring = NULL;
out:
@@ -744,12 +749,13 @@ out:
* @ioctx_ring: I/O context ring to be freed.
* @sdev: SRPT HCA pointer.
* @ring_size: Number of ring elements.
- * @dma_size: Size of I/O context DMA buffer.
+ * @buf_cache: I/O buffer cache.
* @dir: DMA data direction.
*/
static void srpt_free_ioctx_ring(struct srpt_ioctx **ioctx_ring,
struct srpt_device *sdev, int ring_size,
- int dma_size, enum dma_data_direction dir)
+ struct kmem_cache *buf_cache,
+ enum dma_data_direction dir)
{
int i;
@@ -757,7 +763,7 @@ static void srpt_free_ioctx_ring(struct srpt_ioctx **ioctx_ring,
return;
for (i = 0; i < ring_size; ++i)
- srpt_free_ioctx(sdev, ioctx_ring[i], dma_size, dir);
+ srpt_free_ioctx(sdev, ioctx_ring[i], buf_cache, dir);
kvfree(ioctx_ring);
}
@@ -819,7 +825,7 @@ static int srpt_post_recv(struct srpt_device *sdev, struct srpt_rdma_ch *ch,
struct ib_recv_wr wr;
BUG_ON(!sdev);
- list.addr = ioctx->ioctx.dma;
+ list.addr = ioctx->ioctx.dma + ioctx->ioctx.offset;
list.length = srp_max_req_size;
list.lkey = sdev->lkey;
@@ -985,23 +991,28 @@ static inline void *srpt_get_desc_buf(struct srp_cmd *srp_cmd)
/**
* srpt_get_desc_tbl - parse the data descriptors of a SRP_CMD request
- * @ioctx: Pointer to the I/O context associated with the request.
+ * @recv_ioctx: I/O context associated with the received command @srp_cmd.
+ * @ioctx: I/O context that will be used for responding to the initiator.
* @srp_cmd: Pointer to the SRP_CMD request data.
* @dir: Pointer to the variable to which the transfer direction will be
* written.
- * @sg: [out] scatterlist allocated for the parsed SRP_CMD.
+ * @sg: [out] scatterlist for the parsed SRP_CMD.
* @sg_cnt: [out] length of @sg.
* @data_len: Pointer to the variable to which the total data length of all
* descriptors in the SRP_CMD request will be written.
+ * @imm_data_offset: [in] Offset in SRP_CMD requests at which immediate data
+ * starts.
*
* This function initializes ioctx->nrbuf and ioctx->r_bufs.
*
* Returns -EINVAL when the SRP_CMD request contains inconsistent descriptors;
* -ENOMEM when memory allocation fails and zero upon success.
*/
-static int srpt_get_desc_tbl(struct srpt_send_ioctx *ioctx,
+static int srpt_get_desc_tbl(struct srpt_recv_ioctx *recv_ioctx,
+ struct srpt_send_ioctx *ioctx,
struct srp_cmd *srp_cmd, enum dma_data_direction *dir,
- struct scatterlist **sg, unsigned *sg_cnt, u64 *data_len)
+ struct scatterlist **sg, unsigned int *sg_cnt, u64 *data_len,
+ u16 imm_data_offset)
{
BUG_ON(!dir);
BUG_ON(!data_len);
@@ -1025,7 +1036,7 @@ static int srpt_get_desc_tbl(struct srpt_send_ioctx *ioctx,
if (((srp_cmd->buf_fmt & 0xf) == SRP_DATA_DESC_DIRECT) ||
((srp_cmd->buf_fmt >> 4) == SRP_DATA_DESC_DIRECT)) {
- struct srp_direct_buf *db = srpt_get_desc_buf(srp_cmd);
+ struct srp_direct_buf *db = srpt_get_desc_buf(srp_cmd);
*data_len = be32_to_cpu(db->len);
return srpt_alloc_rw_ctxs(ioctx, db, 1, sg, sg_cnt);
@@ -1037,8 +1048,7 @@ static int srpt_get_desc_tbl(struct srpt_send_ioctx *ioctx,
if (nbufs >
(srp_cmd->data_out_desc_cnt + srp_cmd->data_in_desc_cnt)) {
- pr_err("received unsupported SRP_CMD request"
- " type (%u out + %u in != %u / %zu)\n",
+ pr_err("received unsupported SRP_CMD request type (%u out + %u in != %u / %zu)\n",
srp_cmd->data_out_desc_cnt,
srp_cmd->data_in_desc_cnt,
be32_to_cpu(idb->table_desc.len),
@@ -1049,6 +1059,40 @@ static int srpt_get_desc_tbl(struct srpt_send_ioctx *ioctx,
*data_len = be32_to_cpu(idb->len);
return srpt_alloc_rw_ctxs(ioctx, idb->desc_list, nbufs,
sg, sg_cnt);
+ } else if ((srp_cmd->buf_fmt >> 4) == SRP_DATA_DESC_IMM) {
+ struct srp_imm_buf *imm_buf = srpt_get_desc_buf(srp_cmd);
+ void *data = (void *)srp_cmd + imm_data_offset;
+ uint32_t len = be32_to_cpu(imm_buf->len);
+ uint32_t req_size = imm_data_offset + len;
+
+ if (req_size > srp_max_req_size) {
+ pr_err("Immediate data (length %d + %d) exceeds request size %d\n",
+ imm_data_offset, len, srp_max_req_size);
+ return -EINVAL;
+ }
+ if (recv_ioctx->byte_len < req_size) {
+ pr_err("Received too few data - %d < %d\n",
+ recv_ioctx->byte_len, req_size);
+ return -EIO;
+ }
+ /*
+ * The immediate data buffer descriptor must occur before the
+ * immediate data itself.
+ */
+ if ((void *)(imm_buf + 1) > (void *)data) {
+ pr_err("Received invalid write request\n");
+ return -EINVAL;
+ }
+ *data_len = len;
+ ioctx->recv_ioctx = recv_ioctx;
+ if ((uintptr_t)data & 511) {
+ pr_warn_once("Internal error - the receive buffers are not aligned properly.\n");
+ return -EINVAL;
+ }
+ sg_init_one(&ioctx->imm_sg, data, len);
+ *sg = &ioctx->imm_sg;
+ *sg_cnt = 1;
+ return 0;
} else {
*data_len = 0;
return 0;
@@ -1173,24 +1217,18 @@ static int srpt_ch_qp_err(struct srpt_rdma_ch *ch)
static struct srpt_send_ioctx *srpt_get_send_ioctx(struct srpt_rdma_ch *ch)
{
struct srpt_send_ioctx *ioctx;
- unsigned long flags;
+ int tag, cpu;
BUG_ON(!ch);
- ioctx = NULL;
- spin_lock_irqsave(&ch->spinlock, flags);
- if (!list_empty(&ch->free_list)) {
- ioctx = list_first_entry(&ch->free_list,
- struct srpt_send_ioctx, free_list);
- list_del(&ioctx->free_list);
- }
- spin_unlock_irqrestore(&ch->spinlock, flags);
-
- if (!ioctx)
- return ioctx;
+ tag = sbitmap_queue_get(&ch->sess->sess_tag_pool, &cpu);
+ if (tag < 0)
+ return NULL;
+ ioctx = ch->ioctx_ring[tag];
BUG_ON(ioctx->ch != ch);
ioctx->state = SRPT_STATE_NEW;
+ WARN_ON_ONCE(ioctx->recv_ioctx);
ioctx->n_rdma = 0;
ioctx->n_rw_ctx = 0;
ioctx->queue_status_only = false;
@@ -1200,6 +1238,8 @@ static struct srpt_send_ioctx *srpt_get_send_ioctx(struct srpt_rdma_ch *ch)
*/
memset(&ioctx->cmd, 0, sizeof(ioctx->cmd));
memset(&ioctx->sense_data, 0, sizeof(ioctx->sense_data));
+ ioctx->cmd.map_tag = tag;
+ ioctx->cmd.map_cpu = cpu;
return ioctx;
}
@@ -1352,8 +1392,8 @@ static int srpt_build_cmd_rsp(struct srpt_rdma_ch *ch,
BUILD_BUG_ON(MIN_MAX_RSP_SIZE <= sizeof(*srp_rsp));
max_sense_len = ch->max_ti_iu_len - sizeof(*srp_rsp);
if (sense_data_len > max_sense_len) {
- pr_warn("truncated sense data from %d to %d"
- " bytes\n", sense_data_len, max_sense_len);
+ pr_warn("truncated sense data from %d to %d bytes\n",
+ sense_data_len, max_sense_len);
sense_data_len = max_sense_len;
}
@@ -1433,7 +1473,7 @@ static void srpt_handle_cmd(struct srpt_rdma_ch *ch,
BUG_ON(!send_ioctx);
- srp_cmd = recv_ioctx->ioctx.buf;
+ srp_cmd = recv_ioctx->ioctx.buf + recv_ioctx->ioctx.offset;
cmd = &send_ioctx->cmd;
cmd->tag = srp_cmd->tag;
@@ -1453,14 +1493,14 @@ static void srpt_handle_cmd(struct srpt_rdma_ch *ch,
break;
}
- rc = srpt_get_desc_tbl(send_ioctx, srp_cmd, &dir, &sg, &sg_cnt,
- &data_len);
+ rc = srpt_get_desc_tbl(recv_ioctx, send_ioctx, srp_cmd, &dir,
+ &sg, &sg_cnt, &data_len, ch->imm_data_offset);
if (rc) {
if (rc != -EAGAIN) {
pr_err("0x%llx: parsing SRP descriptor table failed.\n",
srp_cmd->tag);
}
- goto release_ioctx;
+ goto busy;
}
rc = target_submit_cmd_map_sgls(cmd, ch->sess, srp_cmd->cdb,
@@ -1471,13 +1511,12 @@ static void srpt_handle_cmd(struct srpt_rdma_ch *ch,
if (rc != 0) {
pr_debug("target_submit_cmd() returned %d for tag %#llx\n", rc,
srp_cmd->tag);
- goto release_ioctx;
+ goto busy;
}
return;
-release_ioctx:
- send_ioctx->state = SRPT_STATE_DONE;
- srpt_release_cmd(cmd);
+busy:
+ target_send_busy(cmd);
}
static int srp_tmr_to_tcm(int fn)
@@ -1521,7 +1560,7 @@ static void srpt_handle_tsk_mgmt(struct srpt_rdma_ch *ch,
BUG_ON(!send_ioctx);
- srp_tsk = recv_ioctx->ioctx.buf;
+ srp_tsk = recv_ioctx->ioctx.buf + recv_ioctx->ioctx.offset;
cmd = &send_ioctx->cmd;
pr_debug("recv tsk_mgmt fn %d for task_tag %lld and cmd tag %lld ch %p sess %p\n",
@@ -1537,11 +1576,9 @@ static void srpt_handle_tsk_mgmt(struct srpt_rdma_ch *ch,
TARGET_SCF_ACK_KREF);
if (rc != 0) {
send_ioctx->cmd.se_tmr_req->response = TMR_FUNCTION_REJECTED;
- goto fail;
+ cmd->se_tfo->queue_tm_rsp(cmd);
}
return;
-fail:
- transport_send_check_condition_and_sense(cmd, 0, 0); // XXX:
}
/**
@@ -1564,10 +1601,11 @@ srpt_handle_new_iu(struct srpt_rdma_ch *ch, struct srpt_recv_ioctx *recv_ioctx)
goto push;
ib_dma_sync_single_for_cpu(ch->sport->sdev->device,
- recv_ioctx->ioctx.dma, srp_max_req_size,
+ recv_ioctx->ioctx.dma,
+ recv_ioctx->ioctx.offset + srp_max_req_size,
DMA_FROM_DEVICE);
- srp_cmd = recv_ioctx->ioctx.buf;
+ srp_cmd = recv_ioctx->ioctx.buf + recv_ioctx->ioctx.offset;
opcode = srp_cmd->opcode;
if (opcode == SRP_CMD || opcode == SRP_TSK_MGMT) {
send_ioctx = srpt_get_send_ioctx(ch);
@@ -1604,7 +1642,8 @@ srpt_handle_new_iu(struct srpt_rdma_ch *ch, struct srpt_recv_ioctx *recv_ioctx)
break;
}
- srpt_post_recv(ch->sport->sdev, ch, recv_ioctx);
+ if (!send_ioctx || !send_ioctx->recv_ioctx)
+ srpt_post_recv(ch->sport->sdev, ch, recv_ioctx);
res = true;
out:
@@ -1630,6 +1669,7 @@ static void srpt_recv_done(struct ib_cq *cq, struct ib_wc *wc)
req_lim = atomic_dec_return(&ch->req_lim);
if (unlikely(req_lim < 0))
pr_err("req_lim = %d < 0\n", req_lim);
+ ioctx->byte_len = wc->byte_len;
srpt_handle_new_iu(ch, ioctx);
} else {
pr_info_ratelimited("receiving failed for ioctx %p with status %d\n",
@@ -1693,14 +1733,14 @@ static void srpt_send_done(struct ib_cq *cq, struct ib_wc *wc)
atomic_add(1 + ioctx->n_rdma, &ch->sq_wr_avail);
if (wc->status != IB_WC_SUCCESS)
- pr_info("sending response for ioctx 0x%p failed"
- " with status %d\n", ioctx, wc->status);
+ pr_info("sending response for ioctx 0x%p failed with status %d\n",
+ ioctx, wc->status);
if (state != SRPT_STATE_DONE) {
transport_generic_free_cmd(&ioctx->cmd, 0);
} else {
- pr_err("IB completion has been received too late for"
- " wr_id = %u.\n", ioctx->ioctx.index);
+ pr_err("IB completion has been received too late for wr_id = %u.\n",
+ ioctx->ioctx.index);
}
srpt_process_wait_list(ch);
@@ -1754,6 +1794,8 @@ retry:
qp_init->cap.max_rdma_ctxs = sq_size / 2;
qp_init->cap.max_send_sge = min(attrs->max_send_sge,
SRPT_MAX_SG_PER_WQE);
+ qp_init->cap.max_recv_sge = min(attrs->max_recv_sge,
+ SRPT_MAX_SG_PER_WQE);
qp_init->port_num = ch->sport->port;
if (sdev->use_srq) {
qp_init->srq = sdev->srq;
@@ -2010,6 +2052,14 @@ static void srpt_free_ch(struct kref *kref)
kfree_rcu(ch, rcu);
}
+/*
+ * Shut down the SCSI target session, tell the connection manager to
+ * disconnect the associated RDMA channel, transition the QP to the error
+ * state and remove the channel from the channel list. This function is
+ * typically called from inside srpt_zerolength_write_done(). Concurrent
+ * srpt_zerolength_write() calls from inside srpt_close_ch() are possible
+ * as long as the channel is on sport->nexus_list.
+ */
static void srpt_release_channel_work(struct work_struct *w)
{
struct srpt_rdma_ch *ch;
@@ -2037,20 +2087,24 @@ static void srpt_release_channel_work(struct work_struct *w)
else
ib_destroy_cm_id(ch->ib_cm.cm_id);
+ sport = ch->sport;
+ mutex_lock(&sport->mutex);
+ list_del_rcu(&ch->list);
+ mutex_unlock(&sport->mutex);
+
srpt_destroy_ch_ib(ch);
srpt_free_ioctx_ring((struct srpt_ioctx **)ch->ioctx_ring,
ch->sport->sdev, ch->rq_size,
- ch->max_rsp_size, DMA_TO_DEVICE);
+ ch->rsp_buf_cache, DMA_TO_DEVICE);
+
+ kmem_cache_destroy(ch->rsp_buf_cache);
srpt_free_ioctx_ring((struct srpt_ioctx **)ch->ioctx_recv_ring,
sdev, ch->rq_size,
- srp_max_req_size, DMA_FROM_DEVICE);
+ ch->req_buf_cache, DMA_FROM_DEVICE);
- sport = ch->sport;
- mutex_lock(&sport->mutex);
- list_del_rcu(&ch->list);
- mutex_unlock(&sport->mutex);
+ kmem_cache_destroy(ch->req_buf_cache);
wake_up(&sport->ch_releaseQ);
@@ -2089,7 +2143,7 @@ static int srpt_cm_req_recv(struct srpt_device *const sdev,
struct srpt_rdma_ch *ch = NULL;
char i_port_id[36];
u32 it_iu_len;
- int i, ret;
+ int i, tag_num, tag_size, ret;
WARN_ON_ONCE(irqs_disabled());
@@ -2174,32 +2228,57 @@ static int srpt_cm_req_recv(struct srpt_device *const sdev,
INIT_LIST_HEAD(&ch->cmd_wait_list);
ch->max_rsp_size = ch->sport->port_attrib.srp_max_rsp_size;
+ ch->rsp_buf_cache = kmem_cache_create("srpt-rsp-buf", ch->max_rsp_size,
+ 512, 0, NULL);
+ if (!ch->rsp_buf_cache)
+ goto free_ch;
+
ch->ioctx_ring = (struct srpt_send_ioctx **)
srpt_alloc_ioctx_ring(ch->sport->sdev, ch->rq_size,
sizeof(*ch->ioctx_ring[0]),
- ch->max_rsp_size, DMA_TO_DEVICE);
+ ch->rsp_buf_cache, 0, DMA_TO_DEVICE);
if (!ch->ioctx_ring) {
pr_err("rejected SRP_LOGIN_REQ because creating a new QP SQ ring failed.\n");
rej->reason = cpu_to_be32(SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES);
- goto free_ch;
+ goto free_rsp_cache;
}
- INIT_LIST_HEAD(&ch->free_list);
- for (i = 0; i < ch->rq_size; i++) {
+ for (i = 0; i < ch->rq_size; i++)
ch->ioctx_ring[i]->ch = ch;
- list_add_tail(&ch->ioctx_ring[i]->free_list, &ch->free_list);
- }
if (!sdev->use_srq) {
+ u16 imm_data_offset = req->req_flags & SRP_IMMED_REQUESTED ?
+ be16_to_cpu(req->imm_data_offset) : 0;
+ u16 alignment_offset;
+ u32 req_sz;
+
+ if (req->req_flags & SRP_IMMED_REQUESTED)
+ pr_debug("imm_data_offset = %d\n",
+ be16_to_cpu(req->imm_data_offset));
+ if (imm_data_offset >= sizeof(struct srp_cmd)) {
+ ch->imm_data_offset = imm_data_offset;
+ rsp->rsp_flags |= SRP_LOGIN_RSP_IMMED_SUPP;
+ } else {
+ ch->imm_data_offset = 0;
+ }
+ alignment_offset = round_up(imm_data_offset, 512) -
+ imm_data_offset;
+ req_sz = alignment_offset + imm_data_offset + srp_max_req_size;
+ ch->req_buf_cache = kmem_cache_create("srpt-req-buf", req_sz,
+ 512, 0, NULL);
+ if (!ch->req_buf_cache)
+ goto free_rsp_ring;
+
ch->ioctx_recv_ring = (struct srpt_recv_ioctx **)
srpt_alloc_ioctx_ring(ch->sport->sdev, ch->rq_size,
sizeof(*ch->ioctx_recv_ring[0]),
- srp_max_req_size,
+ ch->req_buf_cache,
+ alignment_offset,
DMA_FROM_DEVICE);
if (!ch->ioctx_recv_ring) {
pr_err("rejected SRP_LOGIN_REQ because creating a new QP RQ ring failed.\n");
rej->reason =
cpu_to_be32(SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES);
- goto free_ring;
+ goto free_recv_cache;
}
for (i = 0; i < ch->rq_size; i++)
INIT_LIST_HEAD(&ch->ioctx_recv_ring[i]->wait_list);
@@ -2219,18 +2298,20 @@ static int srpt_cm_req_recv(struct srpt_device *const sdev,
pr_debug("registering session %s\n", ch->sess_name);
+ tag_num = ch->rq_size;
+ tag_size = 1; /* ib_srpt does not use se_sess->sess_cmd_map */
if (sport->port_guid_tpg.se_tpg_wwn)
- ch->sess = target_setup_session(&sport->port_guid_tpg, 0, 0,
- TARGET_PROT_NORMAL,
+ ch->sess = target_setup_session(&sport->port_guid_tpg, tag_num,
+ tag_size, TARGET_PROT_NORMAL,
ch->sess_name, ch, NULL);
if (sport->port_gid_tpg.se_tpg_wwn && IS_ERR_OR_NULL(ch->sess))
- ch->sess = target_setup_session(&sport->port_gid_tpg, 0, 0,
- TARGET_PROT_NORMAL, i_port_id, ch,
- NULL);
+ ch->sess = target_setup_session(&sport->port_gid_tpg, tag_num,
+ tag_size, TARGET_PROT_NORMAL, i_port_id,
+ ch, NULL);
/* Retry without leading "0x" */
if (sport->port_gid_tpg.se_tpg_wwn && IS_ERR_OR_NULL(ch->sess))
- ch->sess = target_setup_session(&sport->port_gid_tpg, 0, 0,
- TARGET_PROT_NORMAL,
+ ch->sess = target_setup_session(&sport->port_gid_tpg, tag_num,
+ tag_size, TARGET_PROT_NORMAL,
i_port_id + 2, ch, NULL);
if (IS_ERR_OR_NULL(ch->sess)) {
WARN_ON_ONCE(ch->sess == NULL);
@@ -2249,17 +2330,15 @@ static int srpt_cm_req_recv(struct srpt_device *const sdev,
if ((req->req_flags & SRP_MTCH_ACTION) == SRP_MULTICHAN_SINGLE) {
struct srpt_rdma_ch *ch2;
- rsp->rsp_flags = SRP_LOGIN_RSP_MULTICHAN_NO_CHAN;
-
list_for_each_entry(ch2, &nexus->ch_list, list) {
if (srpt_disconnect_ch(ch2) < 0)
continue;
pr_info("Relogin - closed existing channel %s\n",
ch2->sess_name);
- rsp->rsp_flags = SRP_LOGIN_RSP_MULTICHAN_TERMINATED;
+ rsp->rsp_flags |= SRP_LOGIN_RSP_MULTICHAN_TERMINATED;
}
} else {
- rsp->rsp_flags = SRP_LOGIN_RSP_MULTICHAN_MAINTAINED;
+ rsp->rsp_flags |= SRP_LOGIN_RSP_MULTICHAN_MAINTAINED;
}
list_add_tail_rcu(&ch->list, &nexus->ch_list);
@@ -2289,7 +2368,7 @@ static int srpt_cm_req_recv(struct srpt_device *const sdev,
/* create srp_login_response */
rsp->opcode = SRP_LOGIN_RSP;
rsp->tag = req->tag;
- rsp->max_it_iu_len = req->req_it_iu_len;
+ rsp->max_it_iu_len = cpu_to_be32(srp_max_req_size);
rsp->max_ti_iu_len = req->req_it_iu_len;
ch->max_ti_iu_len = it_iu_len;
rsp->buf_fmt = cpu_to_be16(SRP_BUF_FORMAT_DIRECT |
@@ -2353,12 +2432,18 @@ destroy_ib:
free_recv_ring:
srpt_free_ioctx_ring((struct srpt_ioctx **)ch->ioctx_recv_ring,
ch->sport->sdev, ch->rq_size,
- srp_max_req_size, DMA_FROM_DEVICE);
+ ch->req_buf_cache, DMA_FROM_DEVICE);
-free_ring:
+free_recv_cache:
+ kmem_cache_destroy(ch->req_buf_cache);
+
+free_rsp_ring:
srpt_free_ioctx_ring((struct srpt_ioctx **)ch->ioctx_ring,
ch->sport->sdev, ch->rq_size,
- ch->max_rsp_size, DMA_TO_DEVICE);
+ ch->rsp_buf_cache, DMA_TO_DEVICE);
+
+free_rsp_cache:
+ kmem_cache_destroy(ch->rsp_buf_cache);
free_ch:
if (rdma_cm_id)
@@ -2439,6 +2524,7 @@ static int srpt_rdma_cm_req_recv(struct rdma_cm_id *cm_id,
req.req_flags = req_rdma->req_flags;
memcpy(req.initiator_port_id, req_rdma->initiator_port_id, 16);
memcpy(req.target_port_id, req_rdma->target_port_id, 16);
+ req.imm_data_offset = req_rdma->imm_data_offset;
snprintf(src_addr, sizeof(src_addr), "%pIS",
&cm_id->route.addr.src_addr);
@@ -2608,14 +2694,6 @@ static int srpt_rdma_cm_handler(struct rdma_cm_id *cm_id,
return ret;
}
-static int srpt_write_pending_status(struct se_cmd *se_cmd)
-{
- struct srpt_send_ioctx *ioctx;
-
- ioctx = container_of(se_cmd, struct srpt_send_ioctx, cmd);
- return ioctx->state == SRPT_STATE_NEED_DATA;
-}
-
/*
* srpt_write_pending - Start data transfer from initiator to target (write).
*/
@@ -2629,6 +2707,12 @@ static int srpt_write_pending(struct se_cmd *se_cmd)
enum srpt_command_state new_state;
int ret, i;
+ if (ioctx->recv_ioctx) {
+ srpt_set_cmd_state(ioctx, SRPT_STATE_DATA_IN);
+ target_execute_cmd(&ioctx->cmd);
+ return 0;
+ }
+
new_state = srpt_set_cmd_state(ioctx, SRPT_STATE_NEED_DATA);
WARN_ON(new_state == SRPT_STATE_DONE);
@@ -2786,8 +2870,19 @@ static void srpt_queue_tm_rsp(struct se_cmd *cmd)
srpt_queue_response(cmd);
}
+/*
+ * This function is called for aborted commands if no response is sent to the
+ * initiator. Make sure that the credits freed by aborting a command are
+ * returned to the initiator the next time a response is sent by incrementing
+ * ch->req_lim_delta.
+ */
static void srpt_aborted_task(struct se_cmd *cmd)
{
+ struct srpt_send_ioctx *ioctx = container_of(cmd,
+ struct srpt_send_ioctx, cmd);
+ struct srpt_rdma_ch *ch = ioctx->ch;
+
+ atomic_inc(&ch->req_lim_delta);
}
static int srpt_queue_status(struct se_cmd *cmd)
@@ -2908,7 +3003,9 @@ static void srpt_free_srq(struct srpt_device *sdev)
ib_destroy_srq(sdev->srq);
srpt_free_ioctx_ring((struct srpt_ioctx **)sdev->ioctx_ring, sdev,
- sdev->srq_size, srp_max_req_size, DMA_FROM_DEVICE);
+ sdev->srq_size, sdev->req_buf_cache,
+ DMA_FROM_DEVICE);
+ kmem_cache_destroy(sdev->req_buf_cache);
sdev->srq = NULL;
}
@@ -2935,14 +3032,17 @@ static int srpt_alloc_srq(struct srpt_device *sdev)
pr_debug("create SRQ #wr= %d max_allow=%d dev= %s\n", sdev->srq_size,
sdev->device->attrs.max_srq_wr, dev_name(&device->dev));
+ sdev->req_buf_cache = kmem_cache_create("srpt-srq-req-buf",
+ srp_max_req_size, 0, 0, NULL);
+ if (!sdev->req_buf_cache)
+ goto free_srq;
+
sdev->ioctx_ring = (struct srpt_recv_ioctx **)
srpt_alloc_ioctx_ring(sdev, sdev->srq_size,
sizeof(*sdev->ioctx_ring[0]),
- srp_max_req_size, DMA_FROM_DEVICE);
- if (!sdev->ioctx_ring) {
- ib_destroy_srq(srq);
- return -ENOMEM;
- }
+ sdev->req_buf_cache, 0, DMA_FROM_DEVICE);
+ if (!sdev->ioctx_ring)
+ goto free_cache;
sdev->use_srq = true;
sdev->srq = srq;
@@ -2953,6 +3053,13 @@ static int srpt_alloc_srq(struct srpt_device *sdev)
}
return 0;
+
+free_cache:
+ kmem_cache_destroy(sdev->req_buf_cache);
+
+free_srq:
+ ib_destroy_srq(srq);
+ return -ENOMEM;
}
static int srpt_use_srq(struct srpt_device *sdev, bool use_srq)
@@ -3015,9 +3122,8 @@ static void srpt_add_one(struct ib_device *device)
}
/* print out target login information */
- pr_debug("Target login info: id_ext=%016llx,ioc_guid=%016llx,"
- "pkey=ffff,service_id=%016llx\n", srpt_service_guid,
- srpt_service_guid, srpt_service_guid);
+ pr_debug("Target login info: id_ext=%016llx,ioc_guid=%016llx,pkey=ffff,service_id=%016llx\n",
+ srpt_service_guid, srpt_service_guid, srpt_service_guid);
/*
* We do not have a consistent service_id (ie. also id_ext of target_id)
@@ -3147,11 +3253,6 @@ static int srpt_check_false(struct se_portal_group *se_tpg)
return 0;
}
-static char *srpt_get_fabric_name(void)
-{
- return "srpt";
-}
-
static struct srpt_port *srpt_tpg_to_sport(struct se_portal_group *tpg)
{
return tpg->se_tpg_wwn->priv;
@@ -3182,19 +3283,23 @@ static void srpt_release_cmd(struct se_cmd *se_cmd)
struct srpt_send_ioctx *ioctx = container_of(se_cmd,
struct srpt_send_ioctx, cmd);
struct srpt_rdma_ch *ch = ioctx->ch;
- unsigned long flags;
+ struct srpt_recv_ioctx *recv_ioctx = ioctx->recv_ioctx;
WARN_ON_ONCE(ioctx->state != SRPT_STATE_DONE &&
!(ioctx->cmd.transport_state & CMD_T_ABORTED));
+ if (recv_ioctx) {
+ WARN_ON_ONCE(!list_empty(&recv_ioctx->wait_list));
+ ioctx->recv_ioctx = NULL;
+ srpt_post_recv(ch->sport->sdev, ch, recv_ioctx);
+ }
+
if (ioctx->n_rw_ctx) {
srpt_free_rw_ctxs(ch, ioctx);
ioctx->n_rw_ctx = 0;
}
- spin_lock_irqsave(&ch->spinlock, flags);
- list_add(&ioctx->free_list, &ch->free_list);
- spin_unlock_irqrestore(&ch->spinlock, flags);
+ target_free_tag(se_cmd->se_sess, se_cmd);
}
/**
@@ -3572,7 +3677,7 @@ static ssize_t srpt_tpg_enable_show(struct config_item *item, char *page)
struct se_portal_group *se_tpg = to_tpg(item);
struct srpt_port *sport = srpt_tpg_to_sport(se_tpg);
- return snprintf(page, PAGE_SIZE, "%d\n", (sport->enabled) ? 1: 0);
+ return snprintf(page, PAGE_SIZE, "%d\n", sport->enabled);
}
static ssize_t srpt_tpg_enable_store(struct config_item *item,
@@ -3581,7 +3686,7 @@ static ssize_t srpt_tpg_enable_store(struct config_item *item,
struct se_portal_group *se_tpg = to_tpg(item);
struct srpt_port *sport = srpt_tpg_to_sport(se_tpg);
unsigned long tmp;
- int ret;
+ int ret;
ret = kstrtoul(page, 0, &tmp);
if (ret < 0) {
@@ -3617,7 +3722,7 @@ static struct se_portal_group *srpt_make_tpg(struct se_wwn *wwn,
const char *name)
{
struct srpt_port *sport = wwn->priv;
- static struct se_portal_group *tpg;
+ struct se_portal_group *tpg;
int res;
WARN_ON_ONCE(wwn != &sport->port_guid_wwn &&
@@ -3666,7 +3771,7 @@ static void srpt_drop_tport(struct se_wwn *wwn)
static ssize_t srpt_wwn_version_show(struct config_item *item, char *buf)
{
- return scnprintf(buf, PAGE_SIZE, "%s\n", DRV_VERSION);
+ return scnprintf(buf, PAGE_SIZE, "\n");
}
CONFIGFS_ATTR_RO(srpt_wwn_, version);
@@ -3678,8 +3783,7 @@ static struct configfs_attribute *srpt_wwn_attrs[] = {
static const struct target_core_fabric_ops srpt_template = {
.module = THIS_MODULE,
- .name = "srpt",
- .get_fabric_name = srpt_get_fabric_name,
+ .fabric_name = "srpt",
.tpg_get_wwn = srpt_get_fabric_wwn,
.tpg_get_tag = srpt_get_tag,
.tpg_check_demo_mode = srpt_check_false,
@@ -3693,7 +3797,6 @@ static const struct target_core_fabric_ops srpt_template = {
.sess_get_index = srpt_sess_get_index,
.sess_get_initiator_sid = NULL,
.write_pending = srpt_write_pending,
- .write_pending_status = srpt_write_pending_status,
.set_default_node_attributes = srpt_set_default_node_attrs,
.get_cmd_state = srpt_get_tcm_cmd_state,
.queue_data_in = srpt_queue_data_in,
@@ -3730,16 +3833,14 @@ static int __init srpt_init_module(void)
ret = -EINVAL;
if (srp_max_req_size < MIN_MAX_REQ_SIZE) {
- pr_err("invalid value %d for kernel module parameter"
- " srp_max_req_size -- must be at least %d.\n",
+ pr_err("invalid value %d for kernel module parameter srp_max_req_size -- must be at least %d.\n",
srp_max_req_size, MIN_MAX_REQ_SIZE);
goto out;
}
if (srpt_srq_size < MIN_SRPT_SRQ_SIZE
|| srpt_srq_size > MAX_SRPT_SRQ_SIZE) {
- pr_err("invalid value %d for kernel module parameter"
- " srpt_srq_size -- must be in the range [%d..%d].\n",
+ pr_err("invalid value %d for kernel module parameter srpt_srq_size -- must be in the range [%d..%d].\n",
srpt_srq_size, MIN_SRPT_SRQ_SIZE, MAX_SRPT_SRQ_SIZE);
goto out;
}
diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.h b/drivers/infiniband/ulp/srpt/ib_srpt.h
index 444dfd7281b5..ee9f20e9177a 100644
--- a/drivers/infiniband/ulp/srpt/ib_srpt.h
+++ b/drivers/infiniband/ulp/srpt/ib_srpt.h
@@ -104,10 +104,6 @@ enum {
SRP_CMD_ORDERED_Q = 0x2,
SRP_CMD_ACA = 0x4,
- SRP_LOGIN_RSP_MULTICHAN_NO_CHAN = 0x0,
- SRP_LOGIN_RSP_MULTICHAN_TERMINATED = 0x1,
- SRP_LOGIN_RSP_MULTICHAN_MAINTAINED = 0x2,
-
SRPT_DEF_SG_TABLESIZE = 128,
/*
* An experimentally determined value that avoids that QP creation
@@ -124,11 +120,18 @@ enum {
MAX_SRPT_RDMA_SIZE = 1U << 24,
MAX_SRPT_RSP_SIZE = 1024,
+ SRP_MAX_ADD_CDB_LEN = 16,
+ SRP_MAX_IMM_DATA_OFFSET = 80,
+ SRP_MAX_IMM_DATA = 8 * 1024,
MIN_MAX_REQ_SIZE = 996,
- DEFAULT_MAX_REQ_SIZE
- = sizeof(struct srp_cmd)/*48*/
- + sizeof(struct srp_indirect_buf)/*20*/
- + 128 * sizeof(struct srp_direct_buf)/*16*/,
+ DEFAULT_MAX_REQ_SIZE_1 = sizeof(struct srp_cmd)/*48*/ +
+ SRP_MAX_ADD_CDB_LEN +
+ sizeof(struct srp_indirect_buf)/*20*/ +
+ 128 * sizeof(struct srp_direct_buf)/*16*/,
+ DEFAULT_MAX_REQ_SIZE_2 = SRP_MAX_IMM_DATA_OFFSET +
+ sizeof(struct srp_imm_buf) + SRP_MAX_IMM_DATA,
+ DEFAULT_MAX_REQ_SIZE = DEFAULT_MAX_REQ_SIZE_1 > DEFAULT_MAX_REQ_SIZE_2 ?
+ DEFAULT_MAX_REQ_SIZE_1 : DEFAULT_MAX_REQ_SIZE_2,
MIN_MAX_RSP_SIZE = sizeof(struct srp_rsp)/*36*/ + 4,
DEFAULT_MAX_RSP_SIZE = 256, /* leaves 220 bytes for sense data */
@@ -165,12 +168,14 @@ enum srpt_command_state {
* @cqe: Completion queue element.
* @buf: Pointer to the buffer.
* @dma: DMA address of the buffer.
+ * @offset: Offset of the first byte in @buf and @dma that is actually used.
* @index: Index of the I/O context in its ioctx_ring array.
*/
struct srpt_ioctx {
struct ib_cqe cqe;
void *buf;
dma_addr_t dma;
+ uint32_t offset;
uint32_t index;
};
@@ -178,12 +183,14 @@ struct srpt_ioctx {
* struct srpt_recv_ioctx - SRPT receive I/O context
* @ioctx: See above.
* @wait_list: Node for insertion in srpt_rdma_ch.cmd_wait_list.
+ * @byte_len: Number of bytes in @ioctx.buf.
*/
struct srpt_recv_ioctx {
struct srpt_ioctx ioctx;
struct list_head wait_list;
+ int byte_len;
};
-
+
struct srpt_rw_ctx {
struct rdma_rw_ctx rw;
struct scatterlist *sg;
@@ -194,10 +201,12 @@ struct srpt_rw_ctx {
* struct srpt_send_ioctx - SRPT send I/O context
* @ioctx: See above.
* @ch: Channel pointer.
+ * @recv_ioctx: Receive I/O context associated with this send I/O context.
+ * Only used for processing immediate data.
* @s_rw_ctx: @rw_ctxs points here if only a single rw_ctx is needed.
* @rw_ctxs: RDMA read/write contexts.
+ * @imm_sg: Scatterlist for immediate data.
* @rdma_cqe: RDMA completion queue element.
- * @free_list: Node in srpt_rdma_ch.free_list.
* @state: I/O context state.
* @cmd: Target core command data structure.
* @sense_data: SCSI sense data.
@@ -209,12 +218,14 @@ struct srpt_rw_ctx {
struct srpt_send_ioctx {
struct srpt_ioctx ioctx;
struct srpt_rdma_ch *ch;
+ struct srpt_recv_ioctx *recv_ioctx;
struct srpt_rw_ctx s_rw_ctx;
struct srpt_rw_ctx *rw_ctxs;
+ struct scatterlist imm_sg;
+
struct ib_cqe rdma_cqe;
- struct list_head free_list;
enum srpt_command_state state;
struct se_cmd cmd;
u8 n_rdma;
@@ -245,7 +256,10 @@ enum rdma_ch_state {
* struct srpt_rdma_ch - RDMA channel
* @nexus: I_T nexus this channel is associated with.
* @qp: IB queue pair used for communicating over this channel.
- * @cm_id: IB CM ID associated with the channel.
+ * @ib_cm: See below.
+ * @ib_cm.cm_id: IB CM ID associated with the channel.
+ * @rdma_cm: See below.
+ * @rdma_cm.cm_id: RDMA CM ID associated with the channel.
* @cq: IB completion queue for this channel.
* @zw_cqe: Zero-length write CQE.
* @rcu: RCU head.
@@ -259,12 +273,14 @@ enum rdma_ch_state {
* @req_lim: request limit: maximum number of requests that may be sent
* by the initiator without having received a response.
* @req_lim_delta: Number of credits not yet sent back to the initiator.
+ * @imm_data_offset: Offset from start of SRP_CMD for immediate data.
* @spinlock: Protects free_list and state.
- * @free_list: Head of list with free send I/O contexts.
* @state: channel state. See also enum rdma_ch_state.
* @using_rdma_cm: Whether the RDMA/CM or IB/CM is used for this channel.
* @processing_wait_list: Whether or not cmd_wait_list is being processed.
+ * @rsp_buf_cache: kmem_cache for @ioctx_ring.
* @ioctx_ring: Send ring.
+ * @req_buf_cache: kmem_cache for @ioctx_recv_ring.
* @ioctx_recv_ring: Receive I/O context ring.
* @list: Node in srpt_nexus.ch_list.
* @cmd_wait_list: List of SCSI commands that arrived before the RTU event. This
@@ -297,10 +313,12 @@ struct srpt_rdma_ch {
int max_ti_iu_len;
atomic_t req_lim;
atomic_t req_lim_delta;
+ u16 imm_data_offset;
spinlock_t spinlock;
- struct list_head free_list;
enum rdma_ch_state state;
+ struct kmem_cache *rsp_buf_cache;
struct srpt_send_ioctx **ioctx_ring;
+ struct kmem_cache *req_buf_cache;
struct srpt_recv_ioctx **ioctx_recv_ring;
struct list_head list;
struct list_head cmd_wait_list;
@@ -395,6 +413,7 @@ struct srpt_port {
* @srq_size: SRQ size.
* @sdev_mutex: Serializes use_srq changes.
* @use_srq: Whether or not to use SRQ.
+ * @req_buf_cache: kmem_cache for @ioctx_ring buffers.
* @ioctx_ring: Per-HCA SRQ.
* @event_handler: Per-HCA asynchronous IB event handler.
* @list: Node in srpt_dev_list.
@@ -409,6 +428,7 @@ struct srpt_device {
int srq_size;
struct mutex sdev_mutex;
bool use_srq;
+ struct kmem_cache *req_buf_cache;
struct srpt_recv_ioctx **ioctx_ring;
struct ib_event_handler event_handler;
struct list_head list;