From 44e43d91ad4731d9e2e70c60eecc5982d6671e8c Mon Sep 17 00:00:00 2001 From: Mitko Haralanov Date: Thu, 24 Jan 2019 06:09:46 -0800 Subject: IB/hfi1: OPFN support discovery OPFN (Omni Path Feature Negotiation) support discovery allows a RC QP to announce that it supports OPFN and also discover if OPFN is supported by the peer QP. OPFN parameter negotiation is skipped unless OPFN support is first discovered. OPFN support is announced by claiming what was the reserved bit in dword 1 of OmniPath modified base transport header in requests and responses. Reviewed-by: Mike Marciniszyn Signed-off-by: Ashutosh Dixit Signed-off-by: Mitko Haralanov Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/rc.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'drivers/infiniband/hw/hfi1/rc.c') diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index be603f35d7e4..940e9236c328 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -89,8 +89,8 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, struct rvt_ack_entry *e; u32 hwords; u32 len; - u32 bth0; - u32 bth2; + u32 bth0, bth2; + u32 bth1 = qp->remote_qpn | (HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT); int middle = 0; u32 pmtu = qp->pmtu; struct hfi1_qp_priv *priv = qp->priv; @@ -229,7 +229,7 @@ normal: ps->s_txreq->sde = priv->s_sde; ps->s_txreq->s_cur_size = len; ps->s_txreq->hdr_dwords = hwords; - hfi1_make_ruc_header(qp, ohdr, bth0, bth2, middle, ps); + hfi1_make_ruc_header(qp, ohdr, bth0, bth1, bth2, middle, ps); return 1; bail: @@ -262,8 +262,8 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) struct rvt_swqe *wqe; u32 hwords; u32 len; - u32 bth0 = 0; - u32 bth2; + u32 bth0 = 0, bth2; + u32 bth1 = qp->remote_qpn | (HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT); u32 pmtu = qp->pmtu; char newreq; int middle = 0; @@ -693,6 +693,7 @@ no_flow_control: qp, ohdr, bth0 | (qp->s_state << 24), + bth1, bth2, middle, ps); @@ -796,6 +797,11 @@ static inline void hfi1_make_rc_ack_9B(struct hfi1_packet *packet, if (qp->s_mig_state == IB_MIG_MIGRATED) bth0 |= IB_BTH_MIG_REQ; bth1 = (!!is_fecn) << IB_BECN_SHIFT; + /* + * Inline ACKs go out without the use of the Verbs send engine, so + * we need to set the STL Verbs Extended bit here + */ + bth1 |= HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT; hfi1_make_bth_aeth(qp, ohdr, bth0, bth1); } -- cgit v1.2.3-59-g8ed1b From ddf922c31fedd19c5b89a269c35e5c8b68c64327 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 19:21:01 -0800 Subject: IB/hfi1, IB/rdmavt: Allow for extending of QP's s_ack_queue The OPFN protocol uses the COMPARE_SWAP request to exchange data between the requester and the responder and therefore needs to be stored in the QP's s_ack_queue when the request is received on the responder side. However, because the user does not know anything about the OPFN protocol, this extra entry in the queue cannot be advertised to the user. This patch adds an extra entry in a QP's s_ack_queue. Reviewed-by: Mike Marciniszyn Signed-off-by: Mitko Haralanov Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/rc.c | 13 +++++++------ drivers/infiniband/hw/hfi1/verbs.c | 1 + include/rdma/rdma_vt.h | 10 +++++++++- 3 files changed, 17 insertions(+), 7 deletions(-) (limited to 'drivers/infiniband/hw/hfi1/rc.c') diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index 940e9236c328..8970fc7ffd4b 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -122,7 +122,8 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, * response has been sent instead of only being * constructed. */ - if (++qp->s_tail_ack_queue > HFI1_MAX_RDMA_ATOMIC) + if (++qp->s_tail_ack_queue > + rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) qp->s_tail_ack_queue = 0; /* FALLTHROUGH */ case OP(SEND_ONLY): @@ -1818,7 +1819,7 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data, if (i) prev = i - 1; else - prev = HFI1_MAX_RDMA_ATOMIC; + prev = rvt_size_atomic(ib_to_rvt(qp->ibqp.device)); if (prev == qp->r_head_ack_queue) { e = NULL; break; @@ -1942,7 +1943,7 @@ static inline void update_ack_queue(struct rvt_qp *qp, unsigned n) unsigned next; next = n + 1; - if (next > HFI1_MAX_RDMA_ATOMIC) + if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) next = 0; qp->s_tail_ack_queue = next; qp->s_ack_state = OP(ACKNOWLEDGE); @@ -2298,8 +2299,8 @@ send_last: if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ))) goto nack_inv; next = qp->r_head_ack_queue + 1; - /* s_ack_queue is size HFI1_MAX_RDMA_ATOMIC+1 so use > not >= */ - if (next > HFI1_MAX_RDMA_ATOMIC) + /* s_ack_queue is size rvt_size_atomic()+1 so use > not >= */ + if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) next = 0; spin_lock_irqsave(&qp->s_lock, flags); if (unlikely(next == qp->s_tail_ack_queue)) { @@ -2373,7 +2374,7 @@ send_last: if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC))) goto nack_inv; next = qp->r_head_ack_queue + 1; - if (next > HFI1_MAX_RDMA_ATOMIC) + if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) next = 0; spin_lock_irqsave(&qp->s_lock, flags); if (unlikely(next == qp->s_tail_ack_queue)) { diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index c980345cf1e1..ec3899c0874c 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -1735,6 +1735,7 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd) dd->verbs_dev.rdi.dparms.sge_copy_mode = sge_copy_mode; dd->verbs_dev.rdi.dparms.wss_threshold = wss_threshold; dd->verbs_dev.rdi.dparms.wss_clean_period = wss_clean_period; + dd->verbs_dev.rdi.dparms.extra_rdma_atomic = 1; /* post send table */ dd->verbs_dev.rdi.post_parms = hfi1_post_parms; diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h index acb3bc96dfa7..168e40be183c 100644 --- a/include/rdma/rdma_vt.h +++ b/include/rdma/rdma_vt.h @@ -182,6 +182,7 @@ struct rvt_driver_params { u32 max_mad_size; u8 qos_shift; u8 max_rdma_atomic; + u8 extra_rdma_atomic; u8 reserved_operations; }; @@ -519,7 +520,14 @@ static inline unsigned rvt_get_npkeys(struct rvt_dev_info *rdi) */ static inline unsigned int rvt_max_atomic(struct rvt_dev_info *rdi) { - return rdi->dparms.max_rdma_atomic + 1; + return rdi->dparms.max_rdma_atomic + + rdi->dparms.extra_rdma_atomic + 1; +} + +static inline unsigned int rvt_size_atomic(struct rvt_dev_info *rdi) +{ + return rdi->dparms.max_rdma_atomic + + rdi->dparms.extra_rdma_atomic; } /* -- cgit v1.2.3-59-g8ed1b From 48a615dc00aed68d58244b835b10eb3244aae31d Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 19:21:11 -0800 Subject: IB/hfi1: Integrate OPFN into RC transactions OPFN parameter negotiation allows a pair of connected RC QPs to exchange a set of parameters in succession. This negotiation does not commence till the first ULP request. Because OPFN operations are operations private to the driver, they do not generate user completions or put the QP into error when they run out of retries. This patch integrates the OPFN protocol into the transactions of an RC QP. Reviewed-by: Mike Marciniszyn Signed-off-by: Ashutosh Dixit Signed-off-by: Mitko Haralanov Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/init.c | 7 +++++ drivers/infiniband/hw/hfi1/qp.c | 13 ++++++++ drivers/infiniband/hw/hfi1/rc.c | 58 +++++++++++++++++++++++++++++------ drivers/infiniband/hw/hfi1/tid_rdma.c | 11 +++++++ drivers/infiniband/hw/hfi1/tid_rdma.h | 1 + drivers/infiniband/hw/hfi1/verbs.c | 1 + 6 files changed, 81 insertions(+), 10 deletions(-) (limited to 'drivers/infiniband/hw/hfi1/rc.c') diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index 09c898d0975c..a8dbd0f191f5 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -1498,6 +1498,12 @@ static int __init hfi1_mod_init(void) /* sanitize link CRC options */ link_crc_mask &= SUPPORTED_CRCS; + ret = opfn_init(); + if (ret < 0) { + pr_err("Failed to allocate opfn_wq"); + goto bail_dev; + } + /* * These must be called before the driver is registered with * the PCI subsystem. @@ -1528,6 +1534,7 @@ module_init(hfi1_mod_init); static void __exit hfi1_mod_cleanup(void) { pci_unregister_driver(&hfi1_pci_driver); + opfn_exit(); node_affinity_destroy_all(); hfi1_dbg_exit(); diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c index 5344e8993b28..f822f92b415f 100644 --- a/drivers/infiniband/hw/hfi1/qp.c +++ b/drivers/infiniband/hw/hfi1/qp.c @@ -132,6 +132,12 @@ const struct rvt_operation_params hfi1_post_parms[RVT_OPERATION_MAX] = { .qpt_support = BIT(IB_QPT_RC), }, +[IB_WR_OPFN] = { + .length = sizeof(struct ib_atomic_wr), + .qpt_support = BIT(IB_QPT_RC), + .flags = RVT_OPERATION_USE_RESERVE, +}, + }; static void flush_list_head(struct list_head *l) @@ -285,6 +291,8 @@ void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr, priv->s_sendcontext = qp_to_send_context(qp, priv->s_sc); qp_set_16b(qp); } + + opfn_qp_init(qp, attr, attr_mask); } /** @@ -696,6 +704,7 @@ void qp_priv_free(struct rvt_dev_info *rdi, struct rvt_qp *qp) { struct hfi1_qp_priv *priv = qp->priv; + hfi1_qp_priv_tid_free(rdi, qp); kfree(priv->s_ahg); kfree(priv); } @@ -751,6 +760,10 @@ void notify_qp_reset(struct rvt_qp *qp) { qp->r_adefered = 0; clear_ahg(qp); + + /* Clear any OPFN state */ + if (qp->ibqp.qp_type == IB_QPT_RC) + opfn_conn_error(qp); } /* diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index 8970fc7ffd4b..092d5eba980f 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -57,6 +57,10 @@ /* cut down ridiculously long IB macro names */ #define OP(x) RC_OP(x) +static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, + struct rvt_swqe *wqe, + struct hfi1_ibport *ibp); + static u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe, u32 psn, u32 pmtu) { @@ -517,10 +521,14 @@ no_flow_control: goto bail; } qp->s_num_rd_atomic++; - if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) - qp->s_lsn++; } - if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) { + + /* FALLTHROUGH */ + case IB_WR_OPFN: + if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) + qp->s_lsn++; + if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || + wqe->wr.opcode == IB_WR_OPFN) { qp->s_state = OP(COMPARE_SWAP); put_ib_ateth_swap(wqe->atomic_wr.swap, &ohdr->u.atomic_eth); @@ -1040,6 +1048,7 @@ done: */ void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait) { + struct hfi1_qp_priv *priv = qp->priv; struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, qp->s_acked); struct hfi1_ibport *ibp; @@ -1050,8 +1059,26 @@ void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait) hfi1_migrate_qp(qp); qp->s_retry = qp->s_retry_cnt; } else if (qp->s_last == qp->s_acked) { - rvt_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR); - rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); + /* + * We need special handling for the OPFN request WQEs as + * they are not allowed to generate real user errors + */ + if (wqe->wr.opcode == IB_WR_OPFN) { + struct hfi1_ibport *ibp = + to_iport(qp->ibqp.device, qp->port_num); + /* + * Call opfn_conn_reply() with capcode and + * remaining data as 0 to close out the + * current request + */ + opfn_conn_reply(qp, priv->opfn.curr); + wqe = do_rc_completion(qp, wqe, ibp); + qp->s_flags &= ~RVT_S_WAIT_ACK; + } else { + rvt_send_complete(qp, wqe, + IB_WC_RETRY_EXC_ERR); + rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); + } return; } else { /* need to handle delayed completion */ return; @@ -1363,6 +1390,9 @@ static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, u64 *vaddr = wqe->sg_list[0].vaddr; *vaddr = val; } + if (wqe->wr.opcode == IB_WR_OPFN) + opfn_conn_reply(qp, val); + if (qp->s_num_rd_atomic && (wqe->wr.opcode == IB_WR_RDMA_READ || wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || @@ -2068,6 +2098,7 @@ void hfi1_rc_rcv(struct hfi1_packet *packet) return; fecn = process_ecn(qp, packet); + opfn_trigger_conn_request(qp, be32_to_cpu(ohdr->bth[1])); /* * Process responses (ACKs) before anything else. Note that the @@ -2363,15 +2394,18 @@ send_last: case OP(COMPARE_SWAP): case OP(FETCH_ADD): { - struct ib_atomic_eth *ateth; + struct ib_atomic_eth *ateth = &ohdr->u.atomic_eth; + u64 vaddr = get_ib_ateth_vaddr(ateth); + bool opfn = opcode == OP(COMPARE_SWAP) && + vaddr == HFI1_VERBS_E_ATOMIC_VADDR; struct rvt_ack_entry *e; - u64 vaddr; atomic64_t *maddr; u64 sdata; u32 rkey; u8 next; - if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC))) + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC) && + !opfn)) goto nack_inv; next = qp->r_head_ack_queue + 1; if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) @@ -2387,8 +2421,11 @@ send_last: rvt_put_mr(e->rdma_sge.mr); e->rdma_sge.mr = NULL; } - ateth = &ohdr->u.atomic_eth; - vaddr = get_ib_ateth_vaddr(ateth); + /* Process OPFN special virtual address */ + if (opfn) { + opfn_conn_response(qp, e, ateth); + goto ack; + } if (unlikely(vaddr & (sizeof(u64) - 1))) goto nack_inv_unlck; rkey = be32_to_cpu(ateth->rkey); @@ -2407,6 +2444,7 @@ send_last: sdata); rvt_put_mr(qp->r_sge.sge.mr); qp->r_sge.num_sge = 0; +ack: e->opcode = opcode; e->sent = 0; e->psn = psn; diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index a8fd66f31fee..0c9f313d6229 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -246,5 +246,16 @@ int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp, qpriv->rcd = qp_to_rcd(rdi, qp); + spin_lock_init(&qpriv->opfn.lock); + INIT_WORK(&qpriv->opfn.opfn_work, opfn_send_conn_request); + return 0; } + +void hfi1_qp_priv_tid_free(struct rvt_dev_info *rdi, struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + + if (qp->ibqp.qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) + cancel_work_sync(&priv->opfn.opfn_work); +} diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index 18c6d4333f1e..ee8151558e3f 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -35,5 +35,6 @@ int hfi1_kern_exp_rcv_init(struct hfi1_ctxtdata *rcd, int reinit); int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp, struct ib_qp_init_attr *init_attr); +void hfi1_qp_priv_tid_free(struct rvt_dev_info *rdi, struct rvt_qp *qp); #endif /* HFI1_TID_RDMA_H */ diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index ec3899c0874c..571bfd549c2a 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -1735,6 +1735,7 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd) dd->verbs_dev.rdi.dparms.sge_copy_mode = sge_copy_mode; dd->verbs_dev.rdi.dparms.wss_threshold = wss_threshold; dd->verbs_dev.rdi.dparms.wss_clean_period = wss_clean_period; + dd->verbs_dev.rdi.dparms.reserved_operations = 1; dd->verbs_dev.rdi.dparms.extra_rdma_atomic = 1; /* post send table */ -- cgit v1.2.3-59-g8ed1b From 385156c5f2a61834666f079ee66338f177c65c28 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 19:29:44 -0800 Subject: IB/hfi: Move RC functions into a header file This patch moves some RC helper functions into a header file so that they can be called from both RC and TID RDMA functions. In addition, a common function for rewinding a request is created in rdmavt so that it can be shared between qib and hfi1 driver. Reviewed-by: Mike Marciniszyn Signed-off-by: Mitko Haralanov Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/rc.c | 119 +++++++++++++++---------------------- drivers/infiniband/hw/hfi1/rc.h | 50 ++++++++++++++++ drivers/infiniband/hw/qib/qib_rc.c | 7 +-- drivers/infiniband/sw/rdmavt/rc.c | 13 ++++ include/rdma/rdmavt_qp.h | 10 ++++ 5 files changed, 123 insertions(+), 76 deletions(-) create mode 100644 drivers/infiniband/hw/hfi1/rc.h (limited to 'drivers/infiniband/hw/hfi1/rc.c') diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index 092d5eba980f..6e74cd3814b8 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -51,28 +51,48 @@ #include "hfi.h" #include "qp.h" +#include "rc.h" #include "verbs_txreq.h" #include "trace.h" -/* cut down ridiculously long IB macro names */ -#define OP(x) RC_OP(x) - -static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, - struct rvt_swqe *wqe, - struct hfi1_ibport *ibp); - -static u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe, - u32 psn, u32 pmtu) +struct rvt_ack_entry *find_prev_entry(struct rvt_qp *qp, u32 psn, u8 *prev, + u8 *prev_ack, bool *scheduled) + __must_hold(&qp->s_lock) { - u32 len; + struct rvt_ack_entry *e = NULL; + u8 i, p; + bool s = true; - len = delta_psn(psn, wqe->psn) * pmtu; - ss->sge = wqe->sg_list[0]; - ss->sg_list = wqe->sg_list + 1; - ss->num_sge = wqe->wr.num_sge; - ss->total_len = wqe->length; - rvt_skip_sge(ss, len, false); - return wqe->length - len; + for (i = qp->r_head_ack_queue; ; i = p) { + if (i == qp->s_tail_ack_queue) + s = false; + if (i) + p = i - 1; + else + p = rvt_size_atomic(ib_to_rvt(qp->ibqp.device)); + if (p == qp->r_head_ack_queue) { + e = NULL; + break; + } + e = &qp->s_ack_queue[p]; + if (!e->opcode) { + e = NULL; + break; + } + if (cmp_psn(psn, e->psn) >= 0) { + if (p == qp->s_tail_ack_queue && + cmp_psn(psn, e->lpsn) <= 0) + s = false; + break; + } + } + if (prev) + *prev = p; + if (prev_ack) + *prev_ack = i; + if (scheduled) + *scheduled = s; + return e; } /** @@ -1229,9 +1249,9 @@ static inline void update_last_psn(struct rvt_qp *qp, u32 psn) * This is similar to hfi1_send_complete but has to check to be sure * that the SGEs are not being referenced if the SWQE is being resent. */ -static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, - struct rvt_swqe *wqe, - struct hfi1_ibport *ibp) +struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, + struct rvt_swqe *wqe, + struct hfi1_ibport *ibp) { lockdep_assert_held(&qp->s_lock); /* @@ -1314,8 +1334,8 @@ static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, * May be called at interrupt level, with the QP s_lock held. * Returns 1 if OK, 0 if current operation should be aborted (NAK). */ -static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, - u64 val, struct hfi1_ctxtdata *rcd) +int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, + u64 val, struct hfi1_ctxtdata *rcd) { struct hfi1_ibport *ibp; enum ib_wc_status status; @@ -1754,16 +1774,6 @@ bail: return; } -static inline void rc_defered_ack(struct hfi1_ctxtdata *rcd, - struct rvt_qp *qp) -{ - if (list_empty(&qp->rspwait)) { - qp->r_flags |= RVT_R_RSP_NAK; - rvt_get_qp(qp); - list_add_tail(&qp->rspwait, &rcd->qp_wait_list); - } -} - static inline void rc_cancel_ack(struct rvt_qp *qp) { qp->r_adefered = 0; @@ -1796,8 +1806,9 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data, struct hfi1_ibport *ibp = rcd_to_iport(rcd); struct rvt_ack_entry *e; unsigned long flags; - u8 i, prev; - int old_req; + u8 prev; + u8 mra; /* most recent ACK */ + bool old_req; trace_hfi1_rcv_error(qp, psn); if (diff > 0) { @@ -1843,29 +1854,8 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data, spin_lock_irqsave(&qp->s_lock, flags); - for (i = qp->r_head_ack_queue; ; i = prev) { - if (i == qp->s_tail_ack_queue) - old_req = 0; - if (i) - prev = i - 1; - else - prev = rvt_size_atomic(ib_to_rvt(qp->ibqp.device)); - if (prev == qp->r_head_ack_queue) { - e = NULL; - break; - } - e = &qp->s_ack_queue[prev]; - if (!e->opcode) { - e = NULL; - break; - } - if (cmp_psn(psn, e->psn) >= 0) { - if (prev == qp->s_tail_ack_queue && - cmp_psn(psn, e->lpsn) <= 0) - old_req = 0; - break; - } - } + e = find_prev_entry(qp, psn, &prev, &mra, &old_req); + switch (opcode) { case OP(RDMA_READ_REQUEST): { struct ib_reth *reth; @@ -1940,7 +1930,7 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data, * Resend the most recent ACK if this request is * after all the previous RDMA reads and atomics. */ - if (i == qp->r_head_ack_queue) { + if (mra == qp->r_head_ack_queue) { spin_unlock_irqrestore(&qp->s_lock, flags); qp->r_nak_state = 0; qp->r_ack_psn = qp->r_psn - 1; @@ -1951,7 +1941,7 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data, * Resend the RDMA read or atomic op which * ACKs this duplicate request. */ - qp->s_tail_ack_queue = i; + qp->s_tail_ack_queue = mra; break; } qp->s_ack_state = OP(ACKNOWLEDGE); @@ -1968,17 +1958,6 @@ send_ack: return 0; } -static inline void update_ack_queue(struct rvt_qp *qp, unsigned n) -{ - unsigned next; - - next = n + 1; - if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) - next = 0; - qp->s_tail_ack_queue = next; - qp->s_ack_state = OP(ACKNOWLEDGE); -} - static void log_cca_event(struct hfi1_pportdata *ppd, u8 sl, u32 rlid, u32 lqpn, u32 rqpn, u8 svc_type) { diff --git a/drivers/infiniband/hw/hfi1/rc.h b/drivers/infiniband/hw/hfi1/rc.h new file mode 100644 index 000000000000..4329eadcb3df --- /dev/null +++ b/drivers/infiniband/hw/hfi1/rc.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */ +/* + * Copyright(c) 2018 Intel Corporation. + * + */ + +#ifndef HFI1_RC_H +#define HFI1_RC_H + +/* cut down ridiculously long IB macro names */ +#define OP(x) IB_OPCODE_RC_##x + +static inline void update_ack_queue(struct rvt_qp *qp, unsigned int n) +{ + unsigned int next; + + next = n + 1; + if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) + next = 0; + qp->s_tail_ack_queue = next; + qp->s_ack_state = OP(ACKNOWLEDGE); +} + +static inline void rc_defered_ack(struct hfi1_ctxtdata *rcd, + struct rvt_qp *qp) +{ + if (list_empty(&qp->rspwait)) { + qp->r_flags |= RVT_R_RSP_NAK; + rvt_get_qp(qp); + list_add_tail(&qp->rspwait, &rcd->qp_wait_list); + } +} + +static inline u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe, + u32 psn, u32 pmtu) +{ + u32 len; + + len = delta_psn(psn, wqe->psn) * pmtu; + return rvt_restart_sge(ss, wqe, len); +} + +struct rvt_ack_entry *find_prev_entry(struct rvt_qp *qp, u32 psn, u8 *prev, + u8 *prev_ack, bool *scheduled); +int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, u64 val, + struct hfi1_ctxtdata *rcd); +struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, struct rvt_swqe *wqe, + struct hfi1_ibport *ibp); + +#endif /* HFI1_RC_H */ diff --git a/drivers/infiniband/hw/qib/qib_rc.c b/drivers/infiniband/hw/qib/qib_rc.c index 6fa002940451..50dd9811b088 100644 --- a/drivers/infiniband/hw/qib/qib_rc.c +++ b/drivers/infiniband/hw/qib/qib_rc.c @@ -45,12 +45,7 @@ static u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe, u32 len; len = ((psn - wqe->psn) & QIB_PSN_MASK) * pmtu; - ss->sge = wqe->sg_list[0]; - ss->sg_list = wqe->sg_list + 1; - ss->num_sge = wqe->wr.num_sge; - ss->total_len = wqe->length; - rvt_skip_sge(ss, len, false); - return wqe->length - len; + return rvt_restart_sge(ss, wqe, len); } /** diff --git a/drivers/infiniband/sw/rdmavt/rc.c b/drivers/infiniband/sw/rdmavt/rc.c index 6131cc558bdb..8d71647820a8 100644 --- a/drivers/infiniband/sw/rdmavt/rc.c +++ b/drivers/infiniband/sw/rdmavt/rc.c @@ -187,3 +187,16 @@ void rvt_get_credit(struct rvt_qp *qp, u32 aeth) } } EXPORT_SYMBOL(rvt_get_credit); + +/* rvt_restart_sge - rewind the sge state for a wqe */ +u32 rvt_restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe, u32 len) +{ + ss->sge = wqe->sg_list[0]; + ss->sg_list = wqe->sg_list + 1; + ss->num_sge = wqe->wr.num_sge; + ss->total_len = wqe->length; + rvt_skip_sge(ss, len, false); + return wqe->length - len; +} +EXPORT_SYMBOL(rvt_restart_sge); + diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index cbafb1878669..56a9221378d9 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -628,6 +628,16 @@ __be32 rvt_compute_aeth(struct rvt_qp *qp); */ void rvt_get_credit(struct rvt_qp *qp, u32 aeth); +/** + * rvt_restart_sge - rewind the sge state for a wqe + * @ss: the sge state pointer + * @wqe: the wqe to rewind + * @len: the data length from the start of the wqe in bytes + * + * Returns the remaining data length. + */ +u32 rvt_restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe, u32 len); + /** * @qp - the qp pair * @len - the length -- cgit v1.2.3-59-g8ed1b From b126078e8957f3aea4a44b8916f2f3752b5c392d Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 19:31:46 -0800 Subject: IB/hfi1: Add functions for restarting TID RDMA READ request This patch adds functions to retry TID RDMA READ request. Since TID RDMA READ request could be retried from any segment boundary, it requires a number of tracking fields in various structures and those fields should be reset properly. The qp->s_num_rd_atomic field is reset before retry and therefore should be incremented for each new or retried RDMA READ or atomic request. Reviewed-by: Mike Marciniszyn Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/rc.c | 94 +++++++++++++++++++++++++++-------- drivers/infiniband/hw/hfi1/tid_rdma.c | 82 ++++++++++++++++++++++++++++++ drivers/infiniband/hw/hfi1/tid_rdma.h | 2 + 3 files changed, 158 insertions(+), 20 deletions(-) (limited to 'drivers/infiniband/hw/hfi1/rc.c') diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index 6e74cd3814b8..e478a0b93eb9 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -503,16 +503,14 @@ no_flow_control: * Don't allow more operations to be started * than the QP limits allow. */ - if (newreq) { - if (qp->s_num_rd_atomic >= - qp->s_max_rd_atomic) { - qp->s_flags |= RVT_S_WAIT_RDMAR; - goto bail; - } - qp->s_num_rd_atomic++; - if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) - qp->s_lsn++; + if (qp->s_num_rd_atomic >= + qp->s_max_rd_atomic) { + qp->s_flags |= RVT_S_WAIT_RDMAR; + goto bail; } + qp->s_num_rd_atomic++; + if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) + qp->s_lsn++; put_ib_reth_vaddr( wqe->rdma_wr.remote_addr, &ohdr->u.rc.reth); @@ -534,14 +532,12 @@ no_flow_control: * Don't allow more operations to be started * than the QP limits allow. */ - if (newreq) { - if (qp->s_num_rd_atomic >= - qp->s_max_rd_atomic) { - qp->s_flags |= RVT_S_WAIT_RDMAR; - goto bail; - } - qp->s_num_rd_atomic++; + if (qp->s_num_rd_atomic >= + qp->s_max_rd_atomic) { + qp->s_flags |= RVT_S_WAIT_RDMAR; + goto bail; } + qp->s_num_rd_atomic++; /* FALLTHROUGH */ case IB_WR_OPFN: @@ -970,6 +966,43 @@ void hfi1_send_rc_ack(struct hfi1_packet *packet, bool is_fecn) return; } +/** + * update_num_rd_atomic - update the qp->s_num_rd_atomic + * @qp: the QP + * @psn: the packet sequence number to restart at + * @wqe: the wqe + * + * This is called from reset_psn() to update qp->s_num_rd_atomic + * for the current wqe. + * Called at interrupt level with the QP s_lock held. + */ +static void update_num_rd_atomic(struct rvt_qp *qp, u32 psn, + struct rvt_swqe *wqe) +{ + u32 opcode = wqe->wr.opcode; + + if (opcode == IB_WR_RDMA_READ || + opcode == IB_WR_ATOMIC_CMP_AND_SWP || + opcode == IB_WR_ATOMIC_FETCH_AND_ADD) { + qp->s_num_rd_atomic++; + } else if (opcode == IB_WR_TID_RDMA_READ) { + struct tid_rdma_request *req = wqe_to_tid_req(wqe); + struct hfi1_qp_priv *priv = qp->priv; + + if (cmp_psn(psn, wqe->lpsn) <= 0) { + u32 cur_seg; + + cur_seg = (psn - wqe->psn) / priv->pkts_ps; + req->ack_pending = cur_seg - req->comp_seg; + priv->pending_tid_r_segs += req->ack_pending; + qp->s_num_rd_atomic += req->ack_pending; + } else { + priv->pending_tid_r_segs += req->total_segs; + qp->s_num_rd_atomic += req->total_segs; + } + } +} + /** * reset_psn - reset the QP state to send starting from PSN * @qp: the QP @@ -984,9 +1017,12 @@ static void reset_psn(struct rvt_qp *qp, u32 psn) u32 n = qp->s_acked; struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, n); u32 opcode; + struct hfi1_qp_priv *priv = qp->priv; lockdep_assert_held(&qp->s_lock); qp->s_cur = n; + priv->pending_tid_r_segs = 0; + qp->s_num_rd_atomic = 0; /* * If we are starting the request from the beginning, @@ -996,9 +1032,9 @@ static void reset_psn(struct rvt_qp *qp, u32 psn) qp->s_state = OP(SEND_LAST); goto done; } + update_num_rd_atomic(qp, psn, wqe); /* Find the work request opcode corresponding to the given PSN. */ - opcode = wqe->wr.opcode; for (;;) { int diff; @@ -1008,8 +1044,11 @@ static void reset_psn(struct rvt_qp *qp, u32 psn) break; wqe = rvt_get_swqe_ptr(qp, n); diff = cmp_psn(psn, wqe->psn); - if (diff < 0) + if (diff < 0) { + /* Point wqe back to the previous one*/ + wqe = rvt_get_swqe_ptr(qp, qp->s_cur); break; + } qp->s_cur = n; /* * If we are starting the request from the beginning, @@ -1019,8 +1058,10 @@ static void reset_psn(struct rvt_qp *qp, u32 psn) qp->s_state = OP(SEND_LAST); goto done; } - opcode = wqe->wr.opcode; + + update_num_rd_atomic(qp, psn, wqe); } + opcode = wqe->wr.opcode; /* * Set the state to restart in the middle of a request. @@ -1042,6 +1083,10 @@ static void reset_psn(struct rvt_qp *qp, u32 psn) qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE); break; + case IB_WR_TID_RDMA_READ: + qp->s_state = TID_OP(READ_RESP); + break; + default: /* * This case shouldn't happen since its only @@ -1095,6 +1140,14 @@ void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait) wqe = do_rc_completion(qp, wqe, ibp); qp->s_flags &= ~RVT_S_WAIT_ACK; } else { + if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) { + struct tid_rdma_request *req; + + req = wqe_to_tid_req(wqe); + hfi1_kern_exp_rcv_clear_all(req); + hfi1_kern_clear_hw_flow(priv->rcd, qp); + } + rvt_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR); rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); @@ -1108,7 +1161,8 @@ void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait) } ibp = to_iport(qp->ibqp.device, qp->port_num); - if (wqe->wr.opcode == IB_WR_RDMA_READ) + if (wqe->wr.opcode == IB_WR_RDMA_READ || + wqe->wr.opcode == IB_WR_TID_RDMA_READ) ibp->rvp.n_rc_resends++; else ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn); diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index da8b63ec0f8d..f767c5c20566 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -1622,6 +1622,27 @@ u64 hfi1_access_sw_tid_wait(const struct cntr_entry *entry, return dd->verbs_dev.n_tidwait; } +static struct tid_rdma_flow *find_flow_ib(struct tid_rdma_request *req, + u32 psn, u16 *fidx) +{ + u16 head, tail; + struct tid_rdma_flow *flow; + + head = req->setup_head; + tail = req->clear_tail; + for ( ; CIRC_CNT(head, tail, MAX_FLOWS); + tail = CIRC_NEXT(tail, MAX_FLOWS)) { + flow = &req->flows[tail]; + if (cmp_psn(psn, flow->flow_state.ib_spsn) >= 0 && + cmp_psn(psn, flow->flow_state.ib_lpsn) <= 0) { + if (fidx) + *fidx = tail; + return flow; + } + } + return NULL; +} + static struct tid_rdma_flow * __find_flow_ranged(struct tid_rdma_request *req, u16 head, u16 tail, u32 psn, u16 *fidx) @@ -2714,3 +2735,64 @@ rcu_unlock: drop: return ret; } + +/* + * "Rewind" the TID request information. + * This means that we reset the state back to ACTIVE, + * find the proper flow, set the flow index to that flow, + * and reset the flow information. + */ +void hfi1_tid_rdma_restart_req(struct rvt_qp *qp, struct rvt_swqe *wqe, + u32 *bth2) +{ + struct tid_rdma_request *req = wqe_to_tid_req(wqe); + struct tid_rdma_flow *flow; + int diff; + u32 tididx = 0; + u16 fidx; + + if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) { + *bth2 = mask_psn(qp->s_psn); + flow = find_flow_ib(req, *bth2, &fidx); + if (!flow) + return; + } else { + return; + } + + diff = delta_psn(*bth2, flow->flow_state.ib_spsn); + + flow->sent = 0; + flow->pkt = 0; + flow->tid_idx = 0; + flow->tid_offset = 0; + if (diff) { + for (tididx = 0; tididx < flow->tidcnt; tididx++) { + u32 tidentry = flow->tid_entry[tididx], tidlen, + tidnpkts, npkts; + + flow->tid_offset = 0; + tidlen = EXP_TID_GET(tidentry, LEN) * PAGE_SIZE; + tidnpkts = rvt_div_round_up_mtu(qp, tidlen); + npkts = min_t(u32, diff, tidnpkts); + flow->pkt += npkts; + flow->sent += (npkts == tidnpkts ? tidlen : + npkts * qp->pmtu); + flow->tid_offset += npkts * qp->pmtu; + diff -= npkts; + if (!diff) + break; + } + } + + if (flow->tid_offset == + EXP_TID_GET(flow->tid_entry[tididx], LEN) * PAGE_SIZE) { + tididx++; + flow->tid_offset = 0; + } + flow->tid_idx = tididx; + /* Move flow_idx to correct index */ + req->flow_idx = fidx; + + req->state = TID_REQUEST_ACTIVE; +} diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index d428236aef68..beb5982ce6ad 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -207,5 +207,7 @@ void hfi1_rc_rcv_tid_rdma_read_resp(struct hfi1_packet *packet); bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd, struct hfi1_pportdata *ppd, struct hfi1_packet *packet); +void hfi1_tid_rdma_restart_req(struct rvt_qp *qp, struct rvt_swqe *wqe, + u32 *bth2); #endif /* HFI1_TID_RDMA_H */ -- cgit v1.2.3-59-g8ed1b From 24b11923da4c7dbf5690d3ac74710affaf564196 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 19:32:09 -0800 Subject: IB/hfi1: Integrate TID RDMA READ protocol into RC protocol This patch integrates the TID RDMA READ protocol into the IB RC protocol. This protocol is an end-to-end protocol between the hfi1 drivers on two OPA nodes that converts a qualified RDMA READ request into a TID RDMA READ request to avoid data copying on the requester side. The following codes are added in this patch: - Send the TID RDMA READ request; - Complete the TID RDMA READ send request; - Send the TID RDMA READ response; - Complete the TID RDMA READ request; Reviewed-by: Mike Marciniszyn Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/qp.c | 1 + drivers/infiniband/hw/hfi1/rc.c | 295 +++++++++++++++++++++++++++++++--- drivers/infiniband/hw/hfi1/tid_rdma.c | 33 ++++ drivers/infiniband/hw/hfi1/tid_rdma.h | 1 + drivers/infiniband/hw/hfi1/verbs.c | 1 + 5 files changed, 307 insertions(+), 24 deletions(-) (limited to 'drivers/infiniband/hw/hfi1/rc.c') diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c index 69c38af49492..5fea7319167e 100644 --- a/drivers/infiniband/hw/hfi1/qp.c +++ b/drivers/infiniband/hw/hfi1/qp.c @@ -761,6 +761,7 @@ void quiesce_qp(struct rvt_qp *qp) void notify_qp_reset(struct rvt_qp *qp) { + hfi1_qp_kern_exp_rcv_clear_all(qp); qp->r_adefered = 0; clear_ahg(qp); diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index e478a0b93eb9..a5aacf8e5b93 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -112,12 +112,14 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, { struct rvt_ack_entry *e; u32 hwords; - u32 len; - u32 bth0, bth2; + u32 len = 0; + u32 bth0 = 0, bth2 = 0; u32 bth1 = qp->remote_qpn | (HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT); int middle = 0; u32 pmtu = qp->pmtu; struct hfi1_qp_priv *priv = qp->priv; + bool last_pkt; + u32 delta; lockdep_assert_held(&qp->s_lock); /* Don't send an ACK if we aren't supposed to. */ @@ -190,6 +192,26 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, hwords++; qp->s_ack_rdma_psn = e->psn; bth2 = mask_psn(qp->s_ack_rdma_psn++); + } else if (e->opcode == TID_OP(READ_REQ)) { + /* + * If a TID RDMA read response is being resent and + * we haven't seen the duplicate request yet, + * then stop sending the remaining responses the + * responder has seen until the requester re-sends it. + */ + len = e->rdma_sge.sge_length; + if (len && !e->rdma_sge.mr) { + qp->s_tail_ack_queue = qp->r_head_ack_queue; + goto bail; + } + /* Copy SGE state in case we need to resend */ + ps->s_txreq->mr = e->rdma_sge.mr; + if (ps->s_txreq->mr) + rvt_get_mr(ps->s_txreq->mr); + qp->s_ack_rdma_sge.sge = e->rdma_sge; + qp->s_ack_rdma_sge.num_sge = 1; + qp->s_ack_state = TID_OP(READ_RESP); + goto read_resp; } else { /* COMPARE_SWAP or FETCH_ADD */ ps->s_txreq->ss = NULL; @@ -227,6 +249,28 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, bth2 = mask_psn(qp->s_ack_rdma_psn++); break; + case TID_OP(READ_RESP): +read_resp: + e = &qp->s_ack_queue[qp->s_tail_ack_queue]; + ps->s_txreq->ss = &qp->s_ack_rdma_sge; + delta = hfi1_build_tid_rdma_read_resp(qp, e, ohdr, &bth0, + &bth1, &bth2, &len, + &last_pkt); + if (delta == 0) + goto error_qp; + hwords += delta; + if (last_pkt) { + e->sent = 1; + /* + * Increment qp->s_tail_ack_queue through s_ack_state + * transition. + */ + qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST); + } + break; + case TID_OP(READ_REQ): + goto bail; + default: normal: /* @@ -256,7 +300,14 @@ normal: ps->s_txreq->hdr_dwords = hwords; hfi1_make_ruc_header(qp, ohdr, bth0, bth1, bth2, middle, ps); return 1; - +error_qp: + spin_unlock_irqrestore(&qp->s_lock, ps->flags); + spin_lock_irqsave(&qp->r_lock, ps->flags); + spin_lock(&qp->s_lock); + rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); + spin_unlock(&qp->s_lock); + spin_unlock_irqrestore(&qp->r_lock, ps->flags); + spin_lock_irqsave(&qp->s_lock, ps->flags); bail: qp->s_ack_state = OP(ACKNOWLEDGE); /* @@ -283,16 +334,20 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) struct hfi1_qp_priv *priv = qp->priv; struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); struct ib_other_headers *ohdr; - struct rvt_sge_state *ss; + struct rvt_sge_state *ss = NULL; struct rvt_swqe *wqe; - u32 hwords; - u32 len; - u32 bth0 = 0, bth2; + struct hfi1_swqe_priv *wpriv; + struct tid_rdma_request *req = NULL; + /* header size in 32-bit words LRH+BTH = (8+12)/4. */ + u32 hwords = 5; + u32 len = 0; + u32 bth0 = 0, bth2 = 0; u32 bth1 = qp->remote_qpn | (HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT); u32 pmtu = qp->pmtu; char newreq; int middle = 0; int delta; + struct tid_rdma_flow *flow = NULL; lockdep_assert_held(&qp->s_lock); ps->s_txreq = get_txreq(ps->dev, qp); @@ -334,8 +389,8 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) } clear_ahg(qp); wqe = rvt_get_swqe_ptr(qp, qp->s_last); - rvt_send_complete(qp, wqe, qp->s_last != qp->s_acked ? - IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR); + hfi1_trdma_send_complete(qp, wqe, qp->s_last != qp->s_acked ? + IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR); /* will get called again */ goto done_free_tx; } @@ -354,6 +409,7 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) /* Send a request. */ wqe = rvt_get_swqe_ptr(qp, qp->s_cur); +check_s_state: switch (qp->s_state) { default: if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_NEXT_SEND_OK)) @@ -375,9 +431,13 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) /* * If a fence is requested, wait for previous * RDMA read and atomic operations to finish. + * However, there is no need to guard against + * TID RDMA READ after TID RDMA READ. */ if ((wqe->wr.send_flags & IB_SEND_FENCE) && - qp->s_num_rd_atomic) { + qp->s_num_rd_atomic && + (wqe->wr.opcode != IB_WR_TID_RDMA_READ || + priv->pending_tid_r_segs < qp->s_num_rd_atomic)) { qp->s_flags |= RVT_S_WAIT_FENCE; goto bail; } @@ -526,6 +586,75 @@ no_flow_control: qp->s_cur = 0; break; + case IB_WR_TID_RDMA_READ: + wpriv = wqe->priv; + req = wqe_to_tid_req(wqe); + delta = cmp_psn(qp->s_psn, wqe->psn); + + /* + * Don't allow more operations to be started + * than the QP limits allow. We could get here under + * three conditions; (1) It's a new request; (2) We are + * sending the second or later segment of a request, + * but the qp->s_state is set to OP(RDMA_READ_REQUEST) + * when the last segment of a previous request is + * received just before this; (3) We are re-sending a + * request. + */ + if (qp->s_num_rd_atomic >= qp->s_max_rd_atomic) { + qp->s_flags |= RVT_S_WAIT_RDMAR; + goto bail; + } + if (newreq) { + struct tid_rdma_flow *flow = + &req->flows[req->setup_head]; + + /* + * Set up s_sge as it is needed for TID + * allocation. However, if the pages have been + * walked and mapped, skip it. An earlier try + * has failed to allocate the TID entries. + */ + if (!flow->npagesets) { + qp->s_sge.sge = wqe->sg_list[0]; + qp->s_sge.sg_list = wqe->sg_list + 1; + qp->s_sge.num_sge = wqe->wr.num_sge; + qp->s_sge.total_len = wqe->length; + qp->s_len = wqe->length; + req->isge = 0; + req->clear_tail = req->setup_head; + req->flow_idx = req->setup_head; + req->state = TID_REQUEST_ACTIVE; + } + } else if (delta == 0) { + /* Re-send a request */ + req->cur_seg = 0; + req->comp_seg = 0; + req->ack_pending = 0; + req->flow_idx = req->clear_tail; + req->state = TID_REQUEST_RESEND; + } + req->s_next_psn = qp->s_psn; + /* Read one segment at a time */ + len = min_t(u32, req->seg_len, + wqe->length - req->seg_len * req->cur_seg); + delta = hfi1_build_tid_rdma_read_req(qp, wqe, ohdr, + &bth1, &bth2, + &len); + if (delta <= 0) { + /* Wait for TID space */ + goto bail; + } + if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) + qp->s_lsn++; + hwords += delta; + ss = &wpriv->ss; + /* Check if this is the last segment */ + if (req->cur_seg >= req->total_segs && + ++qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + case IB_WR_ATOMIC_CMP_AND_SWP: case IB_WR_ATOMIC_FETCH_AND_ADD: /* @@ -571,11 +700,13 @@ no_flow_control: default: goto bail; } - qp->s_sge.sge = wqe->sg_list[0]; - qp->s_sge.sg_list = wqe->sg_list + 1; - qp->s_sge.num_sge = wqe->wr.num_sge; - qp->s_sge.total_len = wqe->length; - qp->s_len = wqe->length; + if (wqe->wr.opcode != IB_WR_TID_RDMA_READ) { + qp->s_sge.sge = wqe->sg_list[0]; + qp->s_sge.sg_list = wqe->sg_list + 1; + qp->s_sge.num_sge = wqe->wr.num_sge; + qp->s_sge.total_len = wqe->length; + qp->s_len = wqe->length; + } if (newreq) { qp->s_tail++; if (qp->s_tail >= qp->s_size) @@ -583,6 +714,8 @@ no_flow_control: } if (wqe->wr.opcode == IB_WR_RDMA_READ) qp->s_psn = wqe->lpsn + 1; + else if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) + qp->s_psn = req->s_next_psn; else qp->s_psn++; break; @@ -699,6 +832,99 @@ no_flow_control: if (qp->s_cur == qp->s_size) qp->s_cur = 0; break; + case TID_OP(READ_RESP): + if (wqe->wr.opcode != IB_WR_TID_RDMA_READ) + goto bail; + /* This is used to restart a TID read request */ + req = wqe_to_tid_req(wqe); + wpriv = wqe->priv; + /* + * Back down. The field qp->s_psn has been set to the psn with + * which the request should be restart. It's OK to use division + * as this is on the retry path. + */ + req->cur_seg = delta_psn(qp->s_psn, wqe->psn) / priv->pkts_ps; + + /* + * The following function need to be redefined to return the + * status to make sure that we find the flow. At the same + * time, we can use the req->state change to check if the + * call succeeds or not. + */ + req->state = TID_REQUEST_RESEND; + hfi1_tid_rdma_restart_req(qp, wqe, &bth2); + if (req->state != TID_REQUEST_ACTIVE) { + /* + * Failed to find the flow. Release all allocated tid + * resources. + */ + hfi1_kern_exp_rcv_clear_all(req); + hfi1_kern_clear_hw_flow(priv->rcd, qp); + + hfi1_trdma_send_complete(qp, wqe, IB_WC_LOC_QP_OP_ERR); + goto bail; + } + req->state = TID_REQUEST_RESEND; + len = min_t(u32, req->seg_len, + wqe->length - req->seg_len * req->cur_seg); + flow = &req->flows[req->flow_idx]; + len -= flow->sent; + req->s_next_psn = flow->flow_state.ib_lpsn + 1; + delta = hfi1_build_tid_rdma_read_packet(wqe, ohdr, &bth1, + &bth2, &len); + if (delta <= 0) { + /* Wait for TID space */ + goto bail; + } + hwords += delta; + ss = &wpriv->ss; + /* Check if this is the last segment */ + if (req->cur_seg >= req->total_segs && + ++qp->s_cur == qp->s_size) + qp->s_cur = 0; + qp->s_psn = req->s_next_psn; + break; + case TID_OP(READ_REQ): + req = wqe_to_tid_req(wqe); + delta = cmp_psn(qp->s_psn, wqe->psn); + /* + * If the current WR is not TID RDMA READ, or this is the start + * of a new request, we need to change the qp->s_state so that + * the request can be set up properly. + */ + if (wqe->wr.opcode != IB_WR_TID_RDMA_READ || delta == 0 || + qp->s_cur == qp->s_tail) { + qp->s_state = OP(RDMA_READ_REQUEST); + if (delta == 0 || qp->s_cur == qp->s_tail) + goto check_s_state; + else + goto bail; + } + + /* Rate limiting */ + if (qp->s_num_rd_atomic >= qp->s_max_rd_atomic) { + qp->s_flags |= RVT_S_WAIT_RDMAR; + goto bail; + } + + wpriv = wqe->priv; + /* Read one segment at a time */ + len = min_t(u32, req->seg_len, + wqe->length - req->seg_len * req->cur_seg); + delta = hfi1_build_tid_rdma_read_req(qp, wqe, ohdr, &bth1, + &bth2, &len); + if (delta <= 0) { + /* Wait for TID space */ + goto bail; + } + hwords += delta; + ss = &wpriv->ss; + /* Check if this is the last segment */ + if (req->cur_seg >= req->total_segs && + ++qp->s_cur == qp->s_size) + qp->s_cur = 0; + qp->s_psn = req->s_next_psn; + break; } qp->s_sending_hpsn = bth2; delta = delta_psn(bth2, wqe->psn); @@ -1148,8 +1374,8 @@ void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait) hfi1_kern_clear_hw_flow(priv->rcd, qp); } - rvt_send_complete(qp, wqe, - IB_WC_RETRY_EXC_ERR); + hfi1_trdma_send_complete(qp, wqe, + IB_WC_RETRY_EXC_ERR); rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); } return; @@ -1189,7 +1415,8 @@ static void reset_sending_psn(struct rvt_qp *qp, u32 psn) for (;;) { wqe = rvt_get_swqe_ptr(qp, n); if (cmp_psn(psn, wqe->lpsn) <= 0) { - if (wqe->wr.opcode == IB_WR_RDMA_READ) + if (wqe->wr.opcode == IB_WR_RDMA_READ || + wqe->wr.opcode == IB_WR_TID_RDMA_READ) qp->s_sending_psn = wqe->lpsn + 1; else qp->s_sending_psn = psn + 1; @@ -1238,8 +1465,9 @@ void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah) } opcode = ib_bth_get_opcode(ohdr); - if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) && - opcode <= OP(ATOMIC_ACKNOWLEDGE)) { + if ((opcode >= OP(RDMA_READ_RESPONSE_FIRST) && + opcode <= OP(ATOMIC_ACKNOWLEDGE)) || + opcode == TID_OP(READ_RESP)) { WARN_ON(!qp->s_rdma_ack_cnt); qp->s_rdma_ack_cnt--; return; @@ -1255,8 +1483,12 @@ void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah) if ((psn & IB_BTH_REQ_ACK) && qp->s_acked != qp->s_tail && !(qp->s_flags & (RVT_S_TIMER | RVT_S_WAIT_RNR | RVT_S_WAIT_PSN)) && - (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) - rvt_add_retry_timer(qp); + (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) { + if (opcode == TID_OP(READ_REQ)) + rvt_add_retry_timer_ext(qp, priv->timeout_shift); + else + rvt_add_retry_timer(qp); + } while (qp->s_last != qp->s_acked) { u32 s_last; @@ -1265,6 +1497,7 @@ void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah) if (cmp_psn(wqe->lpsn, qp->s_sending_psn) >= 0 && cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0) break; + trdma_clean_swqe(qp, wqe); rvt_qp_wqe_unreserve(qp, wqe); s_last = qp->s_last; trace_hfi1_qp_send_completion(qp, wqe, s_last); @@ -1317,6 +1550,7 @@ struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) { u32 s_last; + trdma_clean_swqe(qp, wqe); rvt_put_swqe(wqe); rvt_qp_wqe_unreserve(qp, wqe); s_last = qp->s_last; @@ -1393,6 +1627,7 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, { struct hfi1_ibport *ibp; enum ib_wc_status status; + struct hfi1_qp_priv *qpriv = qp->priv; struct rvt_swqe *wqe; int ret = 0; u32 ack_psn; @@ -1439,6 +1674,8 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, */ if ((wqe->wr.opcode == IB_WR_RDMA_READ && (opcode != OP(RDMA_READ_RESPONSE_LAST) || diff != 0)) || + (wqe->wr.opcode == IB_WR_TID_RDMA_READ && + (opcode != TID_OP(READ_RESP) || diff != 0)) || ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) && (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) { @@ -1492,7 +1729,13 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, switch (aeth >> IB_AETH_NAK_SHIFT) { case 0: /* ACK */ this_cpu_inc(*ibp->rvp.rc_acks); - if (qp->s_acked != qp->s_tail) { + if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) { + if (wqe_to_tid_req(wqe)->ack_pending) + rvt_mod_retry_timer_ext(qp, + qpriv->timeout_shift); + else + rvt_stop_rc_timers(qp); + } else if (qp->s_acked != qp->s_tail) { /* * We are expecting more ACKs so * mod the retry timer. @@ -1581,7 +1824,10 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, ibp->rvp.n_other_naks++; class_b: if (qp->s_last == qp->s_acked) { - rvt_send_complete(qp, wqe, status); + if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) + hfi1_kern_read_tid_flow_free(qp); + + hfi1_trdma_send_complete(qp, wqe, status); rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); } break; @@ -1622,6 +1868,7 @@ static void rdma_seq_err(struct rvt_qp *qp, struct hfi1_ibport *ibp, u32 psn, while (cmp_psn(psn, wqe->lpsn) > 0) { if (wqe->wr.opcode == IB_WR_RDMA_READ || + wqe->wr.opcode == IB_WR_TID_RDMA_READ || wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) break; diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index f767c5c20566..f6d9e2717106 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -2796,3 +2796,36 @@ void hfi1_tid_rdma_restart_req(struct rvt_qp *qp, struct rvt_swqe *wqe, req->state = TID_REQUEST_ACTIVE; } + +void hfi1_qp_kern_exp_rcv_clear_all(struct rvt_qp *qp) +{ + int i, ret; + struct hfi1_qp_priv *qpriv = qp->priv; + struct tid_flow_state *fs; + + if (qp->ibqp.qp_type != IB_QPT_RC || !HFI1_CAP_IS_KSET(TID_RDMA)) + return; + + /* + * First, clear the flow to help prevent any delayed packets from + * being delivered. + */ + fs = &qpriv->flow_state; + if (fs->index != RXE_NUM_TID_FLOWS) + hfi1_kern_clear_hw_flow(qpriv->rcd, qp); + + for (i = qp->s_acked; i != qp->s_head;) { + struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, i); + + if (++i == qp->s_size) + i = 0; + /* Free only locally allocated TID entries */ + if (wqe->wr.opcode != IB_WR_TID_RDMA_READ) + continue; + do { + struct hfi1_swqe_priv *priv = wqe->priv; + + ret = hfi1_kern_exp_rcv_clear(&priv->tid_req); + } while (!ret); + } +} diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index beb5982ce6ad..4f85b7ea5cf3 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -209,5 +209,6 @@ bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd, struct hfi1_packet *packet); void hfi1_tid_rdma_restart_req(struct rvt_qp *qp, struct rvt_swqe *wqe, u32 *bth2); +void hfi1_qp_kern_exp_rcv_clear_all(struct rvt_qp *qp); #endif /* HFI1_TID_RDMA_H */ diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index 2d59fcde4db6..88676ca79fda 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -165,6 +165,7 @@ const enum ib_wc_opcode ib_hfi1_wc_opcode[] = { [IB_WR_SEND] = IB_WC_SEND, [IB_WR_SEND_WITH_IMM] = IB_WC_SEND, [IB_WR_RDMA_READ] = IB_WC_RDMA_READ, + [IB_WR_TID_RDMA_READ] = IB_WC_RDMA_READ, [IB_WR_ATOMIC_CMP_AND_SWP] = IB_WC_COMP_SWAP, [IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD, [IB_WR_SEND_WITH_INV] = IB_WC_SEND, -- cgit v1.2.3-59-g8ed1b From a0b34f75ec209e40f06912380533ec525691544f Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Thu, 24 Jan 2019 06:36:48 -0800 Subject: IB/hfi1: Add interlock between a TID RDMA request and other requests This locking mechanism is designed to provent vavious memory corruption scenarios from occurring when requests are pipelined, especially when RDMA READ/WRITE requests are interleaved with TID RDMA READ/WRITE requests: 1. READ-AFTER-READ; 2. READ-AFTER-WRITE; 3. WRITE-AFTER-READ; When memory corruption is likely, a request will be held back until previous requests have been completed. Reviewed-by: Mike Marciniszyn Signed-off-by: Mitko Haralanov Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/rc.c | 16 +++++++++++++++ drivers/infiniband/hw/hfi1/tid_rdma.c | 37 +++++++++++++++++++++++++++++++++++ drivers/infiniband/hw/hfi1/tid_rdma.h | 11 +++++++++++ drivers/infiniband/hw/hfi1/verbs.h | 3 +++ 4 files changed, 67 insertions(+) (limited to 'drivers/infiniband/hw/hfi1/rc.c') diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index a5aacf8e5b93..349751cb8b47 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -482,6 +482,15 @@ check_s_state: len = wqe->length; ss = &qp->s_sge; bth2 = mask_psn(qp->s_psn); + + /* + * Interlock between various IB requests and TID RDMA + * if necessary. + */ + if ((priv->s_flags & HFI1_S_TID_WAIT_INTERLCK) || + hfi1_tid_rdma_wqe_interlock(qp, wqe)) + goto bail; + switch (wqe->wr.opcode) { case IB_WR_SEND: case IB_WR_SEND_WITH_IMM: @@ -1321,6 +1330,7 @@ static void reset_psn(struct rvt_qp *qp, u32 psn) qp->s_state = OP(SEND_LAST); } done: + priv->s_flags &= ~HFI1_S_TID_WAIT_INTERLCK; qp->s_psn = psn; /* * Set RVT_S_WAIT_PSN as rc_complete() may start the timer @@ -1540,6 +1550,8 @@ struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, struct rvt_swqe *wqe, struct hfi1_ibport *ibp) { + struct hfi1_qp_priv *priv = qp->priv; + lockdep_assert_held(&qp->s_lock); /* * Don't decrement refcount and don't generate a @@ -1608,6 +1620,10 @@ struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, qp->s_draining = 0; wqe = rvt_get_swqe_ptr(qp, qp->s_acked); } + if (priv->s_flags & HFI1_S_TID_WAIT_INTERLCK) { + priv->s_flags &= ~HFI1_S_TID_WAIT_INTERLCK; + hfi1_schedule_send(qp); + } return wqe; } diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index f6d9e2717106..ccf15c9e76f8 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -2829,3 +2829,40 @@ void hfi1_qp_kern_exp_rcv_clear_all(struct rvt_qp *qp) } while (!ret); } } + +bool hfi1_tid_rdma_wqe_interlock(struct rvt_qp *qp, struct rvt_swqe *wqe) +{ + struct rvt_swqe *prev; + struct hfi1_qp_priv *priv = qp->priv; + u32 s_prev; + + s_prev = (qp->s_cur == 0 ? qp->s_size : qp->s_cur) - 1; + prev = rvt_get_swqe_ptr(qp, s_prev); + + switch (wqe->wr.opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_IMM: + case IB_WR_SEND_WITH_INV: + case IB_WR_ATOMIC_CMP_AND_SWP: + case IB_WR_ATOMIC_FETCH_AND_ADD: + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_READ: + break; + case IB_WR_TID_RDMA_READ: + switch (prev->wr.opcode) { + case IB_WR_RDMA_READ: + if (qp->s_acked != qp->s_cur) + goto interlock; + break; + default: + break; + } + default: + break; + } + return false; + +interlock: + priv->s_flags |= HFI1_S_TID_WAIT_INTERLCK; + return true; +} diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index 4f85b7ea5cf3..689a5490432f 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -17,6 +17,16 @@ #define TID_RDMA_MAX_SEGMENT_SIZE BIT(18) /* 256 KiB (for now) */ #define TID_RDMA_MAX_PAGES (BIT(18) >> PAGE_SHIFT) +/* + * Bit definitions for priv->s_flags. + * These bit flags overload the bit flags defined for the QP's s_flags. + * Due to the fact that these bit fields are used only for the QP priv + * s_flags, there are no collisions. + * + * HFI1_S_TID_WAIT_INTERLCK - QP is waiting for requester interlock + */ +#define HFI1_S_TID_WAIT_INTERLCK BIT(5) + struct tid_rdma_params { struct rcu_head rcu_head; u32 qp; @@ -210,5 +220,6 @@ bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd, void hfi1_tid_rdma_restart_req(struct rvt_qp *qp, struct rvt_swqe *wqe, u32 *bth2); void hfi1_qp_kern_exp_rcv_clear_all(struct rvt_qp *qp); +bool hfi1_tid_rdma_wqe_interlock(struct rvt_qp *qp, struct rvt_swqe *wqe); #endif /* HFI1_TID_RDMA_H */ diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index 7642b59ad5d2..841727a684d5 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -171,6 +171,9 @@ struct hfi1_qp_priv { u8 hdr_type; /* 9B or 16B */ unsigned long tid_timer_timeout_jiffies; + /* variables for the TID RDMA SE state machine */ + u32 s_flags; + /* For TID RDMA READ */ u32 tid_r_reqs; /* Num of tid reads requested */ u32 tid_r_comp; /* Num of tid reads completed */ -- cgit v1.2.3-59-g8ed1b From 3ce5daa2c1798a530db9a01cd35122e0958538ad Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 19:32:44 -0800 Subject: IB/hfi1: Add static trace for TID RDMA READ protocol This patch makes the following changes to the static trace: 1. Adds the decoding of TID RDMA READ packets in IB header trace; 2. Tracks qpriv->s_flags and iow_flags in qpsleepwakeup trace; 3. Adds a new event to track RC ACK receiving; 4. Adds trace events for various stages of the TID RDMA READ protocol. These events provide a fine-grained control for monitoring and debugging the hfi1 driver in the filed. Reviewed-by: Mike Marciniszyn Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/rc.c | 16 + drivers/infiniband/hw/hfi1/tid_rdma.c | 51 ++- drivers/infiniband/hw/hfi1/trace.c | 36 ++ drivers/infiniband/hw/hfi1/trace_ibhdrs.h | 2 + drivers/infiniband/hw/hfi1/trace_rc.h | 48 +++ drivers/infiniband/hw/hfi1/trace_tid.h | 528 +++++++++++++++++++++++++++++- drivers/infiniband/hw/hfi1/trace_tx.h | 12 +- 7 files changed, 684 insertions(+), 9 deletions(-) (limited to 'drivers/infiniband/hw/hfi1/rc.c') diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index 349751cb8b47..6c9ef572fc69 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -121,6 +121,7 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, bool last_pkt; u32 delta; + trace_hfi1_rsp_make_rc_ack(qp, 0); lockdep_assert_held(&qp->s_lock); /* Don't send an ACK if we aren't supposed to. */ if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) @@ -349,6 +350,7 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) int delta; struct tid_rdma_flow *flow = NULL; + trace_hfi1_sender_make_rc_req(qp); lockdep_assert_held(&qp->s_lock); ps->s_txreq = get_txreq(ps->dev, qp); if (!ps->s_txreq) @@ -596,8 +598,13 @@ no_flow_control: break; case IB_WR_TID_RDMA_READ: + trace_hfi1_tid_read_sender_make_req(qp, newreq); wpriv = wqe->priv; req = wqe_to_tid_req(wqe); + trace_hfi1_tid_req_make_req_read(qp, newreq, + wqe->wr.opcode, + wqe->psn, wqe->lpsn, + req); delta = cmp_psn(qp->s_psn, wqe->psn); /* @@ -892,6 +899,8 @@ no_flow_control: ++qp->s_cur == qp->s_size) qp->s_cur = 0; qp->s_psn = req->s_next_psn; + trace_hfi1_tid_req_make_req_read(qp, 0, wqe->wr.opcode, + wqe->psn, wqe->lpsn, req); break; case TID_OP(READ_REQ): req = wqe_to_tid_req(wqe); @@ -933,6 +942,8 @@ no_flow_control: ++qp->s_cur == qp->s_size) qp->s_cur = 0; qp->s_psn = req->s_next_psn; + trace_hfi1_tid_req_make_req_read(qp, 0, wqe->wr.opcode, + wqe->psn, wqe->lpsn, req); break; } qp->s_sending_hpsn = bth2; @@ -1341,6 +1352,7 @@ done: (cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)) qp->s_flags |= RVT_S_WAIT_PSN; qp->s_flags &= ~HFI1_S_AHG_VALID; + trace_hfi1_sender_reset_psn(qp); } /* @@ -1355,6 +1367,7 @@ void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait) lockdep_assert_held(&qp->r_lock); lockdep_assert_held(&qp->s_lock); + trace_hfi1_sender_restart_rc(qp); if (qp->s_retry == 0) { if (qp->s_mig_state == IB_MIG_ARMED) { hfi1_migrate_qp(qp); @@ -1558,6 +1571,7 @@ struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, * completion if the SWQE is being resent until the send * is finished. */ + trace_hfi1_rc_completion(qp, wqe->lpsn); if (cmp_psn(wqe->lpsn, qp->s_sending_psn) < 0 || cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) { u32 s_last; @@ -1742,6 +1756,8 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, break; } + trace_hfi1_rc_ack_do(qp, aeth, psn, wqe); + trace_hfi1_sender_do_rc_ack(qp); switch (aeth >> IB_AETH_NAK_SHIFT) { case 0: /* ACK */ this_cpu_inc(*ibp->rvp.rc_acks); diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index d65c030911c9..0ee79403acaf 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -1688,6 +1688,7 @@ u32 hfi1_build_tid_rdma_read_packet(struct rvt_swqe *wqe, /* This is the IB psn used to send the request */ *bth2 = mask_psn(flow->flow_state.ib_spsn + flow->pkt); + trace_hfi1_tid_flow_build_read_pkt(qp, req->flow_idx, flow); /* TID Entries for TID RDMA READ payload */ req_addr = &flow->tid_entry[flow->tid_idx]; @@ -1768,6 +1769,8 @@ u32 hfi1_build_tid_rdma_read_req(struct rvt_qp *qp, struct rvt_swqe *wqe, bool retry = true; u32 npkts = rvt_div_round_up_mtu(qp, *len); + trace_hfi1_tid_req_build_read_req(qp, 0, wqe->wr.opcode, wqe->psn, + wqe->lpsn, req); /* * Check sync conditions. Make sure that there are no pending * segments before freeing the flow. @@ -1883,6 +1886,8 @@ static int tid_rdma_rcv_read_request(struct rvt_qp *qp, */ flow->npkts = rvt_div_round_up_mtu(qp, len); for (i = 0; i < flow->tidcnt; i++) { + trace_hfi1_tid_entry_rcv_read_req(qp, i, + flow->tid_entry[i]); tlen = EXP_TID_GET(flow->tid_entry[i], LEN); if (!tlen) return 1; @@ -1917,6 +1922,7 @@ static int tid_rdma_rcv_read_request(struct rvt_qp *qp, flow->flow_state.ib_spsn = psn; flow->flow_state.ib_lpsn = flow->flow_state.ib_spsn + flow->npkts - 1; + trace_hfi1_tid_flow_rcv_read_req(qp, req->setup_head, flow); /* Set the initial flow index to the current flow. */ req->flow_idx = req->setup_head; @@ -1942,6 +1948,8 @@ static int tid_rdma_rcv_read_request(struct rvt_qp *qp, req->total_segs = 1; req->r_flow_psn = e->psn; + trace_hfi1_tid_req_rcv_read_req(qp, 0, e->opcode, e->psn, e->lpsn, + req); return 0; } @@ -1957,6 +1965,8 @@ static int tid_rdma_rcv_error(struct hfi1_packet *packet, u8 prev; bool old_req; + trace_hfi1_rsp_tid_rcv_error(qp, psn); + trace_hfi1_tid_rdma_rcv_err(qp, 0, psn, diff); if (diff > 0) { /* sequence error */ if (!qp->r_nak_state) { @@ -1977,7 +1987,7 @@ static int tid_rdma_rcv_error(struct hfi1_packet *packet, req = ack_to_tid_req(e); req->r_flow_psn = psn; - + trace_hfi1_tid_req_rcv_err(qp, 0, e->opcode, e->psn, e->lpsn, req); if (e->opcode == TID_OP(READ_REQ)) { struct ib_reth *reth; u32 offset; @@ -2088,6 +2098,7 @@ void hfi1_rc_rcv_tid_rdma_read_req(struct hfi1_packet *packet) is_fecn = process_ecn(qp, packet); psn = mask_psn(be32_to_cpu(ohdr->bth[2])); + trace_hfi1_rsp_rcv_tid_read_req(qp, psn); if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST)) rvt_comm_est(qp); @@ -2199,6 +2210,9 @@ u32 hfi1_build_tid_rdma_read_resp(struct rvt_qp *qp, struct rvt_ack_entry *e, next_offset = flow->tid_offset + *len; last_pkt = (flow->sent >= flow->length); + trace_hfi1_tid_entry_build_read_resp(qp, flow->tid_idx, tidentry); + trace_hfi1_tid_flow_build_read_resp(qp, req->clear_tail, flow); + rcu_read_lock(); remote = rcu_dereference(qpriv->tid_rdma.remote); if (!remote) { @@ -2293,6 +2307,7 @@ void hfi1_rc_rcv_tid_rdma_read_resp(struct hfi1_packet *packet) unsigned long flags; u32 kpsn, ipsn; + trace_hfi1_sender_rcv_tid_read_resp(qp); is_fecn = process_ecn(qp, packet); kpsn = mask_psn(be32_to_cpu(ohdr->bth[2])); aeth = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.aeth); @@ -2322,6 +2337,12 @@ void hfi1_rc_rcv_tid_rdma_read_resp(struct hfi1_packet *packet) hfi1_schedule_send(qp); } + trace_hfi1_ack(qp, ipsn); + trace_hfi1_tid_req_rcv_read_resp(qp, 0, req->e.swqe->wr.opcode, + req->e.swqe->psn, req->e.swqe->lpsn, + req); + trace_hfi1_tid_flow_rcv_read_resp(qp, req->clear_tail, flow); + /* Release the tid resources */ hfi1_kern_exp_rcv_clear(req); @@ -2671,6 +2692,8 @@ bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd, unsigned long flags; bool ret = true; + trace_hfi1_msg_handle_kdeth_eflags(NULL, "Kdeth error: rhf ", + packet->rhf); if (packet->rhf & (RHF_VCRC_ERR | RHF_ICRC_ERR)) return ret; @@ -2754,12 +2777,20 @@ void hfi1_tid_rdma_restart_req(struct rvt_qp *qp, struct rvt_swqe *wqe, if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) { *bth2 = mask_psn(qp->s_psn); flow = find_flow_ib(req, *bth2, &fidx); - if (!flow) + if (!flow) { + trace_hfi1_msg_tid_restart_req(/* msg */ + qp, "!!!!!! Could not find flow to restart: bth2 ", + (u64)*bth2); + trace_hfi1_tid_req_restart_req(qp, 0, wqe->wr.opcode, + wqe->psn, wqe->lpsn, + req); return; + } } else { return; } + trace_hfi1_tid_flow_restart_req(qp, fidx, flow); diff = delta_psn(*bth2, flow->flow_state.ib_spsn); flow->sent = 0; @@ -2794,6 +2825,9 @@ void hfi1_tid_rdma_restart_req(struct rvt_qp *qp, struct rvt_swqe *wqe, /* Move flow_idx to correct index */ req->flow_idx = fidx; + trace_hfi1_tid_flow_restart_req(qp, fidx, flow); + trace_hfi1_tid_req_restart_req(qp, 0, wqe->wr.opcode, wqe->psn, + wqe->lpsn, req); req->state = TID_REQUEST_ACTIVE; } @@ -2868,14 +2902,17 @@ interlock: } /* Does @sge meet the alignment requirements for tid rdma? */ -static inline bool hfi1_check_sge_align(struct rvt_sge *sge, int num_sge) +static inline bool hfi1_check_sge_align(struct rvt_qp *qp, + struct rvt_sge *sge, int num_sge) { int i; - for (i = 0; i < num_sge; i++, sge++) + for (i = 0; i < num_sge; i++, sge++) { + trace_hfi1_sge_check_align(qp, i, sge); if ((u64)sge->vaddr & ~PAGE_MASK || sge->sge_length & ~PAGE_MASK) return false; + } return true; } @@ -2904,7 +2941,8 @@ void setup_tid_rdma_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe) goto exit; if (wqe->wr.opcode == IB_WR_RDMA_READ) { - if (hfi1_check_sge_align(&wqe->sg_list[0], wqe->wr.num_sge)) { + if (hfi1_check_sge_align(qp, &wqe->sg_list[0], + wqe->wr.num_sge)) { new_opcode = IB_WR_TID_RDMA_READ; do_tid_rdma = true; } @@ -2930,6 +2968,9 @@ void setup_tid_rdma_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe) priv->tid_req.comp_seg = 0; priv->tid_req.ack_seg = 0; priv->tid_req.state = TID_REQUEST_INACTIVE; + trace_hfi1_tid_req_setup_tid_wqe(qp, 1, wqe->wr.opcode, + wqe->psn, wqe->lpsn, + &priv->tid_req); } exit: rcu_read_unlock(); diff --git a/drivers/infiniband/hw/hfi1/trace.c b/drivers/infiniband/hw/hfi1/trace.c index f1154c3c294e..28181d711fed 100644 --- a/drivers/infiniband/hw/hfi1/trace.c +++ b/drivers/infiniband/hw/hfi1/trace.c @@ -129,6 +129,10 @@ const char *hfi1_trace_get_packet_l2_str(u8 l2) #define IETH_PRN "ieth rkey:0x%.8x" #define ATOMICACKETH_PRN "origdata:%llx" #define ATOMICETH_PRN "vaddr:0x%llx rkey:0x%.8x sdata:%llx cdata:%llx" +#define TID_RDMA_KDETH "kdeth0 0x%x kdeth1 0x%x" +#define TID_RDMA_KDETH_DATA "kdeth0 0x%x: kver %u sh %u intr %u tidctrl %u tid %x offset %x kdeth1 0x%x: jkey %x" +#define TID_READ_REQ_PRN "tid_flow_psn 0x%x tid_flow_qp 0x%x verbs_qp 0x%x" +#define TID_READ_RSP_PRN "verbs_qp 0x%x" #define OP(transport, op) IB_OPCODE_## transport ## _ ## op @@ -323,6 +327,38 @@ const char *parse_everbs_hdrs( parse_syndrome(be32_to_cpu(eh->aeth) >> 24), be32_to_cpu(eh->aeth) & IB_MSN_MASK); break; + case OP(TID_RDMA, READ_REQ): + trace_seq_printf(p, TID_RDMA_KDETH " " RETH_PRN " " + TID_READ_REQ_PRN, + le32_to_cpu(eh->tid_rdma.r_req.kdeth0), + le32_to_cpu(eh->tid_rdma.r_req.kdeth1), + ib_u64_get(&eh->tid_rdma.r_req.reth.vaddr), + be32_to_cpu(eh->tid_rdma.r_req.reth.rkey), + be32_to_cpu(eh->tid_rdma.r_req.reth.length), + be32_to_cpu(eh->tid_rdma.r_req.tid_flow_psn), + be32_to_cpu(eh->tid_rdma.r_req.tid_flow_qp), + be32_to_cpu(eh->tid_rdma.r_req.verbs_qp)); + break; + case OP(TID_RDMA, READ_RESP): + trace_seq_printf(p, TID_RDMA_KDETH_DATA " " AETH_PRN " " + TID_READ_RSP_PRN, + le32_to_cpu(eh->tid_rdma.r_rsp.kdeth0), + KDETH_GET(eh->tid_rdma.r_rsp.kdeth0, KVER), + KDETH_GET(eh->tid_rdma.r_rsp.kdeth0, SH), + KDETH_GET(eh->tid_rdma.r_rsp.kdeth0, INTR), + KDETH_GET(eh->tid_rdma.r_rsp.kdeth0, TIDCTRL), + KDETH_GET(eh->tid_rdma.r_rsp.kdeth0, TID), + KDETH_GET(eh->tid_rdma.r_rsp.kdeth0, OFFSET), + le32_to_cpu(eh->tid_rdma.r_rsp.kdeth1), + KDETH_GET(eh->tid_rdma.r_rsp.kdeth1, JKEY), + be32_to_cpu(eh->tid_rdma.r_rsp.aeth) >> 24, + parse_syndrome(/* aeth */ + be32_to_cpu(eh->tid_rdma.r_rsp.aeth) + >> 24), + (be32_to_cpu(eh->tid_rdma.r_rsp.aeth) & + IB_MSN_MASK), + be32_to_cpu(eh->tid_rdma.r_rsp.verbs_qp)); + break; /* aeth + atomicacketh */ case OP(RC, ATOMIC_ACKNOWLEDGE): trace_seq_printf(p, AETH_PRN " " ATOMICACKETH_PRN, diff --git a/drivers/infiniband/hw/hfi1/trace_ibhdrs.h b/drivers/infiniband/hw/hfi1/trace_ibhdrs.h index 1dc2c28fc96e..1116238bf24d 100644 --- a/drivers/infiniband/hw/hfi1/trace_ibhdrs.h +++ b/drivers/infiniband/hw/hfi1/trace_ibhdrs.h @@ -79,6 +79,8 @@ __print_symbolic(opcode, \ ib_opcode_name(RC_ATOMIC_ACKNOWLEDGE), \ ib_opcode_name(RC_COMPARE_SWAP), \ ib_opcode_name(RC_FETCH_ADD), \ + ib_opcode_name(TID_RDMA_READ_REQ), \ + ib_opcode_name(TID_RDMA_READ_RESP), \ ib_opcode_name(UC_SEND_FIRST), \ ib_opcode_name(UC_SEND_MIDDLE), \ ib_opcode_name(UC_SEND_LAST), \ diff --git a/drivers/infiniband/hw/hfi1/trace_rc.h b/drivers/infiniband/hw/hfi1/trace_rc.h index 8ce476570462..1ebca37862e0 100644 --- a/drivers/infiniband/hw/hfi1/trace_rc.h +++ b/drivers/infiniband/hw/hfi1/trace_rc.h @@ -109,6 +109,54 @@ DEFINE_EVENT(hfi1_rc_template, hfi1_rcv_error, TP_ARGS(qp, psn) ); +DEFINE_EVENT(/* event */ + hfi1_rc_template, hfi1_rc_completion, + TP_PROTO(struct rvt_qp *qp, u32 psn), + TP_ARGS(qp, psn) +); + +DECLARE_EVENT_CLASS(/* rc_ack */ + hfi1_rc_ack_template, + TP_PROTO(struct rvt_qp *qp, u32 aeth, u32 psn, + struct rvt_swqe *wqe), + TP_ARGS(qp, aeth, psn, wqe), + TP_STRUCT__entry(/* entry */ + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(u32, aeth) + __field(u32, psn) + __field(u8, opcode) + __field(u32, spsn) + __field(u32, lpsn) + ), + TP_fast_assign(/* assign */ + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)) + __entry->qpn = qp->ibqp.qp_num; + __entry->aeth = aeth; + __entry->psn = psn; + __entry->opcode = wqe->wr.opcode; + __entry->spsn = wqe->psn; + __entry->lpsn = wqe->lpsn; + ), + TP_printk(/* print */ + "[%s] qpn 0x%x aeth 0x%x psn 0x%x opcode 0x%x spsn 0x%x lpsn 0x%x", + __get_str(dev), + __entry->qpn, + __entry->aeth, + __entry->psn, + __entry->opcode, + __entry->spsn, + __entry->lpsn + ) +); + +DEFINE_EVENT(/* do_rc_ack */ + hfi1_rc_ack_template, hfi1_rc_ack_do, + TP_PROTO(struct rvt_qp *qp, u32 aeth, u32 psn, + struct rvt_swqe *wqe), + TP_ARGS(qp, aeth, psn, wqe) +); + #endif /* __HFI1_TRACE_RC_H */ #undef TRACE_INCLUDE_PATH diff --git a/drivers/infiniband/hw/hfi1/trace_tid.h b/drivers/infiniband/hw/hfi1/trace_tid.h index c1da744f44a5..b71638c22d4b 100644 --- a/drivers/infiniband/hw/hfi1/trace_tid.h +++ b/drivers/infiniband/hw/hfi1/trace_tid.h @@ -31,11 +31,41 @@ u16 hfi1_trace_get_tid_idx(u32 ent); #define TID_FLOW_PRN "[%s] qpn 0x%x flow %d: idx %d resp_ib_psn 0x%x " \ "generation 0x%x fpsn 0x%x-%x r_next_psn 0x%x " \ - "npagesets %u tnode_cnt %u tidcnt %u length %u" + "ib_psn 0x%x-%x npagesets %u tnode_cnt %u " \ + "tidcnt %u tid_idx %u tid_offset %u length %u sent %u" #define TID_NODE_PRN "[%s] qpn 0x%x %s idx %u grp base 0x%x map 0x%x " \ "used %u cnt %u" +#define RSP_INFO_PRN "[%s] qpn 0x%x state 0x%x s_state 0x%x psn 0x%x " \ + "r_psn 0x%x r_state 0x%x r_flags 0x%x " \ + "r_head_ack_queue %u s_tail_ack_queue %u " \ + "s_ack_state 0x%x " \ + "s_nak_state 0x%x s_flags 0x%x ps_flags 0x%x " \ + "iow_flags 0x%lx" + +#define SENDER_INFO_PRN "[%s] qpn 0x%x state 0x%x s_cur %u s_tail %u " \ + "s_head %u s_acked %u s_last %u s_psn 0x%x " \ + "s_last_psn 0x%x s_flags 0x%x ps_flags 0x%x " \ + "iow_flags 0x%lx s_state 0x%x s_num_rd %u s_retry %u" + +#define TID_READ_SENDER_PRN "[%s] qpn 0x%x newreq %u tid_r_reqs %u " \ + "tid_r_comp %u pending_tid_r_segs %u " \ + "s_flags 0x%x ps_flags 0x%x iow_flags 0x%lx " \ + "hw_flow_index %u generation 0x%x " \ + "fpsn 0x%x flow_flags 0x%x" + +#define TID_REQ_PRN "[%s] qpn 0x%x newreq %u opcode 0x%x psn 0x%x lpsn 0x%x " \ + "cur_seg %u comp_seg %u ack_seg %u " \ + "total_segs %u setup_head %u clear_tail %u flow_idx %u " \ + "state %u r_flow_psn 0x%x " \ + "s_next_psn 0x%x" + +#define RCV_ERR_PRN "[%s] qpn 0x%x s_flags 0x%x state 0x%x " \ + "s_tail_ack_queue %u " \ + "r_head_ack_queue %u opcode 0x%x psn 0x%x r_psn 0x%x " \ + " diff %d" + DECLARE_EVENT_CLASS(/* class */ hfi1_exp_tid_reg_unreg, TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, u32 npages, @@ -340,6 +370,18 @@ DEFINE_EVENT(/* event */ TP_ARGS(qp, msg, more) ); +DEFINE_EVENT(/* event */ + hfi1_msg_template, hfi1_msg_tid_restart_req, + TP_PROTO(struct rvt_qp *qp, const char *msg, u64 more), + TP_ARGS(qp, msg, more) +); + +DEFINE_EVENT(/* event */ + hfi1_msg_template, hfi1_msg_handle_kdeth_eflags, + TP_PROTO(struct rvt_qp *qp, const char *msg, u64 more), + TP_ARGS(qp, msg, more) +); + DECLARE_EVENT_CLASS(/* tid_flow_page */ hfi1_tid_flow_page_template, TP_PROTO(struct rvt_qp *qp, struct tid_rdma_flow *flow, u32 index, @@ -429,10 +471,15 @@ DECLARE_EVENT_CLASS(/* tid_fow */ __field(u32, fspsn) __field(u32, flpsn) __field(u32, r_next_psn) + __field(u32, ib_spsn) + __field(u32, ib_lpsn) __field(u32, npagesets) __field(u32, tnode_cnt) __field(u32, tidcnt) + __field(u32, tid_idx) + __field(u32, tid_offset) __field(u32, length) + __field(u32, sent) ), TP_fast_assign(/* assign */ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); @@ -446,10 +493,15 @@ DECLARE_EVENT_CLASS(/* tid_fow */ __entry->flpsn = full_flow_psn(flow, flow->flow_state.lpsn); __entry->r_next_psn = flow->flow_state.r_next_psn; + __entry->ib_spsn = flow->flow_state.ib_spsn; + __entry->ib_lpsn = flow->flow_state.ib_lpsn; __entry->npagesets = flow->npagesets; __entry->tnode_cnt = flow->tnode_cnt; __entry->tidcnt = flow->tidcnt; + __entry->tid_idx = flow->tid_idx; + __entry->tid_offset = flow->tid_offset; __entry->length = flow->length; + __entry->sent = flow->sent; ), TP_printk(/* print */ TID_FLOW_PRN, @@ -462,10 +514,15 @@ DECLARE_EVENT_CLASS(/* tid_fow */ __entry->fspsn, __entry->flpsn, __entry->r_next_psn, + __entry->ib_spsn, + __entry->ib_lpsn, __entry->npagesets, __entry->tnode_cnt, __entry->tidcnt, - __entry->length + __entry->tid_idx, + __entry->tid_offset, + __entry->length, + __entry->sent ) ); @@ -475,6 +532,36 @@ DEFINE_EVENT(/* event */ TP_ARGS(qp, index, flow) ); +DEFINE_EVENT(/* event */ + hfi1_tid_flow_template, hfi1_tid_flow_build_read_pkt, + TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow), + TP_ARGS(qp, index, flow) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_flow_template, hfi1_tid_flow_build_read_resp, + TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow), + TP_ARGS(qp, index, flow) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_flow_template, hfi1_tid_flow_rcv_read_req, + TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow), + TP_ARGS(qp, index, flow) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_flow_template, hfi1_tid_flow_rcv_read_resp, + TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow), + TP_ARGS(qp, index, flow) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_flow_template, hfi1_tid_flow_restart_req, + TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow), + TP_ARGS(qp, index, flow) +); + DECLARE_EVENT_CLASS(/* tid_node */ hfi1_tid_node_template, TP_PROTO(struct rvt_qp *qp, const char *msg, u32 index, u32 base, @@ -557,6 +644,443 @@ DEFINE_EVENT(/* event */ TP_ARGS(qp, index, entry) ); +DEFINE_EVENT(/* event */ + hfi1_tid_entry_template, hfi1_tid_entry_build_read_resp, + TP_PROTO(struct rvt_qp *qp, int index, u32 ent), + TP_ARGS(qp, index, ent) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_entry_template, hfi1_tid_entry_rcv_read_req, + TP_PROTO(struct rvt_qp *qp, int index, u32 ent), + TP_ARGS(qp, index, ent) +); + +DECLARE_EVENT_CLASS(/* rsp_info */ + hfi1_responder_info_template, + TP_PROTO(struct rvt_qp *qp, u32 psn), + TP_ARGS(qp, psn), + TP_STRUCT__entry(/* entry */ + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(u8, state) + __field(u8, s_state) + __field(u32, psn) + __field(u32, r_psn) + __field(u8, r_state) + __field(u8, r_flags) + __field(u8, r_head_ack_queue) + __field(u8, s_tail_ack_queue) + __field(u8, s_ack_state) + __field(u8, s_nak_state) + __field(u8, r_nak_state) + __field(u32, s_flags) + __field(u32, ps_flags) + __field(unsigned long, iow_flags) + ), + TP_fast_assign(/* assign */ + struct hfi1_qp_priv *priv = qp->priv; + + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); + __entry->qpn = qp->ibqp.qp_num; + __entry->state = qp->state; + __entry->s_state = qp->s_state; + __entry->psn = psn; + __entry->r_psn = qp->r_psn; + __entry->r_state = qp->r_state; + __entry->r_flags = qp->r_flags; + __entry->r_head_ack_queue = qp->r_head_ack_queue; + __entry->s_tail_ack_queue = qp->s_tail_ack_queue; + __entry->s_ack_state = qp->s_ack_state; + __entry->s_nak_state = qp->s_nak_state; + __entry->s_flags = qp->s_flags; + __entry->ps_flags = priv->s_flags; + __entry->iow_flags = priv->s_iowait.flags; + ), + TP_printk(/* print */ + RSP_INFO_PRN, + __get_str(dev), + __entry->qpn, + __entry->state, + __entry->s_state, + __entry->psn, + __entry->r_psn, + __entry->r_state, + __entry->r_flags, + __entry->r_head_ack_queue, + __entry->s_tail_ack_queue, + __entry->s_ack_state, + __entry->s_nak_state, + __entry->s_flags, + __entry->ps_flags, + __entry->iow_flags + ) +); + +DEFINE_EVENT(/* event */ + hfi1_responder_info_template, hfi1_rsp_make_rc_ack, + TP_PROTO(struct rvt_qp *qp, u32 psn), + TP_ARGS(qp, psn) +); + +DEFINE_EVENT(/* event */ + hfi1_responder_info_template, hfi1_rsp_rcv_tid_read_req, + TP_PROTO(struct rvt_qp *qp, u32 psn), + TP_ARGS(qp, psn) +); + +DEFINE_EVENT(/* event */ + hfi1_responder_info_template, hfi1_rsp_tid_rcv_error, + TP_PROTO(struct rvt_qp *qp, u32 psn), + TP_ARGS(qp, psn) +); + +DECLARE_EVENT_CLASS(/* sender_info */ + hfi1_sender_info_template, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp), + TP_STRUCT__entry(/* entry */ + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(u8, state) + __field(u32, s_cur) + __field(u32, s_tail) + __field(u32, s_head) + __field(u32, s_acked) + __field(u32, s_last) + __field(u32, s_psn) + __field(u32, s_last_psn) + __field(u32, s_flags) + __field(u32, ps_flags) + __field(unsigned long, iow_flags) + __field(u8, s_state) + __field(u8, s_num_rd) + __field(u8, s_retry) + ), + TP_fast_assign(/* assign */ + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)) + __entry->qpn = qp->ibqp.qp_num; + __entry->state = qp->state; + __entry->s_cur = qp->s_cur; + __entry->s_tail = qp->s_tail; + __entry->s_head = qp->s_head; + __entry->s_acked = qp->s_acked; + __entry->s_last = qp->s_last; + __entry->s_psn = qp->s_psn; + __entry->s_last_psn = qp->s_last_psn; + __entry->s_flags = qp->s_flags; + __entry->ps_flags = ((struct hfi1_qp_priv *)qp->priv)->s_flags; + __entry->iow_flags = + ((struct hfi1_qp_priv *)qp->priv)->s_iowait.flags; + __entry->s_state = qp->s_state; + __entry->s_num_rd = qp->s_num_rd_atomic; + __entry->s_retry = qp->s_retry; + ), + TP_printk(/* print */ + SENDER_INFO_PRN, + __get_str(dev), + __entry->qpn, + __entry->state, + __entry->s_cur, + __entry->s_tail, + __entry->s_head, + __entry->s_acked, + __entry->s_last, + __entry->s_psn, + __entry->s_last_psn, + __entry->s_flags, + __entry->ps_flags, + __entry->iow_flags, + __entry->s_state, + __entry->s_num_rd, + __entry->s_retry + ) +); + +DEFINE_EVENT(/* event */ + hfi1_sender_info_template, hfi1_sender_make_rc_req, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DEFINE_EVENT(/* event */ + hfi1_sender_info_template, hfi1_sender_reset_psn, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DEFINE_EVENT(/* event */ + hfi1_sender_info_template, hfi1_sender_restart_rc, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DEFINE_EVENT(/* event */ + hfi1_sender_info_template, hfi1_sender_do_rc_ack, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DEFINE_EVENT(/* event */ + hfi1_sender_info_template, hfi1_sender_rcv_tid_read_resp, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DECLARE_EVENT_CLASS(/* tid_read_sender */ + hfi1_tid_read_sender_template, + TP_PROTO(struct rvt_qp *qp, char newreq), + TP_ARGS(qp, newreq), + TP_STRUCT__entry(/* entry */ + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(char, newreq) + __field(u32, tid_r_reqs) + __field(u32, tid_r_comp) + __field(u32, pending_tid_r_segs) + __field(u32, s_flags) + __field(u32, ps_flags) + __field(unsigned long, iow_flags) + __field(u32, hw_flow_index) + __field(u32, generation) + __field(u32, fpsn) + __field(u32, flow_flags) + ), + TP_fast_assign(/* assign */ + struct hfi1_qp_priv *priv = qp->priv; + + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); + __entry->qpn = qp->ibqp.qp_num; + __entry->newreq = newreq; + __entry->tid_r_reqs = priv->tid_r_reqs; + __entry->tid_r_comp = priv->tid_r_comp; + __entry->pending_tid_r_segs = priv->pending_tid_r_segs; + __entry->s_flags = qp->s_flags; + __entry->ps_flags = priv->s_flags; + __entry->iow_flags = priv->s_iowait.flags; + __entry->hw_flow_index = priv->flow_state.index; + __entry->generation = priv->flow_state.generation; + __entry->fpsn = priv->flow_state.psn; + __entry->flow_flags = priv->flow_state.flags; + ), + TP_printk(/* print */ + TID_READ_SENDER_PRN, + __get_str(dev), + __entry->qpn, + __entry->newreq, + __entry->tid_r_reqs, + __entry->tid_r_comp, + __entry->pending_tid_r_segs, + __entry->s_flags, + __entry->ps_flags, + __entry->iow_flags, + __entry->hw_flow_index, + __entry->generation, + __entry->fpsn, + __entry->flow_flags + ) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_read_sender_template, hfi1_tid_read_sender_make_req, + TP_PROTO(struct rvt_qp *qp, char newreq), + TP_ARGS(qp, newreq) +); + +DECLARE_EVENT_CLASS(/* tid_rdma_request */ + hfi1_tid_rdma_request_template, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req), + TP_STRUCT__entry(/* entry */ + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(char, newreq) + __field(u8, opcode) + __field(u32, psn) + __field(u32, lpsn) + __field(u32, cur_seg) + __field(u32, comp_seg) + __field(u32, ack_seg) + __field(u32, total_segs) + __field(u16, setup_head) + __field(u16, clear_tail) + __field(u16, flow_idx) + __field(u32, state) + __field(u32, r_flow_psn) + __field(u32, s_next_psn) + ), + TP_fast_assign(/* assign */ + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); + __entry->qpn = qp->ibqp.qp_num; + __entry->newreq = newreq; + __entry->opcode = opcode; + __entry->psn = psn; + __entry->lpsn = lpsn; + __entry->cur_seg = req->cur_seg; + __entry->comp_seg = req->comp_seg; + __entry->ack_seg = req->ack_seg; + __entry->total_segs = req->total_segs; + __entry->setup_head = req->setup_head; + __entry->clear_tail = req->clear_tail; + __entry->flow_idx = req->flow_idx; + __entry->state = req->state; + __entry->r_flow_psn = req->r_flow_psn; + __entry->s_next_psn = req->s_next_psn; + ), + TP_printk(/* print */ + TID_REQ_PRN, + __get_str(dev), + __entry->qpn, + __entry->newreq, + __entry->opcode, + __entry->psn, + __entry->lpsn, + __entry->cur_seg, + __entry->comp_seg, + __entry->ack_seg, + __entry->total_segs, + __entry->setup_head, + __entry->clear_tail, + __entry->flow_idx, + __entry->state, + __entry->r_flow_psn, + __entry->s_next_psn + ) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_make_req_read, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_build_read_req, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_read_req, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_read_resp, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_err, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_restart_req, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_setup_tid_wqe, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DECLARE_EVENT_CLASS(/* rc_rcv_err */ + hfi1_rc_rcv_err_template, + TP_PROTO(struct rvt_qp *qp, u32 opcode, u32 psn, int diff), + TP_ARGS(qp, opcode, psn, diff), + TP_STRUCT__entry(/* entry */ + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(u32, s_flags) + __field(u8, state) + __field(u8, s_tail_ack_queue) + __field(u8, r_head_ack_queue) + __field(u32, opcode) + __field(u32, psn) + __field(u32, r_psn) + __field(int, diff) + ), + TP_fast_assign(/* assign */ + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)) + __entry->qpn = qp->ibqp.qp_num; + __entry->s_flags = qp->s_flags; + __entry->state = qp->state; + __entry->s_tail_ack_queue = qp->s_tail_ack_queue; + __entry->r_head_ack_queue = qp->r_head_ack_queue; + __entry->opcode = opcode; + __entry->psn = psn; + __entry->r_psn = qp->r_psn; + __entry->diff = diff; + ), + TP_printk(/* print */ + RCV_ERR_PRN, + __get_str(dev), + __entry->qpn, + __entry->s_flags, + __entry->state, + __entry->s_tail_ack_queue, + __entry->r_head_ack_queue, + __entry->opcode, + __entry->psn, + __entry->r_psn, + __entry->diff + ) +); + +DEFINE_EVENT(/* event */ + hfi1_rc_rcv_err_template, hfi1_tid_rdma_rcv_err, + TP_PROTO(struct rvt_qp *qp, u32 opcode, u32 psn, int diff), + TP_ARGS(qp, opcode, psn, diff) +); + +DECLARE_EVENT_CLASS(/* sge */ + hfi1_sge_template, + TP_PROTO(struct rvt_qp *qp, int index, struct rvt_sge *sge), + TP_ARGS(qp, index, sge), + TP_STRUCT__entry(/* entry */ + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(int, index) + __field(u64, vaddr) + __field(u32, sge_length) + ), + TP_fast_assign(/* assign */ + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); + __entry->qpn = qp->ibqp.qp_num; + __entry->index = index; + __entry->vaddr = (u64)sge->vaddr; + __entry->sge_length = sge->sge_length; + ), + TP_printk(/* print */ + "[%s] qpn 0x%x sge %d: vaddr 0x%llx sge_length %u", + __get_str(dev), + __entry->qpn, + __entry->index, + __entry->vaddr, + __entry->sge_length + ) +); + +DEFINE_EVENT(/* event */ + hfi1_sge_template, hfi1_sge_check_align, + TP_PROTO(struct rvt_qp *qp, int index, struct rvt_sge *sge), + TP_ARGS(qp, index, sge) +); + #endif /* __HFI1_TRACE_TID_H */ #undef TRACE_INCLUDE_PATH diff --git a/drivers/infiniband/hw/hfi1/trace_tx.h b/drivers/infiniband/hw/hfi1/trace_tx.h index c57af3b31fe1..37dbb3e599c3 100644 --- a/drivers/infiniband/hw/hfi1/trace_tx.h +++ b/drivers/infiniband/hw/hfi1/trace_tx.h @@ -114,19 +114,27 @@ DECLARE_EVENT_CLASS(hfi1_qpsleepwakeup_template, __field(u32, qpn) __field(u32, flags) __field(u32, s_flags) + __field(u32, ps_flags) + __field(unsigned long, iow_flags) ), TP_fast_assign( DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)) __entry->flags = flags; __entry->qpn = qp->ibqp.qp_num; __entry->s_flags = qp->s_flags; + __entry->ps_flags = + ((struct hfi1_qp_priv *)qp->priv)->s_flags; + __entry->iow_flags = + ((struct hfi1_qp_priv *)qp->priv)->s_iowait.flags; ), TP_printk( - "[%s] qpn 0x%x flags 0x%x s_flags 0x%x", + "[%s] qpn 0x%x flags 0x%x s_flags 0x%x ps_flags 0x%x iow_flags 0x%lx", __get_str(dev), __entry->qpn, __entry->flags, - __entry->s_flags + __entry->s_flags, + __entry->ps_flags, + __entry->iow_flags ) ); -- cgit v1.2.3-59-g8ed1b From 4f9264d156dc6c154a8a6cfae780730bad45c6f8 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 21:48:48 -0800 Subject: IB/hfi1: Add an s_acked_ack_queue pointer The s_ack_queue is managed by two pointers into the ring: r_head_ack_queue and s_tail_ack_queue. r_head_ack_queue is the index of where the next received request is going to be placed and s_tail_ack_queue is the entry of the request currently being processed. This works perfectly fine for normal Verbs as the requests are processed one at a time and the s_tail_ack_queue is not moved until the request that it points to is fully completed. In this fashion, s_tail_ack_queue constantly chases r_head_ack_queue and the two pointers can easily be used to determine "queue full" and "queue empty" conditions. The detection of these two conditions are imported in determining when an old entry can safely be overwritten with a new received request and the resources associated with the old request be safely released. When pipelined TID RDMA WRITE is introduced into this mix, things look very different. r_head_ack_queue is still the point at which a newly received request will be inserted, s_tail_ack_queue is still the currently processed request. However, with pipelined TID RDMA WRITE requests, s_tail_ack_queue moves to the next request once all TID RDMA WRITE responses for that request have been sent. The rest of the protocol for a particular request is managed by other pointers specific to TID RDMA - r_tid_tail and r_tid_ack - which point to the entries for which the next TID RDMA DATA packets are going to arrive and the request for which the next TID RDMA ACK packets are to be generated, respectively. What this means is that entries in the ring, which are "behind" s_tail_ack_queue (entries which s_tail_ack_queue has gone past) are no longer considered complete. This is where the problem is - a newly received request could potentially overwrite a still active TID RDMA WRITE request. The reason why the TID RDMA pointers trail s_tail_ack_queue is that the normal Verbs send engine uses s_tail_ack_queue as the pointer for the next response. Since TID RDMA WRITE responses are processed by the normal Verbs send engine, s_tail_ack_queue had to be moved to the next entry once all TID RDMA WRITE response packets were sent to get the desired pipelining between requests. Doing otherwise would mean that the normal Verbs send engine would not be able to send the TID RDMA WRITE responses for the next TID RDMA request until the current one is fully completed. This patch introduces the s_acked_ack_queue index to point to the next request to complete on the responder side. For requests other than TID RDMA WRITE, s_acked_ack_queue should always be kept in sync with s_tail_ack_queue. For TID RDMA WRITE request, it may fall behind s_tail_ack_queue. Reviewed-by: Mike Marciniszyn Signed-off-by: Mitko Haralanov Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/rc.c | 33 ++++++++++++++++++++++++++++----- drivers/infiniband/hw/hfi1/rc.h | 1 + drivers/infiniband/hw/hfi1/tid_rdma.c | 2 ++ drivers/infiniband/hw/hfi1/trace_tid.h | 10 ++++++++-- drivers/infiniband/sw/rdmavt/qp.c | 1 + include/rdma/rdmavt_qp.h | 1 + 6 files changed, 41 insertions(+), 7 deletions(-) (limited to 'drivers/infiniband/hw/hfi1/rc.c') diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index 6c9ef572fc69..9dc8e524510e 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -120,6 +120,7 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, struct hfi1_qp_priv *priv = qp->priv; bool last_pkt; u32 delta; + u8 next = qp->s_tail_ack_queue; trace_hfi1_rsp_make_rc_ack(qp, 0); lockdep_assert_held(&qp->s_lock); @@ -149,9 +150,17 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, * response has been sent instead of only being * constructed. */ - if (++qp->s_tail_ack_queue > - rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) - qp->s_tail_ack_queue = 0; + if (++next > rvt_size_atomic(&dev->rdi)) + next = 0; + /* + * Only advance the s_acked_ack_queue pointer if there + * have been no TID RDMA requests. + */ + e = &qp->s_ack_queue[qp->s_tail_ack_queue]; + if (e->opcode != TID_OP(WRITE_REQ) && + qp->s_acked_ack_queue == qp->s_tail_ack_queue) + qp->s_acked_ack_queue = next; + qp->s_tail_ack_queue = next; /* FALLTHROUGH */ case OP(SEND_ONLY): case OP(ACKNOWLEDGE): @@ -172,6 +181,10 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, */ len = e->rdma_sge.sge_length; if (len && !e->rdma_sge.mr) { + if (qp->s_acked_ack_queue == + qp->s_tail_ack_queue) + qp->s_acked_ack_queue = + qp->r_head_ack_queue; qp->s_tail_ack_queue = qp->r_head_ack_queue; goto bail; } @@ -202,6 +215,10 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, */ len = e->rdma_sge.sge_length; if (len && !e->rdma_sge.mr) { + if (qp->s_acked_ack_queue == + qp->s_tail_ack_queue) + qp->s_acked_ack_queue = + qp->r_head_ack_queue; qp->s_tail_ack_queue = qp->r_head_ack_queue; goto bail; } @@ -2235,6 +2252,8 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data, e->psn = psn; if (old_req) goto unlock_done; + if (qp->s_acked_ack_queue == qp->s_tail_ack_queue) + qp->s_acked_ack_queue = prev; qp->s_tail_ack_queue = prev; break; } @@ -2248,6 +2267,8 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data, */ if (!e || e->opcode != (u8)opcode || old_req) goto unlock_done; + if (qp->s_tail_ack_queue == qp->s_acked_ack_queue) + qp->s_acked_ack_queue = prev; qp->s_tail_ack_queue = prev; break; } @@ -2274,6 +2295,8 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data, * Resend the RDMA read or atomic op which * ACKs this duplicate request. */ + if (qp->s_tail_ack_queue == qp->s_acked_ack_queue) + qp->s_acked_ack_queue = mra; qp->s_tail_ack_queue = mra; break; } @@ -2646,7 +2669,7 @@ send_last: if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) next = 0; spin_lock_irqsave(&qp->s_lock, flags); - if (unlikely(next == qp->s_tail_ack_queue)) { + if (unlikely(next == qp->s_acked_ack_queue)) { if (!qp->s_ack_queue[next].sent) goto nack_inv_unlck; update_ack_queue(qp, next); @@ -2723,7 +2746,7 @@ send_last: if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) next = 0; spin_lock_irqsave(&qp->s_lock, flags); - if (unlikely(next == qp->s_tail_ack_queue)) { + if (unlikely(next == qp->s_acked_ack_queue)) { if (!qp->s_ack_queue[next].sent) goto nack_inv_unlck; update_ack_queue(qp, next); diff --git a/drivers/infiniband/hw/hfi1/rc.h b/drivers/infiniband/hw/hfi1/rc.h index 4329eadcb3df..8e0935b9bf2a 100644 --- a/drivers/infiniband/hw/hfi1/rc.h +++ b/drivers/infiniband/hw/hfi1/rc.h @@ -18,6 +18,7 @@ static inline void update_ack_queue(struct rvt_qp *qp, unsigned int n) if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) next = 0; qp->s_tail_ack_queue = next; + qp->s_acked_ack_queue = next; qp->s_ack_state = OP(ACKNOWLEDGE); } diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index 089e301d9bcd..c320a99afb35 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -2044,6 +2044,8 @@ static int tid_rdma_rcv_error(struct hfi1_packet *packet, goto unlock; } /* Re-process old requests.*/ + if (qp->s_acked_ack_queue == qp->s_tail_ack_queue) + qp->s_acked_ack_queue = prev; qp->s_tail_ack_queue = prev; /* * Since the qp->s_tail_ack_queue is modified, the diff --git a/drivers/infiniband/hw/hfi1/trace_tid.h b/drivers/infiniband/hw/hfi1/trace_tid.h index b71638c22d4b..51f5b0e8da71 100644 --- a/drivers/infiniband/hw/hfi1/trace_tid.h +++ b/drivers/infiniband/hw/hfi1/trace_tid.h @@ -40,7 +40,7 @@ u16 hfi1_trace_get_tid_idx(u32 ent); #define RSP_INFO_PRN "[%s] qpn 0x%x state 0x%x s_state 0x%x psn 0x%x " \ "r_psn 0x%x r_state 0x%x r_flags 0x%x " \ "r_head_ack_queue %u s_tail_ack_queue %u " \ - "s_ack_state 0x%x " \ + "s_acked_ack_queue %u s_ack_state 0x%x " \ "s_nak_state 0x%x s_flags 0x%x ps_flags 0x%x " \ "iow_flags 0x%lx" @@ -62,7 +62,7 @@ u16 hfi1_trace_get_tid_idx(u32 ent); "s_next_psn 0x%x" #define RCV_ERR_PRN "[%s] qpn 0x%x s_flags 0x%x state 0x%x " \ - "s_tail_ack_queue %u " \ + "s_acked_ack_queue %u s_tail_ack_queue %u " \ "r_head_ack_queue %u opcode 0x%x psn 0x%x r_psn 0x%x " \ " diff %d" @@ -671,6 +671,7 @@ DECLARE_EVENT_CLASS(/* rsp_info */ __field(u8, r_flags) __field(u8, r_head_ack_queue) __field(u8, s_tail_ack_queue) + __field(u8, s_acked_ack_queue) __field(u8, s_ack_state) __field(u8, s_nak_state) __field(u8, r_nak_state) @@ -691,6 +692,7 @@ DECLARE_EVENT_CLASS(/* rsp_info */ __entry->r_flags = qp->r_flags; __entry->r_head_ack_queue = qp->r_head_ack_queue; __entry->s_tail_ack_queue = qp->s_tail_ack_queue; + __entry->s_acked_ack_queue = qp->s_acked_ack_queue; __entry->s_ack_state = qp->s_ack_state; __entry->s_nak_state = qp->s_nak_state; __entry->s_flags = qp->s_flags; @@ -709,6 +711,7 @@ DECLARE_EVENT_CLASS(/* rsp_info */ __entry->r_flags, __entry->r_head_ack_queue, __entry->s_tail_ack_queue, + __entry->s_acked_ack_queue, __entry->s_ack_state, __entry->s_nak_state, __entry->s_flags, @@ -1007,6 +1010,7 @@ DECLARE_EVENT_CLASS(/* rc_rcv_err */ __field(u32, qpn) __field(u32, s_flags) __field(u8, state) + __field(u8, s_acked_ack_queue) __field(u8, s_tail_ack_queue) __field(u8, r_head_ack_queue) __field(u32, opcode) @@ -1019,6 +1023,7 @@ DECLARE_EVENT_CLASS(/* rc_rcv_err */ __entry->qpn = qp->ibqp.qp_num; __entry->s_flags = qp->s_flags; __entry->state = qp->state; + __entry->s_acked_ack_queue = qp->s_acked_ack_queue; __entry->s_tail_ack_queue = qp->s_tail_ack_queue; __entry->r_head_ack_queue = qp->r_head_ack_queue; __entry->opcode = opcode; @@ -1032,6 +1037,7 @@ DECLARE_EVENT_CLASS(/* rc_rcv_err */ __entry->qpn, __entry->s_flags, __entry->state, + __entry->s_acked_ack_queue, __entry->s_tail_ack_queue, __entry->r_head_ack_queue, __entry->opcode, diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 2769ebdf89fb..14ec2577bcaa 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -854,6 +854,7 @@ static void rvt_init_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp, qp->s_mig_state = IB_MIG_MIGRATED; qp->r_head_ack_queue = 0; qp->s_tail_ack_queue = 0; + qp->s_acked_ack_queue = 0; qp->s_num_rd_atomic = 0; if (qp->r_rq.wq) { qp->r_rq.wq->head = 0; diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index d8d88d023092..4ee612ab6cb4 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -375,6 +375,7 @@ struct rvt_qp { u8 s_rnr_retry; /* requester RNR retry counter */ u8 s_num_rd_atomic; /* number of RDMA read/atomic pending */ u8 s_tail_ack_queue; /* index into s_ack_queue[] */ + u8 s_acked_ack_queue; /* index into s_ack_queue[] */ struct rvt_sge_state s_ack_rdma_sge; struct timer_list s_timer; -- cgit v1.2.3-59-g8ed1b From 07b923701e38f93b4725e64318e6483f890c1c1d Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 21:48:59 -0800 Subject: IB/hfi1: Add functions to receive TID RDMA WRITE request This patch adds the functions to receive TID RDMA WRITE request. The request will be stored in the QP's s_ack_queue. This patch also adds code to handle duplicate TID RDMA WRITE request and a function to allocate TID resources for data receiving on the responder side. Signed-off-by: Mitko Haralanov Signed-off-by: Mike Marciniszyn Signed-off-by: Ashutosh Dixit Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/init.c | 1 + drivers/infiniband/hw/hfi1/rc.c | 3 + drivers/infiniband/hw/hfi1/tid_rdma.c | 570 +++++++++++++++++++++++++++++++++- drivers/infiniband/hw/hfi1/tid_rdma.h | 16 + drivers/infiniband/hw/hfi1/verbs.h | 12 + 5 files changed, 601 insertions(+), 1 deletion(-) (limited to 'drivers/infiniband/hw/hfi1/rc.c') diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index d13304f7340d..7841a0ad7cb6 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -1512,6 +1512,7 @@ static int __init hfi1_mod_init(void) goto bail_dev; } + hfi1_compute_tid_rdma_flow_wt(); /* * These must be called before the driver is registered with * the PCI subsystem. diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index 9dc8e524510e..fcb733ea8dfb 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -2411,6 +2411,7 @@ void hfi1_rc_rcv(struct hfi1_packet *packet) void *data = packet->payload; u32 tlen = packet->tlen; struct rvt_qp *qp = packet->qp; + struct hfi1_qp_priv *qpriv = qp->priv; struct hfi1_ibport *ibp = rcd_to_iport(rcd); struct ib_other_headers *ohdr = packet->ohdr; u32 opcode = packet->opcode; @@ -2716,6 +2717,7 @@ send_last: qp->r_state = opcode; qp->r_nak_state = 0; qp->r_head_ack_queue = next; + qpriv->r_tid_alloc = qp->r_head_ack_queue; /* Schedule the send engine. */ qp->s_flags |= RVT_S_RESP_PENDING; @@ -2789,6 +2791,7 @@ ack: qp->r_state = opcode; qp->r_nak_state = 0; qp->r_head_ack_queue = next; + qpriv->r_tid_alloc = qp->r_head_ack_queue; /* Schedule the send engine. */ qp->s_flags |= RVT_S_RESP_PENDING; diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index c320a99afb35..516dca9f497e 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -109,12 +109,15 @@ static u32 mask_generation(u32 a) * C - Capcode */ +static u32 tid_rdma_flow_wt; + static void tid_rdma_trigger_resume(struct work_struct *work); static void hfi1_kern_exp_rcv_free_flows(struct tid_rdma_request *req); static int hfi1_kern_exp_rcv_alloc_flows(struct tid_rdma_request *req, gfp_t gfp); static void hfi1_init_trdma_req(struct rvt_qp *qp, struct tid_rdma_request *req); +static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx); static u64 tid_rdma_opfn_encode(struct tid_rdma_params *p) { @@ -313,6 +316,11 @@ int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp, qpriv->flow_state.index = RXE_NUM_TID_FLOWS; qpriv->flow_state.last_index = RXE_NUM_TID_FLOWS; qpriv->flow_state.generation = KERN_GENERATION_RESERVED; + qpriv->rnr_nak_state = TID_RNR_NAK_INIT; + qpriv->r_tid_head = HFI1_QP_WQE_INVALID; + qpriv->r_tid_tail = HFI1_QP_WQE_INVALID; + qpriv->r_tid_ack = HFI1_QP_WQE_INVALID; + qpriv->r_tid_alloc = HFI1_QP_WQE_INVALID; INIT_LIST_HEAD(&qpriv->tid_wait); if (init_attr->qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) { @@ -1959,6 +1967,8 @@ static int tid_rdma_rcv_error(struct hfi1_packet *packet, { struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); struct hfi1_ctxtdata *rcd = ((struct hfi1_qp_priv *)qp->priv)->rcd; + struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); + struct hfi1_qp_priv *qpriv = qp->priv; struct rvt_ack_entry *e; struct tid_rdma_request *req; unsigned long flags; @@ -1982,7 +1992,8 @@ static int tid_rdma_rcv_error(struct hfi1_packet *packet, spin_lock_irqsave(&qp->s_lock, flags); e = find_prev_entry(qp, psn, &prev, NULL, &old_req); - if (!e || e->opcode != TID_OP(READ_REQ)) + if (!e || (e->opcode != TID_OP(READ_REQ) && + e->opcode != TID_OP(WRITE_REQ))) goto unlock; req = ack_to_tid_req(e); @@ -2042,6 +2053,114 @@ static int tid_rdma_rcv_error(struct hfi1_packet *packet, */ if (old_req) goto unlock; + } else { + struct flow_state *fstate; + bool schedule = false; + u8 i; + + if (req->state == TID_REQUEST_RESEND) { + req->state = TID_REQUEST_RESEND_ACTIVE; + } else if (req->state == TID_REQUEST_INIT_RESEND) { + req->state = TID_REQUEST_INIT; + schedule = true; + } + + /* + * True if the request is already scheduled (between + * qp->s_tail_ack_queue and qp->r_head_ack_queue). + * Also, don't change requests, which are at the SYNC + * point and haven't generated any responses yet. + * There is nothing to retransmit for them yet. + */ + if (old_req || req->state == TID_REQUEST_INIT || + (req->state == TID_REQUEST_SYNC && !req->cur_seg)) { + for (i = prev + 1; ; i++) { + if (i > rvt_size_atomic(&dev->rdi)) + i = 0; + if (i == qp->r_head_ack_queue) + break; + e = &qp->s_ack_queue[i]; + req = ack_to_tid_req(e); + if (e->opcode == TID_OP(WRITE_REQ) && + req->state == TID_REQUEST_INIT) + req->state = TID_REQUEST_INIT_RESEND; + } + /* + * If the state of the request has been changed, + * the first leg needs to get scheduled in order to + * pick up the change. Otherwise, normal response + * processing should take care of it. + */ + if (!schedule) + goto unlock; + } + + /* + * If there is no more allocated segment, just schedule the qp + * without changing any state. + */ + if (req->clear_tail == req->setup_head) + goto schedule; + /* + * If this request has sent responses for segments, which have + * not received data yet (flow_idx != clear_tail), the flow_idx + * pointer needs to be adjusted so the same responses can be + * re-sent. + */ + if (CIRC_CNT(req->flow_idx, req->clear_tail, MAX_FLOWS)) { + fstate = &req->flows[req->clear_tail].flow_state; + qpriv->pending_tid_w_segs -= + CIRC_CNT(req->flow_idx, req->clear_tail, + MAX_FLOWS); + req->flow_idx = + CIRC_ADD(req->clear_tail, + delta_psn(psn, fstate->resp_ib_psn), + MAX_FLOWS); + qpriv->pending_tid_w_segs += + delta_psn(psn, fstate->resp_ib_psn); + /* + * When flow_idx == setup_head, we've gotten a duplicate + * request for a segment, which has not been allocated + * yet. In that case, don't adjust this request. + * However, we still want to go through the loop below + * to adjust all subsequent requests. + */ + if (CIRC_CNT(req->setup_head, req->flow_idx, + MAX_FLOWS)) { + req->cur_seg = delta_psn(psn, e->psn); + req->state = TID_REQUEST_RESEND_ACTIVE; + } + } + + for (i = prev + 1; ; i++) { + /* + * Look at everything up to and including + * s_tail_ack_queue + */ + if (i > rvt_size_atomic(&dev->rdi)) + i = 0; + if (i == qp->r_head_ack_queue) + break; + e = &qp->s_ack_queue[i]; + req = ack_to_tid_req(e); + trace_hfi1_tid_req_rcv_err(qp, 0, e->opcode, e->psn, + e->lpsn, req); + if (e->opcode != TID_OP(WRITE_REQ) || + req->cur_seg == req->comp_seg || + req->state == TID_REQUEST_INIT || + req->state == TID_REQUEST_INIT_RESEND) { + if (req->state == TID_REQUEST_INIT) + req->state = TID_REQUEST_INIT_RESEND; + continue; + } + qpriv->pending_tid_w_segs -= + CIRC_CNT(req->flow_idx, + req->clear_tail, + MAX_FLOWS); + req->flow_idx = req->clear_tail; + req->state = TID_REQUEST_RESEND; + req->cur_seg = req->comp_seg; + } } /* Re-process old requests.*/ if (qp->s_acked_ack_queue == qp->s_tail_ack_queue) @@ -2054,6 +2173,18 @@ static int tid_rdma_rcv_error(struct hfi1_packet *packet, * wrong memory region. */ qp->s_ack_state = OP(ACKNOWLEDGE); +schedule: + /* + * It's possible to receive a retry psn that is earlier than an RNRNAK + * psn. In this case, the rnrnak state should be cleared. + */ + if (qpriv->rnr_nak_state) { + qp->s_nak_state = 0; + qpriv->rnr_nak_state = TID_RNR_NAK_INIT; + qp->r_psn = e->lpsn + 1; + hfi1_tid_write_alloc_resources(qp, true); + } + qp->r_state = e->opcode; qp->r_nak_state = 0; qp->s_flags |= RVT_S_RESP_PENDING; @@ -2164,6 +2295,14 @@ void hfi1_rc_rcv_tid_rdma_read_req(struct hfi1_packet *packet) qp->r_head_ack_queue = next; + /* + * For all requests other than TID WRITE which are added to the ack + * queue, qpriv->r_tid_alloc follows qp->r_head_ack_queue. It is ok to + * do this because of interlocks between these and TID WRITE + * requests. The same change has also been made in hfi1_rc_rcv(). + */ + qpriv->r_tid_alloc = qp->r_head_ack_queue; + /* Schedule the send tasklet. */ qp->s_flags |= RVT_S_RESP_PENDING; hfi1_schedule_send(qp); @@ -3015,3 +3154,432 @@ u32 hfi1_build_tid_rdma_write_req(struct rvt_qp *qp, struct rvt_swqe *wqe, rcu_read_unlock(); return sizeof(ohdr->u.tid_rdma.w_req) / sizeof(u32); } + +void hfi1_compute_tid_rdma_flow_wt(void) +{ + /* + * Heuristic for computing the RNR timeout when waiting on the flow + * queue. Rather than a computationaly expensive exact estimate of when + * a flow will be available, we assume that if a QP is at position N in + * the flow queue it has to wait approximately (N + 1) * (number of + * segments between two sync points), assuming PMTU of 4K. The rationale + * for this is that flows are released and recycled at each sync point. + */ + tid_rdma_flow_wt = MAX_TID_FLOW_PSN * enum_to_mtu(OPA_MTU_4096) / + TID_RDMA_MAX_SEGMENT_SIZE; +} + +static u32 position_in_queue(struct hfi1_qp_priv *qpriv, + struct tid_queue *queue) +{ + return qpriv->tid_enqueue - queue->dequeue; +} + +/* + * @qp: points to rvt_qp context. + * @to_seg: desired RNR timeout in segments. + * Return: index of the next highest timeout in the ib_hfi1_rnr_table[] + */ +static u32 hfi1_compute_tid_rnr_timeout(struct rvt_qp *qp, u32 to_seg) +{ + struct hfi1_qp_priv *qpriv = qp->priv; + u64 timeout; + u32 bytes_per_us; + u8 i; + + bytes_per_us = active_egress_rate(qpriv->rcd->ppd) / 8; + timeout = (to_seg * TID_RDMA_MAX_SEGMENT_SIZE) / bytes_per_us; + /* + * Find the next highest value in the RNR table to the required + * timeout. This gives the responder some padding. + */ + for (i = 1; i <= IB_AETH_CREDIT_MASK; i++) + if (rvt_rnr_tbl_to_usec(i) >= timeout) + return i; + return 0; +} + +/** + * Central place for resource allocation at TID write responder, + * is called from write_req and write_data interrupt handlers as + * well as the send thread when a queued QP is scheduled for + * resource allocation. + * + * Iterates over (a) segments of a request and then (b) queued requests + * themselves to allocate resources for up to local->max_write + * segments across multiple requests. Stop allocating when we + * hit a sync point, resume allocating after data packets at + * sync point have been received. + * + * Resource allocation and sending of responses is decoupled. The + * request/segment which are being allocated and sent are as follows. + * Resources are allocated for: + * [request: qpriv->r_tid_alloc, segment: req->alloc_seg] + * The send thread sends: + * [request: qp->s_tail_ack_queue, segment:req->cur_seg] + */ +static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx) +{ + struct tid_rdma_request *req; + struct hfi1_qp_priv *qpriv = qp->priv; + struct hfi1_ctxtdata *rcd = qpriv->rcd; + struct tid_rdma_params *local = &qpriv->tid_rdma.local; + struct rvt_ack_entry *e; + u32 npkts, to_seg; + bool last; + int ret = 0; + + lockdep_assert_held(&qp->s_lock); + + while (1) { + /* + * Don't allocate more segments if a RNR NAK has already been + * scheduled to avoid messing up qp->r_psn: the RNR NAK will + * be sent only when all allocated segments have been sent. + * However, if more segments are allocated before that, TID RDMA + * WRITE RESP packets will be sent out for these new segments + * before the RNR NAK packet. When the requester receives the + * RNR NAK packet, it will restart with qp->s_last_psn + 1, + * which does not match qp->r_psn and will be dropped. + * Consequently, the requester will exhaust its retries and + * put the qp into error state. + */ + if (qpriv->rnr_nak_state == TID_RNR_NAK_SEND) + break; + + /* No requests left to process */ + if (qpriv->r_tid_alloc == qpriv->r_tid_head) { + /* If all data has been received, clear the flow */ + if (qpriv->flow_state.index < RXE_NUM_TID_FLOWS && + !qpriv->alloc_w_segs) + hfi1_kern_clear_hw_flow(rcd, qp); + break; + } + + e = &qp->s_ack_queue[qpriv->r_tid_alloc]; + if (e->opcode != TID_OP(WRITE_REQ)) + goto next_req; + req = ack_to_tid_req(e); + /* Finished allocating for all segments of this request */ + if (req->alloc_seg >= req->total_segs) + goto next_req; + + /* Can allocate only a maximum of local->max_write for a QP */ + if (qpriv->alloc_w_segs >= local->max_write) + break; + + /* Don't allocate at a sync point with data packets pending */ + if (qpriv->sync_pt && qpriv->alloc_w_segs) + break; + + /* All data received at the sync point, continue */ + if (qpriv->sync_pt && !qpriv->alloc_w_segs) { + hfi1_kern_clear_hw_flow(rcd, qp); + qpriv->sync_pt = false; + if (qpriv->s_flags & HFI1_R_TID_SW_PSN) + qpriv->s_flags &= ~HFI1_R_TID_SW_PSN; + } + + /* Allocate flow if we don't have one */ + if (qpriv->flow_state.index >= RXE_NUM_TID_FLOWS) { + ret = hfi1_kern_setup_hw_flow(qpriv->rcd, qp); + if (ret) { + to_seg = tid_rdma_flow_wt * + position_in_queue(qpriv, + &rcd->flow_queue); + break; + } + } + + npkts = rvt_div_round_up_mtu(qp, req->seg_len); + + /* + * We are at a sync point if we run out of KDETH PSN space. + * Last PSN of every generation is reserved for RESYNC. + */ + if (qpriv->flow_state.psn + npkts > MAX_TID_FLOW_PSN - 1) { + qpriv->sync_pt = true; + break; + } + + /* + * If overtaking req->acked_tail, send an RNR NAK. Because the + * QP is not queued in this case, and the issue can only be + * caused due a delay in scheduling the second leg which we + * cannot estimate, we use a rather arbitrary RNR timeout of + * (MAX_FLOWS / 2) segments + */ + if (!CIRC_SPACE(req->setup_head, req->acked_tail, + MAX_FLOWS)) { + ret = -EAGAIN; + to_seg = MAX_FLOWS >> 1; + qpriv->s_flags |= RVT_S_ACK_PENDING; + break; + } + + /* Try to allocate rcv array / TID entries */ + ret = hfi1_kern_exp_rcv_setup(req, &req->ss, &last); + if (ret == -EAGAIN) + to_seg = position_in_queue(qpriv, &rcd->rarr_queue); + if (ret) + break; + + qpriv->alloc_w_segs++; + req->alloc_seg++; + continue; +next_req: + /* Begin processing the next request */ + if (++qpriv->r_tid_alloc > + rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) + qpriv->r_tid_alloc = 0; + } + + /* + * Schedule an RNR NAK to be sent if (a) flow or rcv array allocation + * has failed (b) we are called from the rcv handler interrupt context + * (c) an RNR NAK has not already been scheduled + */ + if (ret == -EAGAIN && intr_ctx && !qp->r_nak_state) + goto send_rnr_nak; + + return; + +send_rnr_nak: + lockdep_assert_held(&qp->r_lock); + + /* Set r_nak_state to prevent unrelated events from generating NAK's */ + qp->r_nak_state = hfi1_compute_tid_rnr_timeout(qp, to_seg) | IB_RNR_NAK; + + /* Pull back r_psn to the segment being RNR NAK'd */ + qp->r_psn = e->psn + req->alloc_seg; + qp->r_ack_psn = qp->r_psn; + /* + * Pull back r_head_ack_queue to the ack entry following the request + * being RNR NAK'd. This allows resources to be allocated to the request + * if the queued QP is scheduled. + */ + qp->r_head_ack_queue = qpriv->r_tid_alloc + 1; + if (qp->r_head_ack_queue > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) + qp->r_head_ack_queue = 0; + qpriv->r_tid_head = qp->r_head_ack_queue; + /* + * These send side fields are used in make_rc_ack(). They are set in + * hfi1_send_rc_ack() but must be set here before dropping qp->s_lock + * for consistency + */ + qp->s_nak_state = qp->r_nak_state; + qp->s_ack_psn = qp->r_ack_psn; + /* + * Clear the ACK PENDING flag to prevent unwanted ACK because we + * have modified qp->s_ack_psn here. + */ + qp->s_flags &= ~(RVT_S_ACK_PENDING); + + /* + * qpriv->rnr_nak_state is used to determine when the scheduled RNR NAK + * has actually been sent. qp->s_flags RVT_S_ACK_PENDING bit cannot be + * used for this because qp->s_lock is dropped before calling + * hfi1_send_rc_ack() leading to inconsistency between the receive + * interrupt handlers and the send thread in make_rc_ack() + */ + qpriv->rnr_nak_state = TID_RNR_NAK_SEND; + + /* + * Schedule RNR NAK to be sent. RNR NAK's are scheduled from the receive + * interrupt handlers but will be sent from the send engine behind any + * previous responses that may have been scheduled + */ + rc_defered_ack(rcd, qp); +} + +void hfi1_rc_rcv_tid_rdma_write_req(struct hfi1_packet *packet) +{ + /* HANDLER FOR TID RDMA WRITE REQUEST packet (Responder side)*/ + + /* + * 1. Verify TID RDMA WRITE REQ as per IB_OPCODE_RC_RDMA_WRITE_FIRST + * (see hfi1_rc_rcv()) + * - Don't allow 0-length requests. + * 2. Put TID RDMA WRITE REQ into the response queueu (s_ack_queue) + * - Setup struct tid_rdma_req with request info + * - Prepare struct tid_rdma_flow array? + * 3. Set the qp->s_ack_state as state diagram in design doc. + * 4. Set RVT_S_RESP_PENDING in s_flags. + * 5. Kick the send engine (hfi1_schedule_send()) + */ + struct hfi1_ctxtdata *rcd = packet->rcd; + struct rvt_qp *qp = packet->qp; + struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); + struct ib_other_headers *ohdr = packet->ohdr; + struct rvt_ack_entry *e; + unsigned long flags; + struct ib_reth *reth; + struct hfi1_qp_priv *qpriv = qp->priv; + struct tid_rdma_request *req; + u32 bth0, psn, len, rkey, num_segs; + bool is_fecn; + u8 next; + u64 vaddr; + int diff; + + bth0 = be32_to_cpu(ohdr->bth[0]); + if (hfi1_ruc_check_hdr(ibp, packet)) + return; + + is_fecn = process_ecn(qp, packet); + psn = mask_psn(be32_to_cpu(ohdr->bth[2])); + + if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST)) + rvt_comm_est(qp); + + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) + goto nack_inv; + + reth = &ohdr->u.tid_rdma.w_req.reth; + vaddr = be64_to_cpu(reth->vaddr); + len = be32_to_cpu(reth->length); + + num_segs = DIV_ROUND_UP(len, qpriv->tid_rdma.local.max_len); + diff = delta_psn(psn, qp->r_psn); + if (unlikely(diff)) { + if (tid_rdma_rcv_error(packet, ohdr, qp, psn, diff)) + return; + goto send_ack; + } + + /* + * The resent request which was previously RNR NAK'd is inserted at the + * location of the original request, which is one entry behind + * r_head_ack_queue + */ + if (qpriv->rnr_nak_state) + qp->r_head_ack_queue = qp->r_head_ack_queue ? + qp->r_head_ack_queue - 1 : + rvt_size_atomic(ib_to_rvt(qp->ibqp.device)); + + /* We've verified the request, insert it into the ack queue. */ + next = qp->r_head_ack_queue + 1; + if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) + next = 0; + spin_lock_irqsave(&qp->s_lock, flags); + if (unlikely(next == qp->s_acked_ack_queue)) { + if (!qp->s_ack_queue[next].sent) + goto nack_inv_unlock; + update_ack_queue(qp, next); + } + e = &qp->s_ack_queue[qp->r_head_ack_queue]; + req = ack_to_tid_req(e); + + /* Bring previously RNR NAK'd request back to life */ + if (qpriv->rnr_nak_state) { + qp->r_nak_state = 0; + qp->s_nak_state = 0; + qpriv->rnr_nak_state = TID_RNR_NAK_INIT; + qp->r_psn = e->lpsn + 1; + req->state = TID_REQUEST_INIT; + goto update_head; + } + + if (e->rdma_sge.mr) { + rvt_put_mr(e->rdma_sge.mr); + e->rdma_sge.mr = NULL; + } + + /* The length needs to be in multiples of PAGE_SIZE */ + if (!len || len & ~PAGE_MASK) + goto nack_inv_unlock; + + rkey = be32_to_cpu(reth->rkey); + qp->r_len = len; + + if (e->opcode == TID_OP(WRITE_REQ) && + (req->setup_head != req->clear_tail || + req->clear_tail != req->acked_tail)) + goto nack_inv_unlock; + + if (unlikely(!rvt_rkey_ok(qp, &e->rdma_sge, qp->r_len, vaddr, + rkey, IB_ACCESS_REMOTE_WRITE))) + goto nack_acc; + + qp->r_psn += num_segs - 1; + + e->opcode = (bth0 >> 24) & 0xff; + e->psn = psn; + e->lpsn = qp->r_psn; + e->sent = 0; + + req->n_flows = min_t(u16, num_segs, qpriv->tid_rdma.local.max_write); + req->state = TID_REQUEST_INIT; + req->cur_seg = 0; + req->comp_seg = 0; + req->ack_seg = 0; + req->alloc_seg = 0; + req->isge = 0; + req->seg_len = qpriv->tid_rdma.local.max_len; + req->total_len = len; + req->total_segs = num_segs; + req->r_flow_psn = e->psn; + req->ss.sge = e->rdma_sge; + req->ss.num_sge = 1; + + req->flow_idx = req->setup_head; + req->clear_tail = req->setup_head; + req->acked_tail = req->setup_head; + + qp->r_state = e->opcode; + qp->r_nak_state = 0; + /* + * We need to increment the MSN here instead of when we + * finish sending the result since a duplicate request would + * increment it more than once. + */ + qp->r_msn++; + qp->r_psn++; + + if (qpriv->r_tid_tail == HFI1_QP_WQE_INVALID) { + qpriv->r_tid_tail = qp->r_head_ack_queue; + } else if (qpriv->r_tid_tail == qpriv->r_tid_head) { + struct tid_rdma_request *ptr; + + e = &qp->s_ack_queue[qpriv->r_tid_tail]; + ptr = ack_to_tid_req(e); + + if (e->opcode != TID_OP(WRITE_REQ) || + ptr->comp_seg == ptr->total_segs) { + if (qpriv->r_tid_tail == qpriv->r_tid_ack) + qpriv->r_tid_ack = qp->r_head_ack_queue; + qpriv->r_tid_tail = qp->r_head_ack_queue; + } + } +update_head: + qp->r_head_ack_queue = next; + qpriv->r_tid_head = qp->r_head_ack_queue; + + hfi1_tid_write_alloc_resources(qp, true); + + /* Schedule the send tasklet. */ + qp->s_flags |= RVT_S_RESP_PENDING; + hfi1_schedule_send(qp); + + spin_unlock_irqrestore(&qp->s_lock, flags); + if (is_fecn) + goto send_ack; + return; + +nack_inv_unlock: + spin_unlock_irqrestore(&qp->s_lock, flags); +nack_inv: + rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR); + qp->r_nak_state = IB_NAK_INVALID_REQUEST; + qp->r_ack_psn = qp->r_psn; + /* Queue NAK for later */ + rc_defered_ack(rcd, qp); + return; +nack_acc: + spin_unlock_irqrestore(&qp->s_lock, flags); + rvt_rc_error(qp, IB_WC_LOC_PROT_ERR); + qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR; + qp->r_ack_psn = qp->r_psn; +send_ack: + hfi1_send_rc_ack(packet, is_fecn); +} diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index 9b952351f072..7780a28db316 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -26,7 +26,9 @@ * * HFI1_S_TID_WAIT_INTERLCK - QP is waiting for requester interlock */ +/* BIT(4) reserved for RVT_S_ACK_PENDING. */ #define HFI1_S_TID_WAIT_INTERLCK BIT(5) +#define HFI1_R_TID_SW_PSN BIT(19) /* * Unlike regular IB RDMA VERBS, which do not require an entry @@ -89,10 +91,12 @@ struct tid_rdma_request { } e; struct tid_rdma_flow *flows; /* array of tid flows */ + struct rvt_sge_state ss; /* SGE state for TID RDMA requests */ u16 n_flows; /* size of the flow buffer window */ u16 setup_head; /* flow index we are setting up */ u16 clear_tail; /* flow index we are clearing */ u16 flow_idx; /* flow index most recently set up */ + u16 acked_tail; u32 seg_len; u32 total_len; @@ -103,6 +107,7 @@ struct tid_rdma_request { u32 cur_seg; /* index of current segment */ u32 comp_seg; /* index of last completed segment */ u32 ack_seg; /* index of last ack'ed segment */ + u32 alloc_seg; /* index of next segment to be allocated */ u32 isge; /* index of "current" sge */ u32 ack_pending; /* num acks pending for this request */ @@ -174,6 +179,12 @@ struct tid_rdma_flow { u32 tid_entry[TID_RDMA_MAX_PAGES]; }; +enum tid_rnr_nak_state { + TID_RNR_NAK_INIT = 0, + TID_RNR_NAK_SEND, + TID_RNR_NAK_SENT, +}; + bool tid_rdma_conn_req(struct rvt_qp *qp, u64 *data); bool tid_rdma_conn_reply(struct rvt_qp *qp, u64 data); bool tid_rdma_conn_resp(struct rvt_qp *qp, u64 *data); @@ -247,4 +258,9 @@ static inline void hfi1_setup_tid_rdma_wqe(struct rvt_qp *qp, u32 hfi1_build_tid_rdma_write_req(struct rvt_qp *qp, struct rvt_swqe *wqe, struct ib_other_headers *ohdr, u32 *bth1, u32 *bth2, u32 *len); + +void hfi1_compute_tid_rdma_flow_wt(void); + +void hfi1_rc_rcv_tid_rdma_write_req(struct hfi1_packet *packet); + #endif /* HFI1_TID_RDMA_H */ diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index 841727a684d5..9ced8a4a7b76 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -172,7 +172,15 @@ struct hfi1_qp_priv { unsigned long tid_timer_timeout_jiffies; /* variables for the TID RDMA SE state machine */ + u8 rnr_nak_state; /* RNR NAK state */ u32 s_flags; + u32 r_tid_head; /* Most recently added TID RDMA request */ + u32 r_tid_tail; /* the last completed TID RDMA request */ + u32 r_tid_ack; /* the TID RDMA request to be ACK'ed */ + u32 r_tid_alloc; /* Request for which we are allocating resources */ + u32 pending_tid_w_segs; /* Num of pending tid write segments */ + u32 alloc_w_segs; /* Number of segments for which write */ + /* resources have been allocated for this QP */ /* For TID RDMA READ */ u32 tid_r_reqs; /* Num of tid reads requested */ @@ -180,8 +188,12 @@ struct hfi1_qp_priv { u32 pending_tid_r_segs; /* Num of pending tid read segments */ u16 pkts_ps; /* packets per segment */ u8 timeout_shift; /* account for number of packets per segment */ + + u8 sync_pt; /* Set when QP reaches sync point */ }; +#define HFI1_QP_WQE_INVALID ((u32)-1) + struct hfi1_swqe_priv { struct tid_rdma_request tid_req; struct rvt_sge_state ss; /* Used for TID RDMA READ Request */ -- cgit v1.2.3-59-g8ed1b From 3c6cb20a0d17d7a75778fb0935d6fa427c8177af Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 21:51:39 -0800 Subject: IB/hfi1: Add TID RDMA WRITE functionality into RDMA verbs This patch integrates TID RDMA WRITE protocol into normal RDMA verbs framework. The TID RDMA WRITE protocol is an end-to-end protocol between the hfi1 drivers on two OPA nodes that converts a qualified RDMA WRITE request into a TID RDMA WRITE request to avoid data copying on the responder side. Reviewed-by: Mike Marciniszyn Signed-off-by: Mitko Haralanov Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/qp.c | 7 + drivers/infiniband/hw/hfi1/rc.c | 487 +++++++++++++++++++++++++++++---- drivers/infiniband/hw/hfi1/tid_rdma.c | 14 + drivers/infiniband/hw/hfi1/user_sdma.c | 3 +- drivers/infiniband/hw/hfi1/verbs.c | 17 +- drivers/infiniband/hw/hfi1/verbs.h | 1 + include/rdma/rdmavt_qp.h | 1 + 7 files changed, 481 insertions(+), 49 deletions(-) (limited to 'drivers/infiniband/hw/hfi1/rc.c') diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c index 96632c77f36f..cfd598e4b303 100644 --- a/drivers/infiniband/hw/hfi1/qp.c +++ b/drivers/infiniband/hw/hfi1/qp.c @@ -138,6 +138,12 @@ const struct rvt_operation_params hfi1_post_parms[RVT_OPERATION_MAX] = { .flags = RVT_OPERATION_USE_RESERVE, }, +[IB_WR_TID_RDMA_WRITE] = { + .length = sizeof(struct ib_rdma_wr), + .qpt_support = BIT(IB_QPT_RC), + .flags = RVT_OPERATION_IGN_RNR_CNT, +}, + }; static void flush_list_head(struct list_head *l) @@ -780,6 +786,7 @@ void quiesce_qp(struct rvt_qp *qp) struct hfi1_qp_priv *priv = qp->priv; hfi1_del_tid_reap_timer(qp); + hfi1_del_tid_retry_timer(qp); iowait_sdma_drain(&priv->s_iowait); qp_pio_drain(qp); flush_tx_list(qp); diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index fcb733ea8dfb..6d2abea896e5 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -111,16 +111,17 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, struct hfi1_pkt_state *ps) { struct rvt_ack_entry *e; - u32 hwords; + u32 hwords, hdrlen; u32 len = 0; u32 bth0 = 0, bth2 = 0; u32 bth1 = qp->remote_qpn | (HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT); int middle = 0; u32 pmtu = qp->pmtu; - struct hfi1_qp_priv *priv = qp->priv; + struct hfi1_qp_priv *qpriv = qp->priv; bool last_pkt; u32 delta; u8 next = qp->s_tail_ack_queue; + struct tid_rdma_request *req; trace_hfi1_rsp_make_rc_ack(qp, 0); lockdep_assert_held(&qp->s_lock); @@ -128,7 +129,7 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) goto bail; - if (priv->hdr_type == HFI1_PKT_TYPE_9B) + if (qpriv->hdr_type == HFI1_PKT_TYPE_9B) /* header size in 32-bit words LRH+BTH = (8+12)/4. */ hwords = 5; else @@ -206,6 +207,21 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, hwords++; qp->s_ack_rdma_psn = e->psn; bth2 = mask_psn(qp->s_ack_rdma_psn++); + } else if (e->opcode == TID_OP(WRITE_REQ)) { + /* + * If a TID RDMA WRITE RESP is being resent, we have to + * wait for the actual request. All requests that are to + * be resent will have their state set to + * TID_REQUEST_RESEND. When the new request arrives, the + * state will be changed to TID_REQUEST_RESEND_ACTIVE. + */ + req = ack_to_tid_req(e); + if (req->state == TID_REQUEST_RESEND || + req->state == TID_REQUEST_INIT_RESEND) + goto bail; + qp->s_ack_state = TID_OP(WRITE_RESP); + qp->s_ack_rdma_psn = mask_psn(e->psn + req->cur_seg); + goto write_resp; } else if (e->opcode == TID_OP(READ_REQ)) { /* * If a TID RDMA read response is being resent and @@ -267,6 +283,59 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, bth2 = mask_psn(qp->s_ack_rdma_psn++); break; + case TID_OP(WRITE_RESP): +write_resp: + /* + * 1. Check if RVT_S_ACK_PENDING is set. If yes, + * goto normal. + * 2. Attempt to allocate TID resources. + * 3. Remove RVT_S_RESP_PENDING flags from s_flags + * 4. If resources not available: + * 4.1 Set RVT_S_WAIT_TID_SPACE + * 4.2 Queue QP on RCD TID queue + * 4.3 Put QP on iowait list. + * 4.4 Build IB RNR NAK with appropriate timeout value + * 4.5 Return indication progress made. + * 5. If resources are available: + * 5.1 Program HW flow CSRs + * 5.2 Build TID RDMA WRITE RESP packet + * 5.3 If more resources needed, do 2.1 - 2.3. + * 5.4 Wake up next QP on RCD TID queue. + * 5.5 Return indication progress made. + */ + + e = &qp->s_ack_queue[qp->s_tail_ack_queue]; + req = ack_to_tid_req(e); + + /* + * Send scheduled RNR NAK's. RNR NAK's need to be sent at + * segment boundaries, not at request boundaries. Don't change + * s_ack_state because we are still in the middle of a request + */ + if (qpriv->rnr_nak_state == TID_RNR_NAK_SEND && + qp->s_tail_ack_queue == qpriv->r_tid_alloc && + req->cur_seg == req->alloc_seg) { + qpriv->rnr_nak_state = TID_RNR_NAK_SENT; + goto normal_no_state; + } + + bth2 = mask_psn(qp->s_ack_rdma_psn); + hdrlen = hfi1_build_tid_rdma_write_resp(qp, e, ohdr, &bth1, + bth2, &len, + &ps->s_txreq->ss); + if (!hdrlen) + return 0; + + hwords += hdrlen; + bth0 = qp->s_ack_state << 24; + qp->s_ack_rdma_psn++; + if (req->cur_seg != req->total_segs) + break; + + e->sent = 1; + qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST); + break; + case TID_OP(READ_RESP): read_resp: e = &qp->s_ack_queue[qp->s_tail_ack_queue]; @@ -298,8 +367,7 @@ normal: * (see above). */ qp->s_ack_state = OP(SEND_ONLY); - qp->s_flags &= ~RVT_S_ACK_PENDING; - ps->s_txreq->ss = NULL; +normal_no_state: if (qp->s_nak_state) ohdr->u.aeth = cpu_to_be32((qp->r_msn & IB_MSN_MASK) | @@ -311,9 +379,11 @@ normal: len = 0; bth0 = OP(ACKNOWLEDGE) << 24; bth2 = mask_psn(qp->s_ack_psn); + qp->s_flags &= ~RVT_S_ACK_PENDING; + ps->s_txreq->ss = NULL; } qp->s_rdma_ack_cnt++; - ps->s_txreq->sde = priv->s_sde; + ps->s_txreq->sde = qpriv->s_sde; ps->s_txreq->s_cur_size = len; ps->s_txreq->hdr_dwords = hwords; hfi1_make_ruc_header(qp, ohdr, bth0, bth1, bth2, middle, ps); @@ -366,6 +436,7 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) int middle = 0; int delta; struct tid_rdma_flow *flow = NULL; + struct tid_rdma_params *remote; trace_hfi1_sender_make_rc_req(qp); lockdep_assert_held(&qp->s_lock); @@ -414,7 +485,7 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) goto done_free_tx; } - if (qp->s_flags & (RVT_S_WAIT_RNR | RVT_S_WAIT_ACK)) + if (qp->s_flags & (RVT_S_WAIT_RNR | RVT_S_WAIT_ACK | HFI1_S_WAIT_HALT)) goto bail; if (cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) { @@ -586,6 +657,108 @@ no_flow_control: qp->s_cur = 0; break; + case IB_WR_TID_RDMA_WRITE: + if (newreq) { + /* + * Limit the number of TID RDMA WRITE requests. + */ + if (atomic_read(&priv->n_tid_requests) >= + HFI1_TID_RDMA_WRITE_CNT) + goto bail; + + if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) + qp->s_lsn++; + } + + hwords += hfi1_build_tid_rdma_write_req(qp, wqe, ohdr, + &bth1, &bth2, + &len); + ss = NULL; + if (priv->s_tid_cur == HFI1_QP_WQE_INVALID) { + priv->s_tid_cur = qp->s_cur; + if (priv->s_tid_tail == HFI1_QP_WQE_INVALID) { + priv->s_tid_tail = qp->s_cur; + priv->s_state = TID_OP(WRITE_RESP); + } + } else if (priv->s_tid_cur == priv->s_tid_head) { + struct rvt_swqe *__w; + struct tid_rdma_request *__r; + + __w = rvt_get_swqe_ptr(qp, priv->s_tid_cur); + __r = wqe_to_tid_req(__w); + + /* + * The s_tid_cur pointer is advanced to s_cur if + * any of the following conditions about the WQE + * to which s_ti_cur currently points to are + * satisfied: + * 1. The request is not a TID RDMA WRITE + * request, + * 2. The request is in the INACTIVE or + * COMPLETE states (TID RDMA READ requests + * stay at INACTIVE and TID RDMA WRITE + * transition to COMPLETE when done), + * 3. The request is in the ACTIVE or SYNC + * state and the number of completed + * segments is equal to the total segment + * count. + * (If ACTIVE, the request is waiting for + * ACKs. If SYNC, the request has not + * received any responses because it's + * waiting on a sync point.) + */ + if (__w->wr.opcode != IB_WR_TID_RDMA_WRITE || + __r->state == TID_REQUEST_INACTIVE || + __r->state == TID_REQUEST_COMPLETE || + ((__r->state == TID_REQUEST_ACTIVE || + __r->state == TID_REQUEST_SYNC) && + __r->comp_seg == __r->total_segs)) { + if (priv->s_tid_tail == + priv->s_tid_cur && + priv->s_state == + TID_OP(WRITE_DATA_LAST)) { + priv->s_tid_tail = qp->s_cur; + priv->s_state = + TID_OP(WRITE_RESP); + } + priv->s_tid_cur = qp->s_cur; + } + /* + * A corner case: when the last TID RDMA WRITE + * request was completed, s_tid_head, + * s_tid_cur, and s_tid_tail all point to the + * same location. Other requests are posted and + * s_cur wraps around to the same location, + * where a new TID RDMA WRITE is posted. In + * this case, none of the indices need to be + * updated. However, the priv->s_state should. + */ + if (priv->s_tid_tail == qp->s_cur && + priv->s_state == TID_OP(WRITE_DATA_LAST)) + priv->s_state = TID_OP(WRITE_RESP); + } + req = wqe_to_tid_req(wqe); + if (newreq) { + priv->s_tid_head = qp->s_cur; + priv->pending_tid_w_resp += req->total_segs; + atomic_inc(&priv->n_tid_requests); + atomic_dec(&priv->n_requests); + } else { + req->state = TID_REQUEST_RESEND; + req->comp_seg = delta_psn(bth2, wqe->psn); + /* + * Pull back any segments since we are going + * to re-receive them. + */ + req->setup_head = req->clear_tail; + priv->pending_tid_w_resp += + delta_psn(wqe->lpsn, bth2) + 1; + } + + if (++qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + case IB_WR_RDMA_READ: /* * Don't allow more operations to be started @@ -745,7 +918,8 @@ no_flow_control: if (qp->s_tail >= qp->s_size) qp->s_tail = 0; } - if (wqe->wr.opcode == IB_WR_RDMA_READ) + if (wqe->wr.opcode == IB_WR_RDMA_READ || + wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) qp->s_psn = wqe->lpsn + 1; else if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) qp->s_psn = req->s_next_psn; @@ -865,6 +1039,33 @@ no_flow_control: if (qp->s_cur == qp->s_size) qp->s_cur = 0; break; + + case TID_OP(WRITE_RESP): + /* + * This value for s_state is used for restarting a TID RDMA + * WRITE request. See comment in OP(RDMA_READ_RESPONSE_MIDDLE + * for more). + */ + req = wqe_to_tid_req(wqe); + req->state = TID_REQUEST_RESEND; + rcu_read_lock(); + remote = rcu_dereference(priv->tid_rdma.remote); + req->comp_seg = delta_psn(qp->s_psn, wqe->psn); + len = wqe->length - (req->comp_seg * remote->max_len); + rcu_read_unlock(); + + bth2 = mask_psn(qp->s_psn); + hwords += hfi1_build_tid_rdma_write_req(qp, wqe, ohdr, &bth1, + &bth2, &len); + qp->s_psn = wqe->lpsn + 1; + ss = NULL; + qp->s_state = TID_OP(WRITE_REQ); + priv->pending_tid_w_resp += delta_psn(wqe->lpsn, bth2) + 1; + priv->s_tid_cur = qp->s_cur; + if (++qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + case TID_OP(READ_RESP): if (wqe->wr.opcode != IB_WR_TID_RDMA_READ) goto bail; @@ -965,7 +1166,8 @@ no_flow_control: } qp->s_sending_hpsn = bth2; delta = delta_psn(bth2, wqe->psn); - if (delta && delta % HFI1_PSN_CREDIT == 0) + if (delta && delta % HFI1_PSN_CREDIT == 0 && + wqe->wr.opcode != IB_WR_TID_RDMA_WRITE) bth2 |= IB_BTH_REQ_ACK; if (qp->s_flags & RVT_S_SEND_ONE) { qp->s_flags &= ~RVT_S_SEND_ONE; @@ -998,6 +1200,12 @@ bail: bail_no_tx: ps->s_txreq = NULL; qp->s_flags &= ~RVT_S_BUSY; + /* + * If we didn't get a txreq, the QP will be woken up later to try + * again. Set the flags to indicate which work item to wake + * up. + */ + iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_IB); return 0; } @@ -1285,6 +1493,7 @@ static void reset_psn(struct rvt_qp *qp, u32 psn) lockdep_assert_held(&qp->s_lock); qp->s_cur = n; priv->pending_tid_r_segs = 0; + priv->pending_tid_w_resp = 0; qp->s_num_rd_atomic = 0; /* @@ -1342,6 +1551,10 @@ static void reset_psn(struct rvt_qp *qp, u32 psn) qp->s_state = OP(RDMA_READ_RESPONSE_LAST); break; + case IB_WR_TID_RDMA_WRITE: + qp->s_state = TID_OP(WRITE_RESP); + break; + case IB_WR_RDMA_READ: qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE); break; @@ -1435,7 +1648,7 @@ void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait) qp->s_flags &= ~(RVT_S_WAIT_FENCE | RVT_S_WAIT_RDMAR | RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_PSN | - RVT_S_WAIT_ACK); + RVT_S_WAIT_ACK | HFI1_S_WAIT_TID_RESP); if (wait) qp->s_flags |= RVT_S_SEND_ONE; reset_psn(qp, psn); @@ -1443,7 +1656,8 @@ void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait) /* * Set qp->s_sending_psn to the next PSN after the given one. - * This would be psn+1 except when RDMA reads are present. + * This would be psn+1 except when RDMA reads or TID RDMA ops + * are present. */ static void reset_sending_psn(struct rvt_qp *qp, u32 psn) { @@ -1456,7 +1670,8 @@ static void reset_sending_psn(struct rvt_qp *qp, u32 psn) wqe = rvt_get_swqe_ptr(qp, n); if (cmp_psn(psn, wqe->lpsn) <= 0) { if (wqe->wr.opcode == IB_WR_RDMA_READ || - wqe->wr.opcode == IB_WR_TID_RDMA_READ) + wqe->wr.opcode == IB_WR_TID_RDMA_READ || + wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) qp->s_sending_psn = wqe->lpsn + 1; else qp->s_sending_psn = psn + 1; @@ -1479,8 +1694,9 @@ void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah) struct rvt_swqe *wqe; struct ib_header *hdr = NULL; struct hfi1_16b_header *hdr_16b = NULL; - u32 opcode; + u32 opcode, head, tail; u32 psn; + struct tid_rdma_request *req; lockdep_assert_held(&qp->s_lock); if (!(ib_rvt_state_ops[qp->state] & RVT_SEND_OR_FLUSH_OR_RECV_OK)) @@ -1507,29 +1723,84 @@ void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah) opcode = ib_bth_get_opcode(ohdr); if ((opcode >= OP(RDMA_READ_RESPONSE_FIRST) && opcode <= OP(ATOMIC_ACKNOWLEDGE)) || - opcode == TID_OP(READ_RESP)) { + opcode == TID_OP(READ_RESP) || + opcode == TID_OP(WRITE_RESP)) { WARN_ON(!qp->s_rdma_ack_cnt); qp->s_rdma_ack_cnt--; return; } psn = ib_bth_get_psn(ohdr); - reset_sending_psn(qp, psn); + /* + * Don't attempt to reset the sending PSN for packets in the + * KDETH PSN space since the PSN does not match anything. + */ + if (opcode != TID_OP(WRITE_DATA) && + opcode != TID_OP(WRITE_DATA_LAST) && + opcode != TID_OP(ACK) && opcode != TID_OP(RESYNC)) + reset_sending_psn(qp, psn); + + /* Handle TID RDMA WRITE packets differently */ + if (opcode >= TID_OP(WRITE_REQ) && + opcode <= TID_OP(WRITE_DATA_LAST)) { + head = priv->s_tid_head; + tail = priv->s_tid_cur; + /* + * s_tid_cur is set to s_tid_head in the case, where + * a new TID RDMA request is being started and all + * previous ones have been completed. + * Therefore, we need to do a secondary check in order + * to properly determine whether we should start the + * RC timer. + */ + wqe = rvt_get_swqe_ptr(qp, tail); + req = wqe_to_tid_req(wqe); + if (head == tail && req->comp_seg < req->total_segs) { + if (tail == 0) + tail = qp->s_size - 1; + else + tail -= 1; + } + } else { + head = qp->s_tail; + tail = qp->s_acked; + } /* * Start timer after a packet requesting an ACK has been sent and * there are still requests that haven't been acked. */ - if ((psn & IB_BTH_REQ_ACK) && qp->s_acked != qp->s_tail && + if ((psn & IB_BTH_REQ_ACK) && tail != head && + opcode != TID_OP(WRITE_DATA) && opcode != TID_OP(WRITE_DATA_LAST) && + opcode != TID_OP(RESYNC) && !(qp->s_flags & - (RVT_S_TIMER | RVT_S_WAIT_RNR | RVT_S_WAIT_PSN)) && - (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) { + (RVT_S_TIMER | RVT_S_WAIT_RNR | RVT_S_WAIT_PSN)) && + (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) { if (opcode == TID_OP(READ_REQ)) rvt_add_retry_timer_ext(qp, priv->timeout_shift); else rvt_add_retry_timer(qp); } + /* Start TID RDMA ACK timer */ + if ((opcode == TID_OP(WRITE_DATA) || + opcode == TID_OP(WRITE_DATA_LAST) || + opcode == TID_OP(RESYNC)) && + (psn & IB_BTH_REQ_ACK) && + !(priv->s_flags & HFI1_S_TID_RETRY_TIMER) && + (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) { + /* + * The TID RDMA ACK packet could be received before this + * function is called. Therefore, add the timer only if TID + * RDMA ACK packets are actually pending. + */ + wqe = rvt_get_swqe_ptr(qp, qp->s_acked); + req = wqe_to_tid_req(wqe); + if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE && + req->ack_seg < req->cur_seg) + hfi1_add_tid_retry_timer(qp); + } + while (qp->s_last != qp->s_acked) { u32 s_last; @@ -1628,7 +1899,16 @@ struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, } qp->s_retry = qp->s_retry_cnt; - update_last_psn(qp, wqe->lpsn); + /* + * Don't update the last PSN if the request being completed is + * a TID RDMA WRITE request. + * Completion of the TID RDMA WRITE requests are done by the + * TID RDMA ACKs and as such could be for a request that has + * already been ACKed as far as the IB state machine is + * concerned. + */ + if (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE) + update_last_psn(qp, wqe->lpsn); /* * If we are completing a request which is in the process of @@ -1658,6 +1938,54 @@ struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, return wqe; } +static void set_restart_qp(struct rvt_qp *qp, struct hfi1_ctxtdata *rcd) +{ + /* Retry this request. */ + if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) { + qp->r_flags |= RVT_R_RDMAR_SEQ; + hfi1_restart_rc(qp, qp->s_last_psn + 1, 0); + if (list_empty(&qp->rspwait)) { + qp->r_flags |= RVT_R_RSP_SEND; + rvt_get_qp(qp); + list_add_tail(&qp->rspwait, &rcd->qp_wait_list); + } + } +} + +/** + * update_qp_retry_state - Update qp retry state. + * @qp: the QP + * @psn: the packet sequence number of the TID RDMA WRITE RESP. + * @spsn: The start psn for the given TID RDMA WRITE swqe. + * @lpsn: The last psn for the given TID RDMA WRITE swqe. + * + * This function is called to update the qp retry state upon + * receiving a TID WRITE RESP after the qp is scheduled to retry + * a request. + */ +static void update_qp_retry_state(struct rvt_qp *qp, u32 psn, u32 spsn, + u32 lpsn) +{ + struct hfi1_qp_priv *qpriv = qp->priv; + + qp->s_psn = psn + 1; + /* + * If this is the first TID RDMA WRITE RESP packet for the current + * request, change the s_state so that the retry will be processed + * correctly. Similarly, if this is the last TID RDMA WRITE RESP + * packet, change the s_state and advance the s_cur. + */ + if (cmp_psn(psn, lpsn) >= 0) { + qp->s_cur = qpriv->s_tid_cur + 1; + if (qp->s_cur >= qp->s_size) + qp->s_cur = 0; + qp->s_state = TID_OP(WRITE_REQ); + } else if (!cmp_psn(psn, spsn)) { + qp->s_cur = qpriv->s_tid_cur; + qp->s_state = TID_OP(WRITE_RESP); + } +} + /** * do_rc_ack - process an incoming RC ACK * @qp: the QP the ACK came in on @@ -1679,6 +2007,7 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, int ret = 0; u32 ack_psn; int diff; + struct rvt_dev_info *rdi; lockdep_assert_held(&qp->s_lock); /* @@ -1725,18 +2054,10 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, (opcode != TID_OP(READ_RESP) || diff != 0)) || ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) && - (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) { - /* Retry this request. */ - if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) { - qp->r_flags |= RVT_R_RDMAR_SEQ; - hfi1_restart_rc(qp, qp->s_last_psn + 1, 0); - if (list_empty(&qp->rspwait)) { - qp->r_flags |= RVT_R_RSP_SEND; - rvt_get_qp(qp); - list_add_tail(&qp->rspwait, - &rcd->qp_wait_list); - } - } + (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0)) || + (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE && + (delta_psn(psn, qp->s_last_psn) != 1))) { + set_restart_qp(qp, rcd); /* * No need to process the ACK/NAK since we are * restarting an earlier request. @@ -1768,6 +2089,14 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, hfi1_schedule_send(qp); } } + + /* + * TID RDMA WRITE requests will be completed by the TID RDMA + * ACK packet handler (see tid_rdma.c). + */ + if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) + break; + wqe = do_rc_completion(qp, wqe, ibp); if (qp->s_acked == qp->s_tail) break; @@ -1785,17 +2114,60 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, else rvt_stop_rc_timers(qp); } else if (qp->s_acked != qp->s_tail) { + struct rvt_swqe *__w = NULL; + + if (qpriv->s_tid_cur != HFI1_QP_WQE_INVALID) + __w = rvt_get_swqe_ptr(qp, qpriv->s_tid_cur); + /* - * We are expecting more ACKs so - * mod the retry timer. - */ - rvt_mod_retry_timer(qp); - /* - * We can stop re-sending the earlier packets and - * continue with the next packet the receiver wants. + * Stop timers if we've received all of the TID RDMA + * WRITE * responses. */ - if (cmp_psn(qp->s_psn, psn) <= 0) - reset_psn(qp, psn + 1); + if (__w && __w->wr.opcode == IB_WR_TID_RDMA_WRITE && + opcode == TID_OP(WRITE_RESP)) { + /* + * Normally, the loop above would correctly + * process all WQEs from s_acked onward and + * either complete them or check for correct + * PSN sequencing. + * However, for TID RDMA, due to pipelining, + * the response may not be for the request at + * s_acked so the above look would just be + * skipped. This does not allow for checking + * the PSN sequencing. It has to be done + * separately. + */ + if (cmp_psn(psn, qp->s_last_psn + 1)) { + set_restart_qp(qp, rcd); + goto bail_stop; + } + /* + * If the psn is being resent, stop the + * resending. + */ + if (qp->s_cur != qp->s_tail && + cmp_psn(qp->s_psn, psn) <= 0) + update_qp_retry_state(qp, psn, + __w->psn, + __w->lpsn); + else if (--qpriv->pending_tid_w_resp) + rvt_mod_retry_timer(qp); + else + rvt_stop_rc_timers(qp); + } else { + /* + * We are expecting more ACKs so + * mod the retry timer. + */ + rvt_mod_retry_timer(qp); + /* + * We can stop re-sending the earlier packets + * and continue with the next packet the + * receiver wants. + */ + if (cmp_psn(qp->s_psn, psn) <= 0) + reset_psn(qp, psn + 1); + } } else { /* No more acks - kill all timers */ rvt_stop_rc_timers(qp); @@ -1811,6 +2183,15 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, rvt_get_credit(qp, aeth); qp->s_rnr_retry = qp->s_rnr_retry_cnt; qp->s_retry = qp->s_retry_cnt; + /* + * If the current request is a TID RDMA WRITE request and the + * response is not a TID RDMA WRITE RESP packet, s_last_psn + * can't be advanced. + */ + if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE && + opcode != TID_OP(WRITE_RESP) && + cmp_psn(psn, wqe->psn) >= 0) + return 1; update_last_psn(qp, psn); return 1; @@ -1820,20 +2201,31 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, goto bail_stop; if (qp->s_flags & RVT_S_WAIT_RNR) goto bail_stop; - if (qp->s_rnr_retry == 0) { + rdi = ib_to_rvt(qp->ibqp.device); + if (qp->s_rnr_retry == 0 && + !((rdi->post_parms[wqe->wr.opcode].flags & + RVT_OPERATION_IGN_RNR_CNT) && + qp->s_rnr_retry_cnt == 0)) { status = IB_WC_RNR_RETRY_EXC_ERR; goto class_b; } - if (qp->s_rnr_retry_cnt < 7) + if (qp->s_rnr_retry_cnt < 7 && qp->s_rnr_retry_cnt > 0) qp->s_rnr_retry--; - /* The last valid PSN is the previous PSN. */ - update_last_psn(qp, psn - 1); + /* + * The last valid PSN is the previous PSN. For TID RDMA WRITE + * request, s_last_psn should be incremented only when a TID + * RDMA WRITE RESP is received to avoid skipping lost TID RDMA + * WRITE RESP packets. + */ + if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) { + reset_psn(qp, qp->s_last_psn + 1); + } else { + update_last_psn(qp, psn - 1); + reset_psn(qp, psn); + } ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn); - - reset_psn(qp, psn); - qp->s_flags &= ~(RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_ACK); rvt_stop_rc_timers(qp); rvt_add_rnr_timer(qp, aeth); @@ -1918,6 +2310,7 @@ static void rdma_seq_err(struct rvt_qp *qp, struct hfi1_ibport *ibp, u32 psn, while (cmp_psn(psn, wqe->lpsn) > 0) { if (wqe->wr.opcode == IB_WR_RDMA_READ || wqe->wr.opcode == IB_WR_TID_RDMA_READ || + wqe->wr.opcode == IB_WR_TID_RDMA_WRITE || wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) break; diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index 80111dd1d876..490e47a0f68b 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -3205,6 +3205,20 @@ void hfi1_qp_kern_exp_rcv_clear_all(struct rvt_qp *qp) do { struct hfi1_swqe_priv *priv = wqe->priv; + ret = hfi1_kern_exp_rcv_clear(&priv->tid_req); + } while (!ret); + } + for (i = qp->s_acked_ack_queue; i != qp->r_head_ack_queue;) { + struct rvt_ack_entry *e = &qp->s_ack_queue[i]; + + if (++i == rvt_max_atomic(ib_to_rvt(qp->ibqp.device))) + i = 0; + /* Free only locally allocated TID entries */ + if (e->opcode != TID_OP(WRITE_REQ)) + continue; + do { + struct hfi1_ack_priv *priv = e->priv; + ret = hfi1_kern_exp_rcv_clear(&priv->tid_req); } while (!ret); } diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c index e5e7fad09f32..6764114b886c 100644 --- a/drivers/infiniband/hw/hfi1/user_sdma.c +++ b/drivers/infiniband/hw/hfi1/user_sdma.c @@ -1126,7 +1126,8 @@ static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags) 0xffffffull), psn = val & mask; if (expct) - psn = (psn & ~BTH_SEQ_MASK) | ((psn + frags) & BTH_SEQ_MASK); + psn = (psn & ~HFI1_KDETH_BTH_SEQ_MASK) | + ((psn + frags) & HFI1_KDETH_BTH_SEQ_MASK); else psn = psn + frags; return psn & mask; diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index 7b87b77582bd..ab97d71cdd92 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -161,6 +161,7 @@ MODULE_PARM_DESC(wss_clean_period, "Count of verbs copies before an entry in the */ const enum ib_wc_opcode ib_hfi1_wc_opcode[] = { [IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE, + [IB_WR_TID_RDMA_WRITE] = IB_WC_RDMA_WRITE, [IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE, [IB_WR_SEND] = IB_WC_SEND, [IB_WR_SEND_WITH_IMM] = IB_WC_SEND, @@ -203,6 +204,12 @@ const u8 hdr_len_by_opcode[256] = { [IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE] = 12 + 8 + 4, [IB_OPCODE_TID_RDMA_READ_REQ] = 12 + 8 + 36, [IB_OPCODE_TID_RDMA_READ_RESP] = 12 + 8 + 36, + [IB_OPCODE_TID_RDMA_WRITE_REQ] = 12 + 8 + 36, + [IB_OPCODE_TID_RDMA_WRITE_RESP] = 12 + 8 + 36, + [IB_OPCODE_TID_RDMA_WRITE_DATA] = 12 + 8 + 36, + [IB_OPCODE_TID_RDMA_WRITE_DATA_LAST] = 12 + 8 + 36, + [IB_OPCODE_TID_RDMA_ACK] = 12 + 8 + 36, + [IB_OPCODE_TID_RDMA_RESYNC] = 12 + 8 + 36, /* UC */ [IB_OPCODE_UC_SEND_FIRST] = 12 + 8, [IB_OPCODE_UC_SEND_MIDDLE] = 12 + 8, @@ -248,8 +255,14 @@ static const opcode_handler opcode_handler_tbl[256] = { [IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE] = &hfi1_rc_rcv, /* TID RDMA has separate handlers for different opcodes.*/ + [IB_OPCODE_TID_RDMA_WRITE_REQ] = &hfi1_rc_rcv_tid_rdma_write_req, + [IB_OPCODE_TID_RDMA_WRITE_RESP] = &hfi1_rc_rcv_tid_rdma_write_resp, + [IB_OPCODE_TID_RDMA_WRITE_DATA] = &hfi1_rc_rcv_tid_rdma_write_data, + [IB_OPCODE_TID_RDMA_WRITE_DATA_LAST] = &hfi1_rc_rcv_tid_rdma_write_data, [IB_OPCODE_TID_RDMA_READ_REQ] = &hfi1_rc_rcv_tid_rdma_read_req, [IB_OPCODE_TID_RDMA_READ_RESP] = &hfi1_rc_rcv_tid_rdma_read_resp, + [IB_OPCODE_TID_RDMA_RESYNC] = &hfi1_rc_rcv_tid_rdma_resync, + [IB_OPCODE_TID_RDMA_ACK] = &hfi1_rc_rcv_tid_rdma_ack, /* UC */ [IB_OPCODE_UC_SEND_FIRST] = &hfi1_uc_rcv, @@ -1332,7 +1345,9 @@ static void hfi1_fill_device_attr(struct hfi1_devdata *dd) rdi->dparms.props.max_mr_size = U64_MAX; rdi->dparms.props.max_fast_reg_page_list_len = UINT_MAX; rdi->dparms.props.max_qp = hfi1_max_qps; - rdi->dparms.props.max_qp_wr = hfi1_max_qp_wrs; + rdi->dparms.props.max_qp_wr = + (hfi1_max_qp_wrs >= HFI1_QP_WQE_INVALID ? + HFI1_QP_WQE_INVALID - 1 : hfi1_max_qp_wrs); rdi->dparms.props.max_send_sge = hfi1_max_sges; rdi->dparms.props.max_recv_sge = hfi1_max_sges; rdi->dparms.props.max_sge_rd = hfi1_max_sges; diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index bee3d21a548e..62ace0b2d17a 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -193,6 +193,7 @@ struct hfi1_qp_priv { u32 r_tid_ack; /* the TID RDMA request to be ACK'ed */ u32 r_tid_alloc; /* Request for which we are allocating resources */ u32 pending_tid_w_segs; /* Num of pending tid write segments */ + u32 pending_tid_w_resp; /* Num of pending tid write responses */ u32 alloc_w_segs; /* Number of segments for which write */ /* resources have been allocated for this QP */ diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index 4ee612ab6cb4..f0fbd4063fef 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -246,6 +246,7 @@ struct rvt_ack_entry { #define RVT_OPERATION_ATOMIC_SGE 0x00000004 #define RVT_OPERATION_LOCAL 0x00000008 #define RVT_OPERATION_USE_RESERVE 0x00000010 +#define RVT_OPERATION_IGN_RNR_CNT 0x00000020 #define RVT_OPERATION_MAX (IB_WR_RESERVED10 + 1) -- cgit v1.2.3-59-g8ed1b From c6c231175ccdf188d443c27e5456b9e2f65e44d4 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 21:51:49 -0800 Subject: IB/hfi1: Add interlock between TID RDMA WRITE and other requests This locking mechanism is designed to provent vavious memory corruption scenarios from occurring when requests are pipelined, especially when RDMA WRITE requests are interleaved with TID RDMA READ requests: 1. READ-AFTER-READ; 2. READ-AFTER-WRITE; 3. WRITE-AFTER-READ; 4. WRITE-AFTER-WRITE. When memory corruption is likely, a request will be held back until previous requests have been completed. Reviewed-by: Mike Marciniszyn Signed-off-by: Mitko Haralanov Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/rc.c | 6 +++++ drivers/infiniband/hw/hfi1/tid_rdma.c | 46 +++++++++++++++++++++++++++++++++-- drivers/infiniband/hw/hfi1/tid_rdma.h | 9 +++++++ 3 files changed, 59 insertions(+), 2 deletions(-) (limited to 'drivers/infiniband/hw/hfi1/rc.c') diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index 6d2abea896e5..cfb863364f50 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -173,6 +173,12 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, } e = &qp->s_ack_queue[qp->s_tail_ack_queue]; + /* Check for tid write fence */ + if ((qpriv->s_flags & HFI1_R_TID_WAIT_INTERLCK) || + hfi1_tid_rdma_ack_interlock(qp, e)) { + iowait_set_flag(&qpriv->s_iowait, IOWAIT_PENDING_IB); + goto bail; + } if (e->opcode == OP(RDMA_READ_REQUEST)) { /* * If a RDMA read response is being resent and diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index 490e47a0f68b..286752011f25 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -2179,6 +2179,7 @@ static int tid_rdma_rcv_error(struct hfi1_packet *packet, req->state = TID_REQUEST_RESEND; req->cur_seg = req->comp_seg; } + qpriv->s_flags &= ~HFI1_R_TID_WAIT_INTERLCK; } /* Re-process old requests.*/ if (qp->s_acked_ack_queue == qp->s_tail_ack_queue) @@ -3229,6 +3230,7 @@ bool hfi1_tid_rdma_wqe_interlock(struct rvt_qp *qp, struct rvt_swqe *wqe) struct rvt_swqe *prev; struct hfi1_qp_priv *priv = qp->priv; u32 s_prev; + struct tid_rdma_request *req; s_prev = (qp->s_cur == 0 ? qp->s_size : qp->s_cur) - 1; prev = rvt_get_swqe_ptr(qp, s_prev); @@ -3240,14 +3242,28 @@ bool hfi1_tid_rdma_wqe_interlock(struct rvt_qp *qp, struct rvt_swqe *wqe) case IB_WR_ATOMIC_CMP_AND_SWP: case IB_WR_ATOMIC_FETCH_AND_ADD: case IB_WR_RDMA_WRITE: + switch (prev->wr.opcode) { + case IB_WR_TID_RDMA_WRITE: + req = wqe_to_tid_req(prev); + if (req->ack_seg != req->total_segs) + goto interlock; + default: + break; + } case IB_WR_RDMA_READ: - break; + if (prev->wr.opcode != IB_WR_TID_RDMA_WRITE) + break; + /* fall through */ case IB_WR_TID_RDMA_READ: switch (prev->wr.opcode) { case IB_WR_RDMA_READ: if (qp->s_acked != qp->s_cur) goto interlock; break; + case IB_WR_TID_RDMA_WRITE: + req = wqe_to_tid_req(prev); + if (req->ack_seg != req->total_segs) + goto interlock; default: break; } @@ -5157,7 +5173,9 @@ static int make_tid_rdma_ack(struct rvt_qp *qp, e = &qp->s_ack_queue[qpriv->r_tid_ack]; req = ack_to_tid_req(e); flow = req->acked_tail; - } + } else if (req->ack_seg == req->total_segs && + qpriv->s_flags & HFI1_R_TID_WAIT_INTERLCK) + qpriv->s_flags &= ~HFI1_R_TID_WAIT_INTERLCK; hwords += hfi1_build_tid_rdma_write_ack(qp, e, ohdr, flow, &bth1, &bth2); @@ -5310,3 +5328,27 @@ bool hfi1_schedule_tid_send(struct rvt_qp *qp) IOWAIT_PENDING_TID); return false; } + +bool hfi1_tid_rdma_ack_interlock(struct rvt_qp *qp, struct rvt_ack_entry *e) +{ + struct rvt_ack_entry *prev; + struct tid_rdma_request *req; + struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); + struct hfi1_qp_priv *priv = qp->priv; + u32 s_prev; + + s_prev = qp->s_tail_ack_queue == 0 ? rvt_size_atomic(&dev->rdi) : + (qp->s_tail_ack_queue - 1); + prev = &qp->s_ack_queue[s_prev]; + + if ((e->opcode == TID_OP(READ_REQ) || + e->opcode == OP(RDMA_READ_REQUEST)) && + prev->opcode == TID_OP(WRITE_REQ)) { + req = ack_to_tid_req(prev); + if (req->ack_seg != req->total_segs) { + priv->s_flags |= HFI1_R_TID_WAIT_INTERLCK; + return true; + } + } + return false; +} diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index 7f8f17ba6c14..44468188a374 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -25,6 +25,7 @@ * s_flags, there are no collisions. * * HFI1_S_TID_WAIT_INTERLCK - QP is waiting for requester interlock + * HFI1_R_TID_WAIT_INTERLCK - QP is waiting for responder interlock */ #define HFI1_S_TID_BUSY_SET BIT(0) /* BIT(1) reserved for RVT_S_BUSY. */ @@ -32,9 +33,15 @@ /* BIT(3) reserved for RVT_S_RESP_PENDING. */ /* BIT(4) reserved for RVT_S_ACK_PENDING. */ #define HFI1_S_TID_WAIT_INTERLCK BIT(5) +#define HFI1_R_TID_WAIT_INTERLCK BIT(6) /* BIT(7) - BIT(15) reserved for RVT_S_WAIT_*. */ +/* BIT(16) reserved for RVT_S_SEND_ONE */ #define HFI1_S_TID_RETRY_TIMER BIT(17) +/* BIT(18) reserved for RVT_S_ECN. */ #define HFI1_R_TID_SW_PSN BIT(19) +/* BIT(26) reserved for HFI1_S_WAIT_HALT */ +/* BIT(27) reserved for HFI1_S_WAIT_TID_RESP */ +/* BIT(28) reserved for HFI1_S_WAIT_TID_SPACE */ /* * Unlike regular IB RDMA VERBS, which do not require an entry @@ -309,4 +316,6 @@ void _hfi1_do_tid_send(struct work_struct *work); bool hfi1_schedule_tid_send(struct rvt_qp *qp); +bool hfi1_tid_rdma_ack_interlock(struct rvt_qp *qp, struct rvt_ack_entry *e); + #endif /* HFI1_TID_RDMA_H */ -- cgit v1.2.3-59-g8ed1b From a05c9bdcfd16cec3a004cca339ab45de4cdf4799 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 21:52:09 -0800 Subject: IB/hfi1: Add static trace for TID RDMA WRITE protocol This patch makes the following changes to the static trace: 1. Adds the decoding of TID RDMA WRITE packets in IB header trace; 2. Adds trace events for various stages of the TID RDMA WRITE protocol. These events provide a fine-grained control for monitoring and debugging the hfi1 driver in the filed. Reviewed-by: Mike Marciniszyn Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/rc.c | 12 + drivers/infiniband/hw/hfi1/tid_rdma.c | 88 +++++ drivers/infiniband/hw/hfi1/trace.c | 66 ++++ drivers/infiniband/hw/hfi1/trace_ibhdrs.h | 6 + drivers/infiniband/hw/hfi1/trace_tid.h | 517 +++++++++++++++++++++++++++++- drivers/infiniband/hw/hfi1/trace_tx.h | 6 + 6 files changed, 692 insertions(+), 3 deletions(-) (limited to 'drivers/infiniband/hw/hfi1/rc.c') diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index cfb863364f50..82afa7736be7 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -162,6 +162,7 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, qp->s_acked_ack_queue == qp->s_tail_ack_queue) qp->s_acked_ack_queue = next; qp->s_tail_ack_queue = next; + trace_hfi1_rsp_make_rc_ack(qp, e->psn); /* FALLTHROUGH */ case OP(SEND_ONLY): case OP(ACKNOWLEDGE): @@ -263,6 +264,7 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, bth2 = mask_psn(e->psn); e->sent = 1; } + trace_hfi1_tid_write_rsp_make_rc_ack(qp); bth0 = qp->s_ack_state << 24; break; @@ -335,6 +337,8 @@ write_resp: hwords += hdrlen; bth0 = qp->s_ack_state << 24; qp->s_ack_rdma_psn++; + trace_hfi1_tid_req_make_rc_ack_write(qp, 0, e->opcode, e->psn, + e->lpsn, req); if (req->cur_seg != req->total_segs) break; @@ -761,6 +765,11 @@ no_flow_control: delta_psn(wqe->lpsn, bth2) + 1; } + trace_hfi1_tid_write_sender_make_req(qp, newreq); + trace_hfi1_tid_req_make_req_write(qp, newreq, + wqe->wr.opcode, + wqe->psn, wqe->lpsn, + req); if (++qp->s_cur == qp->s_size) qp->s_cur = 0; break; @@ -1070,6 +1079,8 @@ no_flow_control: priv->s_tid_cur = qp->s_cur; if (++qp->s_cur == qp->s_size) qp->s_cur = 0; + trace_hfi1_tid_req_make_req_write(qp, 0, wqe->wr.opcode, + wqe->psn, wqe->lpsn, req); break; case TID_OP(READ_RESP): @@ -1625,6 +1636,7 @@ void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait) wqe = do_rc_completion(qp, wqe, ibp); qp->s_flags &= ~RVT_S_WAIT_ACK; } else { + trace_hfi1_tid_write_sender_restart_rc(qp, 0); if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) { struct tid_rdma_request *req; diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index db3188f66dba..a49eb3d9b5b9 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -2962,6 +2962,12 @@ bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd, e = &qp->s_ack_queue[qpriv->r_tid_tail]; req = ack_to_tid_req(e); flow = &req->flows[req->clear_tail]; + trace_hfi1_eflags_err_write(qp, rcv_type, rte, psn); + trace_hfi1_rsp_handle_kdeth_eflags(qp, psn); + trace_hfi1_tid_write_rsp_handle_kdeth_eflags(qp); + trace_hfi1_tid_req_handle_kdeth_eflags(qp, 0, e->opcode, e->psn, + e->lpsn, req); + trace_hfi1_tid_flow_handle_kdeth_eflags(qp, req->clear_tail, flow); switch (rcv_type) { case RHF_RCV_TYPE_EXPECTED: @@ -3489,6 +3495,8 @@ static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx) lockdep_assert_held(&qp->s_lock); while (1) { + trace_hfi1_rsp_tid_write_alloc_res(qp, 0); + trace_hfi1_tid_write_rsp_alloc_res(qp); /* * Don't allocate more segments if a RNR NAK has already been * scheduled to avoid messing up qp->r_psn: the RNR NAK will @@ -3517,6 +3525,8 @@ static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx) if (e->opcode != TID_OP(WRITE_REQ)) goto next_req; req = ack_to_tid_req(e); + trace_hfi1_tid_req_write_alloc_res(qp, 0, e->opcode, e->psn, + e->lpsn, req); /* Finished allocating for all segments of this request */ if (req->alloc_seg >= req->total_segs) goto next_req; @@ -3633,6 +3643,7 @@ send_rnr_nak: */ qp->s_flags &= ~(RVT_S_ACK_PENDING); + trace_hfi1_rsp_tid_write_alloc_res(qp, qp->r_psn); /* * qpriv->rnr_nak_state is used to determine when the scheduled RNR NAK * has actually been sent. qp->s_flags RVT_S_ACK_PENDING bit cannot be @@ -3686,6 +3697,7 @@ void hfi1_rc_rcv_tid_rdma_write_req(struct hfi1_packet *packet) is_fecn = process_ecn(qp, packet); psn = mask_psn(be32_to_cpu(ohdr->bth[2])); + trace_hfi1_rsp_rcv_tid_write_req(qp, psn); if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST)) rvt_comm_est(qp); @@ -3794,6 +3806,9 @@ void hfi1_rc_rcv_tid_rdma_write_req(struct hfi1_packet *packet) qp->r_msn++; qp->r_psn++; + trace_hfi1_tid_req_rcv_write_req(qp, 0, e->opcode, e->psn, e->lpsn, + req); + if (qpriv->r_tid_tail == HFI1_QP_WQE_INVALID) { qpriv->r_tid_tail = qp->r_head_ack_queue; } else if (qpriv->r_tid_tail == qpriv->r_tid_head) { @@ -3814,6 +3829,7 @@ update_head: qpriv->r_tid_head = qp->r_head_ack_queue; hfi1_tid_write_alloc_resources(qp, true); + trace_hfi1_tid_write_rsp_rcv_req(qp); /* Schedule the send tasklet. */ qp->s_flags |= RVT_S_RESP_PENDING; @@ -3855,6 +3871,10 @@ u32 hfi1_build_tid_rdma_write_resp(struct rvt_qp *qp, struct rvt_ack_entry *e, void *resp_addr = NULL; struct tid_rdma_params *remote; + trace_hfi1_tid_req_build_write_resp(qp, 0, e->opcode, e->psn, e->lpsn, + req); + trace_hfi1_tid_write_rsp_build_resp(qp); + trace_hfi1_rsp_build_tid_write_resp(qp, bth2); flow = &req->flows[req->flow_idx]; switch (req->state) { default: @@ -3876,12 +3896,14 @@ u32 hfi1_build_tid_rdma_write_resp(struct rvt_qp *qp, struct rvt_ack_entry *e, goto done; req->state = TID_REQUEST_ACTIVE; + trace_hfi1_tid_flow_build_write_resp(qp, req->flow_idx, flow); req->flow_idx = CIRC_NEXT(req->flow_idx, MAX_FLOWS); hfi1_add_tid_reap_timer(qp); break; case TID_REQUEST_RESEND_ACTIVE: case TID_REQUEST_RESEND: + trace_hfi1_tid_flow_build_write_resp(qp, req->flow_idx, flow); req->flow_idx = CIRC_NEXT(req->flow_idx, MAX_FLOWS); if (!CIRC_CNT(req->setup_head, req->flow_idx, MAX_FLOWS)) req->state = TID_REQUEST_ACTIVE; @@ -3996,6 +4018,9 @@ static void hfi1_tid_timeout(struct timer_list *t) if (qpriv->s_flags & HFI1_R_TID_RSC_TIMER) { dd_dev_warn(dd_from_ibdev(qp->ibqp.device), "[QP%u] %s %d\n", qp->ibqp.qp_num, __func__, __LINE__); + trace_hfi1_msg_tid_timeout(/* msg */ + qp, "resource timeout = ", + (u64)qpriv->tid_timer_timeout_jiffies); hfi1_stop_tid_reap_timer(qp); /* * Go though the entire ack queue and clear any outstanding @@ -4102,6 +4127,8 @@ void hfi1_rc_rcv_tid_rdma_write_resp(struct hfi1_packet *packet) if (!do_rc_ack(qp, aeth, psn, opcode, 0, rcd)) goto ack_done; + trace_hfi1_ack(qp, psn); + flow = &req->flows[req->setup_head]; flow->pkt = 0; flow->tid_idx = 0; @@ -4129,13 +4156,17 @@ void hfi1_rc_rcv_tid_rdma_write_resp(struct hfi1_packet *packet) } memcpy(flow->tid_entry, packet->ebuf, pktlen); flow->tidcnt = pktlen / sizeof(*flow->tid_entry); + trace_hfi1_tid_flow_rcv_write_resp(qp, req->setup_head, flow); req->comp_seg++; + trace_hfi1_tid_write_sender_rcv_resp(qp, 0); /* * Walk the TID_ENTRY list to make sure we have enough space for a * complete segment. */ for (i = 0; i < flow->tidcnt; i++) { + trace_hfi1_tid_entry_rcv_write_resp(/* entry */ + qp, i, flow->tid_entry[i]); if (!EXP_TID_GET(flow->tid_entry[i], LEN)) { status = IB_WC_LOC_LEN_ERR; goto ack_err; @@ -4147,6 +4178,8 @@ void hfi1_rc_rcv_tid_rdma_write_resp(struct hfi1_packet *packet) goto ack_err; } + trace_hfi1_tid_req_rcv_write_resp(qp, 0, wqe->wr.opcode, wqe->psn, + wqe->lpsn, req); /* * If this is the first response for this request, set the initial * flow index to the current flow. @@ -4221,6 +4254,8 @@ bool hfi1_build_tid_rdma_packet(struct rvt_swqe *wqe, next_offset = flow->tid_offset + *len; last_pkt = (flow->tid_idx == (flow->tidcnt - 1) && next_offset >= tidlen) || (flow->sent >= flow->length); + trace_hfi1_tid_entry_build_write_data(qp, flow->tid_idx, tidentry); + trace_hfi1_tid_flow_build_write_data(qp, req->clear_tail, flow); rcu_read_lock(); remote = rcu_dereference(qpriv->tid_rdma.remote); @@ -4303,6 +4338,10 @@ void hfi1_rc_rcv_tid_rdma_write_data(struct hfi1_packet *packet) * - The entire request is complete and there are no more requests * (of any kind) in the queue. */ + trace_hfi1_rsp_rcv_tid_write_data(qp, psn); + trace_hfi1_tid_req_rcv_write_data(qp, 0, e->opcode, e->psn, e->lpsn, + req); + trace_hfi1_tid_write_rsp_rcv_data(qp); if (priv->r_tid_ack == HFI1_QP_WQE_INVALID) priv->r_tid_ack = priv->r_tid_tail; @@ -4451,6 +4490,7 @@ void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet) unsigned long flags; u16 fidx; + trace_hfi1_tid_write_sender_rcv_tid_ack(qp, 0); is_fecn = process_ecn(qp, packet); psn = mask_psn(be32_to_cpu(ohdr->bth[2])); aeth = be32_to_cpu(ohdr->u.tid_rdma.ack.aeth); @@ -4458,6 +4498,7 @@ void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet) resync_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.ack.tid_flow_psn)); spin_lock_irqsave(&qp->s_lock, flags); + trace_hfi1_rcv_tid_ack(qp, aeth, psn, req_psn, resync_psn); /* If we are waiting for an ACK to RESYNC, drop any other packets */ if ((qp->s_flags & HFI1_S_WAIT_HALT) && @@ -4480,7 +4521,10 @@ void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet) goto ack_op_err; req = wqe_to_tid_req(wqe); + trace_hfi1_tid_req_rcv_tid_ack(qp, 0, wqe->wr.opcode, wqe->psn, + wqe->lpsn, req); flow = &req->flows[req->acked_tail]; + trace_hfi1_tid_flow_rcv_tid_ack(qp, req->acked_tail, flow); /* Drop stale ACK/NAK */ if (cmp_psn(psn, full_flow_psn(flow, flow->flow_state.spsn)) < 0) @@ -4493,11 +4537,14 @@ void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet) /* advance acked segment pointer */ req->acked_tail = CIRC_NEXT(req->acked_tail, MAX_FLOWS); req->r_last_acked = flow->flow_state.resp_ib_psn; + trace_hfi1_tid_req_rcv_tid_ack(qp, 0, wqe->wr.opcode, wqe->psn, + wqe->lpsn, req); if (req->ack_seg == req->total_segs) { req->state = TID_REQUEST_COMPLETE; wqe = do_rc_completion(qp, wqe, to_iport(qp->ibqp.device, qp->port_num)); + trace_hfi1_sender_rcv_tid_ack(qp); atomic_dec(&qpriv->n_tid_requests); if (qp->s_acked == qp->s_tail) break; @@ -4506,8 +4553,11 @@ void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet) req = wqe_to_tid_req(wqe); } flow = &req->flows[req->acked_tail]; + trace_hfi1_tid_flow_rcv_tid_ack(qp, req->acked_tail, flow); } + trace_hfi1_tid_req_rcv_tid_ack(qp, 0, wqe->wr.opcode, wqe->psn, + wqe->lpsn, req); switch (aeth >> 29) { case 0: /* ACK */ if (qpriv->s_flags & RVT_S_WAIT_ACK) @@ -4614,6 +4664,9 @@ void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet) flow->pkt = 0; spsn += flow->npkts; resync_psn += flow->npkts; + trace_hfi1_tid_flow_rcv_tid_ack(qp, + fidx, + flow); } if (++last_acked == qpriv->s_tid_cur + 1) break; @@ -4638,6 +4691,8 @@ done: case 0: /* PSN sequence error */ flow = &req->flows[req->acked_tail]; fspsn = full_flow_psn(flow, flow->flow_state.spsn); + trace_hfi1_tid_flow_rcv_tid_ack(qp, req->acked_tail, + flow); req->r_ack_psn = mask_psn(be32_to_cpu(ohdr->bth[2])); req->cur_seg = req->ack_seg; qpriv->s_tid_tail = qp->s_acked; @@ -4713,16 +4768,28 @@ static void hfi1_tid_retry_timeout(struct timer_list *t) struct rvt_qp *qp = priv->owner; struct rvt_swqe *wqe; unsigned long flags; + struct tid_rdma_request *req; spin_lock_irqsave(&qp->r_lock, flags); spin_lock(&qp->s_lock); + trace_hfi1_tid_write_sender_retry_timeout(qp, 0); if (priv->s_flags & HFI1_S_TID_RETRY_TIMER) { hfi1_stop_tid_retry_timer(qp); if (!priv->s_retry) { + trace_hfi1_msg_tid_retry_timeout(/* msg */ + qp, + "Exhausted retries. Tid retry timeout = ", + (u64)priv->tid_retry_timeout_jiffies); + wqe = rvt_get_swqe_ptr(qp, qp->s_acked); hfi1_trdma_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR); rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); } else { + wqe = rvt_get_swqe_ptr(qp, qp->s_acked); + req = wqe_to_tid_req(wqe); + trace_hfi1_tid_req_tid_retry_timeout(/* req */ + qp, 0, wqe->wr.opcode, wqe->psn, wqe->lpsn, req); + priv->s_flags &= ~RVT_S_WAIT_ACK; /* Only send one packet (the RESYNC) */ priv->s_flags |= RVT_S_SEND_ONE; @@ -4818,6 +4885,7 @@ void hfi1_rc_rcv_tid_rdma_resync(struct hfi1_packet *packet) * sync point and the flow has/will be reprogrammed */ qpriv->s_flags &= ~HFI1_R_TID_SW_PSN; + trace_hfi1_tid_write_rsp_rcv_resync(qp); /* * Reset all TID flow information with the new generation. @@ -4832,6 +4900,8 @@ void hfi1_rc_rcv_tid_rdma_resync(struct hfi1_packet *packet) e = &qp->s_ack_queue[idx]; if (e->opcode == TID_OP(WRITE_REQ)) { req = ack_to_tid_req(e); + trace_hfi1_tid_req_rcv_resync(qp, 0, e->opcode, e->psn, + e->lpsn, req); /* start from last unacked segment */ for (flow_idx = req->clear_tail; @@ -4854,6 +4924,8 @@ void hfi1_rc_rcv_tid_rdma_resync(struct hfi1_packet *packet) full_flow_psn(flow, flow->flow_state.spsn); fs->psn += flow->npkts; + trace_hfi1_tid_flow_rcv_resync(qp, flow_idx, + flow); } } if (idx == qp->s_tail_ack_queue) @@ -4913,6 +4985,7 @@ int hfi1_make_tid_rdma_pkt(struct rvt_qp *qp, struct hfi1_pkt_state *ps) u8 opcode = TID_OP(WRITE_DATA); lockdep_assert_held(&qp->s_lock); + trace_hfi1_tid_write_sender_make_tid_pkt(qp, 0); /* * Prioritize the sending of the requests and responses over the * sending of the TID RDMA data packets. @@ -4970,6 +5043,8 @@ int hfi1_make_tid_rdma_pkt(struct rvt_qp *qp, struct hfi1_pkt_state *ps) goto bail; wqe = rvt_get_swqe_ptr(qp, priv->s_tid_tail); req = wqe_to_tid_req(wqe); + trace_hfi1_tid_req_make_tid_pkt(qp, 0, wqe->wr.opcode, wqe->psn, + wqe->lpsn, req); switch (priv->s_state) { case TID_OP(WRITE_REQ): case TID_OP(WRITE_RESP): @@ -4997,6 +5072,8 @@ int hfi1_make_tid_rdma_pkt(struct rvt_qp *qp, struct hfi1_pkt_state *ps) * 3.2.2 Advance RESP pointers. * 3.3 Return indicating progress made. */ + trace_hfi1_sender_make_tid_pkt(qp); + trace_hfi1_tid_write_sender_make_tid_pkt(qp, 0); wqe = rvt_get_swqe_ptr(qp, priv->s_tid_tail); req = wqe_to_tid_req(wqe); len = wqe->length; @@ -5004,6 +5081,8 @@ int hfi1_make_tid_rdma_pkt(struct rvt_qp *qp, struct hfi1_pkt_state *ps) if (!req->comp_seg || req->cur_seg == req->comp_seg) goto bail; + trace_hfi1_tid_req_make_tid_pkt(qp, 0, wqe->wr.opcode, + wqe->psn, wqe->lpsn, req); last = hfi1_build_tid_rdma_packet(wqe, ohdr, &bth1, &bth2, &len); @@ -5028,6 +5107,7 @@ int hfi1_make_tid_rdma_pkt(struct rvt_qp *qp, struct hfi1_pkt_state *ps) break; case TID_OP(RESYNC): + trace_hfi1_sender_make_tid_pkt(qp); /* Use generation from the most recently received response */ wqe = rvt_get_swqe_ptr(qp, priv->s_tid_cur); req = wqe_to_tid_req(wqe); @@ -5098,6 +5178,7 @@ static int make_tid_rdma_ack(struct rvt_qp *qp, u16 flow; struct tid_rdma_request *req, *nreq; + trace_hfi1_tid_write_rsp_make_tid_ack(qp); /* Don't send an ACK if we aren't supposed to. */ if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) goto bail; @@ -5128,6 +5209,9 @@ static int make_tid_rdma_ack(struct rvt_qp *qp, req = ack_to_tid_req(e); } + trace_hfi1_rsp_make_tid_ack(qp, e->psn); + trace_hfi1_tid_req_make_tid_ack(qp, 0, e->opcode, e->psn, e->lpsn, + req); /* * If we've sent all the ACKs that we can, we are done * until we get more segments... @@ -5199,6 +5283,9 @@ static int make_tid_rdma_ack(struct rvt_qp *qp, qpriv->s_flags & HFI1_R_TID_WAIT_INTERLCK) qpriv->s_flags &= ~HFI1_R_TID_WAIT_INTERLCK; + trace_hfi1_tid_write_rsp_make_tid_ack(qp); + trace_hfi1_tid_req_make_tid_ack(qp, 0, e->opcode, e->psn, e->lpsn, + req); hwords += hfi1_build_tid_rdma_write_ack(qp, e, ohdr, flow, &bth1, &bth2); len = 0; @@ -5251,6 +5338,7 @@ static void hfi1_do_tid_send(struct rvt_qp *qp) ps.in_thread = false; ps.timeout_int = qp->timeout_jiffies / 8; + trace_hfi1_rc_do_tid_send(qp, false); spin_lock_irqsave(&qp->s_lock, ps.flags); /* Return if we are already busy processing a work request. */ diff --git a/drivers/infiniband/hw/hfi1/trace.c b/drivers/infiniband/hw/hfi1/trace.c index 28181d711fed..9a3d236bcc88 100644 --- a/drivers/infiniband/hw/hfi1/trace.c +++ b/drivers/infiniband/hw/hfi1/trace.c @@ -133,6 +133,11 @@ const char *hfi1_trace_get_packet_l2_str(u8 l2) #define TID_RDMA_KDETH_DATA "kdeth0 0x%x: kver %u sh %u intr %u tidctrl %u tid %x offset %x kdeth1 0x%x: jkey %x" #define TID_READ_REQ_PRN "tid_flow_psn 0x%x tid_flow_qp 0x%x verbs_qp 0x%x" #define TID_READ_RSP_PRN "verbs_qp 0x%x" +#define TID_WRITE_REQ_PRN "original_qp 0x%x" +#define TID_WRITE_RSP_PRN "tid_flow_psn 0x%x tid_flow_qp 0x%x verbs_qp 0x%x" +#define TID_WRITE_DATA_PRN "verbs_qp 0x%x" +#define TID_ACK_PRN "tid_flow_psn 0x%x verbs_psn 0x%x tid_flow_qp 0x%x verbs_qp 0x%x" +#define TID_RESYNC_PRN "verbs_qp 0x%x" #define OP(transport, op) IB_OPCODE_## transport ## _ ## op @@ -327,6 +332,45 @@ const char *parse_everbs_hdrs( parse_syndrome(be32_to_cpu(eh->aeth) >> 24), be32_to_cpu(eh->aeth) & IB_MSN_MASK); break; + case OP(TID_RDMA, WRITE_REQ): + trace_seq_printf(p, TID_RDMA_KDETH " " RETH_PRN " " + TID_WRITE_REQ_PRN, + le32_to_cpu(eh->tid_rdma.w_req.kdeth0), + le32_to_cpu(eh->tid_rdma.w_req.kdeth1), + ib_u64_get(&eh->tid_rdma.w_req.reth.vaddr), + be32_to_cpu(eh->tid_rdma.w_req.reth.rkey), + be32_to_cpu(eh->tid_rdma.w_req.reth.length), + be32_to_cpu(eh->tid_rdma.w_req.verbs_qp)); + break; + case OP(TID_RDMA, WRITE_RESP): + trace_seq_printf(p, TID_RDMA_KDETH " " AETH_PRN " " + TID_WRITE_RSP_PRN, + le32_to_cpu(eh->tid_rdma.w_rsp.kdeth0), + le32_to_cpu(eh->tid_rdma.w_rsp.kdeth1), + be32_to_cpu(eh->tid_rdma.w_rsp.aeth) >> 24, + parse_syndrome(/* aeth */ + be32_to_cpu(eh->tid_rdma.w_rsp.aeth) + >> 24), + (be32_to_cpu(eh->tid_rdma.w_rsp.aeth) & + IB_MSN_MASK), + be32_to_cpu(eh->tid_rdma.w_rsp.tid_flow_psn), + be32_to_cpu(eh->tid_rdma.w_rsp.tid_flow_qp), + be32_to_cpu(eh->tid_rdma.w_rsp.verbs_qp)); + break; + case OP(TID_RDMA, WRITE_DATA_LAST): + case OP(TID_RDMA, WRITE_DATA): + trace_seq_printf(p, TID_RDMA_KDETH_DATA " " TID_WRITE_DATA_PRN, + le32_to_cpu(eh->tid_rdma.w_data.kdeth0), + KDETH_GET(eh->tid_rdma.w_data.kdeth0, KVER), + KDETH_GET(eh->tid_rdma.w_data.kdeth0, SH), + KDETH_GET(eh->tid_rdma.w_data.kdeth0, INTR), + KDETH_GET(eh->tid_rdma.w_data.kdeth0, TIDCTRL), + KDETH_GET(eh->tid_rdma.w_data.kdeth0, TID), + KDETH_GET(eh->tid_rdma.w_data.kdeth0, OFFSET), + le32_to_cpu(eh->tid_rdma.w_data.kdeth1), + KDETH_GET(eh->tid_rdma.w_data.kdeth1, JKEY), + be32_to_cpu(eh->tid_rdma.w_data.verbs_qp)); + break; case OP(TID_RDMA, READ_REQ): trace_seq_printf(p, TID_RDMA_KDETH " " RETH_PRN " " TID_READ_REQ_PRN, @@ -359,6 +403,28 @@ const char *parse_everbs_hdrs( IB_MSN_MASK), be32_to_cpu(eh->tid_rdma.r_rsp.verbs_qp)); break; + case OP(TID_RDMA, ACK): + trace_seq_printf(p, TID_RDMA_KDETH " " AETH_PRN " " + TID_ACK_PRN, + le32_to_cpu(eh->tid_rdma.ack.kdeth0), + le32_to_cpu(eh->tid_rdma.ack.kdeth1), + be32_to_cpu(eh->tid_rdma.ack.aeth) >> 24, + parse_syndrome(/* aeth */ + be32_to_cpu(eh->tid_rdma.ack.aeth) + >> 24), + (be32_to_cpu(eh->tid_rdma.ack.aeth) & + IB_MSN_MASK), + be32_to_cpu(eh->tid_rdma.ack.tid_flow_psn), + be32_to_cpu(eh->tid_rdma.ack.verbs_psn), + be32_to_cpu(eh->tid_rdma.ack.tid_flow_qp), + be32_to_cpu(eh->tid_rdma.ack.verbs_qp)); + break; + case OP(TID_RDMA, RESYNC): + trace_seq_printf(p, TID_RDMA_KDETH " " TID_RESYNC_PRN, + le32_to_cpu(eh->tid_rdma.resync.kdeth0), + le32_to_cpu(eh->tid_rdma.resync.kdeth1), + be32_to_cpu(eh->tid_rdma.resync.verbs_qp)); + break; /* aeth + atomicacketh */ case OP(RC, ATOMIC_ACKNOWLEDGE): trace_seq_printf(p, AETH_PRN " " ATOMICACKETH_PRN, diff --git a/drivers/infiniband/hw/hfi1/trace_ibhdrs.h b/drivers/infiniband/hw/hfi1/trace_ibhdrs.h index 1116238bf24d..d1372cc66de6 100644 --- a/drivers/infiniband/hw/hfi1/trace_ibhdrs.h +++ b/drivers/infiniband/hw/hfi1/trace_ibhdrs.h @@ -79,8 +79,14 @@ __print_symbolic(opcode, \ ib_opcode_name(RC_ATOMIC_ACKNOWLEDGE), \ ib_opcode_name(RC_COMPARE_SWAP), \ ib_opcode_name(RC_FETCH_ADD), \ + ib_opcode_name(TID_RDMA_WRITE_REQ), \ + ib_opcode_name(TID_RDMA_WRITE_RESP), \ + ib_opcode_name(TID_RDMA_WRITE_DATA), \ + ib_opcode_name(TID_RDMA_WRITE_DATA_LAST), \ ib_opcode_name(TID_RDMA_READ_REQ), \ ib_opcode_name(TID_RDMA_READ_RESP), \ + ib_opcode_name(TID_RDMA_RESYNC), \ + ib_opcode_name(TID_RDMA_ACK), \ ib_opcode_name(UC_SEND_FIRST), \ ib_opcode_name(UC_SEND_MIDDLE), \ ib_opcode_name(UC_SEND_LAST), \ diff --git a/drivers/infiniband/hw/hfi1/trace_tid.h b/drivers/infiniband/hw/hfi1/trace_tid.h index a45b5257d6c4..548dfc45a407 100644 --- a/drivers/infiniband/hw/hfi1/trace_tid.h +++ b/drivers/infiniband/hw/hfi1/trace_tid.h @@ -56,16 +56,33 @@ u16 hfi1_trace_get_tid_idx(u32 ent); "fpsn 0x%x flow_flags 0x%x" #define TID_REQ_PRN "[%s] qpn 0x%x newreq %u opcode 0x%x psn 0x%x lpsn 0x%x " \ - "cur_seg %u comp_seg %u ack_seg %u " \ + "cur_seg %u comp_seg %u ack_seg %u alloc_seg %u " \ "total_segs %u setup_head %u clear_tail %u flow_idx %u " \ - "state %u r_flow_psn 0x%x " \ - "s_next_psn 0x%x" + "acked_tail %u state %u r_ack_psn 0x%x r_flow_psn 0x%x " \ + "r_last_ackd 0x%x s_next_psn 0x%x" #define RCV_ERR_PRN "[%s] qpn 0x%x s_flags 0x%x state 0x%x " \ "s_acked_ack_queue %u s_tail_ack_queue %u " \ "r_head_ack_queue %u opcode 0x%x psn 0x%x r_psn 0x%x " \ " diff %d" +#define TID_WRITE_RSPDR_PRN "[%s] qpn 0x%x r_tid_head %u r_tid_tail %u " \ + "r_tid_ack %u r_tid_alloc %u alloc_w_segs %u " \ + "pending_tid_w_segs %u sync_pt %s " \ + "ps_nak_psn 0x%x ps_nak_state 0x%x " \ + "prnr_nak_state 0x%x hw_flow_index %u generation "\ + "0x%x fpsn 0x%x flow_flags 0x%x resync %s" \ + "r_next_psn_kdeth 0x%x" + +#define TID_WRITE_SENDER_PRN "[%s] qpn 0x%x newreq %u s_tid_cur %u " \ + "s_tid_tail %u s_tid_head %u " \ + "pending_tid_w_resp %u n_requests %u " \ + "n_tid_requests %u s_flags 0x%x ps_flags 0x%x "\ + "iow_flags 0x%lx s_state 0x%x s_retry %u" + +#define KDETH_EFLAGS_ERR_PRN "[%s] qpn 0x%x TID ERR: RcvType 0x%x " \ + "RcvTypeError 0x%x PSN 0x%x" + DECLARE_EVENT_CLASS(/* class */ hfi1_exp_tid_reg_unreg, TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, u32 npages, @@ -382,6 +399,18 @@ DEFINE_EVENT(/* event */ TP_ARGS(qp, msg, more) ); +DEFINE_EVENT(/* event */ + hfi1_msg_template, hfi1_msg_tid_timeout, + TP_PROTO(struct rvt_qp *qp, const char *msg, u64 more), + TP_ARGS(qp, msg, more) +); + +DEFINE_EVENT(/* event */ + hfi1_msg_template, hfi1_msg_tid_retry_timeout, + TP_PROTO(struct rvt_qp *qp, const char *msg, u64 more), + TP_ARGS(qp, msg, more) +); + DECLARE_EVENT_CLASS(/* tid_flow_page */ hfi1_tid_flow_page_template, TP_PROTO(struct rvt_qp *qp, struct tid_rdma_flow *flow, u32 index, @@ -562,6 +591,42 @@ DEFINE_EVENT(/* event */ TP_ARGS(qp, index, flow) ); +DEFINE_EVENT(/* event */ + hfi1_tid_flow_template, hfi1_tid_flow_build_write_resp, + TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow), + TP_ARGS(qp, index, flow) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_flow_template, hfi1_tid_flow_rcv_write_resp, + TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow), + TP_ARGS(qp, index, flow) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_flow_template, hfi1_tid_flow_build_write_data, + TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow), + TP_ARGS(qp, index, flow) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_flow_template, hfi1_tid_flow_rcv_tid_ack, + TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow), + TP_ARGS(qp, index, flow) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_flow_template, hfi1_tid_flow_rcv_resync, + TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow), + TP_ARGS(qp, index, flow) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_flow_template, hfi1_tid_flow_handle_kdeth_eflags, + TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow), + TP_ARGS(qp, index, flow) +); + DECLARE_EVENT_CLASS(/* tid_node */ hfi1_tid_node_template, TP_PROTO(struct rvt_qp *qp, const char *msg, u32 index, u32 base, @@ -656,6 +721,18 @@ DEFINE_EVENT(/* event */ TP_ARGS(qp, index, ent) ); +DEFINE_EVENT(/* event */ + hfi1_tid_entry_template, hfi1_tid_entry_rcv_write_resp, + TP_PROTO(struct rvt_qp *qp, int index, u32 entry), + TP_ARGS(qp, index, entry) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_entry_template, hfi1_tid_entry_build_write_data, + TP_PROTO(struct rvt_qp *qp, int index, u32 entry), + TP_ARGS(qp, index, entry) +); + DECLARE_EVENT_CLASS(/* rsp_info */ hfi1_responder_info_template, TP_PROTO(struct rvt_qp *qp, u32 psn), @@ -738,6 +815,42 @@ DEFINE_EVENT(/* event */ TP_ARGS(qp, psn) ); +DEFINE_EVENT(/* event */ + hfi1_responder_info_template, hfi1_rsp_tid_write_alloc_res, + TP_PROTO(struct rvt_qp *qp, u32 psn), + TP_ARGS(qp, psn) +); + +DEFINE_EVENT(/* event */ + hfi1_responder_info_template, hfi1_rsp_rcv_tid_write_req, + TP_PROTO(struct rvt_qp *qp, u32 psn), + TP_ARGS(qp, psn) +); + +DEFINE_EVENT(/* event */ + hfi1_responder_info_template, hfi1_rsp_build_tid_write_resp, + TP_PROTO(struct rvt_qp *qp, u32 psn), + TP_ARGS(qp, psn) +); + +DEFINE_EVENT(/* event */ + hfi1_responder_info_template, hfi1_rsp_rcv_tid_write_data, + TP_PROTO(struct rvt_qp *qp, u32 psn), + TP_ARGS(qp, psn) +); + +DEFINE_EVENT(/* event */ + hfi1_responder_info_template, hfi1_rsp_make_tid_ack, + TP_PROTO(struct rvt_qp *qp, u32 psn), + TP_ARGS(qp, psn) +); + +DEFINE_EVENT(/* event */ + hfi1_responder_info_template, hfi1_rsp_handle_kdeth_eflags, + TP_PROTO(struct rvt_qp *qp, u32 psn), + TP_ARGS(qp, psn) +); + DECLARE_EVENT_CLASS(/* sender_info */ hfi1_sender_info_template, TP_PROTO(struct rvt_qp *qp), @@ -830,6 +943,18 @@ DEFINE_EVENT(/* event */ TP_ARGS(qp) ); +DEFINE_EVENT(/* event */ + hfi1_sender_info_template, hfi1_sender_rcv_tid_ack, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DEFINE_EVENT(/* event */ + hfi1_sender_info_template, hfi1_sender_make_tid_pkt, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + DECLARE_EVENT_CLASS(/* tid_read_sender */ hfi1_tid_read_sender_template, TP_PROTO(struct rvt_qp *qp, char newreq), @@ -908,12 +1033,16 @@ DECLARE_EVENT_CLASS(/* tid_rdma_request */ __field(u32, cur_seg) __field(u32, comp_seg) __field(u32, ack_seg) + __field(u32, alloc_seg) __field(u32, total_segs) __field(u16, setup_head) __field(u16, clear_tail) __field(u16, flow_idx) + __field(u16, acked_tail) __field(u32, state) + __field(u32, r_ack_psn) __field(u32, r_flow_psn) + __field(u32, r_last_acked) __field(u32, s_next_psn) ), TP_fast_assign(/* assign */ @@ -926,12 +1055,16 @@ DECLARE_EVENT_CLASS(/* tid_rdma_request */ __entry->cur_seg = req->cur_seg; __entry->comp_seg = req->comp_seg; __entry->ack_seg = req->ack_seg; + __entry->alloc_seg = req->alloc_seg; __entry->total_segs = req->total_segs; __entry->setup_head = req->setup_head; __entry->clear_tail = req->clear_tail; __entry->flow_idx = req->flow_idx; + __entry->acked_tail = req->acked_tail; __entry->state = req->state; + __entry->r_ack_psn = req->r_ack_psn; __entry->r_flow_psn = req->r_flow_psn; + __entry->r_last_acked = req->r_last_acked; __entry->s_next_psn = req->s_next_psn; ), TP_printk(/* print */ @@ -945,12 +1078,16 @@ DECLARE_EVENT_CLASS(/* tid_rdma_request */ __entry->cur_seg, __entry->comp_seg, __entry->ack_seg, + __entry->alloc_seg, __entry->total_segs, __entry->setup_head, __entry->clear_tail, __entry->flow_idx, + __entry->acked_tail, __entry->state, + __entry->r_ack_psn, __entry->r_flow_psn, + __entry->r_last_acked, __entry->s_next_psn ) ); @@ -1004,6 +1141,97 @@ DEFINE_EVENT(/* event */ TP_ARGS(qp, newreq, opcode, psn, lpsn, req) ); +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_write_alloc_res, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_write_req, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_build_write_resp, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_write_resp, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_write_data, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_tid_ack, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_tid_retry_timeout, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_resync, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_make_tid_pkt, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_make_tid_ack, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_handle_kdeth_eflags, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_make_rc_ack_write, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_make_req_write, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + DECLARE_EVENT_CLASS(/* rc_rcv_err */ hfi1_rc_rcv_err_template, TP_PROTO(struct rvt_qp *qp, u32 opcode, u32 psn, int diff), @@ -1090,6 +1318,289 @@ DEFINE_EVENT(/* event */ TP_ARGS(qp, index, sge) ); +DECLARE_EVENT_CLASS(/* tid_write_sp */ + hfi1_tid_write_rsp_template, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp), + TP_STRUCT__entry(/* entry */ + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(u32, r_tid_head) + __field(u32, r_tid_tail) + __field(u32, r_tid_ack) + __field(u32, r_tid_alloc) + __field(u32, alloc_w_segs) + __field(u32, pending_tid_w_segs) + __field(bool, sync_pt) + __field(u32, ps_nak_psn) + __field(u8, ps_nak_state) + __field(u8, prnr_nak_state) + __field(u32, hw_flow_index) + __field(u32, generation) + __field(u32, fpsn) + __field(u32, flow_flags) + __field(bool, resync) + __field(u32, r_next_psn_kdeth) + ), + TP_fast_assign(/* assign */ + struct hfi1_qp_priv *priv = qp->priv; + + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); + __entry->qpn = qp->ibqp.qp_num; + __entry->r_tid_head = priv->r_tid_head; + __entry->r_tid_tail = priv->r_tid_tail; + __entry->r_tid_ack = priv->r_tid_ack; + __entry->r_tid_alloc = priv->r_tid_alloc; + __entry->alloc_w_segs = priv->alloc_w_segs; + __entry->pending_tid_w_segs = priv->pending_tid_w_segs; + __entry->sync_pt = priv->sync_pt; + __entry->ps_nak_psn = priv->s_nak_psn; + __entry->ps_nak_state = priv->s_nak_state; + __entry->prnr_nak_state = priv->rnr_nak_state; + __entry->hw_flow_index = priv->flow_state.index; + __entry->generation = priv->flow_state.generation; + __entry->fpsn = priv->flow_state.psn; + __entry->flow_flags = priv->flow_state.flags; + __entry->resync = priv->resync; + __entry->r_next_psn_kdeth = priv->r_next_psn_kdeth; + ), + TP_printk(/* print */ + TID_WRITE_RSPDR_PRN, + __get_str(dev), + __entry->qpn, + __entry->r_tid_head, + __entry->r_tid_tail, + __entry->r_tid_ack, + __entry->r_tid_alloc, + __entry->alloc_w_segs, + __entry->pending_tid_w_segs, + __entry->sync_pt ? "yes" : "no", + __entry->ps_nak_psn, + __entry->ps_nak_state, + __entry->prnr_nak_state, + __entry->hw_flow_index, + __entry->generation, + __entry->fpsn, + __entry->flow_flags, + __entry->resync ? "yes" : "no", + __entry->r_next_psn_kdeth + ) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_alloc_res, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_rcv_req, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_build_resp, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_rcv_data, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_rcv_resync, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_make_tid_ack, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_handle_kdeth_eflags, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_make_rc_ack, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DECLARE_EVENT_CLASS(/* tid_write_sender */ + hfi1_tid_write_sender_template, + TP_PROTO(struct rvt_qp *qp, char newreq), + TP_ARGS(qp, newreq), + TP_STRUCT__entry(/* entry */ + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(char, newreq) + __field(u32, s_tid_cur) + __field(u32, s_tid_tail) + __field(u32, s_tid_head) + __field(u32, pending_tid_w_resp) + __field(u32, n_requests) + __field(u32, n_tid_requests) + __field(u32, s_flags) + __field(u32, ps_flags) + __field(unsigned long, iow_flags) + __field(u8, s_state) + __field(u8, s_retry) + ), + TP_fast_assign(/* assign */ + struct hfi1_qp_priv *priv = qp->priv; + + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); + __entry->qpn = qp->ibqp.qp_num; + __entry->newreq = newreq; + __entry->s_tid_cur = priv->s_tid_cur; + __entry->s_tid_tail = priv->s_tid_tail; + __entry->s_tid_head = priv->s_tid_head; + __entry->pending_tid_w_resp = priv->pending_tid_w_resp; + __entry->n_requests = atomic_read(&priv->n_requests); + __entry->n_tid_requests = atomic_read(&priv->n_tid_requests); + __entry->s_flags = qp->s_flags; + __entry->ps_flags = priv->s_flags; + __entry->iow_flags = priv->s_iowait.flags; + __entry->s_state = priv->s_state; + __entry->s_retry = priv->s_retry; + ), + TP_printk(/* print */ + TID_WRITE_SENDER_PRN, + __get_str(dev), + __entry->qpn, + __entry->newreq, + __entry->s_tid_cur, + __entry->s_tid_tail, + __entry->s_tid_head, + __entry->pending_tid_w_resp, + __entry->n_requests, + __entry->n_tid_requests, + __entry->s_flags, + __entry->ps_flags, + __entry->iow_flags, + __entry->s_state, + __entry->s_retry + ) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_write_sender_template, hfi1_tid_write_sender_rcv_resp, + TP_PROTO(struct rvt_qp *qp, char newreq), + TP_ARGS(qp, newreq) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_write_sender_template, hfi1_tid_write_sender_rcv_tid_ack, + TP_PROTO(struct rvt_qp *qp, char newreq), + TP_ARGS(qp, newreq) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_write_sender_template, hfi1_tid_write_sender_retry_timeout, + TP_PROTO(struct rvt_qp *qp, char newreq), + TP_ARGS(qp, newreq) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_write_sender_template, hfi1_tid_write_sender_make_tid_pkt, + TP_PROTO(struct rvt_qp *qp, char newreq), + TP_ARGS(qp, newreq) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_write_sender_template, hfi1_tid_write_sender_make_req, + TP_PROTO(struct rvt_qp *qp, char newreq), + TP_ARGS(qp, newreq) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_write_sender_template, hfi1_tid_write_sender_restart_rc, + TP_PROTO(struct rvt_qp *qp, char newreq), + TP_ARGS(qp, newreq) +); + +DECLARE_EVENT_CLASS(/* tid_ack */ + hfi1_tid_ack_template, + TP_PROTO(struct rvt_qp *qp, u32 aeth, u32 psn, + u32 req_psn, u32 resync_psn), + TP_ARGS(qp, aeth, psn, req_psn, resync_psn), + TP_STRUCT__entry(/* entry */ + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(u32, aeth) + __field(u32, psn) + __field(u32, req_psn) + __field(u32, resync_psn) + ), + TP_fast_assign(/* assign */ + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)) + __entry->qpn = qp->ibqp.qp_num; + __entry->aeth = aeth; + __entry->psn = psn; + __entry->req_psn = req_psn; + __entry->resync_psn = resync_psn; + ), + TP_printk(/* print */ + "[%s] qpn 0x%x aeth 0x%x psn 0x%x req_psn 0x%x resync_psn 0x%x", + __get_str(dev), + __entry->qpn, + __entry->aeth, + __entry->psn, + __entry->req_psn, + __entry->resync_psn + ) +); + +DEFINE_EVENT(/* rcv_tid_ack */ + hfi1_tid_ack_template, hfi1_rcv_tid_ack, + TP_PROTO(struct rvt_qp *qp, u32 aeth, u32 psn, + u32 req_psn, u32 resync_psn), + TP_ARGS(qp, aeth, psn, req_psn, resync_psn) +); + +DECLARE_EVENT_CLASS(/* kdeth_eflags_error */ + hfi1_kdeth_eflags_error_template, + TP_PROTO(struct rvt_qp *qp, u8 rcv_type, u8 rte, u32 psn), + TP_ARGS(qp, rcv_type, rte, psn), + TP_STRUCT__entry(/* entry */ + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(u8, rcv_type) + __field(u8, rte) + __field(u32, psn) + ), + TP_fast_assign(/* assign */ + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); + __entry->qpn = qp->ibqp.qp_num; + __entry->rcv_type = rcv_type; + __entry->rte = rte; + __entry->psn = psn; + ), + TP_printk(/* print */ + KDETH_EFLAGS_ERR_PRN, + __get_str(dev), + __entry->qpn, + __entry->rcv_type, + __entry->rte, + __entry->psn + ) +); + +DEFINE_EVENT(/* event */ + hfi1_kdeth_eflags_error_template, hfi1_eflags_err_write, + TP_PROTO(struct rvt_qp *qp, u8 rcv_type, u8 rte, u32 psn), + TP_ARGS(qp, rcv_type, rte, psn) +); + #endif /* __HFI1_TRACE_TID_H */ #undef TRACE_INCLUDE_PATH diff --git a/drivers/infiniband/hw/hfi1/trace_tx.h b/drivers/infiniband/hw/hfi1/trace_tx.h index 37dbb3e599c3..09eb0c9ada00 100644 --- a/drivers/infiniband/hw/hfi1/trace_tx.h +++ b/drivers/infiniband/hw/hfi1/trace_tx.h @@ -846,6 +846,12 @@ DEFINE_EVENT( TP_ARGS(qp, flag) ); +DEFINE_EVENT(/* event */ + hfi1_do_send_template, hfi1_rc_do_tid_send, + TP_PROTO(struct rvt_qp *qp, bool flag), + TP_ARGS(qp, flag) +); + DEFINE_EVENT( hfi1_do_send_template, hfi1_rc_expired_time_slice, TP_PROTO(struct rvt_qp *qp, bool flag), -- cgit v1.2.3-59-g8ed1b From 34025fb0c4c9d6b2e294f8f8f0a82491a13c83a2 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 21:52:19 -0800 Subject: IB/hfi1: Prioritize the sending of ACK packets ACK packets are generally associated with request completion and resource release and therefore should be sent first. This patch optimizes the send engine by using the following policies: (1) QPs with RVT_S_ACK_PENDING bit set in qp->s_flags or qpriv->s_flags should have their priority incremented; (2) QPs with ACK or TID-ACK packet queued should have their priority incremented; (3) When a QP is queued to the wait list due to resource constraints, it will be queued to the head if it has ACK packet to send; (4) When selecting qps to run from the wait list, the one with the highest priority and starve_cnt will be selected; each priority will be equivalent to a fixed number of starve_cnt (16). Reviewed-by: Mitko Haralanov Signed-off-by: Mike Marciniszyn Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/iowait.c | 34 ++++++++++++- drivers/infiniband/hw/hfi1/iowait.h | 87 +++++++++++++++++++++----------- drivers/infiniband/hw/hfi1/pio.c | 18 ++++--- drivers/infiniband/hw/hfi1/qp.c | 15 +++++- drivers/infiniband/hw/hfi1/rc.c | 1 + drivers/infiniband/hw/hfi1/sdma.c | 24 +++++---- drivers/infiniband/hw/hfi1/sdma_txreq.h | 1 + drivers/infiniband/hw/hfi1/tid_rdma.c | 1 + drivers/infiniband/hw/hfi1/user_sdma.c | 6 ++- drivers/infiniband/hw/hfi1/verbs.c | 1 + drivers/infiniband/hw/hfi1/verbs_txreq.h | 1 + drivers/infiniband/hw/hfi1/vnic_sdma.c | 6 ++- 12 files changed, 144 insertions(+), 51 deletions(-) (limited to 'drivers/infiniband/hw/hfi1/rc.c') diff --git a/drivers/infiniband/hw/hfi1/iowait.c b/drivers/infiniband/hw/hfi1/iowait.c index 582f1ba136ff..adb4a1ba921b 100644 --- a/drivers/infiniband/hw/hfi1/iowait.c +++ b/drivers/infiniband/hw/hfi1/iowait.c @@ -6,6 +6,9 @@ #include "iowait.h" #include "trace_iowait.h" +/* 1 priority == 16 starve_cnt */ +#define IOWAIT_PRIORITY_STARVE_SHIFT 4 + void iowait_set_flag(struct iowait *wait, u32 flag) { trace_hfi1_iowait_set(wait, flag); @@ -44,7 +47,8 @@ void iowait_init(struct iowait *wait, u32 tx_limit, uint seq, bool pkts_sent), void (*wakeup)(struct iowait *wait, int reason), - void (*sdma_drained)(struct iowait *wait)) + void (*sdma_drained)(struct iowait *wait), + void (*init_priority)(struct iowait *wait)) { int i; @@ -58,6 +62,7 @@ void iowait_init(struct iowait *wait, u32 tx_limit, wait->sleep = sleep; wait->wakeup = wakeup; wait->sdma_drained = sdma_drained; + wait->init_priority = init_priority; wait->flags = 0; for (i = 0; i < IOWAIT_SES; i++) { wait->wait[i].iow = wait; @@ -92,3 +97,30 @@ int iowait_set_work_flag(struct iowait_work *w) iowait_set_flag(w->iow, IOWAIT_PENDING_TID); return IOWAIT_TID_SE; } + +/** + * iowait_priority_update_top - update the top priority entry + * @w: the iowait struct + * @top: a pointer to the top priority entry + * @idx: the index of the current iowait in an array + * @top_idx: the array index for the iowait entry that has the top priority + * + * This function is called to compare the priority of a given + * iowait with the given top priority entry. The top index will + * be returned. + */ +uint iowait_priority_update_top(struct iowait *w, + struct iowait *top, + uint idx, uint top_idx) +{ + u8 cnt, tcnt; + + /* Convert priority into starve_cnt and compare the total.*/ + cnt = (w->priority << IOWAIT_PRIORITY_STARVE_SHIFT) + w->starved_cnt; + tcnt = (top->priority << IOWAIT_PRIORITY_STARVE_SHIFT) + + top->starved_cnt; + if (cnt > tcnt) + return idx; + else + return top_idx; +} diff --git a/drivers/infiniband/hw/hfi1/iowait.h b/drivers/infiniband/hw/hfi1/iowait.h index bd913701761d..07847cb72169 100644 --- a/drivers/infiniband/hw/hfi1/iowait.h +++ b/drivers/infiniband/hw/hfi1/iowait.h @@ -100,6 +100,7 @@ struct iowait_work { * @sleep: no space callback * @wakeup: space callback wakeup * @sdma_drained: sdma count drained + * @init_priority: callback to manipulate priority * @lock: lock protected head of wait queue * @iowork: workqueue overhead * @wait_dma: wait for sdma_busy == 0 @@ -109,7 +110,7 @@ struct iowait_work { * @tx_limit: limit for overflow queuing * @tx_count: number of tx entry's in tx_head'ed list * @flags: wait flags (one per QP) - * @wait: SE array + * @wait: SE array for multiple legs * * This is to be embedded in user's state structure * (QP or PQ). @@ -120,10 +121,13 @@ struct iowait_work { * are callbacks for the ULP to implement * what ever queuing/dequeuing of * the embedded iowait and its containing struct - * when a resource shortage like SDMA ring space is seen. + * when a resource shortage like SDMA ring space + * or PIO credit space is seen. * * Both potentially have locks help - * so sleeping is not allowed. + * so sleeping is not allowed and it is not + * supported to submit txreqs from the wakeup + * call directly because of lock conflicts. * * The wait_dma member along with the iow * @@ -143,6 +147,7 @@ struct iowait { ); void (*wakeup)(struct iowait *wait, int reason); void (*sdma_drained)(struct iowait *wait); + void (*init_priority)(struct iowait *wait); seqlock_t *lock; wait_queue_head_t wait_dma; wait_queue_head_t wait_pio; @@ -152,6 +157,7 @@ struct iowait { u32 tx_limit; u32 tx_count; u8 starved_cnt; + u8 priority; unsigned long flags; struct iowait_work wait[IOWAIT_SES]; }; @@ -171,7 +177,8 @@ void iowait_init(struct iowait *wait, u32 tx_limit, uint seq, bool pkts_sent), void (*wakeup)(struct iowait *wait, int reason), - void (*sdma_drained)(struct iowait *wait)); + void (*sdma_drained)(struct iowait *wait), + void (*init_priority)(struct iowait *wait)); /** * iowait_schedule() - schedule the default send engine work @@ -339,6 +346,8 @@ static inline u16 iowait_get_desc(struct iowait_work *w) tx = list_first_entry(&w->tx_head, struct sdma_txreq, list); num_desc = tx->num_desc; + if (tx->flags & SDMA_TXREQ_F_VIP) + w->iow->priority++; } return num_desc; } @@ -352,6 +361,37 @@ static inline u32 iowait_get_all_desc(struct iowait *w) return num_desc; } +static inline void iowait_update_priority(struct iowait_work *w) +{ + struct sdma_txreq *tx = NULL; + + if (!list_empty(&w->tx_head)) { + tx = list_first_entry(&w->tx_head, struct sdma_txreq, + list); + if (tx->flags & SDMA_TXREQ_F_VIP) + w->iow->priority++; + } +} + +static inline void iowait_update_all_priority(struct iowait *w) +{ + iowait_update_priority(&w->wait[IOWAIT_IB_SE]); + iowait_update_priority(&w->wait[IOWAIT_TID_SE]); +} + +static inline void iowait_init_priority(struct iowait *w) +{ + w->priority = 0; + if (w->init_priority) + w->init_priority(w); +} + +static inline void iowait_get_priority(struct iowait *w) +{ + iowait_init_priority(w); + iowait_update_all_priority(w); +} + /** * iowait_queue - Put the iowait on a wait queue * @pkts_sent: have some packets been sent before queuing? @@ -368,14 +408,18 @@ static inline void iowait_queue(bool pkts_sent, struct iowait *w, /* * To play fair, insert the iowait at the tail of the wait queue if it * has already sent some packets; Otherwise, put it at the head. + * However, if it has priority packets to send, also put it at the + * head. */ - if (pkts_sent) { - list_add_tail(&w->list, wait_head); + if (pkts_sent) w->starved_cnt = 0; - } else { - list_add(&w->list, wait_head); + else w->starved_cnt++; - } + + if (w->priority > 0 || !pkts_sent) + list_add(&w->list, wait_head); + else + list_add_tail(&w->list, wait_head); } /** @@ -392,27 +436,10 @@ static inline void iowait_starve_clear(bool pkts_sent, struct iowait *w) w->starved_cnt = 0; } -/** - * iowait_starve_find_max - Find the maximum of the starve count - * @w: the iowait struct - * @max: a variable containing the max starve count - * @idx: the index of the current iowait in an array - * @max_idx: a variable containing the array index for the - * iowait entry that has the max starve count - * - * This function is called to compare the starve count of a - * given iowait with the given max starve count. The max starve - * count and the index will be updated if the iowait's start - * count is larger. - */ -static inline void iowait_starve_find_max(struct iowait *w, u8 *max, - uint idx, uint *max_idx) -{ - if (w->starved_cnt > *max) { - *max = w->starved_cnt; - *max_idx = idx; - } -} +/* Update the top priority index */ +uint iowait_priority_update_top(struct iowait *w, + struct iowait *top, + uint idx, uint top_idx); /** * iowait_packet_queued() - determine if a packet is queued diff --git a/drivers/infiniband/hw/hfi1/pio.c b/drivers/infiniband/hw/hfi1/pio.c index 04126d7e318d..a1de566fe95e 100644 --- a/drivers/infiniband/hw/hfi1/pio.c +++ b/drivers/infiniband/hw/hfi1/pio.c @@ -1599,8 +1599,7 @@ static void sc_piobufavail(struct send_context *sc) struct rvt_qp *qp; struct hfi1_qp_priv *priv; unsigned long flags; - uint i, n = 0, max_idx = 0; - u8 max_starved_cnt = 0; + uint i, n = 0, top_idx = 0; if (dd->send_contexts[sc->sw_index].type != SC_KERNEL && dd->send_contexts[sc->sw_index].type != SC_VL15) @@ -1619,11 +1618,18 @@ static void sc_piobufavail(struct send_context *sc) if (n == ARRAY_SIZE(qps)) break; wait = list_first_entry(list, struct iowait, list); + iowait_get_priority(wait); qp = iowait_to_qp(wait); priv = qp->priv; list_del_init(&priv->s_iowait.list); priv->s_iowait.lock = NULL; - iowait_starve_find_max(wait, &max_starved_cnt, n, &max_idx); + if (n) { + priv = qps[top_idx]->priv; + top_idx = iowait_priority_update_top(wait, + &priv->s_iowait, + n, top_idx); + } + /* refcount held until actual wake up */ qps[n++] = qp; } @@ -1638,12 +1644,12 @@ static void sc_piobufavail(struct send_context *sc) } write_sequnlock_irqrestore(&sc->waitlock, flags); - /* Wake up the most starved one first */ + /* Wake up the top-priority one first */ if (n) - hfi1_qp_wakeup(qps[max_idx], + hfi1_qp_wakeup(qps[top_idx], RVT_S_WAIT_PIO | HFI1_S_WAIT_PIO_DRAIN); for (i = 0; i < n; i++) - if (i != max_idx) + if (i != top_idx) hfi1_qp_wakeup(qps[i], RVT_S_WAIT_PIO | HFI1_S_WAIT_PIO_DRAIN); } diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c index cfd598e4b303..d8f7add935df 100644 --- a/drivers/infiniband/hw/hfi1/qp.c +++ b/drivers/infiniband/hw/hfi1/qp.c @@ -518,6 +518,7 @@ static int iowait_sleep( ibp->rvp.n_dmawait++; qp->s_flags |= RVT_S_WAIT_DMA_DESC; + iowait_get_priority(&priv->s_iowait); iowait_queue(pkts_sent, &priv->s_iowait, &sde->dmawait); priv->s_iowait.lock = &sde->waitlock; @@ -567,6 +568,17 @@ static void iowait_sdma_drained(struct iowait *wait) spin_unlock_irqrestore(&qp->s_lock, flags); } +static void hfi1_init_priority(struct iowait *w) +{ + struct rvt_qp *qp = iowait_to_qp(w); + struct hfi1_qp_priv *priv = qp->priv; + + if (qp->s_flags & RVT_S_ACK_PENDING) + w->priority++; + if (priv->s_flags & RVT_S_ACK_PENDING) + w->priority++; +} + /** * qp_to_sdma_engine - map a qp to a send engine * @qp: the QP @@ -727,7 +739,8 @@ void *qp_priv_alloc(struct rvt_dev_info *rdi, struct rvt_qp *qp) _hfi1_do_tid_send, iowait_sleep, iowait_wakeup, - iowait_sdma_drained); + iowait_sdma_drained, + hfi1_init_priority); return priv; } diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index 82afa7736be7..e6726c1ab866 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -390,6 +390,7 @@ normal_no_state: bth0 = OP(ACKNOWLEDGE) << 24; bth2 = mask_psn(qp->s_ack_psn); qp->s_flags &= ~RVT_S_ACK_PENDING; + ps->s_txreq->txreq.flags |= SDMA_TXREQ_F_VIP; ps->s_txreq->ss = NULL; } qp->s_rdma_ack_cnt++; diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c index 96897a91fb0a..b0110728f541 100644 --- a/drivers/infiniband/hw/hfi1/sdma.c +++ b/drivers/infiniband/hw/hfi1/sdma.c @@ -1747,10 +1747,9 @@ retry: */ static void sdma_desc_avail(struct sdma_engine *sde, uint avail) { - struct iowait *wait, *nw; + struct iowait *wait, *nw, *twait; struct iowait *waits[SDMA_WAIT_BATCH_SIZE]; - uint i, n = 0, seq, max_idx = 0; - u8 max_starved_cnt = 0; + uint i, n = 0, seq, tidx = 0; #ifdef CONFIG_SDMA_VERBOSITY dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx, @@ -1775,13 +1774,20 @@ static void sdma_desc_avail(struct sdma_engine *sde, uint avail) continue; if (n == ARRAY_SIZE(waits)) break; + iowait_init_priority(wait); num_desc = iowait_get_all_desc(wait); if (num_desc > avail) break; avail -= num_desc; - /* Find the most starved wait memeber */ - iowait_starve_find_max(wait, &max_starved_cnt, - n, &max_idx); + /* Find the top-priority wait memeber */ + if (n) { + twait = waits[tidx]; + tidx = + iowait_priority_update_top(wait, + twait, + n, + tidx); + } list_del_init(&wait->list); waits[n++] = wait; } @@ -1790,12 +1796,12 @@ static void sdma_desc_avail(struct sdma_engine *sde, uint avail) } } while (read_seqretry(&sde->waitlock, seq)); - /* Schedule the most starved one first */ + /* Schedule the top-priority entry first */ if (n) - waits[max_idx]->wakeup(waits[max_idx], SDMA_AVAIL_REASON); + waits[tidx]->wakeup(waits[tidx], SDMA_AVAIL_REASON); for (i = 0; i < n; i++) - if (i != max_idx) + if (i != tidx) waits[i]->wakeup(waits[i], SDMA_AVAIL_REASON); } diff --git a/drivers/infiniband/hw/hfi1/sdma_txreq.h b/drivers/infiniband/hw/hfi1/sdma_txreq.h index bf7d777d756e..514a4784566b 100644 --- a/drivers/infiniband/hw/hfi1/sdma_txreq.h +++ b/drivers/infiniband/hw/hfi1/sdma_txreq.h @@ -91,6 +91,7 @@ struct sdma_desc { #define SDMA_TXREQ_F_URGENT 0x0001 #define SDMA_TXREQ_F_AHG_COPY 0x0002 #define SDMA_TXREQ_F_USE_AHG 0x0004 +#define SDMA_TXREQ_F_VIP 0x0010 struct sdma_txreq; typedef void (*callback_t)(struct sdma_txreq *, int); diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index a49eb3d9b5b9..bc2ff83026f7 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -5296,6 +5296,7 @@ static int make_tid_rdma_ack(struct rvt_qp *qp, ps->s_txreq->ss = NULL; hfi1_make_ruc_header(qp, ohdr, (TID_OP(ACK) << 24), bth1, bth2, middle, ps); + ps->s_txreq->txreq.flags |= SDMA_TXREQ_F_VIP; return 1; bail: /* diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c index 6764114b886c..8bfbc6d7ea34 100644 --- a/drivers/infiniband/hw/hfi1/user_sdma.c +++ b/drivers/infiniband/hw/hfi1/user_sdma.c @@ -144,8 +144,10 @@ static int defer_packet_queue( */ xchg(&pq->state, SDMA_PKT_Q_DEFERRED); write_seqlock(&sde->waitlock); - if (list_empty(&pq->busy.list)) + if (list_empty(&pq->busy.list)) { + iowait_get_priority(&pq->busy); iowait_queue(pkts_sent, &pq->busy, &sde->dmawait); + } write_sequnlock(&sde->waitlock); return -EBUSY; eagain: @@ -191,7 +193,7 @@ int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, pq->mm = fd->mm; iowait_init(&pq->busy, 0, NULL, NULL, defer_packet_queue, - activate_packet_queue, NULL); + activate_packet_queue, NULL, NULL); pq->reqidx = 0; pq->reqs = kcalloc(hfi1_sdma_comp_ring_size, diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index ab97d71cdd92..55a56b3d7f83 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -945,6 +945,7 @@ static int pio_wait(struct rvt_qp *qp, dev->n_piodrain += !!(flag & HFI1_S_WAIT_PIO_DRAIN); qp->s_flags |= flag; was_empty = list_empty(&sc->piowait); + iowait_get_priority(&priv->s_iowait); iowait_queue(ps->pkts_sent, &priv->s_iowait, &sc->piowait); priv->s_iowait.lock = &sc->waitlock; diff --git a/drivers/infiniband/hw/hfi1/verbs_txreq.h b/drivers/infiniband/hw/hfi1/verbs_txreq.h index 2a77af26a231..b002e96eb335 100644 --- a/drivers/infiniband/hw/hfi1/verbs_txreq.h +++ b/drivers/infiniband/hw/hfi1/verbs_txreq.h @@ -94,6 +94,7 @@ static inline struct verbs_txreq *get_txreq(struct hfi1_ibdev *dev, tx->txreq.num_desc = 0; /* Set the header type */ tx->phdr.hdr.hdr_type = priv->hdr_type; + tx->txreq.flags = 0; return tx; } diff --git a/drivers/infiniband/hw/hfi1/vnic_sdma.c b/drivers/infiniband/hw/hfi1/vnic_sdma.c index 1f81c480e028..af1b1ffcb38e 100644 --- a/drivers/infiniband/hw/hfi1/vnic_sdma.c +++ b/drivers/infiniband/hw/hfi1/vnic_sdma.c @@ -240,8 +240,10 @@ static int hfi1_vnic_sdma_sleep(struct sdma_engine *sde, } vnic_sdma->state = HFI1_VNIC_SDMA_Q_DEFERRED; - if (list_empty(&vnic_sdma->wait.list)) + if (list_empty(&vnic_sdma->wait.list)) { + iowait_get_priority(wait->iow); iowait_queue(pkts_sent, wait->iow, &sde->dmawait); + } write_sequnlock(&sde->waitlock); return -EBUSY; } @@ -281,7 +283,7 @@ void hfi1_vnic_sdma_init(struct hfi1_vnic_vport_info *vinfo) iowait_init(&vnic_sdma->wait, 0, NULL, NULL, hfi1_vnic_sdma_sleep, - hfi1_vnic_sdma_wakeup, NULL); + hfi1_vnic_sdma_wakeup, NULL, NULL); vnic_sdma->sde = &vinfo->dd->per_sdma[i]; vnic_sdma->dd = vinfo->dd; vnic_sdma->vinfo = vinfo; -- cgit v1.2.3-59-g8ed1b