diff options
author | Jason Gunthorpe <jgg@mellanox.com> | 2019-04-03 15:28:05 -0300 |
---|---|---|
committer | Jason Gunthorpe <jgg@mellanox.com> | 2019-04-03 15:28:05 -0300 |
commit | 1c726c44210f8d2185cf61adfea850d1b87e75ab (patch) | |
tree | e61c0a41244d18a693f78103dc01f05f8ed45c8e /drivers | |
parent | RDMA/cma: Set proper port number as index (diff) | |
parent | IB/hfi1: Implement CCA for TID RDMA protocol (diff) | |
download | linux-dev-1c726c44210f8d2185cf61adfea850d1b87e75ab.tar.xz linux-dev-1c726c44210f8d2185cf61adfea850d1b87e75ab.zip |
Merge HFI1 updates into k.o/for-next
Based on rdma.git for-rc for dependencies.
From Dennis Dalessandro:
====================
Here are some code improvement patches and fixes for less serious bugs to
TID RDMA than we sent for RC.
====================
* HFI1 updates:
IB/hfi1: Implement CCA for TID RDMA protocol
IB/hfi1: Remove WARN_ON when freeing expected receive groups
IB/hfi1: Unify the software PSN check for TID RDMA READ/WRITE
IB/hfi1: Add a function to read next expected psn from hardware flow
IB/hfi1: Delay the release of destination mr for TID RDMA WRITE DATA
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Diffstat (limited to 'drivers')
-rw-r--r-- | drivers/infiniband/hw/hfi1/chip.c | 55 | ||||
-rw-r--r-- | drivers/infiniband/hw/hfi1/driver.c | 4 | ||||
-rw-r--r-- | drivers/infiniband/hw/hfi1/exp_rcv.c | 3 | ||||
-rw-r--r-- | drivers/infiniband/hw/hfi1/qp.c | 4 | ||||
-rw-r--r-- | drivers/infiniband/hw/hfi1/rc.c | 23 | ||||
-rw-r--r-- | drivers/infiniband/hw/hfi1/rc.h | 8 | ||||
-rw-r--r-- | drivers/infiniband/hw/hfi1/tid_rdma.c | 272 | ||||
-rw-r--r-- | drivers/infiniband/hw/hfi1/tid_rdma.h | 2 | ||||
-rw-r--r-- | drivers/infiniband/hw/hfi1/trace_tid.h | 12 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/odp.c | 3 |
10 files changed, 255 insertions, 131 deletions
diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index 6150567c0b51..229d5d4cafe8 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -13232,7 +13232,7 @@ static int set_up_context_variables(struct hfi1_devdata *dd) int total_contexts; int ret; unsigned ngroups; - int qos_rmt_count; + int rmt_count; int user_rmt_reduced; u32 n_usr_ctxts; u32 send_contexts = chip_send_contexts(dd); @@ -13294,10 +13294,23 @@ static int set_up_context_variables(struct hfi1_devdata *dd) n_usr_ctxts = rcv_contexts - total_contexts; } - /* each user context requires an entry in the RMT */ - qos_rmt_count = qos_rmt_entries(dd, NULL, NULL); - if (qos_rmt_count + n_usr_ctxts > NUM_MAP_ENTRIES) { - user_rmt_reduced = NUM_MAP_ENTRIES - qos_rmt_count; + /* + * The RMT entries are currently allocated as shown below: + * 1. QOS (0 to 128 entries); + * 2. FECN (num_kernel_context - 1 + num_user_contexts + + * num_vnic_contexts); + * 3. VNIC (num_vnic_contexts). + * It should be noted that FECN oversubscribe num_vnic_contexts + * entries of RMT because both VNIC and PSM could allocate any receive + * context between dd->first_dyn_alloc_text and dd->num_rcv_contexts, + * and PSM FECN must reserve an RMT entry for each possible PSM receive + * context. + */ + rmt_count = qos_rmt_entries(dd, NULL, NULL) + (num_vnic_contexts * 2); + if (HFI1_CAP_IS_KSET(TID_RDMA)) + rmt_count += num_kernel_contexts - 1; + if (rmt_count + n_usr_ctxts > NUM_MAP_ENTRIES) { + user_rmt_reduced = NUM_MAP_ENTRIES - rmt_count; dd_dev_err(dd, "RMT size is reducing the number of user receive contexts from %u to %d\n", n_usr_ctxts, @@ -14278,35 +14291,43 @@ bail: init_qpmap_table(dd, FIRST_KERNEL_KCTXT, dd->n_krcv_queues - 1); } -static void init_user_fecn_handling(struct hfi1_devdata *dd, - struct rsm_map_table *rmt) +static void init_fecn_handling(struct hfi1_devdata *dd, + struct rsm_map_table *rmt) { struct rsm_rule_data rrd; u64 reg; - int i, idx, regoff, regidx; + int i, idx, regoff, regidx, start; u8 offset; + u32 total_cnt; + + if (HFI1_CAP_IS_KSET(TID_RDMA)) + /* Exclude context 0 */ + start = 1; + else + start = dd->first_dyn_alloc_ctxt; + + total_cnt = dd->num_rcv_contexts - start; /* there needs to be enough room in the map table */ - if (rmt->used + dd->num_user_contexts >= NUM_MAP_ENTRIES) { - dd_dev_err(dd, "User FECN handling disabled - too many user contexts allocated\n"); + if (rmt->used + total_cnt >= NUM_MAP_ENTRIES) { + dd_dev_err(dd, "FECN handling disabled - too many contexts allocated\n"); return; } /* * RSM will extract the destination context as an index into the * map table. The destination contexts are a sequential block - * in the range first_dyn_alloc_ctxt...num_rcv_contexts-1 (inclusive). + * in the range start...num_rcv_contexts-1 (inclusive). * Map entries are accessed as offset + extracted value. Adjust * the added offset so this sequence can be placed anywhere in * the table - as long as the entries themselves do not wrap. * There are only enough bits in offset for the table size, so * start with that to allow for a "negative" offset. */ - offset = (u8)(NUM_MAP_ENTRIES + (int)rmt->used - - (int)dd->first_dyn_alloc_ctxt); + offset = (u8)(NUM_MAP_ENTRIES + rmt->used - start); - for (i = dd->first_dyn_alloc_ctxt, idx = rmt->used; - i < dd->num_rcv_contexts; i++, idx++) { + for (i = start, idx = rmt->used; i < dd->num_rcv_contexts; + i++, idx++) { /* replace with identity mapping */ regoff = (idx % 8) * 8; regidx = idx / 8; @@ -14341,7 +14362,7 @@ static void init_user_fecn_handling(struct hfi1_devdata *dd, /* add rule 1 */ add_rsm_rule(dd, RSM_INS_FECN, &rrd); - rmt->used += dd->num_user_contexts; + rmt->used += total_cnt; } /* Initialize RSM for VNIC */ @@ -14428,7 +14449,7 @@ static void init_rxe(struct hfi1_devdata *dd) rmt = alloc_rsm_map_table(dd); /* set up QOS, including the QPN map table */ init_qos(dd, rmt); - init_user_fecn_handling(dd, rmt); + init_fecn_handling(dd, rmt); complete_rsm_map_table(dd, rmt); /* record number of used rsm map entries for vnic */ dd->vnic.rmt_start = rmt->used; diff --git a/drivers/infiniband/hw/hfi1/driver.c b/drivers/infiniband/hw/hfi1/driver.c index 867b4e10018f..129e48ec9ee0 100644 --- a/drivers/infiniband/hw/hfi1/driver.c +++ b/drivers/infiniband/hw/hfi1/driver.c @@ -514,7 +514,9 @@ bool hfi1_process_ecn_slowpath(struct rvt_qp *qp, struct hfi1_packet *pkt, */ do_cnp = prescan || (opcode >= IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST && - opcode <= IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE); + opcode <= IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE) || + opcode == TID_OP(READ_RESP) || + opcode == TID_OP(ACK); /* Call appropriate CNP handler */ if (!ignore_fecn && do_cnp && fecn) diff --git a/drivers/infiniband/hw/hfi1/exp_rcv.c b/drivers/infiniband/hw/hfi1/exp_rcv.c index 1be49a0d9c11..e9d5cc8b771a 100644 --- a/drivers/infiniband/hw/hfi1/exp_rcv.c +++ b/drivers/infiniband/hw/hfi1/exp_rcv.c @@ -112,9 +112,6 @@ int hfi1_alloc_ctxt_rcv_groups(struct hfi1_ctxtdata *rcd) */ void hfi1_free_ctxt_rcv_groups(struct hfi1_ctxtdata *rcd) { - WARN_ON(!EXP_TID_SET_EMPTY(rcd->tid_full_list)); - WARN_ON(!EXP_TID_SET_EMPTY(rcd->tid_used_list)); - kfree(rcd->groups); rcd->groups = NULL; hfi1_exp_tid_group_init(rcd); diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c index 1390172b488e..4e0e9fc0a777 100644 --- a/drivers/infiniband/hw/hfi1/qp.c +++ b/drivers/infiniband/hw/hfi1/qp.c @@ -900,7 +900,9 @@ void notify_error_qp(struct rvt_qp *qp) if (!list_empty(&priv->s_iowait.list) && !(qp->s_flags & RVT_S_BUSY) && !(priv->s_flags & RVT_S_BUSY)) { - qp->s_flags &= ~RVT_S_ANY_WAIT_IO; + qp->s_flags &= ~HFI1_S_ANY_WAIT_IO; + iowait_clear_flag(&priv->s_iowait, IOWAIT_PENDING_IB); + iowait_clear_flag(&priv->s_iowait, IOWAIT_PENDING_TID); list_del_init(&priv->s_iowait.list); priv->s_iowait.lock = NULL; rvt_put_qp(qp); diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index e6726c1ab866..5ba39a9f65ad 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -140,10 +140,7 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, case OP(RDMA_READ_RESPONSE_LAST): case OP(RDMA_READ_RESPONSE_ONLY): e = &qp->s_ack_queue[qp->s_tail_ack_queue]; - if (e->rdma_sge.mr) { - rvt_put_mr(e->rdma_sge.mr); - e->rdma_sge.mr = NULL; - } + release_rdma_sge_mr(e); /* FALLTHROUGH */ case OP(ATOMIC_ACKNOWLEDGE): /* @@ -343,7 +340,8 @@ write_resp: break; e->sent = 1; - qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST); + /* Do not free e->rdma_sge until all data are received */ + qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE); break; case TID_OP(READ_RESP): @@ -2643,10 +2641,7 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data, len = be32_to_cpu(reth->length); if (unlikely(offset + len != e->rdma_sge.sge_length)) goto unlock_done; - if (e->rdma_sge.mr) { - rvt_put_mr(e->rdma_sge.mr); - e->rdma_sge.mr = NULL; - } + release_rdma_sge_mr(e); if (len != 0) { u32 rkey = be32_to_cpu(reth->rkey); u64 vaddr = get_ib_reth_vaddr(reth); @@ -3088,10 +3083,7 @@ send_last: update_ack_queue(qp, next); } e = &qp->s_ack_queue[qp->r_head_ack_queue]; - if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) { - rvt_put_mr(e->rdma_sge.mr); - e->rdma_sge.mr = NULL; - } + release_rdma_sge_mr(e); reth = &ohdr->u.rc.reth; len = be32_to_cpu(reth->length); if (len) { @@ -3166,10 +3158,7 @@ send_last: update_ack_queue(qp, next); } e = &qp->s_ack_queue[qp->r_head_ack_queue]; - if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) { - rvt_put_mr(e->rdma_sge.mr); - e->rdma_sge.mr = NULL; - } + release_rdma_sge_mr(e); /* Process OPFN special virtual address */ if (opfn) { opfn_conn_response(qp, e, ateth); diff --git a/drivers/infiniband/hw/hfi1/rc.h b/drivers/infiniband/hw/hfi1/rc.h index 8e0935b9bf2a..5ed5e85d5841 100644 --- a/drivers/infiniband/hw/hfi1/rc.h +++ b/drivers/infiniband/hw/hfi1/rc.h @@ -41,6 +41,14 @@ static inline u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe, return rvt_restart_sge(ss, wqe, len); } +static inline void release_rdma_sge_mr(struct rvt_ack_entry *e) +{ + if (e->rdma_sge.mr) { + rvt_put_mr(e->rdma_sge.mr); + e->rdma_sge.mr = NULL; + } +} + struct rvt_ack_entry *find_prev_entry(struct rvt_qp *qp, u32 psn, u8 *prev, u8 *prev_ack, bool *scheduled); int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, u64 val, diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index fdda33aca77f..eae6f05ca2fa 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -67,8 +67,6 @@ static u32 mask_generation(u32 a) #define TID_RDMA_DESTQP_FLOW_SHIFT 11 #define TID_RDMA_DESTQP_FLOW_MASK 0x1f -#define TID_FLOW_SW_PSN BIT(0) - #define TID_OPFN_QP_CTXT_MASK 0xff #define TID_OPFN_QP_CTXT_SHIFT 56 #define TID_OPFN_QP_KDETH_MASK 0xff @@ -128,6 +126,15 @@ static int make_tid_rdma_ack(struct rvt_qp *qp, struct ib_other_headers *ohdr, struct hfi1_pkt_state *ps); static void hfi1_do_tid_send(struct rvt_qp *qp); +static u32 read_r_next_psn(struct hfi1_devdata *dd, u8 ctxt, u8 fidx); +static void tid_rdma_rcv_err(struct hfi1_packet *packet, + struct ib_other_headers *ohdr, + struct rvt_qp *qp, u32 psn, int diff, bool fecn); +static void update_r_next_psn_fecn(struct hfi1_packet *packet, + struct hfi1_qp_priv *priv, + struct hfi1_ctxtdata *rcd, + struct tid_rdma_flow *flow, + bool fecn); static u64 tid_rdma_opfn_encode(struct tid_rdma_params *p) { @@ -776,7 +783,6 @@ int hfi1_kern_setup_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp) rcd->flows[fs->index].generation = fs->generation; fs->generation = kern_setup_hw_flow(rcd, fs->index); fs->psn = 0; - fs->flags = 0; dequeue_tid_waiter(rcd, &rcd->flow_queue, qp); /* get head before dropping lock */ fqp = first_qp(rcd, &rcd->flow_queue); @@ -1807,6 +1813,7 @@ sync_check: goto done; hfi1_kern_clear_hw_flow(req->rcd, qp); + qpriv->s_flags &= ~HFI1_R_TID_SW_PSN; req->state = TID_REQUEST_ACTIVE; } @@ -2036,10 +2043,7 @@ static int tid_rdma_rcv_error(struct hfi1_packet *packet, if (psn != e->psn || len != req->total_len) goto unlock; - if (e->rdma_sge.mr) { - rvt_put_mr(e->rdma_sge.mr); - e->rdma_sge.mr = NULL; - } + release_rdma_sge_mr(e); rkey = be32_to_cpu(reth->rkey); vaddr = get_ib_reth_vaddr(reth); @@ -2238,7 +2242,7 @@ void hfi1_rc_rcv_tid_rdma_read_req(struct hfi1_packet *packet) struct ib_reth *reth; struct hfi1_qp_priv *qpriv = qp->priv; u32 bth0, psn, len, rkey; - bool is_fecn; + bool fecn; u8 next; u64 vaddr; int diff; @@ -2248,7 +2252,7 @@ void hfi1_rc_rcv_tid_rdma_read_req(struct hfi1_packet *packet) if (hfi1_ruc_check_hdr(ibp, packet)) return; - is_fecn = process_ecn(qp, packet); + fecn = process_ecn(qp, packet); psn = mask_psn(be32_to_cpu(ohdr->bth[2])); trace_hfi1_rsp_rcv_tid_read_req(qp, psn); @@ -2267,9 +2271,8 @@ void hfi1_rc_rcv_tid_rdma_read_req(struct hfi1_packet *packet) diff = delta_psn(psn, qp->r_psn); if (unlikely(diff)) { - if (tid_rdma_rcv_error(packet, ohdr, qp, psn, diff)) - return; - goto send_ack; + tid_rdma_rcv_err(packet, ohdr, qp, psn, diff, fecn); + return; } /* We've verified the request, insert it into the ack queue. */ @@ -2285,10 +2288,7 @@ void hfi1_rc_rcv_tid_rdma_read_req(struct hfi1_packet *packet) update_ack_queue(qp, next); } e = &qp->s_ack_queue[qp->r_head_ack_queue]; - if (e->rdma_sge.mr) { - rvt_put_mr(e->rdma_sge.mr); - e->rdma_sge.mr = NULL; - } + release_rdma_sge_mr(e); rkey = be32_to_cpu(reth->rkey); qp->r_len = len; @@ -2324,11 +2324,11 @@ void hfi1_rc_rcv_tid_rdma_read_req(struct hfi1_packet *packet) /* Schedule the send tasklet. */ qp->s_flags |= RVT_S_RESP_PENDING; + if (fecn) + qp->s_flags |= RVT_S_ECN; hfi1_schedule_send(qp); spin_unlock_irqrestore(&qp->s_lock, flags); - if (is_fecn) - goto send_ack; return; nack_inv_unlock: @@ -2345,8 +2345,6 @@ nack_acc: rvt_rc_error(qp, IB_WC_LOC_PROT_ERR); qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR; qp->r_ack_psn = qp->r_psn; -send_ack: - hfi1_send_rc_ack(packet, is_fecn); } u32 hfi1_build_tid_rdma_read_resp(struct rvt_qp *qp, struct rvt_ack_entry *e, @@ -2463,12 +2461,12 @@ void hfi1_rc_rcv_tid_rdma_read_resp(struct hfi1_packet *packet) struct tid_rdma_request *req; struct tid_rdma_flow *flow; u32 opcode, aeth; - bool is_fecn; + bool fecn; unsigned long flags; u32 kpsn, ipsn; trace_hfi1_sender_rcv_tid_read_resp(qp); - is_fecn = process_ecn(qp, packet); + fecn = process_ecn(qp, packet); kpsn = mask_psn(be32_to_cpu(ohdr->bth[2])); aeth = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.aeth); opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff; @@ -2481,8 +2479,43 @@ void hfi1_rc_rcv_tid_rdma_read_resp(struct hfi1_packet *packet) flow = &req->flows[req->clear_tail]; /* When header suppression is disabled */ - if (cmp_psn(ipsn, flow->flow_state.ib_lpsn)) + if (cmp_psn(ipsn, flow->flow_state.ib_lpsn)) { + update_r_next_psn_fecn(packet, priv, rcd, flow, fecn); + + if (cmp_psn(kpsn, flow->flow_state.r_next_psn)) + goto ack_done; + flow->flow_state.r_next_psn = mask_psn(kpsn + 1); + /* + * Copy the payload to destination buffer if this packet is + * delivered as an eager packet due to RSM rule and FECN. + * The RSM rule selects FECN bit in BTH and SH bit in + * KDETH header and therefore will not match the last + * packet of each segment that has SH bit cleared. + */ + if (fecn && packet->etype == RHF_RCV_TYPE_EAGER) { + struct rvt_sge_state ss; + u32 len; + u32 tlen = packet->tlen; + u16 hdrsize = packet->hlen; + u8 pad = packet->pad; + u8 extra_bytes = pad + packet->extra_byte + + (SIZE_OF_CRC << 2); + u32 pmtu = qp->pmtu; + + if (unlikely(tlen != (hdrsize + pmtu + extra_bytes))) + goto ack_op_err; + len = restart_sge(&ss, req->e.swqe, ipsn, pmtu); + if (unlikely(len < pmtu)) + goto ack_op_err; + rvt_copy_sge(qp, &ss, packet->payload, pmtu, false, + false); + /* Raise the sw sequence check flag for next packet */ + priv->s_flags |= HFI1_R_TID_SW_PSN; + } + goto ack_done; + } + flow->flow_state.r_next_psn = mask_psn(kpsn + 1); req->ack_pending--; priv->pending_tid_r_segs--; qp->s_num_rd_atomic--; @@ -2524,6 +2557,7 @@ void hfi1_rc_rcv_tid_rdma_read_resp(struct hfi1_packet *packet) req->comp_seg == req->cur_seg) || priv->tid_r_comp == priv->tid_r_reqs) { hfi1_kern_clear_hw_flow(priv->rcd, qp); + priv->s_flags &= ~HFI1_R_TID_SW_PSN; if (req->state == TID_REQUEST_SYNC) req->state = TID_REQUEST_ACTIVE; } @@ -2545,8 +2579,6 @@ ack_op_err: ack_done: spin_unlock_irqrestore(&qp->s_lock, flags); - if (is_fecn) - hfi1_send_rc_ack(packet, is_fecn); } void hfi1_kern_read_tid_flow_free(struct rvt_qp *qp) @@ -2773,9 +2805,9 @@ static bool handle_read_kdeth_eflags(struct hfi1_ctxtdata *rcd, rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR); return ret; } - if (priv->flow_state.flags & TID_FLOW_SW_PSN) { + if (priv->s_flags & HFI1_R_TID_SW_PSN) { diff = cmp_psn(psn, - priv->flow_state.r_next_psn); + flow->flow_state.r_next_psn); if (diff > 0) { if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) restart_tid_rdma_read_req(rcd, @@ -2811,22 +2843,15 @@ static bool handle_read_kdeth_eflags(struct hfi1_ctxtdata *rcd, qp->r_flags &= ~RVT_R_RDMAR_SEQ; } - priv->flow_state.r_next_psn++; + flow->flow_state.r_next_psn = + mask_psn(psn + 1); } else { - u64 reg; u32 last_psn; - /* - * The only sane way to get the amount of - * progress is to read the HW flow state. - */ - reg = read_uctxt_csr(dd, rcd->ctxt, - RCV_TID_FLOW_TABLE + - (8 * flow->idx)); - last_psn = mask_psn(reg); - - priv->flow_state.r_next_psn = last_psn; - priv->flow_state.flags |= TID_FLOW_SW_PSN; + last_psn = read_r_next_psn(dd, rcd->ctxt, + flow->idx); + flow->flow_state.r_next_psn = last_psn; + priv->s_flags |= HFI1_R_TID_SW_PSN; /* * If no request has been restarted yet, * restart the current one. @@ -2891,6 +2916,7 @@ bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd, struct rvt_ack_entry *e; struct tid_rdma_request *req; struct tid_rdma_flow *flow; + int diff = 0; trace_hfi1_msg_handle_kdeth_eflags(NULL, "Kdeth error: rhf ", packet->rhf); @@ -2974,17 +3000,10 @@ bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd, switch (rte) { case RHF_RTE_EXPECTED_FLOW_SEQ_ERR: if (!(qpriv->s_flags & HFI1_R_TID_SW_PSN)) { - u64 reg; - qpriv->s_flags |= HFI1_R_TID_SW_PSN; - /* - * The only sane way to get the amount of - * progress is to read the HW flow state. - */ - reg = read_uctxt_csr(dd, rcd->ctxt, - RCV_TID_FLOW_TABLE + - (8 * flow->idx)); - flow->flow_state.r_next_psn = mask_psn(reg); + flow->flow_state.r_next_psn = + read_r_next_psn(dd, rcd->ctxt, + flow->idx); qpriv->r_next_psn_kdeth = flow->flow_state.r_next_psn; goto nak_psn; @@ -2997,10 +3016,12 @@ bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd, * mismatch could be due to packets that were * already in flight. */ - if (psn != flow->flow_state.r_next_psn) { - psn = flow->flow_state.r_next_psn; + diff = cmp_psn(psn, + flow->flow_state.r_next_psn); + if (diff > 0) goto nak_psn; - } + else if (diff < 0) + break; qpriv->s_nak_state = 0; /* @@ -3011,8 +3032,10 @@ bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd, if (psn == full_flow_psn(flow, flow->flow_state.lpsn)) ret = false; + flow->flow_state.r_next_psn = + mask_psn(psn + 1); qpriv->r_next_psn_kdeth = - ++flow->flow_state.r_next_psn; + flow->flow_state.r_next_psn; } break; @@ -3517,8 +3540,10 @@ static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx) if (qpriv->r_tid_alloc == qpriv->r_tid_head) { /* If all data has been received, clear the flow */ if (qpriv->flow_state.index < RXE_NUM_TID_FLOWS && - !qpriv->alloc_w_segs) + !qpriv->alloc_w_segs) { hfi1_kern_clear_hw_flow(rcd, qp); + qpriv->s_flags &= ~HFI1_R_TID_SW_PSN; + } break; } @@ -3544,8 +3569,7 @@ static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx) if (qpriv->sync_pt && !qpriv->alloc_w_segs) { hfi1_kern_clear_hw_flow(rcd, qp); qpriv->sync_pt = false; - if (qpriv->s_flags & HFI1_R_TID_SW_PSN) - qpriv->s_flags &= ~HFI1_R_TID_SW_PSN; + qpriv->s_flags &= ~HFI1_R_TID_SW_PSN; } /* Allocate flow if we don't have one */ @@ -3687,7 +3711,7 @@ void hfi1_rc_rcv_tid_rdma_write_req(struct hfi1_packet *packet) struct hfi1_qp_priv *qpriv = qp->priv; struct tid_rdma_request *req; u32 bth0, psn, len, rkey, num_segs; - bool is_fecn; + bool fecn; u8 next; u64 vaddr; int diff; @@ -3696,7 +3720,7 @@ void hfi1_rc_rcv_tid_rdma_write_req(struct hfi1_packet *packet) if (hfi1_ruc_check_hdr(ibp, packet)) return; - is_fecn = process_ecn(qp, packet); + fecn = process_ecn(qp, packet); psn = mask_psn(be32_to_cpu(ohdr->bth[2])); trace_hfi1_rsp_rcv_tid_write_req(qp, psn); @@ -3713,9 +3737,8 @@ void hfi1_rc_rcv_tid_rdma_write_req(struct hfi1_packet *packet) num_segs = DIV_ROUND_UP(len, qpriv->tid_rdma.local.max_len); diff = delta_psn(psn, qp->r_psn); if (unlikely(diff)) { - if (tid_rdma_rcv_error(packet, ohdr, qp, psn, diff)) - return; - goto send_ack; + tid_rdma_rcv_err(packet, ohdr, qp, psn, diff, fecn); + return; } /* @@ -3751,10 +3774,7 @@ void hfi1_rc_rcv_tid_rdma_write_req(struct hfi1_packet *packet) goto update_head; } - if (e->rdma_sge.mr) { - rvt_put_mr(e->rdma_sge.mr); - e->rdma_sge.mr = NULL; - } + release_rdma_sge_mr(e); /* The length needs to be in multiples of PAGE_SIZE */ if (!len || len & ~PAGE_MASK) @@ -3834,11 +3854,11 @@ update_head: /* Schedule the send tasklet. */ qp->s_flags |= RVT_S_RESP_PENDING; + if (fecn) + qp->s_flags |= RVT_S_ECN; hfi1_schedule_send(qp); spin_unlock_irqrestore(&qp->s_lock, flags); - if (is_fecn) - goto send_ack; return; nack_inv_unlock: @@ -3855,8 +3875,6 @@ nack_acc: rvt_rc_error(qp, IB_WC_LOC_PROT_ERR); qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR; qp->r_ack_psn = qp->r_psn; -send_ack: - hfi1_send_rc_ack(packet, is_fecn); } u32 hfi1_build_tid_rdma_write_resp(struct rvt_qp *qp, struct rvt_ack_entry *e, @@ -4073,10 +4091,10 @@ void hfi1_rc_rcv_tid_rdma_write_resp(struct hfi1_packet *packet) struct tid_rdma_flow *flow; enum ib_wc_status status; u32 opcode, aeth, psn, flow_psn, i, tidlen = 0, pktlen; - bool is_fecn; + bool fecn; unsigned long flags; - is_fecn = process_ecn(qp, packet); + fecn = process_ecn(qp, packet); psn = mask_psn(be32_to_cpu(ohdr->bth[2])); aeth = be32_to_cpu(ohdr->u.tid_rdma.w_rsp.aeth); opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff; @@ -4216,7 +4234,6 @@ void hfi1_rc_rcv_tid_rdma_write_resp(struct hfi1_packet *packet) qpriv->s_tid_cur = i; } qp->s_flags &= ~HFI1_S_WAIT_TID_RESP; - hfi1_schedule_tid_send(qp); goto ack_done; @@ -4225,9 +4242,9 @@ ack_op_err: ack_err: rvt_error_qp(qp, status); ack_done: + if (fecn) + qp->s_flags |= RVT_S_ECN; spin_unlock_irqrestore(&qp->s_lock, flags); - if (is_fecn) - hfi1_send_rc_ack(packet, is_fecn); } bool hfi1_build_tid_rdma_packet(struct rvt_swqe *wqe, @@ -4307,7 +4324,9 @@ void hfi1_rc_rcv_tid_rdma_write_data(struct hfi1_packet *packet) unsigned long flags; u32 psn, next; u8 opcode; + bool fecn; + fecn = process_ecn(qp, packet); psn = mask_psn(be32_to_cpu(ohdr->bth[2])); opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff; @@ -4320,9 +4339,53 @@ void hfi1_rc_rcv_tid_rdma_write_data(struct hfi1_packet *packet) req = ack_to_tid_req(e); flow = &req->flows[req->clear_tail]; if (cmp_psn(psn, full_flow_psn(flow, flow->flow_state.lpsn))) { + update_r_next_psn_fecn(packet, priv, rcd, flow, fecn); + if (cmp_psn(psn, flow->flow_state.r_next_psn)) goto send_nak; - flow->flow_state.r_next_psn++; + + flow->flow_state.r_next_psn = mask_psn(psn + 1); + /* + * Copy the payload to destination buffer if this packet is + * delivered as an eager packet due to RSM rule and FECN. + * The RSM rule selects FECN bit in BTH and SH bit in + * KDETH header and therefore will not match the last + * packet of each segment that has SH bit cleared. + */ + if (fecn && packet->etype == RHF_RCV_TYPE_EAGER) { + struct rvt_sge_state ss; + u32 len; + u32 tlen = packet->tlen; + u16 hdrsize = packet->hlen; + u8 pad = packet->pad; + u8 extra_bytes = pad + packet->extra_byte + + (SIZE_OF_CRC << 2); + u32 pmtu = qp->pmtu; + + if (unlikely(tlen != (hdrsize + pmtu + extra_bytes))) + goto send_nak; + len = req->comp_seg * req->seg_len; + len += delta_psn(psn, + full_flow_psn(flow, flow->flow_state.spsn)) * + pmtu; + if (unlikely(req->total_len - len < pmtu)) + goto send_nak; + + /* + * The e->rdma_sge field is set when TID RDMA WRITE REQ + * is first received and is never modified thereafter. + */ + ss.sge = e->rdma_sge; + ss.sg_list = NULL; + ss.num_sge = 1; + ss.total_len = req->total_len; + rvt_skip_sge(&ss, len, false); + rvt_copy_sge(qp, &ss, packet->payload, pmtu, false, + false); + /* Raise the sw sequence check flag for next packet */ + priv->r_next_psn_kdeth = mask_psn(psn + 1); + priv->s_flags |= HFI1_R_TID_SW_PSN; + } goto exit; } flow->flow_state.r_next_psn = mask_psn(psn + 1); @@ -4347,6 +4410,7 @@ void hfi1_rc_rcv_tid_rdma_write_data(struct hfi1_packet *packet) priv->r_tid_ack = priv->r_tid_tail; if (opcode == TID_OP(WRITE_DATA_LAST)) { + release_rdma_sge_mr(e); for (next = priv->r_tid_tail + 1; ; next++) { if (next > rvt_size_atomic(&dev->rdi)) next = 0; @@ -4386,6 +4450,8 @@ done: hfi1_schedule_tid_send(qp); exit: priv->r_next_psn_kdeth = flow->flow_state.r_next_psn; + if (fecn) + qp->s_flags |= RVT_S_ECN; spin_unlock_irqrestore(&qp->s_lock, flags); return; @@ -4487,12 +4553,11 @@ void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet) struct tid_rdma_request *req; struct tid_rdma_flow *flow; u32 aeth, psn, req_psn, ack_psn, fspsn, resync_psn, ack_kpsn; - bool is_fecn; unsigned long flags; u16 fidx; trace_hfi1_tid_write_sender_rcv_tid_ack(qp, 0); - is_fecn = process_ecn(qp, packet); + process_ecn(qp, packet); psn = mask_psn(be32_to_cpu(ohdr->bth[2])); aeth = be32_to_cpu(ohdr->u.tid_rdma.ack.aeth); req_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.ack.verbs_psn)); @@ -4846,10 +4911,10 @@ void hfi1_rc_rcv_tid_rdma_resync(struct hfi1_packet *packet) struct tid_rdma_flow *flow; struct tid_flow_state *fs = &qpriv->flow_state; u32 psn, generation, idx, gen_next; - bool is_fecn; + bool fecn; unsigned long flags; - is_fecn = process_ecn(qp, packet); + fecn = process_ecn(qp, packet); psn = mask_psn(be32_to_cpu(ohdr->bth[2])); generation = mask_psn(psn + 1) >> HFI1_KDETH_BTH_SEQ_SHIFT; @@ -4940,6 +5005,8 @@ void hfi1_rc_rcv_tid_rdma_resync(struct hfi1_packet *packet) qpriv->s_flags |= RVT_S_ACK_PENDING; hfi1_schedule_tid_send(qp); bail: + if (fecn) + qp->s_flags |= RVT_S_ECN; spin_unlock_irqrestore(&qp->s_lock, flags); } @@ -5464,3 +5531,48 @@ bool hfi1_tid_rdma_ack_interlock(struct rvt_qp *qp, struct rvt_ack_entry *e) } return false; } + +static u32 read_r_next_psn(struct hfi1_devdata *dd, u8 ctxt, u8 fidx) +{ + u64 reg; + + /* + * The only sane way to get the amount of + * progress is to read the HW flow state. + */ + reg = read_uctxt_csr(dd, ctxt, RCV_TID_FLOW_TABLE + (8 * fidx)); + return mask_psn(reg); +} + +static void tid_rdma_rcv_err(struct hfi1_packet *packet, + struct ib_other_headers *ohdr, + struct rvt_qp *qp, u32 psn, int diff, bool fecn) +{ + unsigned long flags; + + tid_rdma_rcv_error(packet, ohdr, qp, psn, diff); + if (fecn) { + spin_lock_irqsave(&qp->s_lock, flags); + qp->s_flags |= RVT_S_ECN; + spin_unlock_irqrestore(&qp->s_lock, flags); + } +} + +static void update_r_next_psn_fecn(struct hfi1_packet *packet, + struct hfi1_qp_priv *priv, + struct hfi1_ctxtdata *rcd, + struct tid_rdma_flow *flow, + bool fecn) +{ + /* + * If a start/middle packet is delivered here due to + * RSM rule and FECN, we need to update the r_next_psn. + */ + if (fecn && packet->etype == RHF_RCV_TYPE_EAGER && + !(priv->s_flags & HFI1_R_TID_SW_PSN)) { + struct hfi1_devdata *dd = rcd->dd; + + flow->flow_state.r_next_psn = + read_r_next_psn(dd, rcd->ctxt, flow->idx); + } +} diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index 53ab24ef4f02..1c536185261e 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -76,10 +76,8 @@ struct tid_rdma_qp_params { struct tid_flow_state { u32 generation; u32 psn; - u32 r_next_psn; /* next PSN to be received (in TID space) */ u8 index; u8 last_index; - u8 flags; }; enum tid_rdma_req_state { diff --git a/drivers/infiniband/hw/hfi1/trace_tid.h b/drivers/infiniband/hw/hfi1/trace_tid.h index 548dfc45a407..4388b594ed1b 100644 --- a/drivers/infiniband/hw/hfi1/trace_tid.h +++ b/drivers/infiniband/hw/hfi1/trace_tid.h @@ -53,7 +53,7 @@ u16 hfi1_trace_get_tid_idx(u32 ent); "tid_r_comp %u pending_tid_r_segs %u " \ "s_flags 0x%x ps_flags 0x%x iow_flags 0x%lx " \ "s_state 0x%x hw_flow_index %u generation 0x%x " \ - "fpsn 0x%x flow_flags 0x%x" + "fpsn 0x%x" #define TID_REQ_PRN "[%s] qpn 0x%x newreq %u opcode 0x%x psn 0x%x lpsn 0x%x " \ "cur_seg %u comp_seg %u ack_seg %u alloc_seg %u " \ @@ -71,7 +71,7 @@ u16 hfi1_trace_get_tid_idx(u32 ent); "pending_tid_w_segs %u sync_pt %s " \ "ps_nak_psn 0x%x ps_nak_state 0x%x " \ "prnr_nak_state 0x%x hw_flow_index %u generation "\ - "0x%x fpsn 0x%x flow_flags 0x%x resync %s" \ + "0x%x fpsn 0x%x resync %s" \ "r_next_psn_kdeth 0x%x" #define TID_WRITE_SENDER_PRN "[%s] qpn 0x%x newreq %u s_tid_cur %u " \ @@ -973,7 +973,6 @@ DECLARE_EVENT_CLASS(/* tid_read_sender */ __field(u32, hw_flow_index) __field(u32, generation) __field(u32, fpsn) - __field(u32, flow_flags) ), TP_fast_assign(/* assign */ struct hfi1_qp_priv *priv = qp->priv; @@ -991,7 +990,6 @@ DECLARE_EVENT_CLASS(/* tid_read_sender */ __entry->hw_flow_index = priv->flow_state.index; __entry->generation = priv->flow_state.generation; __entry->fpsn = priv->flow_state.psn; - __entry->flow_flags = priv->flow_state.flags; ), TP_printk(/* print */ TID_READ_SENDER_PRN, @@ -1007,8 +1005,7 @@ DECLARE_EVENT_CLASS(/* tid_read_sender */ __entry->s_state, __entry->hw_flow_index, __entry->generation, - __entry->fpsn, - __entry->flow_flags + __entry->fpsn ) ); @@ -1338,7 +1335,6 @@ DECLARE_EVENT_CLASS(/* tid_write_sp */ __field(u32, hw_flow_index) __field(u32, generation) __field(u32, fpsn) - __field(u32, flow_flags) __field(bool, resync) __field(u32, r_next_psn_kdeth) ), @@ -1360,7 +1356,6 @@ DECLARE_EVENT_CLASS(/* tid_write_sp */ __entry->hw_flow_index = priv->flow_state.index; __entry->generation = priv->flow_state.generation; __entry->fpsn = priv->flow_state.psn; - __entry->flow_flags = priv->flow_state.flags; __entry->resync = priv->resync; __entry->r_next_psn_kdeth = priv->r_next_psn_kdeth; ), @@ -1381,7 +1376,6 @@ DECLARE_EVENT_CLASS(/* tid_write_sp */ __entry->hw_flow_index, __entry->generation, __entry->fpsn, - __entry->flow_flags, __entry->resync ? "yes" : "no", __entry->r_next_psn_kdeth ) diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 2bc4d67b3e42..91669e35c6ca 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -585,7 +585,7 @@ static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, struct ib_umem_odp *odp_mr = to_ib_umem_odp(mr->umem); bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE; bool prefetch = flags & MLX5_PF_FLAGS_PREFETCH; - u64 access_mask = ODP_READ_ALLOWED_BIT; + u64 access_mask; u64 start_idx, page_mask; struct ib_umem_odp *odp; size_t size; @@ -607,6 +607,7 @@ next_mr: page_shift = mr->umem->page_shift; page_mask = ~(BIT(page_shift) - 1); start_idx = (io_virt - (mr->mmkey.iova & page_mask)) >> page_shift; + access_mask = ODP_READ_ALLOWED_BIT; if (prefetch && !downgrade && !mr->umem->writable) { /* prefetch with write-access must |