From 662d66466637862ef955f7f6e78a286d8cf0ebef Mon Sep 17 00:00:00 2001
From: Kaike Wan <kaike.wan@intel.com>
Date: Mon, 18 Mar 2019 09:55:19 -0700
Subject: IB/hfi1: Failed to drain send queue when QP is put into error state

When a QP is put into error state, all pending requests in the send work
queue should be drained. The following sequence of events could lead to a
failure, causing a request to hang:

(1) The QP builds a packet and tries to send through SDMA engine.
    However, PIO engine is still busy. Consequently, this packet is put on
    the QP's tx list and the QP is put on the PIO waiting list. The field
    qp->s_flags is set with HFI1_S_WAIT_PIO_DRAIN;

(2) The QP is put into error state by the user application and
    notify_error_qp() is called, which removes the QP from the PIO waiting
    list and the packet from the QP's tx list. In addition, qp->s_flags is
    cleared of RVT_S_ANY_WAIT_IO bits, which does not include
    HFI1_S_WAIT_PIO_DRAIN bit;

(3) The hfi1_schdule_send() function is called to drain the QP's send
    queue. Subsequently, hfi1_do_send() is called. Since the flag bit
    HFI1_S_WAIT_PIO_DRAIN is set in qp->s_flags, hfi1_send_ok() fails.  As
    a result, hfi1_do_send() bails out without draining any request from
    the send queue;

(4) The PIO engine completes the sending and tries to wake up any QP on
    its waiting list. But the QP has been removed from the PIO waiting
    list and therefore is kept in sleep forever.

The fix is to clear qp->s_flags of HFI1_S_ANY_WAIT_IO bits in step (2).
HFI1_S_ANY_WAIT_IO includes RVT_S_ANY_WAIT_IO and HFI1_S_WAIT_PIO_DRAIN.

Fixes: 2e2ba09e48b7 ("IB/rdmavt, IB/hfi1: Create device dependent s_flags")
Cc: <stable@vger.kernel.org> # 4.19.x+
Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Reviewed-by: Alex Estrin <alex.estrin@intel.com>
Signed-off-by: Kaike Wan <kaike.wan@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hfi1/qp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/infiniband/hw/hfi1')

diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c
index 9b643c2409cf..64889eda1735 100644
--- a/drivers/infiniband/hw/hfi1/qp.c
+++ b/drivers/infiniband/hw/hfi1/qp.c
@@ -898,7 +898,7 @@ void notify_error_qp(struct rvt_qp *qp)
 		if (!list_empty(&priv->s_iowait.list) &&
 		    !(qp->s_flags & RVT_S_BUSY) &&
 		    !(priv->s_flags & RVT_S_BUSY)) {
-			qp->s_flags &= ~RVT_S_ANY_WAIT_IO;
+			qp->s_flags &= ~HFI1_S_ANY_WAIT_IO;
 			list_del_init(&priv->s_iowait.list);
 			priv->s_iowait.lock = NULL;
 			rvt_put_qp(qp);
-- 
cgit v1.2.3-59-g8ed1b


From 93b289b9aff66eca7575b09f36f5abbeca8e6167 Mon Sep 17 00:00:00 2001
From: Kaike Wan <kaike.wan@intel.com>
Date: Mon, 18 Mar 2019 09:55:29 -0700
Subject: IB/hfi1: Clear the IOWAIT pending bits when QP is put into error
 state

When a QP is put into error state, it may be waiting for send engine
resources. In this case, the QP will be removed from the send engine's
waiting list, but its IOWAIT pending bits are not cleared. This will
normally not have any major impact as the QP is being destroyed.  However,
the QP still needs to wind down its operations, such as draining the send
queue by scheduling the send engine. Clearing the pending bits will avoid
any potential complications. In addition, if the QP will eventually hang,
clearing the pending bits can help debugging by presenting a consistent
picture if the user dumps the qp_stats.

This patch clears a QP's IOWAIT_PENDING_IB and IO_PENDING_TID bits in
priv->s_iowait.flags in this case.

Fixes: 5da0fc9dbf89 ("IB/hfi1: Prepare resource waits for dual leg")
Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Reviewed-by: Alex Estrin <alex.estrin@intel.com>
Signed-off-by: Kaike Wan <kaike.wan@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hfi1/qp.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'drivers/infiniband/hw/hfi1')

diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c
index 64889eda1735..eba300330a02 100644
--- a/drivers/infiniband/hw/hfi1/qp.c
+++ b/drivers/infiniband/hw/hfi1/qp.c
@@ -899,6 +899,8 @@ void notify_error_qp(struct rvt_qp *qp)
 		    !(qp->s_flags & RVT_S_BUSY) &&
 		    !(priv->s_flags & RVT_S_BUSY)) {
 			qp->s_flags &= ~HFI1_S_ANY_WAIT_IO;
+			iowait_clear_flag(&priv->s_iowait, IOWAIT_PENDING_IB);
+			iowait_clear_flag(&priv->s_iowait, IOWAIT_PENDING_TID);
 			list_del_init(&priv->s_iowait.list);
 			priv->s_iowait.lock = NULL;
 			rvt_put_qp(qp);
-- 
cgit v1.2.3-59-g8ed1b


From a8639a79e85c18c16c10089edd589c7948f19bbd Mon Sep 17 00:00:00 2001
From: Kaike Wan <kaike.wan@intel.com>
Date: Mon, 18 Mar 2019 09:55:39 -0700
Subject: IB/hfi1: Eliminate opcode tests on mr deref

When an old ack_queue entry is used to store an incoming request, it may
need to clean up the old entry if it is still referencing the
MR. Originally only RDMA READ request needed to reference MR on the
responder side and therefore the opcode was tested when cleaning up the
old entry. The introduction of tid rdma specific operations in the
ack_queue makes the specific opcode tests wrong.  Multiple opcodes (RDMA
READ, TID RDMA READ, and TID RDMA WRITE) may need MR ref cleanup.

Remove the opcode specific tests associated with the ack_queue.

Fixes: f48ad614c100 ("IB/hfi1: Move driver out of staging")
Signed-off-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Kaike Wan <kaike.wan@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hfi1/rc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'drivers/infiniband/hw/hfi1')

diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c
index e6726c1ab866..5991211d72bd 100644
--- a/drivers/infiniband/hw/hfi1/rc.c
+++ b/drivers/infiniband/hw/hfi1/rc.c
@@ -3088,7 +3088,7 @@ send_last:
 			update_ack_queue(qp, next);
 		}
 		e = &qp->s_ack_queue[qp->r_head_ack_queue];
-		if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) {
+		if (e->rdma_sge.mr) {
 			rvt_put_mr(e->rdma_sge.mr);
 			e->rdma_sge.mr = NULL;
 		}
@@ -3166,7 +3166,7 @@ send_last:
 			update_ack_queue(qp, next);
 		}
 		e = &qp->s_ack_queue[qp->r_head_ack_queue];
-		if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) {
+		if (e->rdma_sge.mr) {
 			rvt_put_mr(e->rdma_sge.mr);
 			e->rdma_sge.mr = NULL;
 		}
-- 
cgit v1.2.3-59-g8ed1b


From d0294344470e6b52d097aa7369173f32d11f2f52 Mon Sep 17 00:00:00 2001
From: Kaike Wan <kaike.wan@intel.com>
Date: Mon, 18 Mar 2019 09:55:49 -0700
Subject: IB/hfi1: Fix the allocation of RSM table

The receive side mapping (RSM) on hfi1 hardware is a special
matching mechanism to direct an incoming packet to a given
hardware receive context. It has 4 instances of matching capabilities
(RSM0 - RSM3) that share the same RSM table (RMT). The RMT has a total of
256 entries, each of which points to a receive context.

Currently, three instances of RSM have been used:
1. RSM0 by QOS;
2. RSM1 by PSM FECN;
3. RSM2 by VNIC.

Each RSM instance should reserve enough entries in RMT to function
properly. Since both PSM and VNIC could allocate any receive context
between dd->first_dyn_alloc_ctxt and dd->num_rcv_contexts, PSM FECN must
reserve enough RMT entries to cover the entire receive context index
range (dd->num_rcv_contexts - dd->first_dyn_alloc_ctxt) instead of only
the user receive contexts allocated for PSM
(dd->num_user_contexts). Consequently, the sizing of
dd->num_user_contexts in set_up_context_variables is incorrect.

Fixes: 2280740f01ae ("IB/hfi1: Virtual Network Interface Controller (VNIC) HW support")
Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
Signed-off-by: Kaike Wan <kaike.wan@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hfi1/chip.c | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

(limited to 'drivers/infiniband/hw/hfi1')

diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c
index 612f04190ed8..9784c6c0d2ec 100644
--- a/drivers/infiniband/hw/hfi1/chip.c
+++ b/drivers/infiniband/hw/hfi1/chip.c
@@ -13232,7 +13232,7 @@ static int set_up_context_variables(struct hfi1_devdata *dd)
 	int total_contexts;
 	int ret;
 	unsigned ngroups;
-	int qos_rmt_count;
+	int rmt_count;
 	int user_rmt_reduced;
 	u32 n_usr_ctxts;
 	u32 send_contexts = chip_send_contexts(dd);
@@ -13294,10 +13294,20 @@ static int set_up_context_variables(struct hfi1_devdata *dd)
 		n_usr_ctxts = rcv_contexts - total_contexts;
 	}
 
-	/* each user context requires an entry in the RMT */
-	qos_rmt_count = qos_rmt_entries(dd, NULL, NULL);
-	if (qos_rmt_count + n_usr_ctxts > NUM_MAP_ENTRIES) {
-		user_rmt_reduced = NUM_MAP_ENTRIES - qos_rmt_count;
+	/*
+	 * The RMT entries are currently allocated as shown below:
+	 * 1. QOS (0 to 128 entries);
+	 * 2. FECN for PSM (num_user_contexts + num_vnic_contexts);
+	 * 3. VNIC (num_vnic_contexts).
+	 * It should be noted that PSM FECN oversubscribe num_vnic_contexts
+	 * entries of RMT because both VNIC and PSM could allocate any receive
+	 * context between dd->first_dyn_alloc_text and dd->num_rcv_contexts,
+	 * and PSM FECN must reserve an RMT entry for each possible PSM receive
+	 * context.
+	 */
+	rmt_count = qos_rmt_entries(dd, NULL, NULL) + (num_vnic_contexts * 2);
+	if (rmt_count + n_usr_ctxts > NUM_MAP_ENTRIES) {
+		user_rmt_reduced = NUM_MAP_ENTRIES - rmt_count;
 		dd_dev_err(dd,
 			   "RMT size is reducing the number of user receive contexts from %u to %d\n",
 			   n_usr_ctxts,
@@ -14285,9 +14295,11 @@ static void init_user_fecn_handling(struct hfi1_devdata *dd,
 	u64 reg;
 	int i, idx, regoff, regidx;
 	u8 offset;
+	u32 total_cnt;
 
 	/* there needs to be enough room in the map table */
-	if (rmt->used + dd->num_user_contexts >= NUM_MAP_ENTRIES) {
+	total_cnt = dd->num_rcv_contexts - dd->first_dyn_alloc_ctxt;
+	if (rmt->used + total_cnt >= NUM_MAP_ENTRIES) {
 		dd_dev_err(dd, "User FECN handling disabled - too many user contexts allocated\n");
 		return;
 	}
@@ -14341,7 +14353,7 @@ static void init_user_fecn_handling(struct hfi1_devdata *dd,
 	/* add rule 1 */
 	add_rsm_rule(dd, RSM_INS_FECN, &rrd);
 
-	rmt->used += dd->num_user_contexts;
+	rmt->used += total_cnt;
 }
 
 /* Initialize RSM for VNIC */
-- 
cgit v1.2.3-59-g8ed1b


From f6f3f532556e4fcaa2d259fd04a800bfb4f9670d Mon Sep 17 00:00:00 2001
From: Kaike Wan <kaike.wan@intel.com>
Date: Mon, 18 Mar 2019 09:58:30 -0700
Subject: IB/hfi1: Delay the release of destination mr for TID RDMA WRITE DATA

The reference of destination memory region is first obtained when TID RDMA
WRITE request is first received on the responder side. This reference is
released once all TID RDMA WRITE RESP packets are sent to the requester
side, even though not all TID RDMA WRITE DATA packets may have been
received. This early release will especially be undesired if the software
needs to access the destination memory before the last data packet is
received.

This patch delays the release of the MR until all TID RDMA DATA packets
have been received. A helper function to release the reference is also
created to simplify the code.

Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
Signed-off-by: Kaike Wan <kaike.wan@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hfi1/rc.c       | 23 ++++++-----------------
 drivers/infiniband/hw/hfi1/rc.h       |  8 ++++++++
 drivers/infiniband/hw/hfi1/tid_rdma.c | 16 ++++------------
 3 files changed, 18 insertions(+), 29 deletions(-)

(limited to 'drivers/infiniband/hw/hfi1')

diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c
index 5991211d72bd..5ba39a9f65ad 100644
--- a/drivers/infiniband/hw/hfi1/rc.c
+++ b/drivers/infiniband/hw/hfi1/rc.c
@@ -140,10 +140,7 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
 	case OP(RDMA_READ_RESPONSE_LAST):
 	case OP(RDMA_READ_RESPONSE_ONLY):
 		e = &qp->s_ack_queue[qp->s_tail_ack_queue];
-		if (e->rdma_sge.mr) {
-			rvt_put_mr(e->rdma_sge.mr);
-			e->rdma_sge.mr = NULL;
-		}
+		release_rdma_sge_mr(e);
 		/* FALLTHROUGH */
 	case OP(ATOMIC_ACKNOWLEDGE):
 		/*
@@ -343,7 +340,8 @@ write_resp:
 			break;
 
 		e->sent = 1;
-		qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
+		/* Do not free e->rdma_sge until all data are received */
+		qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE);
 		break;
 
 	case TID_OP(READ_RESP):
@@ -2643,10 +2641,7 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data,
 		len = be32_to_cpu(reth->length);
 		if (unlikely(offset + len != e->rdma_sge.sge_length))
 			goto unlock_done;
-		if (e->rdma_sge.mr) {
-			rvt_put_mr(e->rdma_sge.mr);
-			e->rdma_sge.mr = NULL;
-		}
+		release_rdma_sge_mr(e);
 		if (len != 0) {
 			u32 rkey = be32_to_cpu(reth->rkey);
 			u64 vaddr = get_ib_reth_vaddr(reth);
@@ -3088,10 +3083,7 @@ send_last:
 			update_ack_queue(qp, next);
 		}
 		e = &qp->s_ack_queue[qp->r_head_ack_queue];
-		if (e->rdma_sge.mr) {
-			rvt_put_mr(e->rdma_sge.mr);
-			e->rdma_sge.mr = NULL;
-		}
+		release_rdma_sge_mr(e);
 		reth = &ohdr->u.rc.reth;
 		len = be32_to_cpu(reth->length);
 		if (len) {
@@ -3166,10 +3158,7 @@ send_last:
 			update_ack_queue(qp, next);
 		}
 		e = &qp->s_ack_queue[qp->r_head_ack_queue];
-		if (e->rdma_sge.mr) {
-			rvt_put_mr(e->rdma_sge.mr);
-			e->rdma_sge.mr = NULL;
-		}
+		release_rdma_sge_mr(e);
 		/* Process OPFN special virtual address */
 		if (opfn) {
 			opfn_conn_response(qp, e, ateth);
diff --git a/drivers/infiniband/hw/hfi1/rc.h b/drivers/infiniband/hw/hfi1/rc.h
index 8e0935b9bf2a..5ed5e85d5841 100644
--- a/drivers/infiniband/hw/hfi1/rc.h
+++ b/drivers/infiniband/hw/hfi1/rc.h
@@ -41,6 +41,14 @@ static inline u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe,
 	return rvt_restart_sge(ss, wqe, len);
 }
 
+static inline void release_rdma_sge_mr(struct rvt_ack_entry *e)
+{
+	if (e->rdma_sge.mr) {
+		rvt_put_mr(e->rdma_sge.mr);
+		e->rdma_sge.mr = NULL;
+	}
+}
+
 struct rvt_ack_entry *find_prev_entry(struct rvt_qp *qp, u32 psn, u8 *prev,
 				      u8 *prev_ack, bool *scheduled);
 int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, u64 val,
diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c
index fdda33aca77f..145dd3348fdd 100644
--- a/drivers/infiniband/hw/hfi1/tid_rdma.c
+++ b/drivers/infiniband/hw/hfi1/tid_rdma.c
@@ -2036,10 +2036,7 @@ static int tid_rdma_rcv_error(struct hfi1_packet *packet,
 		if (psn != e->psn || len != req->total_len)
 			goto unlock;
 
-		if (e->rdma_sge.mr) {
-			rvt_put_mr(e->rdma_sge.mr);
-			e->rdma_sge.mr = NULL;
-		}
+		release_rdma_sge_mr(e);
 
 		rkey = be32_to_cpu(reth->rkey);
 		vaddr = get_ib_reth_vaddr(reth);
@@ -2285,10 +2282,7 @@ void hfi1_rc_rcv_tid_rdma_read_req(struct hfi1_packet *packet)
 		update_ack_queue(qp, next);
 	}
 	e = &qp->s_ack_queue[qp->r_head_ack_queue];
-	if (e->rdma_sge.mr) {
-		rvt_put_mr(e->rdma_sge.mr);
-		e->rdma_sge.mr = NULL;
-	}
+	release_rdma_sge_mr(e);
 
 	rkey = be32_to_cpu(reth->rkey);
 	qp->r_len = len;
@@ -3751,10 +3745,7 @@ void hfi1_rc_rcv_tid_rdma_write_req(struct hfi1_packet *packet)
 		goto update_head;
 	}
 
-	if (e->rdma_sge.mr) {
-		rvt_put_mr(e->rdma_sge.mr);
-		e->rdma_sge.mr = NULL;
-	}
+	release_rdma_sge_mr(e);
 
 	/* The length needs to be in multiples of PAGE_SIZE */
 	if (!len || len & ~PAGE_MASK)
@@ -4347,6 +4338,7 @@ void hfi1_rc_rcv_tid_rdma_write_data(struct hfi1_packet *packet)
 		priv->r_tid_ack = priv->r_tid_tail;
 
 	if (opcode == TID_OP(WRITE_DATA_LAST)) {
+		release_rdma_sge_mr(e);
 		for (next = priv->r_tid_tail + 1; ; next++) {
 			if (next > rvt_size_atomic(&dev->rdi))
 				next = 0;
-- 
cgit v1.2.3-59-g8ed1b


From 6a40693a884dacae68c1771d369ad3be0594ba1c Mon Sep 17 00:00:00 2001
From: Kaike Wan <kaike.wan@intel.com>
Date: Mon, 18 Mar 2019 09:58:40 -0700
Subject: IB/hfi1: Add a function to read next expected psn from hardware flow

This patch adds a function to read next expected KDETH PSN from hardware
flow to simplify the code.

Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
Signed-off-by: Kaike Wan <kaike.wan@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hfi1/tid_rdma.c | 38 +++++++++++++++++------------------
 1 file changed, 18 insertions(+), 20 deletions(-)

(limited to 'drivers/infiniband/hw/hfi1')

diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c
index 145dd3348fdd..d540e9cffa37 100644
--- a/drivers/infiniband/hw/hfi1/tid_rdma.c
+++ b/drivers/infiniband/hw/hfi1/tid_rdma.c
@@ -128,6 +128,7 @@ static int make_tid_rdma_ack(struct rvt_qp *qp,
 			     struct ib_other_headers *ohdr,
 			     struct hfi1_pkt_state *ps);
 static void hfi1_do_tid_send(struct rvt_qp *qp);
+static u32 read_r_next_psn(struct hfi1_devdata *dd, u8 ctxt, u8 fidx);
 
 static u64 tid_rdma_opfn_encode(struct tid_rdma_params *p)
 {
@@ -2807,18 +2808,10 @@ static bool handle_read_kdeth_eflags(struct hfi1_ctxtdata *rcd,
 				}
 				priv->flow_state.r_next_psn++;
 			} else {
-				u64 reg;
 				u32 last_psn;
 
-				/*
-				 * The only sane way to get the amount of
-				 * progress is to read the HW flow state.
-				 */
-				reg = read_uctxt_csr(dd, rcd->ctxt,
-						     RCV_TID_FLOW_TABLE +
-						     (8 * flow->idx));
-				last_psn = mask_psn(reg);
-
+				last_psn = read_r_next_psn(dd, rcd->ctxt,
+							   flow->idx);
 				priv->flow_state.r_next_psn = last_psn;
 				priv->flow_state.flags |= TID_FLOW_SW_PSN;
 				/*
@@ -2968,17 +2961,10 @@ bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd,
 		switch (rte) {
 		case RHF_RTE_EXPECTED_FLOW_SEQ_ERR:
 			if (!(qpriv->s_flags & HFI1_R_TID_SW_PSN)) {
-				u64 reg;
-
 				qpriv->s_flags |= HFI1_R_TID_SW_PSN;
-				/*
-				 * The only sane way to get the amount of
-				 * progress is to read the HW flow state.
-				 */
-				reg = read_uctxt_csr(dd, rcd->ctxt,
-						     RCV_TID_FLOW_TABLE +
-						     (8 * flow->idx));
-				flow->flow_state.r_next_psn = mask_psn(reg);
+				flow->flow_state.r_next_psn =
+					read_r_next_psn(dd, rcd->ctxt,
+							flow->idx);
 				qpriv->r_next_psn_kdeth =
 					flow->flow_state.r_next_psn;
 				goto nak_psn;
@@ -5456,3 +5442,15 @@ bool hfi1_tid_rdma_ack_interlock(struct rvt_qp *qp, struct rvt_ack_entry *e)
 	}
 	return false;
 }
+
+static u32 read_r_next_psn(struct hfi1_devdata *dd, u8 ctxt, u8 fidx)
+{
+	u64 reg;
+
+	/*
+	 * The only sane way to get the amount of
+	 * progress is to read the HW flow state.
+	 */
+	reg = read_uctxt_csr(dd, ctxt, RCV_TID_FLOW_TABLE + (8 * fidx));
+	return mask_psn(reg);
+}
-- 
cgit v1.2.3-59-g8ed1b


From b885d5be9ca10dff6110a8738c45eb4b3fb5a40a Mon Sep 17 00:00:00 2001
From: Kaike Wan <kaike.wan@intel.com>
Date: Mon, 18 Mar 2019 09:58:50 -0700
Subject: IB/hfi1: Unify the software PSN check for TID RDMA READ/WRITE

For expected packet receiving, the hfi1 hardware checks the KDETH PSN
automatically. However, when sequence error occurs, the hfi1 driver can
check the sequence instead until the hardware flow generation is reloaded.

TID RDMA READ and WRITE protocols implement similar software checking
mechanisms, but with different flags and different local variables to
store next expected PSN.

Unify the handling by using only one set of flag and local variable for
both TID RDMA READ and WRITE protocols.

Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
Signed-off-by: Kaike Wan <kaike.wan@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hfi1/tid_rdma.c  | 45 +++++++++++++++++++++-------------
 drivers/infiniband/hw/hfi1/tid_rdma.h  |  2 --
 drivers/infiniband/hw/hfi1/trace_tid.h | 12 +++------
 3 files changed, 31 insertions(+), 28 deletions(-)

(limited to 'drivers/infiniband/hw/hfi1')

diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c
index d540e9cffa37..9ade7d3954b0 100644
--- a/drivers/infiniband/hw/hfi1/tid_rdma.c
+++ b/drivers/infiniband/hw/hfi1/tid_rdma.c
@@ -67,8 +67,6 @@ static u32 mask_generation(u32 a)
 #define TID_RDMA_DESTQP_FLOW_SHIFT      11
 #define TID_RDMA_DESTQP_FLOW_MASK       0x1f
 
-#define TID_FLOW_SW_PSN BIT(0)
-
 #define TID_OPFN_QP_CTXT_MASK 0xff
 #define TID_OPFN_QP_CTXT_SHIFT 56
 #define TID_OPFN_QP_KDETH_MASK 0xff
@@ -777,7 +775,6 @@ int hfi1_kern_setup_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp)
 		rcd->flows[fs->index].generation = fs->generation;
 	fs->generation = kern_setup_hw_flow(rcd, fs->index);
 	fs->psn = 0;
-	fs->flags = 0;
 	dequeue_tid_waiter(rcd, &rcd->flow_queue, qp);
 	/* get head before dropping lock */
 	fqp = first_qp(rcd, &rcd->flow_queue);
@@ -1808,6 +1805,7 @@ sync_check:
 			goto done;
 
 		hfi1_kern_clear_hw_flow(req->rcd, qp);
+		qpriv->s_flags &= ~HFI1_R_TID_SW_PSN;
 		req->state = TID_REQUEST_ACTIVE;
 	}
 
@@ -2476,8 +2474,13 @@ void hfi1_rc_rcv_tid_rdma_read_resp(struct hfi1_packet *packet)
 
 	flow = &req->flows[req->clear_tail];
 	/* When header suppression is disabled */
-	if (cmp_psn(ipsn, flow->flow_state.ib_lpsn))
+	if (cmp_psn(ipsn, flow->flow_state.ib_lpsn)) {
+		if (cmp_psn(kpsn, flow->flow_state.r_next_psn))
+			goto ack_done;
+		flow->flow_state.r_next_psn = mask_psn(kpsn + 1);
 		goto ack_done;
+	}
+	flow->flow_state.r_next_psn = mask_psn(kpsn + 1);
 	req->ack_pending--;
 	priv->pending_tid_r_segs--;
 	qp->s_num_rd_atomic--;
@@ -2519,6 +2522,7 @@ void hfi1_rc_rcv_tid_rdma_read_resp(struct hfi1_packet *packet)
 	     req->comp_seg == req->cur_seg) ||
 	    priv->tid_r_comp == priv->tid_r_reqs) {
 		hfi1_kern_clear_hw_flow(priv->rcd, qp);
+		priv->s_flags &= ~HFI1_R_TID_SW_PSN;
 		if (req->state == TID_REQUEST_SYNC)
 			req->state = TID_REQUEST_ACTIVE;
 	}
@@ -2768,9 +2772,9 @@ static bool handle_read_kdeth_eflags(struct hfi1_ctxtdata *rcd,
 				rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
 				return ret;
 			}
-			if (priv->flow_state.flags & TID_FLOW_SW_PSN) {
+			if (priv->s_flags & HFI1_R_TID_SW_PSN) {
 				diff = cmp_psn(psn,
-					       priv->flow_state.r_next_psn);
+					       flow->flow_state.r_next_psn);
 				if (diff > 0) {
 					if (!(qp->r_flags & RVT_R_RDMAR_SEQ))
 						restart_tid_rdma_read_req(rcd,
@@ -2806,14 +2810,15 @@ static bool handle_read_kdeth_eflags(struct hfi1_ctxtdata *rcd,
 						qp->r_flags &=
 							~RVT_R_RDMAR_SEQ;
 				}
-				priv->flow_state.r_next_psn++;
+				flow->flow_state.r_next_psn =
+					mask_psn(psn + 1);
 			} else {
 				u32 last_psn;
 
 				last_psn = read_r_next_psn(dd, rcd->ctxt,
 							   flow->idx);
-				priv->flow_state.r_next_psn = last_psn;
-				priv->flow_state.flags |= TID_FLOW_SW_PSN;
+				flow->flow_state.r_next_psn = last_psn;
+				priv->s_flags |= HFI1_R_TID_SW_PSN;
 				/*
 				 * If no request has been restarted yet,
 				 * restart the current one.
@@ -2878,6 +2883,7 @@ bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd,
 	struct rvt_ack_entry *e;
 	struct tid_rdma_request *req;
 	struct tid_rdma_flow *flow;
+	int diff = 0;
 
 	trace_hfi1_msg_handle_kdeth_eflags(NULL, "Kdeth error: rhf ",
 					   packet->rhf);
@@ -2977,10 +2983,12 @@ bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd,
 				 * mismatch could be due to packets that were
 				 * already in flight.
 				 */
-				if (psn != flow->flow_state.r_next_psn) {
-					psn = flow->flow_state.r_next_psn;
+				diff = cmp_psn(psn,
+					       flow->flow_state.r_next_psn);
+				if (diff > 0)
 					goto nak_psn;
-				}
+				else if (diff < 0)
+					break;
 
 				qpriv->s_nak_state = 0;
 				/*
@@ -2991,8 +2999,10 @@ bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd,
 				if (psn == full_flow_psn(flow,
 							 flow->flow_state.lpsn))
 					ret = false;
+				flow->flow_state.r_next_psn =
+					mask_psn(psn + 1);
 				qpriv->r_next_psn_kdeth =
-					++flow->flow_state.r_next_psn;
+					flow->flow_state.r_next_psn;
 			}
 			break;
 
@@ -3497,8 +3507,10 @@ static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx)
 		if (qpriv->r_tid_alloc == qpriv->r_tid_head) {
 			/* If all data has been received, clear the flow */
 			if (qpriv->flow_state.index < RXE_NUM_TID_FLOWS &&
-			    !qpriv->alloc_w_segs)
+			    !qpriv->alloc_w_segs) {
 				hfi1_kern_clear_hw_flow(rcd, qp);
+				qpriv->s_flags &= ~HFI1_R_TID_SW_PSN;
+			}
 			break;
 		}
 
@@ -3524,8 +3536,7 @@ static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx)
 		if (qpriv->sync_pt && !qpriv->alloc_w_segs) {
 			hfi1_kern_clear_hw_flow(rcd, qp);
 			qpriv->sync_pt = false;
-			if (qpriv->s_flags & HFI1_R_TID_SW_PSN)
-				qpriv->s_flags &= ~HFI1_R_TID_SW_PSN;
+			qpriv->s_flags &= ~HFI1_R_TID_SW_PSN;
 		}
 
 		/* Allocate flow if we don't have one */
@@ -4299,7 +4310,7 @@ void hfi1_rc_rcv_tid_rdma_write_data(struct hfi1_packet *packet)
 	if (cmp_psn(psn, full_flow_psn(flow, flow->flow_state.lpsn))) {
 		if (cmp_psn(psn, flow->flow_state.r_next_psn))
 			goto send_nak;
-		flow->flow_state.r_next_psn++;
+		flow->flow_state.r_next_psn = mask_psn(psn + 1);
 		goto exit;
 	}
 	flow->flow_state.r_next_psn = mask_psn(psn + 1);
diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h
index 53ab24ef4f02..1c536185261e 100644
--- a/drivers/infiniband/hw/hfi1/tid_rdma.h
+++ b/drivers/infiniband/hw/hfi1/tid_rdma.h
@@ -76,10 +76,8 @@ struct tid_rdma_qp_params {
 struct tid_flow_state {
 	u32 generation;
 	u32 psn;
-	u32 r_next_psn;      /* next PSN to be received (in TID space) */
 	u8 index;
 	u8 last_index;
-	u8 flags;
 };
 
 enum tid_rdma_req_state {
diff --git a/drivers/infiniband/hw/hfi1/trace_tid.h b/drivers/infiniband/hw/hfi1/trace_tid.h
index 548dfc45a407..4388b594ed1b 100644
--- a/drivers/infiniband/hw/hfi1/trace_tid.h
+++ b/drivers/infiniband/hw/hfi1/trace_tid.h
@@ -53,7 +53,7 @@ u16 hfi1_trace_get_tid_idx(u32 ent);
 			    "tid_r_comp %u pending_tid_r_segs %u " \
 			    "s_flags 0x%x ps_flags 0x%x iow_flags 0x%lx " \
 			    "s_state 0x%x hw_flow_index %u generation 0x%x " \
-			    "fpsn 0x%x flow_flags 0x%x"
+			    "fpsn 0x%x"
 
 #define TID_REQ_PRN "[%s] qpn 0x%x newreq %u opcode 0x%x psn 0x%x lpsn 0x%x " \
 		    "cur_seg %u comp_seg %u ack_seg %u alloc_seg %u " \
@@ -71,7 +71,7 @@ u16 hfi1_trace_get_tid_idx(u32 ent);
 			    "pending_tid_w_segs %u sync_pt %s " \
 			    "ps_nak_psn 0x%x ps_nak_state 0x%x " \
 			    "prnr_nak_state 0x%x hw_flow_index %u generation "\
-			    "0x%x fpsn 0x%x flow_flags 0x%x resync %s" \
+			    "0x%x fpsn 0x%x resync %s" \
 			    "r_next_psn_kdeth 0x%x"
 
 #define TID_WRITE_SENDER_PRN "[%s] qpn 0x%x newreq %u s_tid_cur %u " \
@@ -973,7 +973,6 @@ DECLARE_EVENT_CLASS(/* tid_read_sender */
 		__field(u32, hw_flow_index)
 		__field(u32, generation)
 		__field(u32, fpsn)
-		__field(u32, flow_flags)
 	),
 	TP_fast_assign(/* assign */
 		struct hfi1_qp_priv *priv = qp->priv;
@@ -991,7 +990,6 @@ DECLARE_EVENT_CLASS(/* tid_read_sender */
 		__entry->hw_flow_index = priv->flow_state.index;
 		__entry->generation = priv->flow_state.generation;
 		__entry->fpsn = priv->flow_state.psn;
-		__entry->flow_flags = priv->flow_state.flags;
 	),
 	TP_printk(/* print */
 		TID_READ_SENDER_PRN,
@@ -1007,8 +1005,7 @@ DECLARE_EVENT_CLASS(/* tid_read_sender */
 		__entry->s_state,
 		__entry->hw_flow_index,
 		__entry->generation,
-		__entry->fpsn,
-		__entry->flow_flags
+		__entry->fpsn
 	)
 );
 
@@ -1338,7 +1335,6 @@ DECLARE_EVENT_CLASS(/* tid_write_sp */
 		__field(u32, hw_flow_index)
 		__field(u32, generation)
 		__field(u32, fpsn)
-		__field(u32, flow_flags)
 		__field(bool, resync)
 		__field(u32, r_next_psn_kdeth)
 	),
@@ -1360,7 +1356,6 @@ DECLARE_EVENT_CLASS(/* tid_write_sp */
 		__entry->hw_flow_index = priv->flow_state.index;
 		__entry->generation = priv->flow_state.generation;
 		__entry->fpsn = priv->flow_state.psn;
-		__entry->flow_flags = priv->flow_state.flags;
 		__entry->resync = priv->resync;
 		__entry->r_next_psn_kdeth = priv->r_next_psn_kdeth;
 	),
@@ -1381,7 +1376,6 @@ DECLARE_EVENT_CLASS(/* tid_write_sp */
 		__entry->hw_flow_index,
 		__entry->generation,
 		__entry->fpsn,
-		__entry->flow_flags,
 		__entry->resync ? "yes" : "no",
 		__entry->r_next_psn_kdeth
 	)
-- 
cgit v1.2.3-59-g8ed1b


From 8da0f0f26f80612efadc23beb72d5b66a498a386 Mon Sep 17 00:00:00 2001
From: Kaike Wan <kaike.wan@intel.com>
Date: Mon, 18 Mar 2019 09:59:00 -0700
Subject: IB/hfi1: Remove WARN_ON when freeing expected receive groups

When PSM user receive context is freed, the expected receive groups
allocated by the receive context will also been freed. However, if there
are still TID entries in use, the receive groups rcd->tid_full_list or
rcd->tid_used_list will not be empty, and thus triggering the WARN_ONs in
the function hfi1_free_ctxt_rcv_groups().  Even if the two lists may not
be empty, the hfi1 driver will free all TID entries and receive groups
associated with the receive context to prevent any resource leakage. Since
a clean user application exit is not controlled by the hfi1 driver, this
patch will remove the WARN_ONs in hfi1_free_ctxt_rcv_groups().

Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Kaike Wan <kaike.wan@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hfi1/exp_rcv.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'drivers/infiniband/hw/hfi1')

diff --git a/drivers/infiniband/hw/hfi1/exp_rcv.c b/drivers/infiniband/hw/hfi1/exp_rcv.c
index 1be49a0d9c11..e9d5cc8b771a 100644
--- a/drivers/infiniband/hw/hfi1/exp_rcv.c
+++ b/drivers/infiniband/hw/hfi1/exp_rcv.c
@@ -112,9 +112,6 @@ int hfi1_alloc_ctxt_rcv_groups(struct hfi1_ctxtdata *rcd)
  */
 void hfi1_free_ctxt_rcv_groups(struct hfi1_ctxtdata *rcd)
 {
-	WARN_ON(!EXP_TID_SET_EMPTY(rcd->tid_full_list));
-	WARN_ON(!EXP_TID_SET_EMPTY(rcd->tid_used_list));
-
 	kfree(rcd->groups);
 	rcd->groups = NULL;
 	hfi1_exp_tid_group_init(rcd);
-- 
cgit v1.2.3-59-g8ed1b


From 747b931fbe2362366dee30617f816501f3126882 Mon Sep 17 00:00:00 2001
From: Kaike Wan <kaike.wan@intel.com>
Date: Mon, 18 Mar 2019 12:20:59 -0700
Subject: IB/hfi1: Implement CCA for TID RDMA protocol

Currently, FECN handling is not implemented on TID RDMA expected receive
packets and therefore CCA can't be turned on when TID RDMA is
enabled. This patch adds the CCA support to TID RDMA protocol by:

- modifying FECN RSM rule to include kernel receive contexts

- For TID_RDMA READ RESP or TID RDMA ACK packet, a CNP will be sent out if
  the FECN bit is set. For other TID RDMA packets that generate at least
  one response packet, the BECN bit will be set in the first response
  packet

- Copying expected packet data to destination buffer when FECN bit is set
  in the TID RDMA READ RESP or TID RDMA WRITE DATA packet. In this case,
  the expected packet is received as an eager packet

- Handling the TID sequence error for subsequent normal expected packets.

Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
Signed-off-by: Kaike Wan <kaike.wan@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hfi1/chip.c     |  35 ++++---
 drivers/infiniband/hw/hfi1/driver.c   |   4 +-
 drivers/infiniband/hw/hfi1/tid_rdma.c | 173 ++++++++++++++++++++++++++++------
 3 files changed, 167 insertions(+), 45 deletions(-)

(limited to 'drivers/infiniband/hw/hfi1')

diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c
index 9784c6c0d2ec..16861d9ba1b7 100644
--- a/drivers/infiniband/hw/hfi1/chip.c
+++ b/drivers/infiniband/hw/hfi1/chip.c
@@ -13297,15 +13297,18 @@ static int set_up_context_variables(struct hfi1_devdata *dd)
 	/*
 	 * The RMT entries are currently allocated as shown below:
 	 * 1. QOS (0 to 128 entries);
-	 * 2. FECN for PSM (num_user_contexts + num_vnic_contexts);
+	 * 2. FECN (num_kernel_context - 1 + num_user_contexts +
+	 *    num_vnic_contexts);
 	 * 3. VNIC (num_vnic_contexts).
-	 * It should be noted that PSM FECN oversubscribe num_vnic_contexts
+	 * It should be noted that FECN oversubscribe num_vnic_contexts
 	 * entries of RMT because both VNIC and PSM could allocate any receive
 	 * context between dd->first_dyn_alloc_text and dd->num_rcv_contexts,
 	 * and PSM FECN must reserve an RMT entry for each possible PSM receive
 	 * context.
 	 */
 	rmt_count = qos_rmt_entries(dd, NULL, NULL) + (num_vnic_contexts * 2);
+	if (HFI1_CAP_IS_KSET(TID_RDMA))
+		rmt_count += num_kernel_contexts - 1;
 	if (rmt_count + n_usr_ctxts > NUM_MAP_ENTRIES) {
 		user_rmt_reduced = NUM_MAP_ENTRIES - rmt_count;
 		dd_dev_err(dd,
@@ -14288,37 +14291,43 @@ bail:
 	init_qpmap_table(dd, FIRST_KERNEL_KCTXT, dd->n_krcv_queues - 1);
 }
 
-static void init_user_fecn_handling(struct hfi1_devdata *dd,
-				    struct rsm_map_table *rmt)
+static void init_fecn_handling(struct hfi1_devdata *dd,
+			       struct rsm_map_table *rmt)
 {
 	struct rsm_rule_data rrd;
 	u64 reg;
-	int i, idx, regoff, regidx;
+	int i, idx, regoff, regidx, start;
 	u8 offset;
 	u32 total_cnt;
 
+	if (HFI1_CAP_IS_KSET(TID_RDMA))
+		/* Exclude context 0 */
+		start = 1;
+	else
+		start = dd->first_dyn_alloc_ctxt;
+
+	total_cnt = dd->num_rcv_contexts - start;
+
 	/* there needs to be enough room in the map table */
-	total_cnt = dd->num_rcv_contexts - dd->first_dyn_alloc_ctxt;
 	if (rmt->used + total_cnt >= NUM_MAP_ENTRIES) {
-		dd_dev_err(dd, "User FECN handling disabled - too many user contexts allocated\n");
+		dd_dev_err(dd, "FECN handling disabled - too many contexts allocated\n");
 		return;
 	}
 
 	/*
 	 * RSM will extract the destination context as an index into the
 	 * map table.  The destination contexts are a sequential block
-	 * in the range first_dyn_alloc_ctxt...num_rcv_contexts-1 (inclusive).
+	 * in the range start...num_rcv_contexts-1 (inclusive).
 	 * Map entries are accessed as offset + extracted value.  Adjust
 	 * the added offset so this sequence can be placed anywhere in
 	 * the table - as long as the entries themselves do not wrap.
 	 * There are only enough bits in offset for the table size, so
 	 * start with that to allow for a "negative" offset.
 	 */
-	offset = (u8)(NUM_MAP_ENTRIES + (int)rmt->used -
-						(int)dd->first_dyn_alloc_ctxt);
+	offset = (u8)(NUM_MAP_ENTRIES + rmt->used - start);
 
-	for (i = dd->first_dyn_alloc_ctxt, idx = rmt->used;
-				i < dd->num_rcv_contexts; i++, idx++) {
+	for (i = start, idx = rmt->used; i < dd->num_rcv_contexts;
+	     i++, idx++) {
 		/* replace with identity mapping */
 		regoff = (idx % 8) * 8;
 		regidx = idx / 8;
@@ -14440,7 +14449,7 @@ static void init_rxe(struct hfi1_devdata *dd)
 	rmt = alloc_rsm_map_table(dd);
 	/* set up QOS, including the QPN map table */
 	init_qos(dd, rmt);
-	init_user_fecn_handling(dd, rmt);
+	init_fecn_handling(dd, rmt);
 	complete_rsm_map_table(dd, rmt);
 	/* record number of used rsm map entries for vnic */
 	dd->vnic.rmt_start = rmt->used;
diff --git a/drivers/infiniband/hw/hfi1/driver.c b/drivers/infiniband/hw/hfi1/driver.c
index 2a9d2912f5db..1f4fe0269b51 100644
--- a/drivers/infiniband/hw/hfi1/driver.c
+++ b/drivers/infiniband/hw/hfi1/driver.c
@@ -516,7 +516,9 @@ bool hfi1_process_ecn_slowpath(struct rvt_qp *qp, struct hfi1_packet *pkt,
 	 */
 	do_cnp = prescan ||
 		(opcode >= IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST &&
-		 opcode <= IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE);
+		 opcode <= IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE) ||
+		opcode == TID_OP(READ_RESP) ||
+		opcode == TID_OP(ACK);
 
 	/* Call appropriate CNP handler */
 	if (!ignore_fecn && do_cnp && fecn)
diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c
index 9ade7d3954b0..eae6f05ca2fa 100644
--- a/drivers/infiniband/hw/hfi1/tid_rdma.c
+++ b/drivers/infiniband/hw/hfi1/tid_rdma.c
@@ -127,6 +127,14 @@ static int make_tid_rdma_ack(struct rvt_qp *qp,
 			     struct hfi1_pkt_state *ps);
 static void hfi1_do_tid_send(struct rvt_qp *qp);
 static u32 read_r_next_psn(struct hfi1_devdata *dd, u8 ctxt, u8 fidx);
+static void tid_rdma_rcv_err(struct hfi1_packet *packet,
+			     struct ib_other_headers *ohdr,
+			     struct rvt_qp *qp, u32 psn, int diff, bool fecn);
+static void update_r_next_psn_fecn(struct hfi1_packet *packet,
+				   struct hfi1_qp_priv *priv,
+				   struct hfi1_ctxtdata *rcd,
+				   struct tid_rdma_flow *flow,
+				   bool fecn);
 
 static u64 tid_rdma_opfn_encode(struct tid_rdma_params *p)
 {
@@ -2234,7 +2242,7 @@ void hfi1_rc_rcv_tid_rdma_read_req(struct hfi1_packet *packet)
 	struct ib_reth *reth;
 	struct hfi1_qp_priv *qpriv = qp->priv;
 	u32 bth0, psn, len, rkey;
-	bool is_fecn;
+	bool fecn;
 	u8 next;
 	u64 vaddr;
 	int diff;
@@ -2244,7 +2252,7 @@ void hfi1_rc_rcv_tid_rdma_read_req(struct hfi1_packet *packet)
 	if (hfi1_ruc_check_hdr(ibp, packet))
 		return;
 
-	is_fecn = process_ecn(qp, packet);
+	fecn = process_ecn(qp, packet);
 	psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
 	trace_hfi1_rsp_rcv_tid_read_req(qp, psn);
 
@@ -2263,9 +2271,8 @@ void hfi1_rc_rcv_tid_rdma_read_req(struct hfi1_packet *packet)
 
 	diff = delta_psn(psn, qp->r_psn);
 	if (unlikely(diff)) {
-		if (tid_rdma_rcv_error(packet, ohdr, qp, psn, diff))
-			return;
-		goto send_ack;
+		tid_rdma_rcv_err(packet, ohdr, qp, psn, diff, fecn);
+		return;
 	}
 
 	/* We've verified the request, insert it into the ack queue. */
@@ -2317,11 +2324,11 @@ void hfi1_rc_rcv_tid_rdma_read_req(struct hfi1_packet *packet)
 
 	/* Schedule the send tasklet. */
 	qp->s_flags |= RVT_S_RESP_PENDING;
+	if (fecn)
+		qp->s_flags |= RVT_S_ECN;
 	hfi1_schedule_send(qp);
 
 	spin_unlock_irqrestore(&qp->s_lock, flags);
-	if (is_fecn)
-		goto send_ack;
 	return;
 
 nack_inv_unlock:
@@ -2338,8 +2345,6 @@ nack_acc:
 	rvt_rc_error(qp, IB_WC_LOC_PROT_ERR);
 	qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
 	qp->r_ack_psn = qp->r_psn;
-send_ack:
-	hfi1_send_rc_ack(packet, is_fecn);
 }
 
 u32 hfi1_build_tid_rdma_read_resp(struct rvt_qp *qp, struct rvt_ack_entry *e,
@@ -2456,12 +2461,12 @@ void hfi1_rc_rcv_tid_rdma_read_resp(struct hfi1_packet *packet)
 	struct tid_rdma_request *req;
 	struct tid_rdma_flow *flow;
 	u32 opcode, aeth;
-	bool is_fecn;
+	bool fecn;
 	unsigned long flags;
 	u32 kpsn, ipsn;
 
 	trace_hfi1_sender_rcv_tid_read_resp(qp);
-	is_fecn = process_ecn(qp, packet);
+	fecn = process_ecn(qp, packet);
 	kpsn = mask_psn(be32_to_cpu(ohdr->bth[2]));
 	aeth = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.aeth);
 	opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
@@ -2475,9 +2480,39 @@ void hfi1_rc_rcv_tid_rdma_read_resp(struct hfi1_packet *packet)
 	flow = &req->flows[req->clear_tail];
 	/* When header suppression is disabled */
 	if (cmp_psn(ipsn, flow->flow_state.ib_lpsn)) {
+		update_r_next_psn_fecn(packet, priv, rcd, flow, fecn);
+
 		if (cmp_psn(kpsn, flow->flow_state.r_next_psn))
 			goto ack_done;
 		flow->flow_state.r_next_psn = mask_psn(kpsn + 1);
+		/*
+		 * Copy the payload to destination buffer if this packet is
+		 * delivered as an eager packet due to RSM rule and FECN.
+		 * The RSM rule selects FECN bit in BTH and SH bit in
+		 * KDETH header and therefore will not match the last
+		 * packet of each segment that has SH bit cleared.
+		 */
+		if (fecn && packet->etype == RHF_RCV_TYPE_EAGER) {
+			struct rvt_sge_state ss;
+			u32 len;
+			u32 tlen = packet->tlen;
+			u16 hdrsize = packet->hlen;
+			u8 pad = packet->pad;
+			u8 extra_bytes = pad + packet->extra_byte +
+				(SIZE_OF_CRC << 2);
+			u32 pmtu = qp->pmtu;
+
+			if (unlikely(tlen != (hdrsize + pmtu + extra_bytes)))
+				goto ack_op_err;
+			len = restart_sge(&ss, req->e.swqe, ipsn, pmtu);
+			if (unlikely(len < pmtu))
+				goto ack_op_err;
+			rvt_copy_sge(qp, &ss, packet->payload, pmtu, false,
+				     false);
+			/* Raise the sw sequence check flag for next packet */
+			priv->s_flags |= HFI1_R_TID_SW_PSN;
+		}
+
 		goto ack_done;
 	}
 	flow->flow_state.r_next_psn = mask_psn(kpsn + 1);
@@ -2544,8 +2579,6 @@ ack_op_err:
 
 ack_done:
 	spin_unlock_irqrestore(&qp->s_lock, flags);
-	if (is_fecn)
-		hfi1_send_rc_ack(packet, is_fecn);
 }
 
 void hfi1_kern_read_tid_flow_free(struct rvt_qp *qp)
@@ -3678,7 +3711,7 @@ void hfi1_rc_rcv_tid_rdma_write_req(struct hfi1_packet *packet)
 	struct hfi1_qp_priv *qpriv = qp->priv;
 	struct tid_rdma_request *req;
 	u32 bth0, psn, len, rkey, num_segs;
-	bool is_fecn;
+	bool fecn;
 	u8 next;
 	u64 vaddr;
 	int diff;
@@ -3687,7 +3720,7 @@ void hfi1_rc_rcv_tid_rdma_write_req(struct hfi1_packet *packet)
 	if (hfi1_ruc_check_hdr(ibp, packet))
 		return;
 
-	is_fecn = process_ecn(qp, packet);
+	fecn = process_ecn(qp, packet);
 	psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
 	trace_hfi1_rsp_rcv_tid_write_req(qp, psn);
 
@@ -3704,9 +3737,8 @@ void hfi1_rc_rcv_tid_rdma_write_req(struct hfi1_packet *packet)
 	num_segs = DIV_ROUND_UP(len, qpriv->tid_rdma.local.max_len);
 	diff = delta_psn(psn, qp->r_psn);
 	if (unlikely(diff)) {
-		if (tid_rdma_rcv_error(packet, ohdr, qp, psn, diff))
-			return;
-		goto send_ack;
+		tid_rdma_rcv_err(packet, ohdr, qp, psn, diff, fecn);
+		return;
 	}
 
 	/*
@@ -3822,11 +3854,11 @@ update_head:
 
 	/* Schedule the send tasklet. */
 	qp->s_flags |= RVT_S_RESP_PENDING;
+	if (fecn)
+		qp->s_flags |= RVT_S_ECN;
 	hfi1_schedule_send(qp);
 
 	spin_unlock_irqrestore(&qp->s_lock, flags);
-	if (is_fecn)
-		goto send_ack;
 	return;
 
 nack_inv_unlock:
@@ -3843,8 +3875,6 @@ nack_acc:
 	rvt_rc_error(qp, IB_WC_LOC_PROT_ERR);
 	qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
 	qp->r_ack_psn = qp->r_psn;
-send_ack:
-	hfi1_send_rc_ack(packet, is_fecn);
 }
 
 u32 hfi1_build_tid_rdma_write_resp(struct rvt_qp *qp, struct rvt_ack_entry *e,
@@ -4061,10 +4091,10 @@ void hfi1_rc_rcv_tid_rdma_write_resp(struct hfi1_packet *packet)
 	struct tid_rdma_flow *flow;
 	enum ib_wc_status status;
 	u32 opcode, aeth, psn, flow_psn, i, tidlen = 0, pktlen;
-	bool is_fecn;
+	bool fecn;
 	unsigned long flags;
 
-	is_fecn = process_ecn(qp, packet);
+	fecn = process_ecn(qp, packet);
 	psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
 	aeth = be32_to_cpu(ohdr->u.tid_rdma.w_rsp.aeth);
 	opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
@@ -4204,7 +4234,6 @@ void hfi1_rc_rcv_tid_rdma_write_resp(struct hfi1_packet *packet)
 		qpriv->s_tid_cur = i;
 	}
 	qp->s_flags &= ~HFI1_S_WAIT_TID_RESP;
-
 	hfi1_schedule_tid_send(qp);
 	goto ack_done;
 
@@ -4213,9 +4242,9 @@ ack_op_err:
 ack_err:
 	rvt_error_qp(qp, status);
 ack_done:
+	if (fecn)
+		qp->s_flags |= RVT_S_ECN;
 	spin_unlock_irqrestore(&qp->s_lock, flags);
-	if (is_fecn)
-		hfi1_send_rc_ack(packet, is_fecn);
 }
 
 bool hfi1_build_tid_rdma_packet(struct rvt_swqe *wqe,
@@ -4295,7 +4324,9 @@ void hfi1_rc_rcv_tid_rdma_write_data(struct hfi1_packet *packet)
 	unsigned long flags;
 	u32 psn, next;
 	u8 opcode;
+	bool fecn;
 
+	fecn = process_ecn(qp, packet);
 	psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
 	opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
 
@@ -4308,9 +4339,53 @@ void hfi1_rc_rcv_tid_rdma_write_data(struct hfi1_packet *packet)
 	req = ack_to_tid_req(e);
 	flow = &req->flows[req->clear_tail];
 	if (cmp_psn(psn, full_flow_psn(flow, flow->flow_state.lpsn))) {
+		update_r_next_psn_fecn(packet, priv, rcd, flow, fecn);
+
 		if (cmp_psn(psn, flow->flow_state.r_next_psn))
 			goto send_nak;
+
 		flow->flow_state.r_next_psn = mask_psn(psn + 1);
+		/*
+		 * Copy the payload to destination buffer if this packet is
+		 * delivered as an eager packet due to RSM rule and FECN.
+		 * The RSM rule selects FECN bit in BTH and SH bit in
+		 * KDETH header and therefore will not match the last
+		 * packet of each segment that has SH bit cleared.
+		 */
+		if (fecn && packet->etype == RHF_RCV_TYPE_EAGER) {
+			struct rvt_sge_state ss;
+			u32 len;
+			u32 tlen = packet->tlen;
+			u16 hdrsize = packet->hlen;
+			u8 pad = packet->pad;
+			u8 extra_bytes = pad + packet->extra_byte +
+				(SIZE_OF_CRC << 2);
+			u32 pmtu = qp->pmtu;
+
+			if (unlikely(tlen != (hdrsize + pmtu + extra_bytes)))
+				goto send_nak;
+			len = req->comp_seg * req->seg_len;
+			len += delta_psn(psn,
+				full_flow_psn(flow, flow->flow_state.spsn)) *
+				pmtu;
+			if (unlikely(req->total_len - len < pmtu))
+				goto send_nak;
+
+			/*
+			 * The e->rdma_sge field is set when TID RDMA WRITE REQ
+			 * is first received and is never modified thereafter.
+			 */
+			ss.sge = e->rdma_sge;
+			ss.sg_list = NULL;
+			ss.num_sge = 1;
+			ss.total_len = req->total_len;
+			rvt_skip_sge(&ss, len, false);
+			rvt_copy_sge(qp, &ss, packet->payload, pmtu, false,
+				     false);
+			/* Raise the sw sequence check flag for next packet */
+			priv->r_next_psn_kdeth = mask_psn(psn + 1);
+			priv->s_flags |= HFI1_R_TID_SW_PSN;
+		}
 		goto exit;
 	}
 	flow->flow_state.r_next_psn = mask_psn(psn + 1);
@@ -4375,6 +4450,8 @@ done:
 	hfi1_schedule_tid_send(qp);
 exit:
 	priv->r_next_psn_kdeth = flow->flow_state.r_next_psn;
+	if (fecn)
+		qp->s_flags |= RVT_S_ECN;
 	spin_unlock_irqrestore(&qp->s_lock, flags);
 	return;
 
@@ -4476,12 +4553,11 @@ void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet)
 	struct tid_rdma_request *req;
 	struct tid_rdma_flow *flow;
 	u32 aeth, psn, req_psn, ack_psn, fspsn, resync_psn, ack_kpsn;
-	bool is_fecn;
 	unsigned long flags;
 	u16 fidx;
 
 	trace_hfi1_tid_write_sender_rcv_tid_ack(qp, 0);
-	is_fecn = process_ecn(qp, packet);
+	process_ecn(qp, packet);
 	psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
 	aeth = be32_to_cpu(ohdr->u.tid_rdma.ack.aeth);
 	req_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.ack.verbs_psn));
@@ -4835,10 +4911,10 @@ void hfi1_rc_rcv_tid_rdma_resync(struct hfi1_packet *packet)
 	struct tid_rdma_flow *flow;
 	struct tid_flow_state *fs = &qpriv->flow_state;
 	u32 psn, generation, idx, gen_next;
-	bool is_fecn;
+	bool fecn;
 	unsigned long flags;
 
-	is_fecn = process_ecn(qp, packet);
+	fecn = process_ecn(qp, packet);
 	psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
 
 	generation = mask_psn(psn + 1) >> HFI1_KDETH_BTH_SEQ_SHIFT;
@@ -4929,6 +5005,8 @@ void hfi1_rc_rcv_tid_rdma_resync(struct hfi1_packet *packet)
 	qpriv->s_flags |= RVT_S_ACK_PENDING;
 	hfi1_schedule_tid_send(qp);
 bail:
+	if (fecn)
+		qp->s_flags |= RVT_S_ECN;
 	spin_unlock_irqrestore(&qp->s_lock, flags);
 }
 
@@ -5465,3 +5543,36 @@ static u32 read_r_next_psn(struct hfi1_devdata *dd, u8 ctxt, u8 fidx)
 	reg = read_uctxt_csr(dd, ctxt, RCV_TID_FLOW_TABLE + (8 * fidx));
 	return mask_psn(reg);
 }
+
+static void tid_rdma_rcv_err(struct hfi1_packet *packet,
+			     struct ib_other_headers *ohdr,
+			     struct rvt_qp *qp, u32 psn, int diff, bool fecn)
+{
+	unsigned long flags;
+
+	tid_rdma_rcv_error(packet, ohdr, qp, psn, diff);
+	if (fecn) {
+		spin_lock_irqsave(&qp->s_lock, flags);
+		qp->s_flags |= RVT_S_ECN;
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+	}
+}
+
+static void update_r_next_psn_fecn(struct hfi1_packet *packet,
+				   struct hfi1_qp_priv *priv,
+				   struct hfi1_ctxtdata *rcd,
+				   struct tid_rdma_flow *flow,
+				   bool fecn)
+{
+	/*
+	 * If a start/middle packet is delivered here due to
+	 * RSM rule and FECN, we need to update the r_next_psn.
+	 */
+	if (fecn && packet->etype == RHF_RCV_TYPE_EAGER &&
+	    !(priv->s_flags & HFI1_R_TID_SW_PSN)) {
+		struct hfi1_devdata *dd = rcd->dd;
+
+		flow->flow_state.r_next_psn =
+			read_r_next_psn(dd, rcd->ctxt, flow->idx);
+	}
+}
-- 
cgit v1.2.3-59-g8ed1b