Merge branch 'tid-read' into hfi1-tid

This is the series for adding TID RDMA read. Kaike put in a lot of effort into making this more consumable for review so special thanks to him. Allocating resources and tracing are separated out followed by patches which build up the read request. Then we have the patches to receive incoming TID RDMA read requests and handle integration with the RC protocol. See the cover letter of the original posting for more of a detailed overview of TID. https://www.spinics.net/lists/linux-rdma/msg66611.html * tid-read: IB/hfi1: Add static trace for TID RDMA READ protocol IB/hfi1: Enable TID RDMA READ protocol IB/hfi1: Add interlock between a TID RDMA request and other requests IB/hfi1: Integrate TID RDMA READ protocol into RC protocol IB/hfi1: Increment the retry timeout value for TID RDMA READ request IB/hfi1: Add functions for restarting TID RDMA READ request IB/hfi1: Add TID RDMA handlers IB/hfi1: Add functions to receive TID RDMA READ response IB/hfi1: Add a function to build TID RDMA READ response IB/hfi1: Add functions to receive TID RDMA READ request IB/hfi1: Set PbcInsertHcrc for TID RDMA packets IB/hfi1: Add functions to build TID RDMA READ request IB/hfi1: Add static trace for flow and TID management functions IB/hfi1: Add the counter n_tidwait IB/hfi1: TID RDMA RcvArray programming and TID allocation IB/hfi1: TID RDMA flow allocation IB/hfi: Move RC functions into a header file Signed-off-by: Doug Ledford <dledford@redhat.com>
author: Doug Ledford <dledford@redhat.com> 2019-02-05 17:59:43 -0500
committer: Doug Ledford <dledford@redhat.com> 2019-02-05 17:59:43 -0500
commit: a2f3bde88174e5c8edf7d6f1b3403a9214439d50 (patch)
tree: 2b516e3b90cf2ec0eb03b160a8b498486599b848
parent: Merge branch 'opfn' into hfi1-tid (diff)
parent: IB/hfi1: Add static trace for TID RDMA READ protocol (diff)
download: linux-dev-a2f3bde88174e5c8edf7d6f1b3403a9214439d50.tar.xz
linux-dev-a2f3bde88174e5c8edf7d6f1b3403a9214439d50.zip
27 files changed, 4669 insertions, 173 deletions
diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c
index 4d40311f082e..612f04190ed8 100644
--- a/drivers/infiniband/hw/hfi1/chip.c
+++ b/drivers/infiniband/hw/hfi1/chip.c
@@ -4253,6 +4253,8 @@ static struct cntr_entry dev_cntrs[DEV_CNTR_LAST] = {
 			    access_sw_pio_drain),
 [C_SW_KMEM_WAIT] = CNTR_ELEM("KmemWait", 0, 0, CNTR_NORMAL,
 			    access_sw_kmem_wait),
+[C_SW_TID_WAIT] = CNTR_ELEM("TidWait", 0, 0, CNTR_NORMAL,
+			    hfi1_access_sw_tid_wait),
 [C_SW_SEND_SCHED] = CNTR_ELEM("SendSched", 0, 0, CNTR_NORMAL,
 			    access_sw_send_schedule),
 [C_SDMA_DESC_FETCHED_CNT] = CNTR_ELEM("SDEDscFdCn",
diff --git a/drivers/infiniband/hw/hfi1/chip.h b/drivers/infiniband/hw/hfi1/chip.h
index ba3d99e6e33b..6c27c1c6a868 100644
--- a/drivers/infiniband/hw/hfi1/chip.h
+++ b/drivers/infiniband/hw/hfi1/chip.h
@@ -927,6 +927,7 @@ enum {
 	C_SW_PIO_WAIT,
 	C_SW_PIO_DRAIN,
 	C_SW_KMEM_WAIT,
+	C_SW_TID_WAIT,
 	C_SW_SEND_SCHED,
 	C_SDMA_DESC_FETCHED_CNT,
 	C_SDMA_INT_CNT,
diff --git a/drivers/infiniband/hw/hfi1/common.h b/drivers/infiniband/hw/hfi1/common.h
index 40d3cfb58bd1..7310a5dba420 100644
--- a/drivers/infiniband/hw/hfi1/common.h
+++ b/drivers/infiniband/hw/hfi1/common.h
@@ -340,6 +340,10 @@ struct diag_pkt {
 
 #define HFI1_PSM_IOC_BASE_SEQ 0x0
 
+/* Number of BTH.PSN bits used for sequence number in expected rcvs */
+#define HFI1_KDETH_BTH_SEQ_SHIFT 11
+#define HFI1_KDETH_BTH_SEQ_MASK (BIT(HFI1_KDETH_BTH_SEQ_SHIFT) - 1)
+
 static inline __u64 rhf_to_cpu(const __le32 *rbuf)
 {
 	return __le64_to_cpu(*((__le64 *)rbuf));
diff --git a/drivers/infiniband/hw/hfi1/driver.c b/drivers/infiniband/hw/hfi1/driver.c
index a8ad70730203..2a9d2912f5db 100644
--- a/drivers/infiniband/hw/hfi1/driver.c
+++ b/drivers/infiniband/hw/hfi1/driver.c
@@ -1575,25 +1575,32 @@ drop:
 	return -EINVAL;
 }
 
-void handle_eflags(struct hfi1_packet *packet)
+static void show_eflags_errs(struct hfi1_packet *packet)
 {
 	struct hfi1_ctxtdata *rcd = packet->rcd;
 	u32 rte = rhf_rcv_type_err(packet->rhf);
 
+	dd_dev_err(rcd->dd,
+		   "receive context %d: rhf 0x%016llx, errs [ %s%s%s%s%s%s%s%s] rte 0x%x\n",
+		   rcd->ctxt, packet->rhf,
+		   packet->rhf & RHF_K_HDR_LEN_ERR ? "k_hdr_len " : "",
+		   packet->rhf & RHF_DC_UNC_ERR ? "dc_unc " : "",
+		   packet->rhf & RHF_DC_ERR ? "dc " : "",
+		   packet->rhf & RHF_TID_ERR ? "tid " : "",
+		   packet->rhf & RHF_LEN_ERR ? "len " : "",
+		   packet->rhf & RHF_ECC_ERR ? "ecc " : "",
+		   packet->rhf & RHF_VCRC_ERR ? "vcrc " : "",
+		   packet->rhf & RHF_ICRC_ERR ? "icrc " : "",
+		   rte);
+}
+
+void handle_eflags(struct hfi1_packet *packet)
+{
+	struct hfi1_ctxtdata *rcd = packet->rcd;
+
 	rcv_hdrerr(rcd, rcd->ppd, packet);
 	if (rhf_err_flags(packet->rhf))
-		dd_dev_err(rcd->dd,
-			   "receive context %d: rhf 0x%016llx, errs [ %s%s%s%s%s%s%s%s] rte 0x%x\n",
-			   rcd->ctxt, packet->rhf,
-			   packet->rhf & RHF_K_HDR_LEN_ERR ? "k_hdr_len " : "",
-			   packet->rhf & RHF_DC_UNC_ERR ? "dc_unc " : "",
-			   packet->rhf & RHF_DC_ERR ? "dc " : "",
-			   packet->rhf & RHF_TID_ERR ? "tid " : "",
-			   packet->rhf & RHF_LEN_ERR ? "len " : "",
-			   packet->rhf & RHF_ECC_ERR ? "ecc " : "",
-			   packet->rhf & RHF_VCRC_ERR ? "vcrc " : "",
-			   packet->rhf & RHF_ICRC_ERR ? "icrc " : "",
-			   rte);
+		show_eflags_errs(packet);
 }
 
 /*
@@ -1699,11 +1706,14 @@ static int kdeth_process_expected(struct hfi1_packet *packet)
 	if (unlikely(hfi1_dbg_should_fault_rx(packet)))
 		return RHF_RCV_CONTINUE;
 
-	if (unlikely(rhf_err_flags(packet->rhf)))
-		handle_eflags(packet);
+	if (unlikely(rhf_err_flags(packet->rhf))) {
+		struct hfi1_ctxtdata *rcd = packet->rcd;
 
-	dd_dev_err(packet->rcd->dd,
-		   "Unhandled expected packet received. Dropping.\n");
+		if (hfi1_handle_kdeth_eflags(rcd, rcd->ppd, packet))
+			return RHF_RCV_CONTINUE;
+	}
+
+	hfi1_kdeth_expected_rcv(packet);
 	return RHF_RCV_CONTINUE;
 }
 
@@ -1712,11 +1722,17 @@ static int kdeth_process_eager(struct hfi1_packet *packet)
 	hfi1_setup_9B_packet(packet);
 	if (unlikely(hfi1_dbg_should_fault_rx(packet)))
 		return RHF_RCV_CONTINUE;
-	if (unlikely(rhf_err_flags(packet->rhf)))
-		handle_eflags(packet);
 
-	dd_dev_err(packet->rcd->dd,
-		   "Unhandled eager packet received. Dropping.\n");
+	trace_hfi1_rcvhdr(packet);
+	if (unlikely(rhf_err_flags(packet->rhf))) {
+		struct hfi1_ctxtdata *rcd = packet->rcd;
+
+		show_eflags_errs(packet);
+		if (hfi1_handle_kdeth_eflags(rcd, rcd->ppd, packet))
+			return RHF_RCV_CONTINUE;
+	}
+
+	hfi1_kdeth_eager_rcv(packet);
 	return RHF_RCV_CONTINUE;
 }
 
diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h
index 9aa0357e17b7..6582184cc985 100644
--- a/drivers/infiniband/hw/hfi1/hfi.h
+++ b/drivers/infiniband/hw/hfi1/hfi.h
@@ -198,6 +198,14 @@ struct exp_tid_set {
 };
 
 typedef int (*rhf_rcv_function_ptr)(struct hfi1_packet *packet);
+
+struct tid_queue {
+	struct list_head queue_head;
+			/* queue head for QP TID resource waiters */
+	u32 enqueue;	/* count of tid enqueues */
+	u32 dequeue;	/* count of tid dequeues */
+};
+
 struct hfi1_ctxtdata {
 	/* rcvhdrq base, needs mmap before useful */
 	void *rcvhdrq;
@@ -291,6 +299,12 @@ struct hfi1_ctxtdata {
 	/* PSM Specific fields */
 	/* lock protecting all Expected TID data */
 	struct mutex exp_mutex;
+	/* lock protecting all Expected TID data of kernel contexts */
+	spinlock_t exp_lock;
+	/* Queue for QP's waiting for HW TID flows */
+	struct tid_queue flow_queue;
+	/* Queue for QP's waiting for HW receive array entries */
+	struct tid_queue rarr_queue;
 	/* when waiting for rcv or pioavail */
 	wait_queue_head_t wait;
 	/* uuid from PSM */
@@ -323,6 +337,9 @@ struct hfi1_ctxtdata {
 	 */
 	u8 subctxt_cnt;
 
+	/* Bit mask to track free TID RDMA HW flows */
+	unsigned long flow_mask;
+	struct tid_flow_state flows[RXE_NUM_TID_FLOWS];
 };
 
 /**
@@ -2103,7 +2120,7 @@ static inline u64 hfi1_pkt_default_send_ctxt_mask(struct hfi1_devdata *dd,
 			SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_TEST_SMASK |
 #endif
 			HFI1_PKT_USER_SC_INTEGRITY;
-	else
+	else if (ctxt_type != SC_KERNEL)
 		base_sc_integrity |= HFI1_PKT_KERNEL_SC_INTEGRITY;
 
 	/* turn on send-side job key checks if !A0 */
diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c
index a8dbd0f191f5..d13304f7340d 100644
--- a/drivers/infiniband/hw/hfi1/init.c
+++ b/drivers/infiniband/hw/hfi1/init.c
@@ -370,6 +370,9 @@ int hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, int numa,
 		rcd->rhf_rcv_function_map = normal_rhf_rcv_functions;
 
 		mutex_init(&rcd->exp_mutex);
+		spin_lock_init(&rcd->exp_lock);
+		INIT_LIST_HEAD(&rcd->flow_queue.queue_head);
+		INIT_LIST_HEAD(&rcd->rarr_queue.queue_head);
 
 		hfi1_cdbg(PROC, "setting up context %u\n", rcd->ctxt);
 
@@ -472,6 +475,9 @@ int hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, int numa,
 						    GFP_KERNEL, numa);
 			if (!rcd->opstats)
 				goto bail;
+
+			/* Initialize TID flow generations for the context */
+			hfi1_kern_init_ctxt_generations(rcd);
 		}
 
 		*context = rcd;
@@ -771,6 +777,8 @@ static void enable_chip(struct hfi1_devdata *dd)
 			rcvmask |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB;
 		if (HFI1_CAP_KGET_MASK(rcd->flags, NODROP_EGR_FULL))
 			rcvmask |= HFI1_RCVCTRL_NO_EGR_DROP_ENB;
+		if (HFI1_CAP_IS_KSET(TID_RDMA))
+			rcvmask |= HFI1_RCVCTRL_TIDFLOW_ENB;
 		hfi1_rcvctrl(dd, rcvmask, rcd);
 		sc_enable(rcd->sc);
 		hfi1_rcd_put(rcd);
@@ -1589,7 +1597,7 @@ static void cleanup_device_data(struct hfi1_devdata *dd)
 		struct hfi1_ctxtdata *rcd = dd->rcd[ctxt];
 
 		if (rcd) {
-			hfi1_clear_tids(rcd);
+			hfi1_free_ctxt_rcv_groups(rcd);
 			hfi1_free_ctxt(rcd);
 		}
 	}
diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c
index f822f92b415f..acdd9eba189b 100644
--- a/drivers/infiniband/hw/hfi1/qp.c
+++ b/drivers/infiniband/hw/hfi1/qp.c
@@ -319,6 +319,7 @@ int hfi1_setup_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe, bool *call_send)
 
 	switch (qp->ibqp.qp_type) {
 	case IB_QPT_RC:
+		hfi1_setup_tid_rdma_wqe(qp, wqe);
 	case IB_QPT_UC:
 		if (wqe->length > 0x80000000U)
 			return -EINVAL;
@@ -738,6 +739,7 @@ void flush_qp_waiters(struct rvt_qp *qp)
 {
 	lockdep_assert_held(&qp->s_lock);
 	flush_iowait(qp);
+	hfi1_tid_rdma_flush_wait(qp);
 }
 
 void stop_send_queue(struct rvt_qp *qp)
@@ -745,6 +747,8 @@ void stop_send_queue(struct rvt_qp *qp)
 	struct hfi1_qp_priv *priv = qp->priv;
 
 	iowait_cancel_work(&priv->s_iowait);
+	if (cancel_work_sync(&priv->tid_rdma.trigger_work))
+		rvt_put_qp(qp);
 }
 
 void quiesce_qp(struct rvt_qp *qp)
@@ -758,6 +762,7 @@ void quiesce_qp(struct rvt_qp *qp)
 
 void notify_qp_reset(struct rvt_qp *qp)
 {
+	hfi1_qp_kern_exp_rcv_clear_all(qp);
 	qp->r_adefered = 0;
 	clear_ahg(qp);
 
diff --git a/drivers/infiniband/hw/hfi1/qp.h b/drivers/infiniband/hw/hfi1/qp.h
index 7adb6dff6813..ce25a27aa4a1 100644
--- a/drivers/infiniband/hw/hfi1/qp.h
+++ b/drivers/infiniband/hw/hfi1/qp.h
@@ -63,11 +63,13 @@ extern const struct rvt_operation_params hfi1_post_parms[];
  * HFI1_S_AHG_VALID - ahg header valid on chip
  * HFI1_S_AHG_CLEAR - have send engine clear ahg state
  * HFI1_S_WAIT_PIO_DRAIN - qp waiting for PIOs to drain
+ * HFI1_S_WAIT_TID_SPACE - a QP is waiting for TID resource
  * HFI1_S_MIN_BIT_MASK - the lowest bit that can be used by hfi1
  */
 #define HFI1_S_AHG_VALID         0x80000000
 #define HFI1_S_AHG_CLEAR         0x40000000
 #define HFI1_S_WAIT_PIO_DRAIN    0x20000000
+#define HFI1_S_WAIT_TID_SPACE    0x10000000
 #define HFI1_S_MIN_BIT_MASK      0x01000000
 
 /*
diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c
index 092d5eba980f..6c9ef572fc69 100644
--- a/drivers/infiniband/hw/hfi1/rc.c
+++ b/drivers/infiniband/hw/hfi1/rc.c
@@ -51,28 +51,48 @@
 
 #include "hfi.h"
 #include "qp.h"
+#include "rc.h"
 #include "verbs_txreq.h"
 #include "trace.h"
 
-/* cut down ridiculously long IB macro names */
-#define OP(x) RC_OP(x)
-
-static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
-					 struct rvt_swqe *wqe,
-					 struct hfi1_ibport *ibp);
-
-static u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe,
-		       u32 psn, u32 pmtu)
+struct rvt_ack_entry *find_prev_entry(struct rvt_qp *qp, u32 psn, u8 *prev,
+				      u8 *prev_ack, bool *scheduled)
+	__must_hold(&qp->s_lock)
 {
-	u32 len;
-
-	len = delta_psn(psn, wqe->psn) * pmtu;
-	ss->sge = wqe->sg_list[0];
-	ss->sg_list = wqe->sg_list + 1;
-	ss->num_sge = wqe->wr.num_sge;
-	ss->total_len = wqe->length;
-	rvt_skip_sge(ss, len, false);
-	return wqe->length - len;
+	struct rvt_ack_entry *e = NULL;
+	u8 i, p;
+	bool s = true;
+
+	for (i = qp->r_head_ack_queue; ; i = p) {
+		if (i == qp->s_tail_ack_queue)
+			s = false;
+		if (i)
+			p = i - 1;
+		else
+			p = rvt_size_atomic(ib_to_rvt(qp->ibqp.device));
+		if (p == qp->r_head_ack_queue) {
+			e = NULL;
+			break;
+		}
+		e = &qp->s_ack_queue[p];
+		if (!e->opcode) {
+			e = NULL;
+			break;
+		}
+		if (cmp_psn(psn, e->psn) >= 0) {
+			if (p == qp->s_tail_ack_queue &&
+			    cmp_psn(psn, e->lpsn) <= 0)
+				s = false;
+			break;
+		}
+	}
+	if (prev)
+		*prev = p;
+	if (prev_ack)
+		*prev_ack = i;
+	if (scheduled)
+		*scheduled = s;
+	return e;
 }
 
 /**
@@ -92,13 +112,16 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
 {
 	struct rvt_ack_entry *e;
 	u32 hwords;
-	u32 len;
-	u32 bth0, bth2;
+	u32 len = 0;
+	u32 bth0 = 0, bth2 = 0;
 	u32 bth1 = qp->remote_qpn | (HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT);
 	int middle = 0;
 	u32 pmtu = qp->pmtu;
 	struct hfi1_qp_priv *priv = qp->priv;
+	bool last_pkt;
+	u32 delta;
 
+	trace_hfi1_rsp_make_rc_ack(qp, 0);
 	lockdep_assert_held(&qp->s_lock);
 	/* Don't send an ACK if we aren't supposed to. */
 	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
@@ -170,6 +193,26 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
 			hwords++;
 			qp->s_ack_rdma_psn = e->psn;
 			bth2 = mask_psn(qp->s_ack_rdma_psn++);
+		} else if (e->opcode == TID_OP(READ_REQ)) {
+			/*
+			 * If a TID RDMA read response is being resent and
+			 * we haven't seen the duplicate request yet,
+			 * then stop sending the remaining responses the
+			 * responder has seen until the requester re-sends it.
+			 */
+			len = e->rdma_sge.sge_length;
+			if (len && !e->rdma_sge.mr) {
+				qp->s_tail_ack_queue = qp->r_head_ack_queue;
+				goto bail;
+			}
+			/* Copy SGE state in case we need to resend */
+			ps->s_txreq->mr = e->rdma_sge.mr;
+			if (ps->s_txreq->mr)
+				rvt_get_mr(ps->s_txreq->mr);
+			qp->s_ack_rdma_sge.sge = e->rdma_sge;
+			qp->s_ack_rdma_sge.num_sge = 1;
+			qp->s_ack_state = TID_OP(READ_RESP);
+			goto read_resp;
 		} else {
 			/* COMPARE_SWAP or FETCH_ADD */
 			ps->s_txreq->ss = NULL;
@@ -207,6 +250,28 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
 		bth2 = mask_psn(qp->s_ack_rdma_psn++);
 		break;
 
+	case TID_OP(READ_RESP):
+read_resp:
+		e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+		ps->s_txreq->ss = &qp->s_ack_rdma_sge;
+		delta = hfi1_build_tid_rdma_read_resp(qp, e, ohdr, &bth0,
+						      &bth1, &bth2, &len,
+						      &last_pkt);
+		if (delta == 0)
+			goto error_qp;
+		hwords += delta;
+		if (last_pkt) {
+			e->sent = 1;
+			/*
+			 * Increment qp->s_tail_ack_queue through s_ack_state
+			 * transition.
+			 */
+			qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
+		}
+		break;
+	case TID_OP(READ_REQ):
+		goto bail;
+
 	default:
 normal:
 		/*
@@ -236,7 +301,14 @@ normal:
 	ps->s_txreq->hdr_dwords = hwords;
 	hfi1_make_ruc_header(qp, ohdr, bth0, bth1, bth2, middle, ps);
 	return 1;
-
+error_qp:
+	spin_unlock_irqrestore(&qp->s_lock, ps->flags);
+	spin_lock_irqsave(&qp->r_lock, ps->flags);
+	spin_lock(&qp->s_lock);
+	rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+	spin_unlock(&qp->s_lock);
+	spin_unlock_irqrestore(&qp->r_lock, ps->flags);
+	spin_lock_irqsave(&qp->s_lock, ps->flags);
 bail:
 	qp->s_ack_state = OP(ACKNOWLEDGE);
 	/*
@@ -263,17 +335,22 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
 	struct hfi1_qp_priv *priv = qp->priv;
 	struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
 	struct ib_other_headers *ohdr;
-	struct rvt_sge_state *ss;
+	struct rvt_sge_state *ss = NULL;
 	struct rvt_swqe *wqe;
-	u32 hwords;
-	u32 len;
-	u32 bth0 = 0, bth2;
+	struct hfi1_swqe_priv *wpriv;
+	struct tid_rdma_request *req = NULL;
+	/* header size in 32-bit words LRH+BTH = (8+12)/4. */
+	u32 hwords = 5;
+	u32 len = 0;
+	u32 bth0 = 0, bth2 = 0;
 	u32 bth1 = qp->remote_qpn | (HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT);
 	u32 pmtu = qp->pmtu;
 	char newreq;
 	int middle = 0;
 	int delta;
+	struct tid_rdma_flow *flow = NULL;
 
+	trace_hfi1_sender_make_rc_req(qp);
 	lockdep_assert_held(&qp->s_lock);
 	ps->s_txreq = get_txreq(ps->dev, qp);
 	if (!ps->s_txreq)
@@ -314,8 +391,8 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
 		}
 		clear_ahg(qp);
 		wqe = rvt_get_swqe_ptr(qp, qp->s_last);
-		rvt_send_complete(qp, wqe, qp->s_last != qp->s_acked ?
-			IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR);
+		hfi1_trdma_send_complete(qp, wqe, qp->s_last != qp->s_acked ?
+					 IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR);
 		/* will get called again */
 		goto done_free_tx;
 	}
@@ -334,6 +411,7 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
 
 	/* Send a request. */
 	wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
+check_s_state:
 	switch (qp->s_state) {
 	default:
 		if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_NEXT_SEND_OK))
@@ -355,9 +433,13 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
 			/*
 			 * If a fence is requested, wait for previous
 			 * RDMA read and atomic operations to finish.
+			 * However, there is no need to guard against
+			 * TID RDMA READ after TID RDMA READ.
 			 */
 			if ((wqe->wr.send_flags & IB_SEND_FENCE) &&
-			    qp->s_num_rd_atomic) {
+			    qp->s_num_rd_atomic &&
+			    (wqe->wr.opcode != IB_WR_TID_RDMA_READ ||
+			     priv->pending_tid_r_segs < qp->s_num_rd_atomic)) {
 				qp->s_flags |= RVT_S_WAIT_FENCE;
 				goto bail;
 			}
@@ -402,6 +484,15 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
 		len = wqe->length;
 		ss = &qp->s_sge;
 		bth2 = mask_psn(qp->s_psn);
+
+		/*
+		 * Interlock between various IB requests and TID RDMA
+		 * if necessary.
+		 */
+		if ((priv->s_flags & HFI1_S_TID_WAIT_INTERLCK) ||
+		    hfi1_tid_rdma_wqe_interlock(qp, wqe))
+			goto bail;
+
 		switch (wqe->wr.opcode) {
 		case IB_WR_SEND:
 		case IB_WR_SEND_WITH_IMM:
@@ -483,16 +574,14 @@ no_flow_control:
 			 * Don't allow more operations to be started
 			 * than the QP limits allow.
 			 */
-			if (newreq) {
-				if (qp->s_num_rd_atomic >=
-				    qp->s_max_rd_atomic) {
-					qp->s_flags |= RVT_S_WAIT_RDMAR;
-					goto bail;
-				}
-				qp->s_num_rd_atomic++;
-				if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
-					qp->s_lsn++;
+			if (qp->s_num_rd_atomic >=
+			    qp->s_max_rd_atomic) {
+				qp->s_flags |= RVT_S_WAIT_RDMAR;
+				goto bail;
 			}
+			qp->s_num_rd_atomic++;
+			if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
+				qp->s_lsn++;
 			put_ib_reth_vaddr(
 				wqe->rdma_wr.remote_addr,
 				&ohdr->u.rc.reth);
@@ -508,20 +597,92 @@ no_flow_control:
 				qp->s_cur = 0;
 			break;
 
+		case IB_WR_TID_RDMA_READ:
+			trace_hfi1_tid_read_sender_make_req(qp, newreq);
+			wpriv = wqe->priv;
+			req = wqe_to_tid_req(wqe);
+			trace_hfi1_tid_req_make_req_read(qp, newreq,
+							 wqe->wr.opcode,
+							 wqe->psn, wqe->lpsn,
+							 req);
+			delta = cmp_psn(qp->s_psn, wqe->psn);
+
+			/*
+			 * Don't allow more operations to be started
+			 * than the QP limits allow. We could get here under
+			 * three conditions; (1) It's a new request; (2) We are
+			 * sending the second or later segment of a request,
+			 * but the qp->s_state is set to OP(RDMA_READ_REQUEST)
+			 * when the last segment of a previous request is
+			 * received just before this; (3) We are re-sending a
+			 * request.
+			 */
+			if (qp->s_num_rd_atomic >= qp->s_max_rd_atomic) {
+				qp->s_flags |= RVT_S_WAIT_RDMAR;
+				goto bail;
+			}
+			if (newreq) {
+				struct tid_rdma_flow *flow =
+					&req->flows[req->setup_head];
+
+				/*
+				 * Set up s_sge as it is needed for TID
+				 * allocation. However, if the pages have been
+				 * walked and mapped, skip it. An earlier try
+				 * has failed to allocate the TID entries.
+				 */
+				if (!flow->npagesets) {
+					qp->s_sge.sge = wqe->sg_list[0];
+					qp->s_sge.sg_list = wqe->sg_list + 1;
+					qp->s_sge.num_sge = wqe->wr.num_sge;
+					qp->s_sge.total_len = wqe->length;
+					qp->s_len = wqe->length;
+					req->isge = 0;
+					req->clear_tail = req->setup_head;
+					req->flow_idx = req->setup_head;
+					req->state = TID_REQUEST_ACTIVE;
+				}
+			} else if (delta == 0) {
+				/* Re-send a request */
+				req->cur_seg = 0;
+				req->comp_seg = 0;
+				req->ack_pending = 0;
+				req->flow_idx = req->clear_tail;
+				req->state = TID_REQUEST_RESEND;
+			}
+			req->s_next_psn = qp->s_psn;
+			/* Read one segment at a time */
+			len = min_t(u32, req->seg_len,
+				    wqe->length - req->seg_len * req->cur_seg);
+			delta = hfi1_build_tid_rdma_read_req(qp, wqe, ohdr,
+							     &bth1, &bth2,
+							     &len);
+			if (delta <= 0) {
+				/* Wait for TID space */
+				goto bail;
+			}
+			if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
+				qp->s_lsn++;
+			hwords += delta;
+			ss = &wpriv->ss;
+			/* Check if this is the last segment */
+			if (req->cur_seg >= req->total_segs &&
+			    ++qp->s_cur == qp->s_size)
+				qp->s_cur = 0;
+			break;
+
 		case IB_WR_ATOMIC_CMP_AND_SWP:
 		case IB_WR_ATOMIC_FETCH_AND_ADD:
 			/*
 			 * Don't allow more operations to be started
 			 * than the QP limits allow.
 			 */
-			if (newreq) {
-				if (qp->s_num_rd_atomic >=
-				    qp->s_max_rd_atomic) {
-					qp->s_flags |= RVT_S_WAIT_RDMAR;
-					goto bail;
-				}
-				qp->s_num_rd_atomic++;
+			if (qp->s_num_rd_atomic >=
+			    qp->s_max_rd_atomic) {
+				qp->s_flags |= RVT_S_WAIT_RDMAR;
+				goto bail;
 			}
+			qp->s_num_rd_atomic++;
 
 			/* FALLTHROUGH */
 		case IB_WR_OPFN:
@@ -555,11 +716,13 @@ no_flow_control:
 		default:
 			goto bail;
 		}
-		qp->s_sge.sge = wqe->sg_list[0];
-		qp->s_sge.sg_list = wqe->sg_list + 1;
-		qp->s_sge.num_sge = wqe->wr.num_sge;
-		qp->s_sge.total_len = wqe->length;
-		qp->s_len = wqe->length;
+		if (wqe->wr.opcode != IB_WR_TID_RDMA_READ) {
+			qp->s_sge.sge = wqe->sg_list[0];
+			qp->s_sge.sg_list = wqe->sg_list + 1;
+			qp->s_sge.num_sge = wqe->wr.num_sge;
+			qp->s_sge.total_len = wqe->length;
+			qp->s_len = wqe->length;
+		}
 		if (newreq) {
 			qp->s_tail++;
 			if (qp->s_tail >= qp->s_size)
@@ -567,6 +730,8 @@ no_flow_control:
 		}
 		if (wqe->wr.opcode == IB_WR_RDMA_READ)
 			qp->s_psn = wqe->lpsn + 1;
+		else if (wqe->wr.opcode == IB_WR_TID_RDMA_READ)
+			qp->s_psn = req->s_next_psn;
 		else
 			qp->s_psn++;
 		break;
@@ -683,6 +848,103 @@ no_flow_control:
 		if (qp->s_cur == qp->s_size)
 			qp->s_cur = 0;
 		break;
+	case TID_OP(READ_RESP):
+		if (wqe->wr.opcode != IB_WR_TID_RDMA_READ)
+			goto bail;
+		/* This is used to restart a TID read request */
+		req = wqe_to_tid_req(wqe);
+		wpriv = wqe->priv;
+		/*
+		 * Back down. The field qp->s_psn has been set to the psn with
+		 * which the request should be restart. It's OK to use division
+		 * as this is on the retry path.
+		 */
+		req->cur_seg = delta_psn(qp->s_psn, wqe->psn) / priv->pkts_ps;
+
+		/*
+		 * The following function need to be redefined to return the
+		 * status to make sure that we find the flow. At the same
+		 * time, we can use the req->state change to check if the
+		 * call succeeds or not.
+		 */
+		req->state = TID_REQUEST_RESEND;
+		hfi1_tid_rdma_restart_req(qp, wqe, &bth2);
+		if (req->state != TID_REQUEST_ACTIVE) {
+			/*
+			 * Failed to find the flow. Release all allocated tid
+			 * resources.
+			 */
+			hfi1_kern_exp_rcv_clear_all(req);
+			hfi1_kern_clear_hw_flow(priv->rcd, qp);
+
+			hfi1_trdma_send_complete(qp, wqe, IB_WC_LOC_QP_OP_ERR);
+			goto bail;
+		}
+		req->state = TID_REQUEST_RESEND;
+		len = min_t(u32, req->seg_len,
+			    wqe->length - req->seg_len * req->cur_seg);
+		flow = &req->flows[req->flow_idx];
+		len -= flow->sent;
+		req->s_next_psn = flow->flow_state.ib_lpsn + 1;
+		delta = hfi1_build_tid_rdma_read_packet(wqe, ohdr, &bth1,
+							&bth2, &len);
+		if (delta <= 0) {
+			/* Wait for TID space */
+			goto bail;
+		}
+		hwords += delta;
+		ss = &wpriv->ss;
+		/* Check if this is the last segment */
+		if (req->cur_seg >= req->total_segs &&
+		    ++qp->s_cur == qp->s_size)
+			qp->s_cur = 0;
+		qp->s_psn = req->s_next_psn;
+		trace_hfi1_tid_req_make_req_read(qp, 0, wqe->wr.opcode,
+						 wqe->psn, wqe->lpsn, req);
+		break;
+	case TID_OP(READ_REQ):
+		req = wqe_to_tid_req(wqe);
+		delta = cmp_psn(qp->s_psn, wqe->psn);
+		/*
+		 * If the current WR is not TID RDMA READ, or this is the start
+		 * of a new request, we need to change the qp->s_state so that
+		 * the request can be set up properly.
+		 */
+		if (wqe->wr.opcode != IB_WR_TID_RDMA_READ || delta == 0 ||
+		    qp->s_cur == qp->s_tail) {
+			qp->s_state = OP(RDMA_READ_REQUEST);
+			if (delta == 0 || qp->s_cur == qp->s_tail)
+				goto check_s_state;
+			else
+				goto bail;
+		}
+
+		/* Rate limiting */
+		if (qp->s_num_rd_atomic >= qp->s_max_rd_atomic) {
+			qp->s_flags |= RVT_S_WAIT_RDMAR;
+			goto bail;
+		}
+
+		wpriv = wqe->priv;
+		/* Read one segment at a time */
+		len = min_t(u32, req->seg_len,
+			    wqe->length - req->seg_len * req->cur_seg);
+		delta = hfi1_build_tid_rdma_read_req(qp, wqe, ohdr, &bth1,
+						     &bth2, &len);
+		if (delta <= 0) {
+			/* Wait for TID space */
+			goto bail;
+		}
+		hwords += delta;
+		ss = &wpriv->ss;
+		/* Check if this is the last segment */
+		if (req->cur_seg >= req->total_segs &&
+		    ++qp->s_cur == qp->s_size)
+			qp->s_cur = 0;
+		qp->s_psn = req->s_next_psn;
+		trace_hfi1_tid_req_make_req_read(qp, 0, wqe->wr.opcode,
+						 wqe->psn, wqe->lpsn, req);
+		break;
 	}
 	qp->s_sending_hpsn = bth2;
 	delta = delta_psn(bth2, wqe->psn);
@@ -951,6 +1213,43 @@ void hfi1_send_rc_ack(struct hfi1_packet *packet, bool is_fecn)
 }
 
 /**
+ * update_num_rd_atomic - update the qp->s_num_rd_atomic
+ * @qp: the QP
+ * @psn: the packet sequence number to restart at
+ * @wqe: the wqe
+ *
+ * This is called from reset_psn() to update qp->s_num_rd_atomic
+ * for the current wqe.
+ * Called at interrupt level with the QP s_lock held.
+ */
+static void update_num_rd_atomic(struct rvt_qp *qp, u32 psn,
+				 struct rvt_swqe *wqe)
+{
+	u32 opcode = wqe->wr.opcode;
+
+	if (opcode == IB_WR_RDMA_READ ||
+	    opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+	    opcode == IB_WR_ATOMIC_FETCH_AND_ADD) {
+		qp->s_num_rd_atomic++;
+	} else if (opcode == IB_WR_TID_RDMA_READ) {
+		struct tid_rdma_request *req = wqe_to_tid_req(wqe);
+		struct hfi1_qp_priv *priv = qp->priv;
+
+		if (cmp_psn(psn, wqe->lpsn) <= 0) {
+			u32 cur_seg;
+
+			cur_seg = (psn - wqe->psn) / priv->pkts_ps;
+			req->ack_pending = cur_seg - req->comp_seg;
+			priv->pending_tid_r_segs += req->ack_pending;
+			qp->s_num_rd_atomic += req->ack_pending;
+		} else {
+			priv->pending_tid_r_segs += req->total_segs;
+			qp->s_num_rd_atomic += req->total_segs;
+		}
+	}
+}
+
+/**
  * reset_psn - reset the QP state to send starting from PSN
  * @qp: the QP
  * @psn: the packet sequence number to restart at
@@ -964,9 +1263,12 @@ static void reset_psn(struct rvt_qp *qp, u32 psn)
 	u32 n = qp->s_acked;
 	struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, n);
 	u32 opcode;
+	struct hfi1_qp_priv *priv = qp->priv;
 
 	lockdep_assert_held(&qp->s_lock);
 	qp->s_cur = n;
+	priv->pending_tid_r_segs = 0;
+	qp->s_num_rd_atomic = 0;
 
 	/*
 	 * If we are starting the request from the beginning,
@@ -976,9 +1278,9 @@ static void reset_psn(struct rvt_qp *qp, u32 psn)
 		qp->s_state = OP(SEND_LAST);
 		goto done;
 	}
+	update_num_rd_atomic(qp, psn, wqe);
 
 	/* Find the work request opcode corresponding to the given PSN. */
-	opcode = wqe->wr.opcode;
 	for (;;) {
 		int diff;
 
@@ -988,8 +1290,11 @@ static void reset_psn(struct rvt_qp *qp, u32 psn)
 			break;
 		wqe = rvt_get_swqe_ptr(qp, n);
 		diff = cmp_psn(psn, wqe->psn);
-		if (diff < 0)
+		if (diff < 0) {
+			/* Point wqe back to the previous one*/
+			wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
 			break;
+		}
 		qp->s_cur = n;
 		/*
 		 * If we are starting the request from the beginning,
@@ -999,8 +1304,10 @@ static void reset_psn(struct rvt_qp *qp, u32 psn)
 			qp->s_state = OP(SEND_LAST);
 			goto done;
 		}
-		opcode = wqe->wr.opcode;
+
+		update_num_rd_atomic(qp, psn, wqe);
 	}
+	opcode = wqe->wr.opcode;
 
 	/*
 	 * Set the state to restart in the middle of a request.
@@ -1022,6 +1329,10 @@ static void reset_psn(struct rvt_qp *qp, u32 psn)
 		qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE);
 		break;
 
+	case IB_WR_TID_RDMA_READ:
+		qp->s_state = TID_OP(READ_RESP);
+		break;
+
 	default:
 		/*
 		 * This case shouldn't happen since its only
@@ -1030,6 +1341,7 @@ static void reset_psn(struct rvt_qp *qp, u32 psn)
 		qp->s_state = OP(SEND_LAST);
 	}
 done:
+	priv->s_flags &= ~HFI1_S_TID_WAIT_INTERLCK;
 	qp->s_psn = psn;
 	/*
 	 * Set RVT_S_WAIT_PSN as rc_complete() may start the timer
@@ -1040,6 +1352,7 @@ done:
 	    (cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0))
 		qp->s_flags |= RVT_S_WAIT_PSN;
 	qp->s_flags &= ~HFI1_S_AHG_VALID;
+	trace_hfi1_sender_reset_psn(qp);
 }
 
 /*
@@ -1054,6 +1367,7 @@ void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait)
 
 	lockdep_assert_held(&qp->r_lock);
 	lockdep_assert_held(&qp->s_lock);
+	trace_hfi1_sender_restart_rc(qp);
 	if (qp->s_retry == 0) {
 		if (qp->s_mig_state == IB_MIG_ARMED) {
 			hfi1_migrate_qp(qp);
@@ -1075,8 +1389,16 @@ void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait)
 				wqe = do_rc_completion(qp, wqe, ibp);
 				qp->s_flags &= ~RVT_S_WAIT_ACK;
 			} else {
-				rvt_send_complete(qp, wqe,
-						  IB_WC_RETRY_EXC_ERR);
+				if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
+					struct tid_rdma_request *req;
+
+					req = wqe_to_tid_req(wqe);
+					hfi1_kern_exp_rcv_clear_all(req);
+					hfi1_kern_clear_hw_flow(priv->rcd, qp);
+				}
+
+				hfi1_trdma_send_complete(qp, wqe,
+							 IB_WC_RETRY_EXC_ERR);
 				rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
 			}
 			return;
@@ -1088,7 +1410,8 @@ void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait)
 	}
 
 	ibp = to_iport(qp->ibqp.device, qp->port_num);
-	if (wqe->wr.opcode == IB_WR_RDMA_READ)
+	if (wqe->wr.opcode == IB_WR_RDMA_READ ||
+	    wqe->wr.opcode == IB_WR_TID_RDMA_READ)
 		ibp->rvp.n_rc_resends++;
 	else
 		ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn);
@@ -1115,7 +1438,8 @@ static void reset_sending_psn(struct rvt_qp *qp, u32 psn)
 	for (;;) {
 		wqe = rvt_get_swqe_ptr(qp, n);
 		if (cmp_psn(psn, wqe->lpsn) <= 0) {
-			if (wqe->wr.opcode == IB_WR_RDMA_READ)
+			if (wqe->wr.opcode == IB_WR_RDMA_READ ||
+			    wqe->wr.opcode == IB_WR_TID_RDMA_READ)
 				qp->s_sending_psn = wqe->lpsn + 1;
 			else
 				qp->s_sending_psn = psn + 1;
@@ -1164,8 +1488,9 @@ void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah)
 	}
 
 	opcode = ib_bth_get_opcode(ohdr);
-	if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
-	    opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
+	if ((opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
+	     opcode <= OP(ATOMIC_ACKNOWLEDGE)) ||
+	    opcode == TID_OP(READ_RESP)) {
 		WARN_ON(!qp->s_rdma_ack_cnt);
 		qp->s_rdma_ack_cnt--;
 		return;
@@ -1181,8 +1506,12 @@ void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah)
 	if ((psn & IB_BTH_REQ_ACK) && qp->s_acked != qp->s_tail &&
 	    !(qp->s_flags &
 		(RVT_S_TIMER | RVT_S_WAIT_RNR | RVT_S_WAIT_PSN)) &&
-		(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
-		rvt_add_retry_timer(qp);
+		(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
+		if (opcode == TID_OP(READ_REQ))
+			rvt_add_retry_timer_ext(qp, priv->timeout_shift);
+		else
+			rvt_add_retry_timer(qp);
+	}
 
 	while (qp->s_last != qp->s_acked) {
 		u32 s_last;
@@ -1191,6 +1520,7 @@ void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah)
 		if (cmp_psn(wqe->lpsn, qp->s_sending_psn) >= 0 &&
 		    cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)
 			break;
+		trdma_clean_swqe(qp, wqe);
 		rvt_qp_wqe_unreserve(qp, wqe);
 		s_last = qp->s_last;
 		trace_hfi1_qp_send_completion(qp, wqe, s_last);
@@ -1229,20 +1559,24 @@ static inline void update_last_psn(struct rvt_qp *qp, u32 psn)
  * This is similar to hfi1_send_complete but has to check to be sure
  * that the SGEs are not being referenced if the SWQE is being resent.
  */
-static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
-					 struct rvt_swqe *wqe,
-					 struct hfi1_ibport *ibp)
+struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
+				  struct rvt_swqe *wqe,
+				  struct hfi1_ibport *ibp)
 {
+	struct hfi1_qp_priv *priv = qp->priv;
+
 	lockdep_assert_held(&qp->s_lock);
 	/*
 	 * Don't decrement refcount and don't generate a
 	 * completion if the SWQE is being resent until the send
 	 * is finished.
 	 */
+	trace_hfi1_rc_completion(qp, wqe->lpsn);
 	if (cmp_psn(wqe->lpsn, qp->s_sending_psn) < 0 ||
 	    cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
 		u32 s_last;
 
+		trdma_clean_swqe(qp, wqe);
 		rvt_put_swqe(wqe);
 		rvt_qp_wqe_unreserve(qp, wqe);
 		s_last = qp->s_last;
@@ -1300,6 +1634,10 @@ static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
 			qp->s_draining = 0;
 		wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
 	}
+	if (priv->s_flags & HFI1_S_TID_WAIT_INTERLCK) {
+		priv->s_flags &= ~HFI1_S_TID_WAIT_INTERLCK;
+		hfi1_schedule_send(qp);
+	}
 	return wqe;
 }
 
@@ -1314,11 +1652,12 @@ static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
  * May be called at interrupt level, with the QP s_lock held.
  * Returns 1 if OK, 0 if current operation should be aborted (NAK).
  */
-static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
-		     u64 val, struct hfi1_ctxtdata *rcd)
+int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
+	      u64 val, struct hfi1_ctxtdata *rcd)
 {
 	struct hfi1_ibport *ibp;
 	enum ib_wc_status status;
+	struct hfi1_qp_priv *qpriv = qp->priv;
 	struct rvt_swqe *wqe;
 	int ret = 0;
 	u32 ack_psn;
@@ -1365,6 +1704,8 @@ static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
 		 */
 		if ((wqe->wr.opcode == IB_WR_RDMA_READ &&
 		     (opcode != OP(RDMA_READ_RESPONSE_LAST) || diff != 0)) ||
+		    (wqe->wr.opcode == IB_WR_TID_RDMA_READ &&
+		     (opcode != TID_OP(READ_RESP) || diff != 0)) ||
 		    ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
 		      wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) &&
 		     (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) {
@@ -1415,10 +1756,18 @@ static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
 			break;
 	}
 
+	trace_hfi1_rc_ack_do(qp, aeth, psn, wqe);
+	trace_hfi1_sender_do_rc_ack(qp);
 	switch (aeth >> IB_AETH_NAK_SHIFT) {
 	case 0:         /* ACK */
 		this_cpu_inc(*ibp->rvp.rc_acks);
-		if (qp->s_acked != qp->s_tail) {
+		if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
+			if (wqe_to_tid_req(wqe)->ack_pending)
+				rvt_mod_retry_timer_ext(qp,
+							qpriv->timeout_shift);
+			else
+				rvt_stop_rc_timers(qp);
+		} else if (qp->s_acked != qp->s_tail) {
 			/*
 			 * We are expecting more ACKs so
 			 * mod the retry timer.
@@ -1507,7 +1856,10 @@ static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
 			ibp->rvp.n_other_naks++;
 class_b:
 			if (qp->s_last == qp->s_acked) {
-				rvt_send_complete(qp, wqe, status);
+				if (wqe->wr.opcode == IB_WR_TID_RDMA_READ)
+					hfi1_kern_read_tid_flow_free(qp);
+
+				hfi1_trdma_send_complete(qp, wqe, status);
 				rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
 			}
 			break;
@@ -1548,6 +1900,7 @@ static void rdma_seq_err(struct rvt_qp *qp, struct hfi1_ibport *ibp, u32 psn,
 
 	while (cmp_psn(psn, wqe->lpsn) > 0) {
 		if (wqe->wr.opcode == IB_WR_RDMA_READ ||
+		    wqe->wr.opcode == IB_WR_TID_RDMA_READ ||
 		    wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
 		    wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)
 			break;
@@ -1754,16 +2107,6 @@ bail:
 	return;
 }
 
-static inline void rc_defered_ack(struct hfi1_ctxtdata *rcd,
-				  struct rvt_qp *qp)
-{
-	if (list_empty(&qp->rspwait)) {
-		qp->r_flags |= RVT_R_RSP_NAK;
-		rvt_get_qp(qp);
-		list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
-	}
-}
-
 static inline void rc_cancel_ack(struct rvt_qp *qp)
 {
 	qp->r_adefered = 0;
@@ -1796,8 +2139,9 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data,
 	struct hfi1_ibport *ibp = rcd_to_iport(rcd);
 	struct rvt_ack_entry *e;
 	unsigned long flags;
-	u8 i, prev;
-	int old_req;
+	u8 prev;
+	u8 mra; /* most recent ACK */
+	bool old_req;
 
 	trace_hfi1_rcv_error(qp, psn);
 	if (diff > 0) {
@@ -1843,29 +2187,8 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data,
 
 	spin_lock_irqsave(&qp->s_lock, flags);
 
-	for (i = qp->r_head_ack_queue; ; i = prev) {
-		if (i == qp->s_tail_ack_queue)
-			old_req = 0;
-		if (i)
-			prev = i - 1;
-		else
-			prev = rvt_size_atomic(ib_to_rvt(qp->ibqp.device));
-		if (prev == qp->r_head_ack_queue) {
-			e = NULL;
-			break;
-		}
-		e = &qp->s_ack_queue[prev];
-		if (!e->opcode) {
-			e = NULL;
-			break;
-		}
-		if (cmp_psn(psn, e->psn) >= 0) {
-			if (prev == qp->s_tail_ack_queue &&
-			    cmp_psn(psn, e->lpsn) <= 0)
-				old_req = 0;
-			break;
-		}
-	}
+	e = find_prev_entry(qp, psn, &prev, &mra, &old_req);
+
 	switch (opcode) {
 	case OP(RDMA_READ_REQUEST): {
 		struct ib_reth *reth;
@@ -1940,7 +2263,7 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data,
 		 * Resend the most recent ACK if this request is
 		 * after all the previous RDMA reads and atomics.
 		 */
-		if (i == qp->r_head_ack_queue) {
+		if (mra == qp->r_head_ack_queue) {
 			spin_unlock_irqrestore(&qp->s_lock, flags);
 			qp->r_nak_state = 0;
 			qp->r_ack_psn = qp->r_psn - 1;
@@ -1951,7 +2274,7 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data,
 		 * Resend the RDMA read or atomic op which
 		 * ACKs this duplicate request.
 		 */
-		qp->s_tail_ack_queue = i;
+		qp->s_tail_ack_queue = mra;
 		break;
 	}
 	qp->s_ack_state = OP(ACKNOWLEDGE);
@@ -1968,17 +2291,6 @@ send_ack:
 	return 0;
 }
 
-static inline void update_ack_queue(struct rvt_qp *qp, unsigned n)
-{
-	unsigned next;
-
-	next = n + 1;
-	if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
-		next = 0;
-	qp->s_tail_ack_queue = next;
-	qp->s_ack_state = OP(ACKNOWLEDGE);
-}
-
 static void log_cca_event(struct hfi1_pportdata *ppd, u8 sl, u32 rlid,
 			  u32 lqpn, u32 rqpn, u8 svc_type)
 {
diff --git a/drivers/infiniband/hw/hfi1/rc.h b/drivers/infiniband/hw/hfi1/rc.h
new file mode 100644
index 000000000000..4329eadcb3df
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/rc.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
+/*
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ */
+
+#ifndef HFI1_RC_H
+#define HFI1_RC_H
+
+/* cut down ridiculously long IB macro names */
+#define OP(x) IB_OPCODE_RC_##x
+
+static inline void update_ack_queue(struct rvt_qp *qp, unsigned int n)
+{
+	unsigned int next;
+
+	next = n + 1;
+	if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
+		next = 0;
+	qp->s_tail_ack_queue = next;
+	qp->s_ack_state = OP(ACKNOWLEDGE);
+}
+
+static inline void rc_defered_ack(struct hfi1_ctxtdata *rcd,
+				  struct rvt_qp *qp)
+{
+	if (list_empty(&qp->rspwait)) {
+		qp->r_flags |= RVT_R_RSP_NAK;
+		rvt_get_qp(qp);
+		list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
+	}
+}
+
+static inline u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe,
+			      u32 psn, u32 pmtu)
+{
+	u32 len;
+
+	len = delta_psn(psn, wqe->psn) * pmtu;
+	return rvt_restart_sge(ss, wqe, len);
+}
+
+struct rvt_ack_entry *find_prev_entry(struct rvt_qp *qp, u32 psn, u8 *prev,
+				      u8 *prev_ack, bool *scheduled);
+int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, u64 val,
+	      struct hfi1_ctxtdata *rcd);
+struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, struct rvt_swqe *wqe,
+				  struct hfi1_ibport *ibp);
+
+#endif /* HFI1_RC_H */
diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c
index e8f57c0cd8bc..0ee79403acaf 100644
--- a/drivers/infiniband/hw/hfi1/tid_rdma.c
+++ b/drivers/infiniband/hw/hfi1/tid_rdma.c
@@ -5,10 +5,48 @@
  */
 
 #include "hfi.h"
+#include "qp.h"
+#include "rc.h"
 #include "verbs.h"
 #include "tid_rdma.h"
+#include "exp_rcv.h"
 #include "trace.h"
 
+/**
+ * DOC: TID RDMA READ protocol
+ *
+ * This is an end-to-end protocol at the hfi1 level between two nodes that
+ * improves performance by avoiding data copy on the requester side. It
+ * converts a qualified RDMA READ request into a TID RDMA READ request on
+ * the requester side and thereafter handles the request and response
+ * differently. To be qualified, the RDMA READ request should meet the
+ * following:
+ * -- The total data length should be greater than 256K;
+ * -- The total data length should be a multiple of 4K page size;
+ * -- Each local scatter-gather entry should be 4K page aligned;
+ * -- Each local scatter-gather entry should be a multiple of 4K page size;
+ */
+
+#define RCV_TID_FLOW_TABLE_CTRL_FLOW_VALID_SMASK BIT_ULL(32)
+#define RCV_TID_FLOW_TABLE_CTRL_HDR_SUPP_EN_SMASK BIT_ULL(33)
+#define RCV_TID_FLOW_TABLE_CTRL_KEEP_AFTER_SEQ_ERR_SMASK BIT_ULL(34)
+#define RCV_TID_FLOW_TABLE_CTRL_KEEP_ON_GEN_ERR_SMASK BIT_ULL(35)
+#define RCV_TID_FLOW_TABLE_STATUS_SEQ_MISMATCH_SMASK BIT_ULL(37)
+#define RCV_TID_FLOW_TABLE_STATUS_GEN_MISMATCH_SMASK BIT_ULL(38)
+
+/* Maximum number of packets within a flow generation. */
+#define MAX_TID_FLOW_PSN BIT(HFI1_KDETH_BTH_SEQ_SHIFT)
+
+#define GENERATION_MASK 0xFFFFF
+
+static u32 mask_generation(u32 a)
+{
+	return a & GENERATION_MASK;
+}
+
+/* Reserved generation value to set to unused flows for kernel contexts */
+#define KERN_GENERATION_RESERVED mask_generation(U32_MAX)
+
 /*
  * J_KEY for kernel contexts when TID RDMA is used.
  * See generate_jkey() in hfi.h for more information.
@@ -17,8 +55,19 @@
 #define HFI1_KERNEL_MIN_JKEY HFI1_ADMIN_JKEY_RANGE
 #define HFI1_KERNEL_MAX_JKEY (2 * HFI1_ADMIN_JKEY_RANGE - 1)
 
+/* Maximum number of segments in flight per QP request. */
 #define TID_RDMA_MAX_READ_SEGS_PER_REQ  6
 #define TID_RDMA_MAX_WRITE_SEGS_PER_REQ 4
+#define MAX_REQ max_t(u16, TID_RDMA_MAX_READ_SEGS_PER_REQ, \
+			TID_RDMA_MAX_WRITE_SEGS_PER_REQ)
+#define MAX_FLOWS roundup_pow_of_two(MAX_REQ + 1)
+
+#define MAX_EXPECTED_PAGES     (MAX_EXPECTED_BUFFER / PAGE_SIZE)
+
+#define TID_RDMA_DESTQP_FLOW_SHIFT      11
+#define TID_RDMA_DESTQP_FLOW_MASK       0x1f
+
+#define TID_FLOW_SW_PSN BIT(0)
 
 #define TID_OPFN_QP_CTXT_MASK 0xff
 #define TID_OPFN_QP_CTXT_SHIFT 56
@@ -60,6 +109,13 @@
  * C - Capcode
  */
 
+static void tid_rdma_trigger_resume(struct work_struct *work);
+static void hfi1_kern_exp_rcv_free_flows(struct tid_rdma_request *req);
+static int hfi1_kern_exp_rcv_alloc_flows(struct tid_rdma_request *req,
+					 gfp_t gfp);
+static void hfi1_init_trdma_req(struct rvt_qp *qp,
+				struct tid_rdma_request *req);
+
 static u64 tid_rdma_opfn_encode(struct tid_rdma_params *p)
 {
 	return
@@ -210,7 +266,7 @@ int hfi1_kern_exp_rcv_init(struct hfi1_ctxtdata *rcd, int reinit)
 	BUILD_BUG_ON(TID_RDMA_JKEY > HFI1_KERNEL_MAX_JKEY);
 	rcd->jkey = TID_RDMA_JKEY;
 	hfi1_set_ctxt_jkey(rcd->dd, rcd, rcd->jkey);
-	return 0;
+	return hfi1_alloc_ctxt_rcv_groups(rcd);
 }
 
 /**
@@ -246,19 +302,2676 @@ int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp,
 		      struct ib_qp_init_attr *init_attr)
 {
 	struct hfi1_qp_priv *qpriv = qp->priv;
+	int i, ret;
 
 	qpriv->rcd = qp_to_rcd(rdi, qp);
 
 	spin_lock_init(&qpriv->opfn.lock);
 	INIT_WORK(&qpriv->opfn.opfn_work, opfn_send_conn_request);
+	INIT_WORK(&qpriv->tid_rdma.trigger_work, tid_rdma_trigger_resume);
+	qpriv->flow_state.psn = 0;
+	qpriv->flow_state.index = RXE_NUM_TID_FLOWS;
+	qpriv->flow_state.last_index = RXE_NUM_TID_FLOWS;
+	qpriv->flow_state.generation = KERN_GENERATION_RESERVED;
+	INIT_LIST_HEAD(&qpriv->tid_wait);
+
+	if (init_attr->qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) {
+		struct hfi1_devdata *dd = qpriv->rcd->dd;
+
+		qpriv->pages = kzalloc_node(TID_RDMA_MAX_PAGES *
+						sizeof(*qpriv->pages),
+					    GFP_KERNEL, dd->node);
+		if (!qpriv->pages)
+			return -ENOMEM;
+		for (i = 0; i < qp->s_size; i++) {
+			struct hfi1_swqe_priv *priv;
+			struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, i);
+
+			priv = kzalloc_node(sizeof(*priv), GFP_KERNEL,
+					    dd->node);
+			if (!priv)
+				return -ENOMEM;
+
+			hfi1_init_trdma_req(qp, &priv->tid_req);
+			priv->tid_req.e.swqe = wqe;
+			wqe->priv = priv;
+		}
+		for (i = 0; i < rvt_max_atomic(rdi); i++) {
+			struct hfi1_ack_priv *priv;
+
+			priv = kzalloc_node(sizeof(*priv), GFP_KERNEL,
+					    dd->node);
+			if (!priv)
+				return -ENOMEM;
+
+			hfi1_init_trdma_req(qp, &priv->tid_req);
+			priv->tid_req.e.ack = &qp->s_ack_queue[i];
+
+			ret = hfi1_kern_exp_rcv_alloc_flows(&priv->tid_req,
+							    GFP_KERNEL);
+			if (ret) {
+				kfree(priv);
+				return ret;
+			}
+			qp->s_ack_queue[i].priv = priv;
+		}
+	}
 
 	return 0;
 }
 
 void hfi1_qp_priv_tid_free(struct rvt_dev_info *rdi, struct rvt_qp *qp)
 {
+	struct hfi1_qp_priv *qpriv = qp->priv;
+	struct rvt_swqe *wqe;
+	u32 i;
+
+	if (qp->ibqp.qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) {
+		for (i = 0; i < qp->s_size; i++) {
+			wqe = rvt_get_swqe_ptr(qp, i);
+			kfree(wqe->priv);
+			wqe->priv = NULL;
+		}
+		for (i = 0; i < rvt_max_atomic(rdi); i++) {
+			struct hfi1_ack_priv *priv = qp->s_ack_queue[i].priv;
+
+			if (priv)
+				hfi1_kern_exp_rcv_free_flows(&priv->tid_req);
+			kfree(priv);
+			qp->s_ack_queue[i].priv = NULL;
+		}
+		cancel_work_sync(&qpriv->opfn.opfn_work);
+		kfree(qpriv->pages);
+		qpriv->pages = NULL;
+	}
+}
+
+/* Flow and tid waiter functions */
+/**
+ * DOC: lock ordering
+ *
+ * There are two locks involved with the queuing
+ * routines: the qp s_lock and the exp_lock.
+ *
+ * Since the tid space allocation is called from
+ * the send engine, the qp s_lock is already held.
+ *
+ * The allocation routines will get the exp_lock.
+ *
+ * The first_qp() call is provided to allow the head of
+ * the rcd wait queue to be fetched under the exp_lock and
+ * followed by a drop of the exp_lock.
+ *
+ * Any qp in the wait list will have the qp reference count held
+ * to hold the qp in memory.
+ */
+
+/*
+ * return head of rcd wait list
+ *
+ * Must hold the exp_lock.
+ *
+ * Get a reference to the QP to hold the QP in memory.
+ *
+ * The caller must release the reference when the local
+ * is no longer being used.
+ */
+static struct rvt_qp *first_qp(struct hfi1_ctxtdata *rcd,
+			       struct tid_queue *queue)
+	__must_hold(&rcd->exp_lock)
+{
+	struct hfi1_qp_priv *priv;
+
+	lockdep_assert_held(&rcd->exp_lock);
+	priv = list_first_entry_or_null(&queue->queue_head,
+					struct hfi1_qp_priv,
+					tid_wait);
+	if (!priv)
+		return NULL;
+	rvt_get_qp(priv->owner);
+	return priv->owner;
+}
+
+/**
+ * kernel_tid_waiters - determine rcd wait
+ * @rcd: the receive context
+ * @qp: the head of the qp being processed
+ *
+ * This routine will return false IFF
+ * the list is NULL or the head of the
+ * list is the indicated qp.
+ *
+ * Must hold the qp s_lock and the exp_lock.
+ *
+ * Return:
+ * false if either of the conditions below are statisfied:
+ * 1. The list is empty or
+ * 2. The indicated qp is at the head of the list and the
+ *    HFI1_S_WAIT_TID_SPACE bit is set in qp->s_flags.
+ * true is returned otherwise.
+ */
+static bool kernel_tid_waiters(struct hfi1_ctxtdata *rcd,
+			       struct tid_queue *queue, struct rvt_qp *qp)
+	__must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock)
+{
+	struct rvt_qp *fqp;
+	bool ret = true;
+
+	lockdep_assert_held(&qp->s_lock);
+	lockdep_assert_held(&rcd->exp_lock);
+	fqp = first_qp(rcd, queue);
+	if (!fqp || (fqp == qp && (qp->s_flags & HFI1_S_WAIT_TID_SPACE)))
+		ret = false;
+	rvt_put_qp(fqp);
+	return ret;
+}
+
+/**
+ * dequeue_tid_waiter - dequeue the qp from the list
+ * @qp - the qp to remove the wait list
+ *
+ * This routine removes the indicated qp from the
+ * wait list if it is there.
+ *
+ * This should be done after the hardware flow and
+ * tid array resources have been allocated.
+ *
+ * Must hold the qp s_lock and the rcd exp_lock.
+ *
+ * It assumes the s_lock to protect the s_flags
+ * field and to reliably test the HFI1_S_WAIT_TID_SPACE flag.
+ */
+static void dequeue_tid_waiter(struct hfi1_ctxtdata *rcd,
+			       struct tid_queue *queue, struct rvt_qp *qp)
+	__must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+
+	lockdep_assert_held(&qp->s_lock);
+	lockdep_assert_held(&rcd->exp_lock);
+	if (list_empty(&priv->tid_wait))
+		return;
+	list_del_init(&priv->tid_wait);
+	qp->s_flags &= ~HFI1_S_WAIT_TID_SPACE;
+	queue->dequeue++;
+	rvt_put_qp(qp);
+}
+
+/**
+ * queue_qp_for_tid_wait - suspend QP on tid space
+ * @rcd: the receive context
+ * @qp: the qp
+ *
+ * The qp is inserted at the tail of the rcd
+ * wait queue and the HFI1_S_WAIT_TID_SPACE s_flag is set.
+ *
+ * Must hold the qp s_lock and the exp_lock.
+ */
+static void queue_qp_for_tid_wait(struct hfi1_ctxtdata *rcd,
+				  struct tid_queue *queue, struct rvt_qp *qp)
+	__must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+
+	lockdep_assert_held(&qp->s_lock);
+	lockdep_assert_held(&rcd->exp_lock);
+	if (list_empty(&priv->tid_wait)) {
+		qp->s_flags |= HFI1_S_WAIT_TID_SPACE;
+		list_add_tail(&priv->tid_wait, &queue->queue_head);
+		priv->tid_enqueue = ++queue->enqueue;
+		rcd->dd->verbs_dev.n_tidwait++;
+		trace_hfi1_qpsleep(qp, HFI1_S_WAIT_TID_SPACE);
+		rvt_get_qp(qp);
+	}
+}
+
+/**
+ * __trigger_tid_waiter - trigger tid waiter
+ * @qp: the qp
+ *
+ * This is a private entrance to schedule the qp
+ * assuming the caller is holding the qp->s_lock.
+ */
+static void __trigger_tid_waiter(struct rvt_qp *qp)
+	__must_hold(&qp->s_lock)
+{
+	lockdep_assert_held(&qp->s_lock);
+	if (!(qp->s_flags & HFI1_S_WAIT_TID_SPACE))
+		return;
+	trace_hfi1_qpwakeup(qp, HFI1_S_WAIT_TID_SPACE);
+	hfi1_schedule_send(qp);
+}
+
+/**
+ * tid_rdma_schedule_tid_wakeup - schedule wakeup for a qp
+ * @qp - the qp
+ *
+ * trigger a schedule or a waiting qp in a deadlock
+ * safe manner.  The qp reference is held prior
+ * to this call via first_qp().
+ *
+ * If the qp trigger was already scheduled (!rval)
+ * the the reference is dropped, otherwise the resume
+ * or the destroy cancel will dispatch the reference.
+ */
+static void tid_rdma_schedule_tid_wakeup(struct rvt_qp *qp)
+{
+	struct hfi1_qp_priv *priv;
+	struct hfi1_ibport *ibp;
+	struct hfi1_pportdata *ppd;
+	struct hfi1_devdata *dd;
+	bool rval;
+
+	if (!qp)
+		return;
+
+	priv = qp->priv;
+	ibp = to_iport(qp->ibqp.device, qp->port_num);
+	ppd = ppd_from_ibp(ibp);
+	dd = dd_from_ibdev(qp->ibqp.device);
+
+	rval = queue_work_on(priv->s_sde ?
+			     priv->s_sde->cpu :
+			     cpumask_first(cpumask_of_node(dd->node)),
+			     ppd->hfi1_wq,
+			     &priv->tid_rdma.trigger_work);
+	if (!rval)
+		rvt_put_qp(qp);
+}
+
+/**
+ * tid_rdma_trigger_resume - field a trigger work request
+ * @work - the work item
+ *
+ * Complete the off qp trigger processing by directly
+ * calling the progress routine.
+ */
+static void tid_rdma_trigger_resume(struct work_struct *work)
+{
+	struct tid_rdma_qp_params *tr;
+	struct hfi1_qp_priv *priv;
+	struct rvt_qp *qp;
+
+	tr = container_of(work, struct tid_rdma_qp_params, trigger_work);
+	priv = container_of(tr, struct hfi1_qp_priv, tid_rdma);
+	qp = priv->owner;
+	spin_lock_irq(&qp->s_lock);
+	if (qp->s_flags & HFI1_S_WAIT_TID_SPACE) {
+		spin_unlock_irq(&qp->s_lock);
+		hfi1_do_send(priv->owner, true);
+	} else {
+		spin_unlock_irq(&qp->s_lock);
+	}
+	rvt_put_qp(qp);
+}
+
+/**
+ * tid_rdma_flush_wait - unwind any tid space wait
+ *
+ * This is called when resetting a qp to
+ * allow a destroy or reset to get rid
+ * of any tid space linkage and reference counts.
+ */
+static void _tid_rdma_flush_wait(struct rvt_qp *qp, struct tid_queue *queue)
+	__must_hold(&qp->s_lock)
+{
+	struct hfi1_qp_priv *priv;
+
+	if (!qp)
+		return;
+	lockdep_assert_held(&qp->s_lock);
+	priv = qp->priv;
+	qp->s_flags &= ~HFI1_S_WAIT_TID_SPACE;
+	spin_lock(&priv->rcd->exp_lock);
+	if (!list_empty(&priv->tid_wait)) {
+		list_del_init(&priv->tid_wait);
+		qp->s_flags &= ~HFI1_S_WAIT_TID_SPACE;
+		queue->dequeue++;
+		rvt_put_qp(qp);
+	}
+	spin_unlock(&priv->rcd->exp_lock);
+}
+
+void hfi1_tid_rdma_flush_wait(struct rvt_qp *qp)
+	__must_hold(&qp->s_lock)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+
+	_tid_rdma_flush_wait(qp, &priv->rcd->flow_queue);
+	_tid_rdma_flush_wait(qp, &priv->rcd->rarr_queue);
+}
+
+/* Flow functions */
+/**
+ * kern_reserve_flow - allocate a hardware flow
+ * @rcd - the context to use for allocation
+ * @last - the index of the preferred flow. Use RXE_NUM_TID_FLOWS to
+ *         signify "don't care".
+ *
+ * Use a bit mask based allocation to reserve a hardware
+ * flow for use in receiving KDETH data packets. If a preferred flow is
+ * specified the function will attempt to reserve that flow again, if
+ * available.
+ *
+ * The exp_lock must be held.
+ *
+ * Return:
+ * On success: a value postive value between 0 and RXE_NUM_TID_FLOWS - 1
+ * On failure: -EAGAIN
+ */
+static int kern_reserve_flow(struct hfi1_ctxtdata *rcd, int last)
+	__must_hold(&rcd->exp_lock)
+{
+	int nr;
+
+	/* Attempt to reserve the preferred flow index */
+	if (last >= 0 && last < RXE_NUM_TID_FLOWS &&
+	    !test_and_set_bit(last, &rcd->flow_mask))
+		return last;
+
+	nr = ffz(rcd->flow_mask);
+	BUILD_BUG_ON(RXE_NUM_TID_FLOWS >=
+		     (sizeof(rcd->flow_mask) * BITS_PER_BYTE));
+	if (nr > (RXE_NUM_TID_FLOWS - 1))
+		return -EAGAIN;
+	set_bit(nr, &rcd->flow_mask);
+	return nr;
+}
+
+static void kern_set_hw_flow(struct hfi1_ctxtdata *rcd, u32 generation,
+			     u32 flow_idx)
+{
+	u64 reg;
+
+	reg = ((u64)generation << HFI1_KDETH_BTH_SEQ_SHIFT) |
+		RCV_TID_FLOW_TABLE_CTRL_FLOW_VALID_SMASK |
+		RCV_TID_FLOW_TABLE_CTRL_KEEP_AFTER_SEQ_ERR_SMASK |
+		RCV_TID_FLOW_TABLE_CTRL_KEEP_ON_GEN_ERR_SMASK |
+		RCV_TID_FLOW_TABLE_STATUS_SEQ_MISMATCH_SMASK |
+		RCV_TID_FLOW_TABLE_STATUS_GEN_MISMATCH_SMASK;
+
+	if (generation != KERN_GENERATION_RESERVED)
+		reg |= RCV_TID_FLOW_TABLE_CTRL_HDR_SUPP_EN_SMASK;
+
+	write_uctxt_csr(rcd->dd, rcd->ctxt,
+			RCV_TID_FLOW_TABLE + 8 * flow_idx, reg);
+}
+
+static u32 kern_setup_hw_flow(struct hfi1_ctxtdata *rcd, u32 flow_idx)
+	__must_hold(&rcd->exp_lock)
+{
+	u32 generation = rcd->flows[flow_idx].generation;
+
+	kern_set_hw_flow(rcd, generation, flow_idx);
+	return generation;
+}
+
+static u32 kern_flow_generation_next(u32 gen)
+{
+	u32 generation = mask_generation(gen + 1);
+
+	if (generation == KERN_GENERATION_RESERVED)
+		generation = mask_generation(generation + 1);
+	return generation;
+}
+
+static void kern_clear_hw_flow(struct hfi1_ctxtdata *rcd, u32 flow_idx)
+	__must_hold(&rcd->exp_lock)
+{
+	rcd->flows[flow_idx].generation =
+		kern_flow_generation_next(rcd->flows[flow_idx].generation);
+	kern_set_hw_flow(rcd, KERN_GENERATION_RESERVED, flow_idx);
+}
+
+int hfi1_kern_setup_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp)
+{
+	struct hfi1_qp_priv *qpriv = (struct hfi1_qp_priv *)qp->priv;
+	struct tid_flow_state *fs = &qpriv->flow_state;
+	struct rvt_qp *fqp;
+	unsigned long flags;
+	int ret = 0;
+
+	/* The QP already has an allocated flow */
+	if (fs->index != RXE_NUM_TID_FLOWS)
+		return ret;
+
+	spin_lock_irqsave(&rcd->exp_lock, flags);
+	if (kernel_tid_waiters(rcd, &rcd->flow_queue, qp))
+		goto queue;
+
+	ret = kern_reserve_flow(rcd, fs->last_index);
+	if (ret < 0)
+		goto queue;
+	fs->index = ret;
+	fs->last_index = fs->index;
+
+	/* Generation received in a RESYNC overrides default flow generation */
+	if (fs->generation != KERN_GENERATION_RESERVED)
+		rcd->flows[fs->index].generation = fs->generation;
+	fs->generation = kern_setup_hw_flow(rcd, fs->index);
+	fs->psn = 0;
+	fs->flags = 0;
+	dequeue_tid_waiter(rcd, &rcd->flow_queue, qp);
+	/* get head before dropping lock */
+	fqp = first_qp(rcd, &rcd->flow_queue);
+	spin_unlock_irqrestore(&rcd->exp_lock, flags);
+
+	tid_rdma_schedule_tid_wakeup(fqp);
+	return 0;
+queue:
+	queue_qp_for_tid_wait(rcd, &rcd->flow_queue, qp);
+	spin_unlock_irqrestore(&rcd->exp_lock, flags);
+	return -EAGAIN;
+}
+
+void hfi1_kern_clear_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp)
+{
+	struct hfi1_qp_priv *qpriv = (struct hfi1_qp_priv *)qp->priv;
+	struct tid_flow_state *fs = &qpriv->flow_state;
+	struct rvt_qp *fqp;
+	unsigned long flags;
+
+	if (fs->index >= RXE_NUM_TID_FLOWS)
+		return;
+	spin_lock_irqsave(&rcd->exp_lock, flags);
+	kern_clear_hw_flow(rcd, fs->index);
+	clear_bit(fs->index, &rcd->flow_mask);
+	fs->index = RXE_NUM_TID_FLOWS;
+	fs->psn = 0;
+	fs->generation = KERN_GENERATION_RESERVED;
+
+	/* get head before dropping lock */
+	fqp = first_qp(rcd, &rcd->flow_queue);
+	spin_unlock_irqrestore(&rcd->exp_lock, flags);
+
+	if (fqp == qp) {
+		__trigger_tid_waiter(fqp);
+		rvt_put_qp(fqp);
+	} else {
+		tid_rdma_schedule_tid_wakeup(fqp);
+	}
+}
+
+void hfi1_kern_init_ctxt_generations(struct hfi1_ctxtdata *rcd)
+{
+	int i;
+
+	for (i = 0; i < RXE_NUM_TID_FLOWS; i++) {
+		rcd->flows[i].generation = mask_generation(prandom_u32());
+		kern_set_hw_flow(rcd, KERN_GENERATION_RESERVED, i);
+	}
+}
+
+/* TID allocation functions */
+static u8 trdma_pset_order(struct tid_rdma_pageset *s)
+{
+	u8 count = s->count;
+
+	return ilog2(count) + 1;
+}
+
+/**
+ * tid_rdma_find_phys_blocks_4k - get groups base on mr info
+ * @npages - number of pages
+ * @pages - pointer to an array of page structs
+ * @list - page set array to return
+ *
+ * This routine returns the number of groups associated with
+ * the current sge information.  This implementation is based
+ * on the expected receive find_phys_blocks() adjusted to
+ * use the MR information vs. the pfn.
+ *
+ * Return:
+ * the number of RcvArray entries
+ */
+static u32 tid_rdma_find_phys_blocks_4k(struct tid_rdma_flow *flow,
+					struct page **pages,
+					u32 npages,
+					struct tid_rdma_pageset *list)
+{
+	u32 pagecount, pageidx, setcount = 0, i;
+	void *vaddr, *this_vaddr;
+
+	if (!npages)
+		return 0;
+
+	/*
+	 * Look for sets of physically contiguous pages in the user buffer.
+	 * This will allow us to optimize Expected RcvArray entry usage by
+	 * using the bigger supported sizes.
+	 */
+	vaddr = page_address(pages[0]);
+	trace_hfi1_tid_flow_page(flow->req->qp, flow, 0, 0, 0, vaddr);
+	for (pageidx = 0, pagecount = 1, i = 1; i <= npages; i++) {
+		this_vaddr = i < npages ? page_address(pages[i]) : NULL;
+		trace_hfi1_tid_flow_page(flow->req->qp, flow, i, 0, 0,
+					 this_vaddr);
+		/*
+		 * If the vaddr's are not sequential, pages are not physically
+		 * contiguous.
+		 */
+		if (this_vaddr != (vaddr + PAGE_SIZE)) {
+			/*
+			 * At this point we have to loop over the set of
+			 * physically contiguous pages and break them down it
+			 * sizes supported by the HW.
+			 * There are two main constraints:
+			 *     1. The max buffer size is MAX_EXPECTED_BUFFER.
+			 *        If the total set size is bigger than that
+			 *        program only a MAX_EXPECTED_BUFFER chunk.
+			 *     2. The buffer size has to be a power of two. If
+			 *        it is not, round down to the closes power of
+			 *        2 and program that size.
+			 */
+			while (pagecount) {
+				int maxpages = pagecount;
+				u32 bufsize = pagecount * PAGE_SIZE;
+
+				if (bufsize > MAX_EXPECTED_BUFFER)
+					maxpages =
+						MAX_EXPECTED_BUFFER >>
+						PAGE_SHIFT;
+				else if (!is_power_of_2(bufsize))
+					maxpages =
+						rounddown_pow_of_two(bufsize) >>
+						PAGE_SHIFT;
+
+				list[setcount].idx = pageidx;
+				list[setcount].count = maxpages;
+				trace_hfi1_tid_pageset(flow->req->qp, setcount,
+						       list[setcount].idx,
+						       list[setcount].count);
+				pagecount -= maxpages;
+				pageidx += maxpages;
+				setcount++;
+			}
+			pageidx = i;
+			pagecount = 1;
+			vaddr = this_vaddr;
+		} else {
+			vaddr += PAGE_SIZE;
+			pagecount++;
+		}
+	}
+	/* insure we always return an even number of sets */
+	if (setcount & 1)
+		list[setcount++].count = 0;
+	return setcount;
+}
+
+/**
+ * tid_flush_pages - dump out pages into pagesets
+ * @list - list of pagesets
+ * @idx - pointer to current page index
+ * @pages - number of pages to dump
+ * @sets - current number of pagesset
+ *
+ * This routine flushes out accumuated pages.
+ *
+ * To insure an even number of sets the
+ * code may add a filler.
+ *
+ * This can happen with when pages is not
+ * a power of 2 or pages is a power of 2
+ * less than the maximum pages.
+ *
+ * Return:
+ * The new number of sets
+ */
+
+static u32 tid_flush_pages(struct tid_rdma_pageset *list,
+			   u32 *idx, u32 pages, u32 sets)
+{
+	while (pages) {
+		u32 maxpages = pages;
+
+		if (maxpages > MAX_EXPECTED_PAGES)
+			maxpages = MAX_EXPECTED_PAGES;
+		else if (!is_power_of_2(maxpages))
+			maxpages = rounddown_pow_of_two(maxpages);
+		list[sets].idx = *idx;
+		list[sets++].count = maxpages;
+		*idx += maxpages;
+		pages -= maxpages;
+	}
+	/* might need a filler */
+	if (sets & 1)
+		list[sets++].count = 0;
+	return sets;
+}
+
+/**
+ * tid_rdma_find_phys_blocks_8k - get groups base on mr info
+ * @pages - pointer to an array of page structs
+ * @npages - number of pages
+ * @list - page set array to return
+ *
+ * This routine parses an array of pages to compute pagesets
+ * in an 8k compatible way.
+ *
+ * pages are tested two at a time, i, i + 1 for contiguous
+ * pages and i - 1 and i contiguous pages.
+ *
+ * If any condition is false, any accumlated pages are flushed and
+ * v0,v1 are emitted as separate PAGE_SIZE pagesets
+ *
+ * Otherwise, the current 8k is totaled for a future flush.
+ *
+ * Return:
+ * The number of pagesets
+ * list set with the returned number of pagesets
+ *
+ */
+static u32 tid_rdma_find_phys_blocks_8k(struct tid_rdma_flow *flow,
+					struct page **pages,
+					u32 npages,
+					struct tid_rdma_pageset *list)
+{
+	u32 idx, sets = 0, i;
+	u32 pagecnt = 0;
+	void *v0, *v1, *vm1;
+
+	if (!npages)
+		return 0;
+	for (idx = 0, i = 0, vm1 = NULL; i < npages; i += 2) {
+		/* get a new v0 */
+		v0 = page_address(pages[i]);
+		trace_hfi1_tid_flow_page(flow->req->qp, flow, i, 1, 0, v0);
+		v1 = i + 1 < npages ?
+				page_address(pages[i + 1]) : NULL;
+		trace_hfi1_tid_flow_page(flow->req->qp, flow, i, 1, 1, v1);
+		/* compare i, i + 1 vaddr */
+		if (v1 != (v0 + PAGE_SIZE)) {
+			/* flush out pages */
+			sets = tid_flush_pages(list, &idx, pagecnt, sets);
+			/* output v0,v1 as two pagesets */
+			list[sets].idx = idx++;
+			list[sets++].count = 1;
+			if (v1) {
+				list[sets].count = 1;
+				list[sets++].idx = idx++;
+			} else {
+				list[sets++].count = 0;
+			}
+			vm1 = NULL;
+			pagecnt = 0;
+			continue;
+		}
+		/* i,i+1 consecutive, look at i-1,i */
+		if (vm1 && v0 != (vm1 + PAGE_SIZE)) {
+			/* flush out pages */
+			sets = tid_flush_pages(list, &idx, pagecnt, sets);
+			pagecnt = 0;
+		}
+		/* pages will always be a multiple of 8k */
+		pagecnt += 2;
+		/* save i-1 */
+		vm1 = v1;
+		/* move to next pair */
+	}
+	/* dump residual pages at end */
+	sets = tid_flush_pages(list, &idx, npages - idx, sets);
+	/* by design cannot be odd sets */
+	WARN_ON(sets & 1);
+	return sets;
+}
+
+/**
+ * Find pages for one segment of a sge array represented by @ss. The function
+ * does not check the sge, the sge must have been checked for alignment with a
+ * prior call to hfi1_kern_trdma_ok. Other sge checking is done as part of
+ * rvt_lkey_ok and rvt_rkey_ok. Also, the function only modifies the local sge
+ * copy maintained in @ss->sge, the original sge is not modified.
+ *
+ * Unlike IB RDMA WRITE, we can't decrement ss->num_sge here because we are not
+ * releasing the MR reference count at the same time. Otherwise, we'll "leak"
+ * references to the MR. This difference requires that we keep track of progress
+ * into the sg_list. This is done by the cur_seg cursor in the tid_rdma_request
+ * structure.
+ */
+static u32 kern_find_pages(struct tid_rdma_flow *flow,
+			   struct page **pages,
+			   struct rvt_sge_state *ss, bool *last)
+{
+	struct tid_rdma_request *req = flow->req;
+	struct rvt_sge *sge = &ss->sge;
+	u32 length = flow->req->seg_len;
+	u32 len = PAGE_SIZE;
+	u32 i = 0;
+
+	while (length && req->isge < ss->num_sge) {
+		pages[i++] = virt_to_page(sge->vaddr);
+
+		sge->vaddr += len;
+		sge->length -= len;
+		sge->sge_length -= len;
+		if (!sge->sge_length) {
+			if (++req->isge < ss->num_sge)
+				*sge = ss->sg_list[req->isge - 1];
+		} else if (sge->length == 0 && sge->mr->lkey) {
+			if (++sge->n >= RVT_SEGSZ) {
+				++sge->m;
+				sge->n = 0;
+			}
+			sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr;
+			sge->length = sge->mr->map[sge->m]->segs[sge->n].length;
+		}
+		length -= len;
+	}
+
+	flow->length = flow->req->seg_len - length;
+	*last = req->isge == ss->num_sge ? false : true;
+	return i;
+}
+
+static void dma_unmap_flow(struct tid_rdma_flow *flow)
+{
+	struct hfi1_devdata *dd;
+	int i;
+	struct tid_rdma_pageset *pset;
+
+	dd = flow->req->rcd->dd;
+	for (i = 0, pset = &flow->pagesets[0]; i < flow->npagesets;
+			i++, pset++) {
+		if (pset->count && pset->addr) {
+			dma_unmap_page(&dd->pcidev->dev,
+				       pset->addr,
+				       PAGE_SIZE * pset->count,
+				       DMA_FROM_DEVICE);
+			pset->mapped = 0;
+		}
+	}
+}
+
+static int dma_map_flow(struct tid_rdma_flow *flow, struct page **pages)
+{
+	int i;
+	struct hfi1_devdata *dd = flow->req->rcd->dd;
+	struct tid_rdma_pageset *pset;
+
+	for (i = 0, pset = &flow->pagesets[0]; i < flow->npagesets;
+			i++, pset++) {
+		if (pset->count) {
+			pset->addr = dma_map_page(&dd->pcidev->dev,
+						  pages[pset->idx],
+						  0,
+						  PAGE_SIZE * pset->count,
+						  DMA_FROM_DEVICE);
+
+			if (dma_mapping_error(&dd->pcidev->dev, pset->addr)) {
+				dma_unmap_flow(flow);
+				return -ENOMEM;
+			}
+			pset->mapped = 1;
+		}
+	}
+	return 0;
+}
+
+static inline bool dma_mapped(struct tid_rdma_flow *flow)
+{
+	return !!flow->pagesets[0].mapped;
+}
+
+/*
+ * Get pages pointers and identify contiguous physical memory chunks for a
+ * segment. All segments are of length flow->req->seg_len.
+ */
+static int kern_get_phys_blocks(struct tid_rdma_flow *flow,
+				struct page **pages,
+				struct rvt_sge_state *ss, bool *last)
+{
+	u8 npages;
+
+	/* Reuse previously computed pagesets, if any */
+	if (flow->npagesets) {
+		trace_hfi1_tid_flow_alloc(flow->req->qp, flow->req->setup_head,
+					  flow);
+		if (!dma_mapped(flow))
+			return dma_map_flow(flow, pages);
+		return 0;
+	}
+
+	npages = kern_find_pages(flow, pages, ss, last);
+
+	if (flow->req->qp->pmtu == enum_to_mtu(OPA_MTU_4096))
+		flow->npagesets =
+			tid_rdma_find_phys_blocks_4k(flow, pages, npages,
+						     flow->pagesets);
+	else
+		flow->npagesets =
+			tid_rdma_find_phys_blocks_8k(flow, pages, npages,
+						     flow->pagesets);
+
+	return dma_map_flow(flow, pages);
+}
+
+static inline void kern_add_tid_node(struct tid_rdma_flow *flow,
+				     struct hfi1_ctxtdata *rcd, char *s,
+				     struct tid_group *grp, u8 cnt)
+{
+	struct kern_tid_node *node = &flow->tnode[flow->tnode_cnt++];
+
+	WARN_ON_ONCE(flow->tnode_cnt >=
+		     (TID_RDMA_MAX_SEGMENT_SIZE >> PAGE_SHIFT));
+	if (WARN_ON_ONCE(cnt & 1))
+		dd_dev_err(rcd->dd,
+			   "unexpected odd allocation cnt %u map 0x%x used %u",
+			   cnt, grp->map, grp->used);
+
+	node->grp = grp;
+	node->map = grp->map;
+	node->cnt = cnt;
+	trace_hfi1_tid_node_add(flow->req->qp, s, flow->tnode_cnt - 1,
+				grp->base, grp->map, grp->used, cnt);
+}
+
+/*
+ * Try to allocate pageset_count TID's from TID groups for a context
+ *
+ * This function allocates TID's without moving groups between lists or
+ * modifying grp->map. This is done as follows, being cogizant of the lists
+ * between which the TID groups will move:
+ * 1. First allocate complete groups of 8 TID's since this is more efficient,
+ *    these groups will move from group->full without affecting used
+ * 2. If more TID's are needed allocate from used (will move from used->full or
+ *    stay in used)
+ * 3. If we still don't have the required number of TID's go back and look again
+ *    at a complete group (will move from group->used)
+ */
+static int kern_alloc_tids(struct tid_rdma_flow *flow)
+{
+	struct hfi1_ctxtdata *rcd = flow->req->rcd;
+	struct hfi1_devdata *dd = rcd->dd;
+	u32 ngroups, pageidx = 0;
+	struct tid_group *group = NULL, *used;
+	u8 use;
+
+	flow->tnode_cnt = 0;
+	ngroups = flow->npagesets / dd->rcv_entries.group_size;
+	if (!ngroups)
+		goto used_list;
+
+	/* First look at complete groups */
+	list_for_each_entry(group,  &rcd->tid_group_list.list, list) {
+		kern_add_tid_node(flow, rcd, "complete groups", group,
+				  group->size);
+
+		pageidx += group->size;
+		if (!--ngroups)
+			break;
+	}
+
+	if (pageidx >= flow->npagesets)
+		goto ok;
+
+used_list:
+	/* Now look at partially used groups */
+	list_for_each_entry(used, &rcd->tid_used_list.list, list) {
+		use = min_t(u32, flow->npagesets - pageidx,
+			    used->size - used->used);
+		kern_add_tid_node(flow, rcd, "used groups", used, use);
+
+		pageidx += use;
+		if (pageidx >= flow->npagesets)
+			goto ok;
+	}
+
+	/*
+	 * Look again at a complete group, continuing from where we left.
+	 * However, if we are at the head, we have reached the end of the
+	 * complete groups list from the first loop above
+	 */
+	if (group && &group->list == &rcd->tid_group_list.list)
+		goto bail_eagain;
+	group = list_prepare_entry(group, &rcd->tid_group_list.list,
+				   list);
+	if (list_is_last(&group->list, &rcd->tid_group_list.list))
+		goto bail_eagain;
+	group = list_next_entry(group, list);
+	use = min_t(u32, flow->npagesets - pageidx, group->size);
+	kern_add_tid_node(flow, rcd, "complete continue", group, use);
+	pageidx += use;
+	if (pageidx >= flow->npagesets)
+		goto ok;
+bail_eagain:
+	trace_hfi1_msg_alloc_tids(flow->req->qp, " insufficient tids: needed ",
+				  (u64)flow->npagesets);
+	return -EAGAIN;
+ok:
+	return 0;
+}
+
+static void kern_program_rcv_group(struct tid_rdma_flow *flow, int grp_num,
+				   u32 *pset_idx)
+{
+	struct hfi1_ctxtdata *rcd = flow->req->rcd;
+	struct hfi1_devdata *dd = rcd->dd;
+	struct kern_tid_node *node = &flow->tnode[grp_num];
+	struct tid_group *grp = node->grp;
+	struct tid_rdma_pageset *pset;
+	u32 pmtu_pg = flow->req->qp->pmtu >> PAGE_SHIFT;
+	u32 rcventry, npages = 0, pair = 0, tidctrl;
+	u8 i, cnt = 0;
+
+	for (i = 0; i < grp->size; i++) {
+		rcventry = grp->base + i;
+
+		if (node->map & BIT(i) || cnt >= node->cnt) {
+			rcv_array_wc_fill(dd, rcventry);
+			continue;
+		}
+		pset = &flow->pagesets[(*pset_idx)++];
+		if (pset->count) {
+			hfi1_put_tid(dd, rcventry, PT_EXPECTED,
+				     pset->addr, trdma_pset_order(pset));
+		} else {
+			hfi1_put_tid(dd, rcventry, PT_INVALID, 0, 0);
+		}
+		npages += pset->count;
+
+		rcventry -= rcd->expected_base;
+		tidctrl = pair ? 0x3 : rcventry & 0x1 ? 0x2 : 0x1;
+		/*
+		 * A single TID entry will be used to use a rcvarr pair (with
+		 * tidctrl 0x3), if ALL these are true (a) the bit pos is even
+		 * (b) the group map shows current and the next bits as free
+		 * indicating two consecutive rcvarry entries are available (c)
+		 * we actually need 2 more entries
+		 */
+		pair = !(i & 0x1) && !((node->map >> i) & 0x3) &&
+			node->cnt >= cnt + 2;
+		if (!pair) {
+			if (!pset->count)
+				tidctrl = 0x1;
+			flow->tid_entry[flow->tidcnt++] =
+				EXP_TID_SET(IDX, rcventry >> 1) |
+				EXP_TID_SET(CTRL, tidctrl) |
+				EXP_TID_SET(LEN, npages);
+			trace_hfi1_tid_entry_alloc(/* entry */
+			   flow->req->qp, flow->tidcnt - 1,
+			   flow->tid_entry[flow->tidcnt - 1]);
+
+			/* Efficient DIV_ROUND_UP(npages, pmtu_pg) */
+			flow->npkts += (npages + pmtu_pg - 1) >> ilog2(pmtu_pg);
+			npages = 0;
+		}
+
+		if (grp->used == grp->size - 1)
+			tid_group_move(grp, &rcd->tid_used_list,
+				       &rcd->tid_full_list);
+		else if (!grp->used)
+			tid_group_move(grp, &rcd->tid_group_list,
+				       &rcd->tid_used_list);
+
+		grp->used++;
+		grp->map |= BIT(i);
+		cnt++;
+	}
+}
+
+static void kern_unprogram_rcv_group(struct tid_rdma_flow *flow, int grp_num)
+{
+	struct hfi1_ctxtdata *rcd = flow->req->rcd;
+	struct hfi1_devdata *dd = rcd->dd;
+	struct kern_tid_node *node = &flow->tnode[grp_num];
+	struct tid_group *grp = node->grp;
+	u32 rcventry;
+	u8 i, cnt = 0;
+
+	for (i = 0; i < grp->size; i++) {
+		rcventry = grp->base + i;
+
+		if (node->map & BIT(i) || cnt >= node->cnt) {
+			rcv_array_wc_fill(dd, rcventry);
+			continue;
+		}
+
+		hfi1_put_tid(dd, rcventry, PT_INVALID, 0, 0);
+
+		grp->used--;
+		grp->map &= ~BIT(i);
+		cnt++;
+
+		if (grp->used == grp->size - 1)
+			tid_group_move(grp, &rcd->tid_full_list,
+				       &rcd->tid_used_list);
+		else if (!grp->used)
+			tid_group_move(grp, &rcd->tid_used_list,
+				       &rcd->tid_group_list);
+	}
+	if (WARN_ON_ONCE(cnt & 1)) {
+		struct hfi1_ctxtdata *rcd = flow->req->rcd;
+		struct hfi1_devdata *dd = rcd->dd;
+
+		dd_dev_err(dd, "unexpected odd free cnt %u map 0x%x used %u",
+			   cnt, grp->map, grp->used);
+	}
+}
+
+static void kern_program_rcvarray(struct tid_rdma_flow *flow)
+{
+	u32 pset_idx = 0;
+	int i;
+
+	flow->npkts = 0;
+	flow->tidcnt = 0;
+	for (i = 0; i < flow->tnode_cnt; i++)
+		kern_program_rcv_group(flow, i, &pset_idx);
+	trace_hfi1_tid_flow_alloc(flow->req->qp, flow->req->setup_head, flow);
+}
+
+/**
+ * hfi1_kern_exp_rcv_setup() - setup TID's and flow for one segment of a
+ * TID RDMA request
+ *
+ * @req: TID RDMA request for which the segment/flow is being set up
+ * @ss: sge state, maintains state across successive segments of a sge
+ * @last: set to true after the last sge segment has been processed
+ *
+ * This function
+ * (1) finds a free flow entry in the flow circular buffer
+ * (2) finds pages and continuous physical chunks constituing one segment
+ *     of an sge
+ * (3) allocates TID group entries for those chunks
+ * (4) programs rcvarray entries in the hardware corresponding to those
+ *     TID's
+ * (5) computes a tidarray with formatted TID entries which can be sent
+ *     to the sender
+ * (6) Reserves and programs HW flows.
+ * (7) It also manages queing the QP when TID/flow resources are not
+ *     available.
+ *
+ * @req points to struct tid_rdma_request of which the segments are a part. The
+ * function uses qp, rcd and seg_len members of @req. In the absence of errors,
+ * req->flow_idx is the index of the flow which has been prepared in this
+ * invocation of function call. With flow = &req->flows[req->flow_idx],
+ * flow->tid_entry contains the TID array which the sender can use for TID RDMA
+ * sends and flow->npkts contains number of packets required to send the
+ * segment.
+ *
+ * hfi1_check_sge_align should be called prior to calling this function and if
+ * it signals error TID RDMA cannot be used for this sge and this function
+ * should not be called.
+ *
+ * For the queuing, caller must hold the flow->req->qp s_lock from the send
+ * engine and the function will procure the exp_lock.
+ *
+ * Return:
+ * The function returns -EAGAIN if sufficient number of TID/flow resources to
+ * map the segment could not be allocated. In this case the function should be
+ * called again with previous arguments to retry the TID allocation. There are
+ * no other error returns. The function returns 0 on success.
+ */
+int hfi1_kern_exp_rcv_setup(struct tid_rdma_request *req,
+			    struct rvt_sge_state *ss, bool *last)
+	__must_hold(&req->qp->s_lock)
+{
+	struct tid_rdma_flow *flow = &req->flows[req->setup_head];
+	struct hfi1_ctxtdata *rcd = req->rcd;
+	struct hfi1_qp_priv *qpriv = req->qp->priv;
+	unsigned long flags;
+	struct rvt_qp *fqp;
+	u16 clear_tail = req->clear_tail;
+
+	lockdep_assert_held(&req->qp->s_lock);
+	/*
+	 * We return error if either (a) we don't have space in the flow
+	 * circular buffer, or (b) we already have max entries in the buffer.
+	 * Max entries depend on the type of request we are processing and the
+	 * negotiated TID RDMA parameters.
+	 */
+	if (!CIRC_SPACE(req->setup_head, clear_tail, MAX_FLOWS) ||
+	    CIRC_CNT(req->setup_head, clear_tail, MAX_FLOWS) >=
+	    req->n_flows)
+		return -EINVAL;
+
+	/*
+	 * Get pages, identify contiguous physical memory chunks for the segment
+	 * If we can not determine a DMA address mapping we will treat it just
+	 * like if we ran out of space above.
+	 */
+	if (kern_get_phys_blocks(flow, qpriv->pages, ss, last)) {
+		hfi1_wait_kmem(flow->req->qp);
+		return -ENOMEM;
+	}
+
+	spin_lock_irqsave(&rcd->exp_lock, flags);
+	if (kernel_tid_waiters(rcd, &rcd->rarr_queue, flow->req->qp))
+		goto queue;
+
+	/*
+	 * At this point we know the number of pagesets and hence the number of
+	 * TID's to map the segment. Allocate the TID's from the TID groups. If
+	 * we cannot allocate the required number we exit and try again later
+	 */
+	if (kern_alloc_tids(flow))
+		goto queue;
+	/*
+	 * Finally program the TID entries with the pagesets, compute the
+	 * tidarray and enable the HW flow
+	 */
+	kern_program_rcvarray(flow);
+
+	/*
+	 * Setup the flow state with relevant information.
+	 * This information is used for tracking the sequence of data packets
+	 * for the segment.
+	 * The flow is setup here as this is the most accurate time and place
+	 * to do so. Doing at a later time runs the risk of the flow data in
+	 * qpriv getting out of sync.
+	 */
+	memset(&flow->flow_state, 0x0, sizeof(flow->flow_state));
+	flow->idx = qpriv->flow_state.index;
+	flow->flow_state.generation = qpriv->flow_state.generation;
+	flow->flow_state.spsn = qpriv->flow_state.psn;
+	flow->flow_state.lpsn = flow->flow_state.spsn + flow->npkts - 1;
+	flow->flow_state.r_next_psn =
+		full_flow_psn(flow, flow->flow_state.spsn);
+	qpriv->flow_state.psn += flow->npkts;
+
+	dequeue_tid_waiter(rcd, &rcd->rarr_queue, flow->req->qp);
+	/* get head before dropping lock */
+	fqp = first_qp(rcd, &rcd->rarr_queue);
+	spin_unlock_irqrestore(&rcd->exp_lock, flags);
+	tid_rdma_schedule_tid_wakeup(fqp);
+
+	req->setup_head = (req->setup_head + 1) & (MAX_FLOWS - 1);
+	return 0;
+queue:
+	queue_qp_for_tid_wait(rcd, &rcd->rarr_queue, flow->req->qp);
+	spin_unlock_irqrestore(&rcd->exp_lock, flags);
+	return -EAGAIN;
+}
+
+static void hfi1_tid_rdma_reset_flow(struct tid_rdma_flow *flow)
+{
+	flow->npagesets = 0;
+}
+
+/*
+ * This function is called after one segment has been successfully sent to
+ * release the flow and TID HW/SW resources for that segment. The segments for a
+ * TID RDMA request are setup and cleared in FIFO order which is managed using a
+ * circular buffer.
+ */
+int hfi1_kern_exp_rcv_clear(struct tid_rdma_request *req)
+	__must_hold(&req->qp->s_lock)
+{
+	struct tid_rdma_flow *flow = &req->flows[req->clear_tail];
+	struct hfi1_ctxtdata *rcd = req->rcd;
+	unsigned long flags;
+	int i;
+	struct rvt_qp *fqp;
+
+	lockdep_assert_held(&req->qp->s_lock);
+	/* Exit if we have nothing in the flow circular buffer */
+	if (!CIRC_CNT(req->setup_head, req->clear_tail, MAX_FLOWS))
+		return -EINVAL;
+
+	spin_lock_irqsave(&rcd->exp_lock, flags);
+
+	for (i = 0; i < flow->tnode_cnt; i++)
+		kern_unprogram_rcv_group(flow, i);
+	/* To prevent double unprogramming */
+	flow->tnode_cnt = 0;
+	/* get head before dropping lock */
+	fqp = first_qp(rcd, &rcd->rarr_queue);
+	spin_unlock_irqrestore(&rcd->exp_lock, flags);
+
+	dma_unmap_flow(flow);
+
+	hfi1_tid_rdma_reset_flow(flow);
+	req->clear_tail = (req->clear_tail + 1) & (MAX_FLOWS - 1);
+
+	if (fqp == req->qp) {
+		__trigger_tid_waiter(fqp);
+		rvt_put_qp(fqp);
+	} else {
+		tid_rdma_schedule_tid_wakeup(fqp);
+	}
+
+	return 0;
+}
+
+/*
+ * This function is called to release all the tid entries for
+ * a request.
+ */
+void hfi1_kern_exp_rcv_clear_all(struct tid_rdma_request *req)
+	__must_hold(&req->qp->s_lock)
+{
+	/* Use memory barrier for proper ordering */
+	while (CIRC_CNT(req->setup_head, req->clear_tail, MAX_FLOWS)) {
+		if (hfi1_kern_exp_rcv_clear(req))
+			break;
+	}
+}
+
+/**
+ * hfi1_kern_exp_rcv_free_flows - free priviously allocated flow information
+ * @req - the tid rdma request to be cleaned
+ */
+static void hfi1_kern_exp_rcv_free_flows(struct tid_rdma_request *req)
+{
+	kfree(req->flows);
+	req->flows = NULL;
+}
+
+/**
+ * __trdma_clean_swqe - clean up for large sized QPs
+ * @qp: the queue patch
+ * @wqe: the send wqe
+ */
+void __trdma_clean_swqe(struct rvt_qp *qp, struct rvt_swqe *wqe)
+{
+	struct hfi1_swqe_priv *p = wqe->priv;
+
+	hfi1_kern_exp_rcv_free_flows(&p->tid_req);
+}
+
+/*
+ * This can be called at QP create time or in the data path.
+ */
+static int hfi1_kern_exp_rcv_alloc_flows(struct tid_rdma_request *req,
+					 gfp_t gfp)
+{
+	struct tid_rdma_flow *flows;
+	int i;
+
+	if (likely(req->flows))
+		return 0;
+	flows = kmalloc_node(MAX_FLOWS * sizeof(*flows), gfp,
+			     req->rcd->numa_id);
+	if (!flows)
+		return -ENOMEM;
+	/* mini init */
+	for (i = 0; i < MAX_FLOWS; i++) {
+		flows[i].req = req;
+		flows[i].npagesets = 0;
+		flows[i].pagesets[0].mapped =  0;
+	}
+	req->flows = flows;
+	return 0;
+}
+
+static void hfi1_init_trdma_req(struct rvt_qp *qp,
+				struct tid_rdma_request *req)
+{
+	struct hfi1_qp_priv *qpriv = qp->priv;
+
+	/*
+	 * Initialize various TID RDMA request variables.
+	 * These variables are "static", which is why they
+	 * can be pre-initialized here before the WRs has
+	 * even been submitted.
+	 * However, non-NULL values for these variables do not
+	 * imply that this WQE has been enabled for TID RDMA.
+	 * Drivers should check the WQE's opcode to determine
+	 * if a request is a TID RDMA one or not.
+	 */
+	req->qp = qp;
+	req->rcd = qpriv->rcd;
+}
+
+u64 hfi1_access_sw_tid_wait(const struct cntr_entry *entry,
+			    void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = context;
+
+	return dd->verbs_dev.n_tidwait;
+}
+
+static struct tid_rdma_flow *find_flow_ib(struct tid_rdma_request *req,
+					  u32 psn, u16 *fidx)
+{
+	u16 head, tail;
+	struct tid_rdma_flow *flow;
+
+	head = req->setup_head;
+	tail = req->clear_tail;
+	for ( ; CIRC_CNT(head, tail, MAX_FLOWS);
+	     tail = CIRC_NEXT(tail, MAX_FLOWS)) {
+		flow = &req->flows[tail];
+		if (cmp_psn(psn, flow->flow_state.ib_spsn) >= 0 &&
+		    cmp_psn(psn, flow->flow_state.ib_lpsn) <= 0) {
+			if (fidx)
+				*fidx = tail;
+			return flow;
+		}
+	}
+	return NULL;
+}
+
+static struct tid_rdma_flow *
+__find_flow_ranged(struct tid_rdma_request *req, u16 head, u16 tail,
+		   u32 psn, u16 *fidx)
+{
+	for ( ; CIRC_CNT(head, tail, MAX_FLOWS);
+	      tail = CIRC_NEXT(tail, MAX_FLOWS)) {
+		struct tid_rdma_flow *flow = &req->flows[tail];
+		u32 spsn, lpsn;
+
+		spsn = full_flow_psn(flow, flow->flow_state.spsn);
+		lpsn = full_flow_psn(flow, flow->flow_state.lpsn);
+
+		if (cmp_psn(psn, spsn) >= 0 && cmp_psn(psn, lpsn) <= 0) {
+			if (fidx)
+				*fidx = tail;
+			return flow;
+		}
+	}
+	return NULL;
+}
+
+static struct tid_rdma_flow *find_flow(struct tid_rdma_request *req,
+				       u32 psn, u16 *fidx)
+{
+	return __find_flow_ranged(req, req->setup_head, req->clear_tail, psn,
+				  fidx);
+}
+
+/* TID RDMA READ functions */
+u32 hfi1_build_tid_rdma_read_packet(struct rvt_swqe *wqe,
+				    struct ib_other_headers *ohdr, u32 *bth1,
+				    u32 *bth2, u32 *len)
+{
+	struct tid_rdma_request *req = wqe_to_tid_req(wqe);
+	struct tid_rdma_flow *flow = &req->flows[req->flow_idx];
+	struct rvt_qp *qp = req->qp;
+	struct hfi1_qp_priv *qpriv = qp->priv;
+	struct hfi1_swqe_priv *wpriv = wqe->priv;
+	struct tid_rdma_read_req *rreq = &ohdr->u.tid_rdma.r_req;
+	struct tid_rdma_params *remote;
+	u32 req_len = 0;
+	void *req_addr = NULL;
+
+	/* This is the IB psn used to send the request */
+	*bth2 = mask_psn(flow->flow_state.ib_spsn + flow->pkt);
+	trace_hfi1_tid_flow_build_read_pkt(qp, req->flow_idx, flow);
+
+	/* TID Entries for TID RDMA READ payload */
+	req_addr = &flow->tid_entry[flow->tid_idx];
+	req_len = sizeof(*flow->tid_entry) *
+			(flow->tidcnt - flow->tid_idx);
+
+	memset(&ohdr->u.tid_rdma.r_req, 0, sizeof(ohdr->u.tid_rdma.r_req));
+	wpriv->ss.sge.vaddr = req_addr;
+	wpriv->ss.sge.sge_length = req_len;
+	wpriv->ss.sge.length = wpriv->ss.sge.sge_length;
+	/*
+	 * We can safely zero these out. Since the first SGE covers the
+	 * entire packet, nothing else should even look at the MR.
+	 */
+	wpriv->ss.sge.mr = NULL;
+	wpriv->ss.sge.m = 0;
+	wpriv->ss.sge.n = 0;
+
+	wpriv->ss.sg_list = NULL;
+	wpriv->ss.total_len = wpriv->ss.sge.sge_length;
+	wpriv->ss.num_sge = 1;
+
+	/* Construct the TID RDMA READ REQ packet header */
+	rcu_read_lock();
+	remote = rcu_dereference(qpriv->tid_rdma.remote);
+
+	KDETH_RESET(rreq->kdeth0, KVER, 0x1);
+	KDETH_RESET(rreq->kdeth1, JKEY, remote->jkey);
+	rreq->reth.vaddr = cpu_to_be64(wqe->rdma_wr.remote_addr +
+			   req->cur_seg * req->seg_len + flow->sent);
+	rreq->reth.rkey = cpu_to_be32(wqe->rdma_wr.rkey);
+	rreq->reth.length = cpu_to_be32(*len);
+	rreq->tid_flow_psn =
+		cpu_to_be32((flow->flow_state.generation <<
+			     HFI1_KDETH_BTH_SEQ_SHIFT) |
+			    ((flow->flow_state.spsn + flow->pkt) &
+			     HFI1_KDETH_BTH_SEQ_MASK));
+	rreq->tid_flow_qp =
+		cpu_to_be32(qpriv->tid_rdma.local.qp |
+			    ((flow->idx & TID_RDMA_DESTQP_FLOW_MASK) <<
+			     TID_RDMA_DESTQP_FLOW_SHIFT) |
+			    qpriv->rcd->ctxt);
+	rreq->verbs_qp = cpu_to_be32(qp->remote_qpn);
+	*bth1 &= ~RVT_QPN_MASK;
+	*bth1 |= remote->qp;
+	*bth2 |= IB_BTH_REQ_ACK;
+	rcu_read_unlock();
+
+	/* We are done with this segment */
+	flow->sent += *len;
+	req->cur_seg++;
+	qp->s_state = TID_OP(READ_REQ);
+	req->ack_pending++;
+	req->flow_idx = (req->flow_idx + 1) & (MAX_FLOWS - 1);
+	qpriv->pending_tid_r_segs++;
+	qp->s_num_rd_atomic++;
+
+	/* Set the TID RDMA READ request payload size */
+	*len = req_len;
+
+	return sizeof(ohdr->u.tid_rdma.r_req) / sizeof(u32);
+}
+
+/*
+ * @len: contains the data length to read upon entry and the read request
+ *       payload length upon exit.
+ */
+u32 hfi1_build_tid_rdma_read_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
+				 struct ib_other_headers *ohdr, u32 *bth1,
+				 u32 *bth2, u32 *len)
+	__must_hold(&qp->s_lock)
+{
+	struct hfi1_qp_priv *qpriv = qp->priv;
+	struct tid_rdma_request *req = wqe_to_tid_req(wqe);
+	struct tid_rdma_flow *flow = NULL;
+	u32 hdwords = 0;
+	bool last;
+	bool retry = true;
+	u32 npkts = rvt_div_round_up_mtu(qp, *len);
+
+	trace_hfi1_tid_req_build_read_req(qp, 0, wqe->wr.opcode, wqe->psn,
+					  wqe->lpsn, req);
+	/*
+	 * Check sync conditions. Make sure that there are no pending
+	 * segments before freeing the flow.
+	 */
+sync_check:
+	if (req->state == TID_REQUEST_SYNC) {
+		if (qpriv->pending_tid_r_segs)
+			goto done;
+
+		hfi1_kern_clear_hw_flow(req->rcd, qp);
+		req->state = TID_REQUEST_ACTIVE;
+	}
+
+	/*
+	 * If the request for this segment is resent, the tid resources should
+	 * have been allocated before. In this case, req->flow_idx should
+	 * fall behind req->setup_head.
+	 */
+	if (req->flow_idx == req->setup_head) {
+		retry = false;
+		if (req->state == TID_REQUEST_RESEND) {
+			/*
+			 * This is the first new segment for a request whose
+			 * earlier segments have been re-sent. We need to
+			 * set up the sge pointer correctly.
+			 */
+			restart_sge(&qp->s_sge, wqe, req->s_next_psn,
+				    qp->pmtu);
+			req->isge = 0;
+			req->state = TID_REQUEST_ACTIVE;
+		}
+
+		/*
+		 * Check sync. The last PSN of each generation is reserved for
+		 * RESYNC.
+		 */
+		if ((qpriv->flow_state.psn + npkts) > MAX_TID_FLOW_PSN - 1) {
+			req->state = TID_REQUEST_SYNC;
+			goto sync_check;
+		}
+
+		/* Allocate the flow if not yet */
+		if (hfi1_kern_setup_hw_flow(qpriv->rcd, qp))
+			goto done;
+
+		/*
+		 * The following call will advance req->setup_head after
+		 * allocating the tid entries.
+		 */
+		if (hfi1_kern_exp_rcv_setup(req, &qp->s_sge, &last)) {
+			req->state = TID_REQUEST_QUEUED;
+
+			/*
+			 * We don't have resources for this segment. The QP has
+			 * already been queued.
+			 */
+			goto done;
+		}
+	}
+
+	/* req->flow_idx should only be one slot behind req->setup_head */
+	flow = &req->flows[req->flow_idx];
+	flow->pkt = 0;
+	flow->tid_idx = 0;
+	flow->sent = 0;
+	if (!retry) {
+		/* Set the first and last IB PSN for the flow in use.*/
+		flow->flow_state.ib_spsn = req->s_next_psn;
+		flow->flow_state.ib_lpsn =
+			flow->flow_state.ib_spsn + flow->npkts - 1;
+	}
+
+	/* Calculate the next segment start psn.*/
+	req->s_next_psn += flow->npkts;
+
+	/* Build the packet header */
+	hdwords = hfi1_build_tid_rdma_read_packet(wqe, ohdr, bth1, bth2, len);
+done:
+	return hdwords;
+}
+
+/*
+ * Validate and accept the TID RDMA READ request parameters.
+ * Return 0 if the request is accepted successfully;
+ * Return 1 otherwise.
+ */
+static int tid_rdma_rcv_read_request(struct rvt_qp *qp,
+				     struct rvt_ack_entry *e,
+				     struct hfi1_packet *packet,
+				     struct ib_other_headers *ohdr,
+				     u32 bth0, u32 psn, u64 vaddr, u32 len)
+{
+	struct hfi1_qp_priv *qpriv = qp->priv;
+	struct tid_rdma_request *req;
+	struct tid_rdma_flow *flow;
+	u32 flow_psn, i, tidlen = 0, pktlen, tlen;
+
+	req = ack_to_tid_req(e);
+
+	/* Validate the payload first */
+	flow = &req->flows[req->setup_head];
+
+	/* payload length = packet length - (header length + ICRC length) */
+	pktlen = packet->tlen - (packet->hlen + 4);
+	if (pktlen > sizeof(flow->tid_entry))
+		return 1;
+	memcpy(flow->tid_entry, packet->ebuf, pktlen);
+	flow->tidcnt = pktlen / sizeof(*flow->tid_entry);
+
+	/*
+	 * Walk the TID_ENTRY list to make sure we have enough space for a
+	 * complete segment. Also calculate the number of required packets.
+	 */
+	flow->npkts = rvt_div_round_up_mtu(qp, len);
+	for (i = 0; i < flow->tidcnt; i++) {
+		trace_hfi1_tid_entry_rcv_read_req(qp, i,
+						  flow->tid_entry[i]);
+		tlen = EXP_TID_GET(flow->tid_entry[i], LEN);
+		if (!tlen)
+			return 1;
+
+		/*
+		 * For tid pair (tidctr == 3), the buffer size of the pair
+		 * should be the sum of the buffer size described by each
+		 * tid entry. However, only the first entry needs to be
+		 * specified in the request (see WFR HAS Section 8.5.7.1).
+		 */
+		tidlen += tlen;
+	}
+	if (tidlen * PAGE_SIZE < len)
+		return 1;
+
+	/* Empty the flow array */
+	req->clear_tail = req->setup_head;
+	flow->pkt = 0;
+	flow->tid_idx = 0;
+	flow->tid_offset = 0;
+	flow->sent = 0;
+	flow->tid_qpn = be32_to_cpu(ohdr->u.tid_rdma.r_req.tid_flow_qp);
+	flow->idx = (flow->tid_qpn >> TID_RDMA_DESTQP_FLOW_SHIFT) &
+		    TID_RDMA_DESTQP_FLOW_MASK;
+	flow_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.r_req.tid_flow_psn));
+	flow->flow_state.generation = flow_psn >> HFI1_KDETH_BTH_SEQ_SHIFT;
+	flow->flow_state.spsn = flow_psn & HFI1_KDETH_BTH_SEQ_MASK;
+	flow->length = len;
+
+	flow->flow_state.lpsn = flow->flow_state.spsn +
+		flow->npkts - 1;
+	flow->flow_state.ib_spsn = psn;
+	flow->flow_state.ib_lpsn = flow->flow_state.ib_spsn + flow->npkts - 1;
+
+	trace_hfi1_tid_flow_rcv_read_req(qp, req->setup_head, flow);
+	/* Set the initial flow index to the current flow. */
+	req->flow_idx = req->setup_head;
+
+	/* advance circular buffer head */
+	req->setup_head = (req->setup_head + 1) & (MAX_FLOWS - 1);
+
+	/*
+	 * Compute last PSN for request.
+	 */
+	e->opcode = (bth0 >> 24) & 0xff;
+	e->psn = psn;
+	e->lpsn = psn + flow->npkts - 1;
+	e->sent = 0;
+
+	req->n_flows = qpriv->tid_rdma.local.max_read;
+	req->state = TID_REQUEST_ACTIVE;
+	req->cur_seg = 0;
+	req->comp_seg = 0;
+	req->ack_seg = 0;
+	req->isge = 0;
+	req->seg_len = qpriv->tid_rdma.local.max_len;
+	req->total_len = len;
+	req->total_segs = 1;
+	req->r_flow_psn = e->psn;
+
+	trace_hfi1_tid_req_rcv_read_req(qp, 0, e->opcode, e->psn, e->lpsn,
+					req);
+	return 0;
+}
+
+static int tid_rdma_rcv_error(struct hfi1_packet *packet,
+			      struct ib_other_headers *ohdr,
+			      struct rvt_qp *qp, u32 psn, int diff)
+{
+	struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+	struct hfi1_ctxtdata *rcd = ((struct hfi1_qp_priv *)qp->priv)->rcd;
+	struct rvt_ack_entry *e;
+	struct tid_rdma_request *req;
+	unsigned long flags;
+	u8 prev;
+	bool old_req;
+
+	trace_hfi1_rsp_tid_rcv_error(qp, psn);
+	trace_hfi1_tid_rdma_rcv_err(qp, 0, psn, diff);
+	if (diff > 0) {
+		/* sequence error */
+		if (!qp->r_nak_state) {
+			ibp->rvp.n_rc_seqnak++;
+			qp->r_nak_state = IB_NAK_PSN_ERROR;
+			qp->r_ack_psn = qp->r_psn;
+			rc_defered_ack(rcd, qp);
+		}
+		goto done;
+	}
+
+	ibp->rvp.n_rc_dupreq++;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+	e = find_prev_entry(qp, psn, &prev, NULL, &old_req);
+	if (!e || e->opcode != TID_OP(READ_REQ))
+		goto unlock;
+
+	req = ack_to_tid_req(e);
+	req->r_flow_psn = psn;
+	trace_hfi1_tid_req_rcv_err(qp, 0, e->opcode, e->psn, e->lpsn, req);
+	if (e->opcode == TID_OP(READ_REQ)) {
+		struct ib_reth *reth;
+		u32 offset;
+		u32 len;
+		u32 rkey;
+		u64 vaddr;
+		int ok;
+		u32 bth0;
+
+		reth = &ohdr->u.tid_rdma.r_req.reth;
+		/*
+		 * The requester always restarts from the start of the original
+		 * request.
+		 */
+		offset = delta_psn(psn, e->psn) * qp->pmtu;
+		len = be32_to_cpu(reth->length);
+		if (psn != e->psn || len != req->total_len)
+			goto unlock;
+
+		if (e->rdma_sge.mr) {
+			rvt_put_mr(e->rdma_sge.mr);
+			e->rdma_sge.mr = NULL;
+		}
+
+		rkey = be32_to_cpu(reth->rkey);
+		vaddr = get_ib_reth_vaddr(reth);
+
+		qp->r_len = len;
+		ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr, rkey,
+				 IB_ACCESS_REMOTE_READ);
+		if (unlikely(!ok))
+			goto unlock;
+
+		/*
+		 * If all the response packets for the current request have
+		 * been sent out and this request is complete (old_request
+		 * == false) and the TID flow may be unusable (the
+		 * req->clear_tail is advanced). However, when an earlier
+		 * request is received, this request will not be complete any
+		 * more (qp->s_tail_ack_queue is moved back, see below).
+		 * Consequently, we need to update the TID flow info everytime
+		 * a duplicate request is received.
+		 */
+		bth0 = be32_to_cpu(ohdr->bth[0]);
+		if (tid_rdma_rcv_read_request(qp, e, packet, ohdr, bth0, psn,
+					      vaddr, len))
+			goto unlock;
+
+		/*
+		 * True if the request is already scheduled (between
+		 * qp->s_tail_ack_queue and qp->r_head_ack_queue);
+		 */
+		if (old_req)
+			goto unlock;
+	}
+	/* Re-process old requests.*/
+	qp->s_tail_ack_queue = prev;
+	/*
+	 * Since the qp->s_tail_ack_queue is modified, the
+	 * qp->s_ack_state must be changed to re-initialize
+	 * qp->s_ack_rdma_sge; Otherwise, we will end up in
+	 * wrong memory region.
+	 */
+	qp->s_ack_state = OP(ACKNOWLEDGE);
+	qp->r_state = e->opcode;
+	qp->r_nak_state = 0;
+	qp->s_flags |= RVT_S_RESP_PENDING;
+	hfi1_schedule_send(qp);
+unlock:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+done:
+	return 1;
+}
+
+void hfi1_rc_rcv_tid_rdma_read_req(struct hfi1_packet *packet)
+{
+	/* HANDLER FOR TID RDMA READ REQUEST packet (Responder side)*/
+
+	/*
+	 * 1. Verify TID RDMA READ REQ as per IB_OPCODE_RC_RDMA_READ
+	 *    (see hfi1_rc_rcv())
+	 * 2. Put TID RDMA READ REQ into the response queueu (s_ack_queue)
+	 *     - Setup struct tid_rdma_req with request info
+	 *     - Initialize struct tid_rdma_flow info;
+	 *     - Copy TID entries;
+	 * 3. Set the qp->s_ack_state.
+	 * 4. Set RVT_S_RESP_PENDING in s_flags.
+	 * 5. Kick the send engine (hfi1_schedule_send())
+	 */
+	struct hfi1_ctxtdata *rcd = packet->rcd;
+	struct rvt_qp *qp = packet->qp;
+	struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+	struct ib_other_headers *ohdr = packet->ohdr;
+	struct rvt_ack_entry *e;
+	unsigned long flags;
+	struct ib_reth *reth;
+	struct hfi1_qp_priv *qpriv = qp->priv;
+	u32 bth0, psn, len, rkey;
+	bool is_fecn;
+	u8 next;
+	u64 vaddr;
+	int diff;
+	u8 nack_state = IB_NAK_INVALID_REQUEST;
+
+	bth0 = be32_to_cpu(ohdr->bth[0]);
+	if (hfi1_ruc_check_hdr(ibp, packet))
+		return;
+
+	is_fecn = process_ecn(qp, packet);
+	psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
+	trace_hfi1_rsp_rcv_tid_read_req(qp, psn);
+
+	if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST))
+		rvt_comm_est(qp);
+
+	if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
+		goto nack_inv;
+
+	reth = &ohdr->u.tid_rdma.r_req.reth;
+	vaddr = be64_to_cpu(reth->vaddr);
+	len = be32_to_cpu(reth->length);
+	/* The length needs to be in multiples of PAGE_SIZE */
+	if (!len || len & ~PAGE_MASK || len > qpriv->tid_rdma.local.max_len)
+		goto nack_inv;
+
+	diff = delta_psn(psn, qp->r_psn);
+	if (unlikely(diff)) {
+		if (tid_rdma_rcv_error(packet, ohdr, qp, psn, diff))
+			return;
+		goto send_ack;
+	}
+
+	/* We've verified the request, insert it into the ack queue. */
+	next = qp->r_head_ack_queue + 1;
+	if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
+		next = 0;
+	spin_lock_irqsave(&qp->s_lock, flags);
+	if (unlikely(next == qp->s_tail_ack_queue)) {
+		if (!qp->s_ack_queue[next].sent) {
+			nack_state = IB_NAK_REMOTE_OPERATIONAL_ERROR;
+			goto nack_inv_unlock;
+		}
+		update_ack_queue(qp, next);
+	}
+	e = &qp->s_ack_queue[qp->r_head_ack_queue];
+	if (e->rdma_sge.mr) {
+		rvt_put_mr(e->rdma_sge.mr);
+		e->rdma_sge.mr = NULL;
+	}
+
+	rkey = be32_to_cpu(reth->rkey);
+	qp->r_len = len;
+
+	if (unlikely(!rvt_rkey_ok(qp, &e->rdma_sge, qp->r_len, vaddr,
+				  rkey, IB_ACCESS_REMOTE_READ)))
+		goto nack_acc;
+
+	/* Accept the request parameters */
+	if (tid_rdma_rcv_read_request(qp, e, packet, ohdr, bth0, psn, vaddr,
+				      len))
+		goto nack_inv_unlock;
+
+	qp->r_state = e->opcode;
+	qp->r_nak_state = 0;
+	/*
+	 * We need to increment the MSN here instead of when we
+	 * finish sending the result since a duplicate request would
+	 * increment it more than once.
+	 */
+	qp->r_msn++;
+	qp->r_psn += e->lpsn - e->psn + 1;
+
+	qp->r_head_ack_queue = next;
+
+	/* Schedule the send tasklet. */
+	qp->s_flags |= RVT_S_RESP_PENDING;
+	hfi1_schedule_send(qp);
+
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+	if (is_fecn)
+		goto send_ack;
+	return;
+
+nack_inv_unlock:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+nack_inv:
+	rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+	qp->r_nak_state = nack_state;
+	qp->r_ack_psn = qp->r_psn;
+	/* Queue NAK for later */
+	rc_defered_ack(rcd, qp);
+	return;
+nack_acc:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+	rvt_rc_error(qp, IB_WC_LOC_PROT_ERR);
+	qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
+	qp->r_ack_psn = qp->r_psn;
+send_ack:
+	hfi1_send_rc_ack(packet, is_fecn);
+}
+
+u32 hfi1_build_tid_rdma_read_resp(struct rvt_qp *qp, struct rvt_ack_entry *e,
+				  struct ib_other_headers *ohdr, u32 *bth0,
+				  u32 *bth1, u32 *bth2, u32 *len, bool *last)
+{
+	struct hfi1_ack_priv *epriv = e->priv;
+	struct tid_rdma_request *req = &epriv->tid_req;
+	struct hfi1_qp_priv *qpriv = qp->priv;
+	struct tid_rdma_flow *flow = &req->flows[req->clear_tail];
+	u32 tidentry = flow->tid_entry[flow->tid_idx];
+	u32 tidlen = EXP_TID_GET(tidentry, LEN) << PAGE_SHIFT;
+	struct tid_rdma_read_resp *resp = &ohdr->u.tid_rdma.r_rsp;
+	u32 next_offset, om = KDETH_OM_LARGE;
+	bool last_pkt;
+	u32 hdwords = 0;
+	struct tid_rdma_params *remote;
+
+	*len = min_t(u32, qp->pmtu, tidlen - flow->tid_offset);
+	flow->sent += *len;
+	next_offset = flow->tid_offset + *len;
+	last_pkt = (flow->sent >= flow->length);
+
+	trace_hfi1_tid_entry_build_read_resp(qp, flow->tid_idx, tidentry);
+	trace_hfi1_tid_flow_build_read_resp(qp, req->clear_tail, flow);
+
+	rcu_read_lock();
+	remote = rcu_dereference(qpriv->tid_rdma.remote);
+	if (!remote) {
+		rcu_read_unlock();
+		goto done;
+	}
+	KDETH_RESET(resp->kdeth0, KVER, 0x1);
+	KDETH_SET(resp->kdeth0, SH, !last_pkt);
+	KDETH_SET(resp->kdeth0, INTR, !!(!last_pkt && remote->urg));
+	KDETH_SET(resp->kdeth0, TIDCTRL, EXP_TID_GET(tidentry, CTRL));
+	KDETH_SET(resp->kdeth0, TID, EXP_TID_GET(tidentry, IDX));
+	KDETH_SET(resp->kdeth0, OM, om == KDETH_OM_LARGE);
+	KDETH_SET(resp->kdeth0, OFFSET, flow->tid_offset / om);
+	KDETH_RESET(resp->kdeth1, JKEY, remote->jkey);
+	resp->verbs_qp = cpu_to_be32(qp->remote_qpn);
+	rcu_read_unlock();
+
+	resp->aeth = rvt_compute_aeth(qp);
+	resp->verbs_psn = cpu_to_be32(mask_psn(flow->flow_state.ib_spsn +
+					       flow->pkt));
+
+	*bth0 = TID_OP(READ_RESP) << 24;
+	*bth1 = flow->tid_qpn;
+	*bth2 = mask_psn(((flow->flow_state.spsn + flow->pkt++) &
+			  HFI1_KDETH_BTH_SEQ_MASK) |
+			 (flow->flow_state.generation <<
+			  HFI1_KDETH_BTH_SEQ_SHIFT));
+	*last = last_pkt;
+	if (last_pkt)
+		/* Advance to next flow */
+		req->clear_tail = (req->clear_tail + 1) &
+				  (MAX_FLOWS - 1);
+
+	if (next_offset >= tidlen) {
+		flow->tid_offset = 0;
+		flow->tid_idx++;
+	} else {
+		flow->tid_offset = next_offset;
+	}
+
+	hdwords = sizeof(ohdr->u.tid_rdma.r_rsp) / sizeof(u32);
+
+done:
+	return hdwords;
+}
+
+static inline struct tid_rdma_request *
+find_tid_request(struct rvt_qp *qp, u32 psn, enum ib_wr_opcode opcode)
+	__must_hold(&qp->s_lock)
+{
+	struct rvt_swqe *wqe;
+	struct tid_rdma_request *req = NULL;
+	u32 i, end;
+
+	end = qp->s_cur + 1;
+	if (end == qp->s_size)
+		end = 0;
+	for (i = qp->s_acked; i != end;) {
+		wqe = rvt_get_swqe_ptr(qp, i);
+		if (cmp_psn(psn, wqe->psn) >= 0 &&
+		    cmp_psn(psn, wqe->lpsn) <= 0) {
+			if (wqe->wr.opcode == opcode)
+				req = wqe_to_tid_req(wqe);
+			break;
+		}
+		if (++i == qp->s_size)
+			i = 0;
+	}
+
+	return req;
+}
+
+void hfi1_rc_rcv_tid_rdma_read_resp(struct hfi1_packet *packet)
+{
+	/* HANDLER FOR TID RDMA READ RESPONSE packet (Requestor side */
+
+	/*
+	 * 1. Find matching SWQE
+	 * 2. Check that the entire segment has been read.
+	 * 3. Remove HFI1_S_WAIT_TID_RESP from s_flags.
+	 * 4. Free the TID flow resources.
+	 * 5. Kick the send engine (hfi1_schedule_send())
+	 */
+	struct ib_other_headers *ohdr = packet->ohdr;
+	struct rvt_qp *qp = packet->qp;
+	struct hfi1_qp_priv *priv = qp->priv;
+	struct hfi1_ctxtdata *rcd = packet->rcd;
+	struct tid_rdma_request *req;
+	struct tid_rdma_flow *flow;
+	u32 opcode, aeth;
+	bool is_fecn;
+	unsigned long flags;
+	u32 kpsn, ipsn;
+
+	trace_hfi1_sender_rcv_tid_read_resp(qp);
+	is_fecn = process_ecn(qp, packet);
+	kpsn = mask_psn(be32_to_cpu(ohdr->bth[2]));
+	aeth = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.aeth);
+	opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+	ipsn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_psn));
+	req = find_tid_request(qp, ipsn, IB_WR_TID_RDMA_READ);
+	if (unlikely(!req))
+		goto ack_op_err;
+
+	flow = &req->flows[req->clear_tail];
+	/* When header suppression is disabled */
+	if (cmp_psn(ipsn, flow->flow_state.ib_lpsn))
+		goto ack_done;
+	req->ack_pending--;
+	priv->pending_tid_r_segs--;
+	qp->s_num_rd_atomic--;
+	if ((qp->s_flags & RVT_S_WAIT_FENCE) &&
+	    !qp->s_num_rd_atomic) {
+		qp->s_flags &= ~(RVT_S_WAIT_FENCE |
+				 RVT_S_WAIT_ACK);
+		hfi1_schedule_send(qp);
+	}
+	if (qp->s_flags & RVT_S_WAIT_RDMAR) {
+		qp->s_flags &= ~(RVT_S_WAIT_RDMAR | RVT_S_WAIT_ACK);
+		hfi1_schedule_send(qp);
+	}
+
+	trace_hfi1_ack(qp, ipsn);
+	trace_hfi1_tid_req_rcv_read_resp(qp, 0, req->e.swqe->wr.opcode,
+					 req->e.swqe->psn, req->e.swqe->lpsn,
+					 req);
+	trace_hfi1_tid_flow_rcv_read_resp(qp, req->clear_tail, flow);
+
+	/* Release the tid resources */
+	hfi1_kern_exp_rcv_clear(req);
+
+	if (!do_rc_ack(qp, aeth, ipsn, opcode, 0, rcd))
+		goto ack_done;
+
+	/* If not done yet, build next read request */
+	if (++req->comp_seg >= req->total_segs) {
+		priv->tid_r_comp++;
+		req->state = TID_REQUEST_COMPLETE;
+	}
+
+	/*
+	 * Clear the hw flow under two conditions:
+	 * 1. This request is a sync point and it is complete;
+	 * 2. Current request is completed and there are no more requests.
+	 */
+	if ((req->state == TID_REQUEST_SYNC &&
+	     req->comp_seg == req->cur_seg) ||
+	    priv->tid_r_comp == priv->tid_r_reqs) {
+		hfi1_kern_clear_hw_flow(priv->rcd, qp);
+		if (req->state == TID_REQUEST_SYNC)
+			req->state = TID_REQUEST_ACTIVE;
+	}
+
+	hfi1_schedule_send(qp);
+	goto ack_done;
+
+ack_op_err:
+	/*
+	 * The test indicates that the send engine has finished its cleanup
+	 * after sending the request and it's now safe to put the QP into error
+	 * state. However, if the wqe queue is empty (qp->s_acked == qp->s_tail
+	 * == qp->s_head), it would be unsafe to complete the wqe pointed by
+	 * qp->s_acked here. Putting the qp into error state will safely flush
+	 * all remaining requests.
+	 */
+	if (qp->s_last == qp->s_acked)
+		rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+
+ack_done:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+	if (is_fecn)
+		hfi1_send_rc_ack(packet, is_fecn);
+}
+
+void hfi1_kern_read_tid_flow_free(struct rvt_qp *qp)
+	__must_hold(&qp->s_lock)
+{
+	u32 n = qp->s_acked;
+	struct rvt_swqe *wqe;
+	struct tid_rdma_request *req;
+	struct hfi1_qp_priv *priv = qp->priv;
+
+	lockdep_assert_held(&qp->s_lock);
+	/* Free any TID entries */
+	while (n != qp->s_tail) {
+		wqe = rvt_get_swqe_ptr(qp, n);
+		if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
+			req = wqe_to_tid_req(wqe);
+			hfi1_kern_exp_rcv_clear_all(req);
+		}
+
+		if (++n == qp->s_size)
+			n = 0;
+	}
+	/* Free flow */
+	hfi1_kern_clear_hw_flow(priv->rcd, qp);
+}
+
+static bool tid_rdma_tid_err(struct hfi1_ctxtdata *rcd,
+			     struct hfi1_packet *packet, u8 rcv_type,
+			     u8 opcode)
+{
+	struct rvt_qp *qp = packet->qp;
+	u32 ipsn;
+	struct ib_other_headers *ohdr = packet->ohdr;
+
+	if (rcv_type >= RHF_RCV_TYPE_IB)
+		goto done;
+
+	spin_lock(&qp->s_lock);
+	/*
+	 * For TID READ response, error out QP after freeing the tid
+	 * resources.
+	 */
+	if (opcode == TID_OP(READ_RESP)) {
+		ipsn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_psn));
+		if (cmp_psn(ipsn, qp->s_last_psn) > 0 &&
+		    cmp_psn(ipsn, qp->s_psn) < 0) {
+			hfi1_kern_read_tid_flow_free(qp);
+			spin_unlock(&qp->s_lock);
+			rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+			goto done;
+		}
+	}
+
+	spin_unlock(&qp->s_lock);
+done:
+	return true;
+}
+
+static void restart_tid_rdma_read_req(struct hfi1_ctxtdata *rcd,
+				      struct rvt_qp *qp, struct rvt_swqe *wqe)
+{
+	struct tid_rdma_request *req;
+	struct tid_rdma_flow *flow;
+
+	/* Start from the right segment */
+	qp->r_flags |= RVT_R_RDMAR_SEQ;
+	req = wqe_to_tid_req(wqe);
+	flow = &req->flows[req->clear_tail];
+	hfi1_restart_rc(qp, flow->flow_state.ib_spsn, 0);
+	if (list_empty(&qp->rspwait)) {
+		qp->r_flags |= RVT_R_RSP_SEND;
+		rvt_get_qp(qp);
+		list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
+	}
+}
+
+/*
+ * Handle the KDETH eflags for TID RDMA READ response.
+ *
+ * Return true if the last packet for a segment has been received and it is
+ * time to process the response normally; otherwise, return true.
+ *
+ * The caller must hold the packet->qp->r_lock and the rcu_read_lock.
+ */
+static bool handle_read_kdeth_eflags(struct hfi1_ctxtdata *rcd,
+				     struct hfi1_packet *packet, u8 rcv_type,
+				     u8 rte, u32 psn, u32 ibpsn)
+	__must_hold(&packet->qp->r_lock) __must_hold(RCU)
+{
+	struct hfi1_pportdata *ppd = rcd->ppd;
+	struct hfi1_devdata *dd = ppd->dd;
+	struct hfi1_ibport *ibp;
+	struct rvt_swqe *wqe;
+	struct tid_rdma_request *req;
+	struct tid_rdma_flow *flow;
+	u32 ack_psn;
+	struct rvt_qp *qp = packet->qp;
 	struct hfi1_qp_priv *priv = qp->priv;
+	bool ret = true;
+	int diff = 0;
+	u32 fpsn;
+
+	lockdep_assert_held(&qp->r_lock);
+	/* If the psn is out of valid range, drop the packet */
+	if (cmp_psn(ibpsn, qp->s_last_psn) < 0 ||
+	    cmp_psn(ibpsn, qp->s_psn) > 0)
+		return ret;
+
+	spin_lock(&qp->s_lock);
+	/*
+	 * Note that NAKs implicitly ACK outstanding SEND and RDMA write
+	 * requests and implicitly NAK RDMA read and atomic requests issued
+	 * before the NAK'ed request.
+	 */
+	ack_psn = ibpsn - 1;
+	wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+	ibp = to_iport(qp->ibqp.device, qp->port_num);
+
+	/* Complete WQEs that the PSN finishes. */
+	while ((int)delta_psn(ack_psn, wqe->lpsn) >= 0) {
+		/*
+		 * If this request is a RDMA read or atomic, and the NACK is
+		 * for a later operation, this NACK NAKs the RDMA read or
+		 * atomic.
+		 */
+		if (wqe->wr.opcode == IB_WR_RDMA_READ ||
+		    wqe->wr.opcode == IB_WR_TID_RDMA_READ ||
+		    wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+		    wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) {
+			/* Retry this request. */
+			if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) {
+				qp->r_flags |= RVT_R_RDMAR_SEQ;
+				if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
+					restart_tid_rdma_read_req(rcd, qp,
+								  wqe);
+				} else {
+					hfi1_restart_rc(qp, qp->s_last_psn + 1,
+							0);
+					if (list_empty(&qp->rspwait)) {
+						qp->r_flags |= RVT_R_RSP_SEND;
+						rvt_get_qp(qp);
+						list_add_tail(/* wait */
+						   &qp->rspwait,
+						   &rcd->qp_wait_list);
+					}
+				}
+			}
+			/*
+			 * No need to process the NAK since we are
+			 * restarting an earlier request.
+			 */
+			break;
+		}
+
+		wqe = do_rc_completion(qp, wqe, ibp);
+		if (qp->s_acked == qp->s_tail)
+			break;
+	}
+
+	/* Handle the eflags for the request */
+	if (wqe->wr.opcode != IB_WR_TID_RDMA_READ)
+		goto s_unlock;
+
+	req = wqe_to_tid_req(wqe);
+	switch (rcv_type) {
+	case RHF_RCV_TYPE_EXPECTED:
+		switch (rte) {
+		case RHF_RTE_EXPECTED_FLOW_SEQ_ERR:
+			/*
+			 * On the first occurrence of a Flow Sequence error,
+			 * the flag TID_FLOW_SW_PSN is set.
+			 *
+			 * After that, the flow is *not* reprogrammed and the
+			 * protocol falls back to SW PSN checking. This is done
+			 * to prevent continuous Flow Sequence errors for any
+			 * packets that could be still in the fabric.
+			 */
+			flow = find_flow(req, psn, NULL);
+			if (!flow) {
+				/*
+				 * We can't find the IB PSN matching the
+				 * received KDETH PSN. The only thing we can
+				 * do at this point is report the error to
+				 * the QP.
+				 */
+				hfi1_kern_read_tid_flow_free(qp);
+				spin_unlock(&qp->s_lock);
+				rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+				return ret;
+			}
+			if (priv->flow_state.flags & TID_FLOW_SW_PSN) {
+				diff = cmp_psn(psn,
+					       priv->flow_state.r_next_psn);
+				if (diff > 0) {
+					if (!(qp->r_flags & RVT_R_RDMAR_SEQ))
+						restart_tid_rdma_read_req(rcd,
+									  qp,
+									  wqe);
+
+					/* Drop the packet.*/
+					goto s_unlock;
+				} else if (diff < 0) {
+					/*
+					 * If a response packet for a restarted
+					 * request has come back, reset the
+					 * restart flag.
+					 */
+					if (qp->r_flags & RVT_R_RDMAR_SEQ)
+						qp->r_flags &=
+							~RVT_R_RDMAR_SEQ;
 
-	if (qp->ibqp.qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA))
-		cancel_work_sync(&priv->opfn.opfn_work);
+					/* Drop the packet.*/
+					goto s_unlock;
+				}
+
+				/*
+				 * If SW PSN verification is successful and
+				 * this is the last packet in the segment, tell
+				 * the caller to process it as a normal packet.
+				 */
+				fpsn = full_flow_psn(flow,
+						     flow->flow_state.lpsn);
+				if (cmp_psn(fpsn, psn) == 0) {
+					ret = false;
+					if (qp->r_flags & RVT_R_RDMAR_SEQ)
+						qp->r_flags &=
+							~RVT_R_RDMAR_SEQ;
+				}
+				priv->flow_state.r_next_psn++;
+			} else {
+				u64 reg;
+				u32 last_psn;
+
+				/*
+				 * The only sane way to get the amount of
+				 * progress is to read the HW flow state.
+				 */
+				reg = read_uctxt_csr(dd, rcd->ctxt,
+						     RCV_TID_FLOW_TABLE +
+						     (8 * flow->idx));
+				last_psn = mask_psn(reg);
+
+				priv->flow_state.r_next_psn = last_psn;
+				priv->flow_state.flags |= TID_FLOW_SW_PSN;
+				/*
+				 * If no request has been restarted yet,
+				 * restart the current one.
+				 */
+				if (!(qp->r_flags & RVT_R_RDMAR_SEQ))
+					restart_tid_rdma_read_req(rcd, qp,
+								  wqe);
+			}
+
+			break;
+
+		case RHF_RTE_EXPECTED_FLOW_GEN_ERR:
+			/*
+			 * Since the TID flow is able to ride through
+			 * generation mismatch, drop this stale packet.
+			 */
+			break;
+
+		default:
+			break;
+		}
+		break;
+
+	case RHF_RCV_TYPE_ERROR:
+		switch (rte) {
+		case RHF_RTE_ERROR_OP_CODE_ERR:
+		case RHF_RTE_ERROR_KHDR_MIN_LEN_ERR:
+		case RHF_RTE_ERROR_KHDR_HCRC_ERR:
+		case RHF_RTE_ERROR_KHDR_KVER_ERR:
+		case RHF_RTE_ERROR_CONTEXT_ERR:
+		case RHF_RTE_ERROR_KHDR_TID_ERR:
+		default:
+			break;
+		}
+	default:
+		break;
+	}
+s_unlock:
+	spin_unlock(&qp->s_lock);
+	return ret;
+}
+
+bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd,
+			      struct hfi1_pportdata *ppd,
+			      struct hfi1_packet *packet)
+{
+	struct hfi1_ibport *ibp = &ppd->ibport_data;
+	struct hfi1_devdata *dd = ppd->dd;
+	struct rvt_dev_info *rdi = &dd->verbs_dev.rdi;
+	u8 rcv_type = rhf_rcv_type(packet->rhf);
+	u8 rte = rhf_rcv_type_err(packet->rhf);
+	struct ib_header *hdr = packet->hdr;
+	struct ib_other_headers *ohdr = NULL;
+	int lnh = be16_to_cpu(hdr->lrh[0]) & 3;
+	u16 lid  = be16_to_cpu(hdr->lrh[1]);
+	u8 opcode;
+	u32 qp_num, psn, ibpsn;
+	struct rvt_qp *qp;
+	unsigned long flags;
+	bool ret = true;
+
+	trace_hfi1_msg_handle_kdeth_eflags(NULL, "Kdeth error: rhf ",
+					   packet->rhf);
+	if (packet->rhf & (RHF_VCRC_ERR | RHF_ICRC_ERR))
+		return ret;
+
+	packet->ohdr = &hdr->u.oth;
+	ohdr = packet->ohdr;
+	trace_input_ibhdr(rcd->dd, packet, !!(rhf_dc_info(packet->rhf)));
+
+	/* Get the destination QP number. */
+	qp_num = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_qp) &
+		RVT_QPN_MASK;
+	if (lid >= be16_to_cpu(IB_MULTICAST_LID_BASE))
+		goto drop;
+
+	psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
+	opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
+
+	rcu_read_lock();
+	qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num);
+	if (!qp)
+		goto rcu_unlock;
+
+	packet->qp = qp;
+
+	/* Check for valid receive state. */
+	spin_lock_irqsave(&qp->r_lock, flags);
+	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
+		ibp->rvp.n_pkt_drops++;
+		goto r_unlock;
+	}
+
+	if (packet->rhf & RHF_TID_ERR) {
+		/* For TIDERR and RC QPs preemptively schedule a NAK */
+		u32 tlen = rhf_pkt_len(packet->rhf); /* in bytes */
+
+		/* Sanity check packet */
+		if (tlen < 24)
+			goto r_unlock;
+
+		/*
+		 * Check for GRH. We should never get packets with GRH in this
+		 * path.
+		 */
+		if (lnh == HFI1_LRH_GRH)
+			goto r_unlock;
+
+		if (tid_rdma_tid_err(rcd, packet, rcv_type, opcode))
+			goto r_unlock;
+	}
+
+	/* handle TID RDMA READ */
+	if (opcode == TID_OP(READ_RESP)) {
+		ibpsn = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_psn);
+		ibpsn = mask_psn(ibpsn);
+		ret = handle_read_kdeth_eflags(rcd, packet, rcv_type, rte, psn,
+					       ibpsn);
+	}
+
+r_unlock:
+	spin_unlock_irqrestore(&qp->r_lock, flags);
+rcu_unlock:
+	rcu_read_unlock();
+drop:
+	return ret;
+}
+
+/*
+ * "Rewind" the TID request information.
+ * This means that we reset the state back to ACTIVE,
+ * find the proper flow, set the flow index to that flow,
+ * and reset the flow information.
+ */
+void hfi1_tid_rdma_restart_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
+			       u32 *bth2)
+{
+	struct tid_rdma_request *req = wqe_to_tid_req(wqe);
+	struct tid_rdma_flow *flow;
+	int diff;
+	u32 tididx = 0;
+	u16 fidx;
+
+	if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
+		*bth2 = mask_psn(qp->s_psn);
+		flow = find_flow_ib(req, *bth2, &fidx);
+		if (!flow) {
+			trace_hfi1_msg_tid_restart_req(/* msg */
+			   qp, "!!!!!! Could not find flow to restart: bth2 ",
+			   (u64)*bth2);
+			trace_hfi1_tid_req_restart_req(qp, 0, wqe->wr.opcode,
+						       wqe->psn, wqe->lpsn,
+						       req);
+			return;
+		}
+	} else {
+		return;
+	}
+
+	trace_hfi1_tid_flow_restart_req(qp, fidx, flow);
+	diff = delta_psn(*bth2, flow->flow_state.ib_spsn);
+
+	flow->sent = 0;
+	flow->pkt = 0;
+	flow->tid_idx = 0;
+	flow->tid_offset = 0;
+	if (diff) {
+		for (tididx = 0; tididx < flow->tidcnt; tididx++) {
+			u32 tidentry = flow->tid_entry[tididx], tidlen,
+				tidnpkts, npkts;
+
+			flow->tid_offset = 0;
+			tidlen = EXP_TID_GET(tidentry, LEN) * PAGE_SIZE;
+			tidnpkts = rvt_div_round_up_mtu(qp, tidlen);
+			npkts = min_t(u32, diff, tidnpkts);
+			flow->pkt += npkts;
+			flow->sent += (npkts == tidnpkts ? tidlen :
+				       npkts * qp->pmtu);
+			flow->tid_offset += npkts * qp->pmtu;
+			diff -= npkts;
+			if (!diff)
+				break;
+		}
+	}
+
+	if (flow->tid_offset ==
+	    EXP_TID_GET(flow->tid_entry[tididx], LEN) * PAGE_SIZE) {
+		tididx++;
+		flow->tid_offset = 0;
+	}
+	flow->tid_idx = tididx;
+	/* Move flow_idx to correct index */
+	req->flow_idx = fidx;
+
+	trace_hfi1_tid_flow_restart_req(qp, fidx, flow);
+	trace_hfi1_tid_req_restart_req(qp, 0, wqe->wr.opcode, wqe->psn,
+				       wqe->lpsn, req);
+	req->state = TID_REQUEST_ACTIVE;
+}
+
+void hfi1_qp_kern_exp_rcv_clear_all(struct rvt_qp *qp)
+{
+	int i, ret;
+	struct hfi1_qp_priv *qpriv = qp->priv;
+	struct tid_flow_state *fs;
+
+	if (qp->ibqp.qp_type != IB_QPT_RC || !HFI1_CAP_IS_KSET(TID_RDMA))
+		return;
+
+	/*
+	 * First, clear the flow to help prevent any delayed packets from
+	 * being delivered.
+	 */
+	fs = &qpriv->flow_state;
+	if (fs->index != RXE_NUM_TID_FLOWS)
+		hfi1_kern_clear_hw_flow(qpriv->rcd, qp);
+
+	for (i = qp->s_acked; i != qp->s_head;) {
+		struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, i);
+
+		if (++i == qp->s_size)
+			i = 0;
+		/* Free only locally allocated TID entries */
+		if (wqe->wr.opcode != IB_WR_TID_RDMA_READ)
+			continue;
+		do {
+			struct hfi1_swqe_priv *priv = wqe->priv;
+
+			ret = hfi1_kern_exp_rcv_clear(&priv->tid_req);
+		} while (!ret);
+	}
+}
+
+bool hfi1_tid_rdma_wqe_interlock(struct rvt_qp *qp, struct rvt_swqe *wqe)
+{
+	struct rvt_swqe *prev;
+	struct hfi1_qp_priv *priv = qp->priv;
+	u32 s_prev;
+
+	s_prev = (qp->s_cur == 0 ? qp->s_size : qp->s_cur) - 1;
+	prev = rvt_get_swqe_ptr(qp, s_prev);
+
+	switch (wqe->wr.opcode) {
+	case IB_WR_SEND:
+	case IB_WR_SEND_WITH_IMM:
+	case IB_WR_SEND_WITH_INV:
+	case IB_WR_ATOMIC_CMP_AND_SWP:
+	case IB_WR_ATOMIC_FETCH_AND_ADD:
+	case IB_WR_RDMA_WRITE:
+	case IB_WR_RDMA_READ:
+		break;
+	case IB_WR_TID_RDMA_READ:
+		switch (prev->wr.opcode) {
+		case IB_WR_RDMA_READ:
+			if (qp->s_acked != qp->s_cur)
+				goto interlock;
+			break;
+		default:
+			break;
+		}
+	default:
+		break;
+	}
+	return false;
+
+interlock:
+	priv->s_flags |= HFI1_S_TID_WAIT_INTERLCK;
+	return true;
+}
+
+/* Does @sge meet the alignment requirements for tid rdma? */
+static inline bool hfi1_check_sge_align(struct rvt_qp *qp,
+					struct rvt_sge *sge, int num_sge)
+{
+	int i;
+
+	for (i = 0; i < num_sge; i++, sge++) {
+		trace_hfi1_sge_check_align(qp, i, sge);
+		if ((u64)sge->vaddr & ~PAGE_MASK ||
+		    sge->sge_length & ~PAGE_MASK)
+			return false;
+	}
+	return true;
+}
+
+void setup_tid_rdma_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe)
+{
+	struct hfi1_qp_priv *qpriv = (struct hfi1_qp_priv *)qp->priv;
+	struct hfi1_swqe_priv *priv = wqe->priv;
+	struct tid_rdma_params *remote;
+	enum ib_wr_opcode new_opcode;
+	bool do_tid_rdma = false;
+	struct hfi1_pportdata *ppd = qpriv->rcd->ppd;
+
+	if ((rdma_ah_get_dlid(&qp->remote_ah_attr) & ~((1 << ppd->lmc) - 1)) ==
+				ppd->lid)
+		return;
+	if (qpriv->hdr_type != HFI1_PKT_TYPE_9B)
+		return;
+
+	rcu_read_lock();
+	remote = rcu_dereference(qpriv->tid_rdma.remote);
+	/*
+	 * If TID RDMA is disabled by the negotiation, don't
+	 * use it.
+	 */
+	if (!remote)
+		goto exit;
+
+	if (wqe->wr.opcode == IB_WR_RDMA_READ) {
+		if (hfi1_check_sge_align(qp, &wqe->sg_list[0],
+					 wqe->wr.num_sge)) {
+			new_opcode = IB_WR_TID_RDMA_READ;
+			do_tid_rdma = true;
+		}
+	}
+
+	if (do_tid_rdma) {
+		if (hfi1_kern_exp_rcv_alloc_flows(&priv->tid_req, GFP_ATOMIC))
+			goto exit;
+		wqe->wr.opcode = new_opcode;
+		priv->tid_req.seg_len =
+			min_t(u32, remote->max_len, wqe->length);
+		priv->tid_req.total_segs =
+			DIV_ROUND_UP(wqe->length, priv->tid_req.seg_len);
+		/* Compute the last PSN of the request */
+		wqe->lpsn = wqe->psn;
+		if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
+			priv->tid_req.n_flows = remote->max_read;
+			qpriv->tid_r_reqs++;
+			wqe->lpsn += rvt_div_round_up_mtu(qp, wqe->length) - 1;
+		}
+
+		priv->tid_req.cur_seg = 0;
+		priv->tid_req.comp_seg = 0;
+		priv->tid_req.ack_seg = 0;
+		priv->tid_req.state = TID_REQUEST_INACTIVE;
+		trace_hfi1_tid_req_setup_tid_wqe(qp, 1, wqe->wr.opcode,
+						 wqe->psn, wqe->lpsn,
+						 &priv->tid_req);
+	}
+exit:
+	rcu_read_unlock();
 }
diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h
index ee8151558e3f..a53598ce45b2 100644
--- a/drivers/infiniband/hw/hfi1/tid_rdma.h
+++ b/drivers/infiniband/hw/hfi1/tid_rdma.h
@@ -6,7 +6,27 @@
 #ifndef HFI1_TID_RDMA_H
 #define HFI1_TID_RDMA_H
 
+#include <linux/circ_buf.h>
+#include "common.h"
+
+/* Add a convenience helper */
+#define CIRC_ADD(val, add, size) (((val) + (add)) & ((size) - 1))
+#define CIRC_NEXT(val, size) CIRC_ADD(val, 1, size)
+#define CIRC_PREV(val, size) CIRC_ADD(val, -1, size)
+
+#define TID_RDMA_MIN_SEGMENT_SIZE       BIT(18)   /* 256 KiB (for now) */
 #define TID_RDMA_MAX_SEGMENT_SIZE       BIT(18)   /* 256 KiB (for now) */
+#define TID_RDMA_MAX_PAGES              (BIT(18) >> PAGE_SHIFT)
+
+/*
+ * Bit definitions for priv->s_flags.
+ * These bit flags overload the bit flags defined for the QP's s_flags.
+ * Due to the fact that these bit fields are used only for the QP priv
+ * s_flags, there are no collisions.
+ *
+ * HFI1_S_TID_WAIT_INTERLCK - QP is waiting for requester interlock
+ */
+#define HFI1_S_TID_WAIT_INTERLCK  BIT(5)
 
 struct tid_rdma_params {
 	struct rcu_head rcu_head;
@@ -21,10 +41,128 @@ struct tid_rdma_params {
 };
 
 struct tid_rdma_qp_params {
+	struct work_struct trigger_work;
 	struct tid_rdma_params local;
 	struct tid_rdma_params __rcu *remote;
 };
 
+/* Track state for each hardware flow */
+struct tid_flow_state {
+	u32 generation;
+	u32 psn;
+	u32 r_next_psn;      /* next PSN to be received (in TID space) */
+	u8 index;
+	u8 last_index;
+	u8 flags;
+};
+
+enum tid_rdma_req_state {
+	TID_REQUEST_INACTIVE = 0,
+	TID_REQUEST_INIT,
+	TID_REQUEST_INIT_RESEND,
+	TID_REQUEST_ACTIVE,
+	TID_REQUEST_RESEND,
+	TID_REQUEST_RESEND_ACTIVE,
+	TID_REQUEST_QUEUED,
+	TID_REQUEST_SYNC,
+	TID_REQUEST_RNR_NAK,
+	TID_REQUEST_COMPLETE,
+};
+
+struct tid_rdma_request {
+	struct rvt_qp *qp;
+	struct hfi1_ctxtdata *rcd;
+	union {
+		struct rvt_swqe *swqe;
+		struct rvt_ack_entry *ack;
+	} e;
+
+	struct tid_rdma_flow *flows;	/* array of tid flows */
+	u16 n_flows;		/* size of the flow buffer window */
+	u16 setup_head;		/* flow index we are setting up */
+	u16 clear_tail;		/* flow index we are clearing */
+	u16 flow_idx;		/* flow index most recently set up */
+
+	u32 seg_len;
+	u32 total_len;
+	u32 r_flow_psn;         /* IB PSN of next segment start */
+	u32 s_next_psn;		/* IB PSN of next segment start for read */
+
+	u32 total_segs;		/* segments required to complete a request */
+	u32 cur_seg;		/* index of current segment */
+	u32 comp_seg;           /* index of last completed segment */
+	u32 ack_seg;            /* index of last ack'ed segment */
+	u32 isge;		/* index of "current" sge */
+	u32 ack_pending;        /* num acks pending for this request */
+
+	enum tid_rdma_req_state state;
+};
+
+/*
+ * When header suppression is used, PSNs associated with a "flow" are
+ * relevant (and not the PSNs maintained by verbs). Track per-flow
+ * PSNs here for a TID RDMA segment.
+ *
+ */
+struct flow_state {
+	u32 flags;
+	u32 resp_ib_psn;     /* The IB PSN of the response for this flow */
+	u32 generation;      /* generation of flow */
+	u32 spsn;            /* starting PSN in TID space */
+	u32 lpsn;            /* last PSN in TID space */
+	u32 r_next_psn;      /* next PSN to be received (in TID space) */
+
+	/* For tid rdma read */
+	u32 ib_spsn;         /* starting PSN in Verbs space */
+	u32 ib_lpsn;         /* last PSn in Verbs space */
+};
+
+struct tid_rdma_pageset {
+	dma_addr_t addr : 48; /* Only needed for the first page */
+	u8 idx: 8;
+	u8 count : 7;
+	u8 mapped: 1;
+};
+
+/**
+ * kern_tid_node - used for managing TID's in TID groups
+ *
+ * @grp_idx: rcd relative index to tid_group
+ * @map: grp->map captured prior to programming this TID group in HW
+ * @cnt: Only @cnt of available group entries are actually programmed
+ */
+struct kern_tid_node {
+	struct tid_group *grp;
+	u8 map;
+	u8 cnt;
+};
+
+/* Overall info for a TID RDMA segment */
+struct tid_rdma_flow {
+	/*
+	 * While a TID RDMA segment is being transferred, it uses a QP number
+	 * from the "KDETH section of QP numbers" (which is different from the
+	 * QP number that originated the request). Bits 11-15 of these QP
+	 * numbers identify the "TID flow" for the segment.
+	 */
+	struct flow_state flow_state;
+	struct tid_rdma_request *req;
+	u32 tid_qpn;
+	u32 tid_offset;
+	u32 length;
+	u32 sent;
+	u8 tnode_cnt;
+	u8 tidcnt;
+	u8 tid_idx;
+	u8 idx;
+	u8 npagesets;
+	u8 npkts;
+	u8 pkt;
+	struct kern_tid_node tnode[TID_RDMA_MAX_PAGES];
+	struct tid_rdma_pageset pagesets[TID_RDMA_MAX_PAGES];
+	u32 tid_entry[TID_RDMA_MAX_PAGES];
+};
+
 bool tid_rdma_conn_req(struct rvt_qp *qp, u64 *data);
 bool tid_rdma_conn_reply(struct rvt_qp *qp, u64 data);
 bool tid_rdma_conn_resp(struct rvt_qp *qp, u64 *data);
@@ -32,9 +170,67 @@ void tid_rdma_conn_error(struct rvt_qp *qp);
 void tid_rdma_opfn_init(struct rvt_qp *qp, struct tid_rdma_params *p);
 
 int hfi1_kern_exp_rcv_init(struct hfi1_ctxtdata *rcd, int reinit);
+int hfi1_kern_exp_rcv_setup(struct tid_rdma_request *req,
+			    struct rvt_sge_state *ss, bool *last);
+int hfi1_kern_exp_rcv_clear(struct tid_rdma_request *req);
+void hfi1_kern_exp_rcv_clear_all(struct tid_rdma_request *req);
+void __trdma_clean_swqe(struct rvt_qp *qp, struct rvt_swqe *wqe);
+
+/**
+ * trdma_clean_swqe - clean flows for swqe if large send queue
+ * @qp: the qp
+ * @wqe: the send wqe
+ */
+static inline void trdma_clean_swqe(struct rvt_qp *qp, struct rvt_swqe *wqe)
+{
+	if (!wqe->priv)
+		return;
+	__trdma_clean_swqe(qp, wqe);
+}
+
+void hfi1_kern_read_tid_flow_free(struct rvt_qp *qp);
 
 int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp,
 		      struct ib_qp_init_attr *init_attr);
 void hfi1_qp_priv_tid_free(struct rvt_dev_info *rdi, struct rvt_qp *qp);
 
+void hfi1_tid_rdma_flush_wait(struct rvt_qp *qp);
+
+int hfi1_kern_setup_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp);
+void hfi1_kern_clear_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp);
+void hfi1_kern_init_ctxt_generations(struct hfi1_ctxtdata *rcd);
+
+struct cntr_entry;
+u64 hfi1_access_sw_tid_wait(const struct cntr_entry *entry,
+			    void *context, int vl, int mode, u64 data);
+
+u32 hfi1_build_tid_rdma_read_packet(struct rvt_swqe *wqe,
+				    struct ib_other_headers *ohdr,
+				    u32 *bth1, u32 *bth2, u32 *len);
+u32 hfi1_build_tid_rdma_read_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
+				 struct ib_other_headers *ohdr, u32 *bth1,
+				 u32 *bth2, u32 *len);
+void hfi1_rc_rcv_tid_rdma_read_req(struct hfi1_packet *packet);
+u32 hfi1_build_tid_rdma_read_resp(struct rvt_qp *qp, struct rvt_ack_entry *e,
+				  struct ib_other_headers *ohdr, u32 *bth0,
+				  u32 *bth1, u32 *bth2, u32 *len, bool *last);
+void hfi1_rc_rcv_tid_rdma_read_resp(struct hfi1_packet *packet);
+bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd,
+			      struct hfi1_pportdata *ppd,
+			      struct hfi1_packet *packet);
+void hfi1_tid_rdma_restart_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
+			       u32 *bth2);
+void hfi1_qp_kern_exp_rcv_clear_all(struct rvt_qp *qp);
+bool hfi1_tid_rdma_wqe_interlock(struct rvt_qp *qp, struct rvt_swqe *wqe);
+
+void setup_tid_rdma_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe);
+static inline void hfi1_setup_tid_rdma_wqe(struct rvt_qp *qp,
+					   struct rvt_swqe *wqe)
+{
+	if (wqe->priv &&
+	    wqe->wr.opcode == IB_WR_RDMA_READ &&
+	    wqe->length >= TID_RDMA_MIN_SEGMENT_SIZE)
+		setup_tid_rdma_wqe(qp, wqe);
+}
+
 #endif /* HFI1_TID_RDMA_H */
diff --git a/drivers/infiniband/hw/hfi1/trace.c b/drivers/infiniband/hw/hfi1/trace.c
index 7c8aed0ffc07..28181d711fed 100644
--- a/drivers/infiniband/hw/hfi1/trace.c
+++ b/drivers/infiniband/hw/hfi1/trace.c
@@ -46,6 +46,7 @@
  */
 #define CREATE_TRACE_POINTS
 #include "trace.h"
+#include "exp_rcv.h"
 
 static u8 __get_ib_hdr_len(struct ib_header *hdr)
 {
@@ -128,6 +129,10 @@ const char *hfi1_trace_get_packet_l2_str(u8 l2)
 #define IETH_PRN "ieth rkey:0x%.8x"
 #define ATOMICACKETH_PRN "origdata:%llx"
 #define ATOMICETH_PRN "vaddr:0x%llx rkey:0x%.8x sdata:%llx cdata:%llx"
+#define TID_RDMA_KDETH "kdeth0 0x%x kdeth1 0x%x"
+#define TID_RDMA_KDETH_DATA "kdeth0 0x%x: kver %u sh %u intr %u tidctrl %u tid %x offset %x kdeth1 0x%x: jkey %x"
+#define TID_READ_REQ_PRN "tid_flow_psn 0x%x tid_flow_qp 0x%x verbs_qp 0x%x"
+#define TID_READ_RSP_PRN "verbs_qp 0x%x"
 
 #define OP(transport, op) IB_OPCODE_## transport ## _ ## op
 
@@ -322,6 +327,38 @@ const char *parse_everbs_hdrs(
 				 parse_syndrome(be32_to_cpu(eh->aeth) >> 24),
 				 be32_to_cpu(eh->aeth) & IB_MSN_MASK);
 		break;
+	case OP(TID_RDMA, READ_REQ):
+		trace_seq_printf(p, TID_RDMA_KDETH " " RETH_PRN " "
+				 TID_READ_REQ_PRN,
+				 le32_to_cpu(eh->tid_rdma.r_req.kdeth0),
+				 le32_to_cpu(eh->tid_rdma.r_req.kdeth1),
+				 ib_u64_get(&eh->tid_rdma.r_req.reth.vaddr),
+				 be32_to_cpu(eh->tid_rdma.r_req.reth.rkey),
+				 be32_to_cpu(eh->tid_rdma.r_req.reth.length),
+				 be32_to_cpu(eh->tid_rdma.r_req.tid_flow_psn),
+				 be32_to_cpu(eh->tid_rdma.r_req.tid_flow_qp),
+				 be32_to_cpu(eh->tid_rdma.r_req.verbs_qp));
+		break;
+	case OP(TID_RDMA, READ_RESP):
+		trace_seq_printf(p, TID_RDMA_KDETH_DATA " " AETH_PRN " "
+				 TID_READ_RSP_PRN,
+				 le32_to_cpu(eh->tid_rdma.r_rsp.kdeth0),
+				 KDETH_GET(eh->tid_rdma.r_rsp.kdeth0, KVER),
+				 KDETH_GET(eh->tid_rdma.r_rsp.kdeth0, SH),
+				 KDETH_GET(eh->tid_rdma.r_rsp.kdeth0, INTR),
+				 KDETH_GET(eh->tid_rdma.r_rsp.kdeth0, TIDCTRL),
+				 KDETH_GET(eh->tid_rdma.r_rsp.kdeth0, TID),
+				 KDETH_GET(eh->tid_rdma.r_rsp.kdeth0, OFFSET),
+				 le32_to_cpu(eh->tid_rdma.r_rsp.kdeth1),
+				 KDETH_GET(eh->tid_rdma.r_rsp.kdeth1, JKEY),
+				 be32_to_cpu(eh->tid_rdma.r_rsp.aeth) >> 24,
+				 parse_syndrome(/* aeth */
+					 be32_to_cpu(eh->tid_rdma.r_rsp.aeth)
+					 >> 24),
+				 (be32_to_cpu(eh->tid_rdma.r_rsp.aeth) &
+				  IB_MSN_MASK),
+				 be32_to_cpu(eh->tid_rdma.r_rsp.verbs_qp));
+		break;
 	/* aeth + atomicacketh */
 	case OP(RC, ATOMIC_ACKNOWLEDGE):
 		trace_seq_printf(p, AETH_PRN " " ATOMICACKETH_PRN,
@@ -394,6 +431,21 @@ const char *print_u32_array(
 	return ret;
 }
 
+u8 hfi1_trace_get_tid_ctrl(u32 ent)
+{
+	return EXP_TID_GET(ent, CTRL);
+}
+
+u16 hfi1_trace_get_tid_len(u32 ent)
+{
+	return EXP_TID_GET(ent, LEN);
+}
+
+u16 hfi1_trace_get_tid_idx(u32 ent)
+{
+	return EXP_TID_GET(ent, IDX);
+}
+
 __hfi1_trace_fn(AFFINITY);
 __hfi1_trace_fn(PKT);
 __hfi1_trace_fn(PROC);
diff --git a/drivers/infiniband/hw/hfi1/trace_ibhdrs.h b/drivers/infiniband/hw/hfi1/trace_ibhdrs.h
index 1dc2c28fc96e..1116238bf24d 100644
--- a/drivers/infiniband/hw/hfi1/trace_ibhdrs.h
+++ b/drivers/infiniband/hw/hfi1/trace_ibhdrs.h
@@ -79,6 +79,8 @@ __print_symbolic(opcode,                                   \
 	ib_opcode_name(RC_ATOMIC_ACKNOWLEDGE),             \
 	ib_opcode_name(RC_COMPARE_SWAP),                   \
 	ib_opcode_name(RC_FETCH_ADD),                      \
+	ib_opcode_name(TID_RDMA_READ_REQ),	           \
+	ib_opcode_name(TID_RDMA_READ_RESP),	           \
 	ib_opcode_name(UC_SEND_FIRST),                     \
 	ib_opcode_name(UC_SEND_MIDDLE),                    \
 	ib_opcode_name(UC_SEND_LAST),                      \
diff --git a/drivers/infiniband/hw/hfi1/trace_rc.h b/drivers/infiniband/hw/hfi1/trace_rc.h
index 8ce476570462..1ebca37862e0 100644
--- a/drivers/infiniband/hw/hfi1/trace_rc.h
+++ b/drivers/infiniband/hw/hfi1/trace_rc.h
@@ -109,6 +109,54 @@ DEFINE_EVENT(hfi1_rc_template, hfi1_rcv_error,
 	     TP_ARGS(qp, psn)
 );
 
+DEFINE_EVENT(/* event */
+	hfi1_rc_template, hfi1_rc_completion,
+	TP_PROTO(struct rvt_qp *qp, u32 psn),
+	TP_ARGS(qp, psn)
+);
+
+DECLARE_EVENT_CLASS(/* rc_ack */
+	hfi1_rc_ack_template,
+	TP_PROTO(struct rvt_qp *qp, u32 aeth, u32 psn,
+		 struct rvt_swqe *wqe),
+	TP_ARGS(qp, aeth, psn, wqe),
+	TP_STRUCT__entry(/* entry */
+		DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+		__field(u32, qpn)
+		__field(u32, aeth)
+		__field(u32, psn)
+		__field(u8, opcode)
+		__field(u32, spsn)
+		__field(u32, lpsn)
+	),
+	TP_fast_assign(/* assign */
+		DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
+		__entry->qpn = qp->ibqp.qp_num;
+		__entry->aeth = aeth;
+		__entry->psn = psn;
+		__entry->opcode = wqe->wr.opcode;
+		__entry->spsn = wqe->psn;
+		__entry->lpsn = wqe->lpsn;
+	),
+	TP_printk(/* print */
+		"[%s] qpn 0x%x aeth 0x%x psn 0x%x opcode 0x%x spsn 0x%x lpsn 0x%x",
+		__get_str(dev),
+		__entry->qpn,
+		__entry->aeth,
+		__entry->psn,
+		__entry->opcode,
+		__entry->spsn,
+		__entry->lpsn
+	)
+);
+
+DEFINE_EVENT(/* do_rc_ack */
+	hfi1_rc_ack_template, hfi1_rc_ack_do,
+	TP_PROTO(struct rvt_qp *qp, u32 aeth, u32 psn,
+		 struct rvt_swqe *wqe),
+	TP_ARGS(qp, aeth, psn, wqe)
+);
+
 #endif /* __HFI1_TRACE_RC_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/drivers/infiniband/hw/hfi1/trace_tid.h b/drivers/infiniband/hw/hfi1/trace_tid.h
index 57a973c97cde..b71638c22d4b 100644
--- a/drivers/infiniband/hw/hfi1/trace_tid.h
+++ b/drivers/infiniband/hw/hfi1/trace_tid.h
@@ -21,10 +21,51 @@ __print_symbolic(type,                       \
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM hfi1_tid
 
+u8 hfi1_trace_get_tid_ctrl(u32 ent);
+u16 hfi1_trace_get_tid_len(u32 ent);
+u16 hfi1_trace_get_tid_idx(u32 ent);
+
 #define OPFN_PARAM_PRN "[%s] qpn 0x%x %s OPFN: qp 0x%x, max read %u, " \
 		       "max write %u, max length %u, jkey 0x%x timeout %u " \
 		       "urg %u"
 
+#define TID_FLOW_PRN "[%s] qpn 0x%x flow %d: idx %d resp_ib_psn 0x%x " \
+		     "generation 0x%x fpsn 0x%x-%x r_next_psn 0x%x " \
+		     "ib_psn 0x%x-%x npagesets %u tnode_cnt %u " \
+		     "tidcnt %u tid_idx %u tid_offset %u length %u sent %u"
+
+#define TID_NODE_PRN "[%s] qpn 0x%x  %s idx %u grp base 0x%x map 0x%x " \
+		     "used %u cnt %u"
+
+#define RSP_INFO_PRN "[%s] qpn 0x%x state 0x%x s_state 0x%x psn 0x%x " \
+		     "r_psn 0x%x r_state 0x%x r_flags 0x%x " \
+		     "r_head_ack_queue %u s_tail_ack_queue %u " \
+		     "s_ack_state 0x%x " \
+		     "s_nak_state 0x%x s_flags 0x%x ps_flags 0x%x " \
+		     "iow_flags 0x%lx"
+
+#define SENDER_INFO_PRN "[%s] qpn 0x%x state 0x%x s_cur %u s_tail %u " \
+			"s_head %u s_acked %u s_last %u s_psn 0x%x " \
+			"s_last_psn 0x%x s_flags 0x%x ps_flags 0x%x " \
+			"iow_flags 0x%lx s_state 0x%x s_num_rd %u s_retry %u"
+
+#define TID_READ_SENDER_PRN "[%s] qpn 0x%x newreq %u tid_r_reqs %u " \
+			    "tid_r_comp %u pending_tid_r_segs %u " \
+			    "s_flags 0x%x ps_flags 0x%x iow_flags 0x%lx " \
+			    "hw_flow_index %u generation 0x%x " \
+			    "fpsn 0x%x flow_flags 0x%x"
+
+#define TID_REQ_PRN "[%s] qpn 0x%x newreq %u opcode 0x%x psn 0x%x lpsn 0x%x " \
+		    "cur_seg %u comp_seg %u ack_seg %u " \
+		    "total_segs %u setup_head %u clear_tail %u flow_idx %u " \
+		    "state %u r_flow_psn 0x%x " \
+		    "s_next_psn 0x%x"
+
+#define RCV_ERR_PRN "[%s] qpn 0x%x s_flags 0x%x state 0x%x " \
+		    "s_tail_ack_queue %u " \
+		    "r_head_ack_queue %u opcode 0x%x psn 0x%x r_psn 0x%x " \
+		    " diff %d"
+
 DECLARE_EVENT_CLASS(/* class */
 	hfi1_exp_tid_reg_unreg,
 	TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, u32 npages,
@@ -323,6 +364,723 @@ DEFINE_EVENT(/* event */
 	TP_ARGS(qp, msg, more)
 );
 
+DEFINE_EVENT(/* event */
+	hfi1_msg_template, hfi1_msg_alloc_tids,
+	TP_PROTO(struct rvt_qp *qp, const char *msg, u64 more),
+	TP_ARGS(qp, msg, more)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_msg_template, hfi1_msg_tid_restart_req,
+	TP_PROTO(struct rvt_qp *qp, const char *msg, u64 more),
+	TP_ARGS(qp, msg, more)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_msg_template, hfi1_msg_handle_kdeth_eflags,
+	TP_PROTO(struct rvt_qp *qp, const char *msg, u64 more),
+	TP_ARGS(qp, msg, more)
+);
+
+DECLARE_EVENT_CLASS(/* tid_flow_page */
+	hfi1_tid_flow_page_template,
+	TP_PROTO(struct rvt_qp *qp, struct tid_rdma_flow *flow, u32 index,
+		 char mtu8k, char v1, void *vaddr),
+	TP_ARGS(qp, flow, index, mtu8k, v1, vaddr),
+	TP_STRUCT__entry(/* entry */
+		DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+		__field(u32, qpn)
+		__field(char, mtu8k)
+		__field(char, v1)
+		__field(u32, index)
+		__field(u64, page)
+		__field(u64, vaddr)
+	),
+	TP_fast_assign(/* assign */
+		DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+		__entry->qpn = qp->ibqp.qp_num;
+		__entry->mtu8k = mtu8k;
+		__entry->v1 = v1;
+		__entry->index = index;
+		__entry->page = vaddr ? (u64)virt_to_page(vaddr) : 0ULL;
+		__entry->vaddr = (u64)vaddr;
+	),
+	TP_printk(/* print */
+		"[%s] qpn 0x%x page[%u]: page 0x%llx %s 0x%llx",
+		__get_str(dev),
+		__entry->qpn,
+		__entry->index,
+		__entry->page,
+		__entry->mtu8k ? (__entry->v1 ? "v1" : "v0") : "vaddr",
+		__entry->vaddr
+	)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_flow_page_template, hfi1_tid_flow_page,
+	TP_PROTO(struct rvt_qp *qp, struct tid_rdma_flow *flow, u32 index,
+		 char mtu8k, char v1, void *vaddr),
+	TP_ARGS(qp, flow, index, mtu8k, v1, vaddr)
+);
+
+DECLARE_EVENT_CLASS(/* tid_pageset */
+	hfi1_tid_pageset_template,
+	TP_PROTO(struct rvt_qp *qp, u32 index, u16 idx, u16 count),
+	TP_ARGS(qp, index, idx, count),
+	TP_STRUCT__entry(/* entry */
+		DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+		__field(u32, qpn)
+		__field(u32, index)
+		__field(u16, idx)
+		__field(u16, count)
+	),
+	TP_fast_assign(/* assign */
+		DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+		__entry->qpn = qp->ibqp.qp_num;
+		__entry->index = index;
+		__entry->idx = idx;
+		__entry->count = count;
+	),
+	TP_printk(/* print */
+		"[%s] qpn 0x%x list[%u]: idx %u count %u",
+		__get_str(dev),
+		__entry->qpn,
+		__entry->index,
+		__entry->idx,
+		__entry->count
+	)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_pageset_template, hfi1_tid_pageset,
+	TP_PROTO(struct rvt_qp *qp, u32 index, u16 idx, u16 count),
+	TP_ARGS(qp, index, idx, count)
+);
+
+DECLARE_EVENT_CLASS(/* tid_fow */
+	hfi1_tid_flow_template,
+	TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+	TP_ARGS(qp, index, flow),
+	TP_STRUCT__entry(/* entry */
+		DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+		__field(u32, qpn)
+		__field(int, index)
+		__field(int, idx)
+		__field(u32, resp_ib_psn)
+		__field(u32, generation)
+		__field(u32, fspsn)
+		__field(u32, flpsn)
+		__field(u32, r_next_psn)
+		__field(u32, ib_spsn)
+		__field(u32, ib_lpsn)
+		__field(u32, npagesets)
+		__field(u32, tnode_cnt)
+		__field(u32, tidcnt)
+		__field(u32, tid_idx)
+		__field(u32, tid_offset)
+		__field(u32, length)
+		__field(u32, sent)
+	),
+	TP_fast_assign(/* assign */
+		DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+		__entry->qpn = qp->ibqp.qp_num;
+		__entry->index = index;
+		__entry->idx = flow->idx;
+		__entry->resp_ib_psn = flow->flow_state.resp_ib_psn;
+		__entry->generation = flow->flow_state.generation;
+		__entry->fspsn = full_flow_psn(flow,
+					       flow->flow_state.spsn);
+		__entry->flpsn = full_flow_psn(flow,
+					       flow->flow_state.lpsn);
+		__entry->r_next_psn = flow->flow_state.r_next_psn;
+		__entry->ib_spsn = flow->flow_state.ib_spsn;
+		__entry->ib_lpsn = flow->flow_state.ib_lpsn;
+		__entry->npagesets = flow->npagesets;
+		__entry->tnode_cnt = flow->tnode_cnt;
+		__entry->tidcnt = flow->tidcnt;
+		__entry->tid_idx = flow->tid_idx;
+		__entry->tid_offset =  flow->tid_offset;
+		__entry->length = flow->length;
+		__entry->sent = flow->sent;
+	),
+	TP_printk(/* print */
+		TID_FLOW_PRN,
+		__get_str(dev),
+		__entry->qpn,
+		__entry->index,
+		__entry->idx,
+		__entry->resp_ib_psn,
+		__entry->generation,
+		__entry->fspsn,
+		__entry->flpsn,
+		__entry->r_next_psn,
+		__entry->ib_spsn,
+		__entry->ib_lpsn,
+		__entry->npagesets,
+		__entry->tnode_cnt,
+		__entry->tidcnt,
+		__entry->tid_idx,
+		__entry->tid_offset,
+		__entry->length,
+		__entry->sent
+	)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_flow_template, hfi1_tid_flow_alloc,
+	TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+	TP_ARGS(qp, index, flow)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_flow_template, hfi1_tid_flow_build_read_pkt,
+	TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+	TP_ARGS(qp, index, flow)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_flow_template, hfi1_tid_flow_build_read_resp,
+	TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+	TP_ARGS(qp, index, flow)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_flow_template, hfi1_tid_flow_rcv_read_req,
+	TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+	TP_ARGS(qp, index, flow)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_flow_template, hfi1_tid_flow_rcv_read_resp,
+	TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+	TP_ARGS(qp, index, flow)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_flow_template, hfi1_tid_flow_restart_req,
+	TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+	TP_ARGS(qp, index, flow)
+);
+
+DECLARE_EVENT_CLASS(/* tid_node */
+	hfi1_tid_node_template,
+	TP_PROTO(struct rvt_qp *qp, const char *msg, u32 index, u32 base,
+		 u8 map, u8 used, u8 cnt),
+	TP_ARGS(qp, msg, index, base, map, used, cnt),
+	TP_STRUCT__entry(/* entry */
+		DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+		__field(u32, qpn)
+		__string(msg, msg)
+		__field(u32, index)
+		__field(u32, base)
+		__field(u8, map)
+		__field(u8, used)
+		__field(u8, cnt)
+	),
+	TP_fast_assign(/* assign */
+		DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+		__entry->qpn = qp->ibqp.qp_num;
+		__assign_str(msg, msg);
+		__entry->index = index;
+		__entry->base = base;
+		__entry->map = map;
+		__entry->used = used;
+		__entry->cnt = cnt;
+	),
+	TP_printk(/* print */
+		TID_NODE_PRN,
+		__get_str(dev),
+		__entry->qpn,
+		__get_str(msg),
+		__entry->index,
+		__entry->base,
+		__entry->map,
+		__entry->used,
+		__entry->cnt
+	)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_node_template, hfi1_tid_node_add,
+	TP_PROTO(struct rvt_qp *qp, const char *msg, u32 index, u32 base,
+		 u8 map, u8 used, u8 cnt),
+	TP_ARGS(qp, msg, index, base, map, used, cnt)
+);
+
+DECLARE_EVENT_CLASS(/* tid_entry */
+	hfi1_tid_entry_template,
+	TP_PROTO(struct rvt_qp *qp, int index, u32 ent),
+	TP_ARGS(qp, index, ent),
+	TP_STRUCT__entry(/* entry */
+		DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+		__field(u32, qpn)
+		__field(int, index)
+		__field(u8, ctrl)
+		__field(u16, idx)
+		__field(u16, len)
+	),
+	TP_fast_assign(/* assign */
+		DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+		__entry->qpn = qp->ibqp.qp_num;
+		__entry->index = index;
+		__entry->ctrl = hfi1_trace_get_tid_ctrl(ent);
+		__entry->idx = hfi1_trace_get_tid_idx(ent);
+		__entry->len = hfi1_trace_get_tid_len(ent);
+	),
+	TP_printk(/* print */
+		"[%s] qpn 0x%x TID entry %d: idx %u len %u ctrl 0x%x",
+		__get_str(dev),
+		__entry->qpn,
+		__entry->index,
+		__entry->idx,
+		__entry->len,
+		__entry->ctrl
+	)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_entry_template, hfi1_tid_entry_alloc,
+	TP_PROTO(struct rvt_qp *qp, int index, u32 entry),
+	TP_ARGS(qp, index, entry)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_entry_template, hfi1_tid_entry_build_read_resp,
+	TP_PROTO(struct rvt_qp *qp, int index, u32 ent),
+	TP_ARGS(qp, index, ent)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_entry_template, hfi1_tid_entry_rcv_read_req,
+	TP_PROTO(struct rvt_qp *qp, int index, u32 ent),
+	TP_ARGS(qp, index, ent)
+);
+
+DECLARE_EVENT_CLASS(/* rsp_info */
+	hfi1_responder_info_template,
+	TP_PROTO(struct rvt_qp *qp, u32 psn),
+	TP_ARGS(qp, psn),
+	TP_STRUCT__entry(/* entry */
+		DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+		__field(u32, qpn)
+		__field(u8, state)
+		__field(u8, s_state)
+		__field(u32, psn)
+		__field(u32, r_psn)
+		__field(u8, r_state)
+		__field(u8, r_flags)
+		__field(u8, r_head_ack_queue)
+		__field(u8, s_tail_ack_queue)
+		__field(u8, s_ack_state)
+		__field(u8, s_nak_state)
+		__field(u8, r_nak_state)
+		__field(u32, s_flags)
+		__field(u32, ps_flags)
+		__field(unsigned long, iow_flags)
+	),
+	TP_fast_assign(/* assign */
+		struct hfi1_qp_priv *priv = qp->priv;
+
+		DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+		__entry->qpn = qp->ibqp.qp_num;
+		__entry->state = qp->state;
+		__entry->s_state = qp->s_state;
+		__entry->psn = psn;
+		__entry->r_psn = qp->r_psn;
+		__entry->r_state = qp->r_state;
+		__entry->r_flags = qp->r_flags;
+		__entry->r_head_ack_queue = qp->r_head_ack_queue;
+		__entry->s_tail_ack_queue = qp->s_tail_ack_queue;
+		__entry->s_ack_state = qp->s_ack_state;
+		__entry->s_nak_state = qp->s_nak_state;
+		__entry->s_flags = qp->s_flags;
+		__entry->ps_flags = priv->s_flags;
+		__entry->iow_flags = priv->s_iowait.flags;
+	),
+	TP_printk(/* print */
+		RSP_INFO_PRN,
+		__get_str(dev),
+		__entry->qpn,
+		__entry->state,
+		__entry->s_state,
+		__entry->psn,
+		__entry->r_psn,
+		__entry->r_state,
+		__entry->r_flags,
+		__entry->r_head_ack_queue,
+		__entry->s_tail_ack_queue,
+		__entry->s_ack_state,
+		__entry->s_nak_state,
+		__entry->s_flags,
+		__entry->ps_flags,
+		__entry->iow_flags
+	)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_responder_info_template, hfi1_rsp_make_rc_ack,
+	TP_PROTO(struct rvt_qp *qp, u32 psn),
+	TP_ARGS(qp, psn)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_responder_info_template, hfi1_rsp_rcv_tid_read_req,
+	TP_PROTO(struct rvt_qp *qp, u32 psn),
+	TP_ARGS(qp, psn)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_responder_info_template, hfi1_rsp_tid_rcv_error,
+	TP_PROTO(struct rvt_qp *qp, u32 psn),
+	TP_ARGS(qp, psn)
+);
+
+DECLARE_EVENT_CLASS(/* sender_info */
+	hfi1_sender_info_template,
+	TP_PROTO(struct rvt_qp *qp),
+	TP_ARGS(qp),
+	TP_STRUCT__entry(/* entry */
+		DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+		__field(u32, qpn)
+		__field(u8, state)
+		__field(u32, s_cur)
+		__field(u32, s_tail)
+		__field(u32, s_head)
+		__field(u32, s_acked)
+		__field(u32, s_last)
+		__field(u32, s_psn)
+		__field(u32, s_last_psn)
+		__field(u32, s_flags)
+		__field(u32, ps_flags)
+		__field(unsigned long, iow_flags)
+		__field(u8, s_state)
+		__field(u8, s_num_rd)
+		__field(u8, s_retry)
+	),
+	TP_fast_assign(/* assign */
+		DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
+		__entry->qpn = qp->ibqp.qp_num;
+		__entry->state = qp->state;
+		__entry->s_cur = qp->s_cur;
+		__entry->s_tail = qp->s_tail;
+		__entry->s_head = qp->s_head;
+		__entry->s_acked = qp->s_acked;
+		__entry->s_last = qp->s_last;
+		__entry->s_psn = qp->s_psn;
+		__entry->s_last_psn = qp->s_last_psn;
+		__entry->s_flags = qp->s_flags;
+		__entry->ps_flags = ((struct hfi1_qp_priv *)qp->priv)->s_flags;
+		__entry->iow_flags =
+			((struct hfi1_qp_priv *)qp->priv)->s_iowait.flags;
+		__entry->s_state = qp->s_state;
+		__entry->s_num_rd = qp->s_num_rd_atomic;
+		__entry->s_retry = qp->s_retry;
+	),
+	TP_printk(/* print */
+		SENDER_INFO_PRN,
+		__get_str(dev),
+		__entry->qpn,
+		__entry->state,
+		__entry->s_cur,
+		__entry->s_tail,
+		__entry->s_head,
+		__entry->s_acked,
+		__entry->s_last,
+		__entry->s_psn,
+		__entry->s_last_psn,
+		__entry->s_flags,
+		__entry->ps_flags,
+		__entry->iow_flags,
+		__entry->s_state,
+		__entry->s_num_rd,
+		__entry->s_retry
+	)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_sender_info_template, hfi1_sender_make_rc_req,
+	TP_PROTO(struct rvt_qp *qp),
+	TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_sender_info_template, hfi1_sender_reset_psn,
+	TP_PROTO(struct rvt_qp *qp),
+	TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_sender_info_template, hfi1_sender_restart_rc,
+	TP_PROTO(struct rvt_qp *qp),
+	TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_sender_info_template, hfi1_sender_do_rc_ack,
+	TP_PROTO(struct rvt_qp *qp),
+	TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_sender_info_template, hfi1_sender_rcv_tid_read_resp,
+	TP_PROTO(struct rvt_qp *qp),
+	TP_ARGS(qp)
+);
+
+DECLARE_EVENT_CLASS(/* tid_read_sender */
+	hfi1_tid_read_sender_template,
+	TP_PROTO(struct rvt_qp *qp, char newreq),
+	TP_ARGS(qp, newreq),
+	TP_STRUCT__entry(/* entry */
+		DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+		__field(u32, qpn)
+		__field(char, newreq)
+		__field(u32, tid_r_reqs)
+		__field(u32, tid_r_comp)
+		__field(u32, pending_tid_r_segs)
+		__field(u32, s_flags)
+		__field(u32, ps_flags)
+		__field(unsigned long, iow_flags)
+		__field(u32, hw_flow_index)
+		__field(u32, generation)
+		__field(u32, fpsn)
+		__field(u32, flow_flags)
+	),
+	TP_fast_assign(/* assign */
+		struct hfi1_qp_priv *priv = qp->priv;
+
+		DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+		__entry->qpn = qp->ibqp.qp_num;
+		__entry->newreq = newreq;
+		__entry->tid_r_reqs = priv->tid_r_reqs;
+		__entry->tid_r_comp = priv->tid_r_comp;
+		__entry->pending_tid_r_segs = priv->pending_tid_r_segs;
+		__entry->s_flags = qp->s_flags;
+		__entry->ps_flags = priv->s_flags;
+		__entry->iow_flags = priv->s_iowait.flags;
+		__entry->hw_flow_index = priv->flow_state.index;
+		__entry->generation = priv->flow_state.generation;
+		__entry->fpsn = priv->flow_state.psn;
+		__entry->flow_flags = priv->flow_state.flags;
+	),
+	TP_printk(/* print */
+		TID_READ_SENDER_PRN,
+		__get_str(dev),
+		__entry->qpn,
+		__entry->newreq,
+		__entry->tid_r_reqs,
+		__entry->tid_r_comp,
+		__entry->pending_tid_r_segs,
+		__entry->s_flags,
+		__entry->ps_flags,
+		__entry->iow_flags,
+		__entry->hw_flow_index,
+		__entry->generation,
+		__entry->fpsn,
+		__entry->flow_flags
+	)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_read_sender_template, hfi1_tid_read_sender_make_req,
+	TP_PROTO(struct rvt_qp *qp, char newreq),
+	TP_ARGS(qp, newreq)
+);
+
+DECLARE_EVENT_CLASS(/* tid_rdma_request */
+	hfi1_tid_rdma_request_template,
+	TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+		 struct tid_rdma_request *req),
+	TP_ARGS(qp, newreq, opcode, psn, lpsn, req),
+	TP_STRUCT__entry(/* entry */
+		DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+		__field(u32, qpn)
+		__field(char, newreq)
+		__field(u8, opcode)
+		__field(u32, psn)
+		__field(u32, lpsn)
+		__field(u32, cur_seg)
+		__field(u32, comp_seg)
+		__field(u32, ack_seg)
+		__field(u32, total_segs)
+		__field(u16, setup_head)
+		__field(u16, clear_tail)
+		__field(u16, flow_idx)
+		__field(u32, state)
+		__field(u32, r_flow_psn)
+		__field(u32, s_next_psn)
+	),
+	TP_fast_assign(/* assign */
+		DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+		__entry->qpn = qp->ibqp.qp_num;
+		__entry->newreq = newreq;
+		__entry->opcode = opcode;
+		__entry->psn = psn;
+		__entry->lpsn = lpsn;
+		__entry->cur_seg = req->cur_seg;
+		__entry->comp_seg = req->comp_seg;
+		__entry->ack_seg = req->ack_seg;
+		__entry->total_segs = req->total_segs;
+		__entry->setup_head = req->setup_head;
+		__entry->clear_tail = req->clear_tail;
+		__entry->flow_idx = req->flow_idx;
+		__entry->state = req->state;
+		__entry->r_flow_psn = req->r_flow_psn;
+		__entry->s_next_psn = req->s_next_psn;
+	),
+	TP_printk(/* print */
+		TID_REQ_PRN,
+		__get_str(dev),
+		__entry->qpn,
+		__entry->newreq,
+		__entry->opcode,
+		__entry->psn,
+		__entry->lpsn,
+		__entry->cur_seg,
+		__entry->comp_seg,
+		__entry->ack_seg,
+		__entry->total_segs,
+		__entry->setup_head,
+		__entry->clear_tail,
+		__entry->flow_idx,
+		__entry->state,
+		__entry->r_flow_psn,
+		__entry->s_next_psn
+	)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_rdma_request_template, hfi1_tid_req_make_req_read,
+	TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+		 struct tid_rdma_request *req),
+	TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_rdma_request_template, hfi1_tid_req_build_read_req,
+	TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+		 struct tid_rdma_request *req),
+	TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_read_req,
+	TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+		 struct tid_rdma_request *req),
+	TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_read_resp,
+	TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+		 struct tid_rdma_request *req),
+	TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_err,
+	TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+		 struct tid_rdma_request *req),
+	TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_rdma_request_template, hfi1_tid_req_restart_req,
+	TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+		 struct tid_rdma_request *req),
+	TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_rdma_request_template, hfi1_tid_req_setup_tid_wqe,
+	TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+		 struct tid_rdma_request *req),
+	TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DECLARE_EVENT_CLASS(/* rc_rcv_err */
+	hfi1_rc_rcv_err_template,
+	TP_PROTO(struct rvt_qp *qp, u32 opcode, u32 psn, int diff),
+	TP_ARGS(qp, opcode, psn, diff),
+	TP_STRUCT__entry(/* entry */
+		DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+		__field(u32, qpn)
+		__field(u32, s_flags)
+		__field(u8, state)
+		__field(u8, s_tail_ack_queue)
+		__field(u8, r_head_ack_queue)
+		__field(u32, opcode)
+		__field(u32, psn)
+		__field(u32, r_psn)
+		__field(int, diff)
+	),
+	TP_fast_assign(/* assign */
+		DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
+		__entry->qpn = qp->ibqp.qp_num;
+		__entry->s_flags = qp->s_flags;
+		__entry->state = qp->state;
+		__entry->s_tail_ack_queue = qp->s_tail_ack_queue;
+		__entry->r_head_ack_queue = qp->r_head_ack_queue;
+		__entry->opcode = opcode;
+		__entry->psn = psn;
+		__entry->r_psn = qp->r_psn;
+		__entry->diff = diff;
+	),
+	TP_printk(/* print */
+		RCV_ERR_PRN,
+		__get_str(dev),
+		__entry->qpn,
+		__entry->s_flags,
+		__entry->state,
+		__entry->s_tail_ack_queue,
+		__entry->r_head_ack_queue,
+		__entry->opcode,
+		__entry->psn,
+		__entry->r_psn,
+		__entry->diff
+	)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_rc_rcv_err_template, hfi1_tid_rdma_rcv_err,
+	TP_PROTO(struct rvt_qp *qp, u32 opcode, u32 psn, int diff),
+	TP_ARGS(qp, opcode, psn, diff)
+);
+
+DECLARE_EVENT_CLASS(/* sge  */
+	hfi1_sge_template,
+	TP_PROTO(struct rvt_qp *qp, int index, struct rvt_sge *sge),
+	TP_ARGS(qp, index, sge),
+	TP_STRUCT__entry(/* entry */
+		DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+		__field(u32, qpn)
+		__field(int, index)
+		__field(u64, vaddr)
+		__field(u32, sge_length)
+	),
+	TP_fast_assign(/* assign */
+		DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+		__entry->qpn = qp->ibqp.qp_num;
+		__entry->index = index;
+		__entry->vaddr = (u64)sge->vaddr;
+		__entry->sge_length = sge->sge_length;
+	),
+	TP_printk(/* print */
+		"[%s] qpn 0x%x sge %d: vaddr 0x%llx sge_length %u",
+		__get_str(dev),
+		__entry->qpn,
+		__entry->index,
+		__entry->vaddr,
+		__entry->sge_length
+	)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_sge_template, hfi1_sge_check_align,
+	TP_PROTO(struct rvt_qp *qp, int index, struct rvt_sge *sge),
+	TP_ARGS(qp, index, sge)
+);
+
 #endif /* __HFI1_TRACE_TID_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/drivers/infiniband/hw/hfi1/trace_tx.h b/drivers/infiniband/hw/hfi1/trace_tx.h
index c57af3b31fe1..37dbb3e599c3 100644
--- a/drivers/infiniband/hw/hfi1/trace_tx.h
+++ b/drivers/infiniband/hw/hfi1/trace_tx.h
@@ -114,19 +114,27 @@ DECLARE_EVENT_CLASS(hfi1_qpsleepwakeup_template,
 		    __field(u32, qpn)
 		    __field(u32, flags)
 		    __field(u32, s_flags)
+		    __field(u32, ps_flags)
+		    __field(unsigned long, iow_flags)
 		    ),
 		    TP_fast_assign(
 		    DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
 		    __entry->flags = flags;
 		    __entry->qpn = qp->ibqp.qp_num;
 		    __entry->s_flags = qp->s_flags;
+		    __entry->ps_flags =
+			((struct hfi1_qp_priv *)qp->priv)->s_flags;
+		    __entry->iow_flags =
+			((struct hfi1_qp_priv *)qp->priv)->s_iowait.flags;
 		    ),
 		    TP_printk(
-		    "[%s] qpn 0x%x flags 0x%x s_flags 0x%x",
+		    "[%s] qpn 0x%x flags 0x%x s_flags 0x%x ps_flags 0x%x iow_flags 0x%lx",
 		    __get_str(dev),
 		    __entry->qpn,
 		    __entry->flags,
-		    __entry->s_flags
+		    __entry->s_flags,
+		    __entry->ps_flags,
+		    __entry->iow_flags
 		    )
 );
 
diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.h b/drivers/infiniband/hw/hfi1/user_exp_rcv.h
index e383cc01a2bf..43b105de1d54 100644
--- a/drivers/infiniband/hw/hfi1/user_exp_rcv.h
+++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.h
@@ -48,7 +48,6 @@
  */
 
 #include "hfi.h"
-
 #include "exp_rcv.h"
 
 struct tid_pageset {
diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c
index 571bfd549c2a..88676ca79fda 100644
--- a/drivers/infiniband/hw/hfi1/verbs.c
+++ b/drivers/infiniband/hw/hfi1/verbs.c
@@ -165,6 +165,7 @@ const enum ib_wc_opcode ib_hfi1_wc_opcode[] = {
 	[IB_WR_SEND] = IB_WC_SEND,
 	[IB_WR_SEND_WITH_IMM] = IB_WC_SEND,
 	[IB_WR_RDMA_READ] = IB_WC_RDMA_READ,
+	[IB_WR_TID_RDMA_READ] = IB_WC_RDMA_READ,
 	[IB_WR_ATOMIC_CMP_AND_SWP] = IB_WC_COMP_SWAP,
 	[IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD,
 	[IB_WR_SEND_WITH_INV] = IB_WC_SEND,
@@ -200,6 +201,8 @@ const u8 hdr_len_by_opcode[256] = {
 	[IB_OPCODE_RC_FETCH_ADD]                      = 12 + 8 + 28,
 	[IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE]      = 12 + 8 + 4,
 	[IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE]      = 12 + 8 + 4,
+	[IB_OPCODE_TID_RDMA_READ_REQ]                 = 12 + 8 + 36,
+	[IB_OPCODE_TID_RDMA_READ_RESP]                = 12 + 8 + 36,
 	/* UC */
 	[IB_OPCODE_UC_SEND_FIRST]                     = 12 + 8,
 	[IB_OPCODE_UC_SEND_MIDDLE]                    = 12 + 8,
@@ -243,6 +246,11 @@ static const opcode_handler opcode_handler_tbl[256] = {
 	[IB_OPCODE_RC_FETCH_ADD]                      = &hfi1_rc_rcv,
 	[IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE]      = &hfi1_rc_rcv,
 	[IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE]      = &hfi1_rc_rcv,
+
+	/* TID RDMA has separate handlers for different opcodes.*/
+	[IB_OPCODE_TID_RDMA_READ_REQ]        = &hfi1_rc_rcv_tid_rdma_read_req,
+	[IB_OPCODE_TID_RDMA_READ_RESP]       = &hfi1_rc_rcv_tid_rdma_read_resp,
+
 	/* UC */
 	[IB_OPCODE_UC_SEND_FIRST]                     = &hfi1_uc_rcv,
 	[IB_OPCODE_UC_SEND_MIDDLE]                    = &hfi1_uc_rcv,
@@ -308,7 +316,7 @@ static inline opcode_handler qp_ok(struct hfi1_packet *packet)
 static u64 hfi1_fault_tx(struct rvt_qp *qp, u8 opcode, u64 pbc)
 {
 #ifdef CONFIG_FAULT_INJECTION
-	if ((opcode & IB_OPCODE_MSP) == IB_OPCODE_MSP)
+	if ((opcode & IB_OPCODE_MSP) == IB_OPCODE_MSP) {
 		/*
 		 * In order to drop non-IB traffic we
 		 * set PbcInsertHrc to NONE (0x2).
@@ -319,8 +327,9 @@ static u64 hfi1_fault_tx(struct rvt_qp *qp, u8 opcode, u64 pbc)
 		 * packet will not be delivered to the
 		 * correct context.
 		 */
+		pbc &= ~PBC_INSERT_HCRC_SMASK;
 		pbc |= (u64)PBC_IHCRC_NONE << PBC_INSERT_HCRC_SHIFT;
-	else
+	} else {
 		/*
 		 * In order to drop regular verbs
 		 * traffic we set the PbcTestEbp
@@ -330,10 +339,129 @@ static u64 hfi1_fault_tx(struct rvt_qp *qp, u8 opcode, u64 pbc)
 		 * triggered and will be dropped.
 		 */
 		pbc |= PBC_TEST_EBP;
+	}
 #endif
 	return pbc;
 }
 
+static opcode_handler tid_qp_ok(int opcode, struct hfi1_packet *packet)
+{
+	if (packet->qp->ibqp.qp_type != IB_QPT_RC ||
+	    !(ib_rvt_state_ops[packet->qp->state] & RVT_PROCESS_RECV_OK))
+		return NULL;
+	if ((opcode & RVT_OPCODE_QP_MASK) == IB_OPCODE_TID_RDMA)
+		return opcode_handler_tbl[opcode];
+	return NULL;
+}
+
+void hfi1_kdeth_eager_rcv(struct hfi1_packet *packet)
+{
+	struct hfi1_ctxtdata *rcd = packet->rcd;
+	struct ib_header *hdr = packet->hdr;
+	u32 tlen = packet->tlen;
+	struct hfi1_pportdata *ppd = rcd->ppd;
+	struct hfi1_ibport *ibp = &ppd->ibport_data;
+	struct rvt_dev_info *rdi = &ppd->dd->verbs_dev.rdi;
+	opcode_handler opcode_handler;
+	unsigned long flags;
+	u32 qp_num;
+	int lnh;
+	u8 opcode;
+
+	/* DW == LRH (2) + BTH (3) + KDETH (9) + CRC (1) */
+	if (unlikely(tlen < 15 * sizeof(u32)))
+		goto drop;
+
+	lnh = be16_to_cpu(hdr->lrh[0]) & 3;
+	if (lnh != HFI1_LRH_BTH)
+		goto drop;
+
+	packet->ohdr = &hdr->u.oth;
+	trace_input_ibhdr(rcd->dd, packet, !!(rhf_dc_info(packet->rhf)));
+
+	opcode = (be32_to_cpu(packet->ohdr->bth[0]) >> 24);
+	inc_opstats(tlen, &rcd->opstats->stats[opcode]);
+
+	/* verbs_qp can be picked up from any tid_rdma header struct */
+	qp_num = be32_to_cpu(packet->ohdr->u.tid_rdma.r_req.verbs_qp) &
+		RVT_QPN_MASK;
+
+	rcu_read_lock();
+	packet->qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num);
+	if (!packet->qp)
+		goto drop_rcu;
+	spin_lock_irqsave(&packet->qp->r_lock, flags);
+	opcode_handler = tid_qp_ok(opcode, packet);
+	if (likely(opcode_handler))
+		opcode_handler(packet);
+	else
+		goto drop_unlock;
+	spin_unlock_irqrestore(&packet->qp->r_lock, flags);
+	rcu_read_unlock();
+
+	return;
+drop_unlock:
+	spin_unlock_irqrestore(&packet->qp->r_lock, flags);
+drop_rcu:
+	rcu_read_unlock();
+drop:
+	ibp->rvp.n_pkt_drops++;
+}
+
+void hfi1_kdeth_expected_rcv(struct hfi1_packet *packet)
+{
+	struct hfi1_ctxtdata *rcd = packet->rcd;
+	struct ib_header *hdr = packet->hdr;
+	u32 tlen = packet->tlen;
+	struct hfi1_pportdata *ppd = rcd->ppd;
+	struct hfi1_ibport *ibp = &ppd->ibport_data;
+	struct rvt_dev_info *rdi = &ppd->dd->verbs_dev.rdi;
+	opcode_handler opcode_handler;
+	unsigned long flags;
+	u32 qp_num;
+	int lnh;
+	u8 opcode;
+
+	/* DW == LRH (2) + BTH (3) + KDETH (9) + CRC (1) */
+	if (unlikely(tlen < 15 * sizeof(u32)))
+		goto drop;
+
+	lnh = be16_to_cpu(hdr->lrh[0]) & 3;
+	if (lnh != HFI1_LRH_BTH)
+		goto drop;
+
+	packet->ohdr = &hdr->u.oth;
+	trace_input_ibhdr(rcd->dd, packet, !!(rhf_dc_info(packet->rhf)));
+
+	opcode = (be32_to_cpu(packet->ohdr->bth[0]) >> 24);
+	inc_opstats(tlen, &rcd->opstats->stats[opcode]);
+
+	/* verbs_qp can be picked up from any tid_rdma header struct */
+	qp_num = be32_to_cpu(packet->ohdr->u.tid_rdma.r_rsp.verbs_qp) &
+		RVT_QPN_MASK;
+
+	rcu_read_lock();
+	packet->qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num);
+	if (!packet->qp)
+		goto drop_rcu;
+	spin_lock_irqsave(&packet->qp->r_lock, flags);
+	opcode_handler = tid_qp_ok(opcode, packet);
+	if (likely(opcode_handler))
+		opcode_handler(packet);
+	else
+		goto drop_unlock;
+	spin_unlock_irqrestore(&packet->qp->r_lock, flags);
+	rcu_read_unlock();
+
+	return;
+drop_unlock:
+	spin_unlock_irqrestore(&packet->qp->r_lock, flags);
+drop_rcu:
+	rcu_read_unlock();
+drop:
+	ibp->rvp.n_pkt_drops++;
+}
+
 static int hfi1_do_pkey_check(struct hfi1_packet *packet)
 {
 	struct hfi1_ctxtdata *rcd = packet->rcd;
@@ -504,11 +632,28 @@ static void verbs_sdma_complete(
 	hfi1_put_txreq(tx);
 }
 
+void hfi1_wait_kmem(struct rvt_qp *qp)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+	struct ib_qp *ibqp = &qp->ibqp;
+	struct ib_device *ibdev = ibqp->device;
+	struct hfi1_ibdev *dev = to_idev(ibdev);
+
+	if (list_empty(&priv->s_iowait.list)) {
+		if (list_empty(&dev->memwait))
+			mod_timer(&dev->mem_timer, jiffies + 1);
+		qp->s_flags |= RVT_S_WAIT_KMEM;
+		list_add_tail(&priv->s_iowait.list, &dev->memwait);
+		priv->s_iowait.lock = &dev->iowait_lock;
+		trace_hfi1_qpsleep(qp, RVT_S_WAIT_KMEM);
+		rvt_get_qp(qp);
+	}
+}
+
 static int wait_kmem(struct hfi1_ibdev *dev,
 		     struct rvt_qp *qp,
 		     struct hfi1_pkt_state *ps)
 {
-	struct hfi1_qp_priv *priv = qp->priv;
 	unsigned long flags;
 	int ret = 0;
 
@@ -517,15 +662,7 @@ static int wait_kmem(struct hfi1_ibdev *dev,
 		write_seqlock(&dev->iowait_lock);
 		list_add_tail(&ps->s_txreq->txreq.list,
 			      &ps->wait->tx_head);
-		if (list_empty(&priv->s_iowait.list)) {
-			if (list_empty(&dev->memwait))
-				mod_timer(&dev->mem_timer, jiffies + 1);
-			qp->s_flags |= RVT_S_WAIT_KMEM;
-			list_add_tail(&priv->s_iowait.list, &dev->memwait);
-			priv->s_iowait.lock = &dev->iowait_lock;
-			trace_hfi1_qpsleep(qp, RVT_S_WAIT_KMEM);
-			rvt_get_qp(qp);
-		}
+		hfi1_wait_kmem(qp);
 		write_sequnlock(&dev->iowait_lock);
 		hfi1_qp_unbusy(qp, ps->wait);
 		ret = -EBUSY;
@@ -674,6 +811,15 @@ bail_txadd:
 	return ret;
 }
 
+static u64 update_hcrc(u8 opcode, u64 pbc)
+{
+	if ((opcode & IB_OPCODE_TID_RDMA) == IB_OPCODE_TID_RDMA) {
+		pbc &= ~PBC_INSERT_HCRC_SMASK;
+		pbc |= (u64)PBC_IHCRC_LKDETH << PBC_INSERT_HCRC_SHIFT;
+	}
+	return pbc;
+}
+
 int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
 			u64 pbc)
 {
@@ -719,6 +865,9 @@ int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
 					 qp->srate_mbps,
 					 vl,
 					 plen);
+
+			/* Update HCRC based on packet opcode */
+			pbc = update_hcrc(ps->opcode, pbc);
 		}
 		tx->wqe = qp->s_wqe;
 		ret = build_verbs_tx_desc(tx->sde, len, tx, ahg_info, pbc);
@@ -867,6 +1016,9 @@ int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
 		if (unlikely(hfi1_dbg_should_fault_tx(qp, ps->opcode)))
 			pbc = hfi1_fault_tx(qp, ps->opcode, pbc);
 		pbc = create_pbc(ppd, pbc, qp->srate_mbps, vl, plen);
+
+		/* Update HCRC based on packet opcode */
+		pbc = update_hcrc(ps->opcode, pbc);
 	}
 	if (cb)
 		iowait_pio_inc(&priv->s_iowait);
diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h
index c8baa1e38ff6..841727a684d5 100644
--- a/drivers/infiniband/hw/hfi1/verbs.h
+++ b/drivers/infiniband/hw/hfi1/verbs.h
@@ -159,17 +159,38 @@ struct hfi1_qp_priv {
 	struct sdma_engine *s_sde;                /* current sde */
 	struct send_context *s_sendcontext;       /* current sendcontext */
 	struct hfi1_ctxtdata *rcd;                /* QP's receive context */
+	struct page **pages;                      /* for TID page scan */
+	u32 tid_enqueue;                          /* saved when tid waited */
 	u8 s_sc;		                  /* SC[0..4] for next packet */
 	struct iowait s_iowait;
+	struct list_head tid_wait;                /* for queueing tid space */
 	struct hfi1_opfn_data opfn;
+	struct tid_flow_state flow_state;
 	struct tid_rdma_qp_params tid_rdma;
 	struct rvt_qp *owner;
 	u8 hdr_type; /* 9B or 16B */
 	unsigned long tid_timer_timeout_jiffies;
+
+	/* variables for the TID RDMA SE state machine */
+	u32 s_flags;
+
+	/* For TID RDMA READ */
+	u32 tid_r_reqs;         /* Num of tid reads requested */
+	u32 tid_r_comp;         /* Num of tid reads completed */
+	u32 pending_tid_r_segs; /* Num of pending tid read segments */
 	u16 pkts_ps;            /* packets per segment */
 	u8 timeout_shift;       /* account for number of packets per segment */
 };
 
+struct hfi1_swqe_priv {
+	struct tid_rdma_request tid_req;
+	struct rvt_sge_state ss;  /* Used for TID RDMA READ Request */
+};
+
+struct hfi1_ack_priv {
+	struct tid_rdma_request tid_req;
+};
+
 /*
  * This structure is used to hold commonly lookedup and computed values during
  * the send engine progress.
@@ -231,6 +252,7 @@ struct hfi1_ibdev {
 	struct kmem_cache *verbs_txreq_cache;
 	u64 n_txwait;
 	u64 n_kmem_wait;
+	u64 n_tidwait;
 
 	/* protect iowait lists */
 	seqlock_t iowait_lock ____cacheline_aligned_in_smp;
@@ -318,6 +340,31 @@ static inline u32 delta_psn(u32 a, u32 b)
 	return (((int)a - (int)b) << PSN_SHIFT) >> PSN_SHIFT;
 }
 
+static inline struct tid_rdma_request *wqe_to_tid_req(struct rvt_swqe *wqe)
+{
+	return &((struct hfi1_swqe_priv *)wqe->priv)->tid_req;
+}
+
+static inline struct tid_rdma_request *ack_to_tid_req(struct rvt_ack_entry *e)
+{
+	return &((struct hfi1_ack_priv *)e->priv)->tid_req;
+}
+
+/*
+ * Look through all the active flows for a TID RDMA request and find
+ * the one (if it exists) that contains the specified PSN.
+ */
+static inline u32 __full_flow_psn(struct flow_state *state, u32 psn)
+{
+	return mask_psn((state->generation << HFI1_KDETH_BTH_SEQ_SHIFT) |
+			(psn & HFI1_KDETH_BTH_SEQ_MASK));
+}
+
+static inline u32 full_flow_psn(struct tid_rdma_flow *flow, u32 psn)
+{
+	return __full_flow_psn(&flow->flow_state, psn);
+}
+
 struct verbs_txreq;
 void hfi1_put_txreq(struct verbs_txreq *tx);
 
@@ -383,6 +430,10 @@ int hfi1_register_ib_device(struct hfi1_devdata *);
 
 void hfi1_unregister_ib_device(struct hfi1_devdata *);
 
+void hfi1_kdeth_eager_rcv(struct hfi1_packet *packet);
+
+void hfi1_kdeth_expected_rcv(struct hfi1_packet *packet);
+
 void hfi1_ib_rcv(struct hfi1_packet *packet);
 
 void hfi1_16B_rcv(struct hfi1_packet *packet);
@@ -400,6 +451,16 @@ static inline bool opa_bth_is_migration(struct ib_other_headers *ohdr)
 	return ohdr->bth[1] & cpu_to_be32(OPA_BTH_MIG_REQ);
 }
 
+void hfi1_wait_kmem(struct rvt_qp *qp);
+
+static inline void hfi1_trdma_send_complete(struct rvt_qp *qp,
+					    struct rvt_swqe *wqe,
+					    enum ib_wc_status status)
+{
+	trdma_clean_swqe(qp, wqe);
+	rvt_send_complete(qp, wqe, status);
+}
+
 extern const enum ib_wc_opcode ib_hfi1_wc_opcode[];
 
 extern const u8 hdr_len_by_opcode[];
diff --git a/drivers/infiniband/hw/qib/qib_rc.c b/drivers/infiniband/hw/qib/qib_rc.c
index 6fa002940451..50dd9811b088 100644
--- a/drivers/infiniband/hw/qib/qib_rc.c
+++ b/drivers/infiniband/hw/qib/qib_rc.c
@@ -45,12 +45,7 @@ static u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe,
 	u32 len;
 
 	len = ((psn - wqe->psn) & QIB_PSN_MASK) * pmtu;
-	ss->sge = wqe->sg_list[0];
-	ss->sg_list = wqe->sg_list + 1;
-	ss->num_sge = wqe->wr.num_sge;
-	ss->total_len = wqe->length;
-	rvt_skip_sge(ss, len, false);
-	return wqe->length - len;
+	return rvt_restart_sge(ss, wqe, len);
 }
 
 /**
diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c
index 16247d2a671d..2769ebdf89fb 100644
--- a/drivers/infiniband/sw/rdmavt/qp.c
+++ b/drivers/infiniband/sw/rdmavt/qp.c
@@ -1642,11 +1642,11 @@ int rvt_destroy_qp(struct ib_qp *ibqp)
 		kref_put(&qp->ip->ref, rvt_release_mmap_info);
 	else
 		vfree(qp->r_rq.wq);
-	vfree(qp->s_wq);
 	rdi->driver_f.qp_priv_free(rdi, qp);
 	kfree(qp->s_ack_queue);
 	rdma_destroy_ah_attr(&qp->remote_ah_attr);
 	rdma_destroy_ah_attr(&qp->alt_ah_attr);
+	vfree(qp->s_wq);
 	kfree(qp);
 	return 0;
 }
@@ -2393,11 +2393,12 @@ static inline unsigned long rvt_aeth_to_usec(u32 aeth)
 }
 
 /*
- *  rvt_add_retry_timer - add/start a retry timer
+ *  rvt_add_retry_timer_ext - add/start a retry timer
  *  @qp - the QP
+ *  @shift - timeout shift to wait for multiple packets
  *  add a retry timer on the QP
  */
-void rvt_add_retry_timer(struct rvt_qp *qp)
+void rvt_add_retry_timer_ext(struct rvt_qp *qp, u8 shift)
 {
 	struct ib_qp *ibqp = &qp->ibqp;
 	struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
@@ -2405,11 +2406,11 @@ void rvt_add_retry_timer(struct rvt_qp *qp)
 	lockdep_assert_held(&qp->s_lock);
 	qp->s_flags |= RVT_S_TIMER;
        /* 4.096 usec. * (1 << qp->timeout) */
-	qp->s_timer.expires = jiffies + qp->timeout_jiffies +
-			     rdi->busy_jiffies;
+	qp->s_timer.expires = jiffies + rdi->busy_jiffies +
+			      (qp->timeout_jiffies << shift);
 	add_timer(&qp->s_timer);
 }
-EXPORT_SYMBOL(rvt_add_retry_timer);
+EXPORT_SYMBOL(rvt_add_retry_timer_ext);
 
 /**
  * rvt_add_rnr_timer - add/start an rnr timer
diff --git a/drivers/infiniband/sw/rdmavt/rc.c b/drivers/infiniband/sw/rdmavt/rc.c
index 6131cc558bdb..8d71647820a8 100644
--- a/drivers/infiniband/sw/rdmavt/rc.c
+++ b/drivers/infiniband/sw/rdmavt/rc.c
@@ -187,3 +187,16 @@ void rvt_get_credit(struct rvt_qp *qp, u32 aeth)
 	}
 }
 EXPORT_SYMBOL(rvt_get_credit);
+
+/* rvt_restart_sge - rewind the sge state for a wqe */
+u32 rvt_restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe, u32 len)
+{
+	ss->sge = wqe->sg_list[0];
+	ss->sg_list = wqe->sg_list + 1;
+	ss->num_sge = wqe->wr.num_sge;
+	ss->total_len = wqe->length;
+	rvt_skip_sge(ss, len, false);
+	return wqe->length - len;
+}
+EXPORT_SYMBOL(rvt_restart_sge);
+
diff --git a/include/rdma/ib_hdrs.h b/include/rdma/ib_hdrs.h
index 6e35416170a3..58a0a0f99e7f 100644
--- a/include/rdma/ib_hdrs.h
+++ b/include/rdma/ib_hdrs.h
@@ -1,5 +1,5 @@
 /*
- * Copyright(c) 2016 Intel Corporation.
+ * Copyright(c) 2016 - 2018 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -100,6 +100,8 @@ struct ib_atomic_eth {
 	__be64 compare_data; /* potentially unaligned */
 } __packed;
 
+#include <rdma/tid_rdma_defs.h>
+
 union ib_ehdrs {
 	struct {
 		__be32 deth[2];
@@ -117,6 +119,11 @@ union ib_ehdrs {
 	__be32 aeth;
 	__be32 ieth;
 	struct ib_atomic_eth atomic_eth;
+	/* TID RDMA headers */
+	union {
+		struct tid_rdma_read_req r_req;
+		struct tid_rdma_read_resp r_rsp;
+	} tid_rdma;
 }  __packed;
 
 struct ib_other_headers {
diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h
index 168e40be183c..87d66c9630d7 100644
--- a/include/rdma/rdma_vt.h
+++ b/include/rdma/rdma_vt.h
@@ -574,9 +574,10 @@ static inline struct rvt_qp *rvt_lookup_qpn(struct rvt_dev_info *rdi,
 /**
  * rvt_mod_retry_timer - mod a retry timer
  * @qp - the QP
+ * @shift - timeout shift to wait for multiple packets
  * Modify a potentially already running retry timer
  */
-static inline void rvt_mod_retry_timer(struct rvt_qp *qp)
+static inline void rvt_mod_retry_timer_ext(struct rvt_qp *qp, u8 shift)
 {
 	struct ib_qp *ibqp = &qp->ibqp;
 	struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
@@ -584,8 +585,13 @@ static inline void rvt_mod_retry_timer(struct rvt_qp *qp)
 	lockdep_assert_held(&qp->s_lock);
 	qp->s_flags |= RVT_S_TIMER;
 	/* 4.096 usec. * (1 << qp->timeout) */
-	mod_timer(&qp->s_timer, jiffies + qp->timeout_jiffies +
-		  rdi->busy_jiffies);
+	mod_timer(&qp->s_timer, jiffies + rdi->busy_jiffies +
+		  (qp->timeout_jiffies << shift));
+}
+
+static inline void rvt_mod_retry_timer(struct rvt_qp *qp)
+{
+	return rvt_mod_retry_timer_ext(qp, 0);
 }
 
 struct rvt_dev_info *rvt_alloc_device(size_t size, int nports);
diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h
index cbafb1878669..d8d88d023092 100644
--- a/include/rdma/rdmavt_qp.h
+++ b/include/rdma/rdmavt_qp.h
@@ -174,6 +174,7 @@ struct rvt_swqe {
 	u32 lpsn;               /* last packet sequence number */
 	u32 ssn;                /* send sequence number */
 	u32 length;             /* total length of data in sg_list */
+	void *priv;             /* driver dependent field */
 	struct rvt_sge sg_list[0];
 };
 
@@ -235,6 +236,7 @@ struct rvt_ack_entry {
 	u32 lpsn;
 	u8 opcode;
 	u8 sent;
+	void *priv;
 };
 
 #define	RC_QP_SCALING_INTERVAL	5
@@ -629,6 +631,16 @@ __be32 rvt_compute_aeth(struct rvt_qp *qp);
 void rvt_get_credit(struct rvt_qp *qp, u32 aeth);
 
 /**
+ * rvt_restart_sge - rewind the sge state for a wqe
+ * @ss: the sge state pointer
+ * @wqe: the wqe to rewind
+ * @len: the data length from the start of the wqe in bytes
+ *
+ * Returns the remaining data length.
+ */
+u32 rvt_restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe, u32 len);
+
+/**
  * @qp - the qp pair
  * @len - the length
  *
@@ -676,7 +688,11 @@ enum hrtimer_restart rvt_rc_rnr_retry(struct hrtimer *t);
 void rvt_add_rnr_timer(struct rvt_qp *qp, u32 aeth);
 void rvt_del_timers_sync(struct rvt_qp *qp);
 void rvt_stop_rc_timers(struct rvt_qp *qp);
-void rvt_add_retry_timer(struct rvt_qp *qp);
+void rvt_add_retry_timer_ext(struct rvt_qp *qp, u8 shift);
+static inline void rvt_add_retry_timer(struct rvt_qp *qp)
+{
+	rvt_add_retry_timer_ext(qp, 0);
+}
 
 void rvt_copy_sge(struct rvt_qp *qp, struct rvt_sge_state *ss,
 		  void *data, u32 length,
diff --git a/include/rdma/tid_rdma_defs.h b/include/rdma/tid_rdma_defs.h
new file mode 100644
index 000000000000..1c431ea32b52
--- /dev/null
+++ b/include/rdma/tid_rdma_defs.h
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
+/*
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ */
+
+#ifndef TID_RDMA_DEFS_H
+#define TID_RDMA_DEFS_H
+
+#include <rdma/ib_pack.h>
+
+struct tid_rdma_read_req {
+	__le32 kdeth0;
+	__le32 kdeth1;
+	struct ib_reth reth;
+	__be32 tid_flow_psn;
+	__be32 tid_flow_qp;
+	__be32 verbs_qp;
+};
+
+struct tid_rdma_read_resp {
+	__le32 kdeth0;
+	__le32 kdeth1;
+	__be32 aeth;
+	__be32 reserved[4];
+	__be32 verbs_psn;
+	__be32 verbs_qp;
+};
+
+/*
+ * TID RDMA Opcodes
+ */
+#define IB_OPCODE_TID_RDMA 0xe0
+enum {
+	IB_OPCODE_READ_REQ        = 0x4,
+	IB_OPCODE_READ_RESP       = 0x5,
+
+	IB_OPCODE(TID_RDMA, READ_REQ),
+	IB_OPCODE(TID_RDMA, READ_RESP),
+};
+
+#define TID_OP(x) IB_OPCODE_TID_RDMA_##x
+
+/*
+ * Define TID RDMA specific WR opcodes. The ib_wr_opcode
+ * enum already provides some reserved values for use by
+ * low level drivers. Two of those are used but renamed
+ * to be more descriptive.
+ */
+#define IB_WR_TID_RDMA_READ  IB_WR_RESERVED2
+
+#endif /* TID_RDMA_DEFS_H */
author	Doug Ledford <dledford@redhat.com>	2019-02-05 17:59:43 -0500
committer	Doug Ledford <dledford@redhat.com>	2019-02-05 17:59:43 -0500
commit	a2f3bde88174e5c8edf7d6f1b3403a9214439d50 (patch)
tree	2b516e3b90cf2ec0eb03b160a8b498486599b848
parent	Merge branch 'opfn' into hfi1-tid (diff)
parent	IB/hfi1: Add static trace for TID RDMA READ protocol (diff)
download	linux-dev-a2f3bde88174e5c8edf7d6f1b3403a9214439d50.tar.xz linux-dev-a2f3bde88174e5c8edf7d6f1b3403a9214439d50.zip