aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/infiniband/hw/hfi1
diff options
context:
space:
mode:
authorKaike Wan <kaike.wan@intel.com>2019-01-23 21:51:07 -0800
committerDoug Ledford <dledford@redhat.com>2019-02-05 18:07:44 -0500
commit70dcb2e3dc6aa827d74e09c830ea06c660274880 (patch)
tree0f813d35bcf93a49e6a511cc44531fcb8b2e3c42 /drivers/infiniband/hw/hfi1
parentIB/hfi1: Resend the TID RDMA WRITE DATA packets (diff)
downloadlinux-dev-70dcb2e3dc6aa827d74e09c830ea06c660274880.tar.xz
linux-dev-70dcb2e3dc6aa827d74e09c830ea06c660274880.zip
IB/hfi1: Add the TID second leg send packet builder
To improve performance, the TID RDMA WRITE protocol is designed to own a second leg to send data and ack packets in the KDETH PSN space. This patch adds the packet builder for the requester side, which contains the state machine to build TID RDMA WRITE DATA and TID RDMA RESYNC packet. Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Signed-off-by: Mitko Haralanov <mitko.haralanov@intel.com> Signed-off-by: Kaike Wan <kaike.wan@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
Diffstat (limited to 'drivers/infiniband/hw/hfi1')
-rw-r--r--drivers/infiniband/hw/hfi1/tid_rdma.c211
-rw-r--r--drivers/infiniband/hw/hfi1/tid_rdma.h7
-rw-r--r--drivers/infiniband/hw/hfi1/verbs.h2
3 files changed, 220 insertions, 0 deletions
diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c
index cb6321b0d2c9..44c5c0010888 100644
--- a/drivers/infiniband/hw/hfi1/tid_rdma.c
+++ b/drivers/infiniband/hw/hfi1/tid_rdma.c
@@ -331,6 +331,7 @@ int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp,
qpriv->r_tid_tail = HFI1_QP_WQE_INVALID;
qpriv->r_tid_ack = HFI1_QP_WQE_INVALID;
qpriv->r_tid_alloc = HFI1_QP_WQE_INVALID;
+ atomic_set(&qpriv->n_requests, 0);
atomic_set(&qpriv->n_tid_requests, 0);
timer_setup(&qpriv->s_tid_timer, hfi1_tid_timeout, 0);
timer_setup(&qpriv->s_tid_retry_timer, hfi1_tid_retry_timeout, 0);
@@ -4803,3 +4804,213 @@ void hfi1_rc_rcv_tid_rdma_resync(struct hfi1_packet *packet)
bail:
spin_unlock_irqrestore(&qp->s_lock, flags);
}
+
+/*
+ * Call this function when the last TID RDMA WRITE DATA packet for a request
+ * is built.
+ */
+static void update_tid_tail(struct rvt_qp *qp)
+ __must_hold(&qp->s_lock)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+ u32 i;
+ struct rvt_swqe *wqe;
+
+ lockdep_assert_held(&qp->s_lock);
+ /* Can't move beyond s_tid_cur */
+ if (priv->s_tid_tail == priv->s_tid_cur)
+ return;
+ for (i = priv->s_tid_tail + 1; ; i++) {
+ if (i == qp->s_size)
+ i = 0;
+
+ if (i == priv->s_tid_cur)
+ break;
+ wqe = rvt_get_swqe_ptr(qp, i);
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE)
+ break;
+ }
+ priv->s_tid_tail = i;
+ priv->s_state = TID_OP(WRITE_RESP);
+}
+
+int hfi1_make_tid_rdma_pkt(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
+ __must_hold(&qp->s_lock)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+ struct rvt_swqe *wqe;
+ u32 bth1 = 0, bth2 = 0, hwords = 5, len, middle = 0;
+ struct ib_other_headers *ohdr;
+ struct rvt_sge_state *ss = &qp->s_sge;
+ struct rvt_ack_entry *e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+ struct tid_rdma_request *req = ack_to_tid_req(e);
+ bool last = false;
+ u8 opcode = TID_OP(WRITE_DATA);
+
+ lockdep_assert_held(&qp->s_lock);
+ /*
+ * Prioritize the sending of the requests and responses over the
+ * sending of the TID RDMA data packets.
+ */
+ if (((atomic_read(&priv->n_tid_requests) < HFI1_TID_RDMA_WRITE_CNT) &&
+ atomic_read(&priv->n_requests) &&
+ !(qp->s_flags & (RVT_S_BUSY | RVT_S_WAIT_ACK |
+ HFI1_S_ANY_WAIT_IO))) ||
+ (e->opcode == TID_OP(WRITE_REQ) && req->cur_seg < req->alloc_seg &&
+ !(qp->s_flags & (RVT_S_BUSY | HFI1_S_ANY_WAIT_IO)))) {
+ struct iowait_work *iowork;
+
+ iowork = iowait_get_ib_work(&priv->s_iowait);
+ ps->s_txreq = get_waiting_verbs_txreq(iowork);
+ if (ps->s_txreq || hfi1_make_rc_req(qp, ps)) {
+ priv->s_flags |= HFI1_S_TID_BUSY_SET;
+ return 1;
+ }
+ }
+
+ ps->s_txreq = get_txreq(ps->dev, qp);
+ if (!ps->s_txreq)
+ goto bail_no_tx;
+
+ ohdr = &ps->s_txreq->phdr.hdr.ibh.u.oth;
+
+ if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_SEND_OK)) {
+ if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND))
+ goto bail;
+ /* We are in the error state, flush the work request. */
+ if (qp->s_last == READ_ONCE(qp->s_head))
+ goto bail;
+ /* If DMAs are in progress, we can't flush immediately. */
+ if (iowait_sdma_pending(&priv->s_iowait)) {
+ qp->s_flags |= RVT_S_WAIT_DMA;
+ goto bail;
+ }
+ clear_ahg(qp);
+ wqe = rvt_get_swqe_ptr(qp, qp->s_last);
+ hfi1_trdma_send_complete(qp, wqe, qp->s_last != qp->s_acked ?
+ IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR);
+ /* will get called again */
+ goto done_free_tx;
+ }
+
+ if (priv->s_flags & RVT_S_WAIT_ACK)
+ goto bail;
+
+ /* Check whether there is anything to do. */
+ if (priv->s_tid_tail == HFI1_QP_WQE_INVALID)
+ goto bail;
+ wqe = rvt_get_swqe_ptr(qp, priv->s_tid_tail);
+ req = wqe_to_tid_req(wqe);
+ switch (priv->s_state) {
+ case TID_OP(WRITE_REQ):
+ case TID_OP(WRITE_RESP):
+ priv->tid_ss.sge = wqe->sg_list[0];
+ priv->tid_ss.sg_list = wqe->sg_list + 1;
+ priv->tid_ss.num_sge = wqe->wr.num_sge;
+ priv->tid_ss.total_len = wqe->length;
+
+ if (priv->s_state == TID_OP(WRITE_REQ))
+ hfi1_tid_rdma_restart_req(qp, wqe, &bth2);
+ priv->s_state = TID_OP(WRITE_DATA);
+ /* fall through */
+
+ case TID_OP(WRITE_DATA):
+ /*
+ * 1. Check whether TID RDMA WRITE RESP available.
+ * 2. If no:
+ * 2.1 If have more segments and no TID RDMA WRITE RESP,
+ * set HFI1_S_WAIT_TID_RESP
+ * 2.2 Return indicating no progress made.
+ * 3. If yes:
+ * 3.1 Build TID RDMA WRITE DATA packet.
+ * 3.2 If last packet in segment:
+ * 3.2.1 Change KDETH header bits
+ * 3.2.2 Advance RESP pointers.
+ * 3.3 Return indicating progress made.
+ */
+ wqe = rvt_get_swqe_ptr(qp, priv->s_tid_tail);
+ req = wqe_to_tid_req(wqe);
+ len = wqe->length;
+
+ if (!req->comp_seg || req->cur_seg == req->comp_seg)
+ goto bail;
+
+ last = hfi1_build_tid_rdma_packet(wqe, ohdr, &bth1, &bth2,
+ &len);
+
+ if (last) {
+ /* move pointer to next flow */
+ req->clear_tail = CIRC_NEXT(req->clear_tail,
+ MAX_FLOWS);
+ if (++req->cur_seg < req->total_segs) {
+ if (!CIRC_CNT(req->setup_head, req->clear_tail,
+ MAX_FLOWS))
+ qp->s_flags |= HFI1_S_WAIT_TID_RESP;
+ } else {
+ priv->s_state = TID_OP(WRITE_DATA_LAST);
+ opcode = TID_OP(WRITE_DATA_LAST);
+
+ /* Advance the s_tid_tail now */
+ update_tid_tail(qp);
+ }
+ }
+ hwords += sizeof(ohdr->u.tid_rdma.w_data) / sizeof(u32);
+ ss = &priv->tid_ss;
+ break;
+
+ case TID_OP(RESYNC):
+ /* Use generation from the most recently received response */
+ wqe = rvt_get_swqe_ptr(qp, priv->s_tid_cur);
+ req = wqe_to_tid_req(wqe);
+ /* If no responses for this WQE look at the previous one */
+ if (!req->comp_seg) {
+ wqe = rvt_get_swqe_ptr(qp,
+ (!priv->s_tid_cur ? qp->s_size :
+ priv->s_tid_cur) - 1);
+ req = wqe_to_tid_req(wqe);
+ }
+ hwords += hfi1_build_tid_rdma_resync(qp, wqe, ohdr, &bth1,
+ &bth2,
+ CIRC_PREV(req->setup_head,
+ MAX_FLOWS));
+ ss = NULL;
+ len = 0;
+ opcode = TID_OP(RESYNC);
+ break;
+
+ default:
+ goto bail;
+ }
+ if (priv->s_flags & RVT_S_SEND_ONE) {
+ priv->s_flags &= ~RVT_S_SEND_ONE;
+ priv->s_flags |= RVT_S_WAIT_ACK;
+ bth2 |= IB_BTH_REQ_ACK;
+ }
+ qp->s_len -= len;
+ ps->s_txreq->hdr_dwords = hwords;
+ ps->s_txreq->sde = priv->s_sde;
+ ps->s_txreq->ss = ss;
+ ps->s_txreq->s_cur_size = len;
+ hfi1_make_ruc_header(qp, ohdr, (opcode << 24), bth1, bth2,
+ middle, ps);
+ return 1;
+done_free_tx:
+ hfi1_put_txreq(ps->s_txreq);
+ ps->s_txreq = NULL;
+ return 1;
+
+bail:
+ hfi1_put_txreq(ps->s_txreq);
+bail_no_tx:
+ ps->s_txreq = NULL;
+ priv->s_flags &= ~RVT_S_BUSY;
+ /*
+ * If we didn't get a txreq, the QP will be woken up later to try
+ * again, set the flags to the the wake up which work item to wake
+ * up.
+ * (A better algorithm should be found to do this and generalize the
+ * sleep/wakeup flags.)
+ */
+ iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_TID);
+ return 0;
+}
diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h
index bdcf18455d9d..0ce0ef6d60f2 100644
--- a/drivers/infiniband/hw/hfi1/tid_rdma.h
+++ b/drivers/infiniband/hw/hfi1/tid_rdma.h
@@ -26,9 +26,13 @@
*
* HFI1_S_TID_WAIT_INTERLCK - QP is waiting for requester interlock
*/
+#define HFI1_S_TID_BUSY_SET BIT(0)
+/* BIT(1) reserved for RVT_S_BUSY. */
#define HFI1_R_TID_RSC_TIMER BIT(2)
+/* BIT(3) reserved for RVT_S_RESP_PENDING. */
/* BIT(4) reserved for RVT_S_ACK_PENDING. */
#define HFI1_S_TID_WAIT_INTERLCK BIT(5)
+/* BIT(7) - BIT(15) reserved for RVT_S_WAIT_*. */
#define HFI1_S_TID_RETRY_TIMER BIT(17)
#define HFI1_R_TID_SW_PSN BIT(19)
@@ -298,4 +302,7 @@ u32 hfi1_build_tid_rdma_resync(struct rvt_qp *qp, struct rvt_swqe *wqe,
void hfi1_rc_rcv_tid_rdma_resync(struct hfi1_packet *packet);
+struct hfi1_pkt_state;
+int hfi1_make_tid_rdma_pkt(struct rvt_qp *qp, struct hfi1_pkt_state *ps);
+
#endif /* HFI1_TID_RDMA_H */
diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h
index ce40ea9f43c3..3e45149d22c4 100644
--- a/drivers/infiniband/hw/hfi1/verbs.h
+++ b/drivers/infiniband/hw/hfi1/verbs.h
@@ -172,6 +172,8 @@ struct hfi1_qp_priv {
struct rvt_qp *owner;
u8 hdr_type; /* 9B or 16B */
struct rvt_sge_state tid_ss; /* SGE state pointer for 2nd leg */
+ atomic_t n_requests; /* # of TID RDMA requests in the */
+ /* queue */
atomic_t n_tid_requests; /* # of sent TID RDMA requests */
unsigned long tid_timer_timeout_jiffies;
unsigned long tid_retry_timeout_jiffies;