// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause /* Authors: Bernard Metzler */ /* Copyright (c) 2008-2019, IBM Corporation */ #include #include #include #include #include #include #include #include "siw.h" #include "siw_verbs.h" #include "siw_mem.h" /* * siw_rx_umem() * * Receive data of @len into target referenced by @dest_addr. * * @srx: Receive Context * @umem: siw representation of target memory * @dest_addr: user virtual address * @len: number of bytes to place */ static int siw_rx_umem(struct siw_rx_stream *srx, struct siw_umem *umem, u64 dest_addr, int len) { int copied = 0; while (len) { struct page *p; int pg_off, bytes, rv; void *dest; p = siw_get_upage(umem, dest_addr); if (unlikely(!p)) { pr_warn("siw: %s: [QP %u]: bogus addr: %p, %p\n", __func__, qp_id(rx_qp(srx)), (void *)dest_addr, (void *)umem->fp_addr); /* siw internal error */ srx->skb_copied += copied; srx->skb_new -= copied; return -EFAULT; } pg_off = dest_addr & ~PAGE_MASK; bytes = min(len, (int)PAGE_SIZE - pg_off); siw_dbg_qp(rx_qp(srx), "page %p, bytes=%u\n", p, bytes); dest = kmap_atomic(p); rv = skb_copy_bits(srx->skb, srx->skb_offset, dest + pg_off, bytes); if (unlikely(rv)) { kunmap_atomic(dest); srx->skb_copied += copied; srx->skb_new -= copied; pr_warn("siw: [QP %u]: %s, len %d, page %p, rv %d\n", qp_id(rx_qp(srx)), __func__, len, p, rv); return -EFAULT; } if (srx->mpa_crc_hd) { if (rx_qp(srx)->kernel_verbs) { crypto_shash_update(srx->mpa_crc_hd, (u8 *)(dest + pg_off), bytes); kunmap_atomic(dest); } else { kunmap_atomic(dest); /* * Do CRC on original, not target buffer. * Some user land applications may * concurrently write the target buffer, * which would yield a broken CRC. * Walking the skb twice is very ineffcient. * Folding the CRC into skb_copy_bits() * would be much better, but is currently * not supported. */ siw_crc_skb(srx, bytes); } } else { kunmap_atomic(dest); } srx->skb_offset += bytes; copied += bytes; len -= bytes; dest_addr += bytes; pg_off = 0; } srx->skb_copied += copied; srx->skb_new -= copied; return copied; } static int siw_rx_kva(struct siw_rx_stream *srx, void *kva, int len) { int rv; siw_dbg_qp(rx_qp(srx), "kva: 0x%p, len: %u\n", kva, len); rv = skb_copy_bits(srx->skb, srx->skb_offset, kva, len); if (unlikely(rv)) { pr_warn("siw: [QP %u]: %s, len %d, kva 0x%p, rv %d\n", qp_id(rx_qp(srx)), __func__, len, kva, rv); return rv; } if (srx->mpa_crc_hd) crypto_shash_update(srx->mpa_crc_hd, (u8 *)kva, len); srx->skb_offset += len; srx->skb_copied += len; srx->skb_new -= len; return len; } static int siw_rx_pbl(struct siw_rx_stream *srx, int *pbl_idx, struct siw_mem *mem, u64 addr, int len) { struct siw_pbl *pbl = mem->pbl; u64 offset = addr - mem->va; int copied = 0; while (len) { int bytes; u64 buf_addr = siw_pbl_get_buffer(pbl, offset, &bytes, pbl_idx); if (!buf_addr) break; bytes = min(bytes, len); if (siw_rx_kva(srx, (void *)buf_addr, bytes) == bytes) { copied += bytes; offset += bytes; len -= bytes; } else { break; } } return copied; } /* * siw_rresp_check_ntoh() * * Check incoming RRESP fragment header against expected * header values and update expected values for potential next * fragment. * * NOTE: This function must be called only if a RRESP DDP segment * starts but not for fragmented consecutive pieces of an * already started DDP segment. */ static int siw_rresp_check_ntoh(struct siw_rx_stream *srx, struct siw_rx_fpdu *frx) { struct iwarp_rdma_rresp *rresp = &srx->hdr.rresp; struct siw_wqe *wqe = &frx->wqe_active; enum ddp_ecode ecode; u32 sink_stag = be32_to_cpu(rresp->sink_stag); u64 sink_to = be64_to_cpu(rresp->sink_to); if (frx->first_ddp_seg) { srx->ddp_stag = wqe->sqe.sge[0].lkey; srx->ddp_to = wqe->sqe.sge[0].laddr; frx->pbl_idx = 0; } /* Below checks extend beyond the semantics of DDP, and * into RDMAP: * We check if the read response matches exactly the * read request which was send to the remote peer to * trigger this read response. RFC5040/5041 do not * always have a proper error code for the detected * error cases. We choose 'base or bounds error' for * cases where the inbound STag is valid, but offset * or length do not match our response receive state. */ if (unlikely(srx->ddp_stag != sink_stag)) { pr_warn("siw: [QP %u]: rresp stag: %08x != %08x\n", qp_id(rx_qp(srx)), sink_stag, srx->ddp_stag); ecode = DDP_ECODE_T_INVALID_STAG; goto error; } if (unlikely(srx->ddp_to != sink_to)) { pr_warn("siw: [QP %u]: rresp off: %016llx != %016llx\n", qp_id(rx_qp(srx)), (unsigned long long)sink_to, (unsigned long long)srx->ddp_to); ecode = DDP_ECODE_T_BASE_BOUNDS; goto error; } if (unlikely(!frx->more_ddp_segs && (wqe->processed + srx->fpdu_part_rem != wqe->bytes))) { pr_warn("siw: [QP %u]: rresp len: %d != %d\n", qp_id(rx_qp(srx)), wqe->processed + srx->fpdu_part_rem, wqe->bytes); ecode = DDP_ECODE_T_BASE_BOUNDS; goto error; } return 0; error: siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP, DDP_ETYPE_TAGGED_BUF, ecode, 0); return -EINVAL; } /* * siw_write_check_ntoh() * * Check incoming WRITE fragment header against expected * header values and update expected values for potential next * fragment * * NOTE: This function must be called only if a WRITE DDP segment * starts but not for fragmented consecutive pieces of an * already started DDP segment. */ static int siw_write_check_ntoh(struct siw_rx_stream *srx, struct siw_rx_fpdu *frx) { struct iwarp_rdma_write *write = &srx->hdr.rwrite; enum ddp_ecode ecode; u32 sink_stag = be32_to_cpu(write->sink_stag); u64 sink_to = be64_to_cpu(write->sink_to); if (frx->first_ddp_seg) { srx->ddp_stag = sink_stag; srx->ddp_to = sink_to; frx->pbl_idx = 0; } else { if (unlikely(srx->ddp_stag != sink_stag)) { pr_warn("siw: [QP %u]: write stag: %08x != %08x\n", qp_id(rx_qp(srx)), sink_stag, srx->ddp_stag); ecode = DDP_ECODE_T_INVALID_STAG; goto error; } if (unlikely(srx->ddp_to != sink_to)) { pr_warn("siw: [QP %u]: write off: %016llx != %016llx\n", qp_id(rx_qp(srx)), (unsigned long long)sink_to, (unsigned long long)srx->ddp_to); ecode = DDP_ECODE_T_BASE_BOUNDS; goto error; } } return 0; error: siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP, DDP_ETYPE_TAGGED_BUF, ecode, 0); return -EINVAL; } /* * siw_send_check_ntoh() * * Check incoming SEND fragment header against expected * header values and update expected MSN if no next * fragment expected * * NOTE: This function must be called only if a SEND DDP segment * starts but not for fragmented consecutive pieces of an * already started DDP segment. */ static int siw_send_check_ntoh(struct siw_rx_stream *srx, struct siw_rx_fpdu *frx) { struct iwarp_send_inv *send = &srx->hdr.send_inv; struct siw_wqe *wqe = &frx->wqe_active; enum ddp_ecode ecode; u32 ddp_msn = be32_to_cpu(send->ddp_msn); u32 ddp_mo = be32_to_cpu(send->ddp_mo); u32 ddp_qn = be32_to_cpu(send->ddp_qn); if (unlikely(ddp_qn != RDMAP_UNTAGGED_QN_SEND)) { pr_warn("siw: [QP %u]: invalid ddp qn %d for send\n", qp_id(rx_qp(srx)), ddp_qn); ecode = DDP_ECODE_UT_INVALID_QN; goto error; } if (unlikely(ddp_msn != srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND])) { pr_warn("siw: [QP %u]: send msn: %u != %u\n", qp_id(rx_qp(srx)), ddp_msn, srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]); ecode = DDP_ECODE_UT_INVALID_MSN_RANGE; goto error; } if (unlikely(ddp_mo != wqe->processed)) { pr_warn("siw: [QP %u], send mo: %u != %u\n", qp_id(rx_qp(srx)), ddp_mo, wqe->processed); ecode = DDP_ECODE_UT_INVALID_MO; goto error; } if (frx->first_ddp_seg) { /* initialize user memory write position */ frx->sge_idx = 0; frx->sge_off = 0; frx->pbl_idx = 0; /* only valid for SEND_INV and SEND_SE_INV operations */ srx->inval_stag = be32_to_cpu(send->inval_stag); } if (unlikely(wqe->bytes < wqe->processed + srx->fpdu_part_rem)) { siw_dbg_qp(rx_qp(srx), "receive space short: %d - %d < %d\n", wqe->bytes, wqe->processed, srx->fpdu_part_rem); wqe->wc_status = SIW_WC_LOC_LEN_ERR; ecode = DDP_ECODE_UT_INVALID_MSN_NOBUF; goto error; } return 0; error: siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP, DDP_ETYPE_UNTAGGED_BUF, ecode, 0); return -EINVAL; } static struct siw_wqe *siw_rqe_get(struct siw_qp *qp) { struct siw_rqe *rqe; struct siw_srq *srq; struct siw_wqe *wqe = NULL; bool srq_event = false; unsigned long flags; srq = qp->srq; if (srq) { spin_lock_irqsave(&srq->lock, flags); if (unlikely(!srq->num_rqe)) goto out; rqe = &srq->recvq[srq->rq_get % srq->num_rqe]; } else { if (unlikely(!qp->recvq)) goto out; rqe = &qp->recvq[qp->rq_get % qp->attrs.rq_size]; } if (likely(rqe->flags == SIW_WQE_VALID)) { int num_sge = rqe->num_sge; if (likely(num_sge <= SIW_MAX_SGE)) { int i = 0; wqe = rx_wqe(&qp->rx_untagged); rx_type(wqe) = SIW_OP_RECEIVE; wqe->wr_status = SIW_WR_INPROGRESS; wqe->bytes = 0; wqe->processed = 0; wqe->rqe.id = rqe->id; wqe->rqe.num_sge = num_sge; while (i < num_sge) { wqe->rqe.sge[i].laddr = rqe->sge[i].laddr; wqe->rqe.sge[i].lkey = rqe->sge[i].lkey; wqe->rqe.sge[i].length = rqe->sge[i].length; wqe->bytes += wqe->rqe.sge[i].length; wqe->mem[i] = NULL; i++; } /* can be re-used by appl */ smp_store_mb(rqe->flags, 0); } else { siw_dbg_qp(qp, "too many sge's: %d\n", rqe->num_sge); if (srq) spin_unlock_irqrestore(&srq->lock, flags); return NULL; } if (!srq) { qp->rq_get++; } else { if (srq->armed) { /* Test SRQ limit */ u32 off = (srq->rq_get + srq->limit) % srq->num_rqe; struct siw_rqe *rqe2 = &srq->recvq[off]; if (!(rqe2->flags & SIW_WQE_VALID)) { srq->armed = 0; srq_event = true; } } srq->rq_get++; } } out: if (srq) { spin_unlock_irqrestore(&srq->lock, flags); if (srq_event) siw_srq_event(srq, IB_EVENT_SRQ_LIMIT_REACHED); } return wqe; } /* * siw_proc_send: * * Process one incoming SEND and place data into memory referenced by * receive wqe. * * Function supports partially received sends (suspending/resuming * current receive wqe processing) * * return value: * 0: reached the end of a DDP segment * -EAGAIN: to be called again to finish the DDP segment */ int siw_proc_send(struct siw_qp *qp) { struct siw_rx_stream *srx = &qp->rx_stream; struct siw_rx_fpdu *frx = &qp->rx_untagged; struct siw_wqe *wqe; u32 data_bytes; /* all data bytes available */ u32 rcvd_bytes; /* sum of data bytes rcvd */ int rv = 0; if (frx->first_ddp_seg) { wqe = siw_rqe_get(qp); if (unlikely(!wqe)) { siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, DDP_ETYPE_UNTAGGED_BUF, DDP_ECODE_UT_INVALID_MSN_NOBUF, 0); return -ENOENT; } } else { wqe = rx_wqe(frx); } if (srx->state == SIW_GET_DATA_START) { rv = siw_send_check_ntoh(srx, frx); if (unlikely(rv)) { siw_qp_event(qp, IB_EVENT_QP_FATAL); return rv; } if (!srx->fpdu_part_rem) /* zero length SEND */ return 0; } data_bytes = min(srx->fpdu_part_rem, srx->skb_new); rcvd_bytes = 0; /* A zero length SEND will skip below loop */ while (data_bytes) { struct ib_pd *pd; struct siw_mem **mem, *mem_p; struct siw_sge *sge; u32 sge_bytes; /* data bytes avail for SGE */ sge = &wqe->rqe.sge[frx->sge_idx]; if (!sge->length) { /* just skip empty sge's */ frx->sge_idx++; frx->sge_off = 0; frx->pbl_idx = 0; continue; } sge_bytes = min(data_bytes, sge->length - frx->sge_off); mem = &wqe->mem[frx->sge_idx]; /* * check with QP's PD if no SRQ present, SRQ's PD otherwise */ pd = qp->srq == NULL ? qp->pd : qp->srq->base_srq.pd; rv = siw_check_sge(pd, sge, mem, IB_ACCESS_LOCAL_WRITE, frx->sge_off, sge_bytes); if (unlikely(rv)) { siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, DDP_ETYPE_CATASTROPHIC, DDP_ECODE_CATASTROPHIC, 0); siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR); break; } mem_p = *mem; if (mem_p->mem_obj == NULL) rv = siw_rx_kva(srx, (void *)(sge->laddr + frx->sge_off), sge_bytes); else if (!mem_p->is_pbl) rv = siw_rx_umem(srx, mem_p->umem, sge->laddr + frx->sge_off, sge_bytes); else rv = siw_rx_pbl(srx, &frx->pbl_idx, mem_p, sge->laddr + frx->sge_off, sge_bytes); if (unlikely(rv != sge_bytes)) { wqe->processed += rcvd_bytes; siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, DDP_ETYPE_CATASTROPHIC, DDP_ECODE_CATASTROPHIC, 0); return -EINVAL; } frx->sge_off += rv; if (frx->sge_off == sge->length) { frx->sge_idx++; frx->sge_off = 0; frx->pbl_idx = 0; } data_bytes -= rv; rcvd_bytes += rv; srx->fpdu_part_rem -= rv; srx->fpdu_part_rcvd += rv; } wqe->processed += rcvd_bytes; if (!srx->fpdu_part_rem) return 0; return (rv < 0) ? rv : -EAGAIN; } /* * siw_proc_write: * * Place incoming WRITE after referencing and checking target buffer * Function supports partially received WRITEs (suspending/resuming * current receive processing) * * return value: * 0: reached the end of a DDP segment * -EAGAIN: to be called again to finish the DDP segment */ int siw_proc_write(struct siw_qp *qp) { struct siw_rx_stream *srx = &qp->rx_stream; struct siw_rx_fpdu *frx = &qp->rx_tagged; struct siw_mem *mem; int bytes, rv; if (srx->state == SIW_GET_DATA_START) { if (!srx->fpdu_part_rem) /* zero length WRITE */ return 0; rv = siw_write_check_ntoh(srx, frx); if (unlikely(rv)) { siw_qp_event(qp, IB_EVENT_QP_FATAL); return rv; } } bytes = min(srx->fpdu_part_rem, srx->skb_new); if (frx->first_ddp_seg) { struct siw_wqe *wqe = rx_wqe(frx); rx_mem(frx) = siw_mem_id2obj(qp->sdev, srx->ddp_stag >> 8); if (unlikely(!rx_mem(frx))) { siw_dbg_qp(qp, "sink stag not found/invalid, stag 0x%08x\n", srx->ddp_stag); siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, DDP_ETYPE_TAGGED_BUF, DDP_ECODE_T_INVALID_STAG, 0); return -EINVAL; } wqe->rqe.num_sge = 1; rx_type(wqe) = SIW_OP_WRITE; wqe->wr_status = SIW_WR_INPROGRESS; } mem = rx_mem(frx); /* * Check if application re-registered memory with different * key field of STag. */ if (unlikely(mem->stag != srx->ddp_stag)) { siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, DDP_ETYPE_TAGGED_BUF, DDP_ECODE_T_INVALID_STAG, 0); return -EINVAL; } rv = siw_check_mem(qp->pd, mem, srx->ddp_to + srx->fpdu_part_rcvd, IB_ACCESS_REMOTE_WRITE, bytes); if (unlikely(rv)) { siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, DDP_ETYPE_TAGGED_BUF, siw_tagged_error(-rv), 0); siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR); return -EINVAL; } if (mem->mem_obj == NULL) rv = siw_rx_kva(srx, (void *)(srx->ddp_to + srx->fpdu_part_rcvd), bytes); else if (!mem->is_pbl) rv = siw_rx_umem(srx, mem->umem, srx->ddp_to + srx->fpdu_part_rcvd, bytes); else rv = siw_rx_pbl(srx, &frx->pbl_idx, mem, srx->ddp_to + srx->fpdu_part_rcvd, bytes); if (unlikely(rv != bytes)) { siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, DDP_ETYPE_CATASTROPHIC, DDP_ECODE_CATASTROPHIC, 0); return -EINVAL; } srx->fpdu_part_rem -= rv; srx->fpdu_part_rcvd += rv; if (!srx->fpdu_part_rem) { srx->ddp_to += srx->fpdu_part_rcvd; return 0; } return -EAGAIN; } /* * Inbound RREQ's cannot carry user data. */ int siw_proc_rreq(struct siw_qp *qp) { struct siw_rx_stream *srx = &qp->rx_stream; if (!srx->fpdu_part_rem) return 0; pr_warn("siw: [QP %u]: rreq with mpa len %d\n", qp_id(qp), be16_to_cpu(srx->hdr.ctrl.mpa_len)); return -EPROTO; } /* * siw_init_rresp: * * Process inbound RDMA READ REQ. Produce a pseudo READ RESPONSE WQE. * Put it at the tail of the IRQ, if there is another WQE currently in * transmit processing. If not, make it the current WQE to be processed * and schedule transmit processing. * * Can be called from softirq context and from process * context (RREAD socket loopback case!) * * return value: * 0: success, * failure code otherwise */ static int siw_init_rresp(struct siw_qp *qp, struct siw_rx_stream *srx) { struct siw_wqe *tx_work = tx_wqe(qp); struct siw_sqe *resp; uint64_t raddr = be64_to_cpu(srx->hdr.rreq.sink_to), laddr = be64_to_cpu(srx->hdr.rreq.source_to); uint32_t length = be32_to_cpu(srx->hdr.rreq.read_size), lkey = be32_to_cpu(srx->hdr.rreq.source_stag), rkey = be32_to_cpu(srx->hdr.rreq.sink_stag), msn = be32_to_cpu(srx->hdr.rreq.ddp_msn); int run_sq = 1, rv = 0; unsigned long flags; if (unlikely(msn != srx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ])) { siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, DDP_ETYPE_UNTAGGED_BUF, DDP_ECODE_UT_INVALID_MSN_RANGE, 0); return -EPROTO; } spin_lock_irqsave(&qp->sq_lock, flags); if (tx_work->wr_status == SIW_WR_IDLE) { /* * immediately schedule READ response w/o * consuming IRQ entry: IRQ must be empty. */ tx_work->processed = 0; tx_work->mem[0] = NULL; tx_work->wr_status = SIW_WR_QUEUED; resp = &tx_work->sqe; } else { resp = irq_alloc_free(qp); run_sq = 0; } if (likely(resp)) { resp->opcode = SIW_OP_READ_RESPONSE; resp->sge[0].length = length; resp->sge[0].laddr = laddr; resp->sge[0].lkey = lkey; /* Keep aside message sequence number for potential * error reporting during Read Response generation. */ resp->sge[1].length = msn; resp->raddr = raddr; resp->rkey = rkey; resp->num_sge = length ? 1 : 0; /* RRESP now valid as current TX wqe or placed into IRQ */ smp_store_mb(resp->flags, SIW_WQE_VALID); } else { pr_warn("siw: [QP %u]: irq %d exceeded %d\n", qp_id(qp), qp->irq_put % qp->attrs.irq_size, qp->attrs.irq_size); siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP, RDMAP_ETYPE_REMOTE_OPERATION, RDMAP_ECODE_CATASTROPHIC_STREAM, 0); rv = -EPROTO; } spin_unlock_irqrestore(&qp->sq_lock, flags); if (run_sq) rv = siw_sq_start(qp); return rv; } /* * Only called at start of Read.Resonse processing. * Transfer pending Read from tip of ORQ into currrent rx wqe, * but keep ORQ entry valid until Read.Response processing done. * No Queue locking needed. */ static int siw_orqe_start_rx(struct siw_qp *qp) { struct siw_sqe *orqe; struct siw_wqe *wqe = NULL; /* make sure ORQ indices are current */ smp_mb(); orqe = orq_get_current(qp); if (READ_ONCE(orqe->flags) & SIW_WQE_VALID) { /* RRESP is a TAGGED RDMAP operation */ wqe = rx_wqe(&qp->rx_tagged); wqe->sqe.id = orqe->id; wqe->sqe.opcode = orqe->opcode; wqe->sqe.sge[0].laddr = orqe->sge[0].laddr; wqe->sqe.sge[0].lkey = orqe->sge[0].lkey; wqe->sqe.sge[0].length = orqe->sge[0].length; wqe->sqe.flags = orqe->flags; wqe->sqe.num_sge = 1; wqe->bytes = orqe->sge[0].length; wqe->processed = 0; wqe->mem[0] = NULL; /* make sure WQE is completely written before valid */ smp_wmb(); wqe->wr_status = SIW_WR_INPROGRESS; return 0; } return -EPROTO; } /* * siw_proc_rresp: * * Place incoming RRESP data into memory referenced by RREQ WQE * which is at the tip of the ORQ * * Function supports partially received RRESP's (suspending/resuming * current receive processing) */ int siw_proc_rresp(struct siw_qp *qp) { struct siw_rx_stream *srx = &qp->rx_stream; struct siw_rx_fpdu *frx = &qp->rx_tagged; struct siw_wqe *wqe = rx_wqe(frx); struct siw_mem **mem, *mem_p; struct siw_sge *sge; int bytes, rv; if (frx->first_ddp_seg) { if (unlikely(wqe->wr_status != SIW_WR_IDLE)) { pr_warn("siw: [QP %u]: proc RRESP: status %d, op %d\n", qp_id(qp), wqe->wr_status, wqe->sqe.opcode); rv = -EPROTO; goto error_term; } /* * fetch pending RREQ from orq */ rv = siw_orqe_start_rx(qp); if (rv) { pr_warn("siw: [QP %u]: ORQ empty at idx %d\n", qp_id(qp), qp->orq_get % qp->attrs.orq_size); goto error_term; } rv = siw_rresp_check_ntoh(srx, frx); if (unlikely(rv)) { siw_qp_event(qp, IB_EVENT_QP_FATAL); return rv; } } else { if (unlikely(wqe->wr_status != SIW_WR_INPROGRESS)) { pr_warn("siw: [QP %u]: resume RRESP: status %d\n", qp_id(qp), wqe->wr_status); rv = -EPROTO; goto error_term; } } if (!srx->fpdu_part_rem) /* zero length RRESPONSE */ return 0; sge = wqe->sqe.sge; /* there is only one */ mem = &wqe->mem[0]; if (!(*mem)) { /* * check target memory which resolves memory on first fragment */ rv = siw_check_sge(qp->pd, sge, mem, IB_ACCESS_LOCAL_WRITE, 0, wqe->bytes); if (unlikely(rv)) { siw_dbg_qp(qp, "target mem check: %d\n", rv); wqe->wc_status = SIW_WC_LOC_PROT_ERR; siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, DDP_ETYPE_TAGGED_BUF, siw_tagged_error(-rv), 0); siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR); return -EINVAL; } } mem_p = *mem; bytes = min(srx->fpdu_part_rem, srx->skb_new); if (mem_p->mem_obj == NULL) rv = siw_rx_kva(srx, (void *)(sge->laddr + wqe->processed), bytes); else if (!mem_p->is_pbl) rv = siw_rx_umem(srx, mem_p->umem, sge->laddr + wqe->processed, bytes); else rv = siw_rx_pbl(srx, &frx->pbl_idx, mem_p, sge->laddr + wqe->processed, bytes); if (rv != bytes) { wqe->wc_status = SIW_WC_GENERAL_ERR; rv = -EINVAL; goto error_term; } srx->fpdu_part_rem -= rv; srx->fpdu_part_rcvd += rv; wqe->processed += rv; if (!srx->fpdu_part_rem) { srx->ddp_to += srx->fpdu_part_rcvd; return 0; } return -EAGAIN; error_term: siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, DDP_ETYPE_CATASTROPHIC, DDP_ECODE_CATASTROPHIC, 0); return rv; } int siw_proc_terminate(struct siw_qp *qp) { struct siw_rx_stream *srx = &qp->rx_stream; struct sk_buff *skb = srx->skb; struct iwarp_terminate *term = &srx->hdr.terminate; union iwarp_hdr term_info; u8 *infop = (u8 *)&term_info; enum rdma_opcode op; u16 to_copy = sizeof(struct iwarp_ctrl); pr_warn("siw: got TERMINATE. layer %d, type %d, code %d\n", __rdmap_term_layer(term), __rdmap_term_etype(term), __rdmap_term_ecode(term)); if (be32_to_cpu(term->ddp_qn) != RDMAP_UNTAGGED_QN_TERMINATE || be32_to_cpu(term->ddp_msn) != qp->rx_stream.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] || be32_to_cpu(term->ddp_mo) != 0) { pr_warn("siw: rx bogus TERM [QN x%08x, MSN x%08x, MO x%08x]\n", be32_to_cpu(term->ddp_qn), be32_to_cpu(term->ddp_msn), be32_to_cpu(term->ddp_mo)); return -ECONNRESET; } /* * Receive remaining pieces of TERM if indicated */ if (!term->flag_m) return -ECONNRESET; /* Do not take the effort to reassemble a network fragmented * TERM message */ if (srx->skb_new < sizeof(struct iwarp_ctrl_tagged)) return -ECONNRESET; memset(infop, 0, sizeof(term_info)); skb_copy_bits(skb, srx->skb_offset, infop, to_copy); op = __rdmap_get_opcode(&term_info.ctrl); if (op >= RDMAP_TERMINATE) goto out; infop += to_copy; srx->skb_offset += to_copy; srx->skb_new -= to_copy; srx->skb_copied += to_copy; srx->fpdu_part_rcvd += to_copy; srx->fpdu_part_rem -= to_copy; to_copy = iwarp_pktinfo[op].hdr_len - to_copy; /* Again, no network fragmented TERM's */ if (to_copy + MPA_CRC_SIZE > srx->skb_new) return -ECONNRESET; skb_copy_bits(skb, srx->skb_offset, infop, to_copy); if (term->flag_r) { siw_dbg_qp(qp, "TERM reports RDMAP hdr type %u, len %u (%s)\n", op, be16_to_cpu(term_info.ctrl.mpa_len), term->flag_m ? "valid" : "invalid"); } else if (term->flag_d) { siw_dbg_qp(qp, "TERM reports DDP hdr type %u, len %u (%s)\n", op, be16_to_cpu(term_info.ctrl.mpa_len), term->flag_m ? "valid" : "invalid"); } out: srx->skb_new -= to_copy; srx->skb_offset += to_copy; srx->skb_copied += to_copy; srx->fpdu_part_rcvd += to_copy; srx->fpdu_part_rem -= to_copy; return -ECONNRESET; } static int siw_get_trailer(struct siw_qp *qp, struct siw_rx_stream *srx) { struct sk_buff *skb = srx->skb; u8 *tbuf = (u8 *)&srx->trailer.crc - srx->pad; __wsum crc_in, crc_own = 0; siw_dbg_qp(qp, "expected %d, available %d, pad %u\n", srx->fpdu_part_rem, srx->skb_new, srx->pad); if (srx->skb_new < srx->fpdu_part_rem) return -EAGAIN; skb_copy_bits(skb, srx->skb_offset, tbuf, srx->fpdu_part_rem); if (srx->mpa_crc_hd && srx->pad) crypto_shash_update(srx->mpa_crc_hd, tbuf, srx->pad); srx->skb_new -= srx->fpdu_part_rem; srx->skb_offset += srx->fpdu_part_rem; srx->skb_copied += srx->fpdu_part_rem; if (!srx->mpa_crc_hd) return 0; /* * CRC32 is computed, transmitted and received directly in NBO, * so there's never a reason to convert byte order. */ crypto_shash_final(srx->mpa_crc_hd, (u8 *)&crc_own); crc_in = (__force __wsum)srx->trailer.crc; if (unlikely(crc_in != crc_own)) { pr_warn("siw: crc error. in: %08x, own %08x, op %u\n", crc_in, crc_own, qp->rx_stream.rdmap_op); siw_init_terminate(qp, TERM_ERROR_LAYER_LLP, LLP_ETYPE_MPA, LLP_ECODE_RECEIVED_CRC, 0); return -EINVAL; } return 0; } #define MIN_DDP_HDR sizeof(struct iwarp_ctrl_tagged) static int siw_get_hdr(struct siw_rx_stream *srx) { struct sk_buff *skb = srx->skb; struct siw_qp *qp = rx_qp(srx); struct iwarp_ctrl *c_hdr = &srx->hdr.ctrl; struct siw_rx_fpdu *frx; u8 opcode; int bytes; if (srx->fpdu_part_rcvd < MIN_DDP_HDR) { /* * copy a mimimum sized (tagged) DDP frame control part */ bytes = min_t(int, srx->skb_new, MIN_DDP_HDR - srx->fpdu_part_rcvd); skb_copy_bits(skb, srx->skb_offset, (char *)c_hdr + srx->fpdu_part_rcvd, bytes); srx->fpdu_part_rcvd += bytes; srx->skb_new -= bytes; srx->skb_offset += bytes; srx->skb_copied += bytes; if (srx->fpdu_part_rcvd < MIN_DDP_HDR) return -EAGAIN; if (unlikely(__ddp_get_version(c_hdr) != DDP_VERSION)) { enum ddp_etype etype; enum ddp_ecode ecode; pr_warn("siw: received ddp version unsupported %d\n", __ddp_get_version(c_hdr)); if (c_hdr->ddp_rdmap_ctrl & DDP_FLAG_TAGGED) { etype = DDP_ETYPE_TAGGED_BUF; ecode = DDP_ECODE_T_VERSION; } else { etype = DDP_ETYPE_UNTAGGED_BUF; ecode = DDP_ECODE_UT_VERSION; } siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP, etype, ecode, 0); return -EINVAL; } if (unlikely(__rdmap_get_version(c_hdr) != RDMAP_VERSION)) { pr_warn("siw: received rdmap version unsupported %d\n", __rdmap_get_version(c_hdr)); siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_RDMAP, RDMAP_ETYPE_REMOTE_OPERATION, RDMAP_ECODE_VERSION, 0); return -EINVAL; } opcode = __rdmap_get_opcode(c_hdr); if (opcode > RDMAP_TERMINATE) { pr_warn("siw: received unknown packet type %u\n", opcode); siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_RDMAP, RDMAP_ETYPE_REMOTE_OPERATION, RDMAP_ECODE_OPCODE, 0); return -EINVAL; } siw_dbg_qp(rx_qp(srx), "new header, opcode %u\n", opcode); } else { opcode = __rdmap_get_opcode(c_hdr); } set_rx_fpdu_context(qp, opcode); frx = qp->rx_fpdu; /* * Figure out len of current hdr: variable length of * iwarp hdr may force us to copy hdr information in * two steps. Only tagged DDP messages are already * completely received. */ if (iwarp_pktinfo[opcode].hdr_len > sizeof(struct iwarp_ctrl_tagged)) { bytes = iwarp_pktinfo[opcode].hdr_len - MIN_DDP_HDR; if (srx->skb_new < bytes) return -EAGAIN; skb_copy_bits(skb, srx->skb_offset, (char *)c_hdr + srx->fpdu_part_rcvd, bytes); srx->fpdu_part_rcvd += bytes; srx->skb_new -= bytes; srx->skb_offset += bytes; srx->skb_copied += bytes; } /* * DDP/RDMAP header receive completed. Check if the current * DDP segment starts a new RDMAP message or continues a previously * started RDMAP message. * * Alternating reception of DDP segments (or FPDUs) from incomplete * tagged and untagged RDMAP messages is supported, as long as * the current tagged or untagged message gets eventually completed * w/o intersection from another message of the same type * (tagged/untagged). E.g., a WRITE can get intersected by a SEND, * but not by a READ RESPONSE etc. */ if (srx->mpa_crc_hd) { /* * Restart CRC computation */ crypto_shash_init(srx->mpa_crc_hd); crypto_shash_update(srx->mpa_crc_hd, (u8 *)c_hdr, srx->fpdu_part_rcvd); } if (frx->more_ddp_segs) { frx->first_ddp_seg = 0; if (frx->prev_rdmap_op != opcode) { pr_warn("siw: packet intersection: %u : %u\n", frx->prev_rdmap_op, opcode); /* * The last inbound RDMA operation of same type * (tagged or untagged) is left unfinished. * To complete it in error, make it the current * operation again, even with the header already * overwritten. For error handling, only the opcode * and current rx context are relevant. */ set_rx_fpdu_context(qp, frx->prev_rdmap_op); __rdmap_set_opcode(c_hdr, frx->prev_rdmap_op); return -EPROTO; } } else { frx->prev_rdmap_op = opcode; frx->first_ddp_seg = 1; } frx->more_ddp_segs = c_hdr->ddp_rdmap_ctrl & DDP_FLAG_LAST ? 0 : 1; return 0; } static int siw_check_tx_fence(struct siw_qp *qp) { struct siw_wqe *tx_waiting = tx_wqe(qp); struct siw_sqe *rreq; int resume_tx = 0, rv = 0; unsigned long flags; spin_lock_irqsave(&qp->orq_lock, flags); rreq = orq_get_current(qp); /* free current orq entry */ WRITE_ONCE(rreq->flags, 0); if (qp->tx_ctx.orq_fence) { if (unlikely(tx_waiting->wr_status != SIW_WR_QUEUED)) { pr_warn("siw: [QP %u]: fence resume: bad status %d\n", qp_id(qp), tx_waiting->wr_status); rv = -EPROTO; goto out; } /* resume SQ processing */ if (tx_waiting->sqe.opcode == SIW_OP_READ || tx_waiting->sqe.opcode == SIW_OP_READ_LOCAL_INV) { rreq = orq_get_tail(qp); if (unlikely(!rreq)) { pr_warn("siw: [QP %u]: no ORQE\n", qp_id(qp)); rv = -EPROTO; goto out; } siw_read_to_orq(rreq, &tx_waiting->sqe); qp->orq_put++; qp->tx_ctx.orq_fence = 0; resume_tx = 1; } else if (siw_orq_empty(qp)) { qp->tx_ctx.orq_fence = 0; resume_tx = 1; } else { pr_warn("siw: [QP %u]: fence resume: orq idx: %d:%d\n", qp_id(qp), qp->orq_get, qp->orq_put); rv = -EPROTO; } } qp->orq_get++; out: spin_unlock_irqrestore(&qp->orq_lock, flags); if (resume_tx) rv = siw_sq_start(qp); return rv; } /* * siw_rdmap_complete() * * Complete processing of an RDMA message after receiving all * DDP segmens or ABort processing after encountering error case. * * o SENDs + RRESPs will need for completion, * o RREQs need for READ RESPONSE initialization * o WRITEs need memory dereferencing * * TODO: Failed WRITEs need local error to be surfaced. */ static int siw_rdmap_complete(struct siw_qp *qp, int error) { struct siw_rx_stream *srx = &qp->rx_stream; struct siw_wqe *wqe = rx_wqe(qp->rx_fpdu); enum siw_wc_status wc_status = wqe->wc_status; u8 opcode = __rdmap_get_opcode(&srx->hdr.ctrl); int rv = 0; switch (opcode) { case RDMAP_SEND_SE: case RDMAP_SEND_SE_INVAL: wqe->rqe.flags |= SIW_WQE_SOLICITED; /* Fall through */ case RDMAP_SEND: case RDMAP_SEND_INVAL: if (wqe->wr_status == SIW_WR_IDLE) break; srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]++; if (error != 0 && wc_status == SIW_WC_SUCCESS) wc_status = SIW_WC_GENERAL_ERR; /* * Handle STag invalidation request */ if (wc_status == SIW_WC_SUCCESS && (opcode == RDMAP_SEND_INVAL || opcode == RDMAP_SEND_SE_INVAL)) { rv = siw_invalidate_stag(qp->pd, srx->inval_stag); if (rv) { siw_init_terminate( qp, TERM_ERROR_LAYER_RDMAP, rv == -EACCES ? RDMAP_ETYPE_REMOTE_PROTECTION : RDMAP_ETYPE_REMOTE_OPERATION, RDMAP_ECODE_CANNOT_INVALIDATE, 0); wc_status = SIW_WC_REM_INV_REQ_ERR; } rv = siw_rqe_complete(qp, &wqe->rqe, wqe->processed, rv ? 0 : srx->inval_stag, wc_status); } else { rv = siw_rqe_complete(qp, &wqe->rqe, wqe->processed, 0, wc_status); } siw_wqe_put_mem(wqe, SIW_OP_RECEIVE); break; case RDMAP_RDMA_READ_RESP: if (wqe->wr_status == SIW_WR_IDLE) break; if (error != 0) { if ((srx->state == SIW_GET_HDR && qp->rx_fpdu->first_ddp_seg) || error == -ENODATA) /* possible RREQ in ORQ left untouched */ break; if (wc_status == SIW_WC_SUCCESS) wc_status = SIW_WC_GENERAL_ERR; } else if (qp->kernel_verbs && rx_type(wqe) == SIW_OP_READ_LOCAL_INV) { /* * Handle any STag invalidation request */ rv = siw_invalidate_stag(qp->pd, wqe->sqe.sge[0].lkey); if (rv) { siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP, RDMAP_ETYPE_CATASTROPHIC, RDMAP_ECODE_UNSPECIFIED, 0); if (wc_status == SIW_WC_SUCCESS) { wc_status = SIW_WC_GENERAL_ERR; error = rv; } } } /* * All errors turn the wqe into signalled. */ if ((wqe->sqe.flags & SIW_WQE_SIGNALLED) || error != 0) rv = siw_sqe_complete(qp, &wqe->sqe, wqe->processed, wc_status); siw_wqe_put_mem(wqe, SIW_OP_READ); if (!error) rv = siw_check_tx_fence(qp); else /* Disable current ORQ eleement */ WRITE_ONCE(orq_get_current(qp)->flags, 0); break; case RDMAP_RDMA_READ_REQ: if (!error) { rv = siw_init_rresp(qp, srx); srx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ]++; } break; case RDMAP_RDMA_WRITE: if (wqe->wr_status == SIW_WR_IDLE) break; /* * Free References from memory object if * attached to receive context (inbound WRITE). * While a zero-length WRITE is allowed, * no memory reference got created. */ if (rx_mem(&qp->rx_tagged)) { siw_mem_put(rx_mem(&qp->rx_tagged)); rx_mem(&qp->rx_tagged) = NULL; } break; default: break; } wqe->wr_status = SIW_WR_IDLE; return rv; } /* * siw_tcp_rx_data() * * Main routine to consume inbound TCP payload * * @rd_desc: read descriptor * @skb: socket buffer * @off: offset in skb * @len: skb->len - offset : payload in skb */ int siw_tcp_rx_data(read_descriptor_t *rd_desc, struct sk_buff *skb, unsigned int off, size_t len) { struct siw_qp *qp = rd_desc->arg.data; struct siw_rx_stream *srx = &qp->rx_stream; int rv; srx->skb = skb; srx->skb_new = skb->len - off; srx->skb_offset = off; srx->skb_copied = 0; siw_dbg_qp(qp, "new data, len %d\n", srx->skb_new); while (srx->skb_new) { int run_completion = 1; if (unlikely(srx->rx_suspend)) { /* Do not process any more data */ srx->skb_copied += srx->skb_new; break; } switch (srx->state) { case SIW_GET_HDR: rv = siw_get_hdr(srx); if (!rv) { srx->fpdu_part_rem = be16_to_cpu(srx->hdr.ctrl.mpa_len) - srx->fpdu_part_rcvd + MPA_HDR_SIZE; if (srx->fpdu_part_rem) srx->pad = -srx->fpdu_part_rem & 0x3; else srx->pad = 0; srx->state = SIW_GET_DATA_START; srx->fpdu_part_rcvd = 0; } break; case SIW_GET_DATA_MORE: /* * Another data fragment of the same DDP segment. * Setting first_ddp_seg = 0 avoids repeating * initializations that shall occur only once per * DDP segment. */ qp->rx_fpdu->first_ddp_seg = 0; /* Fall through */ case SIW_GET_DATA_START: /* * Headers will be checked by the opcode-specific * data receive function below. */ rv = iwarp_pktinfo[qp->rx_stream.rdmap_op].rx_data(qp); if (!rv) { int mpa_len = be16_to_cpu(srx->hdr.ctrl.mpa_len) + MPA_HDR_SIZE; srx->fpdu_part_rem = (-mpa_len & 0x3) + MPA_CRC_SIZE; srx->fpdu_part_rcvd = 0; srx->state = SIW_GET_TRAILER; } else { if (unlikely(rv == -ECONNRESET)) run_completion = 0; else srx->state = SIW_GET_DATA_MORE; } break; case SIW_GET_TRAILER: /* * read CRC + any padding */ rv = siw_get_trailer(qp, srx); if (likely(!rv)) { /* * FPDU completed. * complete RDMAP message if last fragment */ srx->state = SIW_GET_HDR; srx->fpdu_part_rcvd = 0; if (!(srx->hdr.ctrl.ddp_rdmap_ctrl & DDP_FLAG_LAST)) /* more frags */ break; rv = siw_rdmap_complete(qp, 0); run_completion = 0; } break; default: pr_warn("QP[%u]: RX out of state\n", qp_id(qp)); rv = -EPROTO; run_completion = 0; } if (unlikely(rv != 0 && rv != -EAGAIN)) { if ((srx->state > SIW_GET_HDR || qp->rx_fpdu->more_ddp_segs) && run_completion) siw_rdmap_complete(qp, rv); siw_dbg_qp(qp, "rx error %d, rx state %d\n", rv, srx->state); siw_qp_cm_drop(qp, 1); break; } if (rv) { siw_dbg_qp(qp, "fpdu fragment, state %d, missing %d\n", srx->state, srx->fpdu_part_rem); break; } } return srx->skb_copied; }