/* * siw_qp_prepare_tx() * * Prepare tx state for sending out one fpdu. Builds complete pkt * if no user data or only immediate data are present. * * returns PKT_COMPLETE if complete pkt built, PKT_FRAGMENTED otherwise. */ static int siw_qp_prepare_tx(struct siw_iwarp_tx *c_tx) { struct siw_wqe *wqe = c_tx->wqe; u32 *crc = NULL; dprint(DBG_TX, "(QP%d):\n", TX_QPID(c_tx)); switch (wr_type(wqe)) { case SIW_WR_RDMA_READ_REQ: memcpy(&c_tx->pkt.ctrl, &iwarp_pktinfo[RDMAP_RDMA_READ_REQ].ctrl, sizeof(struct iwarp_ctrl)); c_tx->pkt.rreq.rsvd = 0; c_tx->pkt.rreq.ddp_qn = htonl(RDMAP_UNTAGGED_QN_RDMA_READ); c_tx->pkt.rreq.ddp_msn = htonl(++c_tx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ]); c_tx->pkt.rreq.ddp_mo = 0; c_tx->pkt.rreq.sink_stag = htonl(wqe->wr.rread.sge[0].lkey); c_tx->pkt.rreq.sink_to = cpu_to_be64(wqe->wr.rread.sge[0].addr); /* abs addr! */ c_tx->pkt.rreq.source_stag = htonl(wqe->wr.rread.rtag); c_tx->pkt.rreq.source_to = cpu_to_be64(wqe->wr.rread.raddr); c_tx->pkt.rreq.read_size = htonl(wqe->bytes); dprint(DBG_TX, ": RREQ: Sink: %x, 0x%016llx\n", wqe->wr.rread.sge[0].lkey, wqe->wr.rread.sge[0].addr); c_tx->ctrl_len = sizeof(struct iwarp_rdma_rreq); crc = &c_tx->pkt.rreq_pkt.crc; break; case SIW_WR_SEND: if (wr_flags(wqe) & IB_SEND_SOLICITED) memcpy(&c_tx->pkt.ctrl, &iwarp_pktinfo[RDMAP_SEND_SE].ctrl, sizeof(struct iwarp_ctrl)); else memcpy(&c_tx->pkt.ctrl, &iwarp_pktinfo[RDMAP_SEND].ctrl, sizeof(struct iwarp_ctrl)); c_tx->pkt.send.ddp_qn = RDMAP_UNTAGGED_QN_SEND; c_tx->pkt.send.ddp_msn = htonl(++c_tx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]); c_tx->pkt.send.ddp_mo = 0; c_tx->pkt.send.rsvd = 0; c_tx->ctrl_len = sizeof(struct iwarp_send); if (!wqe->bytes) crc = &c_tx->pkt.send_pkt.crc; break; case SIW_WR_RDMA_WRITE: memcpy(&c_tx->pkt.ctrl, &iwarp_pktinfo[RDMAP_RDMA_WRITE].ctrl, sizeof(struct iwarp_ctrl)); c_tx->pkt.rwrite.sink_stag = htonl(wqe->wr.write.rtag); c_tx->pkt.rwrite.sink_to = cpu_to_be64(wqe->wr.write.raddr); c_tx->ctrl_len = sizeof(struct iwarp_rdma_write); if (!wqe->bytes) crc = &c_tx->pkt.write_pkt.crc; break; case SIW_WR_RDMA_READ_RESP: memcpy(&c_tx->pkt.ctrl, &iwarp_pktinfo[RDMAP_RDMA_READ_RESP].ctrl, sizeof(struct iwarp_ctrl)); /* NBO */ c_tx->pkt.rresp.sink_stag = wqe->wr.rresp.rtag; c_tx->pkt.rresp.sink_to = cpu_to_be64(wqe->wr.rresp.raddr); c_tx->ctrl_len = sizeof(struct iwarp_rdma_rresp); dprint(DBG_TX, ": RRESP: Sink: %x, 0x%016llx\n", wqe->wr.rresp.rtag, wqe->wr.rresp.raddr); if (!wqe->bytes) crc = &c_tx->pkt.rresp_pkt.crc; break; default: dprint(DBG_ON, "Unsupported WQE type %d\n", wr_type(wqe)); BUG(); break; } c_tx->ctrl_sent = 0; c_tx->sge_idx = 0; c_tx->sge_off = 0; c_tx->pg_idx = 0; c_tx->umem_chunk = NULL; /* * Do complete CRC if enabled and short packet */ if (crc) { *crc = 0; if (c_tx->crc_enabled) { if (siw_crc_txhdr(c_tx) != 0) return -EINVAL; crypto_hash_final(&c_tx->mpa_crc_hd, (u8 *)crc); } } c_tx->ctrl_len += MPA_CRC_SIZE; /* * Allow direct sending out of user buffer if WR is non signalled * and payload is over threshold and no CRC is enabled. * Per RDMA verbs, the application should not change the send buffer * until the work completed. In iWarp, work completion is only * local delivery to TCP. TCP may reuse the buffer for * retransmission. Changing unsent data also breaks the CRC, * if applied. * Inline buffers are already out of user control and can be * send 0copy. */ if (zcopy_tx && (!(wr_flags(wqe) & IB_SEND_SIGNALED) || SIW_INLINED_DATA(wqe)) && wqe->bytes > SENDPAGE_THRESH && wr_type(wqe) != SIW_WR_RDMA_READ_REQ) c_tx->use_sendpage = 1; else c_tx->use_sendpage = 0; return crc == NULL ? PKT_FRAGMENTED : PKT_COMPLETE; }
/* * siw_post_send() * * Post a list of S-WR's to a SQ. * * @ofa_qp: OFA QP contained in siw QP * @wr: Null terminated list of user WR's * @bad_wr: Points to failing WR in case of synchronous failure. */ int siw_post_send(struct ib_qp *ofa_qp, struct ib_send_wr *wr, struct ib_send_wr **bad_wr) { struct siw_wqe *wqe = NULL; struct siw_qp *qp = siw_qp_ofa2siw(ofa_qp); unsigned long flags; int rv = 0; dprint(DBG_WR|DBG_TX, "(QP%d): state=%d\n", QP_ID(qp), qp->attrs.state); /* * Try to acquire QP state lock. Must be non-blocking * to accommodate kernel clients needs. */ if (!down_read_trylock(&qp->state_lock)) { *bad_wr = wr; return -ENOTCONN; } if (qp->attrs.state != SIW_QP_STATE_RTS) { dprint(DBG_WR|DBG_ON, "(QP%d): state=%d\n", QP_ID(qp), qp->attrs.state); up_read(&qp->state_lock); *bad_wr = wr; return -ENOTCONN; } dprint(DBG_WR|DBG_TX, "(QP%d): sq_space(#1)=%d\n", QP_ID(qp), atomic_read(&qp->sq_space)); while (wr) { wqe = siw_wqe_alloc(qp, opcode_ofa2siw(wr->opcode)); if (!wqe) { dprint(DBG_ON, " siw_wqe_alloc\n"); rv = -ENOMEM; break; } wr_type(wqe) = opcode_ofa2siw(wr->opcode); wr_id(wqe) = wr->wr_id; wr_flags(wqe) = wr->send_flags; if (qp->attrs.flags & SIW_SIGNAL_ALL_WR) wr_flags(wqe) |= IB_SEND_SIGNALED; if (wr->num_sge > qp->attrs.sq_max_sges) { /* * NOTE: we allow for zero length wr's here. */ dprint(DBG_WR, "(QP%d): Num SGE: %d\n", QP_ID(qp), wr->num_sge); rv = -EINVAL; break; } switch (wr->opcode) { case IB_WR_SEND: if (!SIW_INLINED_DATA(wqe)) { rv = siw_copy_sgl(wr->sg_list, wqe->wr.send.sge, wr->num_sge); wqe->wr.send.num_sge = wr->num_sge; } else rv = siw_copy_inline_sgl(wr, wqe); if (rv < 0) { rv = -EINVAL; break; } wqe->bytes = rv; break; case IB_WR_RDMA_READ: /* * OFED WR restricts RREAD sink to SGL containing * 1 SGE only. we could relax to SGL with multiple * elements referring the SAME ltag or even sending * a private per-rreq tag referring to a checked * local sgl with MULTIPLE ltag's. would be easy * to do... */ if (wr->num_sge != 1) { rv = -EINVAL; break; } rv = siw_copy_sgl(wr->sg_list, wqe->wr.rread.sge, 1); /* * NOTE: zero length RREAD is allowed! */ wqe->wr.rread.raddr = wr->wr.rdma.remote_addr; wqe->wr.rread.rtag = wr->wr.rdma.rkey; wqe->wr.rread.num_sge = 1; wqe->bytes = rv; break; case IB_WR_RDMA_WRITE: if (!SIW_INLINED_DATA(wqe)) { rv = siw_copy_sgl(wr->sg_list, wqe->wr.send.sge, wr->num_sge); wqe->wr.write.num_sge = wr->num_sge; } else rv = siw_copy_inline_sgl(wr, wqe); /* * NOTE: zero length WRITE is allowed! */ if (rv < 0) { rv = -EINVAL; break; } wqe->wr.write.raddr = wr->wr.rdma.remote_addr; wqe->wr.write.rtag = wr->wr.rdma.rkey; wqe->bytes = rv; break; default: dprint(DBG_WR|DBG_TX|DBG_ON, "(QP%d): Opcode %d not yet implemented\n", QP_ID(qp), wr->opcode); wqe->wr.sgl.num_sge = 0; rv = -ENOSYS; break; } dprint(DBG_WR|DBG_TX, "(QP%d): opcode %d, bytes %d, " "flags 0x%x\n", QP_ID(qp), wr_type(wqe), wqe->bytes, wr_flags(wqe)); if (rv < 0) break; wqe->wr_status = SR_WR_QUEUED; lock_sq_rxsave(qp, flags); list_add_tail(&wqe->list, &qp->sq); unlock_sq_rxsave(qp, flags); wr = wr->next; } /* * Send directly if SQ processing is not in progress. * Eventual immediate errors (rv < 0) do not affect the involved * RI resources (Verbs, 8.3.1) and thus do not prevent from SQ * processing, if new work is already pending. But rv must be passed * to caller. */ lock_sq_rxsave(qp, flags); if (tx_wqe(qp) == NULL) { struct siw_wqe *next = siw_next_tx_wqe(qp); if (next != NULL) { if (wr_type(next) != SIW_WR_RDMA_READ_REQ || !ORD_SUSPEND_SQ(qp)) { tx_wqe(qp) = next; if (wr_type(next) != SIW_WR_RDMA_READ_REQ) list_del_init(&next->list); else siw_rreq_queue(next, qp); unlock_sq_rxsave(qp, flags); dprint(DBG_WR|DBG_TX, "(QP%d): Direct sending...\n", QP_ID(qp)); if (qp->attrs.flags & SIW_KERNEL_VERBS) siw_sq_queue_work(qp); else if (siw_qp_sq_process(qp, 1) != 0 && !(qp->tx_ctx.tx_suspend)) siw_qp_cm_drop(qp, 0); } else unlock_sq_rxsave(qp, flags); } else unlock_sq_rxsave(qp, flags); } else unlock_sq_rxsave(qp, flags); up_read(&qp->state_lock); dprint(DBG_WR|DBG_TX, "(QP%d): sq_space(#2)=%d\n", QP_ID(qp), atomic_read(&qp->sq_space)); if (rv >= 0) return 0; /* * Immediate error */ dprint(DBG_WR|DBG_ON, "(QP%d): error=%d\n", QP_ID(qp), rv); if (wqe != NULL) siw_wqe_put(wqe); *bad_wr = wr; return rv; }