int VwritetoSender::send_payload () { std::uint8_t *buf_ptr = buf; std::uint8_t *buf_end = buf + buf_sz; off_t roff_idx = roff; //First cacheline reserved for scif_fence_signal int err = 0, bytes; int mark; while (buf_ptr < buf_end) { bytes = std::min (block_sz, (int)(buf_end - buf_ptr)); err = scif_vwriteto (epd, buf_ptr, bytes, roff_idx, 0); if (err < 0) { std::cerr << "ERROR: scif_vwriteto: " << std::strerror (errno) << std::endl; break; } buf_ptr += bytes; roff_idx += bytes; } /*synchronize */ scif_fence_signal (epd, 0, 0, roff + buf_sz, 0xff, SCIF_FENCE_INIT_SELF | SCIF_SIGNAL_REMOTE); return buf_ptr - buf; }
/* called with rxlock, process all RR's up to signal marker at wr_last */ static void m_pi_post_writeto(struct mcm_qp *m_qp, struct mcm_wr_rx *wr_sig, struct ibv_wc *wc) { mcm_scif_dev_t *smd = m_qp->smd; struct mcm_wr_rx *wr_rx; struct mcm_sr *m_sr = NULL; off_t l_off, l_off_wr, r_off; int ret, i, l_start, l_end, l_len, sg_len, w_len, num_sge, wr_idx, wr_cnt = 0; int wt_flag; wr_idx = m_qp->wr_tl_r_wt; /* from WT tail, process RR's posted until reaching wr_last */ while (m_qp->pi_rr_cnt) { /* RR's pending */ wr_rx = (struct mcm_wr_rx *)(m_qp->wrc.wr_addr + (m_qp->wrc.wr_sz * wr_idx)); if (!(wr_rx->flags & M_READ_POSTED)) { /* reached RR signaled marker, or head pointer */ if (wr_idx == wr_sig->w_idx || wr_idx == m_qp->wr_hd_r) break; wr_idx = (wr_idx + 1) & m_qp->wrc.wr_end; /* next WR */ continue; } wr_cnt++; #if MCM_PROFILE if (wr_rx == wr_sig) mcm_qp_prof_ts(m_qp, MCM_QP_IB_RR, wr_rx->time, wr_rx->qcnt, wr_cnt); #endif mlog(4, " WR_rx[%d-%d] %p m_qp %p wc %p wc->op %x wr_rx->wr.op %x\n", wr_rx->w_idx, wr_sig->w_idx, wr_rx, m_qp, wc, wc->opcode, wr_rx->wr.opcode); m_qp->pi_rr_cnt--; /* rdma read complete */ MCNTR(smd->md, MCM_QP_READ_DONE); /* if SR or RW_imm, need a posted receive */ if ((wr_rx->wr.opcode & IBV_WR_SEND) || (wr_rx->wr.opcode & IBV_WR_RDMA_WRITE_WITH_IMM)) { m_sr = m_pi_get_sr(m_qp, wr_rx->w_idx); if (!m_sr) { mlog(0, " WARNING: SR stalled, no RCV messages posted" " m_qp %p, sr_tl %d sr_hd %d\n", m_qp, m_qp->sr_tl, m_qp->sr_hd); wr_rx->flags |= M_RECV_PAUSED; return; } wr_rx->s_idx = m_sr->s_idx; /* link WR_RX and SR */ m_sr->len = 0; num_sge = m_sr->num_sge; sg_len = m_sr->sg[0].length; r_off = m_sr->sg[0].addr; /* post recv buffer address */ mlog(4, " WR SR or RW_IMM: m_sr[%d] %p -> scif r_off %Lx ln %d\n", m_sr->s_idx, m_sr, r_off, sg_len); } /* need to translate to rdma write dst */ if (!(wr_rx->wr.opcode & IBV_WR_SEND)) { num_sge = 1; sg_len = wr_rx->sg[2].length; r_off = m_pi_mr_trans(smd, wr_rx->wr.wr.rdma.remote_addr, wr_rx->wr.wr.rdma.rkey, sg_len); if (!r_off) goto bail; mlog(4, " RDMA_WRITE op: wr_rx[%d] %p -> scif r_off %Lx len %d\n", wr_rx->w_idx, wr_rx, r_off, sg_len, 0); } /* sg[0] entry == proxy-out buffer, src for IB RR */ /* sg[1] entry == proxy-in buffer, dst for IB RR */ /* sg[2] entry == proxy-in buffer src for scif_sendto */ /* wr.rdma.remote_addr, wr.rdma.rkey, dst for scif_sento - TPT to sci_off */ wr_rx->wr.wr_id = 0; l_off_wr = (uint64_t) (m_qp->wr_off_r + (wr_rx->w_idx * m_qp->wrc.wr_sz)); l_off = wr_rx->sg[2].addr; l_len = wr_rx->sg[2].length; l_start = l_off - (uint64_t)smd->m_offset_r; l_end = l_start + l_len; for (i=0; (i<num_sge && l_len); i++) { w_len = min(sg_len, l_len); wt_flag = 0; mlog(4, " WR_rx[%d] %p writeto l_off %Lx r_off %Lx rb_off 0x%x-0x%x ln %d org_id %Lx tl %d hd %d\n", wr_rx->w_idx, wr_rx, l_off, r_off, l_start, l_end, w_len, wr_rx->org_id, m_qp->wr_tl_r, m_qp->wr_hd_r); #if MCM_PROFILE wr_rx->time = mcm_ts_us(); wr_rx->qcnt = m_qp->post_cnt_wt; #endif if (w_len < 256) wt_flag = SCIF_RMA_USECPU; ret = scif_writeto(smd->scif_tx_ep, l_off, w_len, r_off, wt_flag); if (ret) { mlog(0, " ERR: scif_sendto, ret %d err: %d %s\n", ret, errno, strerror(errno)); goto bail; } MCNTR(smd->md, MCM_SCIF_WRITE_TO); /* adjust for multiple SG entries on post_recv */ l_off += w_len; l_len = l_len - w_len; if (m_sr) { m_sr->len += w_len; r_off = m_sr->sg[i].addr; /* next SR segment */ sg_len = m_sr->sg[i].length; } } if (l_len) { mlog(0, " ERR: RX overrun: written %d remaining %d sge's %d\n", wr_rx->sg[2].length, l_len, num_sge); goto bail; } /* signal last segment */ mlog(4, " SCIF_fence_signal: l_off_wr %p, wr_rx %p wr_idx %d\n", l_off_wr, wr_rx, wr_rx->w_idx); ret = scif_fence_signal(smd->scif_tx_ep, l_off_wr, wr_rx->org_id, 0, 0, SCIF_FENCE_INIT_SELF | SCIF_SIGNAL_LOCAL); if (ret) { mlog(0," ERR: scif_fence_signal, ret %d %s\n", ret, strerror(errno)); goto bail; } MCNTR(smd->md, MCM_SCIF_SIGNAL); wr_rx->flags &= ~M_READ_POSTED; /* reset READ_POSTED */ wr_rx->flags |= M_READ_DONE; wr_rx->flags |= M_READ_WRITE_TO; m_qp->post_cnt_wt++; /* reached RR signaled marker, or head */ if (wr_idx == wr_sig->w_idx || wr_idx == m_qp->wr_hd_r) break; wr_idx = (wr_idx + 1) & m_qp->wrc.wr_end; /* next WR */ } write(smd->md->mc->rx_pipe[1], "w", sizeof "w"); /* signal rx_thread */ return; bail: /* report error via WC back to proxy-out */ mlog(0, " ERR: writeto: wr_rx[%d] %p -> raddr %Lx rkey %x (scif r_off %Lx) len %d\n", wr_rx->w_idx, wr_rx, wr_rx->wr.wr.rdma.remote_addr, wr_rx->wr.wr.rdma.rkey, r_off, sg_len); return; }