int VwritetoSender::send_payload ()
{
	std::uint8_t *buf_ptr = buf;
	std::uint8_t *buf_end = buf + buf_sz;
	off_t roff_idx = roff; //First cacheline reserved for scif_fence_signal
	int err = 0, bytes;
	int mark;
	
	while (buf_ptr < buf_end) {
		bytes = std::min (block_sz, (int)(buf_end - buf_ptr));
		err = scif_vwriteto (epd, buf_ptr, bytes, roff_idx, 0);
		if (err < 0) {
			std::cerr << "ERROR: scif_vwriteto: " << std::strerror (errno) << std::endl;
			break;
		}
		buf_ptr += bytes;
		roff_idx += bytes;
	}

	/*synchronize */
	scif_fence_signal (epd, 0, 0, roff + buf_sz, 0xff, SCIF_FENCE_INIT_SELF | SCIF_SIGNAL_REMOTE);
	return buf_ptr - buf;
}
Example #2
0
/* called with rxlock, process all RR's up to signal marker at wr_last */
static void m_pi_post_writeto(struct mcm_qp *m_qp, struct mcm_wr_rx *wr_sig, struct ibv_wc *wc)
{
	mcm_scif_dev_t *smd = m_qp->smd;
	struct mcm_wr_rx *wr_rx;
	struct mcm_sr *m_sr = NULL;
	off_t l_off, l_off_wr, r_off;
	int ret, i, l_start, l_end, l_len, sg_len, w_len, num_sge, wr_idx, wr_cnt = 0;
	int wt_flag;

	wr_idx = m_qp->wr_tl_r_wt; /* from WT tail, process RR's posted until reaching wr_last */

	while (m_qp->pi_rr_cnt) { /* RR's pending */
		wr_rx = (struct mcm_wr_rx *)(m_qp->wrc.wr_addr + (m_qp->wrc.wr_sz * wr_idx));

		if (!(wr_rx->flags & M_READ_POSTED)) {
			/* reached RR signaled marker, or head pointer */
			if (wr_idx == wr_sig->w_idx || wr_idx == m_qp->wr_hd_r)
				break;

			wr_idx = (wr_idx + 1) & m_qp->wrc.wr_end; /* next WR */
			continue;
		}
		wr_cnt++;
#if MCM_PROFILE
		if (wr_rx == wr_sig)
			mcm_qp_prof_ts(m_qp, MCM_QP_IB_RR, wr_rx->time, wr_rx->qcnt, wr_cnt);
#endif
		mlog(4, " WR_rx[%d-%d] %p m_qp %p wc %p wc->op %x wr_rx->wr.op %x\n",
			wr_rx->w_idx, wr_sig->w_idx, wr_rx, m_qp, wc,
			wc->opcode, wr_rx->wr.opcode);

		m_qp->pi_rr_cnt--; /* rdma read complete */
		MCNTR(smd->md, MCM_QP_READ_DONE);

		/* if SR or RW_imm, need a posted receive */
		if ((wr_rx->wr.opcode & IBV_WR_SEND) ||
		    (wr_rx->wr.opcode & IBV_WR_RDMA_WRITE_WITH_IMM)) {
			m_sr = m_pi_get_sr(m_qp, wr_rx->w_idx);
			if (!m_sr) {
				mlog(0, " WARNING: SR stalled, no RCV messages posted"
					" m_qp %p, sr_tl %d sr_hd %d\n",
					m_qp, m_qp->sr_tl, m_qp->sr_hd);
				wr_rx->flags |= M_RECV_PAUSED;
				return;
			}
			wr_rx->s_idx = m_sr->s_idx; /* link WR_RX and SR */
			m_sr->len = 0;
			num_sge = m_sr->num_sge;
			sg_len = m_sr->sg[0].length;
			r_off = m_sr->sg[0].addr; /* post recv buffer address */
			mlog(4, " WR SR or RW_IMM: m_sr[%d] %p -> scif r_off %Lx ln %d\n",
				m_sr->s_idx, m_sr, r_off, sg_len);
		}
		/* need to translate to rdma write dst */
		if (!(wr_rx->wr.opcode & IBV_WR_SEND)) {
			num_sge = 1;
			sg_len = wr_rx->sg[2].length;
			r_off = m_pi_mr_trans(smd, wr_rx->wr.wr.rdma.remote_addr,
					      wr_rx->wr.wr.rdma.rkey, sg_len);
			if (!r_off)
				goto bail;

			mlog(4, " RDMA_WRITE op: wr_rx[%d] %p -> scif r_off %Lx len %d\n",
				 wr_rx->w_idx, wr_rx, r_off, sg_len, 0);
		}

		/* sg[0] entry == proxy-out buffer, src for IB RR */
		/* sg[1] entry == proxy-in buffer, dst for IB RR */
		/* sg[2] entry == proxy-in buffer src for scif_sendto */
		/* wr.rdma.remote_addr, wr.rdma.rkey, dst for scif_sento - TPT to sci_off */
		wr_rx->wr.wr_id = 0;
		l_off_wr = (uint64_t) (m_qp->wr_off_r + (wr_rx->w_idx * m_qp->wrc.wr_sz));
		l_off = wr_rx->sg[2].addr;
		l_len = wr_rx->sg[2].length;
		l_start = l_off - (uint64_t)smd->m_offset_r;
		l_end = l_start + l_len;

		for (i=0; (i<num_sge && l_len); i++) {
			w_len = min(sg_len, l_len);
			wt_flag = 0;
			mlog(4, " WR_rx[%d] %p writeto l_off %Lx r_off %Lx rb_off 0x%x-0x%x ln %d org_id %Lx tl %d hd %d\n",
				wr_rx->w_idx, wr_rx, l_off, r_off, l_start, l_end, w_len, wr_rx->org_id,
				m_qp->wr_tl_r, m_qp->wr_hd_r);
#if MCM_PROFILE
			wr_rx->time = mcm_ts_us();
			wr_rx->qcnt = m_qp->post_cnt_wt;
#endif
			if (w_len < 256)
				wt_flag = SCIF_RMA_USECPU;

			ret = scif_writeto(smd->scif_tx_ep, l_off, w_len, r_off, wt_flag);

			if (ret) {
				mlog(0, " ERR: scif_sendto, ret %d err: %d %s\n",
					ret, errno, strerror(errno));
				goto bail;
			}
			MCNTR(smd->md, MCM_SCIF_WRITE_TO);

			/* adjust for multiple SG entries on post_recv */
			l_off += w_len;
			l_len = l_len - w_len;
			if (m_sr) {
				m_sr->len += w_len;
				r_off = m_sr->sg[i].addr; /* next SR segment */
				sg_len = m_sr->sg[i].length;
			}
		}
		if (l_len) {
			mlog(0, " ERR: RX overrun: written %d remaining %d sge's %d\n",
				wr_rx->sg[2].length, l_len, num_sge);
			goto bail;
		}

		/* signal last segment */
		mlog(4, " SCIF_fence_signal: l_off_wr %p, wr_rx %p wr_idx %d\n",
			l_off_wr, wr_rx, wr_rx->w_idx);

		ret = scif_fence_signal(smd->scif_tx_ep, l_off_wr, wr_rx->org_id, 0, 0,
					SCIF_FENCE_INIT_SELF | SCIF_SIGNAL_LOCAL);
		if (ret) {
			mlog(0," ERR: scif_fence_signal, ret %d %s\n", ret, strerror(errno));
			goto bail;
		}
		MCNTR(smd->md, MCM_SCIF_SIGNAL);
		wr_rx->flags &= ~M_READ_POSTED; /* reset READ_POSTED */
		wr_rx->flags |= M_READ_DONE;
		wr_rx->flags |= M_READ_WRITE_TO;
		m_qp->post_cnt_wt++;

		/* reached RR signaled marker, or head */
		if (wr_idx == wr_sig->w_idx || wr_idx == m_qp->wr_hd_r)
			break;

		wr_idx = (wr_idx + 1) & m_qp->wrc.wr_end; /* next WR */
	}
	write(smd->md->mc->rx_pipe[1], "w", sizeof "w"); /* signal rx_thread */
	return;
bail:
	/* report error via WC back to proxy-out */
	mlog(0, " ERR: writeto: wr_rx[%d] %p -> raddr %Lx rkey %x (scif r_off %Lx) len %d\n",
		wr_rx->w_idx, wr_rx, wr_rx->wr.wr.rdma.remote_addr,
		wr_rx->wr.wr.rdma.rkey, r_off, sg_len);

	return;
}