static int send_sreq_data(MPIDI_VC_t *vc, MPID_Request *sreq, knem_cookie_t *s_cookiep) { int mpi_errno = MPI_SUCCESS; int dt_contig; MPI_Aint dt_true_lb; MPIDI_msg_sz_t data_sz; MPID_Datatype * dt_ptr; /* MT: this code assumes only one thread can be at this point at a time */ if (knem_fd < 0) { mpi_errno = open_knem_dev(); if (mpi_errno) MPIU_ERR_POP(mpi_errno); } /* find out contig/noncontig, size, and lb for the datatype */ MPIDI_Datatype_get_info(sreq->dev.user_count, sreq->dev.datatype, dt_contig, data_sz, dt_ptr, dt_true_lb); if (dt_contig) { /* handle the iov creation ourselves */ sreq->dev.iov[0].MPID_IOV_BUF = (char *)sreq->dev.user_buf + dt_true_lb; sreq->dev.iov[0].MPID_IOV_LEN = data_sz; sreq->dev.iov_count = 1; } else { /* use the segment routines to handle the iovec creation */ if (sreq->dev.segment_ptr == NULL) { sreq->dev.iov_count = MPID_IOV_LIMIT; sreq->dev.iov_offset = 0; /* segment_ptr may be non-null when this is a continuation of a many-part message that we couldn't fit in one single flight of iovs. */ sreq->dev.segment_ptr = MPID_Segment_alloc(); MPIU_ERR_CHKANDJUMP1((sreq->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc"); MPID_Segment_init(sreq->dev.user_buf, sreq->dev.user_count, sreq->dev.datatype, sreq->dev.segment_ptr, 0); sreq->dev.segment_first = 0; sreq->dev.segment_size = data_sz; /* FIXME we should write our own function that isn't dependent on the in-request iov array. This will let us use IOVs that are larger than MPID_IOV_LIMIT. */ mpi_errno = MPIDI_CH3U_Request_load_send_iov(sreq, &sreq->dev.iov[0], &sreq->dev.iov_count); if (mpi_errno) MPIU_ERR_POP(mpi_errno); } } mpi_errno = do_dma_send(vc, sreq, sreq->dev.iov_count, sreq->dev.iov, s_cookiep); if (mpi_errno) MPIU_ERR_POP(mpi_errno); fn_exit: return mpi_errno; fn_fail: goto fn_exit; }
/* MPIDI_CH3_SendNoncontig_iov - Sends a message by loading an IOV and calling iSendv. The caller must initialize sreq->dev.segment as well as segment_first and segment_size. */ int MPIDI_CH3_SendNoncontig_iov( MPIDI_VC_t *vc, MPID_Request *sreq, void *header, MPIDI_msg_sz_t hdr_sz ) { int mpi_errno = MPI_SUCCESS; int iov_n; MPL_IOV iov[MPL_IOV_LIMIT]; MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_SENDNONCONTIG_IOV); MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_SENDNONCONTIG_IOV); iov[0].MPL_IOV_BUF = header; iov[0].MPL_IOV_LEN = hdr_sz; iov_n = MPL_IOV_LIMIT - 1; if (sreq->dev.ext_hdr_sz > 0) { /* When extended packet header exists, here we leave one IOV slot * before loading data to IOVs, so that there will be enough * IOVs for hdr/ext_hdr/data. */ iov_n--; } mpi_errno = MPIDI_CH3U_Request_load_send_iov(sreq, &iov[1], &iov_n); if (mpi_errno == MPI_SUCCESS) { iov_n += 1; /* Note this routine is invoked withing a CH3 critical section */ /* MPID_THREAD_CS_ENTER(POBJ, vc->pobj_mutex); */ mpi_errno = MPIDI_CH3_iSendv(vc, sreq, iov, iov_n); /* MPID_THREAD_CS_EXIT(POBJ, vc->pobj_mutex); */ /* --BEGIN ERROR HANDLING-- */ if (mpi_errno != MPI_SUCCESS) { MPID_Request_release(sreq); MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch3|eagermsg"); } /* --END ERROR HANDLING-- */ /* Note that in the non-blocking case, we need to add a ref to the datatypes */ } else { /* --BEGIN ERROR HANDLING-- */ MPID_Request_release(sreq); MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch3|loadsendiov"); /* --END ERROR HANDLING-- */ } fn_exit: MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_SENDNONCONTIG_IOV); return mpi_errno; fn_fail: goto fn_exit; }
/* fills in req->dev.iov{,_offset,_count} based on the datatype info in the request, creating a segment if necessary */ static int populate_iov_from_req(MPID_Request *req) { int mpi_errno = MPI_SUCCESS; int dt_contig; MPI_Aint dt_true_lb; MPIDI_msg_sz_t data_sz; MPID_Datatype * dt_ptr; /* find out contig/noncontig, size, and lb for the datatype */ MPIDI_Datatype_get_info(req->dev.user_count, req->dev.datatype, dt_contig, data_sz, dt_ptr, dt_true_lb); if (dt_contig) { /* handle the iov creation ourselves */ req->dev.iov[0].MPL_IOV_BUF = (char *)req->dev.user_buf + dt_true_lb; req->dev.iov[0].MPL_IOV_LEN = data_sz; req->dev.iov_count = 1; } else { /* use the segment routines to handle the iovec creation */ MPIU_Assert(req->dev.segment_ptr == NULL); req->dev.iov_count = MPL_IOV_LIMIT; req->dev.iov_offset = 0; /* XXX DJG FIXME where is this segment freed? */ req->dev.segment_ptr = MPID_Segment_alloc(); MPIR_ERR_CHKANDJUMP1((req->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc"); MPID_Segment_init(req->dev.user_buf, req->dev.user_count, req->dev.datatype, req->dev.segment_ptr, 0); req->dev.segment_first = 0; req->dev.segment_size = data_sz; /* FIXME we should write our own function that isn't dependent on the in-request iov array. This will let us use IOVs that are larger than MPL_IOV_LIMIT. */ mpi_errno = MPIDI_CH3U_Request_load_send_iov(req, &req->dev.iov[0], &req->dev.iov_count); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } fn_fail: return mpi_errno; }
int MPIDI_CH3I_MRAIL_Prepare_rndv_transfer (MPID_Request * sreq, /* contains local info */ MPIDI_CH3I_MRAILI_Rndv_info_t * rndv) { if (rndv->protocol == VAPI_PROTOCOL_R3) { if (sreq->mrail.d_entry != NULL) { dreg_unregister (sreq->mrail.d_entry); sreq->mrail.d_entry = NULL; } if (1 == sreq->mrail.rndv_buf_alloc && NULL != sreq->mrail.rndv_buf) { MPIU_Free (sreq->mrail.rndv_buf); sreq->mrail.rndv_buf_alloc = 0; sreq->mrail.rndv_buf = NULL; } sreq->mrail.remote_addr = NULL; sreq->mrail.remote_handle.hndl = DAT_HANDLE_NULL; sreq->mrail.protocol = VAPI_PROTOCOL_R3; } else { sreq->mrail.remote_addr = rndv->buf_addr; sreq->mrail.remote_handle = rndv->memhandle; DEBUG_PRINT ("[add rndv list] addr %p, key %p\n", sreq->mrail.remote_addr, sreq->mrail.remote_handle.rkey); if (1 == sreq->mrail.rndv_buf_alloc) { int mpi_errno = MPI_SUCCESS; int i; aint_t buf; buf = (aint_t) sreq->mrail.rndv_buf; for (i = 0; i < sreq->dev.iov_count; i++) { MPIU_Memcpy ((void *) buf, sreq->dev.iov[i].MPID_IOV_BUF, sreq->dev.iov[i].MPID_IOV_LEN); buf += sreq->dev.iov[i].MPID_IOV_LEN; } /* TODO: Following part is a workaround to deal with datatype with large number * of segments. We check if the datatype has finished loading and reload if not. * May be better interface with upper layer should be considered*/ while (sreq->dev.OnDataAvail == MPIDI_CH3_ReqHandler_SendReloadIOV) { sreq->dev.iov_count = MPID_IOV_LIMIT; mpi_errno = MPIDI_CH3U_Request_load_send_iov (sreq, sreq->dev.iov, &sreq->dev. iov_count); /* --BEGIN ERROR HANDLING-- */ if (mpi_errno != MPI_SUCCESS) { udapl_error_abort (UDAPL_STATUS_ERR, "Reload iov error"); } for (i = 0; i < sreq->dev.iov_count; i++) { MPIU_Memcpy ((void *) buf, sreq->dev.iov[i].MPID_IOV_BUF, sreq->dev.iov[i].MPID_IOV_LEN); buf += sreq->dev.iov[i].MPID_IOV_LEN; } } } } return MPI_SUCCESS; }
int MPID_nem_scif_SendNoncontig(MPIDI_VC_t * vc, MPID_Request * sreq, void *header, MPIDI_msg_sz_t hdr_sz) { int mpi_errno = MPI_SUCCESS; int iov_n; MPID_IOV iov[MPID_IOV_LIMIT]; MPID_IOV *iov_p; MPIDI_msg_sz_t offset = 0; int complete; uint64_t seqno = 0; MPID_nem_scif_vc_area *vc_scif = VC_SCIF(vc); MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_SCIF_SENDNONCONTIG); MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_SCIF_SENDNONCONTIG); MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "scif_SendNoncontig"); MPIU_Assert(hdr_sz <= sizeof(MPIDI_CH3_Pkt_t)); iov[0].MPID_IOV_BUF = header; iov[0].MPID_IOV_LEN = sizeof(MPIDI_CH3_Pkt_t); iov_n = MPID_IOV_LIMIT - 1; mpi_errno = MPIDI_CH3U_Request_load_send_iov(sreq, &iov[1], &iov_n); MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**loadsendiov"); iov_n += 1; offset = 0; if (MPIDI_CH3I_Sendq_empty(vc_scif->send_queue) && MPID_nem_scif_poll_send(vc_scif->sc->fd, &vc_scif->sc->csend)) { offset = MPID_nem_scif_writev(vc_scif->sc->fd, &vc_scif->sc->csend, iov, iov_n, &seqno); MPIU_ERR_CHKANDJUMP1(offset <= 0, mpi_errno, MPI_ERR_OTHER, "**scif_writev", "**scif_writev %s", MPIU_Strerror(errno)); MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "scif_send noncontig " MPIDI_MSG_SZ_FMT, offset); } if (offset < iov[0].MPID_IOV_LEN) { /* header was not yet sent, save it in req */ sreq->dev.pending_pkt = *(MPIDI_CH3_Pkt_t *) header; iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) & sreq->dev.pending_pkt; iov[0].MPID_IOV_LEN = sizeof(MPIDI_CH3_Pkt_t); } /* check if whole iov was sent, and save any unsent portion of * iov */ sreq->dev.iov_count = 0; complete = 1; for (iov_p = &iov[0]; iov_p < &iov[iov_n]; ++iov_p) { if (offset < iov_p->MPID_IOV_LEN) { sreq->dev.iov[sreq->dev.iov_count].MPID_IOV_BUF = (MPID_IOV_BUF_CAST) ((char *) iov_p->MPID_IOV_BUF + offset); sreq->dev.iov[sreq->dev.iov_count].MPID_IOV_LEN = iov_p->MPID_IOV_LEN - offset; offset = 0; ++sreq->dev.iov_count; complete = 0; seqno = 0; } else offset -= iov_p->MPID_IOV_LEN; } if (seqno) complete = MPID_nem_scif_chk_seqno(&vc_scif->sc->csend, seqno); if (complete) { /* sent whole iov */ int (*reqFn) (MPIDI_VC_t *, MPID_Request *, int *); reqFn = sreq->dev.OnDataAvail; if (!reqFn) { MPIDI_CH3U_Request_complete(sreq); MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, ".... complete"); goto fn_exit; } complete = 0; mpi_errno = reqFn(vc, sreq, &complete); if (mpi_errno) MPIU_ERR_POP(mpi_errno); if (complete) { MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, ".... complete"); goto fn_exit; } seqno = 0; } /* enqueue request */ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "enqueuing"); MPIU_Assert(seqno || (sreq->dev.iov_count >= 1 && sreq->dev.iov[0].MPID_IOV_LEN > 0)); RQ_SCIF(sreq)->seqno = seqno; sreq->ch.vc = vc; sreq->dev.iov_offset = 0; if (MPIDI_CH3I_Sendq_empty(vc_scif->send_queue)) { /* this will be the first send on the queue: queue it and set * the write flag on the pollfd */ MPIDI_CH3I_Sendq_enqueue(&vc_scif->send_queue, sreq); } else { /* there are other sends in the queue before this one: try to * send from the queue */ MPIDI_CH3I_Sendq_enqueue(&vc_scif->send_queue, sreq); mpi_errno = MPID_nem_scif_send_queued(vc, &vc_scif->send_queue); if (mpi_errno) MPIU_ERR_POP(mpi_errno); } fn_exit: MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_SCIF_SENDNONCONTIG); return mpi_errno; fn_fail: MPIU_Object_set_ref(sreq, 0); MPIDI_CH3_Request_destroy(sreq); goto fn_exit; }