/* MPIDI_CH3_EagerNoncontigSend - Eagerly send noncontiguous data */ int MPIDI_CH3_EagerNoncontigSend( MPID_Request **sreq_p, MPIDI_CH3_Pkt_type_t reqtype, const void * buf, MPI_Aint count, MPI_Datatype datatype, MPIDI_msg_sz_t data_sz, int rank, int tag, MPID_Comm * comm, int context_offset ) { int mpi_errno = MPI_SUCCESS; MPIDI_VC_t * vc; MPID_Request *sreq = *sreq_p; MPIDI_CH3_Pkt_t upkt; MPIDI_CH3_Pkt_eager_send_t * const eager_pkt = &upkt.eager_send; MPIU_DBG_MSG_FMT(CH3_OTHER,VERBOSE,(MPIU_DBG_FDEST, "sending non-contiguous eager message, data_sz=" MPIDI_MSG_SZ_FMT, data_sz)); sreq->dev.OnDataAvail = 0; sreq->dev.OnFinal = 0; MPIDI_Pkt_init(eager_pkt, reqtype); eager_pkt->match.parts.rank = comm->rank; eager_pkt->match.parts.tag = tag; eager_pkt->match.parts.context_id = comm->context_id + context_offset; eager_pkt->sender_req_id = MPI_REQUEST_NULL; eager_pkt->data_sz = data_sz; MPIDI_Comm_get_vc_set_active(comm, rank, &vc); MPIDI_VC_FAI_send_seqnum(vc, seqnum); MPIDI_Pkt_set_seqnum(eager_pkt, seqnum); MPIDI_Request_set_seqnum(sreq, seqnum); MPIU_DBG_MSGPKT(vc,tag,eager_pkt->match.parts.context_id,rank,data_sz, "Eager"); sreq->dev.segment_ptr = MPID_Segment_alloc( ); MPIR_ERR_CHKANDJUMP1((sreq->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc"); MPID_Segment_init(buf, count, datatype, sreq->dev.segment_ptr, 0); sreq->dev.segment_first = 0; sreq->dev.segment_size = data_sz; MPID_THREAD_CS_ENTER(POBJ, vc->pobj_mutex); mpi_errno = vc->sendNoncontig_fn(vc, sreq, eager_pkt, sizeof(MPIDI_CH3_Pkt_eager_send_t)); MPID_THREAD_CS_EXIT(POBJ, vc->pobj_mutex); if (mpi_errno) MPIR_ERR_POP(mpi_errno); fn_exit: return mpi_errno; fn_fail: *sreq_p = NULL; goto fn_exit; }
int MPIDI_CH3_iStartRndvTransfer(MPIDI_VC_t * vc, MPID_Request * rreq) { MPIDI_CH3_Pkt_t upkt; MPIDI_CH3_Pkt_rndv_clr_to_send_t *cts_pkt = &upkt.rndv_clr_to_send; MPID_Request *cts_req; MPID_Seqnum_t seqnum; int mpi_errno = MPI_SUCCESS; MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_ISTARTRNDVTRANSFER); MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_ISTARTRNDVTRANSFER); #ifdef CKPT MPIDI_CH3I_CR_lock(); #endif MPIDI_Pkt_init(cts_pkt, MPIDI_CH3_PKT_RNDV_CLR_TO_SEND); if (rreq->dev.iov_count == 1 && rreq->dev.OnDataAvail == NULL) cts_pkt->recv_sz = rreq->dev.iov[0].MPID_IOV_LEN; else cts_pkt->recv_sz = rreq->dev.segment_size; cts_pkt->sender_req_id = rreq->dev.sender_req_id; cts_pkt->receiver_req_id = rreq->handle; MPIDI_VC_FAI_send_seqnum(vc, seqnum); MPIDI_Pkt_set_seqnum(cts_pkt, seqnum); mpi_errno = MPIDI_CH3_Prepare_rndv_cts(vc, cts_pkt, rreq); if (mpi_errno != MPI_SUCCESS) { mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**ch3|ctspkt", 0); goto fn_exit; } mpi_errno = MPIDI_CH3_iStartMsg(vc, cts_pkt, sizeof(*cts_pkt), &cts_req); /* --BEGIN ERROR HANDLING-- */ if (mpi_errno != MPI_SUCCESS) { mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**ch3|ctspkt", 0); goto fn_exit; } /* --END ERROR HANDLING-- */ if (cts_req != NULL) { MPID_Request_release(cts_req); } fn_exit: #ifdef CKPT MPIDI_CH3I_CR_unlock(); #endif MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_ISTARTRNDVTRANSFER); return mpi_errno; }
int MPIDI_CH3_EagerContigSend( MPID_Request **sreq_p, MPIDI_CH3_Pkt_type_t reqtype, const void * buf, MPIDI_msg_sz_t data_sz, int rank, int tag, MPID_Comm * comm, int context_offset ) { int mpi_errno = MPI_SUCCESS; MPIDI_VC_t * vc; MPIDI_CH3_Pkt_t upkt; MPIDI_CH3_Pkt_eager_send_t * const eager_pkt = &upkt.eager_send; MPID_Request *sreq = *sreq_p; MPL_IOV iov[2]; MPIDI_Pkt_init(eager_pkt, reqtype); eager_pkt->match.parts.rank = comm->rank; eager_pkt->match.parts.tag = tag; eager_pkt->match.parts.context_id = comm->context_id + context_offset; eager_pkt->sender_req_id = MPI_REQUEST_NULL; eager_pkt->data_sz = data_sz; iov[0].MPL_IOV_BUF = (MPL_IOV_BUF_CAST)eager_pkt; iov[0].MPL_IOV_LEN = sizeof(*eager_pkt); MPIU_DBG_MSG_FMT(CH3_OTHER,VERBOSE,(MPIU_DBG_FDEST, "sending contiguous eager message, data_sz=" MPIDI_MSG_SZ_FMT, data_sz)); iov[1].MPL_IOV_BUF = (MPL_IOV_BUF_CAST) buf; iov[1].MPL_IOV_LEN = data_sz; MPIDI_Comm_get_vc_set_active(comm, rank, &vc); MPIDI_VC_FAI_send_seqnum(vc, seqnum); MPIDI_Pkt_set_seqnum(eager_pkt, seqnum); MPIU_DBG_MSGPKT(vc,tag,eager_pkt->match.parts.context_id,rank,data_sz,"EagerContig"); MPID_THREAD_CS_ENTER(POBJ, vc->pobj_mutex); mpi_errno = MPIDI_CH3_iStartMsgv(vc, iov, 2, sreq_p); MPID_THREAD_CS_EXIT(POBJ, vc->pobj_mutex); if (mpi_errno != MPI_SUCCESS) { MPIR_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**ch3|eagermsg"); } sreq = *sreq_p; if (sreq != NULL) { MPIDI_Request_set_seqnum(sreq, seqnum); MPIDI_Request_set_type(sreq, MPIDI_REQUEST_TYPE_SEND); } fn_fail: return mpi_errno; }
/* Send a zero-sized message with eager synchronous. This is a temporary routine, as we may want to replace this with a counterpart to the Eager Short message */ int MPIDI_CH3_EagerSyncZero(MPIR_Request **sreq_p, int rank, int tag, MPIR_Comm * comm, int context_offset ) { int mpi_errno = MPI_SUCCESS; MPIDI_CH3_Pkt_t upkt; MPIDI_CH3_Pkt_eager_sync_send_t * const es_pkt = &upkt.eager_sync_send; MPIDI_VC_t * vc; MPIR_Request *sreq = *sreq_p; MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,VERBOSE,"sending zero length message"); /* MT FIXME what are the two operations we are waiting for? the send and * the sync response? */ MPIR_cc_set(&sreq->cc, 2); MPIDI_Request_set_msg_type(sreq, MPIDI_REQUEST_EAGER_MSG); sreq->dev.OnDataAvail = 0; MPIDI_Pkt_init(es_pkt, MPIDI_CH3_PKT_EAGER_SYNC_SEND); es_pkt->match.parts.rank = comm->rank; es_pkt->match.parts.tag = tag; es_pkt->match.parts.context_id = comm->context_id + context_offset; es_pkt->sender_req_id = sreq->handle; es_pkt->data_sz = 0; MPIDI_Comm_get_vc_set_active(comm, rank, &vc); MPIDI_VC_FAI_send_seqnum(vc, seqnum); MPIDI_Pkt_set_seqnum(es_pkt, seqnum); MPIDI_Request_set_seqnum(sreq, seqnum); MPL_DBG_MSGPKT(vc,tag,es_pkt->match.parts.context_id,rank,(intptr_t)0,"EagerSync0"); MPID_THREAD_CS_ENTER(POBJ, vc->pobj_mutex); mpi_errno = MPIDI_CH3_iSend(vc, sreq, es_pkt, sizeof(*es_pkt)); MPID_THREAD_CS_EXIT(POBJ, vc->pobj_mutex); /* --BEGIN ERROR HANDLING-- */ if (mpi_errno != MPI_SUCCESS) { MPIR_Request_free(sreq); *sreq_p = NULL; MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch3|eagermsg"); } /* --END ERROR HANDLING-- */ fn_exit: return mpi_errno; fn_fail: goto fn_exit; }
/* MPIDI_CH3_RndvSend - Send a request to perform a rendezvous send */ int MPIDI_CH3_RndvSend( MPIR_Request **sreq_p, const void * buf, MPI_Aint count, MPI_Datatype datatype, int dt_contig, intptr_t data_sz, MPI_Aint dt_true_lb, int rank, int tag, MPIR_Comm * comm, int context_offset ) { MPIDI_CH3_Pkt_t upkt; MPIDI_CH3_Pkt_rndv_req_to_send_t * const rts_pkt = &upkt.rndv_req_to_send; MPIDI_VC_t * vc; MPIR_Request * rts_sreq; MPIR_Request *sreq =*sreq_p; int mpi_errno = MPI_SUCCESS; MPL_DBG_MSG_D(MPIDI_CH3_DBG_OTHER,VERBOSE, "sending rndv RTS, data_sz=%" PRIdPTR, data_sz); sreq->dev.OnDataAvail = 0; sreq->dev.partner_request = NULL; MPIDI_Pkt_init(rts_pkt, MPIDI_CH3_PKT_RNDV_REQ_TO_SEND); rts_pkt->match.parts.rank = comm->rank; rts_pkt->match.parts.tag = tag; rts_pkt->match.parts.context_id = comm->context_id + context_offset; rts_pkt->sender_req_id = sreq->handle; rts_pkt->data_sz = data_sz; MPIDI_Comm_get_vc_set_active(comm, rank, &vc); MPIDI_VC_FAI_send_seqnum(vc, seqnum); MPIDI_Pkt_set_seqnum(rts_pkt, seqnum); MPIDI_Request_set_seqnum(sreq, seqnum); MPL_DBG_MSGPKT(vc,tag,rts_pkt->match.parts.context_id,rank,data_sz,"Rndv"); MPID_THREAD_CS_ENTER(POBJ, vc->pobj_mutex); mpi_errno = MPIDI_CH3_iStartMsg(vc, rts_pkt, sizeof(*rts_pkt), &rts_sreq); MPID_THREAD_CS_EXIT(POBJ, vc->pobj_mutex); /* --BEGIN ERROR HANDLING-- */ if (mpi_errno != MPI_SUCCESS) { MPIR_Request_free(sreq); *sreq_p = NULL; MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch3|rtspkt"); } /* --END ERROR HANDLING-- */ if (rts_sreq != NULL) { if (rts_sreq->status.MPI_ERROR != MPI_SUCCESS) { MPIR_Request_free(sreq); *sreq_p = NULL; mpi_errno = rts_sreq->status.MPI_ERROR; MPIR_Request_free(rts_sreq); MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch3|rtspkt"); } MPIR_Request_free(rts_sreq); } /* FIXME: fill temporary IOV or pack temporary buffer after send to hide some latency. This requires synchronization because the CTS packet could arrive and be processed before the above iStartmsg completes (depending on the progress engine, threads, etc.). */ fn_exit: return mpi_errno; fn_fail: goto fn_exit; }
int MPID_Isend(const void * buf, int count, MPI_Datatype datatype, int rank, int tag, MPID_Comm * comm, int context_offset, MPID_Request ** request) { MPIDI_msg_sz_t data_sz; int dt_contig; MPI_Aint dt_true_lb; MPID_Datatype * dt_ptr; MPID_Request * sreq; MPIDI_VC_t * vc=0; #if defined(MPID_USE_SEQUENCE_NUMBERS) MPID_Seqnum_t seqnum; #endif int mpi_errno = MPI_SUCCESS; MPIDI_STATE_DECL(MPID_STATE_MPID_ISEND); MPIDI_FUNC_ENTER(MPID_STATE_MPID_ISEND); MPIU_DBG_MSG_FMT(CH3_OTHER,VERBOSE,(MPIU_DBG_FDEST, "rank=%d, tag=%d, context=%d", rank, tag, comm->context_id + context_offset)); if (rank == comm->rank && comm->comm_kind != MPID_INTERCOMM) { #if defined (_OSU_PSM_) goto skip_self_send; /* psm will internally do self-send, no special handling is needed here */ #endif /* _OSU_PSM_ */ mpi_errno = MPIDI_Isend_self(buf, count, datatype, rank, tag, comm, context_offset, MPIDI_REQUEST_TYPE_SEND, &sreq); goto fn_exit; } #if defined (_OSU_PSM_) skip_self_send: #endif if (rank != MPI_PROC_NULL) { MPIDI_Comm_get_vc_set_active(comm, rank, &vc); #ifdef ENABLE_COMM_OVERRIDES /* this needs to come before the sreq is created, since the override * function is responsible for creating its own request */ if (vc->comm_ops && vc->comm_ops->isend) { mpi_errno = vc->comm_ops->isend( vc, buf, count, datatype, rank, tag, comm, context_offset, &sreq); goto fn_exit; } #endif } MPIDI_Request_create_sreq(sreq, mpi_errno, goto fn_exit); MPIDI_Request_set_type(sreq, MPIDI_REQUEST_TYPE_SEND); if (rank == MPI_PROC_NULL) { MPIU_Object_set_ref(sreq, 1); MPID_cc_set(&sreq->cc, 0); goto fn_exit; } MPIDI_Datatype_get_info(count, datatype, dt_contig, data_sz, dt_ptr, dt_true_lb); if (data_sz == 0) { #if defined (_OSU_PSM_) goto eager_send; #endif /* _OSU_PSM_ */ MPIDI_CH3_Pkt_t upkt; MPIDI_CH3_Pkt_eager_send_t * const eager_pkt = &upkt.eager_send; MPIDI_Request_set_msg_type(sreq, MPIDI_REQUEST_EAGER_MSG); sreq->dev.OnDataAvail = 0; MPIU_DBG_MSG(CH3_OTHER,VERBOSE,"sending zero length message"); MPIDI_Pkt_init(eager_pkt, MPIDI_CH3_PKT_EAGER_SEND); eager_pkt->match.parts.rank = comm->rank; eager_pkt->match.parts.tag = tag; eager_pkt->match.parts.context_id = comm->context_id + context_offset; eager_pkt->sender_req_id = sreq->handle; eager_pkt->data_sz = 0; MPIDI_VC_FAI_send_seqnum(vc, seqnum); MPIDI_Pkt_set_seqnum(eager_pkt, seqnum); MPIDI_Request_set_seqnum(sreq, seqnum); MPIU_THREAD_CS_ENTER(CH3COMM,vc); mpi_errno = MPIU_CALL(MPIDI_CH3,iSend(vc, sreq, eager_pkt, sizeof(*eager_pkt))); MPIU_THREAD_CS_EXIT(CH3COMM,vc); /* --BEGIN ERROR HANDLING-- */ if (mpi_errno != MPI_SUCCESS) { MPIU_Object_set_ref(sreq, 0); MPIDI_CH3_Request_destroy(sreq); sreq = NULL; MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**ch3|eagermsg"); goto fn_exit; } /* --END ERROR HANDLING-- */ goto fn_exit; } #if defined (_OSU_PSM_) if(HANDLE_GET_KIND(datatype) != HANDLE_KIND_BUILTIN) { sreq->dev.datatype_ptr = dt_ptr; MPID_Datatype_add_ref(dt_ptr); sreq->psm_flags |= PSM_NEED_DTYPE_RELEASE; } if(vc->force_eager) goto eager_send; #endif /* _OSU_PSM_ */ #if defined(_OSU_MVAPICH_) int i; for (i = 0 ; i < rdma_num_extra_polls; i++) { if (rdma_global_ext_sendq_size > 1) MPID_Progress_test(); } #endif /* FIXME: flow control: limit number of outstanding eager messsages containing data and need to be buffered by the receiver */ #if defined(_OSU_MVAPICH_) if (data_sz + sizeof(MPIDI_CH3_Pkt_eager_send_t) <= vc->eager_max_msg_sz && !vc->force_rndv) #else /* defined(_OSU_MVAPICH_) */ if (data_sz + sizeof(MPIDI_CH3_Pkt_eager_send_t) <= vc->eager_max_msg_sz) #endif /* defined(_OSU_MVAPICH_) */ { #if defined (_OSU_PSM_) eager_send: #endif /* _OSU_PSM */ if (dt_contig) { mpi_errno = MPIDI_CH3_EagerContigIsend( &sreq, MPIDI_CH3_PKT_EAGER_SEND, (char*)buf + dt_true_lb, data_sz, rank, tag, comm, context_offset ); } else { #if defined (_OSU_PSM_) sreq->psm_flags |= PSM_NON_BLOCKING_SEND; #endif mpi_errno = MPIDI_CH3_EagerNoncontigSend( &sreq, MPIDI_CH3_PKT_EAGER_SEND, buf, count, datatype, data_sz, rank, tag, comm, context_offset ); #if defined (_OSU_PSM_) goto fn_exit; #endif /* If we're not complete, then add a reference to the datatype */ if (sreq && sreq->dev.OnDataAvail) { sreq->dev.datatype_ptr = dt_ptr; MPID_Datatype_add_ref(dt_ptr); } } } else { /* Note that the sreq was created above */ MPIDI_Request_set_msg_type( sreq, MPIDI_REQUEST_RNDV_MSG ); mpi_errno = vc->rndvSend_fn( &sreq, buf, count, datatype, dt_contig, data_sz, dt_true_lb, rank, tag, comm, context_offset ); /* FIXME: fill temporary IOV or pack temporary buffer after send to hide some latency. This requires synchronization because the CTS packet could arrive and be processed before the above iStartmsg completes (depending on the progress engine, threads, etc.). */ #if defined(_OSU_MVAPICH_) /* rndv transfers need to process CTS packet to initiate the actual RDMA transfer */ MPID_Progress_test(); #endif /* defined(_OSU_MVAPICH_) */ if (sreq && dt_ptr != NULL) { sreq->dev.datatype_ptr = dt_ptr; MPID_Datatype_add_ref(dt_ptr); } } fn_exit: *request = sreq; #if defined(_OSU_MVAPICH_) for (i = 0 ; i < rdma_num_extra_polls; i++) { if (rdma_global_ext_sendq_size > 1) MPID_Progress_test(); } #endif MPIU_DBG_STMT(CH3_OTHER,VERBOSE, { if (sreq != NULL) { MPIU_DBG_MSG_P(CH3_OTHER,VERBOSE,"request allocated, handle=0x%08x", sreq->handle); } } );
int MPIDI_CH3_Packetized_send(MPIDI_VC_t * vc, MPID_Request * sreq) { MPIDI_CH3_Pkt_packetized_send_start_t send_start; MPIDI_CH3_Pkt_packetized_send_data_t pkt_head; vbuf *buf; int mpi_errno = MPI_SUCCESS; int n_iov; int msg_buffered = 0; int nb; int complete; int pkt_len; int seqnum; MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_SENDV); MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_SENDV); MPIU_DBG_PRINTF(("ch3_isendv\n")); MPIDI_DBG_PRINTF((50, FCNAME, "entering")); MPIDI_Pkt_init(&send_start, MPIDI_CH3_PKT_PACKETIZED_SEND_START); iov[0].MPID_IOV_LEN = sizeof(MPIDI_CH3_Pkt_packetized_send_start_t); iov[0].MPID_IOV_BUF = (void*) &send_start; MPIU_Memcpy(&iov[1], sreq->dev.iov, sreq->dev.iov_count * sizeof(MPID_IOV)); n_iov = 1 + sreq->dev.iov_count; GET_SEQ_NUM(sreq->dev.iov[0].MPID_IOV_BUF, seqnum); if (-1 == seqnum) { MPIDI_VC_FAI_send_seqnum(vc, seqnum); } MPIDI_Pkt_set_seqnum(&send_start, seqnum); MPIDI_Request_set_seqnum(sreq, seqnum); send_start.origin_head_size = sreq->dev.iov[0].MPID_IOV_LEN; Calculate_IOV_len(iov, n_iov, pkt_len); mpi_errno = MPIDI_CH3I_MRAILI_Eager_send(vc, iov, n_iov, pkt_len, &nb, &buf); DEBUG_PRINT("[pkt send] mpierr %d, nb %d\n", mpi_errno, nb); if (MPI_SUCCESS != mpi_errno && MPI_MRAIL_MSG_QUEUED != mpi_errno) { vc->ch.state = MPIDI_CH3I_VC_STATE_FAILED; sreq->status.MPI_ERROR = MPI_ERR_INTERN; MPIDI_CH3U_Request_complete(sreq); goto fn_exit; } else if (MPI_MRAIL_MSG_QUEUED == mpi_errno) { msg_buffered = 1; } nb -= sizeof(MPIDI_CH3_Pkt_packetized_send_start_t); MPIDI_Pkt_init(&pkt_head, MPIDI_CH3_PKT_PACKETIZED_SEND_DATA); iov[0].MPID_IOV_LEN = sizeof(MPIDI_CH3_Pkt_packetized_send_data_t); iov[0].MPID_IOV_BUF = (void*) &pkt_head; do { while (!MPIDI_CH3I_Request_adjust_iov(sreq, nb)) { MPIDI_VC_FAI_send_seqnum(vc, seqnum); MPIDI_Pkt_set_seqnum(&pkt_head, seqnum); MPIDI_Request_set_seqnum(sreq, seqnum); MPIU_Memcpy((void *) &iov[1], &sreq->dev.iov[sreq->dev.iov_offset], (sreq->dev.iov_count - sreq->dev.iov_offset) * sizeof(MPID_IOV)); n_iov = sreq->dev.iov_count - sreq->dev.iov_offset + 1; Calculate_IOV_len(iov, n_iov, pkt_len); mpi_errno = MPIDI_CH3I_MRAILI_Eager_send(vc, iov, n_iov, pkt_len, &nb, &buf); DEBUG_PRINT("[istartmsgv] mpierr %d, nb %d\n", mpi_errno, nb); MPIU_Assert(NULL == buf->sreq); if (MPI_SUCCESS != mpi_errno && MPI_MRAIL_MSG_QUEUED != mpi_errno) { vc->ch.state = MPIDI_CH3I_VC_STATE_FAILED; sreq->status.MPI_ERROR = MPI_ERR_INTERN; MPIDI_CH3U_Request_complete(sreq); goto fn_exit; } else if (MPI_MRAIL_MSG_QUEUED == mpi_errno) { msg_buffered = 1; } nb -= sizeof(MPIDI_CH3_Pkt_packetized_send_data_t); } if (sreq->dev.OnDataAvail == MPIDI_CH3_ReqHandler_SendReloadIOV) { MPIDI_CH3U_Handle_send_req(vc, sreq, &complete); nb = 0; complete = 0; } else { complete = 1; } } while (!complete); if (msg_buffered) { mpi_errno = MPI_MRAIL_MSG_QUEUED; buf->sreq = (void *) sreq; } else { MPIDI_CH3U_Handle_send_req(vc, sreq, &complete); } fn_exit: MPIDI_DBG_PRINTF((50, FCNAME, "exiting")); MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_SENDV); return mpi_errno; }
int MPID_Send(const void * buf, int count, MPI_Datatype datatype, int rank, int tag, MPID_Comm * comm, int context_offset, MPID_Request ** request) { MPIDI_msg_sz_t data_sz; int dt_contig; MPI_Aint dt_true_lb; MPID_Datatype * dt_ptr; MPID_Request * sreq = NULL; MPIDI_VC_t * vc; #if defined(MPID_USE_SEQUENCE_NUMBERS) MPID_Seqnum_t seqnum; #endif int mpi_errno = MPI_SUCCESS; MPIDI_STATE_DECL(MPID_STATE_MPID_SEND); MPIDI_FUNC_ENTER(MPID_STATE_MPID_SEND); MPIU_DBG_MSG_FMT(CH3_OTHER,VERBOSE,(MPIU_DBG_FDEST, "rank=%d, tag=%d, context=%d", rank, tag, comm->context_id + context_offset)); if (rank == comm->rank && comm->comm_kind != MPID_INTERCOMM) { mpi_errno = MPIDI_Isend_self(buf, count, datatype, rank, tag, comm, context_offset, MPIDI_REQUEST_TYPE_SEND, &sreq); /* In the single threaded case, sending to yourself will cause deadlock. Note that in the runtime-thread case, this check will not be made (long-term FIXME) */ # ifndef MPICH_IS_THREADED { if (sreq != NULL && sreq->cc != 0) { MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER, "**dev|selfsenddeadlock"); } } # endif if (mpi_errno != MPI_SUCCESS) { MPIU_ERR_POP(mpi_errno); } goto fn_exit; } if (rank == MPI_PROC_NULL) { goto fn_exit; } MPIDI_Comm_get_vc_set_active(comm, rank, &vc); #ifdef ENABLE_COMM_OVERRIDES if (vc->comm_ops && vc->comm_ops->send) { mpi_errno = vc->comm_ops->send( vc, buf, count, datatype, rank, tag, comm, context_offset, &sreq); goto fn_exit; } #endif MPIDI_Datatype_get_info(count, datatype, dt_contig, data_sz, dt_ptr, dt_true_lb); if (data_sz == 0) { MPIDI_CH3_Pkt_t upkt; MPIDI_CH3_Pkt_eager_send_t * const eager_pkt = &upkt.eager_send; MPIU_DBG_MSG(CH3_OTHER,VERBOSE,"sending zero length message"); MPIDI_Pkt_init(eager_pkt, MPIDI_CH3_PKT_EAGER_SEND); eager_pkt->match.parts.rank = comm->rank; eager_pkt->match.parts.tag = tag; eager_pkt->match.parts.context_id = comm->context_id + context_offset; eager_pkt->sender_req_id = MPI_REQUEST_NULL; eager_pkt->data_sz = 0; MPIDI_VC_FAI_send_seqnum(vc, seqnum); MPIDI_Pkt_set_seqnum(eager_pkt, seqnum); MPIU_THREAD_CS_ENTER(CH3COMM,vc); mpi_errno = MPIDI_CH3_iStartMsg(vc, eager_pkt, sizeof(*eager_pkt), &sreq); MPIU_THREAD_CS_EXIT(CH3COMM,vc); /* --BEGIN ERROR HANDLING-- */ if (mpi_errno != MPI_SUCCESS) { MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**ch3|eagermsg"); } /* --END ERROR HANDLING-- */ if (sreq != NULL) { MPIDI_Request_set_seqnum(sreq, seqnum); MPIDI_Request_set_type(sreq, MPIDI_REQUEST_TYPE_SEND); /* sreq->comm = comm; MPIR_Comm_add_ref(comm); -- not necessary for blocking functions */ } goto fn_exit; } /* FIXME: flow control: limit number of outstanding eager messsages containing data and need to be buffered by the receiver */ #ifdef USE_EAGER_SHORT if (dt_contig && data_sz <= MPIDI_EAGER_SHORT_SIZE) { mpi_errno = MPIDI_CH3_EagerContigShortSend( &sreq, MPIDI_CH3_PKT_EAGERSHORT_SEND, (char *)buf + dt_true_lb, data_sz, rank, tag, comm, context_offset ); } else #endif if (data_sz + sizeof(MPIDI_CH3_Pkt_eager_send_t) <= vc->eager_max_msg_sz) { if (dt_contig) { mpi_errno = MPIDI_CH3_EagerContigSend( &sreq, MPIDI_CH3_PKT_EAGER_SEND, (char *)buf + dt_true_lb, data_sz, rank, tag, comm, context_offset ); } else { MPIDI_Request_create_sreq(sreq, mpi_errno, goto fn_exit); MPIDI_Request_set_type(sreq, MPIDI_REQUEST_TYPE_SEND); mpi_errno = MPIDI_CH3_EagerNoncontigSend( &sreq, MPIDI_CH3_PKT_EAGER_SEND, buf, count, datatype, data_sz, rank, tag, comm, context_offset ); } } else {
int MPIDI_CH3_EagerContigShortSend( MPID_Request **sreq_p, MPIDI_CH3_Pkt_type_t reqtype, const void * buf, MPIDI_msg_sz_t data_sz, int rank, int tag, MPID_Comm * comm, int context_offset ) { int mpi_errno = MPI_SUCCESS; MPIDI_VC_t * vc; MPIDI_CH3_Pkt_t upkt; MPIDI_CH3_Pkt_eagershort_send_t * const eagershort_pkt = &upkt.eagershort_send; MPID_Request *sreq = *sreq_p; /* printf( "Sending short eager\n"); fflush(stdout); */ MPIDI_Pkt_init(eagershort_pkt, reqtype); eagershort_pkt->match.parts.rank = comm->rank; eagershort_pkt->match.parts.tag = tag; eagershort_pkt->match.parts.context_id = comm->context_id + context_offset; eagershort_pkt->data_sz = data_sz; MPIU_DBG_MSG_FMT(CH3_OTHER,VERBOSE,(MPIU_DBG_FDEST, "sending contiguous short eager message, data_sz=" MPIDI_MSG_SZ_FMT, data_sz)); MPIDI_Comm_get_vc_set_active(comm, rank, &vc); MPIDI_VC_FAI_send_seqnum(vc, seqnum); MPIDI_Pkt_set_seqnum(eagershort_pkt, seqnum); /* Copy the payload. We could optimize this if data_sz & 0x3 == 0 (copy (data_sz >> 2) ints, inline that since data size is currently limited to 4 ints */ { unsigned char * restrict p = (unsigned char *)eagershort_pkt->data; unsigned char const * restrict bufp = (unsigned char *)buf; int i; for (i=0; i<data_sz; i++) { *p++ = *bufp++; } } MPIU_DBG_MSGPKT(vc,tag,eagershort_pkt->match.parts.context_id,rank,data_sz, "EagerShort"); MPID_THREAD_CS_ENTER(POBJ, vc->pobj_mutex); mpi_errno = MPIDI_CH3_iStartMsg(vc, eagershort_pkt, sizeof(*eagershort_pkt), sreq_p); MPID_THREAD_CS_EXIT(POBJ, vc->pobj_mutex); if (mpi_errno != MPI_SUCCESS) { MPIR_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**ch3|eagermsg"); } sreq = *sreq_p; if (sreq != NULL) { /*printf( "Surprise, did not complete send of eagershort (starting connection?)\n" ); fflush(stdout); */ /* MT FIXME setting fields in the request after it has been given to the * progress engine is racy. The start call above is protected by * vc CS, but the progress engine is protected by MPIDCOMM. So * we can't just extend the CS type below this point... what's the fix? */ MPIDI_Request_set_seqnum(sreq, seqnum); MPIDI_Request_set_type(sreq, MPIDI_REQUEST_TYPE_SEND); } fn_fail: return mpi_errno; }
int MPID_Send(const void * buf, MPI_Aint count, MPI_Datatype datatype, int rank, int tag, MPID_Comm * comm, int context_offset, MPID_Request ** request) { MPIDI_msg_sz_t data_sz; int dt_contig; MPI_Aint dt_true_lb; MPID_Datatype * dt_ptr; MPID_Request * sreq = NULL; MPIDI_VC_t * vc; #if defined(MPID_USE_SEQUENCE_NUMBERS) MPID_Seqnum_t seqnum; #endif int eager_threshold = -1; int mpi_errno = MPI_SUCCESS; #if defined(FINEGRAIN_MPI) int destpid=-1, destworldrank=-1; #endif MPIDI_STATE_DECL(MPID_STATE_MPID_SEND); MPIDI_FUNC_ENTER(MPID_STATE_MPID_SEND); MPIU_DBG_MSG_FMT(CH3_OTHER,VERBOSE,(MPIU_DBG_FDEST, "rank=%d, tag=%d, context=%d", rank, tag, comm->context_id + context_offset)); /* Check to make sure the communicator hasn't already been revoked */ if (comm->revoked && MPIR_AGREE_TAG != MPIR_TAG_MASK_ERROR_BITS(tag & ~MPIR_Process.tagged_coll_mask) && MPIR_SHRINK_TAG != MPIR_TAG_MASK_ERROR_BITS(tag & ~MPIR_Process.tagged_coll_mask)) { MPIR_ERR_SETANDJUMP(mpi_errno,MPIX_ERR_REVOKED,"**revoked"); } #if defined(FINEGRAIN_MPI) MPIDI_Comm_get_pid_worldrank(comm, rank, &destpid, &destworldrank); if (COMPARE_RANKS(rank,comm,destpid) && comm->comm_kind != MPID_INTERCOMM) { mpi_errno = MPIDI_Isend_self(&buf, count, datatype, rank, tag, comm, context_offset, MPIDI_REQUEST_TYPE_SEND, &sreq); if (rank == comm->rank) { printf("my_fgrank=%d: %s, self send DEADLOCK\n", my_fgrank, __FUNCTION__); if (sreq != NULL && sreq->cc != 0) { MPIR_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER, "**dev|selfsenddeadlock"); } } #else if (rank == comm->rank && comm->comm_kind != MPID_INTERCOMM) { mpi_errno = MPIDI_Isend_self(buf, count, datatype, rank, tag, comm, context_offset, MPIDI_REQUEST_TYPE_SEND, &sreq); /* In the single threaded case, sending to yourself will cause deadlock. Note that in the runtime-thread case, this check will not be made (long-term FIXME) */ # ifndef MPICH_IS_THREADED { if (sreq != NULL && MPID_cc_get(sreq->cc) != 0) { MPIR_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER, "**dev|selfsenddeadlock"); } } # endif #endif if (mpi_errno != MPI_SUCCESS) { MPIR_ERR_POP(mpi_errno); } goto fn_exit; } if (rank == MPI_PROC_NULL) { goto fn_exit; } #if defined(FINEGRAIN_MPI) MPIDI_Comm_get_vc_set_active_direct(comm, destpid, &vc); #else MPIDI_Comm_get_vc_set_active(comm, rank, &vc); #endif MPIR_ERR_CHKANDJUMP1(vc->state == MPIDI_VC_STATE_MORIBUND, mpi_errno, MPIX_ERR_PROC_FAILED, "**comm_fail", "**comm_fail %d", rank); #ifdef ENABLE_COMM_OVERRIDES if (vc->comm_ops && vc->comm_ops->send) { mpi_errno = vc->comm_ops->send( vc, buf, count, datatype, rank, tag, comm, context_offset, &sreq); goto fn_exit; } #endif MPIDI_Datatype_get_info(count, datatype, dt_contig, data_sz, dt_ptr, dt_true_lb); if (data_sz == 0) { MPIDI_CH3_Pkt_t upkt; MPIDI_CH3_Pkt_eager_send_t * const eager_pkt = &upkt.eager_send; MPIU_DBG_MSG(CH3_OTHER,VERBOSE,"sending zero length message"); MPIDI_Pkt_init(eager_pkt, MPIDI_CH3_PKT_EAGER_SEND); #if defined(FINEGRAIN_MPI) eager_pkt->match.parts.dest_rank = destworldrank; #endif eager_pkt->match.parts.rank = comm->rank; eager_pkt->match.parts.tag = tag; eager_pkt->match.parts.context_id = comm->context_id + context_offset; eager_pkt->sender_req_id = MPI_REQUEST_NULL; eager_pkt->data_sz = 0; MPIDI_VC_FAI_send_seqnum(vc, seqnum); MPIDI_Pkt_set_seqnum(eager_pkt, seqnum); MPID_THREAD_CS_ENTER(POBJ, vc->pobj_mutex); mpi_errno = MPIDI_CH3_iStartMsg(vc, eager_pkt, sizeof(*eager_pkt), &sreq); MPID_THREAD_CS_EXIT(POBJ, vc->pobj_mutex); /* --BEGIN ERROR HANDLING-- */ if (mpi_errno != MPI_SUCCESS) { MPIR_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**ch3|eagermsg"); } /* --END ERROR HANDLING-- */ if (sreq != NULL) { MPIDI_Request_set_seqnum(sreq, seqnum); MPIDI_Request_set_type(sreq, MPIDI_REQUEST_TYPE_SEND); /* sreq->comm = comm; MPIR_Comm_add_ref(comm); -- not necessary for blocking functions */ } goto fn_exit; } MPIDI_CH3_GET_EAGER_THRESHOLD(&eager_threshold, comm, vc); /* FIXME: flow control: limit number of outstanding eager messages containing data and need to be buffered by the receiver */ #ifdef USE_EAGER_SHORT if (dt_contig && data_sz <= MPIDI_EAGER_SHORT_SIZE) { mpi_errno = MPIDI_CH3_EagerContigShortSend( &sreq, MPIDI_CH3_PKT_EAGERSHORT_SEND, (char *)buf + dt_true_lb, data_sz, rank, tag, comm, context_offset ); } else #endif if (data_sz + sizeof(MPIDI_CH3_Pkt_eager_send_t) <= eager_threshold) { if (dt_contig) { mpi_errno = MPIDI_CH3_EagerContigSend( &sreq, MPIDI_CH3_PKT_EAGER_SEND, (char *)buf + dt_true_lb, data_sz, rank, tag, comm, context_offset ); } else { MPIDI_Request_create_sreq(sreq, mpi_errno, goto fn_exit); MPIDI_Request_set_type(sreq, MPIDI_REQUEST_TYPE_SEND); mpi_errno = MPIDI_CH3_EagerNoncontigSend( &sreq, MPIDI_CH3_PKT_EAGER_SEND, buf, count, datatype, data_sz, rank, tag, comm, context_offset ); } } else {
void MPIDI_CH3_Rendezvous_r3_push(MPIDI_VC_t * vc, MPID_Request * sreq) { vbuf *buf; MPID_IOV iov[MPID_IOV_LIMIT + 1]; int n_iov; int msg_buffered = 0; int nb; int complete = 0; int seqnum; int finished = 0; int mpi_errno; int wait_for_rndv_r3_ack = 0; MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_RNDV_R3_PUSH); MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_RNDV_R3_PUSH); MPIDI_CH3_Pkt_rndv_r3_data_t pkt_head; MPIDI_Pkt_init(&pkt_head, MPIDI_CH3_PKT_RNDV_R3_DATA); iov[0].MPID_IOV_LEN = sizeof(MPIDI_CH3_Pkt_rndv_r3_data_t); iov[0].MPID_IOV_BUF = (void*) &pkt_head; pkt_head.receiver_req_id = sreq->mrail.partner_id; do { do { #ifndef DAPL_DEFAULT_PROVIDER /* stop sending more R3 data to avoid SRQ flooding at receiver */ if (MPIDI_CH3I_RDMA_Process.has_srq) { if (vc->ch.pending_r3_data >= rdma_max_r3_pending_data) { wait_for_rndv_r3_ack = 1; break; } } #endif MPIDI_VC_FAI_send_seqnum(vc, seqnum); MPIDI_Pkt_set_seqnum(&pkt_head, seqnum); MPIDI_Request_set_seqnum(sreq, seqnum); MPIU_Memcpy((void *) &iov[1], &sreq->dev.iov[sreq->dev.iov_offset], (sreq->dev.iov_count - sreq->dev.iov_offset) * sizeof(MPID_IOV)); n_iov = sreq->dev.iov_count - sreq->dev.iov_offset + 1; DEBUG_PRINT("iov count (sreq): %d, offset %d, len[1] %d\n", sreq->dev.iov_count, sreq->dev.iov_offset, sreq->dev.iov[0].MPID_IOV_LEN); { int i = 0; size_t total_len = 0; for (i = 0; i < n_iov; i++) { total_len += (iov[i].MPID_IOV_LEN); } mpi_errno = MPIDI_CH3I_MRAILI_Eager_send(vc, iov, n_iov, total_len, &nb, &buf); } DEBUG_PRINT("[istartmsgv] mpierr %d, nb %d\n", mpi_errno, nb); if (MPI_SUCCESS != mpi_errno && MPI_MRAIL_MSG_QUEUED != mpi_errno) { vc->ch.state = MPIDI_CH3I_VC_STATE_FAILED; sreq->status.MPI_ERROR = MPI_ERR_INTERN; MPIDI_CH3U_Request_complete(sreq); return; } else if (MPI_MRAIL_MSG_QUEUED == mpi_errno) { msg_buffered = 1; } nb -= sizeof(MPIDI_CH3_Pkt_rndv_r3_data_t); finished = MPIDI_CH3I_Request_adjust_iov(sreq, nb); DEBUG_PRINT("ajust iov finish: %d\n", finished); vc->ch.pending_r3_data += nb; } while (!finished/* && !msg_buffered*/); if (wait_for_rndv_r3_ack) { break; } if (finished && sreq->dev.OnDataAvail == MPIDI_CH3_ReqHandler_SendReloadIOV) { MPIDI_CH3U_Handle_send_req(vc, sreq, &complete); nb = 0; complete = 0; } else if (finished) { complete = 1; } } while (/* 1 != msg_buffered && */0 == complete); DEBUG_PRINT("exit loop with complete %d, msg_buffered %d wiat %d pending data:%d \n", complete, msg_buffered, wait_for_rndv_r3_ack, vc->ch.pending_r3_data); if (wait_for_rndv_r3_ack) { //|| 0 == complete && 1 == msg_buffered) { sreq->mrail.nearly_complete = 0; } else if (1 == msg_buffered) { buf->sreq = (void *) sreq; sreq->mrail.nearly_complete = 1; } else { buf->sreq = NULL; MPIDI_CH3U_Handle_send_req(vc, sreq, &complete); sreq->mrail.nearly_complete = 1; } if (sreq->mrail.nearly_complete) { DEBUG_PRINT("R3 PUSH completed\n"); } else { DEBUG_PRINT("Send Max R3 Pending Data. waiting for ACK\n"); } MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_RNDV_R3_PUSH); }
int MPID_Rsend(const void * buf, int count, MPI_Datatype datatype, int rank, int tag, MPID_Comm * comm, int context_offset, MPID_Request ** request) { MPIDI_msg_sz_t data_sz; int dt_contig; MPI_Aint dt_true_lb; MPID_Datatype * dt_ptr; MPID_Request * sreq = NULL; MPIDI_VC_t * vc; #if defined(MPID_USE_SEQUENCE_NUMBERS) MPID_Seqnum_t seqnum; #endif int mpi_errno = MPI_SUCCESS; MPIDI_STATE_DECL(MPID_STATE_MPID_RSEND); MPIDI_FUNC_ENTER(MPID_STATE_MPID_RSEND); MPIU_DBG_MSG_FMT(CH3_OTHER,VERBOSE,(MPIU_DBG_FDEST, "rank=%d, tag=%d, context=%d", rank, tag, comm->context_id + context_offset)); /* Check to make sure the communicator hasn't already been revoked */ if (comm->revoked && MPIR_AGREE_TAG != MPIR_TAG_MASK_ERROR_BITS(tag & ~MPIR_Process.tagged_coll_mask) && MPIR_SHRINK_TAG != MPIR_TAG_MASK_ERROR_BITS(tag & ~MPIR_Process.tagged_coll_mask)) { MPIR_ERR_SETANDJUMP(mpi_errno,MPIX_ERR_REVOKED,"**revoked"); } if (rank == comm->rank && comm->comm_kind != MPID_INTERCOMM) { mpi_errno = MPIDI_Isend_self(buf, count, datatype, rank, tag, comm, context_offset, MPIDI_REQUEST_TYPE_RSEND, &sreq); goto fn_exit; } if (rank == MPI_PROC_NULL) { goto fn_exit; } MPIDI_Comm_get_vc_set_active(comm, rank, &vc); #ifdef ENABLE_COMM_OVERRIDES if (vc->comm_ops && vc->comm_ops->rsend) { mpi_errno = vc->comm_ops->rsend( vc, buf, count, datatype, rank, tag, comm, context_offset, &sreq); goto fn_exit; } #endif MPIDI_Datatype_get_info(count, datatype, dt_contig, data_sz, dt_ptr, dt_true_lb); if (data_sz == 0) { MPIDI_CH3_Pkt_t upkt; MPIDI_CH3_Pkt_ready_send_t * const ready_pkt = &upkt.ready_send; MPIU_DBG_MSG(CH3_OTHER,VERBOSE,"sending zero length message"); MPIDI_Pkt_init(ready_pkt, MPIDI_CH3_PKT_READY_SEND); ready_pkt->match.parts.rank = comm->rank; ready_pkt->match.parts.tag = tag; ready_pkt->match.parts.context_id = comm->context_id + context_offset; ready_pkt->sender_req_id = MPI_REQUEST_NULL; ready_pkt->data_sz = data_sz; MPIDI_VC_FAI_send_seqnum(vc, seqnum); MPIDI_Pkt_set_seqnum(ready_pkt, seqnum); MPID_THREAD_CS_ENTER(POBJ, vc->pobj_mutex); mpi_errno = MPIDI_CH3_iStartMsg(vc, ready_pkt, sizeof(*ready_pkt), &sreq); MPID_THREAD_CS_EXIT(POBJ, vc->pobj_mutex); /* --BEGIN ERROR HANDLING-- */ if (mpi_errno != MPI_SUCCESS) { mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OTHER, "**ch3|eagermsg", 0); goto fn_exit; } /* --END ERROR HANDLING-- */ if (sreq != NULL) { MPIDI_Request_set_seqnum(sreq, seqnum); MPIDI_Request_set_type(sreq, MPIDI_REQUEST_TYPE_RSEND); /* sreq->comm = comm; MPIR_Comm_add_ref(comm); -- not needed for blocking operations */ } goto fn_exit; } if (vc->ready_eager_max_msg_sz < 0 || data_sz + sizeof(MPIDI_CH3_Pkt_ready_send_t) <= vc->ready_eager_max_msg_sz) { if (dt_contig) { mpi_errno = MPIDI_CH3_EagerContigSend( &sreq, MPIDI_CH3_PKT_READY_SEND, (char *)buf + dt_true_lb, data_sz, rank, tag, comm, context_offset ); } else { MPIDI_Request_create_sreq(sreq, mpi_errno, goto fn_exit); MPIDI_Request_set_type(sreq, MPIDI_REQUEST_TYPE_SEND); mpi_errno = MPIDI_CH3_EagerNoncontigSend( &sreq, MPIDI_CH3_PKT_READY_SEND, buf, count, datatype, data_sz, rank, tag, comm, context_offset ); } } else {
int MPIDI_Isend_self(const void * buf, int count, MPI_Datatype datatype, int rank, int tag, MPID_Comm * comm, int context_offset, int type, MPID_Request ** request) { MPIDI_Message_match match; MPID_Request * sreq; MPID_Request * rreq; MPIDI_VC_t * vc; #if defined(MPID_USE_SEQUENCE_NUMBERS) MPID_Seqnum_t seqnum; #endif int found; int mpi_errno = MPI_SUCCESS; MPIU_DBG_MSG(CH3_OTHER,VERBOSE,"sending message to self"); MPIDI_Request_create_sreq(sreq, mpi_errno, goto fn_exit); MPIDI_Request_set_type(sreq, type); MPIDI_Request_set_msg_type(sreq, MPIDI_REQUEST_SELF_MSG); match.parts.rank = rank; match.parts.tag = tag; match.parts.context_id = comm->context_id + context_offset; MPIU_THREAD_CS_ENTER(MSGQUEUE,); rreq = MPIDI_CH3U_Recvq_FDP_or_AEU(&match, &found); /* --BEGIN ERROR HANDLING-- */ if (rreq == NULL) { MPIU_Object_set_ref(sreq, 0); MPIDI_CH3_Request_destroy(sreq); sreq = NULL; MPIU_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomemuereq %d", MPIDI_CH3U_Recvq_count_unexp()); goto fn_exit; } /* --END ERROR HANDLING-- */ MPIDI_Comm_get_vc_set_active(comm, rank, &vc); MPIDI_VC_FAI_send_seqnum(vc, seqnum); MPIDI_Request_set_seqnum(sreq, seqnum); MPIDI_Request_set_seqnum(rreq, seqnum); rreq->status.MPI_SOURCE = rank; rreq->status.MPI_TAG = tag; if (found) { MPIDI_msg_sz_t data_sz; /* we found a posted req, which we now own, so we can release the CS */ MPIU_THREAD_CS_EXIT(MSGQUEUE,); MPIU_DBG_MSG(CH3_OTHER,VERBOSE, "found posted receive request; copying data"); MPIDI_CH3U_Buffer_copy(buf, count, datatype, &sreq->status.MPI_ERROR, rreq->dev.user_buf, rreq->dev.user_count, rreq->dev.datatype, &data_sz, &rreq->status.MPI_ERROR); rreq->status.count = (int)data_sz; MPID_REQUEST_SET_COMPLETED(rreq); MPID_Request_release(rreq); /* sreq has never been seen by the user or outside this thread, so it is safe to reset ref_count and cc */ MPIU_Object_set_ref(sreq, 1); MPID_cc_set(&sreq->cc, 0); } else { if (type != MPIDI_REQUEST_TYPE_RSEND)
int MPID_Irsend(const void * buf, int count, MPI_Datatype datatype, int rank, int tag, MPID_Comm * comm, int context_offset, MPID_Request ** request) { MPIDI_CH3_Pkt_t upkt; MPIDI_CH3_Pkt_ready_send_t * const ready_pkt = &upkt.ready_send; MPIDI_msg_sz_t data_sz; int dt_contig; MPI_Aint dt_true_lb; MPID_Datatype * dt_ptr; MPID_Request * sreq; MPIDI_VC_t * vc; #if defined(MPID_USE_SEQUENCE_NUMBERS) MPID_Seqnum_t seqnum; #endif int mpi_errno = MPI_SUCCESS; MPIDI_STATE_DECL(MPID_STATE_MPID_IRSEND); MPIDI_FUNC_ENTER(MPID_STATE_MPID_IRSEND); MPIU_DBG_MSG_FMT(CH3_OTHER,VERBOSE,(MPIU_DBG_FDEST, "rank=%d, tag=%d, context=%d", rank, tag, comm->context_id + context_offset)); if (rank == comm->rank && comm->comm_kind != MPID_INTERCOMM) { mpi_errno = MPIDI_Isend_self(buf, count, datatype, rank, tag, comm, context_offset, MPIDI_REQUEST_TYPE_RSEND, &sreq); goto fn_exit; } MPIDI_Request_create_sreq(sreq, mpi_errno, goto fn_exit); MPIDI_Request_set_type(sreq, MPIDI_REQUEST_TYPE_RSEND); MPIDI_Request_set_msg_type(sreq, MPIDI_REQUEST_EAGER_MSG); if (rank == MPI_PROC_NULL) { MPIU_Object_set_ref(sreq, 1); MPID_cc_set(&sreq->cc, 0); goto fn_exit; } MPIDI_Comm_get_vc_set_active(comm, rank, &vc); #ifdef ENABLE_COMM_OVERRIDES if (vc->comm_ops && vc->comm_ops->irsend) { mpi_errno = vc->comm_ops->irsend( vc, buf, count, datatype, rank, tag, comm, context_offset, &sreq); goto fn_exit; } #endif MPIDI_Datatype_get_info(count, datatype, dt_contig, data_sz, dt_ptr, dt_true_lb); MPIDI_Pkt_init(ready_pkt, MPIDI_CH3_PKT_READY_SEND); ready_pkt->match.parts.rank = comm->rank; ready_pkt->match.parts.tag = tag; ready_pkt->match.parts.context_id = comm->context_id + context_offset; ready_pkt->sender_req_id = MPI_REQUEST_NULL; ready_pkt->data_sz = data_sz; if (data_sz == 0) { MPIU_DBG_MSG(CH3_OTHER,VERBOSE,"sending zero length message"); sreq->dev.OnDataAvail = 0; MPIDI_VC_FAI_send_seqnum(vc, seqnum); MPIDI_Pkt_set_seqnum(ready_pkt, seqnum); MPIDI_Request_set_seqnum(sreq, seqnum); MPIU_THREAD_CS_ENTER(CH3COMM,vc); mpi_errno = MPIU_CALL(MPIDI_CH3,iSend(vc, sreq, ready_pkt, sizeof(*ready_pkt))); MPIU_THREAD_CS_EXIT(CH3COMM,vc); /* --BEGIN ERROR HANDLING-- */ if (mpi_errno != MPI_SUCCESS) { MPIU_Object_set_ref(sreq, 0); MPIDI_CH3_Request_destroy(sreq); sreq = NULL; MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**ch3|eagermsg"); goto fn_exit; } /* --END ERROR HANDLING-- */ goto fn_exit; } if (dt_contig) { mpi_errno = MPIDI_CH3_EagerContigIsend( &sreq, MPIDI_CH3_PKT_READY_SEND, (char*)buf + dt_true_lb, data_sz, rank, tag, comm, context_offset ); } else { mpi_errno = MPIDI_CH3_EagerNoncontigSend( &sreq, MPIDI_CH3_PKT_READY_SEND, buf, count, datatype, data_sz, rank, tag, comm, context_offset ); /* If we're not complete, then add a reference to the datatype */ if (sreq && sreq->dev.OnDataAvail) { sreq->dev.datatype_ptr = dt_ptr; MPID_Datatype_add_ref(dt_ptr); } } fn_exit: *request = sreq; MPIU_DBG_STMT(CH3_OTHER,VERBOSE,{ if (sreq != NULL) { MPIU_DBG_MSG_P(CH3_OTHER,VERBOSE,"request allocated, handle=0x%08x", sreq->handle); } } );
/* MPIDI_CH3_EagerSyncNoncontigSend - Eagerly send noncontiguous data in synchronous mode. Some implementations may choose to use Rendezvous sends (see ch3u_rndv.c) for all Synchronous sends (MPI_Issend and MPI_Ssend). An eager synchronous send eliminates one of the handshake messages, but most application codes should not be using synchronous sends in performance-critical operations. */ int MPIDI_CH3_EagerSyncNoncontigSend( MPIR_Request **sreq_p, const void * buf, int count, MPI_Datatype datatype, intptr_t data_sz, int dt_contig, MPI_Aint dt_true_lb, int rank, int tag, MPIR_Comm * comm, int context_offset ) { int mpi_errno = MPI_SUCCESS; MPIDI_CH3_Pkt_t upkt; MPIDI_CH3_Pkt_eager_sync_send_t * const es_pkt = &upkt.eager_sync_send; MPIDI_VC_t * vc; MPIR_Request *sreq = *sreq_p; /* MT FIXME what are the two operations we are waiting for? the send and * the sync response? */ MPIR_cc_set(&sreq->cc, 2); sreq->dev.OnDataAvail = 0; sreq->dev.OnFinal = 0; MPIDI_Pkt_init(es_pkt, MPIDI_CH3_PKT_EAGER_SYNC_SEND); es_pkt->match.parts.rank = comm->rank; es_pkt->match.parts.tag = tag; es_pkt->match.parts.context_id = comm->context_id + context_offset; es_pkt->sender_req_id = sreq->handle; es_pkt->data_sz = data_sz; MPIDI_Comm_get_vc_set_active(comm, rank, &vc); MPIDI_VC_FAI_send_seqnum(vc, seqnum); MPIDI_Pkt_set_seqnum(es_pkt, seqnum); MPIDI_Request_set_seqnum(sreq, seqnum); MPL_DBG_MSGPKT(vc,tag,es_pkt->match.parts.context_id,rank,data_sz,"EagerSync"); if (dt_contig) { MPL_IOV iov[2]; MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_OTHER,VERBOSE,(MPL_DBG_FDEST, "sending contiguous sync eager message, data_sz=%" PRIdPTR, data_sz)); iov[0].MPL_IOV_BUF = (MPL_IOV_BUF_CAST)es_pkt; iov[0].MPL_IOV_LEN = sizeof(*es_pkt); iov[1].MPL_IOV_BUF = (MPL_IOV_BUF_CAST) ((char *)buf + dt_true_lb); iov[1].MPL_IOV_LEN = data_sz; MPID_THREAD_CS_ENTER(POBJ, vc->pobj_mutex); mpi_errno = MPIDI_CH3_iSendv(vc, sreq, iov, 2); MPID_THREAD_CS_EXIT(POBJ, vc->pobj_mutex); /* --BEGIN ERROR HANDLING-- */ if (mpi_errno != MPI_SUCCESS) { /* Make sure to destroy the request before setting the pointer to * NULL, otherwise we lose the handle on the request */ MPIR_Request_free(sreq); *sreq_p = NULL; MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch3|eagermsg"); } /* --END ERROR HANDLING-- */ } else { MPL_DBG_MSG_D(MPIDI_CH3_DBG_OTHER,VERBOSE, "sending non-contiguous sync eager message, data_sz=%" PRIdPTR, data_sz); sreq->dev.segment_ptr = MPIDU_Segment_alloc( ); MPIR_ERR_CHKANDJUMP1((sreq->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPIDU_Segment_alloc"); MPIDU_Segment_init(buf, count, datatype, sreq->dev.segment_ptr, 0); sreq->dev.segment_first = 0; sreq->dev.segment_size = data_sz; MPID_THREAD_CS_ENTER(POBJ, vc->pobj_mutex); mpi_errno = vc->sendNoncontig_fn(vc, sreq, es_pkt, sizeof(MPIDI_CH3_Pkt_eager_sync_send_t)); MPID_THREAD_CS_EXIT(POBJ, vc->pobj_mutex); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } fn_exit: return mpi_errno; fn_fail: *sreq_p = NULL; goto fn_exit; }
static int MPIDI_CH3_SMP_Rendezvous_push(MPIDI_VC_t * vc, MPID_Request * sreq) { int nb; int complete = 0; int seqnum; int mpi_errno; MPIDI_CH3_Pkt_rndv_r3_data_t pkt_head; MPID_Request * send_req; MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_SMP_RNDV_PUSH); MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_SMP_RNDV_PUSH); MPIDI_Pkt_init(&pkt_head, MPIDI_CH3_PKT_RNDV_R3_DATA); pkt_head.receiver_req_id = sreq->mrail.partner_id; MPIDI_VC_FAI_send_seqnum(vc, seqnum); MPIDI_Pkt_set_seqnum(&pkt_head, seqnum); MPIDI_Request_set_seqnum(sreq, seqnum); #if defined(_SMP_LIMIC_) /* Use limic2 for contiguous data * Use shared memory for non-contiguous data */ if (!g_smp_use_limic2 || sreq->dev.OnDataAvail == MPIDI_CH3_ReqHandler_SendReloadIOV || sreq->dev.iov_count > 1) { pkt_head.send_req_id = NULL; } else { pkt_head.send_req_id = sreq; } #endif mpi_errno = MPIDI_CH3_iStartMsg(vc, &pkt_head, sizeof(MPIDI_CH3_Pkt_rndv_r3_data_t), &send_req); if (mpi_errno != MPI_SUCCESS) { MPIU_Object_set_ref(sreq, 0); MPIDI_CH3_Request_destroy(sreq); sreq = NULL; mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**ch3|rtspkt", 0); return mpi_errno; } /* --END ERROR HANDLING-- */ if (send_req != NULL) { DEBUG_PRINT("r3 packet not sent \n"); MPID_Request_release(send_req); } #if defined(_SMP_LIMIC_) if (pkt_head.send_req_id) { sreq->mrail.nearly_complete = 1; return MPI_SUCCESS; } #endif vc->smp.send_current_pkt_type = SMP_RNDV_MSG; DEBUG_PRINT("r3 sent req is %p\n", sreq); if (MPIDI_CH3I_SMP_SendQ_empty(vc)) { for (;;) { DEBUG_PRINT("iov count (sreq): %d, offset %d, len[1] %d\n", sreq->dev.iov_count, sreq->dev.iov_offset, sreq->dev.iov[0].MPID_IOV_LEN); if (vc->smp.send_current_pkt_type == SMP_RNDV_MSG) { mpi_errno = MPIDI_CH3I_SMP_writev_rndv_data(vc, &sreq->dev.iov[sreq->dev.iov_offset], sreq->dev.iov_count - sreq->dev.iov_offset, &nb); } else { MPIU_Assert(vc->smp.send_current_pkt_type == SMP_RNDV_MSG_CONT); MPIDI_CH3I_SMP_writev_rndv_data_cont(vc, &sreq->dev.iov[sreq->dev.iov_offset], sreq->dev.iov_count - sreq->dev.iov_offset, &nb); } if (MPI_SUCCESS != mpi_errno) { vc->ch.state = MPIDI_CH3I_VC_STATE_FAILED; sreq->status.MPI_ERROR = MPI_ERR_INTERN; MPIDI_CH3U_Request_complete(sreq); return mpi_errno; } if (nb >= 0) { if (MPIDI_CH3I_Request_adjust_iov(sreq, nb)) { MPIDI_CH3U_Handle_send_req(vc, sreq, &complete); if (complete) { sreq->mrail.nearly_complete = 1; break; } else { vc->smp.send_current_pkt_type = SMP_RNDV_MSG_CONT; } } else { sreq->ch.reqtype = REQUEST_RNDV_R3_DATA; MPIDI_CH3I_SMP_SendQ_enqueue_head(vc, sreq); vc->smp.send_active = sreq; sreq->mrail.nearly_complete = 1; vc->smp.send_current_pkt_type = SMP_RNDV_MSG_CONT; break; } } else { MPIDI_CH3I_SMP_SendQ_enqueue_head(vc, sreq); vc->smp.send_active = sreq; sreq->mrail.nearly_complete = 1; break; } } } else { sreq->ch.reqtype = REQUEST_RNDV_R3_DATA; MPIDI_CH3I_SMP_SendQ_enqueue(vc, sreq); sreq->mrail.nearly_complete = 1; vc->smp.send_current_pkt_type = SMP_RNDV_MSG; DEBUG_PRINT("Enqueue sreq %p", sreq); } MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_SMP_RNDV_PUSH); return MPI_SUCCESS; }
int MPID_Isend(const void * buf, int count, MPI_Datatype datatype, int rank, int tag, MPID_Comm * comm, int context_offset, MPID_Request ** request) { MPIDI_msg_sz_t data_sz; int dt_contig; MPI_Aint dt_true_lb; MPID_Datatype * dt_ptr; MPID_Request * sreq; MPIDI_VC_t * vc=0; #if defined(MPID_USE_SEQUENCE_NUMBERS) MPID_Seqnum_t seqnum; #endif int eager_threshold = -1; int mpi_errno = MPI_SUCCESS; MPIDI_STATE_DECL(MPID_STATE_MPID_ISEND); MPIDI_FUNC_ENTER(MPID_STATE_MPID_ISEND); MPIU_DBG_MSG_FMT(CH3_OTHER,VERBOSE,(MPIU_DBG_FDEST, "rank=%d, tag=%d, context=%d", rank, tag, comm->context_id + context_offset)); if (rank == comm->rank && comm->comm_kind != MPID_INTERCOMM) { mpi_errno = MPIDI_Isend_self(buf, count, datatype, rank, tag, comm, context_offset, MPIDI_REQUEST_TYPE_SEND, &sreq); goto fn_exit; } if (rank != MPI_PROC_NULL) { MPIDI_Comm_get_vc_set_active(comm, rank, &vc); #ifdef ENABLE_COMM_OVERRIDES /* this needs to come before the sreq is created, since the override * function is responsible for creating its own request */ if (vc->comm_ops && vc->comm_ops->isend) { mpi_errno = vc->comm_ops->isend( vc, buf, count, datatype, rank, tag, comm, context_offset, &sreq); goto fn_exit; } #endif } MPIDI_Request_create_sreq(sreq, mpi_errno, goto fn_exit); MPIDI_Request_set_type(sreq, MPIDI_REQUEST_TYPE_SEND); if (rank == MPI_PROC_NULL) { MPIU_Object_set_ref(sreq, 1); MPID_cc_set(&sreq->cc, 0); goto fn_exit; } MPIDI_Datatype_get_info(count, datatype, dt_contig, data_sz, dt_ptr, dt_true_lb); if (data_sz == 0) { MPIDI_CH3_Pkt_t upkt; MPIDI_CH3_Pkt_eager_send_t * const eager_pkt = &upkt.eager_send; MPIDI_Request_set_msg_type(sreq, MPIDI_REQUEST_EAGER_MSG); sreq->dev.OnDataAvail = 0; MPIU_DBG_MSG(CH3_OTHER,VERBOSE,"sending zero length message"); MPIDI_Pkt_init(eager_pkt, MPIDI_CH3_PKT_EAGER_SEND); eager_pkt->match.parts.rank = comm->rank; eager_pkt->match.parts.tag = tag; eager_pkt->match.parts.context_id = comm->context_id + context_offset; eager_pkt->sender_req_id = sreq->handle; eager_pkt->data_sz = 0; MPIDI_VC_FAI_send_seqnum(vc, seqnum); MPIDI_Pkt_set_seqnum(eager_pkt, seqnum); MPIDI_Request_set_seqnum(sreq, seqnum); MPIU_THREAD_CS_ENTER(CH3COMM,vc); mpi_errno = MPIDI_CH3_iSend(vc, sreq, eager_pkt, sizeof(*eager_pkt)); MPIU_THREAD_CS_EXIT(CH3COMM,vc); /* --BEGIN ERROR HANDLING-- */ if (mpi_errno != MPI_SUCCESS) { MPIU_Object_set_ref(sreq, 0); MPIDI_CH3_Request_destroy(sreq); sreq = NULL; MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**ch3|eagermsg"); goto fn_exit; } /* --END ERROR HANDLING-- */ goto fn_exit; } MPIDI_CH3_GET_EAGER_THRESHOLD(&eager_threshold, comm, vc); /* FIXME: flow control: limit number of outstanding eager messages containing data and need to be buffered by the receiver */ if (data_sz + sizeof(MPIDI_CH3_Pkt_eager_send_t) <= eager_threshold) { if (dt_contig) { mpi_errno = MPIDI_CH3_EagerContigIsend( &sreq, MPIDI_CH3_PKT_EAGER_SEND, (char*)buf + dt_true_lb, data_sz, rank, tag, comm, context_offset ); } else { mpi_errno = MPIDI_CH3_EagerNoncontigSend( &sreq, MPIDI_CH3_PKT_EAGER_SEND, buf, count, datatype, data_sz, rank, tag, comm, context_offset ); /* If we're not complete, then add a reference to the datatype */ if (sreq && sreq->dev.OnDataAvail) { sreq->dev.datatype_ptr = dt_ptr; MPID_Datatype_add_ref(dt_ptr); } } } else { /* Note that the sreq was created above */ MPIDI_Request_set_msg_type( sreq, MPIDI_REQUEST_RNDV_MSG ); mpi_errno = vc->rndvSend_fn( &sreq, buf, count, datatype, dt_contig, data_sz, dt_true_lb, rank, tag, comm, context_offset ); /* FIXME: fill temporary IOV or pack temporary buffer after send to hide some latency. This requires synchronization because the CTS packet could arrive and be processed before the above iStartmsg completes (depending on the progress engine, threads, etc.). */ if (sreq && dt_ptr != NULL) { sreq->dev.datatype_ptr = dt_ptr; MPID_Datatype_add_ref(dt_ptr); } } fn_exit: *request = sreq; MPIU_DBG_STMT(CH3_OTHER,VERBOSE, { if (sreq != NULL) { MPIU_DBG_MSG_P(CH3_OTHER,VERBOSE,"request allocated, handle=0x%08x", sreq->handle); } } );
int MPID_Irsend(const void * buf, int count, MPI_Datatype datatype, int rank, int tag, MPID_Comm * comm, int context_offset, MPID_Request ** request) { MPIDI_CH3_Pkt_t upkt; MPIDI_CH3_Pkt_ready_send_t * const ready_pkt = &upkt.ready_send; MPIDI_msg_sz_t data_sz; int dt_contig; MPI_Aint dt_true_lb; MPID_Datatype * dt_ptr; MPID_Request * sreq; MPIDI_VC_t * vc; #if defined(MPID_USE_SEQUENCE_NUMBERS) MPID_Seqnum_t seqnum; #endif int mpi_errno = MPI_SUCCESS; MPIDI_STATE_DECL(MPID_STATE_MPID_IRSEND); MPIDI_FUNC_ENTER(MPID_STATE_MPID_IRSEND); MPIU_DBG_MSG_FMT(CH3_OTHER,VERBOSE,(MPIU_DBG_FDEST, "rank=%d, tag=%d, context=%d", rank, tag, comm->context_id + context_offset)); /* Check to make sure the communicator hasn't already been revoked */ if (comm->revoked && MPIR_AGREE_TAG != MPIR_TAG_MASK_ERROR_BITS(tag & ~MPIR_Process.tagged_coll_mask) && MPIR_SHRINK_TAG != MPIR_TAG_MASK_ERROR_BITS(tag & ~MPIR_Process.tagged_coll_mask)) { MPIR_ERR_SETANDJUMP(mpi_errno,MPIX_ERR_REVOKED,"**revoked"); } if (rank == comm->rank && comm->comm_kind != MPID_INTERCOMM) { mpi_errno = MPIDI_Isend_self(buf, count, datatype, rank, tag, comm, context_offset, MPIDI_REQUEST_TYPE_RSEND, &sreq); goto fn_exit; } if (rank != MPI_PROC_NULL) { MPIDI_Comm_get_vc_set_active(comm, rank, &vc); #ifdef ENABLE_COMM_OVERRIDES /* this needs to come before the sreq is created, since the override * function is responsible for creating its own request */ if (vc->comm_ops && vc->comm_ops->irsend) { mpi_errno = vc->comm_ops->irsend( vc, buf, count, datatype, rank, tag, comm, context_offset, &sreq); goto fn_exit; } #endif } MPIDI_Request_create_sreq(sreq, mpi_errno, goto fn_exit); MPIDI_Request_set_type(sreq, MPIDI_REQUEST_TYPE_RSEND); MPIDI_Request_set_msg_type(sreq, MPIDI_REQUEST_EAGER_MSG); if (rank == MPI_PROC_NULL) { MPIU_Object_set_ref(sreq, 1); MPID_cc_set(&sreq->cc, 0); goto fn_exit; } MPIDI_Datatype_get_info(count, datatype, dt_contig, data_sz, dt_ptr, dt_true_lb); MPIDI_Pkt_init(ready_pkt, MPIDI_CH3_PKT_READY_SEND); ready_pkt->match.parts.rank = comm->rank; ready_pkt->match.parts.tag = tag; ready_pkt->match.parts.context_id = comm->context_id + context_offset; ready_pkt->sender_req_id = MPI_REQUEST_NULL; ready_pkt->data_sz = data_sz; if (data_sz == 0) { MPIU_DBG_MSG(CH3_OTHER,VERBOSE,"sending zero length message"); sreq->dev.OnDataAvail = 0; MPIDI_VC_FAI_send_seqnum(vc, seqnum); MPIDI_Pkt_set_seqnum(ready_pkt, seqnum); MPIDI_Request_set_seqnum(sreq, seqnum); MPID_THREAD_CS_ENTER(POBJ, vc->pobj_mutex); mpi_errno = MPIDI_CH3_iSend(vc, sreq, ready_pkt, sizeof(*ready_pkt)); MPID_THREAD_CS_EXIT(POBJ, vc->pobj_mutex); /* --BEGIN ERROR HANDLING-- */ if (mpi_errno != MPI_SUCCESS) { MPID_Request_release(sreq); sreq = NULL; MPIR_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**ch3|eagermsg"); goto fn_exit; } /* --END ERROR HANDLING-- */ goto fn_exit; } if (vc->ready_eager_max_msg_sz < 0 || data_sz + sizeof(MPIDI_CH3_Pkt_ready_send_t) <= vc->ready_eager_max_msg_sz) { if (dt_contig) { mpi_errno = MPIDI_CH3_EagerContigIsend( &sreq, MPIDI_CH3_PKT_READY_SEND, (char*)buf + dt_true_lb, data_sz, rank, tag, comm, context_offset ); } else { mpi_errno = MPIDI_CH3_EagerNoncontigSend( &sreq, MPIDI_CH3_PKT_READY_SEND, buf, count, datatype, data_sz, rank, tag, comm, context_offset ); /* If we're not complete, then add a reference to the datatype */ if (sreq && sreq->dev.OnDataAvail) { sreq->dev.datatype_ptr = dt_ptr; MPID_Datatype_add_ref(dt_ptr); } } } else { /* Do rendezvous. This will be sent as a regular send not as a ready send, so the receiver won't know to send an error if the receive has not been posted */ MPIDI_Request_set_msg_type( sreq, MPIDI_REQUEST_RNDV_MSG ); mpi_errno = vc->rndvSend_fn( &sreq, buf, count, datatype, dt_contig, data_sz, dt_true_lb, rank, tag, comm, context_offset ); if (sreq && dt_ptr != NULL) { sreq->dev.datatype_ptr = dt_ptr; MPID_Datatype_add_ref(dt_ptr); } } fn_exit: *request = sreq; MPIU_DBG_STMT(CH3_OTHER,VERBOSE,{ if (sreq != NULL) { MPIU_DBG_MSG_P(CH3_OTHER,VERBOSE,"request allocated, handle=0x%08x", sreq->handle); } } );
void MRAILI_RDMA_Put_finish (MPIDI_VC_t * vc, MPID_Request * sreq, MRAILI_Channel_info * subchannel) { MPIDI_CH3_Pkt_rput_finish_t rput_pkt; MPID_IOV iov; int n_iov = 1; int nb, rdma_ok; int mpi_errno = MPI_SUCCESS; int seqnum; vbuf *buf; rput_pkt.type = MPIDI_CH3_PKT_RPUT_FINISH; rput_pkt.receiver_req_id = sreq->mrail.partner_id; iov.MPID_IOV_BUF = (void *)&rput_pkt; iov.MPID_IOV_LEN = sizeof (MPIDI_CH3_Pkt_rput_finish_t); MPIDI_VC_FAI_send_seqnum(vc, seqnum); MPIDI_Pkt_set_seqnum(&rput_pkt, seqnum); if (MPIDI_CH3I_RDMA_Process.has_rdma_fast_path) { rdma_ok = MPIDI_CH3I_MRAILI_Fast_rdma_ok (vc, sizeof (MPIDI_CH3_Pkt_rput_finish_t)); if (rdma_ok) { /* the packet header and the data now is in rdma fast buffer */ mpi_errno = MPIDI_CH3I_MRAILI_Fast_rdma_send_complete (vc, &iov, n_iov, &nb, &buf); if (mpi_errno != MPI_SUCCESS && mpi_errno != MPI_MRAIL_MSG_QUEUED) { udapl_error_abort (UDAPL_STATUS_ERR, "Cannot send rput through rdma fast path"); } } else { mpi_errno = MPIDI_CH3I_MRAILI_Eager_send (vc, &iov, n_iov, sizeof (MPIDI_CH3_Pkt_rput_finish_t), &nb, &buf); if (mpi_errno != MPI_SUCCESS && mpi_errno != MPI_MRAIL_MSG_QUEUED) { udapl_error_abort (UDAPL_STATUS_ERR, "Cannot send rput through send/recv path"); } } } else { mpi_errno = MPIDI_CH3I_MRAILI_Eager_send (vc, &iov, n_iov, sizeof (MPIDI_CH3_Pkt_rput_finish_t), &nb, &buf); if (mpi_errno != MPI_SUCCESS && mpi_errno != MPI_MRAIL_MSG_QUEUED) { udapl_error_abort (UDAPL_STATUS_ERR, "Cannot send rput through send/recv path"); } } /* mark MPI send complete when VIA send completes */ buf->sreq = (void *) sreq; }
int MPID_nem_mxm_issend(MPIDI_VC_t * vc, const void *buf, int count, MPI_Datatype datatype, int rank, int tag, MPID_Comm * comm, int context_offset, MPID_Request ** sreq_ptr) { int mpi_errno = MPI_SUCCESS; MPID_Request *sreq = NULL; MPID_Datatype *dt_ptr; int dt_contig; MPIDI_msg_sz_t data_sz; MPI_Aint dt_true_lb; MPID_nem_mxm_vc_area *vc_area = NULL; MPID_nem_mxm_req_area *req_area = NULL; MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_MXM_ISSEND); MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_MXM_ISSEND); MPIDI_Datatype_get_info(count, datatype, dt_contig, data_sz, dt_ptr, dt_true_lb); /* create a request */ MPIDI_Request_create_sreq(sreq, mpi_errno, goto fn_exit); MPIU_Assert(sreq != NULL); MPIDI_Request_set_type(sreq, MPIDI_REQUEST_TYPE_SEND); MPIDI_VC_FAI_send_seqnum(vc, seqnum); MPIDI_Request_set_seqnum(sreq, seqnum); if (HANDLE_GET_KIND(datatype) != HANDLE_KIND_BUILTIN) { MPID_Datatype_get_ptr(datatype, sreq->dev.datatype_ptr); MPID_Datatype_add_ref(sreq->dev.datatype_ptr); } sreq->partner_request = NULL; sreq->dev.OnDataAvail = NULL; sreq->dev.tmpbuf = NULL; sreq->ch.vc = vc; sreq->ch.noncontig = FALSE; _dbg_mxm_output(5, "isSend ========> Sending USER msg for req %p (context %d to %d tag %d size %d) \n", sreq, comm->context_id + context_offset, rank, tag, data_sz); vc_area = VC_BASE(vc); req_area = REQ_BASE(sreq); req_area-> ctx = sreq; req_area->iov_buf = req_area->tmp_buf; req_area->iov_count = 0; req_area->iov_buf[0].ptr = NULL; req_area->iov_buf[0].length = 0; if (data_sz) { if (dt_contig) { req_area->iov_count = 1; req_area->iov_buf[0].ptr = (char *) (buf) + dt_true_lb; req_area->iov_buf[0].length = data_sz; } else { MPIDI_msg_sz_t last; MPI_Aint packsize = 0; sreq->ch.noncontig = TRUE; sreq->dev.segment_ptr = MPID_Segment_alloc(); MPIU_ERR_CHKANDJUMP1((sreq->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc"); MPIR_Pack_size_impl(count, datatype, &packsize); last = data_sz; if (packsize > 0) { sreq->dev.tmpbuf = MPIU_Malloc((size_t) packsize); MPIU_Assert(sreq->dev.tmpbuf); MPID_Segment_init(buf, count, datatype, sreq->dev.segment_ptr, 0); MPID_Segment_pack(sreq->dev.segment_ptr, 0, &last, sreq->dev.tmpbuf); req_area->iov_count = 1; req_area->iov_buf[0].ptr = sreq->dev.tmpbuf; req_area->iov_buf[0].length = last; } } } vc_area->pending_sends += 1; mpi_errno = _mxm_isend(vc_area->mxm_ep, req_area, MXM_MPICH_ISEND_SYNC, (mxm_mq_h) comm->dev.ch.netmod_priv, comm->rank, tag, _mxm_tag_mpi2mxm(tag, comm->context_id + context_offset), 0); if (mpi_errno) MPIU_ERR_POP(mpi_errno); _dbg_mxm_out_req(sreq); fn_exit: *sreq_ptr = sreq; MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_MXM_ISSEND); return mpi_errno; fn_fail: goto fn_exit; }
int MPIDI_CH3_Connect_to_root(const char* port_name, MPIDI_VC_t** new_vc) { int mpi_errno = MPI_SUCCESS; int str_errno; char ifname[MAX_HOST_DESCRIPTION_LEN]; MPIDI_VC_t *vc; MPIDI_CH3_Pkt_cm_establish_t pkt; MPID_Request * sreq; int seqnum; MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_CONNECT_TO_ROOT); MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_CONNECT_TO_ROOT); *new_vc = NULL; if (!MPIDI_CH3I_Process.has_dpm) return MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**notimpl", 0); str_errno = MPIU_Str_get_string_arg(port_name, MPIDI_CH3I_HOST_DESCRIPTION_KEY, ifname, MAX_HOST_DESCRIPTION_LEN); if (str_errno != MPIU_STR_SUCCESS) { /* --BEGIN ERROR HANDLING */ if (str_errno == MPIU_STR_FAIL) { MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**argstr_missinghost"); } else { /* MPIU_STR_TRUNCATED or MPIU_STR_NONEM */ MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER, "**argstr_hostd"); } /* --END ERROR HANDLING-- */ } vc = MPIU_Malloc(sizeof(MPIDI_VC_t)); if (!vc) { MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER, "**nomem"); } MPIDI_VC_Init(vc, NULL, 0); mpi_errno = MPIDI_CH3I_CM_Connect_raw_vc(vc, ifname); if (mpi_errno) { MPIU_ERR_POP(mpi_errno); } while (vc->ch.state != MPIDI_CH3I_VC_STATE_IDLE) { mpi_errno = MPID_Progress_test(); /* --BEGIN ERROR HANDLING-- */ if (mpi_errno != MPI_SUCCESS) { MPIU_ERR_POP(mpi_errno); } } /* fprintf(stderr, "[###] vc state to idel, now send cm_establish msg\n") */ /* Now a connection is created, send a cm_establish message */ /* FIXME: vc->mrail.remote_vc_addr is used to find remote vc * A more elegant way is needed */ MPIDI_Pkt_init(&pkt, MPIDI_CH3_PKT_CM_ESTABLISH); MPIDI_VC_FAI_send_seqnum(vc, seqnum); MPIDI_Pkt_set_seqnum(&pkt, seqnum); pkt.vc_addr = vc->mrail.remote_vc_addr; mpi_errno = MPIDI_GetTagFromPort(port_name, &pkt.port_name_tag); if (mpi_errno != MPIU_STR_SUCCESS) { MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER, "**argstr_port_name_tag"); } mpi_errno = MPIDI_CH3_iStartMsg(vc, &pkt, sizeof(pkt), &sreq); if (mpi_errno != MPI_SUCCESS) { MPIU_ERR_SETANDJUMP1(mpi_errno,MPI_ERR_OTHER,"**fail", "**fail %s", "Failed to send cm establish message"); } if (sreq != NULL) { if (sreq->status.MPI_ERROR != MPI_SUCCESS) { mpi_errno = MPIR_Err_create_code(sreq->status.MPI_ERROR, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**fail", 0); MPID_Request_release(sreq); goto fn_fail; } MPID_Request_release(sreq); } *new_vc = vc; fn_fail: MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_CONNECT_TO_ROOT); return mpi_errno; }