static int send_sreq_data(MPIDI_VC_t *vc, MPID_Request *sreq, knem_cookie_t *s_cookiep) { int mpi_errno = MPI_SUCCESS; int dt_contig; MPI_Aint dt_true_lb; MPIDI_msg_sz_t data_sz; MPID_Datatype * dt_ptr; /* MT: this code assumes only one thread can be at this point at a time */ if (knem_fd < 0) { mpi_errno = open_knem_dev(); if (mpi_errno) MPIU_ERR_POP(mpi_errno); } /* find out contig/noncontig, size, and lb for the datatype */ MPIDI_Datatype_get_info(sreq->dev.user_count, sreq->dev.datatype, dt_contig, data_sz, dt_ptr, dt_true_lb); if (dt_contig) { /* handle the iov creation ourselves */ sreq->dev.iov[0].MPID_IOV_BUF = (char *)sreq->dev.user_buf + dt_true_lb; sreq->dev.iov[0].MPID_IOV_LEN = data_sz; sreq->dev.iov_count = 1; } else { /* use the segment routines to handle the iovec creation */ if (sreq->dev.segment_ptr == NULL) { sreq->dev.iov_count = MPID_IOV_LIMIT; sreq->dev.iov_offset = 0; /* segment_ptr may be non-null when this is a continuation of a many-part message that we couldn't fit in one single flight of iovs. */ sreq->dev.segment_ptr = MPID_Segment_alloc(); MPIU_ERR_CHKANDJUMP1((sreq->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc"); MPID_Segment_init(sreq->dev.user_buf, sreq->dev.user_count, sreq->dev.datatype, sreq->dev.segment_ptr, 0); sreq->dev.segment_first = 0; sreq->dev.segment_size = data_sz; /* FIXME we should write our own function that isn't dependent on the in-request iov array. This will let us use IOVs that are larger than MPID_IOV_LIMIT. */ mpi_errno = MPIDI_CH3U_Request_load_send_iov(sreq, &sreq->dev.iov[0], &sreq->dev.iov_count); if (mpi_errno) MPIU_ERR_POP(mpi_errno); } } mpi_errno = do_dma_send(vc, sreq, sreq->dev.iov_count, sreq->dev.iov, s_cookiep); if (mpi_errno) MPIU_ERR_POP(mpi_errno); fn_exit: return mpi_errno; fn_fail: goto fn_exit; }
void MPIDI_Win_datatype_map(MPIDI_Datatype * dt) { if (dt->contig) { dt->num_contig = 1; dt->map = &dt->__map; dt->map[0].DLOOP_VECTOR_BUF = (void*)(size_t)dt->true_lb; dt->map[0].DLOOP_VECTOR_LEN = dt->size; } else { unsigned map_size = dt->pointer->max_contig_blocks*dt->count + 1; dt->num_contig = map_size; dt->map = (DLOOP_VECTOR*)MPIU_Malloc(map_size * sizeof(DLOOP_VECTOR)); MPID_assert(dt->map != NULL); DLOOP_Offset last = dt->pointer->size*dt->count; MPID_Segment seg; MPID_Segment_init(NULL, dt->count, dt->type, &seg, 0); MPID_Segment_pack_vector(&seg, 0, &last, dt->map, &dt->num_contig); MPID_assert((unsigned)dt->num_contig <= map_size); #ifdef TRACE_ON TRACE_ERR("dt->pointer->size=%d num_contig: orig=%u new=%d\n", dt->pointer->size, map_size, dt->num_contig); int i; for(i=0; i<dt->num_contig; ++i) TRACE_ERR(" %d: BUF=%zu LEN=%zu\n", i, (size_t)dt->map[i].DLOOP_VECTOR_BUF, (size_t)dt->map[i].DLOOP_VECTOR_LEN); #endif } }
/* MPIDI_CH3_EagerNoncontigSend - Eagerly send noncontiguous data */ int MPIDI_CH3_EagerNoncontigSend( MPID_Request **sreq_p, MPIDI_CH3_Pkt_type_t reqtype, const void * buf, MPI_Aint count, MPI_Datatype datatype, MPIDI_msg_sz_t data_sz, int rank, int tag, MPID_Comm * comm, int context_offset ) { int mpi_errno = MPI_SUCCESS; MPIDI_VC_t * vc; MPID_Request *sreq = *sreq_p; MPIDI_CH3_Pkt_t upkt; MPIDI_CH3_Pkt_eager_send_t * const eager_pkt = &upkt.eager_send; MPIU_DBG_MSG_FMT(CH3_OTHER,VERBOSE,(MPIU_DBG_FDEST, "sending non-contiguous eager message, data_sz=" MPIDI_MSG_SZ_FMT, data_sz)); sreq->dev.OnDataAvail = 0; sreq->dev.OnFinal = 0; MPIDI_Pkt_init(eager_pkt, reqtype); eager_pkt->match.parts.rank = comm->rank; eager_pkt->match.parts.tag = tag; eager_pkt->match.parts.context_id = comm->context_id + context_offset; eager_pkt->sender_req_id = MPI_REQUEST_NULL; eager_pkt->data_sz = data_sz; MPIDI_Comm_get_vc_set_active(comm, rank, &vc); MPIDI_VC_FAI_send_seqnum(vc, seqnum); MPIDI_Pkt_set_seqnum(eager_pkt, seqnum); MPIDI_Request_set_seqnum(sreq, seqnum); MPIU_DBG_MSGPKT(vc,tag,eager_pkt->match.parts.context_id,rank,data_sz, "Eager"); sreq->dev.segment_ptr = MPID_Segment_alloc( ); MPIR_ERR_CHKANDJUMP1((sreq->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc"); MPID_Segment_init(buf, count, datatype, sreq->dev.segment_ptr, 0); sreq->dev.segment_first = 0; sreq->dev.segment_size = data_sz; MPID_THREAD_CS_ENTER(POBJ, vc->pobj_mutex); mpi_errno = vc->sendNoncontig_fn(vc, sreq, eager_pkt, sizeof(MPIDI_CH3_Pkt_eager_send_t)); MPID_THREAD_CS_EXIT(POBJ, vc->pobj_mutex); if (mpi_errno) MPIR_ERR_POP(mpi_errno); fn_exit: return mpi_errno; fn_fail: *sreq_p = NULL; goto fn_exit; }
int MPID_nem_ib_lmt_done_recv(struct MPIDI_VC *vc, struct MPID_Request *rreq) { int mpi_errno = MPI_SUCCESS; MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_LMT_DONE_RECV); MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_LMT_DONE_RECV); dprintf("lmt_done_recv,enter,rreq=%p,head=%p\n", rreq, MPID_nem_ib_lmtq.head); int is_contig; MPID_Datatype_is_contig(rreq->dev.datatype, &is_contig); if (!is_contig) { dprintf("lmt_done_recv,copying noncontiguous data to user buffer\n"); /* see MPIDI_CH3U_Request_unpack_uebuf (in /src/mpid/ch3/src/ch3u_request.c) */ /* or MPIDI_CH3U_Receive_data_found (in src/mpid/ch3/src/ch3u_handle_recv_pkt.c) */ MPIDI_msg_sz_t unpack_sz = rreq->ch.lmt_data_sz; MPID_Segment seg; MPI_Aint last; MPID_Segment_init(rreq->dev.user_buf, rreq->dev.user_count, rreq->dev.datatype, &seg, 0); last = unpack_sz; MPID_Segment_unpack(&seg, 0, &last, REQ_FIELD(rreq, lmt_pack_buf)); if (last != unpack_sz) { /* --BEGIN ERROR HANDLING-- */ /* received data was not entirely consumed by unpack() * because too few bytes remained to fill the next basic * datatype */ MPIR_STATUS_SET_COUNT(rreq->status, last); rreq->status.MPI_ERROR = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_TYPE, "**MPID_nem_ib_lmt_done_recv", 0); /* --END ERROR HANDLING-- */ } //MPIU_Free(REQ_FIELD(rreq, lmt_pack_buf)); MPID_nem_ib_stfree(REQ_FIELD(rreq, lmt_pack_buf), (size_t) rreq->ch.lmt_data_sz); } dprintf("lmt_done_recv,1,req=%p,pcc=%d\n", rreq, MPIDI_CH3I_progress_completion_count.v); MPIDI_CH3U_Request_complete(rreq); dprintf("lmt_done_recv,complete,req=%p\n", rreq); dprintf("lmt_done_recv,2,pcc=%d\n", MPIDI_CH3I_progress_completion_count.v); fn_exit: MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_LMT_DONE_RECV); return mpi_errno; //fn_fail: goto fn_exit; }
/* fills in req->dev.iov{,_offset,_count} based on the datatype info in the request, creating a segment if necessary */ static int populate_iov_from_req(MPID_Request *req) { int mpi_errno = MPI_SUCCESS; int dt_contig; MPI_Aint dt_true_lb; MPIDI_msg_sz_t data_sz; MPID_Datatype * dt_ptr; /* find out contig/noncontig, size, and lb for the datatype */ MPIDI_Datatype_get_info(req->dev.user_count, req->dev.datatype, dt_contig, data_sz, dt_ptr, dt_true_lb); if (dt_contig) { /* handle the iov creation ourselves */ req->dev.iov[0].MPL_IOV_BUF = (char *)req->dev.user_buf + dt_true_lb; req->dev.iov[0].MPL_IOV_LEN = data_sz; req->dev.iov_count = 1; } else { /* use the segment routines to handle the iovec creation */ MPIU_Assert(req->dev.segment_ptr == NULL); req->dev.iov_count = MPL_IOV_LIMIT; req->dev.iov_offset = 0; /* XXX DJG FIXME where is this segment freed? */ req->dev.segment_ptr = MPID_Segment_alloc(); MPIR_ERR_CHKANDJUMP1((req->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc"); MPID_Segment_init(req->dev.user_buf, req->dev.user_count, req->dev.datatype, req->dev.segment_ptr, 0); req->dev.segment_first = 0; req->dev.segment_size = data_sz; /* FIXME we should write our own function that isn't dependent on the in-request iov array. This will let us use IOVs that are larger than MPL_IOV_LIMIT. */ mpi_errno = MPIDI_CH3U_Request_load_send_iov(req, &req->dev.iov[0], &req->dev.iov_count); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } fn_fail: return mpi_errno; }
static int _mxm_process_rdtype(MPID_Request ** rreq_p, MPI_Datatype datatype, MPID_Datatype * dt_ptr, MPIDI_msg_sz_t data_sz, const void *buf, int count, mxm_req_buffer_t ** iov_buf, int *iov_count) { int mpi_errno = MPI_SUCCESS; MPID_Request *rreq = *rreq_p; MPIDI_msg_sz_t last; MPL_IOV *iov; int n_iov = 0; int index; if (rreq->dev.segment_ptr == NULL) { rreq->dev.segment_ptr = MPID_Segment_alloc(); MPIR_ERR_CHKANDJUMP1((rreq->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc"); } MPID_Segment_init(buf, count, datatype, rreq->dev.segment_ptr, 0); rreq->dev.segment_first = 0; rreq->dev.segment_size = data_sz; last = rreq->dev.segment_size; MPID_Segment_count_contig_blocks(rreq->dev.segment_ptr, rreq->dev.segment_first, &last, (MPI_Aint *) & n_iov); MPIU_Assert(n_iov > 0); iov = MPIU_Malloc(n_iov * sizeof(*iov)); MPIU_Assert(iov); last = rreq->dev.segment_size; MPID_Segment_unpack_vector(rreq->dev.segment_ptr, rreq->dev.segment_first, &last, iov, &n_iov); MPIU_Assert(last == rreq->dev.segment_size); #if defined(MXM_DEBUG) && (MXM_DEBUG > 0) _dbg_mxm_output(7, "Recv Noncontiguous data vector %i entries (free slots : %i)\n", n_iov, MXM_REQ_DATA_MAX_IOV); for (index = 0; index < n_iov; index++) { _dbg_mxm_output(7, "======= Recv iov[%i] = ptr : %p, len : %i \n", index, iov[index].MPL_IOV_BUF, iov[index].MPL_IOV_LEN); } #endif if (n_iov <= MXM_REQ_DATA_MAX_IOV) { if (n_iov > MXM_MPICH_MAX_IOV) { *iov_buf = (mxm_req_buffer_t *) MPIU_Malloc(n_iov * sizeof(**iov_buf)); MPIU_Assert(*iov_buf); } for (index = 0; index < n_iov; index++) { (*iov_buf)[index].ptr = iov[index].MPL_IOV_BUF; (*iov_buf)[index].length = iov[index].MPL_IOV_LEN; } rreq->dev.tmpbuf = NULL; rreq->dev.tmpbuf_sz = 0; *iov_count = n_iov; } else { MPI_Aint packsize = 0; MPIR_Pack_size_impl(rreq->dev.user_count, rreq->dev.datatype, &packsize); rreq->dev.tmpbuf = MPIU_Malloc((size_t) packsize); MPIU_Assert(rreq->dev.tmpbuf); rreq->dev.tmpbuf_sz = packsize; (*iov_buf)[0].ptr = rreq->dev.tmpbuf; (*iov_buf)[0].length = (size_t) packsize; *iov_count = 1; } MPIU_Free(iov); fn_exit: return mpi_errno; fn_fail: goto fn_exit; }
int MPIDO_Allgather_simple(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPID_Comm * comm_ptr, int *mpierrno) { #ifndef HAVE_PAMI_IN_PLACE if (sendbuf == MPI_IN_PLACE) { MPID_Abort (NULL, 0, 1, "'MPI_IN_PLACE' requries support for `PAMI_IN_PLACE`"); return -1; } #endif /* ********************************* * Check the nature of the buffers * ********************************* */ const struct MPIDI_Comm* const mpid = &(comm_ptr->mpid); MPID_Datatype * dt_null = NULL; void *snd_noncontig_buff = NULL, *rcv_noncontig_buff = NULL; MPI_Aint send_true_lb = 0; MPI_Aint recv_true_lb = 0; int snd_data_contig = 1, rcv_data_contig = 1; size_t send_size = 0; size_t recv_size = 0; MPID_Segment segment; volatile unsigned allgather_active = 1; const int rank = comm_ptr->rank; const int size = comm_ptr->local_size; #if ASSERT_LEVEL==0 /* We can't afford the tracing in ndebug/performance libraries */ const unsigned verbose = 0; #else const unsigned verbose = (MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_ALL) && (rank == 0); #endif const pami_metadata_t *my_md; char *rbuf = NULL, *sbuf = NULL; if ((sendcount < 1 && sendbuf != MPI_IN_PLACE) || recvcount < 1) return MPI_SUCCESS; /* Gather datatype information */ MPIDI_Datatype_get_info(recvcount, recvtype, rcv_data_contig, recv_size, dt_null, recv_true_lb); send_size = recv_size; if(MPIDI_Pamix_collsel_advise != NULL && mpid->collsel_fast_query != NULL) { advisor_algorithm_t advisor_algorithms[1]; int num_algorithms = MPIDI_Pamix_collsel_advise(mpid->collsel_fast_query, PAMI_XFER_ALLGATHER, send_size, advisor_algorithms, 1); if(num_algorithms) { if(advisor_algorithms[0].algorithm_type == COLLSEL_EXTERNAL_ALGO) { return MPIR_Allgather(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, comm_ptr, mpierrno); } else if(advisor_algorithms[0].metadata && advisor_algorithms[0].metadata->check_correct.values.asyncflowctl && !(--(comm_ptr->mpid.num_requests))) { comm_ptr->mpid.num_requests = MPIDI_Process.optimized.num_requests; int tmpmpierrno; if(unlikely(verbose)) fprintf(stderr,"Query barrier required for %s\n", advisor_algorithms[0].metadata->name); MPIDO_Barrier(comm_ptr, &tmpmpierrno); } } } rbuf = (char *)recvbuf+recv_true_lb; if(!rcv_data_contig) { rcv_noncontig_buff = MPL_malloc(recv_size * size); rbuf = rcv_noncontig_buff; if(rcv_noncontig_buff == NULL) { MPID_Abort(NULL, MPI_ERR_NO_SPACE, 1, "Fatal: Cannot allocate pack buffer"); } if(sendbuf == MPI_IN_PLACE) { sbuf = PAMI_IN_PLACE; size_t extent; MPID_Datatype_get_extent_macro(recvtype,extent); MPIR_Localcopy(recvbuf + (rank*recvcount*extent), recvcount, recvtype, rcv_noncontig_buff + (rank*recv_size), recv_size,MPI_CHAR); } } if(sendbuf != MPI_IN_PLACE) { MPIDI_Datatype_get_info(sendcount, sendtype, snd_data_contig, send_size, dt_null, send_true_lb); sbuf = (char *)sendbuf+send_true_lb; if(!snd_data_contig) { snd_noncontig_buff = MPL_malloc(send_size); sbuf = snd_noncontig_buff; if(snd_noncontig_buff == NULL) { MPID_Abort(NULL, MPI_ERR_NO_SPACE, 1, "Fatal: Cannot allocate pack buffer"); } DLOOP_Offset last = send_size; MPID_Segment_init(sendbuf, sendcount, sendtype, &segment, 0); MPID_Segment_pack(&segment, 0, &last, snd_noncontig_buff); } } else sbuf = PAMI_IN_PLACE; TRACE_ERR("Using PAMI-level allgather protocol\n"); pami_xfer_t allgather; allgather.cb_done = allgather_cb_done; allgather.cookie = (void *)&allgather_active; allgather.cmd.xfer_allgather.rcvbuf = rbuf; allgather.cmd.xfer_allgather.sndbuf = sbuf; allgather.cmd.xfer_allgather.stype = PAMI_TYPE_BYTE;/* stype is ignored when sndbuf == PAMI_IN_PLACE */ allgather.cmd.xfer_allgather.rtype = PAMI_TYPE_BYTE; allgather.cmd.xfer_allgather.stypecount = send_size; allgather.cmd.xfer_allgather.rtypecount = recv_size; allgather.algorithm = mpid->coll_algorithm[PAMI_XFER_ALLGATHER][0][0]; my_md = &mpid->coll_metadata[PAMI_XFER_ALLGATHER][0][0]; TRACE_ERR("Calling PAMI_Collective with allgather structure\n"); MPIDI_Post_coll_t allgather_post; MPIDI_Context_post(MPIDI_Context[0], &allgather_post.state, MPIDI_Pami_post_wrapper, (void *)&allgather); TRACE_ERR("Allgather %s\n", MPIDI_Process.context_post.active>0?"posted":"invoked"); MPIDI_Update_last_algorithm(comm_ptr, my_md->name); MPID_PROGRESS_WAIT_WHILE(allgather_active); if(!rcv_data_contig) { MPIR_Localcopy(rcv_noncontig_buff, recv_size * size, MPI_CHAR, recvbuf, recvcount, recvtype); MPL_free(rcv_noncontig_buff); } if(!snd_data_contig) MPL_free(snd_noncontig_buff); TRACE_ERR("Allgather done\n"); return MPI_SUCCESS; }
int MPID_nem_ib_lmt_switch_send(struct MPIDI_VC *vc, struct MPID_Request *req) { int mpi_errno = MPI_SUCCESS; int dt_contig; MPIDI_msg_sz_t data_sz; MPID_Datatype *dt_ptr; MPI_Aint dt_true_lb; MPID_IOV r_cookie = req->ch.lmt_tmp_cookie; MPID_nem_ib_lmt_cookie_t *r_cookie_buf = r_cookie.iov_base; MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_LMT_SWITCH_SEND); MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_LMT_SWITCH_SEND); MPIDI_Datatype_get_info(req->dev.user_count, req->dev.datatype, dt_contig, data_sz, dt_ptr, dt_true_lb); void *write_from_buf; if (dt_contig) { write_from_buf = req->dev.user_buf; } else { /* see MPIDI_CH3_EagerNoncontigSend (in ch3u_eager.c) */ req->dev.segment_ptr = MPID_Segment_alloc(); MPIU_ERR_CHKANDJUMP((req->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**outofmemory"); MPID_Segment_init(req->dev.user_buf, req->dev.user_count, req->dev.datatype, req->dev.segment_ptr, 0); req->dev.segment_first = 0; req->dev.segment_size = data_sz; MPIDI_msg_sz_t last; last = req->dev.segment_size; /* segment_size is byte offset */ MPIU_Assert(last > 0); REQ_FIELD(req, lmt_pack_buf) = MPIU_Malloc(data_sz); MPIU_ERR_CHKANDJUMP(!REQ_FIELD(req, lmt_pack_buf), mpi_errno, MPI_ERR_OTHER, "**outofmemory"); MPID_Segment_pack(req->dev.segment_ptr, req->dev.segment_first, &last, (char *) (REQ_FIELD(req, lmt_pack_buf))); MPIU_Assert(last == req->dev.segment_size); write_from_buf = REQ_FIELD(req, lmt_pack_buf); } //assert(dt_true_lb == 0); uint8_t *tailp = (uint8_t *) ((uint8_t *) write_from_buf /*+ dt_true_lb */ + data_sz - sizeof(uint8_t)); #if 0 *is_end_flag_same = (r_cookie_buf->tail == *tailp) ? 1 : 0; #else REQ_FIELD(req, lmt_receiver_tail) = r_cookie_buf->tail; REQ_FIELD(req, lmt_sender_tail) = *tailp; dprintf("lmt_switch_send,tail on sender=%02x,tail onreceiver=%02x,req=%p\n", *tailp, r_cookie_buf->tail, req); #ifdef MPID_NEM_IB_DEBUG_LMT uint8_t *tail_wordp = (uint8_t *) ((uint8_t *) write_from_buf + data_sz - sizeof(uint32_t) * 2); #endif dprintf("lmt_switch_send,tail on sender=%d\n", *tail_wordp); fflush(stdout); #endif fn_exit: MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_LMT_SWITCH_SEND); return mpi_errno; fn_fail: goto fn_exit; }
int MPID_nem_tcp_module_lmt_start_send (MPIDI_VC_t *vc, MPID_Request *req, MPID_IOV r_cookie) { int mpi_errno = MPI_SUCCESS; int ret; MPIDI_msg_sz_t data_sz; int dt_contig; MPI_Aint dt_true_lb; MPID_Datatype * dt_ptr; MPIDI_msg_sz_t last; int nb; int s_len = 0; int r_len; int r_port; char *r_hostname; MPIDI_CH3I_VC *vc_ch = (MPIDI_CH3I_VC *)vc->channel_private; MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_TCP_MODULE_LMT_START_SEND); MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_TCP_MODULE_LMT_START_SEND); mpi_errno = read_r_cookie (r_cookie, &r_hostname, &r_port, &r_len); if (mpi_errno) MPIU_ERR_POP (mpi_errno); free_cookie (vc_ch->net.tcp.lmt_cookie); if (!vc_ch->net.tcp.lmt_connected) { struct sockaddr_in saddr; struct hostent *hp; vc_ch->net.tcp.lmt_desc = socket (AF_INET, SOCK_STREAM, 0); MPIU_ERR_CHKANDJUMP2 (vc_ch->net.tcp.lmt_desc == -1, mpi_errno, MPI_ERR_OTHER, "**sock_create", "**sock_create %s %d", strerror (errno), errno); // ret = fcntl (vc_ch->net.tcp.lmt_desc, F_SETFL, O_NONBLOCK); // MPIU_ERR_CHKANDJUMP2 (ret == -1, mpi_errno, MPI_ERR_OTHER, "**fail", "**fail %s %d", strerror (errno), errno); hp = gethostbyname (r_hostname); MPIU_ERR_CHKANDJUMP2 (hp == NULL, mpi_errno, MPI_ERR_OTHER, "**gethostbyname", "**gethostbyname %s %d", hstrerror (h_errno), h_errno); memset (&saddr, sizeof(saddr), 0); saddr.sin_family = AF_INET; saddr.sin_port = htons (r_port); MPIU_Memcpy (&saddr.sin_addr, hp->h_addr, hp->h_length); set_sockopts (vc_ch->net.tcp.lmt_desc); ret = connect (vc_ch->net.tcp.lmt_desc, (struct sockaddr *)&saddr, sizeof(saddr)); MPIU_ERR_CHKANDJUMP2 (ret == -1, mpi_errno, MPI_ERR_OTHER, "**fail", "**fail %s %d", strerror (errno), errno); vc_ch->net.tcp.lmt_connected = 1; } MPIDI_Datatype_get_info (req->dev.user_count, req->dev.datatype, dt_contig, data_sz, dt_ptr, dt_true_lb); if (r_len < data_sz) { /* message will be truncated */ s_len = data_sz; data_sz = r_len; req->status.MPI_ERROR = MPIU_ERR_SET2 (mpi_errno, MPI_ERR_TRUNCATE, "**truncate", "**truncate %d %d", s_len, r_len); } MPID_Segment_init (req->dev.user_buf, req->dev.user_count, req->dev.datatype, &req->dev.segment, 0); req->dev.segment_first = 0; req->dev.segment_size = data_sz; req->dev.iov_count = MPID_IOV_LIMIT; req->dev.iov_offset = 0; last = data_sz; do { int iov_offset; int left_to_send; MPID_Segment_pack_vector (&req->dev.segment, req->dev.segment_first, &last, req->dev.iov, &req->dev.iov_count); left_to_send = last - req->dev.segment_first; iov_offset = 0; #ifdef TESTING_CHUNKING { char *buf = req->dev.iov[0].MPID_IOV_BUF; int l; while (left_to_send) { if (left_to_send > CHUNK) l = CHUNK; else l = left_to_send; do nb = write (vc_ch->net.tcp.lmt_desc, buf, l); while (nb == -1 && errno == EINTR); MPIU_ERR_CHKANDJUMP (nb == -1, mpi_errno, MPI_ERR_OTHER, "**sock_writev"); left_to_send -= nb; buf += nb; } MPIDI_CH3U_Request_complete (req); goto fn_exit; } #endif do nb = writev (vc_ch->net.tcp.lmt_desc, &req->dev.iov[iov_offset], req->dev.iov_count - iov_offset); while (nb == -1 && errno == EINTR); MPIU_ERR_CHKANDJUMP (nb == -1, mpi_errno, MPI_ERR_OTHER, "**sock_writev"); left_to_send -= nb; while (left_to_send) { /* send rest of iov */ while (nb >= req->dev.iov[iov_offset].MPID_IOV_LEN) { /* update iov to reflect sent bytes */ nb -= req->dev.iov[iov_offset].MPID_IOV_LEN; ++iov_offset; } req->dev.iov[iov_offset].MPID_IOV_BUF = (char *)req->dev.iov[iov_offset].MPID_IOV_BUF + nb; req->dev.iov[iov_offset].MPID_IOV_LEN -= nb; do nb = writev (vc_ch->net.tcp.lmt_desc, &req->dev.iov[iov_offset], req->dev.iov_count - iov_offset); while (nb == -1 && errno == EINTR); MPIU_ERR_CHKANDJUMP (nb == -1, mpi_errno, MPI_ERR_OTHER, "**sock_writev"); left_to_send -= nb; } } while (last < data_sz); MPIDI_CH3U_Request_complete (req); fn_exit: MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_TCP_MODULE_LMT_START_SEND); return mpi_errno; fn_fail: goto fn_exit; }
int MPIDO_Gatherv_simple(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, const int *recvcounts, const int *displs, MPI_Datatype recvtype, int root, MPID_Comm * comm_ptr, int *mpierrno) { #ifndef HAVE_PAMI_IN_PLACE if (sendbuf == MPI_IN_PLACE) { MPID_Abort (NULL, 0, 1, "'MPI_IN_PLACE' requries support for `PAMI_IN_PLACE`"); return -1; } #endif TRACE_ERR("Entering MPIDO_Gatherv_optimized\n"); int snd_contig = 1, rcv_contig = 1; void *snd_noncontig_buff = NULL, *rcv_noncontig_buff = NULL; void *sbuf = NULL, *rbuf = NULL; int *rcounts = NULL; int *rdispls = NULL; int send_size = 0; int recv_size = 0; int rcvlen = 0; int totalrecvcount = 0; pami_type_t rtype = PAMI_TYPE_NULL; MPID_Segment segment; MPID_Datatype *data_ptr = NULL; int send_true_lb, recv_true_lb = 0; int i, tmp; volatile unsigned gatherv_active = 1; const int rank = comm_ptr->rank; const int size = comm_ptr->local_size; #if ASSERT_LEVEL==0 /* We can't afford the tracing in ndebug/performance libraries */ const unsigned verbose = 0; #else const unsigned verbose = (MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_ALL) && (rank == 0); #endif const struct MPIDI_Comm* const mpid = &(comm_ptr->mpid); int recvok=PAMI_SUCCESS, recvcontinuous=0; if(sendbuf != MPI_IN_PLACE) { MPIDI_Datatype_get_info(sendcount, sendtype, snd_contig, send_size, data_ptr, send_true_lb); if(MPIDI_Pamix_collsel_advise != NULL && mpid->collsel_fast_query != NULL) { advisor_algorithm_t advisor_algorithms[1]; int num_algorithms = MPIDI_Pamix_collsel_advise(mpid->collsel_fast_query, PAMI_XFER_GATHERV_INT, 64, advisor_algorithms, 1); if(num_algorithms) { if(advisor_algorithms[0].algorithm_type == COLLSEL_EXTERNAL_ALGO) { return MPIR_Gatherv(sendbuf, sendcount, sendtype, recvbuf, recvcounts, displs, recvtype, root, comm_ptr, mpierrno); } else if(advisor_algorithms[0].metadata && advisor_algorithms[0].metadata->check_correct.values.asyncflowctl && !(--(comm_ptr->mpid.num_requests))) { comm_ptr->mpid.num_requests = MPIDI_Process.optimized.num_requests; int tmpmpierrno; if(unlikely(verbose)) fprintf(stderr,"Query barrier required for %s\n", advisor_algorithms[0].metadata->name); MPIDO_Barrier(comm_ptr, &tmpmpierrno); } } } sbuf = (char *)sendbuf + send_true_lb; if(!snd_contig) { snd_noncontig_buff = MPL_malloc(send_size); sbuf = snd_noncontig_buff; if(snd_noncontig_buff == NULL) { MPID_Abort(NULL, MPI_ERR_NO_SPACE, 1, "Fatal: Cannot allocate pack buffer"); } DLOOP_Offset last = send_size; MPID_Segment_init(sendbuf, sendcount, sendtype, &segment, 0); MPID_Segment_pack(&segment, 0, &last, snd_noncontig_buff); } } else { MPIDI_Datatype_get_info(1, recvtype, rcv_contig, rcvlen, data_ptr, recv_true_lb); if(MPIDI_Pamix_collsel_advise != NULL && mpid->collsel_fast_query != NULL) { advisor_algorithm_t advisor_algorithms[1]; int num_algorithms = MPIDI_Pamix_collsel_advise(mpid->collsel_fast_query, PAMI_XFER_GATHERV_INT, 64, advisor_algorithms, 1); if(num_algorithms) { if(advisor_algorithms[0].algorithm_type == COLLSEL_EXTERNAL_ALGO) { return MPIR_Gatherv(sendbuf, sendcount, sendtype, recvbuf, recvcounts, displs, recvtype, root, comm_ptr, mpierrno); } else if(advisor_algorithms[0].metadata && advisor_algorithms[0].metadata->check_correct.values.asyncflowctl && !(--(comm_ptr->mpid.num_requests))) { comm_ptr->mpid.num_requests = MPIDI_Process.optimized.num_requests; int tmpmpierrno; if(unlikely(verbose)) fprintf(stderr,"Query barrier required for %s\n", advisor_algorithms[0].metadata->name); MPIDO_Barrier(comm_ptr, &tmpmpierrno); } } } } pami_xfer_t gatherv; rbuf = (char *)recvbuf + recv_true_lb; rcounts = (int*)recvcounts; rdispls = (int*)displs; if(rank == root) { if((recvok = MPIDI_Datatype_to_pami(recvtype, &rtype, -1, NULL, &tmp)) != MPI_SUCCESS) { MPIDI_Datatype_get_info(1, recvtype, rcv_contig, rcvlen, data_ptr, recv_true_lb); totalrecvcount = recvcounts[0]; recvcontinuous = displs[0] == 0? 1 : 0 ; rcounts = (int*)MPL_malloc(size); rdispls = (int*)MPL_malloc(size); rdispls[0] = 0; rcounts[0] = rcvlen * recvcounts[0]; for(i = 1; i < size; i++) { rdispls[i]= rcvlen * totalrecvcount; totalrecvcount += recvcounts[i]; if(displs[i] != (displs[i-1] + recvcounts[i-1])) recvcontinuous = 0; rcounts[i] = rcvlen * recvcounts[i]; } recv_size = rcvlen * totalrecvcount; rcv_noncontig_buff = MPL_malloc(recv_size); rbuf = rcv_noncontig_buff; rtype = PAMI_TYPE_BYTE; if(rcv_noncontig_buff == NULL) { MPID_Abort(NULL, MPI_ERR_NO_SPACE, 1, "Fatal: Cannot allocate pack buffer"); } if(sendbuf == MPI_IN_PLACE) { size_t extent; MPID_Datatype_get_extent_macro(recvtype,extent); MPIR_Localcopy(recvbuf + displs[rank]*extent, recvcounts[rank], recvtype, rcv_noncontig_buff + rdispls[rank], rcounts[rank],MPI_CHAR); } } if(sendbuf == MPI_IN_PLACE) { gatherv.cmd.xfer_gatherv_int.sndbuf = PAMI_IN_PLACE; } else { gatherv.cmd.xfer_gatherv_int.sndbuf = sbuf; } gatherv.cmd.xfer_gatherv_int.stype = PAMI_TYPE_BYTE;/* stype is ignored when sndbuf == PAMI_IN_PLACE */ gatherv.cmd.xfer_gatherv_int.stypecount = send_size; } else { gatherv.cmd.xfer_gatherv_int.sndbuf = sbuf; gatherv.cmd.xfer_gatherv_int.stype = PAMI_TYPE_BYTE; gatherv.cmd.xfer_gatherv_int.stypecount = send_size; } gatherv.cb_done = cb_gatherv; gatherv.cookie = (void *)&gatherv_active; gatherv.cmd.xfer_gatherv_int.root = MPIDI_Task_to_endpoint(MPID_VCR_GET_LPID(comm_ptr->vcr, root), 0); gatherv.cmd.xfer_gatherv_int.rcvbuf = rbuf; gatherv.cmd.xfer_gatherv_int.rtype = rtype; gatherv.cmd.xfer_gatherv_int.rtypecounts = (int *) rcounts; gatherv.cmd.xfer_gatherv_int.rdispls = (int *) rdispls; const pami_metadata_t *my_gatherv_md; gatherv.algorithm = mpid->coll_algorithm[PAMI_XFER_GATHERV_INT][0][0]; my_gatherv_md = &mpid->coll_metadata[PAMI_XFER_GATHERV_INT][0][0]; MPIDI_Update_last_algorithm(comm_ptr, my_gatherv_md->name); MPIDI_Post_coll_t gatherv_post; TRACE_ERR("%s gatherv\n", MPIDI_Process.context_post.active>0?"Posting":"Invoking"); MPIDI_Context_post(MPIDI_Context[0], &gatherv_post.state, MPIDI_Pami_post_wrapper, (void *)&gatherv); TRACE_ERR("Gatherv %s\n", MPIDI_Process.context_post.active>0?"posted":"invoked"); TRACE_ERR("Waiting on active %d\n", gatherv_active); MPID_PROGRESS_WAIT_WHILE(gatherv_active); if(!rcv_contig || recvok != PAMI_SUCCESS) { if(recvcontinuous) { MPIR_Localcopy(rcv_noncontig_buff, recv_size, MPI_CHAR, recvbuf, totalrecvcount, recvtype); } else { size_t extent; MPID_Datatype_get_extent_macro(recvtype,extent); for(i=0; i<size; ++i) { char* scbuf = (char*)rcv_noncontig_buff+ rdispls[i]; char* rcbuf = (char*)recvbuf + displs[i]*extent; MPIR_Localcopy(scbuf, rcounts[i], MPI_CHAR, rcbuf, recvcounts[i], recvtype); TRACE_ERR("Pack recv src extent %zu, displ[%zu]=%zu, count[%zu]=%zu buf[%zu]=%u\n", (size_t)extent, (size_t)i,(size_t)precvdispls[i],(size_t)i,(size_t)precvcounts[i],(size_t)precvdispls[i], *(int*)scbuf); TRACE_ERR("Pack recv dest extent %zu, displ[%zu]=%zu, count[%zu]=%zu buf[%zu]=%u\n", (size_t)extent, (size_t)i,(size_t)displs[i],(size_t)i,(size_t)recvcounts[i],(size_t)displs[i], *(int*)rcbuf); } } MPL_free(rcv_noncontig_buff); if(rank == root) { MPL_free(rcounts); MPL_free(rdispls); } } if(!snd_contig) MPL_free(snd_noncontig_buff); TRACE_ERR("Leaving MPIDO_Gatherv_optimized\n"); return MPI_SUCCESS; }
int MPID_nem_lmt_dma_start_recv(MPIDI_VC_t *vc, MPID_Request *rreq, MPID_IOV s_cookie) { int mpi_errno = MPI_SUCCESS; int nodma; int dt_contig; MPI_Aint dt_true_lb; MPIDI_msg_sz_t data_sz; MPID_Datatype * dt_ptr; volatile knem_status_t *status; knem_status_t current_status; struct lmt_dma_node *node = NULL; MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_LMT_DMA_START_RECV); MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_LMT_DMA_START_RECV); /* MT: this code assumes only one thread can be at this point at a time */ if (knem_fd < 0) { mpi_errno = open_knem_dev(); if (mpi_errno) MPIU_ERR_POP(mpi_errno); } /* find out contig/noncontig, size, and lb for the datatype */ MPIDI_Datatype_get_info(rreq->dev.user_count, rreq->dev.datatype, dt_contig, data_sz, dt_ptr, dt_true_lb); nodma = !knem_has_dma || data_sz < MPIR_CVAR_NEMESIS_LMT_DMA_THRESHOLD; if (dt_contig) { /* handle the iov creation ourselves */ rreq->dev.iov[0].MPID_IOV_BUF = (char *)rreq->dev.user_buf + dt_true_lb; rreq->dev.iov[0].MPID_IOV_LEN = data_sz; rreq->dev.iov_count = 1; } else { if (rreq->dev.segment_ptr == NULL) { /* segment_ptr may be non-null when this is a continuation of a many-part message that we couldn't fit in one single flight of iovs. */ MPIU_Assert(rreq->dev.segment_ptr == NULL); rreq->dev.segment_ptr = MPID_Segment_alloc(); MPIU_ERR_CHKANDJUMP1((rreq->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc"); MPID_Segment_init(rreq->dev.user_buf, rreq->dev.user_count, rreq->dev.datatype, rreq->dev.segment_ptr, 0); rreq->dev.segment_first = 0; rreq->dev.segment_size = data_sz; /* see load_send_iov FIXME above */ mpi_errno = MPIDI_CH3U_Request_load_recv_iov(rreq); if (mpi_errno) MPIU_ERR_POP(mpi_errno); } } MPIU_Assert(s_cookie.MPID_IOV_LEN == sizeof(knem_cookie_t)); MPIU_Assert(s_cookie.MPID_IOV_BUF != NULL); mpi_errno = do_dma_recv(rreq->dev.iov_count, rreq->dev.iov, *((knem_cookie_t *)s_cookie.MPID_IOV_BUF), nodma, &status, ¤t_status); if (mpi_errno) MPIU_ERR_POP(mpi_errno); /* TODO refactor this block and MPID_nem_lmt_dma_progress (and anywhere * else) to share a common function. This advancement/completion code is * duplication. */ if (current_status != KNEM_STATUS_PENDING) { /* complete the request if all data has been sent, remove it from the list */ int complete = 0; MPIU_ERR_CHKANDJUMP1(current_status == KNEM_STATUS_FAILED, mpi_errno, MPI_ERR_OTHER, "**recv_status", "**recv_status %d", current_status); mpi_errno = check_req_complete(vc, rreq, &complete); if (mpi_errno) MPIU_ERR_POP(mpi_errno); free_status_index(status - knem_status); if (complete) { /* request was completed by the OnDataAvail fn */ MPID_nem_lmt_send_DONE(vc, rreq); /* tell the other side to complete its request */ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, ".... complete"); } else { /* There is more data to send. We must inform the sender that we have completely received the current batch and that the next batch should be sent. */ MPID_nem_lmt_send_COOKIE(vc, rreq, NULL, 0); } } /* XXX DJG FIXME this looks like it always pushes! */ /* push request if not complete for progress checks later */ node = MPIU_Malloc(sizeof(struct lmt_dma_node)); node->vc = vc; node->req = rreq; node->status_p = status; node->next = outstanding_head; outstanding_head = node; ++MPID_nem_local_lmt_pending; fn_exit: MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_LMT_DMA_START_RECV); return mpi_errno; fn_fail: goto fn_exit; }
int MPIDO_Bcast(void *buffer, int count, MPI_Datatype datatype, int root, MPID_Comm *comm_ptr, int *mpierrno) { TRACE_ERR("in mpido_bcast\n"); const size_t BCAST_LIMIT = 0x40000000; int data_contig, rc; void *data_buffer = NULL, *noncontig_buff = NULL; volatile unsigned active = 1; MPI_Aint data_true_lb = 0; MPID_Datatype *data_ptr; MPID_Segment segment; MPIDI_Post_coll_t bcast_post; const struct MPIDI_Comm* const mpid = &(comm_ptr->mpid); const int rank = comm_ptr->rank; #if ASSERT_LEVEL==0 /* We can't afford the tracing in ndebug/performance libraries */ const unsigned verbose = 0; #else const unsigned verbose = (MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_ALL) && (rank == 0); #endif const int selected_type = mpid->user_selected_type[PAMI_XFER_BROADCAST]; /* Must calculate data_size based on count=1 in case it's total size is > integer */ int data_size_one; MPIDI_Datatype_get_info(1, datatype, data_contig, data_size_one, data_ptr, data_true_lb); /* do this calculation once and use twice */ const size_t data_size_sz = (size_t)data_size_one*(size_t)count; if(unlikely(verbose)) fprintf(stderr,"bcast count %d, size %d (%#zX), root %d, buffer %p\n", count,data_size_one, (size_t)data_size_one*(size_t)count, root,buffer); if(unlikely( data_size_sz > BCAST_LIMIT) ) { void *new_buffer=buffer; int c, new_count = (int)BCAST_LIMIT/data_size_one; MPID_assert(new_count > 0); for(c=1; ((size_t)c*(size_t)new_count) <= (size_t)count; ++c) { if ((rc = MPIDO_Bcast(new_buffer, new_count, datatype, root, comm_ptr, mpierrno)) != MPI_SUCCESS) return rc; new_buffer = (char*)new_buffer + (size_t)data_size_one*(size_t)new_count; } new_count = count % new_count; /* 0 is ok, just returns no-op */ return MPIDO_Bcast(new_buffer, new_count, datatype, root, comm_ptr, mpierrno); } /* Must use data_size based on count for byte bcast processing. Previously calculated as a size_t but large data_sizes were handled above so this cast to int should be fine here. */ const int data_size = (int)data_size_sz; if(selected_type == MPID_COLL_USE_MPICH || data_size == 0) { if(unlikely(verbose)) fprintf(stderr,"Using MPICH bcast algorithm\n"); MPIDI_Update_last_algorithm(comm_ptr,"BCAST_MPICH"); return MPIR_Bcast_intra(buffer, count, datatype, root, comm_ptr, mpierrno); } data_buffer = (char *)buffer + data_true_lb; if(!data_contig) { noncontig_buff = MPIU_Malloc(data_size); data_buffer = noncontig_buff; if(noncontig_buff == NULL) { MPID_Abort(NULL, MPI_ERR_NO_SPACE, 1, "Fatal: Cannot allocate pack buffer"); } if(rank == root) { DLOOP_Offset last = data_size; MPID_Segment_init(buffer, count, datatype, &segment, 0); MPID_Segment_pack(&segment, 0, &last, noncontig_buff); } } pami_xfer_t bcast; pami_algorithm_t my_bcast; const pami_metadata_t *my_md = (pami_metadata_t *)NULL; int queryreq = 0; bcast.cb_done = cb_bcast; bcast.cookie = (void *)&active; bcast.cmd.xfer_broadcast.root = MPIDI_Task_to_endpoint(MPID_VCR_GET_LPID(comm_ptr->vcr, root), 0); bcast.algorithm = mpid->user_selected[PAMI_XFER_BROADCAST]; bcast.cmd.xfer_broadcast.buf = data_buffer; bcast.cmd.xfer_broadcast.type = PAMI_TYPE_BYTE; /* Needs to be sizeof(type)*count since we are using bytes as * the generic type */ bcast.cmd.xfer_broadcast.typecount = data_size; if(selected_type == MPID_COLL_OPTIMIZED) { TRACE_ERR("Optimized bcast (%s) and (%s) were pre-selected\n", mpid->opt_protocol_md[PAMI_XFER_BROADCAST][0].name, mpid->opt_protocol_md[PAMI_XFER_BROADCAST][1].name); if(mpid->cutoff_size[PAMI_XFER_BROADCAST][1] != 0)/* SSS: There is FCA cutoff (FCA only sets cutoff for [PAMI_XFER_BROADCAST][1]) */ { if(data_size <= mpid->cutoff_size[PAMI_XFER_BROADCAST][1]) { my_bcast = mpid->opt_protocol[PAMI_XFER_BROADCAST][1]; my_md = &mpid->opt_protocol_md[PAMI_XFER_BROADCAST][1]; queryreq = mpid->must_query[PAMI_XFER_BROADCAST][1]; } else { return MPIR_Bcast_intra(buffer, count, datatype, root, comm_ptr, mpierrno); } } if(data_size > mpid->cutoff_size[PAMI_XFER_BROADCAST][0]) { my_bcast = mpid->opt_protocol[PAMI_XFER_BROADCAST][1]; my_md = &mpid->opt_protocol_md[PAMI_XFER_BROADCAST][1]; queryreq = mpid->must_query[PAMI_XFER_BROADCAST][1]; } else { my_bcast = mpid->opt_protocol[PAMI_XFER_BROADCAST][0]; my_md = &mpid->opt_protocol_md[PAMI_XFER_BROADCAST][0]; queryreq = mpid->must_query[PAMI_XFER_BROADCAST][0]; } } else { TRACE_ERR("Bcast (%s) was specified by user\n", mpid->user_metadata[PAMI_XFER_BROADCAST].name); my_bcast = mpid->user_selected[PAMI_XFER_BROADCAST]; my_md = &mpid->user_metadata[PAMI_XFER_BROADCAST]; queryreq = selected_type; } bcast.algorithm = my_bcast; if(unlikely(queryreq == MPID_COLL_ALWAYS_QUERY || queryreq == MPID_COLL_CHECK_FN_REQUIRED)) { metadata_result_t result = {0}; TRACE_ERR("querying bcast protocol %s, type was: %d\n", my_md->name, queryreq); if(my_md->check_fn != NULL) /* calling the check fn is sufficient */ { metadata_result_t result = {0}; result = my_md->check_fn(&bcast); result.check.nonlocal = 0; /* #warning REMOVE THIS WHEN IMPLEMENTED */ } else /* no check_fn, manually look at the metadata fields */ { TRACE_ERR("Optimzed selection line %d\n",__LINE__); /* Check if the message range if restricted */ if(my_md->check_correct.values.rangeminmax) { if((my_md->range_lo <= data_size) && (my_md->range_hi >= data_size)) ; /* ok, algorithm selected */ else { result.check.range = 1; if(unlikely(verbose)) { fprintf(stderr,"message size (%u) outside range (%zu<->%zu) for %s.\n", data_size, my_md->range_lo, my_md->range_hi, my_md->name); } } } /* \todo check the rest of the metadata */ } TRACE_ERR("bitmask: %#X\n", result.bitmask); if(result.bitmask) { if(unlikely(verbose)) fprintf(stderr,"Using MPICH bcast algorithm - query fn failed\n"); MPIDI_Update_last_algorithm(comm_ptr,"BCAST_MPICH"); return MPIR_Bcast_intra(buffer, count, datatype, root, comm_ptr, mpierrno); } if(my_md->check_correct.values.asyncflowctl && !(--(comm_ptr->mpid.num_requests))) { comm_ptr->mpid.num_requests = MPIDI_Process.optimized.num_requests; int tmpmpierrno; if(unlikely(verbose)) fprintf(stderr,"Query barrier required for %s\n", my_md->name); MPIDO_Barrier(comm_ptr, &tmpmpierrno); } } if(unlikely(verbose)) { unsigned long long int threadID; MPIU_Thread_id_t tid; MPIU_Thread_self(&tid); threadID = (unsigned long long int)tid; fprintf(stderr,"<%llx> Using protocol %s for bcast on %u\n", threadID, my_md->name, (unsigned) comm_ptr->context_id); } MPIDI_Context_post(MPIDI_Context[0], &bcast_post.state, MPIDI_Pami_post_wrapper, (void *)&bcast); MPIDI_Update_last_algorithm(comm_ptr, my_md->name); MPID_PROGRESS_WAIT_WHILE(active); TRACE_ERR("bcast done\n"); if(!data_contig) { if(rank != root) MPIR_Localcopy(noncontig_buff, data_size, MPI_CHAR, buffer, count, datatype); MPIU_Free(noncontig_buff); } TRACE_ERR("leaving bcast\n"); return 0; }
int MPID_nem_newmad_process_rdtype(MPID_Request **rreq_p, MPID_Datatype * dt_ptr, MPIDI_msg_sz_t data_sz, struct iovec *newmad_iov[], int *num_iov) { MPID_Request *rreq = *rreq_p; MPIDI_msg_sz_t last; MPID_IOV *iov; int n_iov = 0; int mpi_errno = MPI_SUCCESS; int index; MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_NEWMAD_PROCESS_RDTYPE); MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_NEWMAD_PROCESS_RDTYPE); if (rreq->dev.segment_ptr == NULL) { rreq->dev.segment_ptr = MPID_Segment_alloc( ); MPIU_ERR_CHKANDJUMP1((rreq->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc"); } MPID_Segment_init(rreq->dev.user_buf, rreq->dev.user_count, rreq->dev.datatype, rreq->dev.segment_ptr, 0); rreq->dev.segment_first = 0; rreq->dev.segment_size = data_sz; last = rreq->dev.segment_size; MPID_Segment_count_contig_blocks(rreq->dev.segment_ptr,rreq->dev.segment_first,&last,&n_iov); MPIU_Assert(n_iov > 0); iov = MPIU_Malloc(n_iov*sizeof(MPID_IOV)); MPID_Segment_unpack_vector(rreq->dev.segment_ptr, rreq->dev.segment_first, &last,iov, &n_iov); MPIU_Assert(last == rreq->dev.segment_size); #ifdef DEBUG for(index = 0; index < n_iov ; index++) { fprintf(stdout,"======================\n"); fprintf(stdout,"RECV iov[%i]: [base %p][len %i]\n",index, iov[index].MPID_IOV_BUF,iov[index].MPID_IOV_LEN); } #endif if(n_iov <= NMAD_IOV_MAX_DEPTH) { for(index=0; index < n_iov ; index++) { (*newmad_iov)[index].iov_base = iov[index].MPID_IOV_BUF; (*newmad_iov)[index].iov_len = iov[index].MPID_IOV_LEN; } rreq->dev.tmpbuf = NULL; *num_iov = n_iov; } else { int packsize = 0; MPIR_Pack_size_impl(rreq->dev.user_count, rreq->dev.datatype, &packsize); rreq->dev.tmpbuf = MPIU_Malloc((size_t) packsize); MPIU_Assert(rreq->dev.tmpbuf); rreq->dev.tmpbuf_sz = packsize; (*newmad_iov)[0].iov_base = (char *) rreq->dev.tmpbuf; (*newmad_iov)[0].iov_len = (uint32_t) packsize; *num_iov = 1 ; } MPIU_Free(iov); fn_exit: MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_NEWMAD_PROCESS_RDTYPE); return mpi_errno; fn_fail: ATTRIBUTE((unused)) goto fn_exit; }
int MPIR_Localcopy(const void *sendbuf, MPI_Aint sendcount, MPI_Datatype sendtype, void *recvbuf, MPI_Aint recvcount, MPI_Datatype recvtype) { int mpi_errno = MPI_SUCCESS; int sendtype_iscontig, recvtype_iscontig; MPI_Aint sendsize, recvsize, sdata_sz, rdata_sz, copy_sz; MPI_Aint true_extent, sendtype_true_lb, recvtype_true_lb; MPIU_CHKLMEM_DECL(1); MPID_MPI_STATE_DECL(MPID_STATE_MPIR_LOCALCOPY); MPID_MPI_FUNC_ENTER(MPID_STATE_MPIR_LOCALCOPY); MPID_Datatype_get_size_macro(sendtype, sendsize); MPID_Datatype_get_size_macro(recvtype, recvsize); sdata_sz = sendsize * sendcount; rdata_sz = recvsize * recvcount; /* if there is no data to copy, bail out */ if (!sdata_sz || !rdata_sz) goto fn_exit; #if defined(HAVE_ERROR_CHECKING) if (sdata_sz > rdata_sz) { MPIU_ERR_SET2(mpi_errno, MPI_ERR_TRUNCATE, "**truncate", "**truncate %d %d", sdata_sz, rdata_sz); copy_sz = rdata_sz; } else #endif /* HAVE_ERROR_CHECKING */ copy_sz = sdata_sz; /* Builtin types is the common case; optimize for it */ if ((HANDLE_GET_KIND(sendtype) == HANDLE_KIND_BUILTIN) && HANDLE_GET_KIND(recvtype) == HANDLE_KIND_BUILTIN) { MPIU_Memcpy(recvbuf, sendbuf, copy_sz); goto fn_exit; } MPIR_Datatype_iscontig(sendtype, &sendtype_iscontig); MPIR_Datatype_iscontig(recvtype, &recvtype_iscontig); MPIR_Type_get_true_extent_impl(sendtype, &sendtype_true_lb, &true_extent); MPIR_Type_get_true_extent_impl(recvtype, &recvtype_true_lb, &true_extent); if (sendtype_iscontig && recvtype_iscontig) { #if defined(HAVE_ERROR_CHECKING) MPIU_ERR_CHKMEMCPYANDJUMP(mpi_errno, ((char *)recvbuf + recvtype_true_lb), ((char *)sendbuf + sendtype_true_lb), copy_sz); #endif MPIU_Memcpy(((char *) recvbuf + recvtype_true_lb), ((char *) sendbuf + sendtype_true_lb), copy_sz); } else if (sendtype_iscontig) { MPID_Segment seg; MPI_Aint last; MPID_Segment_init(recvbuf, recvcount, recvtype, &seg, 0); last = copy_sz; MPID_Segment_unpack(&seg, 0, &last, (char*)sendbuf + sendtype_true_lb); MPIU_ERR_CHKANDJUMP(last != copy_sz, mpi_errno, MPI_ERR_TYPE, "**dtypemismatch"); } else if (recvtype_iscontig) { MPID_Segment seg; MPI_Aint last; MPID_Segment_init(sendbuf, sendcount, sendtype, &seg, 0); last = copy_sz; MPID_Segment_pack(&seg, 0, &last, (char*)recvbuf + recvtype_true_lb); MPIU_ERR_CHKANDJUMP(last != copy_sz, mpi_errno, MPI_ERR_TYPE, "**dtypemismatch"); } else { char * buf; MPIDI_msg_sz_t buf_off; MPID_Segment sseg; MPIDI_msg_sz_t sfirst; MPID_Segment rseg; MPIDI_msg_sz_t rfirst; MPIU_CHKLMEM_MALLOC(buf, char *, COPY_BUFFER_SZ, mpi_errno, "buf"); MPID_Segment_init(sendbuf, sendcount, sendtype, &sseg, 0); MPID_Segment_init(recvbuf, recvcount, recvtype, &rseg, 0); sfirst = 0; rfirst = 0; buf_off = 0; while (1) { MPI_Aint last; char * buf_end; if (copy_sz - sfirst > COPY_BUFFER_SZ - buf_off) { last = sfirst + (COPY_BUFFER_SZ - buf_off); } else { last = copy_sz; } MPID_Segment_pack(&sseg, sfirst, &last, buf + buf_off); MPIU_Assert(last > sfirst); buf_end = buf + buf_off + (last - sfirst); sfirst = last; MPID_Segment_unpack(&rseg, rfirst, &last, buf); MPIU_Assert(last > rfirst); rfirst = last; if (rfirst == copy_sz) { /* successful completion */ break; } /* if the send side finished, but the recv side couldn't unpack it, there's a datatype mismatch */ MPIU_ERR_CHKANDJUMP(sfirst == copy_sz, mpi_errno, MPI_ERR_TYPE, "**dtypemismatch"); /* if not all data was unpacked, copy it to the front of the buffer for next time */ buf_off = sfirst - rfirst; if (buf_off > 0) { memmove(buf, buf_end - buf_off, buf_off); } } } fn_exit: MPIU_CHKLMEM_FREEALL(); MPID_MPI_FUNC_EXIT(MPID_STATE_MPIR_LOCALCOPY); return mpi_errno; fn_fail: goto fn_exit; }
int MPIDO_Bcast_simple(void *buffer, int count, MPI_Datatype datatype, int root, MPID_Comm *comm_ptr, int *mpierrno) { TRACE_ERR("Entering MPIDO_Bcast_optimized\n"); int data_contig; void *data_buffer = NULL, *noncontig_buff = NULL; volatile unsigned active = 1; MPI_Aint data_true_lb = 0; MPID_Datatype *data_ptr; MPID_Segment segment; MPIDI_Post_coll_t bcast_post; const struct MPIDI_Comm* const mpid = &(comm_ptr->mpid); const int rank = comm_ptr->rank; /* Must calculate data_size based on count=1 in case it's total size is > integer */ int data_size_one; MPIDI_Datatype_get_info(1, datatype, data_contig, data_size_one, data_ptr, data_true_lb); if(MPIDI_Pamix_collsel_advise != NULL && mpid->collsel_fast_query != NULL) { advisor_algorithm_t advisor_algorithms[1]; int num_algorithms = MPIDI_Pamix_collsel_advise(mpid->collsel_fast_query, PAMI_XFER_BROADCAST, data_size_one * count, advisor_algorithms, 1); if(num_algorithms) { if(advisor_algorithms[0].algorithm_type == COLLSEL_EXTERNAL_ALGO) { return MPIR_Bcast_intra(buffer, count, datatype, root, comm_ptr, mpierrno); } } } const int data_size = data_size_one*(size_t)count; data_buffer = (char *)buffer + data_true_lb; if(!data_contig) { noncontig_buff = MPIU_Malloc(data_size); data_buffer = noncontig_buff; if(noncontig_buff == NULL) { MPID_Abort(NULL, MPI_ERR_NO_SPACE, 1, "Fatal: Cannot allocate pack buffer"); } if(rank == root) { DLOOP_Offset last = data_size; MPID_Segment_init(buffer, count, datatype, &segment, 0); MPID_Segment_pack(&segment, 0, &last, noncontig_buff); } } pami_xfer_t bcast; const pami_metadata_t *my_bcast_md; int queryreq = 0; bcast.cb_done = cb_bcast; bcast.cookie = (void *)&active; bcast.cmd.xfer_broadcast.root = MPIDI_Task_to_endpoint(MPID_VCR_GET_LPID(comm_ptr->vcr, root), 0); bcast.algorithm = mpid->coll_algorithm[PAMI_XFER_BROADCAST][0][0]; bcast.cmd.xfer_broadcast.buf = data_buffer; bcast.cmd.xfer_broadcast.type = PAMI_TYPE_BYTE; /* Needs to be sizeof(type)*count since we are using bytes as * the generic type */ bcast.cmd.xfer_broadcast.typecount = data_size; my_bcast_md = &mpid->coll_metadata[PAMI_XFER_BROADCAST][0][0]; MPIDI_Context_post(MPIDI_Context[0], &bcast_post.state, MPIDI_Pami_post_wrapper, (void *)&bcast); MPIDI_Update_last_algorithm(comm_ptr, my_bcast_md->name); MPID_PROGRESS_WAIT_WHILE(active); TRACE_ERR("bcast done\n"); if(!data_contig) { if(rank != root) MPIR_Localcopy(noncontig_buff, data_size, MPI_CHAR, buffer, count, datatype); MPIU_Free(noncontig_buff); } TRACE_ERR("Exiting MPIDO_Bcast_optimized\n"); return 0; }
int MPIDI_Get_accumulate(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype, void *result_addr, int result_count, MPI_Datatype result_datatype, int target_rank, MPI_Aint target_disp, int target_count, MPI_Datatype target_datatype, MPI_Op op, MPID_Win *win_ptr) { int mpi_errno = MPI_SUCCESS; MPIDI_msg_sz_t data_sz; int rank, origin_predefined, result_predefined, target_predefined; int shm_locked = 0; int dt_contig ATTRIBUTE((unused)); MPI_Aint dt_true_lb ATTRIBUTE((unused)); MPID_Datatype *dtp; MPIU_CHKLMEM_DECL(2); MPIDI_STATE_DECL(MPID_STATE_MPIDI_GET_ACCUMULATE); MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_GET_ACCUMULATE); if (target_rank == MPI_PROC_NULL) { goto fn_exit; } if (win_ptr->epoch_state == MPIDI_EPOCH_NONE && win_ptr->fence_issued) { win_ptr->epoch_state = MPIDI_EPOCH_FENCE; } MPIU_ERR_CHKANDJUMP(win_ptr->epoch_state == MPIDI_EPOCH_NONE, mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync"); MPIDI_Datatype_get_info(target_count, target_datatype, dt_contig, data_sz, dtp, dt_true_lb); if (data_sz == 0) { goto fn_exit; } rank = win_ptr->myrank; origin_predefined = TRUE; /* quiet uninitialized warnings (b/c goto) */ if (op != MPI_NO_OP) { MPIDI_CH3I_DATATYPE_IS_PREDEFINED(origin_datatype, origin_predefined); } MPIDI_CH3I_DATATYPE_IS_PREDEFINED(result_datatype, result_predefined); MPIDI_CH3I_DATATYPE_IS_PREDEFINED(target_datatype, target_predefined); /* Do =! rank first (most likely branch?) */ if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED) { MPI_User_function *uop; void *base; int disp_unit; if (win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED) { base = win_ptr->shm_base_addrs[target_rank]; disp_unit = win_ptr->disp_units[target_rank]; MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr); shm_locked = 1; } else { base = win_ptr->base; disp_unit = win_ptr->disp_unit; } /* Perform the local get first, then the accumulate */ mpi_errno = MPIR_Localcopy((char *) base + disp_unit * target_disp, target_count, target_datatype, result_addr, result_count, result_datatype); if (mpi_errno) { MPIU_ERR_POP(mpi_errno); } /* NO_OP: Don't perform the accumulate */ if (op == MPI_NO_OP) { if (shm_locked) { MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr); shm_locked = 0; } goto fn_exit; } if (op == MPI_REPLACE) { mpi_errno = MPIR_Localcopy(origin_addr, origin_count, origin_datatype, (char *) base + disp_unit * target_disp, target_count, target_datatype); if (mpi_errno) { MPIU_ERR_POP(mpi_errno); } if (shm_locked) { MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr); shm_locked = 0; } goto fn_exit; } MPIU_ERR_CHKANDJUMP1((HANDLE_GET_KIND(op) != HANDLE_KIND_BUILTIN), mpi_errno, MPI_ERR_OP, "**opnotpredefined", "**opnotpredefined %d", op ); /* get the function by indexing into the op table */ uop = MPIR_OP_HDL_TO_FN(op); if (origin_predefined && target_predefined) { /* Cast away const'ness for origin_address in order to * avoid changing the prototype for MPI_User_function */ (*uop)((void *) origin_addr, (char *) base + disp_unit*target_disp, &target_count, &target_datatype); } else { /* derived datatype */ MPID_Segment *segp; DLOOP_VECTOR *dloop_vec; MPI_Aint first, last; int vec_len, i, type_size, count; MPI_Datatype type; MPI_Aint true_lb, true_extent, extent; void *tmp_buf=NULL, *target_buf; const void *source_buf; if (origin_datatype != target_datatype) { /* first copy the data into a temporary buffer with the same datatype as the target. Then do the accumulate operation. */ MPIR_Type_get_true_extent_impl(target_datatype, &true_lb, &true_extent); MPID_Datatype_get_extent_macro(target_datatype, extent); MPIU_CHKLMEM_MALLOC(tmp_buf, void *, target_count * (MPIR_MAX(extent,true_extent)), mpi_errno, "temporary buffer"); /* adjust for potential negative lower bound in datatype */ tmp_buf = (void *)((char*)tmp_buf - true_lb); mpi_errno = MPIR_Localcopy(origin_addr, origin_count, origin_datatype, tmp_buf, target_count, target_datatype); if (mpi_errno) { MPIU_ERR_POP(mpi_errno); } } if (target_predefined) { /* target predefined type, origin derived datatype */ (*uop)(tmp_buf, (char *) base + disp_unit * target_disp, &target_count, &target_datatype); } else { segp = MPID_Segment_alloc(); MPIU_ERR_CHKANDJUMP1((!segp), mpi_errno, MPI_ERR_OTHER, "**nomem","**nomem %s","MPID_Segment_alloc"); MPID_Segment_init(NULL, target_count, target_datatype, segp, 0); first = 0; last = SEGMENT_IGNORE_LAST; MPID_Datatype_get_ptr(target_datatype, dtp); vec_len = dtp->max_contig_blocks * target_count + 1; /* +1 needed because Rob says so */ MPIU_CHKLMEM_MALLOC(dloop_vec, DLOOP_VECTOR *, vec_len * sizeof(DLOOP_VECTOR), mpi_errno, "dloop vector"); MPID_Segment_pack_vector(segp, first, &last, dloop_vec, &vec_len); source_buf = (tmp_buf != NULL) ? tmp_buf : origin_addr; target_buf = (char *) base + disp_unit * target_disp; type = dtp->eltype; type_size = MPID_Datatype_get_basic_size(type); for (i=0; i<vec_len; i++) { count = (dloop_vec[i].DLOOP_VECTOR_LEN)/type_size; (*uop)((char *)source_buf + MPIU_PtrToAint(dloop_vec[i].DLOOP_VECTOR_BUF), (char *)target_buf + MPIU_PtrToAint(dloop_vec[i].DLOOP_VECTOR_BUF), &count, &type); } MPID_Segment_free(segp); } }
int MPIDI_CH3_PktHandler_RndvClrToSend( MPIDI_VC_t *vc, MPIDI_CH3_Pkt_t *pkt, MPIDI_msg_sz_t *buflen, MPID_Request **rreqp ) { MPIDI_CH3_Pkt_rndv_clr_to_send_t * cts_pkt = &pkt->rndv_clr_to_send; MPID_Request * sreq; MPID_Request * rts_sreq; MPIDI_CH3_Pkt_t upkt; MPIDI_CH3_Pkt_rndv_send_t * rs_pkt = &upkt.rndv_send; int dt_contig; MPI_Aint dt_true_lb; MPIDI_msg_sz_t data_sz; MPID_Datatype * dt_ptr; int mpi_errno = MPI_SUCCESS; MPIU_DBG_MSG(CH3_OTHER,VERBOSE,"received rndv CTS pkt"); MPID_Request_get_ptr(cts_pkt->sender_req_id, sreq); MPIU_DBG_PRINTF(("received cts, count=%d\n", sreq->dev.user_count)); sreq->dev.OnDataAvail = 0; sreq->dev.OnFinal = 0; /* Release the RTS request if one exists. MPID_Request_fetch_and_clear_rts_sreq() needs to be atomic to prevent cancel send from cancelling the wrong (future) request. If MPID_Request_fetch_and_clear_rts_sreq() returns a NULL rts_sreq, then MPID_Cancel_send() is responsible for releasing the RTS request object. */ MPIDI_Request_fetch_and_clear_rts_sreq(sreq, &rts_sreq); if (rts_sreq != NULL) { MPID_Request_release(rts_sreq); } *buflen = sizeof(MPIDI_CH3_Pkt_t); MPIDI_Pkt_init(rs_pkt, MPIDI_CH3_PKT_RNDV_SEND); rs_pkt->receiver_req_id = cts_pkt->receiver_req_id; MPIDI_Datatype_get_info(sreq->dev.user_count, sreq->dev.datatype, dt_contig, data_sz, dt_ptr, dt_true_lb); if (dt_contig) { MPID_IOV iov[MPID_IOV_LIMIT]; MPIU_DBG_MSG_FMT(CH3_OTHER,VERBOSE,(MPIU_DBG_FDEST, "sending contiguous rndv data, data_sz=" MPIDI_MSG_SZ_FMT, data_sz)); iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST)rs_pkt; iov[0].MPID_IOV_LEN = sizeof(*rs_pkt); iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST)((char *)sreq->dev.user_buf + dt_true_lb); iov[1].MPID_IOV_LEN = data_sz; MPIU_THREAD_CS_ENTER(CH3COMM,vc); mpi_errno = MPIU_CALL(MPIDI_CH3,iSendv(vc, sreq, iov, 2)); MPIU_THREAD_CS_EXIT(CH3COMM,vc); MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|senddata"); } else { sreq->dev.segment_ptr = MPID_Segment_alloc( ); MPIU_ERR_CHKANDJUMP1((sreq->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc"); MPID_Segment_init(sreq->dev.user_buf, sreq->dev.user_count, sreq->dev.datatype, sreq->dev.segment_ptr, 0); sreq->dev.segment_first = 0; sreq->dev.segment_size = data_sz; MPIU_THREAD_CS_ENTER(CH3COMM,vc); mpi_errno = vc->sendNoncontig_fn(vc, sreq, rs_pkt, sizeof(*rs_pkt)); MPIU_THREAD_CS_EXIT(CH3COMM,vc); MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|senddata"); } *rreqp = NULL; fn_fail: return mpi_errno; }
static int send_msg(ptl_hdr_data_t ssend_flag, struct MPIDI_VC *vc, const void *buf, MPI_Aint count, MPI_Datatype datatype, int dest, int tag, MPID_Comm *comm, int context_offset, struct MPID_Request **request) { int mpi_errno = MPI_SUCCESS; MPID_nem_ptl_vc_area *const vc_ptl = VC_PTL(vc); int ret; MPIDI_msg_sz_t data_sz; int dt_contig; MPI_Aint dt_true_lb; MPID_Datatype *dt_ptr; MPID_Request *sreq = NULL; ptl_me_t me; int initial_iov_count, remaining_iov_count; ptl_md_t md; MPI_Aint last; MPIU_CHKPMEM_DECL(2); MPIDI_STATE_DECL(MPID_STATE_SEND_MSG); MPIDI_FUNC_ENTER(MPID_STATE_SEND_MSG); MPID_nem_ptl_request_create_sreq(sreq, mpi_errno, comm); sreq->dev.match.parts.rank = dest; sreq->dev.match.parts.tag = tag; sreq->dev.match.parts.context_id = comm->context_id + context_offset; sreq->ch.vc = vc; if (!vc_ptl->id_initialized) { mpi_errno = MPID_nem_ptl_init_id(vc); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } MPIDI_Datatype_get_info(count, datatype, dt_contig, data_sz, dt_ptr, dt_true_lb); MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "count="MPI_AINT_FMT_DEC_SPEC" datatype=%#x contig=%d data_sz=%lu", count, datatype, dt_contig, data_sz)); if (data_sz <= PTL_LARGE_THRESHOLD) { /* Small message. Send all data eagerly */ if (dt_contig) { void *start = (char *)buf + dt_true_lb; MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Small contig message"); REQ_PTL(sreq)->event_handler = handler_send; MPIU_DBG_MSG_P(CH3_CHANNEL, VERBOSE, "&REQ_PTL(sreq)->event_handler = %p", &(REQ_PTL(sreq)->event_handler)); if (start == NULL) ret = MPID_nem_ptl_rptl_put(MPIDI_nem_ptl_global_md, (ptl_size_t)&dummy, data_sz, PTL_NO_ACK_REQ, vc_ptl->id, vc_ptl->pt, NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank), 0, sreq, NPTL_HEADER(ssend_flag, data_sz)); else ret = MPID_nem_ptl_rptl_put(MPIDI_nem_ptl_global_md, (ptl_size_t)start, data_sz, PTL_NO_ACK_REQ, vc_ptl->id, vc_ptl->pt, NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank), 0, sreq, NPTL_HEADER(ssend_flag, data_sz)); MPIR_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlput", "**ptlput %s", MPID_nem_ptl_strerror(ret)); DBG_MSG_PUT("global", data_sz, vc->pg_rank, NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank), NPTL_HEADER(ssend_flag, data_sz)); MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "id.nid = %#x", vc_ptl->id.phys.nid); MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "id.pid = %#x", vc_ptl->id.phys.pid); MPIU_DBG_MSG_P(CH3_CHANNEL, VERBOSE, "sreq = %p", sreq); MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "vc_ptl->pt = %d", vc_ptl->pt); MPIU_DBG_MSG_P(CH3_CHANNEL, VERBOSE, "REQ_PTL(sreq)->event_handler = %p", REQ_PTL(sreq)->event_handler); goto fn_exit; } /* noncontig data */ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Small noncontig message"); sreq->dev.segment_ptr = MPID_Segment_alloc(); MPIR_ERR_CHKANDJUMP1(sreq->dev.segment_ptr == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc"); MPID_Segment_init(buf, count, datatype, sreq->dev.segment_ptr, 0); sreq->dev.segment_first = 0; sreq->dev.segment_size = data_sz; last = sreq->dev.segment_size; sreq->dev.iov_count = MPL_IOV_LIMIT; MPID_Segment_pack_vector(sreq->dev.segment_ptr, sreq->dev.segment_first, &last, sreq->dev.iov, &sreq->dev.iov_count); if (last == sreq->dev.segment_size) { /* IOV is able to describe entire message */ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, " entire message fits in IOV"); md.start = sreq->dev.iov; md.length = sreq->dev.iov_count; md.options = PTL_IOVEC; md.eq_handle = MPIDI_nem_ptl_origin_eq; md.ct_handle = PTL_CT_NONE; ret = PtlMDBind(MPIDI_nem_ptl_ni, &md, &REQ_PTL(sreq)->md); MPIR_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlmdbind", "**ptlmdbind %s", MPID_nem_ptl_strerror(ret)); REQ_PTL(sreq)->event_handler = handler_send; ret = MPID_nem_ptl_rptl_put(REQ_PTL(sreq)->md, 0, data_sz, PTL_NO_ACK_REQ, vc_ptl->id, vc_ptl->pt, NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank), 0, sreq, NPTL_HEADER(ssend_flag, data_sz)); MPIR_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlput", "**ptlput %s", MPID_nem_ptl_strerror(ret)); DBG_MSG_PUT("sreq", data_sz, vc->pg_rank, NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank), NPTL_HEADER(ssend_flag, data_sz)); goto fn_exit; } /* IOV is not long enough to describe entire message */ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, " IOV too long: using bounce buffer"); MPIU_CHKPMEM_MALLOC(REQ_PTL(sreq)->chunk_buffer[0], void *, data_sz, mpi_errno, "chunk_buffer"); MPID_Segment_init(buf, count, datatype, sreq->dev.segment_ptr, 0); sreq->dev.segment_first = 0; last = data_sz; MPID_Segment_pack(sreq->dev.segment_ptr, sreq->dev.segment_first, &last, REQ_PTL(sreq)->chunk_buffer[0]); MPIU_Assert(last == sreq->dev.segment_size); REQ_PTL(sreq)->event_handler = handler_send; ret = MPID_nem_ptl_rptl_put(MPIDI_nem_ptl_global_md, (ptl_size_t)REQ_PTL(sreq)->chunk_buffer[0], data_sz, PTL_NO_ACK_REQ, vc_ptl->id, vc_ptl->pt, NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank), 0, sreq, NPTL_HEADER(ssend_flag, data_sz)); MPIR_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlput", "**ptlput %s", MPID_nem_ptl_strerror(ret)); DBG_MSG_PUT("global", data_sz, vc->pg_rank, NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank), NPTL_HEADER(ssend_flag, data_sz)); goto fn_exit; } /* Large message. Send first chunk of data and let receiver get the rest */ if (dt_contig) { /* create ME for buffer so receiver can issue a GET for the data */ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Large contig message"); big_meappend((char *)buf + dt_true_lb + PTL_LARGE_THRESHOLD, data_sz - PTL_LARGE_THRESHOLD, vc, NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank), sreq); REQ_PTL(sreq)->event_handler = handler_send; ret = MPID_nem_ptl_rptl_put(MPIDI_nem_ptl_global_md, (ptl_size_t)((char *)buf + dt_true_lb), PTL_LARGE_THRESHOLD, PTL_NO_ACK_REQ, vc_ptl->id, vc_ptl->pt, NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank), 0, sreq, NPTL_HEADER(ssend_flag | NPTL_LARGE, data_sz)); MPIR_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlput", "**ptlput %s", MPID_nem_ptl_strerror(ret)); DBG_MSG_PUT("global", PTL_LARGE_THRESHOLD, vc->pg_rank, NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank), NPTL_HEADER(ssend_flag | NPTL_LARGE, data_sz)); goto fn_exit; } /* Large noncontig data */ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Large noncontig message"); sreq->dev.segment_ptr = MPID_Segment_alloc(); MPIR_ERR_CHKANDJUMP1(sreq->dev.segment_ptr == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc"); MPID_Segment_init(buf, count, datatype, sreq->dev.segment_ptr, 0); sreq->dev.segment_first = 0; sreq->dev.segment_size = data_sz; last = PTL_LARGE_THRESHOLD; sreq->dev.iov_count = MPL_IOV_LIMIT; MPID_Segment_pack_vector(sreq->dev.segment_ptr, sreq->dev.segment_first, &last, sreq->dev.iov, &sreq->dev.iov_count); initial_iov_count = sreq->dev.iov_count; sreq->dev.segment_first = last; if (last == PTL_LARGE_THRESHOLD) { /* first chunk of message fits into IOV */ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, " first chunk fits in IOV"); if (initial_iov_count < MPL_IOV_LIMIT) { /* There may be space for the rest of the message in this IOV */ sreq->dev.iov_count = MPL_IOV_LIMIT - sreq->dev.iov_count; last = sreq->dev.segment_size; MPID_Segment_pack_vector(sreq->dev.segment_ptr, sreq->dev.segment_first, &last, &sreq->dev.iov[initial_iov_count], &sreq->dev.iov_count); remaining_iov_count = sreq->dev.iov_count; if (last == sreq->dev.segment_size && last <= MPIDI_nem_ptl_ni_limits.max_msg_size + PTL_LARGE_THRESHOLD) { /* Entire message fit in one IOV */ int was_incomplete; MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, " rest of message fits in one IOV"); /* Create ME for remaining data */ me.start = &sreq->dev.iov[initial_iov_count]; me.length = remaining_iov_count; me.ct_handle = PTL_CT_NONE; me.uid = PTL_UID_ANY; me.options = ( PTL_ME_OP_PUT | PTL_ME_OP_GET | PTL_ME_USE_ONCE | PTL_ME_IS_ACCESSIBLE | PTL_ME_EVENT_LINK_DISABLE | PTL_ME_EVENT_UNLINK_DISABLE | PTL_IOVEC ); me.match_id = vc_ptl->id; me.match_bits = NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank); me.ignore_bits = 0; me.min_free = 0; MPIU_CHKPMEM_MALLOC(REQ_PTL(sreq)->get_me_p, ptl_handle_me_t *, sizeof(ptl_handle_me_t), mpi_errno, "get_me_p"); ret = PtlMEAppend(MPIDI_nem_ptl_ni, MPIDI_nem_ptl_get_pt, &me, PTL_PRIORITY_LIST, sreq, &REQ_PTL(sreq)->get_me_p[0]); MPIR_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlmeappend", "**ptlmeappend %s", MPID_nem_ptl_strerror(ret)); DBG_MSG_MEAPPEND("CTL", vc->pg_rank, me, sreq); /* increment the cc for the get operation */ MPIDI_CH3U_Request_increment_cc(sreq, &was_incomplete); MPIU_Assert(was_incomplete); /* Create MD for first chunk */ md.start = sreq->dev.iov; md.length = initial_iov_count; md.options = PTL_IOVEC; md.eq_handle = MPIDI_nem_ptl_origin_eq; md.ct_handle = PTL_CT_NONE; ret = PtlMDBind(MPIDI_nem_ptl_ni, &md, &REQ_PTL(sreq)->md); MPIR_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlmdbind", "**ptlmdbind %s", MPID_nem_ptl_strerror(ret)); REQ_PTL(sreq)->event_handler = handler_send; ret = MPID_nem_ptl_rptl_put(REQ_PTL(sreq)->md, 0, PTL_LARGE_THRESHOLD, PTL_NO_ACK_REQ, vc_ptl->id, vc_ptl->pt, NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank), 0, sreq, NPTL_HEADER(ssend_flag | NPTL_LARGE, data_sz)); MPIR_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlput", "**ptlput %s", MPID_nem_ptl_strerror(ret)); DBG_MSG_PUT("req", PTL_LARGE_THRESHOLD, vc->pg_rank, NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank), NPTL_HEADER(ssend_flag | NPTL_LARGE, data_sz)); goto fn_exit; }
int MPIR_Pack_impl(const void *inbuf, int incount, MPI_Datatype datatype, void *outbuf, MPI_Aint outsize, MPI_Aint *position) { int mpi_errno = MPI_SUCCESS; MPI_Aint first, last; MPID_Segment *segp; int contig; MPI_Aint dt_true_lb; MPI_Aint data_sz; if (incount == 0) { goto fn_exit; } /* Handle contig case quickly */ if (HANDLE_GET_KIND(datatype) == HANDLE_KIND_BUILTIN) { contig = TRUE; dt_true_lb = 0; data_sz = incount * MPID_Datatype_get_basic_size(datatype); } else { MPID_Datatype *dt_ptr; MPID_Datatype_get_ptr(datatype, dt_ptr); contig = dt_ptr->is_contig; dt_true_lb = dt_ptr->true_lb; data_sz = incount * dt_ptr->size; } if (contig) { MPIU_Memcpy((char *) outbuf + *position, (char *)inbuf + dt_true_lb, data_sz); *position = (int)((MPI_Aint)*position + data_sz); goto fn_exit; } /* non-contig case */ /* TODO: CHECK RETURN VALUES?? */ /* TODO: SHOULD THIS ALL BE IN A MPID_PACK??? */ segp = MPID_Segment_alloc(); MPIU_ERR_CHKANDJUMP1(segp == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment"); mpi_errno = MPID_Segment_init(inbuf, incount, datatype, segp, 0); if (mpi_errno) MPIU_ERR_POP(mpi_errno); /* NOTE: the use of buffer values and positions in MPI_Pack and in * MPID_Segment_pack are quite different. See code or docs or something. */ first = 0; last = SEGMENT_IGNORE_LAST; /* Ensure that pointer increment fits in a pointer */ MPID_Ensure_Aint_fits_in_pointer((MPI_VOID_PTR_CAST_TO_MPI_AINT outbuf) + (MPI_Aint) *position); MPID_Segment_pack(segp, first, &last, (void *) ((char *) outbuf + *position)); /* Ensure that calculation fits into an int datatype. */ MPID_Ensure_Aint_fits_in_int((MPI_Aint)*position + last); *position = (int)((MPI_Aint)*position + last); MPID_Segment_free(segp); fn_exit: return mpi_errno; fn_fail: goto fn_exit; }
/** * \brief MPID buffer copy * * Implements non-contiguous buffers correctly. * * \param[in] sbuf The address of the input buffer * \param[in] scount The number of elements in that buffer * \param[in] sdt The datatype of those elements * \param[out] smpi_errno Returns errors * \param[in] rbuf The address of the output buffer * \param[out] rcount The number of elements in that buffer * \param[in] rdt The datatype of those elements * \param[out] rsz The size of the ouput data * \param[out] rmpi_errno Returns errors */ void MPIDI_Buffer_copy( const void * const sbuf, MPI_Aint scount, MPI_Datatype sdt, int * smpi_errno, void * const rbuf, MPI_Aint rcount, MPI_Datatype rdt, MPIDI_msg_sz_t * rsz, int * rmpi_errno) { int sdt_contig; int rdt_contig; MPI_Aint sdt_true_lb, rdt_true_lb; MPIDI_msg_sz_t sdata_sz; MPIDI_msg_sz_t rdata_sz; MPID_Datatype * sdt_ptr; MPID_Datatype * rdt_ptr; MPI_Aint sdt_extent; MPI_Aint rdt_extent; *smpi_errno = MPI_SUCCESS; *rmpi_errno = MPI_SUCCESS; /* printf("bufcopy: src count=%d dt=%d\n", scount, sdt); */ /* printf("bufcopy: dst count=%d dt=%d\n", rcount, rdt); */ MPIDI_Datatype_get_info(scount, sdt, sdt_contig, sdata_sz, sdt_ptr, sdt_true_lb); MPIDI_Datatype_get_info(rcount, rdt, rdt_contig, rdata_sz, rdt_ptr, rdt_true_lb); /* --BEGIN ERROR HANDLING-- */ if (sdata_sz > rdata_sz) { *rmpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, __FUNCTION__, __LINE__, MPI_ERR_TRUNCATE, "**truncate", "**truncate %d %d", sdata_sz, rdata_sz ); sdata_sz = rdata_sz; } /* --END ERROR HANDLING-- */ if (sdata_sz == 0) { *rsz = 0; goto fn_exit; } if (sdt_contig && rdt_contig) { #if CUDA_AWARE_SUPPORT if(MPIDI_Process.cuda_aware_support_on && MPIDI_cuda_is_device_buf(rbuf)) { cudaError_t cudaerr = CudaMemcpy(rbuf + rdt_true_lb, sbuf + sdt_true_lb, sdata_sz, cudaMemcpyHostToDevice); } else #endif memcpy((char*)rbuf + rdt_true_lb, (const char *)sbuf + sdt_true_lb, sdata_sz); *rsz = sdata_sz; } else if (sdt_contig) { #if CUDA_AWARE_SUPPORT // This will need to be done in two steps: // 1 - Allocate a temp buffer which is the same size as user buffer and unpack in it. // 2 - Copy unpacked data into user buffer from temp buffer. if(MPIDI_Process.cuda_aware_support_on && MPIDI_cuda_is_device_buf(rbuf)) { MPID_Datatype_get_extent_macro(rdt, rdt_extent); char *buf = MPL_malloc(rdt_extent * rcount); memset(buf, 0, rdt_extent * rcount); MPID_Segment seg; DLOOP_Offset last; MPID_Segment_init(buf, rcount, rdt, &seg, 0); last = sdata_sz; MPID_Segment_unpack(&seg, 0, &last, (char*)sbuf + sdt_true_lb); /* --BEGIN ERROR HANDLING-- */ if (last != sdata_sz) { *rmpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, __FUNCTION__, __LINE__, MPI_ERR_TYPE, "**dtypemismatch", 0); } /* --END ERROR HANDLING-- */ *rsz = last; cudaError_t cudaerr = CudaMemcpy(rbuf + rdt_true_lb, buf, rdt_extent * rcount, cudaMemcpyHostToDevice); MPL_free(buf); goto fn_exit; } #endif MPID_Segment seg; DLOOP_Offset last; MPID_Segment_init(rbuf, rcount, rdt, &seg, 0); last = sdata_sz; MPID_Segment_unpack(&seg, 0, &last, (char*)sbuf + sdt_true_lb); /* --BEGIN ERROR HANDLING-- */ if (last != sdata_sz) { *rmpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, __FUNCTION__, __LINE__, MPI_ERR_TYPE, "**dtypemismatch", 0); } /* --END ERROR HANDLING-- */ *rsz = last; } else if (rdt_contig) { MPID_Segment seg; DLOOP_Offset last; MPID_Segment_init(sbuf, scount, sdt, &seg, 0); last = sdata_sz; MPID_Segment_pack(&seg, 0, &last, (char*)rbuf + rdt_true_lb); /* --BEGIN ERROR HANDLING-- */ if (last != sdata_sz) { *rmpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, __FUNCTION__, __LINE__, MPI_ERR_TYPE, "**dtypemismatch", 0); } /* --END ERROR HANDLING-- */ *rsz = last; } else { char * buf; MPIDI_msg_sz_t buf_off; MPID_Segment sseg; MPIDI_msg_sz_t sfirst; MPID_Segment rseg; MPIDI_msg_sz_t rfirst; buf = MPL_malloc(MPIDI_COPY_BUFFER_SZ); /* --BEGIN ERROR HANDLING-- */ if (buf == NULL) { *smpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, __FUNCTION__, __LINE__, MPI_ERR_OTHER, "**nomem", 0); *rmpi_errno = *smpi_errno; *rsz = 0; goto fn_exit; } /* --END ERROR HANDLING-- */ MPID_Segment_init(sbuf, scount, sdt, &sseg, 0); MPID_Segment_init(rbuf, rcount, rdt, &rseg, 0); sfirst = 0; rfirst = 0; buf_off = 0; for(;;) { DLOOP_Offset last; char * buf_end; if (sdata_sz - sfirst > MPIDI_COPY_BUFFER_SZ - buf_off) { last = sfirst + (MPIDI_COPY_BUFFER_SZ - buf_off); } else { last = sdata_sz; } MPID_Segment_pack(&sseg, sfirst, &last, buf + buf_off); /* --BEGIN ERROR HANDLING-- */ MPID_assert(last > sfirst); /* --END ERROR HANDLING-- */ buf_end = buf + buf_off + (last - sfirst); sfirst = last; MPID_Segment_unpack(&rseg, rfirst, &last, buf); /* --BEGIN ERROR HANDLING-- */ MPID_assert(last > rfirst); /* --END ERROR HANDLING-- */ rfirst = last; if (rfirst == sdata_sz) { /* successful completion */ break; } /* --BEGIN ERROR HANDLING-- */ if (sfirst == sdata_sz) { /* datatype mismatch -- remaining bytes could not be unpacked */ *rmpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, __FUNCTION__, __LINE__, MPI_ERR_TYPE, "**dtypemismatch", 0); break; } /* --END ERROR HANDLING-- */ buf_off = sfirst - rfirst; if (buf_off > 0) { memmove(buf, buf_end - buf_off, buf_off); } } *rsz = rfirst; MPL_free(buf); } fn_exit: return; }
int MPIDO_Scatterv_simple(const void *sendbuf, const int *sendcounts, const int *displs, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, int root, MPID_Comm *comm_ptr, int *mpierrno) { #ifndef HAVE_PAMI_IN_PLACE if (sendbuf == MPI_IN_PLACE) { MPID_Abort (NULL, 0, 1, "'MPI_IN_PLACE' requries support for `PAMI_IN_PLACE`"); return -1; } #endif int snd_contig = 1; int rcv_contig = 1; int send_size = 0, recv_size = 0; int ssize = 0; MPID_Datatype *dt_ptr = NULL; MPI_Aint send_true_lb=0, recv_true_lb=0; void *snd_noncontig_buff = NULL, *rcv_noncontig_buff = NULL; void *sbuf = NULL, *rbuf = NULL; int *sdispls = NULL, *scounts = NULL; int sndcount = 0; MPID_Segment segment; int tmp, i; pami_type_t stype = PAMI_TYPE_NULL; const int rank = comm_ptr->rank; const int size = comm_ptr->local_size; const struct MPIDI_Comm* const mpid = &(comm_ptr->mpid); if (rank == root && sendtype != MPI_DATATYPE_NULL && sendcounts[0] >= 0) { MPIDI_Datatype_get_info(1, sendtype, snd_contig, ssize, dt_ptr, send_true_lb); if(MPIDI_Pamix_collsel_advise != NULL && mpid->collsel_fast_query != NULL) { advisor_algorithm_t advisor_algorithms[1]; int num_algorithms = MPIDI_Pamix_collsel_advise(mpid->collsel_fast_query, PAMI_XFER_SCATTERV_INT, 64, advisor_algorithms, 1); if(num_algorithms) { if(advisor_algorithms[0].algorithm_type == COLLSEL_EXTERNAL_ALGO) { return MPIR_Scatterv(sendbuf, sendcounts, displs, sendtype, recvbuf, recvcount, recvtype, root, comm_ptr, mpierrno); } else if(advisor_algorithms[0].metadata && advisor_algorithms[0].metadata->check_correct.values.asyncflowctl && !(--(comm_ptr->mpid.num_requests))) { comm_ptr->mpid.num_requests = MPIDI_Process.optimized.num_requests; int tmpmpierrno; MPIDO_Barrier(comm_ptr, &tmpmpierrno); } } } } if (recvtype != MPI_DATATYPE_NULL && recvcount >= 0) { MPIDI_Datatype_get_info(recvcount, recvtype, rcv_contig, recv_size, dt_ptr, recv_true_lb); if(MPIDI_Pamix_collsel_advise != NULL && mpid->collsel_fast_query != NULL) { advisor_algorithm_t advisor_algorithms[1]; int num_algorithms = MPIDI_Pamix_collsel_advise(mpid->collsel_fast_query, PAMI_XFER_SCATTERV_INT, 64, advisor_algorithms, 1); if(num_algorithms) { if(advisor_algorithms[0].algorithm_type == COLLSEL_EXTERNAL_ALGO) { return MPIR_Scatterv(sendbuf, sendcounts, displs, sendtype, recvbuf, recvcount, recvtype, root, comm_ptr, mpierrno); } else if(advisor_algorithms[0].metadata && advisor_algorithms[0].metadata->check_correct.values.asyncflowctl && !(--(comm_ptr->mpid.num_requests))) { comm_ptr->mpid.num_requests = MPIDI_Process.optimized.num_requests; int tmpmpierrno; MPIDO_Barrier(comm_ptr, &tmpmpierrno); } } } } pami_xfer_t scatterv; const pami_metadata_t *my_scatterv_md; volatile unsigned scatterv_active = 1; sbuf = (char *)sendbuf + send_true_lb; rbuf = (char *)recvbuf + recv_true_lb; scounts = (int*)sendcounts; sdispls = (int*)displs; if(rank == root) { if(MPIDI_Datatype_to_pami(sendtype, &stype, -1, NULL, &tmp) != MPI_SUCCESS) { if (!snd_contig) { scounts = (int*)MPIU_Malloc(size); sdispls = (int*)MPIU_Malloc(size); for(i = 0; i < size; i++) { scounts[i] = ssize * sendcounts[i]; sdispls[i] = ssize * displs[i]; send_size += scounts[i]; sndcount += sendcounts[i]; } snd_noncontig_buff = MPIU_Malloc(send_size); sbuf = snd_noncontig_buff; stype = PAMI_TYPE_BYTE; if(snd_noncontig_buff == NULL) { MPID_Abort(NULL, MPI_ERR_NO_SPACE, 1, "Fatal: Cannot allocate pack buffer"); } DLOOP_Offset last = send_size; MPID_Segment_init(sendbuf, sndcount, sendtype, &segment, 0); MPID_Segment_pack(&segment, 0, &last, snd_noncontig_buff); } } if(recvbuf == MPI_IN_PLACE) { rbuf = PAMI_IN_PLACE; } } if(recvbuf != MPI_IN_PLACE) { if (!rcv_contig) { rcv_noncontig_buff = MPIU_Malloc(recv_size); rbuf = rcv_noncontig_buff; if(rcv_noncontig_buff == NULL) { MPID_Abort(NULL, MPI_ERR_NO_SPACE, 1, "Fatal: Cannot allocate pack buffer"); } } } scatterv.cb_done = cb_scatterv; scatterv.cookie = (void *)&scatterv_active; scatterv.cmd.xfer_scatterv_int.root = MPIDI_Task_to_endpoint(MPID_VCR_GET_LPID(comm_ptr->vcr, root), 0); scatterv.algorithm = mpid->coll_algorithm[PAMI_XFER_SCATTERV_INT][0][0]; my_scatterv_md = &mpid->coll_metadata[PAMI_XFER_SCATTERV_INT][0][0]; scatterv.cmd.xfer_scatterv_int.rcvbuf = rbuf; scatterv.cmd.xfer_scatterv_int.sndbuf = sbuf; scatterv.cmd.xfer_scatterv_int.stype = stype; scatterv.cmd.xfer_scatterv_int.rtype = PAMI_TYPE_BYTE;/* rtype is ignored when rcvbuf == PAMI_IN_PLACE */ scatterv.cmd.xfer_scatterv_int.stypecounts = (int *) scounts; scatterv.cmd.xfer_scatterv_int.rtypecount = recv_size; scatterv.cmd.xfer_scatterv_int.sdispls = (int *) sdispls; MPIDI_Update_last_algorithm(comm_ptr, my_scatterv_md->name); MPIDI_Post_coll_t scatterv_post; TRACE_ERR("%s scatterv\n", MPIDI_Process.context_post.active>0?"Posting":"Invoking"); MPIDI_Context_post(MPIDI_Context[0], &scatterv_post.state, MPIDI_Pami_post_wrapper, (void *)&scatterv); TRACE_ERR("Waiting on active %d\n", scatterv_active); MPID_PROGRESS_WAIT_WHILE(scatterv_active); if(!rcv_contig) { MPIR_Localcopy(rcv_noncontig_buff, recv_size, MPI_CHAR, recvbuf, recvcount, recvtype); MPIU_Free(rcv_noncontig_buff); } if(!snd_contig) { MPIU_Free(snd_noncontig_buff); MPIU_Free(scounts); MPIU_Free(sdispls); } TRACE_ERR("Leaving MPIDO_Scatterv_optimized\n"); return MPI_SUCCESS; }
/* create a request */ MPIDI_Request_create_sreq(sreq, mpi_errno, goto fn_exit); MPIU_Assert(sreq != NULL); MPIDI_Request_set_type(sreq, MPIDI_REQUEST_TYPE_SEND); MPIDI_VC_FAI_send_seqnum(vc, seqnum); MPIDI_Request_set_seqnum(sreq, seqnum); if (HANDLE_GET_KIND(datatype) != HANDLE_KIND_BUILTIN) { MPID_Datatype_get_ptr(datatype, sreq->dev.datatype_ptr); MPID_Datatype_add_ref(sreq->dev.datatype_ptr); } sreq->partner_request = NULL; sreq->dev.OnDataAvail = NULL; sreq->dev.tmpbuf = NULL; sreq->ch.vc = vc; sreq->ch.noncontig = FALSE; _dbg_mxm_output(5, "isSend ========> Sending USER msg for req %p (context %d to %d tag %d size %d) \n", sreq, comm->context_id + context_offset, rank, tag, data_sz); vc_area = VC_BASE(vc); req_area = REQ_BASE(sreq); req_area-> ctx = sreq; req_area->iov_buf = req_area->tmp_buf; req_area->iov_count = 0; req_area->iov_buf[0].ptr = NULL; req_area->iov_buf[0].length = 0; if (data_sz) { if (dt_contig) { req_area->iov_count = 1; req_area->iov_buf[0].ptr = (char *) (buf) + dt_true_lb; req_area->iov_buf[0].length = data_sz; } else { MPIDI_msg_sz_t last; MPI_Aint packsize = 0; sreq->ch.noncontig = TRUE; sreq->dev.segment_ptr = MPID_Segment_alloc(); MPIU_ERR_CHKANDJUMP1((sreq->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc"); MPIR_Pack_size_impl(count, datatype, &packsize); last = data_sz; if (packsize > 0) { sreq->dev.tmpbuf = MPIU_Malloc((size_t) packsize); MPIU_Assert(sreq->dev.tmpbuf); MPID_Segment_init(buf, count, datatype, sreq->dev.segment_ptr, 0); MPID_Segment_pack(sreq->dev.segment_ptr, 0, &last, sreq->dev.tmpbuf); req_area->iov_count = 1; req_area->iov_buf[0].ptr = sreq->dev.tmpbuf; req_area->iov_buf[0].length = last; } } } vc_area->pending_sends += 1; mpi_errno = _mxm_isend(vc_area->mxm_ep, req_area, MXM_MPICH_ISEND_SYNC, (mxm_mq_h) comm->dev.ch.netmod_priv, comm->rank, tag, _mxm_tag_mpi2mxm(tag, comm->context_id + context_offset), 0); if (mpi_errno) MPIU_ERR_POP(mpi_errno); _dbg_mxm_out_req(sreq); fn_exit: *sreq_ptr = sreq; MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_MXM_ISSEND); return mpi_errno; fn_fail: goto fn_exit; } static int _mxm_handle_sreq(MPID_Request * req) { int complete = FALSE; int (*reqFn) (MPIDI_VC_t *, MPID_Request *, int *); MPID_nem_mxm_vc_area *vc_area = NULL; MPID_nem_mxm_req_area *req_area = NULL; vc_area = VC_BASE(req->ch.vc); req_area = REQ_BASE(req); _dbg_mxm_out_buf(req_area->iov_buf[0].ptr, (req_area->iov_buf[0].length > 16 ? 16 : req_area->iov_buf[0].length)); vc_area->pending_sends -= 1; if (((req->dev.datatype_ptr != NULL) && (req->dev.tmpbuf != NULL))) { MPIU_Free(req->dev.tmpbuf); } if (req_area->iov_count > MXM_MPICH_MAX_IOV) { MPIU_Free(req_area->iov_buf); req_area->iov_buf = req_area->tmp_buf; req_area->iov_count = 0; } reqFn = req->dev.OnDataAvail; if (!reqFn) { MPIDI_CH3U_Request_complete(req); MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, ".... complete"); } else { MPIDI_VC_t *vc = req->ch.vc; reqFn(vc, req, &complete); if (!complete) { MPIU_Assert(complete == TRUE); } } return complete; } static void _mxm_send_completion_cb(void *context) { MPID_Request *req = (MPID_Request *) context; MPID_nem_mxm_vc_area *vc_area = NULL; MPID_nem_mxm_req_area *req_area = NULL; MPIU_Assert(req); _dbg_mxm_out_req(req); vc_area = VC_BASE(req->ch.vc); req_area = REQ_BASE(req); _mxm_to_mpi_status(req_area->mxm_req->item.base.error, &req->status); list_enqueue(&vc_area->mxm_ep->free_queue, &req_area->mxm_req->queue); _dbg_mxm_output(5, "========> %s SEND req %p status %d\n", (MPIR_STATUS_GET_CANCEL_BIT(req->status) ? "Canceling" : "Completing"), req, req->status.MPI_ERROR); if (likely(!MPIR_STATUS_GET_CANCEL_BIT(req->status))) { _mxm_handle_sreq(req); } } static int _mxm_isend(MPID_nem_mxm_ep_t * ep, MPID_nem_mxm_req_area * req, int type, mxm_mq_h mxm_mq, int mxm_rank, int id, mxm_tag_t mxm_tag, int block) { int mpi_errno = MPI_SUCCESS; mxm_error_t ret = MXM_OK; mxm_send_req_t *mxm_sreq; list_head_t *free_queue = NULL; MPIU_Assert(ep); MPIU_Assert(req); free_queue = &ep->free_queue; req->mxm_req = list_dequeue_mxm_req(free_queue); if (!req->mxm_req) { list_grow_mxm_req(free_queue); req->mxm_req = list_dequeue_mxm_req(free_queue); if (!req->mxm_req) { MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "empty free queue"); mpi_errno = MPI_ERR_OTHER; goto fn_fail; } } mxm_sreq = &(req->mxm_req->item.send); mxm_sreq->base.state = MXM_REQ_NEW; mxm_sreq->base.mq = mxm_mq; mxm_sreq->base.conn = ep->mxm_conn; mxm_sreq->base.completed_cb = _mxm_send_completion_cb; mxm_sreq->base.context = req->ctx; if (type == MXM_MPICH_ISEND_AM) { mxm_sreq->opcode = MXM_REQ_OP_AM; mxm_sreq->flags = 0; mxm_sreq->op.am.hid = id; mxm_sreq->op.am.imm_data = mxm_rank; } else if (type == MXM_MPICH_ISEND_SYNC) { mxm_sreq->opcode = MXM_REQ_OP_SEND_SYNC; mxm_sreq->flags = 0; mxm_sreq->op.send.tag = mxm_tag; mxm_sreq->op.send.imm_data = mxm_rank; } else { mxm_sreq->opcode = MXM_REQ_OP_SEND; mxm_sreq->flags = 0; mxm_sreq->op.send.tag = mxm_tag; mxm_sreq->op.send.imm_data = mxm_rank; } if (likely(req->iov_count == 1)) { mxm_sreq->base.data_type = MXM_REQ_DATA_BUFFER; mxm_sreq->base.data.buffer.ptr = req->iov_buf[0].ptr; mxm_sreq->base.data.buffer.length = req->iov_buf[0].length; } else { mxm_sreq->base.data_type = MXM_REQ_DATA_IOV; mxm_sreq->base.data.iov.vector = req->iov_buf; mxm_sreq->base.data.iov.count = req->iov_count; } ret = mxm_req_send(mxm_sreq); if (MXM_OK != ret) { list_enqueue(free_queue, &req->mxm_req->queue); mpi_errno = MPI_ERR_OTHER; goto fn_fail; } if (block) _mxm_req_wait(&mxm_sreq->base); fn_exit: return mpi_errno; fn_fail: goto fn_exit; } #if 0 /* Consider using this function in case non contiguous data */ static int _mxm_process_sdtype(MPID_Request ** sreq_p, MPI_Datatype datatype, MPID_Datatype * dt_ptr, MPIDI_msg_sz_t data_sz, const void *buf, int count, mxm_req_buffer_t ** iov_buf, int *iov_count) { int mpi_errno = MPI_SUCCESS; MPID_Request *sreq = *sreq_p; MPIDI_msg_sz_t last; MPID_IOV *iov; int n_iov = 0; int index; int size_to_copy = 0; sreq->dev.segment_ptr = MPID_Segment_alloc(); MPIU_ERR_CHKANDJUMP1((sreq->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc"); MPID_Segment_init(buf, count, datatype, sreq->dev.segment_ptr, 0); sreq->dev.segment_first = 0; sreq->dev.segment_size = data_sz; last = sreq->dev.segment_size; MPID_Segment_count_contig_blocks(sreq->dev.segment_ptr, sreq->dev.segment_first, &last, (MPI_Aint *) & n_iov); MPIU_Assert(n_iov > 0); iov = MPIU_Malloc(n_iov * sizeof(*iov)); MPIU_Assert(iov); last = sreq->dev.segment_size; MPID_Segment_pack_vector(sreq->dev.segment_ptr, sreq->dev.segment_first, &last, iov, &n_iov); MPIU_Assert(last == sreq->dev.segment_size); #if defined(MXM_DEBUG) && (MXM_DEBUG > 0) _dbg_mxm_output(7, "Send Noncontiguous data vector %i entries (free slots : %i)\n", n_iov, MXM_REQ_DATA_MAX_IOV); for(index = 0; index < n_iov; index++) { _dbg_mxm_output(7, "======= Recv iov[%i] = ptr : %p, len : %i \n", index, iov[index].MPID_IOV_BUF, iov[index].MPID_IOV_LEN); } #endif if (n_iov > MXM_MPICH_MAX_IOV) { *iov_buf = (mxm_req_buffer_t *) MPIU_Malloc(n_iov * sizeof(**iov_buf)); MPIU_Assert(*iov_buf); } for (index = 0; index < n_iov; index++) { if (index < (MXM_REQ_DATA_MAX_IOV - 1)) { (*iov_buf)[index].ptr = iov[index].MPID_IOV_BUF; (*iov_buf)[index].length = iov[index].MPID_IOV_LEN; } else { size_to_copy += iov[index].MPID_IOV_LEN; } } if (size_to_copy == 0) { sreq->dev.tmpbuf = NULL; sreq->dev.tmpbuf_sz = 0; *iov_count = n_iov; } else { int offset = 0; sreq->dev.tmpbuf = MPIU_Malloc(size_to_copy); sreq->dev.tmpbuf_sz = size_to_copy; MPIU_Assert(sreq->dev.tmpbuf); for (index = (MXM_REQ_DATA_MAX_IOV - 1); index < n_iov; index++) { MPIU_Memcpy((char *) (sreq->dev.tmpbuf) + offset, iov[index].MPID_IOV_BUF, iov[index].MPID_IOV_LEN); offset += iov[index].MPID_IOV_LEN; } (*iov_buf)[MXM_REQ_DATA_MAX_IOV - 1].ptr = sreq->dev.tmpbuf; (*iov_buf)[MXM_REQ_DATA_MAX_IOV - 1].length = size_to_copy; *iov_count = MXM_REQ_DATA_MAX_IOV; } MPIU_Free(iov); fn_exit: return mpi_errno; fn_fail: goto fn_exit; }
int MPIDI_CH3U_Request_unpack_uebuf(MPID_Request * rreq) { int dt_contig; MPI_Aint dt_true_lb; MPIDI_msg_sz_t userbuf_sz; MPID_Datatype * dt_ptr; MPIDI_msg_sz_t unpack_sz; int mpi_errno = MPI_SUCCESS; MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3U_REQUEST_UNPACK_UEBUF); MPIDI_STATE_DECL(MPID_STATE_MEMCPY); MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3U_REQUEST_UNPACK_UEBUF); MPIDI_Datatype_get_info(rreq->dev.user_count, rreq->dev.datatype, dt_contig, userbuf_sz, dt_ptr, dt_true_lb); if (rreq->dev.recv_data_sz <= userbuf_sz) { unpack_sz = rreq->dev.recv_data_sz; } else { /* --BEGIN ERROR HANDLING-- */ MPIU_DBG_MSG_FMT(CH3_CHANNEL,VERBOSE,(MPIU_DBG_FDEST, "receive buffer overflow; message truncated, msg_sz=" MPIDI_MSG_SZ_FMT ", buf_sz=" MPIDI_MSG_SZ_FMT, rreq->dev.recv_data_sz, userbuf_sz)); unpack_sz = userbuf_sz; MPIR_STATUS_SET_COUNT(rreq->status, userbuf_sz); rreq->status.MPI_ERROR = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_TRUNCATE, "**truncate", "**truncate %d %d", rreq->dev.recv_data_sz, userbuf_sz); /* --END ERROR HANDLING-- */ } if (unpack_sz > 0) { if (dt_contig) { /* TODO - check that amount of data is consistent with datatype. In other words, if we were to use Segment_unpack() would last = unpack? If not we should return an error (unless configured with --enable-fast) */ MPIDI_FUNC_ENTER(MPID_STATE_MEMCPY); MPIU_Memcpy((char *)rreq->dev.user_buf + dt_true_lb, rreq->dev.tmpbuf, unpack_sz); MPIDI_FUNC_EXIT(MPID_STATE_MEMCPY); } else { MPID_Segment seg; MPI_Aint last; MPID_Segment_init(rreq->dev.user_buf, rreq->dev.user_count, rreq->dev.datatype, &seg, 0); last = unpack_sz; MPID_Segment_unpack(&seg, 0, &last, rreq->dev.tmpbuf); if (last != unpack_sz) { /* --BEGIN ERROR HANDLING-- */ /* received data was not entirely consumed by unpack() because too few bytes remained to fill the next basic datatype */ MPIR_STATUS_SET_COUNT(rreq->status, last); rreq->status.MPI_ERROR = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_TYPE, "**dtypemismatch", 0); /* --END ERROR HANDLING-- */ } } } MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3U_REQUEST_UNPACK_UEBUF); return mpi_errno; }
int MPID_nem_tcp_module_lmt_start_recv (MPIDI_VC_t *vc, MPID_Request *req) { int mpi_errno = MPI_SUCCESS; int ret; MPIDI_msg_sz_t data_sz; int dt_contig; MPI_Aint dt_true_lb; MPID_Datatype * dt_ptr; MPIDI_msg_sz_t last; int nb; int r_len; MPIDI_CH3I_VC *vc_ch = (MPIDI_CH3I_VC *)vc->channel_private; MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_TCP_MODULE_LMT_START_RECV); MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_TCP_MODULE_LMT_START_RECV); free_cookie (vc_ch->net.tcp.lmt_cookie); if (!vc_ch->net.tcp.lmt_connected) { int len; struct sockaddr_in saddr; int connfd; len = sizeof (saddr); connfd = accept (vc_ch->net.tcp.lmt_desc, (struct sockaddr *)&saddr, &len); MPIU_ERR_CHKANDJUMP2 (connfd == -1, mpi_errno, MPI_ERR_OTHER, "**sock|poll|accept", "**sock|poll|accept %d %s", errno, strerror (errno)); /* close listen fd */ do ret = close (vc_ch->net.tcp.lmt_desc); while (ret == -1 && errno == EINTR); MPIU_ERR_CHKANDJUMP2 (ret == -1, mpi_errno, MPI_ERR_OTHER, "**closesocket", "**closesocket %s %d", strerror (errno), errno); /* set lmt_desc to new connected fd */ vc_ch->net.tcp.lmt_desc = connfd; vc_ch->net.tcp.lmt_connected = 1; // ret = fcntl (vc_ch->net.tcp.lmt_desc, F_SETFL, O_NONBLOCK); // MPIU_ERR_CHKANDJUMP2 (ret == -1, mpi_errno, MPI_ERR_OTHER, "**fail", "**fail %s %d", strerror (errno), errno); } MPIDI_Datatype_get_info (req->dev.user_count, req->dev.datatype, dt_contig, data_sz, dt_ptr, dt_true_lb); if (data_sz > vc_ch->net.tcp.lmt_s_len) { data_sz = vc_ch->net.tcp.lmt_s_len; } else if (data_sz < vc_ch->net.tcp.lmt_s_len) { /* message will be truncated */ r_len = data_sz; req->status.MPI_ERROR = MPIU_ERR_SET2 (mpi_errno, MPI_ERR_TRUNCATE, "**truncate", "**truncate %d %d", vc_ch->net.tcp.lmt_s_len, r_len); } MPID_Segment_init (req->dev.user_buf, req->dev.user_count, req->dev.datatype, &req->dev.segment, 0); req->dev.segment_first = 0; req->dev.segment_size = data_sz; req->dev.iov_count = MPID_IOV_LIMIT; req->dev.iov_offset = 0; last = data_sz; do { int iov_offset; int left_to_recv; MPID_Segment_unpack_vector (&req->dev.segment, req->dev.segment_first, &last, req->dev.iov, &req->dev.iov_count); left_to_recv = last - req->dev.segment_first; iov_offset = 0; #ifdef TESTING_CHUNKING { char *buf = req->dev.iov[0].MPID_IOV_BUF; int l; while (left_to_recv) { if (left_to_recv > CHUNK) l = CHUNK; else l = left_to_recv; do nb = read (vc_ch->net.tcp.lmt_desc, buf, l); while (nb == -1 && errno == EINTR); MPIU_ERR_CHKANDJUMP (nb == -1, mpi_errno, MPI_ERR_OTHER, "**sock_writev"); left_to_recv -= nb; buf += nb; } MPIDI_CH3U_Request_complete (req); goto fn_exit; } #endif do nb = readv (vc_ch->net.tcp.lmt_desc, &req->dev.iov[iov_offset], req->dev.iov_count - iov_offset); while (nb == -1 && errno == EINTR); MPIU_ERR_CHKANDJUMP2 (nb == -1, mpi_errno, MPI_ERR_OTHER, "**fail", "**fail %s %d", strerror (errno), errno); MPIU_ERR_CHKANDJUMP (nb == 0, mpi_errno, MPI_ERR_OTHER, "**fail"); left_to_recv -= nb; while (left_to_recv) { /* recv rest of iov */ while (nb >= req->dev.iov[iov_offset].MPID_IOV_LEN) { /* update iov to reflect sent bytes */ nb -= req->dev.iov[iov_offset].MPID_IOV_LEN; ++iov_offset; } req->dev.iov[iov_offset].MPID_IOV_BUF = (char *)req->dev.iov[iov_offset].MPID_IOV_BUF + nb; req->dev.iov[iov_offset].MPID_IOV_LEN -= nb; do nb = readv (vc_ch->net.tcp.lmt_desc, &req->dev.iov[iov_offset], req->dev.iov_count - iov_offset); while (nb == -1 && errno == EINTR); MPIU_ERR_CHKANDJUMP2 (nb == -1, mpi_errno, MPI_ERR_OTHER, "**fail", "**fail %s %d", strerror (errno), errno); MPIU_ERR_CHKANDJUMP (nb == 0, mpi_errno, MPI_ERR_OTHER, "**fail"); left_to_recv -= nb; } } while (last < data_sz); MPIDI_CH3U_Request_complete (req); fn_exit: MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_TCP_MODULE_LMT_START_RECV); return mpi_errno; fn_fail: goto fn_exit; }
/*@ MPI_Unpack_external - Unpack a buffer (packed with MPI_Pack_external) according to a datatype into contiguous memory Input Parameters: + datarep - data representation (string) . inbuf - input buffer start (choice) . insize - input buffer size, in bytes (address integer) . outcount - number of output data items (integer) . datatype - datatype of output data item (handle) Input/Output Parameters: . position - current position in buffer, in bytes (address integer) Output Parameters: . outbuf - output buffer start (choice) .N ThreadSafe .N Fortran .N Errors .N MPI_SUCCESS .N MPI_ERR_TYPE .N MPI_ERR_ARG @*/ int MPI_Unpack_external(const char datarep[], const void *inbuf, MPI_Aint insize, MPI_Aint *position, void *outbuf, int outcount, MPI_Datatype datatype) { static const char FCNAME[] = "MPI_Unpack_external"; int mpi_errno = MPI_SUCCESS; MPI_Aint first, last; MPID_Segment *segp; MPIR_FUNC_TERSE_STATE_DECL(MPID_STATE_MPI_UNPACK_EXTERNAL); MPIR_ERRTEST_INITIALIZED_ORDIE(); MPIR_FUNC_TERSE_ENTER(MPID_STATE_MPI_UNPACK_EXTERNAL); /* Validate parameters, especially handles needing to be converted */ # ifdef HAVE_ERROR_CHECKING { MPID_BEGIN_ERROR_CHECKS; { if (insize > 0) { MPIR_ERRTEST_ARGNULL(inbuf, "input buffer", mpi_errno); } /* NOTE: outbuf could be MPI_BOTTOM; don't test for NULL */ MPIR_ERRTEST_COUNT(insize, mpi_errno); MPIR_ERRTEST_COUNT(outcount, mpi_errno); MPIR_ERRTEST_DATATYPE(datatype, "datatype", mpi_errno); if (datatype != MPI_DATATYPE_NULL && HANDLE_GET_KIND(datatype) != HANDLE_KIND_BUILTIN) { MPIR_Datatype *datatype_ptr = NULL; MPID_Datatype_get_ptr(datatype, datatype_ptr); MPIR_Datatype_valid_ptr(datatype_ptr, mpi_errno); MPID_Datatype_committed_ptr(datatype_ptr, mpi_errno); } /* If datatye_ptr is not valid, it will be reset to null */ if (mpi_errno) goto fn_fail; } MPID_END_ERROR_CHECKS; } # endif /* HAVE_ERROR_CHECKING */ /* ... body of routine ... */ if (insize == 0) { goto fn_exit; } segp = MPID_Segment_alloc(); MPIR_ERR_CHKANDJUMP1((segp == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc"); mpi_errno = MPID_Segment_init(outbuf, outcount, datatype, segp, 1); if (mpi_errno != MPI_SUCCESS) goto fn_fail; /* NOTE: buffer values and positions in MPI_Unpack_external are used very * differently from use in MPID_Segment_unpack_external... */ first = 0; last = SEGMENT_IGNORE_LAST; /* Ensure that pointer increment fits in a pointer */ MPIR_Ensure_Aint_fits_in_pointer((MPIR_VOID_PTR_CAST_TO_MPI_AINT inbuf) + *position); MPID_Segment_unpack_external32(segp, first, &last, (void *) ((char *) inbuf + *position)); *position += last; MPID_Segment_free(segp); if (mpi_errno != MPI_SUCCESS) goto fn_fail; /* ... end of body of routine ... */ fn_exit: MPIR_FUNC_TERSE_EXIT(MPID_STATE_MPI_UNPACK_EXTERNAL); return mpi_errno; fn_fail: /* --BEGIN ERROR HANDLING-- */ # ifdef HAVE_ERROR_CHECKING { mpi_errno = MPIR_Err_create_code( mpi_errno, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OTHER, "**mpi_unpack_external", "**mpi_unpack_external %s %p %d %p %p %d %D", datarep, inbuf, insize, position, outbuf, outcount, datatype); } # endif mpi_errno = MPIR_Err_return_comm(NULL, FCNAME, mpi_errno); goto fn_exit; /* --END ERROR HANDLING-- */ }
int MPIDI_Accumulate(void *origin_addr, int origin_count, MPI_Datatype origin_datatype, int target_rank, MPI_Aint target_disp, int target_count, MPI_Datatype target_datatype, MPI_Op op, MPID_Win *win_ptr) { int mpi_errno=MPI_SUCCESS; MPIDI_msg_sz_t data_sz; int dt_contig, rank, origin_predefined, target_predefined; MPI_Aint dt_true_lb; MPIDI_RMA_ops *new_ptr; MPID_Datatype *dtp; MPIU_CHKLMEM_DECL(2); MPIU_CHKPMEM_DECL(1); MPIDI_STATE_DECL(MPID_STATE_MPIDI_ACCUMULATE); MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_ACCUMULATE); MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, data_sz, dtp, dt_true_lb); if ((data_sz == 0) || (target_rank == MPI_PROC_NULL)) { goto fn_exit; } rank = win_ptr->myrank; MPIDI_CH3I_DATATYPE_IS_PREDEFINED(origin_datatype, origin_predefined); MPIDI_CH3I_DATATYPE_IS_PREDEFINED(target_datatype, target_predefined); /* Do =! rank first (most likely branch?) */ if (target_rank == rank) { MPI_User_function *uop; if (op == MPI_REPLACE) { mpi_errno = MPIR_Localcopy(origin_addr, origin_count, origin_datatype, (char *) win_ptr->base + win_ptr->disp_unit * target_disp, target_count, target_datatype); goto fn_exit; } MPIU_ERR_CHKANDJUMP1((HANDLE_GET_KIND(op) != HANDLE_KIND_BUILTIN), mpi_errno, MPI_ERR_OP, "**opnotpredefined", "**opnotpredefined %d", op ); /* get the function by indexing into the op table */ uop = MPIR_Op_table[((op)&0xf) - 1]; if (origin_predefined && target_predefined) { (*uop)(origin_addr, (char *) win_ptr->base + win_ptr->disp_unit * target_disp, &target_count, &target_datatype); } else { /* derived datatype */ MPID_Segment *segp; DLOOP_VECTOR *dloop_vec; MPI_Aint first, last; int vec_len, i, type_size, count; MPI_Datatype type; MPI_Aint true_lb, true_extent, extent; void *tmp_buf=NULL, *source_buf, *target_buf; if (origin_datatype != target_datatype) { /* first copy the data into a temporary buffer with the same datatype as the target. Then do the accumulate operation. */ MPIR_Type_get_true_extent_impl(target_datatype, &true_lb, &true_extent); MPID_Datatype_get_extent_macro(target_datatype, extent); MPIU_CHKLMEM_MALLOC(tmp_buf, void *, target_count * (MPIR_MAX(extent,true_extent)), mpi_errno, "temporary buffer"); /* adjust for potential negative lower bound in datatype */ tmp_buf = (void *)((char*)tmp_buf - true_lb); mpi_errno = MPIR_Localcopy(origin_addr, origin_count, origin_datatype, tmp_buf, target_count, target_datatype); if (mpi_errno) { MPIU_ERR_POP(mpi_errno); } } if (target_predefined) { /* target predefined type, origin derived datatype */ (*uop)(tmp_buf, (char *) win_ptr->base + win_ptr->disp_unit * target_disp, &target_count, &target_datatype); } else { segp = MPID_Segment_alloc(); MPIU_ERR_CHKANDJUMP1((!segp), mpi_errno, MPI_ERR_OTHER, "**nomem","**nomem %s","MPID_Segment_alloc"); MPID_Segment_init(NULL, target_count, target_datatype, segp, 0); first = 0; last = SEGMENT_IGNORE_LAST; MPID_Datatype_get_ptr(target_datatype, dtp); vec_len = dtp->max_contig_blocks * target_count + 1; /* +1 needed because Rob says so */ MPIU_CHKLMEM_MALLOC(dloop_vec, DLOOP_VECTOR *, vec_len * sizeof(DLOOP_VECTOR), mpi_errno, "dloop vector"); MPID_Segment_pack_vector(segp, first, &last, dloop_vec, &vec_len); source_buf = (tmp_buf != NULL) ? tmp_buf : origin_addr; target_buf = (char *) win_ptr->base + win_ptr->disp_unit * target_disp; type = dtp->eltype; type_size = MPID_Datatype_get_basic_size(type); for (i=0; i<vec_len; i++) { count = (dloop_vec[i].DLOOP_VECTOR_LEN)/type_size; (*uop)((char *)source_buf + MPIU_PtrToAint(dloop_vec[i].DLOOP_VECTOR_BUF), (char *)target_buf + MPIU_PtrToAint(dloop_vec[i].DLOOP_VECTOR_BUF), &count, &type); } MPID_Segment_free(segp); } }
int MPID_nem_mxm_issend(MPIDI_VC_t * vc, const void *buf, int count, MPI_Datatype datatype, int rank, int tag, MPID_Comm * comm, int context_offset, MPID_Request ** sreq_ptr) { int mpi_errno = MPI_SUCCESS; MPID_Request *sreq = NULL; MPID_Datatype *dt_ptr; int dt_contig; MPIDI_msg_sz_t data_sz; MPI_Aint dt_true_lb; MPID_nem_mxm_vc_area *vc_area = NULL; MPID_nem_mxm_req_area *req_area = NULL; MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_MXM_ISSEND); MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_MXM_ISSEND); MPIDI_Datatype_get_info(count, datatype, dt_contig, data_sz, dt_ptr, dt_true_lb); /* create a request */ MPIDI_Request_create_sreq(sreq, mpi_errno, goto fn_exit); MPIU_Assert(sreq != NULL); MPIDI_Request_set_type(sreq, MPIDI_REQUEST_TYPE_SEND); MPIDI_VC_FAI_send_seqnum(vc, seqnum); MPIDI_Request_set_seqnum(sreq, seqnum); if (HANDLE_GET_KIND(datatype) != HANDLE_KIND_BUILTIN) { MPID_Datatype_get_ptr(datatype, sreq->dev.datatype_ptr); MPID_Datatype_add_ref(sreq->dev.datatype_ptr); } sreq->partner_request = NULL; sreq->dev.OnDataAvail = NULL; sreq->dev.tmpbuf = NULL; sreq->ch.vc = vc; sreq->ch.noncontig = FALSE; _dbg_mxm_output(5, "isSend ========> Sending USER msg for req %p (context %d to %d tag %d size %d) \n", sreq, comm->context_id + context_offset, rank, tag, data_sz); vc_area = VC_BASE(vc); req_area = REQ_BASE(sreq); req_area-> ctx = sreq; req_area->iov_buf = req_area->tmp_buf; req_area->iov_count = 0; req_area->iov_buf[0].ptr = NULL; req_area->iov_buf[0].length = 0; if (data_sz) { if (dt_contig) { req_area->iov_count = 1; req_area->iov_buf[0].ptr = (char *) (buf) + dt_true_lb; req_area->iov_buf[0].length = data_sz; } else { MPIDI_msg_sz_t last; MPI_Aint packsize = 0; sreq->ch.noncontig = TRUE; sreq->dev.segment_ptr = MPID_Segment_alloc(); MPIU_ERR_CHKANDJUMP1((sreq->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc"); MPIR_Pack_size_impl(count, datatype, &packsize); last = data_sz; if (packsize > 0) { sreq->dev.tmpbuf = MPIU_Malloc((size_t) packsize); MPIU_Assert(sreq->dev.tmpbuf); MPID_Segment_init(buf, count, datatype, sreq->dev.segment_ptr, 0); MPID_Segment_pack(sreq->dev.segment_ptr, 0, &last, sreq->dev.tmpbuf); req_area->iov_count = 1; req_area->iov_buf[0].ptr = sreq->dev.tmpbuf; req_area->iov_buf[0].length = last; } } } vc_area->pending_sends += 1; mpi_errno = _mxm_isend(vc_area->mxm_ep, req_area, MXM_MPICH_ISEND_SYNC, (mxm_mq_h) comm->dev.ch.netmod_priv, comm->rank, tag, _mxm_tag_mpi2mxm(tag, comm->context_id + context_offset), 0); if (mpi_errno) MPIU_ERR_POP(mpi_errno); _dbg_mxm_out_req(sreq); fn_exit: *sreq_ptr = sreq; MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_MXM_ISSEND); return mpi_errno; fn_fail: goto fn_exit; }
int MPID_nem_ptl_recv_posted(MPIDI_VC_t *vc, MPID_Request *rreq) { int mpi_errno = MPI_SUCCESS; MPID_nem_ptl_vc_area *const vc_ptl = VC_PTL(vc); ptl_me_t me; int dt_contig; MPIDI_msg_sz_t data_sz; MPID_Datatype *dt_ptr; MPI_Aint dt_true_lb; MPI_Aint last; ptl_process_t id_any; int ret; MPIU_CHKPMEM_DECL(1); MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_PTL_RECV_POSTED); MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_PTL_RECV_POSTED); id_any.phys.nid = PTL_NID_ANY; id_any.phys.pid = PTL_PID_ANY; MPID_nem_ptl_init_req(rreq); me.ct_handle = PTL_CT_NONE; me.uid = PTL_UID_ANY; me.options = ( PTL_ME_OP_PUT | PTL_ME_IS_ACCESSIBLE | PTL_ME_EVENT_LINK_DISABLE | PTL_ME_EVENT_UNLINK_DISABLE | PTL_ME_USE_ONCE ); if (vc == NULL) { /* MPI_ANY_SOURCE receive */ me.match_id = id_any; } else { if (!vc_ptl->id_initialized) { mpi_errno = MPID_nem_ptl_init_id(vc); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } me.match_id = vc_ptl->id; } MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "tag=%#x ctx=%#x rank=%#x", rreq->dev.match.parts.tag, rreq->dev.match.parts.context_id, rreq->dev.match.parts.rank)); me.match_bits = NPTL_MATCH(rreq->dev.match.parts.tag, rreq->dev.match.parts.context_id, rreq->dev.match.parts.rank); if (rreq->dev.match.parts.tag == MPI_ANY_TAG) me.ignore_bits = NPTL_MATCH_IGNORE_ANY_TAG; else me.ignore_bits = NPTL_MATCH_IGNORE; me.min_free = 0; MPIDI_Datatype_get_info(rreq->dev.user_count, rreq->dev.datatype, dt_contig, data_sz, dt_ptr, dt_true_lb); MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "count="MPI_AINT_FMT_DEC_SPEC" datatype=%#x contig=%d data_sz=%lu", rreq->dev.user_count, rreq->dev.datatype, dt_contig, data_sz)); if (data_sz <= PTL_LARGE_THRESHOLD) { if (dt_contig) { /* small contig message */ void *start = (char *)rreq->dev.user_buf + dt_true_lb; MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Small contig message"); if (start == NULL) me.start = &dummy; else me.start = start; me.length = data_sz; REQ_PTL(rreq)->event_handler = handler_recv_dequeue_complete; } else { /* small noncontig */ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Small noncontig message"); rreq->dev.segment_ptr = MPID_Segment_alloc(); MPIR_ERR_CHKANDJUMP1(rreq->dev.segment_ptr == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc"); MPID_Segment_init(rreq->dev.user_buf, rreq->dev.user_count, rreq->dev.datatype, rreq->dev.segment_ptr, 0); rreq->dev.segment_first = 0; rreq->dev.segment_size = data_sz; last = rreq->dev.segment_size; rreq->dev.iov_count = MPL_IOV_LIMIT; MPID_Segment_pack_vector(rreq->dev.segment_ptr, rreq->dev.segment_first, &last, rreq->dev.iov, &rreq->dev.iov_count); if (last == rreq->dev.segment_size) { /* entire message fits in IOV */ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, " entire message fits in IOV"); me.start = rreq->dev.iov; me.length = rreq->dev.iov_count; me.options |= PTL_IOVEC; REQ_PTL(rreq)->event_handler = handler_recv_dequeue_complete; } else { /* IOV is not long enough to describe entire message: recv into buffer and unpack later */ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, " IOV too long: using bounce buffer"); MPIU_CHKPMEM_MALLOC(REQ_PTL(rreq)->chunk_buffer[0], void *, data_sz, mpi_errno, "chunk_buffer"); me.start = REQ_PTL(rreq)->chunk_buffer[0]; me.length = data_sz; REQ_PTL(rreq)->event_handler = handler_recv_dequeue_unpack_complete; } } } else { /* Large message: Create an ME for the first chunk of data, then do a GET for the rest */ if (dt_contig) {
/*@ MPI_Pack_external - Packs a datatype into contiguous memory, using the external32 format Input Parameters: + datarep - data representation (string) . inbuf - input buffer start (choice) . incount - number of input data items (integer) . datatype - datatype of each input data item (handle) - outsize - output buffer size, in bytes (address integer) Output Parameters: . outbuf - output buffer start (choice) Input/Output Parameters: . position - current position in buffer, in bytes (address integer) .N ThreadSafe .N Fortran .N Errors .N MPI_SUCCESS .N MPI_ERR_TYPE .N MPI_ERR_ARG .N MPI_ERR_COUNT @*/ int MPI_Pack_external(const char datarep[], const void *inbuf, int incount, MPI_Datatype datatype, void *outbuf, MPI_Aint outsize, MPI_Aint *position) { static const char FCNAME[] = "MPI_Pack_external"; int mpi_errno = MPI_SUCCESS; MPI_Aint first, last; MPID_Segment *segp; MPID_MPI_STATE_DECL(MPID_STATE_MPI_PACK_EXTERNAL); MPIR_ERRTEST_INITIALIZED_ORDIE(); MPID_MPI_FUNC_ENTER(MPID_STATE_MPI_PACK_EXTERNAL); /* Validate parameters and objects (post conversion) */ # ifdef HAVE_ERROR_CHECKING { MPID_BEGIN_ERROR_CHECKS; { MPIR_ERRTEST_COUNT(incount, mpi_errno); MPIR_ERRTEST_COUNT(outsize, mpi_errno); /* NOTE: inbuf could be null (MPI_BOTTOM) */ if (incount > 0) { MPIR_ERRTEST_ARGNULL(outbuf, "output buffer", mpi_errno); } MPIR_ERRTEST_ARGNULL(position, "position", mpi_errno); MPIR_ERRTEST_DATATYPE(datatype, "datatype", mpi_errno); if (HANDLE_GET_KIND(datatype) != HANDLE_KIND_BUILTIN) { MPID_Datatype *datatype_ptr = NULL; MPID_Datatype_get_ptr(datatype, datatype_ptr); MPID_Datatype_valid_ptr(datatype_ptr, mpi_errno); MPID_Datatype_committed_ptr(datatype_ptr, mpi_errno); if (mpi_errno != MPI_SUCCESS) goto fn_fail; } } MPID_END_ERROR_CHECKS; } # endif /* HAVE_ERROR_CHECKING */ /* ... body of routine ... */ if (incount == 0) { goto fn_exit; } segp = MPID_Segment_alloc(); /* --BEGIN ERROR HANDLING-- */ if (segp == NULL) { mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment"); goto fn_fail; } /* --END ERROR HANDLING-- */ mpi_errno = MPID_Segment_init(inbuf, incount, datatype, segp, 1); if (mpi_errno != MPI_SUCCESS) goto fn_fail; /* NOTE: the use of buffer values and positions in MPI_Pack_external and * in MPID_Segment_pack_external are quite different. See code or docs * or something. */ first = 0; last = SEGMENT_IGNORE_LAST; /* Ensure that pointer increment fits in a pointer */ MPID_Ensure_Aint_fits_in_pointer((MPI_VOID_PTR_CAST_TO_MPI_AINT outbuf) + *position); MPID_Segment_pack_external32(segp, first, &last, (void *)((char *) outbuf + *position)); *position += last; MPID_Segment_free(segp); /* ... end of body of routine ... */ fn_exit: MPID_MPI_FUNC_EXIT(MPID_STATE_MPI_PACK_EXTERNAL); return mpi_errno; fn_fail: /* --BEGIN ERROR HANDLING-- */ # ifdef HAVE_ERROR_CHECKING { mpi_errno = MPIR_Err_create_code( mpi_errno, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OTHER, "**mpi_pack_external", "**mpi_pack_external %s %p %d %D %p %d %p", datarep, inbuf, incount, datatype, outbuf, outsize, position); } # endif mpi_errno = MPIR_Err_return_comm(0, FCNAME, mpi_errno); goto fn_exit; /* --END ERROR HANDLING-- */ }
int MPID_nem_ib_lmt_initiate_lmt(struct MPIDI_VC *vc, union MPIDI_CH3_Pkt *rts_pkt, struct MPID_Request *req) { int mpi_errno = MPI_SUCCESS; int dt_contig; MPIDI_msg_sz_t data_sz; MPID_Datatype *dt_ptr; MPI_Aint dt_true_lb; #if 0 MPID_nem_ib_vc_area *vc_ib = VC_IB(vc); #endif MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_LMT_INITIATE_LMT); MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_LMT_INITIATE_LMT); dprintf("lmt_initiate_lmt,enter,%d->%d,req=%p\n", MPID_nem_ib_myrank, vc->pg_rank, req); /* obtain dt_true_lb */ /* see MPIDI_Datatype_get_info(in, in, out, out, out, out) (in src/mpid/ch3/include/mpidimpl.h) */ MPIDI_Datatype_get_info(req->dev.user_count, req->dev.datatype, dt_contig, data_sz, dt_ptr, dt_true_lb); /* FIXME: who frees s_cookie_buf? */ /* malloc memory area for cookie. auto variable is NG because isend does not copy payload */ MPID_nem_ib_lmt_cookie_t *s_cookie_buf = (MPID_nem_ib_lmt_cookie_t *) MPIU_Malloc(sizeof(MPID_nem_ib_lmt_cookie_t)); /* remember address to "free" when receiving DONE from receiver */ req->ch.s_cookie = s_cookie_buf; /* see MPIDI_CH3_PktHandler_RndvClrToSend (in src/mpid/ch3/src/ch3u_rndv.c) */ //assert(dt_true_lb == 0); void *write_from_buf; if (dt_contig) { write_from_buf = (void *) ((char *) req->dev.user_buf + dt_true_lb); } else { /* see MPIDI_CH3_EagerNoncontigSend (in ch3u_eager.c) */ req->dev.segment_ptr = MPID_Segment_alloc(); MPIU_ERR_CHKANDJUMP((req->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**outofmemory"); MPID_Segment_init(req->dev.user_buf, req->dev.user_count, req->dev.datatype, req->dev.segment_ptr, 0); req->dev.segment_first = 0; req->dev.segment_size = data_sz; MPIDI_msg_sz_t last; last = req->dev.segment_size; /* segment_size is byte offset */ MPIU_Assert(last > 0); REQ_FIELD(req, lmt_pack_buf) = MPIU_Malloc((size_t) req->dev.segment_size); MPIU_ERR_CHKANDJUMP(!REQ_FIELD(req, lmt_pack_buf), mpi_errno, MPI_ERR_OTHER, "**outofmemory"); MPID_Segment_pack(req->dev.segment_ptr, req->dev.segment_first, &last, (char *) (REQ_FIELD(req, lmt_pack_buf))); MPIU_Assert(last == req->dev.segment_size); write_from_buf = REQ_FIELD(req, lmt_pack_buf); } dprintf ("lmt_initate_lmt,dt_contig=%d,write_from_buf=%p,req->dev.user_buf=%p,REQ_FIELD(req, lmt_pack_buf)=%p\n", dt_contig, write_from_buf, req->dev.user_buf, REQ_FIELD(req, lmt_pack_buf)); #ifdef HAVE_LIBDCFA #else s_cookie_buf->addr = write_from_buf; #endif /* put sz, see MPID_nem_lmt_RndvSend (in src/mpid/ch3/channels/nemesis/src/mpid_nem_lmt.c) */ /* TODO remove sz field * pkt_RTS_handler (in src/mpid/ch3/channels/nemesis/src/mpid_nem_lmt.c) * rreq->ch.lmt_data_sz = rts_pkt->data_sz; */ //s_cookie_buf->sz = (uint32_t)((MPID_nem_pkt_lmt_rts_t*)rts_pkt)->data_sz; /* preserve and put tail, because tail magic is written on the tail of payload * because we don't want to add another SGE or RDMA command */ MPIU_Assert(((MPID_nem_pkt_lmt_rts_t *) rts_pkt)->data_sz == data_sz); s_cookie_buf->tail = *((uint8_t *) ((uint8_t *) write_from_buf + data_sz - sizeof(uint8_t))); /* prepare magic */ //*((uint32_t*)(write_from_buf + data_sz - sizeof(tailmagic_t))) = MPID_NEM_IB_COM_MAGIC; #if 0 /* moving to packet header */ /* embed RDMA-write-to buffer occupancy information */ dprintf("lmt_initiate_lmt,rsr_seq_num_tail=%d\n", vc_ib->ibcom->rsr_seq_num_tail); /* embed RDMA-write-to buffer occupancy information */ s_cookie_buf->seq_num_tail = vc_ib->ibcom->rsr_seq_num_tail; /* remember the last one sent */ vc_ib->ibcom->rsr_seq_num_tail_last_sent = vc_ib->ibcom->rsr_seq_num_tail; #endif int post_num; uint32_t max_msg_sz; MPID_nem_ib_vc_area *vc_ib = VC_IB(vc); MPID_nem_ib_com_get_info_conn(vc_ib->sc->fd, MPID_NEM_IB_COM_INFOKEY_PATTR_MAX_MSG_SZ, &max_msg_sz, sizeof(uint32_t)); /* Type of max_msg_sz is uint32_t. */ post_num = (data_sz + (long) max_msg_sz - 1) / (long) max_msg_sz; s_cookie_buf->max_msg_sz = max_msg_sz; s_cookie_buf->seg_seq_num = 1; s_cookie_buf->seg_num = post_num; REQ_FIELD(req, buf.from) = write_from_buf; REQ_FIELD(req, data_sz) = data_sz; REQ_FIELD(req, seg_seq_num) = 1; // only send 1st-segment, even if there are some segments. REQ_FIELD(req, seg_num) = post_num; REQ_FIELD(req, max_msg_sz) = max_msg_sz; long length; if (post_num > 1) { length = max_msg_sz; } else { length = data_sz; } /* put IB rkey */ struct MPID_nem_ib_com_reg_mr_cache_entry_t *mr_cache = MPID_nem_ib_com_reg_mr_fetch(write_from_buf, length, 0, MPID_NEM_IB_COM_REG_MR_GLOBAL); MPIU_ERR_CHKANDJUMP(!mr_cache, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_reg_mr_fetch"); struct ibv_mr *mr = mr_cache->mr; REQ_FIELD(req, lmt_mr_cache) = (void *) mr_cache; #ifdef HAVE_LIBDCFA s_cookie_buf->addr = (void *) mr->host_addr; dprintf("lmt_initiate_lmt,s_cookie_buf->addr=%p\n", s_cookie_buf->addr); #endif s_cookie_buf->rkey = mr->rkey; dprintf("lmt_initiate_lmt,tail=%02x,mem-tail=%p,%02x,sz=%ld,raddr=%p,rkey=%08x\n", s_cookie_buf->tail, write_from_buf + data_sz - sizeof(uint8_t), *((uint8_t *) (write_from_buf + data_sz - sizeof(uint8_t))), data_sz, s_cookie_buf->addr, s_cookie_buf->rkey); /* send cookie. rts_pkt as the MPI-header, s_cookie_buf as the payload */ MPID_nem_lmt_send_RTS(vc, (MPID_nem_pkt_lmt_rts_t *) rts_pkt, s_cookie_buf, sizeof(MPID_nem_ib_lmt_cookie_t)); fn_exit: MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_LMT_INITIATE_LMT); return mpi_errno; fn_fail: goto fn_exit; }