static int send_sreq_data(MPIDI_VC_t *vc, MPID_Request *sreq, knem_cookie_t *s_cookiep) { int mpi_errno = MPI_SUCCESS; int dt_contig; MPI_Aint dt_true_lb; MPIDI_msg_sz_t data_sz; MPID_Datatype * dt_ptr; /* MT: this code assumes only one thread can be at this point at a time */ if (knem_fd < 0) { mpi_errno = open_knem_dev(); if (mpi_errno) MPIU_ERR_POP(mpi_errno); } /* find out contig/noncontig, size, and lb for the datatype */ MPIDI_Datatype_get_info(sreq->dev.user_count, sreq->dev.datatype, dt_contig, data_sz, dt_ptr, dt_true_lb); if (dt_contig) { /* handle the iov creation ourselves */ sreq->dev.iov[0].MPID_IOV_BUF = (char *)sreq->dev.user_buf + dt_true_lb; sreq->dev.iov[0].MPID_IOV_LEN = data_sz; sreq->dev.iov_count = 1; } else { /* use the segment routines to handle the iovec creation */ if (sreq->dev.segment_ptr == NULL) { sreq->dev.iov_count = MPID_IOV_LIMIT; sreq->dev.iov_offset = 0; /* segment_ptr may be non-null when this is a continuation of a many-part message that we couldn't fit in one single flight of iovs. */ sreq->dev.segment_ptr = MPID_Segment_alloc(); MPIU_ERR_CHKANDJUMP1((sreq->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc"); MPID_Segment_init(sreq->dev.user_buf, sreq->dev.user_count, sreq->dev.datatype, sreq->dev.segment_ptr, 0); sreq->dev.segment_first = 0; sreq->dev.segment_size = data_sz; /* FIXME we should write our own function that isn't dependent on the in-request iov array. This will let us use IOVs that are larger than MPID_IOV_LIMIT. */ mpi_errno = MPIDI_CH3U_Request_load_send_iov(sreq, &sreq->dev.iov[0], &sreq->dev.iov_count); if (mpi_errno) MPIU_ERR_POP(mpi_errno); } } mpi_errno = do_dma_send(vc, sreq, sreq->dev.iov_count, sreq->dev.iov, s_cookiep); if (mpi_errno) MPIU_ERR_POP(mpi_errno); fn_exit: return mpi_errno; fn_fail: goto fn_exit; }
/* MPIDI_CH3_EagerNoncontigSend - Eagerly send noncontiguous data */ int MPIDI_CH3_EagerNoncontigSend( MPID_Request **sreq_p, MPIDI_CH3_Pkt_type_t reqtype, const void * buf, MPI_Aint count, MPI_Datatype datatype, MPIDI_msg_sz_t data_sz, int rank, int tag, MPID_Comm * comm, int context_offset ) { int mpi_errno = MPI_SUCCESS; MPIDI_VC_t * vc; MPID_Request *sreq = *sreq_p; MPIDI_CH3_Pkt_t upkt; MPIDI_CH3_Pkt_eager_send_t * const eager_pkt = &upkt.eager_send; MPIU_DBG_MSG_FMT(CH3_OTHER,VERBOSE,(MPIU_DBG_FDEST, "sending non-contiguous eager message, data_sz=" MPIDI_MSG_SZ_FMT, data_sz)); sreq->dev.OnDataAvail = 0; sreq->dev.OnFinal = 0; MPIDI_Pkt_init(eager_pkt, reqtype); eager_pkt->match.parts.rank = comm->rank; eager_pkt->match.parts.tag = tag; eager_pkt->match.parts.context_id = comm->context_id + context_offset; eager_pkt->sender_req_id = MPI_REQUEST_NULL; eager_pkt->data_sz = data_sz; MPIDI_Comm_get_vc_set_active(comm, rank, &vc); MPIDI_VC_FAI_send_seqnum(vc, seqnum); MPIDI_Pkt_set_seqnum(eager_pkt, seqnum); MPIDI_Request_set_seqnum(sreq, seqnum); MPIU_DBG_MSGPKT(vc,tag,eager_pkt->match.parts.context_id,rank,data_sz, "Eager"); sreq->dev.segment_ptr = MPID_Segment_alloc( ); MPIR_ERR_CHKANDJUMP1((sreq->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc"); MPID_Segment_init(buf, count, datatype, sreq->dev.segment_ptr, 0); sreq->dev.segment_first = 0; sreq->dev.segment_size = data_sz; MPID_THREAD_CS_ENTER(POBJ, vc->pobj_mutex); mpi_errno = vc->sendNoncontig_fn(vc, sreq, eager_pkt, sizeof(MPIDI_CH3_Pkt_eager_send_t)); MPID_THREAD_CS_EXIT(POBJ, vc->pobj_mutex); if (mpi_errno) MPIR_ERR_POP(mpi_errno); fn_exit: return mpi_errno; fn_fail: *sreq_p = NULL; goto fn_exit; }
/* fills in req->dev.iov{,_offset,_count} based on the datatype info in the request, creating a segment if necessary */ static int populate_iov_from_req(MPID_Request *req) { int mpi_errno = MPI_SUCCESS; int dt_contig; MPI_Aint dt_true_lb; MPIDI_msg_sz_t data_sz; MPID_Datatype * dt_ptr; /* find out contig/noncontig, size, and lb for the datatype */ MPIDI_Datatype_get_info(req->dev.user_count, req->dev.datatype, dt_contig, data_sz, dt_ptr, dt_true_lb); if (dt_contig) { /* handle the iov creation ourselves */ req->dev.iov[0].MPL_IOV_BUF = (char *)req->dev.user_buf + dt_true_lb; req->dev.iov[0].MPL_IOV_LEN = data_sz; req->dev.iov_count = 1; } else { /* use the segment routines to handle the iovec creation */ MPIU_Assert(req->dev.segment_ptr == NULL); req->dev.iov_count = MPL_IOV_LIMIT; req->dev.iov_offset = 0; /* XXX DJG FIXME where is this segment freed? */ req->dev.segment_ptr = MPID_Segment_alloc(); MPIR_ERR_CHKANDJUMP1((req->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc"); MPID_Segment_init(req->dev.user_buf, req->dev.user_count, req->dev.datatype, req->dev.segment_ptr, 0); req->dev.segment_first = 0; req->dev.segment_size = data_sz; /* FIXME we should write our own function that isn't dependent on the in-request iov array. This will let us use IOVs that are larger than MPL_IOV_LIMIT. */ mpi_errno = MPIDI_CH3U_Request_load_send_iov(req, &req->dev.iov[0], &req->dev.iov_count); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } fn_fail: return mpi_errno; }
static int _mxm_process_rdtype(MPID_Request ** rreq_p, MPI_Datatype datatype, MPID_Datatype * dt_ptr, MPIDI_msg_sz_t data_sz, const void *buf, int count, mxm_req_buffer_t ** iov_buf, int *iov_count) { int mpi_errno = MPI_SUCCESS; MPID_Request *rreq = *rreq_p; MPIDI_msg_sz_t last; MPL_IOV *iov; int n_iov = 0; int index; if (rreq->dev.segment_ptr == NULL) { rreq->dev.segment_ptr = MPID_Segment_alloc(); MPIR_ERR_CHKANDJUMP1((rreq->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc"); } MPID_Segment_init(buf, count, datatype, rreq->dev.segment_ptr, 0); rreq->dev.segment_first = 0; rreq->dev.segment_size = data_sz; last = rreq->dev.segment_size; MPID_Segment_count_contig_blocks(rreq->dev.segment_ptr, rreq->dev.segment_first, &last, (MPI_Aint *) & n_iov); MPIU_Assert(n_iov > 0); iov = MPIU_Malloc(n_iov * sizeof(*iov)); MPIU_Assert(iov); last = rreq->dev.segment_size; MPID_Segment_unpack_vector(rreq->dev.segment_ptr, rreq->dev.segment_first, &last, iov, &n_iov); MPIU_Assert(last == rreq->dev.segment_size); #if defined(MXM_DEBUG) && (MXM_DEBUG > 0) _dbg_mxm_output(7, "Recv Noncontiguous data vector %i entries (free slots : %i)\n", n_iov, MXM_REQ_DATA_MAX_IOV); for (index = 0; index < n_iov; index++) { _dbg_mxm_output(7, "======= Recv iov[%i] = ptr : %p, len : %i \n", index, iov[index].MPL_IOV_BUF, iov[index].MPL_IOV_LEN); } #endif if (n_iov <= MXM_REQ_DATA_MAX_IOV) { if (n_iov > MXM_MPICH_MAX_IOV) { *iov_buf = (mxm_req_buffer_t *) MPIU_Malloc(n_iov * sizeof(**iov_buf)); MPIU_Assert(*iov_buf); } for (index = 0; index < n_iov; index++) { (*iov_buf)[index].ptr = iov[index].MPL_IOV_BUF; (*iov_buf)[index].length = iov[index].MPL_IOV_LEN; } rreq->dev.tmpbuf = NULL; rreq->dev.tmpbuf_sz = 0; *iov_count = n_iov; } else { MPI_Aint packsize = 0; MPIR_Pack_size_impl(rreq->dev.user_count, rreq->dev.datatype, &packsize); rreq->dev.tmpbuf = MPIU_Malloc((size_t) packsize); MPIU_Assert(rreq->dev.tmpbuf); rreq->dev.tmpbuf_sz = packsize; (*iov_buf)[0].ptr = rreq->dev.tmpbuf; (*iov_buf)[0].length = (size_t) packsize; *iov_count = 1; } MPIU_Free(iov); fn_exit: return mpi_errno; fn_fail: goto fn_exit; }
int MPID_nem_ib_lmt_switch_send(struct MPIDI_VC *vc, struct MPID_Request *req) { int mpi_errno = MPI_SUCCESS; int dt_contig; MPIDI_msg_sz_t data_sz; MPID_Datatype *dt_ptr; MPI_Aint dt_true_lb; MPID_IOV r_cookie = req->ch.lmt_tmp_cookie; MPID_nem_ib_lmt_cookie_t *r_cookie_buf = r_cookie.iov_base; MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_LMT_SWITCH_SEND); MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_LMT_SWITCH_SEND); MPIDI_Datatype_get_info(req->dev.user_count, req->dev.datatype, dt_contig, data_sz, dt_ptr, dt_true_lb); void *write_from_buf; if (dt_contig) { write_from_buf = req->dev.user_buf; } else { /* see MPIDI_CH3_EagerNoncontigSend (in ch3u_eager.c) */ req->dev.segment_ptr = MPID_Segment_alloc(); MPIU_ERR_CHKANDJUMP((req->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**outofmemory"); MPID_Segment_init(req->dev.user_buf, req->dev.user_count, req->dev.datatype, req->dev.segment_ptr, 0); req->dev.segment_first = 0; req->dev.segment_size = data_sz; MPIDI_msg_sz_t last; last = req->dev.segment_size; /* segment_size is byte offset */ MPIU_Assert(last > 0); REQ_FIELD(req, lmt_pack_buf) = MPIU_Malloc(data_sz); MPIU_ERR_CHKANDJUMP(!REQ_FIELD(req, lmt_pack_buf), mpi_errno, MPI_ERR_OTHER, "**outofmemory"); MPID_Segment_pack(req->dev.segment_ptr, req->dev.segment_first, &last, (char *) (REQ_FIELD(req, lmt_pack_buf))); MPIU_Assert(last == req->dev.segment_size); write_from_buf = REQ_FIELD(req, lmt_pack_buf); } //assert(dt_true_lb == 0); uint8_t *tailp = (uint8_t *) ((uint8_t *) write_from_buf /*+ dt_true_lb */ + data_sz - sizeof(uint8_t)); #if 0 *is_end_flag_same = (r_cookie_buf->tail == *tailp) ? 1 : 0; #else REQ_FIELD(req, lmt_receiver_tail) = r_cookie_buf->tail; REQ_FIELD(req, lmt_sender_tail) = *tailp; dprintf("lmt_switch_send,tail on sender=%02x,tail onreceiver=%02x,req=%p\n", *tailp, r_cookie_buf->tail, req); #ifdef MPID_NEM_IB_DEBUG_LMT uint8_t *tail_wordp = (uint8_t *) ((uint8_t *) write_from_buf + data_sz - sizeof(uint32_t) * 2); #endif dprintf("lmt_switch_send,tail on sender=%d\n", *tail_wordp); fflush(stdout); #endif fn_exit: MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_LMT_SWITCH_SEND); return mpi_errno; fn_fail: goto fn_exit; }
int MPID_nem_ib_lmt_initiate_lmt(struct MPIDI_VC *vc, union MPIDI_CH3_Pkt *rts_pkt, struct MPID_Request *req) { int mpi_errno = MPI_SUCCESS; int dt_contig; MPIDI_msg_sz_t data_sz; MPID_Datatype *dt_ptr; MPI_Aint dt_true_lb; #if 0 MPID_nem_ib_vc_area *vc_ib = VC_IB(vc); #endif MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_LMT_INITIATE_LMT); MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_LMT_INITIATE_LMT); dprintf("lmt_initiate_lmt,enter,%d->%d,req=%p\n", MPID_nem_ib_myrank, vc->pg_rank, req); /* obtain dt_true_lb */ /* see MPIDI_Datatype_get_info(in, in, out, out, out, out) (in src/mpid/ch3/include/mpidimpl.h) */ MPIDI_Datatype_get_info(req->dev.user_count, req->dev.datatype, dt_contig, data_sz, dt_ptr, dt_true_lb); /* FIXME: who frees s_cookie_buf? */ /* malloc memory area for cookie. auto variable is NG because isend does not copy payload */ MPID_nem_ib_lmt_cookie_t *s_cookie_buf = (MPID_nem_ib_lmt_cookie_t *) MPIU_Malloc(sizeof(MPID_nem_ib_lmt_cookie_t)); /* remember address to "free" when receiving DONE from receiver */ req->ch.s_cookie = s_cookie_buf; /* see MPIDI_CH3_PktHandler_RndvClrToSend (in src/mpid/ch3/src/ch3u_rndv.c) */ //assert(dt_true_lb == 0); void *write_from_buf; if (dt_contig) { write_from_buf = (void *) ((char *) req->dev.user_buf + dt_true_lb); } else { /* see MPIDI_CH3_EagerNoncontigSend (in ch3u_eager.c) */ req->dev.segment_ptr = MPID_Segment_alloc(); MPIU_ERR_CHKANDJUMP((req->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**outofmemory"); MPID_Segment_init(req->dev.user_buf, req->dev.user_count, req->dev.datatype, req->dev.segment_ptr, 0); req->dev.segment_first = 0; req->dev.segment_size = data_sz; MPIDI_msg_sz_t last; last = req->dev.segment_size; /* segment_size is byte offset */ MPIU_Assert(last > 0); REQ_FIELD(req, lmt_pack_buf) = MPIU_Malloc((size_t) req->dev.segment_size); MPIU_ERR_CHKANDJUMP(!REQ_FIELD(req, lmt_pack_buf), mpi_errno, MPI_ERR_OTHER, "**outofmemory"); MPID_Segment_pack(req->dev.segment_ptr, req->dev.segment_first, &last, (char *) (REQ_FIELD(req, lmt_pack_buf))); MPIU_Assert(last == req->dev.segment_size); write_from_buf = REQ_FIELD(req, lmt_pack_buf); } dprintf ("lmt_initate_lmt,dt_contig=%d,write_from_buf=%p,req->dev.user_buf=%p,REQ_FIELD(req, lmt_pack_buf)=%p\n", dt_contig, write_from_buf, req->dev.user_buf, REQ_FIELD(req, lmt_pack_buf)); #ifdef HAVE_LIBDCFA #else s_cookie_buf->addr = write_from_buf; #endif /* put sz, see MPID_nem_lmt_RndvSend (in src/mpid/ch3/channels/nemesis/src/mpid_nem_lmt.c) */ /* TODO remove sz field * pkt_RTS_handler (in src/mpid/ch3/channels/nemesis/src/mpid_nem_lmt.c) * rreq->ch.lmt_data_sz = rts_pkt->data_sz; */ //s_cookie_buf->sz = (uint32_t)((MPID_nem_pkt_lmt_rts_t*)rts_pkt)->data_sz; /* preserve and put tail, because tail magic is written on the tail of payload * because we don't want to add another SGE or RDMA command */ MPIU_Assert(((MPID_nem_pkt_lmt_rts_t *) rts_pkt)->data_sz == data_sz); s_cookie_buf->tail = *((uint8_t *) ((uint8_t *) write_from_buf + data_sz - sizeof(uint8_t))); /* prepare magic */ //*((uint32_t*)(write_from_buf + data_sz - sizeof(tailmagic_t))) = MPID_NEM_IB_COM_MAGIC; #if 0 /* moving to packet header */ /* embed RDMA-write-to buffer occupancy information */ dprintf("lmt_initiate_lmt,rsr_seq_num_tail=%d\n", vc_ib->ibcom->rsr_seq_num_tail); /* embed RDMA-write-to buffer occupancy information */ s_cookie_buf->seq_num_tail = vc_ib->ibcom->rsr_seq_num_tail; /* remember the last one sent */ vc_ib->ibcom->rsr_seq_num_tail_last_sent = vc_ib->ibcom->rsr_seq_num_tail; #endif int post_num; uint32_t max_msg_sz; MPID_nem_ib_vc_area *vc_ib = VC_IB(vc); MPID_nem_ib_com_get_info_conn(vc_ib->sc->fd, MPID_NEM_IB_COM_INFOKEY_PATTR_MAX_MSG_SZ, &max_msg_sz, sizeof(uint32_t)); /* Type of max_msg_sz is uint32_t. */ post_num = (data_sz + (long) max_msg_sz - 1) / (long) max_msg_sz; s_cookie_buf->max_msg_sz = max_msg_sz; s_cookie_buf->seg_seq_num = 1; s_cookie_buf->seg_num = post_num; REQ_FIELD(req, buf.from) = write_from_buf; REQ_FIELD(req, data_sz) = data_sz; REQ_FIELD(req, seg_seq_num) = 1; // only send 1st-segment, even if there are some segments. REQ_FIELD(req, seg_num) = post_num; REQ_FIELD(req, max_msg_sz) = max_msg_sz; long length; if (post_num > 1) { length = max_msg_sz; } else { length = data_sz; } /* put IB rkey */ struct MPID_nem_ib_com_reg_mr_cache_entry_t *mr_cache = MPID_nem_ib_com_reg_mr_fetch(write_from_buf, length, 0, MPID_NEM_IB_COM_REG_MR_GLOBAL); MPIU_ERR_CHKANDJUMP(!mr_cache, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_reg_mr_fetch"); struct ibv_mr *mr = mr_cache->mr; REQ_FIELD(req, lmt_mr_cache) = (void *) mr_cache; #ifdef HAVE_LIBDCFA s_cookie_buf->addr = (void *) mr->host_addr; dprintf("lmt_initiate_lmt,s_cookie_buf->addr=%p\n", s_cookie_buf->addr); #endif s_cookie_buf->rkey = mr->rkey; dprintf("lmt_initiate_lmt,tail=%02x,mem-tail=%p,%02x,sz=%ld,raddr=%p,rkey=%08x\n", s_cookie_buf->tail, write_from_buf + data_sz - sizeof(uint8_t), *((uint8_t *) (write_from_buf + data_sz - sizeof(uint8_t))), data_sz, s_cookie_buf->addr, s_cookie_buf->rkey); /* send cookie. rts_pkt as the MPI-header, s_cookie_buf as the payload */ MPID_nem_lmt_send_RTS(vc, (MPID_nem_pkt_lmt_rts_t *) rts_pkt, s_cookie_buf, sizeof(MPID_nem_ib_lmt_cookie_t)); fn_exit: MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_LMT_INITIATE_LMT); return mpi_errno; fn_fail: goto fn_exit; }
int MPID_nem_ptl_recv_posted(MPIDI_VC_t *vc, MPID_Request *rreq) { int mpi_errno = MPI_SUCCESS; MPID_nem_ptl_vc_area *const vc_ptl = VC_PTL(vc); ptl_me_t me; int dt_contig; MPIDI_msg_sz_t data_sz; MPID_Datatype *dt_ptr; MPI_Aint dt_true_lb; MPI_Aint last; ptl_process_t id_any; int ret; MPIU_CHKPMEM_DECL(1); MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_PTL_RECV_POSTED); MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_PTL_RECV_POSTED); id_any.phys.nid = PTL_NID_ANY; id_any.phys.pid = PTL_PID_ANY; MPID_nem_ptl_init_req(rreq); me.ct_handle = PTL_CT_NONE; me.uid = PTL_UID_ANY; me.options = ( PTL_ME_OP_PUT | PTL_ME_IS_ACCESSIBLE | PTL_ME_EVENT_LINK_DISABLE | PTL_ME_EVENT_UNLINK_DISABLE | PTL_ME_USE_ONCE ); if (vc == NULL) { /* MPI_ANY_SOURCE receive */ me.match_id = id_any; } else { if (!vc_ptl->id_initialized) { mpi_errno = MPID_nem_ptl_init_id(vc); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } me.match_id = vc_ptl->id; } MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "tag=%#x ctx=%#x rank=%#x", rreq->dev.match.parts.tag, rreq->dev.match.parts.context_id, rreq->dev.match.parts.rank)); me.match_bits = NPTL_MATCH(rreq->dev.match.parts.tag, rreq->dev.match.parts.context_id, rreq->dev.match.parts.rank); if (rreq->dev.match.parts.tag == MPI_ANY_TAG) me.ignore_bits = NPTL_MATCH_IGNORE_ANY_TAG; else me.ignore_bits = NPTL_MATCH_IGNORE; me.min_free = 0; MPIDI_Datatype_get_info(rreq->dev.user_count, rreq->dev.datatype, dt_contig, data_sz, dt_ptr, dt_true_lb); MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "count="MPI_AINT_FMT_DEC_SPEC" datatype=%#x contig=%d data_sz=%lu", rreq->dev.user_count, rreq->dev.datatype, dt_contig, data_sz)); if (data_sz <= PTL_LARGE_THRESHOLD) { if (dt_contig) { /* small contig message */ void *start = (char *)rreq->dev.user_buf + dt_true_lb; MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Small contig message"); if (start == NULL) me.start = &dummy; else me.start = start; me.length = data_sz; REQ_PTL(rreq)->event_handler = handler_recv_dequeue_complete; } else { /* small noncontig */ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Small noncontig message"); rreq->dev.segment_ptr = MPID_Segment_alloc(); MPIR_ERR_CHKANDJUMP1(rreq->dev.segment_ptr == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc"); MPID_Segment_init(rreq->dev.user_buf, rreq->dev.user_count, rreq->dev.datatype, rreq->dev.segment_ptr, 0); rreq->dev.segment_first = 0; rreq->dev.segment_size = data_sz; last = rreq->dev.segment_size; rreq->dev.iov_count = MPL_IOV_LIMIT; MPID_Segment_pack_vector(rreq->dev.segment_ptr, rreq->dev.segment_first, &last, rreq->dev.iov, &rreq->dev.iov_count); if (last == rreq->dev.segment_size) { /* entire message fits in IOV */ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, " entire message fits in IOV"); me.start = rreq->dev.iov; me.length = rreq->dev.iov_count; me.options |= PTL_IOVEC; REQ_PTL(rreq)->event_handler = handler_recv_dequeue_complete; } else { /* IOV is not long enough to describe entire message: recv into buffer and unpack later */ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, " IOV too long: using bounce buffer"); MPIU_CHKPMEM_MALLOC(REQ_PTL(rreq)->chunk_buffer[0], void *, data_sz, mpi_errno, "chunk_buffer"); me.start = REQ_PTL(rreq)->chunk_buffer[0]; me.length = data_sz; REQ_PTL(rreq)->event_handler = handler_recv_dequeue_unpack_complete; } } } else { /* Large message: Create an ME for the first chunk of data, then do a GET for the rest */ if (dt_contig) {
int MPIDI_CH3U_Post_data_receive_found(MPID_Request * rreq) { int mpi_errno = MPI_SUCCESS; int dt_contig; MPI_Aint dt_true_lb; MPIDI_msg_sz_t userbuf_sz; MPID_Datatype * dt_ptr = NULL; MPIDI_msg_sz_t data_sz; MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3U_POST_DATA_RECEIVE_FOUND); MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3U_POST_DATA_RECEIVE_FOUND); MPIU_DBG_MSG(CH3_OTHER,VERBOSE,"posted request found"); MPIDI_Datatype_get_info(rreq->dev.user_count, rreq->dev.datatype, dt_contig, userbuf_sz, dt_ptr, dt_true_lb); if (rreq->dev.recv_data_sz <= userbuf_sz) { data_sz = rreq->dev.recv_data_sz; } else { MPIU_DBG_MSG_FMT(CH3_OTHER,VERBOSE,(MPIU_DBG_FDEST, "receive buffer too small; message truncated, msg_sz=" MPIDI_MSG_SZ_FMT ", userbuf_sz=" MPIDI_MSG_SZ_FMT, rreq->dev.recv_data_sz, userbuf_sz)); rreq->status.MPI_ERROR = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_TRUNCATE, "**truncate", "**truncate %d %d %d %d", rreq->status.MPI_SOURCE, rreq->status.MPI_TAG, rreq->dev.recv_data_sz, userbuf_sz ); rreq->status.count = userbuf_sz; data_sz = userbuf_sz; } if (dt_contig && data_sz == rreq->dev.recv_data_sz) { /* user buffer is contiguous and large enough to store the entire message. However, we haven't yet *read* the data (this code describes how to read the data into the destination) */ MPIU_DBG_MSG(CH3_OTHER,VERBOSE,"IOV loaded for contiguous read"); rreq->dev.iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST)((char*)(rreq->dev.user_buf) + dt_true_lb); rreq->dev.iov[0].MPID_IOV_LEN = data_sz; rreq->dev.iov_count = 1; /* FIXME: We want to set the OnDataAvail to the appropriate function, which depends on whether this is an RMA request or a pt-to-pt request. */ rreq->dev.OnDataAvail = 0; } else { /* user buffer is not contiguous or is too small to hold the entire message */ MPIU_DBG_MSG(CH3_OTHER,VERBOSE,"IOV loaded for non-contiguous read"); rreq->dev.segment_ptr = MPID_Segment_alloc( ); MPIU_ERR_CHKANDJUMP1((rreq->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc"); MPID_Segment_init(rreq->dev.user_buf, rreq->dev.user_count, rreq->dev.datatype, rreq->dev.segment_ptr, 0); rreq->dev.segment_first = 0; rreq->dev.segment_size = data_sz; mpi_errno = MPIDI_CH3U_Request_load_recv_iov(rreq); if (mpi_errno != MPI_SUCCESS) { MPIU_ERR_SETFATALANDJUMP(mpi_errno,MPI_ERR_OTHER, "**ch3|loadrecviov"); } } fn_exit: MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3U_POST_DATA_RECEIVE_FOUND); return mpi_errno; fn_fail: goto fn_exit; }
/* create a request */ MPIDI_Request_create_sreq(sreq, mpi_errno, goto fn_exit); MPIU_Assert(sreq != NULL); MPIDI_Request_set_type(sreq, MPIDI_REQUEST_TYPE_SEND); MPIDI_VC_FAI_send_seqnum(vc, seqnum); MPIDI_Request_set_seqnum(sreq, seqnum); if (HANDLE_GET_KIND(datatype) != HANDLE_KIND_BUILTIN) { MPID_Datatype_get_ptr(datatype, sreq->dev.datatype_ptr); MPID_Datatype_add_ref(sreq->dev.datatype_ptr); } sreq->partner_request = NULL; sreq->dev.OnDataAvail = NULL; sreq->dev.tmpbuf = NULL; sreq->ch.vc = vc; sreq->ch.noncontig = FALSE; _dbg_mxm_output(5, "isSend ========> Sending USER msg for req %p (context %d to %d tag %d size %d) \n", sreq, comm->context_id + context_offset, rank, tag, data_sz); vc_area = VC_BASE(vc); req_area = REQ_BASE(sreq); req_area-> ctx = sreq; req_area->iov_buf = req_area->tmp_buf; req_area->iov_count = 0; req_area->iov_buf[0].ptr = NULL; req_area->iov_buf[0].length = 0; if (data_sz) { if (dt_contig) { req_area->iov_count = 1; req_area->iov_buf[0].ptr = (char *) (buf) + dt_true_lb; req_area->iov_buf[0].length = data_sz; } else { MPIDI_msg_sz_t last; MPI_Aint packsize = 0; sreq->ch.noncontig = TRUE; sreq->dev.segment_ptr = MPID_Segment_alloc(); MPIU_ERR_CHKANDJUMP1((sreq->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc"); MPIR_Pack_size_impl(count, datatype, &packsize); last = data_sz; if (packsize > 0) { sreq->dev.tmpbuf = MPIU_Malloc((size_t) packsize); MPIU_Assert(sreq->dev.tmpbuf); MPID_Segment_init(buf, count, datatype, sreq->dev.segment_ptr, 0); MPID_Segment_pack(sreq->dev.segment_ptr, 0, &last, sreq->dev.tmpbuf); req_area->iov_count = 1; req_area->iov_buf[0].ptr = sreq->dev.tmpbuf; req_area->iov_buf[0].length = last; } } } vc_area->pending_sends += 1; mpi_errno = _mxm_isend(vc_area->mxm_ep, req_area, MXM_MPICH_ISEND_SYNC, (mxm_mq_h) comm->dev.ch.netmod_priv, comm->rank, tag, _mxm_tag_mpi2mxm(tag, comm->context_id + context_offset), 0); if (mpi_errno) MPIU_ERR_POP(mpi_errno); _dbg_mxm_out_req(sreq); fn_exit: *sreq_ptr = sreq; MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_MXM_ISSEND); return mpi_errno; fn_fail: goto fn_exit; } static int _mxm_handle_sreq(MPID_Request * req) { int complete = FALSE; int (*reqFn) (MPIDI_VC_t *, MPID_Request *, int *); MPID_nem_mxm_vc_area *vc_area = NULL; MPID_nem_mxm_req_area *req_area = NULL; vc_area = VC_BASE(req->ch.vc); req_area = REQ_BASE(req); _dbg_mxm_out_buf(req_area->iov_buf[0].ptr, (req_area->iov_buf[0].length > 16 ? 16 : req_area->iov_buf[0].length)); vc_area->pending_sends -= 1; if (((req->dev.datatype_ptr != NULL) && (req->dev.tmpbuf != NULL))) { MPIU_Free(req->dev.tmpbuf); } if (req_area->iov_count > MXM_MPICH_MAX_IOV) { MPIU_Free(req_area->iov_buf); req_area->iov_buf = req_area->tmp_buf; req_area->iov_count = 0; } reqFn = req->dev.OnDataAvail; if (!reqFn) { MPIDI_CH3U_Request_complete(req); MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, ".... complete"); } else { MPIDI_VC_t *vc = req->ch.vc; reqFn(vc, req, &complete); if (!complete) { MPIU_Assert(complete == TRUE); } } return complete; } static void _mxm_send_completion_cb(void *context) { MPID_Request *req = (MPID_Request *) context; MPID_nem_mxm_vc_area *vc_area = NULL; MPID_nem_mxm_req_area *req_area = NULL; MPIU_Assert(req); _dbg_mxm_out_req(req); vc_area = VC_BASE(req->ch.vc); req_area = REQ_BASE(req); _mxm_to_mpi_status(req_area->mxm_req->item.base.error, &req->status); list_enqueue(&vc_area->mxm_ep->free_queue, &req_area->mxm_req->queue); _dbg_mxm_output(5, "========> %s SEND req %p status %d\n", (MPIR_STATUS_GET_CANCEL_BIT(req->status) ? "Canceling" : "Completing"), req, req->status.MPI_ERROR); if (likely(!MPIR_STATUS_GET_CANCEL_BIT(req->status))) { _mxm_handle_sreq(req); } } static int _mxm_isend(MPID_nem_mxm_ep_t * ep, MPID_nem_mxm_req_area * req, int type, mxm_mq_h mxm_mq, int mxm_rank, int id, mxm_tag_t mxm_tag, int block) { int mpi_errno = MPI_SUCCESS; mxm_error_t ret = MXM_OK; mxm_send_req_t *mxm_sreq; list_head_t *free_queue = NULL; MPIU_Assert(ep); MPIU_Assert(req); free_queue = &ep->free_queue; req->mxm_req = list_dequeue_mxm_req(free_queue); if (!req->mxm_req) { list_grow_mxm_req(free_queue); req->mxm_req = list_dequeue_mxm_req(free_queue); if (!req->mxm_req) { MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "empty free queue"); mpi_errno = MPI_ERR_OTHER; goto fn_fail; } } mxm_sreq = &(req->mxm_req->item.send); mxm_sreq->base.state = MXM_REQ_NEW; mxm_sreq->base.mq = mxm_mq; mxm_sreq->base.conn = ep->mxm_conn; mxm_sreq->base.completed_cb = _mxm_send_completion_cb; mxm_sreq->base.context = req->ctx; if (type == MXM_MPICH_ISEND_AM) { mxm_sreq->opcode = MXM_REQ_OP_AM; mxm_sreq->flags = 0; mxm_sreq->op.am.hid = id; mxm_sreq->op.am.imm_data = mxm_rank; } else if (type == MXM_MPICH_ISEND_SYNC) { mxm_sreq->opcode = MXM_REQ_OP_SEND_SYNC; mxm_sreq->flags = 0; mxm_sreq->op.send.tag = mxm_tag; mxm_sreq->op.send.imm_data = mxm_rank; } else { mxm_sreq->opcode = MXM_REQ_OP_SEND; mxm_sreq->flags = 0; mxm_sreq->op.send.tag = mxm_tag; mxm_sreq->op.send.imm_data = mxm_rank; } if (likely(req->iov_count == 1)) { mxm_sreq->base.data_type = MXM_REQ_DATA_BUFFER; mxm_sreq->base.data.buffer.ptr = req->iov_buf[0].ptr; mxm_sreq->base.data.buffer.length = req->iov_buf[0].length; } else { mxm_sreq->base.data_type = MXM_REQ_DATA_IOV; mxm_sreq->base.data.iov.vector = req->iov_buf; mxm_sreq->base.data.iov.count = req->iov_count; } ret = mxm_req_send(mxm_sreq); if (MXM_OK != ret) { list_enqueue(free_queue, &req->mxm_req->queue); mpi_errno = MPI_ERR_OTHER; goto fn_fail; } if (block) _mxm_req_wait(&mxm_sreq->base); fn_exit: return mpi_errno; fn_fail: goto fn_exit; } #if 0 /* Consider using this function in case non contiguous data */ static int _mxm_process_sdtype(MPID_Request ** sreq_p, MPI_Datatype datatype, MPID_Datatype * dt_ptr, MPIDI_msg_sz_t data_sz, const void *buf, int count, mxm_req_buffer_t ** iov_buf, int *iov_count) { int mpi_errno = MPI_SUCCESS; MPID_Request *sreq = *sreq_p; MPIDI_msg_sz_t last; MPID_IOV *iov; int n_iov = 0; int index; int size_to_copy = 0; sreq->dev.segment_ptr = MPID_Segment_alloc(); MPIU_ERR_CHKANDJUMP1((sreq->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc"); MPID_Segment_init(buf, count, datatype, sreq->dev.segment_ptr, 0); sreq->dev.segment_first = 0; sreq->dev.segment_size = data_sz; last = sreq->dev.segment_size; MPID_Segment_count_contig_blocks(sreq->dev.segment_ptr, sreq->dev.segment_first, &last, (MPI_Aint *) & n_iov); MPIU_Assert(n_iov > 0); iov = MPIU_Malloc(n_iov * sizeof(*iov)); MPIU_Assert(iov); last = sreq->dev.segment_size; MPID_Segment_pack_vector(sreq->dev.segment_ptr, sreq->dev.segment_first, &last, iov, &n_iov); MPIU_Assert(last == sreq->dev.segment_size); #if defined(MXM_DEBUG) && (MXM_DEBUG > 0) _dbg_mxm_output(7, "Send Noncontiguous data vector %i entries (free slots : %i)\n", n_iov, MXM_REQ_DATA_MAX_IOV); for(index = 0; index < n_iov; index++) { _dbg_mxm_output(7, "======= Recv iov[%i] = ptr : %p, len : %i \n", index, iov[index].MPID_IOV_BUF, iov[index].MPID_IOV_LEN); } #endif if (n_iov > MXM_MPICH_MAX_IOV) { *iov_buf = (mxm_req_buffer_t *) MPIU_Malloc(n_iov * sizeof(**iov_buf)); MPIU_Assert(*iov_buf); } for (index = 0; index < n_iov; index++) { if (index < (MXM_REQ_DATA_MAX_IOV - 1)) { (*iov_buf)[index].ptr = iov[index].MPID_IOV_BUF; (*iov_buf)[index].length = iov[index].MPID_IOV_LEN; } else { size_to_copy += iov[index].MPID_IOV_LEN; } } if (size_to_copy == 0) { sreq->dev.tmpbuf = NULL; sreq->dev.tmpbuf_sz = 0; *iov_count = n_iov; } else { int offset = 0; sreq->dev.tmpbuf = MPIU_Malloc(size_to_copy); sreq->dev.tmpbuf_sz = size_to_copy; MPIU_Assert(sreq->dev.tmpbuf); for (index = (MXM_REQ_DATA_MAX_IOV - 1); index < n_iov; index++) { MPIU_Memcpy((char *) (sreq->dev.tmpbuf) + offset, iov[index].MPID_IOV_BUF, iov[index].MPID_IOV_LEN); offset += iov[index].MPID_IOV_LEN; } (*iov_buf)[MXM_REQ_DATA_MAX_IOV - 1].ptr = sreq->dev.tmpbuf; (*iov_buf)[MXM_REQ_DATA_MAX_IOV - 1].length = size_to_copy; *iov_count = MXM_REQ_DATA_MAX_IOV; } MPIU_Free(iov); fn_exit: return mpi_errno; fn_fail: goto fn_exit; }
static int send_msg(ptl_hdr_data_t ssend_flag, struct MPIDI_VC *vc, const void *buf, MPI_Aint count, MPI_Datatype datatype, int dest, int tag, MPID_Comm *comm, int context_offset, struct MPID_Request **request) { int mpi_errno = MPI_SUCCESS; MPID_nem_ptl_vc_area *const vc_ptl = VC_PTL(vc); int ret; MPIDI_msg_sz_t data_sz; int dt_contig; MPI_Aint dt_true_lb; MPID_Datatype *dt_ptr; MPID_Request *sreq = NULL; ptl_me_t me; int initial_iov_count, remaining_iov_count; ptl_md_t md; MPI_Aint last; MPIU_CHKPMEM_DECL(2); MPIDI_STATE_DECL(MPID_STATE_SEND_MSG); MPIDI_FUNC_ENTER(MPID_STATE_SEND_MSG); MPID_nem_ptl_request_create_sreq(sreq, mpi_errno, comm); sreq->dev.match.parts.rank = dest; sreq->dev.match.parts.tag = tag; sreq->dev.match.parts.context_id = comm->context_id + context_offset; sreq->ch.vc = vc; if (!vc_ptl->id_initialized) { mpi_errno = MPID_nem_ptl_init_id(vc); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } MPIDI_Datatype_get_info(count, datatype, dt_contig, data_sz, dt_ptr, dt_true_lb); MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "count="MPI_AINT_FMT_DEC_SPEC" datatype=%#x contig=%d data_sz=%lu", count, datatype, dt_contig, data_sz)); if (data_sz <= PTL_LARGE_THRESHOLD) { /* Small message. Send all data eagerly */ if (dt_contig) { void *start = (char *)buf + dt_true_lb; MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Small contig message"); REQ_PTL(sreq)->event_handler = handler_send; MPIU_DBG_MSG_P(CH3_CHANNEL, VERBOSE, "&REQ_PTL(sreq)->event_handler = %p", &(REQ_PTL(sreq)->event_handler)); if (start == NULL) ret = MPID_nem_ptl_rptl_put(MPIDI_nem_ptl_global_md, (ptl_size_t)&dummy, data_sz, PTL_NO_ACK_REQ, vc_ptl->id, vc_ptl->pt, NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank), 0, sreq, NPTL_HEADER(ssend_flag, data_sz)); else ret = MPID_nem_ptl_rptl_put(MPIDI_nem_ptl_global_md, (ptl_size_t)start, data_sz, PTL_NO_ACK_REQ, vc_ptl->id, vc_ptl->pt, NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank), 0, sreq, NPTL_HEADER(ssend_flag, data_sz)); MPIR_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlput", "**ptlput %s", MPID_nem_ptl_strerror(ret)); DBG_MSG_PUT("global", data_sz, vc->pg_rank, NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank), NPTL_HEADER(ssend_flag, data_sz)); MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "id.nid = %#x", vc_ptl->id.phys.nid); MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "id.pid = %#x", vc_ptl->id.phys.pid); MPIU_DBG_MSG_P(CH3_CHANNEL, VERBOSE, "sreq = %p", sreq); MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "vc_ptl->pt = %d", vc_ptl->pt); MPIU_DBG_MSG_P(CH3_CHANNEL, VERBOSE, "REQ_PTL(sreq)->event_handler = %p", REQ_PTL(sreq)->event_handler); goto fn_exit; } /* noncontig data */ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Small noncontig message"); sreq->dev.segment_ptr = MPID_Segment_alloc(); MPIR_ERR_CHKANDJUMP1(sreq->dev.segment_ptr == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc"); MPID_Segment_init(buf, count, datatype, sreq->dev.segment_ptr, 0); sreq->dev.segment_first = 0; sreq->dev.segment_size = data_sz; last = sreq->dev.segment_size; sreq->dev.iov_count = MPL_IOV_LIMIT; MPID_Segment_pack_vector(sreq->dev.segment_ptr, sreq->dev.segment_first, &last, sreq->dev.iov, &sreq->dev.iov_count); if (last == sreq->dev.segment_size) { /* IOV is able to describe entire message */ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, " entire message fits in IOV"); md.start = sreq->dev.iov; md.length = sreq->dev.iov_count; md.options = PTL_IOVEC; md.eq_handle = MPIDI_nem_ptl_origin_eq; md.ct_handle = PTL_CT_NONE; ret = PtlMDBind(MPIDI_nem_ptl_ni, &md, &REQ_PTL(sreq)->md); MPIR_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlmdbind", "**ptlmdbind %s", MPID_nem_ptl_strerror(ret)); REQ_PTL(sreq)->event_handler = handler_send; ret = MPID_nem_ptl_rptl_put(REQ_PTL(sreq)->md, 0, data_sz, PTL_NO_ACK_REQ, vc_ptl->id, vc_ptl->pt, NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank), 0, sreq, NPTL_HEADER(ssend_flag, data_sz)); MPIR_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlput", "**ptlput %s", MPID_nem_ptl_strerror(ret)); DBG_MSG_PUT("sreq", data_sz, vc->pg_rank, NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank), NPTL_HEADER(ssend_flag, data_sz)); goto fn_exit; } /* IOV is not long enough to describe entire message */ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, " IOV too long: using bounce buffer"); MPIU_CHKPMEM_MALLOC(REQ_PTL(sreq)->chunk_buffer[0], void *, data_sz, mpi_errno, "chunk_buffer"); MPID_Segment_init(buf, count, datatype, sreq->dev.segment_ptr, 0); sreq->dev.segment_first = 0; last = data_sz; MPID_Segment_pack(sreq->dev.segment_ptr, sreq->dev.segment_first, &last, REQ_PTL(sreq)->chunk_buffer[0]); MPIU_Assert(last == sreq->dev.segment_size); REQ_PTL(sreq)->event_handler = handler_send; ret = MPID_nem_ptl_rptl_put(MPIDI_nem_ptl_global_md, (ptl_size_t)REQ_PTL(sreq)->chunk_buffer[0], data_sz, PTL_NO_ACK_REQ, vc_ptl->id, vc_ptl->pt, NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank), 0, sreq, NPTL_HEADER(ssend_flag, data_sz)); MPIR_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlput", "**ptlput %s", MPID_nem_ptl_strerror(ret)); DBG_MSG_PUT("global", data_sz, vc->pg_rank, NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank), NPTL_HEADER(ssend_flag, data_sz)); goto fn_exit; } /* Large message. Send first chunk of data and let receiver get the rest */ if (dt_contig) { /* create ME for buffer so receiver can issue a GET for the data */ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Large contig message"); big_meappend((char *)buf + dt_true_lb + PTL_LARGE_THRESHOLD, data_sz - PTL_LARGE_THRESHOLD, vc, NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank), sreq); REQ_PTL(sreq)->event_handler = handler_send; ret = MPID_nem_ptl_rptl_put(MPIDI_nem_ptl_global_md, (ptl_size_t)((char *)buf + dt_true_lb), PTL_LARGE_THRESHOLD, PTL_NO_ACK_REQ, vc_ptl->id, vc_ptl->pt, NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank), 0, sreq, NPTL_HEADER(ssend_flag | NPTL_LARGE, data_sz)); MPIR_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlput", "**ptlput %s", MPID_nem_ptl_strerror(ret)); DBG_MSG_PUT("global", PTL_LARGE_THRESHOLD, vc->pg_rank, NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank), NPTL_HEADER(ssend_flag | NPTL_LARGE, data_sz)); goto fn_exit; } /* Large noncontig data */ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Large noncontig message"); sreq->dev.segment_ptr = MPID_Segment_alloc(); MPIR_ERR_CHKANDJUMP1(sreq->dev.segment_ptr == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc"); MPID_Segment_init(buf, count, datatype, sreq->dev.segment_ptr, 0); sreq->dev.segment_first = 0; sreq->dev.segment_size = data_sz; last = PTL_LARGE_THRESHOLD; sreq->dev.iov_count = MPL_IOV_LIMIT; MPID_Segment_pack_vector(sreq->dev.segment_ptr, sreq->dev.segment_first, &last, sreq->dev.iov, &sreq->dev.iov_count); initial_iov_count = sreq->dev.iov_count; sreq->dev.segment_first = last; if (last == PTL_LARGE_THRESHOLD) { /* first chunk of message fits into IOV */ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, " first chunk fits in IOV"); if (initial_iov_count < MPL_IOV_LIMIT) { /* There may be space for the rest of the message in this IOV */ sreq->dev.iov_count = MPL_IOV_LIMIT - sreq->dev.iov_count; last = sreq->dev.segment_size; MPID_Segment_pack_vector(sreq->dev.segment_ptr, sreq->dev.segment_first, &last, &sreq->dev.iov[initial_iov_count], &sreq->dev.iov_count); remaining_iov_count = sreq->dev.iov_count; if (last == sreq->dev.segment_size && last <= MPIDI_nem_ptl_ni_limits.max_msg_size + PTL_LARGE_THRESHOLD) { /* Entire message fit in one IOV */ int was_incomplete; MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, " rest of message fits in one IOV"); /* Create ME for remaining data */ me.start = &sreq->dev.iov[initial_iov_count]; me.length = remaining_iov_count; me.ct_handle = PTL_CT_NONE; me.uid = PTL_UID_ANY; me.options = ( PTL_ME_OP_PUT | PTL_ME_OP_GET | PTL_ME_USE_ONCE | PTL_ME_IS_ACCESSIBLE | PTL_ME_EVENT_LINK_DISABLE | PTL_ME_EVENT_UNLINK_DISABLE | PTL_IOVEC ); me.match_id = vc_ptl->id; me.match_bits = NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank); me.ignore_bits = 0; me.min_free = 0; MPIU_CHKPMEM_MALLOC(REQ_PTL(sreq)->get_me_p, ptl_handle_me_t *, sizeof(ptl_handle_me_t), mpi_errno, "get_me_p"); ret = PtlMEAppend(MPIDI_nem_ptl_ni, MPIDI_nem_ptl_get_pt, &me, PTL_PRIORITY_LIST, sreq, &REQ_PTL(sreq)->get_me_p[0]); MPIR_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlmeappend", "**ptlmeappend %s", MPID_nem_ptl_strerror(ret)); DBG_MSG_MEAPPEND("CTL", vc->pg_rank, me, sreq); /* increment the cc for the get operation */ MPIDI_CH3U_Request_increment_cc(sreq, &was_incomplete); MPIU_Assert(was_incomplete); /* Create MD for first chunk */ md.start = sreq->dev.iov; md.length = initial_iov_count; md.options = PTL_IOVEC; md.eq_handle = MPIDI_nem_ptl_origin_eq; md.ct_handle = PTL_CT_NONE; ret = PtlMDBind(MPIDI_nem_ptl_ni, &md, &REQ_PTL(sreq)->md); MPIR_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlmdbind", "**ptlmdbind %s", MPID_nem_ptl_strerror(ret)); REQ_PTL(sreq)->event_handler = handler_send; ret = MPID_nem_ptl_rptl_put(REQ_PTL(sreq)->md, 0, PTL_LARGE_THRESHOLD, PTL_NO_ACK_REQ, vc_ptl->id, vc_ptl->pt, NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank), 0, sreq, NPTL_HEADER(ssend_flag | NPTL_LARGE, data_sz)); MPIR_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlput", "**ptlput %s", MPID_nem_ptl_strerror(ret)); DBG_MSG_PUT("req", PTL_LARGE_THRESHOLD, vc->pg_rank, NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank), NPTL_HEADER(ssend_flag | NPTL_LARGE, data_sz)); goto fn_exit; }
int MPID_nem_lmt_dma_start_recv(MPIDI_VC_t *vc, MPID_Request *rreq, MPID_IOV s_cookie) { int mpi_errno = MPI_SUCCESS; int nodma; int dt_contig; MPI_Aint dt_true_lb; MPIDI_msg_sz_t data_sz; MPID_Datatype * dt_ptr; volatile knem_status_t *status; knem_status_t current_status; struct lmt_dma_node *node = NULL; MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_LMT_DMA_START_RECV); MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_LMT_DMA_START_RECV); /* MT: this code assumes only one thread can be at this point at a time */ if (knem_fd < 0) { mpi_errno = open_knem_dev(); if (mpi_errno) MPIU_ERR_POP(mpi_errno); } /* find out contig/noncontig, size, and lb for the datatype */ MPIDI_Datatype_get_info(rreq->dev.user_count, rreq->dev.datatype, dt_contig, data_sz, dt_ptr, dt_true_lb); nodma = !knem_has_dma || data_sz < MPIR_CVAR_NEMESIS_LMT_DMA_THRESHOLD; if (dt_contig) { /* handle the iov creation ourselves */ rreq->dev.iov[0].MPID_IOV_BUF = (char *)rreq->dev.user_buf + dt_true_lb; rreq->dev.iov[0].MPID_IOV_LEN = data_sz; rreq->dev.iov_count = 1; } else { if (rreq->dev.segment_ptr == NULL) { /* segment_ptr may be non-null when this is a continuation of a many-part message that we couldn't fit in one single flight of iovs. */ MPIU_Assert(rreq->dev.segment_ptr == NULL); rreq->dev.segment_ptr = MPID_Segment_alloc(); MPIU_ERR_CHKANDJUMP1((rreq->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc"); MPID_Segment_init(rreq->dev.user_buf, rreq->dev.user_count, rreq->dev.datatype, rreq->dev.segment_ptr, 0); rreq->dev.segment_first = 0; rreq->dev.segment_size = data_sz; /* see load_send_iov FIXME above */ mpi_errno = MPIDI_CH3U_Request_load_recv_iov(rreq); if (mpi_errno) MPIU_ERR_POP(mpi_errno); } } MPIU_Assert(s_cookie.MPID_IOV_LEN == sizeof(knem_cookie_t)); MPIU_Assert(s_cookie.MPID_IOV_BUF != NULL); mpi_errno = do_dma_recv(rreq->dev.iov_count, rreq->dev.iov, *((knem_cookie_t *)s_cookie.MPID_IOV_BUF), nodma, &status, ¤t_status); if (mpi_errno) MPIU_ERR_POP(mpi_errno); /* TODO refactor this block and MPID_nem_lmt_dma_progress (and anywhere * else) to share a common function. This advancement/completion code is * duplication. */ if (current_status != KNEM_STATUS_PENDING) { /* complete the request if all data has been sent, remove it from the list */ int complete = 0; MPIU_ERR_CHKANDJUMP1(current_status == KNEM_STATUS_FAILED, mpi_errno, MPI_ERR_OTHER, "**recv_status", "**recv_status %d", current_status); mpi_errno = check_req_complete(vc, rreq, &complete); if (mpi_errno) MPIU_ERR_POP(mpi_errno); free_status_index(status - knem_status); if (complete) { /* request was completed by the OnDataAvail fn */ MPID_nem_lmt_send_DONE(vc, rreq); /* tell the other side to complete its request */ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, ".... complete"); } else { /* There is more data to send. We must inform the sender that we have completely received the current batch and that the next batch should be sent. */ MPID_nem_lmt_send_COOKIE(vc, rreq, NULL, 0); } } /* XXX DJG FIXME this looks like it always pushes! */ /* push request if not complete for progress checks later */ node = MPIU_Malloc(sizeof(struct lmt_dma_node)); node->vc = vc; node->req = rreq; node->status_p = status; node->next = outstanding_head; outstanding_head = node; ++MPID_nem_local_lmt_pending; fn_exit: MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_LMT_DMA_START_RECV); return mpi_errno; fn_fail: goto fn_exit; }
int MPIDI_Accumulate(void *origin_addr, int origin_count, MPI_Datatype origin_datatype, int target_rank, MPI_Aint target_disp, int target_count, MPI_Datatype target_datatype, MPI_Op op, MPID_Win *win_ptr) { int mpi_errno=MPI_SUCCESS; MPIDI_msg_sz_t data_sz; int dt_contig, rank, origin_predefined, target_predefined; MPI_Aint dt_true_lb; MPIDI_RMA_ops *new_ptr; MPID_Datatype *dtp; MPIU_CHKLMEM_DECL(2); MPIU_CHKPMEM_DECL(1); MPIDI_STATE_DECL(MPID_STATE_MPIDI_ACCUMULATE); MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_ACCUMULATE); MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, data_sz, dtp, dt_true_lb); if ((data_sz == 0) || (target_rank == MPI_PROC_NULL)) { goto fn_exit; } rank = win_ptr->myrank; MPIDI_CH3I_DATATYPE_IS_PREDEFINED(origin_datatype, origin_predefined); MPIDI_CH3I_DATATYPE_IS_PREDEFINED(target_datatype, target_predefined); /* Do =! rank first (most likely branch?) */ if (target_rank == rank) { MPI_User_function *uop; if (op == MPI_REPLACE) { mpi_errno = MPIR_Localcopy(origin_addr, origin_count, origin_datatype, (char *) win_ptr->base + win_ptr->disp_unit * target_disp, target_count, target_datatype); goto fn_exit; } MPIU_ERR_CHKANDJUMP1((HANDLE_GET_KIND(op) != HANDLE_KIND_BUILTIN), mpi_errno, MPI_ERR_OP, "**opnotpredefined", "**opnotpredefined %d", op ); /* get the function by indexing into the op table */ uop = MPIR_Op_table[((op)&0xf) - 1]; if (origin_predefined && target_predefined) { (*uop)(origin_addr, (char *) win_ptr->base + win_ptr->disp_unit * target_disp, &target_count, &target_datatype); } else { /* derived datatype */ MPID_Segment *segp; DLOOP_VECTOR *dloop_vec; MPI_Aint first, last; int vec_len, i, type_size, count; MPI_Datatype type; MPI_Aint true_lb, true_extent, extent; void *tmp_buf=NULL, *source_buf, *target_buf; if (origin_datatype != target_datatype) { /* first copy the data into a temporary buffer with the same datatype as the target. Then do the accumulate operation. */ MPIR_Type_get_true_extent_impl(target_datatype, &true_lb, &true_extent); MPID_Datatype_get_extent_macro(target_datatype, extent); MPIU_CHKLMEM_MALLOC(tmp_buf, void *, target_count * (MPIR_MAX(extent,true_extent)), mpi_errno, "temporary buffer"); /* adjust for potential negative lower bound in datatype */ tmp_buf = (void *)((char*)tmp_buf - true_lb); mpi_errno = MPIR_Localcopy(origin_addr, origin_count, origin_datatype, tmp_buf, target_count, target_datatype); if (mpi_errno) { MPIU_ERR_POP(mpi_errno); } } if (target_predefined) { /* target predefined type, origin derived datatype */ (*uop)(tmp_buf, (char *) win_ptr->base + win_ptr->disp_unit * target_disp, &target_count, &target_datatype); } else { segp = MPID_Segment_alloc(); MPIU_ERR_CHKANDJUMP1((!segp), mpi_errno, MPI_ERR_OTHER, "**nomem","**nomem %s","MPID_Segment_alloc"); MPID_Segment_init(NULL, target_count, target_datatype, segp, 0); first = 0; last = SEGMENT_IGNORE_LAST; MPID_Datatype_get_ptr(target_datatype, dtp); vec_len = dtp->max_contig_blocks * target_count + 1; /* +1 needed because Rob says so */ MPIU_CHKLMEM_MALLOC(dloop_vec, DLOOP_VECTOR *, vec_len * sizeof(DLOOP_VECTOR), mpi_errno, "dloop vector"); MPID_Segment_pack_vector(segp, first, &last, dloop_vec, &vec_len); source_buf = (tmp_buf != NULL) ? tmp_buf : origin_addr; target_buf = (char *) win_ptr->base + win_ptr->disp_unit * target_disp; type = dtp->eltype; type_size = MPID_Datatype_get_basic_size(type); for (i=0; i<vec_len; i++) { count = (dloop_vec[i].DLOOP_VECTOR_LEN)/type_size; (*uop)((char *)source_buf + MPIU_PtrToAint(dloop_vec[i].DLOOP_VECTOR_BUF), (char *)target_buf + MPIU_PtrToAint(dloop_vec[i].DLOOP_VECTOR_BUF), &count, &type); } MPID_Segment_free(segp); } }
function, which depends on whether this is an RMA request or a pt-to-pt request. */ rreq->dev.OnDataAvail = 0; /* The recv_pending_count must be one here (!) because of the way the pending count is queried. We may want to fix this, but it will require a sweep of the code */ } else { MPIDI_msg_sz_t recv_data_sz; MPI_Aint last; /* user buffer is not contiguous. Use the segment code to unpack it, handling various errors and exceptional cases */ /* FIXME: The MPICH tests do not exercise this branch */ /* printf( "Surprise!\n" ); fflush(stdout);*/ rreq->dev.segment_ptr = MPID_Segment_alloc( ); MPIR_ERR_CHKANDJUMP1((rreq->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc"); MPID_Segment_init(rreq->dev.user_buf, rreq->dev.user_count, rreq->dev.datatype, rreq->dev.segment_ptr, 0); recv_data_sz = rreq->dev.recv_data_sz; last = recv_data_sz; MPID_Segment_unpack( rreq->dev.segment_ptr, 0, &last, eagershort_pkt->data ); if (last != recv_data_sz) { /* --BEGIN ERROR HANDLING-- */ /* There are two cases: a datatype mismatch (could not consume all data) or a too-short buffer. We need to distinguish between these two types. */ MPIR_STATUS_SET_COUNT(rreq->status, last);
/* MPIDI_CH3_EagerSyncNoncontigSend - Eagerly send noncontiguous data in synchronous mode. Some implementations may choose to use Rendezvous sends (see ch3u_rndv.c) for all Synchronous sends (MPI_Issend and MPI_Ssend). An eager synchronous send eliminates one of the handshake messages, but most application codes should not be using synchronous sends in performance-critical operations. */ int MPIDI_CH3_EagerSyncNoncontigSend( MPID_Request **sreq_p, const void * buf, int count, MPI_Datatype datatype, MPIDI_msg_sz_t data_sz, int dt_contig, MPI_Aint dt_true_lb, int rank, int tag, MPID_Comm * comm, int context_offset ) { int mpi_errno = MPI_SUCCESS; MPIDI_CH3_Pkt_t upkt; MPIDI_CH3_Pkt_eager_sync_send_t * const es_pkt = &upkt.eager_sync_send; MPIDI_VC_t * vc; MPID_Request *sreq = *sreq_p; /* MT FIXME what are the two operations we are waiting for? the send and * the sync response? */ MPID_cc_set(&sreq->cc, 2); sreq->dev.OnDataAvail = 0; sreq->dev.OnFinal = 0; MPIDI_Pkt_init(es_pkt, MPIDI_CH3_PKT_EAGER_SYNC_SEND); es_pkt->match.parts.rank = comm->rank; es_pkt->match.parts.tag = tag; es_pkt->match.parts.context_id = comm->context_id + context_offset; es_pkt->sender_req_id = sreq->handle; es_pkt->data_sz = data_sz; MPIDI_Comm_get_vc_set_active(comm, rank, &vc); MPIDI_VC_FAI_send_seqnum(vc, seqnum); MPIDI_Pkt_set_seqnum(es_pkt, seqnum); MPIDI_Request_set_seqnum(sreq, seqnum); MPIU_DBG_MSGPKT(vc,tag,es_pkt->match.parts.context_id,rank,data_sz,"EagerSync"); if (dt_contig) { MPL_IOV iov[2]; MPIU_DBG_MSG_FMT(CH3_OTHER,VERBOSE,(MPIU_DBG_FDEST, "sending contiguous sync eager message, data_sz=" MPIDI_MSG_SZ_FMT, data_sz)); iov[0].MPL_IOV_BUF = (MPL_IOV_BUF_CAST)es_pkt; iov[0].MPL_IOV_LEN = sizeof(*es_pkt); iov[1].MPL_IOV_BUF = (MPL_IOV_BUF_CAST) ((char *)buf + dt_true_lb); iov[1].MPL_IOV_LEN = data_sz; MPID_THREAD_CS_ENTER(POBJ, vc->pobj_mutex); mpi_errno = MPIDI_CH3_iSendv(vc, sreq, iov, 2); MPID_THREAD_CS_EXIT(POBJ, vc->pobj_mutex); /* --BEGIN ERROR HANDLING-- */ if (mpi_errno != MPI_SUCCESS) { /* Make sure to destroy the request before setting the pointer to * NULL, otherwise we lose the handle on the request */ MPID_Request_release(sreq); *sreq_p = NULL; MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch3|eagermsg"); } /* --END ERROR HANDLING-- */ } else { MPIU_DBG_MSG_D(CH3_OTHER,VERBOSE, "sending non-contiguous sync eager message, data_sz=" MPIDI_MSG_SZ_FMT, data_sz); sreq->dev.segment_ptr = MPID_Segment_alloc( ); MPIR_ERR_CHKANDJUMP1((sreq->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc"); MPID_Segment_init(buf, count, datatype, sreq->dev.segment_ptr, 0); sreq->dev.segment_first = 0; sreq->dev.segment_size = data_sz; MPID_THREAD_CS_ENTER(POBJ, vc->pobj_mutex); mpi_errno = vc->sendNoncontig_fn(vc, sreq, es_pkt, sizeof(MPIDI_CH3_Pkt_eager_sync_send_t)); MPID_THREAD_CS_EXIT(POBJ, vc->pobj_mutex); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } fn_exit: return mpi_errno; fn_fail: *sreq_p = NULL; goto fn_exit; }
int MPIR_Pack_impl(const void *inbuf, int incount, MPI_Datatype datatype, void *outbuf, MPI_Aint outsize, MPI_Aint *position) { int mpi_errno = MPI_SUCCESS; MPI_Aint first, last; MPID_Segment *segp; int contig; MPI_Aint dt_true_lb; MPI_Aint data_sz; if (incount == 0) { goto fn_exit; } /* Handle contig case quickly */ if (HANDLE_GET_KIND(datatype) == HANDLE_KIND_BUILTIN) { contig = TRUE; dt_true_lb = 0; data_sz = incount * MPID_Datatype_get_basic_size(datatype); } else { MPID_Datatype *dt_ptr; MPID_Datatype_get_ptr(datatype, dt_ptr); contig = dt_ptr->is_contig; dt_true_lb = dt_ptr->true_lb; data_sz = incount * dt_ptr->size; } if (contig) { MPIU_Memcpy((char *) outbuf + *position, (char *)inbuf + dt_true_lb, data_sz); *position = (int)((MPI_Aint)*position + data_sz); goto fn_exit; } /* non-contig case */ /* TODO: CHECK RETURN VALUES?? */ /* TODO: SHOULD THIS ALL BE IN A MPID_PACK??? */ segp = MPID_Segment_alloc(); MPIU_ERR_CHKANDJUMP1(segp == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment"); mpi_errno = MPID_Segment_init(inbuf, incount, datatype, segp, 0); if (mpi_errno) MPIU_ERR_POP(mpi_errno); /* NOTE: the use of buffer values and positions in MPI_Pack and in * MPID_Segment_pack are quite different. See code or docs or something. */ first = 0; last = SEGMENT_IGNORE_LAST; /* Ensure that pointer increment fits in a pointer */ MPID_Ensure_Aint_fits_in_pointer((MPI_VOID_PTR_CAST_TO_MPI_AINT outbuf) + (MPI_Aint) *position); MPID_Segment_pack(segp, first, &last, (void *) ((char *) outbuf + *position)); /* Ensure that calculation fits into an int datatype. */ MPID_Ensure_Aint_fits_in_int((MPI_Aint)*position + last); *position = (int)((MPI_Aint)*position + last); MPID_Segment_free(segp); fn_exit: return mpi_errno; fn_fail: goto fn_exit; }
int MPIDI_CH3_PktHandler_RndvClrToSend( MPIDI_VC_t *vc, MPIDI_CH3_Pkt_t *pkt, MPIDI_msg_sz_t *buflen, MPID_Request **rreqp ) { MPIDI_CH3_Pkt_rndv_clr_to_send_t * cts_pkt = &pkt->rndv_clr_to_send; MPID_Request * sreq; MPID_Request * rts_sreq; MPIDI_CH3_Pkt_t upkt; MPIDI_CH3_Pkt_rndv_send_t * rs_pkt = &upkt.rndv_send; int dt_contig; MPI_Aint dt_true_lb; MPIDI_msg_sz_t data_sz; MPID_Datatype * dt_ptr; int mpi_errno = MPI_SUCCESS; MPIU_DBG_MSG(CH3_OTHER,VERBOSE,"received rndv CTS pkt"); MPID_Request_get_ptr(cts_pkt->sender_req_id, sreq); MPIU_DBG_PRINTF(("received cts, count=%d\n", sreq->dev.user_count)); sreq->dev.OnDataAvail = 0; sreq->dev.OnFinal = 0; /* Release the RTS request if one exists. MPID_Request_fetch_and_clear_rts_sreq() needs to be atomic to prevent cancel send from cancelling the wrong (future) request. If MPID_Request_fetch_and_clear_rts_sreq() returns a NULL rts_sreq, then MPID_Cancel_send() is responsible for releasing the RTS request object. */ MPIDI_Request_fetch_and_clear_rts_sreq(sreq, &rts_sreq); if (rts_sreq != NULL) { MPID_Request_release(rts_sreq); } *buflen = sizeof(MPIDI_CH3_Pkt_t); MPIDI_Pkt_init(rs_pkt, MPIDI_CH3_PKT_RNDV_SEND); rs_pkt->receiver_req_id = cts_pkt->receiver_req_id; MPIDI_Datatype_get_info(sreq->dev.user_count, sreq->dev.datatype, dt_contig, data_sz, dt_ptr, dt_true_lb); if (dt_contig) { MPID_IOV iov[MPID_IOV_LIMIT]; MPIU_DBG_MSG_FMT(CH3_OTHER,VERBOSE,(MPIU_DBG_FDEST, "sending contiguous rndv data, data_sz=" MPIDI_MSG_SZ_FMT, data_sz)); iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST)rs_pkt; iov[0].MPID_IOV_LEN = sizeof(*rs_pkt); iov[1].MPID_IOV_BUF = (MPID_IOV_BUF_CAST)((char *)sreq->dev.user_buf + dt_true_lb); iov[1].MPID_IOV_LEN = data_sz; MPIU_THREAD_CS_ENTER(CH3COMM,vc); mpi_errno = MPIU_CALL(MPIDI_CH3,iSendv(vc, sreq, iov, 2)); MPIU_THREAD_CS_EXIT(CH3COMM,vc); MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|senddata"); } else { sreq->dev.segment_ptr = MPID_Segment_alloc( ); MPIU_ERR_CHKANDJUMP1((sreq->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc"); MPID_Segment_init(sreq->dev.user_buf, sreq->dev.user_count, sreq->dev.datatype, sreq->dev.segment_ptr, 0); sreq->dev.segment_first = 0; sreq->dev.segment_size = data_sz; MPIU_THREAD_CS_ENTER(CH3COMM,vc); mpi_errno = vc->sendNoncontig_fn(vc, sreq, rs_pkt, sizeof(*rs_pkt)); MPIU_THREAD_CS_EXIT(CH3COMM,vc); MPIU_ERR_CHKANDJUMP(mpi_errno, mpi_errno, MPI_ERR_OTHER, "**ch3|senddata"); } *rreqp = NULL; fn_fail: return mpi_errno; }
int MPID_nem_newmad_process_rdtype(MPID_Request **rreq_p, MPID_Datatype * dt_ptr, MPIDI_msg_sz_t data_sz, struct iovec *newmad_iov[], int *num_iov) { MPID_Request *rreq = *rreq_p; MPIDI_msg_sz_t last; MPID_IOV *iov; int n_iov = 0; int mpi_errno = MPI_SUCCESS; int index; MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_NEWMAD_PROCESS_RDTYPE); MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_NEWMAD_PROCESS_RDTYPE); if (rreq->dev.segment_ptr == NULL) { rreq->dev.segment_ptr = MPID_Segment_alloc( ); MPIU_ERR_CHKANDJUMP1((rreq->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc"); } MPID_Segment_init(rreq->dev.user_buf, rreq->dev.user_count, rreq->dev.datatype, rreq->dev.segment_ptr, 0); rreq->dev.segment_first = 0; rreq->dev.segment_size = data_sz; last = rreq->dev.segment_size; MPID_Segment_count_contig_blocks(rreq->dev.segment_ptr,rreq->dev.segment_first,&last,&n_iov); MPIU_Assert(n_iov > 0); iov = MPIU_Malloc(n_iov*sizeof(MPID_IOV)); MPID_Segment_unpack_vector(rreq->dev.segment_ptr, rreq->dev.segment_first, &last,iov, &n_iov); MPIU_Assert(last == rreq->dev.segment_size); #ifdef DEBUG for(index = 0; index < n_iov ; index++) { fprintf(stdout,"======================\n"); fprintf(stdout,"RECV iov[%i]: [base %p][len %i]\n",index, iov[index].MPID_IOV_BUF,iov[index].MPID_IOV_LEN); } #endif if(n_iov <= NMAD_IOV_MAX_DEPTH) { for(index=0; index < n_iov ; index++) { (*newmad_iov)[index].iov_base = iov[index].MPID_IOV_BUF; (*newmad_iov)[index].iov_len = iov[index].MPID_IOV_LEN; } rreq->dev.tmpbuf = NULL; *num_iov = n_iov; } else { int packsize = 0; MPIR_Pack_size_impl(rreq->dev.user_count, rreq->dev.datatype, &packsize); rreq->dev.tmpbuf = MPIU_Malloc((size_t) packsize); MPIU_Assert(rreq->dev.tmpbuf); rreq->dev.tmpbuf_sz = packsize; (*newmad_iov)[0].iov_base = (char *) rreq->dev.tmpbuf; (*newmad_iov)[0].iov_len = (uint32_t) packsize; *num_iov = 1 ; } MPIU_Free(iov); fn_exit: MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_NEWMAD_PROCESS_RDTYPE); return mpi_errno; fn_fail: ATTRIBUTE((unused)) goto fn_exit; }
int MPIDI_Get_accumulate(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype, void *result_addr, int result_count, MPI_Datatype result_datatype, int target_rank, MPI_Aint target_disp, int target_count, MPI_Datatype target_datatype, MPI_Op op, MPID_Win *win_ptr) { int mpi_errno = MPI_SUCCESS; MPIDI_msg_sz_t data_sz; int rank, origin_predefined, result_predefined, target_predefined; int shm_locked = 0; int dt_contig ATTRIBUTE((unused)); MPI_Aint dt_true_lb ATTRIBUTE((unused)); MPID_Datatype *dtp; MPIU_CHKLMEM_DECL(2); MPIDI_STATE_DECL(MPID_STATE_MPIDI_GET_ACCUMULATE); MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_GET_ACCUMULATE); if (target_rank == MPI_PROC_NULL) { goto fn_exit; } if (win_ptr->epoch_state == MPIDI_EPOCH_NONE && win_ptr->fence_issued) { win_ptr->epoch_state = MPIDI_EPOCH_FENCE; } MPIU_ERR_CHKANDJUMP(win_ptr->epoch_state == MPIDI_EPOCH_NONE, mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync"); MPIDI_Datatype_get_info(target_count, target_datatype, dt_contig, data_sz, dtp, dt_true_lb); if (data_sz == 0) { goto fn_exit; } rank = win_ptr->myrank; origin_predefined = TRUE; /* quiet uninitialized warnings (b/c goto) */ if (op != MPI_NO_OP) { MPIDI_CH3I_DATATYPE_IS_PREDEFINED(origin_datatype, origin_predefined); } MPIDI_CH3I_DATATYPE_IS_PREDEFINED(result_datatype, result_predefined); MPIDI_CH3I_DATATYPE_IS_PREDEFINED(target_datatype, target_predefined); /* Do =! rank first (most likely branch?) */ if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED) { MPI_User_function *uop; void *base; int disp_unit; if (win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED) { base = win_ptr->shm_base_addrs[target_rank]; disp_unit = win_ptr->disp_units[target_rank]; MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr); shm_locked = 1; } else { base = win_ptr->base; disp_unit = win_ptr->disp_unit; } /* Perform the local get first, then the accumulate */ mpi_errno = MPIR_Localcopy((char *) base + disp_unit * target_disp, target_count, target_datatype, result_addr, result_count, result_datatype); if (mpi_errno) { MPIU_ERR_POP(mpi_errno); } /* NO_OP: Don't perform the accumulate */ if (op == MPI_NO_OP) { if (shm_locked) { MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr); shm_locked = 0; } goto fn_exit; } if (op == MPI_REPLACE) { mpi_errno = MPIR_Localcopy(origin_addr, origin_count, origin_datatype, (char *) base + disp_unit * target_disp, target_count, target_datatype); if (mpi_errno) { MPIU_ERR_POP(mpi_errno); } if (shm_locked) { MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr); shm_locked = 0; } goto fn_exit; } MPIU_ERR_CHKANDJUMP1((HANDLE_GET_KIND(op) != HANDLE_KIND_BUILTIN), mpi_errno, MPI_ERR_OP, "**opnotpredefined", "**opnotpredefined %d", op ); /* get the function by indexing into the op table */ uop = MPIR_OP_HDL_TO_FN(op); if (origin_predefined && target_predefined) { /* Cast away const'ness for origin_address in order to * avoid changing the prototype for MPI_User_function */ (*uop)((void *) origin_addr, (char *) base + disp_unit*target_disp, &target_count, &target_datatype); } else { /* derived datatype */ MPID_Segment *segp; DLOOP_VECTOR *dloop_vec; MPI_Aint first, last; int vec_len, i, type_size, count; MPI_Datatype type; MPI_Aint true_lb, true_extent, extent; void *tmp_buf=NULL, *target_buf; const void *source_buf; if (origin_datatype != target_datatype) { /* first copy the data into a temporary buffer with the same datatype as the target. Then do the accumulate operation. */ MPIR_Type_get_true_extent_impl(target_datatype, &true_lb, &true_extent); MPID_Datatype_get_extent_macro(target_datatype, extent); MPIU_CHKLMEM_MALLOC(tmp_buf, void *, target_count * (MPIR_MAX(extent,true_extent)), mpi_errno, "temporary buffer"); /* adjust for potential negative lower bound in datatype */ tmp_buf = (void *)((char*)tmp_buf - true_lb); mpi_errno = MPIR_Localcopy(origin_addr, origin_count, origin_datatype, tmp_buf, target_count, target_datatype); if (mpi_errno) { MPIU_ERR_POP(mpi_errno); } } if (target_predefined) { /* target predefined type, origin derived datatype */ (*uop)(tmp_buf, (char *) base + disp_unit * target_disp, &target_count, &target_datatype); } else { segp = MPID_Segment_alloc(); MPIU_ERR_CHKANDJUMP1((!segp), mpi_errno, MPI_ERR_OTHER, "**nomem","**nomem %s","MPID_Segment_alloc"); MPID_Segment_init(NULL, target_count, target_datatype, segp, 0); first = 0; last = SEGMENT_IGNORE_LAST; MPID_Datatype_get_ptr(target_datatype, dtp); vec_len = dtp->max_contig_blocks * target_count + 1; /* +1 needed because Rob says so */ MPIU_CHKLMEM_MALLOC(dloop_vec, DLOOP_VECTOR *, vec_len * sizeof(DLOOP_VECTOR), mpi_errno, "dloop vector"); MPID_Segment_pack_vector(segp, first, &last, dloop_vec, &vec_len); source_buf = (tmp_buf != NULL) ? tmp_buf : origin_addr; target_buf = (char *) base + disp_unit * target_disp; type = dtp->eltype; type_size = MPID_Datatype_get_basic_size(type); for (i=0; i<vec_len; i++) { count = (dloop_vec[i].DLOOP_VECTOR_LEN)/type_size; (*uop)((char *)source_buf + MPIU_PtrToAint(dloop_vec[i].DLOOP_VECTOR_BUF), (char *)target_buf + MPIU_PtrToAint(dloop_vec[i].DLOOP_VECTOR_BUF), &count, &type); } MPID_Segment_free(segp); } }
int MPID_nem_mxm_issend(MPIDI_VC_t * vc, const void *buf, int count, MPI_Datatype datatype, int rank, int tag, MPID_Comm * comm, int context_offset, MPID_Request ** sreq_ptr) { int mpi_errno = MPI_SUCCESS; MPID_Request *sreq = NULL; MPID_Datatype *dt_ptr; int dt_contig; MPIDI_msg_sz_t data_sz; MPI_Aint dt_true_lb; MPID_nem_mxm_vc_area *vc_area = NULL; MPID_nem_mxm_req_area *req_area = NULL; MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_MXM_ISSEND); MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_MXM_ISSEND); MPIDI_Datatype_get_info(count, datatype, dt_contig, data_sz, dt_ptr, dt_true_lb); /* create a request */ MPIDI_Request_create_sreq(sreq, mpi_errno, goto fn_exit); MPIU_Assert(sreq != NULL); MPIDI_Request_set_type(sreq, MPIDI_REQUEST_TYPE_SEND); MPIDI_VC_FAI_send_seqnum(vc, seqnum); MPIDI_Request_set_seqnum(sreq, seqnum); if (HANDLE_GET_KIND(datatype) != HANDLE_KIND_BUILTIN) { MPID_Datatype_get_ptr(datatype, sreq->dev.datatype_ptr); MPID_Datatype_add_ref(sreq->dev.datatype_ptr); } sreq->partner_request = NULL; sreq->dev.OnDataAvail = NULL; sreq->dev.tmpbuf = NULL; sreq->ch.vc = vc; sreq->ch.noncontig = FALSE; _dbg_mxm_output(5, "isSend ========> Sending USER msg for req %p (context %d to %d tag %d size %d) \n", sreq, comm->context_id + context_offset, rank, tag, data_sz); vc_area = VC_BASE(vc); req_area = REQ_BASE(sreq); req_area-> ctx = sreq; req_area->iov_buf = req_area->tmp_buf; req_area->iov_count = 0; req_area->iov_buf[0].ptr = NULL; req_area->iov_buf[0].length = 0; if (data_sz) { if (dt_contig) { req_area->iov_count = 1; req_area->iov_buf[0].ptr = (char *) (buf) + dt_true_lb; req_area->iov_buf[0].length = data_sz; } else { MPIDI_msg_sz_t last; MPI_Aint packsize = 0; sreq->ch.noncontig = TRUE; sreq->dev.segment_ptr = MPID_Segment_alloc(); MPIU_ERR_CHKANDJUMP1((sreq->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc"); MPIR_Pack_size_impl(count, datatype, &packsize); last = data_sz; if (packsize > 0) { sreq->dev.tmpbuf = MPIU_Malloc((size_t) packsize); MPIU_Assert(sreq->dev.tmpbuf); MPID_Segment_init(buf, count, datatype, sreq->dev.segment_ptr, 0); MPID_Segment_pack(sreq->dev.segment_ptr, 0, &last, sreq->dev.tmpbuf); req_area->iov_count = 1; req_area->iov_buf[0].ptr = sreq->dev.tmpbuf; req_area->iov_buf[0].length = last; } } } vc_area->pending_sends += 1; mpi_errno = _mxm_isend(vc_area->mxm_ep, req_area, MXM_MPICH_ISEND_SYNC, (mxm_mq_h) comm->dev.ch.netmod_priv, comm->rank, tag, _mxm_tag_mpi2mxm(tag, comm->context_id + context_offset), 0); if (mpi_errno) MPIU_ERR_POP(mpi_errno); _dbg_mxm_out_req(sreq); fn_exit: *sreq_ptr = sreq; MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_MXM_ISSEND); return mpi_errno; fn_fail: goto fn_exit; }
/*@ MPI_Unpack_external - Unpack a buffer (packed with MPI_Pack_external) according to a datatype into contiguous memory Input Parameters: + datarep - data representation (string) . inbuf - input buffer start (choice) . insize - input buffer size, in bytes (address integer) . outcount - number of output data items (integer) . datatype - datatype of output data item (handle) Input/Output Parameters: . position - current position in buffer, in bytes (address integer) Output Parameters: . outbuf - output buffer start (choice) .N ThreadSafe .N Fortran .N Errors .N MPI_SUCCESS .N MPI_ERR_TYPE .N MPI_ERR_ARG @*/ int MPI_Unpack_external(const char datarep[], const void *inbuf, MPI_Aint insize, MPI_Aint *position, void *outbuf, int outcount, MPI_Datatype datatype) { static const char FCNAME[] = "MPI_Unpack_external"; int mpi_errno = MPI_SUCCESS; MPI_Aint first, last; MPID_Segment *segp; MPIR_FUNC_TERSE_STATE_DECL(MPID_STATE_MPI_UNPACK_EXTERNAL); MPIR_ERRTEST_INITIALIZED_ORDIE(); MPIR_FUNC_TERSE_ENTER(MPID_STATE_MPI_UNPACK_EXTERNAL); /* Validate parameters, especially handles needing to be converted */ # ifdef HAVE_ERROR_CHECKING { MPID_BEGIN_ERROR_CHECKS; { if (insize > 0) { MPIR_ERRTEST_ARGNULL(inbuf, "input buffer", mpi_errno); } /* NOTE: outbuf could be MPI_BOTTOM; don't test for NULL */ MPIR_ERRTEST_COUNT(insize, mpi_errno); MPIR_ERRTEST_COUNT(outcount, mpi_errno); MPIR_ERRTEST_DATATYPE(datatype, "datatype", mpi_errno); if (datatype != MPI_DATATYPE_NULL && HANDLE_GET_KIND(datatype) != HANDLE_KIND_BUILTIN) { MPIR_Datatype *datatype_ptr = NULL; MPID_Datatype_get_ptr(datatype, datatype_ptr); MPIR_Datatype_valid_ptr(datatype_ptr, mpi_errno); MPID_Datatype_committed_ptr(datatype_ptr, mpi_errno); } /* If datatye_ptr is not valid, it will be reset to null */ if (mpi_errno) goto fn_fail; } MPID_END_ERROR_CHECKS; } # endif /* HAVE_ERROR_CHECKING */ /* ... body of routine ... */ if (insize == 0) { goto fn_exit; } segp = MPID_Segment_alloc(); MPIR_ERR_CHKANDJUMP1((segp == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc"); mpi_errno = MPID_Segment_init(outbuf, outcount, datatype, segp, 1); if (mpi_errno != MPI_SUCCESS) goto fn_fail; /* NOTE: buffer values and positions in MPI_Unpack_external are used very * differently from use in MPID_Segment_unpack_external... */ first = 0; last = SEGMENT_IGNORE_LAST; /* Ensure that pointer increment fits in a pointer */ MPIR_Ensure_Aint_fits_in_pointer((MPIR_VOID_PTR_CAST_TO_MPI_AINT inbuf) + *position); MPID_Segment_unpack_external32(segp, first, &last, (void *) ((char *) inbuf + *position)); *position += last; MPID_Segment_free(segp); if (mpi_errno != MPI_SUCCESS) goto fn_fail; /* ... end of body of routine ... */ fn_exit: MPIR_FUNC_TERSE_EXIT(MPID_STATE_MPI_UNPACK_EXTERNAL); return mpi_errno; fn_fail: /* --BEGIN ERROR HANDLING-- */ # ifdef HAVE_ERROR_CHECKING { mpi_errno = MPIR_Err_create_code( mpi_errno, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OTHER, "**mpi_unpack_external", "**mpi_unpack_external %s %p %d %p %p %d %D", datarep, inbuf, insize, position, outbuf, outcount, datatype); } # endif mpi_errno = MPIR_Err_return_comm(NULL, FCNAME, mpi_errno); goto fn_exit; /* --END ERROR HANDLING-- */ }
/*@ MPI_Pack_external - Packs a datatype into contiguous memory, using the external32 format Input Parameters: + datarep - data representation (string) . inbuf - input buffer start (choice) . incount - number of input data items (integer) . datatype - datatype of each input data item (handle) - outsize - output buffer size, in bytes (address integer) Output Parameters: . outbuf - output buffer start (choice) Input/Output Parameters: . position - current position in buffer, in bytes (address integer) .N ThreadSafe .N Fortran .N Errors .N MPI_SUCCESS .N MPI_ERR_TYPE .N MPI_ERR_ARG .N MPI_ERR_COUNT @*/ int MPI_Pack_external(const char datarep[], const void *inbuf, int incount, MPI_Datatype datatype, void *outbuf, MPI_Aint outsize, MPI_Aint *position) { static const char FCNAME[] = "MPI_Pack_external"; int mpi_errno = MPI_SUCCESS; MPI_Aint first, last; MPID_Segment *segp; MPID_MPI_STATE_DECL(MPID_STATE_MPI_PACK_EXTERNAL); MPIR_ERRTEST_INITIALIZED_ORDIE(); MPID_MPI_FUNC_ENTER(MPID_STATE_MPI_PACK_EXTERNAL); /* Validate parameters and objects (post conversion) */ # ifdef HAVE_ERROR_CHECKING { MPID_BEGIN_ERROR_CHECKS; { MPIR_ERRTEST_COUNT(incount, mpi_errno); MPIR_ERRTEST_COUNT(outsize, mpi_errno); /* NOTE: inbuf could be null (MPI_BOTTOM) */ if (incount > 0) { MPIR_ERRTEST_ARGNULL(outbuf, "output buffer", mpi_errno); } MPIR_ERRTEST_ARGNULL(position, "position", mpi_errno); MPIR_ERRTEST_DATATYPE(datatype, "datatype", mpi_errno); if (HANDLE_GET_KIND(datatype) != HANDLE_KIND_BUILTIN) { MPID_Datatype *datatype_ptr = NULL; MPID_Datatype_get_ptr(datatype, datatype_ptr); MPID_Datatype_valid_ptr(datatype_ptr, mpi_errno); MPID_Datatype_committed_ptr(datatype_ptr, mpi_errno); if (mpi_errno != MPI_SUCCESS) goto fn_fail; } } MPID_END_ERROR_CHECKS; } # endif /* HAVE_ERROR_CHECKING */ /* ... body of routine ... */ if (incount == 0) { goto fn_exit; } segp = MPID_Segment_alloc(); /* --BEGIN ERROR HANDLING-- */ if (segp == NULL) { mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment"); goto fn_fail; } /* --END ERROR HANDLING-- */ mpi_errno = MPID_Segment_init(inbuf, incount, datatype, segp, 1); if (mpi_errno != MPI_SUCCESS) goto fn_fail; /* NOTE: the use of buffer values and positions in MPI_Pack_external and * in MPID_Segment_pack_external are quite different. See code or docs * or something. */ first = 0; last = SEGMENT_IGNORE_LAST; /* Ensure that pointer increment fits in a pointer */ MPID_Ensure_Aint_fits_in_pointer((MPI_VOID_PTR_CAST_TO_MPI_AINT outbuf) + *position); MPID_Segment_pack_external32(segp, first, &last, (void *)((char *) outbuf + *position)); *position += last; MPID_Segment_free(segp); /* ... end of body of routine ... */ fn_exit: MPID_MPI_FUNC_EXIT(MPID_STATE_MPI_PACK_EXTERNAL); return mpi_errno; fn_fail: /* --BEGIN ERROR HANDLING-- */ # ifdef HAVE_ERROR_CHECKING { mpi_errno = MPIR_Err_create_code( mpi_errno, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OTHER, "**mpi_pack_external", "**mpi_pack_external %s %p %d %D %p %d %p", datarep, inbuf, incount, datatype, outbuf, outsize, position); } # endif mpi_errno = MPIR_Err_return_comm(0, FCNAME, mpi_errno); goto fn_exit; /* --END ERROR HANDLING-- */ }
int MPIDI_CH3U_Receive_data_found(MPID_Request *rreq, char *buf, MPIDI_msg_sz_t *buflen, int *complete) { int dt_contig; MPI_Aint dt_true_lb; MPIDI_msg_sz_t userbuf_sz; MPID_Datatype * dt_ptr = NULL; MPIDI_msg_sz_t data_sz; int mpi_errno = MPI_SUCCESS; MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3U_RECEIVE_DATA_FOUND); MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3U_RECEIVE_DATA_FOUND); MPIU_DBG_MSG(CH3_OTHER,VERBOSE,"posted request found"); MPIDI_Datatype_get_info(rreq->dev.user_count, rreq->dev.datatype, dt_contig, userbuf_sz, dt_ptr, dt_true_lb); if (rreq->dev.recv_data_sz <= userbuf_sz) { data_sz = rreq->dev.recv_data_sz; } else { MPIU_DBG_MSG_FMT(CH3_OTHER,VERBOSE,(MPIU_DBG_FDEST, "receive buffer too small; message truncated, msg_sz=" MPIDI_MSG_SZ_FMT ", userbuf_sz=" MPIDI_MSG_SZ_FMT, rreq->dev.recv_data_sz, userbuf_sz)); rreq->status.MPI_ERROR = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_TRUNCATE, "**truncate", "**truncate %d %d %d %d", rreq->status.MPI_SOURCE, rreq->status.MPI_TAG, rreq->dev.recv_data_sz, userbuf_sz ); rreq->status.count = userbuf_sz; data_sz = userbuf_sz; } if (dt_contig && data_sz == rreq->dev.recv_data_sz) { /* user buffer is contiguous and large enough to store the entire message. However, we haven't yet *read* the data (this code describes how to read the data into the destination) */ /* if all of the data has already been received, unpack it now, otherwise build an iov and let the channel unpack */ if (*buflen >= data_sz) { MPIU_DBG_MSG(CH3_OTHER,VERBOSE,"Copying contiguous data to user buffer"); /* copy data out of the receive buffer */ MPIU_Memcpy((char*)(rreq->dev.user_buf) + dt_true_lb, buf, data_sz); *buflen = data_sz; *complete = TRUE; } else { MPIU_DBG_MSG(CH3_OTHER,VERBOSE,"IOV loaded for contiguous read"); rreq->dev.iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST)((char*)(rreq->dev.user_buf) + dt_true_lb); rreq->dev.iov[0].MPID_IOV_LEN = data_sz; rreq->dev.iov_count = 1; *buflen = 0; *complete = FALSE; } /* FIXME: We want to set the OnDataAvail to the appropriate function, which depends on whether this is an RMA request or a pt-to-pt request. */ rreq->dev.OnDataAvail = 0; } else { /* user buffer is not contiguous or is too small to hold the entire message */ rreq->dev.segment_ptr = MPID_Segment_alloc( ); MPIU_ERR_CHKANDJUMP1((rreq->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc"); MPID_Segment_init(rreq->dev.user_buf, rreq->dev.user_count, rreq->dev.datatype, rreq->dev.segment_ptr, 0); rreq->dev.segment_first = 0; rreq->dev.segment_size = data_sz; /* if all of the data has already been received, and the message is not truncated, unpack it now, otherwise build an iov and let the channel unpack */ if (data_sz == rreq->dev.recv_data_sz && *buflen >= data_sz) { MPIDI_msg_sz_t last; MPIU_DBG_MSG(CH3_OTHER,VERBOSE,"Copying noncontiguous data to user buffer"); last = data_sz; MPID_Segment_unpack(rreq->dev.segment_ptr, rreq->dev.segment_first, &last, buf); /* --BEGIN ERROR HANDLING-- */ if (last != data_sz) { /* If the data can't be unpacked, the we have a mismatch between the datatype and the amount of data received. Throw away received data. */ MPIU_ERR_SET(rreq->status.MPI_ERROR, MPI_ERR_TYPE, "**dtypemismatch"); rreq->status.count = (int)rreq->dev.segment_first; *buflen = data_sz; *complete = TRUE; /* FIXME: Set OnDataAvail to 0? If not, why not? */ goto fn_exit; } /* --END ERROR HANDLING-- */ *buflen = data_sz; rreq->dev.OnDataAvail = 0; *complete = TRUE; } else { MPIU_DBG_MSG(CH3_OTHER,VERBOSE,"IOV loaded for non-contiguous read"); mpi_errno = MPIDI_CH3U_Request_load_recv_iov(rreq); if (mpi_errno != MPI_SUCCESS) { MPIU_ERR_SETFATALANDJUMP(mpi_errno,MPI_ERR_OTHER, "**ch3|loadrecviov"); } *buflen = 0; *complete = FALSE; } } fn_exit: MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3U_RECEIVE_DATA_FOUND); return mpi_errno; fn_fail: goto fn_exit; }