void MPIDI_Win_datatype_map(MPIDI_Datatype * dt) { if (dt->contig) { dt->num_contig = 1; dt->map = &dt->__map; dt->map[0].DLOOP_VECTOR_BUF = (void*)(size_t)dt->true_lb; dt->map[0].DLOOP_VECTOR_LEN = dt->size; } else { unsigned map_size = dt->pointer->max_contig_blocks*dt->count + 1; dt->num_contig = map_size; dt->map = (DLOOP_VECTOR*)MPIU_Malloc(map_size * sizeof(DLOOP_VECTOR)); MPID_assert(dt->map != NULL); DLOOP_Offset last = dt->pointer->size*dt->count; MPID_Segment seg; MPID_Segment_init(NULL, dt->count, dt->type, &seg, 0); MPID_Segment_pack_vector(&seg, 0, &last, dt->map, &dt->num_contig); MPID_assert((unsigned)dt->num_contig <= map_size); #ifdef TRACE_ON TRACE_ERR("dt->pointer->size=%d num_contig: orig=%u new=%d\n", dt->pointer->size, map_size, dt->num_contig); int i; for(i=0; i<dt->num_contig; ++i) TRACE_ERR(" %d: BUF=%zu LEN=%zu\n", i, (size_t)dt->map[i].DLOOP_VECTOR_BUF, (size_t)dt->map[i].DLOOP_VECTOR_LEN); #endif } }
void MPID_Segment_unpack_vector(struct DLOOP_Segment *segp, DLOOP_Offset first, DLOOP_Offset *lastp, DLOOP_VECTOR *vectorp, int *lengthp) { MPIDI_STATE_DECL(MPID_STATE_MPID_SEGMENT_UNPACK_VECTOR); MPIDI_FUNC_ENTER(MPID_STATE_MPID_SEGMENT_UNPACK_VECTOR); MPID_Segment_pack_vector(segp, first, lastp, vectorp, lengthp); MPIDI_FUNC_EXIT(MPID_STATE_MPID_SEGMENT_UNPACK_VECTOR); return; }
/* create a request */ MPIDI_Request_create_sreq(sreq, mpi_errno, goto fn_exit); MPIU_Assert(sreq != NULL); MPIDI_Request_set_type(sreq, MPIDI_REQUEST_TYPE_SEND); MPIDI_VC_FAI_send_seqnum(vc, seqnum); MPIDI_Request_set_seqnum(sreq, seqnum); if (HANDLE_GET_KIND(datatype) != HANDLE_KIND_BUILTIN) { MPID_Datatype_get_ptr(datatype, sreq->dev.datatype_ptr); MPID_Datatype_add_ref(sreq->dev.datatype_ptr); } sreq->partner_request = NULL; sreq->dev.OnDataAvail = NULL; sreq->dev.tmpbuf = NULL; sreq->ch.vc = vc; sreq->ch.noncontig = FALSE; _dbg_mxm_output(5, "isSend ========> Sending USER msg for req %p (context %d to %d tag %d size %d) \n", sreq, comm->context_id + context_offset, rank, tag, data_sz); vc_area = VC_BASE(vc); req_area = REQ_BASE(sreq); req_area-> ctx = sreq; req_area->iov_buf = req_area->tmp_buf; req_area->iov_count = 0; req_area->iov_buf[0].ptr = NULL; req_area->iov_buf[0].length = 0; if (data_sz) { if (dt_contig) { req_area->iov_count = 1; req_area->iov_buf[0].ptr = (char *) (buf) + dt_true_lb; req_area->iov_buf[0].length = data_sz; } else { MPIDI_msg_sz_t last; MPI_Aint packsize = 0; sreq->ch.noncontig = TRUE; sreq->dev.segment_ptr = MPID_Segment_alloc(); MPIU_ERR_CHKANDJUMP1((sreq->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc"); MPIR_Pack_size_impl(count, datatype, &packsize); last = data_sz; if (packsize > 0) { sreq->dev.tmpbuf = MPIU_Malloc((size_t) packsize); MPIU_Assert(sreq->dev.tmpbuf); MPID_Segment_init(buf, count, datatype, sreq->dev.segment_ptr, 0); MPID_Segment_pack(sreq->dev.segment_ptr, 0, &last, sreq->dev.tmpbuf); req_area->iov_count = 1; req_area->iov_buf[0].ptr = sreq->dev.tmpbuf; req_area->iov_buf[0].length = last; } } } vc_area->pending_sends += 1; mpi_errno = _mxm_isend(vc_area->mxm_ep, req_area, MXM_MPICH_ISEND_SYNC, (mxm_mq_h) comm->dev.ch.netmod_priv, comm->rank, tag, _mxm_tag_mpi2mxm(tag, comm->context_id + context_offset), 0); if (mpi_errno) MPIU_ERR_POP(mpi_errno); _dbg_mxm_out_req(sreq); fn_exit: *sreq_ptr = sreq; MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_MXM_ISSEND); return mpi_errno; fn_fail: goto fn_exit; } static int _mxm_handle_sreq(MPID_Request * req) { int complete = FALSE; int (*reqFn) (MPIDI_VC_t *, MPID_Request *, int *); MPID_nem_mxm_vc_area *vc_area = NULL; MPID_nem_mxm_req_area *req_area = NULL; vc_area = VC_BASE(req->ch.vc); req_area = REQ_BASE(req); _dbg_mxm_out_buf(req_area->iov_buf[0].ptr, (req_area->iov_buf[0].length > 16 ? 16 : req_area->iov_buf[0].length)); vc_area->pending_sends -= 1; if (((req->dev.datatype_ptr != NULL) && (req->dev.tmpbuf != NULL))) { MPIU_Free(req->dev.tmpbuf); } if (req_area->iov_count > MXM_MPICH_MAX_IOV) { MPIU_Free(req_area->iov_buf); req_area->iov_buf = req_area->tmp_buf; req_area->iov_count = 0; } reqFn = req->dev.OnDataAvail; if (!reqFn) { MPIDI_CH3U_Request_complete(req); MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, ".... complete"); } else { MPIDI_VC_t *vc = req->ch.vc; reqFn(vc, req, &complete); if (!complete) { MPIU_Assert(complete == TRUE); } } return complete; } static void _mxm_send_completion_cb(void *context) { MPID_Request *req = (MPID_Request *) context; MPID_nem_mxm_vc_area *vc_area = NULL; MPID_nem_mxm_req_area *req_area = NULL; MPIU_Assert(req); _dbg_mxm_out_req(req); vc_area = VC_BASE(req->ch.vc); req_area = REQ_BASE(req); _mxm_to_mpi_status(req_area->mxm_req->item.base.error, &req->status); list_enqueue(&vc_area->mxm_ep->free_queue, &req_area->mxm_req->queue); _dbg_mxm_output(5, "========> %s SEND req %p status %d\n", (MPIR_STATUS_GET_CANCEL_BIT(req->status) ? "Canceling" : "Completing"), req, req->status.MPI_ERROR); if (likely(!MPIR_STATUS_GET_CANCEL_BIT(req->status))) { _mxm_handle_sreq(req); } } static int _mxm_isend(MPID_nem_mxm_ep_t * ep, MPID_nem_mxm_req_area * req, int type, mxm_mq_h mxm_mq, int mxm_rank, int id, mxm_tag_t mxm_tag, int block) { int mpi_errno = MPI_SUCCESS; mxm_error_t ret = MXM_OK; mxm_send_req_t *mxm_sreq; list_head_t *free_queue = NULL; MPIU_Assert(ep); MPIU_Assert(req); free_queue = &ep->free_queue; req->mxm_req = list_dequeue_mxm_req(free_queue); if (!req->mxm_req) { list_grow_mxm_req(free_queue); req->mxm_req = list_dequeue_mxm_req(free_queue); if (!req->mxm_req) { MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "empty free queue"); mpi_errno = MPI_ERR_OTHER; goto fn_fail; } } mxm_sreq = &(req->mxm_req->item.send); mxm_sreq->base.state = MXM_REQ_NEW; mxm_sreq->base.mq = mxm_mq; mxm_sreq->base.conn = ep->mxm_conn; mxm_sreq->base.completed_cb = _mxm_send_completion_cb; mxm_sreq->base.context = req->ctx; if (type == MXM_MPICH_ISEND_AM) { mxm_sreq->opcode = MXM_REQ_OP_AM; mxm_sreq->flags = 0; mxm_sreq->op.am.hid = id; mxm_sreq->op.am.imm_data = mxm_rank; } else if (type == MXM_MPICH_ISEND_SYNC) { mxm_sreq->opcode = MXM_REQ_OP_SEND_SYNC; mxm_sreq->flags = 0; mxm_sreq->op.send.tag = mxm_tag; mxm_sreq->op.send.imm_data = mxm_rank; } else { mxm_sreq->opcode = MXM_REQ_OP_SEND; mxm_sreq->flags = 0; mxm_sreq->op.send.tag = mxm_tag; mxm_sreq->op.send.imm_data = mxm_rank; } if (likely(req->iov_count == 1)) { mxm_sreq->base.data_type = MXM_REQ_DATA_BUFFER; mxm_sreq->base.data.buffer.ptr = req->iov_buf[0].ptr; mxm_sreq->base.data.buffer.length = req->iov_buf[0].length; } else { mxm_sreq->base.data_type = MXM_REQ_DATA_IOV; mxm_sreq->base.data.iov.vector = req->iov_buf; mxm_sreq->base.data.iov.count = req->iov_count; } ret = mxm_req_send(mxm_sreq); if (MXM_OK != ret) { list_enqueue(free_queue, &req->mxm_req->queue); mpi_errno = MPI_ERR_OTHER; goto fn_fail; } if (block) _mxm_req_wait(&mxm_sreq->base); fn_exit: return mpi_errno; fn_fail: goto fn_exit; } #if 0 /* Consider using this function in case non contiguous data */ static int _mxm_process_sdtype(MPID_Request ** sreq_p, MPI_Datatype datatype, MPID_Datatype * dt_ptr, MPIDI_msg_sz_t data_sz, const void *buf, int count, mxm_req_buffer_t ** iov_buf, int *iov_count) { int mpi_errno = MPI_SUCCESS; MPID_Request *sreq = *sreq_p; MPIDI_msg_sz_t last; MPID_IOV *iov; int n_iov = 0; int index; int size_to_copy = 0; sreq->dev.segment_ptr = MPID_Segment_alloc(); MPIU_ERR_CHKANDJUMP1((sreq->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc"); MPID_Segment_init(buf, count, datatype, sreq->dev.segment_ptr, 0); sreq->dev.segment_first = 0; sreq->dev.segment_size = data_sz; last = sreq->dev.segment_size; MPID_Segment_count_contig_blocks(sreq->dev.segment_ptr, sreq->dev.segment_first, &last, (MPI_Aint *) & n_iov); MPIU_Assert(n_iov > 0); iov = MPIU_Malloc(n_iov * sizeof(*iov)); MPIU_Assert(iov); last = sreq->dev.segment_size; MPID_Segment_pack_vector(sreq->dev.segment_ptr, sreq->dev.segment_first, &last, iov, &n_iov); MPIU_Assert(last == sreq->dev.segment_size); #if defined(MXM_DEBUG) && (MXM_DEBUG > 0) _dbg_mxm_output(7, "Send Noncontiguous data vector %i entries (free slots : %i)\n", n_iov, MXM_REQ_DATA_MAX_IOV); for(index = 0; index < n_iov; index++) { _dbg_mxm_output(7, "======= Recv iov[%i] = ptr : %p, len : %i \n", index, iov[index].MPID_IOV_BUF, iov[index].MPID_IOV_LEN); } #endif if (n_iov > MXM_MPICH_MAX_IOV) { *iov_buf = (mxm_req_buffer_t *) MPIU_Malloc(n_iov * sizeof(**iov_buf)); MPIU_Assert(*iov_buf); } for (index = 0; index < n_iov; index++) { if (index < (MXM_REQ_DATA_MAX_IOV - 1)) { (*iov_buf)[index].ptr = iov[index].MPID_IOV_BUF; (*iov_buf)[index].length = iov[index].MPID_IOV_LEN; } else { size_to_copy += iov[index].MPID_IOV_LEN; } } if (size_to_copy == 0) { sreq->dev.tmpbuf = NULL; sreq->dev.tmpbuf_sz = 0; *iov_count = n_iov; } else { int offset = 0; sreq->dev.tmpbuf = MPIU_Malloc(size_to_copy); sreq->dev.tmpbuf_sz = size_to_copy; MPIU_Assert(sreq->dev.tmpbuf); for (index = (MXM_REQ_DATA_MAX_IOV - 1); index < n_iov; index++) { MPIU_Memcpy((char *) (sreq->dev.tmpbuf) + offset, iov[index].MPID_IOV_BUF, iov[index].MPID_IOV_LEN); offset += iov[index].MPID_IOV_LEN; } (*iov_buf)[MXM_REQ_DATA_MAX_IOV - 1].ptr = sreq->dev.tmpbuf; (*iov_buf)[MXM_REQ_DATA_MAX_IOV - 1].length = size_to_copy; *iov_count = MXM_REQ_DATA_MAX_IOV; } MPIU_Free(iov); fn_exit: return mpi_errno; fn_fail: goto fn_exit; }
int MPIDI_CH3U_Request_load_send_iov(MPID_Request * const sreq, MPID_IOV * const iov, int * const iov_n) { MPI_Aint last; int mpi_errno = MPI_SUCCESS; MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3U_REQUEST_LOAD_SEND_IOV); MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3U_REQUEST_LOAD_SEND_IOV); MPIU_Assert(sreq->dev.segment_ptr != NULL); last = sreq->dev.segment_size; MPIU_DBG_MSG_FMT(CH3_CHANNEL,VERBOSE,(MPIU_DBG_FDEST, "pre-pv: first=" MPIDI_MSG_SZ_FMT ", last=" MPIDI_MSG_SZ_FMT ", iov_n=%d", sreq->dev.segment_first, last, *iov_n)); MPIU_Assert(sreq->dev.segment_first < last); MPIU_Assert(last > 0); MPIU_Assert(*iov_n > 0 && *iov_n <= MPID_IOV_LIMIT); MPID_Segment_pack_vector(sreq->dev.segment_ptr, sreq->dev.segment_first, &last, iov, iov_n); MPIU_DBG_MSG_FMT(CH3_CHANNEL,VERBOSE,(MPIU_DBG_FDEST, "post-pv: first=" MPIDI_MSG_SZ_FMT ", last=" MPIDI_MSG_SZ_FMT ", iov_n=%d", sreq->dev.segment_first, last, *iov_n)); MPIU_Assert(*iov_n > 0 && *iov_n <= MPID_IOV_LIMIT); if (last == sreq->dev.segment_size) { MPIU_DBG_MSG(CH3_CHANNEL,VERBOSE,"remaining data loaded into IOV"); sreq->dev.OnDataAvail = sreq->dev.OnFinal; } else if ((last - sreq->dev.segment_first) / *iov_n >= MPIDI_IOV_DENSITY_MIN) { MPIU_DBG_MSG(CH3_CHANNEL,VERBOSE,"more data loaded into IOV"); sreq->dev.segment_first = last; sreq->dev.OnDataAvail = MPIDI_CH3_ReqHandler_SendReloadIOV; } else { MPIDI_msg_sz_t data_sz; int i, iov_data_copied; MPIU_DBG_MSG(CH3_CHANNEL,VERBOSE,"low density. using SRBuf."); data_sz = sreq->dev.segment_size - sreq->dev.segment_first; if (!MPIDI_Request_get_srbuf_flag(sreq)) { MPIDI_CH3U_SRBuf_alloc(sreq, data_sz); /* --BEGIN ERROR HANDLING-- */ if (sreq->dev.tmpbuf_sz == 0) { MPIU_DBG_MSG(CH3_CHANNEL,TYPICAL,"SRBuf allocation failure"); mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**nomem", "**nomem %d", data_sz); sreq->status.MPI_ERROR = mpi_errno; goto fn_exit; } /* --END ERROR HANDLING-- */ } iov_data_copied = 0; for (i = 0; i < *iov_n; i++) { MPIU_Memcpy((char*) sreq->dev.tmpbuf + iov_data_copied, iov[i].MPID_IOV_BUF, iov[i].MPID_IOV_LEN); iov_data_copied += iov[i].MPID_IOV_LEN; } sreq->dev.segment_first = last; last = (data_sz <= sreq->dev.tmpbuf_sz - iov_data_copied) ? sreq->dev.segment_size : sreq->dev.segment_first + sreq->dev.tmpbuf_sz - iov_data_copied; MPIU_DBG_MSG_FMT(CH3_CHANNEL,VERBOSE,(MPIU_DBG_FDEST, "pre-pack: first=" MPIDI_MSG_SZ_FMT ", last=" MPIDI_MSG_SZ_FMT, sreq->dev.segment_first, last)); MPID_Segment_pack(sreq->dev.segment_ptr, sreq->dev.segment_first, &last, (char*) sreq->dev.tmpbuf + iov_data_copied); MPIU_DBG_MSG_FMT(CH3_CHANNEL,VERBOSE,(MPIU_DBG_FDEST, "post-pack: first=" MPIDI_MSG_SZ_FMT ", last=" MPIDI_MSG_SZ_FMT, sreq->dev.segment_first, last)); iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST)sreq->dev.tmpbuf; iov[0].MPID_IOV_LEN = last - sreq->dev.segment_first + iov_data_copied; *iov_n = 1; if (last == sreq->dev.segment_size) { MPIU_DBG_MSG(CH3_CHANNEL,VERBOSE,"remaining data packed into SRBuf"); sreq->dev.OnDataAvail = sreq->dev.OnFinal; } else { MPIU_DBG_MSG(CH3_CHANNEL,VERBOSE,"more data packed into SRBuf"); sreq->dev.segment_first = last; sreq->dev.OnDataAvail = MPIDI_CH3_ReqHandler_SendReloadIOV; } } fn_exit: MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3U_REQUEST_LOAD_SEND_IOV); return mpi_errno; }
int MPID_nem_ptl_recv_posted(MPIDI_VC_t *vc, MPID_Request *rreq) { int mpi_errno = MPI_SUCCESS; MPID_nem_ptl_vc_area *const vc_ptl = VC_PTL(vc); ptl_me_t me; int dt_contig; MPIDI_msg_sz_t data_sz; MPID_Datatype *dt_ptr; MPI_Aint dt_true_lb; MPI_Aint last; ptl_process_t id_any; int ret; MPIU_CHKPMEM_DECL(1); MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_PTL_RECV_POSTED); MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_PTL_RECV_POSTED); id_any.phys.nid = PTL_NID_ANY; id_any.phys.pid = PTL_PID_ANY; MPID_nem_ptl_init_req(rreq); me.ct_handle = PTL_CT_NONE; me.uid = PTL_UID_ANY; me.options = ( PTL_ME_OP_PUT | PTL_ME_IS_ACCESSIBLE | PTL_ME_EVENT_LINK_DISABLE | PTL_ME_EVENT_UNLINK_DISABLE | PTL_ME_USE_ONCE ); if (vc == NULL) { /* MPI_ANY_SOURCE receive */ me.match_id = id_any; } else { if (!vc_ptl->id_initialized) { mpi_errno = MPID_nem_ptl_init_id(vc); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } me.match_id = vc_ptl->id; } MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "tag=%#x ctx=%#x rank=%#x", rreq->dev.match.parts.tag, rreq->dev.match.parts.context_id, rreq->dev.match.parts.rank)); me.match_bits = NPTL_MATCH(rreq->dev.match.parts.tag, rreq->dev.match.parts.context_id, rreq->dev.match.parts.rank); if (rreq->dev.match.parts.tag == MPI_ANY_TAG) me.ignore_bits = NPTL_MATCH_IGNORE_ANY_TAG; else me.ignore_bits = NPTL_MATCH_IGNORE; me.min_free = 0; MPIDI_Datatype_get_info(rreq->dev.user_count, rreq->dev.datatype, dt_contig, data_sz, dt_ptr, dt_true_lb); MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "count="MPI_AINT_FMT_DEC_SPEC" datatype=%#x contig=%d data_sz=%lu", rreq->dev.user_count, rreq->dev.datatype, dt_contig, data_sz)); if (data_sz <= PTL_LARGE_THRESHOLD) { if (dt_contig) { /* small contig message */ void *start = (char *)rreq->dev.user_buf + dt_true_lb; MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Small contig message"); if (start == NULL) me.start = &dummy; else me.start = start; me.length = data_sz; REQ_PTL(rreq)->event_handler = handler_recv_dequeue_complete; } else { /* small noncontig */ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Small noncontig message"); rreq->dev.segment_ptr = MPID_Segment_alloc(); MPIR_ERR_CHKANDJUMP1(rreq->dev.segment_ptr == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc"); MPID_Segment_init(rreq->dev.user_buf, rreq->dev.user_count, rreq->dev.datatype, rreq->dev.segment_ptr, 0); rreq->dev.segment_first = 0; rreq->dev.segment_size = data_sz; last = rreq->dev.segment_size; rreq->dev.iov_count = MPL_IOV_LIMIT; MPID_Segment_pack_vector(rreq->dev.segment_ptr, rreq->dev.segment_first, &last, rreq->dev.iov, &rreq->dev.iov_count); if (last == rreq->dev.segment_size) { /* entire message fits in IOV */ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, " entire message fits in IOV"); me.start = rreq->dev.iov; me.length = rreq->dev.iov_count; me.options |= PTL_IOVEC; REQ_PTL(rreq)->event_handler = handler_recv_dequeue_complete; } else { /* IOV is not long enough to describe entire message: recv into buffer and unpack later */ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, " IOV too long: using bounce buffer"); MPIU_CHKPMEM_MALLOC(REQ_PTL(rreq)->chunk_buffer[0], void *, data_sz, mpi_errno, "chunk_buffer"); me.start = REQ_PTL(rreq)->chunk_buffer[0]; me.length = data_sz; REQ_PTL(rreq)->event_handler = handler_recv_dequeue_unpack_complete; } } } else { /* Large message: Create an ME for the first chunk of data, then do a GET for the rest */ if (dt_contig) {
static int handler_recv_dequeue_large(const ptl_event_t *e) { int mpi_errno = MPI_SUCCESS; MPID_Request *const rreq = e->user_ptr; MPIDI_VC_t *vc; MPID_nem_ptl_vc_area *vc_ptl; int ret; int dt_contig; MPIDI_msg_sz_t data_sz; MPID_Datatype *dt_ptr; MPI_Aint dt_true_lb; MPI_Aint last; MPIU_CHKPMEM_DECL(1); MPIDI_STATE_DECL(MPID_STATE_HANDLER_RECV_DEQUEUE_LARGE); MPIDI_FUNC_ENTER(MPID_STATE_HANDLER_RECV_DEQUEUE_LARGE); MPIU_Assert(e->type == PTL_EVENT_PUT || e->type == PTL_EVENT_PUT_OVERFLOW); MPIDI_Comm_get_vc(rreq->comm, NPTL_MATCH_GET_RANK(e->match_bits), &vc); vc_ptl = VC_PTL(vc); dequeue_req(e); MPIDI_Datatype_get_info(rreq->dev.user_count, rreq->dev.datatype, dt_contig, data_sz, dt_ptr, dt_true_lb); /* unpack data from unexpected buffer first */ if (e->type == PTL_EVENT_PUT_OVERFLOW) { if (dt_contig) { MPIU_Memcpy((char *)rreq->dev.user_buf + dt_true_lb, e->start, e->mlength); } else { last = e->mlength; MPID_Segment_unpack(rreq->dev.segment_ptr, 0, &last, e->start); MPIU_Assert(last == e->mlength); rreq->dev.segment_first = e->mlength; } } if (!(e->hdr_data & NPTL_LARGE)) { /* all data has already been received; we're done */ mpi_errno = handler_recv_complete(e); if (mpi_errno) MPIR_ERR_POP(mpi_errno); goto fn_exit; } MPIU_Assert (e->mlength == PTL_LARGE_THRESHOLD); /* we need to GET the rest of the data from the sender's buffer */ if (dt_contig) { big_get((char *)rreq->dev.user_buf + dt_true_lb + PTL_LARGE_THRESHOLD, data_sz - PTL_LARGE_THRESHOLD, vc, e->match_bits, rreq); goto fn_exit; } /* noncontig recv buffer */ last = rreq->dev.segment_size; rreq->dev.iov_count = MPL_IOV_LIMIT; MPID_Segment_pack_vector(rreq->dev.segment_ptr, rreq->dev.segment_first, &last, rreq->dev.iov, &rreq->dev.iov_count); if (last == rreq->dev.segment_size && rreq->dev.segment_size <= MPIDI_nem_ptl_ni_limits.max_msg_size + PTL_LARGE_THRESHOLD) { /* Rest of message fits in one IOV */ ptl_md_t md; md.start = rreq->dev.iov; md.length = rreq->dev.iov_count; md.options = PTL_IOVEC; md.eq_handle = MPIDI_nem_ptl_origin_eq; md.ct_handle = PTL_CT_NONE; ret = PtlMDBind(MPIDI_nem_ptl_ni, &md, &REQ_PTL(rreq)->md); MPIR_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlmdbind", "**ptlmdbind %s", MPID_nem_ptl_strerror(ret)); REQ_PTL(rreq)->event_handler = handler_recv_complete; ret = MPID_nem_ptl_rptl_get(REQ_PTL(rreq)->md, 0, rreq->dev.segment_size - rreq->dev.segment_first, vc_ptl->id, vc_ptl->ptg, e->match_bits, 0, rreq); MPIR_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlget", "**ptlget %s", MPID_nem_ptl_strerror(ret)); goto fn_exit; } /* message won't fit in a single IOV, allocate buffer and unpack when received */ /* FIXME: For now, allocate a single large buffer to hold entire message */ MPIU_CHKPMEM_MALLOC(REQ_PTL(rreq)->chunk_buffer[0], void *, data_sz - PTL_LARGE_THRESHOLD, mpi_errno, "chunk_buffer"); big_get(REQ_PTL(rreq)->chunk_buffer[0], data_sz - PTL_LARGE_THRESHOLD, vc, e->match_bits, rreq); fn_exit: MPIU_CHKPMEM_COMMIT(); fn_exit2: MPIDI_FUNC_EXIT(MPID_STATE_HANDLER_RECV_DEQUEUE_LARGE); return mpi_errno; fn_fail: MPIU_CHKPMEM_REAP(); goto fn_exit2; }
int MPIDI_Get_accumulate(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype, void *result_addr, int result_count, MPI_Datatype result_datatype, int target_rank, MPI_Aint target_disp, int target_count, MPI_Datatype target_datatype, MPI_Op op, MPID_Win *win_ptr) { int mpi_errno = MPI_SUCCESS; MPIDI_msg_sz_t data_sz; int rank, origin_predefined, result_predefined, target_predefined; int shm_locked = 0; int dt_contig ATTRIBUTE((unused)); MPI_Aint dt_true_lb ATTRIBUTE((unused)); MPID_Datatype *dtp; MPIU_CHKLMEM_DECL(2); MPIDI_STATE_DECL(MPID_STATE_MPIDI_GET_ACCUMULATE); MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_GET_ACCUMULATE); if (target_rank == MPI_PROC_NULL) { goto fn_exit; } if (win_ptr->epoch_state == MPIDI_EPOCH_NONE && win_ptr->fence_issued) { win_ptr->epoch_state = MPIDI_EPOCH_FENCE; } MPIU_ERR_CHKANDJUMP(win_ptr->epoch_state == MPIDI_EPOCH_NONE, mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync"); MPIDI_Datatype_get_info(target_count, target_datatype, dt_contig, data_sz, dtp, dt_true_lb); if (data_sz == 0) { goto fn_exit; } rank = win_ptr->myrank; origin_predefined = TRUE; /* quiet uninitialized warnings (b/c goto) */ if (op != MPI_NO_OP) { MPIDI_CH3I_DATATYPE_IS_PREDEFINED(origin_datatype, origin_predefined); } MPIDI_CH3I_DATATYPE_IS_PREDEFINED(result_datatype, result_predefined); MPIDI_CH3I_DATATYPE_IS_PREDEFINED(target_datatype, target_predefined); /* Do =! rank first (most likely branch?) */ if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED) { MPI_User_function *uop; void *base; int disp_unit; if (win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED) { base = win_ptr->shm_base_addrs[target_rank]; disp_unit = win_ptr->disp_units[target_rank]; MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr); shm_locked = 1; } else { base = win_ptr->base; disp_unit = win_ptr->disp_unit; } /* Perform the local get first, then the accumulate */ mpi_errno = MPIR_Localcopy((char *) base + disp_unit * target_disp, target_count, target_datatype, result_addr, result_count, result_datatype); if (mpi_errno) { MPIU_ERR_POP(mpi_errno); } /* NO_OP: Don't perform the accumulate */ if (op == MPI_NO_OP) { if (shm_locked) { MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr); shm_locked = 0; } goto fn_exit; } if (op == MPI_REPLACE) { mpi_errno = MPIR_Localcopy(origin_addr, origin_count, origin_datatype, (char *) base + disp_unit * target_disp, target_count, target_datatype); if (mpi_errno) { MPIU_ERR_POP(mpi_errno); } if (shm_locked) { MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr); shm_locked = 0; } goto fn_exit; } MPIU_ERR_CHKANDJUMP1((HANDLE_GET_KIND(op) != HANDLE_KIND_BUILTIN), mpi_errno, MPI_ERR_OP, "**opnotpredefined", "**opnotpredefined %d", op ); /* get the function by indexing into the op table */ uop = MPIR_OP_HDL_TO_FN(op); if (origin_predefined && target_predefined) { /* Cast away const'ness for origin_address in order to * avoid changing the prototype for MPI_User_function */ (*uop)((void *) origin_addr, (char *) base + disp_unit*target_disp, &target_count, &target_datatype); } else { /* derived datatype */ MPID_Segment *segp; DLOOP_VECTOR *dloop_vec; MPI_Aint first, last; int vec_len, i, type_size, count; MPI_Datatype type; MPI_Aint true_lb, true_extent, extent; void *tmp_buf=NULL, *target_buf; const void *source_buf; if (origin_datatype != target_datatype) { /* first copy the data into a temporary buffer with the same datatype as the target. Then do the accumulate operation. */ MPIR_Type_get_true_extent_impl(target_datatype, &true_lb, &true_extent); MPID_Datatype_get_extent_macro(target_datatype, extent); MPIU_CHKLMEM_MALLOC(tmp_buf, void *, target_count * (MPIR_MAX(extent,true_extent)), mpi_errno, "temporary buffer"); /* adjust for potential negative lower bound in datatype */ tmp_buf = (void *)((char*)tmp_buf - true_lb); mpi_errno = MPIR_Localcopy(origin_addr, origin_count, origin_datatype, tmp_buf, target_count, target_datatype); if (mpi_errno) { MPIU_ERR_POP(mpi_errno); } } if (target_predefined) { /* target predefined type, origin derived datatype */ (*uop)(tmp_buf, (char *) base + disp_unit * target_disp, &target_count, &target_datatype); } else { segp = MPID_Segment_alloc(); MPIU_ERR_CHKANDJUMP1((!segp), mpi_errno, MPI_ERR_OTHER, "**nomem","**nomem %s","MPID_Segment_alloc"); MPID_Segment_init(NULL, target_count, target_datatype, segp, 0); first = 0; last = SEGMENT_IGNORE_LAST; MPID_Datatype_get_ptr(target_datatype, dtp); vec_len = dtp->max_contig_blocks * target_count + 1; /* +1 needed because Rob says so */ MPIU_CHKLMEM_MALLOC(dloop_vec, DLOOP_VECTOR *, vec_len * sizeof(DLOOP_VECTOR), mpi_errno, "dloop vector"); MPID_Segment_pack_vector(segp, first, &last, dloop_vec, &vec_len); source_buf = (tmp_buf != NULL) ? tmp_buf : origin_addr; target_buf = (char *) base + disp_unit * target_disp; type = dtp->eltype; type_size = MPID_Datatype_get_basic_size(type); for (i=0; i<vec_len; i++) { count = (dloop_vec[i].DLOOP_VECTOR_LEN)/type_size; (*uop)((char *)source_buf + MPIU_PtrToAint(dloop_vec[i].DLOOP_VECTOR_BUF), (char *)target_buf + MPIU_PtrToAint(dloop_vec[i].DLOOP_VECTOR_BUF), &count, &type); } MPID_Segment_free(segp); } }
static int send_msg(ptl_hdr_data_t ssend_flag, struct MPIDI_VC *vc, const void *buf, MPI_Aint count, MPI_Datatype datatype, int dest, int tag, MPID_Comm *comm, int context_offset, struct MPID_Request **request) { int mpi_errno = MPI_SUCCESS; MPID_nem_ptl_vc_area *const vc_ptl = VC_PTL(vc); int ret; MPIDI_msg_sz_t data_sz; int dt_contig; MPI_Aint dt_true_lb; MPID_Datatype *dt_ptr; MPID_Request *sreq = NULL; ptl_me_t me; int initial_iov_count, remaining_iov_count; ptl_md_t md; MPI_Aint last; MPIU_CHKPMEM_DECL(2); MPIDI_STATE_DECL(MPID_STATE_SEND_MSG); MPIDI_FUNC_ENTER(MPID_STATE_SEND_MSG); MPID_nem_ptl_request_create_sreq(sreq, mpi_errno, comm); sreq->dev.match.parts.rank = dest; sreq->dev.match.parts.tag = tag; sreq->dev.match.parts.context_id = comm->context_id + context_offset; sreq->ch.vc = vc; if (!vc_ptl->id_initialized) { mpi_errno = MPID_nem_ptl_init_id(vc); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } MPIDI_Datatype_get_info(count, datatype, dt_contig, data_sz, dt_ptr, dt_true_lb); MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "count="MPI_AINT_FMT_DEC_SPEC" datatype=%#x contig=%d data_sz=%lu", count, datatype, dt_contig, data_sz)); if (data_sz <= PTL_LARGE_THRESHOLD) { /* Small message. Send all data eagerly */ if (dt_contig) { void *start = (char *)buf + dt_true_lb; MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Small contig message"); REQ_PTL(sreq)->event_handler = handler_send; MPIU_DBG_MSG_P(CH3_CHANNEL, VERBOSE, "&REQ_PTL(sreq)->event_handler = %p", &(REQ_PTL(sreq)->event_handler)); if (start == NULL) ret = MPID_nem_ptl_rptl_put(MPIDI_nem_ptl_global_md, (ptl_size_t)&dummy, data_sz, PTL_NO_ACK_REQ, vc_ptl->id, vc_ptl->pt, NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank), 0, sreq, NPTL_HEADER(ssend_flag, data_sz)); else ret = MPID_nem_ptl_rptl_put(MPIDI_nem_ptl_global_md, (ptl_size_t)start, data_sz, PTL_NO_ACK_REQ, vc_ptl->id, vc_ptl->pt, NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank), 0, sreq, NPTL_HEADER(ssend_flag, data_sz)); MPIR_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlput", "**ptlput %s", MPID_nem_ptl_strerror(ret)); DBG_MSG_PUT("global", data_sz, vc->pg_rank, NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank), NPTL_HEADER(ssend_flag, data_sz)); MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "id.nid = %#x", vc_ptl->id.phys.nid); MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "id.pid = %#x", vc_ptl->id.phys.pid); MPIU_DBG_MSG_P(CH3_CHANNEL, VERBOSE, "sreq = %p", sreq); MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "vc_ptl->pt = %d", vc_ptl->pt); MPIU_DBG_MSG_P(CH3_CHANNEL, VERBOSE, "REQ_PTL(sreq)->event_handler = %p", REQ_PTL(sreq)->event_handler); goto fn_exit; } /* noncontig data */ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Small noncontig message"); sreq->dev.segment_ptr = MPID_Segment_alloc(); MPIR_ERR_CHKANDJUMP1(sreq->dev.segment_ptr == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc"); MPID_Segment_init(buf, count, datatype, sreq->dev.segment_ptr, 0); sreq->dev.segment_first = 0; sreq->dev.segment_size = data_sz; last = sreq->dev.segment_size; sreq->dev.iov_count = MPL_IOV_LIMIT; MPID_Segment_pack_vector(sreq->dev.segment_ptr, sreq->dev.segment_first, &last, sreq->dev.iov, &sreq->dev.iov_count); if (last == sreq->dev.segment_size) { /* IOV is able to describe entire message */ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, " entire message fits in IOV"); md.start = sreq->dev.iov; md.length = sreq->dev.iov_count; md.options = PTL_IOVEC; md.eq_handle = MPIDI_nem_ptl_origin_eq; md.ct_handle = PTL_CT_NONE; ret = PtlMDBind(MPIDI_nem_ptl_ni, &md, &REQ_PTL(sreq)->md); MPIR_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlmdbind", "**ptlmdbind %s", MPID_nem_ptl_strerror(ret)); REQ_PTL(sreq)->event_handler = handler_send; ret = MPID_nem_ptl_rptl_put(REQ_PTL(sreq)->md, 0, data_sz, PTL_NO_ACK_REQ, vc_ptl->id, vc_ptl->pt, NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank), 0, sreq, NPTL_HEADER(ssend_flag, data_sz)); MPIR_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlput", "**ptlput %s", MPID_nem_ptl_strerror(ret)); DBG_MSG_PUT("sreq", data_sz, vc->pg_rank, NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank), NPTL_HEADER(ssend_flag, data_sz)); goto fn_exit; } /* IOV is not long enough to describe entire message */ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, " IOV too long: using bounce buffer"); MPIU_CHKPMEM_MALLOC(REQ_PTL(sreq)->chunk_buffer[0], void *, data_sz, mpi_errno, "chunk_buffer"); MPID_Segment_init(buf, count, datatype, sreq->dev.segment_ptr, 0); sreq->dev.segment_first = 0; last = data_sz; MPID_Segment_pack(sreq->dev.segment_ptr, sreq->dev.segment_first, &last, REQ_PTL(sreq)->chunk_buffer[0]); MPIU_Assert(last == sreq->dev.segment_size); REQ_PTL(sreq)->event_handler = handler_send; ret = MPID_nem_ptl_rptl_put(MPIDI_nem_ptl_global_md, (ptl_size_t)REQ_PTL(sreq)->chunk_buffer[0], data_sz, PTL_NO_ACK_REQ, vc_ptl->id, vc_ptl->pt, NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank), 0, sreq, NPTL_HEADER(ssend_flag, data_sz)); MPIR_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlput", "**ptlput %s", MPID_nem_ptl_strerror(ret)); DBG_MSG_PUT("global", data_sz, vc->pg_rank, NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank), NPTL_HEADER(ssend_flag, data_sz)); goto fn_exit; } /* Large message. Send first chunk of data and let receiver get the rest */ if (dt_contig) { /* create ME for buffer so receiver can issue a GET for the data */ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Large contig message"); big_meappend((char *)buf + dt_true_lb + PTL_LARGE_THRESHOLD, data_sz - PTL_LARGE_THRESHOLD, vc, NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank), sreq); REQ_PTL(sreq)->event_handler = handler_send; ret = MPID_nem_ptl_rptl_put(MPIDI_nem_ptl_global_md, (ptl_size_t)((char *)buf + dt_true_lb), PTL_LARGE_THRESHOLD, PTL_NO_ACK_REQ, vc_ptl->id, vc_ptl->pt, NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank), 0, sreq, NPTL_HEADER(ssend_flag | NPTL_LARGE, data_sz)); MPIR_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlput", "**ptlput %s", MPID_nem_ptl_strerror(ret)); DBG_MSG_PUT("global", PTL_LARGE_THRESHOLD, vc->pg_rank, NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank), NPTL_HEADER(ssend_flag | NPTL_LARGE, data_sz)); goto fn_exit; } /* Large noncontig data */ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Large noncontig message"); sreq->dev.segment_ptr = MPID_Segment_alloc(); MPIR_ERR_CHKANDJUMP1(sreq->dev.segment_ptr == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc"); MPID_Segment_init(buf, count, datatype, sreq->dev.segment_ptr, 0); sreq->dev.segment_first = 0; sreq->dev.segment_size = data_sz; last = PTL_LARGE_THRESHOLD; sreq->dev.iov_count = MPL_IOV_LIMIT; MPID_Segment_pack_vector(sreq->dev.segment_ptr, sreq->dev.segment_first, &last, sreq->dev.iov, &sreq->dev.iov_count); initial_iov_count = sreq->dev.iov_count; sreq->dev.segment_first = last; if (last == PTL_LARGE_THRESHOLD) { /* first chunk of message fits into IOV */ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, " first chunk fits in IOV"); if (initial_iov_count < MPL_IOV_LIMIT) { /* There may be space for the rest of the message in this IOV */ sreq->dev.iov_count = MPL_IOV_LIMIT - sreq->dev.iov_count; last = sreq->dev.segment_size; MPID_Segment_pack_vector(sreq->dev.segment_ptr, sreq->dev.segment_first, &last, &sreq->dev.iov[initial_iov_count], &sreq->dev.iov_count); remaining_iov_count = sreq->dev.iov_count; if (last == sreq->dev.segment_size && last <= MPIDI_nem_ptl_ni_limits.max_msg_size + PTL_LARGE_THRESHOLD) { /* Entire message fit in one IOV */ int was_incomplete; MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, " rest of message fits in one IOV"); /* Create ME for remaining data */ me.start = &sreq->dev.iov[initial_iov_count]; me.length = remaining_iov_count; me.ct_handle = PTL_CT_NONE; me.uid = PTL_UID_ANY; me.options = ( PTL_ME_OP_PUT | PTL_ME_OP_GET | PTL_ME_USE_ONCE | PTL_ME_IS_ACCESSIBLE | PTL_ME_EVENT_LINK_DISABLE | PTL_ME_EVENT_UNLINK_DISABLE | PTL_IOVEC ); me.match_id = vc_ptl->id; me.match_bits = NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank); me.ignore_bits = 0; me.min_free = 0; MPIU_CHKPMEM_MALLOC(REQ_PTL(sreq)->get_me_p, ptl_handle_me_t *, sizeof(ptl_handle_me_t), mpi_errno, "get_me_p"); ret = PtlMEAppend(MPIDI_nem_ptl_ni, MPIDI_nem_ptl_get_pt, &me, PTL_PRIORITY_LIST, sreq, &REQ_PTL(sreq)->get_me_p[0]); MPIR_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlmeappend", "**ptlmeappend %s", MPID_nem_ptl_strerror(ret)); DBG_MSG_MEAPPEND("CTL", vc->pg_rank, me, sreq); /* increment the cc for the get operation */ MPIDI_CH3U_Request_increment_cc(sreq, &was_incomplete); MPIU_Assert(was_incomplete); /* Create MD for first chunk */ md.start = sreq->dev.iov; md.length = initial_iov_count; md.options = PTL_IOVEC; md.eq_handle = MPIDI_nem_ptl_origin_eq; md.ct_handle = PTL_CT_NONE; ret = PtlMDBind(MPIDI_nem_ptl_ni, &md, &REQ_PTL(sreq)->md); MPIR_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlmdbind", "**ptlmdbind %s", MPID_nem_ptl_strerror(ret)); REQ_PTL(sreq)->event_handler = handler_send; ret = MPID_nem_ptl_rptl_put(REQ_PTL(sreq)->md, 0, PTL_LARGE_THRESHOLD, PTL_NO_ACK_REQ, vc_ptl->id, vc_ptl->pt, NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank), 0, sreq, NPTL_HEADER(ssend_flag | NPTL_LARGE, data_sz)); MPIR_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlput", "**ptlput %s", MPID_nem_ptl_strerror(ret)); DBG_MSG_PUT("req", PTL_LARGE_THRESHOLD, vc->pg_rank, NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank), NPTL_HEADER(ssend_flag | NPTL_LARGE, data_sz)); goto fn_exit; }
int MPIDI_Accumulate(void *origin_addr, int origin_count, MPI_Datatype origin_datatype, int target_rank, MPI_Aint target_disp, int target_count, MPI_Datatype target_datatype, MPI_Op op, MPID_Win *win_ptr) { int mpi_errno=MPI_SUCCESS; MPIDI_msg_sz_t data_sz; int dt_contig, rank, origin_predefined, target_predefined; MPI_Aint dt_true_lb; MPIDI_RMA_ops *new_ptr; MPID_Datatype *dtp; MPIU_CHKLMEM_DECL(2); MPIU_CHKPMEM_DECL(1); MPIDI_STATE_DECL(MPID_STATE_MPIDI_ACCUMULATE); MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_ACCUMULATE); MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, data_sz, dtp, dt_true_lb); if ((data_sz == 0) || (target_rank == MPI_PROC_NULL)) { goto fn_exit; } rank = win_ptr->myrank; MPIDI_CH3I_DATATYPE_IS_PREDEFINED(origin_datatype, origin_predefined); MPIDI_CH3I_DATATYPE_IS_PREDEFINED(target_datatype, target_predefined); /* Do =! rank first (most likely branch?) */ if (target_rank == rank) { MPI_User_function *uop; if (op == MPI_REPLACE) { mpi_errno = MPIR_Localcopy(origin_addr, origin_count, origin_datatype, (char *) win_ptr->base + win_ptr->disp_unit * target_disp, target_count, target_datatype); goto fn_exit; } MPIU_ERR_CHKANDJUMP1((HANDLE_GET_KIND(op) != HANDLE_KIND_BUILTIN), mpi_errno, MPI_ERR_OP, "**opnotpredefined", "**opnotpredefined %d", op ); /* get the function by indexing into the op table */ uop = MPIR_Op_table[((op)&0xf) - 1]; if (origin_predefined && target_predefined) { (*uop)(origin_addr, (char *) win_ptr->base + win_ptr->disp_unit * target_disp, &target_count, &target_datatype); } else { /* derived datatype */ MPID_Segment *segp; DLOOP_VECTOR *dloop_vec; MPI_Aint first, last; int vec_len, i, type_size, count; MPI_Datatype type; MPI_Aint true_lb, true_extent, extent; void *tmp_buf=NULL, *source_buf, *target_buf; if (origin_datatype != target_datatype) { /* first copy the data into a temporary buffer with the same datatype as the target. Then do the accumulate operation. */ MPIR_Type_get_true_extent_impl(target_datatype, &true_lb, &true_extent); MPID_Datatype_get_extent_macro(target_datatype, extent); MPIU_CHKLMEM_MALLOC(tmp_buf, void *, target_count * (MPIR_MAX(extent,true_extent)), mpi_errno, "temporary buffer"); /* adjust for potential negative lower bound in datatype */ tmp_buf = (void *)((char*)tmp_buf - true_lb); mpi_errno = MPIR_Localcopy(origin_addr, origin_count, origin_datatype, tmp_buf, target_count, target_datatype); if (mpi_errno) { MPIU_ERR_POP(mpi_errno); } } if (target_predefined) { /* target predefined type, origin derived datatype */ (*uop)(tmp_buf, (char *) win_ptr->base + win_ptr->disp_unit * target_disp, &target_count, &target_datatype); } else { segp = MPID_Segment_alloc(); MPIU_ERR_CHKANDJUMP1((!segp), mpi_errno, MPI_ERR_OTHER, "**nomem","**nomem %s","MPID_Segment_alloc"); MPID_Segment_init(NULL, target_count, target_datatype, segp, 0); first = 0; last = SEGMENT_IGNORE_LAST; MPID_Datatype_get_ptr(target_datatype, dtp); vec_len = dtp->max_contig_blocks * target_count + 1; /* +1 needed because Rob says so */ MPIU_CHKLMEM_MALLOC(dloop_vec, DLOOP_VECTOR *, vec_len * sizeof(DLOOP_VECTOR), mpi_errno, "dloop vector"); MPID_Segment_pack_vector(segp, first, &last, dloop_vec, &vec_len); source_buf = (tmp_buf != NULL) ? tmp_buf : origin_addr; target_buf = (char *) win_ptr->base + win_ptr->disp_unit * target_disp; type = dtp->eltype; type_size = MPID_Datatype_get_basic_size(type); for (i=0; i<vec_len; i++) { count = (dloop_vec[i].DLOOP_VECTOR_LEN)/type_size; (*uop)((char *)source_buf + MPIU_PtrToAint(dloop_vec[i].DLOOP_VECTOR_BUF), (char *)target_buf + MPIU_PtrToAint(dloop_vec[i].DLOOP_VECTOR_BUF), &count, &type); } MPID_Segment_free(segp); } }
int MPID_nem_tcp_module_lmt_start_send (MPIDI_VC_t *vc, MPID_Request *req, MPID_IOV r_cookie) { int mpi_errno = MPI_SUCCESS; int ret; MPIDI_msg_sz_t data_sz; int dt_contig; MPI_Aint dt_true_lb; MPID_Datatype * dt_ptr; MPIDI_msg_sz_t last; int nb; int s_len = 0; int r_len; int r_port; char *r_hostname; MPIDI_CH3I_VC *vc_ch = (MPIDI_CH3I_VC *)vc->channel_private; MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_TCP_MODULE_LMT_START_SEND); MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_TCP_MODULE_LMT_START_SEND); mpi_errno = read_r_cookie (r_cookie, &r_hostname, &r_port, &r_len); if (mpi_errno) MPIU_ERR_POP (mpi_errno); free_cookie (vc_ch->net.tcp.lmt_cookie); if (!vc_ch->net.tcp.lmt_connected) { struct sockaddr_in saddr; struct hostent *hp; vc_ch->net.tcp.lmt_desc = socket (AF_INET, SOCK_STREAM, 0); MPIU_ERR_CHKANDJUMP2 (vc_ch->net.tcp.lmt_desc == -1, mpi_errno, MPI_ERR_OTHER, "**sock_create", "**sock_create %s %d", strerror (errno), errno); // ret = fcntl (vc_ch->net.tcp.lmt_desc, F_SETFL, O_NONBLOCK); // MPIU_ERR_CHKANDJUMP2 (ret == -1, mpi_errno, MPI_ERR_OTHER, "**fail", "**fail %s %d", strerror (errno), errno); hp = gethostbyname (r_hostname); MPIU_ERR_CHKANDJUMP2 (hp == NULL, mpi_errno, MPI_ERR_OTHER, "**gethostbyname", "**gethostbyname %s %d", hstrerror (h_errno), h_errno); memset (&saddr, sizeof(saddr), 0); saddr.sin_family = AF_INET; saddr.sin_port = htons (r_port); MPIU_Memcpy (&saddr.sin_addr, hp->h_addr, hp->h_length); set_sockopts (vc_ch->net.tcp.lmt_desc); ret = connect (vc_ch->net.tcp.lmt_desc, (struct sockaddr *)&saddr, sizeof(saddr)); MPIU_ERR_CHKANDJUMP2 (ret == -1, mpi_errno, MPI_ERR_OTHER, "**fail", "**fail %s %d", strerror (errno), errno); vc_ch->net.tcp.lmt_connected = 1; } MPIDI_Datatype_get_info (req->dev.user_count, req->dev.datatype, dt_contig, data_sz, dt_ptr, dt_true_lb); if (r_len < data_sz) { /* message will be truncated */ s_len = data_sz; data_sz = r_len; req->status.MPI_ERROR = MPIU_ERR_SET2 (mpi_errno, MPI_ERR_TRUNCATE, "**truncate", "**truncate %d %d", s_len, r_len); } MPID_Segment_init (req->dev.user_buf, req->dev.user_count, req->dev.datatype, &req->dev.segment, 0); req->dev.segment_first = 0; req->dev.segment_size = data_sz; req->dev.iov_count = MPID_IOV_LIMIT; req->dev.iov_offset = 0; last = data_sz; do { int iov_offset; int left_to_send; MPID_Segment_pack_vector (&req->dev.segment, req->dev.segment_first, &last, req->dev.iov, &req->dev.iov_count); left_to_send = last - req->dev.segment_first; iov_offset = 0; #ifdef TESTING_CHUNKING { char *buf = req->dev.iov[0].MPID_IOV_BUF; int l; while (left_to_send) { if (left_to_send > CHUNK) l = CHUNK; else l = left_to_send; do nb = write (vc_ch->net.tcp.lmt_desc, buf, l); while (nb == -1 && errno == EINTR); MPIU_ERR_CHKANDJUMP (nb == -1, mpi_errno, MPI_ERR_OTHER, "**sock_writev"); left_to_send -= nb; buf += nb; } MPIDI_CH3U_Request_complete (req); goto fn_exit; } #endif do nb = writev (vc_ch->net.tcp.lmt_desc, &req->dev.iov[iov_offset], req->dev.iov_count - iov_offset); while (nb == -1 && errno == EINTR); MPIU_ERR_CHKANDJUMP (nb == -1, mpi_errno, MPI_ERR_OTHER, "**sock_writev"); left_to_send -= nb; while (left_to_send) { /* send rest of iov */ while (nb >= req->dev.iov[iov_offset].MPID_IOV_LEN) { /* update iov to reflect sent bytes */ nb -= req->dev.iov[iov_offset].MPID_IOV_LEN; ++iov_offset; } req->dev.iov[iov_offset].MPID_IOV_BUF = (char *)req->dev.iov[iov_offset].MPID_IOV_BUF + nb; req->dev.iov[iov_offset].MPID_IOV_LEN -= nb; do nb = writev (vc_ch->net.tcp.lmt_desc, &req->dev.iov[iov_offset], req->dev.iov_count - iov_offset); while (nb == -1 && errno == EINTR); MPIU_ERR_CHKANDJUMP (nb == -1, mpi_errno, MPI_ERR_OTHER, "**sock_writev"); left_to_send -= nb; } } while (last < data_sz); MPIDI_CH3U_Request_complete (req); fn_exit: MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_TCP_MODULE_LMT_START_SEND); return mpi_errno; fn_fail: goto fn_exit; }