int MPID_nem_mxm_SendNoncontig(MPIDI_VC_t * vc, MPID_Request * sreq, void *hdr, MPIDI_msg_sz_t hdr_sz) { int mpi_errno = MPI_SUCCESS; MPIDI_msg_sz_t last; MPID_nem_mxm_vc_area *vc_area = NULL; MPID_nem_mxm_req_area *req_area = NULL; MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_MXM_SENDNONCONTIGMSG); MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_MXM_SENDNONCONTIGMSG); MPIU_Assert(hdr_sz <= sizeof(MPIDI_CH3_Pkt_t)); MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "MPID_nem_mxm_iSendNoncontig"); MPIU_Memcpy(&(sreq->dev.pending_pkt), (char *) hdr, sizeof(MPIDI_CH3_Pkt_t)); _dbg_mxm_output(5, "SendNoncontig ========> Sending ADI msg (to=%d type=%d) for req %p (data_size %d, %d) \n", vc->pg_rank, sreq->dev.pending_pkt.type, sreq, sizeof(MPIDI_CH3_Pkt_t), sreq->dev.segment_size); vc_area = VC_BASE(vc); req_area = REQ_BASE(sreq); req_area->ctx = sreq; req_area->iov_buf = req_area->tmp_buf; req_area->iov_count = 1; req_area->iov_buf[0].ptr = (void *) &(sreq->dev.pending_pkt); req_area->iov_buf[0].length = sizeof(MPIDI_CH3_Pkt_t); MPIU_Assert(sreq->dev.segment_first == 0); last = sreq->dev.segment_size; if (last > 0) { sreq->dev.tmpbuf = MPIU_Malloc((size_t) sreq->dev.segment_size); MPIU_Assert(sreq->dev.tmpbuf); MPID_Segment_pack(sreq->dev.segment_ptr, sreq->dev.segment_first, &last, sreq->dev.tmpbuf); MPIU_Assert(last == sreq->dev.segment_size); req_area->iov_count = 2; req_area->iov_buf[1].ptr = sreq->dev.tmpbuf; req_area->iov_buf[1].length = last; } vc_area->pending_sends += 1; sreq->ch.vc = vc; sreq->ch.noncontig = TRUE; mpi_errno = _mxm_isend(vc_area->mxm_ep, req_area, MXM_MPICH_ISEND_AM, mxm_obj->mxm_mq, mxm_obj->mxm_rank, MXM_MPICH_HID_ADI_MSG, 0, 0); if (mpi_errno) MPIU_ERR_POP(mpi_errno); fn_exit: MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_MXM_SENDNONCONTIGMSG); return mpi_errno; fn_fail: goto fn_exit; }
int MPID_nem_mxm_issend(MPIDI_VC_t * vc, const void *buf, int count, MPI_Datatype datatype, int rank, int tag, MPID_Comm * comm, int context_offset, MPID_Request ** sreq_ptr) { int mpi_errno = MPI_SUCCESS; MPID_Request *sreq = NULL; MPID_Datatype *dt_ptr; int dt_contig; MPIDI_msg_sz_t data_sz; MPI_Aint dt_true_lb; MPID_nem_mxm_vc_area *vc_area = NULL; MPID_nem_mxm_req_area *req_area = NULL; MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_MXM_ISSEND); MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_MXM_ISSEND); MPIDI_Datatype_get_info(count, datatype, dt_contig, data_sz, dt_ptr, dt_true_lb); /* create a request */ MPIDI_Request_create_sreq(sreq, mpi_errno, goto fn_exit); MPIU_Assert(sreq != NULL); MPIDI_Request_set_type(sreq, MPIDI_REQUEST_TYPE_SEND); MPIDI_VC_FAI_send_seqnum(vc, seqnum); MPIDI_Request_set_seqnum(sreq, seqnum); if (HANDLE_GET_KIND(datatype) != HANDLE_KIND_BUILTIN) { MPID_Datatype_get_ptr(datatype, sreq->dev.datatype_ptr); MPID_Datatype_add_ref(sreq->dev.datatype_ptr); } sreq->partner_request = NULL; sreq->dev.OnDataAvail = NULL; sreq->dev.tmpbuf = NULL; sreq->ch.vc = vc; sreq->ch.noncontig = FALSE; _dbg_mxm_output(5, "isSend ========> Sending USER msg for req %p (context %d to %d tag %d size %d) \n", sreq, comm->context_id + context_offset, rank, tag, data_sz); vc_area = VC_BASE(vc); req_area = REQ_BASE(sreq); req_area-> ctx = sreq; req_area->iov_buf = req_area->tmp_buf; req_area->iov_count = 0; req_area->iov_buf[0].ptr = NULL; req_area->iov_buf[0].length = 0; if (data_sz) { if (dt_contig) { req_area->iov_count = 1; req_area->iov_buf[0].ptr = (char *) (buf) + dt_true_lb; req_area->iov_buf[0].length = data_sz; } else { MPIDI_msg_sz_t last; MPI_Aint packsize = 0; sreq->ch.noncontig = TRUE; sreq->dev.segment_ptr = MPID_Segment_alloc(); MPIU_ERR_CHKANDJUMP1((sreq->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc"); MPIR_Pack_size_impl(count, datatype, &packsize); last = data_sz; if (packsize > 0) { sreq->dev.tmpbuf = MPIU_Malloc((size_t) packsize); MPIU_Assert(sreq->dev.tmpbuf); MPID_Segment_init(buf, count, datatype, sreq->dev.segment_ptr, 0); MPID_Segment_pack(sreq->dev.segment_ptr, 0, &last, sreq->dev.tmpbuf); req_area->iov_count = 1; req_area->iov_buf[0].ptr = sreq->dev.tmpbuf; req_area->iov_buf[0].length = last; } } } vc_area->pending_sends += 1; mpi_errno = _mxm_isend(vc_area->mxm_ep, req_area, MXM_MPICH_ISEND_SYNC, (mxm_mq_h) comm->dev.ch.netmod_priv, comm->rank, tag, _mxm_tag_mpi2mxm(tag, comm->context_id + context_offset), 0); if (mpi_errno) MPIU_ERR_POP(mpi_errno); _dbg_mxm_out_req(sreq); fn_exit: *sreq_ptr = sreq; MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_MXM_ISSEND); return mpi_errno; fn_fail: goto fn_exit; }
/** * \brief MPID buffer copy * * Implements non-contiguous buffers correctly. * * \param[in] sbuf The address of the input buffer * \param[in] scount The number of elements in that buffer * \param[in] sdt The datatype of those elements * \param[out] smpi_errno Returns errors * \param[in] rbuf The address of the output buffer * \param[out] rcount The number of elements in that buffer * \param[in] rdt The datatype of those elements * \param[out] rsz The size of the ouput data * \param[out] rmpi_errno Returns errors */ void MPIDI_Buffer_copy( const void * const sbuf, MPI_Aint scount, MPI_Datatype sdt, int * smpi_errno, void * const rbuf, MPI_Aint rcount, MPI_Datatype rdt, MPIDI_msg_sz_t * rsz, int * rmpi_errno) { int sdt_contig; int rdt_contig; MPI_Aint sdt_true_lb, rdt_true_lb; MPIDI_msg_sz_t sdata_sz; MPIDI_msg_sz_t rdata_sz; MPID_Datatype * sdt_ptr; MPID_Datatype * rdt_ptr; MPI_Aint sdt_extent; MPI_Aint rdt_extent; *smpi_errno = MPI_SUCCESS; *rmpi_errno = MPI_SUCCESS; /* printf("bufcopy: src count=%d dt=%d\n", scount, sdt); */ /* printf("bufcopy: dst count=%d dt=%d\n", rcount, rdt); */ MPIDI_Datatype_get_info(scount, sdt, sdt_contig, sdata_sz, sdt_ptr, sdt_true_lb); MPIDI_Datatype_get_info(rcount, rdt, rdt_contig, rdata_sz, rdt_ptr, rdt_true_lb); /* --BEGIN ERROR HANDLING-- */ if (sdata_sz > rdata_sz) { *rmpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, __FUNCTION__, __LINE__, MPI_ERR_TRUNCATE, "**truncate", "**truncate %d %d", sdata_sz, rdata_sz ); sdata_sz = rdata_sz; } /* --END ERROR HANDLING-- */ if (sdata_sz == 0) { *rsz = 0; goto fn_exit; } if (sdt_contig && rdt_contig) { #if CUDA_AWARE_SUPPORT if(MPIDI_Process.cuda_aware_support_on && MPIDI_cuda_is_device_buf(rbuf)) { cudaError_t cudaerr = CudaMemcpy(rbuf + rdt_true_lb, sbuf + sdt_true_lb, sdata_sz, cudaMemcpyHostToDevice); } else #endif memcpy((char*)rbuf + rdt_true_lb, (const char *)sbuf + sdt_true_lb, sdata_sz); *rsz = sdata_sz; } else if (sdt_contig) { #if CUDA_AWARE_SUPPORT // This will need to be done in two steps: // 1 - Allocate a temp buffer which is the same size as user buffer and unpack in it. // 2 - Copy unpacked data into user buffer from temp buffer. if(MPIDI_Process.cuda_aware_support_on && MPIDI_cuda_is_device_buf(rbuf)) { MPID_Datatype_get_extent_macro(rdt, rdt_extent); char *buf = MPL_malloc(rdt_extent * rcount); memset(buf, 0, rdt_extent * rcount); MPID_Segment seg; DLOOP_Offset last; MPID_Segment_init(buf, rcount, rdt, &seg, 0); last = sdata_sz; MPID_Segment_unpack(&seg, 0, &last, (char*)sbuf + sdt_true_lb); /* --BEGIN ERROR HANDLING-- */ if (last != sdata_sz) { *rmpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, __FUNCTION__, __LINE__, MPI_ERR_TYPE, "**dtypemismatch", 0); } /* --END ERROR HANDLING-- */ *rsz = last; cudaError_t cudaerr = CudaMemcpy(rbuf + rdt_true_lb, buf, rdt_extent * rcount, cudaMemcpyHostToDevice); MPL_free(buf); goto fn_exit; } #endif MPID_Segment seg; DLOOP_Offset last; MPID_Segment_init(rbuf, rcount, rdt, &seg, 0); last = sdata_sz; MPID_Segment_unpack(&seg, 0, &last, (char*)sbuf + sdt_true_lb); /* --BEGIN ERROR HANDLING-- */ if (last != sdata_sz) { *rmpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, __FUNCTION__, __LINE__, MPI_ERR_TYPE, "**dtypemismatch", 0); } /* --END ERROR HANDLING-- */ *rsz = last; } else if (rdt_contig) { MPID_Segment seg; DLOOP_Offset last; MPID_Segment_init(sbuf, scount, sdt, &seg, 0); last = sdata_sz; MPID_Segment_pack(&seg, 0, &last, (char*)rbuf + rdt_true_lb); /* --BEGIN ERROR HANDLING-- */ if (last != sdata_sz) { *rmpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, __FUNCTION__, __LINE__, MPI_ERR_TYPE, "**dtypemismatch", 0); } /* --END ERROR HANDLING-- */ *rsz = last; } else { char * buf; MPIDI_msg_sz_t buf_off; MPID_Segment sseg; MPIDI_msg_sz_t sfirst; MPID_Segment rseg; MPIDI_msg_sz_t rfirst; buf = MPL_malloc(MPIDI_COPY_BUFFER_SZ); /* --BEGIN ERROR HANDLING-- */ if (buf == NULL) { *smpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, __FUNCTION__, __LINE__, MPI_ERR_OTHER, "**nomem", 0); *rmpi_errno = *smpi_errno; *rsz = 0; goto fn_exit; } /* --END ERROR HANDLING-- */ MPID_Segment_init(sbuf, scount, sdt, &sseg, 0); MPID_Segment_init(rbuf, rcount, rdt, &rseg, 0); sfirst = 0; rfirst = 0; buf_off = 0; for(;;) { DLOOP_Offset last; char * buf_end; if (sdata_sz - sfirst > MPIDI_COPY_BUFFER_SZ - buf_off) { last = sfirst + (MPIDI_COPY_BUFFER_SZ - buf_off); } else { last = sdata_sz; } MPID_Segment_pack(&sseg, sfirst, &last, buf + buf_off); /* --BEGIN ERROR HANDLING-- */ MPID_assert(last > sfirst); /* --END ERROR HANDLING-- */ buf_end = buf + buf_off + (last - sfirst); sfirst = last; MPID_Segment_unpack(&rseg, rfirst, &last, buf); /* --BEGIN ERROR HANDLING-- */ MPID_assert(last > rfirst); /* --END ERROR HANDLING-- */ rfirst = last; if (rfirst == sdata_sz) { /* successful completion */ break; } /* --BEGIN ERROR HANDLING-- */ if (sfirst == sdata_sz) { /* datatype mismatch -- remaining bytes could not be unpacked */ *rmpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, __FUNCTION__, __LINE__, MPI_ERR_TYPE, "**dtypemismatch", 0); break; } /* --END ERROR HANDLING-- */ buf_off = sfirst - rfirst; if (buf_off > 0) { memmove(buf, buf_end - buf_off, buf_off); } } *rsz = rfirst; MPL_free(buf); } fn_exit: return; }
int MPIDI_CH3U_Request_load_send_iov(MPID_Request * const sreq, MPID_IOV * const iov, int * const iov_n) { MPI_Aint last; int mpi_errno = MPI_SUCCESS; MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3U_REQUEST_LOAD_SEND_IOV); MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3U_REQUEST_LOAD_SEND_IOV); MPIU_Assert(sreq->dev.segment_ptr != NULL); last = sreq->dev.segment_size; MPIU_DBG_MSG_FMT(CH3_CHANNEL,VERBOSE,(MPIU_DBG_FDEST, "pre-pv: first=" MPIDI_MSG_SZ_FMT ", last=" MPIDI_MSG_SZ_FMT ", iov_n=%d", sreq->dev.segment_first, last, *iov_n)); MPIU_Assert(sreq->dev.segment_first < last); MPIU_Assert(last > 0); MPIU_Assert(*iov_n > 0 && *iov_n <= MPID_IOV_LIMIT); MPID_Segment_pack_vector(sreq->dev.segment_ptr, sreq->dev.segment_first, &last, iov, iov_n); MPIU_DBG_MSG_FMT(CH3_CHANNEL,VERBOSE,(MPIU_DBG_FDEST, "post-pv: first=" MPIDI_MSG_SZ_FMT ", last=" MPIDI_MSG_SZ_FMT ", iov_n=%d", sreq->dev.segment_first, last, *iov_n)); MPIU_Assert(*iov_n > 0 && *iov_n <= MPID_IOV_LIMIT); if (last == sreq->dev.segment_size) { MPIU_DBG_MSG(CH3_CHANNEL,VERBOSE,"remaining data loaded into IOV"); sreq->dev.OnDataAvail = sreq->dev.OnFinal; } else if ((last - sreq->dev.segment_first) / *iov_n >= MPIDI_IOV_DENSITY_MIN) { MPIU_DBG_MSG(CH3_CHANNEL,VERBOSE,"more data loaded into IOV"); sreq->dev.segment_first = last; sreq->dev.OnDataAvail = MPIDI_CH3_ReqHandler_SendReloadIOV; } else { MPIDI_msg_sz_t data_sz; int i, iov_data_copied; MPIU_DBG_MSG(CH3_CHANNEL,VERBOSE,"low density. using SRBuf."); data_sz = sreq->dev.segment_size - sreq->dev.segment_first; if (!MPIDI_Request_get_srbuf_flag(sreq)) { MPIDI_CH3U_SRBuf_alloc(sreq, data_sz); /* --BEGIN ERROR HANDLING-- */ if (sreq->dev.tmpbuf_sz == 0) { MPIU_DBG_MSG(CH3_CHANNEL,TYPICAL,"SRBuf allocation failure"); mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**nomem", "**nomem %d", data_sz); sreq->status.MPI_ERROR = mpi_errno; goto fn_exit; } /* --END ERROR HANDLING-- */ } iov_data_copied = 0; for (i = 0; i < *iov_n; i++) { MPIU_Memcpy((char*) sreq->dev.tmpbuf + iov_data_copied, iov[i].MPID_IOV_BUF, iov[i].MPID_IOV_LEN); iov_data_copied += iov[i].MPID_IOV_LEN; } sreq->dev.segment_first = last; last = (data_sz <= sreq->dev.tmpbuf_sz - iov_data_copied) ? sreq->dev.segment_size : sreq->dev.segment_first + sreq->dev.tmpbuf_sz - iov_data_copied; MPIU_DBG_MSG_FMT(CH3_CHANNEL,VERBOSE,(MPIU_DBG_FDEST, "pre-pack: first=" MPIDI_MSG_SZ_FMT ", last=" MPIDI_MSG_SZ_FMT, sreq->dev.segment_first, last)); MPID_Segment_pack(sreq->dev.segment_ptr, sreq->dev.segment_first, &last, (char*) sreq->dev.tmpbuf + iov_data_copied); MPIU_DBG_MSG_FMT(CH3_CHANNEL,VERBOSE,(MPIU_DBG_FDEST, "post-pack: first=" MPIDI_MSG_SZ_FMT ", last=" MPIDI_MSG_SZ_FMT, sreq->dev.segment_first, last)); iov[0].MPID_IOV_BUF = (MPID_IOV_BUF_CAST)sreq->dev.tmpbuf; iov[0].MPID_IOV_LEN = last - sreq->dev.segment_first + iov_data_copied; *iov_n = 1; if (last == sreq->dev.segment_size) { MPIU_DBG_MSG(CH3_CHANNEL,VERBOSE,"remaining data packed into SRBuf"); sreq->dev.OnDataAvail = sreq->dev.OnFinal; } else { MPIU_DBG_MSG(CH3_CHANNEL,VERBOSE,"more data packed into SRBuf"); sreq->dev.segment_first = last; sreq->dev.OnDataAvail = MPIDI_CH3_ReqHandler_SendReloadIOV; } } fn_exit: MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3U_REQUEST_LOAD_SEND_IOV); return mpi_errno; }
int MPIDO_Gatherv_simple(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, const int *recvcounts, const int *displs, MPI_Datatype recvtype, int root, MPID_Comm * comm_ptr, int *mpierrno) { #ifndef HAVE_PAMI_IN_PLACE if (sendbuf == MPI_IN_PLACE) { MPID_Abort (NULL, 0, 1, "'MPI_IN_PLACE' requries support for `PAMI_IN_PLACE`"); return -1; } #endif TRACE_ERR("Entering MPIDO_Gatherv_optimized\n"); int snd_contig = 1, rcv_contig = 1; void *snd_noncontig_buff = NULL, *rcv_noncontig_buff = NULL; void *sbuf = NULL, *rbuf = NULL; int *rcounts = NULL; int *rdispls = NULL; int send_size = 0; int recv_size = 0; int rcvlen = 0; int totalrecvcount = 0; pami_type_t rtype = PAMI_TYPE_NULL; MPID_Segment segment; MPID_Datatype *data_ptr = NULL; int send_true_lb, recv_true_lb = 0; int i, tmp; volatile unsigned gatherv_active = 1; const int rank = comm_ptr->rank; const int size = comm_ptr->local_size; #if ASSERT_LEVEL==0 /* We can't afford the tracing in ndebug/performance libraries */ const unsigned verbose = 0; #else const unsigned verbose = (MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_ALL) && (rank == 0); #endif const struct MPIDI_Comm* const mpid = &(comm_ptr->mpid); int recvok=PAMI_SUCCESS, recvcontinuous=0; if(sendbuf != MPI_IN_PLACE) { MPIDI_Datatype_get_info(sendcount, sendtype, snd_contig, send_size, data_ptr, send_true_lb); if(MPIDI_Pamix_collsel_advise != NULL && mpid->collsel_fast_query != NULL) { advisor_algorithm_t advisor_algorithms[1]; int num_algorithms = MPIDI_Pamix_collsel_advise(mpid->collsel_fast_query, PAMI_XFER_GATHERV_INT, 64, advisor_algorithms, 1); if(num_algorithms) { if(advisor_algorithms[0].algorithm_type == COLLSEL_EXTERNAL_ALGO) { return MPIR_Gatherv(sendbuf, sendcount, sendtype, recvbuf, recvcounts, displs, recvtype, root, comm_ptr, mpierrno); } else if(advisor_algorithms[0].metadata && advisor_algorithms[0].metadata->check_correct.values.asyncflowctl && !(--(comm_ptr->mpid.num_requests))) { comm_ptr->mpid.num_requests = MPIDI_Process.optimized.num_requests; int tmpmpierrno; if(unlikely(verbose)) fprintf(stderr,"Query barrier required for %s\n", advisor_algorithms[0].metadata->name); MPIDO_Barrier(comm_ptr, &tmpmpierrno); } } } sbuf = (char *)sendbuf + send_true_lb; if(!snd_contig) { snd_noncontig_buff = MPL_malloc(send_size); sbuf = snd_noncontig_buff; if(snd_noncontig_buff == NULL) { MPID_Abort(NULL, MPI_ERR_NO_SPACE, 1, "Fatal: Cannot allocate pack buffer"); } DLOOP_Offset last = send_size; MPID_Segment_init(sendbuf, sendcount, sendtype, &segment, 0); MPID_Segment_pack(&segment, 0, &last, snd_noncontig_buff); } } else { MPIDI_Datatype_get_info(1, recvtype, rcv_contig, rcvlen, data_ptr, recv_true_lb); if(MPIDI_Pamix_collsel_advise != NULL && mpid->collsel_fast_query != NULL) { advisor_algorithm_t advisor_algorithms[1]; int num_algorithms = MPIDI_Pamix_collsel_advise(mpid->collsel_fast_query, PAMI_XFER_GATHERV_INT, 64, advisor_algorithms, 1); if(num_algorithms) { if(advisor_algorithms[0].algorithm_type == COLLSEL_EXTERNAL_ALGO) { return MPIR_Gatherv(sendbuf, sendcount, sendtype, recvbuf, recvcounts, displs, recvtype, root, comm_ptr, mpierrno); } else if(advisor_algorithms[0].metadata && advisor_algorithms[0].metadata->check_correct.values.asyncflowctl && !(--(comm_ptr->mpid.num_requests))) { comm_ptr->mpid.num_requests = MPIDI_Process.optimized.num_requests; int tmpmpierrno; if(unlikely(verbose)) fprintf(stderr,"Query barrier required for %s\n", advisor_algorithms[0].metadata->name); MPIDO_Barrier(comm_ptr, &tmpmpierrno); } } } } pami_xfer_t gatherv; rbuf = (char *)recvbuf + recv_true_lb; rcounts = (int*)recvcounts; rdispls = (int*)displs; if(rank == root) { if((recvok = MPIDI_Datatype_to_pami(recvtype, &rtype, -1, NULL, &tmp)) != MPI_SUCCESS) { MPIDI_Datatype_get_info(1, recvtype, rcv_contig, rcvlen, data_ptr, recv_true_lb); totalrecvcount = recvcounts[0]; recvcontinuous = displs[0] == 0? 1 : 0 ; rcounts = (int*)MPL_malloc(size); rdispls = (int*)MPL_malloc(size); rdispls[0] = 0; rcounts[0] = rcvlen * recvcounts[0]; for(i = 1; i < size; i++) { rdispls[i]= rcvlen * totalrecvcount; totalrecvcount += recvcounts[i]; if(displs[i] != (displs[i-1] + recvcounts[i-1])) recvcontinuous = 0; rcounts[i] = rcvlen * recvcounts[i]; } recv_size = rcvlen * totalrecvcount; rcv_noncontig_buff = MPL_malloc(recv_size); rbuf = rcv_noncontig_buff; rtype = PAMI_TYPE_BYTE; if(rcv_noncontig_buff == NULL) { MPID_Abort(NULL, MPI_ERR_NO_SPACE, 1, "Fatal: Cannot allocate pack buffer"); } if(sendbuf == MPI_IN_PLACE) { size_t extent; MPID_Datatype_get_extent_macro(recvtype,extent); MPIR_Localcopy(recvbuf + displs[rank]*extent, recvcounts[rank], recvtype, rcv_noncontig_buff + rdispls[rank], rcounts[rank],MPI_CHAR); } } if(sendbuf == MPI_IN_PLACE) { gatherv.cmd.xfer_gatherv_int.sndbuf = PAMI_IN_PLACE; } else { gatherv.cmd.xfer_gatherv_int.sndbuf = sbuf; } gatherv.cmd.xfer_gatherv_int.stype = PAMI_TYPE_BYTE;/* stype is ignored when sndbuf == PAMI_IN_PLACE */ gatherv.cmd.xfer_gatherv_int.stypecount = send_size; } else { gatherv.cmd.xfer_gatherv_int.sndbuf = sbuf; gatherv.cmd.xfer_gatherv_int.stype = PAMI_TYPE_BYTE; gatherv.cmd.xfer_gatherv_int.stypecount = send_size; } gatherv.cb_done = cb_gatherv; gatherv.cookie = (void *)&gatherv_active; gatherv.cmd.xfer_gatherv_int.root = MPIDI_Task_to_endpoint(MPID_VCR_GET_LPID(comm_ptr->vcr, root), 0); gatherv.cmd.xfer_gatherv_int.rcvbuf = rbuf; gatherv.cmd.xfer_gatherv_int.rtype = rtype; gatherv.cmd.xfer_gatherv_int.rtypecounts = (int *) rcounts; gatherv.cmd.xfer_gatherv_int.rdispls = (int *) rdispls; const pami_metadata_t *my_gatherv_md; gatherv.algorithm = mpid->coll_algorithm[PAMI_XFER_GATHERV_INT][0][0]; my_gatherv_md = &mpid->coll_metadata[PAMI_XFER_GATHERV_INT][0][0]; MPIDI_Update_last_algorithm(comm_ptr, my_gatherv_md->name); MPIDI_Post_coll_t gatherv_post; TRACE_ERR("%s gatherv\n", MPIDI_Process.context_post.active>0?"Posting":"Invoking"); MPIDI_Context_post(MPIDI_Context[0], &gatherv_post.state, MPIDI_Pami_post_wrapper, (void *)&gatherv); TRACE_ERR("Gatherv %s\n", MPIDI_Process.context_post.active>0?"posted":"invoked"); TRACE_ERR("Waiting on active %d\n", gatherv_active); MPID_PROGRESS_WAIT_WHILE(gatherv_active); if(!rcv_contig || recvok != PAMI_SUCCESS) { if(recvcontinuous) { MPIR_Localcopy(rcv_noncontig_buff, recv_size, MPI_CHAR, recvbuf, totalrecvcount, recvtype); } else { size_t extent; MPID_Datatype_get_extent_macro(recvtype,extent); for(i=0; i<size; ++i) { char* scbuf = (char*)rcv_noncontig_buff+ rdispls[i]; char* rcbuf = (char*)recvbuf + displs[i]*extent; MPIR_Localcopy(scbuf, rcounts[i], MPI_CHAR, rcbuf, recvcounts[i], recvtype); TRACE_ERR("Pack recv src extent %zu, displ[%zu]=%zu, count[%zu]=%zu buf[%zu]=%u\n", (size_t)extent, (size_t)i,(size_t)precvdispls[i],(size_t)i,(size_t)precvcounts[i],(size_t)precvdispls[i], *(int*)scbuf); TRACE_ERR("Pack recv dest extent %zu, displ[%zu]=%zu, count[%zu]=%zu buf[%zu]=%u\n", (size_t)extent, (size_t)i,(size_t)displs[i],(size_t)i,(size_t)recvcounts[i],(size_t)displs[i], *(int*)rcbuf); } } MPL_free(rcv_noncontig_buff); if(rank == root) { MPL_free(rcounts); MPL_free(rdispls); } } if(!snd_contig) MPL_free(snd_noncontig_buff); TRACE_ERR("Leaving MPIDO_Gatherv_optimized\n"); return MPI_SUCCESS; }
int MPIR_Pack_impl(const void *inbuf, int incount, MPI_Datatype datatype, void *outbuf, MPI_Aint outsize, MPI_Aint *position) { int mpi_errno = MPI_SUCCESS; MPI_Aint first, last; MPID_Segment *segp; int contig; MPI_Aint dt_true_lb; MPI_Aint data_sz; if (incount == 0) { goto fn_exit; } /* Handle contig case quickly */ if (HANDLE_GET_KIND(datatype) == HANDLE_KIND_BUILTIN) { contig = TRUE; dt_true_lb = 0; data_sz = incount * MPID_Datatype_get_basic_size(datatype); } else { MPID_Datatype *dt_ptr; MPID_Datatype_get_ptr(datatype, dt_ptr); contig = dt_ptr->is_contig; dt_true_lb = dt_ptr->true_lb; data_sz = incount * dt_ptr->size; } if (contig) { MPIU_Memcpy((char *) outbuf + *position, (char *)inbuf + dt_true_lb, data_sz); *position = (int)((MPI_Aint)*position + data_sz); goto fn_exit; } /* non-contig case */ /* TODO: CHECK RETURN VALUES?? */ /* TODO: SHOULD THIS ALL BE IN A MPID_PACK??? */ segp = MPID_Segment_alloc(); MPIU_ERR_CHKANDJUMP1(segp == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment"); mpi_errno = MPID_Segment_init(inbuf, incount, datatype, segp, 0); if (mpi_errno) MPIU_ERR_POP(mpi_errno); /* NOTE: the use of buffer values and positions in MPI_Pack and in * MPID_Segment_pack are quite different. See code or docs or something. */ first = 0; last = SEGMENT_IGNORE_LAST; /* Ensure that pointer increment fits in a pointer */ MPID_Ensure_Aint_fits_in_pointer((MPI_VOID_PTR_CAST_TO_MPI_AINT outbuf) + (MPI_Aint) *position); MPID_Segment_pack(segp, first, &last, (void *) ((char *) outbuf + *position)); /* Ensure that calculation fits into an int datatype. */ MPID_Ensure_Aint_fits_in_int((MPI_Aint)*position + last); *position = (int)((MPI_Aint)*position + last); MPID_Segment_free(segp); fn_exit: return mpi_errno; fn_fail: goto fn_exit; }
int MPIDO_Scatterv_simple(const void *sendbuf, const int *sendcounts, const int *displs, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, int root, MPID_Comm *comm_ptr, int *mpierrno) { #ifndef HAVE_PAMI_IN_PLACE if (sendbuf == MPI_IN_PLACE) { MPID_Abort (NULL, 0, 1, "'MPI_IN_PLACE' requries support for `PAMI_IN_PLACE`"); return -1; } #endif int snd_contig = 1; int rcv_contig = 1; int send_size = 0, recv_size = 0; int ssize = 0; MPID_Datatype *dt_ptr = NULL; MPI_Aint send_true_lb=0, recv_true_lb=0; void *snd_noncontig_buff = NULL, *rcv_noncontig_buff = NULL; void *sbuf = NULL, *rbuf = NULL; int *sdispls = NULL, *scounts = NULL; int sndcount = 0; MPID_Segment segment; int tmp, i; pami_type_t stype = PAMI_TYPE_NULL; const int rank = comm_ptr->rank; const int size = comm_ptr->local_size; const struct MPIDI_Comm* const mpid = &(comm_ptr->mpid); if (rank == root && sendtype != MPI_DATATYPE_NULL && sendcounts[0] >= 0) { MPIDI_Datatype_get_info(1, sendtype, snd_contig, ssize, dt_ptr, send_true_lb); if(MPIDI_Pamix_collsel_advise != NULL && mpid->collsel_fast_query != NULL) { advisor_algorithm_t advisor_algorithms[1]; int num_algorithms = MPIDI_Pamix_collsel_advise(mpid->collsel_fast_query, PAMI_XFER_SCATTERV_INT, 64, advisor_algorithms, 1); if(num_algorithms) { if(advisor_algorithms[0].algorithm_type == COLLSEL_EXTERNAL_ALGO) { return MPIR_Scatterv(sendbuf, sendcounts, displs, sendtype, recvbuf, recvcount, recvtype, root, comm_ptr, mpierrno); } else if(advisor_algorithms[0].metadata && advisor_algorithms[0].metadata->check_correct.values.asyncflowctl && !(--(comm_ptr->mpid.num_requests))) { comm_ptr->mpid.num_requests = MPIDI_Process.optimized.num_requests; int tmpmpierrno; MPIDO_Barrier(comm_ptr, &tmpmpierrno); } } } } if (recvtype != MPI_DATATYPE_NULL && recvcount >= 0) { MPIDI_Datatype_get_info(recvcount, recvtype, rcv_contig, recv_size, dt_ptr, recv_true_lb); if(MPIDI_Pamix_collsel_advise != NULL && mpid->collsel_fast_query != NULL) { advisor_algorithm_t advisor_algorithms[1]; int num_algorithms = MPIDI_Pamix_collsel_advise(mpid->collsel_fast_query, PAMI_XFER_SCATTERV_INT, 64, advisor_algorithms, 1); if(num_algorithms) { if(advisor_algorithms[0].algorithm_type == COLLSEL_EXTERNAL_ALGO) { return MPIR_Scatterv(sendbuf, sendcounts, displs, sendtype, recvbuf, recvcount, recvtype, root, comm_ptr, mpierrno); } else if(advisor_algorithms[0].metadata && advisor_algorithms[0].metadata->check_correct.values.asyncflowctl && !(--(comm_ptr->mpid.num_requests))) { comm_ptr->mpid.num_requests = MPIDI_Process.optimized.num_requests; int tmpmpierrno; MPIDO_Barrier(comm_ptr, &tmpmpierrno); } } } } pami_xfer_t scatterv; const pami_metadata_t *my_scatterv_md; volatile unsigned scatterv_active = 1; sbuf = (char *)sendbuf + send_true_lb; rbuf = (char *)recvbuf + recv_true_lb; scounts = (int*)sendcounts; sdispls = (int*)displs; if(rank == root) { if(MPIDI_Datatype_to_pami(sendtype, &stype, -1, NULL, &tmp) != MPI_SUCCESS) { if (!snd_contig) { scounts = (int*)MPIU_Malloc(size); sdispls = (int*)MPIU_Malloc(size); for(i = 0; i < size; i++) { scounts[i] = ssize * sendcounts[i]; sdispls[i] = ssize * displs[i]; send_size += scounts[i]; sndcount += sendcounts[i]; } snd_noncontig_buff = MPIU_Malloc(send_size); sbuf = snd_noncontig_buff; stype = PAMI_TYPE_BYTE; if(snd_noncontig_buff == NULL) { MPID_Abort(NULL, MPI_ERR_NO_SPACE, 1, "Fatal: Cannot allocate pack buffer"); } DLOOP_Offset last = send_size; MPID_Segment_init(sendbuf, sndcount, sendtype, &segment, 0); MPID_Segment_pack(&segment, 0, &last, snd_noncontig_buff); } } if(recvbuf == MPI_IN_PLACE) { rbuf = PAMI_IN_PLACE; } } if(recvbuf != MPI_IN_PLACE) { if (!rcv_contig) { rcv_noncontig_buff = MPIU_Malloc(recv_size); rbuf = rcv_noncontig_buff; if(rcv_noncontig_buff == NULL) { MPID_Abort(NULL, MPI_ERR_NO_SPACE, 1, "Fatal: Cannot allocate pack buffer"); } } } scatterv.cb_done = cb_scatterv; scatterv.cookie = (void *)&scatterv_active; scatterv.cmd.xfer_scatterv_int.root = MPIDI_Task_to_endpoint(MPID_VCR_GET_LPID(comm_ptr->vcr, root), 0); scatterv.algorithm = mpid->coll_algorithm[PAMI_XFER_SCATTERV_INT][0][0]; my_scatterv_md = &mpid->coll_metadata[PAMI_XFER_SCATTERV_INT][0][0]; scatterv.cmd.xfer_scatterv_int.rcvbuf = rbuf; scatterv.cmd.xfer_scatterv_int.sndbuf = sbuf; scatterv.cmd.xfer_scatterv_int.stype = stype; scatterv.cmd.xfer_scatterv_int.rtype = PAMI_TYPE_BYTE;/* rtype is ignored when rcvbuf == PAMI_IN_PLACE */ scatterv.cmd.xfer_scatterv_int.stypecounts = (int *) scounts; scatterv.cmd.xfer_scatterv_int.rtypecount = recv_size; scatterv.cmd.xfer_scatterv_int.sdispls = (int *) sdispls; MPIDI_Update_last_algorithm(comm_ptr, my_scatterv_md->name); MPIDI_Post_coll_t scatterv_post; TRACE_ERR("%s scatterv\n", MPIDI_Process.context_post.active>0?"Posting":"Invoking"); MPIDI_Context_post(MPIDI_Context[0], &scatterv_post.state, MPIDI_Pami_post_wrapper, (void *)&scatterv); TRACE_ERR("Waiting on active %d\n", scatterv_active); MPID_PROGRESS_WAIT_WHILE(scatterv_active); if(!rcv_contig) { MPIR_Localcopy(rcv_noncontig_buff, recv_size, MPI_CHAR, recvbuf, recvcount, recvtype); MPIU_Free(rcv_noncontig_buff); } if(!snd_contig) { MPIU_Free(snd_noncontig_buff); MPIU_Free(scounts); MPIU_Free(sdispls); } TRACE_ERR("Leaving MPIDO_Scatterv_optimized\n"); return MPI_SUCCESS; }
int MPID_nem_ib_lmt_switch_send(struct MPIDI_VC *vc, struct MPID_Request *req) { int mpi_errno = MPI_SUCCESS; int dt_contig; MPIDI_msg_sz_t data_sz; MPID_Datatype *dt_ptr; MPI_Aint dt_true_lb; MPID_IOV r_cookie = req->ch.lmt_tmp_cookie; MPID_nem_ib_lmt_cookie_t *r_cookie_buf = r_cookie.iov_base; MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_LMT_SWITCH_SEND); MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_LMT_SWITCH_SEND); MPIDI_Datatype_get_info(req->dev.user_count, req->dev.datatype, dt_contig, data_sz, dt_ptr, dt_true_lb); void *write_from_buf; if (dt_contig) { write_from_buf = req->dev.user_buf; } else { /* see MPIDI_CH3_EagerNoncontigSend (in ch3u_eager.c) */ req->dev.segment_ptr = MPID_Segment_alloc(); MPIU_ERR_CHKANDJUMP((req->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**outofmemory"); MPID_Segment_init(req->dev.user_buf, req->dev.user_count, req->dev.datatype, req->dev.segment_ptr, 0); req->dev.segment_first = 0; req->dev.segment_size = data_sz; MPIDI_msg_sz_t last; last = req->dev.segment_size; /* segment_size is byte offset */ MPIU_Assert(last > 0); REQ_FIELD(req, lmt_pack_buf) = MPIU_Malloc(data_sz); MPIU_ERR_CHKANDJUMP(!REQ_FIELD(req, lmt_pack_buf), mpi_errno, MPI_ERR_OTHER, "**outofmemory"); MPID_Segment_pack(req->dev.segment_ptr, req->dev.segment_first, &last, (char *) (REQ_FIELD(req, lmt_pack_buf))); MPIU_Assert(last == req->dev.segment_size); write_from_buf = REQ_FIELD(req, lmt_pack_buf); } //assert(dt_true_lb == 0); uint8_t *tailp = (uint8_t *) ((uint8_t *) write_from_buf /*+ dt_true_lb */ + data_sz - sizeof(uint8_t)); #if 0 *is_end_flag_same = (r_cookie_buf->tail == *tailp) ? 1 : 0; #else REQ_FIELD(req, lmt_receiver_tail) = r_cookie_buf->tail; REQ_FIELD(req, lmt_sender_tail) = *tailp; dprintf("lmt_switch_send,tail on sender=%02x,tail onreceiver=%02x,req=%p\n", *tailp, r_cookie_buf->tail, req); #ifdef MPID_NEM_IB_DEBUG_LMT uint8_t *tail_wordp = (uint8_t *) ((uint8_t *) write_from_buf + data_sz - sizeof(uint32_t) * 2); #endif dprintf("lmt_switch_send,tail on sender=%d\n", *tail_wordp); fflush(stdout); #endif fn_exit: MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_LMT_SWITCH_SEND); return mpi_errno; fn_fail: goto fn_exit; }
int MPIDO_Allgather_simple(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPID_Comm * comm_ptr, int *mpierrno) { #ifndef HAVE_PAMI_IN_PLACE if (sendbuf == MPI_IN_PLACE) { MPID_Abort (NULL, 0, 1, "'MPI_IN_PLACE' requries support for `PAMI_IN_PLACE`"); return -1; } #endif /* ********************************* * Check the nature of the buffers * ********************************* */ const struct MPIDI_Comm* const mpid = &(comm_ptr->mpid); MPID_Datatype * dt_null = NULL; void *snd_noncontig_buff = NULL, *rcv_noncontig_buff = NULL; MPI_Aint send_true_lb = 0; MPI_Aint recv_true_lb = 0; int snd_data_contig = 1, rcv_data_contig = 1; size_t send_size = 0; size_t recv_size = 0; MPID_Segment segment; volatile unsigned allgather_active = 1; const int rank = comm_ptr->rank; const int size = comm_ptr->local_size; #if ASSERT_LEVEL==0 /* We can't afford the tracing in ndebug/performance libraries */ const unsigned verbose = 0; #else const unsigned verbose = (MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_ALL) && (rank == 0); #endif const pami_metadata_t *my_md; char *rbuf = NULL, *sbuf = NULL; if ((sendcount < 1 && sendbuf != MPI_IN_PLACE) || recvcount < 1) return MPI_SUCCESS; /* Gather datatype information */ MPIDI_Datatype_get_info(recvcount, recvtype, rcv_data_contig, recv_size, dt_null, recv_true_lb); send_size = recv_size; if(MPIDI_Pamix_collsel_advise != NULL && mpid->collsel_fast_query != NULL) { advisor_algorithm_t advisor_algorithms[1]; int num_algorithms = MPIDI_Pamix_collsel_advise(mpid->collsel_fast_query, PAMI_XFER_ALLGATHER, send_size, advisor_algorithms, 1); if(num_algorithms) { if(advisor_algorithms[0].algorithm_type == COLLSEL_EXTERNAL_ALGO) { return MPIR_Allgather(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, comm_ptr, mpierrno); } else if(advisor_algorithms[0].metadata && advisor_algorithms[0].metadata->check_correct.values.asyncflowctl && !(--(comm_ptr->mpid.num_requests))) { comm_ptr->mpid.num_requests = MPIDI_Process.optimized.num_requests; int tmpmpierrno; if(unlikely(verbose)) fprintf(stderr,"Query barrier required for %s\n", advisor_algorithms[0].metadata->name); MPIDO_Barrier(comm_ptr, &tmpmpierrno); } } } rbuf = (char *)recvbuf+recv_true_lb; if(!rcv_data_contig) { rcv_noncontig_buff = MPL_malloc(recv_size * size); rbuf = rcv_noncontig_buff; if(rcv_noncontig_buff == NULL) { MPID_Abort(NULL, MPI_ERR_NO_SPACE, 1, "Fatal: Cannot allocate pack buffer"); } if(sendbuf == MPI_IN_PLACE) { sbuf = PAMI_IN_PLACE; size_t extent; MPID_Datatype_get_extent_macro(recvtype,extent); MPIR_Localcopy(recvbuf + (rank*recvcount*extent), recvcount, recvtype, rcv_noncontig_buff + (rank*recv_size), recv_size,MPI_CHAR); } } if(sendbuf != MPI_IN_PLACE) { MPIDI_Datatype_get_info(sendcount, sendtype, snd_data_contig, send_size, dt_null, send_true_lb); sbuf = (char *)sendbuf+send_true_lb; if(!snd_data_contig) { snd_noncontig_buff = MPL_malloc(send_size); sbuf = snd_noncontig_buff; if(snd_noncontig_buff == NULL) { MPID_Abort(NULL, MPI_ERR_NO_SPACE, 1, "Fatal: Cannot allocate pack buffer"); } DLOOP_Offset last = send_size; MPID_Segment_init(sendbuf, sendcount, sendtype, &segment, 0); MPID_Segment_pack(&segment, 0, &last, snd_noncontig_buff); } } else sbuf = PAMI_IN_PLACE; TRACE_ERR("Using PAMI-level allgather protocol\n"); pami_xfer_t allgather; allgather.cb_done = allgather_cb_done; allgather.cookie = (void *)&allgather_active; allgather.cmd.xfer_allgather.rcvbuf = rbuf; allgather.cmd.xfer_allgather.sndbuf = sbuf; allgather.cmd.xfer_allgather.stype = PAMI_TYPE_BYTE;/* stype is ignored when sndbuf == PAMI_IN_PLACE */ allgather.cmd.xfer_allgather.rtype = PAMI_TYPE_BYTE; allgather.cmd.xfer_allgather.stypecount = send_size; allgather.cmd.xfer_allgather.rtypecount = recv_size; allgather.algorithm = mpid->coll_algorithm[PAMI_XFER_ALLGATHER][0][0]; my_md = &mpid->coll_metadata[PAMI_XFER_ALLGATHER][0][0]; TRACE_ERR("Calling PAMI_Collective with allgather structure\n"); MPIDI_Post_coll_t allgather_post; MPIDI_Context_post(MPIDI_Context[0], &allgather_post.state, MPIDI_Pami_post_wrapper, (void *)&allgather); TRACE_ERR("Allgather %s\n", MPIDI_Process.context_post.active>0?"posted":"invoked"); MPIDI_Update_last_algorithm(comm_ptr, my_md->name); MPID_PROGRESS_WAIT_WHILE(allgather_active); if(!rcv_data_contig) { MPIR_Localcopy(rcv_noncontig_buff, recv_size * size, MPI_CHAR, recvbuf, recvcount, recvtype); MPL_free(rcv_noncontig_buff); } if(!snd_data_contig) MPL_free(snd_noncontig_buff); TRACE_ERR("Allgather done\n"); return MPI_SUCCESS; }
int MPIDO_Bcast(void *buffer, int count, MPI_Datatype datatype, int root, MPID_Comm *comm_ptr, int *mpierrno) { TRACE_ERR("in mpido_bcast\n"); const size_t BCAST_LIMIT = 0x40000000; int data_contig, rc; void *data_buffer = NULL, *noncontig_buff = NULL; volatile unsigned active = 1; MPI_Aint data_true_lb = 0; MPID_Datatype *data_ptr; MPID_Segment segment; MPIDI_Post_coll_t bcast_post; const struct MPIDI_Comm* const mpid = &(comm_ptr->mpid); const int rank = comm_ptr->rank; #if ASSERT_LEVEL==0 /* We can't afford the tracing in ndebug/performance libraries */ const unsigned verbose = 0; #else const unsigned verbose = (MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_ALL) && (rank == 0); #endif const int selected_type = mpid->user_selected_type[PAMI_XFER_BROADCAST]; /* Must calculate data_size based on count=1 in case it's total size is > integer */ int data_size_one; MPIDI_Datatype_get_info(1, datatype, data_contig, data_size_one, data_ptr, data_true_lb); /* do this calculation once and use twice */ const size_t data_size_sz = (size_t)data_size_one*(size_t)count; if(unlikely(verbose)) fprintf(stderr,"bcast count %d, size %d (%#zX), root %d, buffer %p\n", count,data_size_one, (size_t)data_size_one*(size_t)count, root,buffer); if(unlikely( data_size_sz > BCAST_LIMIT) ) { void *new_buffer=buffer; int c, new_count = (int)BCAST_LIMIT/data_size_one; MPID_assert(new_count > 0); for(c=1; ((size_t)c*(size_t)new_count) <= (size_t)count; ++c) { if ((rc = MPIDO_Bcast(new_buffer, new_count, datatype, root, comm_ptr, mpierrno)) != MPI_SUCCESS) return rc; new_buffer = (char*)new_buffer + (size_t)data_size_one*(size_t)new_count; } new_count = count % new_count; /* 0 is ok, just returns no-op */ return MPIDO_Bcast(new_buffer, new_count, datatype, root, comm_ptr, mpierrno); } /* Must use data_size based on count for byte bcast processing. Previously calculated as a size_t but large data_sizes were handled above so this cast to int should be fine here. */ const int data_size = (int)data_size_sz; if(selected_type == MPID_COLL_USE_MPICH || data_size == 0) { if(unlikely(verbose)) fprintf(stderr,"Using MPICH bcast algorithm\n"); MPIDI_Update_last_algorithm(comm_ptr,"BCAST_MPICH"); return MPIR_Bcast_intra(buffer, count, datatype, root, comm_ptr, mpierrno); } data_buffer = (char *)buffer + data_true_lb; if(!data_contig) { noncontig_buff = MPIU_Malloc(data_size); data_buffer = noncontig_buff; if(noncontig_buff == NULL) { MPID_Abort(NULL, MPI_ERR_NO_SPACE, 1, "Fatal: Cannot allocate pack buffer"); } if(rank == root) { DLOOP_Offset last = data_size; MPID_Segment_init(buffer, count, datatype, &segment, 0); MPID_Segment_pack(&segment, 0, &last, noncontig_buff); } } pami_xfer_t bcast; pami_algorithm_t my_bcast; const pami_metadata_t *my_md = (pami_metadata_t *)NULL; int queryreq = 0; bcast.cb_done = cb_bcast; bcast.cookie = (void *)&active; bcast.cmd.xfer_broadcast.root = MPIDI_Task_to_endpoint(MPID_VCR_GET_LPID(comm_ptr->vcr, root), 0); bcast.algorithm = mpid->user_selected[PAMI_XFER_BROADCAST]; bcast.cmd.xfer_broadcast.buf = data_buffer; bcast.cmd.xfer_broadcast.type = PAMI_TYPE_BYTE; /* Needs to be sizeof(type)*count since we are using bytes as * the generic type */ bcast.cmd.xfer_broadcast.typecount = data_size; if(selected_type == MPID_COLL_OPTIMIZED) { TRACE_ERR("Optimized bcast (%s) and (%s) were pre-selected\n", mpid->opt_protocol_md[PAMI_XFER_BROADCAST][0].name, mpid->opt_protocol_md[PAMI_XFER_BROADCAST][1].name); if(mpid->cutoff_size[PAMI_XFER_BROADCAST][1] != 0)/* SSS: There is FCA cutoff (FCA only sets cutoff for [PAMI_XFER_BROADCAST][1]) */ { if(data_size <= mpid->cutoff_size[PAMI_XFER_BROADCAST][1]) { my_bcast = mpid->opt_protocol[PAMI_XFER_BROADCAST][1]; my_md = &mpid->opt_protocol_md[PAMI_XFER_BROADCAST][1]; queryreq = mpid->must_query[PAMI_XFER_BROADCAST][1]; } else { return MPIR_Bcast_intra(buffer, count, datatype, root, comm_ptr, mpierrno); } } if(data_size > mpid->cutoff_size[PAMI_XFER_BROADCAST][0]) { my_bcast = mpid->opt_protocol[PAMI_XFER_BROADCAST][1]; my_md = &mpid->opt_protocol_md[PAMI_XFER_BROADCAST][1]; queryreq = mpid->must_query[PAMI_XFER_BROADCAST][1]; } else { my_bcast = mpid->opt_protocol[PAMI_XFER_BROADCAST][0]; my_md = &mpid->opt_protocol_md[PAMI_XFER_BROADCAST][0]; queryreq = mpid->must_query[PAMI_XFER_BROADCAST][0]; } } else { TRACE_ERR("Bcast (%s) was specified by user\n", mpid->user_metadata[PAMI_XFER_BROADCAST].name); my_bcast = mpid->user_selected[PAMI_XFER_BROADCAST]; my_md = &mpid->user_metadata[PAMI_XFER_BROADCAST]; queryreq = selected_type; } bcast.algorithm = my_bcast; if(unlikely(queryreq == MPID_COLL_ALWAYS_QUERY || queryreq == MPID_COLL_CHECK_FN_REQUIRED)) { metadata_result_t result = {0}; TRACE_ERR("querying bcast protocol %s, type was: %d\n", my_md->name, queryreq); if(my_md->check_fn != NULL) /* calling the check fn is sufficient */ { metadata_result_t result = {0}; result = my_md->check_fn(&bcast); result.check.nonlocal = 0; /* #warning REMOVE THIS WHEN IMPLEMENTED */ } else /* no check_fn, manually look at the metadata fields */ { TRACE_ERR("Optimzed selection line %d\n",__LINE__); /* Check if the message range if restricted */ if(my_md->check_correct.values.rangeminmax) { if((my_md->range_lo <= data_size) && (my_md->range_hi >= data_size)) ; /* ok, algorithm selected */ else { result.check.range = 1; if(unlikely(verbose)) { fprintf(stderr,"message size (%u) outside range (%zu<->%zu) for %s.\n", data_size, my_md->range_lo, my_md->range_hi, my_md->name); } } } /* \todo check the rest of the metadata */ } TRACE_ERR("bitmask: %#X\n", result.bitmask); if(result.bitmask) { if(unlikely(verbose)) fprintf(stderr,"Using MPICH bcast algorithm - query fn failed\n"); MPIDI_Update_last_algorithm(comm_ptr,"BCAST_MPICH"); return MPIR_Bcast_intra(buffer, count, datatype, root, comm_ptr, mpierrno); } if(my_md->check_correct.values.asyncflowctl && !(--(comm_ptr->mpid.num_requests))) { comm_ptr->mpid.num_requests = MPIDI_Process.optimized.num_requests; int tmpmpierrno; if(unlikely(verbose)) fprintf(stderr,"Query barrier required for %s\n", my_md->name); MPIDO_Barrier(comm_ptr, &tmpmpierrno); } } if(unlikely(verbose)) { unsigned long long int threadID; MPIU_Thread_id_t tid; MPIU_Thread_self(&tid); threadID = (unsigned long long int)tid; fprintf(stderr,"<%llx> Using protocol %s for bcast on %u\n", threadID, my_md->name, (unsigned) comm_ptr->context_id); } MPIDI_Context_post(MPIDI_Context[0], &bcast_post.state, MPIDI_Pami_post_wrapper, (void *)&bcast); MPIDI_Update_last_algorithm(comm_ptr, my_md->name); MPID_PROGRESS_WAIT_WHILE(active); TRACE_ERR("bcast done\n"); if(!data_contig) { if(rank != root) MPIR_Localcopy(noncontig_buff, data_size, MPI_CHAR, buffer, count, datatype); MPIU_Free(noncontig_buff); } TRACE_ERR("leaving bcast\n"); return 0; }
int MPID_nem_ib_lmt_initiate_lmt(struct MPIDI_VC *vc, union MPIDI_CH3_Pkt *rts_pkt, struct MPID_Request *req) { int mpi_errno = MPI_SUCCESS; int dt_contig; MPIDI_msg_sz_t data_sz; MPID_Datatype *dt_ptr; MPI_Aint dt_true_lb; #if 0 MPID_nem_ib_vc_area *vc_ib = VC_IB(vc); #endif MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_LMT_INITIATE_LMT); MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_LMT_INITIATE_LMT); dprintf("lmt_initiate_lmt,enter,%d->%d,req=%p\n", MPID_nem_ib_myrank, vc->pg_rank, req); /* obtain dt_true_lb */ /* see MPIDI_Datatype_get_info(in, in, out, out, out, out) (in src/mpid/ch3/include/mpidimpl.h) */ MPIDI_Datatype_get_info(req->dev.user_count, req->dev.datatype, dt_contig, data_sz, dt_ptr, dt_true_lb); /* FIXME: who frees s_cookie_buf? */ /* malloc memory area for cookie. auto variable is NG because isend does not copy payload */ MPID_nem_ib_lmt_cookie_t *s_cookie_buf = (MPID_nem_ib_lmt_cookie_t *) MPIU_Malloc(sizeof(MPID_nem_ib_lmt_cookie_t)); /* remember address to "free" when receiving DONE from receiver */ req->ch.s_cookie = s_cookie_buf; /* see MPIDI_CH3_PktHandler_RndvClrToSend (in src/mpid/ch3/src/ch3u_rndv.c) */ //assert(dt_true_lb == 0); void *write_from_buf; if (dt_contig) { write_from_buf = (void *) ((char *) req->dev.user_buf + dt_true_lb); } else { /* see MPIDI_CH3_EagerNoncontigSend (in ch3u_eager.c) */ req->dev.segment_ptr = MPID_Segment_alloc(); MPIU_ERR_CHKANDJUMP((req->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**outofmemory"); MPID_Segment_init(req->dev.user_buf, req->dev.user_count, req->dev.datatype, req->dev.segment_ptr, 0); req->dev.segment_first = 0; req->dev.segment_size = data_sz; MPIDI_msg_sz_t last; last = req->dev.segment_size; /* segment_size is byte offset */ MPIU_Assert(last > 0); REQ_FIELD(req, lmt_pack_buf) = MPIU_Malloc((size_t) req->dev.segment_size); MPIU_ERR_CHKANDJUMP(!REQ_FIELD(req, lmt_pack_buf), mpi_errno, MPI_ERR_OTHER, "**outofmemory"); MPID_Segment_pack(req->dev.segment_ptr, req->dev.segment_first, &last, (char *) (REQ_FIELD(req, lmt_pack_buf))); MPIU_Assert(last == req->dev.segment_size); write_from_buf = REQ_FIELD(req, lmt_pack_buf); } dprintf ("lmt_initate_lmt,dt_contig=%d,write_from_buf=%p,req->dev.user_buf=%p,REQ_FIELD(req, lmt_pack_buf)=%p\n", dt_contig, write_from_buf, req->dev.user_buf, REQ_FIELD(req, lmt_pack_buf)); #ifdef HAVE_LIBDCFA #else s_cookie_buf->addr = write_from_buf; #endif /* put sz, see MPID_nem_lmt_RndvSend (in src/mpid/ch3/channels/nemesis/src/mpid_nem_lmt.c) */ /* TODO remove sz field * pkt_RTS_handler (in src/mpid/ch3/channels/nemesis/src/mpid_nem_lmt.c) * rreq->ch.lmt_data_sz = rts_pkt->data_sz; */ //s_cookie_buf->sz = (uint32_t)((MPID_nem_pkt_lmt_rts_t*)rts_pkt)->data_sz; /* preserve and put tail, because tail magic is written on the tail of payload * because we don't want to add another SGE or RDMA command */ MPIU_Assert(((MPID_nem_pkt_lmt_rts_t *) rts_pkt)->data_sz == data_sz); s_cookie_buf->tail = *((uint8_t *) ((uint8_t *) write_from_buf + data_sz - sizeof(uint8_t))); /* prepare magic */ //*((uint32_t*)(write_from_buf + data_sz - sizeof(tailmagic_t))) = MPID_NEM_IB_COM_MAGIC; #if 0 /* moving to packet header */ /* embed RDMA-write-to buffer occupancy information */ dprintf("lmt_initiate_lmt,rsr_seq_num_tail=%d\n", vc_ib->ibcom->rsr_seq_num_tail); /* embed RDMA-write-to buffer occupancy information */ s_cookie_buf->seq_num_tail = vc_ib->ibcom->rsr_seq_num_tail; /* remember the last one sent */ vc_ib->ibcom->rsr_seq_num_tail_last_sent = vc_ib->ibcom->rsr_seq_num_tail; #endif int post_num; uint32_t max_msg_sz; MPID_nem_ib_vc_area *vc_ib = VC_IB(vc); MPID_nem_ib_com_get_info_conn(vc_ib->sc->fd, MPID_NEM_IB_COM_INFOKEY_PATTR_MAX_MSG_SZ, &max_msg_sz, sizeof(uint32_t)); /* Type of max_msg_sz is uint32_t. */ post_num = (data_sz + (long) max_msg_sz - 1) / (long) max_msg_sz; s_cookie_buf->max_msg_sz = max_msg_sz; s_cookie_buf->seg_seq_num = 1; s_cookie_buf->seg_num = post_num; REQ_FIELD(req, buf.from) = write_from_buf; REQ_FIELD(req, data_sz) = data_sz; REQ_FIELD(req, seg_seq_num) = 1; // only send 1st-segment, even if there are some segments. REQ_FIELD(req, seg_num) = post_num; REQ_FIELD(req, max_msg_sz) = max_msg_sz; long length; if (post_num > 1) { length = max_msg_sz; } else { length = data_sz; } /* put IB rkey */ struct MPID_nem_ib_com_reg_mr_cache_entry_t *mr_cache = MPID_nem_ib_com_reg_mr_fetch(write_from_buf, length, 0, MPID_NEM_IB_COM_REG_MR_GLOBAL); MPIU_ERR_CHKANDJUMP(!mr_cache, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_reg_mr_fetch"); struct ibv_mr *mr = mr_cache->mr; REQ_FIELD(req, lmt_mr_cache) = (void *) mr_cache; #ifdef HAVE_LIBDCFA s_cookie_buf->addr = (void *) mr->host_addr; dprintf("lmt_initiate_lmt,s_cookie_buf->addr=%p\n", s_cookie_buf->addr); #endif s_cookie_buf->rkey = mr->rkey; dprintf("lmt_initiate_lmt,tail=%02x,mem-tail=%p,%02x,sz=%ld,raddr=%p,rkey=%08x\n", s_cookie_buf->tail, write_from_buf + data_sz - sizeof(uint8_t), *((uint8_t *) (write_from_buf + data_sz - sizeof(uint8_t))), data_sz, s_cookie_buf->addr, s_cookie_buf->rkey); /* send cookie. rts_pkt as the MPI-header, s_cookie_buf as the payload */ MPID_nem_lmt_send_RTS(vc, (MPID_nem_pkt_lmt_rts_t *) rts_pkt, s_cookie_buf, sizeof(MPID_nem_ib_lmt_cookie_t)); fn_exit: MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_LMT_INITIATE_LMT); return mpi_errno; fn_fail: goto fn_exit; }
int MPIDO_Bcast_simple(void *buffer, int count, MPI_Datatype datatype, int root, MPID_Comm *comm_ptr, int *mpierrno) { TRACE_ERR("Entering MPIDO_Bcast_optimized\n"); int data_contig; void *data_buffer = NULL, *noncontig_buff = NULL; volatile unsigned active = 1; MPI_Aint data_true_lb = 0; MPID_Datatype *data_ptr; MPID_Segment segment; MPIDI_Post_coll_t bcast_post; const struct MPIDI_Comm* const mpid = &(comm_ptr->mpid); const int rank = comm_ptr->rank; /* Must calculate data_size based on count=1 in case it's total size is > integer */ int data_size_one; MPIDI_Datatype_get_info(1, datatype, data_contig, data_size_one, data_ptr, data_true_lb); if(MPIDI_Pamix_collsel_advise != NULL && mpid->collsel_fast_query != NULL) { advisor_algorithm_t advisor_algorithms[1]; int num_algorithms = MPIDI_Pamix_collsel_advise(mpid->collsel_fast_query, PAMI_XFER_BROADCAST, data_size_one * count, advisor_algorithms, 1); if(num_algorithms) { if(advisor_algorithms[0].algorithm_type == COLLSEL_EXTERNAL_ALGO) { return MPIR_Bcast_intra(buffer, count, datatype, root, comm_ptr, mpierrno); } } } const int data_size = data_size_one*(size_t)count; data_buffer = (char *)buffer + data_true_lb; if(!data_contig) { noncontig_buff = MPIU_Malloc(data_size); data_buffer = noncontig_buff; if(noncontig_buff == NULL) { MPID_Abort(NULL, MPI_ERR_NO_SPACE, 1, "Fatal: Cannot allocate pack buffer"); } if(rank == root) { DLOOP_Offset last = data_size; MPID_Segment_init(buffer, count, datatype, &segment, 0); MPID_Segment_pack(&segment, 0, &last, noncontig_buff); } } pami_xfer_t bcast; const pami_metadata_t *my_bcast_md; int queryreq = 0; bcast.cb_done = cb_bcast; bcast.cookie = (void *)&active; bcast.cmd.xfer_broadcast.root = MPIDI_Task_to_endpoint(MPID_VCR_GET_LPID(comm_ptr->vcr, root), 0); bcast.algorithm = mpid->coll_algorithm[PAMI_XFER_BROADCAST][0][0]; bcast.cmd.xfer_broadcast.buf = data_buffer; bcast.cmd.xfer_broadcast.type = PAMI_TYPE_BYTE; /* Needs to be sizeof(type)*count since we are using bytes as * the generic type */ bcast.cmd.xfer_broadcast.typecount = data_size; my_bcast_md = &mpid->coll_metadata[PAMI_XFER_BROADCAST][0][0]; MPIDI_Context_post(MPIDI_Context[0], &bcast_post.state, MPIDI_Pami_post_wrapper, (void *)&bcast); MPIDI_Update_last_algorithm(comm_ptr, my_bcast_md->name); MPID_PROGRESS_WAIT_WHILE(active); TRACE_ERR("bcast done\n"); if(!data_contig) { if(rank != root) MPIR_Localcopy(noncontig_buff, data_size, MPI_CHAR, buffer, count, datatype); MPIU_Free(noncontig_buff); } TRACE_ERR("Exiting MPIDO_Bcast_optimized\n"); return 0; }
int MPIR_Localcopy(const void *sendbuf, MPI_Aint sendcount, MPI_Datatype sendtype, void *recvbuf, MPI_Aint recvcount, MPI_Datatype recvtype) { int mpi_errno = MPI_SUCCESS; int sendtype_iscontig, recvtype_iscontig; MPI_Aint sendsize, recvsize, sdata_sz, rdata_sz, copy_sz; MPI_Aint true_extent, sendtype_true_lb, recvtype_true_lb; MPIU_CHKLMEM_DECL(1); MPID_MPI_STATE_DECL(MPID_STATE_MPIR_LOCALCOPY); MPID_MPI_FUNC_ENTER(MPID_STATE_MPIR_LOCALCOPY); MPID_Datatype_get_size_macro(sendtype, sendsize); MPID_Datatype_get_size_macro(recvtype, recvsize); sdata_sz = sendsize * sendcount; rdata_sz = recvsize * recvcount; /* if there is no data to copy, bail out */ if (!sdata_sz || !rdata_sz) goto fn_exit; #if defined(HAVE_ERROR_CHECKING) if (sdata_sz > rdata_sz) { MPIU_ERR_SET2(mpi_errno, MPI_ERR_TRUNCATE, "**truncate", "**truncate %d %d", sdata_sz, rdata_sz); copy_sz = rdata_sz; } else #endif /* HAVE_ERROR_CHECKING */ copy_sz = sdata_sz; /* Builtin types is the common case; optimize for it */ if ((HANDLE_GET_KIND(sendtype) == HANDLE_KIND_BUILTIN) && HANDLE_GET_KIND(recvtype) == HANDLE_KIND_BUILTIN) { MPIU_Memcpy(recvbuf, sendbuf, copy_sz); goto fn_exit; } MPIR_Datatype_iscontig(sendtype, &sendtype_iscontig); MPIR_Datatype_iscontig(recvtype, &recvtype_iscontig); MPIR_Type_get_true_extent_impl(sendtype, &sendtype_true_lb, &true_extent); MPIR_Type_get_true_extent_impl(recvtype, &recvtype_true_lb, &true_extent); if (sendtype_iscontig && recvtype_iscontig) { #if defined(HAVE_ERROR_CHECKING) MPIU_ERR_CHKMEMCPYANDJUMP(mpi_errno, ((char *)recvbuf + recvtype_true_lb), ((char *)sendbuf + sendtype_true_lb), copy_sz); #endif MPIU_Memcpy(((char *) recvbuf + recvtype_true_lb), ((char *) sendbuf + sendtype_true_lb), copy_sz); } else if (sendtype_iscontig) { MPID_Segment seg; MPI_Aint last; MPID_Segment_init(recvbuf, recvcount, recvtype, &seg, 0); last = copy_sz; MPID_Segment_unpack(&seg, 0, &last, (char*)sendbuf + sendtype_true_lb); MPIU_ERR_CHKANDJUMP(last != copy_sz, mpi_errno, MPI_ERR_TYPE, "**dtypemismatch"); } else if (recvtype_iscontig) { MPID_Segment seg; MPI_Aint last; MPID_Segment_init(sendbuf, sendcount, sendtype, &seg, 0); last = copy_sz; MPID_Segment_pack(&seg, 0, &last, (char*)recvbuf + recvtype_true_lb); MPIU_ERR_CHKANDJUMP(last != copy_sz, mpi_errno, MPI_ERR_TYPE, "**dtypemismatch"); } else { char * buf; MPIDI_msg_sz_t buf_off; MPID_Segment sseg; MPIDI_msg_sz_t sfirst; MPID_Segment rseg; MPIDI_msg_sz_t rfirst; MPIU_CHKLMEM_MALLOC(buf, char *, COPY_BUFFER_SZ, mpi_errno, "buf"); MPID_Segment_init(sendbuf, sendcount, sendtype, &sseg, 0); MPID_Segment_init(recvbuf, recvcount, recvtype, &rseg, 0); sfirst = 0; rfirst = 0; buf_off = 0; while (1) { MPI_Aint last; char * buf_end; if (copy_sz - sfirst > COPY_BUFFER_SZ - buf_off) { last = sfirst + (COPY_BUFFER_SZ - buf_off); } else { last = copy_sz; } MPID_Segment_pack(&sseg, sfirst, &last, buf + buf_off); MPIU_Assert(last > sfirst); buf_end = buf + buf_off + (last - sfirst); sfirst = last; MPID_Segment_unpack(&rseg, rfirst, &last, buf); MPIU_Assert(last > rfirst); rfirst = last; if (rfirst == copy_sz) { /* successful completion */ break; } /* if the send side finished, but the recv side couldn't unpack it, there's a datatype mismatch */ MPIU_ERR_CHKANDJUMP(sfirst == copy_sz, mpi_errno, MPI_ERR_TYPE, "**dtypemismatch"); /* if not all data was unpacked, copy it to the front of the buffer for next time */ buf_off = sfirst - rfirst; if (buf_off > 0) { memmove(buf, buf_end - buf_off, buf_off); } } } fn_exit: MPIU_CHKLMEM_FREEALL(); MPID_MPI_FUNC_EXIT(MPID_STATE_MPIR_LOCALCOPY); return mpi_errno; fn_fail: goto fn_exit; }
int MPID_nem_mxm_SendNoncontig(MPIDI_VC_t * vc, MPID_Request * sreq, void *hdr, MPIDI_msg_sz_t hdr_sz) { int mpi_errno = MPI_SUCCESS; MPIDI_msg_sz_t last; MPID_nem_mxm_vc_area *vc_area = NULL; MPID_nem_mxm_req_area *req_area = NULL; MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_MXM_SENDNONCONTIGMSG); MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_MXM_SENDNONCONTIGMSG); MPIU_Assert(hdr_sz <= sizeof(MPIDI_CH3_Pkt_t)); MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "MPID_nem_mxm_iSendNoncontig"); MPIU_Memcpy(&(sreq->dev.pending_pkt), (char *) hdr, sizeof(MPIDI_CH3_Pkt_t)); _dbg_mxm_output(5, "SendNoncontig ========> Sending ADI msg (to=%d type=%d) for req %p (data_size %d, %d) \n", vc->pg_rank, sreq->dev.pending_pkt.type, sreq, sizeof(MPIDI_CH3_Pkt_t), sreq->dev.segment_size - sreq->dev.segment_first); vc_area = VC_BASE(vc); req_area = REQ_BASE(sreq); req_area->ctx = sreq; req_area->iov_buf = req_area->tmp_buf; req_area->iov_count = 0; req_area->iov_buf[req_area->iov_count].ptr = (void *) &(sreq->dev.pending_pkt); req_area->iov_buf[req_area->iov_count].length = sizeof(MPIDI_CH3_Pkt_t); (req_area->iov_count)++; if (sreq->dev.ext_hdr_ptr != NULL) { req_area->iov_buf[req_area->iov_count].ptr = (void *) (sreq->dev.ext_hdr_ptr); req_area->iov_buf[req_area->iov_count].length = sreq->dev.ext_hdr_sz; (req_area->iov_count)++; } last = sreq->dev.segment_size; /* NOTE: currently upper layer never pass packet with data that has * either "last <= 0" or "last-sreq->dev.segment_first <=0" to this * layer. In future, if upper layer passes such kind of packet, the * judgement of the following IF branch needs to be modified. */ MPIU_Assert(last > 0 && last - sreq->dev.segment_first > 0); if (last > 0) { sreq->dev.tmpbuf = MPIU_Malloc((size_t) (sreq->dev.segment_size - sreq->dev.segment_first)); MPIU_Assert(sreq->dev.tmpbuf); MPID_Segment_pack(sreq->dev.segment_ptr, sreq->dev.segment_first, &last, sreq->dev.tmpbuf); MPIU_Assert(last == sreq->dev.segment_size); req_area->iov_buf[req_area->iov_count].ptr = sreq->dev.tmpbuf; req_area->iov_buf[req_area->iov_count].length = last - sreq->dev.segment_first; (req_area->iov_count)++; } vc_area->pending_sends += 1; sreq->ch.vc = vc; sreq->ch.noncontig = TRUE; mpi_errno = _mxm_isend(vc_area->mxm_ep, req_area, MXM_MPICH_ISEND_AM, mxm_obj->mxm_mq, mxm_obj->mxm_rank, MXM_MPICH_HID_ADI_MSG, 0, 0); if (mpi_errno) MPIR_ERR_POP(mpi_errno); fn_exit: MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_MXM_SENDNONCONTIGMSG); return mpi_errno; fn_fail: goto fn_exit; }
static int send_msg(ptl_hdr_data_t ssend_flag, struct MPIDI_VC *vc, const void *buf, MPI_Aint count, MPI_Datatype datatype, int dest, int tag, MPID_Comm *comm, int context_offset, struct MPID_Request **request) { int mpi_errno = MPI_SUCCESS; MPID_nem_ptl_vc_area *const vc_ptl = VC_PTL(vc); int ret; MPIDI_msg_sz_t data_sz; int dt_contig; MPI_Aint dt_true_lb; MPID_Datatype *dt_ptr; MPID_Request *sreq = NULL; ptl_me_t me; int initial_iov_count, remaining_iov_count; ptl_md_t md; MPI_Aint last; MPIU_CHKPMEM_DECL(2); MPIDI_STATE_DECL(MPID_STATE_SEND_MSG); MPIDI_FUNC_ENTER(MPID_STATE_SEND_MSG); MPID_nem_ptl_request_create_sreq(sreq, mpi_errno, comm); sreq->dev.match.parts.rank = dest; sreq->dev.match.parts.tag = tag; sreq->dev.match.parts.context_id = comm->context_id + context_offset; sreq->ch.vc = vc; if (!vc_ptl->id_initialized) { mpi_errno = MPID_nem_ptl_init_id(vc); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } MPIDI_Datatype_get_info(count, datatype, dt_contig, data_sz, dt_ptr, dt_true_lb); MPIU_DBG_MSG_FMT(CH3_CHANNEL, VERBOSE, (MPIU_DBG_FDEST, "count="MPI_AINT_FMT_DEC_SPEC" datatype=%#x contig=%d data_sz=%lu", count, datatype, dt_contig, data_sz)); if (data_sz <= PTL_LARGE_THRESHOLD) { /* Small message. Send all data eagerly */ if (dt_contig) { void *start = (char *)buf + dt_true_lb; MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Small contig message"); REQ_PTL(sreq)->event_handler = handler_send; MPIU_DBG_MSG_P(CH3_CHANNEL, VERBOSE, "&REQ_PTL(sreq)->event_handler = %p", &(REQ_PTL(sreq)->event_handler)); if (start == NULL) ret = MPID_nem_ptl_rptl_put(MPIDI_nem_ptl_global_md, (ptl_size_t)&dummy, data_sz, PTL_NO_ACK_REQ, vc_ptl->id, vc_ptl->pt, NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank), 0, sreq, NPTL_HEADER(ssend_flag, data_sz)); else ret = MPID_nem_ptl_rptl_put(MPIDI_nem_ptl_global_md, (ptl_size_t)start, data_sz, PTL_NO_ACK_REQ, vc_ptl->id, vc_ptl->pt, NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank), 0, sreq, NPTL_HEADER(ssend_flag, data_sz)); MPIR_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlput", "**ptlput %s", MPID_nem_ptl_strerror(ret)); DBG_MSG_PUT("global", data_sz, vc->pg_rank, NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank), NPTL_HEADER(ssend_flag, data_sz)); MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "id.nid = %#x", vc_ptl->id.phys.nid); MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "id.pid = %#x", vc_ptl->id.phys.pid); MPIU_DBG_MSG_P(CH3_CHANNEL, VERBOSE, "sreq = %p", sreq); MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, "vc_ptl->pt = %d", vc_ptl->pt); MPIU_DBG_MSG_P(CH3_CHANNEL, VERBOSE, "REQ_PTL(sreq)->event_handler = %p", REQ_PTL(sreq)->event_handler); goto fn_exit; } /* noncontig data */ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Small noncontig message"); sreq->dev.segment_ptr = MPID_Segment_alloc(); MPIR_ERR_CHKANDJUMP1(sreq->dev.segment_ptr == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc"); MPID_Segment_init(buf, count, datatype, sreq->dev.segment_ptr, 0); sreq->dev.segment_first = 0; sreq->dev.segment_size = data_sz; last = sreq->dev.segment_size; sreq->dev.iov_count = MPL_IOV_LIMIT; MPID_Segment_pack_vector(sreq->dev.segment_ptr, sreq->dev.segment_first, &last, sreq->dev.iov, &sreq->dev.iov_count); if (last == sreq->dev.segment_size) { /* IOV is able to describe entire message */ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, " entire message fits in IOV"); md.start = sreq->dev.iov; md.length = sreq->dev.iov_count; md.options = PTL_IOVEC; md.eq_handle = MPIDI_nem_ptl_origin_eq; md.ct_handle = PTL_CT_NONE; ret = PtlMDBind(MPIDI_nem_ptl_ni, &md, &REQ_PTL(sreq)->md); MPIR_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlmdbind", "**ptlmdbind %s", MPID_nem_ptl_strerror(ret)); REQ_PTL(sreq)->event_handler = handler_send; ret = MPID_nem_ptl_rptl_put(REQ_PTL(sreq)->md, 0, data_sz, PTL_NO_ACK_REQ, vc_ptl->id, vc_ptl->pt, NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank), 0, sreq, NPTL_HEADER(ssend_flag, data_sz)); MPIR_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlput", "**ptlput %s", MPID_nem_ptl_strerror(ret)); DBG_MSG_PUT("sreq", data_sz, vc->pg_rank, NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank), NPTL_HEADER(ssend_flag, data_sz)); goto fn_exit; } /* IOV is not long enough to describe entire message */ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, " IOV too long: using bounce buffer"); MPIU_CHKPMEM_MALLOC(REQ_PTL(sreq)->chunk_buffer[0], void *, data_sz, mpi_errno, "chunk_buffer"); MPID_Segment_init(buf, count, datatype, sreq->dev.segment_ptr, 0); sreq->dev.segment_first = 0; last = data_sz; MPID_Segment_pack(sreq->dev.segment_ptr, sreq->dev.segment_first, &last, REQ_PTL(sreq)->chunk_buffer[0]); MPIU_Assert(last == sreq->dev.segment_size); REQ_PTL(sreq)->event_handler = handler_send; ret = MPID_nem_ptl_rptl_put(MPIDI_nem_ptl_global_md, (ptl_size_t)REQ_PTL(sreq)->chunk_buffer[0], data_sz, PTL_NO_ACK_REQ, vc_ptl->id, vc_ptl->pt, NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank), 0, sreq, NPTL_HEADER(ssend_flag, data_sz)); MPIR_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlput", "**ptlput %s", MPID_nem_ptl_strerror(ret)); DBG_MSG_PUT("global", data_sz, vc->pg_rank, NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank), NPTL_HEADER(ssend_flag, data_sz)); goto fn_exit; } /* Large message. Send first chunk of data and let receiver get the rest */ if (dt_contig) { /* create ME for buffer so receiver can issue a GET for the data */ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Large contig message"); big_meappend((char *)buf + dt_true_lb + PTL_LARGE_THRESHOLD, data_sz - PTL_LARGE_THRESHOLD, vc, NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank), sreq); REQ_PTL(sreq)->event_handler = handler_send; ret = MPID_nem_ptl_rptl_put(MPIDI_nem_ptl_global_md, (ptl_size_t)((char *)buf + dt_true_lb), PTL_LARGE_THRESHOLD, PTL_NO_ACK_REQ, vc_ptl->id, vc_ptl->pt, NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank), 0, sreq, NPTL_HEADER(ssend_flag | NPTL_LARGE, data_sz)); MPIR_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlput", "**ptlput %s", MPID_nem_ptl_strerror(ret)); DBG_MSG_PUT("global", PTL_LARGE_THRESHOLD, vc->pg_rank, NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank), NPTL_HEADER(ssend_flag | NPTL_LARGE, data_sz)); goto fn_exit; } /* Large noncontig data */ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, "Large noncontig message"); sreq->dev.segment_ptr = MPID_Segment_alloc(); MPIR_ERR_CHKANDJUMP1(sreq->dev.segment_ptr == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPID_Segment_alloc"); MPID_Segment_init(buf, count, datatype, sreq->dev.segment_ptr, 0); sreq->dev.segment_first = 0; sreq->dev.segment_size = data_sz; last = PTL_LARGE_THRESHOLD; sreq->dev.iov_count = MPL_IOV_LIMIT; MPID_Segment_pack_vector(sreq->dev.segment_ptr, sreq->dev.segment_first, &last, sreq->dev.iov, &sreq->dev.iov_count); initial_iov_count = sreq->dev.iov_count; sreq->dev.segment_first = last; if (last == PTL_LARGE_THRESHOLD) { /* first chunk of message fits into IOV */ MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, " first chunk fits in IOV"); if (initial_iov_count < MPL_IOV_LIMIT) { /* There may be space for the rest of the message in this IOV */ sreq->dev.iov_count = MPL_IOV_LIMIT - sreq->dev.iov_count; last = sreq->dev.segment_size; MPID_Segment_pack_vector(sreq->dev.segment_ptr, sreq->dev.segment_first, &last, &sreq->dev.iov[initial_iov_count], &sreq->dev.iov_count); remaining_iov_count = sreq->dev.iov_count; if (last == sreq->dev.segment_size && last <= MPIDI_nem_ptl_ni_limits.max_msg_size + PTL_LARGE_THRESHOLD) { /* Entire message fit in one IOV */ int was_incomplete; MPIU_DBG_MSG(CH3_CHANNEL, VERBOSE, " rest of message fits in one IOV"); /* Create ME for remaining data */ me.start = &sreq->dev.iov[initial_iov_count]; me.length = remaining_iov_count; me.ct_handle = PTL_CT_NONE; me.uid = PTL_UID_ANY; me.options = ( PTL_ME_OP_PUT | PTL_ME_OP_GET | PTL_ME_USE_ONCE | PTL_ME_IS_ACCESSIBLE | PTL_ME_EVENT_LINK_DISABLE | PTL_ME_EVENT_UNLINK_DISABLE | PTL_IOVEC ); me.match_id = vc_ptl->id; me.match_bits = NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank); me.ignore_bits = 0; me.min_free = 0; MPIU_CHKPMEM_MALLOC(REQ_PTL(sreq)->get_me_p, ptl_handle_me_t *, sizeof(ptl_handle_me_t), mpi_errno, "get_me_p"); ret = PtlMEAppend(MPIDI_nem_ptl_ni, MPIDI_nem_ptl_get_pt, &me, PTL_PRIORITY_LIST, sreq, &REQ_PTL(sreq)->get_me_p[0]); MPIR_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlmeappend", "**ptlmeappend %s", MPID_nem_ptl_strerror(ret)); DBG_MSG_MEAPPEND("CTL", vc->pg_rank, me, sreq); /* increment the cc for the get operation */ MPIDI_CH3U_Request_increment_cc(sreq, &was_incomplete); MPIU_Assert(was_incomplete); /* Create MD for first chunk */ md.start = sreq->dev.iov; md.length = initial_iov_count; md.options = PTL_IOVEC; md.eq_handle = MPIDI_nem_ptl_origin_eq; md.ct_handle = PTL_CT_NONE; ret = PtlMDBind(MPIDI_nem_ptl_ni, &md, &REQ_PTL(sreq)->md); MPIR_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlmdbind", "**ptlmdbind %s", MPID_nem_ptl_strerror(ret)); REQ_PTL(sreq)->event_handler = handler_send; ret = MPID_nem_ptl_rptl_put(REQ_PTL(sreq)->md, 0, PTL_LARGE_THRESHOLD, PTL_NO_ACK_REQ, vc_ptl->id, vc_ptl->pt, NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank), 0, sreq, NPTL_HEADER(ssend_flag | NPTL_LARGE, data_sz)); MPIR_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlput", "**ptlput %s", MPID_nem_ptl_strerror(ret)); DBG_MSG_PUT("req", PTL_LARGE_THRESHOLD, vc->pg_rank, NPTL_MATCH(tag, comm->context_id + context_offset, comm->rank), NPTL_HEADER(ssend_flag | NPTL_LARGE, data_sz)); goto fn_exit; }
void MPIDI_CH3U_Buffer_copy( const void * const sbuf, MPI_Aint scount, MPI_Datatype sdt, int * smpi_errno, void * const rbuf, MPI_Aint rcount, MPI_Datatype rdt, MPIDI_msg_sz_t * rsz, int * rmpi_errno) { int sdt_contig; int rdt_contig; MPI_Aint sdt_true_lb, rdt_true_lb; MPIDI_msg_sz_t sdata_sz; MPIDI_msg_sz_t rdata_sz; MPID_Datatype * sdt_ptr; MPID_Datatype * rdt_ptr; MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3U_BUFFER_COPY); MPIDI_STATE_DECL(MPID_STATE_MEMCPY); MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3U_BUFFER_COPY); *smpi_errno = MPI_SUCCESS; *rmpi_errno = MPI_SUCCESS; MPIDI_Datatype_get_info(scount, sdt, sdt_contig, sdata_sz, sdt_ptr, sdt_true_lb); MPIDI_Datatype_get_info(rcount, rdt, rdt_contig, rdata_sz, rdt_ptr, rdt_true_lb); /* --BEGIN ERROR HANDLING-- */ if (sdata_sz > rdata_sz) { MPIU_DBG_MSG_FMT(CH3_OTHER,TYPICAL,(MPIU_DBG_FDEST, "message truncated, sdata_sz=" MPIDI_MSG_SZ_FMT " rdata_sz=" MPIDI_MSG_SZ_FMT, sdata_sz, rdata_sz)); sdata_sz = rdata_sz; *rmpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_TRUNCATE, "**truncate", "**truncate %d %d", sdata_sz, rdata_sz ); } /* --END ERROR HANDLING-- */ if (sdata_sz == 0) { *rsz = 0; goto fn_exit; } if (sdt_contig && rdt_contig) { MPIDI_FUNC_ENTER(MPID_STATE_MEMCPY); MPIU_Memcpy((char *)rbuf + rdt_true_lb, (const char *)sbuf + sdt_true_lb, sdata_sz); MPIDI_FUNC_EXIT(MPID_STATE_MEMCPY); *rsz = sdata_sz; } else if (sdt_contig) { MPID_Segment seg; MPI_Aint last; MPID_Segment_init(rbuf, rcount, rdt, &seg, 0); last = sdata_sz; MPIU_DBG_MSG_FMT(CH3_OTHER,VERBOSE,(MPIU_DBG_FDEST, "pre-unpack last=" MPIDI_MSG_SZ_FMT, last )); MPID_Segment_unpack(&seg, 0, &last, (char*)sbuf + sdt_true_lb); MPIU_DBG_MSG_FMT(CH3_OTHER,VERBOSE,(MPIU_DBG_FDEST, "pre-unpack last=" MPIDI_MSG_SZ_FMT, last )); /* --BEGIN ERROR HANDLING-- */ if (last != sdata_sz) { *rmpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_TYPE, "**dtypemismatch", 0); } /* --END ERROR HANDLING-- */ *rsz = last; } else if (rdt_contig) { MPID_Segment seg; MPI_Aint last; MPID_Segment_init(sbuf, scount, sdt, &seg, 0); last = sdata_sz; MPIU_DBG_MSG_FMT(CH3_OTHER,VERBOSE,(MPIU_DBG_FDEST, "pre-pack last=" MPIDI_MSG_SZ_FMT, last )); MPID_Segment_pack(&seg, 0, &last, (char*)rbuf + rdt_true_lb); MPIU_DBG_MSG_FMT(CH3_OTHER,VERBOSE,(MPIU_DBG_FDEST, "post-pack last=" MPIDI_MSG_SZ_FMT, last )); /* --BEGIN ERROR HANDLING-- */ if (last != sdata_sz) { *rmpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_TYPE, "**dtypemismatch", 0); } /* --END ERROR HANDLING-- */ *rsz = last; } else { char * buf; MPIDI_msg_sz_t buf_off; MPID_Segment sseg; MPIDI_msg_sz_t sfirst; MPID_Segment rseg; MPIDI_msg_sz_t rfirst; buf = MPIU_Malloc(MPIDI_COPY_BUFFER_SZ); /* --BEGIN ERROR HANDLING-- */ if (buf == NULL) { MPIU_DBG_MSG(CH3_OTHER,TYPICAL,"SRBuf allocation failure"); *smpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**nomem", 0); *rmpi_errno = *smpi_errno; *rsz = 0; goto fn_exit; } /* --END ERROR HANDLING-- */ MPID_Segment_init(sbuf, scount, sdt, &sseg, 0); MPID_Segment_init(rbuf, rcount, rdt, &rseg, 0); sfirst = 0; rfirst = 0; buf_off = 0; for(;;) { MPI_Aint last; char * buf_end; if (sdata_sz - sfirst > MPIDI_COPY_BUFFER_SZ - buf_off) { last = sfirst + (MPIDI_COPY_BUFFER_SZ - buf_off); } else { last = sdata_sz; } MPIU_DBG_MSG_FMT(CH3_OTHER,VERBOSE,(MPIU_DBG_FDEST, "pre-pack first=" MPIDI_MSG_SZ_FMT ", last=" MPIDI_MSG_SZ_FMT, sfirst, last )); MPID_Segment_pack(&sseg, sfirst, &last, buf + buf_off); MPIU_DBG_MSG_FMT(CH3_OTHER,VERBOSE,(MPIU_DBG_FDEST, "post-pack first=" MPIDI_MSG_SZ_FMT ", last=" MPIDI_MSG_SZ_FMT, sfirst, last )); /* --BEGIN ERROR HANDLING-- */ MPIU_Assert(last > sfirst); /* --END ERROR HANDLING-- */ buf_end = buf + buf_off + (last - sfirst); sfirst = last; MPIU_DBG_MSG_FMT(CH3_OTHER,VERBOSE,(MPIU_DBG_FDEST, "pre-unpack first=" MPIDI_MSG_SZ_FMT ", last=" MPIDI_MSG_SZ_FMT, rfirst, last )); MPID_Segment_unpack(&rseg, rfirst, &last, buf); MPIU_DBG_MSG_FMT(CH3_OTHER,VERBOSE,(MPIU_DBG_FDEST, "post-unpack first=" MPIDI_MSG_SZ_FMT ", last=" MPIDI_MSG_SZ_FMT, rfirst, last )); /* --BEGIN ERROR HANDLING-- */ MPIU_Assert(last > rfirst); /* --END ERROR HANDLING-- */ rfirst = last; if (rfirst == sdata_sz) { /* successful completion */ break; } /* --BEGIN ERROR HANDLING-- */ if (sfirst == sdata_sz) { /* datatype mismatch -- remaining bytes could not be unpacked */ *rmpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_TYPE, "**dtypemismatch", 0); break; } /* --END ERROR HANDLING-- */ buf_off = sfirst - rfirst; if (buf_off > 0) { MPIU_DBG_MSG_FMT(CH3_OTHER, VERBOSE, (MPIU_DBG_FDEST, "moved " MPIDI_MSG_SZ_FMT " bytes to the beginning of the tmp buffer", buf_off)); memmove(buf, buf_end - buf_off, buf_off); } } *rsz = rfirst; MPIU_Free(buf); } fn_exit: MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3U_BUFFER_COPY); }