/** * \brief MPID buffer copy * * Implements non-contiguous buffers correctly. * * \param[in] sbuf The address of the input buffer * \param[in] scount The number of elements in that buffer * \param[in] sdt The datatype of those elements * \param[out] smpi_errno Returns errors * \param[in] rbuf The address of the output buffer * \param[out] rcount The number of elements in that buffer * \param[in] rdt The datatype of those elements * \param[out] rsz The size of the ouput data * \param[out] rmpi_errno Returns errors */ void MPIDI_Buffer_copy( const void * const sbuf, MPI_Aint scount, MPI_Datatype sdt, int * smpi_errno, void * const rbuf, MPI_Aint rcount, MPI_Datatype rdt, MPIDI_msg_sz_t * rsz, int * rmpi_errno) { int sdt_contig; int rdt_contig; MPI_Aint sdt_true_lb, rdt_true_lb; MPIDI_msg_sz_t sdata_sz; MPIDI_msg_sz_t rdata_sz; MPID_Datatype * sdt_ptr; MPID_Datatype * rdt_ptr; MPI_Aint sdt_extent; MPI_Aint rdt_extent; *smpi_errno = MPI_SUCCESS; *rmpi_errno = MPI_SUCCESS; /* printf("bufcopy: src count=%d dt=%d\n", scount, sdt); */ /* printf("bufcopy: dst count=%d dt=%d\n", rcount, rdt); */ MPIDI_Datatype_get_info(scount, sdt, sdt_contig, sdata_sz, sdt_ptr, sdt_true_lb); MPIDI_Datatype_get_info(rcount, rdt, rdt_contig, rdata_sz, rdt_ptr, rdt_true_lb); /* --BEGIN ERROR HANDLING-- */ if (sdata_sz > rdata_sz) { *rmpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, __FUNCTION__, __LINE__, MPI_ERR_TRUNCATE, "**truncate", "**truncate %d %d", sdata_sz, rdata_sz ); sdata_sz = rdata_sz; } /* --END ERROR HANDLING-- */ if (sdata_sz == 0) { *rsz = 0; goto fn_exit; } if (sdt_contig && rdt_contig) { #if CUDA_AWARE_SUPPORT if(MPIDI_Process.cuda_aware_support_on && MPIDI_cuda_is_device_buf(rbuf)) { cudaError_t cudaerr = CudaMemcpy(rbuf + rdt_true_lb, sbuf + sdt_true_lb, sdata_sz, cudaMemcpyHostToDevice); } else #endif memcpy((char*)rbuf + rdt_true_lb, (const char *)sbuf + sdt_true_lb, sdata_sz); *rsz = sdata_sz; } else if (sdt_contig) { #if CUDA_AWARE_SUPPORT // This will need to be done in two steps: // 1 - Allocate a temp buffer which is the same size as user buffer and unpack in it. // 2 - Copy unpacked data into user buffer from temp buffer. if(MPIDI_Process.cuda_aware_support_on && MPIDI_cuda_is_device_buf(rbuf)) { MPID_Datatype_get_extent_macro(rdt, rdt_extent); char *buf = MPL_malloc(rdt_extent * rcount); memset(buf, 0, rdt_extent * rcount); MPID_Segment seg; DLOOP_Offset last; MPID_Segment_init(buf, rcount, rdt, &seg, 0); last = sdata_sz; MPID_Segment_unpack(&seg, 0, &last, (char*)sbuf + sdt_true_lb); /* --BEGIN ERROR HANDLING-- */ if (last != sdata_sz) { *rmpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, __FUNCTION__, __LINE__, MPI_ERR_TYPE, "**dtypemismatch", 0); } /* --END ERROR HANDLING-- */ *rsz = last; cudaError_t cudaerr = CudaMemcpy(rbuf + rdt_true_lb, buf, rdt_extent * rcount, cudaMemcpyHostToDevice); MPL_free(buf); goto fn_exit; } #endif MPID_Segment seg; DLOOP_Offset last; MPID_Segment_init(rbuf, rcount, rdt, &seg, 0); last = sdata_sz; MPID_Segment_unpack(&seg, 0, &last, (char*)sbuf + sdt_true_lb); /* --BEGIN ERROR HANDLING-- */ if (last != sdata_sz) { *rmpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, __FUNCTION__, __LINE__, MPI_ERR_TYPE, "**dtypemismatch", 0); } /* --END ERROR HANDLING-- */ *rsz = last; } else if (rdt_contig) { MPID_Segment seg; DLOOP_Offset last; MPID_Segment_init(sbuf, scount, sdt, &seg, 0); last = sdata_sz; MPID_Segment_pack(&seg, 0, &last, (char*)rbuf + rdt_true_lb); /* --BEGIN ERROR HANDLING-- */ if (last != sdata_sz) { *rmpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, __FUNCTION__, __LINE__, MPI_ERR_TYPE, "**dtypemismatch", 0); } /* --END ERROR HANDLING-- */ *rsz = last; } else { char * buf; MPIDI_msg_sz_t buf_off; MPID_Segment sseg; MPIDI_msg_sz_t sfirst; MPID_Segment rseg; MPIDI_msg_sz_t rfirst; buf = MPL_malloc(MPIDI_COPY_BUFFER_SZ); /* --BEGIN ERROR HANDLING-- */ if (buf == NULL) { *smpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, __FUNCTION__, __LINE__, MPI_ERR_OTHER, "**nomem", 0); *rmpi_errno = *smpi_errno; *rsz = 0; goto fn_exit; } /* --END ERROR HANDLING-- */ MPID_Segment_init(sbuf, scount, sdt, &sseg, 0); MPID_Segment_init(rbuf, rcount, rdt, &rseg, 0); sfirst = 0; rfirst = 0; buf_off = 0; for(;;) { DLOOP_Offset last; char * buf_end; if (sdata_sz - sfirst > MPIDI_COPY_BUFFER_SZ - buf_off) { last = sfirst + (MPIDI_COPY_BUFFER_SZ - buf_off); } else { last = sdata_sz; } MPID_Segment_pack(&sseg, sfirst, &last, buf + buf_off); /* --BEGIN ERROR HANDLING-- */ MPID_assert(last > sfirst); /* --END ERROR HANDLING-- */ buf_end = buf + buf_off + (last - sfirst); sfirst = last; MPID_Segment_unpack(&rseg, rfirst, &last, buf); /* --BEGIN ERROR HANDLING-- */ MPID_assert(last > rfirst); /* --END ERROR HANDLING-- */ rfirst = last; if (rfirst == sdata_sz) { /* successful completion */ break; } /* --BEGIN ERROR HANDLING-- */ if (sfirst == sdata_sz) { /* datatype mismatch -- remaining bytes could not be unpacked */ *rmpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, __FUNCTION__, __LINE__, MPI_ERR_TYPE, "**dtypemismatch", 0); break; } /* --END ERROR HANDLING-- */ buf_off = sfirst - rfirst; if (buf_off > 0) { memmove(buf, buf_end - buf_off, buf_off); } } *rsz = rfirst; MPL_free(buf); } fn_exit: return; }
static inline void MPIDI_RecvShortCB(pami_context_t context, const void * _msginfo, const void * sndbuf, size_t sndlen, pami_endpoint_t sender, unsigned isSync) { MPID_assert(_msginfo != NULL); const MPIDI_MsgInfo *msginfo = (const MPIDI_MsgInfo *)_msginfo; MPID_Request * rreq = NULL; pami_task_t source; #if TOKEN_FLOW_CONTROL int rettoks=0; #endif /* -------------------- */ /* Match the request. */ /* -------------------- */ unsigned rank = msginfo->MPIrank; unsigned tag = msginfo->MPItag; unsigned context_id = msginfo->MPIctxt; MPIU_THREAD_CS_ENTER(MSGQUEUE,0); source = PAMIX_Endpoint_query(sender); MPIDI_Receive_tokens(msginfo,source); #ifndef OUT_OF_ORDER_HANDLING rreq = MPIDI_Recvq_FDP(rank, tag, context_id); #else rreq = MPIDI_Recvq_FDP(rank, source, tag, context_id, msginfo->MPIseqno); #endif /* Match not found */ if (unlikely(rreq == NULL)) { #if (MPIDI_STATISTICS) MPID_NSTAT(mpid_statp->earlyArrivals); #endif MPIU_THREAD_CS_EXIT(MSGQUEUE,0); MPID_Request *newreq = MPIDI_Request_create2(); MPID_assert(newreq != NULL); if (sndlen) { newreq->mpid.uebuflen = sndlen; if (!TOKEN_FLOW_CONTROL_ON) { newreq->mpid.uebuf = MPL_malloc(sndlen); newreq->mpid.uebuf_malloc = mpiuMalloc; } else { #if TOKEN_FLOW_CONTROL MPIU_THREAD_CS_ENTER(MSGQUEUE,0); newreq->mpid.uebuf = MPIDI_mm_alloc(sndlen); newreq->mpid.uebuf_malloc = mpidiBufMM; MPIU_THREAD_CS_EXIT(MSGQUEUE,0); #else MPID_assert_always(0); #endif } MPID_assert(newreq->mpid.uebuf != NULL); } MPIU_THREAD_CS_ENTER(MSGQUEUE,0); #ifndef OUT_OF_ORDER_HANDLING rreq = MPIDI_Recvq_FDP(rank, tag, context_id); #else rreq = MPIDI_Recvq_FDP(rank, PAMIX_Endpoint_query(sender), tag, context_id, msginfo->MPIseqno); #endif if (unlikely(rreq == NULL)) { MPIDI_Callback_process_unexp(newreq, context, msginfo, sndlen, sender, sndbuf, NULL, isSync); /* request is always complete now */ if (TOKEN_FLOW_CONTROL_ON && sndlen) { #if TOKEN_FLOW_CONTROL MPIDI_Token_cntr[source].unmatched++; #else MPID_assert_always(0); #endif } MPIU_THREAD_CS_EXIT(MSGQUEUE,0); MPID_Request_release(newreq); goto fn_exit_short; } else { MPIU_THREAD_CS_EXIT(MSGQUEUE,0); MPID_Request_discard(newreq); } } else { #if (MPIDI_STATISTICS) MPID_NSTAT(mpid_statp->earlyArrivalsMatched); #endif if (TOKEN_FLOW_CONTROL_ON && sndlen) { #if TOKEN_FLOW_CONTROL MPIDI_Update_rettoks(source); MPIDI_Must_return_tokens(context,source); #else MPID_assert_always(0); #endif } MPIU_THREAD_CS_EXIT(MSGQUEUE,0); } /* the receive queue processing has been completed and we found match*/ /* ---------------------- */ /* Copy in information. */ /* ---------------------- */ rreq->status.MPI_SOURCE = rank; rreq->status.MPI_TAG = tag; MPIR_STATUS_SET_COUNT(rreq->status, sndlen); MPIDI_Request_setCA (rreq, MPIDI_CA_COMPLETE); MPIDI_Request_cpyPeerRequestH(rreq, msginfo); MPIDI_Request_setSync (rreq, isSync); MPIDI_Request_setRzv (rreq, 0); /* ----------------------------- */ /* Request was already posted. */ /* ----------------------------- */ if (unlikely(isSync)) MPIDI_SyncAck_post(context, rreq, PAMIX_Endpoint_query(sender)); if (unlikely(HANDLE_GET_KIND(rreq->mpid.datatype) != HANDLE_KIND_BUILTIN)) { MPIDI_Callback_process_userdefined_dt(context, sndbuf, sndlen, rreq); goto fn_exit_short; } size_t dt_size = rreq->mpid.userbufcount * MPID_Datatype_get_basic_size(rreq->mpid.datatype); /* ----------------------------- */ /* Test for truncated message. */ /* ----------------------------- */ if (unlikely(sndlen > dt_size)) { #if ASSERT_LEVEL > 0 MPIDI_Callback_process_trunc(context, rreq, NULL, sndbuf); goto fn_exit_short; #else sndlen = dt_size; #endif } MPID_assert(rreq->mpid.uebuf == NULL); MPID_assert(rreq->mpid.uebuflen == 0); void* rcvbuf = rreq->mpid.userbuf; if (sndlen > 0) { #if CUDA_AWARE_SUPPORT if(MPIDI_Process.cuda_aware_support_on && MPIDI_cuda_is_device_buf(rcvbuf)) { cudaError_t cudaerr = CudaMemcpy(rcvbuf, sndbuf, (size_t)sndlen, cudaMemcpyHostToDevice); } else #endif memcpy(rcvbuf, sndbuf, sndlen); } TRACE_SET_R_VAL(source,(rreq->mpid.idx),rlen,sndlen); TRACE_SET_R_BIT(source,(rreq->mpid.idx),fl.f.comp_in_HH); TRACE_SET_R_VAL(source,(rreq->mpid.idx),bufadd,rreq->mpid.userbuf); MPIDI_Request_complete(rreq); fn_exit_short: #ifdef OUT_OF_ORDER_HANDLING MPIU_THREAD_CS_ENTER(MSGQUEUE,0); if (MPIDI_In_cntr[source].n_OutOfOrderMsgs>0) { MPIDI_Recvq_process_out_of_order_msgs(source, context); } MPIU_THREAD_CS_EXIT(MSGQUEUE,0); #endif /* ---------------------------------------- */ /* Signal that the recv has been started. */ /* ---------------------------------------- */ MPIDI_Progress_signal(); }
int MPIDO_Gatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, const int *recvcounts, const int *displs, MPI_Datatype recvtype, int root, MPID_Comm * comm_ptr, int *mpierrno) { #ifndef HAVE_PAMI_IN_PLACE if (sendbuf == MPI_IN_PLACE) { MPID_Abort (NULL, 0, 1, "'MPI_IN_PLACE' requries support for `PAMI_IN_PLACE`"); return -1; } #endif TRACE_ERR("Entering MPIDO_Gatherv\n"); int i; int contig ATTRIBUTE((unused)), rsize ATTRIBUTE((unused)), ssize ATTRIBUTE((unused)); int pamidt = 1; MPID_Datatype *dt_ptr = NULL; MPI_Aint send_true_lb, recv_true_lb; char *sbuf, *rbuf; pami_type_t stype, rtype; int tmp; volatile unsigned gatherv_active = 1; const int rank = comm_ptr->rank; const int size = comm_ptr->local_size; #if ASSERT_LEVEL==0 /* We can't afford the tracing in ndebug/performance libraries */ const unsigned verbose = 0; #else const unsigned verbose = (MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_ALL) && (rank == 0); #endif const struct MPIDI_Comm* const mpid = &(comm_ptr->mpid); const int selected_type = mpid->user_selected_type[PAMI_XFER_GATHERV_INT]; /* Check for native PAMI types and MPI_IN_PLACE on sendbuf */ /* MPI_IN_PLACE is a nonlocal decision. We will need a preallreduce if we ever have * multiple "good" gathervs that work on different counts for example */ if((sendbuf != MPI_IN_PLACE) && (MPIDI_Datatype_to_pami(sendtype, &stype, -1, NULL, &tmp) != MPI_SUCCESS)) pamidt = 0; if(MPIDI_Datatype_to_pami(recvtype, &rtype, -1, NULL, &tmp) != MPI_SUCCESS) pamidt = 0; if(pamidt == 0 || selected_type == MPID_COLL_USE_MPICH) { if(unlikely(verbose)) fprintf(stderr,"Using MPICH gatherv algorithm\n"); TRACE_ERR("GATHERV using MPICH\n"); MPIDI_Update_last_algorithm(comm_ptr, "GATHERV_MPICH"); #if CUDA_AWARE_SUPPORT if(MPIDI_Process.cuda_aware_support_on) { MPI_Aint sdt_extent,rdt_extent; MPID_Datatype_get_extent_macro(sendtype, sdt_extent); MPID_Datatype_get_extent_macro(recvtype, rdt_extent); char *scbuf = NULL; char *rcbuf = NULL; int is_send_dev_buf = MPIDI_cuda_is_device_buf(sendbuf); int is_recv_dev_buf = (rank == root) ? MPIDI_cuda_is_device_buf(recvbuf) : 0; if(is_send_dev_buf) { scbuf = MPL_malloc(sdt_extent * sendcount); cudaError_t cudaerr = CudaMemcpy(scbuf, sendbuf, sdt_extent * sendcount, cudaMemcpyDeviceToHost); if (cudaSuccess != cudaerr) fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr)); } else scbuf = sendbuf; size_t rtotal_buf; if(is_recv_dev_buf) { //Since displs can be non-continous, we need to calculate max buffer size int highest_displs = displs[size - 1]; int highest_recvcount = recvcounts[size - 1]; for(i = 0; i < size; i++) { if(displs[i]+recvcounts[i] > highest_displs+highest_recvcount) { highest_displs = displs[i]; highest_recvcount = recvcounts[i]; } } rtotal_buf = (highest_displs+highest_recvcount)*rdt_extent; rcbuf = MPL_malloc(rtotal_buf); if(sendbuf == MPI_IN_PLACE) { cudaError_t cudaerr = CudaMemcpy(rcbuf, recvbuf, rtotal_buf, cudaMemcpyDeviceToHost); if (cudaSuccess != cudaerr) fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr)); } else memset(rcbuf, 0, rtotal_buf); } else rcbuf = recvbuf; int cuda_res = MPIR_Gatherv(scbuf, sendcount, sendtype, rcbuf, recvcounts, displs, recvtype, root, comm_ptr, mpierrno); if(is_send_dev_buf)MPL_free(scbuf); if(is_recv_dev_buf) { cudaError_t cudaerr = CudaMemcpy(recvbuf, rcbuf, rtotal_buf, cudaMemcpyHostToDevice); if (cudaSuccess != cudaerr) fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr)); MPL_free(rcbuf); } return cuda_res; } else #endif return MPIR_Gatherv(sendbuf, sendcount, sendtype, recvbuf, recvcounts, displs, recvtype, root, comm_ptr, mpierrno); } MPIDI_Datatype_get_info(1, recvtype, contig, rsize, dt_ptr, recv_true_lb); rbuf = (char *)recvbuf + recv_true_lb; sbuf = (void *) sendbuf; pami_xfer_t gatherv; gatherv.cb_done = cb_gatherv; gatherv.cookie = (void *)&gatherv_active; gatherv.cmd.xfer_gatherv_int.root = MPIDI_Task_to_endpoint(MPID_VCR_GET_LPID(comm_ptr->vcr, root), 0); gatherv.cmd.xfer_gatherv_int.rcvbuf = rbuf; gatherv.cmd.xfer_gatherv_int.rtype = rtype; gatherv.cmd.xfer_gatherv_int.rtypecounts = (int *) recvcounts; gatherv.cmd.xfer_gatherv_int.rdispls = (int *) displs; gatherv.cmd.xfer_gatherv_int.sndbuf = NULL; gatherv.cmd.xfer_gatherv_int.stype = stype; gatherv.cmd.xfer_gatherv_int.stypecount = sendcount; if(rank == root) { if(sendbuf == MPI_IN_PLACE) { if(unlikely(verbose)) fprintf(stderr,"gatherv MPI_IN_PLACE buffering\n"); sbuf = PAMI_IN_PLACE; gatherv.cmd.xfer_gatherv_int.stype = rtype; gatherv.cmd.xfer_gatherv_int.stypecount = recvcounts[rank]; } else { MPIDI_Datatype_get_info(1, sendtype, contig, ssize, dt_ptr, send_true_lb); sbuf = (char *)sbuf + send_true_lb; } } gatherv.cmd.xfer_gatherv_int.sndbuf = sbuf; pami_algorithm_t my_gatherv; const pami_metadata_t *my_md = (pami_metadata_t *)NULL; int queryreq = 0; if(selected_type == MPID_COLL_OPTIMIZED) { TRACE_ERR("Optimized gatherv %s was selected\n", mpid->opt_protocol_md[PAMI_XFER_GATHERV_INT][0].name); my_gatherv = mpid->opt_protocol[PAMI_XFER_GATHERV_INT][0]; my_md = &mpid->opt_protocol_md[PAMI_XFER_GATHERV_INT][0]; queryreq = mpid->must_query[PAMI_XFER_GATHERV_INT][0]; } else { TRACE_ERR("Optimized gatherv %s was set by user\n", mpid->user_metadata[PAMI_XFER_GATHERV_INT].name); my_gatherv = mpid->user_selected[PAMI_XFER_GATHERV_INT]; my_md = &mpid->user_metadata[PAMI_XFER_GATHERV_INT]; queryreq = selected_type; } gatherv.algorithm = my_gatherv; if(unlikely(queryreq == MPID_COLL_ALWAYS_QUERY || queryreq == MPID_COLL_CHECK_FN_REQUIRED)) { metadata_result_t result = {0}; TRACE_ERR("querying gatherv protocol %s, type was %d\n", my_md->name, queryreq); if(my_md->check_fn == NULL) { /* process metadata bits */ if((!my_md->check_correct.values.inplace) && (sendbuf == MPI_IN_PLACE)) result.check.unspecified = 1; /* Can't check ranges like this. Non-local. Comment out for now. if(my_md->check_correct.values.rangeminmax) { MPI_Aint data_true_lb; MPID_Datatype *data_ptr; int data_size, data_contig; MPIDI_Datatype_get_info(sendcount, sendtype, data_contig, data_size, data_ptr, data_true_lb); if((my_md->range_lo <= data_size) && (my_md->range_hi >= data_size)) ; else { result.check.range = 1; if(unlikely(verbose)) { fprintf(stderr,"message size (%u) outside range (%zu<->%zu) for %s.\n", data_size, my_md->range_lo, my_md->range_hi, my_md->name); } } } */ } else /* calling the check fn is sufficient */ result = my_md->check_fn(&gatherv); TRACE_ERR("bitmask: %#X\n", result.bitmask); result.check.nonlocal = 0; /* #warning REMOVE THIS WHEN IMPLEMENTED */ if(result.bitmask) { if(unlikely(verbose)) fprintf(stderr,"Query failed for %s. Using MPICH gatherv.\n", my_md->name); MPIDI_Update_last_algorithm(comm_ptr, "GATHERV_MPICH"); return MPIR_Gatherv(sendbuf, sendcount, sendtype, recvbuf, recvcounts, displs, recvtype, root, comm_ptr, mpierrno); } if(my_md->check_correct.values.asyncflowctl && !(--(comm_ptr->mpid.num_requests))) { comm_ptr->mpid.num_requests = MPIDI_Process.optimized.num_requests; int tmpmpierrno; if(unlikely(verbose)) fprintf(stderr,"Query barrier required for %s\n", my_md->name); MPIDO_Barrier(comm_ptr, &tmpmpierrno); } } MPIDI_Update_last_algorithm(comm_ptr, my_md->name); if(unlikely(verbose)) { unsigned long long int threadID; MPL_thread_id_t tid; MPL_thread_self(&tid); threadID = (unsigned long long int)tid; fprintf(stderr,"<%llx> Using protocol %s for gatherv on %u\n", threadID, my_md->name, (unsigned) comm_ptr->context_id); } MPIDI_Post_coll_t gatherv_post; MPIDI_Context_post(MPIDI_Context[0], &gatherv_post.state, MPIDI_Pami_post_wrapper, (void *)&gatherv); TRACE_ERR("Waiting on active %d\n", gatherv_active); MPID_PROGRESS_WAIT_WHILE(gatherv_active); TRACE_ERR("Leaving MPIDO_Gatherv\n"); return 0; }
int MPIDO_Allgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPID_Comm * comm_ptr, int *mpierrno) { #ifndef HAVE_PAMI_IN_PLACE if (sendbuf == MPI_IN_PLACE) { MPID_Abort (NULL, 0, 1, "'MPI_IN_PLACE' requries support for `PAMI_IN_PLACE`"); return -1; } #endif /* ********************************* * Check the nature of the buffers * ********************************* */ const struct MPIDI_Comm* const mpid = &(comm_ptr->mpid); int config[6], i; MPID_Datatype * dt_null = NULL; MPI_Aint send_true_lb = 0; MPI_Aint recv_true_lb = 0; int comm_size = comm_ptr->local_size; size_t send_bytes = 0; size_t recv_bytes = 0; volatile unsigned allred_active = 1; volatile unsigned allgather_active = 1; pami_xfer_t allred; const int rank = comm_ptr->rank; int queryreq = 0; #if ASSERT_LEVEL==0 /* We can't afford the tracing in ndebug/performance libraries */ const unsigned verbose = 0; #else const unsigned verbose = (MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_ALL) && (rank == 0); #endif const int selected_type = mpid->user_selected_type[PAMI_XFER_ALLGATHER]; for (i=0;i<6;i++) config[i] = 1; const pami_metadata_t *my_md = (pami_metadata_t *)NULL; allred.cb_done = allred_cb_done; allred.cookie = (void *)&allred_active; /* Pick an algorithm that is guaranteed to work for the pre-allreduce */ /* TODO: This needs selection for fast(er|est) allreduce protocol */ allred.algorithm = mpid->coll_algorithm[PAMI_XFER_ALLREDUCE][0][0]; allred.cmd.xfer_allreduce.sndbuf = (void *)config; allred.cmd.xfer_allreduce.stype = PAMI_TYPE_SIGNED_INT; allred.cmd.xfer_allreduce.rcvbuf = (void *)config; allred.cmd.xfer_allreduce.rtype = PAMI_TYPE_SIGNED_INT; allred.cmd.xfer_allreduce.stypecount = 6; allred.cmd.xfer_allreduce.rtypecount = 6; allred.cmd.xfer_allreduce.op = PAMI_DATA_BAND; char use_tree_reduce, use_alltoall, use_bcast, use_pami, use_opt; char *rbuf = NULL, *sbuf = NULL; const char * const allgathers = mpid->allgathers; use_alltoall = allgathers[2]; use_tree_reduce = allgathers[0]; use_bcast = allgathers[1]; use_pami = (selected_type == MPID_COLL_USE_MPICH) ? 0 : 1; use_opt = use_alltoall || use_tree_reduce || use_bcast || use_pami; TRACE_ERR("flags before: b: %d a: %d t: %d p: %d\n", use_bcast, use_alltoall, use_tree_reduce, use_pami); if(!use_opt) { if(unlikely(verbose)) fprintf(stderr,"Using MPICH allgather algorithm\n"); TRACE_ERR("No options set/available; using MPICH for allgather\n"); MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHER_MPICH"); #if CUDA_AWARE_SUPPORT if(MPIDI_Process.cuda_aware_support_on) { MPI_Aint sdt_extent,rdt_extent; MPID_Datatype_get_extent_macro(sendtype, sdt_extent); MPID_Datatype_get_extent_macro(recvtype, rdt_extent); char *scbuf = NULL; char *rcbuf = NULL; int is_send_dev_buf = MPIDI_cuda_is_device_buf(sendbuf); int is_recv_dev_buf = MPIDI_cuda_is_device_buf(recvbuf); if(is_send_dev_buf) { scbuf = MPL_malloc(sdt_extent * sendcount); cudaError_t cudaerr = CudaMemcpy(scbuf, sendbuf, sdt_extent * sendcount, cudaMemcpyDeviceToHost); if (cudaSuccess != cudaerr) fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr)); } else scbuf = sendbuf; if(is_recv_dev_buf) { rcbuf = MPL_malloc(rdt_extent * recvcount); if(sendbuf == MPI_IN_PLACE) { cudaError_t cudaerr = CudaMemcpy(rcbuf, recvbuf, rdt_extent * recvcount, cudaMemcpyDeviceToHost); if (cudaSuccess != cudaerr) fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr)); } else memset(rcbuf, 0, rdt_extent * recvcount); } else rcbuf = recvbuf; int cuda_res = MPIR_Allgather(scbuf, sendcount, sendtype, rcbuf, recvcount, recvtype, comm_ptr, mpierrno); if(is_send_dev_buf)MPL_free(scbuf); if(is_recv_dev_buf) { cudaError_t cudaerr = CudaMemcpy(recvbuf, rcbuf, rdt_extent * recvcount, cudaMemcpyHostToDevice); if (cudaSuccess != cudaerr) fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr)); MPL_free(rcbuf); } return cuda_res; } else #endif return MPIR_Allgather(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, comm_ptr, mpierrno); } if ((sendcount < 1 && sendbuf != MPI_IN_PLACE) || recvcount < 1) return MPI_SUCCESS; /* Gather datatype information */ MPIDI_Datatype_get_info(recvcount, recvtype, config[MPID_RECV_CONTIG], recv_bytes, dt_null, recv_true_lb); send_bytes = recv_bytes; rbuf = (char *)recvbuf+recv_true_lb; sbuf = PAMI_IN_PLACE; if(sendbuf != MPI_IN_PLACE) { MPIDI_Datatype_get_info(sendcount, sendtype, config[MPID_SEND_CONTIG], send_bytes, dt_null, send_true_lb); sbuf = (char *)sendbuf+send_true_lb; } else if(unlikely(verbose)) fprintf(stderr,"allgather MPI_IN_PLACE buffering\n"); /* verify everyone's datatype contiguity */ /* Check buffer alignment now, since we're pre-allreducing anyway */ /* Only do this if one of the glue protocols is likely to be used */ if(use_alltoall || use_tree_reduce || use_bcast) { config[MPID_ALIGNEDBUFFER] = !((long)sendbuf & 0x0F) && !((long)recvbuf & 0x0F); /* #warning need to determine best allreduce for short messages */ if(mpid->preallreduces[MPID_ALLGATHER_PREALLREDUCE]) { TRACE_ERR("Preallreducing in allgather\n"); MPIDI_Post_coll_t allred_post; MPIDI_Context_post(MPIDI_Context[0], &allred_post.state, MPIDI_Pami_post_wrapper, (void *)&allred); MPID_PROGRESS_WAIT_WHILE(allred_active); } use_alltoall = allgathers[2] && config[MPID_RECV_CONTIG] && config[MPID_SEND_CONTIG];; /* Note: some of the glue protocols use recv_bytes*comm_size rather than * recv_bytes so we use that for comparison here, plus we pass that in * to those protocols. */ use_tree_reduce = allgathers[0] && config[MPID_RECV_CONTIG] && config[MPID_SEND_CONTIG] && config[MPID_RECV_CONTINUOUS] && (recv_bytes*comm_size%sizeof(unsigned)) == 0; use_bcast = allgathers[1]; TRACE_ERR("flags after: b: %d a: %d t: %d p: %d\n", use_bcast, use_alltoall, use_tree_reduce, use_pami); } if(use_pami) { TRACE_ERR("Using PAMI-level allgather protocol\n"); pami_xfer_t allgather; allgather.cb_done = allgather_cb_done; allgather.cookie = (void *)&allgather_active; allgather.cmd.xfer_allgather.rcvbuf = rbuf; allgather.cmd.xfer_allgather.sndbuf = sbuf; allgather.cmd.xfer_allgather.stype = PAMI_TYPE_BYTE; allgather.cmd.xfer_allgather.rtype = PAMI_TYPE_BYTE; allgather.cmd.xfer_allgather.stypecount = send_bytes; allgather.cmd.xfer_allgather.rtypecount = recv_bytes; if(selected_type == MPID_COLL_OPTIMIZED) { if((mpid->cutoff_size[PAMI_XFER_ALLGATHER][0] == 0) || (mpid->cutoff_size[PAMI_XFER_ALLGATHER][0] > 0 && mpid->cutoff_size[PAMI_XFER_ALLGATHER][0] >= send_bytes)) { allgather.algorithm = mpid->opt_protocol[PAMI_XFER_ALLGATHER][0]; my_md = &mpid->opt_protocol_md[PAMI_XFER_ALLGATHER][0]; queryreq = mpid->must_query[PAMI_XFER_ALLGATHER][0]; } else { return MPIR_Allgather(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, comm_ptr, mpierrno); } } else { allgather.algorithm = mpid->user_selected[PAMI_XFER_ALLGATHER]; my_md = &mpid->user_metadata[PAMI_XFER_ALLGATHER]; queryreq = selected_type; } if(unlikely( queryreq == MPID_COLL_ALWAYS_QUERY || queryreq == MPID_COLL_CHECK_FN_REQUIRED)) { metadata_result_t result = {0}; TRACE_ERR("Querying allgather protocol %s, type was: %d\n", my_md->name, selected_type); if(my_md->check_fn == NULL) { /* process metadata bits */ if((!my_md->check_correct.values.inplace) && (sendbuf == MPI_IN_PLACE)) result.check.unspecified = 1; if(my_md->check_correct.values.rangeminmax) { if((my_md->range_lo <= recv_bytes) && (my_md->range_hi >= recv_bytes)) ; /* ok, algorithm selected */ else { result.check.range = 1; if(unlikely(verbose)) { fprintf(stderr,"message size (%zu) outside range (%zu<->%zu) for %s.\n", recv_bytes, my_md->range_lo, my_md->range_hi, my_md->name); } } } } else /* calling the check fn is sufficient */ result = my_md->check_fn(&allgather); TRACE_ERR("bitmask: %#X\n", result.bitmask); result.check.nonlocal = 0; /* #warning REMOVE THIS WHEN IMPLEMENTED */ if(result.bitmask) { if(unlikely(verbose)) fprintf(stderr,"Query failed for %s. Using MPICH allgather\n", my_md->name); MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHER_MPICH"); return MPIR_Allgather(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, comm_ptr, mpierrno); } if(my_md->check_correct.values.asyncflowctl && !(--(comm_ptr->mpid.num_requests))) { comm_ptr->mpid.num_requests = MPIDI_Process.optimized.num_requests; int tmpmpierrno; if(unlikely(verbose)) fprintf(stderr,"Query barrier required for %s\n", my_md->name); MPIDO_Barrier(comm_ptr, &tmpmpierrno); } } if(unlikely(verbose)) { unsigned long long int threadID; MPL_thread_id_t tid; MPL_thread_self(&tid); threadID = (unsigned long long int)tid; fprintf(stderr,"<%llx> Using protocol %s for allgather on %u\n", threadID, my_md->name, (unsigned) comm_ptr->context_id); } TRACE_ERR("Calling PAMI_Collective with allgather structure\n"); MPIDI_Post_coll_t allgather_post; MPIDI_Context_post(MPIDI_Context[0], &allgather_post.state, MPIDI_Pami_post_wrapper, (void *)&allgather); MPIDI_Update_last_algorithm(comm_ptr, my_md->name); MPID_PROGRESS_WAIT_WHILE(allgather_active); TRACE_ERR("Allgather done\n"); return PAMI_SUCCESS; } if(use_tree_reduce) { if(unlikely(verbose)) fprintf(stderr,"Using protocol GLUE_ALLREDUCE for allgather\n"); TRACE_ERR("Using allgather via allreduce\n"); MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHER_OPT_ALLREDUCE"); return MPIDO_Allgather_allreduce(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, send_true_lb, recv_true_lb, send_bytes, recv_bytes*comm_size, comm_ptr, mpierrno); } if(use_alltoall) { if(unlikely(verbose)) fprintf(stderr,"Using protocol GLUE_BCAST for allgather\n"); TRACE_ERR("Using allgather via alltoall\n"); MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHER_OPT_ALLTOALL"); return MPIDO_Allgather_alltoall(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, send_true_lb, recv_true_lb, send_bytes, recv_bytes*comm_size, comm_ptr, mpierrno); } if(use_bcast) { if(unlikely(verbose)) fprintf(stderr,"Using protocol GLUE_ALLTOALL for allgather\n"); TRACE_ERR("Using allgather via bcast\n"); MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHER_OPT_BCAST"); return MPIDO_Allgather_bcast(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, send_true_lb, recv_true_lb, send_bytes, recv_bytes*comm_size, comm_ptr, mpierrno); } /* Nothing used yet; dump to MPICH */ if(unlikely(verbose)) fprintf(stderr,"Using MPICH allgather algorithm\n"); TRACE_ERR("Using allgather via mpich\n"); MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHER_MPICH"); return MPIR_Allgather(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, comm_ptr, mpierrno); }
int MPIDO_Reduce_scatter(const void *sendbuf, void *recvbuf, int *recvcounts, MPI_Datatype datatype, MPI_Op op, MPID_Comm *comm_ptr, int *mpierrno) { const int rank = comm_ptr->rank; const int size = comm_ptr->local_size; #if ASSERT_LEVEL==0 /* We can't afford the tracing in ndebug/performance libraries */ const unsigned verbose = 0; #else const unsigned verbose = (MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_ALL) && (rank == 0); #endif if(unlikely(verbose)) fprintf(stderr,"Using MPICH reduce_scatter algorithm\n"); #if CUDA_AWARE_SUPPORT if(MPIDI_Process.cuda_aware_support_on) { MPI_Aint dt_extent; MPID_Datatype_get_extent_macro(datatype, dt_extent); char *scbuf = NULL; char *rcbuf = NULL; int is_send_dev_buf = MPIDI_cuda_is_device_buf(sendbuf); int is_recv_dev_buf = MPIDI_cuda_is_device_buf(recvbuf); int i; size_t total_buf = 0; for(i = 0; i < size; i++) { total_buf += recvcounts[i]; } if(is_send_dev_buf) { scbuf = MPIU_Malloc(dt_extent * total_buf); cudaError_t cudaerr = CudaMemcpy(scbuf, sendbuf, dt_extent * total_buf, cudaMemcpyDeviceToHost); if (cudaSuccess != cudaerr) fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr)); } else scbuf = sendbuf; if(is_recv_dev_buf) { rcbuf = MPIU_Malloc(total_buf * dt_extent); if(sendbuf == MPI_IN_PLACE) { cudaError_t cudaerr = CudaMemcpy(rcbuf, recvbuf, dt_extent * total_buf, cudaMemcpyDeviceToHost); if (cudaSuccess != cudaerr) fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr)); } else memset(rcbuf, 0, total_buf * dt_extent); } else rcbuf = recvbuf; int cuda_res = MPIR_Reduce_scatter(scbuf, rcbuf, recvcounts, datatype, op, comm_ptr, mpierrno); if(is_send_dev_buf)MPIU_Free(scbuf); if(is_recv_dev_buf) { cudaError_t cudaerr = CudaMemcpy(recvbuf, rcbuf, dt_extent * total_buf, cudaMemcpyHostToDevice); if (cudaSuccess != cudaerr) fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr)); MPIU_Free(rcbuf); } return cuda_res; } else #endif return MPIR_Reduce_scatter(sendbuf, recvbuf, recvcounts, datatype, op, comm_ptr, mpierrno); }
int MPIDO_Reduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPID_Comm *comm_ptr, int *mpierrno) { #ifndef HAVE_PAMI_IN_PLACE if (sendbuf == MPI_IN_PLACE) { MPID_Abort (NULL, 0, 1, "'MPI_IN_PLACE' requries support for `PAMI_IN_PLACE`"); return -1; } #endif MPID_Datatype *dt_null = NULL; MPI_Aint true_lb = 0; int dt_contig ATTRIBUTE((unused)), tsize; int mu; char *sbuf, *rbuf; pami_data_function pop; pami_type_t pdt; int rc; int alg_selected = 0; const int rank = comm_ptr->rank; #if ASSERT_LEVEL==0 /* We can't afford the tracing in ndebug/performance libraries */ const unsigned verbose = 0; #else const unsigned verbose = (MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_ALL) && (rank == 0); #endif const struct MPIDI_Comm* const mpid = &(comm_ptr->mpid); const int selected_type = mpid->user_selected_type[PAMI_XFER_REDUCE]; rc = MPIDI_Datatype_to_pami(datatype, &pdt, op, &pop, &mu); if(unlikely(verbose)) fprintf(stderr,"reduce - rc %u, root %u, count %d, dt: %p, op: %p, mu: %u, selectedvar %u != %u (MPICH) sendbuf %p, recvbuf %p\n", rc, root, count, pdt, pop, mu, (unsigned)selected_type, MPID_COLL_USE_MPICH,sendbuf, recvbuf); pami_xfer_t reduce; pami_algorithm_t my_reduce=0; const pami_metadata_t *my_md = (pami_metadata_t *)NULL; int queryreq = 0; volatile unsigned reduce_active = 1; MPIDI_Datatype_get_info(count, datatype, dt_contig, tsize, dt_null, true_lb); rbuf = (char *)recvbuf + true_lb; sbuf = (char *)sendbuf + true_lb; if(sendbuf == MPI_IN_PLACE) { if(unlikely(verbose)) fprintf(stderr,"reduce MPI_IN_PLACE send buffering (%d,%d)\n",count,tsize); sbuf = PAMI_IN_PLACE; } reduce.cb_done = reduce_cb_done; reduce.cookie = (void *)&reduce_active; if(mpid->optreduce) /* GLUE_ALLREDUCE */ { char* tbuf = NULL; if(unlikely(verbose)) fprintf(stderr,"Using protocol GLUE_ALLREDUCE for reduce (%d,%d)\n",count,tsize); MPIDI_Update_last_algorithm(comm_ptr, "REDUCE_OPT_ALLREDUCE"); void *destbuf = recvbuf; if(rank != root) /* temp buffer for non-root destbuf */ { tbuf = destbuf = MPL_malloc(tsize); } /* Switch to comm->coll_fns->fn() */ MPIDO_Allreduce(sendbuf, destbuf, count, datatype, op, comm_ptr, mpierrno); if(tbuf) MPL_free(tbuf); return 0; } if(selected_type == MPID_COLL_USE_MPICH || rc != MPI_SUCCESS) { if(unlikely(verbose)) fprintf(stderr,"Using MPICH reduce algorithm\n"); #if CUDA_AWARE_SUPPORT if(MPIDI_Process.cuda_aware_support_on) { MPI_Aint dt_extent; MPID_Datatype_get_extent_macro(datatype, dt_extent); char *scbuf = NULL; char *rcbuf = NULL; int is_send_dev_buf = MPIDI_cuda_is_device_buf(sendbuf); int is_recv_dev_buf = MPIDI_cuda_is_device_buf(recvbuf); if(is_send_dev_buf) { scbuf = MPL_malloc(dt_extent * count); cudaError_t cudaerr = CudaMemcpy(scbuf, sendbuf, dt_extent * count, cudaMemcpyDeviceToHost); if (cudaSuccess != cudaerr) fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr)); } else scbuf = sendbuf; if(is_recv_dev_buf) { rcbuf = MPL_malloc(dt_extent * count); if(sendbuf == MPI_IN_PLACE) { cudaError_t cudaerr = CudaMemcpy(rcbuf, recvbuf, dt_extent * count, cudaMemcpyDeviceToHost); if (cudaSuccess != cudaerr) fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr)); } else memset(rcbuf, 0, dt_extent * count); } else rcbuf = recvbuf; int cuda_res = MPIR_Reduce(scbuf, rcbuf, count, datatype, op, root, comm_ptr, mpierrno); if(is_send_dev_buf)MPL_free(scbuf); if(is_recv_dev_buf) { cudaError_t cudaerr = CudaMemcpy(recvbuf, rcbuf, dt_extent * count, cudaMemcpyHostToDevice); if (cudaSuccess != cudaerr) fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr)); MPL_free(rcbuf); } return cuda_res; } else #endif return MPIR_Reduce(sendbuf, recvbuf, count, datatype, op, root, comm_ptr, mpierrno); } if(selected_type == MPID_COLL_OPTIMIZED) { if((mpid->cutoff_size[PAMI_XFER_REDUCE][0] == 0) || (mpid->cutoff_size[PAMI_XFER_REDUCE][0] >= tsize && mpid->cutoff_size[PAMI_XFER_REDUCE][0] > 0)) { TRACE_ERR("Optimized Reduce (%s) was pre-selected\n", mpid->opt_protocol_md[PAMI_XFER_REDUCE][0].name); my_reduce = mpid->opt_protocol[PAMI_XFER_REDUCE][0]; my_md = &mpid->opt_protocol_md[PAMI_XFER_REDUCE][0]; queryreq = mpid->must_query[PAMI_XFER_REDUCE][0]; } } else { TRACE_ERR("Optimized reduce (%s) was specified by user\n", mpid->user_metadata[PAMI_XFER_REDUCE].name); my_reduce = mpid->user_selected[PAMI_XFER_REDUCE]; my_md = &mpid->user_metadata[PAMI_XFER_REDUCE]; queryreq = selected_type; } reduce.algorithm = my_reduce; reduce.cmd.xfer_reduce.sndbuf = sbuf; reduce.cmd.xfer_reduce.rcvbuf = rbuf; reduce.cmd.xfer_reduce.stype = pdt; reduce.cmd.xfer_reduce.rtype = pdt; reduce.cmd.xfer_reduce.stypecount = count; reduce.cmd.xfer_reduce.rtypecount = count; reduce.cmd.xfer_reduce.op = pop; reduce.cmd.xfer_reduce.root = MPIDI_Task_to_endpoint(MPID_VCR_GET_LPID(comm_ptr->vcr, root), 0); if(unlikely(queryreq == MPID_COLL_ALWAYS_QUERY || queryreq == MPID_COLL_CHECK_FN_REQUIRED)) { metadata_result_t result = {0}; TRACE_ERR("Querying reduce protocol %s, type was %d\n", my_md->name, queryreq); if(my_md->check_fn == NULL) { /* process metadata bits */ if((!my_md->check_correct.values.inplace) && (sendbuf == MPI_IN_PLACE)) result.check.unspecified = 1; if(my_md->check_correct.values.rangeminmax) { MPI_Aint data_true_lb ATTRIBUTE((unused)); MPID_Datatype *data_ptr; int data_size, data_contig ATTRIBUTE((unused)); MPIDI_Datatype_get_info(count, datatype, data_contig, data_size, data_ptr, data_true_lb); if((my_md->range_lo <= data_size) && (my_md->range_hi >= data_size)) ; /* ok, algorithm selected */ else { result.check.range = 1; if(unlikely(verbose)) { fprintf(stderr,"message size (%u) outside range (%zu<->%zu) for %s.\n", data_size, my_md->range_lo, my_md->range_hi, my_md->name); } } } }
int MPIDO_Allgatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, const int *recvcounts, const int *displs, MPI_Datatype recvtype, MPID_Comm * comm_ptr, int *mpierrno) { #ifndef HAVE_PAMI_IN_PLACE if (sendbuf == MPI_IN_PLACE) { MPID_Abort (NULL, 0, 1, "'MPI_IN_PLACE' requries support for `PAMI_IN_PLACE`"); return -1; } #endif TRACE_ERR("Entering MPIDO_Allgatherv\n"); /* function pointer to be used to point to approperiate algorithm */ /* Check the nature of the buffers */ MPID_Datatype *dt_null = NULL; MPI_Aint send_true_lb = 0; MPI_Aint recv_true_lb = 0; size_t send_size = 0; size_t recv_size = 0; int config[6]; int scount=sendcount; int i, rc, buffer_sum = 0; const int size = comm_ptr->local_size; char use_tree_reduce, use_alltoall, use_bcast, use_pami, use_opt; char *sbuf, *rbuf; const int rank = comm_ptr->rank; const struct MPIDI_Comm* const mpid = &(comm_ptr->mpid); int queryreq = 0; #if ASSERT_LEVEL==0 /* We can't afford the tracing in ndebug/performance libraries */ const unsigned verbose = 0; #else const unsigned verbose = (MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_ALL) && (rank == 0); #endif const int selected_type = mpid->user_selected_type[PAMI_XFER_ALLGATHERV_INT]; pami_xfer_t allred; volatile unsigned allred_active = 1; volatile unsigned allgatherv_active = 1; pami_type_t stype, rtype; int tmp; const pami_metadata_t *my_md = (pami_metadata_t *)NULL; for(i=0;i<6;i++) config[i] = 1; allred.cb_done = allred_cb_done; allred.cookie = (void *)&allred_active; allred.algorithm = mpid->coll_algorithm[PAMI_XFER_ALLREDUCE][0][0]; allred.cmd.xfer_allreduce.sndbuf = (void *)config; allred.cmd.xfer_allreduce.stype = PAMI_TYPE_SIGNED_INT; allred.cmd.xfer_allreduce.rcvbuf = (void *)config; allred.cmd.xfer_allreduce.rtype = PAMI_TYPE_SIGNED_INT; allred.cmd.xfer_allreduce.stypecount = 6; allred.cmd.xfer_allreduce.rtypecount = 6; allred.cmd.xfer_allreduce.op = PAMI_DATA_BAND; use_alltoall = mpid->allgathervs[2]; use_tree_reduce = mpid->allgathervs[0]; use_bcast = mpid->allgathervs[1]; use_pami = selected_type != MPID_COLL_USE_MPICH; if((sendbuf != MPI_IN_PLACE) && (MPIDI_Datatype_to_pami(sendtype, &stype, -1, NULL, &tmp) != MPI_SUCCESS)) use_pami = 0; if(MPIDI_Datatype_to_pami(recvtype, &rtype, -1, NULL, &tmp) != MPI_SUCCESS) use_pami = 0; use_opt = use_alltoall || use_tree_reduce || use_bcast || use_pami; if(!use_opt) /* back to MPICH */ { if(unlikely(verbose)) fprintf(stderr,"Using MPICH allgatherv type %u.\n", selected_type); TRACE_ERR("Using MPICH Allgatherv\n"); MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHERV_MPICH"); #if CUDA_AWARE_SUPPORT if(MPIDI_Process.cuda_aware_support_on) { MPI_Aint sdt_extent,rdt_extent; MPID_Datatype_get_extent_macro(sendtype, sdt_extent); MPID_Datatype_get_extent_macro(recvtype, rdt_extent); char *scbuf = NULL; char *rcbuf = NULL; int is_send_dev_buf = MPIDI_cuda_is_device_buf(sendbuf); int is_recv_dev_buf = MPIDI_cuda_is_device_buf(recvbuf); if(is_send_dev_buf) { scbuf = MPIU_Malloc(sdt_extent * sendcount); cudaError_t cudaerr = CudaMemcpy(scbuf, sendbuf, sdt_extent * sendcount, cudaMemcpyDeviceToHost); if (cudaSuccess != cudaerr) fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr)); } else scbuf = sendbuf; size_t rtotal_buf; if(is_recv_dev_buf) { //Since displs can be non-continous, we need to calculate max buffer size int highest_displs = displs[size - 1]; int highest_recvcount = recvcounts[size - 1]; for(i = 0; i < size; i++) { if(displs[i]+recvcounts[i] > highest_displs+highest_recvcount) { highest_displs = displs[i]; highest_recvcount = recvcounts[i]; } } rtotal_buf = (highest_displs+highest_recvcount)*rdt_extent; rcbuf = MPIU_Malloc(rtotal_buf); if(sendbuf == MPI_IN_PLACE) { cudaError_t cudaerr = CudaMemcpy(rcbuf, recvbuf, rtotal_buf, cudaMemcpyDeviceToHost); if (cudaSuccess != cudaerr) fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr)); } else memset(rcbuf, 0, rtotal_buf); } else rcbuf = recvbuf; int cuda_res = MPIR_Allgatherv(scbuf, sendcount, sendtype, rcbuf, recvcounts, displs, recvtype, comm_ptr, mpierrno); if(is_send_dev_buf)MPIU_Free(scbuf); if(is_recv_dev_buf) { cudaError_t cudaerr = CudaMemcpy(recvbuf, rcbuf, rtotal_buf, cudaMemcpyHostToDevice); if (cudaSuccess != cudaerr) fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr)); MPIU_Free(rcbuf); } return cuda_res; } else #endif return MPIR_Allgatherv(sendbuf, sendcount, sendtype, recvbuf, recvcounts, displs, recvtype, comm_ptr, mpierrno); } MPIDI_Datatype_get_info(1, recvtype, config[MPID_RECV_CONTIG], recv_size, dt_null, recv_true_lb); if(sendbuf == MPI_IN_PLACE) { sbuf = PAMI_IN_PLACE; if(unlikely(verbose)) fprintf(stderr,"allgatherv MPI_IN_PLACE buffering\n"); stype = rtype; scount = recvcounts[rank]; send_size = recv_size * scount; } else { MPIDI_Datatype_get_info(sendcount, sendtype, config[MPID_SEND_CONTIG], send_size, dt_null, send_true_lb); sbuf = (char *)sendbuf+send_true_lb; } rbuf = (char *)recvbuf+recv_true_lb; if(use_alltoall || use_bcast || use_tree_reduce) { if (displs[0]) config[MPID_RECV_CONTINUOUS] = 0; for (i = 1; i < size; i++) { buffer_sum += recvcounts[i - 1]; if (buffer_sum != displs[i]) { config[MPID_RECV_CONTINUOUS] = 0; break; } } buffer_sum += recvcounts[size - 1]; buffer_sum *= recv_size; /* disable with "safe allgatherv" env var */ if(mpid->preallreduces[MPID_ALLGATHERV_PREALLREDUCE]) { MPIDI_Post_coll_t allred_post; MPIDI_Context_post(MPIDI_Context[0], &allred_post.state, MPIDI_Pami_post_wrapper, (void *)&allred); MPID_PROGRESS_WAIT_WHILE(allred_active); } use_tree_reduce = mpid->allgathervs[0] && config[MPID_RECV_CONTIG] && config[MPID_SEND_CONTIG] && config[MPID_RECV_CONTINUOUS] && buffer_sum % sizeof(unsigned) == 0; use_alltoall = mpid->allgathervs[2] && config[MPID_RECV_CONTIG] && config[MPID_SEND_CONTIG]; use_bcast = mpid->allgathervs[1]; } if(use_pami) { pami_xfer_t allgatherv; allgatherv.cb_done = allgatherv_cb_done; allgatherv.cookie = (void *)&allgatherv_active; if(selected_type == MPID_COLL_OPTIMIZED) { if((mpid->cutoff_size[PAMI_XFER_ALLGATHERV_INT][0] == 0) || (mpid->cutoff_size[PAMI_XFER_ALLGATHERV_INT][0] > 0 && mpid->cutoff_size[PAMI_XFER_ALLGATHERV_INT][0] >= send_size)) { allgatherv.algorithm = mpid->opt_protocol[PAMI_XFER_ALLGATHERV_INT][0]; my_md = &mpid->opt_protocol_md[PAMI_XFER_ALLGATHERV_INT][0]; queryreq = mpid->must_query[PAMI_XFER_ALLGATHERV_INT][0]; } else return MPIR_Allgatherv(sendbuf, sendcount, sendtype, recvbuf, recvcounts, displs, recvtype, comm_ptr, mpierrno); } else { allgatherv.algorithm = mpid->user_selected[PAMI_XFER_ALLGATHERV_INT]; my_md = &mpid->user_metadata[PAMI_XFER_ALLGATHERV_INT]; queryreq = selected_type; } allgatherv.cmd.xfer_allgatherv_int.sndbuf = sbuf; allgatherv.cmd.xfer_allgatherv_int.rcvbuf = rbuf; allgatherv.cmd.xfer_allgatherv_int.stype = stype; allgatherv.cmd.xfer_allgatherv_int.rtype = rtype; allgatherv.cmd.xfer_allgatherv_int.stypecount = scount; allgatherv.cmd.xfer_allgatherv_int.rtypecounts = (int *) recvcounts; allgatherv.cmd.xfer_allgatherv_int.rdispls = (int *) displs; if(unlikely (queryreq == MPID_COLL_ALWAYS_QUERY || queryreq == MPID_COLL_CHECK_FN_REQUIRED)) { metadata_result_t result = {0}; TRACE_ERR("Querying allgatherv_int protocol %s, type was %d\n", my_md->name, selected_type); if(my_md->check_fn == NULL) { /* process metadata bits */ if((!my_md->check_correct.values.inplace) && (sendbuf == MPI_IN_PLACE)) result.check.unspecified = 1; /* Can't check ranges like this. Non-local. Comment out for now. if(my_md->check_correct.values.rangeminmax) { MPI_Aint data_true_lb; MPID_Datatype *data_ptr; int data_size, data_contig; MPIDI_Datatype_get_info(sendcount, sendtype, data_contig, data_size, data_ptr, data_true_lb); if((my_md->range_lo <= data_size) && (my_md->range_hi >= data_size)) ; else { result.check.range = 1; if(unlikely(verbose)) { fprintf(stderr,"message size (%u) outside range (%zu<->%zu) for %s.\n", data_size, my_md->range_lo, my_md->range_hi, my_md->name); } } } */ } else /* calling the check fn is sufficient */ result = my_md->check_fn(&allgatherv); TRACE_ERR("Allgatherv bitmask: %#X\n", result.bitmask); result.check.nonlocal = 0; /* #warning REMOVE THIS WHEN IMPLEMENTED */ if(result.bitmask) { if(unlikely(verbose)) fprintf(stderr,"Query failed for %s. Using MPICH allgatherv.\n", my_md->name); MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHERV_MPICH"); return MPIR_Allgatherv(sendbuf, sendcount, sendtype, recvbuf, recvcounts, displs, recvtype, comm_ptr, mpierrno); } if(my_md->check_correct.values.asyncflowctl && !(--(comm_ptr->mpid.num_requests))) { comm_ptr->mpid.num_requests = MPIDI_Process.optimized.num_requests; int tmpmpierrno; if(unlikely(verbose)) fprintf(stderr,"Query barrier required for %s\n", my_md->name); MPIDO_Barrier(comm_ptr, &tmpmpierrno); } } if(unlikely(verbose)) { unsigned long long int threadID; MPIU_Thread_id_t tid; MPIU_Thread_self(&tid); threadID = (unsigned long long int)tid; fprintf(stderr,"<%llx> Using protocol %s for allgatherv on %u\n", threadID, my_md->name, (unsigned) comm_ptr->context_id); } MPIDI_Post_coll_t allgatherv_post; MPIDI_Context_post(MPIDI_Context[0], &allgatherv_post.state, MPIDI_Pami_post_wrapper, (void *)&allgatherv); MPIDI_Update_last_algorithm(comm_ptr, my_md->name); TRACE_ERR("Rank %d waiting on active %d\n", rank, allgatherv_active); MPID_PROGRESS_WAIT_WHILE(allgatherv_active); return PAMI_SUCCESS; } /* TODO These need ordered in speed-order */ if(use_tree_reduce) { if(unlikely(verbose)) fprintf(stderr,"Using tree reduce allgatherv type %u.\n", selected_type); rc = MPIDO_Allgatherv_allreduce(sendbuf, sendcount, sendtype, recvbuf, recvcounts, buffer_sum, displs, recvtype, send_true_lb, recv_true_lb, send_size, recv_size, comm_ptr, mpierrno); MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHERV_OPT_ALLREDUCE"); return rc; } if(use_bcast) { if(unlikely(verbose)) fprintf(stderr,"Using bcast allgatherv type %u.\n", selected_type); rc = MPIDO_Allgatherv_bcast(sendbuf, sendcount, sendtype, recvbuf, recvcounts, buffer_sum, displs, recvtype, send_true_lb, recv_true_lb, send_size, recv_size, comm_ptr, mpierrno); MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHERV_OPT_BCAST"); return rc; } if(use_alltoall) { if(unlikely(verbose)) fprintf(stderr,"Using alltoall allgatherv type %u.\n", selected_type); rc = MPIDO_Allgatherv_alltoall(sendbuf, sendcount, sendtype, recvbuf, (int *)recvcounts, buffer_sum, displs, recvtype, send_true_lb, recv_true_lb, send_size, recv_size, comm_ptr, mpierrno); MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHERV_OPT_ALLTOALL"); return rc; } if(unlikely(verbose)) fprintf(stderr,"Using MPICH allgatherv type %u.\n", selected_type); TRACE_ERR("Using MPICH for Allgatherv\n"); MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHERV_MPICH"); return MPIR_Allgatherv(sendbuf, sendcount, sendtype, recvbuf, recvcounts, displs, recvtype, comm_ptr, mpierrno); }