int MPIDO_Reduce_binom(void * sendbuf, void * recvbuf, int count, DCMF_Dt dcmf_dt, DCMF_Op dcmf_op, MPI_Datatype data_type, int root, MPID_Comm * comm) { int rc, hw_root = comm->vcr[root]; DCMF_CollectiveRequest_t request; volatile unsigned active = 1; DCMF_Callback_t callback = { reduce_cb_done, (void *) &active }; DCMF_Geometry_t * geometry = &(comm->dcmf.geometry); rc = DCMF_Reduce(&MPIDI_CollectiveProtocols.binomial_reduce, &request, callback, DCMF_MATCH_CONSISTENCY, geometry, hw_root, sendbuf, recvbuf, count, dcmf_dt, dcmf_op); MPID_PROGRESS_WAIT_WHILE(active); return rc; }
int MPIDO_Reduce_global_tree(void * sendbuf, void * recvbuf, int count, DCMF_Dt dcmf_dt, DCMF_Op dcmf_op, MPI_Datatype data_type, int root, MPID_Comm * comm) { int rc, hw_root = comm->vcr[root]; DCMF_CollectiveRequest_t request; volatile unsigned active = 1; DCMF_Callback_t callback = { reduce_cb_done, (void *) &active }; rc = DCMF_GlobalAllreduce(&MPIDI_Protocols.globalallreduce, (DCMF_Request_t *)&request, callback, DCMF_MATCH_CONSISTENCY, hw_root, sendbuf, recvbuf, count, dcmf_dt, dcmf_op); MPID_PROGRESS_WAIT_WHILE(active); return rc; }
int MPIDO_Barrier_gi(MPID_Comm * comm) { int rc; MPID_Comm * comm_world; MPID_Comm_get_ptr(MPI_COMM_WORLD, comm_world); DCMF_Callback_t callback = { barrier_cb_done, (void *) &mpid_globalbarrier_active }; /* initialize global active field */ mpid_globalbarrier_active = 1; if (mpid_globalbarrier_restart) rc = DCMF_Restart (&mpid_globalbarrier_request); else { mpid_globalbarrier_restart = 1; rc = DCMF_GlobalBarrier(&MPIDI_Protocols.globalbarrier, &mpid_globalbarrier_request, callback); } if (rc == DCMF_SUCCESS) MPID_PROGRESS_WAIT_WHILE(* (int *) callback.clientdata); return rc; }
int MPID_Win_fence(int assert, MPID_Win *win) { int mpi_errno = MPI_SUCCESS; struct MPIDI_Win_sync* sync = &win->mpid.sync; MPID_PROGRESS_WAIT_WHILE(sync->total != sync->complete); sync->total = 0; sync->started = 0; sync->complete = 0; mpi_errno = MPIR_Barrier_impl(win->comm_ptr, &mpi_errno); return mpi_errno; }
int MPID_Probe(int source, int tag, MPID_Comm * comm, int context_offset, MPI_Status * status) { const int context = comm->recvcontext_id + context_offset; if (source == MPI_PROC_NULL) { MPIR_Status_set_procnull(status); return MPI_SUCCESS; } MPID_PROGRESS_WAIT_WHILE(!MPIDI_Recvq_FU_r(source, tag, context, status)); return MPI_SUCCESS; }
int MPIDO_Alltoallw_torus(void *sendbuf, int * sendcounts, int * senddispls, MPI_Datatype * sendtypes, void * recvbuf, int * recvcounts, int * recvdispls, MPI_Datatype *recvtypes, MPID_Comm * comm) { int rc; DCMF_CollectiveRequest_t request; volatile unsigned active = 1; DCMF_Callback_t callback = { alltoallw_cb_done, (void *) &active }; DCMF_Geometry_t * geometry = &(comm->dcmf.geometry); /* ignore some of the args passed in, used the one setup in comm ptr */ unsigned * sndlen = comm->dcmf.sndlen; unsigned * sdispls = comm->dcmf.sdispls; unsigned * rcvlen = comm->dcmf.rcvlen; unsigned * rdispls = comm->dcmf.rdispls; unsigned * sndcounters = comm->dcmf.sndcounters; unsigned * rcvcounters = comm->dcmf.rcvcounters; rc = DCMF_Alltoallv(&MPIDI_CollectiveProtocols.torus_alltoallv, &request, callback, DCMF_MATCH_CONSISTENCY, geometry, sendbuf, sndlen, sdispls, recvbuf, rcvlen, rdispls, sndcounters, rcvcounters); MPID_PROGRESS_WAIT_WHILE(active); return rc; }
int MPIDO_Barrier_dcmf(MPID_Comm * comm) { int rc; /* use local (thread safe) active field */ volatile unsigned active; DCMF_Callback_t callback = { barrier_cb_done, (void *) &active}; /* initialize local (thread safe) active field */ active = 1; /* geometry sets up proper barrier for the geometry at init time */ rc = DCMF_Barrier(&comm->dcmf.geometry, callback, DCMF_MATCH_CONSISTENCY); if (rc == DCMF_SUCCESS) MPID_PROGRESS_WAIT_WHILE(* (int *) callback.clientdata); return rc; }
int MPIDO_Gatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, const int *recvcounts, const int *displs, MPI_Datatype recvtype, int root, MPID_Comm * comm_ptr, int *mpierrno) { #ifndef HAVE_PAMI_IN_PLACE if (sendbuf == MPI_IN_PLACE) { MPID_Abort (NULL, 0, 1, "'MPI_IN_PLACE' requries support for `PAMI_IN_PLACE`"); return -1; } #endif TRACE_ERR("Entering MPIDO_Gatherv\n"); int i; int contig ATTRIBUTE((unused)), rsize ATTRIBUTE((unused)), ssize ATTRIBUTE((unused)); int pamidt = 1; MPID_Datatype *dt_ptr = NULL; MPI_Aint send_true_lb, recv_true_lb; char *sbuf, *rbuf; pami_type_t stype, rtype; int tmp; volatile unsigned gatherv_active = 1; const int rank = comm_ptr->rank; const int size = comm_ptr->local_size; #if ASSERT_LEVEL==0 /* We can't afford the tracing in ndebug/performance libraries */ const unsigned verbose = 0; #else const unsigned verbose = (MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_ALL) && (rank == 0); #endif const struct MPIDI_Comm* const mpid = &(comm_ptr->mpid); const int selected_type = mpid->user_selected_type[PAMI_XFER_GATHERV_INT]; /* Check for native PAMI types and MPI_IN_PLACE on sendbuf */ /* MPI_IN_PLACE is a nonlocal decision. We will need a preallreduce if we ever have * multiple "good" gathervs that work on different counts for example */ if((sendbuf != MPI_IN_PLACE) && (MPIDI_Datatype_to_pami(sendtype, &stype, -1, NULL, &tmp) != MPI_SUCCESS)) pamidt = 0; if(MPIDI_Datatype_to_pami(recvtype, &rtype, -1, NULL, &tmp) != MPI_SUCCESS) pamidt = 0; if(pamidt == 0 || selected_type == MPID_COLL_USE_MPICH) { if(unlikely(verbose)) fprintf(stderr,"Using MPICH gatherv algorithm\n"); TRACE_ERR("GATHERV using MPICH\n"); MPIDI_Update_last_algorithm(comm_ptr, "GATHERV_MPICH"); #if CUDA_AWARE_SUPPORT if(MPIDI_Process.cuda_aware_support_on) { MPI_Aint sdt_extent,rdt_extent; MPID_Datatype_get_extent_macro(sendtype, sdt_extent); MPID_Datatype_get_extent_macro(recvtype, rdt_extent); char *scbuf = NULL; char *rcbuf = NULL; int is_send_dev_buf = MPIDI_cuda_is_device_buf(sendbuf); int is_recv_dev_buf = (rank == root) ? MPIDI_cuda_is_device_buf(recvbuf) : 0; if(is_send_dev_buf) { scbuf = MPL_malloc(sdt_extent * sendcount); cudaError_t cudaerr = CudaMemcpy(scbuf, sendbuf, sdt_extent * sendcount, cudaMemcpyDeviceToHost); if (cudaSuccess != cudaerr) fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr)); } else scbuf = sendbuf; size_t rtotal_buf; if(is_recv_dev_buf) { //Since displs can be non-continous, we need to calculate max buffer size int highest_displs = displs[size - 1]; int highest_recvcount = recvcounts[size - 1]; for(i = 0; i < size; i++) { if(displs[i]+recvcounts[i] > highest_displs+highest_recvcount) { highest_displs = displs[i]; highest_recvcount = recvcounts[i]; } } rtotal_buf = (highest_displs+highest_recvcount)*rdt_extent; rcbuf = MPL_malloc(rtotal_buf); if(sendbuf == MPI_IN_PLACE) { cudaError_t cudaerr = CudaMemcpy(rcbuf, recvbuf, rtotal_buf, cudaMemcpyDeviceToHost); if (cudaSuccess != cudaerr) fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr)); } else memset(rcbuf, 0, rtotal_buf); } else rcbuf = recvbuf; int cuda_res = MPIR_Gatherv(scbuf, sendcount, sendtype, rcbuf, recvcounts, displs, recvtype, root, comm_ptr, mpierrno); if(is_send_dev_buf)MPL_free(scbuf); if(is_recv_dev_buf) { cudaError_t cudaerr = CudaMemcpy(recvbuf, rcbuf, rtotal_buf, cudaMemcpyHostToDevice); if (cudaSuccess != cudaerr) fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr)); MPL_free(rcbuf); } return cuda_res; } else #endif return MPIR_Gatherv(sendbuf, sendcount, sendtype, recvbuf, recvcounts, displs, recvtype, root, comm_ptr, mpierrno); } MPIDI_Datatype_get_info(1, recvtype, contig, rsize, dt_ptr, recv_true_lb); rbuf = (char *)recvbuf + recv_true_lb; sbuf = (void *) sendbuf; pami_xfer_t gatherv; gatherv.cb_done = cb_gatherv; gatherv.cookie = (void *)&gatherv_active; gatherv.cmd.xfer_gatherv_int.root = MPIDI_Task_to_endpoint(MPID_VCR_GET_LPID(comm_ptr->vcr, root), 0); gatherv.cmd.xfer_gatherv_int.rcvbuf = rbuf; gatherv.cmd.xfer_gatherv_int.rtype = rtype; gatherv.cmd.xfer_gatherv_int.rtypecounts = (int *) recvcounts; gatherv.cmd.xfer_gatherv_int.rdispls = (int *) displs; gatherv.cmd.xfer_gatherv_int.sndbuf = NULL; gatherv.cmd.xfer_gatherv_int.stype = stype; gatherv.cmd.xfer_gatherv_int.stypecount = sendcount; if(rank == root) { if(sendbuf == MPI_IN_PLACE) { if(unlikely(verbose)) fprintf(stderr,"gatherv MPI_IN_PLACE buffering\n"); sbuf = PAMI_IN_PLACE; gatherv.cmd.xfer_gatherv_int.stype = rtype; gatherv.cmd.xfer_gatherv_int.stypecount = recvcounts[rank]; } else { MPIDI_Datatype_get_info(1, sendtype, contig, ssize, dt_ptr, send_true_lb); sbuf = (char *)sbuf + send_true_lb; } } gatherv.cmd.xfer_gatherv_int.sndbuf = sbuf; pami_algorithm_t my_gatherv; const pami_metadata_t *my_md = (pami_metadata_t *)NULL; int queryreq = 0; if(selected_type == MPID_COLL_OPTIMIZED) { TRACE_ERR("Optimized gatherv %s was selected\n", mpid->opt_protocol_md[PAMI_XFER_GATHERV_INT][0].name); my_gatherv = mpid->opt_protocol[PAMI_XFER_GATHERV_INT][0]; my_md = &mpid->opt_protocol_md[PAMI_XFER_GATHERV_INT][0]; queryreq = mpid->must_query[PAMI_XFER_GATHERV_INT][0]; } else { TRACE_ERR("Optimized gatherv %s was set by user\n", mpid->user_metadata[PAMI_XFER_GATHERV_INT].name); my_gatherv = mpid->user_selected[PAMI_XFER_GATHERV_INT]; my_md = &mpid->user_metadata[PAMI_XFER_GATHERV_INT]; queryreq = selected_type; } gatherv.algorithm = my_gatherv; if(unlikely(queryreq == MPID_COLL_ALWAYS_QUERY || queryreq == MPID_COLL_CHECK_FN_REQUIRED)) { metadata_result_t result = {0}; TRACE_ERR("querying gatherv protocol %s, type was %d\n", my_md->name, queryreq); if(my_md->check_fn == NULL) { /* process metadata bits */ if((!my_md->check_correct.values.inplace) && (sendbuf == MPI_IN_PLACE)) result.check.unspecified = 1; /* Can't check ranges like this. Non-local. Comment out for now. if(my_md->check_correct.values.rangeminmax) { MPI_Aint data_true_lb; MPID_Datatype *data_ptr; int data_size, data_contig; MPIDI_Datatype_get_info(sendcount, sendtype, data_contig, data_size, data_ptr, data_true_lb); if((my_md->range_lo <= data_size) && (my_md->range_hi >= data_size)) ; else { result.check.range = 1; if(unlikely(verbose)) { fprintf(stderr,"message size (%u) outside range (%zu<->%zu) for %s.\n", data_size, my_md->range_lo, my_md->range_hi, my_md->name); } } } */ } else /* calling the check fn is sufficient */ result = my_md->check_fn(&gatherv); TRACE_ERR("bitmask: %#X\n", result.bitmask); result.check.nonlocal = 0; /* #warning REMOVE THIS WHEN IMPLEMENTED */ if(result.bitmask) { if(unlikely(verbose)) fprintf(stderr,"Query failed for %s. Using MPICH gatherv.\n", my_md->name); MPIDI_Update_last_algorithm(comm_ptr, "GATHERV_MPICH"); return MPIR_Gatherv(sendbuf, sendcount, sendtype, recvbuf, recvcounts, displs, recvtype, root, comm_ptr, mpierrno); } if(my_md->check_correct.values.asyncflowctl && !(--(comm_ptr->mpid.num_requests))) { comm_ptr->mpid.num_requests = MPIDI_Process.optimized.num_requests; int tmpmpierrno; if(unlikely(verbose)) fprintf(stderr,"Query barrier required for %s\n", my_md->name); MPIDO_Barrier(comm_ptr, &tmpmpierrno); } } MPIDI_Update_last_algorithm(comm_ptr, my_md->name); if(unlikely(verbose)) { unsigned long long int threadID; MPL_thread_id_t tid; MPL_thread_self(&tid); threadID = (unsigned long long int)tid; fprintf(stderr,"<%llx> Using protocol %s for gatherv on %u\n", threadID, my_md->name, (unsigned) comm_ptr->context_id); } MPIDI_Post_coll_t gatherv_post; MPIDI_Context_post(MPIDI_Context[0], &gatherv_post.state, MPIDI_Pami_post_wrapper, (void *)&gatherv); TRACE_ERR("Waiting on active %d\n", gatherv_active); MPID_PROGRESS_WAIT_WHILE(gatherv_active); TRACE_ERR("Leaving MPIDO_Gatherv\n"); return 0; }
int MPIDO_Gatherv_simple(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, const int *recvcounts, const int *displs, MPI_Datatype recvtype, int root, MPID_Comm * comm_ptr, int *mpierrno) { #ifndef HAVE_PAMI_IN_PLACE if (sendbuf == MPI_IN_PLACE) { MPID_Abort (NULL, 0, 1, "'MPI_IN_PLACE' requries support for `PAMI_IN_PLACE`"); return -1; } #endif TRACE_ERR("Entering MPIDO_Gatherv_optimized\n"); int snd_contig = 1, rcv_contig = 1; void *snd_noncontig_buff = NULL, *rcv_noncontig_buff = NULL; void *sbuf = NULL, *rbuf = NULL; int *rcounts = NULL; int *rdispls = NULL; int send_size = 0; int recv_size = 0; int rcvlen = 0; int totalrecvcount = 0; pami_type_t rtype = PAMI_TYPE_NULL; MPID_Segment segment; MPID_Datatype *data_ptr = NULL; int send_true_lb, recv_true_lb = 0; int i, tmp; volatile unsigned gatherv_active = 1; const int rank = comm_ptr->rank; const int size = comm_ptr->local_size; #if ASSERT_LEVEL==0 /* We can't afford the tracing in ndebug/performance libraries */ const unsigned verbose = 0; #else const unsigned verbose = (MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_ALL) && (rank == 0); #endif const struct MPIDI_Comm* const mpid = &(comm_ptr->mpid); int recvok=PAMI_SUCCESS, recvcontinuous=0; if(sendbuf != MPI_IN_PLACE) { MPIDI_Datatype_get_info(sendcount, sendtype, snd_contig, send_size, data_ptr, send_true_lb); if(MPIDI_Pamix_collsel_advise != NULL && mpid->collsel_fast_query != NULL) { advisor_algorithm_t advisor_algorithms[1]; int num_algorithms = MPIDI_Pamix_collsel_advise(mpid->collsel_fast_query, PAMI_XFER_GATHERV_INT, 64, advisor_algorithms, 1); if(num_algorithms) { if(advisor_algorithms[0].algorithm_type == COLLSEL_EXTERNAL_ALGO) { return MPIR_Gatherv(sendbuf, sendcount, sendtype, recvbuf, recvcounts, displs, recvtype, root, comm_ptr, mpierrno); } else if(advisor_algorithms[0].metadata && advisor_algorithms[0].metadata->check_correct.values.asyncflowctl && !(--(comm_ptr->mpid.num_requests))) { comm_ptr->mpid.num_requests = MPIDI_Process.optimized.num_requests; int tmpmpierrno; if(unlikely(verbose)) fprintf(stderr,"Query barrier required for %s\n", advisor_algorithms[0].metadata->name); MPIDO_Barrier(comm_ptr, &tmpmpierrno); } } } sbuf = (char *)sendbuf + send_true_lb; if(!snd_contig) { snd_noncontig_buff = MPL_malloc(send_size); sbuf = snd_noncontig_buff; if(snd_noncontig_buff == NULL) { MPID_Abort(NULL, MPI_ERR_NO_SPACE, 1, "Fatal: Cannot allocate pack buffer"); } DLOOP_Offset last = send_size; MPID_Segment_init(sendbuf, sendcount, sendtype, &segment, 0); MPID_Segment_pack(&segment, 0, &last, snd_noncontig_buff); } } else { MPIDI_Datatype_get_info(1, recvtype, rcv_contig, rcvlen, data_ptr, recv_true_lb); if(MPIDI_Pamix_collsel_advise != NULL && mpid->collsel_fast_query != NULL) { advisor_algorithm_t advisor_algorithms[1]; int num_algorithms = MPIDI_Pamix_collsel_advise(mpid->collsel_fast_query, PAMI_XFER_GATHERV_INT, 64, advisor_algorithms, 1); if(num_algorithms) { if(advisor_algorithms[0].algorithm_type == COLLSEL_EXTERNAL_ALGO) { return MPIR_Gatherv(sendbuf, sendcount, sendtype, recvbuf, recvcounts, displs, recvtype, root, comm_ptr, mpierrno); } else if(advisor_algorithms[0].metadata && advisor_algorithms[0].metadata->check_correct.values.asyncflowctl && !(--(comm_ptr->mpid.num_requests))) { comm_ptr->mpid.num_requests = MPIDI_Process.optimized.num_requests; int tmpmpierrno; if(unlikely(verbose)) fprintf(stderr,"Query barrier required for %s\n", advisor_algorithms[0].metadata->name); MPIDO_Barrier(comm_ptr, &tmpmpierrno); } } } } pami_xfer_t gatherv; rbuf = (char *)recvbuf + recv_true_lb; rcounts = (int*)recvcounts; rdispls = (int*)displs; if(rank == root) { if((recvok = MPIDI_Datatype_to_pami(recvtype, &rtype, -1, NULL, &tmp)) != MPI_SUCCESS) { MPIDI_Datatype_get_info(1, recvtype, rcv_contig, rcvlen, data_ptr, recv_true_lb); totalrecvcount = recvcounts[0]; recvcontinuous = displs[0] == 0? 1 : 0 ; rcounts = (int*)MPL_malloc(size); rdispls = (int*)MPL_malloc(size); rdispls[0] = 0; rcounts[0] = rcvlen * recvcounts[0]; for(i = 1; i < size; i++) { rdispls[i]= rcvlen * totalrecvcount; totalrecvcount += recvcounts[i]; if(displs[i] != (displs[i-1] + recvcounts[i-1])) recvcontinuous = 0; rcounts[i] = rcvlen * recvcounts[i]; } recv_size = rcvlen * totalrecvcount; rcv_noncontig_buff = MPL_malloc(recv_size); rbuf = rcv_noncontig_buff; rtype = PAMI_TYPE_BYTE; if(rcv_noncontig_buff == NULL) { MPID_Abort(NULL, MPI_ERR_NO_SPACE, 1, "Fatal: Cannot allocate pack buffer"); } if(sendbuf == MPI_IN_PLACE) { size_t extent; MPID_Datatype_get_extent_macro(recvtype,extent); MPIR_Localcopy(recvbuf + displs[rank]*extent, recvcounts[rank], recvtype, rcv_noncontig_buff + rdispls[rank], rcounts[rank],MPI_CHAR); } } if(sendbuf == MPI_IN_PLACE) { gatherv.cmd.xfer_gatherv_int.sndbuf = PAMI_IN_PLACE; } else { gatherv.cmd.xfer_gatherv_int.sndbuf = sbuf; } gatherv.cmd.xfer_gatherv_int.stype = PAMI_TYPE_BYTE;/* stype is ignored when sndbuf == PAMI_IN_PLACE */ gatherv.cmd.xfer_gatherv_int.stypecount = send_size; } else { gatherv.cmd.xfer_gatherv_int.sndbuf = sbuf; gatherv.cmd.xfer_gatherv_int.stype = PAMI_TYPE_BYTE; gatherv.cmd.xfer_gatherv_int.stypecount = send_size; } gatherv.cb_done = cb_gatherv; gatherv.cookie = (void *)&gatherv_active; gatherv.cmd.xfer_gatherv_int.root = MPIDI_Task_to_endpoint(MPID_VCR_GET_LPID(comm_ptr->vcr, root), 0); gatherv.cmd.xfer_gatherv_int.rcvbuf = rbuf; gatherv.cmd.xfer_gatherv_int.rtype = rtype; gatherv.cmd.xfer_gatherv_int.rtypecounts = (int *) rcounts; gatherv.cmd.xfer_gatherv_int.rdispls = (int *) rdispls; const pami_metadata_t *my_gatherv_md; gatherv.algorithm = mpid->coll_algorithm[PAMI_XFER_GATHERV_INT][0][0]; my_gatherv_md = &mpid->coll_metadata[PAMI_XFER_GATHERV_INT][0][0]; MPIDI_Update_last_algorithm(comm_ptr, my_gatherv_md->name); MPIDI_Post_coll_t gatherv_post; TRACE_ERR("%s gatherv\n", MPIDI_Process.context_post.active>0?"Posting":"Invoking"); MPIDI_Context_post(MPIDI_Context[0], &gatherv_post.state, MPIDI_Pami_post_wrapper, (void *)&gatherv); TRACE_ERR("Gatherv %s\n", MPIDI_Process.context_post.active>0?"posted":"invoked"); TRACE_ERR("Waiting on active %d\n", gatherv_active); MPID_PROGRESS_WAIT_WHILE(gatherv_active); if(!rcv_contig || recvok != PAMI_SUCCESS) { if(recvcontinuous) { MPIR_Localcopy(rcv_noncontig_buff, recv_size, MPI_CHAR, recvbuf, totalrecvcount, recvtype); } else { size_t extent; MPID_Datatype_get_extent_macro(recvtype,extent); for(i=0; i<size; ++i) { char* scbuf = (char*)rcv_noncontig_buff+ rdispls[i]; char* rcbuf = (char*)recvbuf + displs[i]*extent; MPIR_Localcopy(scbuf, rcounts[i], MPI_CHAR, rcbuf, recvcounts[i], recvtype); TRACE_ERR("Pack recv src extent %zu, displ[%zu]=%zu, count[%zu]=%zu buf[%zu]=%u\n", (size_t)extent, (size_t)i,(size_t)precvdispls[i],(size_t)i,(size_t)precvcounts[i],(size_t)precvdispls[i], *(int*)scbuf); TRACE_ERR("Pack recv dest extent %zu, displ[%zu]=%zu, count[%zu]=%zu buf[%zu]=%u\n", (size_t)extent, (size_t)i,(size_t)displs[i],(size_t)i,(size_t)recvcounts[i],(size_t)displs[i], *(int*)rcbuf); } } MPL_free(rcv_noncontig_buff); if(rank == root) { MPL_free(rcounts); MPL_free(rdispls); } } if(!snd_contig) MPL_free(snd_noncontig_buff); TRACE_ERR("Leaving MPIDO_Gatherv_optimized\n"); return MPI_SUCCESS; }
int MPIDO_Scatterv_simple(const void *sendbuf, const int *sendcounts, const int *displs, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, int root, MPID_Comm *comm_ptr, int *mpierrno) { #ifndef HAVE_PAMI_IN_PLACE if (sendbuf == MPI_IN_PLACE) { MPID_Abort (NULL, 0, 1, "'MPI_IN_PLACE' requries support for `PAMI_IN_PLACE`"); return -1; } #endif int snd_contig = 1; int rcv_contig = 1; int send_size = 0, recv_size = 0; int ssize = 0; MPID_Datatype *dt_ptr = NULL; MPI_Aint send_true_lb=0, recv_true_lb=0; void *snd_noncontig_buff = NULL, *rcv_noncontig_buff = NULL; void *sbuf = NULL, *rbuf = NULL; int *sdispls = NULL, *scounts = NULL; int sndcount = 0; MPID_Segment segment; int tmp, i; pami_type_t stype = PAMI_TYPE_NULL; const int rank = comm_ptr->rank; const int size = comm_ptr->local_size; const struct MPIDI_Comm* const mpid = &(comm_ptr->mpid); if (rank == root && sendtype != MPI_DATATYPE_NULL && sendcounts[0] >= 0) { MPIDI_Datatype_get_info(1, sendtype, snd_contig, ssize, dt_ptr, send_true_lb); if(MPIDI_Pamix_collsel_advise != NULL && mpid->collsel_fast_query != NULL) { advisor_algorithm_t advisor_algorithms[1]; int num_algorithms = MPIDI_Pamix_collsel_advise(mpid->collsel_fast_query, PAMI_XFER_SCATTERV_INT, 64, advisor_algorithms, 1); if(num_algorithms) { if(advisor_algorithms[0].algorithm_type == COLLSEL_EXTERNAL_ALGO) { return MPIR_Scatterv(sendbuf, sendcounts, displs, sendtype, recvbuf, recvcount, recvtype, root, comm_ptr, mpierrno); } else if(advisor_algorithms[0].metadata && advisor_algorithms[0].metadata->check_correct.values.asyncflowctl && !(--(comm_ptr->mpid.num_requests))) { comm_ptr->mpid.num_requests = MPIDI_Process.optimized.num_requests; int tmpmpierrno; MPIDO_Barrier(comm_ptr, &tmpmpierrno); } } } } if (recvtype != MPI_DATATYPE_NULL && recvcount >= 0) { MPIDI_Datatype_get_info(recvcount, recvtype, rcv_contig, recv_size, dt_ptr, recv_true_lb); if(MPIDI_Pamix_collsel_advise != NULL && mpid->collsel_fast_query != NULL) { advisor_algorithm_t advisor_algorithms[1]; int num_algorithms = MPIDI_Pamix_collsel_advise(mpid->collsel_fast_query, PAMI_XFER_SCATTERV_INT, 64, advisor_algorithms, 1); if(num_algorithms) { if(advisor_algorithms[0].algorithm_type == COLLSEL_EXTERNAL_ALGO) { return MPIR_Scatterv(sendbuf, sendcounts, displs, sendtype, recvbuf, recvcount, recvtype, root, comm_ptr, mpierrno); } else if(advisor_algorithms[0].metadata && advisor_algorithms[0].metadata->check_correct.values.asyncflowctl && !(--(comm_ptr->mpid.num_requests))) { comm_ptr->mpid.num_requests = MPIDI_Process.optimized.num_requests; int tmpmpierrno; MPIDO_Barrier(comm_ptr, &tmpmpierrno); } } } } pami_xfer_t scatterv; const pami_metadata_t *my_scatterv_md; volatile unsigned scatterv_active = 1; sbuf = (char *)sendbuf + send_true_lb; rbuf = (char *)recvbuf + recv_true_lb; scounts = (int*)sendcounts; sdispls = (int*)displs; if(rank == root) { if(MPIDI_Datatype_to_pami(sendtype, &stype, -1, NULL, &tmp) != MPI_SUCCESS) { if (!snd_contig) { scounts = (int*)MPIU_Malloc(size); sdispls = (int*)MPIU_Malloc(size); for(i = 0; i < size; i++) { scounts[i] = ssize * sendcounts[i]; sdispls[i] = ssize * displs[i]; send_size += scounts[i]; sndcount += sendcounts[i]; } snd_noncontig_buff = MPIU_Malloc(send_size); sbuf = snd_noncontig_buff; stype = PAMI_TYPE_BYTE; if(snd_noncontig_buff == NULL) { MPID_Abort(NULL, MPI_ERR_NO_SPACE, 1, "Fatal: Cannot allocate pack buffer"); } DLOOP_Offset last = send_size; MPID_Segment_init(sendbuf, sndcount, sendtype, &segment, 0); MPID_Segment_pack(&segment, 0, &last, snd_noncontig_buff); } } if(recvbuf == MPI_IN_PLACE) { rbuf = PAMI_IN_PLACE; } } if(recvbuf != MPI_IN_PLACE) { if (!rcv_contig) { rcv_noncontig_buff = MPIU_Malloc(recv_size); rbuf = rcv_noncontig_buff; if(rcv_noncontig_buff == NULL) { MPID_Abort(NULL, MPI_ERR_NO_SPACE, 1, "Fatal: Cannot allocate pack buffer"); } } } scatterv.cb_done = cb_scatterv; scatterv.cookie = (void *)&scatterv_active; scatterv.cmd.xfer_scatterv_int.root = MPIDI_Task_to_endpoint(MPID_VCR_GET_LPID(comm_ptr->vcr, root), 0); scatterv.algorithm = mpid->coll_algorithm[PAMI_XFER_SCATTERV_INT][0][0]; my_scatterv_md = &mpid->coll_metadata[PAMI_XFER_SCATTERV_INT][0][0]; scatterv.cmd.xfer_scatterv_int.rcvbuf = rbuf; scatterv.cmd.xfer_scatterv_int.sndbuf = sbuf; scatterv.cmd.xfer_scatterv_int.stype = stype; scatterv.cmd.xfer_scatterv_int.rtype = PAMI_TYPE_BYTE;/* rtype is ignored when rcvbuf == PAMI_IN_PLACE */ scatterv.cmd.xfer_scatterv_int.stypecounts = (int *) scounts; scatterv.cmd.xfer_scatterv_int.rtypecount = recv_size; scatterv.cmd.xfer_scatterv_int.sdispls = (int *) sdispls; MPIDI_Update_last_algorithm(comm_ptr, my_scatterv_md->name); MPIDI_Post_coll_t scatterv_post; TRACE_ERR("%s scatterv\n", MPIDI_Process.context_post.active>0?"Posting":"Invoking"); MPIDI_Context_post(MPIDI_Context[0], &scatterv_post.state, MPIDI_Pami_post_wrapper, (void *)&scatterv); TRACE_ERR("Waiting on active %d\n", scatterv_active); MPID_PROGRESS_WAIT_WHILE(scatterv_active); if(!rcv_contig) { MPIR_Localcopy(rcv_noncontig_buff, recv_size, MPI_CHAR, recvbuf, recvcount, recvtype); MPIU_Free(rcv_noncontig_buff); } if(!snd_contig) { MPIU_Free(snd_noncontig_buff); MPIU_Free(scounts); MPIU_Free(sdispls); } TRACE_ERR("Leaving MPIDO_Scatterv_optimized\n"); return MPI_SUCCESS; }
int MPIDO_Barrier(MPID_Comm *comm_ptr, int *mpierrno) { TRACE_ERR("Entering MPIDO_Barrier\n"); volatile unsigned active=1; MPIDI_Post_coll_t barrier_post; pami_xfer_t barrier; pami_algorithm_t my_barrier; pami_metadata_t *my_barrier_md; int queryreq = 0; if(comm_ptr->mpid.user_selected_type[PAMI_XFER_BARRIER] == MPID_COLL_USE_MPICH) { if(unlikely(MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_ALL && comm_ptr->rank == 0)) fprintf(stderr,"Using MPICH barrier\n"); TRACE_ERR("Using MPICH Barrier\n"); return MPIR_Barrier(comm_ptr, mpierrno); } barrier.cb_done = cb_barrier; barrier.cookie = (void *)&active; if(comm_ptr->mpid.user_selected_type[PAMI_XFER_BARRIER] == MPID_COLL_OPTIMIZED) { TRACE_ERR("Optimized barrier (%s) was pre-selected\n", comm_ptr->mpid.opt_protocol_md[PAMI_XFER_BARRIER][0].name); my_barrier = comm_ptr->mpid.opt_protocol[PAMI_XFER_BARRIER][0]; my_barrier_md = &comm_ptr->mpid.opt_protocol_md[PAMI_XFER_BARRIER][0]; queryreq = comm_ptr->mpid.must_query[PAMI_XFER_BARRIER][0]; } else { TRACE_ERR("Barrier (%s) was specified by user\n", comm_ptr->mpid.user_metadata[PAMI_XFER_BARRIER].name); my_barrier = comm_ptr->mpid.user_selected[PAMI_XFER_BARRIER]; my_barrier_md = &comm_ptr->mpid.user_metadata[PAMI_XFER_BARRIER]; queryreq = comm_ptr->mpid.user_selected_type[PAMI_XFER_BARRIER]; } barrier.algorithm = my_barrier; /* There is no support for query-required barrier protocols here */ MPID_assert_always(queryreq != MPID_COLL_ALWAYS_QUERY); MPID_assert_always(queryreq != MPID_COLL_CHECK_FN_REQUIRED); /* TODO Name needs fixed somehow */ MPIDI_Update_last_algorithm(comm_ptr, my_barrier_md->name); if(unlikely(MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_ALL && comm_ptr->rank == 0)) { unsigned long long int threadID; MPIU_Thread_id_t tid; MPIU_Thread_self(&tid); threadID = (unsigned long long int)tid; fprintf(stderr,"<%llx> Using protocol %s for barrier on %u\n", threadID, my_barrier_md->name, /* comm_ptr->rank,comm_ptr->local_size,comm_ptr->remote_size,*/ (unsigned) comm_ptr->context_id); } TRACE_ERR("%s barrier\n", MPIDI_Process.context_post.active>0?"posting":"invoking"); MPIDI_Context_post(MPIDI_Context[0], &barrier_post.state, MPIDI_Pami_post_wrapper, (void *)&barrier); TRACE_ERR("barrier %s rc: %d\n", MPIDI_Process.context_post.active>0?"posted":"invoked", rc); TRACE_ERR("advance spinning\n"); MPID_PROGRESS_WAIT_WHILE(active); TRACE_ERR("exiting mpido_barrier\n"); return 0; }
int MPIDO_Allgatherv_simple(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, const int *recvcounts, const int *displs, MPI_Datatype recvtype, MPID_Comm * comm_ptr, int *mpierrno) { #ifndef HAVE_PAMI_IN_PLACE if (sendbuf == MPI_IN_PLACE) { MPID_Abort (NULL, 0, 1, "'MPI_IN_PLACE' requries support for `PAMI_IN_PLACE`"); return -1; } #endif TRACE_ERR("Entering MPIDO_Allgatherv_optimized\n"); /* function pointer to be used to point to approperiate algorithm */ /* Check the nature of the buffers */ MPID_Datatype *dt_null = NULL; MPI_Aint send_true_lb = 0; MPI_Aint recv_true_lb = 0; size_t send_size = 0; size_t recv_size = 0; size_t rcvtypelen = 0; int snd_data_contig = 0, rcv_data_contig = 0; void *snd_noncontig_buff = NULL, *rcv_noncontig_buff = NULL; int scount=sendcount; char *sbuf, *rbuf; pami_type_t stype = NULL, rtype; const int rank = comm_ptr->rank; const int size = comm_ptr->local_size; const struct MPIDI_Comm* const mpid = &(comm_ptr->mpid); #if ASSERT_LEVEL==0 /* We can't afford the tracing in ndebug/performance libraries */ const unsigned verbose = 0; #else const unsigned verbose = (MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_ALL) && (rank == 0); #endif int recvcontinuous=0; size_t totalrecvcount=0; int *lrecvdispls = NULL; /* possible local displs calculated for noncontinous */ int *lrecvcounts = NULL;/* possible local counts calculated for noncontinous */ const int *precvdispls = displs; /* pointer to displs to use as pami parmi */ const int *precvcounts = recvcounts; /* pointer to counts to use as pami parmi */ int inplace = sendbuf == MPI_IN_PLACE? 1 : 0; volatile unsigned allgatherv_active = 1; int recvok=PAMI_SUCCESS, sendok=PAMI_SUCCESS; int tmp; const pami_metadata_t *my_md; MPIDI_Datatype_get_info(1, recvtype, rcv_data_contig, rcvtypelen, dt_null, recv_true_lb); if(MPIDI_Pamix_collsel_advise != NULL && mpid->collsel_fast_query != NULL) { advisor_algorithm_t advisor_algorithms[1]; int num_algorithms = MPIDI_Pamix_collsel_advise(mpid->collsel_fast_query, PAMI_XFER_ALLGATHERV_INT, rcvtypelen * recvcounts[0], advisor_algorithms, 1); if(num_algorithms) { if(advisor_algorithms[0].algorithm_type == COLLSEL_EXTERNAL_ALGO) { return MPIR_Allgatherv(sendbuf, sendcount, sendtype, recvbuf, recvcounts, displs, recvtype, comm_ptr, mpierrno); } else if(advisor_algorithms[0].metadata && advisor_algorithms[0].metadata->check_correct.values.asyncflowctl && !(--(comm_ptr->mpid.num_requests))) { comm_ptr->mpid.num_requests = MPIDI_Process.optimized.num_requests; int tmpmpierrno; if(unlikely(verbose)) fprintf(stderr,"Query barrier required for %s\n", advisor_algorithms[0].metadata->name); MPIDO_Barrier(comm_ptr, &tmpmpierrno); } } } if(!inplace) { sendok = MPIDI_Datatype_to_pami(sendtype, &stype, -1, NULL, &tmp); MPIDI_Datatype_get_info(sendcount, sendtype, snd_data_contig, send_size, dt_null, send_true_lb); sbuf = (char *)sendbuf + send_true_lb; if(!snd_data_contig || (sendok != PAMI_SUCCESS)) { stype = PAMI_TYPE_UNSIGNED_CHAR; scount = send_size; if(!snd_data_contig) { snd_noncontig_buff = MPIU_Malloc(send_size); sbuf = snd_noncontig_buff; if(snd_noncontig_buff == NULL) { MPID_Abort(NULL, MPI_ERR_NO_SPACE, 1, "Fatal: Cannot allocate pack buffer"); } MPIR_Localcopy(sendbuf, sendcount, sendtype, snd_noncontig_buff, send_size,MPI_CHAR); } } } else sbuf = PAMI_IN_PLACE; recvok = MPIDI_Datatype_to_pami(recvtype, &rtype, -1, NULL, &tmp); rbuf = (char *)recvbuf+recv_true_lb; if(!rcv_data_contig || (recvok != PAMI_SUCCESS)) { rtype = PAMI_TYPE_UNSIGNED_CHAR; totalrecvcount = recvcounts[0]; recvcontinuous = displs[0] == 0? 1 : 0 ; int i; precvdispls = lrecvdispls = MPIU_Malloc(size*sizeof(int)); precvcounts = lrecvcounts = MPIU_Malloc(size*sizeof(int)); lrecvdispls[0]= 0; lrecvcounts[0]= rcvtypelen * recvcounts[0]; for(i=1; i<size; ++i) { lrecvdispls[i]= rcvtypelen * totalrecvcount; totalrecvcount += recvcounts[i]; if(displs[i] != (displs[i-1] + recvcounts[i-1])) recvcontinuous = 0; lrecvcounts[i]= rcvtypelen * recvcounts[i]; } recv_size = rcvtypelen * totalrecvcount; TRACE_ERR("Pack receive rcv_contig %zu, recvok %zd, totalrecvcount %zu, recvcontinuous %zu, rcvtypelen %zu, recv_size %zu\n", (size_t)rcv_data_contig, (size_t)recvok, (size_t)totalrecvcount, (size_t)recvcontinuous,(size_t)rcvtypelen, (size_t)recv_size); rcv_noncontig_buff = MPIU_Malloc(recv_size); rbuf = rcv_noncontig_buff; if(rcv_noncontig_buff == NULL) { MPID_Abort(NULL, MPI_ERR_NO_SPACE, 1, "Fatal: Cannot allocate pack buffer"); } if(inplace) { size_t extent; MPID_Datatype_get_extent_macro(recvtype,extent); MPIR_Localcopy(recvbuf + displs[rank]*extent, recvcounts[rank], recvtype, rcv_noncontig_buff + precvdispls[rank], precvcounts[rank],MPI_CHAR); scount = precvcounts[rank]; stype = PAMI_TYPE_UNSIGNED_CHAR; sbuf = PAMI_IN_PLACE; } } pami_xfer_t allgatherv; allgatherv.cb_done = allgatherv_cb_done; allgatherv.cookie = (void *)&allgatherv_active; allgatherv.cmd.xfer_allgatherv_int.sndbuf = sbuf; allgatherv.cmd.xfer_allgatherv_int.rcvbuf = rbuf; allgatherv.cmd.xfer_allgatherv_int.stype = stype;/* stype is ignored when sndbuf == PAMI_IN_PLACE */ allgatherv.cmd.xfer_allgatherv_int.rtype = rtype; allgatherv.cmd.xfer_allgatherv_int.stypecount = scount; allgatherv.cmd.xfer_allgatherv_int.rtypecounts = (int *) precvcounts; allgatherv.cmd.xfer_allgatherv_int.rdispls = (int *) precvdispls; allgatherv.algorithm = mpid->coll_algorithm[PAMI_XFER_ALLGATHERV_INT][0][0]; my_md = &mpid->coll_metadata[PAMI_XFER_ALLGATHERV_INT][0][0]; TRACE_ERR("Calling allgatherv via %s()\n", MPIDI_Process.context_post.active>0?"PAMI_Collective":"PAMI_Context_post"); MPIDI_Post_coll_t allgatherv_post; MPIDI_Context_post(MPIDI_Context[0], &allgatherv_post.state, MPIDI_Pami_post_wrapper, (void *)&allgatherv); MPIDI_Update_last_algorithm(comm_ptr, my_md->name); TRACE_ERR("Rank %d waiting on active %d\n", rank, allgatherv_active); MPID_PROGRESS_WAIT_WHILE(allgatherv_active); if(!rcv_data_contig || (recvok != PAMI_SUCCESS)) { if(recvcontinuous) { MPIR_Localcopy(rcv_noncontig_buff, recv_size,MPI_CHAR, recvbuf, totalrecvcount, recvtype); } else { size_t extent; int i; MPID_Datatype_get_extent_macro(recvtype,extent); for(i=0; i<size; ++i) { char* scbuf = (char*)rcv_noncontig_buff+ precvdispls[i]; char* rcbuf = (char*)recvbuf + displs[i]*extent; MPIR_Localcopy(scbuf, precvcounts[i], MPI_CHAR, rcbuf, recvcounts[i], recvtype); TRACE_ERR("Pack recv src extent %zu, displ[%zu]=%zu, count[%zu]=%zu buf[%zu]=%u\n", (size_t)extent, (size_t)i,(size_t)precvdispls[i],(size_t)i,(size_t)precvcounts[i],(size_t)precvdispls[i], *(int*)scbuf); TRACE_ERR("Pack recv dest extent %zu, displ[%zu]=%zu, count[%zu]=%zu buf[%zu]=%u\n", (size_t)extent, (size_t)i,(size_t)displs[i],(size_t)i,(size_t)recvcounts[i],(size_t)displs[i], *(int*)rcbuf); } } MPIU_Free(rcv_noncontig_buff); } if(!snd_data_contig) MPIU_Free(snd_noncontig_buff); if(lrecvdispls) MPIU_Free(lrecvdispls); if(lrecvcounts) MPIU_Free(lrecvcounts); return MPI_SUCCESS; }
int MPIDO_Bcast(void *buffer, int count, MPI_Datatype datatype, int root, MPID_Comm *comm_ptr, int *mpierrno) { TRACE_ERR("in mpido_bcast\n"); const size_t BCAST_LIMIT = 0x40000000; int data_contig, rc; void *data_buffer = NULL, *noncontig_buff = NULL; volatile unsigned active = 1; MPI_Aint data_true_lb = 0; MPID_Datatype *data_ptr; MPID_Segment segment; MPIDI_Post_coll_t bcast_post; const struct MPIDI_Comm* const mpid = &(comm_ptr->mpid); const int rank = comm_ptr->rank; #if ASSERT_LEVEL==0 /* We can't afford the tracing in ndebug/performance libraries */ const unsigned verbose = 0; #else const unsigned verbose = (MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_ALL) && (rank == 0); #endif const int selected_type = mpid->user_selected_type[PAMI_XFER_BROADCAST]; /* Must calculate data_size based on count=1 in case it's total size is > integer */ int data_size_one; MPIDI_Datatype_get_info(1, datatype, data_contig, data_size_one, data_ptr, data_true_lb); /* do this calculation once and use twice */ const size_t data_size_sz = (size_t)data_size_one*(size_t)count; if(unlikely(verbose)) fprintf(stderr,"bcast count %d, size %d (%#zX), root %d, buffer %p\n", count,data_size_one, (size_t)data_size_one*(size_t)count, root,buffer); if(unlikely( data_size_sz > BCAST_LIMIT) ) { void *new_buffer=buffer; int c, new_count = (int)BCAST_LIMIT/data_size_one; MPID_assert(new_count > 0); for(c=1; ((size_t)c*(size_t)new_count) <= (size_t)count; ++c) { if ((rc = MPIDO_Bcast(new_buffer, new_count, datatype, root, comm_ptr, mpierrno)) != MPI_SUCCESS) return rc; new_buffer = (char*)new_buffer + (size_t)data_size_one*(size_t)new_count; } new_count = count % new_count; /* 0 is ok, just returns no-op */ return MPIDO_Bcast(new_buffer, new_count, datatype, root, comm_ptr, mpierrno); } /* Must use data_size based on count for byte bcast processing. Previously calculated as a size_t but large data_sizes were handled above so this cast to int should be fine here. */ const int data_size = (int)data_size_sz; if(selected_type == MPID_COLL_USE_MPICH || data_size == 0) { if(unlikely(verbose)) fprintf(stderr,"Using MPICH bcast algorithm\n"); MPIDI_Update_last_algorithm(comm_ptr,"BCAST_MPICH"); return MPIR_Bcast_intra(buffer, count, datatype, root, comm_ptr, mpierrno); } data_buffer = (char *)buffer + data_true_lb; if(!data_contig) { noncontig_buff = MPIU_Malloc(data_size); data_buffer = noncontig_buff; if(noncontig_buff == NULL) { MPID_Abort(NULL, MPI_ERR_NO_SPACE, 1, "Fatal: Cannot allocate pack buffer"); } if(rank == root) { DLOOP_Offset last = data_size; MPID_Segment_init(buffer, count, datatype, &segment, 0); MPID_Segment_pack(&segment, 0, &last, noncontig_buff); } } pami_xfer_t bcast; pami_algorithm_t my_bcast; const pami_metadata_t *my_md = (pami_metadata_t *)NULL; int queryreq = 0; bcast.cb_done = cb_bcast; bcast.cookie = (void *)&active; bcast.cmd.xfer_broadcast.root = MPIDI_Task_to_endpoint(MPID_VCR_GET_LPID(comm_ptr->vcr, root), 0); bcast.algorithm = mpid->user_selected[PAMI_XFER_BROADCAST]; bcast.cmd.xfer_broadcast.buf = data_buffer; bcast.cmd.xfer_broadcast.type = PAMI_TYPE_BYTE; /* Needs to be sizeof(type)*count since we are using bytes as * the generic type */ bcast.cmd.xfer_broadcast.typecount = data_size; if(selected_type == MPID_COLL_OPTIMIZED) { TRACE_ERR("Optimized bcast (%s) and (%s) were pre-selected\n", mpid->opt_protocol_md[PAMI_XFER_BROADCAST][0].name, mpid->opt_protocol_md[PAMI_XFER_BROADCAST][1].name); if(mpid->cutoff_size[PAMI_XFER_BROADCAST][1] != 0)/* SSS: There is FCA cutoff (FCA only sets cutoff for [PAMI_XFER_BROADCAST][1]) */ { if(data_size <= mpid->cutoff_size[PAMI_XFER_BROADCAST][1]) { my_bcast = mpid->opt_protocol[PAMI_XFER_BROADCAST][1]; my_md = &mpid->opt_protocol_md[PAMI_XFER_BROADCAST][1]; queryreq = mpid->must_query[PAMI_XFER_BROADCAST][1]; } else { return MPIR_Bcast_intra(buffer, count, datatype, root, comm_ptr, mpierrno); } } if(data_size > mpid->cutoff_size[PAMI_XFER_BROADCAST][0]) { my_bcast = mpid->opt_protocol[PAMI_XFER_BROADCAST][1]; my_md = &mpid->opt_protocol_md[PAMI_XFER_BROADCAST][1]; queryreq = mpid->must_query[PAMI_XFER_BROADCAST][1]; } else { my_bcast = mpid->opt_protocol[PAMI_XFER_BROADCAST][0]; my_md = &mpid->opt_protocol_md[PAMI_XFER_BROADCAST][0]; queryreq = mpid->must_query[PAMI_XFER_BROADCAST][0]; } } else { TRACE_ERR("Bcast (%s) was specified by user\n", mpid->user_metadata[PAMI_XFER_BROADCAST].name); my_bcast = mpid->user_selected[PAMI_XFER_BROADCAST]; my_md = &mpid->user_metadata[PAMI_XFER_BROADCAST]; queryreq = selected_type; } bcast.algorithm = my_bcast; if(unlikely(queryreq == MPID_COLL_ALWAYS_QUERY || queryreq == MPID_COLL_CHECK_FN_REQUIRED)) { metadata_result_t result = {0}; TRACE_ERR("querying bcast protocol %s, type was: %d\n", my_md->name, queryreq); if(my_md->check_fn != NULL) /* calling the check fn is sufficient */ { metadata_result_t result = {0}; result = my_md->check_fn(&bcast); result.check.nonlocal = 0; /* #warning REMOVE THIS WHEN IMPLEMENTED */ } else /* no check_fn, manually look at the metadata fields */ { TRACE_ERR("Optimzed selection line %d\n",__LINE__); /* Check if the message range if restricted */ if(my_md->check_correct.values.rangeminmax) { if((my_md->range_lo <= data_size) && (my_md->range_hi >= data_size)) ; /* ok, algorithm selected */ else { result.check.range = 1; if(unlikely(verbose)) { fprintf(stderr,"message size (%u) outside range (%zu<->%zu) for %s.\n", data_size, my_md->range_lo, my_md->range_hi, my_md->name); } } } /* \todo check the rest of the metadata */ } TRACE_ERR("bitmask: %#X\n", result.bitmask); if(result.bitmask) { if(unlikely(verbose)) fprintf(stderr,"Using MPICH bcast algorithm - query fn failed\n"); MPIDI_Update_last_algorithm(comm_ptr,"BCAST_MPICH"); return MPIR_Bcast_intra(buffer, count, datatype, root, comm_ptr, mpierrno); } if(my_md->check_correct.values.asyncflowctl && !(--(comm_ptr->mpid.num_requests))) { comm_ptr->mpid.num_requests = MPIDI_Process.optimized.num_requests; int tmpmpierrno; if(unlikely(verbose)) fprintf(stderr,"Query barrier required for %s\n", my_md->name); MPIDO_Barrier(comm_ptr, &tmpmpierrno); } } if(unlikely(verbose)) { unsigned long long int threadID; MPIU_Thread_id_t tid; MPIU_Thread_self(&tid); threadID = (unsigned long long int)tid; fprintf(stderr,"<%llx> Using protocol %s for bcast on %u\n", threadID, my_md->name, (unsigned) comm_ptr->context_id); } MPIDI_Context_post(MPIDI_Context[0], &bcast_post.state, MPIDI_Pami_post_wrapper, (void *)&bcast); MPIDI_Update_last_algorithm(comm_ptr, my_md->name); MPID_PROGRESS_WAIT_WHILE(active); TRACE_ERR("bcast done\n"); if(!data_contig) { if(rank != root) MPIR_Localcopy(noncontig_buff, data_size, MPI_CHAR, buffer, count, datatype); MPIU_Free(noncontig_buff); } TRACE_ERR("leaving bcast\n"); return 0; }
int MPIDO_Bcast_simple(void *buffer, int count, MPI_Datatype datatype, int root, MPID_Comm *comm_ptr, int *mpierrno) { TRACE_ERR("Entering MPIDO_Bcast_optimized\n"); int data_contig; void *data_buffer = NULL, *noncontig_buff = NULL; volatile unsigned active = 1; MPI_Aint data_true_lb = 0; MPID_Datatype *data_ptr; MPID_Segment segment; MPIDI_Post_coll_t bcast_post; const struct MPIDI_Comm* const mpid = &(comm_ptr->mpid); const int rank = comm_ptr->rank; /* Must calculate data_size based on count=1 in case it's total size is > integer */ int data_size_one; MPIDI_Datatype_get_info(1, datatype, data_contig, data_size_one, data_ptr, data_true_lb); if(MPIDI_Pamix_collsel_advise != NULL && mpid->collsel_fast_query != NULL) { advisor_algorithm_t advisor_algorithms[1]; int num_algorithms = MPIDI_Pamix_collsel_advise(mpid->collsel_fast_query, PAMI_XFER_BROADCAST, data_size_one * count, advisor_algorithms, 1); if(num_algorithms) { if(advisor_algorithms[0].algorithm_type == COLLSEL_EXTERNAL_ALGO) { return MPIR_Bcast_intra(buffer, count, datatype, root, comm_ptr, mpierrno); } } } const int data_size = data_size_one*(size_t)count; data_buffer = (char *)buffer + data_true_lb; if(!data_contig) { noncontig_buff = MPIU_Malloc(data_size); data_buffer = noncontig_buff; if(noncontig_buff == NULL) { MPID_Abort(NULL, MPI_ERR_NO_SPACE, 1, "Fatal: Cannot allocate pack buffer"); } if(rank == root) { DLOOP_Offset last = data_size; MPID_Segment_init(buffer, count, datatype, &segment, 0); MPID_Segment_pack(&segment, 0, &last, noncontig_buff); } } pami_xfer_t bcast; const pami_metadata_t *my_bcast_md; int queryreq = 0; bcast.cb_done = cb_bcast; bcast.cookie = (void *)&active; bcast.cmd.xfer_broadcast.root = MPIDI_Task_to_endpoint(MPID_VCR_GET_LPID(comm_ptr->vcr, root), 0); bcast.algorithm = mpid->coll_algorithm[PAMI_XFER_BROADCAST][0][0]; bcast.cmd.xfer_broadcast.buf = data_buffer; bcast.cmd.xfer_broadcast.type = PAMI_TYPE_BYTE; /* Needs to be sizeof(type)*count since we are using bytes as * the generic type */ bcast.cmd.xfer_broadcast.typecount = data_size; my_bcast_md = &mpid->coll_metadata[PAMI_XFER_BROADCAST][0][0]; MPIDI_Context_post(MPIDI_Context[0], &bcast_post.state, MPIDI_Pami_post_wrapper, (void *)&bcast); MPIDI_Update_last_algorithm(comm_ptr, my_bcast_md->name); MPID_PROGRESS_WAIT_WHILE(active); TRACE_ERR("bcast done\n"); if(!data_contig) { if(rank != root) MPIR_Localcopy(noncontig_buff, data_size, MPI_CHAR, buffer, count, datatype); MPIU_Free(noncontig_buff); } TRACE_ERR("Exiting MPIDO_Bcast_optimized\n"); return 0; }
int MPIDO_Allgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPID_Comm * comm_ptr, int *mpierrno) { /* ********************************* * Check the nature of the buffers * ********************************* */ /* MPIDO_Coll_config config = {1,1,1,1,1,1};*/ int config[6], i; MPID_Datatype * dt_null = NULL; MPI_Aint send_true_lb = 0; MPI_Aint recv_true_lb = 0; int rc, comm_size = comm_ptr->local_size; size_t send_size = 0; size_t recv_size = 0; volatile unsigned allred_active = 1; volatile unsigned allgather_active = 1; pami_xfer_t allred; for (i=0;i<6;i++) config[i] = 1; pami_metadata_t *my_md; allred.cb_done = allred_cb_done; allred.cookie = (void *)&allred_active; /* Pick an algorithm that is guaranteed to work for the pre-allreduce */ /* TODO: This needs selection for fast(er|est) allreduce protocol */ allred.algorithm = comm_ptr->mpid.coll_algorithm[PAMI_XFER_ALLREDUCE][0][0]; allred.cmd.xfer_allreduce.sndbuf = (void *)config; allred.cmd.xfer_allreduce.stype = PAMI_TYPE_SIGNED_INT; allred.cmd.xfer_allreduce.rcvbuf = (void *)config; allred.cmd.xfer_allreduce.rtype = PAMI_TYPE_SIGNED_INT; allred.cmd.xfer_allreduce.stypecount = 6; allred.cmd.xfer_allreduce.rtypecount = 6; allred.cmd.xfer_allreduce.op = PAMI_DATA_BAND; char use_tree_reduce, use_alltoall, use_bcast, use_pami, use_opt; char *rbuf = NULL, *sbuf = NULL; use_alltoall = comm_ptr->mpid.allgathers[2]; use_tree_reduce = comm_ptr->mpid.allgathers[0]; use_bcast = comm_ptr->mpid.allgathers[1]; use_pami = (comm_ptr->mpid.user_selected_type[PAMI_XFER_ALLGATHER] == MPID_COLL_USE_MPICH) ? 0 : 1; /* if(sendbuf == MPI_IN_PLACE) use_pami = 0;*/ use_opt = use_alltoall || use_tree_reduce || use_bcast || use_pami; TRACE_ERR("flags before: b: %d a: %d t: %d p: %d\n", use_bcast, use_alltoall, use_tree_reduce, use_pami); if(!use_opt) { if(unlikely(MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_ALL && comm_ptr->rank == 0)) fprintf(stderr,"Using MPICH allgather algorithm\n"); TRACE_ERR("No options set/available; using MPICH for allgather\n"); MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHER_MPICH"); return MPIR_Allgather(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, comm_ptr, mpierrno); } if ((sendcount < 1 && sendbuf != MPI_IN_PLACE) || recvcount < 1) return MPI_SUCCESS; /* Gather datatype information */ MPIDI_Datatype_get_info(recvcount, recvtype, config[MPID_RECV_CONTIG], recv_size, dt_null, recv_true_lb); send_size = recv_size; rbuf = (char *)recvbuf+recv_true_lb; if(sendbuf != MPI_IN_PLACE) { if(unlikely(MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_ALL)) fprintf(stderr,"allgather MPI_IN_PLACE buffering\n"); MPIDI_Datatype_get_info(sendcount, sendtype, config[MPID_SEND_CONTIG], send_size, dt_null, send_true_lb); sbuf = (char *)sendbuf+send_true_lb; } else { sbuf = (char *)recvbuf+recv_size*comm_ptr->rank; } /* fprintf(stderr,"sendount: %d, recvcount: %d send_size: %zd recv_size: %zd\n", sendcount, recvcount, send_size, recv_size);*/ /* verify everyone's datatype contiguity */ /* Check buffer alignment now, since we're pre-allreducing anyway */ /* Only do this if one of the glue protocols is likely to be used */ if(use_alltoall || use_tree_reduce || use_bcast) { config[MPID_ALIGNEDBUFFER] = !((long)sendbuf & 0x0F) && !((long)recvbuf & 0x0F); /* #warning need to determine best allreduce for short messages */ if(comm_ptr->mpid.preallreduces[MPID_ALLGATHER_PREALLREDUCE]) { TRACE_ERR("Preallreducing in allgather\n"); MPIDI_Post_coll_t allred_post; MPIDI_Context_post(MPIDI_Context[0], &allred_post.state, MPIDI_Pami_post_wrapper, (void *)&allred); MPID_PROGRESS_WAIT_WHILE(allred_active); } use_alltoall = comm_ptr->mpid.allgathers[2] && config[MPID_RECV_CONTIG] && config[MPID_SEND_CONTIG];; /* Note: some of the glue protocols use recv_size*comm_size rather than * recv_size so we use that for comparison here, plus we pass that in * to those protocols. */ use_tree_reduce = comm_ptr->mpid.allgathers[0] && config[MPID_RECV_CONTIG] && config[MPID_SEND_CONTIG] && config[MPID_RECV_CONTINUOUS] && (recv_size*comm_size % sizeof(int) == 0); use_bcast = comm_ptr->mpid.allgathers[1]; TRACE_ERR("flags after: b: %d a: %d t: %d p: %d\n", use_bcast, use_alltoall, use_tree_reduce, use_pami); } if(use_pami) { TRACE_ERR("Using PAMI-level allgather protocol\n"); pami_xfer_t allgather; allgather.cb_done = allgather_cb_done; allgather.cookie = (void *)&allgather_active; allgather.cmd.xfer_allgather.rcvbuf = rbuf; allgather.cmd.xfer_allgather.sndbuf = sbuf; allgather.cmd.xfer_allgather.stype = PAMI_TYPE_BYTE; allgather.cmd.xfer_allgather.rtype = PAMI_TYPE_BYTE; allgather.cmd.xfer_allgather.stypecount = send_size; allgather.cmd.xfer_allgather.rtypecount = recv_size; if(comm_ptr->mpid.user_selected_type[PAMI_XFER_ALLGATHER] == MPID_COLL_OPTIMIZED) { allgather.algorithm = comm_ptr->mpid.opt_protocol[PAMI_XFER_ALLGATHER][0]; my_md = &comm_ptr->mpid.opt_protocol_md[PAMI_XFER_ALLGATHER][0]; } else { allgather.algorithm = comm_ptr->mpid.user_selected[PAMI_XFER_ALLGATHER]; my_md = &comm_ptr->mpid.user_metadata[PAMI_XFER_ALLGATHER]; } if(unlikely( comm_ptr->mpid.user_selected_type[PAMI_XFER_ALLGATHER] == MPID_COLL_ALWAYS_QUERY || comm_ptr->mpid.user_selected_type[PAMI_XFER_ALLGATHER] == MPID_COLL_CHECK_FN_REQUIRED)) { metadata_result_t result = {0}; TRACE_ERR("Querying allgather protocol %s, type was: %d\n", my_md->name, comm_ptr->mpid.user_selected_type[PAMI_XFER_ALLGATHER]); result = my_md->check_fn(&allgather); TRACE_ERR("bitmask: %#X\n", result.bitmask); if(!result.bitmask) { fprintf(stderr,"Query failed for %s.\n", my_md->name); } } if(unlikely(MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_ALL && comm_ptr->rank == 0)) { unsigned long long int threadID; MPIU_Thread_id_t tid; MPIU_Thread_self(&tid); threadID = (unsigned long long int)tid; fprintf(stderr,"<%llx> Using protocol %s for allgather on %u\n", threadID, my_md->name, (unsigned) comm_ptr->context_id); } TRACE_ERR("Calling PAMI_Collective with allgather structure\n"); MPIDI_Post_coll_t allgather_post; MPIDI_Context_post(MPIDI_Context[0], &allgather_post.state, MPIDI_Pami_post_wrapper, (void *)&allgather); TRACE_ERR("Allgather %s\n", MPIDI_Process.context_post.active>0?"posted":"invoked"); MPIDI_Update_last_algorithm(comm_ptr, my_md->name); MPID_PROGRESS_WAIT_WHILE(allgather_active); TRACE_ERR("Allgather done\n"); return PAMI_SUCCESS; } if(use_tree_reduce) { TRACE_ERR("Using allgather via allreduce\n"); MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHER_OPT_ALLREDUCE"); rc = MPIDO_Allgather_allreduce(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, send_true_lb, recv_true_lb, send_size, recv_size*comm_size, comm_ptr, mpierrno); return rc; } if(use_alltoall) { TRACE_ERR("Using allgather via alltoall\n"); MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHER_OPT_ALLTOALL"); rc = MPIDO_Allgather_alltoall(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, send_true_lb, recv_true_lb, send_size, recv_size*comm_size, comm_ptr, mpierrno); return rc; } if(use_bcast) { TRACE_ERR("Using allgather via bcast\n"); MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHER_OPT_BCAST"); rc = MPIDO_Allgather_bcast(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, send_true_lb, recv_true_lb, send_size, recv_size*comm_size, comm_ptr, mpierrno); return rc; } /* Nothing used yet; dump to MPICH */ if(unlikely(MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_ALL && comm_ptr->rank == 0)) fprintf(stderr,"Using MPICH allgather algorithm\n"); TRACE_ERR("Using allgather via mpich\n"); MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHER_MPICH"); return MPIR_Allgather(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, comm_ptr, mpierrno); }
void MPIDI_Coll_comm_create(MPID_Comm *comm) { volatile int geom_init = 1; int i; MPIDI_Post_geom_create_t geom_post; TRACE_ERR("MPIDI_Coll_comm_create enter\n"); if (!MPIDI_Process.optimized.collectives) return; if(comm->comm_kind != MPID_INTRACOMM) return; /* Create a geometry */ comm->coll_fns = MPIU_Calloc0(1, MPID_Collops); MPID_assert(comm->coll_fns != NULL); if(comm->mpid.geometry != MPIDI_Process.world_geometry) { if(unlikely(MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_0 && comm->rank == 0)) fprintf(stderr,"world geom: %p parent geom: %p\n", MPIDI_Process.world_geometry, comm->mpid.parent); TRACE_ERR("Creating subgeom\n"); /* Change to this at some point */ comm->mpid.tasks = NULL; for(i=1;i<comm->local_size;i++) { /* only if sequential tasks should we use a (single) range. Multi or reordered ranges are inefficient */ if(MPID_VCR_GET_LPID(comm->vcr, i) != (MPID_VCR_GET_LPID(comm->vcr, i-1) + 1)) { /* not sequential, use tasklist */ MPID_VCR_GET_LPIDS(comm, comm->mpid.tasks); break; } } /* Should we use a range? (no task list set) */ if(comm->mpid.tasks == NULL) { /* one range, {first rank ... last rank} */ comm->mpid.range.lo = MPID_VCR_GET_LPID(comm->vcr, 0); comm->mpid.range.hi = MPID_VCR_GET_LPID(comm->vcr, comm->local_size-1); } if(unlikely(MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_0 && comm->rank == 0)) fprintf(stderr,"create geometry tasks %p {%u..%u}\n", comm->mpid.tasks, MPID_VCR_GET_LPID(comm->vcr, 0),MPID_VCR_GET_LPID(comm->vcr, comm->local_size-1)); pami_configuration_t config[3]; size_t numconfigs = 0; #ifdef HAVE_PAMI_GEOMETRY_NONCONTIG config[0].name = PAMI_GEOMETRY_NONCONTIG; if(MPIDI_Process.optimized.memory & MPID_OPT_LVL_NONCONTIG) config[0].value.intval = 0; // Disable non-contig, pamid doesn't use pami for non-contig data collectives else config[0].value.intval = 1; // Enable non-contig even though pamid doesn't use pami for non-contig data collectives, // we still possibly want those collectives for other reasons. ++numconfigs; #endif if(MPIDI_Process.optimized.subcomms) { config[numconfigs].name = PAMI_GEOMETRY_OPTIMIZE; config[numconfigs].value.intval = 1; ++numconfigs; } #ifdef HAVE_PAMI_GEOMETRY_MEMORY_OPTIMIZE if(MPIDI_Process.optimized.memory) { config[numconfigs].name = PAMI_GEOMETRY_MEMORY_OPTIMIZE; config[numconfigs].value.intval = MPIDI_Process.optimized.memory; /* level of optimization */ ++numconfigs; } #endif if((MPIDI_Process.optimized.memory & MPID_OPT_LVL_IRREG) && (comm->local_size & (comm->local_size-1))) { /* Don't create irregular geometries. Fallback to MPICH only collectives */ geom_init = 0; comm->mpid.geometry = PAMI_GEOMETRY_NULL; } else if(comm->mpid.tasks == NULL) { geom_post.client = MPIDI_Client; geom_post.configs = config; geom_post.context_offset = 0; /* TODO BES investigate */ geom_post.num_configs = numconfigs; geom_post.newgeom = &comm->mpid.geometry, geom_post.parent = PAMI_GEOMETRY_NULL; geom_post.id = comm->context_id; geom_post.ranges = &comm->mpid.range; geom_post.tasks = NULL;; geom_post.count = (size_t)1; geom_post.fn = geom_create_cb_done; geom_post.cookie = (void*)&geom_init; TRACE_ERR("%s geom_rangelist_create\n", MPIDI_Process.context_post>0?"Posting":"Invoking"); MPIDI_Context_post(MPIDI_Context[0], &geom_post.state, geom_rangelist_create_wrapper, (void *)&geom_post); } else { geom_post.client = MPIDI_Client; geom_post.configs = config; geom_post.context_offset = 0; /* TODO BES investigate */ geom_post.num_configs = numconfigs; geom_post.newgeom = &comm->mpid.geometry, geom_post.parent = PAMI_GEOMETRY_NULL; geom_post.id = comm->context_id; geom_post.ranges = NULL; geom_post.tasks = comm->mpid.tasks; geom_post.count = (size_t)comm->local_size; geom_post.fn = geom_create_cb_done; geom_post.cookie = (void*)&geom_init; TRACE_ERR("%s geom_tasklist_create\n", MPIDI_Process.context_post>0?"Posting":"Invoking"); MPIDI_Context_post(MPIDI_Context[0], &geom_post.state, geom_tasklist_create_wrapper, (void *)&geom_post); } TRACE_ERR("Waiting for geom create to finish\n"); MPID_PROGRESS_WAIT_WHILE(geom_init); if(comm->mpid.geometry == PAMI_GEOMETRY_NULL) { if(unlikely(MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_0 && comm->rank == 0)) fprintf(stderr,"Created unoptimized communicator id=%u, size=%u\n", (unsigned) comm->context_id,comm->local_size); MPIU_TestFree(&comm->coll_fns); return; } } /* Initialize the async flow control in case it will be used. */ comm->mpid.num_requests = MPIDI_Process.optimized.num_requests; TRACE_ERR("Querying protocols\n"); /* Determine what protocols are available for this comm/geom */ /* These two functions moved to mpid_collselect.c */ MPIDI_Comm_coll_query(comm); MPIDI_Comm_coll_envvars(comm); if(MPIDI_Process.optimized.select_colls) MPIDI_Comm_coll_select(comm); TRACE_ERR("mpir barrier\n"); int mpierrno = FALSE; /* Switch to comm->coll_fns->fn() */ MPIDO_Barrier(comm, &mpierrno); TRACE_ERR("MPIDI_Coll_comm_create exit\n"); }
void MPIDI_Coll_comm_destroy(MPID_Comm *comm) { TRACE_ERR("MPIDI_Coll_comm_destroy enter\n"); int i; volatile int geom_destroy = 1; MPIDI_Post_geom_destroy_t geom_destroy_post; if (!MPIDI_Process.optimized.collectives) return; if(comm->comm_kind != MPID_INTRACOMM) return; /* It's possible (MPIR_Setup_intercomm_localcomm) to have an intracomm without a geometry even when using optimized collectives */ if(comm->mpid.geometry == PAMI_GEOMETRY_NULL) return; MPIU_TestFree(&comm->coll_fns); for(i=0;i<PAMI_XFER_COUNT;i++) { TRACE_ERR("Freeing algo/meta %d\n", i); /* When allocating comm->mpid.coll_algorithm, we skip allocations for AM collectives. Also there is no explicit initialization of comm->mpid.coll_algorithm to NULLs. This may cause MPIU_TestFree to cause problems when freeing. We skip AM collectives here as we skip allocating them in MPIDI_Comm_coll_query */ if(i == PAMI_XFER_AMBROADCAST || i == PAMI_XFER_AMSCATTER || i == PAMI_XFER_AMGATHER || i == PAMI_XFER_AMREDUCE) continue; MPIU_TestFree(&comm->mpid.coll_algorithm[i][0]); MPIU_TestFree(&comm->mpid.coll_algorithm[i][1]); MPIU_TestFree(&comm->mpid.coll_metadata[i][0]); MPIU_TestFree(&comm->mpid.coll_metadata[i][1]); } if(MPIDI_Process.optimized.auto_select_colls != MPID_AUTO_SELECT_COLLS_NONE && MPIDI_Process.optimized.auto_select_colls != MPID_AUTO_SELECT_COLLS_TUNE && comm->local_size > 1) { /* Destroy the fast query object. */ pami_extension_collsel_query_destroy pamix_collsel_query_destroy = (pami_extension_collsel_query_destroy) PAMI_Extension_symbol(MPIDI_Collsel_extension, "Collsel_query_destroy"); if(pamix_collsel_query_destroy != NULL) { pamix_collsel_query_destroy(&(comm->mpid.collsel_fast_query)); } } TRACE_ERR("Destroying geometry\n"); geom_destroy_post.client = MPIDI_Client; geom_destroy_post.geom = &comm->mpid.geometry; geom_destroy_post.fn = geom_destroy_cb_done; geom_destroy_post.cookie = (void *)&geom_destroy; TRACE_ERR("%s geom_destroy\n", MPIDI_Process.context_post>0?"Posting":"Invoking"); MPIDI_Context_post(MPIDI_Context[0], &geom_destroy_post.state, geom_destroy_wrapper, (void *)&geom_destroy_post); TRACE_ERR("Waiting for geom destroy to finish\n"); MPID_PROGRESS_WAIT_WHILE(geom_destroy); MPID_VCR_FREE_LPIDS(comm->mpid.tasks); /* TRACE_ERR("Freeing geometry ranges\n"); MPIU_TestFree(&comm->mpid.tasks_descriptor.ranges); */ TRACE_ERR("MPIDI_Coll_comm_destroy exit\n"); }
int MPIDO_Allgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPID_Comm * comm_ptr, int *mpierrno) { #ifndef HAVE_PAMI_IN_PLACE if (sendbuf == MPI_IN_PLACE) { MPID_Abort (NULL, 0, 1, "'MPI_IN_PLACE' requries support for `PAMI_IN_PLACE`"); return -1; } #endif /* ********************************* * Check the nature of the buffers * ********************************* */ const struct MPIDI_Comm* const mpid = &(comm_ptr->mpid); int config[6], i; MPID_Datatype * dt_null = NULL; MPI_Aint send_true_lb = 0; MPI_Aint recv_true_lb = 0; int comm_size = comm_ptr->local_size; size_t send_bytes = 0; size_t recv_bytes = 0; volatile unsigned allred_active = 1; volatile unsigned allgather_active = 1; pami_xfer_t allred; const int rank = comm_ptr->rank; int queryreq = 0; #if ASSERT_LEVEL==0 /* We can't afford the tracing in ndebug/performance libraries */ const unsigned verbose = 0; #else const unsigned verbose = (MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_ALL) && (rank == 0); #endif const int selected_type = mpid->user_selected_type[PAMI_XFER_ALLGATHER]; for (i=0;i<6;i++) config[i] = 1; const pami_metadata_t *my_md = (pami_metadata_t *)NULL; allred.cb_done = allred_cb_done; allred.cookie = (void *)&allred_active; /* Pick an algorithm that is guaranteed to work for the pre-allreduce */ /* TODO: This needs selection for fast(er|est) allreduce protocol */ allred.algorithm = mpid->coll_algorithm[PAMI_XFER_ALLREDUCE][0][0]; allred.cmd.xfer_allreduce.sndbuf = (void *)config; allred.cmd.xfer_allreduce.stype = PAMI_TYPE_SIGNED_INT; allred.cmd.xfer_allreduce.rcvbuf = (void *)config; allred.cmd.xfer_allreduce.rtype = PAMI_TYPE_SIGNED_INT; allred.cmd.xfer_allreduce.stypecount = 6; allred.cmd.xfer_allreduce.rtypecount = 6; allred.cmd.xfer_allreduce.op = PAMI_DATA_BAND; char use_tree_reduce, use_alltoall, use_bcast, use_pami, use_opt; char *rbuf = NULL, *sbuf = NULL; const char * const allgathers = mpid->allgathers; use_alltoall = allgathers[2]; use_tree_reduce = allgathers[0]; use_bcast = allgathers[1]; use_pami = (selected_type == MPID_COLL_USE_MPICH) ? 0 : 1; use_opt = use_alltoall || use_tree_reduce || use_bcast || use_pami; TRACE_ERR("flags before: b: %d a: %d t: %d p: %d\n", use_bcast, use_alltoall, use_tree_reduce, use_pami); if(!use_opt) { if(unlikely(verbose)) fprintf(stderr,"Using MPICH allgather algorithm\n"); TRACE_ERR("No options set/available; using MPICH for allgather\n"); MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHER_MPICH"); #if CUDA_AWARE_SUPPORT if(MPIDI_Process.cuda_aware_support_on) { MPI_Aint sdt_extent,rdt_extent; MPID_Datatype_get_extent_macro(sendtype, sdt_extent); MPID_Datatype_get_extent_macro(recvtype, rdt_extent); char *scbuf = NULL; char *rcbuf = NULL; int is_send_dev_buf = MPIDI_cuda_is_device_buf(sendbuf); int is_recv_dev_buf = MPIDI_cuda_is_device_buf(recvbuf); if(is_send_dev_buf) { scbuf = MPL_malloc(sdt_extent * sendcount); cudaError_t cudaerr = CudaMemcpy(scbuf, sendbuf, sdt_extent * sendcount, cudaMemcpyDeviceToHost); if (cudaSuccess != cudaerr) fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr)); } else scbuf = sendbuf; if(is_recv_dev_buf) { rcbuf = MPL_malloc(rdt_extent * recvcount); if(sendbuf == MPI_IN_PLACE) { cudaError_t cudaerr = CudaMemcpy(rcbuf, recvbuf, rdt_extent * recvcount, cudaMemcpyDeviceToHost); if (cudaSuccess != cudaerr) fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr)); } else memset(rcbuf, 0, rdt_extent * recvcount); } else rcbuf = recvbuf; int cuda_res = MPIR_Allgather(scbuf, sendcount, sendtype, rcbuf, recvcount, recvtype, comm_ptr, mpierrno); if(is_send_dev_buf)MPL_free(scbuf); if(is_recv_dev_buf) { cudaError_t cudaerr = CudaMemcpy(recvbuf, rcbuf, rdt_extent * recvcount, cudaMemcpyHostToDevice); if (cudaSuccess != cudaerr) fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr)); MPL_free(rcbuf); } return cuda_res; } else #endif return MPIR_Allgather(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, comm_ptr, mpierrno); } if ((sendcount < 1 && sendbuf != MPI_IN_PLACE) || recvcount < 1) return MPI_SUCCESS; /* Gather datatype information */ MPIDI_Datatype_get_info(recvcount, recvtype, config[MPID_RECV_CONTIG], recv_bytes, dt_null, recv_true_lb); send_bytes = recv_bytes; rbuf = (char *)recvbuf+recv_true_lb; sbuf = PAMI_IN_PLACE; if(sendbuf != MPI_IN_PLACE) { MPIDI_Datatype_get_info(sendcount, sendtype, config[MPID_SEND_CONTIG], send_bytes, dt_null, send_true_lb); sbuf = (char *)sendbuf+send_true_lb; } else if(unlikely(verbose)) fprintf(stderr,"allgather MPI_IN_PLACE buffering\n"); /* verify everyone's datatype contiguity */ /* Check buffer alignment now, since we're pre-allreducing anyway */ /* Only do this if one of the glue protocols is likely to be used */ if(use_alltoall || use_tree_reduce || use_bcast) { config[MPID_ALIGNEDBUFFER] = !((long)sendbuf & 0x0F) && !((long)recvbuf & 0x0F); /* #warning need to determine best allreduce for short messages */ if(mpid->preallreduces[MPID_ALLGATHER_PREALLREDUCE]) { TRACE_ERR("Preallreducing in allgather\n"); MPIDI_Post_coll_t allred_post; MPIDI_Context_post(MPIDI_Context[0], &allred_post.state, MPIDI_Pami_post_wrapper, (void *)&allred); MPID_PROGRESS_WAIT_WHILE(allred_active); } use_alltoall = allgathers[2] && config[MPID_RECV_CONTIG] && config[MPID_SEND_CONTIG];; /* Note: some of the glue protocols use recv_bytes*comm_size rather than * recv_bytes so we use that for comparison here, plus we pass that in * to those protocols. */ use_tree_reduce = allgathers[0] && config[MPID_RECV_CONTIG] && config[MPID_SEND_CONTIG] && config[MPID_RECV_CONTINUOUS] && (recv_bytes*comm_size%sizeof(unsigned)) == 0; use_bcast = allgathers[1]; TRACE_ERR("flags after: b: %d a: %d t: %d p: %d\n", use_bcast, use_alltoall, use_tree_reduce, use_pami); } if(use_pami) { TRACE_ERR("Using PAMI-level allgather protocol\n"); pami_xfer_t allgather; allgather.cb_done = allgather_cb_done; allgather.cookie = (void *)&allgather_active; allgather.cmd.xfer_allgather.rcvbuf = rbuf; allgather.cmd.xfer_allgather.sndbuf = sbuf; allgather.cmd.xfer_allgather.stype = PAMI_TYPE_BYTE; allgather.cmd.xfer_allgather.rtype = PAMI_TYPE_BYTE; allgather.cmd.xfer_allgather.stypecount = send_bytes; allgather.cmd.xfer_allgather.rtypecount = recv_bytes; if(selected_type == MPID_COLL_OPTIMIZED) { if((mpid->cutoff_size[PAMI_XFER_ALLGATHER][0] == 0) || (mpid->cutoff_size[PAMI_XFER_ALLGATHER][0] > 0 && mpid->cutoff_size[PAMI_XFER_ALLGATHER][0] >= send_bytes)) { allgather.algorithm = mpid->opt_protocol[PAMI_XFER_ALLGATHER][0]; my_md = &mpid->opt_protocol_md[PAMI_XFER_ALLGATHER][0]; queryreq = mpid->must_query[PAMI_XFER_ALLGATHER][0]; } else { return MPIR_Allgather(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, comm_ptr, mpierrno); } } else { allgather.algorithm = mpid->user_selected[PAMI_XFER_ALLGATHER]; my_md = &mpid->user_metadata[PAMI_XFER_ALLGATHER]; queryreq = selected_type; } if(unlikely( queryreq == MPID_COLL_ALWAYS_QUERY || queryreq == MPID_COLL_CHECK_FN_REQUIRED)) { metadata_result_t result = {0}; TRACE_ERR("Querying allgather protocol %s, type was: %d\n", my_md->name, selected_type); if(my_md->check_fn == NULL) { /* process metadata bits */ if((!my_md->check_correct.values.inplace) && (sendbuf == MPI_IN_PLACE)) result.check.unspecified = 1; if(my_md->check_correct.values.rangeminmax) { if((my_md->range_lo <= recv_bytes) && (my_md->range_hi >= recv_bytes)) ; /* ok, algorithm selected */ else { result.check.range = 1; if(unlikely(verbose)) { fprintf(stderr,"message size (%zu) outside range (%zu<->%zu) for %s.\n", recv_bytes, my_md->range_lo, my_md->range_hi, my_md->name); } } } } else /* calling the check fn is sufficient */ result = my_md->check_fn(&allgather); TRACE_ERR("bitmask: %#X\n", result.bitmask); result.check.nonlocal = 0; /* #warning REMOVE THIS WHEN IMPLEMENTED */ if(result.bitmask) { if(unlikely(verbose)) fprintf(stderr,"Query failed for %s. Using MPICH allgather\n", my_md->name); MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHER_MPICH"); return MPIR_Allgather(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, comm_ptr, mpierrno); } if(my_md->check_correct.values.asyncflowctl && !(--(comm_ptr->mpid.num_requests))) { comm_ptr->mpid.num_requests = MPIDI_Process.optimized.num_requests; int tmpmpierrno; if(unlikely(verbose)) fprintf(stderr,"Query barrier required for %s\n", my_md->name); MPIDO_Barrier(comm_ptr, &tmpmpierrno); } } if(unlikely(verbose)) { unsigned long long int threadID; MPL_thread_id_t tid; MPL_thread_self(&tid); threadID = (unsigned long long int)tid; fprintf(stderr,"<%llx> Using protocol %s for allgather on %u\n", threadID, my_md->name, (unsigned) comm_ptr->context_id); } TRACE_ERR("Calling PAMI_Collective with allgather structure\n"); MPIDI_Post_coll_t allgather_post; MPIDI_Context_post(MPIDI_Context[0], &allgather_post.state, MPIDI_Pami_post_wrapper, (void *)&allgather); MPIDI_Update_last_algorithm(comm_ptr, my_md->name); MPID_PROGRESS_WAIT_WHILE(allgather_active); TRACE_ERR("Allgather done\n"); return PAMI_SUCCESS; } if(use_tree_reduce) { if(unlikely(verbose)) fprintf(stderr,"Using protocol GLUE_ALLREDUCE for allgather\n"); TRACE_ERR("Using allgather via allreduce\n"); MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHER_OPT_ALLREDUCE"); return MPIDO_Allgather_allreduce(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, send_true_lb, recv_true_lb, send_bytes, recv_bytes*comm_size, comm_ptr, mpierrno); } if(use_alltoall) { if(unlikely(verbose)) fprintf(stderr,"Using protocol GLUE_BCAST for allgather\n"); TRACE_ERR("Using allgather via alltoall\n"); MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHER_OPT_ALLTOALL"); return MPIDO_Allgather_alltoall(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, send_true_lb, recv_true_lb, send_bytes, recv_bytes*comm_size, comm_ptr, mpierrno); } if(use_bcast) { if(unlikely(verbose)) fprintf(stderr,"Using protocol GLUE_ALLTOALL for allgather\n"); TRACE_ERR("Using allgather via bcast\n"); MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHER_OPT_BCAST"); return MPIDO_Allgather_bcast(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, send_true_lb, recv_true_lb, send_bytes, recv_bytes*comm_size, comm_ptr, mpierrno); } /* Nothing used yet; dump to MPICH */ if(unlikely(verbose)) fprintf(stderr,"Using MPICH allgather algorithm\n"); TRACE_ERR("Using allgather via mpich\n"); MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHER_MPICH"); return MPIR_Allgather(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, comm_ptr, mpierrno); }
int MPIDO_Allgather_simple(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPID_Comm * comm_ptr, int *mpierrno) { #ifndef HAVE_PAMI_IN_PLACE if (sendbuf == MPI_IN_PLACE) { MPID_Abort (NULL, 0, 1, "'MPI_IN_PLACE' requries support for `PAMI_IN_PLACE`"); return -1; } #endif /* ********************************* * Check the nature of the buffers * ********************************* */ const struct MPIDI_Comm* const mpid = &(comm_ptr->mpid); MPID_Datatype * dt_null = NULL; void *snd_noncontig_buff = NULL, *rcv_noncontig_buff = NULL; MPI_Aint send_true_lb = 0; MPI_Aint recv_true_lb = 0; int snd_data_contig = 1, rcv_data_contig = 1; size_t send_size = 0; size_t recv_size = 0; MPID_Segment segment; volatile unsigned allgather_active = 1; const int rank = comm_ptr->rank; const int size = comm_ptr->local_size; #if ASSERT_LEVEL==0 /* We can't afford the tracing in ndebug/performance libraries */ const unsigned verbose = 0; #else const unsigned verbose = (MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_ALL) && (rank == 0); #endif const pami_metadata_t *my_md; char *rbuf = NULL, *sbuf = NULL; if ((sendcount < 1 && sendbuf != MPI_IN_PLACE) || recvcount < 1) return MPI_SUCCESS; /* Gather datatype information */ MPIDI_Datatype_get_info(recvcount, recvtype, rcv_data_contig, recv_size, dt_null, recv_true_lb); send_size = recv_size; if(MPIDI_Pamix_collsel_advise != NULL && mpid->collsel_fast_query != NULL) { advisor_algorithm_t advisor_algorithms[1]; int num_algorithms = MPIDI_Pamix_collsel_advise(mpid->collsel_fast_query, PAMI_XFER_ALLGATHER, send_size, advisor_algorithms, 1); if(num_algorithms) { if(advisor_algorithms[0].algorithm_type == COLLSEL_EXTERNAL_ALGO) { return MPIR_Allgather(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, comm_ptr, mpierrno); } else if(advisor_algorithms[0].metadata && advisor_algorithms[0].metadata->check_correct.values.asyncflowctl && !(--(comm_ptr->mpid.num_requests))) { comm_ptr->mpid.num_requests = MPIDI_Process.optimized.num_requests; int tmpmpierrno; if(unlikely(verbose)) fprintf(stderr,"Query barrier required for %s\n", advisor_algorithms[0].metadata->name); MPIDO_Barrier(comm_ptr, &tmpmpierrno); } } } rbuf = (char *)recvbuf+recv_true_lb; if(!rcv_data_contig) { rcv_noncontig_buff = MPL_malloc(recv_size * size); rbuf = rcv_noncontig_buff; if(rcv_noncontig_buff == NULL) { MPID_Abort(NULL, MPI_ERR_NO_SPACE, 1, "Fatal: Cannot allocate pack buffer"); } if(sendbuf == MPI_IN_PLACE) { sbuf = PAMI_IN_PLACE; size_t extent; MPID_Datatype_get_extent_macro(recvtype,extent); MPIR_Localcopy(recvbuf + (rank*recvcount*extent), recvcount, recvtype, rcv_noncontig_buff + (rank*recv_size), recv_size,MPI_CHAR); } } if(sendbuf != MPI_IN_PLACE) { MPIDI_Datatype_get_info(sendcount, sendtype, snd_data_contig, send_size, dt_null, send_true_lb); sbuf = (char *)sendbuf+send_true_lb; if(!snd_data_contig) { snd_noncontig_buff = MPL_malloc(send_size); sbuf = snd_noncontig_buff; if(snd_noncontig_buff == NULL) { MPID_Abort(NULL, MPI_ERR_NO_SPACE, 1, "Fatal: Cannot allocate pack buffer"); } DLOOP_Offset last = send_size; MPID_Segment_init(sendbuf, sendcount, sendtype, &segment, 0); MPID_Segment_pack(&segment, 0, &last, snd_noncontig_buff); } } else sbuf = PAMI_IN_PLACE; TRACE_ERR("Using PAMI-level allgather protocol\n"); pami_xfer_t allgather; allgather.cb_done = allgather_cb_done; allgather.cookie = (void *)&allgather_active; allgather.cmd.xfer_allgather.rcvbuf = rbuf; allgather.cmd.xfer_allgather.sndbuf = sbuf; allgather.cmd.xfer_allgather.stype = PAMI_TYPE_BYTE;/* stype is ignored when sndbuf == PAMI_IN_PLACE */ allgather.cmd.xfer_allgather.rtype = PAMI_TYPE_BYTE; allgather.cmd.xfer_allgather.stypecount = send_size; allgather.cmd.xfer_allgather.rtypecount = recv_size; allgather.algorithm = mpid->coll_algorithm[PAMI_XFER_ALLGATHER][0][0]; my_md = &mpid->coll_metadata[PAMI_XFER_ALLGATHER][0][0]; TRACE_ERR("Calling PAMI_Collective with allgather structure\n"); MPIDI_Post_coll_t allgather_post; MPIDI_Context_post(MPIDI_Context[0], &allgather_post.state, MPIDI_Pami_post_wrapper, (void *)&allgather); TRACE_ERR("Allgather %s\n", MPIDI_Process.context_post.active>0?"posted":"invoked"); MPIDI_Update_last_algorithm(comm_ptr, my_md->name); MPID_PROGRESS_WAIT_WHILE(allgather_active); if(!rcv_data_contig) { MPIR_Localcopy(rcv_noncontig_buff, recv_size * size, MPI_CHAR, recvbuf, recvcount, recvtype); MPL_free(rcv_noncontig_buff); } if(!snd_data_contig) MPL_free(snd_noncontig_buff); TRACE_ERR("Allgather done\n"); return MPI_SUCCESS; }
int MPIDO_Allgatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, const int *recvcounts, const int *displs, MPI_Datatype recvtype, MPID_Comm * comm_ptr, int *mpierrno) { #ifndef HAVE_PAMI_IN_PLACE if (sendbuf == MPI_IN_PLACE) { MPID_Abort (NULL, 0, 1, "'MPI_IN_PLACE' requries support for `PAMI_IN_PLACE`"); return -1; } #endif TRACE_ERR("Entering MPIDO_Allgatherv\n"); /* function pointer to be used to point to approperiate algorithm */ /* Check the nature of the buffers */ MPID_Datatype *dt_null = NULL; MPI_Aint send_true_lb = 0; MPI_Aint recv_true_lb = 0; size_t send_size = 0; size_t recv_size = 0; int config[6]; int scount=sendcount; int i, rc, buffer_sum = 0; const int size = comm_ptr->local_size; char use_tree_reduce, use_alltoall, use_bcast, use_pami, use_opt; char *sbuf, *rbuf; const int rank = comm_ptr->rank; const struct MPIDI_Comm* const mpid = &(comm_ptr->mpid); int queryreq = 0; #if ASSERT_LEVEL==0 /* We can't afford the tracing in ndebug/performance libraries */ const unsigned verbose = 0; #else const unsigned verbose = (MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_ALL) && (rank == 0); #endif const int selected_type = mpid->user_selected_type[PAMI_XFER_ALLGATHERV_INT]; pami_xfer_t allred; volatile unsigned allred_active = 1; volatile unsigned allgatherv_active = 1; pami_type_t stype, rtype; int tmp; const pami_metadata_t *my_md = (pami_metadata_t *)NULL; for(i=0;i<6;i++) config[i] = 1; allred.cb_done = allred_cb_done; allred.cookie = (void *)&allred_active; allred.algorithm = mpid->coll_algorithm[PAMI_XFER_ALLREDUCE][0][0]; allred.cmd.xfer_allreduce.sndbuf = (void *)config; allred.cmd.xfer_allreduce.stype = PAMI_TYPE_SIGNED_INT; allred.cmd.xfer_allreduce.rcvbuf = (void *)config; allred.cmd.xfer_allreduce.rtype = PAMI_TYPE_SIGNED_INT; allred.cmd.xfer_allreduce.stypecount = 6; allred.cmd.xfer_allreduce.rtypecount = 6; allred.cmd.xfer_allreduce.op = PAMI_DATA_BAND; use_alltoall = mpid->allgathervs[2]; use_tree_reduce = mpid->allgathervs[0]; use_bcast = mpid->allgathervs[1]; use_pami = selected_type != MPID_COLL_USE_MPICH; if((sendbuf != MPI_IN_PLACE) && (MPIDI_Datatype_to_pami(sendtype, &stype, -1, NULL, &tmp) != MPI_SUCCESS)) use_pami = 0; if(MPIDI_Datatype_to_pami(recvtype, &rtype, -1, NULL, &tmp) != MPI_SUCCESS) use_pami = 0; use_opt = use_alltoall || use_tree_reduce || use_bcast || use_pami; if(!use_opt) /* back to MPICH */ { if(unlikely(verbose)) fprintf(stderr,"Using MPICH allgatherv type %u.\n", selected_type); TRACE_ERR("Using MPICH Allgatherv\n"); MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHERV_MPICH"); #if CUDA_AWARE_SUPPORT if(MPIDI_Process.cuda_aware_support_on) { MPI_Aint sdt_extent,rdt_extent; MPID_Datatype_get_extent_macro(sendtype, sdt_extent); MPID_Datatype_get_extent_macro(recvtype, rdt_extent); char *scbuf = NULL; char *rcbuf = NULL; int is_send_dev_buf = MPIDI_cuda_is_device_buf(sendbuf); int is_recv_dev_buf = MPIDI_cuda_is_device_buf(recvbuf); if(is_send_dev_buf) { scbuf = MPIU_Malloc(sdt_extent * sendcount); cudaError_t cudaerr = CudaMemcpy(scbuf, sendbuf, sdt_extent * sendcount, cudaMemcpyDeviceToHost); if (cudaSuccess != cudaerr) fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr)); } else scbuf = sendbuf; size_t rtotal_buf; if(is_recv_dev_buf) { //Since displs can be non-continous, we need to calculate max buffer size int highest_displs = displs[size - 1]; int highest_recvcount = recvcounts[size - 1]; for(i = 0; i < size; i++) { if(displs[i]+recvcounts[i] > highest_displs+highest_recvcount) { highest_displs = displs[i]; highest_recvcount = recvcounts[i]; } } rtotal_buf = (highest_displs+highest_recvcount)*rdt_extent; rcbuf = MPIU_Malloc(rtotal_buf); if(sendbuf == MPI_IN_PLACE) { cudaError_t cudaerr = CudaMemcpy(rcbuf, recvbuf, rtotal_buf, cudaMemcpyDeviceToHost); if (cudaSuccess != cudaerr) fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr)); } else memset(rcbuf, 0, rtotal_buf); } else rcbuf = recvbuf; int cuda_res = MPIR_Allgatherv(scbuf, sendcount, sendtype, rcbuf, recvcounts, displs, recvtype, comm_ptr, mpierrno); if(is_send_dev_buf)MPIU_Free(scbuf); if(is_recv_dev_buf) { cudaError_t cudaerr = CudaMemcpy(recvbuf, rcbuf, rtotal_buf, cudaMemcpyHostToDevice); if (cudaSuccess != cudaerr) fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr)); MPIU_Free(rcbuf); } return cuda_res; } else #endif return MPIR_Allgatherv(sendbuf, sendcount, sendtype, recvbuf, recvcounts, displs, recvtype, comm_ptr, mpierrno); } MPIDI_Datatype_get_info(1, recvtype, config[MPID_RECV_CONTIG], recv_size, dt_null, recv_true_lb); if(sendbuf == MPI_IN_PLACE) { sbuf = PAMI_IN_PLACE; if(unlikely(verbose)) fprintf(stderr,"allgatherv MPI_IN_PLACE buffering\n"); stype = rtype; scount = recvcounts[rank]; send_size = recv_size * scount; } else { MPIDI_Datatype_get_info(sendcount, sendtype, config[MPID_SEND_CONTIG], send_size, dt_null, send_true_lb); sbuf = (char *)sendbuf+send_true_lb; } rbuf = (char *)recvbuf+recv_true_lb; if(use_alltoall || use_bcast || use_tree_reduce) { if (displs[0]) config[MPID_RECV_CONTINUOUS] = 0; for (i = 1; i < size; i++) { buffer_sum += recvcounts[i - 1]; if (buffer_sum != displs[i]) { config[MPID_RECV_CONTINUOUS] = 0; break; } } buffer_sum += recvcounts[size - 1]; buffer_sum *= recv_size; /* disable with "safe allgatherv" env var */ if(mpid->preallreduces[MPID_ALLGATHERV_PREALLREDUCE]) { MPIDI_Post_coll_t allred_post; MPIDI_Context_post(MPIDI_Context[0], &allred_post.state, MPIDI_Pami_post_wrapper, (void *)&allred); MPID_PROGRESS_WAIT_WHILE(allred_active); } use_tree_reduce = mpid->allgathervs[0] && config[MPID_RECV_CONTIG] && config[MPID_SEND_CONTIG] && config[MPID_RECV_CONTINUOUS] && buffer_sum % sizeof(unsigned) == 0; use_alltoall = mpid->allgathervs[2] && config[MPID_RECV_CONTIG] && config[MPID_SEND_CONTIG]; use_bcast = mpid->allgathervs[1]; } if(use_pami) { pami_xfer_t allgatherv; allgatherv.cb_done = allgatherv_cb_done; allgatherv.cookie = (void *)&allgatherv_active; if(selected_type == MPID_COLL_OPTIMIZED) { if((mpid->cutoff_size[PAMI_XFER_ALLGATHERV_INT][0] == 0) || (mpid->cutoff_size[PAMI_XFER_ALLGATHERV_INT][0] > 0 && mpid->cutoff_size[PAMI_XFER_ALLGATHERV_INT][0] >= send_size)) { allgatherv.algorithm = mpid->opt_protocol[PAMI_XFER_ALLGATHERV_INT][0]; my_md = &mpid->opt_protocol_md[PAMI_XFER_ALLGATHERV_INT][0]; queryreq = mpid->must_query[PAMI_XFER_ALLGATHERV_INT][0]; } else return MPIR_Allgatherv(sendbuf, sendcount, sendtype, recvbuf, recvcounts, displs, recvtype, comm_ptr, mpierrno); } else { allgatherv.algorithm = mpid->user_selected[PAMI_XFER_ALLGATHERV_INT]; my_md = &mpid->user_metadata[PAMI_XFER_ALLGATHERV_INT]; queryreq = selected_type; } allgatherv.cmd.xfer_allgatherv_int.sndbuf = sbuf; allgatherv.cmd.xfer_allgatherv_int.rcvbuf = rbuf; allgatherv.cmd.xfer_allgatherv_int.stype = stype; allgatherv.cmd.xfer_allgatherv_int.rtype = rtype; allgatherv.cmd.xfer_allgatherv_int.stypecount = scount; allgatherv.cmd.xfer_allgatherv_int.rtypecounts = (int *) recvcounts; allgatherv.cmd.xfer_allgatherv_int.rdispls = (int *) displs; if(unlikely (queryreq == MPID_COLL_ALWAYS_QUERY || queryreq == MPID_COLL_CHECK_FN_REQUIRED)) { metadata_result_t result = {0}; TRACE_ERR("Querying allgatherv_int protocol %s, type was %d\n", my_md->name, selected_type); if(my_md->check_fn == NULL) { /* process metadata bits */ if((!my_md->check_correct.values.inplace) && (sendbuf == MPI_IN_PLACE)) result.check.unspecified = 1; /* Can't check ranges like this. Non-local. Comment out for now. if(my_md->check_correct.values.rangeminmax) { MPI_Aint data_true_lb; MPID_Datatype *data_ptr; int data_size, data_contig; MPIDI_Datatype_get_info(sendcount, sendtype, data_contig, data_size, data_ptr, data_true_lb); if((my_md->range_lo <= data_size) && (my_md->range_hi >= data_size)) ; else { result.check.range = 1; if(unlikely(verbose)) { fprintf(stderr,"message size (%u) outside range (%zu<->%zu) for %s.\n", data_size, my_md->range_lo, my_md->range_hi, my_md->name); } } } */ } else /* calling the check fn is sufficient */ result = my_md->check_fn(&allgatherv); TRACE_ERR("Allgatherv bitmask: %#X\n", result.bitmask); result.check.nonlocal = 0; /* #warning REMOVE THIS WHEN IMPLEMENTED */ if(result.bitmask) { if(unlikely(verbose)) fprintf(stderr,"Query failed for %s. Using MPICH allgatherv.\n", my_md->name); MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHERV_MPICH"); return MPIR_Allgatherv(sendbuf, sendcount, sendtype, recvbuf, recvcounts, displs, recvtype, comm_ptr, mpierrno); } if(my_md->check_correct.values.asyncflowctl && !(--(comm_ptr->mpid.num_requests))) { comm_ptr->mpid.num_requests = MPIDI_Process.optimized.num_requests; int tmpmpierrno; if(unlikely(verbose)) fprintf(stderr,"Query barrier required for %s\n", my_md->name); MPIDO_Barrier(comm_ptr, &tmpmpierrno); } } if(unlikely(verbose)) { unsigned long long int threadID; MPIU_Thread_id_t tid; MPIU_Thread_self(&tid); threadID = (unsigned long long int)tid; fprintf(stderr,"<%llx> Using protocol %s for allgatherv on %u\n", threadID, my_md->name, (unsigned) comm_ptr->context_id); } MPIDI_Post_coll_t allgatherv_post; MPIDI_Context_post(MPIDI_Context[0], &allgatherv_post.state, MPIDI_Pami_post_wrapper, (void *)&allgatherv); MPIDI_Update_last_algorithm(comm_ptr, my_md->name); TRACE_ERR("Rank %d waiting on active %d\n", rank, allgatherv_active); MPID_PROGRESS_WAIT_WHILE(allgatherv_active); return PAMI_SUCCESS; } /* TODO These need ordered in speed-order */ if(use_tree_reduce) { if(unlikely(verbose)) fprintf(stderr,"Using tree reduce allgatherv type %u.\n", selected_type); rc = MPIDO_Allgatherv_allreduce(sendbuf, sendcount, sendtype, recvbuf, recvcounts, buffer_sum, displs, recvtype, send_true_lb, recv_true_lb, send_size, recv_size, comm_ptr, mpierrno); MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHERV_OPT_ALLREDUCE"); return rc; } if(use_bcast) { if(unlikely(verbose)) fprintf(stderr,"Using bcast allgatherv type %u.\n", selected_type); rc = MPIDO_Allgatherv_bcast(sendbuf, sendcount, sendtype, recvbuf, recvcounts, buffer_sum, displs, recvtype, send_true_lb, recv_true_lb, send_size, recv_size, comm_ptr, mpierrno); MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHERV_OPT_BCAST"); return rc; } if(use_alltoall) { if(unlikely(verbose)) fprintf(stderr,"Using alltoall allgatherv type %u.\n", selected_type); rc = MPIDO_Allgatherv_alltoall(sendbuf, sendcount, sendtype, recvbuf, (int *)recvcounts, buffer_sum, displs, recvtype, send_true_lb, recv_true_lb, send_size, recv_size, comm_ptr, mpierrno); MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHERV_OPT_ALLTOALL"); return rc; } if(unlikely(verbose)) fprintf(stderr,"Using MPICH allgatherv type %u.\n", selected_type); TRACE_ERR("Using MPICH for Allgatherv\n"); MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHERV_MPICH"); return MPIR_Allgatherv(sendbuf, sendcount, sendtype, recvbuf, recvcounts, displs, recvtype, comm_ptr, mpierrno); }