int MPIDO_Gatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, const int *recvcounts, const int *displs, MPI_Datatype recvtype, int root, MPID_Comm * comm_ptr, int *mpierrno) { #ifndef HAVE_PAMI_IN_PLACE if (sendbuf == MPI_IN_PLACE) { MPID_Abort (NULL, 0, 1, "'MPI_IN_PLACE' requries support for `PAMI_IN_PLACE`"); return -1; } #endif TRACE_ERR("Entering MPIDO_Gatherv\n"); int i; int contig ATTRIBUTE((unused)), rsize ATTRIBUTE((unused)), ssize ATTRIBUTE((unused)); int pamidt = 1; MPID_Datatype *dt_ptr = NULL; MPI_Aint send_true_lb, recv_true_lb; char *sbuf, *rbuf; pami_type_t stype, rtype; int tmp; volatile unsigned gatherv_active = 1; const int rank = comm_ptr->rank; const int size = comm_ptr->local_size; #if ASSERT_LEVEL==0 /* We can't afford the tracing in ndebug/performance libraries */ const unsigned verbose = 0; #else const unsigned verbose = (MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_ALL) && (rank == 0); #endif const struct MPIDI_Comm* const mpid = &(comm_ptr->mpid); const int selected_type = mpid->user_selected_type[PAMI_XFER_GATHERV_INT]; /* Check for native PAMI types and MPI_IN_PLACE on sendbuf */ /* MPI_IN_PLACE is a nonlocal decision. We will need a preallreduce if we ever have * multiple "good" gathervs that work on different counts for example */ if((sendbuf != MPI_IN_PLACE) && (MPIDI_Datatype_to_pami(sendtype, &stype, -1, NULL, &tmp) != MPI_SUCCESS)) pamidt = 0; if(MPIDI_Datatype_to_pami(recvtype, &rtype, -1, NULL, &tmp) != MPI_SUCCESS) pamidt = 0; if(pamidt == 0 || selected_type == MPID_COLL_USE_MPICH) { if(unlikely(verbose)) fprintf(stderr,"Using MPICH gatherv algorithm\n"); TRACE_ERR("GATHERV using MPICH\n"); MPIDI_Update_last_algorithm(comm_ptr, "GATHERV_MPICH"); #if CUDA_AWARE_SUPPORT if(MPIDI_Process.cuda_aware_support_on) { MPI_Aint sdt_extent,rdt_extent; MPID_Datatype_get_extent_macro(sendtype, sdt_extent); MPID_Datatype_get_extent_macro(recvtype, rdt_extent); char *scbuf = NULL; char *rcbuf = NULL; int is_send_dev_buf = MPIDI_cuda_is_device_buf(sendbuf); int is_recv_dev_buf = (rank == root) ? MPIDI_cuda_is_device_buf(recvbuf) : 0; if(is_send_dev_buf) { scbuf = MPL_malloc(sdt_extent * sendcount); cudaError_t cudaerr = CudaMemcpy(scbuf, sendbuf, sdt_extent * sendcount, cudaMemcpyDeviceToHost); if (cudaSuccess != cudaerr) fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr)); } else scbuf = sendbuf; size_t rtotal_buf; if(is_recv_dev_buf) { //Since displs can be non-continous, we need to calculate max buffer size int highest_displs = displs[size - 1]; int highest_recvcount = recvcounts[size - 1]; for(i = 0; i < size; i++) { if(displs[i]+recvcounts[i] > highest_displs+highest_recvcount) { highest_displs = displs[i]; highest_recvcount = recvcounts[i]; } } rtotal_buf = (highest_displs+highest_recvcount)*rdt_extent; rcbuf = MPL_malloc(rtotal_buf); if(sendbuf == MPI_IN_PLACE) { cudaError_t cudaerr = CudaMemcpy(rcbuf, recvbuf, rtotal_buf, cudaMemcpyDeviceToHost); if (cudaSuccess != cudaerr) fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr)); } else memset(rcbuf, 0, rtotal_buf); } else rcbuf = recvbuf; int cuda_res = MPIR_Gatherv(scbuf, sendcount, sendtype, rcbuf, recvcounts, displs, recvtype, root, comm_ptr, mpierrno); if(is_send_dev_buf)MPL_free(scbuf); if(is_recv_dev_buf) { cudaError_t cudaerr = CudaMemcpy(recvbuf, rcbuf, rtotal_buf, cudaMemcpyHostToDevice); if (cudaSuccess != cudaerr) fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr)); MPL_free(rcbuf); } return cuda_res; } else #endif return MPIR_Gatherv(sendbuf, sendcount, sendtype, recvbuf, recvcounts, displs, recvtype, root, comm_ptr, mpierrno); } MPIDI_Datatype_get_info(1, recvtype, contig, rsize, dt_ptr, recv_true_lb); rbuf = (char *)recvbuf + recv_true_lb; sbuf = (void *) sendbuf; pami_xfer_t gatherv; gatherv.cb_done = cb_gatherv; gatherv.cookie = (void *)&gatherv_active; gatherv.cmd.xfer_gatherv_int.root = MPIDI_Task_to_endpoint(MPID_VCR_GET_LPID(comm_ptr->vcr, root), 0); gatherv.cmd.xfer_gatherv_int.rcvbuf = rbuf; gatherv.cmd.xfer_gatherv_int.rtype = rtype; gatherv.cmd.xfer_gatherv_int.rtypecounts = (int *) recvcounts; gatherv.cmd.xfer_gatherv_int.rdispls = (int *) displs; gatherv.cmd.xfer_gatherv_int.sndbuf = NULL; gatherv.cmd.xfer_gatherv_int.stype = stype; gatherv.cmd.xfer_gatherv_int.stypecount = sendcount; if(rank == root) { if(sendbuf == MPI_IN_PLACE) { if(unlikely(verbose)) fprintf(stderr,"gatherv MPI_IN_PLACE buffering\n"); sbuf = PAMI_IN_PLACE; gatherv.cmd.xfer_gatherv_int.stype = rtype; gatherv.cmd.xfer_gatherv_int.stypecount = recvcounts[rank]; } else { MPIDI_Datatype_get_info(1, sendtype, contig, ssize, dt_ptr, send_true_lb); sbuf = (char *)sbuf + send_true_lb; } } gatherv.cmd.xfer_gatherv_int.sndbuf = sbuf; pami_algorithm_t my_gatherv; const pami_metadata_t *my_md = (pami_metadata_t *)NULL; int queryreq = 0; if(selected_type == MPID_COLL_OPTIMIZED) { TRACE_ERR("Optimized gatherv %s was selected\n", mpid->opt_protocol_md[PAMI_XFER_GATHERV_INT][0].name); my_gatherv = mpid->opt_protocol[PAMI_XFER_GATHERV_INT][0]; my_md = &mpid->opt_protocol_md[PAMI_XFER_GATHERV_INT][0]; queryreq = mpid->must_query[PAMI_XFER_GATHERV_INT][0]; } else { TRACE_ERR("Optimized gatherv %s was set by user\n", mpid->user_metadata[PAMI_XFER_GATHERV_INT].name); my_gatherv = mpid->user_selected[PAMI_XFER_GATHERV_INT]; my_md = &mpid->user_metadata[PAMI_XFER_GATHERV_INT]; queryreq = selected_type; } gatherv.algorithm = my_gatherv; if(unlikely(queryreq == MPID_COLL_ALWAYS_QUERY || queryreq == MPID_COLL_CHECK_FN_REQUIRED)) { metadata_result_t result = {0}; TRACE_ERR("querying gatherv protocol %s, type was %d\n", my_md->name, queryreq); if(my_md->check_fn == NULL) { /* process metadata bits */ if((!my_md->check_correct.values.inplace) && (sendbuf == MPI_IN_PLACE)) result.check.unspecified = 1; /* Can't check ranges like this. Non-local. Comment out for now. if(my_md->check_correct.values.rangeminmax) { MPI_Aint data_true_lb; MPID_Datatype *data_ptr; int data_size, data_contig; MPIDI_Datatype_get_info(sendcount, sendtype, data_contig, data_size, data_ptr, data_true_lb); if((my_md->range_lo <= data_size) && (my_md->range_hi >= data_size)) ; else { result.check.range = 1; if(unlikely(verbose)) { fprintf(stderr,"message size (%u) outside range (%zu<->%zu) for %s.\n", data_size, my_md->range_lo, my_md->range_hi, my_md->name); } } } */ } else /* calling the check fn is sufficient */ result = my_md->check_fn(&gatherv); TRACE_ERR("bitmask: %#X\n", result.bitmask); result.check.nonlocal = 0; /* #warning REMOVE THIS WHEN IMPLEMENTED */ if(result.bitmask) { if(unlikely(verbose)) fprintf(stderr,"Query failed for %s. Using MPICH gatherv.\n", my_md->name); MPIDI_Update_last_algorithm(comm_ptr, "GATHERV_MPICH"); return MPIR_Gatherv(sendbuf, sendcount, sendtype, recvbuf, recvcounts, displs, recvtype, root, comm_ptr, mpierrno); } if(my_md->check_correct.values.asyncflowctl && !(--(comm_ptr->mpid.num_requests))) { comm_ptr->mpid.num_requests = MPIDI_Process.optimized.num_requests; int tmpmpierrno; if(unlikely(verbose)) fprintf(stderr,"Query barrier required for %s\n", my_md->name); MPIDO_Barrier(comm_ptr, &tmpmpierrno); } } MPIDI_Update_last_algorithm(comm_ptr, my_md->name); if(unlikely(verbose)) { unsigned long long int threadID; MPL_thread_id_t tid; MPL_thread_self(&tid); threadID = (unsigned long long int)tid; fprintf(stderr,"<%llx> Using protocol %s for gatherv on %u\n", threadID, my_md->name, (unsigned) comm_ptr->context_id); } MPIDI_Post_coll_t gatherv_post; MPIDI_Context_post(MPIDI_Context[0], &gatherv_post.state, MPIDI_Pami_post_wrapper, (void *)&gatherv); TRACE_ERR("Waiting on active %d\n", gatherv_active); MPID_PROGRESS_WAIT_WHILE(gatherv_active); TRACE_ERR("Leaving MPIDO_Gatherv\n"); return 0; }
int MPIDO_Gatherv_simple(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, const int *recvcounts, const int *displs, MPI_Datatype recvtype, int root, MPID_Comm * comm_ptr, int *mpierrno) { #ifndef HAVE_PAMI_IN_PLACE if (sendbuf == MPI_IN_PLACE) { MPID_Abort (NULL, 0, 1, "'MPI_IN_PLACE' requries support for `PAMI_IN_PLACE`"); return -1; } #endif TRACE_ERR("Entering MPIDO_Gatherv_optimized\n"); int snd_contig = 1, rcv_contig = 1; void *snd_noncontig_buff = NULL, *rcv_noncontig_buff = NULL; void *sbuf = NULL, *rbuf = NULL; int *rcounts = NULL; int *rdispls = NULL; int send_size = 0; int recv_size = 0; int rcvlen = 0; int totalrecvcount = 0; pami_type_t rtype = PAMI_TYPE_NULL; MPID_Segment segment; MPID_Datatype *data_ptr = NULL; int send_true_lb, recv_true_lb = 0; int i, tmp; volatile unsigned gatherv_active = 1; const int rank = comm_ptr->rank; const int size = comm_ptr->local_size; #if ASSERT_LEVEL==0 /* We can't afford the tracing in ndebug/performance libraries */ const unsigned verbose = 0; #else const unsigned verbose = (MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_ALL) && (rank == 0); #endif const struct MPIDI_Comm* const mpid = &(comm_ptr->mpid); int recvok=PAMI_SUCCESS, recvcontinuous=0; if(sendbuf != MPI_IN_PLACE) { MPIDI_Datatype_get_info(sendcount, sendtype, snd_contig, send_size, data_ptr, send_true_lb); if(MPIDI_Pamix_collsel_advise != NULL && mpid->collsel_fast_query != NULL) { advisor_algorithm_t advisor_algorithms[1]; int num_algorithms = MPIDI_Pamix_collsel_advise(mpid->collsel_fast_query, PAMI_XFER_GATHERV_INT, 64, advisor_algorithms, 1); if(num_algorithms) { if(advisor_algorithms[0].algorithm_type == COLLSEL_EXTERNAL_ALGO) { return MPIR_Gatherv(sendbuf, sendcount, sendtype, recvbuf, recvcounts, displs, recvtype, root, comm_ptr, mpierrno); } else if(advisor_algorithms[0].metadata && advisor_algorithms[0].metadata->check_correct.values.asyncflowctl && !(--(comm_ptr->mpid.num_requests))) { comm_ptr->mpid.num_requests = MPIDI_Process.optimized.num_requests; int tmpmpierrno; if(unlikely(verbose)) fprintf(stderr,"Query barrier required for %s\n", advisor_algorithms[0].metadata->name); MPIDO_Barrier(comm_ptr, &tmpmpierrno); } } } sbuf = (char *)sendbuf + send_true_lb; if(!snd_contig) { snd_noncontig_buff = MPL_malloc(send_size); sbuf = snd_noncontig_buff; if(snd_noncontig_buff == NULL) { MPID_Abort(NULL, MPI_ERR_NO_SPACE, 1, "Fatal: Cannot allocate pack buffer"); } DLOOP_Offset last = send_size; MPID_Segment_init(sendbuf, sendcount, sendtype, &segment, 0); MPID_Segment_pack(&segment, 0, &last, snd_noncontig_buff); } } else { MPIDI_Datatype_get_info(1, recvtype, rcv_contig, rcvlen, data_ptr, recv_true_lb); if(MPIDI_Pamix_collsel_advise != NULL && mpid->collsel_fast_query != NULL) { advisor_algorithm_t advisor_algorithms[1]; int num_algorithms = MPIDI_Pamix_collsel_advise(mpid->collsel_fast_query, PAMI_XFER_GATHERV_INT, 64, advisor_algorithms, 1); if(num_algorithms) { if(advisor_algorithms[0].algorithm_type == COLLSEL_EXTERNAL_ALGO) { return MPIR_Gatherv(sendbuf, sendcount, sendtype, recvbuf, recvcounts, displs, recvtype, root, comm_ptr, mpierrno); } else if(advisor_algorithms[0].metadata && advisor_algorithms[0].metadata->check_correct.values.asyncflowctl && !(--(comm_ptr->mpid.num_requests))) { comm_ptr->mpid.num_requests = MPIDI_Process.optimized.num_requests; int tmpmpierrno; if(unlikely(verbose)) fprintf(stderr,"Query barrier required for %s\n", advisor_algorithms[0].metadata->name); MPIDO_Barrier(comm_ptr, &tmpmpierrno); } } } } pami_xfer_t gatherv; rbuf = (char *)recvbuf + recv_true_lb; rcounts = (int*)recvcounts; rdispls = (int*)displs; if(rank == root) { if((recvok = MPIDI_Datatype_to_pami(recvtype, &rtype, -1, NULL, &tmp)) != MPI_SUCCESS) { MPIDI_Datatype_get_info(1, recvtype, rcv_contig, rcvlen, data_ptr, recv_true_lb); totalrecvcount = recvcounts[0]; recvcontinuous = displs[0] == 0? 1 : 0 ; rcounts = (int*)MPL_malloc(size); rdispls = (int*)MPL_malloc(size); rdispls[0] = 0; rcounts[0] = rcvlen * recvcounts[0]; for(i = 1; i < size; i++) { rdispls[i]= rcvlen * totalrecvcount; totalrecvcount += recvcounts[i]; if(displs[i] != (displs[i-1] + recvcounts[i-1])) recvcontinuous = 0; rcounts[i] = rcvlen * recvcounts[i]; } recv_size = rcvlen * totalrecvcount; rcv_noncontig_buff = MPL_malloc(recv_size); rbuf = rcv_noncontig_buff; rtype = PAMI_TYPE_BYTE; if(rcv_noncontig_buff == NULL) { MPID_Abort(NULL, MPI_ERR_NO_SPACE, 1, "Fatal: Cannot allocate pack buffer"); } if(sendbuf == MPI_IN_PLACE) { size_t extent; MPID_Datatype_get_extent_macro(recvtype,extent); MPIR_Localcopy(recvbuf + displs[rank]*extent, recvcounts[rank], recvtype, rcv_noncontig_buff + rdispls[rank], rcounts[rank],MPI_CHAR); } } if(sendbuf == MPI_IN_PLACE) { gatherv.cmd.xfer_gatherv_int.sndbuf = PAMI_IN_PLACE; } else { gatherv.cmd.xfer_gatherv_int.sndbuf = sbuf; } gatherv.cmd.xfer_gatherv_int.stype = PAMI_TYPE_BYTE;/* stype is ignored when sndbuf == PAMI_IN_PLACE */ gatherv.cmd.xfer_gatherv_int.stypecount = send_size; } else { gatherv.cmd.xfer_gatherv_int.sndbuf = sbuf; gatherv.cmd.xfer_gatherv_int.stype = PAMI_TYPE_BYTE; gatherv.cmd.xfer_gatherv_int.stypecount = send_size; } gatherv.cb_done = cb_gatherv; gatherv.cookie = (void *)&gatherv_active; gatherv.cmd.xfer_gatherv_int.root = MPIDI_Task_to_endpoint(MPID_VCR_GET_LPID(comm_ptr->vcr, root), 0); gatherv.cmd.xfer_gatherv_int.rcvbuf = rbuf; gatherv.cmd.xfer_gatherv_int.rtype = rtype; gatherv.cmd.xfer_gatherv_int.rtypecounts = (int *) rcounts; gatherv.cmd.xfer_gatherv_int.rdispls = (int *) rdispls; const pami_metadata_t *my_gatherv_md; gatherv.algorithm = mpid->coll_algorithm[PAMI_XFER_GATHERV_INT][0][0]; my_gatherv_md = &mpid->coll_metadata[PAMI_XFER_GATHERV_INT][0][0]; MPIDI_Update_last_algorithm(comm_ptr, my_gatherv_md->name); MPIDI_Post_coll_t gatherv_post; TRACE_ERR("%s gatherv\n", MPIDI_Process.context_post.active>0?"Posting":"Invoking"); MPIDI_Context_post(MPIDI_Context[0], &gatherv_post.state, MPIDI_Pami_post_wrapper, (void *)&gatherv); TRACE_ERR("Gatherv %s\n", MPIDI_Process.context_post.active>0?"posted":"invoked"); TRACE_ERR("Waiting on active %d\n", gatherv_active); MPID_PROGRESS_WAIT_WHILE(gatherv_active); if(!rcv_contig || recvok != PAMI_SUCCESS) { if(recvcontinuous) { MPIR_Localcopy(rcv_noncontig_buff, recv_size, MPI_CHAR, recvbuf, totalrecvcount, recvtype); } else { size_t extent; MPID_Datatype_get_extent_macro(recvtype,extent); for(i=0; i<size; ++i) { char* scbuf = (char*)rcv_noncontig_buff+ rdispls[i]; char* rcbuf = (char*)recvbuf + displs[i]*extent; MPIR_Localcopy(scbuf, rcounts[i], MPI_CHAR, rcbuf, recvcounts[i], recvtype); TRACE_ERR("Pack recv src extent %zu, displ[%zu]=%zu, count[%zu]=%zu buf[%zu]=%u\n", (size_t)extent, (size_t)i,(size_t)precvdispls[i],(size_t)i,(size_t)precvcounts[i],(size_t)precvdispls[i], *(int*)scbuf); TRACE_ERR("Pack recv dest extent %zu, displ[%zu]=%zu, count[%zu]=%zu buf[%zu]=%u\n", (size_t)extent, (size_t)i,(size_t)displs[i],(size_t)i,(size_t)recvcounts[i],(size_t)displs[i], *(int*)rcbuf); } } MPL_free(rcv_noncontig_buff); if(rank == root) { MPL_free(rcounts); MPL_free(rdispls); } } if(!snd_contig) MPL_free(snd_noncontig_buff); TRACE_ERR("Leaving MPIDO_Gatherv_optimized\n"); return MPI_SUCCESS; }
int MPIDO_Scatterv_simple(const void *sendbuf, const int *sendcounts, const int *displs, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, int root, MPID_Comm *comm_ptr, int *mpierrno) { #ifndef HAVE_PAMI_IN_PLACE if (sendbuf == MPI_IN_PLACE) { MPID_Abort (NULL, 0, 1, "'MPI_IN_PLACE' requries support for `PAMI_IN_PLACE`"); return -1; } #endif int snd_contig = 1; int rcv_contig = 1; int send_size = 0, recv_size = 0; int ssize = 0; MPID_Datatype *dt_ptr = NULL; MPI_Aint send_true_lb=0, recv_true_lb=0; void *snd_noncontig_buff = NULL, *rcv_noncontig_buff = NULL; void *sbuf = NULL, *rbuf = NULL; int *sdispls = NULL, *scounts = NULL; int sndcount = 0; MPID_Segment segment; int tmp, i; pami_type_t stype = PAMI_TYPE_NULL; const int rank = comm_ptr->rank; const int size = comm_ptr->local_size; const struct MPIDI_Comm* const mpid = &(comm_ptr->mpid); if (rank == root && sendtype != MPI_DATATYPE_NULL && sendcounts[0] >= 0) { MPIDI_Datatype_get_info(1, sendtype, snd_contig, ssize, dt_ptr, send_true_lb); if(MPIDI_Pamix_collsel_advise != NULL && mpid->collsel_fast_query != NULL) { advisor_algorithm_t advisor_algorithms[1]; int num_algorithms = MPIDI_Pamix_collsel_advise(mpid->collsel_fast_query, PAMI_XFER_SCATTERV_INT, 64, advisor_algorithms, 1); if(num_algorithms) { if(advisor_algorithms[0].algorithm_type == COLLSEL_EXTERNAL_ALGO) { return MPIR_Scatterv(sendbuf, sendcounts, displs, sendtype, recvbuf, recvcount, recvtype, root, comm_ptr, mpierrno); } else if(advisor_algorithms[0].metadata && advisor_algorithms[0].metadata->check_correct.values.asyncflowctl && !(--(comm_ptr->mpid.num_requests))) { comm_ptr->mpid.num_requests = MPIDI_Process.optimized.num_requests; int tmpmpierrno; MPIDO_Barrier(comm_ptr, &tmpmpierrno); } } } } if (recvtype != MPI_DATATYPE_NULL && recvcount >= 0) { MPIDI_Datatype_get_info(recvcount, recvtype, rcv_contig, recv_size, dt_ptr, recv_true_lb); if(MPIDI_Pamix_collsel_advise != NULL && mpid->collsel_fast_query != NULL) { advisor_algorithm_t advisor_algorithms[1]; int num_algorithms = MPIDI_Pamix_collsel_advise(mpid->collsel_fast_query, PAMI_XFER_SCATTERV_INT, 64, advisor_algorithms, 1); if(num_algorithms) { if(advisor_algorithms[0].algorithm_type == COLLSEL_EXTERNAL_ALGO) { return MPIR_Scatterv(sendbuf, sendcounts, displs, sendtype, recvbuf, recvcount, recvtype, root, comm_ptr, mpierrno); } else if(advisor_algorithms[0].metadata && advisor_algorithms[0].metadata->check_correct.values.asyncflowctl && !(--(comm_ptr->mpid.num_requests))) { comm_ptr->mpid.num_requests = MPIDI_Process.optimized.num_requests; int tmpmpierrno; MPIDO_Barrier(comm_ptr, &tmpmpierrno); } } } } pami_xfer_t scatterv; const pami_metadata_t *my_scatterv_md; volatile unsigned scatterv_active = 1; sbuf = (char *)sendbuf + send_true_lb; rbuf = (char *)recvbuf + recv_true_lb; scounts = (int*)sendcounts; sdispls = (int*)displs; if(rank == root) { if(MPIDI_Datatype_to_pami(sendtype, &stype, -1, NULL, &tmp) != MPI_SUCCESS) { if (!snd_contig) { scounts = (int*)MPIU_Malloc(size); sdispls = (int*)MPIU_Malloc(size); for(i = 0; i < size; i++) { scounts[i] = ssize * sendcounts[i]; sdispls[i] = ssize * displs[i]; send_size += scounts[i]; sndcount += sendcounts[i]; } snd_noncontig_buff = MPIU_Malloc(send_size); sbuf = snd_noncontig_buff; stype = PAMI_TYPE_BYTE; if(snd_noncontig_buff == NULL) { MPID_Abort(NULL, MPI_ERR_NO_SPACE, 1, "Fatal: Cannot allocate pack buffer"); } DLOOP_Offset last = send_size; MPID_Segment_init(sendbuf, sndcount, sendtype, &segment, 0); MPID_Segment_pack(&segment, 0, &last, snd_noncontig_buff); } } if(recvbuf == MPI_IN_PLACE) { rbuf = PAMI_IN_PLACE; } } if(recvbuf != MPI_IN_PLACE) { if (!rcv_contig) { rcv_noncontig_buff = MPIU_Malloc(recv_size); rbuf = rcv_noncontig_buff; if(rcv_noncontig_buff == NULL) { MPID_Abort(NULL, MPI_ERR_NO_SPACE, 1, "Fatal: Cannot allocate pack buffer"); } } } scatterv.cb_done = cb_scatterv; scatterv.cookie = (void *)&scatterv_active; scatterv.cmd.xfer_scatterv_int.root = MPIDI_Task_to_endpoint(MPID_VCR_GET_LPID(comm_ptr->vcr, root), 0); scatterv.algorithm = mpid->coll_algorithm[PAMI_XFER_SCATTERV_INT][0][0]; my_scatterv_md = &mpid->coll_metadata[PAMI_XFER_SCATTERV_INT][0][0]; scatterv.cmd.xfer_scatterv_int.rcvbuf = rbuf; scatterv.cmd.xfer_scatterv_int.sndbuf = sbuf; scatterv.cmd.xfer_scatterv_int.stype = stype; scatterv.cmd.xfer_scatterv_int.rtype = PAMI_TYPE_BYTE;/* rtype is ignored when rcvbuf == PAMI_IN_PLACE */ scatterv.cmd.xfer_scatterv_int.stypecounts = (int *) scounts; scatterv.cmd.xfer_scatterv_int.rtypecount = recv_size; scatterv.cmd.xfer_scatterv_int.sdispls = (int *) sdispls; MPIDI_Update_last_algorithm(comm_ptr, my_scatterv_md->name); MPIDI_Post_coll_t scatterv_post; TRACE_ERR("%s scatterv\n", MPIDI_Process.context_post.active>0?"Posting":"Invoking"); MPIDI_Context_post(MPIDI_Context[0], &scatterv_post.state, MPIDI_Pami_post_wrapper, (void *)&scatterv); TRACE_ERR("Waiting on active %d\n", scatterv_active); MPID_PROGRESS_WAIT_WHILE(scatterv_active); if(!rcv_contig) { MPIR_Localcopy(rcv_noncontig_buff, recv_size, MPI_CHAR, recvbuf, recvcount, recvtype); MPIU_Free(rcv_noncontig_buff); } if(!snd_contig) { MPIU_Free(snd_noncontig_buff); MPIU_Free(scounts); MPIU_Free(sdispls); } TRACE_ERR("Leaving MPIDO_Scatterv_optimized\n"); return MPI_SUCCESS; }
int MPIDO_Bcast(void *buffer, int count, MPI_Datatype datatype, int root, MPID_Comm *comm_ptr, int *mpierrno) { TRACE_ERR("in mpido_bcast\n"); const size_t BCAST_LIMIT = 0x40000000; int data_contig, rc; void *data_buffer = NULL, *noncontig_buff = NULL; volatile unsigned active = 1; MPI_Aint data_true_lb = 0; MPID_Datatype *data_ptr; MPID_Segment segment; MPIDI_Post_coll_t bcast_post; const struct MPIDI_Comm* const mpid = &(comm_ptr->mpid); const int rank = comm_ptr->rank; #if ASSERT_LEVEL==0 /* We can't afford the tracing in ndebug/performance libraries */ const unsigned verbose = 0; #else const unsigned verbose = (MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_ALL) && (rank == 0); #endif const int selected_type = mpid->user_selected_type[PAMI_XFER_BROADCAST]; /* Must calculate data_size based on count=1 in case it's total size is > integer */ int data_size_one; MPIDI_Datatype_get_info(1, datatype, data_contig, data_size_one, data_ptr, data_true_lb); /* do this calculation once and use twice */ const size_t data_size_sz = (size_t)data_size_one*(size_t)count; if(unlikely(verbose)) fprintf(stderr,"bcast count %d, size %d (%#zX), root %d, buffer %p\n", count,data_size_one, (size_t)data_size_one*(size_t)count, root,buffer); if(unlikely( data_size_sz > BCAST_LIMIT) ) { void *new_buffer=buffer; int c, new_count = (int)BCAST_LIMIT/data_size_one; MPID_assert(new_count > 0); for(c=1; ((size_t)c*(size_t)new_count) <= (size_t)count; ++c) { if ((rc = MPIDO_Bcast(new_buffer, new_count, datatype, root, comm_ptr, mpierrno)) != MPI_SUCCESS) return rc; new_buffer = (char*)new_buffer + (size_t)data_size_one*(size_t)new_count; } new_count = count % new_count; /* 0 is ok, just returns no-op */ return MPIDO_Bcast(new_buffer, new_count, datatype, root, comm_ptr, mpierrno); } /* Must use data_size based on count for byte bcast processing. Previously calculated as a size_t but large data_sizes were handled above so this cast to int should be fine here. */ const int data_size = (int)data_size_sz; if(selected_type == MPID_COLL_USE_MPICH || data_size == 0) { if(unlikely(verbose)) fprintf(stderr,"Using MPICH bcast algorithm\n"); MPIDI_Update_last_algorithm(comm_ptr,"BCAST_MPICH"); return MPIR_Bcast_intra(buffer, count, datatype, root, comm_ptr, mpierrno); } data_buffer = (char *)buffer + data_true_lb; if(!data_contig) { noncontig_buff = MPIU_Malloc(data_size); data_buffer = noncontig_buff; if(noncontig_buff == NULL) { MPID_Abort(NULL, MPI_ERR_NO_SPACE, 1, "Fatal: Cannot allocate pack buffer"); } if(rank == root) { DLOOP_Offset last = data_size; MPID_Segment_init(buffer, count, datatype, &segment, 0); MPID_Segment_pack(&segment, 0, &last, noncontig_buff); } } pami_xfer_t bcast; pami_algorithm_t my_bcast; const pami_metadata_t *my_md = (pami_metadata_t *)NULL; int queryreq = 0; bcast.cb_done = cb_bcast; bcast.cookie = (void *)&active; bcast.cmd.xfer_broadcast.root = MPIDI_Task_to_endpoint(MPID_VCR_GET_LPID(comm_ptr->vcr, root), 0); bcast.algorithm = mpid->user_selected[PAMI_XFER_BROADCAST]; bcast.cmd.xfer_broadcast.buf = data_buffer; bcast.cmd.xfer_broadcast.type = PAMI_TYPE_BYTE; /* Needs to be sizeof(type)*count since we are using bytes as * the generic type */ bcast.cmd.xfer_broadcast.typecount = data_size; if(selected_type == MPID_COLL_OPTIMIZED) { TRACE_ERR("Optimized bcast (%s) and (%s) were pre-selected\n", mpid->opt_protocol_md[PAMI_XFER_BROADCAST][0].name, mpid->opt_protocol_md[PAMI_XFER_BROADCAST][1].name); if(mpid->cutoff_size[PAMI_XFER_BROADCAST][1] != 0)/* SSS: There is FCA cutoff (FCA only sets cutoff for [PAMI_XFER_BROADCAST][1]) */ { if(data_size <= mpid->cutoff_size[PAMI_XFER_BROADCAST][1]) { my_bcast = mpid->opt_protocol[PAMI_XFER_BROADCAST][1]; my_md = &mpid->opt_protocol_md[PAMI_XFER_BROADCAST][1]; queryreq = mpid->must_query[PAMI_XFER_BROADCAST][1]; } else { return MPIR_Bcast_intra(buffer, count, datatype, root, comm_ptr, mpierrno); } } if(data_size > mpid->cutoff_size[PAMI_XFER_BROADCAST][0]) { my_bcast = mpid->opt_protocol[PAMI_XFER_BROADCAST][1]; my_md = &mpid->opt_protocol_md[PAMI_XFER_BROADCAST][1]; queryreq = mpid->must_query[PAMI_XFER_BROADCAST][1]; } else { my_bcast = mpid->opt_protocol[PAMI_XFER_BROADCAST][0]; my_md = &mpid->opt_protocol_md[PAMI_XFER_BROADCAST][0]; queryreq = mpid->must_query[PAMI_XFER_BROADCAST][0]; } } else { TRACE_ERR("Bcast (%s) was specified by user\n", mpid->user_metadata[PAMI_XFER_BROADCAST].name); my_bcast = mpid->user_selected[PAMI_XFER_BROADCAST]; my_md = &mpid->user_metadata[PAMI_XFER_BROADCAST]; queryreq = selected_type; } bcast.algorithm = my_bcast; if(unlikely(queryreq == MPID_COLL_ALWAYS_QUERY || queryreq == MPID_COLL_CHECK_FN_REQUIRED)) { metadata_result_t result = {0}; TRACE_ERR("querying bcast protocol %s, type was: %d\n", my_md->name, queryreq); if(my_md->check_fn != NULL) /* calling the check fn is sufficient */ { metadata_result_t result = {0}; result = my_md->check_fn(&bcast); result.check.nonlocal = 0; /* #warning REMOVE THIS WHEN IMPLEMENTED */ } else /* no check_fn, manually look at the metadata fields */ { TRACE_ERR("Optimzed selection line %d\n",__LINE__); /* Check if the message range if restricted */ if(my_md->check_correct.values.rangeminmax) { if((my_md->range_lo <= data_size) && (my_md->range_hi >= data_size)) ; /* ok, algorithm selected */ else { result.check.range = 1; if(unlikely(verbose)) { fprintf(stderr,"message size (%u) outside range (%zu<->%zu) for %s.\n", data_size, my_md->range_lo, my_md->range_hi, my_md->name); } } } /* \todo check the rest of the metadata */ } TRACE_ERR("bitmask: %#X\n", result.bitmask); if(result.bitmask) { if(unlikely(verbose)) fprintf(stderr,"Using MPICH bcast algorithm - query fn failed\n"); MPIDI_Update_last_algorithm(comm_ptr,"BCAST_MPICH"); return MPIR_Bcast_intra(buffer, count, datatype, root, comm_ptr, mpierrno); } if(my_md->check_correct.values.asyncflowctl && !(--(comm_ptr->mpid.num_requests))) { comm_ptr->mpid.num_requests = MPIDI_Process.optimized.num_requests; int tmpmpierrno; if(unlikely(verbose)) fprintf(stderr,"Query barrier required for %s\n", my_md->name); MPIDO_Barrier(comm_ptr, &tmpmpierrno); } } if(unlikely(verbose)) { unsigned long long int threadID; MPIU_Thread_id_t tid; MPIU_Thread_self(&tid); threadID = (unsigned long long int)tid; fprintf(stderr,"<%llx> Using protocol %s for bcast on %u\n", threadID, my_md->name, (unsigned) comm_ptr->context_id); } MPIDI_Context_post(MPIDI_Context[0], &bcast_post.state, MPIDI_Pami_post_wrapper, (void *)&bcast); MPIDI_Update_last_algorithm(comm_ptr, my_md->name); MPID_PROGRESS_WAIT_WHILE(active); TRACE_ERR("bcast done\n"); if(!data_contig) { if(rank != root) MPIR_Localcopy(noncontig_buff, data_size, MPI_CHAR, buffer, count, datatype); MPIU_Free(noncontig_buff); } TRACE_ERR("leaving bcast\n"); return 0; }
int MPIDO_Bcast_simple(void *buffer, int count, MPI_Datatype datatype, int root, MPID_Comm *comm_ptr, int *mpierrno) { TRACE_ERR("Entering MPIDO_Bcast_optimized\n"); int data_contig; void *data_buffer = NULL, *noncontig_buff = NULL; volatile unsigned active = 1; MPI_Aint data_true_lb = 0; MPID_Datatype *data_ptr; MPID_Segment segment; MPIDI_Post_coll_t bcast_post; const struct MPIDI_Comm* const mpid = &(comm_ptr->mpid); const int rank = comm_ptr->rank; /* Must calculate data_size based on count=1 in case it's total size is > integer */ int data_size_one; MPIDI_Datatype_get_info(1, datatype, data_contig, data_size_one, data_ptr, data_true_lb); if(MPIDI_Pamix_collsel_advise != NULL && mpid->collsel_fast_query != NULL) { advisor_algorithm_t advisor_algorithms[1]; int num_algorithms = MPIDI_Pamix_collsel_advise(mpid->collsel_fast_query, PAMI_XFER_BROADCAST, data_size_one * count, advisor_algorithms, 1); if(num_algorithms) { if(advisor_algorithms[0].algorithm_type == COLLSEL_EXTERNAL_ALGO) { return MPIR_Bcast_intra(buffer, count, datatype, root, comm_ptr, mpierrno); } } } const int data_size = data_size_one*(size_t)count; data_buffer = (char *)buffer + data_true_lb; if(!data_contig) { noncontig_buff = MPIU_Malloc(data_size); data_buffer = noncontig_buff; if(noncontig_buff == NULL) { MPID_Abort(NULL, MPI_ERR_NO_SPACE, 1, "Fatal: Cannot allocate pack buffer"); } if(rank == root) { DLOOP_Offset last = data_size; MPID_Segment_init(buffer, count, datatype, &segment, 0); MPID_Segment_pack(&segment, 0, &last, noncontig_buff); } } pami_xfer_t bcast; const pami_metadata_t *my_bcast_md; int queryreq = 0; bcast.cb_done = cb_bcast; bcast.cookie = (void *)&active; bcast.cmd.xfer_broadcast.root = MPIDI_Task_to_endpoint(MPID_VCR_GET_LPID(comm_ptr->vcr, root), 0); bcast.algorithm = mpid->coll_algorithm[PAMI_XFER_BROADCAST][0][0]; bcast.cmd.xfer_broadcast.buf = data_buffer; bcast.cmd.xfer_broadcast.type = PAMI_TYPE_BYTE; /* Needs to be sizeof(type)*count since we are using bytes as * the generic type */ bcast.cmd.xfer_broadcast.typecount = data_size; my_bcast_md = &mpid->coll_metadata[PAMI_XFER_BROADCAST][0][0]; MPIDI_Context_post(MPIDI_Context[0], &bcast_post.state, MPIDI_Pami_post_wrapper, (void *)&bcast); MPIDI_Update_last_algorithm(comm_ptr, my_bcast_md->name); MPID_PROGRESS_WAIT_WHILE(active); TRACE_ERR("bcast done\n"); if(!data_contig) { if(rank != root) MPIR_Localcopy(noncontig_buff, data_size, MPI_CHAR, buffer, count, datatype); MPIU_Free(noncontig_buff); } TRACE_ERR("Exiting MPIDO_Bcast_optimized\n"); return 0; }
int MPIDO_Reduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPID_Comm *comm_ptr, int *mpierrno) { #ifndef HAVE_PAMI_IN_PLACE if (sendbuf == MPI_IN_PLACE) { MPID_Abort (NULL, 0, 1, "'MPI_IN_PLACE' requries support for `PAMI_IN_PLACE`"); return -1; } #endif MPID_Datatype *dt_null = NULL; MPI_Aint true_lb = 0; int dt_contig ATTRIBUTE((unused)), tsize; int mu; char *sbuf, *rbuf; pami_data_function pop; pami_type_t pdt; int rc; int alg_selected = 0; const int rank = comm_ptr->rank; #if ASSERT_LEVEL==0 /* We can't afford the tracing in ndebug/performance libraries */ const unsigned verbose = 0; #else const unsigned verbose = (MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_ALL) && (rank == 0); #endif const struct MPIDI_Comm* const mpid = &(comm_ptr->mpid); const int selected_type = mpid->user_selected_type[PAMI_XFER_REDUCE]; rc = MPIDI_Datatype_to_pami(datatype, &pdt, op, &pop, &mu); if(unlikely(verbose)) fprintf(stderr,"reduce - rc %u, root %u, count %d, dt: %p, op: %p, mu: %u, selectedvar %u != %u (MPICH) sendbuf %p, recvbuf %p\n", rc, root, count, pdt, pop, mu, (unsigned)selected_type, MPID_COLL_USE_MPICH,sendbuf, recvbuf); pami_xfer_t reduce; pami_algorithm_t my_reduce=0; const pami_metadata_t *my_md = (pami_metadata_t *)NULL; int queryreq = 0; volatile unsigned reduce_active = 1; MPIDI_Datatype_get_info(count, datatype, dt_contig, tsize, dt_null, true_lb); rbuf = (char *)recvbuf + true_lb; sbuf = (char *)sendbuf + true_lb; if(sendbuf == MPI_IN_PLACE) { if(unlikely(verbose)) fprintf(stderr,"reduce MPI_IN_PLACE send buffering (%d,%d)\n",count,tsize); sbuf = PAMI_IN_PLACE; } reduce.cb_done = reduce_cb_done; reduce.cookie = (void *)&reduce_active; if(mpid->optreduce) /* GLUE_ALLREDUCE */ { char* tbuf = NULL; if(unlikely(verbose)) fprintf(stderr,"Using protocol GLUE_ALLREDUCE for reduce (%d,%d)\n",count,tsize); MPIDI_Update_last_algorithm(comm_ptr, "REDUCE_OPT_ALLREDUCE"); void *destbuf = recvbuf; if(rank != root) /* temp buffer for non-root destbuf */ { tbuf = destbuf = MPL_malloc(tsize); } /* Switch to comm->coll_fns->fn() */ MPIDO_Allreduce(sendbuf, destbuf, count, datatype, op, comm_ptr, mpierrno); if(tbuf) MPL_free(tbuf); return 0; } if(selected_type == MPID_COLL_USE_MPICH || rc != MPI_SUCCESS) { if(unlikely(verbose)) fprintf(stderr,"Using MPICH reduce algorithm\n"); #if CUDA_AWARE_SUPPORT if(MPIDI_Process.cuda_aware_support_on) { MPI_Aint dt_extent; MPID_Datatype_get_extent_macro(datatype, dt_extent); char *scbuf = NULL; char *rcbuf = NULL; int is_send_dev_buf = MPIDI_cuda_is_device_buf(sendbuf); int is_recv_dev_buf = MPIDI_cuda_is_device_buf(recvbuf); if(is_send_dev_buf) { scbuf = MPL_malloc(dt_extent * count); cudaError_t cudaerr = CudaMemcpy(scbuf, sendbuf, dt_extent * count, cudaMemcpyDeviceToHost); if (cudaSuccess != cudaerr) fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr)); } else scbuf = sendbuf; if(is_recv_dev_buf) { rcbuf = MPL_malloc(dt_extent * count); if(sendbuf == MPI_IN_PLACE) { cudaError_t cudaerr = CudaMemcpy(rcbuf, recvbuf, dt_extent * count, cudaMemcpyDeviceToHost); if (cudaSuccess != cudaerr) fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr)); } else memset(rcbuf, 0, dt_extent * count); } else rcbuf = recvbuf; int cuda_res = MPIR_Reduce(scbuf, rcbuf, count, datatype, op, root, comm_ptr, mpierrno); if(is_send_dev_buf)MPL_free(scbuf); if(is_recv_dev_buf) { cudaError_t cudaerr = CudaMemcpy(recvbuf, rcbuf, dt_extent * count, cudaMemcpyHostToDevice); if (cudaSuccess != cudaerr) fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr)); MPL_free(rcbuf); } return cuda_res; } else #endif return MPIR_Reduce(sendbuf, recvbuf, count, datatype, op, root, comm_ptr, mpierrno); } if(selected_type == MPID_COLL_OPTIMIZED) { if((mpid->cutoff_size[PAMI_XFER_REDUCE][0] == 0) || (mpid->cutoff_size[PAMI_XFER_REDUCE][0] >= tsize && mpid->cutoff_size[PAMI_XFER_REDUCE][0] > 0)) { TRACE_ERR("Optimized Reduce (%s) was pre-selected\n", mpid->opt_protocol_md[PAMI_XFER_REDUCE][0].name); my_reduce = mpid->opt_protocol[PAMI_XFER_REDUCE][0]; my_md = &mpid->opt_protocol_md[PAMI_XFER_REDUCE][0]; queryreq = mpid->must_query[PAMI_XFER_REDUCE][0]; } } else { TRACE_ERR("Optimized reduce (%s) was specified by user\n", mpid->user_metadata[PAMI_XFER_REDUCE].name); my_reduce = mpid->user_selected[PAMI_XFER_REDUCE]; my_md = &mpid->user_metadata[PAMI_XFER_REDUCE]; queryreq = selected_type; } reduce.algorithm = my_reduce; reduce.cmd.xfer_reduce.sndbuf = sbuf; reduce.cmd.xfer_reduce.rcvbuf = rbuf; reduce.cmd.xfer_reduce.stype = pdt; reduce.cmd.xfer_reduce.rtype = pdt; reduce.cmd.xfer_reduce.stypecount = count; reduce.cmd.xfer_reduce.rtypecount = count; reduce.cmd.xfer_reduce.op = pop; reduce.cmd.xfer_reduce.root = MPIDI_Task_to_endpoint(MPID_VCR_GET_LPID(comm_ptr->vcr, root), 0); if(unlikely(queryreq == MPID_COLL_ALWAYS_QUERY || queryreq == MPID_COLL_CHECK_FN_REQUIRED)) { metadata_result_t result = {0}; TRACE_ERR("Querying reduce protocol %s, type was %d\n", my_md->name, queryreq); if(my_md->check_fn == NULL) { /* process metadata bits */ if((!my_md->check_correct.values.inplace) && (sendbuf == MPI_IN_PLACE)) result.check.unspecified = 1; if(my_md->check_correct.values.rangeminmax) { MPI_Aint data_true_lb ATTRIBUTE((unused)); MPID_Datatype *data_ptr; int data_size, data_contig ATTRIBUTE((unused)); MPIDI_Datatype_get_info(count, datatype, data_contig, data_size, data_ptr, data_true_lb); if((my_md->range_lo <= data_size) && (my_md->range_hi >= data_size)) ; /* ok, algorithm selected */ else { result.check.range = 1; if(unlikely(verbose)) { fprintf(stderr,"message size (%u) outside range (%zu<->%zu) for %s.\n", data_size, my_md->range_lo, my_md->range_hi, my_md->name); } } } }