int MPIDO_Allgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPID_Comm * comm_ptr, int *mpierrno) { #ifndef HAVE_PAMI_IN_PLACE if (sendbuf == MPI_IN_PLACE) { MPID_Abort (NULL, 0, 1, "'MPI_IN_PLACE' requries support for `PAMI_IN_PLACE`"); return -1; } #endif /* ********************************* * Check the nature of the buffers * ********************************* */ const struct MPIDI_Comm* const mpid = &(comm_ptr->mpid); int config[6], i; MPID_Datatype * dt_null = NULL; MPI_Aint send_true_lb = 0; MPI_Aint recv_true_lb = 0; int comm_size = comm_ptr->local_size; size_t send_bytes = 0; size_t recv_bytes = 0; volatile unsigned allred_active = 1; volatile unsigned allgather_active = 1; pami_xfer_t allred; const int rank = comm_ptr->rank; int queryreq = 0; #if ASSERT_LEVEL==0 /* We can't afford the tracing in ndebug/performance libraries */ const unsigned verbose = 0; #else const unsigned verbose = (MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_ALL) && (rank == 0); #endif const int selected_type = mpid->user_selected_type[PAMI_XFER_ALLGATHER]; for (i=0;i<6;i++) config[i] = 1; const pami_metadata_t *my_md = (pami_metadata_t *)NULL; allred.cb_done = allred_cb_done; allred.cookie = (void *)&allred_active; /* Pick an algorithm that is guaranteed to work for the pre-allreduce */ /* TODO: This needs selection for fast(er|est) allreduce protocol */ allred.algorithm = mpid->coll_algorithm[PAMI_XFER_ALLREDUCE][0][0]; allred.cmd.xfer_allreduce.sndbuf = (void *)config; allred.cmd.xfer_allreduce.stype = PAMI_TYPE_SIGNED_INT; allred.cmd.xfer_allreduce.rcvbuf = (void *)config; allred.cmd.xfer_allreduce.rtype = PAMI_TYPE_SIGNED_INT; allred.cmd.xfer_allreduce.stypecount = 6; allred.cmd.xfer_allreduce.rtypecount = 6; allred.cmd.xfer_allreduce.op = PAMI_DATA_BAND; char use_tree_reduce, use_alltoall, use_bcast, use_pami, use_opt; char *rbuf = NULL, *sbuf = NULL; const char * const allgathers = mpid->allgathers; use_alltoall = allgathers[2]; use_tree_reduce = allgathers[0]; use_bcast = allgathers[1]; use_pami = (selected_type == MPID_COLL_USE_MPICH) ? 0 : 1; use_opt = use_alltoall || use_tree_reduce || use_bcast || use_pami; TRACE_ERR("flags before: b: %d a: %d t: %d p: %d\n", use_bcast, use_alltoall, use_tree_reduce, use_pami); if(!use_opt) { if(unlikely(verbose)) fprintf(stderr,"Using MPICH allgather algorithm\n"); TRACE_ERR("No options set/available; using MPICH for allgather\n"); MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHER_MPICH"); #if CUDA_AWARE_SUPPORT if(MPIDI_Process.cuda_aware_support_on) { MPI_Aint sdt_extent,rdt_extent; MPID_Datatype_get_extent_macro(sendtype, sdt_extent); MPID_Datatype_get_extent_macro(recvtype, rdt_extent); char *scbuf = NULL; char *rcbuf = NULL; int is_send_dev_buf = MPIDI_cuda_is_device_buf(sendbuf); int is_recv_dev_buf = MPIDI_cuda_is_device_buf(recvbuf); if(is_send_dev_buf) { scbuf = MPL_malloc(sdt_extent * sendcount); cudaError_t cudaerr = CudaMemcpy(scbuf, sendbuf, sdt_extent * sendcount, cudaMemcpyDeviceToHost); if (cudaSuccess != cudaerr) fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr)); } else scbuf = sendbuf; if(is_recv_dev_buf) { rcbuf = MPL_malloc(rdt_extent * recvcount); if(sendbuf == MPI_IN_PLACE) { cudaError_t cudaerr = CudaMemcpy(rcbuf, recvbuf, rdt_extent * recvcount, cudaMemcpyDeviceToHost); if (cudaSuccess != cudaerr) fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr)); } else memset(rcbuf, 0, rdt_extent * recvcount); } else rcbuf = recvbuf; int cuda_res = MPIR_Allgather(scbuf, sendcount, sendtype, rcbuf, recvcount, recvtype, comm_ptr, mpierrno); if(is_send_dev_buf)MPL_free(scbuf); if(is_recv_dev_buf) { cudaError_t cudaerr = CudaMemcpy(recvbuf, rcbuf, rdt_extent * recvcount, cudaMemcpyHostToDevice); if (cudaSuccess != cudaerr) fprintf(stderr, "cudaMemcpy failed: %s\n", CudaGetErrorString(cudaerr)); MPL_free(rcbuf); } return cuda_res; } else #endif return MPIR_Allgather(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, comm_ptr, mpierrno); } if ((sendcount < 1 && sendbuf != MPI_IN_PLACE) || recvcount < 1) return MPI_SUCCESS; /* Gather datatype information */ MPIDI_Datatype_get_info(recvcount, recvtype, config[MPID_RECV_CONTIG], recv_bytes, dt_null, recv_true_lb); send_bytes = recv_bytes; rbuf = (char *)recvbuf+recv_true_lb; sbuf = PAMI_IN_PLACE; if(sendbuf != MPI_IN_PLACE) { MPIDI_Datatype_get_info(sendcount, sendtype, config[MPID_SEND_CONTIG], send_bytes, dt_null, send_true_lb); sbuf = (char *)sendbuf+send_true_lb; } else if(unlikely(verbose)) fprintf(stderr,"allgather MPI_IN_PLACE buffering\n"); /* verify everyone's datatype contiguity */ /* Check buffer alignment now, since we're pre-allreducing anyway */ /* Only do this if one of the glue protocols is likely to be used */ if(use_alltoall || use_tree_reduce || use_bcast) { config[MPID_ALIGNEDBUFFER] = !((long)sendbuf & 0x0F) && !((long)recvbuf & 0x0F); /* #warning need to determine best allreduce for short messages */ if(mpid->preallreduces[MPID_ALLGATHER_PREALLREDUCE]) { TRACE_ERR("Preallreducing in allgather\n"); MPIDI_Post_coll_t allred_post; MPIDI_Context_post(MPIDI_Context[0], &allred_post.state, MPIDI_Pami_post_wrapper, (void *)&allred); MPID_PROGRESS_WAIT_WHILE(allred_active); } use_alltoall = allgathers[2] && config[MPID_RECV_CONTIG] && config[MPID_SEND_CONTIG];; /* Note: some of the glue protocols use recv_bytes*comm_size rather than * recv_bytes so we use that for comparison here, plus we pass that in * to those protocols. */ use_tree_reduce = allgathers[0] && config[MPID_RECV_CONTIG] && config[MPID_SEND_CONTIG] && config[MPID_RECV_CONTINUOUS] && (recv_bytes*comm_size%sizeof(unsigned)) == 0; use_bcast = allgathers[1]; TRACE_ERR("flags after: b: %d a: %d t: %d p: %d\n", use_bcast, use_alltoall, use_tree_reduce, use_pami); } if(use_pami) { TRACE_ERR("Using PAMI-level allgather protocol\n"); pami_xfer_t allgather; allgather.cb_done = allgather_cb_done; allgather.cookie = (void *)&allgather_active; allgather.cmd.xfer_allgather.rcvbuf = rbuf; allgather.cmd.xfer_allgather.sndbuf = sbuf; allgather.cmd.xfer_allgather.stype = PAMI_TYPE_BYTE; allgather.cmd.xfer_allgather.rtype = PAMI_TYPE_BYTE; allgather.cmd.xfer_allgather.stypecount = send_bytes; allgather.cmd.xfer_allgather.rtypecount = recv_bytes; if(selected_type == MPID_COLL_OPTIMIZED) { if((mpid->cutoff_size[PAMI_XFER_ALLGATHER][0] == 0) || (mpid->cutoff_size[PAMI_XFER_ALLGATHER][0] > 0 && mpid->cutoff_size[PAMI_XFER_ALLGATHER][0] >= send_bytes)) { allgather.algorithm = mpid->opt_protocol[PAMI_XFER_ALLGATHER][0]; my_md = &mpid->opt_protocol_md[PAMI_XFER_ALLGATHER][0]; queryreq = mpid->must_query[PAMI_XFER_ALLGATHER][0]; } else { return MPIR_Allgather(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, comm_ptr, mpierrno); } } else { allgather.algorithm = mpid->user_selected[PAMI_XFER_ALLGATHER]; my_md = &mpid->user_metadata[PAMI_XFER_ALLGATHER]; queryreq = selected_type; } if(unlikely( queryreq == MPID_COLL_ALWAYS_QUERY || queryreq == MPID_COLL_CHECK_FN_REQUIRED)) { metadata_result_t result = {0}; TRACE_ERR("Querying allgather protocol %s, type was: %d\n", my_md->name, selected_type); if(my_md->check_fn == NULL) { /* process metadata bits */ if((!my_md->check_correct.values.inplace) && (sendbuf == MPI_IN_PLACE)) result.check.unspecified = 1; if(my_md->check_correct.values.rangeminmax) { if((my_md->range_lo <= recv_bytes) && (my_md->range_hi >= recv_bytes)) ; /* ok, algorithm selected */ else { result.check.range = 1; if(unlikely(verbose)) { fprintf(stderr,"message size (%zu) outside range (%zu<->%zu) for %s.\n", recv_bytes, my_md->range_lo, my_md->range_hi, my_md->name); } } } } else /* calling the check fn is sufficient */ result = my_md->check_fn(&allgather); TRACE_ERR("bitmask: %#X\n", result.bitmask); result.check.nonlocal = 0; /* #warning REMOVE THIS WHEN IMPLEMENTED */ if(result.bitmask) { if(unlikely(verbose)) fprintf(stderr,"Query failed for %s. Using MPICH allgather\n", my_md->name); MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHER_MPICH"); return MPIR_Allgather(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, comm_ptr, mpierrno); } if(my_md->check_correct.values.asyncflowctl && !(--(comm_ptr->mpid.num_requests))) { comm_ptr->mpid.num_requests = MPIDI_Process.optimized.num_requests; int tmpmpierrno; if(unlikely(verbose)) fprintf(stderr,"Query barrier required for %s\n", my_md->name); MPIDO_Barrier(comm_ptr, &tmpmpierrno); } } if(unlikely(verbose)) { unsigned long long int threadID; MPL_thread_id_t tid; MPL_thread_self(&tid); threadID = (unsigned long long int)tid; fprintf(stderr,"<%llx> Using protocol %s for allgather on %u\n", threadID, my_md->name, (unsigned) comm_ptr->context_id); } TRACE_ERR("Calling PAMI_Collective with allgather structure\n"); MPIDI_Post_coll_t allgather_post; MPIDI_Context_post(MPIDI_Context[0], &allgather_post.state, MPIDI_Pami_post_wrapper, (void *)&allgather); MPIDI_Update_last_algorithm(comm_ptr, my_md->name); MPID_PROGRESS_WAIT_WHILE(allgather_active); TRACE_ERR("Allgather done\n"); return PAMI_SUCCESS; } if(use_tree_reduce) { if(unlikely(verbose)) fprintf(stderr,"Using protocol GLUE_ALLREDUCE for allgather\n"); TRACE_ERR("Using allgather via allreduce\n"); MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHER_OPT_ALLREDUCE"); return MPIDO_Allgather_allreduce(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, send_true_lb, recv_true_lb, send_bytes, recv_bytes*comm_size, comm_ptr, mpierrno); } if(use_alltoall) { if(unlikely(verbose)) fprintf(stderr,"Using protocol GLUE_BCAST for allgather\n"); TRACE_ERR("Using allgather via alltoall\n"); MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHER_OPT_ALLTOALL"); return MPIDO_Allgather_alltoall(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, send_true_lb, recv_true_lb, send_bytes, recv_bytes*comm_size, comm_ptr, mpierrno); } if(use_bcast) { if(unlikely(verbose)) fprintf(stderr,"Using protocol GLUE_ALLTOALL for allgather\n"); TRACE_ERR("Using allgather via bcast\n"); MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHER_OPT_BCAST"); return MPIDO_Allgather_bcast(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, send_true_lb, recv_true_lb, send_bytes, recv_bytes*comm_size, comm_ptr, mpierrno); } /* Nothing used yet; dump to MPICH */ if(unlikely(verbose)) fprintf(stderr,"Using MPICH allgather algorithm\n"); TRACE_ERR("Using allgather via mpich\n"); MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHER_MPICH"); return MPIR_Allgather(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, comm_ptr, mpierrno); }
int MPIDO_Allgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPID_Comm * comm_ptr, int *mpierrno) { /* ********************************* * Check the nature of the buffers * ********************************* */ /* MPIDO_Coll_config config = {1,1,1,1,1,1};*/ int config[6], i; MPID_Datatype * dt_null = NULL; MPI_Aint send_true_lb = 0; MPI_Aint recv_true_lb = 0; int rc, comm_size = comm_ptr->local_size; size_t send_size = 0; size_t recv_size = 0; volatile unsigned allred_active = 1; volatile unsigned allgather_active = 1; pami_xfer_t allred; for (i=0;i<6;i++) config[i] = 1; pami_metadata_t *my_md; allred.cb_done = allred_cb_done; allred.cookie = (void *)&allred_active; /* Pick an algorithm that is guaranteed to work for the pre-allreduce */ /* TODO: This needs selection for fast(er|est) allreduce protocol */ allred.algorithm = comm_ptr->mpid.coll_algorithm[PAMI_XFER_ALLREDUCE][0][0]; allred.cmd.xfer_allreduce.sndbuf = (void *)config; allred.cmd.xfer_allreduce.stype = PAMI_TYPE_SIGNED_INT; allred.cmd.xfer_allreduce.rcvbuf = (void *)config; allred.cmd.xfer_allreduce.rtype = PAMI_TYPE_SIGNED_INT; allred.cmd.xfer_allreduce.stypecount = 6; allred.cmd.xfer_allreduce.rtypecount = 6; allred.cmd.xfer_allreduce.op = PAMI_DATA_BAND; char use_tree_reduce, use_alltoall, use_bcast, use_pami, use_opt; char *rbuf = NULL, *sbuf = NULL; use_alltoall = comm_ptr->mpid.allgathers[2]; use_tree_reduce = comm_ptr->mpid.allgathers[0]; use_bcast = comm_ptr->mpid.allgathers[1]; use_pami = (comm_ptr->mpid.user_selected_type[PAMI_XFER_ALLGATHER] == MPID_COLL_USE_MPICH) ? 0 : 1; /* if(sendbuf == MPI_IN_PLACE) use_pami = 0;*/ use_opt = use_alltoall || use_tree_reduce || use_bcast || use_pami; TRACE_ERR("flags before: b: %d a: %d t: %d p: %d\n", use_bcast, use_alltoall, use_tree_reduce, use_pami); if(!use_opt) { if(unlikely(MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_ALL && comm_ptr->rank == 0)) fprintf(stderr,"Using MPICH allgather algorithm\n"); TRACE_ERR("No options set/available; using MPICH for allgather\n"); MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHER_MPICH"); return MPIR_Allgather(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, comm_ptr, mpierrno); } if ((sendcount < 1 && sendbuf != MPI_IN_PLACE) || recvcount < 1) return MPI_SUCCESS; /* Gather datatype information */ MPIDI_Datatype_get_info(recvcount, recvtype, config[MPID_RECV_CONTIG], recv_size, dt_null, recv_true_lb); send_size = recv_size; rbuf = (char *)recvbuf+recv_true_lb; if(sendbuf != MPI_IN_PLACE) { if(unlikely(MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_ALL)) fprintf(stderr,"allgather MPI_IN_PLACE buffering\n"); MPIDI_Datatype_get_info(sendcount, sendtype, config[MPID_SEND_CONTIG], send_size, dt_null, send_true_lb); sbuf = (char *)sendbuf+send_true_lb; } else { sbuf = (char *)recvbuf+recv_size*comm_ptr->rank; } /* fprintf(stderr,"sendount: %d, recvcount: %d send_size: %zd recv_size: %zd\n", sendcount, recvcount, send_size, recv_size);*/ /* verify everyone's datatype contiguity */ /* Check buffer alignment now, since we're pre-allreducing anyway */ /* Only do this if one of the glue protocols is likely to be used */ if(use_alltoall || use_tree_reduce || use_bcast) { config[MPID_ALIGNEDBUFFER] = !((long)sendbuf & 0x0F) && !((long)recvbuf & 0x0F); /* #warning need to determine best allreduce for short messages */ if(comm_ptr->mpid.preallreduces[MPID_ALLGATHER_PREALLREDUCE]) { TRACE_ERR("Preallreducing in allgather\n"); MPIDI_Post_coll_t allred_post; MPIDI_Context_post(MPIDI_Context[0], &allred_post.state, MPIDI_Pami_post_wrapper, (void *)&allred); MPID_PROGRESS_WAIT_WHILE(allred_active); } use_alltoall = comm_ptr->mpid.allgathers[2] && config[MPID_RECV_CONTIG] && config[MPID_SEND_CONTIG];; /* Note: some of the glue protocols use recv_size*comm_size rather than * recv_size so we use that for comparison here, plus we pass that in * to those protocols. */ use_tree_reduce = comm_ptr->mpid.allgathers[0] && config[MPID_RECV_CONTIG] && config[MPID_SEND_CONTIG] && config[MPID_RECV_CONTINUOUS] && (recv_size*comm_size % sizeof(int) == 0); use_bcast = comm_ptr->mpid.allgathers[1]; TRACE_ERR("flags after: b: %d a: %d t: %d p: %d\n", use_bcast, use_alltoall, use_tree_reduce, use_pami); } if(use_pami) { TRACE_ERR("Using PAMI-level allgather protocol\n"); pami_xfer_t allgather; allgather.cb_done = allgather_cb_done; allgather.cookie = (void *)&allgather_active; allgather.cmd.xfer_allgather.rcvbuf = rbuf; allgather.cmd.xfer_allgather.sndbuf = sbuf; allgather.cmd.xfer_allgather.stype = PAMI_TYPE_BYTE; allgather.cmd.xfer_allgather.rtype = PAMI_TYPE_BYTE; allgather.cmd.xfer_allgather.stypecount = send_size; allgather.cmd.xfer_allgather.rtypecount = recv_size; if(comm_ptr->mpid.user_selected_type[PAMI_XFER_ALLGATHER] == MPID_COLL_OPTIMIZED) { allgather.algorithm = comm_ptr->mpid.opt_protocol[PAMI_XFER_ALLGATHER][0]; my_md = &comm_ptr->mpid.opt_protocol_md[PAMI_XFER_ALLGATHER][0]; } else { allgather.algorithm = comm_ptr->mpid.user_selected[PAMI_XFER_ALLGATHER]; my_md = &comm_ptr->mpid.user_metadata[PAMI_XFER_ALLGATHER]; } if(unlikely( comm_ptr->mpid.user_selected_type[PAMI_XFER_ALLGATHER] == MPID_COLL_ALWAYS_QUERY || comm_ptr->mpid.user_selected_type[PAMI_XFER_ALLGATHER] == MPID_COLL_CHECK_FN_REQUIRED)) { metadata_result_t result = {0}; TRACE_ERR("Querying allgather protocol %s, type was: %d\n", my_md->name, comm_ptr->mpid.user_selected_type[PAMI_XFER_ALLGATHER]); result = my_md->check_fn(&allgather); TRACE_ERR("bitmask: %#X\n", result.bitmask); if(!result.bitmask) { fprintf(stderr,"Query failed for %s.\n", my_md->name); } } if(unlikely(MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_ALL && comm_ptr->rank == 0)) { unsigned long long int threadID; MPIU_Thread_id_t tid; MPIU_Thread_self(&tid); threadID = (unsigned long long int)tid; fprintf(stderr,"<%llx> Using protocol %s for allgather on %u\n", threadID, my_md->name, (unsigned) comm_ptr->context_id); } TRACE_ERR("Calling PAMI_Collective with allgather structure\n"); MPIDI_Post_coll_t allgather_post; MPIDI_Context_post(MPIDI_Context[0], &allgather_post.state, MPIDI_Pami_post_wrapper, (void *)&allgather); TRACE_ERR("Allgather %s\n", MPIDI_Process.context_post.active>0?"posted":"invoked"); MPIDI_Update_last_algorithm(comm_ptr, my_md->name); MPID_PROGRESS_WAIT_WHILE(allgather_active); TRACE_ERR("Allgather done\n"); return PAMI_SUCCESS; } if(use_tree_reduce) { TRACE_ERR("Using allgather via allreduce\n"); MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHER_OPT_ALLREDUCE"); rc = MPIDO_Allgather_allreduce(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, send_true_lb, recv_true_lb, send_size, recv_size*comm_size, comm_ptr, mpierrno); return rc; } if(use_alltoall) { TRACE_ERR("Using allgather via alltoall\n"); MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHER_OPT_ALLTOALL"); rc = MPIDO_Allgather_alltoall(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, send_true_lb, recv_true_lb, send_size, recv_size*comm_size, comm_ptr, mpierrno); return rc; } if(use_bcast) { TRACE_ERR("Using allgather via bcast\n"); MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHER_OPT_BCAST"); rc = MPIDO_Allgather_bcast(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, send_true_lb, recv_true_lb, send_size, recv_size*comm_size, comm_ptr, mpierrno); return rc; } /* Nothing used yet; dump to MPICH */ if(unlikely(MPIDI_Process.verbose >= MPIDI_VERBOSE_DETAILS_ALL && comm_ptr->rank == 0)) fprintf(stderr,"Using MPICH allgather algorithm\n"); TRACE_ERR("Using allgather via mpich\n"); MPIDI_Update_last_algorithm(comm_ptr, "ALLGATHER_MPICH"); return MPIR_Allgather(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, comm_ptr, mpierrno); }