int Coll_reduce_mvapich2_two_level::reduce( const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPI_Comm comm) { int mpi_errno = MPI_SUCCESS; int my_rank, total_size, local_rank, local_size; int leader_comm_rank = -1, leader_comm_size = 0; MPI_Comm shmem_comm, leader_comm; int leader_root, leader_of_root; const unsigned char* in_buf = nullptr; unsigned char *out_buf = nullptr, *tmp_buf = nullptr; MPI_Aint true_lb, true_extent, extent; int is_commutative = 0, stride = 0; int intra_node_root=0; //if not set (use of the algo directly, without mvapich2 selector) if(MV2_Reduce_function==NULL) MV2_Reduce_function=Coll_reduce_mpich::reduce; if(MV2_Reduce_intra_function==NULL) MV2_Reduce_intra_function=Coll_reduce_mpich::reduce; if(comm->get_leaders_comm()==MPI_COMM_NULL){ comm->init_smp(); } my_rank = comm->rank(); total_size = comm->size(); shmem_comm = comm->get_intra_comm(); local_rank = shmem_comm->rank(); local_size = shmem_comm->size(); leader_comm = comm->get_leaders_comm(); int* leaders_map = comm->get_leaders_map(); leader_of_root = comm->group()->rank(leaders_map[root]); leader_root = leader_comm->group()->rank(leaders_map[root]); is_commutative= (op==MPI_OP_NULL || op->is_commutative()); datatype->extent(&true_lb, &true_extent); extent =datatype->get_extent(); stride = count * std::max(extent, true_extent); if (local_size == total_size) { /* First handle the case where there is only one node */ if (stride <= MV2_INTRA_SHMEM_REDUCE_MSG && is_commutative == 1) { if (local_rank == 0 ) { tmp_buf = smpi_get_tmp_sendbuffer(count * std::max(extent, true_extent)); tmp_buf = tmp_buf - true_lb; } if (sendbuf != MPI_IN_PLACE) { in_buf = static_cast<const unsigned char*>(sendbuf); } else { in_buf = static_cast<const unsigned char*>(recvbuf); } if (local_rank == 0) { if( my_rank != root) { out_buf = tmp_buf; } else { out_buf = static_cast<unsigned char*>(recvbuf); if (in_buf == out_buf) { in_buf = static_cast<const unsigned char*>(MPI_IN_PLACE); out_buf = static_cast<unsigned char*>(recvbuf); } } } else { in_buf = static_cast<const unsigned char*>(sendbuf); out_buf = nullptr; } if (count * (std::max(extent, true_extent)) < SHMEM_COLL_BLOCK_SIZE) { mpi_errno = MPIR_Reduce_shmem_MV2(in_buf, out_buf, count, datatype, op, 0, shmem_comm); } else { mpi_errno = MPIR_Reduce_intra_knomial_wrapper_MV2(in_buf, out_buf, count, datatype, op, 0, shmem_comm); } if (local_rank == 0 && root != my_rank) { Request::send(out_buf, count, datatype, root, COLL_TAG_REDUCE+1, comm); } if ((local_rank != 0) && (root == my_rank)) { Request::recv(recvbuf, count, datatype, leader_of_root, COLL_TAG_REDUCE+1, comm, MPI_STATUS_IGNORE); } } else { if(mv2_use_knomial_reduce == 1) { reduce_fn = &MPIR_Reduce_intra_knomial_wrapper_MV2; } else { reduce_fn = &MPIR_Reduce_binomial_MV2; } mpi_errno = reduce_fn(sendbuf, recvbuf, count, datatype, op, root, comm); } /* We are done */ if (tmp_buf != nullptr) smpi_free_tmp_buffer(tmp_buf + true_lb); goto fn_exit; } if (local_rank == 0) { leader_comm = comm->get_leaders_comm(); if(leader_comm==MPI_COMM_NULL){ leader_comm = MPI_COMM_WORLD; } leader_comm_size = leader_comm->size(); leader_comm_rank = leader_comm->rank(); tmp_buf = smpi_get_tmp_sendbuffer(count * std::max(extent, true_extent)); tmp_buf = tmp_buf - true_lb; } if (sendbuf != MPI_IN_PLACE) { in_buf = static_cast<const unsigned char*>(sendbuf); } else { in_buf = static_cast<const unsigned char*>(recvbuf); } if (local_rank == 0) { out_buf = static_cast<unsigned char*>(tmp_buf); } else { out_buf = nullptr; } if(local_size > 1) { /* Lets do the intra-node reduce operations, if we have more than one * process in the node */ /*Fix the input and outbuf buffers for the intra-node reduce. *Node leaders will have the reduced data in tmp_buf after *this step*/ if (MV2_Reduce_intra_function == & MPIR_Reduce_shmem_MV2) { if (is_commutative == 1 && (count * (std::max(extent, true_extent)) < SHMEM_COLL_BLOCK_SIZE)) { mpi_errno = MV2_Reduce_intra_function(in_buf, out_buf, count, datatype, op, intra_node_root, shmem_comm); } else { mpi_errno = MPIR_Reduce_intra_knomial_wrapper_MV2(in_buf, out_buf, count, datatype, op, intra_node_root, shmem_comm); } } else { mpi_errno = MV2_Reduce_intra_function(in_buf, out_buf, count, datatype, op, intra_node_root, shmem_comm); } } else { smpi_free_tmp_buffer(tmp_buf + true_lb); tmp_buf = (unsigned char*)in_buf; // xxx } /* Now work on the inter-leader phase. Data is in tmp_buf */ if (local_rank == 0 && leader_comm_size > 1) { /*The leader of root will have the global reduced data in tmp_buf or recv_buf at the end of the reduce */ if (leader_comm_rank == leader_root) { if (my_rank == root) { /* I am the root of the leader-comm, and the * root of the reduce op. So, I will write the * final result directly into my recvbuf */ if(tmp_buf != recvbuf) { in_buf = tmp_buf; out_buf = static_cast<unsigned char*>(recvbuf); } else { unsigned char* buf = smpi_get_tmp_sendbuffer(count * datatype->get_extent()); Datatype::copy(tmp_buf, count, datatype, buf, count, datatype); // in_buf = MPI_IN_PLACE; in_buf = buf; out_buf = static_cast<unsigned char*>(recvbuf); } } else { unsigned char* buf = smpi_get_tmp_sendbuffer(count * datatype->get_extent()); Datatype::copy(tmp_buf, count, datatype, buf, count, datatype); // in_buf = MPI_IN_PLACE; in_buf = buf; out_buf = tmp_buf; } } else { in_buf = tmp_buf; out_buf = nullptr; } /* inter-leader communication */ mpi_errno = MV2_Reduce_function(in_buf, out_buf, count, datatype, op, leader_root, leader_comm); } if (local_size > 1) { /* Send the message to the root if the leader is not the * root of the reduce operation. The reduced data is in tmp_buf */ if ((local_rank == 0) && (root != my_rank) && (leader_root == leader_comm_rank)) { Request::send(tmp_buf, count, datatype, root, COLL_TAG_REDUCE + 1, comm); } if ((local_rank != 0) && (root == my_rank)) { Request::recv(recvbuf, count, datatype, leader_of_root, COLL_TAG_REDUCE + 1, comm, MPI_STATUS_IGNORE); } smpi_free_tmp_buffer(tmp_buf + true_lb); if (leader_comm_rank == leader_root) { if (my_rank != root || (my_rank == root && tmp_buf == recvbuf)) { smpi_free_tmp_buffer(in_buf); } } } fn_exit: return mpi_errno; }
int Coll_scatter_mvapich2_two_level_direct::scatter(void *sendbuf, int sendcnt, MPI_Datatype sendtype, void *recvbuf, int recvcnt, MPI_Datatype recvtype, int root, MPI_Comm comm) { int comm_size, rank; int local_rank, local_size; int leader_comm_rank = -1, leader_comm_size = -1; int mpi_errno = MPI_SUCCESS; int recvtype_size, sendtype_size, nbytes; void *tmp_buf = NULL; void *leader_scatter_buf = NULL; MPI_Status status; int leader_root, leader_of_root = -1; MPI_Comm shmem_comm, leader_comm; //if not set (use of the algo directly, without mvapich2 selector) if(MV2_Scatter_intra_function==NULL) MV2_Scatter_intra_function=Coll_scatter_mpich::scatter; if(comm->get_leaders_comm()==MPI_COMM_NULL){ comm->init_smp(); } comm_size = comm->size(); rank = comm->rank(); if (((rank == root) && (recvcnt == 0)) || ((rank != root) && (sendcnt == 0))) { return MPI_SUCCESS; } /* extract the rank,size information for the intra-node * communicator */ shmem_comm = comm->get_intra_comm(); local_rank = shmem_comm->rank(); local_size = shmem_comm->size(); if (local_rank == 0) { /* Node leader. Extract the rank, size information for the leader * communicator */ leader_comm = comm->get_leaders_comm(); leader_comm_size = leader_comm->size(); leader_comm_rank = leader_comm->rank(); } if (local_size == comm_size) { /* purely intra-node scatter. Just use the direct algorithm and we are done */ mpi_errno = MPIR_Scatter_MV2_Direct(sendbuf, sendcnt, sendtype, recvbuf, recvcnt, recvtype, root, comm); } else { recvtype_size=recvtype->size(); sendtype_size=sendtype->size(); if (rank == root) { nbytes = sendcnt * sendtype_size; } else { nbytes = recvcnt * recvtype_size; } if (local_rank == 0) { /* Node leader, allocate tmp_buffer */ tmp_buf = smpi_get_tmp_sendbuffer(nbytes * local_size); } leader_comm = comm->get_leaders_comm(); int* leaders_map = comm->get_leaders_map(); leader_of_root = comm->group()->rank(leaders_map[root]); leader_root = leader_comm->group()->rank(leaders_map[root]); /* leader_root is the rank of the leader of the root in leader_comm. * leader_root is to be used as the root of the inter-leader gather ops */ if ((local_rank == 0) && (root != rank) && (leader_of_root == rank)) { /* The root of the scatter operation is not the node leader. Recv * data from the node leader */ leader_scatter_buf = smpi_get_tmp_sendbuffer(nbytes * comm_size); Request::recv(leader_scatter_buf, nbytes * comm_size, MPI_BYTE, root, COLL_TAG_SCATTER, comm, &status); } if (rank == root && local_rank != 0) { /* The root of the scatter operation is not the node leader. Send * data to the node leader */ Request::send(sendbuf, sendcnt * comm_size, sendtype, leader_of_root, COLL_TAG_SCATTER, comm ); } if (leader_comm_size > 1 && local_rank == 0) { if (not comm->is_uniform()) { int* displs = NULL; int* sendcnts = NULL; int* node_sizes; int i = 0; node_sizes = comm->get_non_uniform_map(); if (root != leader_of_root) { if (leader_comm_rank == leader_root) { displs = static_cast<int*>(xbt_malloc(sizeof(int) * leader_comm_size)); sendcnts = static_cast<int*>(xbt_malloc(sizeof(int) * leader_comm_size)); sendcnts[0] = node_sizes[0] * nbytes; displs[0] = 0; for (i = 1; i < leader_comm_size; i++) { displs[i] = displs[i - 1] + node_sizes[i - 1] * nbytes; sendcnts[i] = node_sizes[i] * nbytes; } } Colls::scatterv(leader_scatter_buf, sendcnts, displs, MPI_BYTE, tmp_buf, nbytes * local_size, MPI_BYTE, leader_root, leader_comm); } else { if (leader_comm_rank == leader_root) { displs = static_cast<int*>(xbt_malloc(sizeof(int) * leader_comm_size)); sendcnts = static_cast<int*>(xbt_malloc(sizeof(int) * leader_comm_size)); sendcnts[0] = node_sizes[0] * sendcnt; displs[0] = 0; for (i = 1; i < leader_comm_size; i++) { displs[i] = displs[i - 1] + node_sizes[i - 1] * sendcnt; sendcnts[i] = node_sizes[i] * sendcnt; } } Colls::scatterv(sendbuf, sendcnts, displs, sendtype, tmp_buf, nbytes * local_size, MPI_BYTE, leader_root, leader_comm); } if (leader_comm_rank == leader_root) { xbt_free(displs); xbt_free(sendcnts); } } else { if (leader_of_root != root) { mpi_errno = MPIR_Scatter_MV2_Direct(leader_scatter_buf, nbytes * local_size, MPI_BYTE, tmp_buf, nbytes * local_size, MPI_BYTE, leader_root, leader_comm); } else { mpi_errno = MPIR_Scatter_MV2_Direct(sendbuf, sendcnt * local_size, sendtype, tmp_buf, nbytes * local_size, MPI_BYTE, leader_root, leader_comm); } } } /* The leaders are now done with the inter-leader part. Scatter the data within the nodes */ if (rank == root && recvbuf == MPI_IN_PLACE) { mpi_errno = MV2_Scatter_intra_function(tmp_buf, nbytes, MPI_BYTE, (void *)sendbuf, sendcnt, sendtype, 0, shmem_comm); } else { mpi_errno = MV2_Scatter_intra_function(tmp_buf, nbytes, MPI_BYTE, recvbuf, recvcnt, recvtype, 0, shmem_comm); } } /* check if multiple threads are calling this collective function */ if (comm_size != local_size && local_rank == 0) { smpi_free_tmp_buffer(tmp_buf); if (leader_of_root == rank && root != rank) { smpi_free_tmp_buffer(leader_scatter_buf); } } return (mpi_errno); }
int Coll_bcast_mvapich2_inter_node::bcast(void *buffer, int count, MPI_Datatype datatype, int root, MPI_Comm comm) { int rank; int mpi_errno = MPI_SUCCESS; MPI_Comm shmem_comm, leader_comm; int local_rank, local_size, global_rank = -1; int leader_root, leader_of_root; rank = comm->rank(); //comm_size = comm->size(); if (MV2_Bcast_function==NULL){ MV2_Bcast_function=Coll_bcast_mpich::bcast; } if (MV2_Bcast_intra_node_function==NULL){ MV2_Bcast_intra_node_function= Coll_bcast_mpich::bcast; } if(comm->get_leaders_comm()==MPI_COMM_NULL){ comm->init_smp(); } shmem_comm = comm->get_intra_comm(); local_rank = shmem_comm->rank(); local_size = shmem_comm->size(); leader_comm = comm->get_leaders_comm(); if ((local_rank == 0) && (local_size > 1)) { global_rank = leader_comm->rank(); } int* leaders_map = comm->get_leaders_map(); leader_of_root = comm->group()->rank(leaders_map[root]); leader_root = leader_comm->group()->rank(leaders_map[root]); if (local_size > 1) { if ((local_rank == 0) && (root != rank) && (leader_root == global_rank)) { Request::recv(buffer, count, datatype, root, COLL_TAG_BCAST, comm, MPI_STATUS_IGNORE); } if ((local_rank != 0) && (root == rank)) { Request::send(buffer, count, datatype, leader_of_root, COLL_TAG_BCAST, comm); } } #if defined(_MCST_SUPPORT_) if (comm_ptr->ch.is_mcast_ok) { mpi_errno = MPIR_Mcast_inter_node_MV2(buffer, count, datatype, root, comm_ptr, errflag); if (mpi_errno == MPI_SUCCESS) { goto fn_exit; } else { goto fn_fail; } } #endif /* if (local_rank == 0) { leader_comm = comm->get_leaders_comm(); root = leader_root; } if (MV2_Bcast_function == &MPIR_Pipelined_Bcast_MV2) { mpi_errno = MPIR_Pipelined_Bcast_MV2(buffer, count, datatype, root, comm); } else if (MV2_Bcast_function == &MPIR_Bcast_scatter_ring_allgather_shm_MV2) { mpi_errno = MPIR_Bcast_scatter_ring_allgather_shm_MV2(buffer, count, datatype, root, comm); } else */{ if (local_rank == 0) { /* if (MV2_Bcast_function == &MPIR_Knomial_Bcast_inter_node_wrapper_MV2) { mpi_errno = MPIR_Knomial_Bcast_inter_node_wrapper_MV2(buffer, count, datatype, root, comm); } else {*/ mpi_errno = MV2_Bcast_function(buffer, count, datatype, leader_root, leader_comm); // } } } return mpi_errno; }