int Coll_reduce_mvapich2_two_level::reduce( const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPI_Comm comm) { int mpi_errno = MPI_SUCCESS; int my_rank, total_size, local_rank, local_size; int leader_comm_rank = -1, leader_comm_size = 0; MPI_Comm shmem_comm, leader_comm; int leader_root, leader_of_root; const unsigned char* in_buf = nullptr; unsigned char *out_buf = nullptr, *tmp_buf = nullptr; MPI_Aint true_lb, true_extent, extent; int is_commutative = 0, stride = 0; int intra_node_root=0; //if not set (use of the algo directly, without mvapich2 selector) if(MV2_Reduce_function==NULL) MV2_Reduce_function=Coll_reduce_mpich::reduce; if(MV2_Reduce_intra_function==NULL) MV2_Reduce_intra_function=Coll_reduce_mpich::reduce; if(comm->get_leaders_comm()==MPI_COMM_NULL){ comm->init_smp(); } my_rank = comm->rank(); total_size = comm->size(); shmem_comm = comm->get_intra_comm(); local_rank = shmem_comm->rank(); local_size = shmem_comm->size(); leader_comm = comm->get_leaders_comm(); int* leaders_map = comm->get_leaders_map(); leader_of_root = comm->group()->rank(leaders_map[root]); leader_root = leader_comm->group()->rank(leaders_map[root]); is_commutative= (op==MPI_OP_NULL || op->is_commutative()); datatype->extent(&true_lb, &true_extent); extent =datatype->get_extent(); stride = count * std::max(extent, true_extent); if (local_size == total_size) { /* First handle the case where there is only one node */ if (stride <= MV2_INTRA_SHMEM_REDUCE_MSG && is_commutative == 1) { if (local_rank == 0 ) { tmp_buf = smpi_get_tmp_sendbuffer(count * std::max(extent, true_extent)); tmp_buf = tmp_buf - true_lb; } if (sendbuf != MPI_IN_PLACE) { in_buf = static_cast<const unsigned char*>(sendbuf); } else { in_buf = static_cast<const unsigned char*>(recvbuf); } if (local_rank == 0) { if( my_rank != root) { out_buf = tmp_buf; } else { out_buf = static_cast<unsigned char*>(recvbuf); if (in_buf == out_buf) { in_buf = static_cast<const unsigned char*>(MPI_IN_PLACE); out_buf = static_cast<unsigned char*>(recvbuf); } } } else { in_buf = static_cast<const unsigned char*>(sendbuf); out_buf = nullptr; } if (count * (std::max(extent, true_extent)) < SHMEM_COLL_BLOCK_SIZE) { mpi_errno = MPIR_Reduce_shmem_MV2(in_buf, out_buf, count, datatype, op, 0, shmem_comm); } else { mpi_errno = MPIR_Reduce_intra_knomial_wrapper_MV2(in_buf, out_buf, count, datatype, op, 0, shmem_comm); } if (local_rank == 0 && root != my_rank) { Request::send(out_buf, count, datatype, root, COLL_TAG_REDUCE+1, comm); } if ((local_rank != 0) && (root == my_rank)) { Request::recv(recvbuf, count, datatype, leader_of_root, COLL_TAG_REDUCE+1, comm, MPI_STATUS_IGNORE); } } else { if(mv2_use_knomial_reduce == 1) { reduce_fn = &MPIR_Reduce_intra_knomial_wrapper_MV2; } else { reduce_fn = &MPIR_Reduce_binomial_MV2; } mpi_errno = reduce_fn(sendbuf, recvbuf, count, datatype, op, root, comm); } /* We are done */ if (tmp_buf != nullptr) smpi_free_tmp_buffer(tmp_buf + true_lb); goto fn_exit; } if (local_rank == 0) { leader_comm = comm->get_leaders_comm(); if(leader_comm==MPI_COMM_NULL){ leader_comm = MPI_COMM_WORLD; } leader_comm_size = leader_comm->size(); leader_comm_rank = leader_comm->rank(); tmp_buf = smpi_get_tmp_sendbuffer(count * std::max(extent, true_extent)); tmp_buf = tmp_buf - true_lb; } if (sendbuf != MPI_IN_PLACE) { in_buf = static_cast<const unsigned char*>(sendbuf); } else { in_buf = static_cast<const unsigned char*>(recvbuf); } if (local_rank == 0) { out_buf = static_cast<unsigned char*>(tmp_buf); } else { out_buf = nullptr; } if(local_size > 1) { /* Lets do the intra-node reduce operations, if we have more than one * process in the node */ /*Fix the input and outbuf buffers for the intra-node reduce. *Node leaders will have the reduced data in tmp_buf after *this step*/ if (MV2_Reduce_intra_function == & MPIR_Reduce_shmem_MV2) { if (is_commutative == 1 && (count * (std::max(extent, true_extent)) < SHMEM_COLL_BLOCK_SIZE)) { mpi_errno = MV2_Reduce_intra_function(in_buf, out_buf, count, datatype, op, intra_node_root, shmem_comm); } else { mpi_errno = MPIR_Reduce_intra_knomial_wrapper_MV2(in_buf, out_buf, count, datatype, op, intra_node_root, shmem_comm); } } else { mpi_errno = MV2_Reduce_intra_function(in_buf, out_buf, count, datatype, op, intra_node_root, shmem_comm); } } else { smpi_free_tmp_buffer(tmp_buf + true_lb); tmp_buf = (unsigned char*)in_buf; // xxx } /* Now work on the inter-leader phase. Data is in tmp_buf */ if (local_rank == 0 && leader_comm_size > 1) { /*The leader of root will have the global reduced data in tmp_buf or recv_buf at the end of the reduce */ if (leader_comm_rank == leader_root) { if (my_rank == root) { /* I am the root of the leader-comm, and the * root of the reduce op. So, I will write the * final result directly into my recvbuf */ if(tmp_buf != recvbuf) { in_buf = tmp_buf; out_buf = static_cast<unsigned char*>(recvbuf); } else { unsigned char* buf = smpi_get_tmp_sendbuffer(count * datatype->get_extent()); Datatype::copy(tmp_buf, count, datatype, buf, count, datatype); // in_buf = MPI_IN_PLACE; in_buf = buf; out_buf = static_cast<unsigned char*>(recvbuf); } } else { unsigned char* buf = smpi_get_tmp_sendbuffer(count * datatype->get_extent()); Datatype::copy(tmp_buf, count, datatype, buf, count, datatype); // in_buf = MPI_IN_PLACE; in_buf = buf; out_buf = tmp_buf; } } else { in_buf = tmp_buf; out_buf = nullptr; } /* inter-leader communication */ mpi_errno = MV2_Reduce_function(in_buf, out_buf, count, datatype, op, leader_root, leader_comm); } if (local_size > 1) { /* Send the message to the root if the leader is not the * root of the reduce operation. The reduced data is in tmp_buf */ if ((local_rank == 0) && (root != my_rank) && (leader_root == leader_comm_rank)) { Request::send(tmp_buf, count, datatype, root, COLL_TAG_REDUCE + 1, comm); } if ((local_rank != 0) && (root == my_rank)) { Request::recv(recvbuf, count, datatype, leader_of_root, COLL_TAG_REDUCE + 1, comm, MPI_STATUS_IGNORE); } smpi_free_tmp_buffer(tmp_buf + true_lb); if (leader_comm_rank == leader_root) { if (my_rank != root || (my_rank == root && tmp_buf == recvbuf)) { smpi_free_tmp_buffer(in_buf); } } } fn_exit: return mpi_errno; }
int Coll_reduce_mvapich2::reduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPI_Comm comm) { if(mv2_reduce_thresholds_table == NULL) init_mv2_reduce_tables_stampede(); int mpi_errno = MPI_SUCCESS; int range = 0; int range_threshold = 0; int range_intra_threshold = 0; int is_commutative, pof2; int comm_size = 0; long nbytes = 0; int sendtype_size; int is_two_level = 0; comm_size = comm->size(); sendtype_size=datatype->size(); nbytes = count * sendtype_size; if (count == 0) return MPI_SUCCESS; is_commutative = (op==MPI_OP_NULL || op->is_commutative()); /* find nearest power-of-two less than or equal to comm_size */ for( pof2 = 1; pof2 <= comm_size; pof2 <<= 1 ); pof2 >>=1; /* Search for the corresponding system size inside the tuning table */ while ((range < (mv2_size_reduce_tuning_table - 1)) && (comm_size > mv2_reduce_thresholds_table[range].numproc)) { range++; } /* Search for corresponding inter-leader function */ while ((range_threshold < (mv2_reduce_thresholds_table[range].size_inter_table - 1)) && (nbytes > mv2_reduce_thresholds_table[range].inter_leader[range_threshold].max) && (mv2_reduce_thresholds_table[range].inter_leader[range_threshold].max != -1)) { range_threshold++; } /* Search for corresponding intra node function */ while ((range_intra_threshold < (mv2_reduce_thresholds_table[range].size_intra_table - 1)) && (nbytes > mv2_reduce_thresholds_table[range].intra_node[range_intra_threshold].max) && (mv2_reduce_thresholds_table[range].intra_node[range_intra_threshold].max != -1)) { range_intra_threshold++; } /* Set intra-node function pt for reduce_two_level */ MV2_Reduce_intra_function = mv2_reduce_thresholds_table[range].intra_node[range_intra_threshold]. MV2_pt_Reduce_function; /* Set inter-leader pt */ MV2_Reduce_function = mv2_reduce_thresholds_table[range].inter_leader[range_threshold]. MV2_pt_Reduce_function; if(mv2_reduce_intra_knomial_factor<0) { mv2_reduce_intra_knomial_factor = mv2_reduce_thresholds_table[range].intra_k_degree; } if(mv2_reduce_inter_knomial_factor<0) { mv2_reduce_inter_knomial_factor = mv2_reduce_thresholds_table[range].inter_k_degree; } if(mv2_reduce_thresholds_table[range].is_two_level_reduce[range_threshold] == 1){ is_two_level = 1; } /* We call Reduce function */ if(is_two_level == 1) { if (is_commutative == 1) { if(comm->get_leaders_comm()==MPI_COMM_NULL){ comm->init_smp(); } mpi_errno = MPIR_Reduce_two_level_helper_MV2(sendbuf, recvbuf, count, datatype, op, root, comm); } else { mpi_errno = MPIR_Reduce_binomial_MV2(sendbuf, recvbuf, count, datatype, op, root, comm); } } else if(MV2_Reduce_function == &MPIR_Reduce_inter_knomial_wrapper_MV2 ){ if(is_commutative ==1) { mpi_errno = MV2_Reduce_function(sendbuf, recvbuf, count, datatype, op, root, comm); } else { mpi_errno = MPIR_Reduce_binomial_MV2(sendbuf, recvbuf, count, datatype, op, root, comm); } } else if(MV2_Reduce_function == &MPIR_Reduce_redscat_gather_MV2){ if (/*(HANDLE_GET_KIND(op) == HANDLE_KIND_BUILTIN) &&*/ (count >= pof2)) { mpi_errno = MV2_Reduce_function(sendbuf, recvbuf, count, datatype, op, root, comm); } else { mpi_errno = MPIR_Reduce_binomial_MV2(sendbuf, recvbuf, count, datatype, op, root, comm); } } else { mpi_errno = MV2_Reduce_function(sendbuf, recvbuf, count, datatype, op, root, comm); } return mpi_errno; }
int smpi_coll_tuned_reduce_mvapich2_two_level( void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPI_Comm comm) { int mpi_errno = MPI_SUCCESS; int my_rank, total_size, local_rank, local_size; int leader_comm_rank = -1, leader_comm_size = 0; MPI_Comm shmem_comm, leader_comm; int leader_root, leader_of_root; void *in_buf = NULL, *out_buf = NULL, *tmp_buf = NULL; MPI_Aint true_lb, true_extent, extent; int is_commutative = 0, stride = 0; int intra_node_root=0; //if not set (use of the algo directly, without mvapich2 selector) if(MV2_Reduce_function==NULL) MV2_Reduce_function=smpi_coll_tuned_reduce_mpich; if(MV2_Reduce_intra_function==NULL) MV2_Reduce_intra_function=smpi_coll_tuned_reduce_mpich; if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){ smpi_comm_init_smp(comm); } my_rank = smpi_comm_rank(comm); total_size = smpi_comm_size(comm); shmem_comm = smpi_comm_get_intra_comm(comm); local_rank = smpi_comm_rank(shmem_comm); local_size = smpi_comm_size(shmem_comm); leader_comm = smpi_comm_get_leaders_comm(comm); int* leaders_map = smpi_comm_get_leaders_map(comm); leader_of_root = smpi_group_rank(smpi_comm_group(comm),leaders_map[root]); leader_root = smpi_group_rank(smpi_comm_group(leader_comm),leaders_map[root]); is_commutative=smpi_op_is_commute(op); smpi_datatype_extent(datatype, &true_lb, &true_extent); extent =smpi_datatype_get_extent(datatype); stride = count * MAX(extent, true_extent); if (local_size == total_size) { /* First handle the case where there is only one node */ if (stride <= MV2_INTRA_SHMEM_REDUCE_MSG && is_commutative == 1) { if (local_rank == 0 ) { tmp_buf=(void *)smpi_get_tmp_sendbuffer( count * (MAX(extent, true_extent))); tmp_buf = (void *) ((char *) tmp_buf - true_lb); } if (sendbuf != MPI_IN_PLACE) { in_buf = (void *)sendbuf; } else { in_buf = recvbuf; } if (local_rank == 0) { if( my_rank != root) { out_buf = tmp_buf; } else { out_buf = recvbuf; if(in_buf == out_buf) { in_buf = MPI_IN_PLACE; out_buf = recvbuf; } } } else { in_buf = (void *)sendbuf; out_buf = NULL; } if (count * (MAX(extent, true_extent)) < SHMEM_COLL_BLOCK_SIZE) { mpi_errno = MPIR_Reduce_shmem_MV2(in_buf, out_buf, count, datatype, op, 0, shmem_comm); } else { mpi_errno = MPIR_Reduce_intra_knomial_wrapper_MV2(in_buf, out_buf, count, datatype, op, 0, shmem_comm); } if (local_rank == 0 && root != my_rank) { smpi_mpi_send(out_buf, count, datatype, root, COLL_TAG_REDUCE+1, comm); } if ((local_rank != 0) && (root == my_rank)) { smpi_mpi_recv(recvbuf, count, datatype, leader_of_root, COLL_TAG_REDUCE+1, comm, MPI_STATUS_IGNORE); } } else { if(mv2_use_knomial_reduce == 1) { reduce_fn = &MPIR_Reduce_intra_knomial_wrapper_MV2; } else { reduce_fn = &MPIR_Reduce_binomial_MV2; } mpi_errno = reduce_fn(sendbuf, recvbuf, count, datatype, op, root, comm); } /* We are done */ if(tmp_buf!=NULL) smpi_free_tmp_buffer((void *) ((char *) tmp_buf + true_lb)); goto fn_exit; } if (local_rank == 0) { leader_comm = smpi_comm_get_leaders_comm(comm); if(leader_comm==MPI_COMM_NULL){ leader_comm = MPI_COMM_WORLD; } leader_comm_size = smpi_comm_size(leader_comm); leader_comm_rank = smpi_comm_rank(leader_comm); tmp_buf=(void *)smpi_get_tmp_sendbuffer(count * (MAX(extent, true_extent))); tmp_buf = (void *) ((char *) tmp_buf - true_lb); } if (sendbuf != MPI_IN_PLACE) { in_buf = (void *)sendbuf; } else { in_buf = recvbuf; } if (local_rank == 0) { out_buf = tmp_buf; } else { out_buf = NULL; } if(local_size > 1) { /* Lets do the intra-node reduce operations, if we have more than one * process in the node */ /*Fix the input and outbuf buffers for the intra-node reduce. *Node leaders will have the reduced data in tmp_buf after *this step*/ if (MV2_Reduce_intra_function == & MPIR_Reduce_shmem_MV2) { if (is_commutative == 1 && (count * (MAX(extent, true_extent)) < SHMEM_COLL_BLOCK_SIZE)) { mpi_errno = MV2_Reduce_intra_function(in_buf, out_buf, count, datatype, op, intra_node_root, shmem_comm); } else { mpi_errno = MPIR_Reduce_intra_knomial_wrapper_MV2(in_buf, out_buf, count, datatype, op, intra_node_root, shmem_comm); } } else { mpi_errno = MV2_Reduce_intra_function(in_buf, out_buf, count, datatype, op, intra_node_root, shmem_comm); } } else { smpi_free_tmp_buffer((void *) ((char *) tmp_buf + true_lb)); tmp_buf = in_buf; } /* Now work on the inter-leader phase. Data is in tmp_buf */ if (local_rank == 0 && leader_comm_size > 1) { /*The leader of root will have the global reduced data in tmp_buf or recv_buf at the end of the reduce */ if (leader_comm_rank == leader_root) { if (my_rank == root) { /* I am the root of the leader-comm, and the * root of the reduce op. So, I will write the * final result directly into my recvbuf */ if(tmp_buf != recvbuf) { in_buf = tmp_buf; out_buf = recvbuf; } else { in_buf = (char *)smpi_get_tmp_sendbuffer(count* smpi_datatype_get_extent(datatype)); smpi_datatype_copy(tmp_buf, count, datatype, in_buf, count, datatype); //in_buf = MPI_IN_PLACE; out_buf = recvbuf; } } else { in_buf = (char *)smpi_get_tmp_sendbuffer(count* smpi_datatype_get_extent(datatype)); smpi_datatype_copy(tmp_buf, count, datatype, in_buf, count, datatype); //in_buf = MPI_IN_PLACE; out_buf = tmp_buf; } } else { in_buf = tmp_buf; out_buf = NULL; } /* inter-leader communication */ mpi_errno = MV2_Reduce_function(in_buf, out_buf, count, datatype, op, leader_root, leader_comm); } if (local_size > 1) { /* Send the message to the root if the leader is not the * root of the reduce operation. The reduced data is in tmp_buf */ if ((local_rank == 0) && (root != my_rank) && (leader_root == leader_comm_rank)) { smpi_mpi_send(tmp_buf, count, datatype, root, COLL_TAG_REDUCE+1, comm); } if ((local_rank != 0) && (root == my_rank)) { smpi_mpi_recv(recvbuf, count, datatype, leader_of_root, COLL_TAG_REDUCE+1, comm, MPI_STATUS_IGNORE); } smpi_free_tmp_buffer((void *) ((char *) tmp_buf + true_lb)); if (leader_comm_rank == leader_root) { if (my_rank != root || (my_rank == root && tmp_buf == recvbuf)) { smpi_free_tmp_buffer(in_buf); } } } fn_exit: return mpi_errno; }