int main() { try { cl::Context context; std::vector<cl::Device> devices; std::tie(context, devices) = init_open_cl(); cl::CommandQueue queue(context, devices[0]); cl::Program program = load_program("program.cl", context, devices); cl_fn reduce_fn(program, "do_reduce"); cl_fn sweep_fn(program, "do_sweep"); std::ifstream in(INPUT_FILE); size_t n, npow2; in >> n; npow2 = pow(2.0, ceil(log2(n))); std::vector<float> in_array(npow2); for (size_t i = 0; i < n; ++i) in >> in_array[i]; cl::Buffer out_buf(context, std::begin(in_array), std::end(in_array), false); std::vector<cl::Event> events; for (size_t offset = 1; npow2 / (offset * 2) >= WORKGROUP_SIZE; offset *= 2) exec_fn(reduce_fn, out_buf, npow2, offset, npow2 / offset, events, queue); if (npow2 < 512) exec_fn(reduce_fn, out_buf, npow2, 1, WORKGROUP_SIZE, events, queue); exec_fn(sweep_fn, out_buf, npow2, npow2 / 2, WORKGROUP_SIZE, events, queue); for (size_t offset = npow2 / 1024; offset > 0; offset /= 2) exec_fn(sweep_fn, out_buf, npow2, offset, npow2 / offset, events, queue); std::vector<float> out_array(n); queue.enqueueReadBuffer(out_buf, CL_TRUE, 0, sizeof(float) * n, &out_array[0]); std::ofstream out(OUTPUT_FILE); out << std::fixed << std::setprecision(3); for (size_t i = 0; i < n; i++) out << out_array[i] << " "; out << std::endl; } catch (cl::Error &e) { std::cerr << "ERROR: " << e.what() << " (" << e.err() << ")" << std::endl; } catch (std::runtime_error &e) { std::cerr << e.what() << std::endl; } return 0; }
int Coll_reduce_mvapich2_two_level::reduce( const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPI_Comm comm) { int mpi_errno = MPI_SUCCESS; int my_rank, total_size, local_rank, local_size; int leader_comm_rank = -1, leader_comm_size = 0; MPI_Comm shmem_comm, leader_comm; int leader_root, leader_of_root; const unsigned char* in_buf = nullptr; unsigned char *out_buf = nullptr, *tmp_buf = nullptr; MPI_Aint true_lb, true_extent, extent; int is_commutative = 0, stride = 0; int intra_node_root=0; //if not set (use of the algo directly, without mvapich2 selector) if(MV2_Reduce_function==NULL) MV2_Reduce_function=Coll_reduce_mpich::reduce; if(MV2_Reduce_intra_function==NULL) MV2_Reduce_intra_function=Coll_reduce_mpich::reduce; if(comm->get_leaders_comm()==MPI_COMM_NULL){ comm->init_smp(); } my_rank = comm->rank(); total_size = comm->size(); shmem_comm = comm->get_intra_comm(); local_rank = shmem_comm->rank(); local_size = shmem_comm->size(); leader_comm = comm->get_leaders_comm(); int* leaders_map = comm->get_leaders_map(); leader_of_root = comm->group()->rank(leaders_map[root]); leader_root = leader_comm->group()->rank(leaders_map[root]); is_commutative= (op==MPI_OP_NULL || op->is_commutative()); datatype->extent(&true_lb, &true_extent); extent =datatype->get_extent(); stride = count * std::max(extent, true_extent); if (local_size == total_size) { /* First handle the case where there is only one node */ if (stride <= MV2_INTRA_SHMEM_REDUCE_MSG && is_commutative == 1) { if (local_rank == 0 ) { tmp_buf = smpi_get_tmp_sendbuffer(count * std::max(extent, true_extent)); tmp_buf = tmp_buf - true_lb; } if (sendbuf != MPI_IN_PLACE) { in_buf = static_cast<const unsigned char*>(sendbuf); } else { in_buf = static_cast<const unsigned char*>(recvbuf); } if (local_rank == 0) { if( my_rank != root) { out_buf = tmp_buf; } else { out_buf = static_cast<unsigned char*>(recvbuf); if (in_buf == out_buf) { in_buf = static_cast<const unsigned char*>(MPI_IN_PLACE); out_buf = static_cast<unsigned char*>(recvbuf); } } } else { in_buf = static_cast<const unsigned char*>(sendbuf); out_buf = nullptr; } if (count * (std::max(extent, true_extent)) < SHMEM_COLL_BLOCK_SIZE) { mpi_errno = MPIR_Reduce_shmem_MV2(in_buf, out_buf, count, datatype, op, 0, shmem_comm); } else { mpi_errno = MPIR_Reduce_intra_knomial_wrapper_MV2(in_buf, out_buf, count, datatype, op, 0, shmem_comm); } if (local_rank == 0 && root != my_rank) { Request::send(out_buf, count, datatype, root, COLL_TAG_REDUCE+1, comm); } if ((local_rank != 0) && (root == my_rank)) { Request::recv(recvbuf, count, datatype, leader_of_root, COLL_TAG_REDUCE+1, comm, MPI_STATUS_IGNORE); } } else { if(mv2_use_knomial_reduce == 1) { reduce_fn = &MPIR_Reduce_intra_knomial_wrapper_MV2; } else { reduce_fn = &MPIR_Reduce_binomial_MV2; } mpi_errno = reduce_fn(sendbuf, recvbuf, count, datatype, op, root, comm); } /* We are done */ if (tmp_buf != nullptr) smpi_free_tmp_buffer(tmp_buf + true_lb); goto fn_exit; } if (local_rank == 0) { leader_comm = comm->get_leaders_comm(); if(leader_comm==MPI_COMM_NULL){ leader_comm = MPI_COMM_WORLD; } leader_comm_size = leader_comm->size(); leader_comm_rank = leader_comm->rank(); tmp_buf = smpi_get_tmp_sendbuffer(count * std::max(extent, true_extent)); tmp_buf = tmp_buf - true_lb; } if (sendbuf != MPI_IN_PLACE) { in_buf = static_cast<const unsigned char*>(sendbuf); } else { in_buf = static_cast<const unsigned char*>(recvbuf); } if (local_rank == 0) { out_buf = static_cast<unsigned char*>(tmp_buf); } else { out_buf = nullptr; } if(local_size > 1) { /* Lets do the intra-node reduce operations, if we have more than one * process in the node */ /*Fix the input and outbuf buffers for the intra-node reduce. *Node leaders will have the reduced data in tmp_buf after *this step*/ if (MV2_Reduce_intra_function == & MPIR_Reduce_shmem_MV2) { if (is_commutative == 1 && (count * (std::max(extent, true_extent)) < SHMEM_COLL_BLOCK_SIZE)) { mpi_errno = MV2_Reduce_intra_function(in_buf, out_buf, count, datatype, op, intra_node_root, shmem_comm); } else { mpi_errno = MPIR_Reduce_intra_knomial_wrapper_MV2(in_buf, out_buf, count, datatype, op, intra_node_root, shmem_comm); } } else { mpi_errno = MV2_Reduce_intra_function(in_buf, out_buf, count, datatype, op, intra_node_root, shmem_comm); } } else { smpi_free_tmp_buffer(tmp_buf + true_lb); tmp_buf = (unsigned char*)in_buf; // xxx } /* Now work on the inter-leader phase. Data is in tmp_buf */ if (local_rank == 0 && leader_comm_size > 1) { /*The leader of root will have the global reduced data in tmp_buf or recv_buf at the end of the reduce */ if (leader_comm_rank == leader_root) { if (my_rank == root) { /* I am the root of the leader-comm, and the * root of the reduce op. So, I will write the * final result directly into my recvbuf */ if(tmp_buf != recvbuf) { in_buf = tmp_buf; out_buf = static_cast<unsigned char*>(recvbuf); } else { unsigned char* buf = smpi_get_tmp_sendbuffer(count * datatype->get_extent()); Datatype::copy(tmp_buf, count, datatype, buf, count, datatype); // in_buf = MPI_IN_PLACE; in_buf = buf; out_buf = static_cast<unsigned char*>(recvbuf); } } else { unsigned char* buf = smpi_get_tmp_sendbuffer(count * datatype->get_extent()); Datatype::copy(tmp_buf, count, datatype, buf, count, datatype); // in_buf = MPI_IN_PLACE; in_buf = buf; out_buf = tmp_buf; } } else { in_buf = tmp_buf; out_buf = nullptr; } /* inter-leader communication */ mpi_errno = MV2_Reduce_function(in_buf, out_buf, count, datatype, op, leader_root, leader_comm); } if (local_size > 1) { /* Send the message to the root if the leader is not the * root of the reduce operation. The reduced data is in tmp_buf */ if ((local_rank == 0) && (root != my_rank) && (leader_root == leader_comm_rank)) { Request::send(tmp_buf, count, datatype, root, COLL_TAG_REDUCE + 1, comm); } if ((local_rank != 0) && (root == my_rank)) { Request::recv(recvbuf, count, datatype, leader_of_root, COLL_TAG_REDUCE + 1, comm, MPI_STATUS_IGNORE); } smpi_free_tmp_buffer(tmp_buf + true_lb); if (leader_comm_rank == leader_root) { if (my_rank != root || (my_rank == root && tmp_buf == recvbuf)) { smpi_free_tmp_buffer(in_buf); } } } fn_exit: return mpi_errno; }
int smpi_coll_tuned_reduce_mvapich2_two_level( void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPI_Comm comm) { int mpi_errno = MPI_SUCCESS; int my_rank, total_size, local_rank, local_size; int leader_comm_rank = -1, leader_comm_size = 0; MPI_Comm shmem_comm, leader_comm; int leader_root, leader_of_root; void *in_buf = NULL, *out_buf = NULL, *tmp_buf = NULL; MPI_Aint true_lb, true_extent, extent; int is_commutative = 0, stride = 0; int intra_node_root=0; //if not set (use of the algo directly, without mvapich2 selector) if(MV2_Reduce_function==NULL) MV2_Reduce_function=smpi_coll_tuned_reduce_mpich; if(MV2_Reduce_intra_function==NULL) MV2_Reduce_intra_function=smpi_coll_tuned_reduce_mpich; if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){ smpi_comm_init_smp(comm); } my_rank = smpi_comm_rank(comm); total_size = smpi_comm_size(comm); shmem_comm = smpi_comm_get_intra_comm(comm); local_rank = smpi_comm_rank(shmem_comm); local_size = smpi_comm_size(shmem_comm); leader_comm = smpi_comm_get_leaders_comm(comm); int* leaders_map = smpi_comm_get_leaders_map(comm); leader_of_root = smpi_group_rank(smpi_comm_group(comm),leaders_map[root]); leader_root = smpi_group_rank(smpi_comm_group(leader_comm),leaders_map[root]); is_commutative=smpi_op_is_commute(op); smpi_datatype_extent(datatype, &true_lb, &true_extent); extent =smpi_datatype_get_extent(datatype); stride = count * MAX(extent, true_extent); if (local_size == total_size) { /* First handle the case where there is only one node */ if (stride <= MV2_INTRA_SHMEM_REDUCE_MSG && is_commutative == 1) { if (local_rank == 0 ) { tmp_buf=(void *)smpi_get_tmp_sendbuffer( count * (MAX(extent, true_extent))); tmp_buf = (void *) ((char *) tmp_buf - true_lb); } if (sendbuf != MPI_IN_PLACE) { in_buf = (void *)sendbuf; } else { in_buf = recvbuf; } if (local_rank == 0) { if( my_rank != root) { out_buf = tmp_buf; } else { out_buf = recvbuf; if(in_buf == out_buf) { in_buf = MPI_IN_PLACE; out_buf = recvbuf; } } } else { in_buf = (void *)sendbuf; out_buf = NULL; } if (count * (MAX(extent, true_extent)) < SHMEM_COLL_BLOCK_SIZE) { mpi_errno = MPIR_Reduce_shmem_MV2(in_buf, out_buf, count, datatype, op, 0, shmem_comm); } else { mpi_errno = MPIR_Reduce_intra_knomial_wrapper_MV2(in_buf, out_buf, count, datatype, op, 0, shmem_comm); } if (local_rank == 0 && root != my_rank) { smpi_mpi_send(out_buf, count, datatype, root, COLL_TAG_REDUCE+1, comm); } if ((local_rank != 0) && (root == my_rank)) { smpi_mpi_recv(recvbuf, count, datatype, leader_of_root, COLL_TAG_REDUCE+1, comm, MPI_STATUS_IGNORE); } } else { if(mv2_use_knomial_reduce == 1) { reduce_fn = &MPIR_Reduce_intra_knomial_wrapper_MV2; } else { reduce_fn = &MPIR_Reduce_binomial_MV2; } mpi_errno = reduce_fn(sendbuf, recvbuf, count, datatype, op, root, comm); } /* We are done */ if(tmp_buf!=NULL) smpi_free_tmp_buffer((void *) ((char *) tmp_buf + true_lb)); goto fn_exit; } if (local_rank == 0) { leader_comm = smpi_comm_get_leaders_comm(comm); if(leader_comm==MPI_COMM_NULL){ leader_comm = MPI_COMM_WORLD; } leader_comm_size = smpi_comm_size(leader_comm); leader_comm_rank = smpi_comm_rank(leader_comm); tmp_buf=(void *)smpi_get_tmp_sendbuffer(count * (MAX(extent, true_extent))); tmp_buf = (void *) ((char *) tmp_buf - true_lb); } if (sendbuf != MPI_IN_PLACE) { in_buf = (void *)sendbuf; } else { in_buf = recvbuf; } if (local_rank == 0) { out_buf = tmp_buf; } else { out_buf = NULL; } if(local_size > 1) { /* Lets do the intra-node reduce operations, if we have more than one * process in the node */ /*Fix the input and outbuf buffers for the intra-node reduce. *Node leaders will have the reduced data in tmp_buf after *this step*/ if (MV2_Reduce_intra_function == & MPIR_Reduce_shmem_MV2) { if (is_commutative == 1 && (count * (MAX(extent, true_extent)) < SHMEM_COLL_BLOCK_SIZE)) { mpi_errno = MV2_Reduce_intra_function(in_buf, out_buf, count, datatype, op, intra_node_root, shmem_comm); } else { mpi_errno = MPIR_Reduce_intra_knomial_wrapper_MV2(in_buf, out_buf, count, datatype, op, intra_node_root, shmem_comm); } } else { mpi_errno = MV2_Reduce_intra_function(in_buf, out_buf, count, datatype, op, intra_node_root, shmem_comm); } } else { smpi_free_tmp_buffer((void *) ((char *) tmp_buf + true_lb)); tmp_buf = in_buf; } /* Now work on the inter-leader phase. Data is in tmp_buf */ if (local_rank == 0 && leader_comm_size > 1) { /*The leader of root will have the global reduced data in tmp_buf or recv_buf at the end of the reduce */ if (leader_comm_rank == leader_root) { if (my_rank == root) { /* I am the root of the leader-comm, and the * root of the reduce op. So, I will write the * final result directly into my recvbuf */ if(tmp_buf != recvbuf) { in_buf = tmp_buf; out_buf = recvbuf; } else { in_buf = (char *)smpi_get_tmp_sendbuffer(count* smpi_datatype_get_extent(datatype)); smpi_datatype_copy(tmp_buf, count, datatype, in_buf, count, datatype); //in_buf = MPI_IN_PLACE; out_buf = recvbuf; } } else { in_buf = (char *)smpi_get_tmp_sendbuffer(count* smpi_datatype_get_extent(datatype)); smpi_datatype_copy(tmp_buf, count, datatype, in_buf, count, datatype); //in_buf = MPI_IN_PLACE; out_buf = tmp_buf; } } else { in_buf = tmp_buf; out_buf = NULL; } /* inter-leader communication */ mpi_errno = MV2_Reduce_function(in_buf, out_buf, count, datatype, op, leader_root, leader_comm); } if (local_size > 1) { /* Send the message to the root if the leader is not the * root of the reduce operation. The reduced data is in tmp_buf */ if ((local_rank == 0) && (root != my_rank) && (leader_root == leader_comm_rank)) { smpi_mpi_send(tmp_buf, count, datatype, root, COLL_TAG_REDUCE+1, comm); } if ((local_rank != 0) && (root == my_rank)) { smpi_mpi_recv(recvbuf, count, datatype, leader_of_root, COLL_TAG_REDUCE+1, comm, MPI_STATUS_IGNORE); } smpi_free_tmp_buffer((void *) ((char *) tmp_buf + true_lb)); if (leader_comm_rank == leader_root) { if (my_rank != root || (my_rank == root && tmp_buf == recvbuf)) { smpi_free_tmp_buffer(in_buf); } } } fn_exit: return mpi_errno; }