int PMPI_Sendrecv(const void* sendbuf, int sendcount, MPI_Datatype sendtype, int dst, int sendtag, void* recvbuf, int recvcount, MPI_Datatype recvtype, int src, int recvtag, MPI_Comm comm, MPI_Status* status) { int retval = 0; smpi_bench_end(); if (comm == MPI_COMM_NULL) { retval = MPI_ERR_COMM; } else if (not sendtype->is_valid() || not recvtype->is_valid()) { retval = MPI_ERR_TYPE; } else if (src == MPI_PROC_NULL) { if(status!=MPI_STATUS_IGNORE){ simgrid::smpi::Status::empty(status); status->MPI_SOURCE = MPI_PROC_NULL; } if(dst != MPI_PROC_NULL) simgrid::smpi::Request::send(sendbuf, sendcount, sendtype, dst, sendtag, comm); retval = MPI_SUCCESS; }else if (dst == MPI_PROC_NULL){ simgrid::smpi::Request::recv(recvbuf, recvcount, recvtype, src, recvtag, comm, status); retval = MPI_SUCCESS; }else if (dst >= comm->group()->size() || dst <0 || (src!=MPI_ANY_SOURCE && (src >= comm->group()->size() || src <0))){ retval = MPI_ERR_RANK; } else if ((sendcount < 0 || recvcount<0) || (sendbuf==nullptr && sendcount > 0) || (recvbuf==nullptr && recvcount>0)) { retval = MPI_ERR_COUNT; } else if((sendtag<0 && sendtag != MPI_ANY_TAG)||(recvtag<0 && recvtag != MPI_ANY_TAG)){ retval = MPI_ERR_TAG; } else { int my_proc_id = simgrid::s4u::this_actor::get_pid(); int dst_traced = getPid(comm, dst); int src_traced = getPid(comm, src); // FIXME: Hack the way to trace this one std::vector<int>* dst_hack = new std::vector<int>; std::vector<int>* src_hack = new std::vector<int>; dst_hack->push_back(dst_traced); src_hack->push_back(src_traced); TRACE_smpi_comm_in(my_proc_id, __func__, new simgrid::instr::VarCollTIData( "sendRecv", -1, sendtype->is_replayable() ? sendcount : sendcount * sendtype->size(), dst_hack, recvtype->is_replayable() ? recvcount : recvcount * recvtype->size(), src_hack, simgrid::smpi::Datatype::encode(sendtype), simgrid::smpi::Datatype::encode(recvtype))); TRACE_smpi_send(my_proc_id, my_proc_id, dst_traced, sendtag, sendcount * sendtype->size()); simgrid::smpi::Request::sendrecv(sendbuf, sendcount, sendtype, dst, sendtag, recvbuf, recvcount, recvtype, src, recvtag, comm, status); retval = MPI_SUCCESS; TRACE_smpi_recv(src_traced, my_proc_id, recvtag); TRACE_smpi_comm_out(my_proc_id); } smpi_bench_begin(); return retval; }
int PMPI_Ssend(const void* buf, int count, MPI_Datatype datatype, int dst, int tag, MPI_Comm comm) { int retval = 0; smpi_bench_end(); if (comm == MPI_COMM_NULL) { retval = MPI_ERR_COMM; } else if (dst == MPI_PROC_NULL) { retval = MPI_SUCCESS; } else if (dst >= comm->group()->size() || dst <0){ retval = MPI_ERR_RANK; } else if ((count < 0) || (buf==nullptr && count > 0)) { retval = MPI_ERR_COUNT; } else if (datatype==MPI_DATATYPE_NULL || not datatype->is_valid()) { retval = MPI_ERR_TYPE; } else if(tag<0 && tag != MPI_ANY_TAG){ retval = MPI_ERR_TAG; } else { int my_proc_id = simgrid::s4u::this_actor::get_pid(); int dst_traced = getPid(comm, dst); TRACE_smpi_comm_in(my_proc_id, __func__, new simgrid::instr::Pt2PtTIData("Ssend", dst, datatype->is_replayable() ? count : count * datatype->size(), tag, simgrid::smpi::Datatype::encode(datatype))); TRACE_smpi_send(my_proc_id, my_proc_id, dst_traced, tag, count * datatype->size()); simgrid::smpi::Request::ssend(buf, count, datatype, dst, tag, comm); retval = MPI_SUCCESS; TRACE_smpi_comm_out(my_proc_id); } smpi_bench_begin(); return retval; }
int PMPI_Recv(void *buf, int count, MPI_Datatype datatype, int src, int tag, MPI_Comm comm, MPI_Status * status) { int retval = 0; smpi_bench_end(); if (comm == MPI_COMM_NULL) { retval = MPI_ERR_COMM; } else if (src == MPI_PROC_NULL) { if(status != MPI_STATUS_IGNORE){ simgrid::smpi::Status::empty(status); status->MPI_SOURCE = MPI_PROC_NULL; } retval = MPI_SUCCESS; } else if (src!=MPI_ANY_SOURCE && (src >= comm->group()->size() || src <0)){ retval = MPI_ERR_RANK; } else if ((count < 0) || (buf==nullptr && count > 0)) { retval = MPI_ERR_COUNT; } else if (datatype==MPI_DATATYPE_NULL || not datatype->is_valid()) { retval = MPI_ERR_TYPE; } else if(tag<0 && tag != MPI_ANY_TAG){ retval = MPI_ERR_TAG; } else { int my_proc_id = simgrid::s4u::this_actor::get_pid(); TRACE_smpi_comm_in(my_proc_id, __func__, new simgrid::instr::Pt2PtTIData("recv", src, datatype->is_replayable() ? count : count * datatype->size(), tag, simgrid::smpi::Datatype::encode(datatype))); simgrid::smpi::Request::recv(buf, count, datatype, src, tag, comm, status); retval = MPI_SUCCESS; // the src may not have been known at the beginning of the recv (MPI_ANY_SOURCE) int src_traced=0; if (status != MPI_STATUS_IGNORE) src_traced = getPid(comm, status->MPI_SOURCE); else src_traced = getPid(comm, src); if (not TRACE_smpi_view_internals()) { TRACE_smpi_recv(src_traced, my_proc_id, tag); } TRACE_smpi_comm_out(my_proc_id); } smpi_bench_begin(); return retval; }
int PMPI_Irecv(void *buf, int count, MPI_Datatype datatype, int src, int tag, MPI_Comm comm, MPI_Request * request) { int retval = 0; smpi_bench_end(); if (request == nullptr) { retval = MPI_ERR_ARG; } else if (comm == MPI_COMM_NULL) { retval = MPI_ERR_COMM; } else if (src == MPI_PROC_NULL) { *request = MPI_REQUEST_NULL; retval = MPI_SUCCESS; } else if (src!=MPI_ANY_SOURCE && (src >= comm->group()->size() || src <0)){ retval = MPI_ERR_RANK; } else if ((count < 0) || (buf==nullptr && count > 0)) { retval = MPI_ERR_COUNT; } else if (datatype==MPI_DATATYPE_NULL || not datatype->is_valid()) { retval = MPI_ERR_TYPE; } else if(tag<0 && tag != MPI_ANY_TAG){ retval = MPI_ERR_TAG; } else { int my_proc_id = simgrid::s4u::this_actor::get_pid(); TRACE_smpi_comm_in(my_proc_id, __func__, new simgrid::instr::Pt2PtTIData("irecv", src, datatype->is_replayable() ? count : count * datatype->size(), tag, simgrid::smpi::Datatype::encode(datatype))); *request = simgrid::smpi::Request::irecv(buf, count, datatype, src, tag, comm); retval = MPI_SUCCESS; TRACE_smpi_comm_out(my_proc_id); } smpi_bench_begin(); if (retval != MPI_SUCCESS && request != nullptr) *request = MPI_REQUEST_NULL; return retval; }
int Coll_scatter_mvapich2_two_level_direct::scatter(void *sendbuf, int sendcnt, MPI_Datatype sendtype, void *recvbuf, int recvcnt, MPI_Datatype recvtype, int root, MPI_Comm comm) { int comm_size, rank; int local_rank, local_size; int leader_comm_rank = -1, leader_comm_size = -1; int mpi_errno = MPI_SUCCESS; int recvtype_size, sendtype_size, nbytes; void *tmp_buf = NULL; void *leader_scatter_buf = NULL; MPI_Status status; int leader_root, leader_of_root = -1; MPI_Comm shmem_comm, leader_comm; //if not set (use of the algo directly, without mvapich2 selector) if(MV2_Scatter_intra_function==NULL) MV2_Scatter_intra_function=Coll_scatter_mpich::scatter; if(comm->get_leaders_comm()==MPI_COMM_NULL){ comm->init_smp(); } comm_size = comm->size(); rank = comm->rank(); if (((rank == root) && (recvcnt == 0)) || ((rank != root) && (sendcnt == 0))) { return MPI_SUCCESS; } /* extract the rank,size information for the intra-node * communicator */ shmem_comm = comm->get_intra_comm(); local_rank = shmem_comm->rank(); local_size = shmem_comm->size(); if (local_rank == 0) { /* Node leader. Extract the rank, size information for the leader * communicator */ leader_comm = comm->get_leaders_comm(); leader_comm_size = leader_comm->size(); leader_comm_rank = leader_comm->rank(); } if (local_size == comm_size) { /* purely intra-node scatter. Just use the direct algorithm and we are done */ mpi_errno = MPIR_Scatter_MV2_Direct(sendbuf, sendcnt, sendtype, recvbuf, recvcnt, recvtype, root, comm); } else { recvtype_size=recvtype->size(); sendtype_size=sendtype->size(); if (rank == root) { nbytes = sendcnt * sendtype_size; } else { nbytes = recvcnt * recvtype_size; } if (local_rank == 0) { /* Node leader, allocate tmp_buffer */ tmp_buf = smpi_get_tmp_sendbuffer(nbytes * local_size); } leader_comm = comm->get_leaders_comm(); int* leaders_map = comm->get_leaders_map(); leader_of_root = comm->group()->rank(leaders_map[root]); leader_root = leader_comm->group()->rank(leaders_map[root]); /* leader_root is the rank of the leader of the root in leader_comm. * leader_root is to be used as the root of the inter-leader gather ops */ if ((local_rank == 0) && (root != rank) && (leader_of_root == rank)) { /* The root of the scatter operation is not the node leader. Recv * data from the node leader */ leader_scatter_buf = smpi_get_tmp_sendbuffer(nbytes * comm_size); Request::recv(leader_scatter_buf, nbytes * comm_size, MPI_BYTE, root, COLL_TAG_SCATTER, comm, &status); } if (rank == root && local_rank != 0) { /* The root of the scatter operation is not the node leader. Send * data to the node leader */ Request::send(sendbuf, sendcnt * comm_size, sendtype, leader_of_root, COLL_TAG_SCATTER, comm ); } if (leader_comm_size > 1 && local_rank == 0) { if (not comm->is_uniform()) { int* displs = NULL; int* sendcnts = NULL; int* node_sizes; int i = 0; node_sizes = comm->get_non_uniform_map(); if (root != leader_of_root) { if (leader_comm_rank == leader_root) { displs = static_cast<int*>(xbt_malloc(sizeof(int) * leader_comm_size)); sendcnts = static_cast<int*>(xbt_malloc(sizeof(int) * leader_comm_size)); sendcnts[0] = node_sizes[0] * nbytes; displs[0] = 0; for (i = 1; i < leader_comm_size; i++) { displs[i] = displs[i - 1] + node_sizes[i - 1] * nbytes; sendcnts[i] = node_sizes[i] * nbytes; } } Colls::scatterv(leader_scatter_buf, sendcnts, displs, MPI_BYTE, tmp_buf, nbytes * local_size, MPI_BYTE, leader_root, leader_comm); } else { if (leader_comm_rank == leader_root) { displs = static_cast<int*>(xbt_malloc(sizeof(int) * leader_comm_size)); sendcnts = static_cast<int*>(xbt_malloc(sizeof(int) * leader_comm_size)); sendcnts[0] = node_sizes[0] * sendcnt; displs[0] = 0; for (i = 1; i < leader_comm_size; i++) { displs[i] = displs[i - 1] + node_sizes[i - 1] * sendcnt; sendcnts[i] = node_sizes[i] * sendcnt; } } Colls::scatterv(sendbuf, sendcnts, displs, sendtype, tmp_buf, nbytes * local_size, MPI_BYTE, leader_root, leader_comm); } if (leader_comm_rank == leader_root) { xbt_free(displs); xbt_free(sendcnts); } } else { if (leader_of_root != root) { mpi_errno = MPIR_Scatter_MV2_Direct(leader_scatter_buf, nbytes * local_size, MPI_BYTE, tmp_buf, nbytes * local_size, MPI_BYTE, leader_root, leader_comm); } else { mpi_errno = MPIR_Scatter_MV2_Direct(sendbuf, sendcnt * local_size, sendtype, tmp_buf, nbytes * local_size, MPI_BYTE, leader_root, leader_comm); } } } /* The leaders are now done with the inter-leader part. Scatter the data within the nodes */ if (rank == root && recvbuf == MPI_IN_PLACE) { mpi_errno = MV2_Scatter_intra_function(tmp_buf, nbytes, MPI_BYTE, (void *)sendbuf, sendcnt, sendtype, 0, shmem_comm); } else { mpi_errno = MV2_Scatter_intra_function(tmp_buf, nbytes, MPI_BYTE, recvbuf, recvcnt, recvtype, 0, shmem_comm); } } /* check if multiple threads are calling this collective function */ if (comm_size != local_size && local_rank == 0) { smpi_free_tmp_buffer(tmp_buf); if (leader_of_root == rank && root != rank) { smpi_free_tmp_buffer(leader_scatter_buf); } } return (mpi_errno); }
int Coll_reduce_mvapich2_two_level::reduce( const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPI_Comm comm) { int mpi_errno = MPI_SUCCESS; int my_rank, total_size, local_rank, local_size; int leader_comm_rank = -1, leader_comm_size = 0; MPI_Comm shmem_comm, leader_comm; int leader_root, leader_of_root; const unsigned char* in_buf = nullptr; unsigned char *out_buf = nullptr, *tmp_buf = nullptr; MPI_Aint true_lb, true_extent, extent; int is_commutative = 0, stride = 0; int intra_node_root=0; //if not set (use of the algo directly, without mvapich2 selector) if(MV2_Reduce_function==NULL) MV2_Reduce_function=Coll_reduce_mpich::reduce; if(MV2_Reduce_intra_function==NULL) MV2_Reduce_intra_function=Coll_reduce_mpich::reduce; if(comm->get_leaders_comm()==MPI_COMM_NULL){ comm->init_smp(); } my_rank = comm->rank(); total_size = comm->size(); shmem_comm = comm->get_intra_comm(); local_rank = shmem_comm->rank(); local_size = shmem_comm->size(); leader_comm = comm->get_leaders_comm(); int* leaders_map = comm->get_leaders_map(); leader_of_root = comm->group()->rank(leaders_map[root]); leader_root = leader_comm->group()->rank(leaders_map[root]); is_commutative= (op==MPI_OP_NULL || op->is_commutative()); datatype->extent(&true_lb, &true_extent); extent =datatype->get_extent(); stride = count * std::max(extent, true_extent); if (local_size == total_size) { /* First handle the case where there is only one node */ if (stride <= MV2_INTRA_SHMEM_REDUCE_MSG && is_commutative == 1) { if (local_rank == 0 ) { tmp_buf = smpi_get_tmp_sendbuffer(count * std::max(extent, true_extent)); tmp_buf = tmp_buf - true_lb; } if (sendbuf != MPI_IN_PLACE) { in_buf = static_cast<const unsigned char*>(sendbuf); } else { in_buf = static_cast<const unsigned char*>(recvbuf); } if (local_rank == 0) { if( my_rank != root) { out_buf = tmp_buf; } else { out_buf = static_cast<unsigned char*>(recvbuf); if (in_buf == out_buf) { in_buf = static_cast<const unsigned char*>(MPI_IN_PLACE); out_buf = static_cast<unsigned char*>(recvbuf); } } } else { in_buf = static_cast<const unsigned char*>(sendbuf); out_buf = nullptr; } if (count * (std::max(extent, true_extent)) < SHMEM_COLL_BLOCK_SIZE) { mpi_errno = MPIR_Reduce_shmem_MV2(in_buf, out_buf, count, datatype, op, 0, shmem_comm); } else { mpi_errno = MPIR_Reduce_intra_knomial_wrapper_MV2(in_buf, out_buf, count, datatype, op, 0, shmem_comm); } if (local_rank == 0 && root != my_rank) { Request::send(out_buf, count, datatype, root, COLL_TAG_REDUCE+1, comm); } if ((local_rank != 0) && (root == my_rank)) { Request::recv(recvbuf, count, datatype, leader_of_root, COLL_TAG_REDUCE+1, comm, MPI_STATUS_IGNORE); } } else { if(mv2_use_knomial_reduce == 1) { reduce_fn = &MPIR_Reduce_intra_knomial_wrapper_MV2; } else { reduce_fn = &MPIR_Reduce_binomial_MV2; } mpi_errno = reduce_fn(sendbuf, recvbuf, count, datatype, op, root, comm); } /* We are done */ if (tmp_buf != nullptr) smpi_free_tmp_buffer(tmp_buf + true_lb); goto fn_exit; } if (local_rank == 0) { leader_comm = comm->get_leaders_comm(); if(leader_comm==MPI_COMM_NULL){ leader_comm = MPI_COMM_WORLD; } leader_comm_size = leader_comm->size(); leader_comm_rank = leader_comm->rank(); tmp_buf = smpi_get_tmp_sendbuffer(count * std::max(extent, true_extent)); tmp_buf = tmp_buf - true_lb; } if (sendbuf != MPI_IN_PLACE) { in_buf = static_cast<const unsigned char*>(sendbuf); } else { in_buf = static_cast<const unsigned char*>(recvbuf); } if (local_rank == 0) { out_buf = static_cast<unsigned char*>(tmp_buf); } else { out_buf = nullptr; } if(local_size > 1) { /* Lets do the intra-node reduce operations, if we have more than one * process in the node */ /*Fix the input and outbuf buffers for the intra-node reduce. *Node leaders will have the reduced data in tmp_buf after *this step*/ if (MV2_Reduce_intra_function == & MPIR_Reduce_shmem_MV2) { if (is_commutative == 1 && (count * (std::max(extent, true_extent)) < SHMEM_COLL_BLOCK_SIZE)) { mpi_errno = MV2_Reduce_intra_function(in_buf, out_buf, count, datatype, op, intra_node_root, shmem_comm); } else { mpi_errno = MPIR_Reduce_intra_knomial_wrapper_MV2(in_buf, out_buf, count, datatype, op, intra_node_root, shmem_comm); } } else { mpi_errno = MV2_Reduce_intra_function(in_buf, out_buf, count, datatype, op, intra_node_root, shmem_comm); } } else { smpi_free_tmp_buffer(tmp_buf + true_lb); tmp_buf = (unsigned char*)in_buf; // xxx } /* Now work on the inter-leader phase. Data is in tmp_buf */ if (local_rank == 0 && leader_comm_size > 1) { /*The leader of root will have the global reduced data in tmp_buf or recv_buf at the end of the reduce */ if (leader_comm_rank == leader_root) { if (my_rank == root) { /* I am the root of the leader-comm, and the * root of the reduce op. So, I will write the * final result directly into my recvbuf */ if(tmp_buf != recvbuf) { in_buf = tmp_buf; out_buf = static_cast<unsigned char*>(recvbuf); } else { unsigned char* buf = smpi_get_tmp_sendbuffer(count * datatype->get_extent()); Datatype::copy(tmp_buf, count, datatype, buf, count, datatype); // in_buf = MPI_IN_PLACE; in_buf = buf; out_buf = static_cast<unsigned char*>(recvbuf); } } else { unsigned char* buf = smpi_get_tmp_sendbuffer(count * datatype->get_extent()); Datatype::copy(tmp_buf, count, datatype, buf, count, datatype); // in_buf = MPI_IN_PLACE; in_buf = buf; out_buf = tmp_buf; } } else { in_buf = tmp_buf; out_buf = nullptr; } /* inter-leader communication */ mpi_errno = MV2_Reduce_function(in_buf, out_buf, count, datatype, op, leader_root, leader_comm); } if (local_size > 1) { /* Send the message to the root if the leader is not the * root of the reduce operation. The reduced data is in tmp_buf */ if ((local_rank == 0) && (root != my_rank) && (leader_root == leader_comm_rank)) { Request::send(tmp_buf, count, datatype, root, COLL_TAG_REDUCE + 1, comm); } if ((local_rank != 0) && (root == my_rank)) { Request::recv(recvbuf, count, datatype, leader_of_root, COLL_TAG_REDUCE + 1, comm, MPI_STATUS_IGNORE); } smpi_free_tmp_buffer(tmp_buf + true_lb); if (leader_comm_rank == leader_root) { if (my_rank != root || (my_rank == root && tmp_buf == recvbuf)) { smpi_free_tmp_buffer(in_buf); } } } fn_exit: return mpi_errno; }
static int getPid(MPI_Comm comm, int id) { simgrid::s4u::ActorPtr actor = comm->group()->actor(id); return (actor == nullptr) ? MPI_UNDEFINED : actor->get_pid(); }
int Coll_bcast_mvapich2_inter_node::bcast(void *buffer, int count, MPI_Datatype datatype, int root, MPI_Comm comm) { int rank; int mpi_errno = MPI_SUCCESS; MPI_Comm shmem_comm, leader_comm; int local_rank, local_size, global_rank = -1; int leader_root, leader_of_root; rank = comm->rank(); //comm_size = comm->size(); if (MV2_Bcast_function==NULL){ MV2_Bcast_function=Coll_bcast_mpich::bcast; } if (MV2_Bcast_intra_node_function==NULL){ MV2_Bcast_intra_node_function= Coll_bcast_mpich::bcast; } if(comm->get_leaders_comm()==MPI_COMM_NULL){ comm->init_smp(); } shmem_comm = comm->get_intra_comm(); local_rank = shmem_comm->rank(); local_size = shmem_comm->size(); leader_comm = comm->get_leaders_comm(); if ((local_rank == 0) && (local_size > 1)) { global_rank = leader_comm->rank(); } int* leaders_map = comm->get_leaders_map(); leader_of_root = comm->group()->rank(leaders_map[root]); leader_root = leader_comm->group()->rank(leaders_map[root]); if (local_size > 1) { if ((local_rank == 0) && (root != rank) && (leader_root == global_rank)) { Request::recv(buffer, count, datatype, root, COLL_TAG_BCAST, comm, MPI_STATUS_IGNORE); } if ((local_rank != 0) && (root == rank)) { Request::send(buffer, count, datatype, leader_of_root, COLL_TAG_BCAST, comm); } } #if defined(_MCST_SUPPORT_) if (comm_ptr->ch.is_mcast_ok) { mpi_errno = MPIR_Mcast_inter_node_MV2(buffer, count, datatype, root, comm_ptr, errflag); if (mpi_errno == MPI_SUCCESS) { goto fn_exit; } else { goto fn_fail; } } #endif /* if (local_rank == 0) { leader_comm = comm->get_leaders_comm(); root = leader_root; } if (MV2_Bcast_function == &MPIR_Pipelined_Bcast_MV2) { mpi_errno = MPIR_Pipelined_Bcast_MV2(buffer, count, datatype, root, comm); } else if (MV2_Bcast_function == &MPIR_Bcast_scatter_ring_allgather_shm_MV2) { mpi_errno = MPIR_Bcast_scatter_ring_allgather_shm_MV2(buffer, count, datatype, root, comm); } else */{ if (local_rank == 0) { /* if (MV2_Bcast_function == &MPIR_Knomial_Bcast_inter_node_wrapper_MV2) { mpi_errno = MPIR_Knomial_Bcast_inter_node_wrapper_MV2(buffer, count, datatype, root, comm); } else {*/ mpi_errno = MV2_Bcast_function(buffer, count, datatype, leader_root, leader_comm); // } } } return mpi_errno; }