int Coll_gather_ompi::gather(const void *sbuf, int scount, MPI_Datatype sdtype, void* rbuf, int rcount, MPI_Datatype rdtype, int root, MPI_Comm comm ) { //const int large_segment_size = 32768; //const int small_segment_size = 1024; //const size_t large_block_size = 92160; const size_t intermediate_block_size = 6000; const size_t small_block_size = 1024; const int large_communicator_size = 60; const int small_communicator_size = 10; int communicator_size, rank; size_t dsize, block_size; XBT_DEBUG("smpi_coll_tuned_gather_ompi"); communicator_size = comm->size(); rank = comm->rank(); // Determine block size if (rank == root) { dsize = rdtype->size(); block_size = dsize * rcount; } else { dsize = sdtype->size(); block_size = dsize * scount; } /* if (block_size > large_block_size) {*/ /* return smpi_coll_tuned_gather_ompi_linear_sync (sbuf, scount, sdtype, */ /* rbuf, rcount, rdtype, */ /* root, comm);*/ /* } else*/ if (block_size > intermediate_block_size) { return Coll_gather_ompi_linear_sync::gather (sbuf, scount, sdtype, rbuf, rcount, rdtype, root, comm); } else if ((communicator_size > large_communicator_size) || ((communicator_size > small_communicator_size) && (block_size < small_block_size))) { return Coll_gather_ompi_binomial::gather (sbuf, scount, sdtype, rbuf, rcount, rdtype, root, comm); } // Otherwise, use basic linear return Coll_gather_ompi_basic_linear::gather (sbuf, scount, sdtype, rbuf, rcount, rdtype, root, comm); }
int PMPI_Ssend(const void* buf, int count, MPI_Datatype datatype, int dst, int tag, MPI_Comm comm) { int retval = 0; smpi_bench_end(); if (comm == MPI_COMM_NULL) { retval = MPI_ERR_COMM; } else if (dst == MPI_PROC_NULL) { retval = MPI_SUCCESS; } else if (dst >= comm->group()->size() || dst <0){ retval = MPI_ERR_RANK; } else if ((count < 0) || (buf==nullptr && count > 0)) { retval = MPI_ERR_COUNT; } else if (datatype==MPI_DATATYPE_NULL || not datatype->is_valid()) { retval = MPI_ERR_TYPE; } else if(tag<0 && tag != MPI_ANY_TAG){ retval = MPI_ERR_TAG; } else { int my_proc_id = simgrid::s4u::this_actor::get_pid(); int dst_traced = getPid(comm, dst); TRACE_smpi_comm_in(my_proc_id, __func__, new simgrid::instr::Pt2PtTIData("Ssend", dst, datatype->is_replayable() ? count : count * datatype->size(), tag, simgrid::smpi::Datatype::encode(datatype))); TRACE_smpi_send(my_proc_id, my_proc_id, dst_traced, tag, count * datatype->size()); simgrid::smpi::Request::ssend(buf, count, datatype, dst, tag, comm); retval = MPI_SUCCESS; TRACE_smpi_comm_out(my_proc_id); } smpi_bench_begin(); return retval; }
int PMPI_Sendrecv(const void* sendbuf, int sendcount, MPI_Datatype sendtype, int dst, int sendtag, void* recvbuf, int recvcount, MPI_Datatype recvtype, int src, int recvtag, MPI_Comm comm, MPI_Status* status) { int retval = 0; smpi_bench_end(); if (comm == MPI_COMM_NULL) { retval = MPI_ERR_COMM; } else if (not sendtype->is_valid() || not recvtype->is_valid()) { retval = MPI_ERR_TYPE; } else if (src == MPI_PROC_NULL) { if(status!=MPI_STATUS_IGNORE){ simgrid::smpi::Status::empty(status); status->MPI_SOURCE = MPI_PROC_NULL; } if(dst != MPI_PROC_NULL) simgrid::smpi::Request::send(sendbuf, sendcount, sendtype, dst, sendtag, comm); retval = MPI_SUCCESS; }else if (dst == MPI_PROC_NULL){ simgrid::smpi::Request::recv(recvbuf, recvcount, recvtype, src, recvtag, comm, status); retval = MPI_SUCCESS; }else if (dst >= comm->group()->size() || dst <0 || (src!=MPI_ANY_SOURCE && (src >= comm->group()->size() || src <0))){ retval = MPI_ERR_RANK; } else if ((sendcount < 0 || recvcount<0) || (sendbuf==nullptr && sendcount > 0) || (recvbuf==nullptr && recvcount>0)) { retval = MPI_ERR_COUNT; } else if((sendtag<0 && sendtag != MPI_ANY_TAG)||(recvtag<0 && recvtag != MPI_ANY_TAG)){ retval = MPI_ERR_TAG; } else { int my_proc_id = simgrid::s4u::this_actor::get_pid(); int dst_traced = getPid(comm, dst); int src_traced = getPid(comm, src); // FIXME: Hack the way to trace this one std::vector<int>* dst_hack = new std::vector<int>; std::vector<int>* src_hack = new std::vector<int>; dst_hack->push_back(dst_traced); src_hack->push_back(src_traced); TRACE_smpi_comm_in(my_proc_id, __func__, new simgrid::instr::VarCollTIData( "sendRecv", -1, sendtype->is_replayable() ? sendcount : sendcount * sendtype->size(), dst_hack, recvtype->is_replayable() ? recvcount : recvcount * recvtype->size(), src_hack, simgrid::smpi::Datatype::encode(sendtype), simgrid::smpi::Datatype::encode(recvtype))); TRACE_smpi_send(my_proc_id, my_proc_id, dst_traced, sendtag, sendcount * sendtype->size()); simgrid::smpi::Request::sendrecv(sendbuf, sendcount, sendtype, dst, sendtag, recvbuf, recvcount, recvtype, src, recvtag, comm, status); retval = MPI_SUCCESS; TRACE_smpi_recv(src_traced, my_proc_id, recvtag); TRACE_smpi_comm_out(my_proc_id); } smpi_bench_begin(); return retval; }
int PMPI_Type_size_x(MPI_Datatype datatype, MPI_Count *size) { if (datatype == MPI_DATATYPE_NULL) { return MPI_ERR_TYPE; } else if (size == nullptr) { return MPI_ERR_ARG; } else { *size = static_cast<MPI_Count>(datatype->size()); return MPI_SUCCESS; } }
int PMPI_Pack_size(int incount, MPI_Datatype datatype, MPI_Comm comm, int* size) { if(incount<0){ return MPI_ERR_COUNT; } else if (datatype == MPI_DATATYPE_NULL || not datatype->is_valid()){ return MPI_ERR_TYPE; } else if(comm==MPI_COMM_NULL){ return MPI_ERR_COMM; } else { *size=incount*datatype->size(); return MPI_SUCCESS; } }
int Coll_reduce_scatter_ompi::reduce_scatter(const void *sbuf, void *rbuf, const int *rcounts, MPI_Datatype dtype, MPI_Op op, MPI_Comm comm ) { int comm_size, i, pow2; size_t total_message_size, dsize; const double a = 0.0012; const double b = 8.0; const size_t small_message_size = 12 * 1024; const size_t large_message_size = 256 * 1024; int zerocounts = 0; XBT_DEBUG("Coll_reduce_scatter_ompi::reduce_scatter"); comm_size = comm->size(); // We need data size for decision function dsize=dtype->size(); total_message_size = 0; for (i = 0; i < comm_size; i++) { total_message_size += rcounts[i]; if (0 == rcounts[i]) { zerocounts = 1; } } if (((op != MPI_OP_NULL) && not op->is_commutative()) || (zerocounts)) { Coll_reduce_scatter_default::reduce_scatter(sbuf, rbuf, rcounts, dtype, op, comm); return MPI_SUCCESS; } total_message_size *= dsize; // compute the nearest power of 2 for (pow2 = 1; pow2 < comm_size; pow2 <<= 1); if ((total_message_size <= small_message_size) || ((total_message_size <= large_message_size) && (pow2 == comm_size)) || (comm_size >= a * total_message_size + b)) { return Coll_reduce_scatter_ompi_basic_recursivehalving::reduce_scatter(sbuf, rbuf, rcounts, dtype, op, comm); } return Coll_reduce_scatter_ompi_ring::reduce_scatter(sbuf, rbuf, rcounts, dtype, op, comm); }
int Coll_scatter_ompi::scatter(const void *sbuf, int scount, MPI_Datatype sdtype, void* rbuf, int rcount, MPI_Datatype rdtype, int root, MPI_Comm comm ) { const size_t small_block_size = 300; const int small_comm_size = 10; int communicator_size, rank; size_t dsize, block_size; XBT_DEBUG("Coll_scatter_ompi::scatter"); communicator_size = comm->size(); rank = comm->rank(); // Determine block size if (root == rank) { dsize=sdtype->size(); block_size = dsize * scount; } else { dsize=rdtype->size(); block_size = dsize * rcount; } if ((communicator_size > small_comm_size) && (block_size < small_block_size)) { std::unique_ptr<unsigned char[]> tmp_buf; if (rank != root) { tmp_buf.reset(new unsigned char[rcount * rdtype->get_extent()]); sbuf = tmp_buf.get(); scount = rcount; sdtype = rdtype; } return Coll_scatter_ompi_binomial::scatter(sbuf, scount, sdtype, rbuf, rcount, rdtype, root, comm); } return Coll_scatter_ompi_basic_linear::scatter (sbuf, scount, sdtype, rbuf, rcount, rdtype, root, comm); }
int Coll_allgatherv_ompi::allgatherv(const void *sbuf, int scount, MPI_Datatype sdtype, void* rbuf, const int *rcounts, const int *rdispls, MPI_Datatype rdtype, MPI_Comm comm ) { int i; int communicator_size; size_t dsize, total_dsize; communicator_size = comm->size(); /* Special case for 2 processes */ if (communicator_size == 2) { return Coll_allgatherv_pair::allgatherv(sbuf, scount, sdtype, rbuf, rcounts, rdispls, rdtype, comm); } /* Determine complete data size */ dsize=sdtype->size(); total_dsize = 0; for (i = 0; i < communicator_size; i++) { total_dsize += dsize * rcounts[i]; } /* Decision based on allgather decision. */ if (total_dsize < 50000) { return Coll_allgatherv_ompi_bruck::allgatherv(sbuf, scount, sdtype, rbuf, rcounts, rdispls, rdtype, comm); } else { if (communicator_size % 2) { return Coll_allgatherv_ring::allgatherv(sbuf, scount, sdtype, rbuf, rcounts, rdispls, rdtype, comm); } else { return Coll_allgatherv_ompi_neighborexchange::allgatherv(sbuf, scount, sdtype, rbuf, rcounts, rdispls, rdtype, comm); } } }
int PMPI_Rput(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype, int target_rank, MPI_Aint target_disp, int target_count, MPI_Datatype target_datatype, MPI_Win win, MPI_Request* request){ int retval = 0; smpi_bench_end(); if (win == MPI_WIN_NULL) { retval = MPI_ERR_WIN; } else if (target_rank == MPI_PROC_NULL) { *request = MPI_REQUEST_NULL; retval = MPI_SUCCESS; } else if (target_rank <0){ retval = MPI_ERR_RANK; } else if (win->dynamic()==0 && target_disp <0){ //in case of dynamic window, target_disp can be mistakenly seen as negative, as it is an address retval = MPI_ERR_ARG; } else if ((origin_count < 0 || target_count < 0) || (origin_addr==nullptr && origin_count > 0)){ retval = MPI_ERR_COUNT; } else if (((origin_datatype == MPI_DATATYPE_NULL) || (target_datatype == MPI_DATATYPE_NULL)) || ((not origin_datatype->is_valid()) || (not target_datatype->is_valid()))) { retval = MPI_ERR_TYPE; } else if(request == nullptr){ retval = MPI_ERR_REQUEST; } else { int my_proc_id = simgrid::s4u::this_actor::get_pid(); MPI_Group group; win->get_group(&group); int dst_traced = group->actor(target_rank)->get_pid(); TRACE_smpi_comm_in(my_proc_id, __func__, new simgrid::instr::Pt2PtTIData( "Rput", target_rank, origin_datatype->is_replayable() ? origin_count : origin_count * origin_datatype->size(), simgrid::smpi::Datatype::encode(origin_datatype))); TRACE_smpi_send(my_proc_id, my_proc_id, dst_traced, SMPI_RMA_TAG, origin_count * origin_datatype->size()); retval = win->put( origin_addr, origin_count, origin_datatype, target_rank, target_disp, target_count, target_datatype, request); TRACE_smpi_comm_out(my_proc_id); } smpi_bench_begin(); return retval; }
int Coll_reduce_mvapich2::reduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPI_Comm comm) { if(mv2_reduce_thresholds_table == NULL) init_mv2_reduce_tables_stampede(); int mpi_errno = MPI_SUCCESS; int range = 0; int range_threshold = 0; int range_intra_threshold = 0; int is_commutative, pof2; int comm_size = 0; long nbytes = 0; int sendtype_size; int is_two_level = 0; comm_size = comm->size(); sendtype_size=datatype->size(); nbytes = count * sendtype_size; if (count == 0) return MPI_SUCCESS; is_commutative = (op==MPI_OP_NULL || op->is_commutative()); /* find nearest power-of-two less than or equal to comm_size */ for( pof2 = 1; pof2 <= comm_size; pof2 <<= 1 ); pof2 >>=1; /* Search for the corresponding system size inside the tuning table */ while ((range < (mv2_size_reduce_tuning_table - 1)) && (comm_size > mv2_reduce_thresholds_table[range].numproc)) { range++; } /* Search for corresponding inter-leader function */ while ((range_threshold < (mv2_reduce_thresholds_table[range].size_inter_table - 1)) && (nbytes > mv2_reduce_thresholds_table[range].inter_leader[range_threshold].max) && (mv2_reduce_thresholds_table[range].inter_leader[range_threshold].max != -1)) { range_threshold++; } /* Search for corresponding intra node function */ while ((range_intra_threshold < (mv2_reduce_thresholds_table[range].size_intra_table - 1)) && (nbytes > mv2_reduce_thresholds_table[range].intra_node[range_intra_threshold].max) && (mv2_reduce_thresholds_table[range].intra_node[range_intra_threshold].max != -1)) { range_intra_threshold++; } /* Set intra-node function pt for reduce_two_level */ MV2_Reduce_intra_function = mv2_reduce_thresholds_table[range].intra_node[range_intra_threshold]. MV2_pt_Reduce_function; /* Set inter-leader pt */ MV2_Reduce_function = mv2_reduce_thresholds_table[range].inter_leader[range_threshold]. MV2_pt_Reduce_function; if(mv2_reduce_intra_knomial_factor<0) { mv2_reduce_intra_knomial_factor = mv2_reduce_thresholds_table[range].intra_k_degree; } if(mv2_reduce_inter_knomial_factor<0) { mv2_reduce_inter_knomial_factor = mv2_reduce_thresholds_table[range].inter_k_degree; } if(mv2_reduce_thresholds_table[range].is_two_level_reduce[range_threshold] == 1){ is_two_level = 1; } /* We call Reduce function */ if(is_two_level == 1) { if (is_commutative == 1) { if(comm->get_leaders_comm()==MPI_COMM_NULL){ comm->init_smp(); } mpi_errno = MPIR_Reduce_two_level_helper_MV2(sendbuf, recvbuf, count, datatype, op, root, comm); } else { mpi_errno = MPIR_Reduce_binomial_MV2(sendbuf, recvbuf, count, datatype, op, root, comm); } } else if(MV2_Reduce_function == &MPIR_Reduce_inter_knomial_wrapper_MV2 ){ if(is_commutative ==1) { mpi_errno = MV2_Reduce_function(sendbuf, recvbuf, count, datatype, op, root, comm); } else { mpi_errno = MPIR_Reduce_binomial_MV2(sendbuf, recvbuf, count, datatype, op, root, comm); } } else if(MV2_Reduce_function == &MPIR_Reduce_redscat_gather_MV2){ if (/*(HANDLE_GET_KIND(op) == HANDLE_KIND_BUILTIN) &&*/ (count >= pof2)) { mpi_errno = MV2_Reduce_function(sendbuf, recvbuf, count, datatype, op, root, comm); } else { mpi_errno = MPIR_Reduce_binomial_MV2(sendbuf, recvbuf, count, datatype, op, root, comm); } } else { mpi_errno = MV2_Reduce_function(sendbuf, recvbuf, count, datatype, op, root, comm); } return mpi_errno; }
/* * gather_intra_linear_sync * * Function: - synchronized gather operation with * Accepts: - same arguments as MPI_Gather(), first segment size * Returns: - MPI_SUCCESS or error code */ int Coll_gather_ompi_linear_sync::gather(void *sbuf, int scount, MPI_Datatype sdtype, void *rbuf, int rcount, MPI_Datatype rdtype, int root, MPI_Comm comm) { int i; int ret, line; int rank, size; int first_segment_count; size_t typelng; MPI_Aint extent; MPI_Aint lb; int first_segment_size=0; size = comm->size(); rank = comm->rank(); size_t dsize, block_size; if (rank == root) { dsize= rdtype->size(); block_size = dsize * rcount; } else { dsize=sdtype->size(); block_size = dsize * scount; } if (block_size > 92160){ first_segment_size = 32768; }else{ first_segment_size = 1024; } XBT_DEBUG("smpi_coll_tuned_gather_ompi_linear_sync rank %d, segment %d", rank, first_segment_size); if (rank != root) { /* Non-root processes: - receive zero byte message from the root, - send the first segment of the data synchronously, - send the second segment of the data. */ typelng = sdtype->size(); sdtype->extent(&lb, &extent); first_segment_count = scount; COLL_TUNED_COMPUTED_SEGCOUNT((size_t)first_segment_size, typelng, first_segment_count); Request::recv(sbuf, 0, MPI_BYTE, root, COLL_TAG_GATHER, comm, MPI_STATUS_IGNORE); Request::send(sbuf, first_segment_count, sdtype, root, COLL_TAG_GATHER, comm); Request::send((char*)sbuf + extent * first_segment_count, (scount - first_segment_count), sdtype, root, COLL_TAG_GATHER, comm); } else { /* Root process, - For every non-root node: - post irecv for the first segment of the message - send zero byte message to signal node to send the message - post irecv for the second segment of the message - wait for the first segment to complete - Copy local data if necessary - Waitall for all the second segments to complete. */ char* ptmp; MPI_Request first_segment_req; MPI_Request* reqs = new (std::nothrow) MPI_Request[size]; if (NULL == reqs) { ret = -1; line = __LINE__; goto error_hndl; } typelng=rdtype->size(); rdtype->extent(&lb, &extent); first_segment_count = rcount; COLL_TUNED_COMPUTED_SEGCOUNT( (size_t)first_segment_size, typelng, first_segment_count ); for (i = 0; i < size; ++i) { if (i == rank) { /* skip myself */ reqs[i] = MPI_REQUEST_NULL; continue; } /* irecv for the first segment from i */ ptmp = (char*)rbuf + i * rcount * extent; first_segment_req = Request::irecv(ptmp, first_segment_count, rdtype, i, COLL_TAG_GATHER, comm ); /* send sync message */ Request::send(rbuf, 0, MPI_BYTE, i, COLL_TAG_GATHER, comm); /* irecv for the second segment */ ptmp = (char*)rbuf + (i * rcount + first_segment_count) * extent; reqs[i]=Request::irecv(ptmp, (rcount - first_segment_count), rdtype, i, COLL_TAG_GATHER, comm ); /* wait on the first segment to complete */ Request::wait(&first_segment_req, MPI_STATUS_IGNORE); } /* copy local data if necessary */ if (MPI_IN_PLACE != sbuf) { ret = Datatype::copy(sbuf, scount, sdtype, (char*)rbuf + rank * rcount * extent, rcount, rdtype); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } } /* wait all second segments to complete */ ret = Request::waitall(size, reqs, MPI_STATUSES_IGNORE); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } delete[] reqs; } /* All done */ return MPI_SUCCESS; error_hndl: XBT_DEBUG( "ERROR_HNDL: node %d file %s line %d error %d\n", rank, __FILE__, line, ret ); return ret; }
int Coll_allgather_ompi::allgather(const void *sbuf, int scount, MPI_Datatype sdtype, void* rbuf, int rcount, MPI_Datatype rdtype, MPI_Comm comm ) { int communicator_size, pow2_size; size_t dsize, total_dsize; communicator_size = comm->size(); /* Special case for 2 processes */ if (communicator_size == 2) { return Coll_allgather_pair::allgather (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm/*, module*/); } /* Determine complete data size */ dsize=sdtype->size(); total_dsize = dsize * scount * communicator_size; for (pow2_size = 1; pow2_size < communicator_size; pow2_size <<=1); /* Decision based on MX 2Gb results from Grig cluster at The University of Tennesse, Knoxville - if total message size is less than 50KB use either bruck or recursive doubling for non-power of two and power of two nodes, respectively. - else use ring and neighbor exchange algorithms for odd and even number of nodes, respectively. */ if (total_dsize < 50000) { if (pow2_size == communicator_size) { return Coll_allgather_rdb::allgather(sbuf, scount, sdtype, rbuf, rcount, rdtype, comm); } else { return Coll_allgather_bruck::allgather(sbuf, scount, sdtype, rbuf, rcount, rdtype, comm); } } else { if (communicator_size % 2) { return Coll_allgather_ring::allgather(sbuf, scount, sdtype, rbuf, rcount, rdtype, comm); } else { return Coll_allgather_ompi_neighborexchange::allgather(sbuf, scount, sdtype, rbuf, rcount, rdtype, comm); } } #if defined(USE_MPICH2_DECISION) /* Decision as in MPICH-2 presented in Thakur et.al. "Optimization of Collective Communication Operations in MPICH", International Journal of High Performance Computing Applications, Vol. 19, No. 1, 49-66 (2005) - for power-of-two processes and small and medium size messages (up to 512KB) use recursive doubling - for non-power-of-two processes and small messages (80KB) use bruck, - for everything else use ring. */ if ((pow2_size == communicator_size) && (total_dsize < 524288)) { return Coll_allgather_rdb::allgather(sbuf, scount, sdtype, rbuf, rcount, rdtype, comm); } else if (total_dsize <= 81920) { return Coll_allgather_bruck::allgather(sbuf, scount, sdtype, rbuf, rcount, rdtype, comm); } return Coll_allgather_ring::allgather(sbuf, scount, sdtype, rbuf, rcount, rdtype, comm); #endif /* defined(USE_MPICH2_DECISION) */ }
int Coll_bcast_mvapich2_intra_node::bcast(void *buffer, int count, MPI_Datatype datatype, int root, MPI_Comm comm) { int mpi_errno = MPI_SUCCESS; int comm_size; int two_level_bcast = 1; size_t nbytes = 0; int is_homogeneous, is_contig; MPI_Aint type_size; unsigned char* tmp_buf = nullptr; MPI_Comm shmem_comm; if (count == 0) return MPI_SUCCESS; if (MV2_Bcast_function==NULL){ MV2_Bcast_function=Coll_bcast_mpich::bcast; } if (MV2_Bcast_intra_node_function==NULL){ MV2_Bcast_intra_node_function= Coll_bcast_mpich::bcast; } if(comm->get_leaders_comm()==MPI_COMM_NULL){ comm->init_smp(); } comm_size = comm->size(); // rank = comm->rank(); /* if (HANDLE_GET_KIND(datatype) == HANDLE_KIND_BUILTIN)*/ is_contig = 1; /* else { MPID_Datatype_get_ptr(datatype, dtp); is_contig = dtp->is_contig; } */ is_homogeneous = 1; #ifdef MPID_HAS_HETERO if (comm_ptr->is_hetero) is_homogeneous = 0; #endif /* MPI_Type_size() might not give the accurate size of the packed * datatype for heterogeneous systems (because of padding, encoding, * etc). On the other hand, MPI_Pack_size() can become very * expensive, depending on the implementation, especially for * heterogeneous systems. We want to use MPI_Type_size() wherever * possible, and MPI_Pack_size() in other places. */ //if (is_homogeneous) { type_size=datatype->size(); //} /* else {*/ /* MPIR_Pack_size_impl(1, datatype, &type_size);*/ /* }*/ nbytes = (size_t) (count) * (type_size); if (comm_size <= mv2_bcast_two_level_system_size) { if (nbytes > mv2_bcast_short_msg && nbytes < mv2_bcast_large_msg) { two_level_bcast = 1; } else { two_level_bcast = 0; } } if (two_level_bcast == 1 #if defined(_MCST_SUPPORT_) || comm_ptr->ch.is_mcast_ok #endif ) { if (not is_contig || not is_homogeneous) { tmp_buf = smpi_get_tmp_sendbuffer(nbytes); /* TODO: Pipeline the packing and communication */ // position = 0; /* if (rank == root) {*/ /* mpi_errno =*/ /* MPIR_Pack_impl(buffer, count, datatype, tmp_buf, nbytes, &position);*/ /* if (mpi_errno)*/ /* MPIU_ERR_POP(mpi_errno);*/ /* }*/ } shmem_comm = comm->get_intra_comm(); if (not is_contig || not is_homogeneous) { mpi_errno = MPIR_Bcast_inter_node_helper_MV2(tmp_buf, nbytes, MPI_BYTE, root, comm); } else { mpi_errno = MPIR_Bcast_inter_node_helper_MV2(buffer, count, datatype, root, comm); } /* We are now done with the inter-node phase */ if (nbytes <= mv2_knomial_intra_node_threshold) { if (not is_contig || not is_homogeneous) { mpi_errno = MPIR_Shmem_Bcast_MV2(tmp_buf, nbytes, MPI_BYTE, root, shmem_comm); } else { mpi_errno = MPIR_Shmem_Bcast_MV2(buffer, count, datatype, root, shmem_comm); } } else { if (not is_contig || not is_homogeneous) { mpi_errno = MPIR_Knomial_Bcast_intra_node_MV2(tmp_buf, nbytes, MPI_BYTE, INTRA_NODE_ROOT, shmem_comm); } else { mpi_errno = MPIR_Knomial_Bcast_intra_node_MV2(buffer, count, datatype, INTRA_NODE_ROOT, shmem_comm); } } } else { if (nbytes <= mv2_bcast_short_msg) { mpi_errno = MPIR_Bcast_binomial_MV2(buffer, count, datatype, root, comm); } else { if (mv2_scatter_rd_inter_leader_bcast) { mpi_errno = MPIR_Bcast_scatter_ring_allgather_MV2(buffer, count, datatype, root, comm); } else { mpi_errno = MPIR_Bcast_scatter_doubling_allgather_MV2(buffer, count, datatype, root, comm); } } } return mpi_errno; }
int Coll_reduce_ompi::reduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPI_Comm comm ) { int communicator_size=0; //int segsize = 0; size_t message_size, dsize; const double a1 = 0.6016 / 1024.0; /* [1/B] */ const double b1 = 1.3496; const double a2 = 0.0410 / 1024.0; /* [1/B] */ const double b2 = 9.7128; const double a3 = 0.0422 / 1024.0; /* [1/B] */ const double b3 = 1.1614; //const double a4 = 0.0033 / 1024.0; [1/B] //const double b4 = 1.6761; /* no limit on # of outstanding requests */ //const int max_requests = 0; communicator_size = comm->size(); /* need data size for decision function */ dsize=datatype->size(); message_size = dsize * count; /* needed for decision */ /** * If the operation is non commutative we currently have choice of linear * or in-order binary tree algorithm. */ if ((op != MPI_OP_NULL) && not op->is_commutative()) { if ((communicator_size < 12) && (message_size < 2048)) { return Coll_reduce_ompi_basic_linear::reduce(sendbuf, recvbuf, count, datatype, op, root, comm /*, module*/); } return Coll_reduce_ompi_in_order_binary::reduce(sendbuf, recvbuf, count, datatype, op, root, comm /*, module, 0, max_requests*/); } if ((communicator_size < 8) && (message_size < 512)){ /* Linear_0K */ return Coll_reduce_ompi_basic_linear::reduce (sendbuf, recvbuf, count, datatype, op, root, comm); } else if (((communicator_size < 8) && (message_size < 20480)) || (message_size < 2048) || (count <= 1)) { /* Binomial_0K */ //segsize = 0; return Coll_reduce_ompi_binomial::reduce(sendbuf, recvbuf, count, datatype, op, root, comm/*, module, segsize, max_requests*/); } else if (communicator_size > (a1 * message_size + b1)) { // Binomial_1K //segsize = 1024; return Coll_reduce_ompi_binomial::reduce(sendbuf, recvbuf, count, datatype, op, root, comm/*, module, segsize, max_requests*/); } else if (communicator_size > (a2 * message_size + b2)) { // Pipeline_1K //segsize = 1024; return Coll_reduce_ompi_pipeline::reduce (sendbuf, recvbuf, count, datatype, op, root, comm/*, module, segsize, max_requests*/); } else if (communicator_size > (a3 * message_size + b3)) { // Binary_32K //segsize = 32*1024; return Coll_reduce_ompi_binary::reduce( sendbuf, recvbuf, count, datatype, op, root, comm/*, module, segsize, max_requests*/); } // if (communicator_size > (a4 * message_size + b4)) { // Pipeline_32K // segsize = 32*1024; // } else { // Pipeline_64K // segsize = 64*1024; // } return Coll_reduce_ompi_pipeline::reduce (sendbuf, recvbuf, count, datatype, op, root, comm/*, module, segsize, max_requests*/); #if 0 /* for small messages use linear algorithm */ if (message_size <= 4096) { segsize = 0; fanout = communicator_size - 1; /* when linear implemented or taken from basic put here, right now using chain as a linear system */ /* it is implemented and I shouldn't be calling a chain with a fanout bigger than MAXTREEFANOUT from topo.h! */ return Coll_reduce_intra_basic_linear::reduce (sendbuf, recvbuf, count, datatype, op, root, comm, module); /* return Coll_reduce_intra_chain::reduce (sendbuf, recvbuf, count, datatype, op, root, comm, segsize, fanout); */ } if (message_size < 524288) { if (message_size <= 65536 ) { segsize = 32768; fanout = 8; } else { segsize = 1024; fanout = communicator_size/2; } /* later swap this for a binary tree */ /* fanout = 2; */ return Coll_reduce_intra_chain::reduce (sendbuf, recvbuf, count, datatype, op, root, comm, module, segsize, fanout, max_requests); } segsize = 1024; return Coll_reduce_intra_pipeline::reduce (sendbuf, recvbuf, count, datatype, op, root, comm, module, segsize, max_requests); #endif /* 0 */ }
int Coll_bcast_ompi::bcast(void *buff, int count, MPI_Datatype datatype, int root, MPI_Comm comm ) { /* Decision function based on MX results for messages up to 36MB and communicator sizes up to 64 nodes */ const size_t small_message_size = 2048; const size_t intermediate_message_size = 370728; const double a_p16 = 3.2118e-6; /* [1 / byte] */ const double b_p16 = 8.7936; const double a_p64 = 2.3679e-6; /* [1 / byte] */ const double b_p64 = 1.1787; const double a_p128 = 1.6134e-6; /* [1 / byte] */ const double b_p128 = 2.1102; int communicator_size; //int segsize = 0; size_t message_size, dsize; communicator_size = comm->size(); /* else we need data size for decision function */ dsize = datatype->size(); message_size = dsize * (unsigned long)count; /* needed for decision */ /* Handle messages of small and intermediate size, and single-element broadcasts */ if ((message_size < small_message_size) || (count <= 1)) { /* Binomial without segmentation */ return Coll_bcast_binomial_tree::bcast (buff, count, datatype, root, comm); } else if (message_size < intermediate_message_size) { // SplittedBinary with 1KB segments return Coll_bcast_ompi_split_bintree::bcast(buff, count, datatype, root, comm); } //Handle large message sizes else if (communicator_size < (a_p128 * message_size + b_p128)) { //Pipeline with 128KB segments //segsize = 1024 << 7; return Coll_bcast_ompi_pipeline::bcast (buff, count, datatype, root, comm); } else if (communicator_size < 13) { // Split Binary with 8KB segments return Coll_bcast_ompi_split_bintree::bcast(buff, count, datatype, root, comm); } else if (communicator_size < (a_p64 * message_size + b_p64)) { // Pipeline with 64KB segments //segsize = 1024 << 6; return Coll_bcast_ompi_pipeline::bcast (buff, count, datatype, root, comm); } else if (communicator_size < (a_p16 * message_size + b_p16)) { //Pipeline with 16KB segments //segsize = 1024 << 4; return Coll_bcast_ompi_pipeline::bcast (buff, count, datatype, root, comm); } /* Pipeline with 8KB segments */ //segsize = 1024 << 3; return Coll_bcast_flattree_pipeline::bcast (buff, count, datatype, root, comm /*segsize*/); #if 0 /* this is based on gige measurements */ if (communicator_size < 4) { return Coll_bcast_intra_basic_linear::bcast (buff, count, datatype, root, comm, module); } if (communicator_size == 4) { if (message_size < 524288) segsize = 0; else segsize = 16384; return Coll_bcast_intra_bintree::bcast (buff, count, datatype, root, comm, module, segsize); } if (communicator_size <= 8 && message_size < 4096) { return Coll_bcast_intra_basic_linear::bcast (buff, count, datatype, root, comm, module); } if (communicator_size > 8 && message_size >= 32768 && message_size < 524288) { segsize = 16384; return Coll_bcast_intra_bintree::bcast (buff, count, datatype, root, comm, module, segsize); } if (message_size >= 524288) { segsize = 16384; return Coll_bcast_intra_pipeline::bcast (buff, count, datatype, root, comm, module, segsize); } segsize = 0; /* once tested can swap this back in */ /* return Coll_bcast_intra_bmtree::bcast (buff, count, datatype, root, comm, segsize); */ return Coll_bcast_intra_bintree::bcast (buff, count, datatype, root, comm, module, segsize); #endif /* 0 */ }
int PMPI_Irecv(void *buf, int count, MPI_Datatype datatype, int src, int tag, MPI_Comm comm, MPI_Request * request) { int retval = 0; smpi_bench_end(); if (request == nullptr) { retval = MPI_ERR_ARG; } else if (comm == MPI_COMM_NULL) { retval = MPI_ERR_COMM; } else if (src == MPI_PROC_NULL) { *request = MPI_REQUEST_NULL; retval = MPI_SUCCESS; } else if (src!=MPI_ANY_SOURCE && (src >= comm->group()->size() || src <0)){ retval = MPI_ERR_RANK; } else if ((count < 0) || (buf==nullptr && count > 0)) { retval = MPI_ERR_COUNT; } else if (datatype==MPI_DATATYPE_NULL || not datatype->is_valid()) { retval = MPI_ERR_TYPE; } else if(tag<0 && tag != MPI_ANY_TAG){ retval = MPI_ERR_TAG; } else { int my_proc_id = simgrid::s4u::this_actor::get_pid(); TRACE_smpi_comm_in(my_proc_id, __func__, new simgrid::instr::Pt2PtTIData("irecv", src, datatype->is_replayable() ? count : count * datatype->size(), tag, simgrid::smpi::Datatype::encode(datatype))); *request = simgrid::smpi::Request::irecv(buf, count, datatype, src, tag, comm); retval = MPI_SUCCESS; TRACE_smpi_comm_out(my_proc_id); } smpi_bench_begin(); if (retval != MPI_SUCCESS && request != nullptr) *request = MPI_REQUEST_NULL; return retval; }
int PMPI_Compare_and_swap(const void* origin_addr, void* compare_addr, void* result_addr, MPI_Datatype datatype, int target_rank, MPI_Aint target_disp, MPI_Win win) { int retval = 0; smpi_bench_end(); if (win == MPI_WIN_NULL) { retval = MPI_ERR_WIN; } else if (target_rank == MPI_PROC_NULL) { retval = MPI_SUCCESS; } else if (target_rank <0){ retval = MPI_ERR_RANK; } else if (win->dynamic()==0 && target_disp <0){ //in case of dynamic window, target_disp can be mistakenly seen as negative, as it is an address retval = MPI_ERR_ARG; } else if (origin_addr==nullptr || result_addr==nullptr || compare_addr==nullptr){ retval = MPI_ERR_COUNT; } else if ((datatype == MPI_DATATYPE_NULL) || (not datatype->is_valid())) { retval = MPI_ERR_TYPE; } else { int my_proc_id = simgrid::s4u::this_actor::get_pid(); MPI_Group group; win->get_group(&group); TRACE_smpi_comm_in(my_proc_id, __func__, new simgrid::instr::Pt2PtTIData("Compare_and_swap", target_rank, datatype->is_replayable() ? 1 : datatype->size(), simgrid::smpi::Datatype::encode(datatype))); retval = win->compare_and_swap(origin_addr, compare_addr, result_addr, datatype, target_rank, target_disp); TRACE_smpi_comm_out(my_proc_id); } smpi_bench_begin(); return retval; }
int Coll_bcast_mvapich2::bcast(void *buffer, int count, MPI_Datatype datatype, int root, MPI_Comm comm) { int mpi_errno = MPI_SUCCESS; int comm_size/*, rank*/; int two_level_bcast = 1; long nbytes = 0; int range = 0; int range_threshold = 0; int range_threshold_intra = 0; // int is_homogeneous, is_contig; MPI_Aint type_size; //, position; // unsigned char *tmp_buf = NULL; MPI_Comm shmem_comm; //MPID_Datatype *dtp; if (count == 0) return MPI_SUCCESS; if(comm->get_leaders_comm()==MPI_COMM_NULL){ comm->init_smp(); } if (not mv2_bcast_thresholds_table) init_mv2_bcast_tables_stampede(); comm_size = comm->size(); //rank = comm->rank(); //is_contig=1; /* if (HANDLE_GET_KIND(datatype) == HANDLE_KIND_BUILTIN)*/ /* is_contig = 1;*/ /* else {*/ /* MPID_Datatype_get_ptr(datatype, dtp);*/ /* is_contig = dtp->is_contig;*/ /* }*/ // is_homogeneous = 1; /* MPI_Type_size() might not give the accurate size of the packed * datatype for heterogeneous systems (because of padding, encoding, * etc). On the other hand, MPI_Pack_size() can become very * expensive, depending on the implementation, especially for * heterogeneous systems. We want to use MPI_Type_size() wherever * possible, and MPI_Pack_size() in other places. */ //if (is_homogeneous) { type_size=datatype->size(); /* } else { MPIR_Pack_size_impl(1, datatype, &type_size); }*/ nbytes = (count) * (type_size); /* Search for the corresponding system size inside the tuning table */ while ((range < (mv2_size_bcast_tuning_table - 1)) && (comm_size > mv2_bcast_thresholds_table[range].numproc)) { range++; } /* Search for corresponding inter-leader function */ while ((range_threshold < (mv2_bcast_thresholds_table[range].size_inter_table - 1)) && (nbytes > mv2_bcast_thresholds_table[range].inter_leader[range_threshold].max) && (mv2_bcast_thresholds_table[range].inter_leader[range_threshold].max != -1)) { range_threshold++; } /* Search for corresponding intra-node function */ while ((range_threshold_intra < (mv2_bcast_thresholds_table[range].size_intra_table - 1)) && (nbytes > mv2_bcast_thresholds_table[range].intra_node[range_threshold_intra].max) && (mv2_bcast_thresholds_table[range].intra_node[range_threshold_intra].max != -1)) { range_threshold_intra++; } MV2_Bcast_function = mv2_bcast_thresholds_table[range].inter_leader[range_threshold]. MV2_pt_Bcast_function; MV2_Bcast_intra_node_function = mv2_bcast_thresholds_table[range]. intra_node[range_threshold_intra].MV2_pt_Bcast_function; /* if (mv2_user_bcast_intra == NULL && */ /* MV2_Bcast_intra_node_function == &MPIR_Knomial_Bcast_intra_node_MV2) {*/ /* MV2_Bcast_intra_node_function = &MPIR_Shmem_Bcast_MV2;*/ /* }*/ if (mv2_bcast_thresholds_table[range].inter_leader[range_threshold]. zcpy_pipelined_knomial_factor != -1) { zcpy_knomial_factor = mv2_bcast_thresholds_table[range].inter_leader[range_threshold]. zcpy_pipelined_knomial_factor; } if (mv2_pipelined_zcpy_knomial_factor != -1) { zcpy_knomial_factor = mv2_pipelined_zcpy_knomial_factor; } if(MV2_Bcast_intra_node_function == NULL) { /* if tuning table do not have any intra selection, set func pointer to ** default one for mcast intra node */ MV2_Bcast_intra_node_function = &MPIR_Shmem_Bcast_MV2; } /* Set value of pipeline segment size */ bcast_segment_size = mv2_bcast_thresholds_table[range].bcast_segment_size; /* Set value of inter node knomial factor */ mv2_inter_node_knomial_factor = mv2_bcast_thresholds_table[range].inter_node_knomial_factor; /* Set value of intra node knomial factor */ mv2_intra_node_knomial_factor = mv2_bcast_thresholds_table[range].intra_node_knomial_factor; /* Check if we will use a two level algorithm or not */ two_level_bcast = #if defined(_MCST_SUPPORT_) mv2_bcast_thresholds_table[range].is_two_level_bcast[range_threshold] || comm->ch.is_mcast_ok; #else mv2_bcast_thresholds_table[range].is_two_level_bcast[range_threshold]; #endif if (two_level_bcast == 1) { // if (not is_contig || not is_homogeneous) { // tmp_buf = smpi_get_tmp_sendbuffer(nbytes); /* position = 0;*/ /* if (rank == root) {*/ /* mpi_errno =*/ /* MPIR_Pack_impl(buffer, count, datatype, tmp_buf, nbytes, &position);*/ /* if (mpi_errno)*/ /* MPIU_ERR_POP(mpi_errno);*/ /* }*/ // } #ifdef CHANNEL_MRAIL_GEN2 if ((mv2_enable_zcpy_bcast == 1) && (&MPIR_Pipelined_Bcast_Zcpy_MV2 == MV2_Bcast_function)) { // if (not is_contig || not is_homogeneous) { // mpi_errno = MPIR_Pipelined_Bcast_Zcpy_MV2(tmp_buf, nbytes, MPI_BYTE, root, comm); // } else { mpi_errno = MPIR_Pipelined_Bcast_Zcpy_MV2(buffer, count, datatype, root, comm); // } } else #endif /* defined(CHANNEL_MRAIL_GEN2) */ { shmem_comm = comm->get_intra_comm(); // if (not is_contig || not is_homogeneous) { // MPIR_Bcast_tune_inter_node_helper_MV2(tmp_buf, nbytes, MPI_BYTE, root, comm); // } else { MPIR_Bcast_tune_inter_node_helper_MV2(buffer, count, datatype, root, comm); // } /* We are now done with the inter-node phase */ root = INTRA_NODE_ROOT; // if (not is_contig || not is_homogeneous) { // mpi_errno = MV2_Bcast_intra_node_function(tmp_buf, nbytes, MPI_BYTE, root, shmem_comm); // } else { mpi_errno = MV2_Bcast_intra_node_function(buffer, count, datatype, root, shmem_comm); // } } /* if (not is_contig || not is_homogeneous) {*/ /* if (rank != root) {*/ /* position = 0;*/ /* mpi_errno = MPIR_Unpack_impl(tmp_buf, nbytes, &position, buffer,*/ /* count, datatype);*/ /* }*/ /* }*/ } else { /* We use Knomial for intra node */ MV2_Bcast_intra_node_function = &MPIR_Knomial_Bcast_intra_node_MV2; /* if (mv2_enable_shmem_bcast == 0) {*/ /* Fall back to non-tuned version */ /* MPIR_Bcast_intra_MV2(buffer, count, datatype, root, comm);*/ /* } else {*/ mpi_errno = MV2_Bcast_function(buffer, count, datatype, root, comm); /* }*/ } return mpi_errno; }
int Coll_allreduce_mvapich2::allreduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm) { int mpi_errno = MPI_SUCCESS; //int rank = 0, int comm_size = 0; comm_size = comm->size(); //rank = comm->rank(); if (count == 0) { return MPI_SUCCESS; } if (mv2_allreduce_thresholds_table == NULL) init_mv2_allreduce_tables_stampede(); /* check if multiple threads are calling this collective function */ MPI_Aint sendtype_size = 0; long nbytes = 0; int is_commutative = 0; MPI_Aint true_lb, true_extent; sendtype_size=datatype->size(); nbytes = count * sendtype_size; datatype->extent(&true_lb, &true_extent); is_commutative = op->is_commutative(); { int range = 0, range_threshold = 0, range_threshold_intra = 0; int is_two_level = 0; /* Search for the corresponding system size inside the tuning table */ while ((range < (mv2_size_allreduce_tuning_table - 1)) && (comm_size > mv2_allreduce_thresholds_table[range].numproc)) { range++; } /* Search for corresponding inter-leader function */ /* skip mcast poiters if mcast is not available */ if(mv2_allreduce_thresholds_table[range].mcast_enabled != 1){ while ((range_threshold < (mv2_allreduce_thresholds_table[range].size_inter_table - 1)) && ((mv2_allreduce_thresholds_table[range]. inter_leader[range_threshold].MV2_pt_Allreducection == &MPIR_Allreduce_mcst_reduce_redscat_gather_MV2) || (mv2_allreduce_thresholds_table[range]. inter_leader[range_threshold].MV2_pt_Allreducection == &MPIR_Allreduce_mcst_reduce_two_level_helper_MV2) )) { range_threshold++; } } while ((range_threshold < (mv2_allreduce_thresholds_table[range].size_inter_table - 1)) && (nbytes > mv2_allreduce_thresholds_table[range].inter_leader[range_threshold].max) && (mv2_allreduce_thresholds_table[range].inter_leader[range_threshold].max != -1)) { range_threshold++; } if(mv2_allreduce_thresholds_table[range].is_two_level_allreduce[range_threshold] == 1){ is_two_level = 1; } /* Search for corresponding intra-node function */ while ((range_threshold_intra < (mv2_allreduce_thresholds_table[range].size_intra_table - 1)) && (nbytes > mv2_allreduce_thresholds_table[range].intra_node[range_threshold_intra].max) && (mv2_allreduce_thresholds_table[range].intra_node[range_threshold_intra].max != -1)) { range_threshold_intra++; } MV2_Allreducection = mv2_allreduce_thresholds_table[range].inter_leader[range_threshold] .MV2_pt_Allreducection; MV2_Allreduce_intra_function = mv2_allreduce_thresholds_table[range].intra_node[range_threshold_intra] .MV2_pt_Allreducection; /* check if mcast is ready, otherwise replace mcast with other algorithm */ if((MV2_Allreducection == &MPIR_Allreduce_mcst_reduce_redscat_gather_MV2)|| (MV2_Allreducection == &MPIR_Allreduce_mcst_reduce_two_level_helper_MV2)){ { MV2_Allreducection = &MPIR_Allreduce_pt2pt_rd_MV2; } if(is_two_level != 1) { MV2_Allreducection = &MPIR_Allreduce_pt2pt_rd_MV2; } } if(is_two_level == 1){ // check if shm is ready, if not use other algorithm first if (is_commutative) { if(comm->get_leaders_comm()==MPI_COMM_NULL){ comm->init_smp(); } mpi_errno = MPIR_Allreduce_two_level_MV2(sendbuf, recvbuf, count, datatype, op, comm); } else { mpi_errno = MPIR_Allreduce_pt2pt_rd_MV2(sendbuf, recvbuf, count, datatype, op, comm); } } else { mpi_errno = MV2_Allreducection(sendbuf, recvbuf, count, datatype, op, comm); } } //comm->ch.intra_node_done=0; return (mpi_errno); }
int PMPI_Recv(void *buf, int count, MPI_Datatype datatype, int src, int tag, MPI_Comm comm, MPI_Status * status) { int retval = 0; smpi_bench_end(); if (comm == MPI_COMM_NULL) { retval = MPI_ERR_COMM; } else if (src == MPI_PROC_NULL) { if(status != MPI_STATUS_IGNORE){ simgrid::smpi::Status::empty(status); status->MPI_SOURCE = MPI_PROC_NULL; } retval = MPI_SUCCESS; } else if (src!=MPI_ANY_SOURCE && (src >= comm->group()->size() || src <0)){ retval = MPI_ERR_RANK; } else if ((count < 0) || (buf==nullptr && count > 0)) { retval = MPI_ERR_COUNT; } else if (datatype==MPI_DATATYPE_NULL || not datatype->is_valid()) { retval = MPI_ERR_TYPE; } else if(tag<0 && tag != MPI_ANY_TAG){ retval = MPI_ERR_TAG; } else { int my_proc_id = simgrid::s4u::this_actor::get_pid(); TRACE_smpi_comm_in(my_proc_id, __func__, new simgrid::instr::Pt2PtTIData("recv", src, datatype->is_replayable() ? count : count * datatype->size(), tag, simgrid::smpi::Datatype::encode(datatype))); simgrid::smpi::Request::recv(buf, count, datatype, src, tag, comm, status); retval = MPI_SUCCESS; // the src may not have been known at the beginning of the recv (MPI_ANY_SOURCE) int src_traced=0; if (status != MPI_STATUS_IGNORE) src_traced = getPid(comm, status->MPI_SOURCE); else src_traced = getPid(comm, src); if (not TRACE_smpi_view_internals()) { TRACE_smpi_recv(src_traced, my_proc_id, tag); } TRACE_smpi_comm_out(my_proc_id); } smpi_bench_begin(); return retval; }
int PMPI_Get_accumulate(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype, void *result_addr, int result_count, MPI_Datatype result_datatype, int target_rank, MPI_Aint target_disp, int target_count, MPI_Datatype target_datatype, MPI_Op op, MPI_Win win){ int retval = 0; smpi_bench_end(); if (win == MPI_WIN_NULL) { retval = MPI_ERR_WIN; } else if (target_rank == MPI_PROC_NULL) { retval = MPI_SUCCESS; } else if (target_rank <0){ retval = MPI_ERR_RANK; } else if (win->dynamic()==0 && target_disp <0){ //in case of dynamic window, target_disp can be mistakenly seen as negative, as it is an address retval = MPI_ERR_ARG; } else if ((origin_count < 0 || target_count < 0 || result_count <0) || (origin_addr==nullptr && origin_count > 0 && op != MPI_NO_OP) || (result_addr==nullptr && result_count > 0)){ retval = MPI_ERR_COUNT; } else if (((target_datatype == MPI_DATATYPE_NULL) || (result_datatype == MPI_DATATYPE_NULL)) || (((origin_datatype != MPI_DATATYPE_NULL) && (not origin_datatype->is_valid())) || (not target_datatype->is_valid()) || (not result_datatype->is_valid()))) { retval = MPI_ERR_TYPE; } else if (op == MPI_OP_NULL) { retval = MPI_ERR_OP; } else { int my_proc_id = simgrid::s4u::this_actor::get_pid(); MPI_Group group; win->get_group(&group); TRACE_smpi_comm_in(my_proc_id, __func__, new simgrid::instr::Pt2PtTIData( "Get_accumulate", target_rank, target_datatype->is_replayable() ? target_count : target_count * target_datatype->size(), simgrid::smpi::Datatype::encode(target_datatype))); retval = win->get_accumulate( origin_addr, origin_count, origin_datatype, result_addr, result_count, result_datatype, target_rank, target_disp, target_count, target_datatype, op); TRACE_smpi_comm_out(my_proc_id); } smpi_bench_begin(); return retval; }
int Coll_reduce_scatter_mvapich2::reduce_scatter(const void *sendbuf, void *recvbuf, const int *recvcnts, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm) { int mpi_errno = MPI_SUCCESS; int i = 0, comm_size = comm->size(), total_count = 0, type_size = 0, nbytes = 0; int is_commutative = 0; int* disps = new int[comm_size]; if(mv2_red_scat_thresholds_table==NULL) init_mv2_reduce_scatter_tables_stampede(); is_commutative=(op==MPI_OP_NULL || op->is_commutative()); for (i = 0; i < comm_size; i++) { disps[i] = total_count; total_count += recvcnts[i]; } type_size=datatype->size(); nbytes = total_count * type_size; if (is_commutative) { int range = 0; int range_threshold = 0; /* Search for the corresponding system size inside the tuning table */ while ((range < (mv2_size_red_scat_tuning_table - 1)) && (comm_size > mv2_red_scat_thresholds_table[range].numproc)) { range++; } /* Search for corresponding inter-leader function */ while ((range_threshold < (mv2_red_scat_thresholds_table[range].size_inter_table - 1)) && (nbytes > mv2_red_scat_thresholds_table[range].inter_leader[range_threshold].max) && (mv2_red_scat_thresholds_table[range].inter_leader[range_threshold].max != -1)) { range_threshold++; } /* Set inter-leader pt */ MV2_Red_scat_function = mv2_red_scat_thresholds_table[range].inter_leader[range_threshold]. MV2_pt_Red_scat_function; mpi_errno = MV2_Red_scat_function(sendbuf, recvbuf, recvcnts, datatype, op, comm); } else { int is_block_regular = 1; for (i = 0; i < (comm_size - 1); ++i) { if (recvcnts[i] != recvcnts[i+1]) { is_block_regular = 0; break; } } int pof2 = 1; while (pof2 < comm_size) pof2 <<= 1; if (pof2 == comm_size && is_block_regular) { /* noncommutative, pof2 size, and block regular */ MPIR_Reduce_scatter_non_comm_MV2(sendbuf, recvbuf, recvcnts, datatype, op, comm); } mpi_errno = Coll_reduce_scatter_mpich_rdb::reduce_scatter(sendbuf, recvbuf, recvcnts, datatype, op, comm); } delete[] disps; return mpi_errno; }
int Coll_scatter_mvapich2_two_level_direct::scatter(void *sendbuf, int sendcnt, MPI_Datatype sendtype, void *recvbuf, int recvcnt, MPI_Datatype recvtype, int root, MPI_Comm comm) { int comm_size, rank; int local_rank, local_size; int leader_comm_rank = -1, leader_comm_size = -1; int mpi_errno = MPI_SUCCESS; int recvtype_size, sendtype_size, nbytes; void *tmp_buf = NULL; void *leader_scatter_buf = NULL; MPI_Status status; int leader_root, leader_of_root = -1; MPI_Comm shmem_comm, leader_comm; //if not set (use of the algo directly, without mvapich2 selector) if(MV2_Scatter_intra_function==NULL) MV2_Scatter_intra_function=Coll_scatter_mpich::scatter; if(comm->get_leaders_comm()==MPI_COMM_NULL){ comm->init_smp(); } comm_size = comm->size(); rank = comm->rank(); if (((rank == root) && (recvcnt == 0)) || ((rank != root) && (sendcnt == 0))) { return MPI_SUCCESS; } /* extract the rank,size information for the intra-node * communicator */ shmem_comm = comm->get_intra_comm(); local_rank = shmem_comm->rank(); local_size = shmem_comm->size(); if (local_rank == 0) { /* Node leader. Extract the rank, size information for the leader * communicator */ leader_comm = comm->get_leaders_comm(); leader_comm_size = leader_comm->size(); leader_comm_rank = leader_comm->rank(); } if (local_size == comm_size) { /* purely intra-node scatter. Just use the direct algorithm and we are done */ mpi_errno = MPIR_Scatter_MV2_Direct(sendbuf, sendcnt, sendtype, recvbuf, recvcnt, recvtype, root, comm); } else { recvtype_size=recvtype->size(); sendtype_size=sendtype->size(); if (rank == root) { nbytes = sendcnt * sendtype_size; } else { nbytes = recvcnt * recvtype_size; } if (local_rank == 0) { /* Node leader, allocate tmp_buffer */ tmp_buf = smpi_get_tmp_sendbuffer(nbytes * local_size); } leader_comm = comm->get_leaders_comm(); int* leaders_map = comm->get_leaders_map(); leader_of_root = comm->group()->rank(leaders_map[root]); leader_root = leader_comm->group()->rank(leaders_map[root]); /* leader_root is the rank of the leader of the root in leader_comm. * leader_root is to be used as the root of the inter-leader gather ops */ if ((local_rank == 0) && (root != rank) && (leader_of_root == rank)) { /* The root of the scatter operation is not the node leader. Recv * data from the node leader */ leader_scatter_buf = smpi_get_tmp_sendbuffer(nbytes * comm_size); Request::recv(leader_scatter_buf, nbytes * comm_size, MPI_BYTE, root, COLL_TAG_SCATTER, comm, &status); } if (rank == root && local_rank != 0) { /* The root of the scatter operation is not the node leader. Send * data to the node leader */ Request::send(sendbuf, sendcnt * comm_size, sendtype, leader_of_root, COLL_TAG_SCATTER, comm ); } if (leader_comm_size > 1 && local_rank == 0) { if (not comm->is_uniform()) { int* displs = NULL; int* sendcnts = NULL; int* node_sizes; int i = 0; node_sizes = comm->get_non_uniform_map(); if (root != leader_of_root) { if (leader_comm_rank == leader_root) { displs = static_cast<int*>(xbt_malloc(sizeof(int) * leader_comm_size)); sendcnts = static_cast<int*>(xbt_malloc(sizeof(int) * leader_comm_size)); sendcnts[0] = node_sizes[0] * nbytes; displs[0] = 0; for (i = 1; i < leader_comm_size; i++) { displs[i] = displs[i - 1] + node_sizes[i - 1] * nbytes; sendcnts[i] = node_sizes[i] * nbytes; } } Colls::scatterv(leader_scatter_buf, sendcnts, displs, MPI_BYTE, tmp_buf, nbytes * local_size, MPI_BYTE, leader_root, leader_comm); } else { if (leader_comm_rank == leader_root) { displs = static_cast<int*>(xbt_malloc(sizeof(int) * leader_comm_size)); sendcnts = static_cast<int*>(xbt_malloc(sizeof(int) * leader_comm_size)); sendcnts[0] = node_sizes[0] * sendcnt; displs[0] = 0; for (i = 1; i < leader_comm_size; i++) { displs[i] = displs[i - 1] + node_sizes[i - 1] * sendcnt; sendcnts[i] = node_sizes[i] * sendcnt; } } Colls::scatterv(sendbuf, sendcnts, displs, sendtype, tmp_buf, nbytes * local_size, MPI_BYTE, leader_root, leader_comm); } if (leader_comm_rank == leader_root) { xbt_free(displs); xbt_free(sendcnts); } } else { if (leader_of_root != root) { mpi_errno = MPIR_Scatter_MV2_Direct(leader_scatter_buf, nbytes * local_size, MPI_BYTE, tmp_buf, nbytes * local_size, MPI_BYTE, leader_root, leader_comm); } else { mpi_errno = MPIR_Scatter_MV2_Direct(sendbuf, sendcnt * local_size, sendtype, tmp_buf, nbytes * local_size, MPI_BYTE, leader_root, leader_comm); } } } /* The leaders are now done with the inter-leader part. Scatter the data within the nodes */ if (rank == root && recvbuf == MPI_IN_PLACE) { mpi_errno = MV2_Scatter_intra_function(tmp_buf, nbytes, MPI_BYTE, (void *)sendbuf, sendcnt, sendtype, 0, shmem_comm); } else { mpi_errno = MV2_Scatter_intra_function(tmp_buf, nbytes, MPI_BYTE, recvbuf, recvcnt, recvtype, 0, shmem_comm); } } /* check if multiple threads are calling this collective function */ if (comm_size != local_size && local_rank == 0) { smpi_free_tmp_buffer(tmp_buf); if (leader_of_root == rank && root != rank) { smpi_free_tmp_buffer(leader_scatter_buf); } } return (mpi_errno); }
int Coll_scatter_mvapich2::scatter(const void *sendbuf, int sendcnt, MPI_Datatype sendtype, void *recvbuf, int recvcnt, MPI_Datatype recvtype, int root, MPI_Comm comm) { int range = 0, range_threshold = 0, range_threshold_intra = 0; int mpi_errno = MPI_SUCCESS; // int mpi_errno_ret = MPI_SUCCESS; int rank, nbytes, comm_size; int partial_sub_ok = 0; int conf_index = 0; MPI_Comm shmem_comm; // MPID_Comm *shmem_commptr=NULL; if(mv2_scatter_thresholds_table==NULL) init_mv2_scatter_tables_stampede(); if(comm->get_leaders_comm()==MPI_COMM_NULL){ comm->init_smp(); } comm_size = comm->size(); rank = comm->rank(); if (rank == root) { int sendtype_size = sendtype->size(); nbytes = sendcnt * sendtype_size; } else { int recvtype_size = recvtype->size(); nbytes = recvcnt * recvtype_size; } // check if safe to use partial subscription mode if (comm->is_uniform()) { shmem_comm = comm->get_intra_comm(); if (mv2_scatter_table_ppn_conf[0] == -1) { // Indicating user defined tuning conf_index = 0; }else{ int local_size = shmem_comm->size(); int i = 0; do { if (local_size == mv2_scatter_table_ppn_conf[i]) { conf_index = i; partial_sub_ok = 1; break; } i++; } while(i < mv2_scatter_num_ppn_conf); } } if (partial_sub_ok != 1) { conf_index = 0; } /* Search for the corresponding system size inside the tuning table */ while ((range < (mv2_size_scatter_tuning_table[conf_index] - 1)) && (comm_size > mv2_scatter_thresholds_table[conf_index][range].numproc)) { range++; } /* Search for corresponding inter-leader function */ while ((range_threshold < (mv2_scatter_thresholds_table[conf_index][range].size_inter_table - 1)) && (nbytes > mv2_scatter_thresholds_table[conf_index][range].inter_leader[range_threshold].max) && (mv2_scatter_thresholds_table[conf_index][range].inter_leader[range_threshold].max != -1)) { range_threshold++; } /* Search for corresponding intra-node function */ while ((range_threshold_intra < (mv2_scatter_thresholds_table[conf_index][range].size_intra_table - 1)) && (nbytes > mv2_scatter_thresholds_table[conf_index][range].intra_node[range_threshold_intra].max) && (mv2_scatter_thresholds_table[conf_index][range].intra_node[range_threshold_intra].max != -1)) { range_threshold_intra++; } MV2_Scatter_function = mv2_scatter_thresholds_table[conf_index][range].inter_leader[range_threshold] .MV2_pt_Scatter_function; if(MV2_Scatter_function == &MPIR_Scatter_mcst_wrap_MV2) { #if defined(_MCST_SUPPORT_) if(comm->ch.is_mcast_ok == 1 && mv2_use_mcast_scatter == 1 && comm->ch.shmem_coll_ok == 1) { MV2_Scatter_function = &MPIR_Scatter_mcst_MV2; } else #endif /*#if defined(_MCST_SUPPORT_) */ { if(mv2_scatter_thresholds_table[conf_index][range].inter_leader[range_threshold + 1]. MV2_pt_Scatter_function != NULL) { MV2_Scatter_function = mv2_scatter_thresholds_table[conf_index][range].inter_leader[range_threshold + 1] .MV2_pt_Scatter_function; } else { /* Fallback! */ MV2_Scatter_function = &MPIR_Scatter_MV2_Binomial; } } } if( (MV2_Scatter_function == &MPIR_Scatter_MV2_two_level_Direct) || (MV2_Scatter_function == &MPIR_Scatter_MV2_two_level_Binomial)) { if( comm->is_blocked()) { MV2_Scatter_intra_function = mv2_scatter_thresholds_table[conf_index][range].intra_node[range_threshold_intra] .MV2_pt_Scatter_function; mpi_errno = MV2_Scatter_function(sendbuf, sendcnt, sendtype, recvbuf, recvcnt, recvtype, root, comm); } else { mpi_errno = MPIR_Scatter_MV2_Binomial(sendbuf, sendcnt, sendtype, recvbuf, recvcnt, recvtype, root, comm); } } else { mpi_errno = MV2_Scatter_function(sendbuf, sendcnt, sendtype, recvbuf, recvcnt, recvtype, root, comm); } return (mpi_errno); }