int smpi_coll_tuned_reduce_ompi_binomial( void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPI_Comm comm) { uint32_t segsize=0; int segcount = count; size_t typelng; const double a1 = 0.6016 / 1024.0; /* [1/B] */ const double b1 = 1.3496; // COLL_TUNED_UPDATE_IN_ORDER_BMTREE( comm, tuned_module, root ); /** * Determine number of segments and number of elements * sent per operation */ typelng= smpi_datatype_size( datatype); int communicator_size = smpi_comm_size(comm); size_t message_size = typelng * count; if (((communicator_size < 8) && (message_size < 20480)) || (message_size < 2048) || (count <= 1)) { /* Binomial_0K */ segsize = 0; } else if (communicator_size > (a1 * message_size + b1)) { // Binomial_1K segsize = 1024; } XBT_DEBUG("coll:tuned:reduce_intra_binomial rank %d ss %5d", smpi_comm_rank(comm), segsize); COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount ); return smpi_coll_tuned_ompi_reduce_generic( sendbuf, recvbuf, count, datatype, op, root, comm, ompi_coll_tuned_topo_build_in_order_bmtree(comm, root), segcount, 0); }
int smpi_coll_tuned_scatter_ompi_binomial(void *sbuf, int scount, MPI_Datatype sdtype, void *rbuf, int rcount, MPI_Datatype rdtype, int root, MPI_Comm comm ) { int line = -1; int i; int rank; int vrank; int size; int total_send = 0; char *ptmp = NULL; char *tempbuf = NULL; int err; ompi_coll_tree_t* bmtree; MPI_Status status; MPI_Aint sextent, slb, strue_lb, strue_extent; MPI_Aint rextent, rlb, rtrue_lb, rtrue_extent; size = smpi_comm_size(comm); rank = smpi_comm_rank(comm); XBT_DEBUG( "smpi_coll_tuned_scatter_ompi_binomial rank %d", rank); /* create the binomial tree */ // COLL_TUNED_UPDATE_IN_ORDER_BMTREE( comm, tuned_module, root ); bmtree = ompi_coll_tuned_topo_build_in_order_bmtree( comm, root);//ompi_ data->cached_in_order_bmtree; smpi_datatype_extent(sdtype, &slb, &sextent); smpi_datatype_extent(sdtype, &strue_lb, &strue_extent); smpi_datatype_extent(rdtype, &rlb, &rextent); smpi_datatype_extent(rdtype, &rtrue_lb, &rtrue_extent); vrank = (rank - root + size) % size; if (rank == root) { if (0 == root) { /* root on 0, just use the send buffer */ ptmp = (char *) sbuf; if (rbuf != MPI_IN_PLACE) { /* local copy to rbuf */ err = smpi_datatype_copy(sbuf, scount, sdtype, rbuf, rcount, rdtype); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } } } else { /* root is not on 0, allocate temp buffer for send */ tempbuf = (char *) malloc(strue_extent + (scount*size - 1) * sextent); if (NULL == tempbuf) { err = MPI_ERR_OTHER; line = __LINE__; goto err_hndl; } ptmp = tempbuf - slb; /* and rotate data so they will eventually in the right place */ err = smpi_datatype_copy((char *) sbuf + sextent*root*scount, scount*(size-root), sdtype, ptmp, scount*(size-root), sdtype); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } err = smpi_datatype_copy((char*)sbuf, scount*root, sdtype, ptmp + sextent*scount*(size - root), scount*root, sdtype); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } if (rbuf != MPI_IN_PLACE) { /* local copy to rbuf */ err = smpi_datatype_copy(ptmp, scount, sdtype, rbuf, rcount, rdtype); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } } } total_send = scount; } else if (!(vrank % 2)) { /* non-root, non-leaf nodes, allocte temp buffer for recv * the most we need is rcount*size/2 */ tempbuf = (char *) malloc(rtrue_extent + (rcount*size - 1) * rextent); if (NULL == tempbuf) { err= MPI_ERR_OTHER; line = __LINE__; goto err_hndl; } ptmp = tempbuf - rlb; sdtype = rdtype; scount = rcount; sextent = rextent; total_send = scount; } else { /* leaf nodes, just use rbuf */ ptmp = (char *) rbuf; } if (!(vrank % 2)) { if (rank != root) { /* recv from parent on non-root */ smpi_mpi_recv(ptmp, rcount*size, rdtype, bmtree->tree_prev, COLL_TAG_SCATTER, comm, &status); /* local copy to rbuf */ err = smpi_datatype_copy(ptmp, scount, sdtype, rbuf, rcount, rdtype); } /* send to children on all non-leaf */ for (i = 0; i < bmtree->tree_nextsize; i++) { int mycount = 0, vkid; /* figure out how much data I have to send to this child */ vkid = (bmtree->tree_next[i] - root + size) % size; mycount = vkid - vrank; if (mycount > (size - vkid)) mycount = size - vkid; mycount *= scount; smpi_mpi_send(ptmp + total_send*sextent, mycount, sdtype, bmtree->tree_next[i], COLL_TAG_SCATTER, comm); total_send += mycount; } if (NULL != tempbuf) free(tempbuf); } else { /* recv from parent on leaf nodes */ smpi_mpi_recv(ptmp, rcount, rdtype, bmtree->tree_prev, COLL_TAG_SCATTER, comm, &status); } //!FIXME : store the tree, as done in ompi, instead of calculating it each time ? xbt_free(bmtree); return MPI_SUCCESS; err_hndl: if (NULL != tempbuf) free(tempbuf); XBT_DEBUG( "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank); return err; }
int Coll_gather_ompi_binomial::gather(void* sbuf, int scount, MPI_Datatype sdtype, void* rbuf, int rcount, MPI_Datatype rdtype, int root, MPI_Comm comm) { int line = -1; int i; int rank; int vrank; int size; int total_recv = 0; char *ptmp = NULL; char *tempbuf = NULL; int err; ompi_coll_tree_t* bmtree; MPI_Status status; MPI_Aint sextent, slb, strue_lb, strue_extent; MPI_Aint rextent, rlb, rtrue_lb, rtrue_extent; size = comm->size(); rank = comm->rank(); XBT_DEBUG("smpi_coll_tuned_gather_ompi_binomial rank %d", rank); /* create the binomial tree */ // COLL_TUNED_UPDATE_IN_ORDER_BMTREE( comm, tuned_module, root ); bmtree = ompi_coll_tuned_topo_build_in_order_bmtree(comm, root); // data->cached_in_order_bmtree; sdtype->extent(&slb, &sextent); sdtype->extent(&strue_lb, &strue_extent); vrank = (rank - root + size) % size; if (rank == root) { rdtype->extent(&rlb, &rextent); rdtype->extent(&rtrue_lb, &rtrue_extent); if (0 == root) { /* root on 0, just use the recv buffer */ ptmp = (char*)rbuf; if (sbuf != MPI_IN_PLACE) { err = Datatype::copy(sbuf, scount, sdtype, ptmp, rcount, rdtype); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } } } else { /* root is not on 0, allocate temp buffer for recv, * rotate data at the end */ tempbuf = (char*)smpi_get_tmp_recvbuffer(rtrue_extent + (rcount * size - 1) * rextent); if (NULL == tempbuf) { err = MPI_ERR_OTHER; line = __LINE__; goto err_hndl; } ptmp = tempbuf - rlb; if (sbuf != MPI_IN_PLACE) { /* copy from sbuf to temp buffer */ err = Datatype::copy(sbuf, scount, sdtype, ptmp, rcount, rdtype); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } } else { /* copy from rbuf to temp buffer */ err = Datatype::copy((char*)rbuf + rank * rextent * rcount, rcount, rdtype, ptmp, rcount, rdtype); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } } } total_recv = rcount; } else if (!(vrank % 2)) { /* other non-leaf nodes, allocate temp buffer for data received from * children, the most we need is half of the total data elements due * to the property of binimoal tree */ tempbuf = (char*)smpi_get_tmp_sendbuffer(strue_extent + (scount * size - 1) * sextent); if (NULL == tempbuf) { err = MPI_ERR_OTHER; line = __LINE__; goto err_hndl; } ptmp = tempbuf - slb; /* local copy to tempbuf */ err = Datatype::copy(sbuf, scount, sdtype, ptmp, scount, sdtype); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } /* use sdtype,scount as rdtype,rdcount since they are ignored on * non-root procs */ rdtype = sdtype; rcount = scount; rextent = sextent; total_recv = rcount; } else { /* leaf nodes, no temp buffer needed, use sdtype,scount as * rdtype,rdcount since they are ignored on non-root procs */ ptmp = (char*)sbuf; total_recv = scount; } if (!(vrank % 2)) { /* all non-leaf nodes recv from children */ for (i = 0; i < bmtree->tree_nextsize; i++) { int mycount = 0, vkid; /* figure out how much data I have to send to this child */ vkid = (bmtree->tree_next[i] - root + size) % size; mycount = vkid - vrank; if (mycount > (size - vkid)) mycount = size - vkid; mycount *= rcount; XBT_DEBUG("smpi_coll_tuned_gather_ompi_binomial rank %d recv %d mycount = %d", rank, bmtree->tree_next[i], mycount); Request::recv(ptmp + total_recv * rextent, mycount, rdtype, bmtree->tree_next[i], COLL_TAG_GATHER, comm, &status); total_recv += mycount; } } if (rank != root) { /* all nodes except root send to parents */ XBT_DEBUG("smpi_coll_tuned_gather_ompi_binomial rank %d send %d count %d\n", rank, bmtree->tree_prev, total_recv); Request::send(ptmp, total_recv, sdtype, bmtree->tree_prev, COLL_TAG_GATHER, comm); } if (rank == root) { if (root != 0) { /* rotate received data on root if root != 0 */ err = Datatype::copy(ptmp, rcount * (size - root), rdtype, (char*)rbuf + rextent * root * rcount, rcount * (size - root), rdtype); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } err = Datatype::copy(ptmp + rextent * rcount * (size - root), rcount * root, rdtype, (char*)rbuf, rcount * root, rdtype); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } smpi_free_tmp_buffer(tempbuf); } } else if (!(vrank % 2)) { /* other non-leaf nodes */ smpi_free_tmp_buffer(tempbuf); } ompi_coll_tuned_topo_destroy_tree(&bmtree); return MPI_SUCCESS; err_hndl: if (NULL != tempbuf) smpi_free_tmp_buffer(tempbuf); XBT_DEBUG("%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank); return err; }