int smpi_coll_tuned_allgather_ompi(void *sbuf, int scount, MPI_Datatype sdtype, void* rbuf, int rcount, MPI_Datatype rdtype, MPI_Comm comm ) { int communicator_size, pow2_size; size_t dsize, total_dsize; communicator_size = smpi_comm_size(comm); /* Special case for 2 processes */ if (communicator_size == 2) { return smpi_coll_tuned_allgather_pair (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm/*, module*/); } /* Determine complete data size */ dsize=smpi_datatype_size(sdtype); total_dsize = dsize * scount * communicator_size; for (pow2_size = 1; pow2_size < communicator_size; pow2_size <<=1); /* Decision based on MX 2Gb results from Grig cluster at The University of Tennesse, Knoxville - if total message size is less than 50KB use either bruck or recursive doubling for non-power of two and power of two nodes, respectively. - else use ring and neighbor exchange algorithms for odd and even number of nodes, respectively. */ if (total_dsize < 50000) { if (pow2_size == communicator_size) { return smpi_coll_tuned_allgather_rdb(sbuf, scount, sdtype, rbuf, rcount, rdtype, comm); } else { return smpi_coll_tuned_allgather_bruck(sbuf, scount, sdtype, rbuf, rcount, rdtype, comm); } } else { if (communicator_size % 2) { return smpi_coll_tuned_allgather_ring(sbuf, scount, sdtype, rbuf, rcount, rdtype, comm); } else { return smpi_coll_tuned_allgather_ompi_neighborexchange(sbuf, scount, sdtype, rbuf, rcount, rdtype, comm); } } #if defined(USE_MPICH2_DECISION) /* Decision as in MPICH-2 presented in Thakur et.al. "Optimization of Collective Communication Operations in MPICH", International Journal of High Performance Computing Applications, Vol. 19, No. 1, 49-66 (2005) - for power-of-two processes and small and medium size messages (up to 512KB) use recursive doubling - for non-power-of-two processes and small messages (80KB) use bruck, - for everything else use ring. */ if ((pow2_size == communicator_size) && (total_dsize < 524288)) { return smpi_coll_tuned_allgather_rdb(sbuf, scount, sdtype, rbuf, rcount, rdtype, comm); } else if (total_dsize <= 81920) { return smpi_coll_tuned_allgather_bruck(sbuf, scount, sdtype, rbuf, rcount, rdtype, comm); } return smpi_coll_tuned_allgather_ring(sbuf, scount, sdtype, rbuf, rcount, rdtype, comm); #endif /* defined(USE_MPICH2_DECISION) */ }
int smpi_coll_tuned_allgather_ompi_neighborexchange(void *sbuf, int scount, MPI_Datatype sdtype, void* rbuf, int rcount, MPI_Datatype rdtype, MPI_Comm comm ) { int line = -1; int rank, size; int neighbor[2], offset_at_step[2], recv_data_from[2], send_data_from; int i, even_rank; int err = 0; ptrdiff_t slb, rlb, sext, rext; char *tmpsend = NULL, *tmprecv = NULL; size = smpi_comm_size(comm); rank = smpi_comm_rank(comm); if (size % 2) { XBT_DEBUG( "coll:tuned:allgather_intra_neighborexchange WARNING: odd size %d, switching to ring algorithm", size); return smpi_coll_tuned_allgather_ring(sbuf, scount, sdtype, rbuf, rcount, rdtype, comm); } XBT_DEBUG( "coll:tuned:allgather_intra_neighborexchange rank %d", rank); err = smpi_datatype_extent (sdtype, &slb, &sext); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } err = smpi_datatype_extent (rdtype, &rlb, &rext); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } /* Initialization step: - if send buffer is not MPI_IN_PLACE, copy send buffer to appropriate block of receive buffer */ tmprecv = (char*) rbuf + rank * rcount * rext; if (MPI_IN_PLACE != sbuf) { tmpsend = (char*) sbuf; smpi_datatype_copy (tmpsend, scount, sdtype, tmprecv, rcount, rdtype); } /* Determine neighbors, order in which blocks will arrive, etc. */ even_rank = !(rank % 2); if (even_rank) { neighbor[0] = (rank + 1) % size; neighbor[1] = (rank - 1 + size) % size; recv_data_from[0] = rank; recv_data_from[1] = rank; offset_at_step[0] = (+2); offset_at_step[1] = (-2); } else { neighbor[0] = (rank - 1 + size) % size; neighbor[1] = (rank + 1) % size; recv_data_from[0] = neighbor[0]; recv_data_from[1] = neighbor[0]; offset_at_step[0] = (-2); offset_at_step[1] = (+2); } /* Communication loop: - First step is special: exchange a single block with neighbor[0]. - Rest of the steps: update recv_data_from according to offset, and exchange two blocks with appropriate neighbor. the send location becomes previous receve location. */ tmprecv = (char*)rbuf + neighbor[0] * rcount * rext; tmpsend = (char*)rbuf + rank * rcount * rext; /* Sendreceive */ smpi_mpi_sendrecv(tmpsend, rcount, rdtype, neighbor[0], COLL_TAG_ALLGATHER, tmprecv, rcount, rdtype, neighbor[0], COLL_TAG_ALLGATHER, comm, MPI_STATUS_IGNORE); /* Determine initial sending location */ if (even_rank) { send_data_from = rank; } else { send_data_from = recv_data_from[0]; } for (i = 1; i < (size / 2); i++) { const int i_parity = i % 2; recv_data_from[i_parity] = (recv_data_from[i_parity] + offset_at_step[i_parity] + size) % size; tmprecv = (char*)rbuf + recv_data_from[i_parity] * rcount * rext; tmpsend = (char*)rbuf + send_data_from * rcount * rext; /* Sendreceive */ smpi_mpi_sendrecv(tmpsend, 2 * rcount, rdtype, neighbor[i_parity], COLL_TAG_ALLGATHER, tmprecv, 2 * rcount, rdtype, neighbor[i_parity], COLL_TAG_ALLGATHER, comm, MPI_STATUS_IGNORE); send_data_from = recv_data_from[i_parity]; } return MPI_SUCCESS; err_hndl: XBT_DEBUG( "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank); return err; }