int smpi_coll_tuned_allreduce_ompi(void *sbuf, void *rbuf, int count, MPI_Datatype dtype, MPI_Op op, MPI_Comm comm) { size_t dsize, block_dsize; int comm_size = smpi_comm_size(comm); const size_t intermediate_message = 10000; /** * Decision function based on MX results from the Grig cluster at UTK. * * Currently, linear, recursive doubling, and nonoverlapping algorithms * can handle both commutative and non-commutative operations. * Ring algorithm does not support non-commutative operations. */ dsize = smpi_datatype_size(dtype); block_dsize = dsize * count; if (block_dsize < intermediate_message) { return (smpi_coll_tuned_allreduce_rdb (sbuf, rbuf, count, dtype, op, comm)); } if( smpi_op_is_commute(op) && (count > comm_size) ) { const size_t segment_size = 1 << 20; /* 1 MB */ if ((comm_size * segment_size >= block_dsize)) { //FIXME: ok, these are not the right algorithms, try to find closer ones // lr is a good match for allreduce_ring (difference is mainly the use of sendrecv) return smpi_coll_tuned_allreduce_lr(sbuf, rbuf, count, dtype, op, comm); } else { return (smpi_coll_tuned_allreduce_ompi_ring_segmented (sbuf, rbuf, count, dtype, op, comm /*segment_size*/)); } } return (smpi_coll_tuned_allreduce_redbcast(sbuf, rbuf, count, dtype, op, comm)); }
int smpi_coll_tuned_allreduce_ompi_ring_segmented(void *sbuf, void *rbuf, int count, MPI_Datatype dtype, MPI_Op op, MPI_Comm comm) { int ret = MPI_SUCCESS; int line; int k, recv_from, send_to; int early_blockcount, late_blockcount, split_rank; int segcount, max_segcount; int num_phases, phase; int block_count; unsigned int inbi; size_t typelng; char *tmpsend = NULL, *tmprecv = NULL; char *inbuf[2] = {NULL, NULL}; ptrdiff_t true_extent, extent; ptrdiff_t block_offset, max_real_segsize; MPI_Request reqs[2] = {NULL, NULL}; const size_t segsize = 1 << 20; /* 1 MB */ unsigned int size = smpi_comm_size(comm); unsigned int rank = smpi_comm_rank(comm); XBT_DEBUG("coll:tuned:allreduce_intra_ring_segmented rank %d, count %d", rank, count); /* Special case for size == 1 */ if (1 == size) { if (MPI_IN_PLACE != sbuf) { ret= smpi_datatype_copy(sbuf, count, dtype,rbuf, count, dtype); if (ret < 0) { line = __LINE__; goto error_hndl; } } return MPI_SUCCESS; } /* Determine segment count based on the suggested segment size */ extent = smpi_datatype_get_extent(dtype); if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } true_extent = smpi_datatype_get_extent(dtype); if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } typelng = smpi_datatype_size(dtype); if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } segcount = count; COLL_TUNED_COMPUTED_SEGCOUNT(segsize, typelng, segcount) /* Special case for count less than size * segcount - use regular ring */ if (count < size * segcount) { XBT_DEBUG( "coll:tuned:allreduce_ring_segmented rank %d/%d, count %d, switching to regular ring", rank, size, count); return (smpi_coll_tuned_allreduce_lr(sbuf, rbuf, count, dtype, op, comm)); } /* Determine the number of phases of the algorithm */ num_phases = count / (size * segcount); if ((count % (size * segcount) >= size) && (count % (size * segcount) > ((size * segcount) / 2))) { num_phases++; } /* Determine the number of elements per block and corresponding block sizes. The blocks are divided into "early" and "late" ones: blocks 0 .. (split_rank - 1) are "early" and blocks (split_rank) .. (size - 1) are "late". Early blocks are at most 1 element larger than the late ones. Note, these blocks will be split into num_phases segments, out of the largest one will have max_segcount elements. */ COLL_TUNED_COMPUTE_BLOCKCOUNT( count, size, split_rank, early_blockcount, late_blockcount ) COLL_TUNED_COMPUTE_BLOCKCOUNT( early_blockcount, num_phases, inbi, max_segcount, k) max_real_segsize = true_extent + (max_segcount - 1) * extent; /* Allocate and initialize temporary buffers */ inbuf[0] = (char*)smpi_get_tmp_sendbuffer(max_real_segsize); if (NULL == inbuf[0]) { ret = -1; line = __LINE__; goto error_hndl; } if (size > 2) { inbuf[1] = (char*)smpi_get_tmp_recvbuffer(max_real_segsize); if (NULL == inbuf[1]) { ret = -1; line = __LINE__; goto error_hndl; } } /* Handle MPI_IN_PLACE */ if (MPI_IN_PLACE != sbuf) { ret= smpi_datatype_copy(sbuf, count, dtype,rbuf, count, dtype); if (ret < 0) { line = __LINE__; goto error_hndl; } } /* Computation loop: for each phase, repeat ring allreduce computation loop */ for (phase = 0; phase < num_phases; phase ++) { ptrdiff_t phase_offset; int early_phase_segcount, late_phase_segcount, split_phase, phase_count; /* For each of the remote nodes: - post irecv for block (r-1) - send block (r) To do this, first compute block offset and count, and use block offset to compute phase offset. - in loop for every step k = 2 .. n - post irecv for block (r + n - k) % n - wait on block (r + n - k + 1) % n to arrive - compute on block (r + n - k + 1) % n - send block (r + n - k + 1) % n - wait on block (r + 1) - compute on block (r + 1) - send block (r + 1) to rank (r + 1) Note that we must be careful when computing the begining of buffers and for send operations and computation we must compute the exact block size. */ send_to = (rank + 1) % size; recv_from = (rank + size - 1) % size; inbi = 0; /* Initialize first receive from the neighbor on the left */ reqs[inbi] = smpi_mpi_irecv(inbuf[inbi], max_segcount, dtype, recv_from, 666, comm); /* Send first block (my block) to the neighbor on the right: - compute my block and phase offset - send data */ block_offset = ((rank < split_rank)? (rank * early_blockcount) : (rank * late_blockcount + split_rank)); block_count = ((rank < split_rank)? early_blockcount : late_blockcount); COLL_TUNED_COMPUTE_BLOCKCOUNT(block_count, num_phases, split_phase, early_phase_segcount, late_phase_segcount) phase_count = ((phase < split_phase)? (early_phase_segcount) : (late_phase_segcount)); phase_offset = ((phase < split_phase)? (phase * early_phase_segcount) : (phase * late_phase_segcount + split_phase)); tmpsend = ((char*)rbuf) + (block_offset + phase_offset) * extent; smpi_mpi_send(tmpsend, phase_count, dtype, send_to, 666, comm); for (k = 2; k < size; k++) { const int prevblock = (rank + size - k + 1) % size; inbi = inbi ^ 0x1; /* Post irecv for the current block */ reqs[inbi] = smpi_mpi_irecv(inbuf[inbi], max_segcount, dtype, recv_from, 666, comm); if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } /* Wait on previous block to arrive */ smpi_mpi_wait(&reqs[inbi ^ 0x1], MPI_STATUS_IGNORE); /* Apply operation on previous block: result goes to rbuf rbuf[prevblock] = inbuf[inbi ^ 0x1] (op) rbuf[prevblock] */ block_offset = ((prevblock < split_rank)? (prevblock * early_blockcount) : (prevblock * late_blockcount + split_rank)); block_count = ((prevblock < split_rank)? early_blockcount : late_blockcount); COLL_TUNED_COMPUTE_BLOCKCOUNT(block_count, num_phases, split_phase, early_phase_segcount, late_phase_segcount) phase_count = ((phase < split_phase)? (early_phase_segcount) : (late_phase_segcount)); phase_offset = ((phase < split_phase)? (phase * early_phase_segcount) : (phase * late_phase_segcount + split_phase)); tmprecv = ((char*)rbuf) + (block_offset + phase_offset) * extent; smpi_op_apply(op, inbuf[inbi ^ 0x1], tmprecv, &phase_count, &dtype); /* send previous block to send_to */ smpi_mpi_send(tmprecv, phase_count, dtype, send_to, 666, comm); } /* Wait on the last block to arrive */ smpi_mpi_wait(&reqs[inbi], MPI_STATUS_IGNORE); /* Apply operation on the last block (from neighbor (rank + 1) rbuf[rank+1] = inbuf[inbi] (op) rbuf[rank + 1] */ recv_from = (rank + 1) % size; block_offset = ((recv_from < split_rank)? (recv_from * early_blockcount) : (recv_from * late_blockcount + split_rank)); block_count = ((recv_from < split_rank)? early_blockcount : late_blockcount); COLL_TUNED_COMPUTE_BLOCKCOUNT(block_count, num_phases, split_phase, early_phase_segcount, late_phase_segcount) phase_count = ((phase < split_phase)? (early_phase_segcount) : (late_phase_segcount)); phase_offset = ((phase < split_phase)? (phase * early_phase_segcount) : (phase * late_phase_segcount + split_phase)); tmprecv = ((char*)rbuf) + (block_offset + phase_offset) * extent; smpi_op_apply(op, inbuf[inbi], tmprecv, &phase_count, &dtype); } /* Distribution loop - variation of ring allgather */ send_to = (rank + 1) % size; recv_from = (rank + size - 1) % size; for (k = 0; k < size - 1; k++) { const int recv_data_from = (rank + size - k) % size; const int send_data_from = (rank + 1 + size - k) % size; const int send_block_offset = ((send_data_from < split_rank)? (send_data_from * early_blockcount) : (send_data_from * late_blockcount + split_rank)); const int recv_block_offset = ((recv_data_from < split_rank)? (recv_data_from * early_blockcount) : (recv_data_from * late_blockcount + split_rank)); block_count = ((send_data_from < split_rank)? early_blockcount : late_blockcount); tmprecv = (char*)rbuf + recv_block_offset * extent; tmpsend = (char*)rbuf + send_block_offset * extent; smpi_mpi_sendrecv(tmpsend, block_count, dtype, send_to, 666, tmprecv, early_blockcount, dtype, recv_from, 666, comm, MPI_STATUS_IGNORE); } if (NULL != inbuf[0]) smpi_free_tmp_buffer(inbuf[0]); if (NULL != inbuf[1]) smpi_free_tmp_buffer(inbuf[1]); return MPI_SUCCESS; error_hndl: XBT_DEBUG("%s:%4d\tRank %d Error occurred %d\n", __FILE__, line, rank, ret); if (NULL != inbuf[0]) smpi_free_tmp_buffer(inbuf[0]); if (NULL != inbuf[1]) smpi_free_tmp_buffer(inbuf[1]); return ret; }