/* * Another recursive doubling type algorithm, but in this case * we go up the tree and back down the tree. */ int ompi_coll_tuned_barrier_intra_tree(struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int rank, size, depth, err, jump, partner; rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_barrier_intra_tree %d", rank)); /* Find the nearest power of 2 of the communicator size. */ depth = opal_next_poweroftwo_inclusive(size); for (jump=1; jump<depth; jump<<=1) { partner = rank ^ jump; if (!(partner & (jump-1)) && partner < size) { if (partner > rank) { err = MCA_PML_CALL(recv (NULL, 0, MPI_BYTE, partner, MCA_COLL_BASE_TAG_BARRIER, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != err) return err; } else if (partner < rank) { err = MCA_PML_CALL(send (NULL, 0, MPI_BYTE, partner, MCA_COLL_BASE_TAG_BARRIER, MCA_PML_BASE_SEND_STANDARD, comm)); if (MPI_SUCCESS != err) return err; } } } depth >>= 1; for (jump = depth; jump>0; jump>>=1) { partner = rank ^ jump; if (!(partner & (jump-1)) && partner < size) { if (partner > rank) { err = MCA_PML_CALL(send (NULL, 0, MPI_BYTE, partner, MCA_COLL_BASE_TAG_BARRIER, MCA_PML_BASE_SEND_STANDARD, comm)); if (MPI_SUCCESS != err) return err; } else if (partner < rank) { err = MCA_PML_CALL(recv (NULL, 0, MPI_BYTE, partner, MCA_COLL_BASE_TAG_BARRIER, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != err) return err; } } } return MPI_SUCCESS; }
/* * reduce_scatter_intra_dec * * Function: - seletects reduce_scatter algorithm to use * Accepts: - same arguments as MPI_Reduce_scatter() * Returns: - MPI_SUCCESS or error code (passed from * the reduce scatter implementation) * Note: If we detect zero valued counts in the rcounts array, we * fall back to the nonoverlapping algorithm because the other * algorithms do not currently handle it. */ int ompi_coll_tuned_reduce_scatter_intra_dec_fixed( void *sbuf, void *rbuf, int *rcounts, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int comm_size, i, pow2; size_t total_message_size, dsize; const double a = 0.0012; const double b = 8.0; const size_t small_message_size = 12 * 1024; const size_t large_message_size = 256 * 1024; bool zerocounts = false; OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_reduce_scatter_intra_dec_fixed")); comm_size = ompi_comm_size(comm); /* We need data size for decision function */ ompi_datatype_type_size(dtype, &dsize); total_message_size = 0; for (i = 0; i < comm_size; i++) { total_message_size += rcounts[i]; if (0 == rcounts[i]) { zerocounts = true; } } if( !ompi_op_is_commute(op) || (zerocounts)) { return ompi_coll_tuned_reduce_scatter_intra_nonoverlapping (sbuf, rbuf, rcounts, dtype, op, comm, module); } total_message_size *= dsize; /* compute the nearest power of 2 */ pow2 = opal_next_poweroftwo_inclusive (comm_size); if ((total_message_size <= small_message_size) || ((total_message_size <= large_message_size) && (pow2 == comm_size)) || (comm_size >= a * total_message_size + b)) { return ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(sbuf, rbuf, rcounts, dtype, op, comm, module); } return ompi_coll_tuned_reduce_scatter_intra_ring(sbuf, rbuf, rcounts, dtype, op, comm, module); }
int ompi_coll_tuned_allgather_intra_dec_fixed(void *sbuf, int scount, struct ompi_datatype_t *sdtype, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int communicator_size, pow2_size; size_t dsize, total_dsize; communicator_size = ompi_comm_size(comm); /* Special case for 2 processes */ if (communicator_size == 2) { return ompi_coll_tuned_allgather_intra_two_procs (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module); } /* Determine complete data size */ ompi_datatype_type_size(sdtype, &dsize); total_dsize = dsize * scount * communicator_size; OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_allgather_intra_dec_fixed" " rank %d com_size %d msg_length %lu", ompi_comm_rank(comm), communicator_size, (unsigned long)total_dsize)); pow2_size = opal_next_poweroftwo_inclusive (communicator_size); /* Decision based on MX 2Gb results from Grig cluster at The University of Tennesse, Knoxville - if total message size is less than 50KB use either bruck or recursive doubling for non-power of two and power of two nodes, respectively. - else use ring and neighbor exchange algorithms for odd and even number of nodes, respectively. */ if (total_dsize < 50000) { if (pow2_size == communicator_size) { return ompi_coll_tuned_allgather_intra_recursivedoubling(sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module); } else { return ompi_coll_tuned_allgather_intra_bruck(sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module); } } else { if (communicator_size % 2) { return ompi_coll_tuned_allgather_intra_ring(sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module); } else { return ompi_coll_tuned_allgather_intra_neighborexchange(sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module); } } #if defined(USE_MPICH2_DECISION) /* Decision as in MPICH-2 presented in Thakur et.al. "Optimization of Collective Communication Operations in MPICH", International Journal of High Performance Computing Applications, Vol. 19, No. 1, 49-66 (2005) - for power-of-two processes and small and medium size messages (up to 512KB) use recursive doubling - for non-power-of-two processes and small messages (80KB) use bruck, - for everything else use ring. */ if ((pow2_size == communicator_size) && (total_dsize < 524288)) { return ompi_coll_tuned_allgather_intra_recursivedoubling(sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module); } else if (total_dsize <= 81920) { return ompi_coll_tuned_allgather_intra_bruck(sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module); } return ompi_coll_tuned_allgather_intra_ring(sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module); #endif /* defined(USE_MPICH2_DECISION) */ }