int ompi_coll_tuned_reduce_intra_binary( void *sendbuf, void *recvbuf, int count, ompi_datatype_t* datatype, ompi_op_t* op, int root, ompi_communicator_t* comm, mca_coll_base_module_t *module, uint32_t segsize, int max_outstanding_reqs ) { int segcount = count; size_t typelng; mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; mca_coll_tuned_comm_t *data = tuned_module->tuned_data; OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_binary rank %d ss %5d", ompi_comm_rank(comm), segsize)); COLL_TUNED_UPDATE_BINTREE( comm, tuned_module, root ); /** * Determine number of segments and number of elements * sent per operation */ ompi_datatype_type_size( datatype, &typelng ); COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount ); return ompi_coll_tuned_reduce_generic( sendbuf, recvbuf, count, datatype, op, root, comm, module, data->cached_bintree, segcount, max_outstanding_reqs ); }
int ompi_coll_tuned_reduce_intra_chain( void *sendbuf, void *recvbuf, int count, ompi_datatype_t* datatype, ompi_op_t* op, int root, ompi_communicator_t* comm, uint32_t segsize, int fanout) { int segcount = count; size_t typelng; OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_chain rank %d fo %d ss %5d", ompi_comm_rank(comm), fanout, segsize)); COLL_TUNED_UPDATE_CHAIN( comm, root, fanout ); /** * Determine number of segments and number of elements * sent per operation */ ompi_ddt_type_size( datatype, &typelng ); COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount ); return ompi_coll_tuned_reduce_generic( sendbuf, recvbuf, count, datatype, op, root, comm, comm->c_coll_selected_data->cached_chain, segcount ); }
/* * reduce_intra_in_order_binary * * Function: Logarithmic reduce operation for non-commutative operations. * Acecpts: same as MPI_Reduce() * Returns: MPI_SUCCESS or error code */ int ompi_coll_tuned_reduce_intra_in_order_binary( void *sendbuf, void *recvbuf, int count, ompi_datatype_t* datatype, ompi_op_t* op, int root, ompi_communicator_t* comm, mca_coll_base_module_t *module, uint32_t segsize, int max_outstanding_reqs ) { int ret, rank, size, io_root, segcount = count; void *use_this_sendbuf = NULL, *use_this_recvbuf = NULL; size_t typelng; mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; mca_coll_tuned_comm_t *data = tuned_module->tuned_data; rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_in_order_binary rank %d ss %5d", rank, segsize)); COLL_TUNED_UPDATE_IN_ORDER_BINTREE( comm, tuned_module ); /** * Determine number of segments and number of elements * sent per operation */ ompi_datatype_type_size( datatype, &typelng ); COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount ); /* An in-order binary tree must use root (size-1) to preserve the order of operations. Thus, if root is not rank (size - 1), then we must handle 1. MPI_IN_PLACE option on real root, and 2. we must allocate temporary recvbuf on rank (size - 1). Note that generic function must be careful not to switch order of operations for non-commutative ops. */ io_root = size - 1; use_this_sendbuf = sendbuf; use_this_recvbuf = recvbuf; if (io_root != root) { ptrdiff_t tlb, text, lb, ext; char *tmpbuf = NULL; ompi_datatype_get_extent(datatype, &lb, &ext); ompi_datatype_get_true_extent(datatype, &tlb, &text); if ((root == rank) && (MPI_IN_PLACE == sendbuf)) { tmpbuf = (char *) malloc(text + (ptrdiff_t)(count - 1) * ext); if (NULL == tmpbuf) { return MPI_ERR_INTERN; } ompi_datatype_copy_content_same_ddt(datatype, count, (char*)tmpbuf, (char*)recvbuf); use_this_sendbuf = tmpbuf; } else if (io_root == rank) { tmpbuf = (char *) malloc(text + (ptrdiff_t)(count - 1) * ext); if (NULL == tmpbuf) { return MPI_ERR_INTERN; } use_this_recvbuf = tmpbuf; } } /* Use generic reduce with in-order binary tree topology and io_root */ ret = ompi_coll_tuned_reduce_generic( use_this_sendbuf, use_this_recvbuf, count, datatype, op, io_root, comm, module, data->cached_in_order_bintree, segcount, max_outstanding_reqs ); if (MPI_SUCCESS != ret) { return ret; } /* Clean up */ if (io_root != root) { if (root == rank) { /* Receive result from rank io_root to recvbuf */ ret = MCA_PML_CALL(recv(recvbuf, count, datatype, io_root, MCA_COLL_BASE_TAG_REDUCE, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != ret) { return ret; } if (MPI_IN_PLACE == sendbuf) { free(use_this_sendbuf); } } else if (io_root == rank) { /* Send result from use_this_recvbuf to root */ ret = MCA_PML_CALL(send(use_this_recvbuf, count, datatype, root, MCA_COLL_BASE_TAG_REDUCE, MCA_PML_BASE_SEND_STANDARD, comm)); if (MPI_SUCCESS != ret) { return ret; } free(use_this_recvbuf); } } return MPI_SUCCESS; }
int ompi_coll_tuned_reduce_intra_binomial( void *sendbuf, void *recvbuf, int count, ompi_datatype_t* datatype, ompi_op_t* op, int root, ompi_communicator_t* comm, mca_coll_base_module_t *module, uint32_t segsize, int max_outstanding_reqs ) { int segcount = count; size_t typelng; mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; mca_coll_tuned_comm_t *data = tuned_module->tuned_data; OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_binomial rank %d ss %5d", ompi_comm_rank(comm), segsize)); //printf ("USE BINOMIAL REDUCTION\n"); if (sdn_comp_enable > 1) { assert(false); ompi_coll_tree_t *tree = (ompi_coll_tree_t*)malloc(sizeof(ompi_coll_tree_t)); tree->tree_bmtree = 1; tree->tree_root = MPI_UNDEFINED; tree->tree_nextsize = MPI_UNDEFINED; tree->tree_root = root; switch (ompi_comm_rank(comm)) { case 0: tree->tree_nextsize = 0; tree->tree_prev = 1; break; case 1: tree->tree_nextsize = 1; tree->tree_next[0] = 0; tree->tree_prev = 3; break; case 2: tree->tree_nextsize = 0; tree->tree_prev = 3; break; case 3: tree->tree_nextsize = 2; tree->tree_next[0] = 1; tree->tree_next[1] = 2; tree->tree_prev = 7; break; case 4: tree->tree_nextsize = 1; tree->tree_next[0] = 5; tree->tree_prev = 7; break; case 5: tree->tree_nextsize = 0; tree->tree_prev = 4; break; case 6: tree->tree_nextsize = 0; tree->tree_prev = 7; break; case 7: tree->tree_nextsize = 3; tree->tree_next[0] = 3; tree->tree_next[1] = 4; tree->tree_next[2] = 6; tree->tree_prev = 7; break; } /** * Determine number of segments and number of elements * sent per operation */ ompi_datatype_type_size( datatype, &typelng ); COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount ); return ompi_coll_tuned_reduce_generic( sendbuf, recvbuf, count, datatype, op, root, comm, module, tree, segcount, max_outstanding_reqs ); } else if (sdn_comp_enable > 0) { /** * Determine number of segments and number of elements * sent per operation */ ompi_datatype_type_size( datatype, &typelng ); COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount ); return ompi_coll_tuned_reduce_generic( sendbuf, recvbuf, count, datatype, op, root, comm, module, sdn_shortest_bmtree[root], segcount, max_outstanding_reqs ); } else { COLL_TUNED_UPDATE_IN_ORDER_BMTREE( comm, tuned_module, root ); /** * Determine number of segments and number of elements * sent per operation */ ompi_datatype_type_size( datatype, &typelng ); COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount ); return ompi_coll_tuned_reduce_generic( sendbuf, recvbuf, count, datatype, op, root, comm, module, data->cached_in_order_bmtree, segcount, max_outstanding_reqs ); } /* end if sdn_comp_enable */ }