int mca_coll_ml_reduce_nb(void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, ompi_request_t **req, mca_coll_base_module_t *module) { int ret = OMPI_SUCCESS; mca_coll_ml_module_t *ml_module = (mca_coll_ml_module_t*)module; if (OPAL_UNLIKELY(!ompi_op_is_commute(op) || !opal_datatype_is_contiguous_memory_layout(&dtype->super, count))) { /* coll/ml does not handle non-communative operations at this time. fallback * on another collective module */ return ml_module->fallback.coll_ireduce (sbuf, rbuf, count, dtype, op, root, comm, req, ml_module->fallback.coll_ireduce_module); } ML_VERBOSE(10,("Calling Ml Reduce ")); ret = parallel_reduce_start(sbuf, rbuf, count, dtype, op, root, comm, ml_module, req, ML_SMALL_DATA_REDUCE, ML_LARGE_DATA_REDUCE); if (OPAL_UNLIKELY(ret != OMPI_SUCCESS)) { ML_VERBOSE(10, ("Failed to launch")); return ret; } ML_VERBOSE(10, ("Non-blocking Reduce is done")); return OMPI_SUCCESS; }
int MPI_Op_commutative(MPI_Op op, int *commute) { OPAL_CR_NOOP_PROGRESS(); /* Error checking */ if (MPI_PARAM_CHECK) { OMPI_ERR_INIT_FINALIZE(FUNC_NAME); if (NULL == op || MPI_OP_NULL == op) { return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_OP, FUNC_NAME); } if (NULL == commute) { return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_ARG, FUNC_NAME); } } /* We have a valid op, get the flag */ *commute = ompi_op_is_commute(op); /* All done */ return MPI_SUCCESS; }
/* * reduce_scatter_intra_dec * * Function: - seletects reduce_scatter algorithm to use * Accepts: - same arguments as MPI_Reduce_scatter() * Returns: - MPI_SUCCESS or error code (passed from * the reduce scatter implementation) * Note: If we detect zero valued counts in the rcounts array, we * fall back to the nonoverlapping algorithm because the other * algorithms do not currently handle it. */ int ompi_coll_tuned_reduce_scatter_intra_dec_fixed( void *sbuf, void *rbuf, int *rcounts, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int comm_size, i, pow2; size_t total_message_size, dsize; const double a = 0.0012; const double b = 8.0; const size_t small_message_size = 12 * 1024; const size_t large_message_size = 256 * 1024; bool zerocounts = false; OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_reduce_scatter_intra_dec_fixed")); comm_size = ompi_comm_size(comm); /* We need data size for decision function */ ompi_ddt_type_size(dtype, &dsize); total_message_size = 0; for (i = 0; i < comm_size; i++) { total_message_size += rcounts[i]; if (0 == rcounts[i]) { zerocounts = true; } } if( !ompi_op_is_commute(op) || (zerocounts)) { return ompi_coll_tuned_reduce_scatter_intra_nonoverlapping (sbuf, rbuf, rcounts, dtype, op, comm, module); } total_message_size *= dsize; /* compute the nearest power of 2 */ for (pow2 = 1; pow2 < comm_size; pow2 <<= 1); if ((total_message_size <= small_message_size) || ((total_message_size <= large_message_size) && (pow2 == comm_size)) || (comm_size >= a * total_message_size + b)) { return ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(sbuf, rbuf, rcounts, dtype, op, comm, module); } return ompi_coll_tuned_reduce_scatter_intra_ring(sbuf, rbuf, rcounts, dtype, op, comm, module); }
/* * allreduce_intra * * Function: - allreduce using other MPI collectives * Accepts: - same as MPI_Allreduce() * Returns: - MPI_SUCCESS or error code */ int ompi_coll_tuned_allreduce_intra_dec_fixed (void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { size_t dsize, block_dsize; int comm_size = ompi_comm_size(comm); const size_t intermediate_message = 10000; OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_allreduce_intra_dec_fixed")); /** * Decision function based on MX results from the Grig cluster at UTK. * * Currently, linear, recursive doubling, and nonoverlapping algorithms * can handle both commutative and non-commutative operations. * Ring algorithm does not support non-commutative operations. */ ompi_ddt_type_size(dtype, &dsize); block_dsize = dsize * count; if (block_dsize < intermediate_message) { return (ompi_coll_tuned_allreduce_intra_recursivedoubling (sbuf, rbuf, count, dtype, op, comm, module)); } if( ompi_op_is_commute(op) && (count > comm_size) ) { const size_t segment_size = 1 << 20; /* 1 MB */ if ((comm_size * segment_size >= block_dsize)) { return (ompi_coll_tuned_allreduce_intra_ring (sbuf, rbuf, count, dtype, op, comm, module)); } else { return (ompi_coll_tuned_allreduce_intra_ring_segmented (sbuf, rbuf, count, dtype, op, comm, module, segment_size)); } } return (ompi_coll_tuned_allreduce_intra_nonoverlapping (sbuf, rbuf, count, dtype, op, comm, module)); }
/* * reduce_intra_dec * * Function: - seletects reduce algorithm to use * Accepts: - same arguments as MPI_reduce() * Returns: - MPI_SUCCESS or error code (passed from the reduce implementation) * */ int ompi_coll_tuned_reduce_intra_dec_fixed( void *sendbuf, void *recvbuf, int count, struct ompi_datatype_t* datatype, struct ompi_op_t* op, int root, struct ompi_communicator_t* comm, mca_coll_base_module_t *module) { int communicator_size, segsize = 0; size_t message_size, dsize; const double a1 = 0.6016 / 1024.0; /* [1/B] */ const double b1 = 1.3496; const double a2 = 0.0410 / 1024.0; /* [1/B] */ const double b2 = 9.7128; const double a3 = 0.0422 / 1024.0; /* [1/B] */ const double b3 = 1.1614; const double a4 = 0.0033 / 1024.0; /* [1/B] */ const double b4 = 1.6761; const int max_requests = 0; /* no limit on # of outstanding requests */ communicator_size = ompi_comm_size(comm); /* need data size for decision function */ ompi_ddt_type_size(datatype, &dsize); message_size = dsize * count; /* needed for decision */ /** * If the operation is non commutative we currently have choice of linear * or in-order binary tree algorithm. */ if( !ompi_op_is_commute(op) ) { if ((communicator_size < 12) && (message_size < 2048)) { return ompi_coll_tuned_reduce_intra_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm, module); } return ompi_coll_tuned_reduce_intra_in_order_binary (sendbuf, recvbuf, count, datatype, op, root, comm, module, 0, max_requests); } OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_reduce_intra_dec_fixed " "root %d rank %d com_size %d msg_length %lu", root, ompi_comm_rank(comm), communicator_size, (unsigned long)message_size)); if ((communicator_size < 8) && (message_size < 512)){ /* Linear_0K */ return ompi_coll_tuned_reduce_intra_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm, module); } else if (((communicator_size < 8) && (message_size < 20480)) || (message_size < 2048) || (count <= 1)) { /* Binomial_0K */ segsize = 0; return ompi_coll_tuned_reduce_intra_binomial(sendbuf, recvbuf, count, datatype, op, root, comm, module, segsize, max_requests); } else if (communicator_size > (a1 * message_size + b1)) { /* Binomial_1K */ segsize = 1024; return ompi_coll_tuned_reduce_intra_binomial(sendbuf, recvbuf, count, datatype, op, root, comm, module, segsize, max_requests); } else if (communicator_size > (a2 * message_size + b2)) { /* Pipeline_1K */ segsize = 1024; return ompi_coll_tuned_reduce_intra_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm, module, segsize, max_requests); } else if (communicator_size > (a3 * message_size + b3)) { /* Binary_32K */ segsize = 32*1024; return ompi_coll_tuned_reduce_intra_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm, module, segsize, max_requests); } if (communicator_size > (a4 * message_size + b4)) { /* Pipeline_32K */ segsize = 32*1024; } else { /* Pipeline_64K */ segsize = 64*1024; } return ompi_coll_tuned_reduce_intra_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm, module, segsize, max_requests); #if 0 /* for small messages use linear algorithm */ if (message_size <= 4096) { segsize = 0; fanout = communicator_size - 1; /* when linear implemented or taken from basic put here, right now using chain as a linear system */ /* it is implemented and I shouldn't be calling a chain with a fanout bigger than MAXTREEFANOUT from topo.h! */ return ompi_coll_tuned_reduce_intra_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm, module); /* return ompi_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, segsize, fanout); */ } if (message_size < 524288) { if (message_size <= 65536 ) { segsize = 32768; fanout = 8; } else { segsize = 1024; fanout = communicator_size/2; } /* later swap this for a binary tree */ /* fanout = 2; */ return ompi_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, module, segsize, fanout, max_requests); } segsize = 1024; return ompi_coll_tuned_reduce_intra_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm, module, segsize, max_requests); #endif /* 0 */ }
/* * reduce_log_intra * * Function: - reduction using O(log N) algorithm * Accepts: - same as MPI_Reduce() * Returns: - MPI_SUCCESS or error code * * * Performing reduction on each dimension of the hypercube. * An example for 8 procs (dimensions = 3): * * Stage 1, reduce on X dimension, 1 -> 0, 3 -> 2, 5 -> 4, 7 -> 6 * * 6----<---7 proc_0: 0+1 * /| /| proc_1: 1 * / | / | proc_2: 2+3 * / | / | proc_3: 3 * 4----<---5 | proc_4: 4+5 * | 2--< |---3 proc_5: 5 * | / | / proc_6: 6+7 * | / | / proc_7: 7 * |/ |/ * 0----<---1 * * Stage 2, reduce on Y dimension, 2 -> 0, 6 -> 4 * * 6--------7 proc_0: 0+1+2+3 * /| /| proc_1: 1 * v | / | proc_2: 2+3 * / | / | proc_3: 3 * 4--------5 | proc_4: 4+5+6+7 * | 2--- |---3 proc_5: 5 * | / | / proc_6: 6+7 * | v | / proc_7: 7 * |/ |/ * 0--------1 * * Stage 3, reduce on Z dimension, 4 -> 0 * * 6--------7 proc_0: 0+1+2+3+4+5+6+7 * /| /| proc_1: 1 * / | / | proc_2: 2+3 * / | / | proc_3: 3 * 4--------5 | proc_4: 4+5+6+7 * | 2--- |---3 proc_5: 5 * v / | / proc_6: 6+7 * | / | / proc_7: 7 * |/ |/ * 0--------1 * * */ int mca_coll_basic_reduce_log_intra(void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, size, rank, vrank; int err, peer, dim, mask; ptrdiff_t true_lb, true_extent, lb, extent; char *free_buffer = NULL; char *free_rbuf = NULL; char *pml_buffer = NULL; char *snd_buffer = NULL; char *rcv_buffer = (char*)rbuf; char *inplace_temp = NULL; /* JMS Codearound for now -- if the operations is not communative, * just call the linear algorithm. Need to talk to Edgar / George * about fixing this algorithm here to work with non-communative * operations. */ if (!ompi_op_is_commute(op)) { return mca_coll_basic_reduce_lin_intra(sbuf, rbuf, count, dtype, op, root, comm, module); } /* Some variables */ size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); vrank = ompi_op_is_commute(op) ? (rank - root + size) % size : rank; dim = comm->c_cube_dim; /* Allocate the incoming and resulting message buffers. See lengthy * rationale above. */ ompi_datatype_get_extent(dtype, &lb, &extent); ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); free_buffer = (char*)malloc(true_extent + (count - 1) * extent); if (NULL == free_buffer) { return OMPI_ERR_OUT_OF_RESOURCE; } pml_buffer = free_buffer - lb; /* read the comment about commutative operations (few lines down * the page) */ if (ompi_op_is_commute(op)) { rcv_buffer = pml_buffer; } /* Allocate sendbuf in case the MPI_IN_PLACE option has been used. See lengthy * rationale above. */ if (MPI_IN_PLACE == sbuf) { inplace_temp = (char*)malloc(true_extent + (count - 1) * extent); if (NULL == inplace_temp) { err = OMPI_ERR_OUT_OF_RESOURCE; goto cleanup_and_return; } sbuf = inplace_temp - lb; err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)sbuf, (char*)rbuf); } snd_buffer = (char*)sbuf; if (rank != root && 0 == (vrank & 1)) { /* root is the only one required to provide a valid rbuf. * Assume rbuf is invalid for all other ranks, so fix it up * here to be valid on all non-leaf ranks */ free_rbuf = (char*)malloc(true_extent + (count - 1) * extent); if (NULL == free_rbuf) { err = OMPI_ERR_OUT_OF_RESOURCE; goto cleanup_and_return; } rbuf = free_rbuf - lb; } /* Loop over cube dimensions. High processes send to low ones in the * dimension. */ for (i = 0, mask = 1; i < dim; ++i, mask <<= 1) { /* A high-proc sends to low-proc and stops. */ if (vrank & mask) { peer = vrank & ~mask; if (ompi_op_is_commute(op)) { peer = (peer + root) % size; } err = MCA_PML_CALL(send(snd_buffer, count, dtype, peer, MCA_COLL_BASE_TAG_REDUCE, MCA_PML_BASE_SEND_STANDARD, comm)); if (MPI_SUCCESS != err) { goto cleanup_and_return; } snd_buffer = (char*)rbuf; break; } /* A low-proc receives, reduces, and moves to a higher * dimension. */ else { peer = vrank | mask; if (peer >= size) { continue; } if (ompi_op_is_commute(op)) { peer = (peer + root) % size; } /* Most of the time (all except the first one for commutative * operations) we receive in the user provided buffer * (rbuf). But the exception is here to allow us to dont have * to copy from the sbuf to a temporary location. If the * operation is commutative we dont care in which order we * apply the operation, so for the first time we can receive * the data in the pml_buffer and then apply to operation * between this buffer and the user provided data. */ err = MCA_PML_CALL(recv(rcv_buffer, count, dtype, peer, MCA_COLL_BASE_TAG_REDUCE, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != err) { goto cleanup_and_return; } /* Perform the operation. The target is always the user * provided buffer We do the operation only if we receive it * not in the user buffer */ if (snd_buffer != sbuf) { /* the target buffer is the locally allocated one */ ompi_op_reduce(op, rcv_buffer, pml_buffer, count, dtype); } else { /* If we're commutative, we don't care about the order of * operations and we can just reduce the operations now. * If we are not commutative, we have to copy the send * buffer into a temp buffer (pml_buffer) and then reduce * what we just received against it. */ if (!ompi_op_is_commute(op)) { ompi_datatype_copy_content_same_ddt(dtype, count, pml_buffer, (char*)sbuf); ompi_op_reduce(op, rbuf, pml_buffer, count, dtype); } else { ompi_op_reduce(op, sbuf, pml_buffer, count, dtype); } /* now we have to send the buffer containing the computed data */ snd_buffer = pml_buffer; /* starting from now we always receive in the user * provided buffer */ rcv_buffer = (char*)rbuf; } } } /* Get the result to the root if needed. */ err = MPI_SUCCESS; if (0 == vrank) { if (root == rank) { ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf, snd_buffer); } else { err = MCA_PML_CALL(send(snd_buffer, count, dtype, root, MCA_COLL_BASE_TAG_REDUCE, MCA_PML_BASE_SEND_STANDARD, comm)); } } else if (rank == root) { err = MCA_PML_CALL(recv(rcv_buffer, count, dtype, 0, MCA_COLL_BASE_TAG_REDUCE, comm, MPI_STATUS_IGNORE)); if (rcv_buffer != rbuf) { ompi_op_reduce(op, rcv_buffer, rbuf, count, dtype); } } cleanup_and_return: if (NULL != inplace_temp) { free(inplace_temp); } if (NULL != free_buffer) { free(free_buffer); } if (NULL != free_rbuf) { free(free_rbuf); } /* All done */ return err; }
/** * This is a generic implementation of the reduce protocol. It used the tree * provided as an argument and execute all operations using a segment of * count times a datatype. * For the last communication it will update the count in order to limit * the number of datatype to the original count (original_count) * * Note that for non-commutative operations we cannot save memory copy * for the first block: thus we must copy sendbuf to accumbuf on intermediate * to keep the optimized loop happy. */ int ompi_coll_tuned_reduce_generic( void* sendbuf, void* recvbuf, int original_count, ompi_datatype_t* datatype, ompi_op_t* op, int root, ompi_communicator_t* comm, mca_coll_base_module_t *module, ompi_coll_tree_t* tree, int count_by_segment, int max_outstanding_reqs ) { char *inbuf[2] = {NULL, NULL}, *inbuf_free[2] = {NULL, NULL}; char *accumbuf = NULL, *accumbuf_free = NULL; char *local_op_buffer = NULL, *sendtmpbuf = NULL; ptrdiff_t extent, lower_bound, segment_increment; size_t typelng; ompi_request_t* reqs[2] = {MPI_REQUEST_NULL, MPI_REQUEST_NULL}; int num_segments, line, ret, segindex, i, rank; int recvcount, prevcount, inbi; /** * Determine number of segments and number of elements * sent per operation */ ompi_datatype_get_extent( datatype, &lower_bound, &extent ); ompi_datatype_type_size( datatype, &typelng ); num_segments = (original_count + count_by_segment - 1) / count_by_segment; segment_increment = (ptrdiff_t)count_by_segment * extent; sendtmpbuf = (char*) sendbuf; if( sendbuf == MPI_IN_PLACE ) { sendtmpbuf = (char *)recvbuf; } OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:reduce_generic count %d, msg size %ld, segsize %ld, max_requests %d", original_count, (unsigned long)((ptrdiff_t)num_segments * (ptrdiff_t)segment_increment), (unsigned long)segment_increment, max_outstanding_reqs)); rank = ompi_comm_rank(comm); /*printf("Tree of rank %d - ", rank); printf("Parent : %d - ", tree->tree_prev); printf("Child : "); for (i = 0; i < tree->tree_nextsize; i++) printf("%d ", tree->tree_next[i]); printf("\n");*/ /* non-leaf nodes - wait for children to send me data & forward up (if needed) */ if( tree->tree_nextsize > 0 ) { ptrdiff_t true_lower_bound, true_extent, real_segment_size; ompi_datatype_get_true_extent( datatype, &true_lower_bound, &true_extent ); /* handle non existant recv buffer (i.e. its NULL) and protect the recv buffer on non-root nodes */ accumbuf = (char*)recvbuf; if( (NULL == accumbuf) || (root != rank) ) { /* Allocate temporary accumulator buffer. */ accumbuf_free = (char*)malloc(true_extent + (ptrdiff_t)(original_count - 1) * extent); if (accumbuf_free == NULL) { line = __LINE__; ret = -1; goto error_hndl; } accumbuf = accumbuf_free - lower_bound; } /* If this is a non-commutative operation we must copy sendbuf to the accumbuf, in order to simplfy the loops */ if (!ompi_op_is_commute(op)) { ompi_datatype_copy_content_same_ddt(datatype, original_count, (char*)accumbuf, (char*)sendtmpbuf); } /* Allocate two buffers for incoming segments */ real_segment_size = true_extent + (ptrdiff_t)(count_by_segment - 1) * extent; inbuf_free[0] = (char*) malloc(real_segment_size); if( inbuf_free[0] == NULL ) { line = __LINE__; ret = -1; goto error_hndl; } inbuf[0] = inbuf_free[0] - lower_bound; /* if there is chance to overlap communication - allocate second buffer */ if( (num_segments > 1) || (tree->tree_nextsize > 1) ) { inbuf_free[1] = (char*) malloc(real_segment_size); if( inbuf_free[1] == NULL ) { line = __LINE__; ret = -1; goto error_hndl; } inbuf[1] = inbuf_free[1] - lower_bound; } /* reset input buffer index and receive count */ inbi = 0; recvcount = 0; /* for each segment */ for( segindex = 0; segindex <= num_segments; segindex++ ) { prevcount = recvcount; /* recvcount - number of elements in current segment */ recvcount = count_by_segment; if( segindex == (num_segments-1) ) recvcount = original_count - (ptrdiff_t)count_by_segment * (ptrdiff_t)segindex; /* for each child */ for( i = 0; i < tree->tree_nextsize; i++ ) { /** * We try to overlap communication: * either with next segment or with the next child */ /* post irecv for current segindex on current child */ if( segindex < num_segments ) { void* local_recvbuf = inbuf[inbi]; if( 0 == i ) { /* for the first step (1st child per segment) and * commutative operations we might be able to irecv * directly into the accumulate buffer so that we can * reduce(op) this with our sendbuf in one step as * ompi_op_reduce only has two buffer pointers, * this avoids an extra memory copy. * * BUT if the operation is non-commutative or * we are root and are USING MPI_IN_PLACE this is wrong! */ if( (ompi_op_is_commute(op)) && !((MPI_IN_PLACE == sendbuf) && (rank == tree->tree_root)) ) { local_recvbuf = accumbuf + (ptrdiff_t)segindex * (ptrdiff_t)segment_increment; } } ret = MCA_PML_CALL(irecv(local_recvbuf, recvcount, datatype, tree->tree_next[i], MCA_COLL_BASE_TAG_REDUCE, comm, &reqs[inbi])); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl;} } /* wait for previous req to complete, if any. if there are no requests reqs[inbi ^1] will be MPI_REQUEST_NULL. */ /* wait on data from last child for previous segment */ ret = ompi_request_wait_all( 1, &reqs[inbi ^ 1], MPI_STATUSES_IGNORE ); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } local_op_buffer = inbuf[inbi ^ 1]; if( i > 0 ) { /* our first operation is to combine our own [sendbuf] data * with the data we recvd from down stream (but only * the operation is commutative and if we are not root and * not using MPI_IN_PLACE) */ if( 1 == i ) { if( (ompi_op_is_commute(op)) && !((MPI_IN_PLACE == sendbuf) && (rank == tree->tree_root)) ) { local_op_buffer = sendtmpbuf + (ptrdiff_t)segindex * (ptrdiff_t)segment_increment; } } /* apply operation */ ompi_op_reduce(op, local_op_buffer, accumbuf + (ptrdiff_t)segindex * (ptrdiff_t)segment_increment, recvcount, datatype ); } else if ( segindex > 0 ) { void* accumulator = accumbuf + (ptrdiff_t)(segindex-1) * (ptrdiff_t)segment_increment; if( tree->tree_nextsize <= 1 ) { if( (ompi_op_is_commute(op)) && !((MPI_IN_PLACE == sendbuf) && (rank == tree->tree_root)) ) { local_op_buffer = sendtmpbuf + (ptrdiff_t)(segindex-1) * (ptrdiff_t)segment_increment; } } ompi_op_reduce(op, local_op_buffer, accumulator, prevcount, datatype ); /* all reduced on available data this step (i) complete, * pass to the next process unless you are the root. */ if (rank != tree->tree_root) { /* send combined/accumulated data to parent */ ret = MCA_PML_CALL( send( accumulator, prevcount, datatype, tree->tree_prev, MCA_COLL_BASE_TAG_REDUCE, MCA_PML_BASE_SEND_STANDARD, comm) ); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } } /* we stop when segindex = number of segments (i.e. we do num_segment+1 steps for pipelining */ if (segindex == num_segments) break; } /* update input buffer index */ inbi = inbi ^ 1; } /* end of for each child */ } /* end of for each segment */ /* clean up */ if( inbuf_free[0] != NULL) free(inbuf_free[0]); if( inbuf_free[1] != NULL) free(inbuf_free[1]); if( accumbuf_free != NULL ) free(accumbuf_free); } /* leaf nodes Depending on the value of max_outstanding_reqs and the number of segments we have two options: - send all segments using blocking send to the parent, or - avoid overflooding the parent nodes by limiting the number of outstanding requests to max_oustanding_reqs. TODO/POSSIBLE IMPROVEMENT: If there is a way to determine the eager size for the current communication, synchronization should be used only when the message/segment size is smaller than the eager size. */ else { /* If the number of segments is less than a maximum number of oustanding requests or there is no limit on the maximum number of outstanding requests, we send data to the parent using blocking send */ if ((0 == max_outstanding_reqs) || (num_segments <= max_outstanding_reqs)) { segindex = 0; while ( original_count > 0) { if (original_count < count_by_segment) { count_by_segment = original_count; } ret = MCA_PML_CALL( send((char*)sendbuf + (ptrdiff_t)segindex * (ptrdiff_t)segment_increment, count_by_segment, datatype, tree->tree_prev, MCA_COLL_BASE_TAG_REDUCE, MCA_PML_BASE_SEND_STANDARD, comm) ); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } segindex++; original_count -= count_by_segment; } } /* Otherwise, introduce flow control: - post max_outstanding_reqs non-blocking synchronous send, - for remaining segments - wait for a ssend to complete, and post the next one. - wait for all outstanding sends to complete. */ else { int creq = 0; ompi_request_t **sreq = NULL; sreq = (ompi_request_t**) calloc( max_outstanding_reqs, sizeof(ompi_request_t*) ); if (NULL == sreq) { line = __LINE__; ret = -1; goto error_hndl; } /* post first group of requests */ for (segindex = 0; segindex < max_outstanding_reqs; segindex++) { ret = MCA_PML_CALL( isend((char*)sendbuf + (ptrdiff_t)segindex * (ptrdiff_t)segment_increment, count_by_segment, datatype, tree->tree_prev, MCA_COLL_BASE_TAG_REDUCE, MCA_PML_BASE_SEND_SYNCHRONOUS, comm, &sreq[segindex]) ); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } original_count -= count_by_segment; } creq = 0; while ( original_count > 0 ) { /* wait on a posted request to complete */ ret = ompi_request_wait(&sreq[creq], MPI_STATUS_IGNORE); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } sreq[creq] = MPI_REQUEST_NULL; if( original_count < count_by_segment ) { count_by_segment = original_count; } ret = MCA_PML_CALL( isend((char*)sendbuf + (ptrdiff_t)segindex * (ptrdiff_t)segment_increment, count_by_segment, datatype, tree->tree_prev, MCA_COLL_BASE_TAG_REDUCE, MCA_PML_BASE_SEND_SYNCHRONOUS, comm, &sreq[creq]) ); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } creq = (creq + 1) % max_outstanding_reqs; segindex++; original_count -= count_by_segment; } /* Wait on the remaining request to complete */ ret = ompi_request_wait_all( max_outstanding_reqs, sreq, MPI_STATUSES_IGNORE ); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } /* free requests */ free(sreq); } } return OMPI_SUCCESS; error_hndl: /* error handler */ OPAL_OUTPUT (( ompi_coll_tuned_stream, "ERROR_HNDL: node %d file %s line %d error %d\n", rank, __FILE__, line, ret )); if( inbuf_free[0] != NULL ) free(inbuf_free[0]); if( inbuf_free[1] != NULL ) free(inbuf_free[1]); if( accumbuf_free != NULL ) free(accumbuf); return ret; }
/* * exscan_intra * * Function: - basic exscan operation * Accepts: - same arguments as MPI_Exscan() * Returns: - MPI_SUCCESS or error code */ int mca_coll_basic_exscan_intra(void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int size, rank, err; ptrdiff_t true_lb, true_extent, lb, extent; char *free_buffer = NULL; char *reduce_buffer = NULL; char *source; MPI_Request req = MPI_REQUEST_NULL; /* Initialize. */ rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); /* If we're rank 0, then we send our sbuf to the next rank */ if (0 == rank) { return MCA_PML_CALL(send(sbuf, count, dtype, rank + 1, MCA_COLL_BASE_TAG_EXSCAN, MCA_PML_BASE_SEND_STANDARD, comm)); } /* If we're the last rank, then just receive the result from the * prior rank */ else if ((size - 1) == rank) { return MCA_PML_CALL(recv(rbuf, count, dtype, rank - 1, MCA_COLL_BASE_TAG_EXSCAN, comm, MPI_STATUS_IGNORE)); } /* Otherwise, get the result from the prior rank, combine it with my * data, and send it to the next rank */ /* Start the receive for the prior rank's answer */ err = MCA_PML_CALL(irecv(rbuf, count, dtype, rank - 1, MCA_COLL_BASE_TAG_EXSCAN, comm, &req)); if (MPI_SUCCESS != err) { goto error; } /* Get a temporary buffer to perform the reduction into. Rationale * for malloc'ing this size is provided in coll_basic_reduce.c. */ ompi_ddt_get_extent(dtype, &lb, &extent); ompi_ddt_get_true_extent(dtype, &true_lb, &true_extent); free_buffer = (char*)malloc(true_extent + (count - 1) * extent); if (NULL == free_buffer) { return OMPI_ERR_OUT_OF_RESOURCE; } reduce_buffer = free_buffer - lb; if (ompi_op_is_commute(op)) { /* If we're commutative, we can copy my sbuf into the reduction * buffer before the receive completes */ err = ompi_ddt_copy_content_same_ddt(dtype, count, reduce_buffer, (char*)sbuf); if (MPI_SUCCESS != err) { goto error; } /* Now setup the reduction */ source = (char*)rbuf; /* Finally, wait for the receive to complete (so that we can do * the reduction). */ err = ompi_request_wait(&req, MPI_STATUS_IGNORE); if (MPI_SUCCESS != err) { goto error; } } else { /* Setup the reduction */ source = (char*)sbuf; /* If we're not commutative, we have to wait for the receive to * complete and then copy it into the reduce buffer */ err = ompi_request_wait(&req, MPI_STATUS_IGNORE); if (MPI_SUCCESS != err) { goto error; } err = ompi_ddt_copy_content_same_ddt(dtype, count, reduce_buffer, (char*)rbuf); if (MPI_SUCCESS != err) { goto error; } } /* Now reduce the received answer with my source into the answer * that we send off to the next rank */ ompi_op_reduce(op, source, reduce_buffer, count, dtype); /* Send my result off to the next rank */ err = MCA_PML_CALL(send(reduce_buffer, count, dtype, rank + 1, MCA_COLL_BASE_TAG_EXSCAN, MCA_PML_BASE_SEND_STANDARD, comm)); /* Error */ error: free(free_buffer); if (MPI_REQUEST_NULL != req) { ompi_request_cancel(req); ompi_request_wait(&req, MPI_STATUS_IGNORE); } /* All done */ return err; }