int ompi_osc_ucx_get(void *origin_addr, int origin_count, struct ompi_datatype_t *origin_dt, int target, ptrdiff_t target_disp, int target_count, struct ompi_datatype_t *target_dt, struct ompi_win_t *win) { ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module; ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, target); uint64_t remote_addr = (module->win_info_array[target]).addr + target_disp * OSC_UCX_GET_DISP(module, target); ucp_rkey_h rkey; ptrdiff_t origin_lb, origin_extent, target_lb, target_extent; bool is_origin_contig = false, is_target_contig = false; ucs_status_t status; int ret = OMPI_SUCCESS; ret = check_sync_state(module, target, false); if (ret != OMPI_SUCCESS) { return ret; } if (module->flavor == MPI_WIN_FLAVOR_DYNAMIC) { status = get_dynamic_win_info(remote_addr, module, ep, target); if (status != UCS_OK) { return OMPI_ERROR; } } rkey = (module->win_info_array[target]).rkey; ompi_datatype_get_true_extent(origin_dt, &origin_lb, &origin_extent); ompi_datatype_get_true_extent(target_dt, &target_lb, &target_extent); is_origin_contig = ompi_datatype_is_contiguous_memory_layout(origin_dt, origin_count); is_target_contig = ompi_datatype_is_contiguous_memory_layout(target_dt, target_count); if (is_origin_contig && is_target_contig) { /* fast path */ size_t origin_len; ompi_datatype_type_size(origin_dt, &origin_len); origin_len *= origin_count; status = ucp_get_nbi(ep, (void *)((intptr_t)origin_addr + origin_lb), origin_len, remote_addr + target_lb, rkey); if (status != UCS_OK && status != UCS_INPROGRESS) { opal_output_verbose(1, ompi_osc_base_framework.framework_output, "%s:%d: ucp_get_nbi failed: %d\n", __FILE__, __LINE__, status); return OMPI_ERROR; } return incr_and_check_ops_num(module, target, ep); } else { return ddt_put_get(module, origin_addr, origin_count, origin_dt, is_origin_contig, origin_lb, target, ep, remote_addr, rkey, target_count, target_dt, is_target_contig, target_lb, true); } }
/* * reduce_inter * * Function: - reduction using the local_comm * Accepts: - same as MPI_Reduce() * Returns: - MPI_SUCCESS or error code */ int mca_coll_inter_reduce_inter(void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int rank, err; ptrdiff_t true_lb, true_extent, lb, extent; char *free_buffer = NULL; char *pml_buffer = NULL; /* Initialize */ rank = ompi_comm_rank(comm); if (MPI_PROC_NULL == root) { /* do nothing */ err = OMPI_SUCCESS; } else if (MPI_ROOT != root) { /* Perform the reduce locally with the first process as root */ ompi_datatype_get_extent(dtype, &lb, &extent); ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); free_buffer = (char*)malloc(true_extent + (count - 1) * extent); if (NULL == free_buffer) { return OMPI_ERR_OUT_OF_RESOURCE; } pml_buffer = free_buffer - true_lb; err = comm->c_local_comm->c_coll.coll_reduce(sbuf, pml_buffer, count, dtype, op, 0, comm->c_local_comm, comm->c_local_comm->c_coll.coll_reduce_module); if (0 == rank) { /* First process sends the result to the root */ err = MCA_PML_CALL(send(pml_buffer, count, dtype, root, MCA_COLL_BASE_TAG_REDUCE, MCA_PML_BASE_SEND_STANDARD, comm)); if (OMPI_SUCCESS != err) { return err; } } if (NULL != free_buffer) { free(free_buffer); } } else { /* Root receives the reduced message from the first process */ err = MCA_PML_CALL(recv(rbuf, count, dtype, 0, MCA_COLL_BASE_TAG_REDUCE, comm, MPI_STATUS_IGNORE)); if (OMPI_SUCCESS != err) { return err; } } /* All done */ return err; }
/** * Computing the correct buffer length for moving a multiple of a datatype * is not an easy task. Define a function to centralize the complexity in a * single location. */ size_t compute_buffer_length(ompi_datatype_t* pdt, int count) { MPI_Aint extent, lb, true_extent, true_lb; size_t length; ompi_datatype_get_extent(pdt, &lb, &extent); ompi_datatype_get_true_extent(pdt, &true_lb, &true_extent); (void)true_lb; length = true_lb + true_extent + (count - 1) * extent; return length; }
/* * reduce_log_inter * * Function: - reduction using O(N) algorithm * Accepts: - same as MPI_Reduce() * Returns: - MPI_SUCCESS or error code */ int mca_coll_cuda_reduce(const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { mca_coll_cuda_module_t *s = (mca_coll_cuda_module_t*) module; ptrdiff_t true_lb, true_extent, lb, extent; char *rbuf1 = NULL, *sbuf1 = NULL, *rbuf2 = NULL; const char *sbuf2; size_t bufsize; int rc; ompi_datatype_get_extent(dtype, &lb, &extent); ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); bufsize = true_extent + (ptrdiff_t)(count - 1) * extent; if ((MPI_IN_PLACE != sbuf) && (opal_cuda_check_bufs((char *)sbuf, NULL))) { sbuf1 = (char*)malloc(bufsize); if (NULL == sbuf1) { return OMPI_ERR_OUT_OF_RESOURCE; } opal_cuda_memcpy_sync(sbuf1, sbuf, bufsize); sbuf2 = sbuf; /* save away original buffer */ sbuf = sbuf1 - lb; } if (opal_cuda_check_bufs(rbuf, NULL)) { rbuf1 = (char*)malloc(bufsize); if (NULL == rbuf1) { if (NULL != sbuf1) free(sbuf1); return OMPI_ERR_OUT_OF_RESOURCE; } opal_cuda_memcpy_sync(rbuf1, rbuf, bufsize); rbuf2 = rbuf; /* save away original buffer */ rbuf = rbuf1 - lb; } rc = s->c_coll.coll_reduce((void *) sbuf, rbuf, count, dtype, op, root, comm, s->c_coll.coll_reduce_module); if (NULL != sbuf1) { free(sbuf1); } if (NULL != rbuf1) { rbuf = rbuf2; opal_cuda_memcpy_sync(rbuf, rbuf1, bufsize); free(rbuf1); } return rc; }
void ADIOI_Datatype_iscontig(MPI_Datatype datatype, int *flag) { /* * Open MPI contiguous check return true for datatype with * gaps in the beginning and at the end. We have to provide * a count of 2 in order to get these gaps taken into acount. * In addition, if the data is contiguous but true_lb differes * from zero, ROMIO will ignore the displacement. Thus, lie! */ *flag = ompi_datatype_is_contiguous_memory_layout(datatype, 2); if (*flag) { MPI_Aint true_extent, true_lb; ompi_datatype_get_true_extent(datatype, &true_lb, &true_extent); if (true_lb > 0) *flag = 0; } }
/* * scan * * Function: - basic scan operation * Accepts: - same arguments as MPI_Scan() * Returns: - MPI_SUCCESS or error code */ int mca_coll_basic_scan_intra(void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int size, rank, err; ptrdiff_t true_lb, true_extent, lb, extent; char *free_buffer = NULL; char *pml_buffer = NULL; /* Initialize */ rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); /* If I'm rank 0, just copy into the receive buffer */ if (0 == rank) { if (MPI_IN_PLACE != sbuf) { err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf, (char*)sbuf); if (MPI_SUCCESS != err) { return err; } } } /* Otherwise receive previous buffer and reduce. */ else { /* Allocate a temporary buffer. Rationale for this size is * listed in coll_basic_reduce.c. Use this temporary buffer to * receive into, later. */ ompi_datatype_get_extent(dtype, &lb, &extent); ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); free_buffer = (char*)malloc(true_extent + (count - 1) * extent); if (NULL == free_buffer) { return OMPI_ERR_OUT_OF_RESOURCE; } pml_buffer = free_buffer - true_lb; /* Copy the send buffer into the receive buffer. */ if (MPI_IN_PLACE != sbuf) { err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf, (char*)sbuf); if (MPI_SUCCESS != err) { if (NULL != free_buffer) { free(free_buffer); } return err; } } /* Receive the prior answer */ err = MCA_PML_CALL(recv(pml_buffer, count, dtype, rank - 1, MCA_COLL_BASE_TAG_SCAN, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != err) { if (NULL != free_buffer) { free(free_buffer); } return err; } /* Perform the operation */ ompi_op_reduce(op, pml_buffer, rbuf, count, dtype); /* All done */ if (NULL != free_buffer) { free(free_buffer); } } /* Send result to next process. */ if (rank < (size - 1)) { return MCA_PML_CALL(send(rbuf, count, dtype, rank + 1, MCA_COLL_BASE_TAG_SCAN, MCA_PML_BASE_SEND_STANDARD, comm)); } /* All done */ return MPI_SUCCESS; }
/* * reduce_lin_intra * * Function: - reduction using O(N) algorithm * Accepts: - same as MPI_Reduce() * Returns: - MPI_SUCCESS or error code */ int mca_coll_basic_reduce_lin_intra(void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, rank, err, size; ptrdiff_t true_lb, true_extent, lb, extent; char *free_buffer = NULL; char *pml_buffer = NULL; char *inplace_temp = NULL; char *inbuf; /* Initialize */ rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); /* If not root, send data to the root. */ if (rank != root) { err = MCA_PML_CALL(send(sbuf, count, dtype, root, MCA_COLL_BASE_TAG_REDUCE, MCA_PML_BASE_SEND_STANDARD, comm)); return err; } /* Root receives and reduces messages. Allocate buffer to receive * messages. This comment applies to all collectives in this basic * module where we allocate a temporary buffer. For the next few * lines of code, it's tremendously complicated how we decided that * this was the Right Thing to do. Sit back and enjoy. And prepare * to have your mind warped. :-) * * Recall some definitions (I always get these backwards, so I'm * going to put them here): * * extent: the length from the lower bound to the upper bound -- may * be considerably larger than the buffer required to hold the data * (or smaller! But it's easiest to think about when it's larger). * * true extent: the exact number of bytes required to hold the data * in the layout pattern in the datatype. * * For example, consider the following buffer (just talking about * LB, extent, and true extent -- extrapolate for UB; i.e., assume * the UB equals exactly where the data ends): * * A B C * -------------------------------------------------------- * | | | * -------------------------------------------------------- * * There are multiple cases: * * 1. A is what we give to MPI_Send (and friends), and A is where * the data starts, and C is where the data ends. In this case: * * - extent: C-A * - true extent: C-A * - LB: 0 * * A C * -------------------------------------------------------- * | | * -------------------------------------------------------- * <=======================extent=========================> * <======================true extent=====================> * * 2. A is what we give to MPI_Send (and friends), B is where the * data starts, and C is where the data ends. In this case: * * - extent: C-A * - true extent: C-B * - LB: positive * * A B C * -------------------------------------------------------- * | | User buffer | * -------------------------------------------------------- * <=======================extent=========================> * <===============true extent=============> * * 3. B is what we give to MPI_Send (and friends), A is where the * data starts, and C is where the data ends. In this case: * * - extent: C-A * - true extent: C-A * - LB: negative * * A B C * -------------------------------------------------------- * | | User buffer | * -------------------------------------------------------- * <=======================extent=========================> * <======================true extent=====================> * * 4. MPI_BOTTOM is what we give to MPI_Send (and friends), B is * where the data starts, and C is where the data ends. In this * case: * * - extent: C-MPI_BOTTOM * - true extent: C-B * - LB: [potentially very large] positive * * MPI_BOTTOM B C * -------------------------------------------------------- * | | User buffer | * -------------------------------------------------------- * <=======================extent=========================> * <===============true extent=============> * * So in all cases, for a temporary buffer, all we need to malloc() * is a buffer of size true_extent. We therefore need to know two * pointer values: what value to give to MPI_Send (and friends) and * what value to give to free(), because they might not be the same. * * Clearly, what we give to free() is exactly what was returned from * malloc(). That part is easy. :-) * * What we give to MPI_Send (and friends) is a bit more complicated. * Let's take the 4 cases from above: * * 1. If A is what we give to MPI_Send and A is where the data * starts, then clearly we give to MPI_Send what we got back from * malloc(). * * 2. If B is what we get back from malloc, but we give A to * MPI_Send, then the buffer range [A,B) represents "dead space" * -- no data will be put there. So it's safe to give B-LB to * MPI_Send. More specifically, the LB is positive, so B-LB is * actually A. * * 3. If A is what we get back from malloc, and B is what we give to * MPI_Send, then the LB is negative, so A-LB will actually equal * B. * * 4. Although this seems like the weirdest case, it's actually * quite similar to case #2 -- the pointer we give to MPI_Send is * smaller than the pointer we got back from malloc(). * * Hence, in all cases, we give (return_from_malloc - LB) to MPI_Send. * * This works fine and dandy if we only have (count==1), which we * rarely do. ;-) So we really need to allocate (true_extent + * ((count - 1) * extent)) to get enough space for the rest. This may * be more than is necessary, but it's ok. * * Simple, no? :-) * */ ompi_datatype_get_extent(dtype, &lb, &extent); ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); if (MPI_IN_PLACE == sbuf) { sbuf = rbuf; inplace_temp = (char*)malloc(true_extent + (count - 1) * extent); if (NULL == inplace_temp) { return OMPI_ERR_OUT_OF_RESOURCE; } rbuf = inplace_temp - lb; } if (size > 1) { free_buffer = (char*)malloc(true_extent + (count - 1) * extent); if (NULL == free_buffer) { if (NULL != inplace_temp) { free(inplace_temp); } return OMPI_ERR_OUT_OF_RESOURCE; } pml_buffer = free_buffer - lb; } /* Initialize the receive buffer. */ if (rank == (size - 1)) { err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf, (char*)sbuf); } else { err = MCA_PML_CALL(recv(rbuf, count, dtype, size - 1, MCA_COLL_BASE_TAG_REDUCE, comm, MPI_STATUS_IGNORE)); } if (MPI_SUCCESS != err) { if (NULL != free_buffer) { free(free_buffer); } return err; } /* Loop receiving and calling reduction function (C or Fortran). */ for (i = size - 2; i >= 0; --i) { if (rank == i) { inbuf = (char*)sbuf; } else { err = MCA_PML_CALL(recv(pml_buffer, count, dtype, i, MCA_COLL_BASE_TAG_REDUCE, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != err) { if (NULL != free_buffer) { free(free_buffer); } return err; } inbuf = pml_buffer; } /* Perform the reduction */ ompi_op_reduce(op, inbuf, rbuf, count, dtype); } if (NULL != inplace_temp) { err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)sbuf, inplace_temp); free(inplace_temp); } if (NULL != free_buffer) { free(free_buffer); } /* All done */ return MPI_SUCCESS; }
/* * reduce_lin_inter * * Function: - reduction using O(N) algorithm * Accepts: - same as MPI_Reduce() * Returns: - MPI_SUCCESS or error code */ int mca_coll_basic_reduce_lin_inter(void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, err, size; ptrdiff_t true_lb, true_extent, lb, extent; char *free_buffer = NULL; char *pml_buffer = NULL; /* Initialize */ size = ompi_comm_remote_size(comm); if (MPI_PROC_NULL == root) { /* do nothing */ err = OMPI_SUCCESS; } else if (MPI_ROOT != root) { /* If not root, send data to the root. */ err = MCA_PML_CALL(send(sbuf, count, dtype, root, MCA_COLL_BASE_TAG_REDUCE, MCA_PML_BASE_SEND_STANDARD, comm)); } else { /* Root receives and reduces messages */ ompi_datatype_get_extent(dtype, &lb, &extent); ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); free_buffer = (char*)malloc(true_extent + (count - 1) * extent); if (NULL == free_buffer) { return OMPI_ERR_OUT_OF_RESOURCE; } pml_buffer = free_buffer - lb; /* Initialize the receive buffer. */ err = MCA_PML_CALL(recv(rbuf, count, dtype, 0, MCA_COLL_BASE_TAG_REDUCE, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != err) { if (NULL != free_buffer) { free(free_buffer); } return err; } /* Loop receiving and calling reduction function (C or Fortran). */ for (i = 1; i < size; i++) { err = MCA_PML_CALL(recv(pml_buffer, count, dtype, i, MCA_COLL_BASE_TAG_REDUCE, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != err) { if (NULL != free_buffer) { free(free_buffer); } return err; } /* Perform the reduction */ ompi_op_reduce(op, pml_buffer, rbuf, count, dtype); } if (NULL != free_buffer) { free(free_buffer); } } /* All done */ return err; }
int ompi_coll_tuned_alltoall_intra_bruck(void *sbuf, int scount, struct ompi_datatype_t *sdtype, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, k, line = -1, rank, size, err = 0, weallocated = 0; int sendto, recvfrom, distance, *displs = NULL, *blen = NULL; char *tmpbuf = NULL, *tmpbuf_free = NULL; ptrdiff_t rlb, slb, tlb, sext, rext, tsext; struct ompi_datatype_t *new_ddt; #ifdef blahblah mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; mca_coll_tuned_comm_t *data = tuned_module->tuned_data; #endif if (MPI_IN_PLACE == sbuf) { return mca_coll_tuned_alltoall_intra_basic_inplace (rbuf, rcount, rdtype, comm, module); } size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:alltoall_intra_bruck rank %d", rank)); err = ompi_datatype_get_extent (sdtype, &slb, &sext); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } err = ompi_datatype_get_true_extent(sdtype, &tlb, &tsext); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } err = ompi_datatype_get_extent (rdtype, &rlb, &rext); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } #ifdef blahblah /* try and SAVE memory by using the data segment hung off the communicator if possible */ if (data->mcct_num_reqs >= size) { /* we have enought preallocated for displments and lengths */ displs = (int*) data->mcct_reqs; blen = (int *) (displs + size); weallocated = 0; } else { /* allocate the buffers ourself */ #endif displs = (int *) malloc(size * sizeof(int)); if (displs == NULL) { line = __LINE__; err = -1; goto err_hndl; } blen = (int *) malloc(size * sizeof(int)); if (blen == NULL) { line = __LINE__; err = -1; goto err_hndl; } weallocated = 1; #ifdef blahblah } #endif /* tmp buffer allocation for message data */ tmpbuf_free = (char *) malloc(tsext + ((ptrdiff_t)scount * (ptrdiff_t)size - 1) * sext); if (tmpbuf_free == NULL) { line = __LINE__; err = -1; goto err_hndl; } tmpbuf = tmpbuf_free - slb; /* Step 1 - local rotation - shift up by rank */ err = ompi_datatype_copy_content_same_ddt (sdtype, (int32_t) ((ptrdiff_t)(size - rank) * (ptrdiff_t)scount), tmpbuf, ((char*) sbuf) + (ptrdiff_t)rank * (ptrdiff_t)scount * sext); if (err<0) { line = __LINE__; err = -1; goto err_hndl; } if (rank != 0) { err = ompi_datatype_copy_content_same_ddt (sdtype, (ptrdiff_t)rank * (ptrdiff_t)scount, tmpbuf + (ptrdiff_t)(size - rank) * (ptrdiff_t)scount* sext, (char*) sbuf); if (err<0) { line = __LINE__; err = -1; goto err_hndl; } } /* perform communication step */ for (distance = 1; distance < size; distance<<=1) { sendto = (rank + distance) % size; recvfrom = (rank - distance + size) % size; k = 0; /* create indexed datatype */ for (i = 1; i < size; i++) { if (( i & distance) == distance) { displs[k] = (ptrdiff_t)i * (ptrdiff_t)scount; blen[k] = scount; k++; } } /* Set indexes and displacements */ err = ompi_datatype_create_indexed(k, blen, displs, sdtype, &new_ddt); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } /* Commit the new datatype */ err = ompi_datatype_commit(&new_ddt); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } /* Sendreceive */ err = ompi_coll_tuned_sendrecv ( tmpbuf, 1, new_ddt, sendto, MCA_COLL_BASE_TAG_ALLTOALL, rbuf, 1, new_ddt, recvfrom, MCA_COLL_BASE_TAG_ALLTOALL, comm, MPI_STATUS_IGNORE, rank ); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } /* Copy back new data from recvbuf to tmpbuf */ err = ompi_datatype_copy_content_same_ddt(new_ddt, 1,tmpbuf, (char *) rbuf); if (err < 0) { line = __LINE__; err = -1; goto err_hndl; } /* free ddt */ err = ompi_datatype_destroy(&new_ddt); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } } /* end of for (distance = 1... */ /* Step 3 - local rotation - */ for (i = 0; i < size; i++) { err = ompi_datatype_copy_content_same_ddt (rdtype, (int32_t) rcount, ((char*)rbuf) + ((ptrdiff_t)((rank - i + size) % size) * (ptrdiff_t)rcount * rext), tmpbuf + (ptrdiff_t)i * (ptrdiff_t)rcount * rext); if (err < 0) { line = __LINE__; err = -1; goto err_hndl; } } /* Step 4 - clean up */ if (tmpbuf != NULL) free(tmpbuf_free); if (weallocated) { if (displs != NULL) free(displs); if (blen != NULL) free(blen); } return OMPI_SUCCESS; err_hndl: OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank)); if (tmpbuf != NULL) free(tmpbuf_free); if (weallocated) { if (displs != NULL) free(displs); if (blen != NULL) free(blen); } return err; }
/* * reduce_log_intra * * Function: - reduction using O(log N) algorithm * Accepts: - same as MPI_Reduce() * Returns: - MPI_SUCCESS or error code * * * Performing reduction on each dimension of the hypercube. * An example for 8 procs (dimensions = 3): * * Stage 1, reduce on X dimension, 1 -> 0, 3 -> 2, 5 -> 4, 7 -> 6 * * 6----<---7 proc_0: 0+1 * /| /| proc_1: 1 * / | / | proc_2: 2+3 * / | / | proc_3: 3 * 4----<---5 | proc_4: 4+5 * | 2--< |---3 proc_5: 5 * | / | / proc_6: 6+7 * | / | / proc_7: 7 * |/ |/ * 0----<---1 * * Stage 2, reduce on Y dimension, 2 -> 0, 6 -> 4 * * 6--------7 proc_0: 0+1+2+3 * /| /| proc_1: 1 * v | / | proc_2: 2+3 * / | / | proc_3: 3 * 4--------5 | proc_4: 4+5+6+7 * | 2--- |---3 proc_5: 5 * | / | / proc_6: 6+7 * | v | / proc_7: 7 * |/ |/ * 0--------1 * * Stage 3, reduce on Z dimension, 4 -> 0 * * 6--------7 proc_0: 0+1+2+3+4+5+6+7 * /| /| proc_1: 1 * / | / | proc_2: 2+3 * / | / | proc_3: 3 * 4--------5 | proc_4: 4+5+6+7 * | 2--- |---3 proc_5: 5 * v / | / proc_6: 6+7 * | / | / proc_7: 7 * |/ |/ * 0--------1 * * */ int mca_coll_basic_reduce_log_intra(void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, size, rank, vrank; int err, peer, dim, mask; ptrdiff_t true_lb, true_extent, lb, extent; char *free_buffer = NULL; char *free_rbuf = NULL; char *pml_buffer = NULL; char *snd_buffer = NULL; char *rcv_buffer = (char*)rbuf; char *inplace_temp = NULL; /* JMS Codearound for now -- if the operations is not communative, * just call the linear algorithm. Need to talk to Edgar / George * about fixing this algorithm here to work with non-communative * operations. */ if (!ompi_op_is_commute(op)) { return mca_coll_basic_reduce_lin_intra(sbuf, rbuf, count, dtype, op, root, comm, module); } /* Some variables */ size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); vrank = ompi_op_is_commute(op) ? (rank - root + size) % size : rank; dim = comm->c_cube_dim; /* Allocate the incoming and resulting message buffers. See lengthy * rationale above. */ ompi_datatype_get_extent(dtype, &lb, &extent); ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); free_buffer = (char*)malloc(true_extent + (count - 1) * extent); if (NULL == free_buffer) { return OMPI_ERR_OUT_OF_RESOURCE; } pml_buffer = free_buffer - lb; /* read the comment about commutative operations (few lines down * the page) */ if (ompi_op_is_commute(op)) { rcv_buffer = pml_buffer; } /* Allocate sendbuf in case the MPI_IN_PLACE option has been used. See lengthy * rationale above. */ if (MPI_IN_PLACE == sbuf) { inplace_temp = (char*)malloc(true_extent + (count - 1) * extent); if (NULL == inplace_temp) { err = OMPI_ERR_OUT_OF_RESOURCE; goto cleanup_and_return; } sbuf = inplace_temp - lb; err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)sbuf, (char*)rbuf); } snd_buffer = (char*)sbuf; if (rank != root && 0 == (vrank & 1)) { /* root is the only one required to provide a valid rbuf. * Assume rbuf is invalid for all other ranks, so fix it up * here to be valid on all non-leaf ranks */ free_rbuf = (char*)malloc(true_extent + (count - 1) * extent); if (NULL == free_rbuf) { err = OMPI_ERR_OUT_OF_RESOURCE; goto cleanup_and_return; } rbuf = free_rbuf - lb; } /* Loop over cube dimensions. High processes send to low ones in the * dimension. */ for (i = 0, mask = 1; i < dim; ++i, mask <<= 1) { /* A high-proc sends to low-proc and stops. */ if (vrank & mask) { peer = vrank & ~mask; if (ompi_op_is_commute(op)) { peer = (peer + root) % size; } err = MCA_PML_CALL(send(snd_buffer, count, dtype, peer, MCA_COLL_BASE_TAG_REDUCE, MCA_PML_BASE_SEND_STANDARD, comm)); if (MPI_SUCCESS != err) { goto cleanup_and_return; } snd_buffer = (char*)rbuf; break; } /* A low-proc receives, reduces, and moves to a higher * dimension. */ else { peer = vrank | mask; if (peer >= size) { continue; } if (ompi_op_is_commute(op)) { peer = (peer + root) % size; } /* Most of the time (all except the first one for commutative * operations) we receive in the user provided buffer * (rbuf). But the exception is here to allow us to dont have * to copy from the sbuf to a temporary location. If the * operation is commutative we dont care in which order we * apply the operation, so for the first time we can receive * the data in the pml_buffer and then apply to operation * between this buffer and the user provided data. */ err = MCA_PML_CALL(recv(rcv_buffer, count, dtype, peer, MCA_COLL_BASE_TAG_REDUCE, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != err) { goto cleanup_and_return; } /* Perform the operation. The target is always the user * provided buffer We do the operation only if we receive it * not in the user buffer */ if (snd_buffer != sbuf) { /* the target buffer is the locally allocated one */ ompi_op_reduce(op, rcv_buffer, pml_buffer, count, dtype); } else { /* If we're commutative, we don't care about the order of * operations and we can just reduce the operations now. * If we are not commutative, we have to copy the send * buffer into a temp buffer (pml_buffer) and then reduce * what we just received against it. */ if (!ompi_op_is_commute(op)) { ompi_datatype_copy_content_same_ddt(dtype, count, pml_buffer, (char*)sbuf); ompi_op_reduce(op, rbuf, pml_buffer, count, dtype); } else { ompi_op_reduce(op, sbuf, pml_buffer, count, dtype); } /* now we have to send the buffer containing the computed data */ snd_buffer = pml_buffer; /* starting from now we always receive in the user * provided buffer */ rcv_buffer = (char*)rbuf; } } } /* Get the result to the root if needed. */ err = MPI_SUCCESS; if (0 == vrank) { if (root == rank) { ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf, snd_buffer); } else { err = MCA_PML_CALL(send(snd_buffer, count, dtype, root, MCA_COLL_BASE_TAG_REDUCE, MCA_PML_BASE_SEND_STANDARD, comm)); } } else if (rank == root) { err = MCA_PML_CALL(recv(rcv_buffer, count, dtype, 0, MCA_COLL_BASE_TAG_REDUCE, comm, MPI_STATUS_IGNORE)); if (rcv_buffer != rbuf) { ompi_op_reduce(op, rcv_buffer, rbuf, count, dtype); } } cleanup_and_return: if (NULL != inplace_temp) { free(inplace_temp); } if (NULL != free_buffer) { free(free_buffer); } if (NULL != free_rbuf) { free(free_rbuf); } /* All done */ return err; }
/** * This is a generic implementation of the reduce protocol. It used the tree * provided as an argument and execute all operations using a segment of * count times a datatype. * For the last communication it will update the count in order to limit * the number of datatype to the original count (original_count) * * Note that for non-commutative operations we cannot save memory copy * for the first block: thus we must copy sendbuf to accumbuf on intermediate * to keep the optimized loop happy. */ int ompi_coll_tuned_reduce_generic( void* sendbuf, void* recvbuf, int original_count, ompi_datatype_t* datatype, ompi_op_t* op, int root, ompi_communicator_t* comm, mca_coll_base_module_t *module, ompi_coll_tree_t* tree, int count_by_segment, int max_outstanding_reqs ) { char *inbuf[2] = {NULL, NULL}, *inbuf_free[2] = {NULL, NULL}; char *accumbuf = NULL, *accumbuf_free = NULL; char *local_op_buffer = NULL, *sendtmpbuf = NULL; ptrdiff_t extent, lower_bound, segment_increment; size_t typelng; ompi_request_t* reqs[2] = {MPI_REQUEST_NULL, MPI_REQUEST_NULL}; int num_segments, line, ret, segindex, i, rank; int recvcount, prevcount, inbi; /** * Determine number of segments and number of elements * sent per operation */ ompi_datatype_get_extent( datatype, &lower_bound, &extent ); ompi_datatype_type_size( datatype, &typelng ); num_segments = (original_count + count_by_segment - 1) / count_by_segment; segment_increment = (ptrdiff_t)count_by_segment * extent; sendtmpbuf = (char*) sendbuf; if( sendbuf == MPI_IN_PLACE ) { sendtmpbuf = (char *)recvbuf; } OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:reduce_generic count %d, msg size %ld, segsize %ld, max_requests %d", original_count, (unsigned long)((ptrdiff_t)num_segments * (ptrdiff_t)segment_increment), (unsigned long)segment_increment, max_outstanding_reqs)); rank = ompi_comm_rank(comm); /*printf("Tree of rank %d - ", rank); printf("Parent : %d - ", tree->tree_prev); printf("Child : "); for (i = 0; i < tree->tree_nextsize; i++) printf("%d ", tree->tree_next[i]); printf("\n");*/ /* non-leaf nodes - wait for children to send me data & forward up (if needed) */ if( tree->tree_nextsize > 0 ) { ptrdiff_t true_lower_bound, true_extent, real_segment_size; ompi_datatype_get_true_extent( datatype, &true_lower_bound, &true_extent ); /* handle non existant recv buffer (i.e. its NULL) and protect the recv buffer on non-root nodes */ accumbuf = (char*)recvbuf; if( (NULL == accumbuf) || (root != rank) ) { /* Allocate temporary accumulator buffer. */ accumbuf_free = (char*)malloc(true_extent + (ptrdiff_t)(original_count - 1) * extent); if (accumbuf_free == NULL) { line = __LINE__; ret = -1; goto error_hndl; } accumbuf = accumbuf_free - lower_bound; } /* If this is a non-commutative operation we must copy sendbuf to the accumbuf, in order to simplfy the loops */ if (!ompi_op_is_commute(op)) { ompi_datatype_copy_content_same_ddt(datatype, original_count, (char*)accumbuf, (char*)sendtmpbuf); } /* Allocate two buffers for incoming segments */ real_segment_size = true_extent + (ptrdiff_t)(count_by_segment - 1) * extent; inbuf_free[0] = (char*) malloc(real_segment_size); if( inbuf_free[0] == NULL ) { line = __LINE__; ret = -1; goto error_hndl; } inbuf[0] = inbuf_free[0] - lower_bound; /* if there is chance to overlap communication - allocate second buffer */ if( (num_segments > 1) || (tree->tree_nextsize > 1) ) { inbuf_free[1] = (char*) malloc(real_segment_size); if( inbuf_free[1] == NULL ) { line = __LINE__; ret = -1; goto error_hndl; } inbuf[1] = inbuf_free[1] - lower_bound; } /* reset input buffer index and receive count */ inbi = 0; recvcount = 0; /* for each segment */ for( segindex = 0; segindex <= num_segments; segindex++ ) { prevcount = recvcount; /* recvcount - number of elements in current segment */ recvcount = count_by_segment; if( segindex == (num_segments-1) ) recvcount = original_count - (ptrdiff_t)count_by_segment * (ptrdiff_t)segindex; /* for each child */ for( i = 0; i < tree->tree_nextsize; i++ ) { /** * We try to overlap communication: * either with next segment or with the next child */ /* post irecv for current segindex on current child */ if( segindex < num_segments ) { void* local_recvbuf = inbuf[inbi]; if( 0 == i ) { /* for the first step (1st child per segment) and * commutative operations we might be able to irecv * directly into the accumulate buffer so that we can * reduce(op) this with our sendbuf in one step as * ompi_op_reduce only has two buffer pointers, * this avoids an extra memory copy. * * BUT if the operation is non-commutative or * we are root and are USING MPI_IN_PLACE this is wrong! */ if( (ompi_op_is_commute(op)) && !((MPI_IN_PLACE == sendbuf) && (rank == tree->tree_root)) ) { local_recvbuf = accumbuf + (ptrdiff_t)segindex * (ptrdiff_t)segment_increment; } } ret = MCA_PML_CALL(irecv(local_recvbuf, recvcount, datatype, tree->tree_next[i], MCA_COLL_BASE_TAG_REDUCE, comm, &reqs[inbi])); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl;} } /* wait for previous req to complete, if any. if there are no requests reqs[inbi ^1] will be MPI_REQUEST_NULL. */ /* wait on data from last child for previous segment */ ret = ompi_request_wait_all( 1, &reqs[inbi ^ 1], MPI_STATUSES_IGNORE ); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } local_op_buffer = inbuf[inbi ^ 1]; if( i > 0 ) { /* our first operation is to combine our own [sendbuf] data * with the data we recvd from down stream (but only * the operation is commutative and if we are not root and * not using MPI_IN_PLACE) */ if( 1 == i ) { if( (ompi_op_is_commute(op)) && !((MPI_IN_PLACE == sendbuf) && (rank == tree->tree_root)) ) { local_op_buffer = sendtmpbuf + (ptrdiff_t)segindex * (ptrdiff_t)segment_increment; } } /* apply operation */ ompi_op_reduce(op, local_op_buffer, accumbuf + (ptrdiff_t)segindex * (ptrdiff_t)segment_increment, recvcount, datatype ); } else if ( segindex > 0 ) { void* accumulator = accumbuf + (ptrdiff_t)(segindex-1) * (ptrdiff_t)segment_increment; if( tree->tree_nextsize <= 1 ) { if( (ompi_op_is_commute(op)) && !((MPI_IN_PLACE == sendbuf) && (rank == tree->tree_root)) ) { local_op_buffer = sendtmpbuf + (ptrdiff_t)(segindex-1) * (ptrdiff_t)segment_increment; } } ompi_op_reduce(op, local_op_buffer, accumulator, prevcount, datatype ); /* all reduced on available data this step (i) complete, * pass to the next process unless you are the root. */ if (rank != tree->tree_root) { /* send combined/accumulated data to parent */ ret = MCA_PML_CALL( send( accumulator, prevcount, datatype, tree->tree_prev, MCA_COLL_BASE_TAG_REDUCE, MCA_PML_BASE_SEND_STANDARD, comm) ); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } } /* we stop when segindex = number of segments (i.e. we do num_segment+1 steps for pipelining */ if (segindex == num_segments) break; } /* update input buffer index */ inbi = inbi ^ 1; } /* end of for each child */ } /* end of for each segment */ /* clean up */ if( inbuf_free[0] != NULL) free(inbuf_free[0]); if( inbuf_free[1] != NULL) free(inbuf_free[1]); if( accumbuf_free != NULL ) free(accumbuf_free); } /* leaf nodes Depending on the value of max_outstanding_reqs and the number of segments we have two options: - send all segments using blocking send to the parent, or - avoid overflooding the parent nodes by limiting the number of outstanding requests to max_oustanding_reqs. TODO/POSSIBLE IMPROVEMENT: If there is a way to determine the eager size for the current communication, synchronization should be used only when the message/segment size is smaller than the eager size. */ else { /* If the number of segments is less than a maximum number of oustanding requests or there is no limit on the maximum number of outstanding requests, we send data to the parent using blocking send */ if ((0 == max_outstanding_reqs) || (num_segments <= max_outstanding_reqs)) { segindex = 0; while ( original_count > 0) { if (original_count < count_by_segment) { count_by_segment = original_count; } ret = MCA_PML_CALL( send((char*)sendbuf + (ptrdiff_t)segindex * (ptrdiff_t)segment_increment, count_by_segment, datatype, tree->tree_prev, MCA_COLL_BASE_TAG_REDUCE, MCA_PML_BASE_SEND_STANDARD, comm) ); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } segindex++; original_count -= count_by_segment; } } /* Otherwise, introduce flow control: - post max_outstanding_reqs non-blocking synchronous send, - for remaining segments - wait for a ssend to complete, and post the next one. - wait for all outstanding sends to complete. */ else { int creq = 0; ompi_request_t **sreq = NULL; sreq = (ompi_request_t**) calloc( max_outstanding_reqs, sizeof(ompi_request_t*) ); if (NULL == sreq) { line = __LINE__; ret = -1; goto error_hndl; } /* post first group of requests */ for (segindex = 0; segindex < max_outstanding_reqs; segindex++) { ret = MCA_PML_CALL( isend((char*)sendbuf + (ptrdiff_t)segindex * (ptrdiff_t)segment_increment, count_by_segment, datatype, tree->tree_prev, MCA_COLL_BASE_TAG_REDUCE, MCA_PML_BASE_SEND_SYNCHRONOUS, comm, &sreq[segindex]) ); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } original_count -= count_by_segment; } creq = 0; while ( original_count > 0 ) { /* wait on a posted request to complete */ ret = ompi_request_wait(&sreq[creq], MPI_STATUS_IGNORE); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } sreq[creq] = MPI_REQUEST_NULL; if( original_count < count_by_segment ) { count_by_segment = original_count; } ret = MCA_PML_CALL( isend((char*)sendbuf + (ptrdiff_t)segindex * (ptrdiff_t)segment_increment, count_by_segment, datatype, tree->tree_prev, MCA_COLL_BASE_TAG_REDUCE, MCA_PML_BASE_SEND_SYNCHRONOUS, comm, &sreq[creq]) ); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } creq = (creq + 1) % max_outstanding_reqs; segindex++; original_count -= count_by_segment; } /* Wait on the remaining request to complete */ ret = ompi_request_wait_all( max_outstanding_reqs, sreq, MPI_STATUSES_IGNORE ); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } /* free requests */ free(sreq); } } return OMPI_SUCCESS; error_hndl: /* error handler */ OPAL_OUTPUT (( ompi_coll_tuned_stream, "ERROR_HNDL: node %d file %s line %d error %d\n", rank, __FILE__, line, ret )); if( inbuf_free[0] != NULL ) free(inbuf_free[0]); if( inbuf_free[1] != NULL ) free(inbuf_free[1]); if( accumbuf_free != NULL ) free(accumbuf); return ret; }
/* * reduce_lin_intra * * Function: - reduction using O(N) algorithm * Accepts: - same as MPI_Reduce() * Returns: - MPI_SUCCESS or error code */ int ompi_coll_tuned_reduce_intra_basic_linear(void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, rank, err, size; ptrdiff_t true_lb, true_extent, lb, extent; char *free_buffer = NULL, *pml_buffer = NULL; char *inplace_temp = NULL, *inbuf; /* Initialize */ rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_basic_linear rank %d", rank)); /* If not root, send data to the root. */ if (rank != root) { err = MCA_PML_CALL(send(sbuf, count, dtype, root, MCA_COLL_BASE_TAG_REDUCE, MCA_PML_BASE_SEND_STANDARD, comm)); return err; } /* see discussion in ompi_coll_basic_reduce_lin_intra about extent and true extent */ /* for reducing buffer allocation lengths.... */ ompi_datatype_get_extent(dtype, &lb, &extent); ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); if (MPI_IN_PLACE == sbuf) { sbuf = rbuf; inplace_temp = (char*)malloc(true_extent + (ptrdiff_t)(count - 1) * extent); if (NULL == inplace_temp) { return OMPI_ERR_OUT_OF_RESOURCE; } rbuf = inplace_temp - lb; } if (size > 1) { free_buffer = (char*)malloc(true_extent + (ptrdiff_t)(count - 1) * extent); if (NULL == free_buffer) { return OMPI_ERR_OUT_OF_RESOURCE; } pml_buffer = free_buffer - lb; } /* Initialize the receive buffer. */ if (rank == (size - 1)) { err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf, (char*)sbuf); } else { err = MCA_PML_CALL(recv(rbuf, count, dtype, size - 1, MCA_COLL_BASE_TAG_REDUCE, comm, MPI_STATUS_IGNORE)); } if (MPI_SUCCESS != err) { if (NULL != free_buffer) { free(free_buffer); } return err; } /* Loop receiving and calling reduction function (C or Fortran). */ for (i = size - 2; i >= 0; --i) { if (rank == i) { inbuf = (char*)sbuf; } else { err = MCA_PML_CALL(recv(pml_buffer, count, dtype, i, MCA_COLL_BASE_TAG_REDUCE, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != err) { if (NULL != free_buffer) { free(free_buffer); } return err; } inbuf = pml_buffer; } /* Perform the reduction */ ompi_op_reduce(op, inbuf, rbuf, count, dtype); } if (NULL != inplace_temp) { err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)sbuf, inplace_temp); free(inplace_temp); } if (NULL != free_buffer) { free(free_buffer); } /* All done */ return MPI_SUCCESS; }
/* * reduce_scatter * * Function: - reduce then scatter * Accepts: - same as MPI_Reduce_scatter() * Returns: - MPI_SUCCESS or error code * * Algorithm: * Cummutative, reasonable sized messages * recursive halving algorithm * Others: * reduce and scatterv (needs to be cleaned * up at some point) * * NOTE: that the recursive halving algorithm should be faster than * the reduce/scatter for all message sizes. However, the memory * usage for the recusive halving is msg_size + 2 * comm_size greater * for the recursive halving, so I've limited where the recursive * halving is used to be nice to the app memory wise. There are much * better algorithms for large messages with cummutative operations, * so this should be investigated further. * * NOTE: We default to a simple reduce/scatterv if one of the rcounts * is zero. This is because the existing algorithms do not currently * support a count of zero in the array. */ int mca_coll_basic_reduce_scatter_intra(void *sbuf, void *rbuf, int *rcounts, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, rank, size, count, err = OMPI_SUCCESS; ptrdiff_t true_lb, true_extent, lb, extent, buf_size; int *disps = NULL; char *recv_buf = NULL, *recv_buf_free = NULL; char *result_buf = NULL, *result_buf_free = NULL; bool zerocounts = false; /* Initialize */ rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); /* Find displacements and the like */ disps = (int*) malloc(sizeof(int) * size); if (NULL == disps) return OMPI_ERR_OUT_OF_RESOURCE; disps[0] = 0; for (i = 0; i < (size - 1); ++i) { disps[i + 1] = disps[i] + rcounts[i]; if (0 == rcounts[i]) { zerocounts = true; } } count = disps[size - 1] + rcounts[size - 1]; if (0 == rcounts[size - 1]) { zerocounts = true; } /* short cut the trivial case */ if (0 == count) { free(disps); return OMPI_SUCCESS; } /* get datatype information */ ompi_datatype_get_extent(dtype, &lb, &extent); ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); buf_size = true_extent + (count - 1) * extent; /* Handle MPI_IN_PLACE */ if (MPI_IN_PLACE == sbuf) { sbuf = rbuf; } if ((op->o_flags & OMPI_OP_FLAGS_COMMUTE) && (buf_size < COMMUTATIVE_LONG_MSG) && (!zerocounts)) { int tmp_size, remain = 0, tmp_rank; /* temporary receive buffer. See coll_basic_reduce.c for details on sizing */ recv_buf_free = (char*) malloc(buf_size); recv_buf = recv_buf_free - lb; if (NULL == recv_buf_free) { err = OMPI_ERR_OUT_OF_RESOURCE; goto cleanup; } /* allocate temporary buffer for results */ result_buf_free = (char*) malloc(buf_size); result_buf = result_buf_free - lb; /* copy local buffer into the temporary results */ err = ompi_datatype_sndrcv(sbuf, count, dtype, result_buf, count, dtype); if (OMPI_SUCCESS != err) goto cleanup; /* figure out power of two mapping: grow until larger than comm size, then go back one, to get the largest power of two less than comm size */ tmp_size = opal_next_poweroftwo(size); tmp_size >>= 1; remain = size - tmp_size; /* If comm size is not a power of two, have the first "remain" procs with an even rank send to rank + 1, leaving a power of two procs to do the rest of the algorithm */ if (rank < 2 * remain) { if ((rank & 1) == 0) { err = MCA_PML_CALL(send(result_buf, count, dtype, rank + 1, MCA_COLL_BASE_TAG_REDUCE_SCATTER, MCA_PML_BASE_SEND_STANDARD, comm)); if (OMPI_SUCCESS != err) goto cleanup; /* we don't participate from here on out */ tmp_rank = -1; } else { err = MCA_PML_CALL(recv(recv_buf, count, dtype, rank - 1, MCA_COLL_BASE_TAG_REDUCE_SCATTER, comm, MPI_STATUS_IGNORE)); if (OMPI_SUCCESS != err) goto cleanup; /* integrate their results into our temp results */ ompi_op_reduce(op, recv_buf, result_buf, count, dtype); /* adjust rank to be the bottom "remain" ranks */ tmp_rank = rank / 2; } } else { /* just need to adjust rank to show that the bottom "even remain" ranks dropped out */ tmp_rank = rank - remain; } /* For ranks not kicked out by the above code, perform the recursive halving */ if (tmp_rank >= 0) { int *tmp_disps = NULL, *tmp_rcounts = NULL; int mask, send_index, recv_index, last_index; /* recalculate disps and rcounts to account for the special "remainder" processes that are no longer doing anything */ tmp_rcounts = (int*) malloc(tmp_size * sizeof(int)); if (NULL == tmp_rcounts) { err = OMPI_ERR_OUT_OF_RESOURCE; goto cleanup; } tmp_disps = (int*) malloc(tmp_size * sizeof(int)); if (NULL == tmp_disps) { free(tmp_rcounts); err = OMPI_ERR_OUT_OF_RESOURCE; goto cleanup; } for (i = 0 ; i < tmp_size ; ++i) { if (i < remain) { /* need to include old neighbor as well */ tmp_rcounts[i] = rcounts[i * 2 + 1] + rcounts[i * 2]; } else { tmp_rcounts[i] = rcounts[i + remain]; } } tmp_disps[0] = 0; for (i = 0; i < tmp_size - 1; ++i) { tmp_disps[i + 1] = tmp_disps[i] + tmp_rcounts[i]; } /* do the recursive halving communication. Don't use the dimension information on the communicator because I think the information is invalidated by our "shrinking" of the communicator */ mask = tmp_size >> 1; send_index = recv_index = 0; last_index = tmp_size; while (mask > 0) { int tmp_peer, peer, send_count, recv_count; struct ompi_request_t *request; tmp_peer = tmp_rank ^ mask; peer = (tmp_peer < remain) ? tmp_peer * 2 + 1 : tmp_peer + remain; /* figure out if we're sending, receiving, or both */ send_count = recv_count = 0; if (tmp_rank < tmp_peer) { send_index = recv_index + mask; for (i = send_index ; i < last_index ; ++i) { send_count += tmp_rcounts[i]; } for (i = recv_index ; i < send_index ; ++i) { recv_count += tmp_rcounts[i]; } } else { recv_index = send_index + mask; for (i = send_index ; i < recv_index ; ++i) { send_count += tmp_rcounts[i]; } for (i = recv_index ; i < last_index ; ++i) { recv_count += tmp_rcounts[i]; } } /* actual data transfer. Send from result_buf, receive into recv_buf */ if (send_count > 0 && recv_count != 0) { err = MCA_PML_CALL(irecv(recv_buf + tmp_disps[recv_index] * extent, recv_count, dtype, peer, MCA_COLL_BASE_TAG_REDUCE_SCATTER, comm, &request)); if (OMPI_SUCCESS != err) { free(tmp_rcounts); free(tmp_disps); goto cleanup; } } if (recv_count > 0 && send_count != 0) { err = MCA_PML_CALL(send(result_buf + tmp_disps[send_index] * extent, send_count, dtype, peer, MCA_COLL_BASE_TAG_REDUCE_SCATTER, MCA_PML_BASE_SEND_STANDARD, comm)); if (OMPI_SUCCESS != err) { free(tmp_rcounts); free(tmp_disps); goto cleanup; } } if (send_count > 0 && recv_count != 0) { err = ompi_request_wait(&request, MPI_STATUS_IGNORE); if (OMPI_SUCCESS != err) { free(tmp_rcounts); free(tmp_disps); goto cleanup; } } /* if we received something on this step, push it into the results buffer */ if (recv_count > 0) { ompi_op_reduce(op, recv_buf + tmp_disps[recv_index] * extent, result_buf + tmp_disps[recv_index] * extent, recv_count, dtype); } /* update for next iteration */ send_index = recv_index; last_index = recv_index + mask; mask >>= 1; } /* copy local results from results buffer into real receive buffer */ if (0 != rcounts[rank]) { err = ompi_datatype_sndrcv(result_buf + disps[rank] * extent, rcounts[rank], dtype, rbuf, rcounts[rank], dtype); if (OMPI_SUCCESS != err) { free(tmp_rcounts); free(tmp_disps); goto cleanup; } } free(tmp_rcounts); free(tmp_disps); }
/* MPI_IN_PLACE all to all algorithm. TODO: implement a better one. */ int mca_coll_base_alltoall_intra_basic_inplace(const void *rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module; int i, j, size, rank, err = MPI_SUCCESS, line; MPI_Request *preq; char *tmp_buffer; size_t max_size; ptrdiff_t ext, true_lb, true_ext; /* Initialize. */ size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); /* If only one process, we're done. */ if (1 == size) { return MPI_SUCCESS; } /* Find the largest receive amount */ ompi_datatype_type_extent (rdtype, &ext); ompi_datatype_get_true_extent ( rdtype, &true_lb, &true_ext); max_size = true_ext + ext * (rcount-1); /* Allocate a temporary buffer */ tmp_buffer = calloc (max_size, 1); if (NULL == tmp_buffer) { return OMPI_ERR_OUT_OF_RESOURCE; } max_size = ext * rcount; /* in-place alltoall slow algorithm (but works) */ for (i = 0 ; i < size ; ++i) { for (j = i+1 ; j < size ; ++j) { /* Initiate all send/recv to/from others. */ preq = coll_base_comm_get_reqs(base_module->base_data, size * 2); if (i == rank) { /* Copy the data into the temporary buffer */ err = ompi_datatype_copy_content_same_ddt (rdtype, rcount, tmp_buffer, (char *) rbuf + j * max_size); if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; } /* Exchange data with the peer */ err = MCA_PML_CALL(irecv ((char *) rbuf + max_size * j, rcount, rdtype, j, MCA_COLL_BASE_TAG_ALLTOALL, comm, preq++)); if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; } err = MCA_PML_CALL(isend ((char *) tmp_buffer, rcount, rdtype, j, MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD, comm, preq++)); if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; } } else if (j == rank) { /* Copy the data into the temporary buffer */ err = ompi_datatype_copy_content_same_ddt (rdtype, rcount, tmp_buffer, (char *) rbuf + i * max_size); if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; } /* Exchange data with the peer */ err = MCA_PML_CALL(irecv ((char *) rbuf + max_size * i, rcount, rdtype, i, MCA_COLL_BASE_TAG_ALLTOALL, comm, preq++)); if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; } err = MCA_PML_CALL(isend ((char *) tmp_buffer, rcount, rdtype, i, MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD, comm, preq++)); if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; } } else { continue; } /* Wait for the requests to complete */ err = ompi_request_wait_all (2, base_module->base_data->mcct_reqs, MPI_STATUSES_IGNORE); if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; } } } error_hndl: /* Free the temporary buffer */ free (tmp_buffer); if( MPI_SUCCESS != err ) { OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank)); ompi_coll_base_free_reqs(base_module->base_data->mcct_reqs, 2); } /* All done */ return err; }
int ompi_osc_ucx_get_accumulate(const void *origin_addr, int origin_count, struct ompi_datatype_t *origin_dt, void *result_addr, int result_count, struct ompi_datatype_t *result_dt, int target, ptrdiff_t target_disp, int target_count, struct ompi_datatype_t *target_dt, struct ompi_op_t *op, struct ompi_win_t *win) { ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module; ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, target); int ret = OMPI_SUCCESS; ret = check_sync_state(module, target, false); if (ret != OMPI_SUCCESS) { return ret; } ret = start_atomicity(module, ep, target); if (ret != OMPI_SUCCESS) { return ret; } ret = ompi_osc_ucx_get(result_addr, result_count, result_dt, target, target_disp, target_count, target_dt, win); if (ret != OMPI_SUCCESS) { return ret; } if (op != &ompi_mpi_op_no_op.op) { if (op == &ompi_mpi_op_replace.op) { ret = ompi_osc_ucx_put(origin_addr, origin_count, origin_dt, target, target_disp, target_count, target_dt, win); if (ret != OMPI_SUCCESS) { return ret; } } else { void *temp_addr = NULL; uint32_t temp_count; ompi_datatype_t *temp_dt; ptrdiff_t temp_lb, temp_extent; ucs_status_t status; bool is_origin_contig = ompi_datatype_is_contiguous_memory_layout(origin_dt, origin_count); if (ompi_datatype_is_predefined(target_dt)) { temp_dt = target_dt; temp_count = target_count; } else { ret = ompi_osc_base_get_primitive_type_info(target_dt, &temp_dt, &temp_count); if (ret != OMPI_SUCCESS) { return ret; } } ompi_datatype_get_true_extent(temp_dt, &temp_lb, &temp_extent); temp_addr = malloc(temp_extent * temp_count); if (temp_addr == NULL) { return OMPI_ERR_TEMP_OUT_OF_RESOURCE; } ret = ompi_osc_ucx_get(temp_addr, (int)temp_count, temp_dt, target, target_disp, target_count, target_dt, win); if (ret != OMPI_SUCCESS) { return ret; } status = ucp_ep_flush(ep); if (status != UCS_OK) { opal_output_verbose(1, ompi_osc_base_framework.framework_output, "%s:%d: ucp_ep_flush failed: %d\n", __FILE__, __LINE__, status); return OMPI_ERROR; } if (ompi_datatype_is_predefined(origin_dt) || is_origin_contig) { ompi_op_reduce(op, (void *)origin_addr, temp_addr, (int)temp_count, temp_dt); } else { ucx_iovec_t *origin_ucx_iov = NULL; uint32_t origin_ucx_iov_count = 0; uint32_t origin_ucx_iov_idx = 0; ret = create_iov_list(origin_addr, origin_count, origin_dt, &origin_ucx_iov, &origin_ucx_iov_count); if (ret != OMPI_SUCCESS) { return ret; } if ((op != &ompi_mpi_op_maxloc.op && op != &ompi_mpi_op_minloc.op) || ompi_datatype_is_contiguous_memory_layout(temp_dt, temp_count)) { size_t temp_size; ompi_datatype_type_size(temp_dt, &temp_size); while (origin_ucx_iov_idx < origin_ucx_iov_count) { int curr_count = origin_ucx_iov[origin_ucx_iov_idx].len / temp_size; ompi_op_reduce(op, origin_ucx_iov[origin_ucx_iov_idx].addr, temp_addr, curr_count, temp_dt); temp_addr = (void *)((char *)temp_addr + curr_count * temp_size); origin_ucx_iov_idx++; } } else { int i; void *curr_origin_addr = origin_ucx_iov[origin_ucx_iov_idx].addr; for (i = 0; i < (int)temp_count; i++) { ompi_op_reduce(op, curr_origin_addr, (void *)((char *)temp_addr + i * temp_extent), 1, temp_dt); curr_origin_addr = (void *)((char *)curr_origin_addr + temp_extent); origin_ucx_iov_idx++; if (curr_origin_addr >= (void *)((char *)origin_ucx_iov[origin_ucx_iov_idx].addr + origin_ucx_iov[origin_ucx_iov_idx].len)) { origin_ucx_iov_idx++; curr_origin_addr = origin_ucx_iov[origin_ucx_iov_idx].addr; } } } free(origin_ucx_iov); } ret = ompi_osc_ucx_put(temp_addr, (int)temp_count, temp_dt, target, target_disp, target_count, target_dt, win); if (ret != OMPI_SUCCESS) { return ret; } status = ucp_ep_flush(ep); if (status != UCS_OK) { opal_output_verbose(1, ompi_osc_base_framework.framework_output, "%s:%d: ucp_ep_flush failed: %d\n", __FILE__, __LINE__, status); return OMPI_ERROR; } free(temp_addr); } } ret = end_atomicity(module, ep, target); return ret; }
/* Todo: gather_intra_generic, gather_intra_binary, gather_intra_chain, * gather_intra_pipeline, segmentation? */ int ompi_coll_base_gather_intra_binomial(const void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int line = -1, i, rank, vrank, size, total_recv = 0, err; char *ptmp = NULL, *tempbuf = NULL; ompi_coll_tree_t* bmtree; MPI_Status status; MPI_Aint sextent, slb, strue_lb, strue_extent; MPI_Aint rextent, rlb, rtrue_lb, rtrue_extent; mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module; mca_coll_base_comm_t *data = base_module->base_data; size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "ompi_coll_base_gather_intra_binomial rank %d", rank)); /* create the binomial tree */ COLL_BASE_UPDATE_IN_ORDER_BMTREE( comm, base_module, root ); bmtree = data->cached_in_order_bmtree; ompi_datatype_get_extent(sdtype, &slb, &sextent); ompi_datatype_get_true_extent(sdtype, &strue_lb, &strue_extent); vrank = (rank - root + size) % size; if (rank == root) { ompi_datatype_get_extent(rdtype, &rlb, &rextent); ompi_datatype_get_true_extent(rdtype, &rtrue_lb, &rtrue_extent); if (0 == root){ /* root on 0, just use the recv buffer */ ptmp = (char *) rbuf; if (sbuf != MPI_IN_PLACE) { err = ompi_datatype_sndrcv((void *)sbuf, scount, sdtype, ptmp, rcount, rdtype); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } } } else { /* root is not on 0, allocate temp buffer for recv, * rotate data at the end */ tempbuf = (char *) malloc(rtrue_extent + ((ptrdiff_t)rcount * (ptrdiff_t)size - 1) * rextent); if (NULL == tempbuf) { err= OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl; } ptmp = tempbuf - rtrue_lb; if (sbuf != MPI_IN_PLACE) { /* copy from sbuf to temp buffer */ err = ompi_datatype_sndrcv((void *)sbuf, scount, sdtype, ptmp, rcount, rdtype); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } } else { /* copy from rbuf to temp buffer */ err = ompi_datatype_copy_content_same_ddt(rdtype, rcount, ptmp, (char *)rbuf + (ptrdiff_t)rank * rextent * (ptrdiff_t)rcount); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } } } total_recv = rcount; } else if (!(vrank % 2)) { /* other non-leaf nodes, allocate temp buffer for data received from * children, the most we need is half of the total data elements due * to the property of binimoal tree */ tempbuf = (char *) malloc(strue_extent + ((ptrdiff_t)scount * (ptrdiff_t)size - 1) * sextent); if (NULL == tempbuf) { err= OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl; } ptmp = tempbuf - strue_lb; /* local copy to tempbuf */ err = ompi_datatype_sndrcv((void *)sbuf, scount, sdtype, ptmp, scount, sdtype); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } /* use sdtype,scount as rdtype,rdcount since they are ignored on * non-root procs */ rdtype = sdtype; rcount = scount; rextent = sextent; total_recv = rcount; } else { /* leaf nodes, no temp buffer needed, use sdtype,scount as * rdtype,rdcount since they are ignored on non-root procs */ ptmp = (char *) sbuf; total_recv = scount; } if (!(vrank % 2)) { /* all non-leaf nodes recv from children */ for (i = 0; i < bmtree->tree_nextsize; i++) { int mycount = 0, vkid; /* figure out how much data I have to send to this child */ vkid = (bmtree->tree_next[i] - root + size) % size; mycount = vkid - vrank; if (mycount > (size - vkid)) mycount = size - vkid; mycount *= rcount; OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "ompi_coll_base_gather_intra_binomial rank %d recv %d mycount = %d", rank, bmtree->tree_next[i], mycount)); err = MCA_PML_CALL(recv(ptmp + total_recv*rextent, (ptrdiff_t)rcount * size - total_recv, rdtype, bmtree->tree_next[i], MCA_COLL_BASE_TAG_GATHER, comm, &status)); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } total_recv += mycount; } } if (rank != root) { /* all nodes except root send to parents */ OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "ompi_coll_base_gather_intra_binomial rank %d send %d count %d\n", rank, bmtree->tree_prev, total_recv)); err = MCA_PML_CALL(send(ptmp, total_recv, sdtype, bmtree->tree_prev, MCA_COLL_BASE_TAG_GATHER, MCA_PML_BASE_SEND_STANDARD, comm)); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } } if (rank == root) { if (root != 0) { /* rotate received data on root if root != 0 */ err = ompi_datatype_copy_content_same_ddt(rdtype, (ptrdiff_t)rcount * (ptrdiff_t)(size - root), (char *)rbuf + rextent * (ptrdiff_t)root * (ptrdiff_t)rcount, ptmp); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } err = ompi_datatype_copy_content_same_ddt(rdtype, (ptrdiff_t)rcount * (ptrdiff_t)root, (char *) rbuf, ptmp + rextent * (ptrdiff_t)rcount * (ptrdiff_t)(size-root)); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } free(tempbuf); } } else if (!(vrank % 2)) { /* other non-leaf nodes */ free(tempbuf); } return MPI_SUCCESS; err_hndl: if (NULL != tempbuf) free(tempbuf); OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank)); return err; }
/* * reduce_intra * * Function: - reduction using two level hierarchy algorithm * Accepts: - same as MPI_Reduce() * Returns: - MPI_SUCCESS or error code */ int mca_coll_hierarch_allreduce_intra(void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { struct ompi_communicator_t *llcomm=NULL; struct ompi_communicator_t *lcomm=NULL; mca_coll_hierarch_module_t *hierarch_module = (mca_coll_hierarch_module_t *) module; int rank; int lroot, llroot; ptrdiff_t extent, true_extent, lb, true_lb; char *tmpbuf=NULL, *tbuf=NULL; int ret=OMPI_SUCCESS; int root=0; rank = ompi_comm_rank ( comm ); lcomm = hierarch_module->hier_lcomm; if ( mca_coll_hierarch_verbose_param ) { printf("%s:%d: executing hierarchical allreduce with cnt=%d \n", comm->c_name, rank, count ); } llcomm = mca_coll_hierarch_get_llcomm ( root, hierarch_module, &llroot, &lroot); if ( MPI_COMM_NULL != lcomm ) { ompi_datatype_get_extent(dtype, &lb, &extent); ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); tbuf = (char*)malloc(true_extent + (count - 1) * extent); if (NULL == tbuf) { return OMPI_ERR_OUT_OF_RESOURCE; } tmpbuf = tbuf - lb; if ( MPI_IN_PLACE != sbuf ) { ret = lcomm->c_coll.coll_reduce (sbuf, tmpbuf, count, dtype, op, lroot, lcomm, lcomm->c_coll.coll_reduce_module); } else { ret = lcomm->c_coll.coll_reduce (rbuf, tmpbuf, count, dtype, op, lroot, lcomm, lcomm->c_coll.coll_reduce_module); } if ( OMPI_SUCCESS != ret ) { goto exit; } } if ( MPI_UNDEFINED != llroot ) { if ( MPI_COMM_NULL != lcomm ) { ret = llcomm->c_coll.coll_allreduce (tmpbuf, rbuf, count, dtype, op, llcomm, llcomm->c_coll.coll_allreduce_module); } else { ret = llcomm->c_coll.coll_allreduce (sbuf, rbuf, count, dtype, op, llcomm, llcomm->c_coll.coll_allreduce_module); } } if ( MPI_COMM_NULL != lcomm ) { ret = lcomm->c_coll.coll_bcast(rbuf, count, dtype, lroot, lcomm, lcomm->c_coll.coll_bcast_module ); } exit: if ( NULL != tmpbuf ) { free ( tmpbuf ); } return ret; }
int ompi_coll_base_alltoall_intra_bruck(const void *sbuf, int scount, struct ompi_datatype_t *sdtype, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, k, line = -1, rank, size, err = 0; int sendto, recvfrom, distance, *displs = NULL, *blen = NULL; char *tmpbuf = NULL, *tmpbuf_free = NULL; ptrdiff_t rlb, slb, tlb, sext, rext, tsext; struct ompi_datatype_t *new_ddt; if (MPI_IN_PLACE == sbuf) { return mca_coll_base_alltoall_intra_basic_inplace (rbuf, rcount, rdtype, comm, module); } size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:alltoall_intra_bruck rank %d", rank)); err = ompi_datatype_get_extent (sdtype, &slb, &sext); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } err = ompi_datatype_get_true_extent(sdtype, &tlb, &tsext); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } err = ompi_datatype_get_extent (rdtype, &rlb, &rext); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } displs = (int *) malloc(size * sizeof(int)); if (displs == NULL) { line = __LINE__; err = -1; goto err_hndl; } blen = (int *) malloc(size * sizeof(int)); if (blen == NULL) { line = __LINE__; err = -1; goto err_hndl; } /* tmp buffer allocation for message data */ tmpbuf_free = (char *) malloc(tsext + ((ptrdiff_t)scount * (ptrdiff_t)size - 1) * sext); if (tmpbuf_free == NULL) { line = __LINE__; err = -1; goto err_hndl; } tmpbuf = tmpbuf_free - slb; /* Step 1 - local rotation - shift up by rank */ err = ompi_datatype_copy_content_same_ddt (sdtype, (int32_t) ((ptrdiff_t)(size - rank) * (ptrdiff_t)scount), tmpbuf, ((char*) sbuf) + (ptrdiff_t)rank * (ptrdiff_t)scount * sext); if (err<0) { line = __LINE__; err = -1; goto err_hndl; } if (rank != 0) { err = ompi_datatype_copy_content_same_ddt (sdtype, (ptrdiff_t)rank * (ptrdiff_t)scount, tmpbuf + (ptrdiff_t)(size - rank) * (ptrdiff_t)scount* sext, (char*) sbuf); if (err<0) { line = __LINE__; err = -1; goto err_hndl; } } /* perform communication step */ for (distance = 1; distance < size; distance<<=1) { sendto = (rank + distance) % size; recvfrom = (rank - distance + size) % size; k = 0; /* create indexed datatype */ for (i = 1; i < size; i++) { if (( i & distance) == distance) { displs[k] = (ptrdiff_t)i * (ptrdiff_t)scount; blen[k] = scount; k++; } } /* Set indexes and displacements */ err = ompi_datatype_create_indexed(k, blen, displs, sdtype, &new_ddt); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } /* Commit the new datatype */ err = ompi_datatype_commit(&new_ddt); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } /* Sendreceive */ err = ompi_coll_base_sendrecv ( tmpbuf, 1, new_ddt, sendto, MCA_COLL_BASE_TAG_ALLTOALL, rbuf, 1, new_ddt, recvfrom, MCA_COLL_BASE_TAG_ALLTOALL, comm, MPI_STATUS_IGNORE, rank ); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } /* Copy back new data from recvbuf to tmpbuf */ err = ompi_datatype_copy_content_same_ddt(new_ddt, 1,tmpbuf, (char *) rbuf); if (err < 0) { line = __LINE__; err = -1; goto err_hndl; } /* free ddt */ err = ompi_datatype_destroy(&new_ddt); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } } /* end of for (distance = 1... */ /* Step 3 - local rotation - */ for (i = 0; i < size; i++) { err = ompi_datatype_copy_content_same_ddt (rdtype, (int32_t) rcount, ((char*)rbuf) + ((ptrdiff_t)((rank - i + size) % size) * (ptrdiff_t)rcount * rext), tmpbuf + (ptrdiff_t)i * (ptrdiff_t)rcount * rext); if (err < 0) { line = __LINE__; err = -1; goto err_hndl; } } /* Step 4 - clean up */ if (tmpbuf != NULL) free(tmpbuf_free); if (displs != NULL) free(displs); if (blen != NULL) free(blen); return OMPI_SUCCESS; err_hndl: OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank)); if (tmpbuf != NULL) free(tmpbuf_free); if (displs != NULL) free(displs); if (blen != NULL) free(blen); return err; }
static int ompi_coll_portals4_scatter_intra_linear_top(const void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, ompi_coll_portals4_request_t *request, mca_coll_base_module_t *module) { mca_coll_portals4_module_t *portals4_module = (mca_coll_portals4_module_t*) module; int ret, line; ptl_ct_event_t ct; ptl_ct_event_t sync_incr_event; int8_t i_am_root; int32_t expected_rtrs = 0; int32_t expected_puts = 0; int32_t expected_acks = 0; int32_t expected_ops = 0; int32_t expected_chained_rtrs = 0; int32_t expected_chained_acks = 0; OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:portals4:scatter_intra_linear_top enter rank %d", request->u.scatter.my_rank)); request->type = OMPI_COLL_PORTALS4_TYPE_SCATTER; request->u.scatter.scatter_buf = NULL; request->u.scatter.scatter_mdh = PTL_INVALID_HANDLE; request->u.scatter.scatter_cth = PTL_INVALID_HANDLE; request->u.scatter.scatter_meh = PTL_INVALID_HANDLE; request->u.scatter.sync_mdh = PTL_INVALID_HANDLE; request->u.scatter.sync_cth = PTL_INVALID_HANDLE; request->u.scatter.sync_meh = PTL_INVALID_HANDLE; request->u.scatter.my_rank = ompi_comm_rank(comm); request->u.scatter.size = ompi_comm_size(comm); request->u.scatter.root_rank = root; request->u.scatter.sbuf = sbuf; request->u.scatter.rbuf = rbuf; request->u.scatter.pack_src_buf = sbuf; request->u.scatter.pack_src_count = scount; request->u.scatter.pack_src_dtype = sdtype; ompi_datatype_get_extent(request->u.scatter.pack_src_dtype, &request->u.scatter.pack_src_lb, &request->u.scatter.pack_src_extent); ompi_datatype_get_true_extent(request->u.scatter.pack_src_dtype, &request->u.scatter.pack_src_true_lb, &request->u.scatter.pack_src_true_extent); if ((root == request->u.scatter.my_rank) && (rbuf == MPI_IN_PLACE)) { request->u.scatter.unpack_dst_buf = NULL; request->u.scatter.unpack_dst_count = 0; request->u.scatter.unpack_dst_dtype = MPI_DATATYPE_NULL; } else { request->u.scatter.unpack_dst_buf = rbuf; request->u.scatter.unpack_dst_count = rcount; request->u.scatter.unpack_dst_dtype = rdtype; request->u.scatter.unpack_dst_offset = 0; ompi_datatype_get_extent(request->u.scatter.unpack_dst_dtype, &request->u.scatter.unpack_dst_lb, &request->u.scatter.unpack_dst_extent); ompi_datatype_get_true_extent(request->u.scatter.unpack_dst_dtype, &request->u.scatter.unpack_dst_true_lb, &request->u.scatter.unpack_dst_true_extent); } opal_output_verbose(30, ompi_coll_base_framework.framework_output, "%s:%d:rank(%d): request->u.scatter.unpack_dst_offset(%lu)", __FILE__, __LINE__, request->u.scatter.my_rank, request->u.scatter.unpack_dst_offset); /**********************************/ /* Setup Common Parameters */ /**********************************/ i_am_root = (request->u.scatter.my_rank == request->u.scatter.root_rank); request->u.scatter.coll_count = opal_atomic_add_size_t(&portals4_module->coll_count, 1); ret = setup_scatter_buffers_linear(comm, request, portals4_module); if (MPI_SUCCESS != ret) { line = __LINE__; goto err_hdlr; } ret = setup_scatter_handles(comm, request, portals4_module); if (MPI_SUCCESS != ret) { line = __LINE__; goto err_hdlr; } ret = setup_sync_handles(comm, request, portals4_module); if (MPI_SUCCESS != ret) { line = __LINE__; goto err_hdlr; } /**********************************/ /* do the scatter */ /**********************************/ if (i_am_root) { /* operations on the sync counter */ expected_rtrs = request->u.scatter.size - 1; /* expect RTRs from non-root ranks */ expected_acks = request->u.scatter.size - 1; /* expect Recv-ACKs from non-root ranks */ /* operations on the scatter counter */ expected_puts = 0; expected_chained_rtrs = 1; expected_chained_acks = 1; /* Chain the RTR and Recv-ACK to the Scatter CT */ sync_incr_event.success=1; sync_incr_event.failure=0; ret = PtlTriggeredCTInc(request->u.scatter.scatter_cth, sync_incr_event, request->u.scatter.sync_cth, expected_rtrs); if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; } ret = PtlTriggeredCTInc(request->u.scatter.scatter_cth, sync_incr_event, request->u.scatter.sync_cth, expected_rtrs + expected_acks); if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; } /* root, so put packed bytes to other ranks */ for (int32_t i=0; i<request->u.scatter.size; i++) { /* do not put to my scatter_buf. my data gets unpacked into my out buffer in linear_bottom(). */ if (i == request->u.scatter.my_rank) { continue; } ptl_size_t offset = request->u.scatter.packed_size * i; opal_output_verbose(30, ompi_coll_base_framework.framework_output, "%s:%d:rank(%d): offset(%lu)=rank(%d) * packed_size(%ld)", __FILE__, __LINE__, request->u.scatter.my_rank, offset, i, request->u.scatter.packed_size); ret = PtlTriggeredPut(request->u.scatter.scatter_mdh, (ptl_size_t)request->u.scatter.scatter_buf + offset, request->u.scatter.packed_size, PTL_NO_ACK_REQ, ompi_coll_portals4_get_peer(comm, i), mca_coll_portals4_component.pt_idx, request->u.scatter.scatter_match_bits, 0, NULL, 0, request->u.scatter.scatter_cth, expected_chained_rtrs); if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; } } } else { /* non-root, so do nothing */ /* operations on the sync counter */ expected_rtrs = 0; expected_acks = 0; /* operations on the scatter counter */ expected_puts = 1; /* scatter put from root */ expected_chained_rtrs = 0; expected_chained_acks = 0; } expected_ops = expected_chained_rtrs + expected_puts; /**********************************************/ /* only non-root ranks are PUT to, so only */ /* non-root ranks must PUT a Recv-ACK to root */ /**********************************************/ if (!i_am_root) { ret = PtlTriggeredPut(request->u.scatter.sync_mdh, 0, 0, PTL_NO_ACK_REQ, ompi_coll_portals4_get_peer(comm, request->u.scatter.root_rank), mca_coll_portals4_component.pt_idx, request->u.scatter.sync_match_bits, 0, NULL, 0, request->u.scatter.scatter_cth, expected_ops); if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; } } expected_ops += expected_chained_acks; if (!request->u.scatter.is_sync) { /******************************************/ /* put to finish pt when all ops complete */ /******************************************/ ret = PtlTriggeredPut(mca_coll_portals4_component.zero_md_h, 0, 0, PTL_NO_ACK_REQ, ompi_coll_portals4_get_peer(comm, request->u.scatter.my_rank), mca_coll_portals4_component.finish_pt_idx, 0, 0, NULL, (uintptr_t) request, request->u.scatter.scatter_cth, expected_ops); if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; } } /**************************************/ /* all non-root ranks put RTR to root */ /**************************************/ if (!i_am_root) { ret = PtlPut(request->u.scatter.sync_mdh, 0, 0, PTL_NO_ACK_REQ, ompi_coll_portals4_get_peer(comm, request->u.scatter.root_rank), mca_coll_portals4_component.pt_idx, request->u.scatter.sync_match_bits, 0, NULL, 0); if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; } } if (request->u.scatter.is_sync) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, "calling CTWait(expected_ops=%d)\n", expected_ops); /********************************/ /* Wait for all ops to complete */ /********************************/ ret = PtlCTWait(request->u.scatter.scatter_cth, expected_ops, &ct); if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; } opal_output_verbose(1, ompi_coll_base_framework.framework_output, "completed CTWait(expected_ops=%d)\n", expected_ops); } OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:portals4:scatter_intra_linear_top exit rank %d", request->u.scatter.my_rank)); return OMPI_SUCCESS; err_hdlr: if (NULL != request->u.scatter.scatter_buf) free(request->u.scatter.scatter_buf); opal_output(ompi_coll_base_framework.framework_output, "%s:%4d:%4d\tError occurred ret=%d, rank %2d", __FILE__, __LINE__, line, ret, request->u.scatter.my_rank); return ret; }
/* * ompi_coll_base_allreduce_intra_recursivedoubling * * Function: Recursive doubling algorithm for allreduce operation * Accepts: Same as MPI_Allreduce() * Returns: MPI_SUCCESS or error code * * Description: Implements recursive doubling algorithm for allreduce. * Original (non-segmented) implementation is used in MPICH-2 * for small and intermediate size messages. * The algorithm preserves order of operations so it can * be used both by commutative and non-commutative operations. * * Example on 7 nodes: * Initial state * # 0 1 2 3 4 5 6 * [0] [1] [2] [3] [4] [5] [6] * Initial adjustment step for non-power of two nodes. * old rank 1 3 5 6 * new rank 0 1 2 3 * [0+1] [2+3] [4+5] [6] * Step 1 * old rank 1 3 5 6 * new rank 0 1 2 3 * [0+1+] [0+1+] [4+5+] [4+5+] * [2+3+] [2+3+] [6 ] [6 ] * Step 2 * old rank 1 3 5 6 * new rank 0 1 2 3 * [0+1+] [0+1+] [0+1+] [0+1+] * [2+3+] [2+3+] [2+3+] [2+3+] * [4+5+] [4+5+] [4+5+] [4+5+] * [6 ] [6 ] [6 ] [6 ] * Final adjustment step for non-power of two nodes * # 0 1 2 3 4 5 6 * [0+1+] [0+1+] [0+1+] [0+1+] [0+1+] [0+1+] [0+1+] * [2+3+] [2+3+] [2+3+] [2+3+] [2+3+] [2+3+] [2+3+] * [4+5+] [4+5+] [4+5+] [4+5+] [4+5+] [4+5+] [4+5+] * [6 ] [6 ] [6 ] [6 ] [6 ] [6 ] [6 ] * */ int ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int ret, line, rank, size, adjsize, remote, distance; int newrank, newremote, extra_ranks; char *tmpsend = NULL, *tmprecv = NULL, *tmpswap = NULL, *inplacebuf = NULL; ptrdiff_t true_lb, true_extent, lb, extent; ompi_request_t *reqs[2] = {NULL, NULL}; size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:allreduce_intra_recursivedoubling rank %d", rank)); /* Special case for size == 1 */ if (1 == size) { if (MPI_IN_PLACE != sbuf) { ret = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf, (char*)sbuf); if (ret < 0) { line = __LINE__; goto error_hndl; } } return MPI_SUCCESS; } /* Allocate and initialize temporary send buffer */ ret = ompi_datatype_get_extent(dtype, &lb, &extent); if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } ret = ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } inplacebuf = (char*) malloc(true_extent + (ptrdiff_t)(count - 1) * extent); if (NULL == inplacebuf) { ret = -1; line = __LINE__; goto error_hndl; } if (MPI_IN_PLACE == sbuf) { ret = ompi_datatype_copy_content_same_ddt(dtype, count, inplacebuf, (char*)rbuf); if (ret < 0) { line = __LINE__; goto error_hndl; } } else { ret = ompi_datatype_copy_content_same_ddt(dtype, count, inplacebuf, (char*)sbuf); if (ret < 0) { line = __LINE__; goto error_hndl; } } tmpsend = (char*) inplacebuf; tmprecv = (char*) rbuf; /* Determine nearest power of two less than or equal to size */ adjsize = opal_next_poweroftwo (size); adjsize >>= 1; /* Handle non-power-of-two case: - Even ranks less than 2 * extra_ranks send their data to (rank + 1), and sets new rank to -1. - Odd ranks less than 2 * extra_ranks receive data from (rank - 1), apply appropriate operation, and set new rank to rank/2 - Everyone else sets rank to rank - extra_ranks */ extra_ranks = size - adjsize; if (rank < (2 * extra_ranks)) { if (0 == (rank % 2)) { ret = MCA_PML_CALL(send(tmpsend, count, dtype, (rank + 1), MCA_COLL_BASE_TAG_ALLREDUCE, MCA_PML_BASE_SEND_STANDARD, comm)); if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } newrank = -1; } else { ret = MCA_PML_CALL(recv(tmprecv, count, dtype, (rank - 1), MCA_COLL_BASE_TAG_ALLREDUCE, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } /* tmpsend = tmprecv (op) tmpsend */ ompi_op_reduce(op, tmprecv, tmpsend, count, dtype); newrank = rank >> 1; } } else {
/******************************************************************************* * ompi_coll_tuned_reduce_scatter_intra_nonoverlapping * * This function just calls a reduce to rank 0, followed by an * appropriate scatterv call. */ int ompi_coll_tuned_reduce_scatter_intra_nonoverlapping(void *sbuf, void *rbuf, int *rcounts, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int err, i, rank, size, total_count, *displs = NULL; const int root = 0; char *tmprbuf = NULL, *tmprbuf_free = NULL; rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_scatter_intra_nonoverlapping, rank %d", rank)); for (i = 0, total_count = 0; i < size; i++) { total_count += rcounts[i]; } /* Reduce to rank 0 (root) and scatterv */ tmprbuf = (char*) rbuf; if (MPI_IN_PLACE == sbuf) { /* rbuf on root (0) is big enough to hold whole data */ if (root == rank) { err = comm->c_coll.coll_reduce (MPI_IN_PLACE, tmprbuf, total_count, dtype, op, root, comm, comm->c_coll.coll_reduce_module); } else { err = comm->c_coll.coll_reduce(tmprbuf, NULL, total_count, dtype, op, root, comm, comm->c_coll.coll_reduce_module); } } else { if (root == rank) { /* We must allocate temporary receive buffer on root to ensure that rbuf is big enough */ ptrdiff_t lb, extent, tlb, textent; ompi_datatype_get_extent(dtype, &lb, &extent); ompi_datatype_get_true_extent(dtype, &tlb, &textent); tmprbuf_free = (char*) malloc(textent + (ptrdiff_t)(total_count - 1) * extent); tmprbuf = tmprbuf_free - lb; } err = comm->c_coll.coll_reduce (sbuf, tmprbuf, total_count, dtype, op, root, comm, comm->c_coll.coll_reduce_module); } if (MPI_SUCCESS != err) { if (NULL != tmprbuf_free) free(tmprbuf_free); return err; } displs = (int*) malloc(size * sizeof(int)); displs[0] = 0; for (i = 1; i < size; i++) { displs[i] = displs[i-1] + rcounts[i-1]; } err = comm->c_coll.coll_scatterv (tmprbuf, rcounts, displs, dtype, rbuf, rcounts[rank], dtype, root, comm, comm->c_coll.coll_scatterv_module); free(displs); if (NULL != tmprbuf_free) free(tmprbuf_free); return err; }
/* * reduce_intra_in_order_binary * * Function: Logarithmic reduce operation for non-commutative operations. * Acecpts: same as MPI_Reduce() * Returns: MPI_SUCCESS or error code */ int ompi_coll_tuned_reduce_intra_in_order_binary( void *sendbuf, void *recvbuf, int count, ompi_datatype_t* datatype, ompi_op_t* op, int root, ompi_communicator_t* comm, mca_coll_base_module_t *module, uint32_t segsize, int max_outstanding_reqs ) { int ret, rank, size, io_root, segcount = count; void *use_this_sendbuf = NULL, *use_this_recvbuf = NULL; size_t typelng; mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; mca_coll_tuned_comm_t *data = tuned_module->tuned_data; rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_in_order_binary rank %d ss %5d", rank, segsize)); COLL_TUNED_UPDATE_IN_ORDER_BINTREE( comm, tuned_module ); /** * Determine number of segments and number of elements * sent per operation */ ompi_datatype_type_size( datatype, &typelng ); COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount ); /* An in-order binary tree must use root (size-1) to preserve the order of operations. Thus, if root is not rank (size - 1), then we must handle 1. MPI_IN_PLACE option on real root, and 2. we must allocate temporary recvbuf on rank (size - 1). Note that generic function must be careful not to switch order of operations for non-commutative ops. */ io_root = size - 1; use_this_sendbuf = sendbuf; use_this_recvbuf = recvbuf; if (io_root != root) { ptrdiff_t tlb, text, lb, ext; char *tmpbuf = NULL; ompi_datatype_get_extent(datatype, &lb, &ext); ompi_datatype_get_true_extent(datatype, &tlb, &text); if ((root == rank) && (MPI_IN_PLACE == sendbuf)) { tmpbuf = (char *) malloc(text + (ptrdiff_t)(count - 1) * ext); if (NULL == tmpbuf) { return MPI_ERR_INTERN; } ompi_datatype_copy_content_same_ddt(datatype, count, (char*)tmpbuf, (char*)recvbuf); use_this_sendbuf = tmpbuf; } else if (io_root == rank) { tmpbuf = (char *) malloc(text + (ptrdiff_t)(count - 1) * ext); if (NULL == tmpbuf) { return MPI_ERR_INTERN; } use_this_recvbuf = tmpbuf; } } /* Use generic reduce with in-order binary tree topology and io_root */ ret = ompi_coll_tuned_reduce_generic( use_this_sendbuf, use_this_recvbuf, count, datatype, op, io_root, comm, module, data->cached_in_order_bintree, segcount, max_outstanding_reqs ); if (MPI_SUCCESS != ret) { return ret; } /* Clean up */ if (io_root != root) { if (root == rank) { /* Receive result from rank io_root to recvbuf */ ret = MCA_PML_CALL(recv(recvbuf, count, datatype, io_root, MCA_COLL_BASE_TAG_REDUCE, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != ret) { return ret; } if (MPI_IN_PLACE == sendbuf) { free(use_this_sendbuf); } } else if (io_root == rank) { /* Send result from use_this_recvbuf to root */ ret = MCA_PML_CALL(send(use_this_recvbuf, count, datatype, root, MCA_COLL_BASE_TAG_REDUCE, MCA_PML_BASE_SEND_STANDARD, comm)); if (MPI_SUCCESS != ret) { return ret; } free(use_this_recvbuf); } } return MPI_SUCCESS; }
/* * exscan_intra * * Function: - basic exscan operation * Accepts: - same arguments as MPI_Exscan() * Returns: - MPI_SUCCESS or error code */ int mca_coll_basic_exscan_intra(void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int size, rank, err; ptrdiff_t true_lb, true_extent, lb, extent; char *free_buffer = NULL; char *reduce_buffer = NULL; rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); /* For MPI_IN_PLACE, just adjust send buffer to point to * receive buffer. */ if (MPI_IN_PLACE == sbuf) { sbuf = rbuf; } /* If we're rank 0, then just send our sbuf to the next rank, and * we are done. */ if (0 == rank) { return MCA_PML_CALL(send(sbuf, count, dtype, rank + 1, MCA_COLL_BASE_TAG_EXSCAN, MCA_PML_BASE_SEND_STANDARD, comm)); } /* If we're the last rank, then just receive the result from the * prior rank, and we are done. */ else if ((size - 1) == rank) { return MCA_PML_CALL(recv(rbuf, count, dtype, rank - 1, MCA_COLL_BASE_TAG_EXSCAN, comm, MPI_STATUS_IGNORE)); } /* Otherwise, get the result from the prior rank, combine it with my * data, and send it to the next rank */ /* Get a temporary buffer to perform the reduction into. Rationale * for malloc'ing this size is provided in coll_basic_reduce.c. */ ompi_datatype_get_extent(dtype, &lb, &extent); ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); free_buffer = (char*)malloc(true_extent + (count - 1) * extent); if (NULL == free_buffer) { return OMPI_ERR_OUT_OF_RESOURCE; } reduce_buffer = free_buffer - lb; err = ompi_datatype_copy_content_same_ddt(dtype, count, reduce_buffer, (char*)sbuf); /* Receive the reduced value from the prior rank */ err = MCA_PML_CALL(recv(rbuf, count, dtype, rank - 1, MCA_COLL_BASE_TAG_EXSCAN, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != err) { goto error; } /* Now reduce the prior rank's result with my source buffer. The source * buffer had been previously copied into the temporary reduce_buffer. */ ompi_op_reduce(op, rbuf, reduce_buffer, count, dtype); /* Send my result off to the next rank */ err = MCA_PML_CALL(send(reduce_buffer, count, dtype, rank + 1, MCA_COLL_BASE_TAG_EXSCAN, MCA_PML_BASE_SEND_STANDARD, comm)); /* Error */ error: free(free_buffer); /* All done */ return err; }