int MPIR_Iscan_rec_dbl(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPIR_Comm *comm_ptr, MPIR_Sched_t s) { int mpi_errno = MPI_SUCCESS; MPI_Aint true_extent, true_lb, extent; int is_commutative; int mask, dst, rank, comm_size; void *partial_scan = NULL; void *tmp_buf = NULL; MPIR_SCHED_CHKPMEM_DECL(2); if (count == 0) goto fn_exit; comm_size = comm_ptr->local_size; rank = comm_ptr->rank; is_commutative = MPIR_Op_is_commutative(op); /* need to allocate temporary buffer to store partial scan*/ MPIR_Type_get_true_extent_impl(datatype, &true_lb, &true_extent); MPID_Datatype_get_extent_macro(datatype, extent); MPIR_SCHED_CHKPMEM_MALLOC(partial_scan, void *, count*(MPL_MAX(extent,true_extent)), mpi_errno, "partial_scan"); /* This eventually gets malloc()ed as a temp buffer, not added to * any user buffers */ MPIR_Ensure_Aint_fits_in_pointer(count * MPL_MAX(extent, true_extent)); /* adjust for potential negative lower bound in datatype */ partial_scan = (void *)((char*)partial_scan - true_lb); /* need to allocate temporary buffer to store incoming data*/ MPIR_SCHED_CHKPMEM_MALLOC(tmp_buf, void *, count*(MPL_MAX(extent,true_extent)), mpi_errno, "tmp_buf"); /* adjust for potential negative lower bound in datatype */ tmp_buf = (void *)((char*)tmp_buf - true_lb); /* Since this is an inclusive scan, copy local contribution into recvbuf. */ if (sendbuf != MPI_IN_PLACE) { mpi_errno = MPIR_Sched_copy(sendbuf, count, datatype, recvbuf, count, datatype, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } if (sendbuf != MPI_IN_PLACE) mpi_errno = MPIR_Sched_copy(sendbuf, count, datatype, partial_scan, count, datatype, s); else mpi_errno = MPIR_Sched_copy(recvbuf, count, datatype, partial_scan, count, datatype, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mask = 0x1; while (mask < comm_size) { dst = rank ^ mask; if (dst < comm_size) { /* Send partial_scan to dst. Recv into tmp_buf */ mpi_errno = MPIR_Sched_send(partial_scan, count, datatype, dst, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* sendrecv, no barrier here */ mpi_errno = MPIR_Sched_recv(tmp_buf, count, datatype, dst, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); if (rank > dst) { mpi_errno = MPIR_Sched_reduce(tmp_buf, partial_scan, count, datatype, op, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Sched_reduce(tmp_buf, recvbuf, count, datatype, op, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); } else { if (is_commutative) { mpi_errno = MPIR_Sched_reduce(tmp_buf, partial_scan, count, datatype, op, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); } else { mpi_errno = MPIR_Sched_reduce(partial_scan, tmp_buf, count, datatype, op, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); mpi_errno = MPIR_Sched_copy(tmp_buf, count, datatype, partial_scan, count, datatype, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); } } } mask <<= 1; } MPIR_SCHED_CHKPMEM_COMMIT(s); fn_exit: return mpi_errno; fn_fail: MPIR_SCHED_CHKPMEM_REAP(s); goto fn_exit; }
int MPIR_Ireduce_scatter_sched_intra_recursive_halving(const void *sendbuf, void *recvbuf, const int recvcounts[], MPI_Datatype datatype, MPI_Op op, MPIR_Comm * comm_ptr, MPIR_Sched_t s) { int mpi_errno = MPI_SUCCESS; int rank, comm_size, i; MPI_Aint extent, true_extent, true_lb; int *disps; void *tmp_recvbuf, *tmp_results; int type_size ATTRIBUTE((unused)), total_count, dst; int mask; int *newcnts, *newdisps, rem, newdst, send_idx, recv_idx, last_idx, send_cnt, recv_cnt; int pof2, old_i, newrank; MPIR_SCHED_CHKPMEM_DECL(5); comm_size = comm_ptr->local_size; rank = comm_ptr->rank; MPIR_Datatype_get_extent_macro(datatype, extent); MPIR_Type_get_true_extent_impl(datatype, &true_lb, &true_extent); #ifdef HAVE_ERROR_CHECKING MPIR_Assert(MPIR_Op_is_commutative(op)); #endif MPIR_SCHED_CHKPMEM_MALLOC(disps, int *, comm_size * sizeof(int), mpi_errno, "disps", MPL_MEM_BUFFER); total_count = 0; for (i = 0; i < comm_size; i++) { disps[i] = total_count; total_count += recvcounts[i]; } if (total_count == 0) { goto fn_exit; } MPIR_Datatype_get_size_macro(datatype, type_size); /* allocate temp. buffer to receive incoming data */ MPIR_SCHED_CHKPMEM_MALLOC(tmp_recvbuf, void *, total_count * (MPL_MAX(true_extent, extent)), mpi_errno, "tmp_recvbuf", MPL_MEM_BUFFER); /* adjust for potential negative lower bound in datatype */ tmp_recvbuf = (void *) ((char *) tmp_recvbuf - true_lb); /* need to allocate another temporary buffer to accumulate * results because recvbuf may not be big enough */ MPIR_SCHED_CHKPMEM_MALLOC(tmp_results, void *, total_count * (MPL_MAX(true_extent, extent)), mpi_errno, "tmp_results", MPL_MEM_BUFFER); /* adjust for potential negative lower bound in datatype */ tmp_results = (void *) ((char *) tmp_results - true_lb); /* copy sendbuf into tmp_results */ if (sendbuf != MPI_IN_PLACE) mpi_errno = MPIR_Sched_copy(sendbuf, total_count, datatype, tmp_results, total_count, datatype, s); else mpi_errno = MPIR_Sched_copy(recvbuf, total_count, datatype, tmp_results, total_count, datatype, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); pof2 = comm_ptr->pof2; rem = comm_size - pof2; /* In the non-power-of-two case, all even-numbered * processes of rank < 2*rem send their data to * (rank+1). These even-numbered processes no longer * participate in the algorithm until the very end. The * remaining processes form a nice power-of-two. */ if (rank < 2 * rem) { if (rank % 2 == 0) { /* even */ mpi_errno = MPIR_Sched_send(tmp_results, total_count, datatype, rank + 1, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); /* temporarily set the rank to -1 so that this * process does not pariticipate in recursive * doubling */ newrank = -1; } else { /* odd */ mpi_errno = MPIR_Sched_recv(tmp_recvbuf, total_count, datatype, rank - 1, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); /* do the reduction on received data. since the * ordering is right, it doesn't matter whether * the operation is commutative or not. */ mpi_errno = MPIR_Sched_reduce(tmp_recvbuf, tmp_results, total_count, datatype, op, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); /* change the rank */ newrank = rank / 2; } } else /* rank >= 2*rem */ newrank = rank - rem; if (newrank != -1) { /* recalculate the recvcounts and disps arrays because the * even-numbered processes who no longer participate will * have their result calculated by the process to their * right (rank+1). */ MPIR_SCHED_CHKPMEM_MALLOC(newcnts, int *, pof2 * sizeof(int), mpi_errno, "newcnts", MPL_MEM_BUFFER); MPIR_SCHED_CHKPMEM_MALLOC(newdisps, int *, pof2 * sizeof(int), mpi_errno, "newdisps", MPL_MEM_BUFFER); for (i = 0; i < pof2; i++) { /* what does i map to in the old ranking? */ old_i = (i < rem) ? i * 2 + 1 : i + rem; if (old_i < 2 * rem) { /* This process has to also do its left neighbor's * work */ newcnts[i] = recvcounts[old_i] + recvcounts[old_i - 1]; } else newcnts[i] = recvcounts[old_i]; } newdisps[0] = 0; for (i = 1; i < pof2; i++) newdisps[i] = newdisps[i - 1] + newcnts[i - 1]; mask = pof2 >> 1; send_idx = recv_idx = 0; last_idx = pof2; while (mask > 0) { newdst = newrank ^ mask; /* find real rank of dest */ dst = (newdst < rem) ? newdst * 2 + 1 : newdst + rem; send_cnt = recv_cnt = 0; if (newrank < newdst) { send_idx = recv_idx + mask; for (i = send_idx; i < last_idx; i++) send_cnt += newcnts[i]; for (i = recv_idx; i < send_idx; i++) recv_cnt += newcnts[i]; } else { recv_idx = send_idx + mask; for (i = send_idx; i < recv_idx; i++) send_cnt += newcnts[i]; for (i = recv_idx; i < last_idx; i++) recv_cnt += newcnts[i]; } /* Send data from tmp_results. Recv into tmp_recvbuf */ { /* avoid sending and receiving pointless 0-byte messages */ int send_dst = (send_cnt ? dst : MPI_PROC_NULL); int recv_dst = (recv_cnt ? dst : MPI_PROC_NULL); mpi_errno = MPIR_Sched_send(((char *) tmp_results + newdisps[send_idx] * extent), send_cnt, datatype, send_dst, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Sched_recv(((char *) tmp_recvbuf + newdisps[recv_idx] * extent), recv_cnt, datatype, recv_dst, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); } /* tmp_recvbuf contains data received in this step. * tmp_results contains data accumulated so far */ if (recv_cnt) { mpi_errno = MPIR_Sched_reduce(((char *) tmp_recvbuf + newdisps[recv_idx] * extent), ((char *) tmp_results + newdisps[recv_idx] * extent), recv_cnt, datatype, op, s); MPIR_SCHED_BARRIER(s); } /* update send_idx for next iteration */ send_idx = recv_idx; last_idx = recv_idx + mask; mask >>= 1; } /* copy this process's result from tmp_results to recvbuf */ if (recvcounts[rank]) { mpi_errno = MPIR_Sched_copy(((char *) tmp_results + disps[rank] * extent), recvcounts[rank], datatype, recvbuf, recvcounts[rank], datatype, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); } }
int MPIR_Ireduce_scatter_sched_intra_recursive_doubling(const void *sendbuf, void *recvbuf, const int recvcounts[], MPI_Datatype datatype, MPI_Op op, MPIR_Comm *comm_ptr, MPIR_Sched_t s) { int mpi_errno = MPI_SUCCESS; int rank, comm_size, i; MPI_Aint extent, true_extent, true_lb; int *disps; void *tmp_recvbuf, *tmp_results; int type_size ATTRIBUTE((unused)), dis[2], blklens[2], total_count, dst; int mask, dst_tree_root, my_tree_root, j, k; int received; MPI_Datatype sendtype, recvtype; int nprocs_completed, tmp_mask, tree_root, is_commutative; MPIR_SCHED_CHKPMEM_DECL(5); comm_size = comm_ptr->local_size; rank = comm_ptr->rank; MPIR_Datatype_get_extent_macro(datatype, extent); MPIR_Type_get_true_extent_impl(datatype, &true_lb, &true_extent); is_commutative = MPIR_Op_is_commutative(op); MPIR_SCHED_CHKPMEM_MALLOC(disps, int *, comm_size * sizeof(int), mpi_errno, "disps", MPL_MEM_BUFFER); total_count = 0; for (i=0; i<comm_size; i++) { disps[i] = total_count; total_count += recvcounts[i]; } if (total_count == 0) { goto fn_exit; } MPIR_Datatype_get_size_macro(datatype, type_size); /* total_count*extent eventually gets malloced. it isn't added to * a user-passed in buffer */ MPIR_Ensure_Aint_fits_in_pointer(total_count * MPL_MAX(true_extent, extent)); /* need to allocate temporary buffer to receive incoming data*/ MPIR_SCHED_CHKPMEM_MALLOC(tmp_recvbuf, void *, total_count*(MPL_MAX(true_extent,extent)), mpi_errno, "tmp_recvbuf", MPL_MEM_BUFFER); /* adjust for potential negative lower bound in datatype */ tmp_recvbuf = (void *)((char*)tmp_recvbuf - true_lb); /* need to allocate another temporary buffer to accumulate results */ MPIR_SCHED_CHKPMEM_MALLOC(tmp_results, void *, total_count*(MPL_MAX(true_extent,extent)), mpi_errno, "tmp_results", MPL_MEM_BUFFER); /* adjust for potential negative lower bound in datatype */ tmp_results = (void *)((char*)tmp_results - true_lb); /* copy sendbuf into tmp_results */ if (sendbuf != MPI_IN_PLACE) mpi_errno = MPIR_Sched_copy(sendbuf, total_count, datatype, tmp_results, total_count, datatype, s); else mpi_errno = MPIR_Sched_copy(recvbuf, total_count, datatype, tmp_results, total_count, datatype, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); mask = 0x1; i = 0; while (mask < comm_size) { dst = rank ^ mask; dst_tree_root = dst >> i; dst_tree_root <<= i; my_tree_root = rank >> i; my_tree_root <<= i; /* At step 1, processes exchange (n-n/p) amount of data; at step 2, (n-2n/p) amount of data; at step 3, (n-4n/p) amount of data, and so forth. We use derived datatypes for this. At each step, a process does not need to send data indexed from my_tree_root to my_tree_root+mask-1. Similarly, a process won't receive data indexed from dst_tree_root to dst_tree_root+mask-1. */ /* calculate sendtype */ blklens[0] = blklens[1] = 0; for (j=0; j<my_tree_root; j++) blklens[0] += recvcounts[j]; for (j=my_tree_root+mask; j<comm_size; j++) blklens[1] += recvcounts[j]; dis[0] = 0; dis[1] = blklens[0]; for (j=my_tree_root; (j<my_tree_root+mask) && (j<comm_size); j++) dis[1] += recvcounts[j]; mpi_errno = MPIR_Type_indexed_impl(2, blklens, dis, datatype, &sendtype); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Type_commit_impl(&sendtype); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* calculate recvtype */ blklens[0] = blklens[1] = 0; for (j=0; j<dst_tree_root && j<comm_size; j++) blklens[0] += recvcounts[j]; for (j=dst_tree_root+mask; j<comm_size; j++) blklens[1] += recvcounts[j]; dis[0] = 0; dis[1] = blklens[0]; for (j=dst_tree_root; (j<dst_tree_root+mask) && (j<comm_size); j++) dis[1] += recvcounts[j]; mpi_errno = MPIR_Type_indexed_impl(2, blklens, dis, datatype, &recvtype); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Type_commit_impl(&recvtype); if (mpi_errno) MPIR_ERR_POP(mpi_errno); received = 0; if (dst < comm_size) { /* tmp_results contains data to be sent in each step. Data is received in tmp_recvbuf and then accumulated into tmp_results. accumulation is done later below. */ mpi_errno = MPIR_Sched_send(tmp_results, 1, sendtype, dst, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Sched_recv(tmp_recvbuf, 1, recvtype, dst, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); received = 1; } /* if some processes in this process's subtree in this step did not have any destination process to communicate with because of non-power-of-two, we need to send them the result. We use a logarithmic recursive-halfing algorithm for this. */ if (dst_tree_root + mask > comm_size) { nprocs_completed = comm_size - my_tree_root - mask; /* nprocs_completed is the number of processes in this subtree that have all the data. Send data to others in a tree fashion. First find root of current tree that is being divided into two. k is the number of least-significant bits in this process's rank that must be zeroed out to find the rank of the root */ j = mask; k = 0; while (j) { j >>= 1; k++; } k--; tmp_mask = mask >> 1; while (tmp_mask) { dst = rank ^ tmp_mask; tree_root = rank >> k; tree_root <<= k; /* send only if this proc has data and destination doesn't have data. at any step, multiple processes can send if they have the data */ if ((dst > rank) && (rank < tree_root + nprocs_completed) && (dst >= tree_root + nprocs_completed)) { /* send the current result */ mpi_errno = MPIR_Sched_send(tmp_recvbuf, 1, recvtype, dst, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); } /* recv only if this proc. doesn't have data and sender has data */ else if ((dst < rank) && (dst < tree_root + nprocs_completed) && (rank >= tree_root + nprocs_completed)) { mpi_errno = MPIR_Sched_recv(tmp_recvbuf, 1, recvtype, dst, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); received = 1; } tmp_mask >>= 1; k--; } } /* N.B. The following comment comes from the FT version of * MPI_Reduce_scatter. It does not currently apply to this code, but * will in the future when we update the NBC code to be fault-tolerant * in roughly the same fashion. [goodell@ 2011-03-03] */ /* The following reduction is done here instead of after the MPIC_Sendrecv or MPIC_Recv above. This is because to do it above, in the noncommutative case, we would need an extra temp buffer so as not to overwrite temp_recvbuf, because temp_recvbuf may have to be communicated to other processes in the non-power-of-two case. To avoid that extra allocation, we do the reduce here. */ if (received) { if (is_commutative || (dst_tree_root < my_tree_root)) { mpi_errno = MPIR_Sched_reduce(tmp_recvbuf, tmp_results, blklens[0], datatype, op, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Sched_reduce(((char *)tmp_recvbuf + dis[1]*extent), ((char *)tmp_results + dis[1]*extent), blklens[1], datatype, op, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); } else { mpi_errno = MPIR_Sched_reduce(tmp_results, tmp_recvbuf, blklens[0], datatype, op, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Sched_reduce(((char *)tmp_results + dis[1]*extent), ((char *)tmp_recvbuf + dis[1]*extent), blklens[1], datatype, op, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); /* copy result back into tmp_results */ mpi_errno = MPIR_Sched_copy(tmp_recvbuf, 1, recvtype, tmp_results, 1, recvtype, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); } } MPIR_Type_free_impl(&sendtype); MPIR_Type_free_impl(&recvtype); mask <<= 1; i++; }
int MPIR_Iallreduce_sched_intra_reduce_scatter_allgather(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPIR_Comm * comm_ptr, MPIR_Sched_t s) { int mpi_errno = MPI_SUCCESS; int comm_size, rank, newrank, pof2, rem; int i, send_idx, recv_idx, last_idx, mask, newdst, dst, send_cnt, recv_cnt; MPI_Aint true_lb, true_extent, extent; void *tmp_buf = NULL; int *cnts, *disps; MPIR_SCHED_CHKPMEM_DECL(1); MPIR_CHKLMEM_DECL(2); #ifdef HAVE_ERROR_CHECKING /* we only support builtin datatypes for now, breaking up user types to do * the reduce-scatter is tricky */ MPIR_Assert(HANDLE_GET_KIND(op) == HANDLE_KIND_BUILTIN); #endif comm_size = comm_ptr->local_size; rank = comm_ptr->rank; /* need to allocate temporary buffer to store incoming data */ MPIR_Type_get_true_extent_impl(datatype, &true_lb, &true_extent); MPIR_Datatype_get_extent_macro(datatype, extent); MPIR_Ensure_Aint_fits_in_pointer(count * MPL_MAX(extent, true_extent)); MPIR_SCHED_CHKPMEM_MALLOC(tmp_buf, void *, count * (MPL_MAX(extent, true_extent)), mpi_errno, "temporary buffer", MPL_MEM_BUFFER); /* adjust for potential negative lower bound in datatype */ tmp_buf = (void *) ((char *) tmp_buf - true_lb); /* copy local data into recvbuf */ if (sendbuf != MPI_IN_PLACE) { mpi_errno = MPIR_Sched_copy(sendbuf, count, datatype, recvbuf, count, datatype, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); } /* get nearest power-of-two less than or equal to comm_size */ pof2 = comm_ptr->pof2; rem = comm_size - pof2; /* In the non-power-of-two case, all even-numbered * processes of rank < 2*rem send their data to * (rank+1). These even-numbered processes no longer * participate in the algorithm until the very end. The * remaining processes form a nice power-of-two. */ if (rank < 2 * rem) { if (rank % 2 == 0) { /* even */ mpi_errno = MPIR_Sched_send(recvbuf, count, datatype, rank + 1, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); /* temporarily set the rank to -1 so that this * process does not pariticipate in recursive * doubling */ newrank = -1; } else { /* odd */ mpi_errno = MPIR_Sched_recv(tmp_buf, count, datatype, rank - 1, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); /* do the reduction on received data. since the * ordering is right, it doesn't matter whether * the operation is commutative or not. */ mpi_errno = MPIR_Sched_reduce(tmp_buf, recvbuf, count, datatype, op, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); /* change the rank */ newrank = rank / 2; } } else /* rank >= 2*rem */ newrank = rank - rem; if (newrank != -1) { /* for the reduce-scatter, calculate the count that * each process receives and the displacement within * the buffer */ /* TODO I (goodell@) believe that these counts and displacements could be * calculated directly during the loop, rather than requiring a less-scalable * "2*pof2"-sized memory allocation */ MPIR_CHKLMEM_MALLOC(cnts, int *, pof2 * sizeof(int), mpi_errno, "counts", MPL_MEM_BUFFER); MPIR_CHKLMEM_MALLOC(disps, int *, pof2 * sizeof(int), mpi_errno, "displacements", MPL_MEM_BUFFER); MPIR_Assert(count >= pof2); /* the cnts calculations assume this */ for (i = 0; i < (pof2 - 1); i++) cnts[i] = count / pof2; cnts[pof2 - 1] = count - (count / pof2) * (pof2 - 1); if (pof2) disps[0] = 0; for (i = 1; i < pof2; i++) disps[i] = disps[i - 1] + cnts[i - 1]; mask = 0x1; send_idx = recv_idx = 0; last_idx = pof2; while (mask < pof2) { newdst = newrank ^ mask; /* find real rank of dest */ dst = (newdst < rem) ? newdst * 2 + 1 : newdst + rem; send_cnt = recv_cnt = 0; if (newrank < newdst) { send_idx = recv_idx + pof2 / (mask * 2); for (i = send_idx; i < last_idx; i++) send_cnt += cnts[i]; for (i = recv_idx; i < send_idx; i++) recv_cnt += cnts[i]; } else { recv_idx = send_idx + pof2 / (mask * 2); for (i = send_idx; i < recv_idx; i++) send_cnt += cnts[i]; for (i = recv_idx; i < last_idx; i++) recv_cnt += cnts[i]; } /* Send data from recvbuf. Recv into tmp_buf */ mpi_errno = MPIR_Sched_recv(((char *) tmp_buf + disps[recv_idx] * extent), recv_cnt, datatype, dst, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* sendrecv, no barrier here */ mpi_errno = MPIR_Sched_send(((char *) recvbuf + disps[send_idx] * extent), send_cnt, datatype, dst, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); /* tmp_buf contains data received in this step. * recvbuf contains data accumulated so far */ /* This algorithm is used only for predefined ops * and predefined ops are always commutative. */ mpi_errno = MPIR_Sched_reduce(((char *) tmp_buf + disps[recv_idx] * extent), ((char *) recvbuf + disps[recv_idx] * extent), recv_cnt, datatype, op, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); /* update send_idx for next iteration */ send_idx = recv_idx; mask <<= 1; /* update last_idx, but not in last iteration * because the value is needed in the allgather * step below. */ if (mask < pof2) last_idx = recv_idx + pof2 / mask; } /* now do the allgather */ mask >>= 1; while (mask > 0) { newdst = newrank ^ mask; /* find real rank of dest */ dst = (newdst < rem) ? newdst * 2 + 1 : newdst + rem; send_cnt = recv_cnt = 0; if (newrank < newdst) { /* update last_idx except on first iteration */ if (mask != pof2 / 2) last_idx = last_idx + pof2 / (mask * 2); recv_idx = send_idx + pof2 / (mask * 2); for (i = send_idx; i < recv_idx; i++) send_cnt += cnts[i]; for (i = recv_idx; i < last_idx; i++) recv_cnt += cnts[i]; } else { recv_idx = send_idx - pof2 / (mask * 2); for (i = send_idx; i < last_idx; i++) send_cnt += cnts[i]; for (i = recv_idx; i < send_idx; i++) recv_cnt += cnts[i]; } mpi_errno = MPIR_Sched_recv(((char *) recvbuf + disps[recv_idx] * extent), recv_cnt, datatype, dst, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* sendrecv, no barrier here */ mpi_errno = MPIR_Sched_send(((char *) recvbuf + disps[send_idx] * extent), send_cnt, datatype, dst, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); if (newrank > newdst) send_idx = recv_idx; mask >>= 1; } }