int MPIR_Iexscan(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPID_Comm *comm_ptr, MPID_Sched_t s) { int mpi_errno = MPI_SUCCESS; int rank, comm_size; int mask, dst, is_commutative, flag; MPI_Aint true_extent, true_lb, extent; void *partial_scan, *tmp_buf; MPIR_SCHED_CHKPMEM_DECL(2); if (count == 0) goto fn_exit; comm_size = comm_ptr->local_size; rank = comm_ptr->rank; is_commutative = MPIR_Op_is_commutative(op); /* need to allocate temporary buffer to store partial scan*/ MPIR_Type_get_true_extent_impl(datatype, &true_lb, &true_extent); MPID_Datatype_get_extent_macro(datatype, extent); MPIR_SCHED_CHKPMEM_MALLOC(partial_scan, void *, (count*(MPIR_MAX(true_extent,extent))), mpi_errno, "partial_scan"); /* adjust for potential negative lower bound in datatype */ partial_scan = (void *)((char*)partial_scan - true_lb); /* need to allocate temporary buffer to store incoming data*/ MPIR_SCHED_CHKPMEM_MALLOC(tmp_buf, void *, (count*(MPIR_MAX(true_extent,extent))), mpi_errno, "tmp_buf"); /* adjust for potential negative lower bound in datatype */ tmp_buf = (void *)((char*)tmp_buf - true_lb); mpi_errno = MPID_Sched_copy((sendbuf == MPI_IN_PLACE ? recvbuf : sendbuf), count, datatype, partial_scan, count, datatype, s); if (mpi_errno) MPIU_ERR_POP(mpi_errno); flag = 0; mask = 0x1; while (mask < comm_size) { dst = rank ^ mask; if (dst < comm_size) { /* Send partial_scan to dst. Recv into tmp_buf */ mpi_errno = MPID_Sched_send(partial_scan, count, datatype, dst, comm_ptr, s); if (mpi_errno) MPIU_ERR_POP(mpi_errno); /* sendrecv, no barrier here */ mpi_errno = MPID_Sched_recv(tmp_buf, count, datatype, dst, comm_ptr, s); if (mpi_errno) MPIU_ERR_POP(mpi_errno); MPID_SCHED_BARRIER(s); if (rank > dst) { mpi_errno = MPID_Sched_reduce(tmp_buf, partial_scan, count, datatype, op, s); if (mpi_errno) MPIU_ERR_POP(mpi_errno); MPID_SCHED_BARRIER(s); /* On rank 0, recvbuf is not defined. For sendbuf==MPI_IN_PLACE recvbuf must not change (per MPI-2.2). On rank 1, recvbuf is to be set equal to the value in sendbuf on rank 0. On others, recvbuf is the scan of values in the sendbufs on lower ranks. */ if (rank != 0) { if (flag == 0) { /* simply copy data recd from rank 0 into recvbuf */ mpi_errno = MPID_Sched_copy(tmp_buf, count, datatype, recvbuf, count, datatype, s); if (mpi_errno) MPIU_ERR_POP(mpi_errno); MPID_SCHED_BARRIER(s); flag = 1; } else { mpi_errno = MPID_Sched_reduce(tmp_buf, recvbuf, count, datatype, op, s); if (mpi_errno) MPIU_ERR_POP(mpi_errno); MPID_SCHED_BARRIER(s); } } } else { if (is_commutative) { mpi_errno = MPID_Sched_reduce(tmp_buf, partial_scan, count, datatype, op, s); if (mpi_errno) MPIU_ERR_POP(mpi_errno); MPID_SCHED_BARRIER(s); } else { mpi_errno = MPID_Sched_reduce(partial_scan, tmp_buf, count, datatype, op, s); if (mpi_errno) MPIU_ERR_POP(mpi_errno); MPID_SCHED_BARRIER(s); mpi_errno = MPID_Sched_copy(tmp_buf, count, datatype, partial_scan, count, datatype, s); if (mpi_errno) MPIU_ERR_POP(mpi_errno); MPID_SCHED_BARRIER(s); } } } mask <<= 1; } MPIR_SCHED_CHKPMEM_COMMIT(s); fn_exit: return mpi_errno; fn_fail: MPIR_SCHED_CHKPMEM_REAP(s); goto fn_exit; }
int MPIR_Iallreduce_redscat_allgather(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPID_Comm *comm_ptr, MPID_Sched_t s) { int mpi_errno = MPI_SUCCESS; int comm_size, rank, newrank, pof2, rem; int i, send_idx, recv_idx, last_idx, mask, newdst, dst, send_cnt, recv_cnt; MPI_Aint true_lb, true_extent, extent; void *tmp_buf = NULL; int *cnts, *disps; MPIR_SCHED_CHKPMEM_DECL(1); MPIU_CHKLMEM_DECL(2); /* we only support builtin datatypes for now, breaking up user types to do * the reduce-scatter is tricky */ MPIU_Assert(HANDLE_GET_KIND(op) == HANDLE_KIND_BUILTIN); comm_size = comm_ptr->local_size; rank = comm_ptr->rank; /* need to allocate temporary buffer to store incoming data*/ MPIR_Type_get_true_extent_impl(datatype, &true_lb, &true_extent); MPID_Datatype_get_extent_macro(datatype, extent); MPID_Ensure_Aint_fits_in_pointer(count * MPIR_MAX(extent, true_extent)); MPIR_SCHED_CHKPMEM_MALLOC(tmp_buf, void *, count*(MPIR_MAX(extent,true_extent)), mpi_errno, "temporary buffer"); /* adjust for potential negative lower bound in datatype */ tmp_buf = (void *)((char*)tmp_buf - true_lb); /* copy local data into recvbuf */ if (sendbuf != MPI_IN_PLACE) { mpi_errno = MPID_Sched_copy(sendbuf, count, datatype, recvbuf, count, datatype, s); if (mpi_errno) MPIU_ERR_POP(mpi_errno); MPID_SCHED_BARRIER(s); } /* find nearest power-of-two less than or equal to comm_size */ pof2 = 1; while (pof2 <= comm_size) pof2 <<= 1; pof2 >>=1; rem = comm_size - pof2; /* In the non-power-of-two case, all even-numbered processes of rank < 2*rem send their data to (rank+1). These even-numbered processes no longer participate in the algorithm until the very end. The remaining processes form a nice power-of-two. */ if (rank < 2*rem) { if (rank % 2 == 0) { /* even */ mpi_errno = MPID_Sched_send(recvbuf, count, datatype, rank+1, comm_ptr, s); if (mpi_errno) MPIU_ERR_POP(mpi_errno); MPID_SCHED_BARRIER(s); /* temporarily set the rank to -1 so that this process does not pariticipate in recursive doubling */ newrank = -1; } else { /* odd */ mpi_errno = MPID_Sched_recv(tmp_buf, count, datatype, rank-1, comm_ptr, s); if (mpi_errno) MPIU_ERR_POP(mpi_errno); MPID_SCHED_BARRIER(s); /* do the reduction on received data. since the ordering is right, it doesn't matter whether the operation is commutative or not. */ mpi_errno = MPID_Sched_reduce(tmp_buf, recvbuf, count, datatype, op, s); if (mpi_errno) MPIU_ERR_POP(mpi_errno); MPID_SCHED_BARRIER(s); /* change the rank */ newrank = rank / 2; } } else /* rank >= 2*rem */ newrank = rank - rem; if (newrank != -1) { /* for the reduce-scatter, calculate the count that each process receives and the displacement within the buffer */ /* TODO I (goodell@) believe that these counts and displacements could be * calculated directly during the loop, rather than requiring a less-scalable * "2*pof2"-sized memory allocation */ MPIU_CHKLMEM_MALLOC(cnts, int *, pof2*sizeof(int), mpi_errno, "counts"); MPIU_CHKLMEM_MALLOC(disps, int *, pof2*sizeof(int), mpi_errno, "displacements"); MPIU_Assert(count >= pof2); /* the cnts calculations assume this */ for (i=0; i<(pof2-1); i++) cnts[i] = count/pof2; cnts[pof2-1] = count - (count/pof2)*(pof2-1); disps[0] = 0; for (i=1; i<pof2; i++) disps[i] = disps[i-1] + cnts[i-1]; mask = 0x1; send_idx = recv_idx = 0; last_idx = pof2; while (mask < pof2) { newdst = newrank ^ mask; /* find real rank of dest */ dst = (newdst < rem) ? newdst*2 + 1 : newdst + rem; send_cnt = recv_cnt = 0; if (newrank < newdst) { send_idx = recv_idx + pof2/(mask*2); for (i=send_idx; i<last_idx; i++) send_cnt += cnts[i]; for (i=recv_idx; i<send_idx; i++) recv_cnt += cnts[i]; } else { recv_idx = send_idx + pof2/(mask*2); for (i=send_idx; i<recv_idx; i++) send_cnt += cnts[i]; for (i=recv_idx; i<last_idx; i++) recv_cnt += cnts[i]; } /* Send data from recvbuf. Recv into tmp_buf */ mpi_errno = MPID_Sched_recv(((char *)tmp_buf + disps[recv_idx]*extent), recv_cnt, datatype, dst, comm_ptr, s); if (mpi_errno) MPIU_ERR_POP(mpi_errno); /* sendrecv, no barrier here */ mpi_errno = MPID_Sched_send(((char *)recvbuf + disps[send_idx]*extent), send_cnt, datatype, dst, comm_ptr, s); if (mpi_errno) MPIU_ERR_POP(mpi_errno); MPID_SCHED_BARRIER(s); /* tmp_buf contains data received in this step. recvbuf contains data accumulated so far */ /* This algorithm is used only for predefined ops and predefined ops are always commutative. */ mpi_errno = MPID_Sched_reduce(((char *)tmp_buf + disps[recv_idx]*extent), ((char *)recvbuf + disps[recv_idx]*extent), recv_cnt, datatype, op, s); if (mpi_errno) MPIU_ERR_POP(mpi_errno); MPID_SCHED_BARRIER(s); /* update send_idx for next iteration */ send_idx = recv_idx; mask <<= 1; /* update last_idx, but not in last iteration because the value is needed in the allgather step below. */ if (mask < pof2) last_idx = recv_idx + pof2/mask; } /* now do the allgather */ mask >>= 1; while (mask > 0) { newdst = newrank ^ mask; /* find real rank of dest */ dst = (newdst < rem) ? newdst*2 + 1 : newdst + rem; send_cnt = recv_cnt = 0; if (newrank < newdst) { /* update last_idx except on first iteration */ if (mask != pof2/2) last_idx = last_idx + pof2/(mask*2); recv_idx = send_idx + pof2/(mask*2); for (i=send_idx; i<recv_idx; i++) send_cnt += cnts[i]; for (i=recv_idx; i<last_idx; i++) recv_cnt += cnts[i]; } else { recv_idx = send_idx - pof2/(mask*2); for (i=send_idx; i<last_idx; i++) send_cnt += cnts[i]; for (i=recv_idx; i<send_idx; i++) recv_cnt += cnts[i]; } mpi_errno = MPID_Sched_recv(((char *)recvbuf + disps[recv_idx]*extent), recv_cnt, datatype, dst, comm_ptr, s); if (mpi_errno) MPIU_ERR_POP(mpi_errno); /* sendrecv, no barrier here */ mpi_errno = MPID_Sched_send(((char *)recvbuf + disps[send_idx]*extent), send_cnt, datatype, dst, comm_ptr, s); if (mpi_errno) MPIU_ERR_POP(mpi_errno); MPID_SCHED_BARRIER(s); if (newrank > newdst) send_idx = recv_idx; mask >>= 1; } }
int MPIR_Ireduce_scatter_rec_hlv(const void *sendbuf, void *recvbuf, const int recvcounts[], MPI_Datatype datatype, MPI_Op op, MPID_Comm *comm_ptr, MPID_Sched_t s) { int mpi_errno = MPI_SUCCESS; int rank, comm_size, i; MPI_Aint extent, true_extent, true_lb; int *disps; void *tmp_recvbuf, *tmp_results; int type_size ATTRIBUTE((unused)), total_count, dst; int mask; int *newcnts, *newdisps, rem, newdst, send_idx, recv_idx, last_idx, send_cnt, recv_cnt; int pof2, old_i, newrank; MPIR_SCHED_CHKPMEM_DECL(5); comm_size = comm_ptr->local_size; rank = comm_ptr->rank; MPID_Datatype_get_extent_macro(datatype, extent); MPIR_Type_get_true_extent_impl(datatype, &true_lb, &true_extent); MPIU_Assert(MPIR_Op_is_commutative(op)); MPIR_SCHED_CHKPMEM_MALLOC(disps, int *, comm_size * sizeof(int), mpi_errno, "disps"); total_count = 0; for (i=0; i<comm_size; i++) { disps[i] = total_count; total_count += recvcounts[i]; } if (total_count == 0) { goto fn_exit; } MPID_Datatype_get_size_macro(datatype, type_size); /* allocate temp. buffer to receive incoming data */ MPIR_SCHED_CHKPMEM_MALLOC(tmp_recvbuf, void *, total_count*(MPL_MAX(true_extent,extent)), mpi_errno, "tmp_recvbuf"); /* adjust for potential negative lower bound in datatype */ tmp_recvbuf = (void *)((char*)tmp_recvbuf - true_lb); /* need to allocate another temporary buffer to accumulate results because recvbuf may not be big enough */ MPIR_SCHED_CHKPMEM_MALLOC(tmp_results, void *, total_count*(MPL_MAX(true_extent,extent)), mpi_errno, "tmp_results"); /* adjust for potential negative lower bound in datatype */ tmp_results = (void *)((char*)tmp_results - true_lb); /* copy sendbuf into tmp_results */ if (sendbuf != MPI_IN_PLACE) mpi_errno = MPID_Sched_copy(sendbuf, total_count, datatype, tmp_results, total_count, datatype, s); else mpi_errno = MPID_Sched_copy(recvbuf, total_count, datatype, tmp_results, total_count, datatype, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPID_SCHED_BARRIER(s); pof2 = 1; while (pof2 <= comm_size) pof2 <<= 1; pof2 >>=1; rem = comm_size - pof2; /* In the non-power-of-two case, all even-numbered processes of rank < 2*rem send their data to (rank+1). These even-numbered processes no longer participate in the algorithm until the very end. The remaining processes form a nice power-of-two. */ if (rank < 2*rem) { if (rank % 2 == 0) { /* even */ mpi_errno = MPID_Sched_send(tmp_results, total_count, datatype, rank+1, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPID_SCHED_BARRIER(s); /* temporarily set the rank to -1 so that this process does not pariticipate in recursive doubling */ newrank = -1; } else { /* odd */ mpi_errno = MPID_Sched_recv(tmp_recvbuf, total_count, datatype, rank-1, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPID_SCHED_BARRIER(s); /* do the reduction on received data. since the ordering is right, it doesn't matter whether the operation is commutative or not. */ mpi_errno = MPID_Sched_reduce(tmp_recvbuf, tmp_results, total_count, datatype, op, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPID_SCHED_BARRIER(s); /* change the rank */ newrank = rank / 2; } } else /* rank >= 2*rem */ newrank = rank - rem; if (newrank != -1) { /* recalculate the recvcounts and disps arrays because the even-numbered processes who no longer participate will have their result calculated by the process to their right (rank+1). */ MPIR_SCHED_CHKPMEM_MALLOC(newcnts, int *, pof2*sizeof(int), mpi_errno, "newcnts"); MPIR_SCHED_CHKPMEM_MALLOC(newdisps, int *, pof2*sizeof(int), mpi_errno, "newdisps"); for (i = 0; i < pof2; i++) { /* what does i map to in the old ranking? */ old_i = (i < rem) ? i*2 + 1 : i + rem; if (old_i < 2*rem) { /* This process has to also do its left neighbor's work */ newcnts[i] = recvcounts[old_i] + recvcounts[old_i-1]; } else newcnts[i] = recvcounts[old_i]; } newdisps[0] = 0; for (i=1; i<pof2; i++) newdisps[i] = newdisps[i-1] + newcnts[i-1]; mask = pof2 >> 1; send_idx = recv_idx = 0; last_idx = pof2; while (mask > 0) { newdst = newrank ^ mask; /* find real rank of dest */ dst = (newdst < rem) ? newdst*2 + 1 : newdst + rem; send_cnt = recv_cnt = 0; if (newrank < newdst) { send_idx = recv_idx + mask; for (i=send_idx; i<last_idx; i++) send_cnt += newcnts[i]; for (i=recv_idx; i<send_idx; i++) recv_cnt += newcnts[i]; } else { recv_idx = send_idx + mask; for (i=send_idx; i<recv_idx; i++) send_cnt += newcnts[i]; for (i=recv_idx; i<last_idx; i++) recv_cnt += newcnts[i]; } /* Send data from tmp_results. Recv into tmp_recvbuf */ { /* avoid sending and receiving pointless 0-byte messages */ int send_dst = (send_cnt ? dst : MPI_PROC_NULL); int recv_dst = (recv_cnt ? dst : MPI_PROC_NULL); mpi_errno = MPID_Sched_send(((char *)tmp_results + newdisps[send_idx]*extent), send_cnt, datatype, send_dst, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPID_Sched_recv(((char *) tmp_recvbuf + newdisps[recv_idx]*extent), recv_cnt, datatype, recv_dst, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPID_SCHED_BARRIER(s); } /* tmp_recvbuf contains data received in this step. tmp_results contains data accumulated so far */ if (recv_cnt) { mpi_errno = MPID_Sched_reduce(((char *)tmp_recvbuf + newdisps[recv_idx]*extent), ((char *)tmp_results + newdisps[recv_idx]*extent), recv_cnt, datatype, op, s); MPID_SCHED_BARRIER(s); } /* update send_idx for next iteration */ send_idx = recv_idx; last_idx = recv_idx + mask; mask >>= 1; } /* copy this process's result from tmp_results to recvbuf */ if (recvcounts[rank]) { mpi_errno = MPID_Sched_copy(((char *)tmp_results + disps[rank]*extent), recvcounts[rank], datatype, recvbuf, recvcounts[rank], datatype, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPID_SCHED_BARRIER(s); } }