/* Algorithm: Short Linear Gather * * This linear gather algorithm is tuned for short messages. The remote group * does a local intracommunicator gather to rank 0. Rank 0 then sends data to * root. * * Cost: (lgp+1).alpha + n.((p-1)/p).beta + n.beta */ int MPIR_Igather_sched_inter_short(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, int root, MPIR_Comm * comm_ptr, MPIR_Sched_t s) { int mpi_errno = MPI_SUCCESS; int rank; MPI_Aint local_size, remote_size; MPIR_Comm *newcomm_ptr = NULL; MPIR_SCHED_CHKPMEM_DECL(1); remote_size = comm_ptr->remote_size; local_size = comm_ptr->local_size; if (root == MPI_ROOT) { /* root receives data from rank 0 on remote group */ mpi_errno = MPIR_Sched_recv(recvbuf, recvcount * remote_size, recvtype, 0, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } else { /* remote group. Rank 0 allocates temporary buffer, does * local intracommunicator gather, and then sends the data * to root. */ MPI_Aint sendtype_sz; void *tmp_buf = NULL; rank = comm_ptr->rank; if (rank == 0) { MPIR_Datatype_get_size_macro(sendtype, sendtype_sz); MPIR_SCHED_CHKPMEM_MALLOC(tmp_buf, void *, sendcount * local_size * sendtype_sz, mpi_errno, "tmp_buf", MPL_MEM_BUFFER); } else {
/* Algorithm: Ring * * In the first step, each process i sends its contribution to process * i+1 and receives the contribution from process i-1 (with * wrap-around). From the second step onwards, each process i * forwards to process i+1 the data it received from process i-1 in * the previous step. This takes a total of p-1 steps. * * Cost = (p-1).alpha + n.((p-1)/p).beta * * This algorithm is preferred to recursive doubling for long messages * because we find that this communication pattern (nearest neighbor) * performs twice as fast as recursive doubling for long messages (on * Myrinet and IBM SP). */ int MPIR_Iallgather_sched_intra_ring(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPIR_Comm * comm_ptr, MPIR_Sched_t s) { int mpi_errno = MPI_SUCCESS; int rank, comm_size; int i, j, jnext, left, right; MPI_Aint recvtype_extent; comm_size = comm_ptr->local_size; rank = comm_ptr->rank; MPIR_Datatype_get_extent_macro(recvtype, recvtype_extent); /* First, load the "local" version in the recvbuf. */ if (sendbuf != MPI_IN_PLACE) { mpi_errno = MPIR_Sched_copy(sendbuf, sendcount, sendtype, ((char *) recvbuf + rank * recvcount * recvtype_extent), recvcount, recvtype, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); } /* Now, send left to right. This fills in the receive area in * reverse order. */ left = (comm_size + rank - 1) % comm_size; right = (rank + 1) % comm_size; j = rank; jnext = left; for (i = 1; i < comm_size; i++) { mpi_errno = MPIR_Sched_send(((char *) recvbuf + j * recvcount * recvtype_extent), recvcount, recvtype, right, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* concurrent, no barrier here */ mpi_errno = MPIR_Sched_recv(((char *) recvbuf + jnext * recvcount * recvtype_extent), recvcount, recvtype, left, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); j = jnext; jnext = (comm_size + jnext - 1) % comm_size; } fn_exit: return mpi_errno; fn_fail: goto fn_exit; }
int MPIR_Ineighbor_alltoallw_sched_allcomm_linear(const void *sendbuf, const int sendcounts[], const MPI_Aint sdispls[], const MPI_Datatype sendtypes[], void *recvbuf, const int recvcounts[], const MPI_Aint rdispls[], const MPI_Datatype recvtypes[], MPIR_Comm * comm_ptr, MPIR_Sched_t s) { int mpi_errno = MPI_SUCCESS; int indegree, outdegree, weighted; int k, l; int *srcs, *dsts; MPIR_CHKLMEM_DECL(2); mpi_errno = MPIR_Topo_canon_nhb_count(comm_ptr, &indegree, &outdegree, &weighted); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_CHKLMEM_MALLOC(srcs, int *, indegree * sizeof(int), mpi_errno, "srcs", MPL_MEM_COMM); MPIR_CHKLMEM_MALLOC(dsts, int *, outdegree * sizeof(int), mpi_errno, "dsts", MPL_MEM_COMM); mpi_errno = MPIR_Topo_canon_nhb(comm_ptr, indegree, srcs, MPI_UNWEIGHTED, outdegree, dsts, MPI_UNWEIGHTED); if (mpi_errno) MPIR_ERR_POP(mpi_errno); for (k = 0; k < outdegree; ++k) { char *sb; MPIR_Ensure_Aint_fits_in_pointer(MPIR_VOID_PTR_CAST_TO_MPI_AINT sendbuf + sdispls[k]); sb = ((char *) sendbuf) + sdispls[k]; mpi_errno = MPIR_Sched_send(sb, sendcounts[k], sendtypes[k], dsts[k], comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } for (l = 0; l < indegree; ++l) { char *rb; MPIR_Ensure_Aint_fits_in_pointer(MPIR_VOID_PTR_CAST_TO_MPI_AINT recvbuf + rdispls[l]); rb = ((char *) recvbuf) + rdispls[l]; mpi_errno = MPIR_Sched_recv(rb, recvcounts[l], recvtypes[l], srcs[l], comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } MPIR_SCHED_BARRIER(s); fn_exit: MPIR_CHKLMEM_FREEALL(); return mpi_errno; fn_fail: goto fn_exit; }
int MPIR_Ineighbor_allgatherv_sched_allcomm_linear(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, const int recvcounts[], const int displs[], MPI_Datatype recvtype, MPIR_Comm * comm_ptr, MPIR_Sched_t s) { int mpi_errno = MPI_SUCCESS; int indegree, outdegree, weighted; int k, l; int *srcs, *dsts; MPI_Aint recvtype_extent; MPIR_CHKLMEM_DECL(2); MPIR_Datatype_get_extent_macro(recvtype, recvtype_extent); mpi_errno = MPIR_Topo_canon_nhb_count(comm_ptr, &indegree, &outdegree, &weighted); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_CHKLMEM_MALLOC(srcs, int *, indegree * sizeof(int), mpi_errno, "srcs", MPL_MEM_COMM); MPIR_CHKLMEM_MALLOC(dsts, int *, outdegree * sizeof(int), mpi_errno, "dsts", MPL_MEM_COMM); mpi_errno = MPIR_Topo_canon_nhb(comm_ptr, indegree, srcs, MPI_UNWEIGHTED, outdegree, dsts, MPI_UNWEIGHTED); if (mpi_errno) MPIR_ERR_POP(mpi_errno); for (k = 0; k < outdegree; ++k) { mpi_errno = MPIR_Sched_send(sendbuf, sendcount, sendtype, dsts[k], comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } for (l = 0; l < indegree; ++l) { char *rb = ((char *) recvbuf) + displs[l] * recvtype_extent; mpi_errno = MPIR_Sched_recv(rb, recvcounts[l], recvtype, srcs[l], comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } MPIR_SCHED_BARRIER(s); fn_exit: MPIR_CHKLMEM_FREEALL(); return mpi_errno; fn_fail: goto fn_exit; }
int MPIR_Ialltoall_sched_intra_permuted_sendrecv(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPIR_Comm *comm_ptr, MPIR_Sched_t s) { int mpi_errno = MPI_SUCCESS; int i; int rank, comm_size; int ii, ss, bblock, dst; MPI_Aint sendtype_extent, recvtype_extent; #ifdef HAVE_ERROR_CHECKING MPIR_Assert(sendbuf != MPI_IN_PLACE); /* we do not handle in-place */ #endif comm_size = comm_ptr->local_size; rank = comm_ptr->rank; MPIR_Datatype_get_extent_macro(sendtype, sendtype_extent); MPIR_Datatype_get_extent_macro(recvtype, recvtype_extent); bblock = MPIR_CVAR_ALLTOALL_THROTTLE; if (bblock == 0) bblock = comm_size; for (ii = 0; ii < comm_size; ii += bblock) { ss = comm_size-ii < bblock ? comm_size-ii : bblock; /* do the communication -- post ss sends and receives: */ for (i = 0; i < ss; i++) { dst = (rank+i+ii) % comm_size; mpi_errno = MPIR_Sched_recv(((char *)recvbuf + dst*recvcount*recvtype_extent), recvcount, recvtype, dst, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } for (i = 0; i < ss; i++) { dst = (rank-i-ii+comm_size) % comm_size; mpi_errno = MPIR_Sched_send(((char *)sendbuf + dst*sendcount*sendtype_extent), sendcount, sendtype, dst, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } /* force the (2*ss) sends/recvs above to complete before posting additional ops */ MPIR_SCHED_BARRIER(s); } fn_exit: return mpi_errno; fn_fail: goto fn_exit; }
int MPIR_Ibcast_sched_inter_flat(void *buffer, int count, MPI_Datatype datatype, int root, MPIR_Comm * comm_ptr, MPIR_Sched_t s) { int mpi_errno = MPI_SUCCESS; MPIR_Assert(comm_ptr->comm_kind == MPIR_COMM_KIND__INTERCOMM); /* Intercommunicator broadcast. * Root sends to rank 0 in remote group. Remote group does local * intracommunicator broadcast. */ if (root == MPI_PROC_NULL) { /* local processes other than root do nothing */ mpi_errno = MPI_SUCCESS; } else if (root == MPI_ROOT) { /* root sends to rank 0 on remote group and returns */ mpi_errno = MPIR_Sched_send(buffer, count, datatype, 0, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } else { /* remote group. rank 0 on remote group receives from root */ if (comm_ptr->rank == 0) { mpi_errno = MPIR_Sched_recv(buffer, count, datatype, root, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); } if (comm_ptr->local_comm == NULL) { mpi_errno = MPII_Setup_intercomm_localcomm(comm_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } /* now do the usual broadcast on this intracommunicator * with rank 0 as root. */ mpi_errno = MPIR_Ibcast_sched(buffer, count, datatype, root, comm_ptr->local_comm, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } fn_exit: return mpi_errno; fn_fail: goto fn_exit; }
int MPIR_Iscatter_sched_inter_linear(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, int root, MPIR_Comm * comm_ptr, MPIR_Sched_t s) { int mpi_errno = MPI_SUCCESS; int remote_size; int i; MPI_Aint extent; if (root == MPI_PROC_NULL) { /* local processes other than root do nothing */ goto fn_exit; } remote_size = comm_ptr->remote_size; if (root == MPI_ROOT) { MPIR_Datatype_get_extent_macro(sendtype, extent); for (i = 0; i < remote_size; i++) { mpi_errno = MPIR_Sched_send(((char *) sendbuf + sendcount * i * extent), sendcount, sendtype, i, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } MPIR_SCHED_BARRIER(s); } else { mpi_errno = MPIR_Sched_recv(recvbuf, recvcount, recvtype, root, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); } fn_exit: return mpi_errno; fn_fail: goto fn_exit; }
/* Algorithm: Recursive Doubling * * Restrictions: Intracommunicators Only * * We use the dissemination algorithm described in: * Debra Hensgen, Raphael Finkel, and Udi Manber, "Two Algorithms for * Barrier Synchronization," International Journal of Parallel * Programming, 17(1):1-17, 1988. * * It uses ceiling(lgp) steps. In step k, 0 <= k <= (ceiling(lgp)-1), * process i sends to process (i + 2^k) % p and receives from process * (i - 2^k + p) % p. */ int MPIR_Ibarrier_sched_intra_recursive_doubling(MPIR_Comm * comm_ptr, MPIR_Sched_t s) { int mpi_errno = MPI_SUCCESS; int size, rank, src, dst, mask; MPIR_Assert(comm_ptr->comm_kind == MPIR_COMM_KIND__INTRACOMM); size = comm_ptr->local_size; rank = comm_ptr->rank; /* Trivial barriers return immediately */ if (size == 1) goto fn_exit; mask = 0x1; while (mask < size) { dst = (rank + mask) % size; src = (rank - mask + size) % size; mpi_errno = MPIR_Sched_send(NULL, 0, MPI_BYTE, dst, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Sched_recv(NULL, 0, MPI_BYTE, src, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Sched_barrier(s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mask <<= 1; } fn_exit: return mpi_errno; fn_fail: goto fn_exit; }
static int sched_cb_gcn_bcast(MPIR_Comm * comm, int tag, void *state) { int mpi_errno = MPI_SUCCESS; struct gcn_state *st = state; if (st->gcn_cid_kind == MPIR_COMM_KIND__INTERCOMM) { if (st->comm_ptr_inter->rank == 0) { mpi_errno = MPIR_Sched_recv(st->ctx1, 1, MPIR_CONTEXT_ID_T_DATATYPE, 0, st->comm_ptr_inter, st->s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Sched_send(st->ctx0, 1, MPIR_CONTEXT_ID_T_DATATYPE, 0, st->comm_ptr_inter, st->s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(st->s); } mpi_errno = st->comm_ptr->coll_fns->Ibcast_sched(st->ctx1, 1, MPIR_CONTEXT_ID_T_DATATYPE, 0, st->comm_ptr, st->s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(st->s); } mpi_errno = MPIR_Sched_cb(&sched_cb_commit_comm, st, st->s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Sched_cb(&MPIR_Sched_cb_free_buf, st, st->s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); fn_fail: return mpi_errno; }
/* Algorithm: Blocked Alltoallw * * Since each process sends/receives different amounts of data to every other * process, we don't know the total message size for all processes without * additional communication. Therefore we simply use the "middle of the road" * isend/irecv algorithm that works reasonably well in all cases. * * We post all irecvs and isends and then do a waitall. We scatter the order of * sources and destinations among the processes, so that all processes don't * try to send/recv to/from the same process at the same time. * * *** Modification: We post only a small number of isends and irecvs at a time * and wait on them as suggested by Tony Ladd. *** */ int MPIR_Ialltoallw_sched_intra_blocked(const void *sendbuf, const int sendcounts[], const int sdispls[], const MPI_Datatype sendtypes[], void *recvbuf, const int recvcounts[], const int rdispls[], const MPI_Datatype recvtypes[], MPIR_Comm * comm_ptr, MPIR_Sched_t s) { int mpi_errno = MPI_SUCCESS; int comm_size, i; int dst, rank; int ii, ss, bblock; int type_size; #ifdef HAVE_ERROR_CHECKING MPIR_Assert(sendbuf != MPI_IN_PLACE); #endif /* HAVE_ERROR_CHECKING */ comm_size = comm_ptr->local_size; rank = comm_ptr->rank; bblock = MPIR_CVAR_ALLTOALL_THROTTLE; if (bblock == 0) bblock = comm_size; /* post only bblock isends/irecvs at a time as suggested by Tony Ladd */ for (ii = 0; ii < comm_size; ii += bblock) { ss = comm_size - ii < bblock ? comm_size - ii : bblock; /* do the communication -- post ss sends and receives: */ for (i = 0; i < ss; i++) { dst = (rank + i + ii) % comm_size; if (recvcounts[dst]) { MPIR_Datatype_get_size_macro(recvtypes[dst], type_size); if (type_size) { mpi_errno = MPIR_Sched_recv((char *) recvbuf + rdispls[dst], recvcounts[dst], recvtypes[dst], dst, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } } } for (i = 0; i < ss; i++) { dst = (rank - i - ii + comm_size) % comm_size; if (sendcounts[dst]) { MPIR_Datatype_get_size_macro(sendtypes[dst], type_size); if (type_size) { mpi_errno = MPIR_Sched_send((char *) sendbuf + sdispls[dst], sendcounts[dst], sendtypes[dst], dst, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } } } /* force our block of sends/recvs to complete before starting the next block */ MPIR_SCHED_BARRIER(s); } fn_exit: return mpi_errno; fn_fail: goto fn_exit; }
int MPIR_Igatherv_sched_linear(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, const int recvcounts[], const int displs[], MPI_Datatype recvtype, int root, MPIR_Comm *comm_ptr, MPIR_Sched_t s) { int mpi_errno = MPI_SUCCESS; int i; int comm_size, rank; MPI_Aint extent; int min_procs; rank = comm_ptr->rank; /* If rank == root, then I recv lots, otherwise I send */ if (((comm_ptr->comm_kind == MPIR_COMM_KIND__INTRACOMM) && (root == rank)) || ((comm_ptr->comm_kind == MPIR_COMM_KIND__INTERCOMM) && (root == MPI_ROOT))) { if (comm_ptr->comm_kind == MPIR_COMM_KIND__INTRACOMM) comm_size = comm_ptr->local_size; else comm_size = comm_ptr->remote_size; MPIR_Datatype_get_extent_macro(recvtype, extent); /* each node can make sure it is not going to overflow aint */ MPIR_Ensure_Aint_fits_in_pointer(MPIR_VOID_PTR_CAST_TO_MPI_AINT recvbuf + displs[rank] * extent); for (i = 0; i < comm_size; i++) { if (recvcounts[i]) { if ((comm_ptr->comm_kind == MPIR_COMM_KIND__INTRACOMM) && (i == rank)) { if (sendbuf != MPI_IN_PLACE) { mpi_errno = MPIR_Sched_copy(sendbuf, sendcount, sendtype, ((char *)recvbuf+displs[rank]*extent), recvcounts[rank], recvtype, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } } else { mpi_errno = MPIR_Sched_recv(((char *)recvbuf+displs[i]*extent), recvcounts[i], recvtype, i, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } } } } else if (root != MPI_PROC_NULL) { /* non-root nodes, and in the intercomm. case, non-root nodes on remote side */ if (sendcount) { /* we want local size in both the intracomm and intercomm cases because the size of the root's group (group A in the standard) is irrelevant here. */ comm_size = comm_ptr->local_size; min_procs = MPIR_CVAR_GATHERV_INTER_SSEND_MIN_PROCS; if (min_procs == -1) min_procs = comm_size + 1; /* Disable ssend */ else if (min_procs == 0) /* backwards compatibility, use default value */ MPIR_CVAR_GET_DEFAULT_INT(GATHERV_INTER_SSEND_MIN_PROCS,&min_procs); if (comm_size >= min_procs) mpi_errno = MPIR_Sched_ssend(sendbuf, sendcount, sendtype, root, comm_ptr, s); else mpi_errno = MPIR_Sched_send(sendbuf, sendcount, sendtype, root, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } } fn_exit: return mpi_errno; fn_fail: goto fn_exit; }
int MPIR_Iscan_rec_dbl(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPIR_Comm *comm_ptr, MPIR_Sched_t s) { int mpi_errno = MPI_SUCCESS; MPI_Aint true_extent, true_lb, extent; int is_commutative; int mask, dst, rank, comm_size; void *partial_scan = NULL; void *tmp_buf = NULL; MPIR_SCHED_CHKPMEM_DECL(2); if (count == 0) goto fn_exit; comm_size = comm_ptr->local_size; rank = comm_ptr->rank; is_commutative = MPIR_Op_is_commutative(op); /* need to allocate temporary buffer to store partial scan*/ MPIR_Type_get_true_extent_impl(datatype, &true_lb, &true_extent); MPID_Datatype_get_extent_macro(datatype, extent); MPIR_SCHED_CHKPMEM_MALLOC(partial_scan, void *, count*(MPL_MAX(extent,true_extent)), mpi_errno, "partial_scan"); /* This eventually gets malloc()ed as a temp buffer, not added to * any user buffers */ MPIR_Ensure_Aint_fits_in_pointer(count * MPL_MAX(extent, true_extent)); /* adjust for potential negative lower bound in datatype */ partial_scan = (void *)((char*)partial_scan - true_lb); /* need to allocate temporary buffer to store incoming data*/ MPIR_SCHED_CHKPMEM_MALLOC(tmp_buf, void *, count*(MPL_MAX(extent,true_extent)), mpi_errno, "tmp_buf"); /* adjust for potential negative lower bound in datatype */ tmp_buf = (void *)((char*)tmp_buf - true_lb); /* Since this is an inclusive scan, copy local contribution into recvbuf. */ if (sendbuf != MPI_IN_PLACE) { mpi_errno = MPIR_Sched_copy(sendbuf, count, datatype, recvbuf, count, datatype, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } if (sendbuf != MPI_IN_PLACE) mpi_errno = MPIR_Sched_copy(sendbuf, count, datatype, partial_scan, count, datatype, s); else mpi_errno = MPIR_Sched_copy(recvbuf, count, datatype, partial_scan, count, datatype, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mask = 0x1; while (mask < comm_size) { dst = rank ^ mask; if (dst < comm_size) { /* Send partial_scan to dst. Recv into tmp_buf */ mpi_errno = MPIR_Sched_send(partial_scan, count, datatype, dst, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* sendrecv, no barrier here */ mpi_errno = MPIR_Sched_recv(tmp_buf, count, datatype, dst, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); if (rank > dst) { mpi_errno = MPIR_Sched_reduce(tmp_buf, partial_scan, count, datatype, op, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Sched_reduce(tmp_buf, recvbuf, count, datatype, op, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); } else { if (is_commutative) { mpi_errno = MPIR_Sched_reduce(tmp_buf, partial_scan, count, datatype, op, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); } else { mpi_errno = MPIR_Sched_reduce(partial_scan, tmp_buf, count, datatype, op, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); mpi_errno = MPIR_Sched_copy(tmp_buf, count, datatype, partial_scan, count, datatype, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); } } } mask <<= 1; } MPIR_SCHED_CHKPMEM_COMMIT(s); fn_exit: return mpi_errno; fn_fail: MPIR_SCHED_CHKPMEM_REAP(s); goto fn_exit; }
int MPIR_Iscatterv_sched_allcomm_linear(const void *sendbuf, const int sendcounts[], const int displs[], MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, int root, MPIR_Comm * comm_ptr, MPIR_Sched_t s) { int mpi_errno = MPI_SUCCESS; int rank, comm_size; MPI_Aint extent; int i; rank = comm_ptr->rank; /* If I'm the root, then scatter */ if (((comm_ptr->comm_kind == MPIR_COMM_KIND__INTRACOMM) && (root == rank)) || ((comm_ptr->comm_kind == MPIR_COMM_KIND__INTERCOMM) && (root == MPI_ROOT))) { if (comm_ptr->comm_kind == MPIR_COMM_KIND__INTRACOMM) comm_size = comm_ptr->local_size; else comm_size = comm_ptr->remote_size; MPIR_Datatype_get_extent_macro(sendtype, extent); /* We need a check to ensure extent will fit in a * pointer. That needs extent * (max count) but we can't get * that without looping over the input data. This is at least * a minimal sanity check. Maybe add a global var since we do * loop over sendcount[] in MPI_Scatterv before calling * this? */ MPIR_Ensure_Aint_fits_in_pointer(MPIR_VOID_PTR_CAST_TO_MPI_AINT sendbuf + extent); for (i = 0; i < comm_size; i++) { if (sendcounts[i]) { if ((comm_ptr->comm_kind == MPIR_COMM_KIND__INTRACOMM) && (i == rank)) { if (recvbuf != MPI_IN_PLACE) { mpi_errno = MPIR_Sched_copy(((char *) sendbuf + displs[rank] * extent), sendcounts[rank], sendtype, recvbuf, recvcount, recvtype, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } } else { mpi_errno = MPIR_Sched_send(((char *) sendbuf + displs[i] * extent), sendcounts[i], sendtype, i, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } } } } else if (root != MPI_PROC_NULL) { /* non-root nodes, and in the intercomm. case, non-root nodes on remote side */ if (recvcount) { mpi_errno = MPIR_Sched_recv(recvbuf, recvcount, recvtype, root, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } } fn_exit: return mpi_errno; fn_fail: goto fn_exit; }
int MPIR_Ireduce_scatter_sched_intra_recursive_doubling(const void *sendbuf, void *recvbuf, const int recvcounts[], MPI_Datatype datatype, MPI_Op op, MPIR_Comm *comm_ptr, MPIR_Sched_t s) { int mpi_errno = MPI_SUCCESS; int rank, comm_size, i; MPI_Aint extent, true_extent, true_lb; int *disps; void *tmp_recvbuf, *tmp_results; int type_size ATTRIBUTE((unused)), dis[2], blklens[2], total_count, dst; int mask, dst_tree_root, my_tree_root, j, k; int received; MPI_Datatype sendtype, recvtype; int nprocs_completed, tmp_mask, tree_root, is_commutative; MPIR_SCHED_CHKPMEM_DECL(5); comm_size = comm_ptr->local_size; rank = comm_ptr->rank; MPIR_Datatype_get_extent_macro(datatype, extent); MPIR_Type_get_true_extent_impl(datatype, &true_lb, &true_extent); is_commutative = MPIR_Op_is_commutative(op); MPIR_SCHED_CHKPMEM_MALLOC(disps, int *, comm_size * sizeof(int), mpi_errno, "disps", MPL_MEM_BUFFER); total_count = 0; for (i=0; i<comm_size; i++) { disps[i] = total_count; total_count += recvcounts[i]; } if (total_count == 0) { goto fn_exit; } MPIR_Datatype_get_size_macro(datatype, type_size); /* total_count*extent eventually gets malloced. it isn't added to * a user-passed in buffer */ MPIR_Ensure_Aint_fits_in_pointer(total_count * MPL_MAX(true_extent, extent)); /* need to allocate temporary buffer to receive incoming data*/ MPIR_SCHED_CHKPMEM_MALLOC(tmp_recvbuf, void *, total_count*(MPL_MAX(true_extent,extent)), mpi_errno, "tmp_recvbuf", MPL_MEM_BUFFER); /* adjust for potential negative lower bound in datatype */ tmp_recvbuf = (void *)((char*)tmp_recvbuf - true_lb); /* need to allocate another temporary buffer to accumulate results */ MPIR_SCHED_CHKPMEM_MALLOC(tmp_results, void *, total_count*(MPL_MAX(true_extent,extent)), mpi_errno, "tmp_results", MPL_MEM_BUFFER); /* adjust for potential negative lower bound in datatype */ tmp_results = (void *)((char*)tmp_results - true_lb); /* copy sendbuf into tmp_results */ if (sendbuf != MPI_IN_PLACE) mpi_errno = MPIR_Sched_copy(sendbuf, total_count, datatype, tmp_results, total_count, datatype, s); else mpi_errno = MPIR_Sched_copy(recvbuf, total_count, datatype, tmp_results, total_count, datatype, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); mask = 0x1; i = 0; while (mask < comm_size) { dst = rank ^ mask; dst_tree_root = dst >> i; dst_tree_root <<= i; my_tree_root = rank >> i; my_tree_root <<= i; /* At step 1, processes exchange (n-n/p) amount of data; at step 2, (n-2n/p) amount of data; at step 3, (n-4n/p) amount of data, and so forth. We use derived datatypes for this. At each step, a process does not need to send data indexed from my_tree_root to my_tree_root+mask-1. Similarly, a process won't receive data indexed from dst_tree_root to dst_tree_root+mask-1. */ /* calculate sendtype */ blklens[0] = blklens[1] = 0; for (j=0; j<my_tree_root; j++) blklens[0] += recvcounts[j]; for (j=my_tree_root+mask; j<comm_size; j++) blklens[1] += recvcounts[j]; dis[0] = 0; dis[1] = blklens[0]; for (j=my_tree_root; (j<my_tree_root+mask) && (j<comm_size); j++) dis[1] += recvcounts[j]; mpi_errno = MPIR_Type_indexed_impl(2, blklens, dis, datatype, &sendtype); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Type_commit_impl(&sendtype); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* calculate recvtype */ blklens[0] = blklens[1] = 0; for (j=0; j<dst_tree_root && j<comm_size; j++) blklens[0] += recvcounts[j]; for (j=dst_tree_root+mask; j<comm_size; j++) blklens[1] += recvcounts[j]; dis[0] = 0; dis[1] = blklens[0]; for (j=dst_tree_root; (j<dst_tree_root+mask) && (j<comm_size); j++) dis[1] += recvcounts[j]; mpi_errno = MPIR_Type_indexed_impl(2, blklens, dis, datatype, &recvtype); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Type_commit_impl(&recvtype); if (mpi_errno) MPIR_ERR_POP(mpi_errno); received = 0; if (dst < comm_size) { /* tmp_results contains data to be sent in each step. Data is received in tmp_recvbuf and then accumulated into tmp_results. accumulation is done later below. */ mpi_errno = MPIR_Sched_send(tmp_results, 1, sendtype, dst, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Sched_recv(tmp_recvbuf, 1, recvtype, dst, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); received = 1; } /* if some processes in this process's subtree in this step did not have any destination process to communicate with because of non-power-of-two, we need to send them the result. We use a logarithmic recursive-halfing algorithm for this. */ if (dst_tree_root + mask > comm_size) { nprocs_completed = comm_size - my_tree_root - mask; /* nprocs_completed is the number of processes in this subtree that have all the data. Send data to others in a tree fashion. First find root of current tree that is being divided into two. k is the number of least-significant bits in this process's rank that must be zeroed out to find the rank of the root */ j = mask; k = 0; while (j) { j >>= 1; k++; } k--; tmp_mask = mask >> 1; while (tmp_mask) { dst = rank ^ tmp_mask; tree_root = rank >> k; tree_root <<= k; /* send only if this proc has data and destination doesn't have data. at any step, multiple processes can send if they have the data */ if ((dst > rank) && (rank < tree_root + nprocs_completed) && (dst >= tree_root + nprocs_completed)) { /* send the current result */ mpi_errno = MPIR_Sched_send(tmp_recvbuf, 1, recvtype, dst, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); } /* recv only if this proc. doesn't have data and sender has data */ else if ((dst < rank) && (dst < tree_root + nprocs_completed) && (rank >= tree_root + nprocs_completed)) { mpi_errno = MPIR_Sched_recv(tmp_recvbuf, 1, recvtype, dst, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); received = 1; } tmp_mask >>= 1; k--; } } /* N.B. The following comment comes from the FT version of * MPI_Reduce_scatter. It does not currently apply to this code, but * will in the future when we update the NBC code to be fault-tolerant * in roughly the same fashion. [goodell@ 2011-03-03] */ /* The following reduction is done here instead of after the MPIC_Sendrecv or MPIC_Recv above. This is because to do it above, in the noncommutative case, we would need an extra temp buffer so as not to overwrite temp_recvbuf, because temp_recvbuf may have to be communicated to other processes in the non-power-of-two case. To avoid that extra allocation, we do the reduce here. */ if (received) { if (is_commutative || (dst_tree_root < my_tree_root)) { mpi_errno = MPIR_Sched_reduce(tmp_recvbuf, tmp_results, blklens[0], datatype, op, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Sched_reduce(((char *)tmp_recvbuf + dis[1]*extent), ((char *)tmp_results + dis[1]*extent), blklens[1], datatype, op, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); } else { mpi_errno = MPIR_Sched_reduce(tmp_results, tmp_recvbuf, blklens[0], datatype, op, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Sched_reduce(((char *)tmp_results + dis[1]*extent), ((char *)tmp_recvbuf + dis[1]*extent), blklens[1], datatype, op, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); /* copy result back into tmp_results */ mpi_errno = MPIR_Sched_copy(tmp_recvbuf, 1, recvtype, tmp_results, 1, recvtype, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); } } MPIR_Type_free_impl(&sendtype); MPIR_Type_free_impl(&recvtype); mask <<= 1; i++; }
int MPIR_Ireduce_scatter_sched_intra_recursive_halving(const void *sendbuf, void *recvbuf, const int recvcounts[], MPI_Datatype datatype, MPI_Op op, MPIR_Comm * comm_ptr, MPIR_Sched_t s) { int mpi_errno = MPI_SUCCESS; int rank, comm_size, i; MPI_Aint extent, true_extent, true_lb; int *disps; void *tmp_recvbuf, *tmp_results; int type_size ATTRIBUTE((unused)), total_count, dst; int mask; int *newcnts, *newdisps, rem, newdst, send_idx, recv_idx, last_idx, send_cnt, recv_cnt; int pof2, old_i, newrank; MPIR_SCHED_CHKPMEM_DECL(5); comm_size = comm_ptr->local_size; rank = comm_ptr->rank; MPIR_Datatype_get_extent_macro(datatype, extent); MPIR_Type_get_true_extent_impl(datatype, &true_lb, &true_extent); #ifdef HAVE_ERROR_CHECKING MPIR_Assert(MPIR_Op_is_commutative(op)); #endif MPIR_SCHED_CHKPMEM_MALLOC(disps, int *, comm_size * sizeof(int), mpi_errno, "disps", MPL_MEM_BUFFER); total_count = 0; for (i = 0; i < comm_size; i++) { disps[i] = total_count; total_count += recvcounts[i]; } if (total_count == 0) { goto fn_exit; } MPIR_Datatype_get_size_macro(datatype, type_size); /* allocate temp. buffer to receive incoming data */ MPIR_SCHED_CHKPMEM_MALLOC(tmp_recvbuf, void *, total_count * (MPL_MAX(true_extent, extent)), mpi_errno, "tmp_recvbuf", MPL_MEM_BUFFER); /* adjust for potential negative lower bound in datatype */ tmp_recvbuf = (void *) ((char *) tmp_recvbuf - true_lb); /* need to allocate another temporary buffer to accumulate * results because recvbuf may not be big enough */ MPIR_SCHED_CHKPMEM_MALLOC(tmp_results, void *, total_count * (MPL_MAX(true_extent, extent)), mpi_errno, "tmp_results", MPL_MEM_BUFFER); /* adjust for potential negative lower bound in datatype */ tmp_results = (void *) ((char *) tmp_results - true_lb); /* copy sendbuf into tmp_results */ if (sendbuf != MPI_IN_PLACE) mpi_errno = MPIR_Sched_copy(sendbuf, total_count, datatype, tmp_results, total_count, datatype, s); else mpi_errno = MPIR_Sched_copy(recvbuf, total_count, datatype, tmp_results, total_count, datatype, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); pof2 = comm_ptr->pof2; rem = comm_size - pof2; /* In the non-power-of-two case, all even-numbered * processes of rank < 2*rem send their data to * (rank+1). These even-numbered processes no longer * participate in the algorithm until the very end. The * remaining processes form a nice power-of-two. */ if (rank < 2 * rem) { if (rank % 2 == 0) { /* even */ mpi_errno = MPIR_Sched_send(tmp_results, total_count, datatype, rank + 1, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); /* temporarily set the rank to -1 so that this * process does not pariticipate in recursive * doubling */ newrank = -1; } else { /* odd */ mpi_errno = MPIR_Sched_recv(tmp_recvbuf, total_count, datatype, rank - 1, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); /* do the reduction on received data. since the * ordering is right, it doesn't matter whether * the operation is commutative or not. */ mpi_errno = MPIR_Sched_reduce(tmp_recvbuf, tmp_results, total_count, datatype, op, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); /* change the rank */ newrank = rank / 2; } } else /* rank >= 2*rem */ newrank = rank - rem; if (newrank != -1) { /* recalculate the recvcounts and disps arrays because the * even-numbered processes who no longer participate will * have their result calculated by the process to their * right (rank+1). */ MPIR_SCHED_CHKPMEM_MALLOC(newcnts, int *, pof2 * sizeof(int), mpi_errno, "newcnts", MPL_MEM_BUFFER); MPIR_SCHED_CHKPMEM_MALLOC(newdisps, int *, pof2 * sizeof(int), mpi_errno, "newdisps", MPL_MEM_BUFFER); for (i = 0; i < pof2; i++) { /* what does i map to in the old ranking? */ old_i = (i < rem) ? i * 2 + 1 : i + rem; if (old_i < 2 * rem) { /* This process has to also do its left neighbor's * work */ newcnts[i] = recvcounts[old_i] + recvcounts[old_i - 1]; } else newcnts[i] = recvcounts[old_i]; } newdisps[0] = 0; for (i = 1; i < pof2; i++) newdisps[i] = newdisps[i - 1] + newcnts[i - 1]; mask = pof2 >> 1; send_idx = recv_idx = 0; last_idx = pof2; while (mask > 0) { newdst = newrank ^ mask; /* find real rank of dest */ dst = (newdst < rem) ? newdst * 2 + 1 : newdst + rem; send_cnt = recv_cnt = 0; if (newrank < newdst) { send_idx = recv_idx + mask; for (i = send_idx; i < last_idx; i++) send_cnt += newcnts[i]; for (i = recv_idx; i < send_idx; i++) recv_cnt += newcnts[i]; } else { recv_idx = send_idx + mask; for (i = send_idx; i < recv_idx; i++) send_cnt += newcnts[i]; for (i = recv_idx; i < last_idx; i++) recv_cnt += newcnts[i]; } /* Send data from tmp_results. Recv into tmp_recvbuf */ { /* avoid sending and receiving pointless 0-byte messages */ int send_dst = (send_cnt ? dst : MPI_PROC_NULL); int recv_dst = (recv_cnt ? dst : MPI_PROC_NULL); mpi_errno = MPIR_Sched_send(((char *) tmp_results + newdisps[send_idx] * extent), send_cnt, datatype, send_dst, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Sched_recv(((char *) tmp_recvbuf + newdisps[recv_idx] * extent), recv_cnt, datatype, recv_dst, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); } /* tmp_recvbuf contains data received in this step. * tmp_results contains data accumulated so far */ if (recv_cnt) { mpi_errno = MPIR_Sched_reduce(((char *) tmp_recvbuf + newdisps[recv_idx] * extent), ((char *) tmp_results + newdisps[recv_idx] * extent), recv_cnt, datatype, op, s); MPIR_SCHED_BARRIER(s); } /* update send_idx for next iteration */ send_idx = recv_idx; last_idx = recv_idx + mask; mask >>= 1; } /* copy this process's result from tmp_results to recvbuf */ if (recvcounts[rank]) { mpi_errno = MPIR_Sched_copy(((char *) tmp_results + disps[rank] * extent), recvcounts[rank], datatype, recvbuf, recvcounts[rank], datatype, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); } }
/* Algorithm: Nonblocking all-to-all for sendbuf==MPI_IN_PLACE. * * Restrictions: Only for MPI_IN_PLACE * * We use nonblocking equivalent of pair-wise sendrecv_replace in order to * conserve memory usage, which is keeping with the spirit of the MPI-2.2 * Standard. But because of this approach all processes must agree on the * global schedule of "sendrecv_replace" operations to avoid deadlock. * * Note that this is not an especially efficient algorithm in terms of time. * Something like MADRE is probably the best solution for the MPI_IN_PLACE * scenario. */ int MPIR_Ialltoall_sched_intra_inplace(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPIR_Comm * comm_ptr, MPIR_Sched_t s) { int mpi_errno = MPI_SUCCESS; void *tmp_buf = NULL; int i, j; int rank, comm_size; int nbytes, recvtype_size; MPI_Aint recvtype_extent; int peer; MPIR_SCHED_CHKPMEM_DECL(1); #ifdef HAVE_ERROR_CHECKING MPIR_Assert(sendbuf == MPI_IN_PLACE); #endif if (recvcount == 0) goto fn_exit; comm_size = comm_ptr->local_size; rank = comm_ptr->rank; MPIR_Datatype_get_size_macro(recvtype, recvtype_size); MPIR_Datatype_get_extent_macro(recvtype, recvtype_extent); nbytes = recvtype_size * recvcount; MPIR_SCHED_CHKPMEM_MALLOC(tmp_buf, void *, nbytes, mpi_errno, "tmp_buf", MPL_MEM_BUFFER); for (i = 0; i < comm_size; ++i) { /* start inner loop at i to avoid re-exchanging data */ for (j = i; j < comm_size; ++j) { if (rank == i && rank == j) { /* no need to "sendrecv_replace" for ourselves */ } else if (rank == i || rank == j) { if (rank == i) peer = j; else peer = i; /* pack to tmp_buf */ mpi_errno = MPIR_Sched_copy(((char *) recvbuf + peer * recvcount * recvtype_extent), recvcount, recvtype, tmp_buf, nbytes, MPI_BYTE, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); /* now simultaneously send from tmp_buf and recv to recvbuf */ mpi_errno = MPIR_Sched_send(tmp_buf, nbytes, MPI_BYTE, peer, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Sched_recv(((char *) recvbuf + peer * recvcount * recvtype_extent), recvcount, recvtype, peer, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); } } } MPIR_SCHED_CHKPMEM_COMMIT(s); fn_exit: return mpi_errno; fn_fail: MPIR_SCHED_CHKPMEM_REAP(s); goto fn_exit; }
int MPIR_Ialltoallw_sched_inter_pairwise_exchange(const void *sendbuf, const int sendcounts[], const int sdispls[], const MPI_Datatype sendtypes[], void *recvbuf, const int recvcounts[], const int rdispls[], const MPI_Datatype recvtypes[], MPIR_Comm * comm_ptr, MPIR_Sched_t s) { /* Intercommunicator alltoallw. We use a pairwise exchange algorithm similar to the one used in intracommunicator alltoallw. Since the local and remote groups can be of different sizes, we first compute the max of local_group_size, remote_group_size. At step i, 0 <= i < max_size, each process receives from src = (rank - i + max_size) % max_size if src < remote_size, and sends to dst = (rank + i) % max_size if dst < remote_size. FIXME: change algorithm to match intracommunicator alltoallw */ int mpi_errno = MPI_SUCCESS; int local_size, remote_size, max_size, i; int src, dst, rank, sendcount, recvcount; char *sendaddr, *recvaddr; MPI_Datatype sendtype, recvtype; local_size = comm_ptr->local_size; remote_size = comm_ptr->remote_size; rank = comm_ptr->rank; /* Use pairwise exchange algorithm. */ max_size = MPL_MAX(local_size, remote_size); for (i = 0; i < max_size; i++) { src = (rank - i + max_size) % max_size; dst = (rank + i) % max_size; if (src >= remote_size) { src = MPI_PROC_NULL; recvaddr = NULL; recvcount = 0; recvtype = MPI_DATATYPE_NULL; } else { recvaddr = (char *) recvbuf + rdispls[src]; recvcount = recvcounts[src]; recvtype = recvtypes[src]; } if (dst >= remote_size) { dst = MPI_PROC_NULL; sendaddr = NULL; sendcount = 0; sendtype = MPI_DATATYPE_NULL; } else { sendaddr = (char *) sendbuf + sdispls[dst]; sendcount = sendcounts[dst]; sendtype = sendtypes[dst]; } mpi_errno = MPIR_Sched_send(sendaddr, sendcount, sendtype, dst, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* sendrecv, no barrier here */ mpi_errno = MPIR_Sched_recv(recvaddr, recvcount, recvtype, src, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); } fn_exit: return mpi_errno; fn_fail: goto fn_exit; }
int MPIR_Iallgatherv_sched_intra_recursive_doubling(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, const int recvcounts[], const int displs[], MPI_Datatype recvtype, MPIR_Comm * comm_ptr, MPIR_Sched_t s) { int mpi_errno = MPI_SUCCESS; int comm_size, rank, i, j, k; int curr_count, send_offset, incoming_count, recv_offset; int mask, dst, total_count, position, offset, my_tree_root, dst_tree_root; MPI_Aint recvtype_extent, recvtype_sz; void *tmp_buf = NULL; MPIR_SCHED_CHKPMEM_DECL(1); comm_size = comm_ptr->local_size; rank = comm_ptr->rank; #ifdef HAVE_ERROR_CHECKING /* Currently this algorithm can only handle power-of-2 comm_size. * Non power-of-2 comm_size is still experimental */ MPIR_Assert(!(comm_size & (comm_size - 1))); #endif /* HAVE_ERROR_CHECKING */ /* need to receive contiguously into tmp_buf because * displs could make the recvbuf noncontiguous */ MPIR_Datatype_get_size_macro(recvtype, recvtype_sz); MPIR_Datatype_get_extent_macro(recvtype, recvtype_extent); total_count = 0; for (i = 0; i < comm_size; i++) total_count += recvcounts[i]; if (total_count == 0) goto fn_exit; MPIR_SCHED_CHKPMEM_MALLOC(tmp_buf, void *, total_count * recvtype_sz, mpi_errno, "tmp_buf", MPL_MEM_BUFFER); /* copy local data into right location in tmp_buf */ position = 0; for (i = 0; i < rank; i++) position += recvcounts[i]; if (sendbuf != MPI_IN_PLACE) { mpi_errno = MPIR_Sched_copy(sendbuf, sendcount, sendtype, ((char *) tmp_buf + position * recvtype_sz), recvcounts[rank] * recvtype_sz, MPI_BYTE, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } else { /* if in_place specified, local data is found in recvbuf */ mpi_errno = MPIR_Sched_copy(((char *) recvbuf + displs[rank] * recvtype_extent), recvcounts[rank], recvtype, ((char *) tmp_buf + position * recvtype_sz), recvcounts[rank] * recvtype_sz, MPI_BYTE, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } curr_count = recvcounts[rank]; /* never used uninitialized w/o this, but compiler can't tell that */ incoming_count = -1; /* [goodell@] random notes that help slightly when deciphering this code: * - mask is also equal to the number of blocks that we are going to recv * (less if comm_size is non-pof2) * - FOO_tree_root is the leftmost (lowest ranked) process with whom FOO has * communicated, directly or indirectly, at the beginning of round the * round. FOO is either "dst" or "my", where "my" means use my rank. * - in each round we are going to recv the blocks * B[dst_tree_root],B[dst_tree_root+1],...,B[min(dst_tree_root+mask,comm_size)] */ mask = 0x1; i = 0; while (mask < comm_size) { dst = rank ^ mask; /* find offset into send and recv buffers. zero out * the least significant "i" bits of rank and dst to * find root of src and dst subtrees. Use ranks of * roots as index to send from and recv into buffer */ dst_tree_root = dst >> i; dst_tree_root <<= i; my_tree_root = rank >> i; my_tree_root <<= i; if (dst < comm_size) { send_offset = 0; for (j = 0; j < my_tree_root; j++) send_offset += recvcounts[j]; recv_offset = 0; for (j = 0; j < dst_tree_root; j++) recv_offset += recvcounts[j]; incoming_count = 0; for (j = dst_tree_root; j < (dst_tree_root + mask) && j < comm_size; ++j) incoming_count += recvcounts[j]; mpi_errno = MPIR_Sched_send(((char *) tmp_buf + send_offset * recvtype_sz), curr_count * recvtype_sz, MPI_BYTE, dst, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* sendrecv, no barrier here */ mpi_errno = MPIR_Sched_recv(((char *) tmp_buf + recv_offset * recvtype_sz), incoming_count * recvtype_sz, MPI_BYTE, dst, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); curr_count += incoming_count; } /* if some processes in this process's subtree in this step * did not have any destination process to communicate with * because of non-power-of-two, we need to send them the * data that they would normally have received from those * processes. That is, the haves in this subtree must send to * the havenots. We use a logarithmic * recursive-halfing algorithm for this. */ /* This part of the code will not currently be * executed because we are not using recursive * doubling for non power of two. Mark it as experimental * so that it doesn't show up as red in the coverage * tests. */ /* --BEGIN EXPERIMENTAL-- */ if (dst_tree_root + mask > comm_size) { int tmp_mask, tree_root; int nprocs_completed = comm_size - my_tree_root - mask; /* nprocs_completed is the number of processes in this * subtree that have all the data. Send data to others * in a tree fashion. First find root of current tree * that is being divided into two. k is the number of * least-significant bits in this process's rank that * must be zeroed out to find the rank of the root */ /* [goodell@] it looks like (k==i) is always true, could possibly * skip the loop below */ j = mask; k = 0; while (j) { j >>= 1; k++; } k--; tmp_mask = mask >> 1; while (tmp_mask) { dst = rank ^ tmp_mask; tree_root = rank >> k; tree_root <<= k; /* send only if this proc has data and destination * doesn't have data. at any step, multiple processes * can send if they have the data */ if ((dst > rank) && (rank < tree_root + nprocs_completed) && (dst >= tree_root + nprocs_completed)) { offset = 0; for (j = 0; j < (my_tree_root + mask); j++) offset += recvcounts[j]; offset *= recvtype_sz; /* incoming_count was set in the previous * receive. that's the amount of data to be * sent now. */ mpi_errno = MPIR_Sched_send(((char *) tmp_buf + offset), incoming_count * recvtype_sz, MPI_BYTE, dst, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); } /* recv only if this proc. doesn't have data and sender * has data */ else if ((dst < rank) && (dst < tree_root + nprocs_completed) && (rank >= tree_root + nprocs_completed)) { offset = 0; for (j = 0; j < (my_tree_root + mask); j++) offset += recvcounts[j]; /* recalculate incoming_count, since not all processes will have * this value */ incoming_count = 0; for (j = dst_tree_root; j < (dst_tree_root + mask) && j < comm_size; ++j) incoming_count += recvcounts[j]; mpi_errno = MPIR_Sched_recv(((char *) tmp_buf + offset * recvtype_sz), incoming_count * recvtype_sz, MPI_BYTE, dst, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); curr_count += incoming_count; } tmp_mask >>= 1; k--; } } /* --END EXPERIMENTAL-- */ mask <<= 1; i++; }
int MPIR_Igather_sched_intra_binomial(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, int root, MPIR_Comm *comm_ptr, MPIR_Sched_t s) { int mpi_errno = MPI_SUCCESS; int comm_size, rank; int relative_rank, is_homogeneous; int mask, src, dst, relative_src; MPI_Aint recvtype_size, sendtype_size, curr_cnt=0, nbytes; int recvblks; int tmp_buf_size, missing; void *tmp_buf = NULL; int blocks[2]; int displs[2]; MPI_Aint struct_displs[2]; MPI_Aint extent=0; int copy_offset = 0, copy_blks = 0; MPI_Datatype types[2], tmp_type; MPIR_SCHED_CHKPMEM_DECL(1); comm_size = comm_ptr->local_size; rank = comm_ptr->rank; if (((rank == root) && (recvcount == 0)) || ((rank != root) && (sendcount == 0))) goto fn_exit; is_homogeneous = TRUE; #ifdef MPID_HAS_HETERO is_homogeneous = !comm_ptr->is_hetero; #endif MPIR_Assert(comm_ptr->comm_kind == MPIR_COMM_KIND__INTRACOMM); /* Use binomial tree algorithm. */ relative_rank = (rank >= root) ? rank - root : rank - root + comm_size; if (rank == root) { MPIR_Datatype_get_extent_macro(recvtype, extent); MPIR_Ensure_Aint_fits_in_pointer(MPIR_VOID_PTR_CAST_TO_MPI_AINT recvbuf+ (extent*recvcount*comm_size)); } if (is_homogeneous) { /* communicator is homogeneous. no need to pack buffer. */ if (rank == root) { MPIR_Datatype_get_size_macro(recvtype, recvtype_size); nbytes = recvtype_size * recvcount; } else { MPIR_Datatype_get_size_macro(sendtype, sendtype_size); nbytes = sendtype_size * sendcount; } /* Find the number of missing nodes in my sub-tree compared to * a balanced tree */ for (mask = 1; mask < comm_size; mask <<= 1); --mask; while (relative_rank & mask) mask >>= 1; missing = (relative_rank | mask) - comm_size + 1; if (missing < 0) missing = 0; tmp_buf_size = (mask - missing); /* If the message is smaller than the threshold, we will copy * our message in there too */ if (nbytes < MPIR_CVAR_GATHER_VSMALL_MSG_SIZE) tmp_buf_size++; tmp_buf_size *= nbytes; /* For zero-ranked root, we don't need any temporary buffer */ if ((rank == root) && (!root || (nbytes >= MPIR_CVAR_GATHER_VSMALL_MSG_SIZE))) tmp_buf_size = 0; if (tmp_buf_size) { MPIR_SCHED_CHKPMEM_MALLOC(tmp_buf, void *, tmp_buf_size, mpi_errno, "tmp_buf", MPL_MEM_BUFFER); } if (rank == root) { if (sendbuf != MPI_IN_PLACE) { mpi_errno = MPIR_Localcopy(sendbuf, sendcount, sendtype, ((char *) recvbuf + extent*recvcount*rank), recvcount, recvtype); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } } else if (tmp_buf_size && (nbytes < MPIR_CVAR_GATHER_VSMALL_MSG_SIZE)) { /* copy from sendbuf into tmp_buf */ mpi_errno = MPIR_Localcopy(sendbuf, sendcount, sendtype, tmp_buf, nbytes, MPI_BYTE); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } curr_cnt = nbytes; mask = 0x1; while (mask < comm_size) { if ((mask & relative_rank) == 0) { src = relative_rank | mask; if (src < comm_size) { src = (src + root) % comm_size; if (rank == root) { recvblks = mask; if ((2 * recvblks) > comm_size) recvblks = comm_size - recvblks; if ((rank + mask + recvblks == comm_size) || (((rank + mask) % comm_size) < ((rank + mask + recvblks) % comm_size))) { /* If the data contiguously fits into the * receive buffer, place it directly. This * should cover the case where the root is * rank 0. */ char *rp = (char *)recvbuf + (((rank + mask) % comm_size)*recvcount*extent); mpi_errno = MPIR_Sched_recv(rp, (recvblks * recvcount), recvtype, src, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Sched_barrier(s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } else if (nbytes < MPIR_CVAR_GATHER_VSMALL_MSG_SIZE) { mpi_errno = MPIR_Sched_recv(tmp_buf, (recvblks * nbytes), MPI_BYTE, src, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Sched_barrier(s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); copy_offset = rank + mask; copy_blks = recvblks; } else { blocks[0] = recvcount * (comm_size - root - mask); displs[0] = recvcount * (root + mask); blocks[1] = (recvcount * recvblks) - blocks[0]; displs[1] = 0; mpi_errno = MPIR_Type_indexed_impl(2, blocks, displs, recvtype, &tmp_type); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Type_commit_impl(&tmp_type); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Sched_recv(recvbuf, 1, tmp_type, src, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Sched_barrier(s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* this "premature" free is safe b/c the sched holds an actual ref to keep it alive */ MPIR_Type_free_impl(&tmp_type); } } else { /* Intermediate nodes store in temporary buffer */ MPI_Aint offset; /* Estimate the amount of data that is going to come in */ recvblks = mask; relative_src = ((src - root) < 0) ? (src - root + comm_size) : (src - root); if (relative_src + mask > comm_size) recvblks -= (relative_src + mask - comm_size); if (nbytes < MPIR_CVAR_GATHER_VSMALL_MSG_SIZE) offset = mask * nbytes; else offset = (mask - 1) * nbytes; mpi_errno = MPIR_Sched_recv(((char *)tmp_buf + offset), (recvblks * nbytes), MPI_BYTE, src, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Sched_barrier(s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); curr_cnt += (recvblks * nbytes); } } } else { dst = relative_rank ^ mask; dst = (dst + root) % comm_size; if (!tmp_buf_size) { /* leaf nodes send directly from sendbuf */ mpi_errno = MPIR_Sched_send(sendbuf, sendcount, sendtype, dst, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Sched_barrier(s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } else if (nbytes < MPIR_CVAR_GATHER_VSMALL_MSG_SIZE) { mpi_errno = MPIR_Sched_send(tmp_buf, curr_cnt, MPI_BYTE, dst, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Sched_barrier(s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } else { blocks[0] = sendcount; struct_displs[0] = MPIR_VOID_PTR_CAST_TO_MPI_AINT sendbuf; types[0] = sendtype; /* check for overflow. work around int limits if needed*/ if (curr_cnt - nbytes != (int)(curr_cnt-nbytes)) { blocks[1] = 1; MPIR_Type_contiguous_x_impl(curr_cnt - nbytes, MPI_BYTE, &(types[1])); } else { MPIR_Assign_trunc(blocks[1], curr_cnt - nbytes, int); types[1] = MPI_BYTE; } struct_displs[1] = MPIR_VOID_PTR_CAST_TO_MPI_AINT tmp_buf; mpi_errno = MPIR_Type_create_struct_impl(2, blocks, struct_displs, types, &tmp_type); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Type_commit_impl(&tmp_type); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Sched_send(MPI_BOTTOM, 1, tmp_type, dst, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); /* this "premature" free is safe b/c the sched holds an actual ref to keep it alive */ MPIR_Type_free_impl(&tmp_type); } break; } mask <<= 1; } if ((rank == root) && root && (nbytes < MPIR_CVAR_GATHER_VSMALL_MSG_SIZE) && copy_blks) { /* reorder and copy from tmp_buf into recvbuf */ /* FIXME why are there two copies here? */ mpi_errno = MPIR_Sched_copy(tmp_buf, nbytes * (comm_size - copy_offset), MPI_BYTE, ((char *)recvbuf + extent * recvcount * copy_offset), recvcount * (comm_size - copy_offset), recvtype, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Sched_copy((char *)tmp_buf + nbytes * (comm_size - copy_offset), nbytes * (copy_blks - comm_size + copy_offset), MPI_BYTE, recvbuf, recvcount * (copy_blks - comm_size + copy_offset), recvtype, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } } #ifdef MPID_HAS_HETERO else {
int MPIR_Iallreduce_sched_intra_reduce_scatter_allgather(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPIR_Comm * comm_ptr, MPIR_Sched_t s) { int mpi_errno = MPI_SUCCESS; int comm_size, rank, newrank, pof2, rem; int i, send_idx, recv_idx, last_idx, mask, newdst, dst, send_cnt, recv_cnt; MPI_Aint true_lb, true_extent, extent; void *tmp_buf = NULL; int *cnts, *disps; MPIR_SCHED_CHKPMEM_DECL(1); MPIR_CHKLMEM_DECL(2); #ifdef HAVE_ERROR_CHECKING /* we only support builtin datatypes for now, breaking up user types to do * the reduce-scatter is tricky */ MPIR_Assert(HANDLE_GET_KIND(op) == HANDLE_KIND_BUILTIN); #endif comm_size = comm_ptr->local_size; rank = comm_ptr->rank; /* need to allocate temporary buffer to store incoming data */ MPIR_Type_get_true_extent_impl(datatype, &true_lb, &true_extent); MPIR_Datatype_get_extent_macro(datatype, extent); MPIR_Ensure_Aint_fits_in_pointer(count * MPL_MAX(extent, true_extent)); MPIR_SCHED_CHKPMEM_MALLOC(tmp_buf, void *, count * (MPL_MAX(extent, true_extent)), mpi_errno, "temporary buffer", MPL_MEM_BUFFER); /* adjust for potential negative lower bound in datatype */ tmp_buf = (void *) ((char *) tmp_buf - true_lb); /* copy local data into recvbuf */ if (sendbuf != MPI_IN_PLACE) { mpi_errno = MPIR_Sched_copy(sendbuf, count, datatype, recvbuf, count, datatype, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); } /* get nearest power-of-two less than or equal to comm_size */ pof2 = comm_ptr->pof2; rem = comm_size - pof2; /* In the non-power-of-two case, all even-numbered * processes of rank < 2*rem send their data to * (rank+1). These even-numbered processes no longer * participate in the algorithm until the very end. The * remaining processes form a nice power-of-two. */ if (rank < 2 * rem) { if (rank % 2 == 0) { /* even */ mpi_errno = MPIR_Sched_send(recvbuf, count, datatype, rank + 1, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); /* temporarily set the rank to -1 so that this * process does not pariticipate in recursive * doubling */ newrank = -1; } else { /* odd */ mpi_errno = MPIR_Sched_recv(tmp_buf, count, datatype, rank - 1, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); /* do the reduction on received data. since the * ordering is right, it doesn't matter whether * the operation is commutative or not. */ mpi_errno = MPIR_Sched_reduce(tmp_buf, recvbuf, count, datatype, op, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); /* change the rank */ newrank = rank / 2; } } else /* rank >= 2*rem */ newrank = rank - rem; if (newrank != -1) { /* for the reduce-scatter, calculate the count that * each process receives and the displacement within * the buffer */ /* TODO I (goodell@) believe that these counts and displacements could be * calculated directly during the loop, rather than requiring a less-scalable * "2*pof2"-sized memory allocation */ MPIR_CHKLMEM_MALLOC(cnts, int *, pof2 * sizeof(int), mpi_errno, "counts", MPL_MEM_BUFFER); MPIR_CHKLMEM_MALLOC(disps, int *, pof2 * sizeof(int), mpi_errno, "displacements", MPL_MEM_BUFFER); MPIR_Assert(count >= pof2); /* the cnts calculations assume this */ for (i = 0; i < (pof2 - 1); i++) cnts[i] = count / pof2; cnts[pof2 - 1] = count - (count / pof2) * (pof2 - 1); if (pof2) disps[0] = 0; for (i = 1; i < pof2; i++) disps[i] = disps[i - 1] + cnts[i - 1]; mask = 0x1; send_idx = recv_idx = 0; last_idx = pof2; while (mask < pof2) { newdst = newrank ^ mask; /* find real rank of dest */ dst = (newdst < rem) ? newdst * 2 + 1 : newdst + rem; send_cnt = recv_cnt = 0; if (newrank < newdst) { send_idx = recv_idx + pof2 / (mask * 2); for (i = send_idx; i < last_idx; i++) send_cnt += cnts[i]; for (i = recv_idx; i < send_idx; i++) recv_cnt += cnts[i]; } else { recv_idx = send_idx + pof2 / (mask * 2); for (i = send_idx; i < recv_idx; i++) send_cnt += cnts[i]; for (i = recv_idx; i < last_idx; i++) recv_cnt += cnts[i]; } /* Send data from recvbuf. Recv into tmp_buf */ mpi_errno = MPIR_Sched_recv(((char *) tmp_buf + disps[recv_idx] * extent), recv_cnt, datatype, dst, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* sendrecv, no barrier here */ mpi_errno = MPIR_Sched_send(((char *) recvbuf + disps[send_idx] * extent), send_cnt, datatype, dst, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); /* tmp_buf contains data received in this step. * recvbuf contains data accumulated so far */ /* This algorithm is used only for predefined ops * and predefined ops are always commutative. */ mpi_errno = MPIR_Sched_reduce(((char *) tmp_buf + disps[recv_idx] * extent), ((char *) recvbuf + disps[recv_idx] * extent), recv_cnt, datatype, op, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); /* update send_idx for next iteration */ send_idx = recv_idx; mask <<= 1; /* update last_idx, but not in last iteration * because the value is needed in the allgather * step below. */ if (mask < pof2) last_idx = recv_idx + pof2 / mask; } /* now do the allgather */ mask >>= 1; while (mask > 0) { newdst = newrank ^ mask; /* find real rank of dest */ dst = (newdst < rem) ? newdst * 2 + 1 : newdst + rem; send_cnt = recv_cnt = 0; if (newrank < newdst) { /* update last_idx except on first iteration */ if (mask != pof2 / 2) last_idx = last_idx + pof2 / (mask * 2); recv_idx = send_idx + pof2 / (mask * 2); for (i = send_idx; i < recv_idx; i++) send_cnt += cnts[i]; for (i = recv_idx; i < last_idx; i++) recv_cnt += cnts[i]; } else { recv_idx = send_idx - pof2 / (mask * 2); for (i = send_idx; i < last_idx; i++) send_cnt += cnts[i]; for (i = recv_idx; i < send_idx; i++) recv_cnt += cnts[i]; } mpi_errno = MPIR_Sched_recv(((char *) recvbuf + disps[recv_idx] * extent), recv_cnt, datatype, dst, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* sendrecv, no barrier here */ mpi_errno = MPIR_Sched_send(((char *) recvbuf + disps[send_idx] * extent), send_cnt, datatype, dst, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); if (newrank > newdst) send_idx = recv_idx; mask >>= 1; } }