int MPIR_Iscan_rec_dbl(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPIR_Comm *comm_ptr, MPIR_Sched_t s) { int mpi_errno = MPI_SUCCESS; MPI_Aint true_extent, true_lb, extent; int is_commutative; int mask, dst, rank, comm_size; void *partial_scan = NULL; void *tmp_buf = NULL; MPIR_SCHED_CHKPMEM_DECL(2); if (count == 0) goto fn_exit; comm_size = comm_ptr->local_size; rank = comm_ptr->rank; is_commutative = MPIR_Op_is_commutative(op); /* need to allocate temporary buffer to store partial scan*/ MPIR_Type_get_true_extent_impl(datatype, &true_lb, &true_extent); MPID_Datatype_get_extent_macro(datatype, extent); MPIR_SCHED_CHKPMEM_MALLOC(partial_scan, void *, count*(MPL_MAX(extent,true_extent)), mpi_errno, "partial_scan"); /* This eventually gets malloc()ed as a temp buffer, not added to * any user buffers */ MPIR_Ensure_Aint_fits_in_pointer(count * MPL_MAX(extent, true_extent)); /* adjust for potential negative lower bound in datatype */ partial_scan = (void *)((char*)partial_scan - true_lb); /* need to allocate temporary buffer to store incoming data*/ MPIR_SCHED_CHKPMEM_MALLOC(tmp_buf, void *, count*(MPL_MAX(extent,true_extent)), mpi_errno, "tmp_buf"); /* adjust for potential negative lower bound in datatype */ tmp_buf = (void *)((char*)tmp_buf - true_lb); /* Since this is an inclusive scan, copy local contribution into recvbuf. */ if (sendbuf != MPI_IN_PLACE) { mpi_errno = MPIR_Sched_copy(sendbuf, count, datatype, recvbuf, count, datatype, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } if (sendbuf != MPI_IN_PLACE) mpi_errno = MPIR_Sched_copy(sendbuf, count, datatype, partial_scan, count, datatype, s); else mpi_errno = MPIR_Sched_copy(recvbuf, count, datatype, partial_scan, count, datatype, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mask = 0x1; while (mask < comm_size) { dst = rank ^ mask; if (dst < comm_size) { /* Send partial_scan to dst. Recv into tmp_buf */ mpi_errno = MPIR_Sched_send(partial_scan, count, datatype, dst, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* sendrecv, no barrier here */ mpi_errno = MPIR_Sched_recv(tmp_buf, count, datatype, dst, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); if (rank > dst) { mpi_errno = MPIR_Sched_reduce(tmp_buf, partial_scan, count, datatype, op, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Sched_reduce(tmp_buf, recvbuf, count, datatype, op, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); } else { if (is_commutative) { mpi_errno = MPIR_Sched_reduce(tmp_buf, partial_scan, count, datatype, op, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); } else { mpi_errno = MPIR_Sched_reduce(partial_scan, tmp_buf, count, datatype, op, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); mpi_errno = MPIR_Sched_copy(tmp_buf, count, datatype, partial_scan, count, datatype, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); } } } mask <<= 1; } MPIR_SCHED_CHKPMEM_COMMIT(s); fn_exit: return mpi_errno; fn_fail: MPIR_SCHED_CHKPMEM_REAP(s); goto fn_exit; }
int MPIR_Ibarrier_inter(MPID_Comm *comm_ptr, MPID_Sched_t s) { int mpi_errno = MPI_SUCCESS; int rank, root; MPIR_SCHED_CHKPMEM_DECL(1); char *buf = NULL; MPIU_Assert(comm_ptr->comm_kind == MPID_INTERCOMM); rank = comm_ptr->rank; /* Get the local intracommunicator */ if (!comm_ptr->local_comm) { mpi_errno = MPIR_Setup_intercomm_localcomm(comm_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } /* do a barrier on the local intracommunicator */ MPIU_Assert(comm_ptr->local_comm->coll_fns && comm_ptr->local_comm->coll_fns->Ibarrier_sched); if(comm_ptr->local_size != 1) { mpi_errno = comm_ptr->local_comm->coll_fns->Ibarrier_sched(comm_ptr->local_comm, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPID_SCHED_BARRIER(s); } /* rank 0 on each group does an intercommunicator broadcast to the remote group to indicate that all processes in the local group have reached the barrier. We do a 1-byte bcast because a 0-byte bcast will just return without doing anything. */ MPIR_SCHED_CHKPMEM_MALLOC(buf, char *, 1, mpi_errno, "bcast buf"); buf[0] = 'D'; /* avoid valgrind warnings */ /* first broadcast from left to right group, then from right to left group */ MPIU_Assert(comm_ptr->coll_fns && comm_ptr->coll_fns->Ibcast_sched); if (comm_ptr->is_low_group) { root = (rank == 0) ? MPI_ROOT : MPI_PROC_NULL; mpi_errno = comm_ptr->coll_fns->Ibcast_sched(buf, 1, MPI_BYTE, root, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPID_SCHED_BARRIER(s); /* receive bcast from right */ root = 0; mpi_errno = comm_ptr->coll_fns->Ibcast_sched(buf, 1, MPI_BYTE, root, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } else { /* receive bcast from left */ root = 0; mpi_errno = comm_ptr->coll_fns->Ibcast_sched(buf, 1, MPI_BYTE, root, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPID_SCHED_BARRIER(s); /* bcast to left */ root = (rank == 0) ? MPI_ROOT : MPI_PROC_NULL; mpi_errno = comm_ptr->coll_fns->Ibcast_sched(buf, 1, MPI_BYTE, root, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } MPIR_SCHED_CHKPMEM_COMMIT(s); fn_exit: return mpi_errno; fn_fail: MPIR_SCHED_CHKPMEM_REAP(s); goto fn_exit; }
int MPIR_Ialltoall_bruck(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPID_Comm *comm_ptr, MPID_Sched_t s) { int mpi_errno = MPI_SUCCESS; int i; int nbytes, recvtype_size, recvbuf_extent, newtype_size; int rank, comm_size; void *tmp_buf = NULL; MPI_Aint sendtype_extent, recvtype_extent, recvtype_true_lb, recvtype_true_extent; int pof2, dst, src; int count, block; MPI_Datatype newtype; int *displs; MPIU_CHKLMEM_DECL(1); /* displs */ MPIR_SCHED_CHKPMEM_DECL(2); /* tmp_buf (2x) */ MPIU_Assert(sendbuf != MPI_IN_PLACE); /* we do not handle in-place */ comm_size = comm_ptr->local_size; rank = comm_ptr->rank; MPID_Datatype_get_extent_macro(sendtype, sendtype_extent); MPID_Datatype_get_size_macro(recvtype, recvtype_size); MPID_Datatype_get_extent_macro(recvtype, recvtype_extent); /* allocate temporary buffer */ /* must be same size as entire recvbuf for Phase 3 */ nbytes = recvtype_size * recvcount * comm_size; MPIR_SCHED_CHKPMEM_MALLOC(tmp_buf, void *, nbytes, mpi_errno, "tmp_buf"); /* Do Phase 1 of the algorithim. Shift the data blocks on process i * upwards by a distance of i blocks. Store the result in recvbuf. */ mpi_errno = MPID_Sched_copy(((char *) sendbuf + rank*sendcount*sendtype_extent), (comm_size - rank)*sendcount, sendtype, recvbuf, (comm_size - rank)*recvcount, recvtype, s); if (mpi_errno) MPIU_ERR_POP(mpi_errno); mpi_errno = MPID_Sched_copy(sendbuf, rank*sendcount, sendtype, ((char *) recvbuf + (comm_size-rank)*recvcount*recvtype_extent), rank*recvcount, recvtype, s); if (mpi_errno) MPIU_ERR_POP(mpi_errno); MPID_SCHED_BARRIER(s); /* Input data is now stored in recvbuf with datatype recvtype */ /* Now do Phase 2, the communication phase. It takes ceiling(lg p) steps. In each step i, each process sends to rank+2^i and receives from rank-2^i, and exchanges all data blocks whose ith bit is 1. */ /* allocate displacements array for indexed datatype used in communication */ MPIU_CHKLMEM_MALLOC(displs, int *, comm_size * sizeof(int), mpi_errno, "displs"); pof2 = 1; while (pof2 < comm_size) { dst = (rank + pof2) % comm_size; src = (rank - pof2 + comm_size) % comm_size; /* Exchange all data blocks whose ith bit is 1 */ /* Create an indexed datatype for the purpose */ count = 0; for (block=1; block<comm_size; block++) { if (block & pof2) { displs[count] = block * recvcount; count++; } } mpi_errno = MPIR_Type_create_indexed_block_impl(count, recvcount, displs, recvtype, &newtype); if (mpi_errno) MPIU_ERR_POP(mpi_errno); mpi_errno = MPIR_Type_commit_impl(&newtype); if (mpi_errno) MPIU_ERR_POP(mpi_errno); MPID_Datatype_get_size_macro(newtype, newtype_size); /* we will usually copy much less than nbytes */ mpi_errno = MPID_Sched_copy(recvbuf, 1, newtype, tmp_buf, newtype_size, MPI_BYTE, s); if (mpi_errno) MPIU_ERR_POP(mpi_errno); MPID_SCHED_BARRIER(s); /* now send and recv in parallel */ mpi_errno = MPID_Sched_send(tmp_buf, newtype_size, MPI_BYTE, dst, comm_ptr, s); if (mpi_errno) MPIU_ERR_POP(mpi_errno); mpi_errno = MPID_Sched_recv(recvbuf, 1, newtype, src, comm_ptr, s); if (mpi_errno) MPIU_ERR_POP(mpi_errno); MPID_SCHED_BARRIER(s); MPIR_Type_free_impl(&newtype); pof2 *= 2; } /* Phase 3: Rotate blocks in recvbuf upwards by (rank + 1) blocks. Need * a temporary buffer of the same size as recvbuf. */ /* get true extent of recvtype */ MPIR_Type_get_true_extent_impl(recvtype, &recvtype_true_lb, &recvtype_true_extent); recvbuf_extent = recvcount * comm_size * (MPIR_MAX(recvtype_true_extent, recvtype_extent)); /* not a leak, old tmp_buf value is still tracked by CHKPMEM macros */ MPIR_SCHED_CHKPMEM_MALLOC(tmp_buf, void *, recvbuf_extent, mpi_errno, "tmp_buf"); /* adjust for potential negative lower bound in datatype */ tmp_buf = (void *)((char*)tmp_buf - recvtype_true_lb); mpi_errno = MPID_Sched_copy(((char *) recvbuf + (rank+1)*recvcount*recvtype_extent), (comm_size - rank - 1)*recvcount, recvtype, tmp_buf, (comm_size - rank - 1)*recvcount, recvtype, s); if (mpi_errno) MPIU_ERR_POP(mpi_errno); mpi_errno = MPID_Sched_copy(recvbuf, (rank+1)*recvcount, recvtype, ((char *) tmp_buf + (comm_size-rank-1)*recvcount*recvtype_extent), (rank+1)*recvcount, recvtype, s); if (mpi_errno) MPIU_ERR_POP(mpi_errno); MPID_SCHED_BARRIER(s); /* Blocks are in the reverse order now (comm_size-1 to 0). * Reorder them to (0 to comm_size-1) and store them in recvbuf. */ for (i = 0; i < comm_size; i++){ mpi_errno = MPID_Sched_copy(((char *) tmp_buf + i*recvcount*recvtype_extent), recvcount, recvtype, ((char *) recvbuf + (comm_size-i-1)*recvcount*recvtype_extent), recvcount, recvtype, s); if (mpi_errno) MPIU_ERR_POP(mpi_errno); } MPIR_SCHED_CHKPMEM_COMMIT(s); fn_exit: MPIU_CHKLMEM_FREEALL(); return mpi_errno; fn_fail: MPIR_SCHED_CHKPMEM_REAP(s); goto fn_exit; }
int MPIR_Ialltoall_inplace(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPID_Comm *comm_ptr, MPID_Sched_t s) { int mpi_errno = MPI_SUCCESS; void *tmp_buf = NULL; int i, j; int rank, comm_size; int nbytes, recvtype_size; MPI_Aint recvtype_extent; int peer; MPIR_SCHED_CHKPMEM_DECL(1); MPIU_Assert(sendbuf == MPI_IN_PLACE); if (recvcount == 0) goto fn_exit; comm_size = comm_ptr->local_size; rank = comm_ptr->rank; MPID_Datatype_get_size_macro(recvtype, recvtype_size); MPID_Datatype_get_extent_macro(recvtype, recvtype_extent); nbytes = recvtype_size * recvcount; MPIR_SCHED_CHKPMEM_MALLOC(tmp_buf, void *, nbytes, mpi_errno, "tmp_buf"); for (i = 0; i < comm_size; ++i) { /* start inner loop at i to avoid re-exchanging data */ for (j = i; j < comm_size; ++j) { if (rank == i && rank == j) { /* no need to "sendrecv_replace" for ourselves */ } else if (rank == i || rank == j) { if (rank == i) peer = j; else peer = i; /* pack to tmp_buf */ mpi_errno = MPID_Sched_copy(((char *)recvbuf + peer*recvcount*recvtype_extent), recvcount, recvtype, tmp_buf, nbytes, MPI_BYTE, s); if (mpi_errno) MPIU_ERR_POP(mpi_errno); MPID_SCHED_BARRIER(s); /* now simultaneously send from tmp_buf and recv to recvbuf */ mpi_errno = MPID_Sched_send(tmp_buf, nbytes, MPI_BYTE, peer, comm_ptr, s); if (mpi_errno) MPIU_ERR_POP(mpi_errno); mpi_errno = MPID_Sched_recv(((char *)recvbuf + peer*recvcount*recvtype_extent), recvcount, recvtype, peer, comm_ptr, s); if (mpi_errno) MPIU_ERR_POP(mpi_errno); MPID_SCHED_BARRIER(s); } } } MPIR_SCHED_CHKPMEM_COMMIT(s); fn_exit: return mpi_errno; fn_fail: MPIR_SCHED_CHKPMEM_REAP(s); goto fn_exit; }
int MPIR_Iexscan(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPID_Comm *comm_ptr, MPID_Sched_t s) { int mpi_errno = MPI_SUCCESS; int rank, comm_size; int mask, dst, is_commutative, flag; MPI_Aint true_extent, true_lb, extent; void *partial_scan, *tmp_buf; MPIR_SCHED_CHKPMEM_DECL(2); if (count == 0) goto fn_exit; comm_size = comm_ptr->local_size; rank = comm_ptr->rank; is_commutative = MPIR_Op_is_commutative(op); /* need to allocate temporary buffer to store partial scan*/ MPIR_Type_get_true_extent_impl(datatype, &true_lb, &true_extent); MPID_Datatype_get_extent_macro(datatype, extent); MPIR_SCHED_CHKPMEM_MALLOC(partial_scan, void *, (count*(MPIR_MAX(true_extent,extent))), mpi_errno, "partial_scan"); /* adjust for potential negative lower bound in datatype */ partial_scan = (void *)((char*)partial_scan - true_lb); /* need to allocate temporary buffer to store incoming data*/ MPIR_SCHED_CHKPMEM_MALLOC(tmp_buf, void *, (count*(MPIR_MAX(true_extent,extent))), mpi_errno, "tmp_buf"); /* adjust for potential negative lower bound in datatype */ tmp_buf = (void *)((char*)tmp_buf - true_lb); mpi_errno = MPID_Sched_copy((sendbuf == MPI_IN_PLACE ? recvbuf : sendbuf), count, datatype, partial_scan, count, datatype, s); if (mpi_errno) MPIU_ERR_POP(mpi_errno); flag = 0; mask = 0x1; while (mask < comm_size) { dst = rank ^ mask; if (dst < comm_size) { /* Send partial_scan to dst. Recv into tmp_buf */ mpi_errno = MPID_Sched_send(partial_scan, count, datatype, dst, comm_ptr, s); if (mpi_errno) MPIU_ERR_POP(mpi_errno); /* sendrecv, no barrier here */ mpi_errno = MPID_Sched_recv(tmp_buf, count, datatype, dst, comm_ptr, s); if (mpi_errno) MPIU_ERR_POP(mpi_errno); MPID_SCHED_BARRIER(s); if (rank > dst) { mpi_errno = MPID_Sched_reduce(tmp_buf, partial_scan, count, datatype, op, s); if (mpi_errno) MPIU_ERR_POP(mpi_errno); MPID_SCHED_BARRIER(s); /* On rank 0, recvbuf is not defined. For sendbuf==MPI_IN_PLACE recvbuf must not change (per MPI-2.2). On rank 1, recvbuf is to be set equal to the value in sendbuf on rank 0. On others, recvbuf is the scan of values in the sendbufs on lower ranks. */ if (rank != 0) { if (flag == 0) { /* simply copy data recd from rank 0 into recvbuf */ mpi_errno = MPID_Sched_copy(tmp_buf, count, datatype, recvbuf, count, datatype, s); if (mpi_errno) MPIU_ERR_POP(mpi_errno); MPID_SCHED_BARRIER(s); flag = 1; } else { mpi_errno = MPID_Sched_reduce(tmp_buf, recvbuf, count, datatype, op, s); if (mpi_errno) MPIU_ERR_POP(mpi_errno); MPID_SCHED_BARRIER(s); } } } else { if (is_commutative) { mpi_errno = MPID_Sched_reduce(tmp_buf, partial_scan, count, datatype, op, s); if (mpi_errno) MPIU_ERR_POP(mpi_errno); MPID_SCHED_BARRIER(s); } else { mpi_errno = MPID_Sched_reduce(partial_scan, tmp_buf, count, datatype, op, s); if (mpi_errno) MPIU_ERR_POP(mpi_errno); MPID_SCHED_BARRIER(s); mpi_errno = MPID_Sched_copy(tmp_buf, count, datatype, partial_scan, count, datatype, s); if (mpi_errno) MPIU_ERR_POP(mpi_errno); MPID_SCHED_BARRIER(s); } } } mask <<= 1; } MPIR_SCHED_CHKPMEM_COMMIT(s); fn_exit: return mpi_errno; fn_fail: MPIR_SCHED_CHKPMEM_REAP(s); goto fn_exit; }