int MPIR_Alltoallv_intra(const void *sendbuf, const int *sendcounts, const int *sdispls, MPI_Datatype sendtype, void *recvbuf, const int *recvcounts, const int *rdispls, MPI_Datatype recvtype, MPID_Comm *comm_ptr, int *errflag) { int comm_size, i, j; MPI_Aint send_extent, recv_extent; int mpi_errno = MPI_SUCCESS; int mpi_errno_ret = MPI_SUCCESS; MPI_Status *starray; MPI_Status status; MPI_Request *reqarray; int dst, rank, req_cnt; MPI_Comm comm; int ii, ss, bblock; int type_size; MPIU_CHKLMEM_DECL(2); comm = comm_ptr->handle; comm_size = comm_ptr->local_size; rank = comm_ptr->rank; /* Get extent of recv type, but send type is only valid if (sendbuf!=MPI_IN_PLACE) */ MPID_Datatype_get_extent_macro(recvtype, recv_extent); /* check if multiple threads are calling this collective function */ MPIDU_ERR_CHECK_MULTIPLE_THREADS_ENTER( comm_ptr ); if (sendbuf == MPI_IN_PLACE) { /* We use pair-wise sendrecv_replace in order to conserve memory usage, * which is keeping with the spirit of the MPI-2.2 Standard. But * because of this approach all processes must agree on the global * schedule of sendrecv_replace operations to avoid deadlock. * * Note that this is not an especially efficient algorithm in terms of * time and there will be multiple repeated malloc/free's rather than * maintaining a single buffer across the whole loop. Something like * MADRE is probably the best solution for the MPI_IN_PLACE scenario. */ for (i = 0; i < comm_size; ++i) { /* start inner loop at i to avoid re-exchanging data */ for (j = i; j < comm_size; ++j) { if (rank == i) { /* also covers the (rank == i && rank == j) case */ mpi_errno = MPIC_Sendrecv_replace_ft(((char *)recvbuf + rdispls[j]*recv_extent), recvcounts[j], recvtype, j, MPIR_ALLTOALLV_TAG, j, MPIR_ALLTOALLV_TAG, comm, &status, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = TRUE; MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail"); MPIU_ERR_ADD(mpi_errno_ret, mpi_errno); } } else if (rank == j) { /* same as above with i/j args reversed */ mpi_errno = MPIC_Sendrecv_replace_ft(((char *)recvbuf + rdispls[i]*recv_extent), recvcounts[i], recvtype, i, MPIR_ALLTOALLV_TAG, i, MPIR_ALLTOALLV_TAG, comm, &status, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = TRUE; MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail"); MPIU_ERR_ADD(mpi_errno_ret, mpi_errno); } } } } } else { bblock = MPIR_PARAM_ALLTOALL_THROTTLE; if (bblock == 0) bblock = comm_size; MPID_Datatype_get_extent_macro(sendtype, send_extent); MPIU_CHKLMEM_MALLOC(starray, MPI_Status*, 2*bblock*sizeof(MPI_Status), mpi_errno, "starray"); MPIU_CHKLMEM_MALLOC(reqarray, MPI_Request*, 2*bblock*sizeof(MPI_Request), mpi_errno, "reqarray"); /* post only bblock isends/irecvs at a time as suggested by Tony Ladd */ for (ii=0; ii<comm_size; ii+=bblock) { req_cnt = 0; ss = comm_size-ii < bblock ? comm_size-ii : bblock; /* do the communication -- post ss sends and receives: */ for ( i=0; i<ss; i++ ) { dst = (rank+i+ii) % comm_size; if (recvcounts[dst]) { MPID_Datatype_get_size_macro(recvtype, type_size); if (type_size) { MPID_Ensure_Aint_fits_in_pointer(MPI_VOID_PTR_CAST_TO_MPI_AINT recvbuf + rdispls[dst]*recv_extent); mpi_errno = MPIC_Irecv_ft((char *)recvbuf+rdispls[dst]*recv_extent, recvcounts[dst], recvtype, dst, MPIR_ALLTOALLV_TAG, comm, &reqarray[req_cnt]); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = TRUE; MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail"); MPIU_ERR_ADD(mpi_errno_ret, mpi_errno); } req_cnt++; } } } for ( i=0; i<ss; i++ ) { dst = (rank-i-ii+comm_size) % comm_size; if (sendcounts[dst]) { MPID_Datatype_get_size_macro(sendtype, type_size); if (type_size) { MPID_Ensure_Aint_fits_in_pointer(MPI_VOID_PTR_CAST_TO_MPI_AINT sendbuf + sdispls[dst]*send_extent); mpi_errno = MPIC_Isend_ft((char *)sendbuf+sdispls[dst]*send_extent, sendcounts[dst], sendtype, dst, MPIR_ALLTOALLV_TAG, comm, &reqarray[req_cnt], errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = TRUE; MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail"); MPIU_ERR_ADD(mpi_errno_ret, mpi_errno); } req_cnt++; } } } mpi_errno = MPIC_Waitall_ft(req_cnt, reqarray, starray, errflag); if (mpi_errno && mpi_errno != MPI_ERR_IN_STATUS) MPIU_ERR_POP(mpi_errno); /* --BEGIN ERROR HANDLING-- */ if (mpi_errno == MPI_ERR_IN_STATUS) { for (i=0; i<req_cnt; i++) { if (starray[i].MPI_ERROR != MPI_SUCCESS) { mpi_errno = starray[i].MPI_ERROR; if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = TRUE; MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail"); MPIU_ERR_ADD(mpi_errno_ret, mpi_errno); } } } } /* --END ERROR HANDLING-- */ } } fn_exit: /* check if multiple threads are calling this collective function */ MPIDU_ERR_CHECK_MULTIPLE_THREADS_EXIT( comm_ptr ); MPIU_CHKLMEM_FREEALL(); if (mpi_errno_ret) mpi_errno = mpi_errno_ret; else if (*errflag) MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**coll_fail"); return mpi_errno; fn_fail: goto fn_exit; }
int MPIR_Gatherv ( void *sendbuf, int sendcnt, MPI_Datatype sendtype, void *recvbuf, int *recvcnts, int *displs, MPI_Datatype recvtype, int root, MPID_Comm *comm_ptr, int *errflag ) { int comm_size, rank; int mpi_errno = MPI_SUCCESS; int mpi_errno_ret = MPI_SUCCESS; MPI_Comm comm; MPI_Aint extent; int i, reqs; int min_procs; MPI_Request *reqarray; MPI_Status *starray; MPIU_CHKLMEM_DECL(2); comm = comm_ptr->handle; rank = comm_ptr->rank; /* check if multiple threads are calling this collective function */ MPIDU_ERR_CHECK_MULTIPLE_THREADS_ENTER( comm_ptr ); /* If rank == root, then I recv lots, otherwise I send */ if (((comm_ptr->comm_kind == MPID_INTRACOMM) && (root == rank)) || ((comm_ptr->comm_kind == MPID_INTERCOMM) && (root == MPI_ROOT))) { if (comm_ptr->comm_kind == MPID_INTRACOMM) comm_size = comm_ptr->local_size; else comm_size = comm_ptr->remote_size; MPID_Datatype_get_extent_macro(recvtype, extent); /* each node can make sure it is not going to overflow aint */ MPID_Ensure_Aint_fits_in_pointer(MPI_VOID_PTR_CAST_TO_MPI_AINT recvbuf + displs[rank] * extent); MPIU_CHKLMEM_MALLOC(reqarray, MPI_Request *, comm_size * sizeof(MPI_Request), mpi_errno, "reqarray"); MPIU_CHKLMEM_MALLOC(starray, MPI_Status *, comm_size * sizeof(MPI_Status), mpi_errno, "starray"); reqs = 0; for (i = 0; i < comm_size; i++) { if (recvcnts[i]) { if ((comm_ptr->comm_kind == MPID_INTRACOMM) && (i == rank)) { if (sendbuf != MPI_IN_PLACE) { mpi_errno = MPIR_Localcopy(sendbuf, sendcnt, sendtype, ((char *)recvbuf+displs[rank]*extent), recvcnts[rank], recvtype); if (mpi_errno) MPIU_ERR_POP(mpi_errno); } } else { mpi_errno = MPIC_Irecv_ft(((char *)recvbuf+displs[i]*extent), recvcnts[i], recvtype, i, MPIR_GATHERV_TAG, comm, &reqarray[reqs++]); if (mpi_errno) MPIU_ERR_POP(mpi_errno); } } } /* ... then wait for *all* of them to finish: */ mpi_errno = MPIC_Waitall_ft(reqs, reqarray, starray, errflag); if (mpi_errno&& mpi_errno != MPI_ERR_IN_STATUS) MPIU_ERR_POP(mpi_errno); /* --BEGIN ERROR HANDLING-- */ if (mpi_errno == MPI_ERR_IN_STATUS) { for (i = 0; i < reqs; i++) { if (starray[i].MPI_ERROR != MPI_SUCCESS) { mpi_errno = starray[i].MPI_ERROR; if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = TRUE; MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail"); MPIU_ERR_ADD(mpi_errno_ret, mpi_errno); } } } } /* --END ERROR HANDLING-- */ } else if (root != MPI_PROC_NULL) { /* non-root nodes, and in the intercomm. case, non-root nodes on remote side */