int MPIR_Alltoallv_intra(const void *sendbuf, const int *sendcounts, const int *sdispls, MPI_Datatype sendtype, void *recvbuf, const int *recvcounts, const int *rdispls, MPI_Datatype recvtype, MPID_Comm *comm_ptr, MPIR_Errflag_t *errflag) { int comm_size, i, j; MPI_Aint send_extent, recv_extent; int mpi_errno = MPI_SUCCESS; int mpi_errno_ret = MPI_SUCCESS; MPI_Status *starray; MPI_Status status; MPID_Request **reqarray; int dst, rank, req_cnt; int ii, ss, bblock; int type_size; MPIU_CHKLMEM_DECL(2); comm_size = comm_ptr->local_size; rank = comm_ptr->rank; /* Get extent of recv type, but send type is only valid if (sendbuf!=MPI_IN_PLACE) */ MPID_Datatype_get_extent_macro(recvtype, recv_extent); /* check if multiple threads are calling this collective function */ MPIDU_ERR_CHECK_MULTIPLE_THREADS_ENTER( comm_ptr ); if (sendbuf == MPI_IN_PLACE) { /* We use pair-wise sendrecv_replace in order to conserve memory usage, * which is keeping with the spirit of the MPI-2.2 Standard. But * because of this approach all processes must agree on the global * schedule of sendrecv_replace operations to avoid deadlock. * * Note that this is not an especially efficient algorithm in terms of * time and there will be multiple repeated malloc/free's rather than * maintaining a single buffer across the whole loop. Something like * MADRE is probably the best solution for the MPI_IN_PLACE scenario. */ for (i = 0; i < comm_size; ++i) { /* start inner loop at i to avoid re-exchanging data */ for (j = i; j < comm_size; ++j) { if (rank == i) { /* also covers the (rank == i && rank == j) case */ mpi_errno = MPIC_Sendrecv_replace(((char *)recvbuf + rdispls[j]*recv_extent), recvcounts[j], recvtype, j, MPIR_ALLTOALLV_TAG, j, MPIR_ALLTOALLV_TAG, comm_ptr, &status, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = MPIR_ERR_GET_CLASS(mpi_errno); MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); } } else if (rank == j) { /* same as above with i/j args reversed */ mpi_errno = MPIC_Sendrecv_replace(((char *)recvbuf + rdispls[i]*recv_extent), recvcounts[i], recvtype, i, MPIR_ALLTOALLV_TAG, i, MPIR_ALLTOALLV_TAG, comm_ptr, &status, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = MPIR_ERR_GET_CLASS(mpi_errno); MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); } } } } } else { bblock = MPIR_CVAR_ALLTOALL_THROTTLE; if (bblock == 0) bblock = comm_size; MPID_Datatype_get_extent_macro(sendtype, send_extent); MPIU_CHKLMEM_MALLOC(starray, MPI_Status*, 2*bblock*sizeof(MPI_Status), mpi_errno, "starray"); MPIU_CHKLMEM_MALLOC(reqarray, MPID_Request**, 2*bblock*sizeof(MPID_Request *), mpi_errno, "reqarray"); /* post only bblock isends/irecvs at a time as suggested by Tony Ladd */ for (ii=0; ii<comm_size; ii+=bblock) { req_cnt = 0; ss = comm_size-ii < bblock ? comm_size-ii : bblock; /* do the communication -- post ss sends and receives: */ for ( i=0; i<ss; i++ ) { dst = (rank+i+ii) % comm_size; if (recvcounts[dst]) { MPID_Datatype_get_size_macro(recvtype, type_size); if (type_size) { MPIU_Ensure_Aint_fits_in_pointer(MPIU_VOID_PTR_CAST_TO_MPI_AINT recvbuf + rdispls[dst]*recv_extent); mpi_errno = MPIC_Irecv((char *)recvbuf+rdispls[dst]*recv_extent, recvcounts[dst], recvtype, dst, MPIR_ALLTOALLV_TAG, comm_ptr, &reqarray[req_cnt]); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = MPIR_ERR_GET_CLASS(mpi_errno); MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); } req_cnt++; } } } for ( i=0; i<ss; i++ ) { dst = (rank-i-ii+comm_size) % comm_size; if (sendcounts[dst]) { MPID_Datatype_get_size_macro(sendtype, type_size); if (type_size) { MPIU_Ensure_Aint_fits_in_pointer(MPIU_VOID_PTR_CAST_TO_MPI_AINT sendbuf + sdispls[dst]*send_extent); mpi_errno = MPIC_Isend((char *)sendbuf+sdispls[dst]*send_extent, sendcounts[dst], sendtype, dst, MPIR_ALLTOALLV_TAG, comm_ptr, &reqarray[req_cnt], errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = MPIR_ERR_GET_CLASS(mpi_errno); MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); } req_cnt++; } } } mpi_errno = MPIC_Waitall(req_cnt, reqarray, starray, errflag); if (mpi_errno && mpi_errno != MPI_ERR_IN_STATUS) MPIR_ERR_POP(mpi_errno); /* --BEGIN ERROR HANDLING-- */ if (mpi_errno == MPI_ERR_IN_STATUS) { for (i=0; i<req_cnt; i++) { if (starray[i].MPI_ERROR != MPI_SUCCESS) { mpi_errno = starray[i].MPI_ERROR; if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = MPIR_ERR_GET_CLASS(mpi_errno); MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); } } } } /* --END ERROR HANDLING-- */ } } fn_exit: /* check if multiple threads are calling this collective function */ MPIDU_ERR_CHECK_MULTIPLE_THREADS_EXIT( comm_ptr ); MPIU_CHKLMEM_FREEALL(); if (mpi_errno_ret) mpi_errno = mpi_errno_ret; else if (*errflag != MPIR_ERR_NONE) MPIR_ERR_SET(mpi_errno, *errflag, "**coll_fail"); return mpi_errno; fn_fail: goto fn_exit; }
/*@ MPI_Dist_graph_create - MPI_DIST_GRAPH_CREATE returns a handle to a new communicator to which the distributed graph topology information is attached. Input Parameters: + comm_old - input communicator (handle) . n - number of source nodes for which this process specifies edges (non-negative integer) . sources - array containing the n source nodes for which this process specifies edges (array of non-negative integers) . degrees - array specifying the number of destinations for each source node in the source node array (array of non-negative integers) . destinations - destination nodes for the source nodes in the source node array (array of non-negative integers) . weights - weights for source to destination edges (array of non-negative integers or MPI_UNWEIGHTED) . info - hints on optimization and interpretation of weights (handle) - reorder - the process may be reordered (true) or not (false) (logical) Output Parameters: . comm_dist_graph - communicator with distributed graph topology added (handle) .N ThreadSafe .N Fortran .N Errors .N MPI_SUCCESS .N MPI_ERR_ARG .N MPI_ERR_OTHER @*/ int MPI_Dist_graph_create(MPI_Comm comm_old, int n, const int sources[], const int degrees[], const int destinations[], const int weights[], MPI_Info info, int reorder, MPI_Comm * comm_dist_graph) { int mpi_errno = MPI_SUCCESS; MPIR_Comm *comm_ptr = NULL; MPIR_Comm *comm_dist_graph_ptr = NULL; MPIR_Request **reqs = NULL; MPIR_Topology *topo_ptr = NULL; MPII_Dist_graph_topology *dist_graph_ptr = NULL; int i; int j; int idx; int comm_size = 0; int in_capacity; int out_capacity; int **rout = NULL; int **rin = NULL; int *rin_sizes; int *rout_sizes; int *rin_idx; int *rout_idx; int *rs; int in_out_peers[2] = { -1, -1 }; MPIR_Errflag_t errflag = MPIR_ERR_NONE; MPIR_CHKLMEM_DECL(9); MPIR_CHKPMEM_DECL(1); MPIR_FUNC_TERSE_STATE_DECL(MPID_STATE_MPI_DIST_GRAPH_CREATE); MPIR_ERRTEST_INITIALIZED_ORDIE(); MPID_THREAD_CS_ENTER(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX); MPIR_FUNC_TERSE_ENTER(MPID_STATE_MPI_DIST_GRAPH_CREATE); /* Validate parameters, especially handles needing to be converted */ #ifdef HAVE_ERROR_CHECKING { MPID_BEGIN_ERROR_CHECKS; { MPIR_ERRTEST_COMM(comm_old, mpi_errno); MPIR_ERRTEST_INFO_OR_NULL(info, mpi_errno); if (mpi_errno != MPI_SUCCESS) goto fn_fail; } MPID_END_ERROR_CHECKS; } #endif /* Convert MPI object handles to object pointers */ MPIR_Comm_get_ptr(comm_old, comm_ptr); /* Validate parameters and objects (post conversion) */ #ifdef HAVE_ERROR_CHECKING { MPID_BEGIN_ERROR_CHECKS; { /* Validate comm_ptr */ MPIR_Comm_valid_ptr(comm_ptr, mpi_errno, FALSE); /* If comm_ptr is not valid, it will be reset to null */ if (comm_ptr) { MPIR_ERRTEST_COMM_INTRA(comm_ptr, mpi_errno); } MPIR_ERRTEST_ARGNEG(n, "n", mpi_errno); if (n > 0) { int have_degrees = 0; MPIR_ERRTEST_ARGNULL(sources, "sources", mpi_errno); MPIR_ERRTEST_ARGNULL(degrees, "degrees", mpi_errno); for (i = 0; i < n; ++i) { if (degrees[i]) { have_degrees = 1; break; } } if (have_degrees) { MPIR_ERRTEST_ARGNULL(destinations, "destinations", mpi_errno); if (weights != MPI_UNWEIGHTED) MPIR_ERRTEST_ARGNULL(weights, "weights", mpi_errno); } } if (mpi_errno != MPI_SUCCESS) goto fn_fail; } MPID_END_ERROR_CHECKS; } #endif /* HAVE_ERROR_CHECKING */ /* ... body of routine ... */ /* Implementation based on Torsten Hoefler's reference implementation * attached to MPI-2.2 ticket #33. */ *comm_dist_graph = MPI_COMM_NULL; comm_size = comm_ptr->local_size; /* following the spirit of the old topo interface, attributes do not * propagate to the new communicator (see MPI-2.1 pp. 243 line 11) */ mpi_errno = MPII_Comm_copy(comm_ptr, comm_size, &comm_dist_graph_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_Assert(comm_dist_graph_ptr != NULL); /* rin is an array of size comm_size containing pointers to arrays of * rin_sizes[x]. rin[x] is locally known number of edges into this process * from rank x. * * rout is an array of comm_size containing pointers to arrays of * rout_sizes[x]. rout[x] is the locally known number of edges out of this * process to rank x. */ MPIR_CHKLMEM_MALLOC(rout, int **, comm_size * sizeof(int *), mpi_errno, "rout", MPL_MEM_COMM); MPIR_CHKLMEM_MALLOC(rin, int **, comm_size * sizeof(int *), mpi_errno, "rin", MPL_MEM_COMM); MPIR_CHKLMEM_MALLOC(rin_sizes, int *, comm_size * sizeof(int), mpi_errno, "rin_sizes", MPL_MEM_COMM); MPIR_CHKLMEM_MALLOC(rout_sizes, int *, comm_size * sizeof(int), mpi_errno, "rout_sizes", MPL_MEM_COMM); MPIR_CHKLMEM_MALLOC(rin_idx, int *, comm_size * sizeof(int), mpi_errno, "rin_idx", MPL_MEM_COMM); MPIR_CHKLMEM_MALLOC(rout_idx, int *, comm_size * sizeof(int), mpi_errno, "rout_idx", MPL_MEM_COMM); memset(rout, 0, comm_size * sizeof(int *)); memset(rin, 0, comm_size * sizeof(int *)); memset(rin_sizes, 0, comm_size * sizeof(int)); memset(rout_sizes, 0, comm_size * sizeof(int)); memset(rin_idx, 0, comm_size * sizeof(int)); memset(rout_idx, 0, comm_size * sizeof(int)); /* compute array sizes */ idx = 0; for (i = 0; i < n; ++i) { MPIR_Assert(sources[i] < comm_size); for (j = 0; j < degrees[i]; ++j) { MPIR_Assert(destinations[idx] < comm_size); /* rout_sizes[i] is twice as long as the number of edges to be * sent to rank i by this process */ rout_sizes[sources[i]] += 2; rin_sizes[destinations[idx]] += 2; ++idx; } } /* allocate arrays */ for (i = 0; i < comm_size; ++i) { /* can't use CHKLMEM macros b/c we are in a loop */ if (rin_sizes[i]) { rin[i] = MPL_malloc(rin_sizes[i] * sizeof(int), MPL_MEM_COMM); } if (rout_sizes[i]) { rout[i] = MPL_malloc(rout_sizes[i] * sizeof(int), MPL_MEM_COMM); } } /* populate arrays */ idx = 0; for (i = 0; i < n; ++i) { /* TODO add this assert as proper error checking above */ int s_rank = sources[i]; MPIR_Assert(s_rank < comm_size); MPIR_Assert(s_rank >= 0); for (j = 0; j < degrees[i]; ++j) { int d_rank = destinations[idx]; int weight = (weights == MPI_UNWEIGHTED ? 0 : weights[idx]); /* TODO add this assert as proper error checking above */ MPIR_Assert(d_rank < comm_size); MPIR_Assert(d_rank >= 0); /* XXX DJG what about self-edges? do we need to drop one of these * cases when there is a self-edge to avoid double-counting? */ /* rout[s][2*x] is the value of d for the j'th edge between (s,d) * with weight rout[s][2*x+1], where x is the current end of the * outgoing edge list for s. x==(rout_idx[s]/2) */ rout[s_rank][rout_idx[s_rank]++] = d_rank; rout[s_rank][rout_idx[s_rank]++] = weight; /* rin[d][2*x] is the value of s for the j'th edge between (s,d) * with weight rout[d][2*x+1], where x is the current end of the * incoming edge list for d. x==(rin_idx[d]/2) */ rin[d_rank][rin_idx[d_rank]++] = s_rank; rin[d_rank][rin_idx[d_rank]++] = weight; ++idx; } } for (i = 0; i < comm_size; ++i) { /* sanity check that all arrays are fully populated */ MPIR_Assert(rin_idx[i] == rin_sizes[i]); MPIR_Assert(rout_idx[i] == rout_sizes[i]); } MPIR_CHKLMEM_MALLOC(rs, int *, 2 * comm_size * sizeof(int), mpi_errno, "red-scat source buffer", MPL_MEM_COMM); for (i = 0; i < comm_size; ++i) { rs[2 * i] = (rin_sizes[i] ? 1 : 0); rs[2 * i + 1] = (rout_sizes[i] ? 1 : 0); } /* compute the number of peers I will recv from */ mpi_errno = MPIR_Reduce_scatter_block(rs, in_out_peers, 2, MPI_INT, MPI_SUM, comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); MPIR_Assert(in_out_peers[0] <= comm_size && in_out_peers[0] >= 0); MPIR_Assert(in_out_peers[1] <= comm_size && in_out_peers[1] >= 0); idx = 0; /* must be 2*comm_size requests because we will possibly send inbound and * outbound edges to everyone in our communicator */ MPIR_CHKLMEM_MALLOC(reqs, MPIR_Request **, 2 * comm_size * sizeof(MPIR_Request *), mpi_errno, "temp request array", MPL_MEM_COMM); for (i = 0; i < comm_size; ++i) { if (rin_sizes[i]) { /* send edges where i is a destination to process i */ mpi_errno = MPIC_Isend(&rin[i][0], rin_sizes[i], MPI_INT, i, MPIR_TOPO_A_TAG, comm_ptr, &reqs[idx++], &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } if (rout_sizes[i]) { /* send edges where i is a source to process i */ mpi_errno = MPIC_Isend(&rout[i][0], rout_sizes[i], MPI_INT, i, MPIR_TOPO_B_TAG, comm_ptr, &reqs[idx++], &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } } MPIR_Assert(idx <= (2 * comm_size)); /* Create the topology structure */ MPIR_CHKPMEM_MALLOC(topo_ptr, MPIR_Topology *, sizeof(MPIR_Topology), mpi_errno, "topo_ptr", MPL_MEM_COMM); topo_ptr->kind = MPI_DIST_GRAPH; dist_graph_ptr = &topo_ptr->topo.dist_graph; dist_graph_ptr->indegree = 0; dist_graph_ptr->in = NULL; dist_graph_ptr->in_weights = NULL; dist_graph_ptr->outdegree = 0; dist_graph_ptr->out = NULL; dist_graph_ptr->out_weights = NULL; dist_graph_ptr->is_weighted = (weights != MPI_UNWEIGHTED); /* can't use CHKPMEM macros for this b/c we need to realloc */ in_capacity = 10; /* arbitrary */ dist_graph_ptr->in = MPL_malloc(in_capacity * sizeof(int), MPL_MEM_COMM); if (dist_graph_ptr->is_weighted) { dist_graph_ptr->in_weights = MPL_malloc(in_capacity * sizeof(int), MPL_MEM_COMM); MPIR_Assert(dist_graph_ptr->in_weights != NULL); } out_capacity = 10; /* arbitrary */ dist_graph_ptr->out = MPL_malloc(out_capacity * sizeof(int), MPL_MEM_COMM); if (dist_graph_ptr->is_weighted) { dist_graph_ptr->out_weights = MPL_malloc(out_capacity * sizeof(int), MPL_MEM_COMM); MPIR_Assert(dist_graph_ptr->out_weights); } for (i = 0; i < in_out_peers[0]; ++i) { MPI_Status status; MPI_Aint count; int *buf; /* receive inbound edges */ mpi_errno = MPIC_Probe(MPI_ANY_SOURCE, MPIR_TOPO_A_TAG, comm_old, &status); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_Get_count_impl(&status, MPI_INT, &count); /* can't use CHKLMEM macros b/c we are in a loop */ /* FIXME: Why not - there is only one allocated at a time. Is it only * that there is no defined macro to pop and free an item? */ buf = MPL_malloc(count * sizeof(int), MPL_MEM_COMM); MPIR_ERR_CHKANDJUMP(!buf, mpi_errno, MPI_ERR_OTHER, "**nomem"); mpi_errno = MPIC_Recv(buf, count, MPI_INT, MPI_ANY_SOURCE, MPIR_TOPO_A_TAG, comm_ptr, MPI_STATUS_IGNORE, &errflag); /* FIXME: buf is never freed on error! */ if (mpi_errno) MPIR_ERR_POP(mpi_errno); for (j = 0; j < count / 2; ++j) { int deg = dist_graph_ptr->indegree++; if (deg >= in_capacity) { in_capacity *= 2; /* FIXME: buf is never freed on error! */ MPIR_REALLOC_ORJUMP(dist_graph_ptr->in, in_capacity * sizeof(int), MPL_MEM_COMM, mpi_errno); if (dist_graph_ptr->is_weighted) /* FIXME: buf is never freed on error! */ MPIR_REALLOC_ORJUMP(dist_graph_ptr->in_weights, in_capacity * sizeof(int), MPL_MEM_COMM, mpi_errno); } dist_graph_ptr->in[deg] = buf[2 * j]; if (dist_graph_ptr->is_weighted) dist_graph_ptr->in_weights[deg] = buf[2 * j + 1]; } MPL_free(buf); } for (i = 0; i < in_out_peers[1]; ++i) { MPI_Status status; MPI_Aint count; int *buf; /* receive outbound edges */ mpi_errno = MPIC_Probe(MPI_ANY_SOURCE, MPIR_TOPO_B_TAG, comm_old, &status); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_Get_count_impl(&status, MPI_INT, &count); /* can't use CHKLMEM macros b/c we are in a loop */ /* Why not? */ buf = MPL_malloc(count * sizeof(int), MPL_MEM_COMM); MPIR_ERR_CHKANDJUMP(!buf, mpi_errno, MPI_ERR_OTHER, "**nomem"); mpi_errno = MPIC_Recv(buf, count, MPI_INT, MPI_ANY_SOURCE, MPIR_TOPO_B_TAG, comm_ptr, MPI_STATUS_IGNORE, &errflag); /* FIXME: buf is never freed on error! */ if (mpi_errno) MPIR_ERR_POP(mpi_errno); for (j = 0; j < count / 2; ++j) { int deg = dist_graph_ptr->outdegree++; if (deg >= out_capacity) { out_capacity *= 2; /* FIXME: buf is never freed on error! */ MPIR_REALLOC_ORJUMP(dist_graph_ptr->out, out_capacity * sizeof(int), MPL_MEM_COMM, mpi_errno); if (dist_graph_ptr->is_weighted) /* FIXME: buf is never freed on error! */ MPIR_REALLOC_ORJUMP(dist_graph_ptr->out_weights, out_capacity * sizeof(int), MPL_MEM_COMM, mpi_errno); } dist_graph_ptr->out[deg] = buf[2 * j]; if (dist_graph_ptr->is_weighted) dist_graph_ptr->out_weights[deg] = buf[2 * j + 1]; } MPL_free(buf); } mpi_errno = MPIC_Waitall(idx, reqs, MPI_STATUSES_IGNORE, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* remove any excess memory allocation */ MPIR_REALLOC_ORJUMP(dist_graph_ptr->in, dist_graph_ptr->indegree * sizeof(int), MPL_MEM_COMM, mpi_errno); MPIR_REALLOC_ORJUMP(dist_graph_ptr->out, dist_graph_ptr->outdegree * sizeof(int), MPL_MEM_COMM, mpi_errno); if (dist_graph_ptr->is_weighted) { MPIR_REALLOC_ORJUMP(dist_graph_ptr->in_weights, dist_graph_ptr->indegree * sizeof(int), MPL_MEM_COMM, mpi_errno); MPIR_REALLOC_ORJUMP(dist_graph_ptr->out_weights, dist_graph_ptr->outdegree * sizeof(int), MPL_MEM_COMM, mpi_errno); } mpi_errno = MPIR_Topology_put(comm_dist_graph_ptr, topo_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_CHKPMEM_COMMIT(); MPIR_OBJ_PUBLISH_HANDLE(*comm_dist_graph, comm_dist_graph_ptr->handle); /* ... end of body of routine ... */ fn_exit: for (i = 0; i < comm_size; ++i) { MPL_free(rin[i]); MPL_free(rout[i]); } MPIR_CHKLMEM_FREEALL(); MPIR_FUNC_TERSE_EXIT(MPID_STATE_MPI_DIST_GRAPH_CREATE); MPID_THREAD_CS_EXIT(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX); return mpi_errno; /* --BEGIN ERROR HANDLING-- */ fn_fail: if (dist_graph_ptr) { MPL_free(dist_graph_ptr->in); MPL_free(dist_graph_ptr->in_weights); MPL_free(dist_graph_ptr->out); MPL_free(dist_graph_ptr->out_weights); } MPIR_CHKPMEM_REAP(); #ifdef HAVE_ERROR_CHECKING mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_RECOVERABLE, __func__, __LINE__, MPI_ERR_OTHER, "**mpi_dist_graph_create", "**mpi_dist_graph_create %C %d %p %p %p %p %I %d %p", comm_old, n, sources, degrees, destinations, weights, info, reorder, comm_dist_graph); #endif mpi_errno = MPIR_Err_return_comm(comm_ptr, __func__, mpi_errno); goto fn_exit; /* --END ERROR HANDLING-- */ }
int MPIR_Alltoall_intra_scattered( const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPIR_Comm *comm_ptr, MPIR_Errflag_t *errflag ) { int comm_size, i, j; MPI_Aint sendtype_extent, recvtype_extent; int mpi_errno=MPI_SUCCESS, dst, rank; int mpi_errno_ret = MPI_SUCCESS; MPI_Datatype newtype = MPI_DATATYPE_NULL; MPIR_Request **reqarray; MPI_Status *starray; MPIR_CHKLMEM_DECL(6); if (recvcount == 0) return MPI_SUCCESS; comm_size = comm_ptr->local_size; rank = comm_ptr->rank; #ifdef HAVE_ERROR_CHECKING MPIR_Assert(sendbuf != MPI_IN_PLACE); #endif /* HAVE_ERROR_CHECKING */ /* Get extent of send and recv types */ MPIR_Datatype_get_extent_macro(recvtype, recvtype_extent); MPIR_Datatype_get_extent_macro(sendtype, sendtype_extent); int ii, ss, bblock; bblock = MPIR_CVAR_ALLTOALL_THROTTLE; if (bblock == 0) bblock = comm_size; MPIR_CHKLMEM_MALLOC(reqarray, MPIR_Request **, 2*bblock*sizeof(MPIR_Request*), mpi_errno, "reqarray", MPL_MEM_BUFFER); MPIR_CHKLMEM_MALLOC(starray, MPI_Status *, 2*bblock*sizeof(MPI_Status), mpi_errno, "starray", MPL_MEM_BUFFER); for (ii=0; ii<comm_size; ii+=bblock) { ss = comm_size-ii < bblock ? comm_size-ii : bblock; /* do the communication -- post ss sends and receives: */ for ( i=0; i<ss; i++ ) { dst = (rank+i+ii) % comm_size; mpi_errno = MPIC_Irecv((char *)recvbuf + dst*recvcount*recvtype_extent, recvcount, recvtype, dst, MPIR_ALLTOALL_TAG, comm_ptr, &reqarray[i]); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } for ( i=0; i<ss; i++ ) { dst = (rank-i-ii+comm_size) % comm_size; mpi_errno = MPIC_Isend((char *)sendbuf + dst*sendcount*sendtype_extent, sendcount, sendtype, dst, MPIR_ALLTOALL_TAG, comm_ptr, &reqarray[i+ss], errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } /* ... then wait for them to finish: */ mpi_errno = MPIC_Waitall(2*ss,reqarray,starray, errflag); if (mpi_errno && mpi_errno != MPI_ERR_IN_STATUS) MPIR_ERR_POP(mpi_errno); /* --BEGIN ERROR HANDLING-- */ if (mpi_errno == MPI_ERR_IN_STATUS) { for (j=0; j<2*ss; j++) { if (starray[j].MPI_ERROR != MPI_SUCCESS) { mpi_errno = starray[j].MPI_ERROR; if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = MPIR_ERR_GET_CLASS(mpi_errno); MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); } } } } /* --END ERROR HANDLING-- */ } fn_exit: MPIR_CHKLMEM_FREEALL(); if (mpi_errno_ret) mpi_errno = mpi_errno_ret; else if (*errflag != MPIR_ERR_NONE) MPIR_ERR_SET(mpi_errno, *errflag, "**coll_fail"); return mpi_errno; fn_fail: if (newtype != MPI_DATATYPE_NULL) MPIR_Type_free_impl(&newtype); goto fn_exit; }
int MPIR_Alltoallw_intra(const void *sendbuf, const int sendcounts[], const int sdispls[], const MPI_Datatype sendtypes[], void *recvbuf, const int recvcounts[], const int rdispls[], const MPI_Datatype recvtypes[], MPID_Comm *comm_ptr, int *errflag) { int comm_size, i, j; int mpi_errno = MPI_SUCCESS; int mpi_errno_ret = MPI_SUCCESS; MPI_Status status; MPI_Status *starray; MPI_Request *reqarray; int dst, rank; MPI_Comm comm; int outstanding_requests; int ii, ss, bblock; int type_size; MPIU_CHKLMEM_DECL(2); comm = comm_ptr->handle; comm_size = comm_ptr->local_size; rank = comm_ptr->rank; /* check if multiple threads are calling this collective function */ MPIDU_ERR_CHECK_MULTIPLE_THREADS_ENTER( comm_ptr ); if (sendbuf == MPI_IN_PLACE) { /* We use pair-wise sendrecv_replace in order to conserve memory usage, * which is keeping with the spirit of the MPI-2.2 Standard. But * because of this approach all processes must agree on the global * schedule of sendrecv_replace operations to avoid deadlock. * * Note that this is not an especially efficient algorithm in terms of * time and there will be multiple repeated malloc/free's rather than * maintaining a single buffer across the whole loop. Something like * MADRE is probably the best solution for the MPI_IN_PLACE scenario. */ for (i = 0; i < comm_size; ++i) { /* start inner loop at i to avoid re-exchanging data */ for (j = i; j < comm_size; ++j) { if (rank == i) { /* also covers the (rank == i && rank == j) case */ mpi_errno = MPIC_Sendrecv_replace(((char *)recvbuf + rdispls[j]), recvcounts[j], recvtypes[j], j, MPIR_ALLTOALLW_TAG, j, MPIR_ALLTOALLW_TAG, comm, &status, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = TRUE; MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail"); MPIU_ERR_ADD(mpi_errno_ret, mpi_errno); } } else if (rank == j) { /* same as above with i/j args reversed */ mpi_errno = MPIC_Sendrecv_replace(((char *)recvbuf + rdispls[i]), recvcounts[i], recvtypes[i], i, MPIR_ALLTOALLW_TAG, i, MPIR_ALLTOALLW_TAG, comm, &status, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = TRUE; MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail"); MPIU_ERR_ADD(mpi_errno_ret, mpi_errno); } } } } } else { bblock = MPIR_CVAR_ALLTOALL_THROTTLE; if (bblock == 0) bblock = comm_size; MPIU_CHKLMEM_MALLOC(starray, MPI_Status*, 2*bblock*sizeof(MPI_Status), mpi_errno, "starray"); MPIU_CHKLMEM_MALLOC(reqarray, MPI_Request*, 2*bblock*sizeof(MPI_Request), mpi_errno, "reqarray"); /* post only bblock isends/irecvs at a time as suggested by Tony Ladd */ for (ii=0; ii<comm_size; ii+=bblock) { outstanding_requests = 0; ss = comm_size-ii < bblock ? comm_size-ii : bblock; /* do the communication -- post ss sends and receives: */ for ( i=0; i<ss; i++ ) { dst = (rank+i+ii) % comm_size; if (recvcounts[dst]) { MPID_Datatype_get_size_macro(recvtypes[dst], type_size); if (type_size) { mpi_errno = MPIC_Irecv((char *)recvbuf+rdispls[dst], recvcounts[dst], recvtypes[dst], dst, MPIR_ALLTOALLW_TAG, comm, &reqarray[outstanding_requests]); if (mpi_errno) { MPIU_ERR_POP(mpi_errno); } outstanding_requests++; } } } for ( i=0; i<ss; i++ ) { dst = (rank-i-ii+comm_size) % comm_size; if (sendcounts[dst]) { MPID_Datatype_get_size_macro(sendtypes[dst], type_size); if (type_size) { mpi_errno = MPIC_Isend((char *)sendbuf+sdispls[dst], sendcounts[dst], sendtypes[dst], dst, MPIR_ALLTOALLW_TAG, comm, &reqarray[outstanding_requests], errflag); if (mpi_errno) { MPIU_ERR_POP(mpi_errno); } outstanding_requests++; } } } mpi_errno = MPIC_Waitall(outstanding_requests, reqarray, starray, errflag); if (mpi_errno && mpi_errno != MPI_ERR_IN_STATUS) MPIU_ERR_POP(mpi_errno); /* --BEGIN ERROR HANDLING-- */ if (mpi_errno == MPI_ERR_IN_STATUS) { for (i=0; i<outstanding_requests; i++) { if (starray[i].MPI_ERROR != MPI_SUCCESS) { mpi_errno = starray[i].MPI_ERROR; if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = TRUE; MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail"); MPIU_ERR_ADD(mpi_errno_ret, mpi_errno); } } } } /* --END ERROR HANDLING-- */ } #ifdef FOO /* Use pairwise exchange algorithm. */ /* Make local copy first */ mpi_errno = MPIR_Localcopy(((char *)sendbuf+sdispls[rank]), sendcounts[rank], sendtypes[rank], ((char *)recvbuf+rdispls[rank]), recvcounts[rank], recvtypes[rank]); if (mpi_errno) MPIU_ERR_POP(mpi_errno); /* Do the pairwise exchange. */ for (i=1; i<comm_size; i++) { src = (rank - i + comm_size) % comm_size; dst = (rank + i) % comm_size; mpi_errno = MPIC_Sendrecv(((char *)sendbuf+sdispls[dst]), sendcounts[dst], sendtypes[dst], dst, MPIR_ALLTOALLW_TAG, ((char *)recvbuf+rdispls[src]), recvcounts[src], recvtypes[dst], src, MPIR_ALLTOALLW_TAG, comm, &status, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = TRUE; MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail"); MPIU_ERR_ADD(mpi_errno_ret, mpi_errno); } } #endif } /* check if multiple threads are calling this collective function */ fn_exit: MPIDU_ERR_CHECK_MULTIPLE_THREADS_EXIT( comm_ptr ); MPIU_CHKLMEM_FREEALL(); if (mpi_errno_ret) mpi_errno = mpi_errno_ret; else if (*errflag) MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**coll_fail"); return mpi_errno; fn_fail: goto fn_exit; }