/* * alltoallv_intra * * Function: - MPI_Alltoallv * Accepts: - same as MPI_Alltoallv() * Returns: - MPI_SUCCESS or an MPI error code */ int mca_coll_self_alltoallv_intra(const void *sbuf, const int *scounts, const int *sdisps, struct ompi_datatype_t *sdtype, void *rbuf, const int *rcounts, const int *rdisps, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int err; ptrdiff_t lb, rextent, sextent; if (MPI_IN_PLACE == sbuf) { return MPI_SUCCESS; } err = ompi_datatype_get_extent(sdtype, &lb, &sextent); if (OMPI_SUCCESS != err) { return OMPI_ERROR; } err = ompi_datatype_get_extent(rdtype, &lb, &rextent); if (OMPI_SUCCESS != err) { return OMPI_ERROR; } return ompi_datatype_sndrcv(((char *) sbuf) + sdisps[0] * sextent, scounts[0], sdtype, ((char *) rbuf) + rdisps[0] * rextent, rcounts[0], rdtype); }
int ompi_coll_base_alltoall_intra_pairwise(const void *sbuf, int scount, struct ompi_datatype_t *sdtype, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int line = -1, err = 0, rank, size, step, sendto, recvfrom; void * tmpsend, *tmprecv; ptrdiff_t lb, sext, rext; if (MPI_IN_PLACE == sbuf) { return mca_coll_base_alltoall_intra_basic_inplace (rbuf, rcount, rdtype, comm, module); } size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:alltoall_intra_pairwise rank %d", rank)); err = ompi_datatype_get_extent (sdtype, &lb, &sext); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } err = ompi_datatype_get_extent (rdtype, &lb, &rext); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } /* Perform pairwise exchange - starting from 1 so the local copy is last */ for (step = 1; step < size + 1; step++) { /* Determine sender and receiver for this step. */ sendto = (rank + step) % size; recvfrom = (rank + size - step) % size; /* Determine sending and receiving locations */ tmpsend = (char*)sbuf + (ptrdiff_t)sendto * sext * (ptrdiff_t)scount; tmprecv = (char*)rbuf + (ptrdiff_t)recvfrom * rext * (ptrdiff_t)rcount; /* send and receive */ err = ompi_coll_base_sendrecv( tmpsend, scount, sdtype, sendto, MCA_COLL_BASE_TAG_ALLTOALL, tmprecv, rcount, rdtype, recvfrom, MCA_COLL_BASE_TAG_ALLTOALL, comm, MPI_STATUS_IGNORE, rank); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } } return MPI_SUCCESS; err_hndl: OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank)); (void)line; // silence compiler warning return err; }
int ompi_coll_tuned_alltoall_intra_two_procs(void *sbuf, int scount, struct ompi_datatype_t *sdtype, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int line = -1, err = 0, rank, remote; void * tmpsend, *tmprecv; ptrdiff_t sext, rext, lb; if (MPI_IN_PLACE == sbuf) { return mca_coll_tuned_alltoall_intra_basic_inplace (rbuf, rcount, rdtype, comm, module); } rank = ompi_comm_rank(comm); OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_alltoall_intra_two_procs rank %d", rank)); err = ompi_datatype_get_extent (sdtype, &lb, &sext); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } err = ompi_datatype_get_extent (rdtype, &lb, &rext); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } /* exchange data */ remote = rank ^ 1; tmpsend = (char*)sbuf + (ptrdiff_t)remote * sext * (ptrdiff_t)scount; tmprecv = (char*)rbuf + (ptrdiff_t)remote * rext * (ptrdiff_t)rcount; /* send and receive */ err = ompi_coll_tuned_sendrecv ( tmpsend, scount, sdtype, remote, MCA_COLL_BASE_TAG_ALLTOALL, tmprecv, rcount, rdtype, remote, MCA_COLL_BASE_TAG_ALLTOALL, comm, MPI_STATUS_IGNORE, rank ); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } /* ddt sendrecv your own data */ err = ompi_datatype_sndrcv((char*) sbuf + (ptrdiff_t)rank * sext * (ptrdiff_t)scount, (int32_t) scount, sdtype, (char*) rbuf + (ptrdiff_t)rank * rext * (ptrdiff_t)rcount, (int32_t) rcount, rdtype); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } /* done */ return MPI_SUCCESS; err_hndl: OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank)); return err; }
static int mca_coll_basic_neighbor_alltoallv_graph(const void *sbuf, const int scounts[], const int sdisps[], struct ompi_datatype_t *sdtype, void *rbuf, const int rcounts[], const int rdisps[], struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { mca_coll_basic_module_t *basic_module = (mca_coll_basic_module_t *) module; const mca_topo_base_comm_graph_2_2_0_t *graph = comm->c_topo->mtc.graph; int rc = MPI_SUCCESS, neighbor, degree; const int rank = ompi_comm_rank (comm); ptrdiff_t lb, rdextent, sdextent; ompi_request_t **reqs; const int *edges; mca_topo_base_graph_neighbors_count (comm, rank, °ree); /* ensure we have enough storage for requests */ rc = mca_coll_basic_check_for_requests (basic_module, 2 * degree); if (OMPI_SUCCESS != rc) { return rc; } edges = graph->edges; if (rank > 0) { edges += graph->index[rank - 1]; } ompi_datatype_get_extent(rdtype, &lb, &rdextent); ompi_datatype_get_extent(sdtype, &lb, &sdextent); /* post all receives first */ for (neighbor = 0, reqs = basic_module->mccb_reqs ; neighbor < degree ; ++neighbor) { rc = MCA_PML_CALL(irecv((char *) rbuf + rdisps[neighbor] * rdextent, rcounts[neighbor], rdtype, edges[neighbor], MCA_COLL_BASE_TAG_ALLTOALL, comm, reqs++)); if (OMPI_SUCCESS != rc) break; } if (OMPI_SUCCESS != rc) { /* should probably try to clean up here */ return rc; } for (neighbor = 0 ; neighbor < degree ; ++neighbor) { /* remove cast from const when the pml layer is updated to take a const for the send buffer */ rc = MCA_PML_CALL(isend((char *) sbuf + sdisps[neighbor] * sdextent, scounts[neighbor], sdtype, edges[neighbor], MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD, comm, reqs++)); if (OMPI_SUCCESS != rc) break; } if (OMPI_SUCCESS != rc) { /* should probably try to clean up here */ return rc; } return ompi_request_wait_all (degree * 2, basic_module->mccb_reqs, MPI_STATUSES_IGNORE); }
static int mca_coll_basic_neighbor_alltoallv_graph(const void *sbuf, const int scounts[], const int sdisps[], struct ompi_datatype_t *sdtype, void *rbuf, const int rcounts[], const int rdisps[], struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { const mca_topo_base_comm_graph_2_2_0_t *graph = comm->c_topo->mtc.graph; int rc = MPI_SUCCESS, neighbor, degree; const int rank = ompi_comm_rank (comm); ptrdiff_t lb, rdextent, sdextent; ompi_request_t **reqs, **preqs; const int *edges; mca_topo_base_graph_neighbors_count (comm, rank, °ree); if( 0 == degree ) return OMPI_SUCCESS; edges = graph->edges; if (rank > 0) { edges += graph->index[rank - 1]; } ompi_datatype_get_extent(rdtype, &lb, &rdextent); ompi_datatype_get_extent(sdtype, &lb, &sdextent); reqs = preqs = ompi_coll_base_comm_get_reqs( module->base_data, 2 * degree ); if( NULL == reqs ) { return OMPI_ERR_OUT_OF_RESOURCE; } /* post all receives first */ for (neighbor = 0; neighbor < degree ; ++neighbor) { rc = MCA_PML_CALL(irecv((char *) rbuf + rdisps[neighbor] * rdextent, rcounts[neighbor], rdtype, edges[neighbor], MCA_COLL_BASE_TAG_ALLTOALL, comm, preqs++)); if (OMPI_SUCCESS != rc) break; } if (OMPI_SUCCESS != rc) { ompi_coll_base_free_reqs( reqs, neighbor + 1); return rc; } for (neighbor = 0 ; neighbor < degree ; ++neighbor) { /* remove cast from const when the pml layer is updated to take a const for the send buffer */ rc = MCA_PML_CALL(isend((char *) sbuf + sdisps[neighbor] * sdextent, scounts[neighbor], sdtype, edges[neighbor], MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD, comm, preqs++)); if (OMPI_SUCCESS != rc) break; } if (OMPI_SUCCESS != rc) { ompi_coll_base_free_reqs( reqs, degree + neighbor + 1); return rc; } rc = ompi_request_wait_all (degree * 2, reqs, MPI_STATUSES_IGNORE); if (OMPI_SUCCESS != rc) { ompi_coll_base_free_reqs( reqs, degree * 2); } return rc; }
static int mca_coll_basic_neighbor_alltoallv_dist_graph(const void *sbuf, const int scounts[], const int sdisps[], struct ompi_datatype_t *sdtype, void *rbuf, const int rcounts[], const int rdisps[], struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { mca_coll_basic_module_t *basic_module = (mca_coll_basic_module_t *) module; const mca_topo_base_comm_dist_graph_2_1_0_t *dist_graph = comm->c_topo->mtc.dist_graph; ptrdiff_t lb, rdextent, sdextent; int rc = MPI_SUCCESS, neighbor; const int *inedges, *outedges; int indegree, outdegree; ompi_request_t **reqs; indegree = dist_graph->indegree; outdegree = dist_graph->outdegree; inedges = dist_graph->in; outedges = dist_graph->out; ompi_datatype_get_extent(rdtype, &lb, &rdextent); ompi_datatype_get_extent(sdtype, &lb, &sdextent); /* post all receives first */ for (neighbor = 0, reqs = basic_module->mccb_reqs ; neighbor < indegree ; ++neighbor) { rc = MCA_PML_CALL(irecv((char *) rbuf + rdisps[neighbor] * rdextent, rcounts[neighbor], rdtype, inedges[neighbor], MCA_COLL_BASE_TAG_ALLTOALL, comm, reqs++)); if (OMPI_SUCCESS != rc) break; } if (OMPI_SUCCESS != rc) { /* should probably try to clean up here */ return rc; } for (neighbor = 0 ; neighbor < outdegree ; ++neighbor) { /* remove cast from const when the pml layer is updated to take a const for the send buffer */ rc = MCA_PML_CALL(isend((char *) sbuf + sdisps[neighbor] * sdextent, scounts[neighbor], sdtype, outedges[neighbor], MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD, comm, reqs++)); if (OMPI_SUCCESS != rc) break; } if (OMPI_SUCCESS != rc) { /* should probably try to clean up here */ return rc; } return ompi_request_wait_all (indegree + outdegree, basic_module->mccb_reqs, MPI_STATUSES_IGNORE); }
/* * allgather_intra * * Function: - allgather using other MPI collections * Accepts: - same as MPI_Allgather() * Returns: - MPI_SUCCESS or error code */ int mca_coll_basic_allgather_intra(void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int err; ptrdiff_t lb, extent; /* Handle MPI_IN_PLACE (see explanantion in reduce.c for how to allocate temp buffer) -- note that rank 0 can use IN_PLACE natively, and we can just alias the right position in rbuf as sbuf and avoid using a temporary buffer if gather is implemented correctly */ if (MPI_IN_PLACE == sbuf && 0 != ompi_comm_rank(comm)) { ompi_datatype_get_extent(rdtype, &lb, &extent); sbuf = ((char*) rbuf) + (ompi_comm_rank(comm) * extent * rcount); sdtype = rdtype; scount = rcount; } /* Gather and broadcast. */ err = comm->c_coll.coll_gather(sbuf, scount, sdtype, rbuf, rcount, rdtype, 0, comm, comm->c_coll.coll_gather_module); if (MPI_SUCCESS == err) { err = comm->c_coll.coll_bcast(rbuf, rcount * ompi_comm_size(comm), rdtype, 0, comm, comm->c_coll.coll_bcast_module); } /* All done */ return err; }
/* * allgather_sched_linear * * Description: an implementation of Iallgather using linear algorithm * * Time: O(comm_size) * Schedule length (rounds): O(comm_size) */ static inline int allgather_sched_linear( int rank, int comm_size, NBC_Schedule *schedule, const void *sendbuf, int scount, struct ompi_datatype_t *sdtype, void *recvbuf, int rcount, struct ompi_datatype_t *rdtype) { int res = OMPI_SUCCESS; ptrdiff_t rlb, rext; res = ompi_datatype_get_extent(rdtype, &rlb, &rext); char *sbuf = (char *)recvbuf + rank * rcount * rext; for (int remote = 0; remote < comm_size ; ++remote) { if (remote != rank) { /* Recv from rank remote */ char *rbuf = (char *)recvbuf + remote * rcount * rext; res = NBC_Sched_recv(rbuf, false, rcount, rdtype, remote, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } /* Send to rank remote - not from the sendbuf to optimize MPI_IN_PLACE */ res = NBC_Sched_send(sbuf, false, rcount, rdtype, remote, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } } } cleanup_and_return: return res; }
static int mca_coll_basic_neighbor_allgather_dist_graph(const void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { const mca_topo_base_comm_dist_graph_2_2_0_t *dist_graph = comm->c_topo->mtc.dist_graph; const int *inedges, *outedges; int indegree, outdegree; ompi_request_t **reqs, **preqs; ptrdiff_t lb, extent; int rc = MPI_SUCCESS, neighbor; indegree = dist_graph->indegree; outdegree = dist_graph->outdegree; if( 0 == (indegree + outdegree) ) return OMPI_SUCCESS; inedges = dist_graph->in; outedges = dist_graph->out; ompi_datatype_get_extent(rdtype, &lb, &extent); reqs = preqs = coll_base_comm_get_reqs( module->base_data, indegree + outdegree); if( NULL == reqs ) { return OMPI_ERR_OUT_OF_RESOURCE; } for (neighbor = 0; neighbor < indegree ; ++neighbor) { rc = MCA_PML_CALL(irecv(rbuf, rcount, rdtype, inedges[neighbor], MCA_COLL_BASE_TAG_ALLGATHER, comm, preqs++)); if (OMPI_SUCCESS != rc) break; rbuf = (char *) rbuf + extent * rcount; } if (OMPI_SUCCESS != rc) { ompi_coll_base_free_reqs(reqs, neighbor + 1); return rc; } for (neighbor = 0 ; neighbor < outdegree ; ++neighbor) { /* remove cast from const when the pml layer is updated to take * a const for the send buffer. */ rc = MCA_PML_CALL(isend((void *) sbuf, scount, sdtype, outedges[neighbor], MCA_COLL_BASE_TAG_ALLGATHER, MCA_PML_BASE_SEND_STANDARD, comm, preqs++)); if (OMPI_SUCCESS != rc) break; } if (OMPI_SUCCESS != rc) { ompi_coll_base_free_reqs(reqs, indegree + neighbor + 1); return rc; } rc = ompi_request_wait_all (indegree + outdegree, reqs, MPI_STATUSES_IGNORE); if (OMPI_SUCCESS != rc) { ompi_coll_base_free_reqs(reqs, indegree + outdegree); } return rc; }
/* * reduce_inter * * Function: - reduction using the local_comm * Accepts: - same as MPI_Reduce() * Returns: - MPI_SUCCESS or error code */ int mca_coll_inter_reduce_inter(void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int rank, err; ptrdiff_t true_lb, true_extent, lb, extent; char *free_buffer = NULL; char *pml_buffer = NULL; /* Initialize */ rank = ompi_comm_rank(comm); if (MPI_PROC_NULL == root) { /* do nothing */ err = OMPI_SUCCESS; } else if (MPI_ROOT != root) { /* Perform the reduce locally with the first process as root */ ompi_datatype_get_extent(dtype, &lb, &extent); ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); free_buffer = (char*)malloc(true_extent + (count - 1) * extent); if (NULL == free_buffer) { return OMPI_ERR_OUT_OF_RESOURCE; } pml_buffer = free_buffer - true_lb; err = comm->c_local_comm->c_coll.coll_reduce(sbuf, pml_buffer, count, dtype, op, 0, comm->c_local_comm, comm->c_local_comm->c_coll.coll_reduce_module); if (0 == rank) { /* First process sends the result to the root */ err = MCA_PML_CALL(send(pml_buffer, count, dtype, root, MCA_COLL_BASE_TAG_REDUCE, MCA_PML_BASE_SEND_STANDARD, comm)); if (OMPI_SUCCESS != err) { return err; } } if (NULL != free_buffer) { free(free_buffer); } } else { /* Root receives the reduced message from the first process */ err = MCA_PML_CALL(recv(rbuf, count, dtype, 0, MCA_COLL_BASE_TAG_REDUCE, comm, MPI_STATUS_IGNORE)); if (OMPI_SUCCESS != err) { return err; } } /* All done */ return err; }
/* * gatherv_inter * * Function: - basic gatherv operation * Accepts: - same arguments as MPI_Gatherv() * Returns: - MPI_SUCCESS or error code */ int mca_coll_basic_gatherv_inter(const void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, const int *rcounts, const int *disps, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, size, err; char *ptmp; ptrdiff_t lb, extent; ompi_request_t **reqs = NULL; size = ompi_comm_remote_size(comm); /* If not root, receive data. Note that we will only get here if * scount > 0 or rank == root. */ if (MPI_PROC_NULL == root) { /* do nothing */ err = OMPI_SUCCESS; } else if (MPI_ROOT != root) { /* Everyone but root sends data and returns. */ err = MCA_PML_CALL(send(sbuf, scount, sdtype, root, MCA_COLL_BASE_TAG_GATHERV, MCA_PML_BASE_SEND_STANDARD, comm)); } else { /* I am the root, loop receiving data. */ err = ompi_datatype_get_extent(rdtype, &lb, &extent); if (OMPI_SUCCESS != err) { return OMPI_ERROR; } reqs = coll_base_comm_get_reqs(module->base_data, size); if( NULL == reqs ) { return OMPI_ERR_OUT_OF_RESOURCE; } for (i = 0; i < size; ++i) { ptmp = ((char *) rbuf) + (extent * disps[i]); err = MCA_PML_CALL(irecv(ptmp, rcounts[i], rdtype, i, MCA_COLL_BASE_TAG_GATHERV, comm, &reqs[i])); if (OMPI_SUCCESS != err) { ompi_coll_base_free_reqs(reqs, i + 1); return err; } } err = ompi_request_wait_all(size, reqs, MPI_STATUSES_IGNORE); if (OMPI_SUCCESS != err) { ompi_coll_base_free_reqs(reqs, size); } } /* All done */ return err; }
/* * scatterv_inter * * Function: - scatterv operation * Accepts: - same arguments as MPI_Scatterv() * Returns: - MPI_SUCCESS or error code */ int mca_coll_basic_scatterv_inter(const void *sbuf, const int *scounts, const int *disps, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, size, err; char *ptmp; ptrdiff_t lb, extent; ompi_request_t **reqs; /* Initialize */ size = ompi_comm_remote_size(comm); /* If not root, receive data. Note that we will only get here if * rcount > 0 or rank == root. */ if (MPI_PROC_NULL == root) { /* do nothing */ err = OMPI_SUCCESS; } else if (MPI_ROOT != root) { /* If not root, receive data. */ err = MCA_PML_CALL(recv(rbuf, rcount, rdtype, root, MCA_COLL_BASE_TAG_SCATTERV, comm, MPI_STATUS_IGNORE)); } else { /* I am the root, loop sending data. */ err = ompi_datatype_get_extent(sdtype, &lb, &extent); if (OMPI_SUCCESS != err) { return OMPI_ERROR; } reqs = coll_base_comm_get_reqs(module->base_data, size); for (i = 0; i < size; ++i) { ptmp = ((char *) sbuf) + (extent * disps[i]); err = MCA_PML_CALL(isend(ptmp, scounts[i], sdtype, i, MCA_COLL_BASE_TAG_SCATTERV, MCA_PML_BASE_SEND_STANDARD, comm, &(reqs[i]))); if (OMPI_SUCCESS != err) { ompi_coll_base_free_reqs(reqs, i); return err; } } err = ompi_request_wait_all(size, reqs, MPI_STATUSES_IGNORE); if (OMPI_SUCCESS != err) { ompi_coll_base_free_reqs(reqs, size); } } /* All done */ return err; }
/** * Computing the correct buffer length for moving a multiple of a datatype * is not an easy task. Define a function to centralize the complexity in a * single location. */ size_t compute_buffer_length(ompi_datatype_t* pdt, int count) { MPI_Aint extent, lb, true_extent, true_lb; size_t length; ompi_datatype_get_extent(pdt, &lb, &extent); ompi_datatype_get_true_extent(pdt, &true_lb, &true_extent); (void)true_lb; length = true_lb + true_extent + (count - 1) * extent; return length; }
/* * gather_intra * * Function: - basic gather operation * Accepts: - same arguments as MPI_Gather() * Returns: - MPI_SUCCESS or error code */ int ompi_coll_base_gather_intra_basic_linear(const void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, err, rank, size; char *ptmp; MPI_Aint incr, extent, lb; size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); /* Everyone but root sends data and returns. */ OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "ompi_coll_base_gather_intra_basic_linear rank %d", rank)); if (rank != root) { return MCA_PML_CALL(send(sbuf, scount, sdtype, root, MCA_COLL_BASE_TAG_GATHER, MCA_PML_BASE_SEND_STANDARD, comm)); } /* I am the root, loop receiving the data. */ ompi_datatype_get_extent(rdtype, &lb, &extent); incr = extent * (ptrdiff_t)rcount; for (i = 0, ptmp = (char *) rbuf; i < size; ++i, ptmp += incr) { if (i == rank) { if (MPI_IN_PLACE != sbuf) { err = ompi_datatype_sndrcv((void *)sbuf, scount, sdtype, ptmp, rcount, rdtype); } else { err = MPI_SUCCESS; } } else { err = MCA_PML_CALL(recv(ptmp, rcount, rdtype, i, MCA_COLL_BASE_TAG_GATHER, comm, MPI_STATUS_IGNORE)); } if (MPI_SUCCESS != err) { return err; } } /* All done */ return MPI_SUCCESS; }
static int mca_coll_basic_neighbor_allgather_graph(const void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { const mca_topo_base_comm_graph_2_2_0_t *graph = comm->c_topo->mtc.graph; const int rank = ompi_comm_rank (comm); const int *edges; int degree; ompi_request_t **reqs, **preqs; ptrdiff_t lb, extent; int rc = MPI_SUCCESS, neighbor; mca_topo_base_graph_neighbors_count (comm, rank, °ree); edges = graph->edges; if (rank > 0) { edges += graph->index[rank - 1]; } ompi_datatype_get_extent(rdtype, &lb, &extent); reqs = preqs = coll_base_comm_get_reqs( module->base_data, 2 * degree); if( NULL == reqs ) { return OMPI_ERR_OUT_OF_RESOURCE; } for (neighbor = 0; neighbor < degree ; ++neighbor) { rc = MCA_PML_CALL(irecv(rbuf, rcount, rdtype, edges[neighbor], MCA_COLL_BASE_TAG_ALLGATHER, comm, preqs++)); if (OMPI_SUCCESS != rc) break; rbuf = (char *) rbuf + extent * rcount; /* remove cast from const when the pml layer is updated to take * a const for the send buffer. */ rc = MCA_PML_CALL(isend((void *) sbuf, scount, sdtype, edges[neighbor], MCA_COLL_BASE_TAG_ALLGATHER, MCA_PML_BASE_SEND_STANDARD, comm, preqs++)); if (OMPI_SUCCESS != rc) break; } if (OMPI_SUCCESS != rc) { ompi_coll_base_free_reqs( reqs, (2 * neighbor + 1)); return rc; } rc = ompi_request_wait_all (degree * 2, reqs, MPI_STATUSES_IGNORE); if (OMPI_SUCCESS != rc) { ompi_coll_base_free_reqs( reqs, degree * 2); } return rc; }
/* * reduce_log_inter * * Function: - reduction using O(N) algorithm * Accepts: - same as MPI_Reduce() * Returns: - MPI_SUCCESS or error code */ int mca_coll_cuda_reduce(const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { mca_coll_cuda_module_t *s = (mca_coll_cuda_module_t*) module; ptrdiff_t true_lb, true_extent, lb, extent; char *rbuf1 = NULL, *sbuf1 = NULL, *rbuf2 = NULL; const char *sbuf2; size_t bufsize; int rc; ompi_datatype_get_extent(dtype, &lb, &extent); ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); bufsize = true_extent + (ptrdiff_t)(count - 1) * extent; if ((MPI_IN_PLACE != sbuf) && (opal_cuda_check_bufs((char *)sbuf, NULL))) { sbuf1 = (char*)malloc(bufsize); if (NULL == sbuf1) { return OMPI_ERR_OUT_OF_RESOURCE; } opal_cuda_memcpy_sync(sbuf1, sbuf, bufsize); sbuf2 = sbuf; /* save away original buffer */ sbuf = sbuf1 - lb; } if (opal_cuda_check_bufs(rbuf, NULL)) { rbuf1 = (char*)malloc(bufsize); if (NULL == rbuf1) { if (NULL != sbuf1) free(sbuf1); return OMPI_ERR_OUT_OF_RESOURCE; } opal_cuda_memcpy_sync(rbuf1, rbuf, bufsize); rbuf2 = rbuf; /* save away original buffer */ rbuf = rbuf1 - lb; } rc = s->c_coll.coll_reduce((void *) sbuf, rbuf, count, dtype, op, root, comm, s->c_coll.coll_reduce_module); if (NULL != sbuf1) { free(sbuf1); } if (NULL != rbuf1) { rbuf = rbuf2; opal_cuda_memcpy_sync(rbuf, rbuf1, bufsize); free(rbuf1); } return rc; }
/* * scatter_inter * * Function: - scatter operation * Accepts: - same arguments as MPI_Scatter() * Returns: - MPI_SUCCESS or error code */ int mca_coll_basic_scatter_inter(void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, size, err; char *ptmp; ptrdiff_t lb, incr; mca_coll_basic_module_t *basic_module = (mca_coll_basic_module_t*) module; ompi_request_t **reqs = basic_module->mccb_reqs; /* Initialize */ size = ompi_comm_remote_size(comm); if (MPI_PROC_NULL == root) { /* do nothing */ err = OMPI_SUCCESS; } else if (MPI_ROOT != root) { /* If not root, receive data. */ err = MCA_PML_CALL(recv(rbuf, rcount, rdtype, root, MCA_COLL_BASE_TAG_SCATTER, comm, MPI_STATUS_IGNORE)); } else { /* I am the root, loop sending data. */ err = ompi_datatype_get_extent(sdtype, &lb, &incr); if (OMPI_SUCCESS != err) { return OMPI_ERROR; } incr *= scount; for (i = 0, ptmp = (char *) sbuf; i < size; ++i, ptmp += incr) { err = MCA_PML_CALL(isend(ptmp, scount, sdtype, i, MCA_COLL_BASE_TAG_SCATTER, MCA_PML_BASE_SEND_STANDARD, comm, reqs++)); if (OMPI_SUCCESS != err) { return err; } } err = ompi_request_wait_all(size, basic_module->mccb_reqs, MPI_STATUSES_IGNORE); } return err; }
/* * allgatherv_intra * * Function: - allgather * Accepts: - same as MPI_Allgatherv() * Returns: - MPI_SUCCESS or error code */ int mca_coll_self_allgatherv_intra(void *sbuf, int scount, struct ompi_datatype_t *sdtype, void * rbuf, int *rcounts, int *disps, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { if (MPI_IN_PLACE == sbuf) { return MPI_SUCCESS; } else { int err; ptrdiff_t lb, extent; err = ompi_datatype_get_extent(rdtype, &lb, &extent); if (OMPI_SUCCESS != err) { return OMPI_ERROR; } return ompi_datatype_sndrcv(sbuf, scount, sdtype, ((char *) rbuf) + disps[0] * extent, rcounts[0], rdtype); } }
/* * allgather_sched_recursivedoubling * * Description: an implementation of Iallgather using recursive doubling algorithm * Limitation: power-of-two number of processes only * Time: O(log(comm_size)) * Schedule length (rounds): O(log(comm_size)) * Memory: no additional memory requirements beyond user-supplied buffers. * * Example on 4 nodes: * Initialization: everyone has its own buffer at location rank in rbuf * # 0 1 2 3 * [0] [ ] [ ] [ ] * [ ] [1] [ ] [ ] * [ ] [ ] [2] [ ] * [ ] [ ] [ ] [3] * Step 0: exchange data with (rank ^ 2^0) * # 0 1 2 3 * [0] [0] [ ] [ ] * [1] [1] [ ] [ ] * [ ] [ ] [2] [2] * [ ] [ ] [3] [3] * Step 1: exchange data with (rank ^ 2^1) (if you can) * # 0 1 2 3 * [0] [0] [0] [0] * [1] [1] [1] [1] * [2] [2] [2] [2] * [3] [3] [3] [3] * */ static inline int allgather_sched_recursivedoubling( int rank, int comm_size, NBC_Schedule *schedule, const void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype) { int res = OMPI_SUCCESS; ptrdiff_t rlb, rext; char *tmpsend = NULL, *tmprecv = NULL; res = ompi_datatype_get_extent(rdtype, &rlb, &rext); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } int sendblocklocation = rank; for (int distance = 1; distance < comm_size; distance <<= 1) { int remote = rank ^ distance; tmpsend = (char *)rbuf + (ptrdiff_t)sendblocklocation * (ptrdiff_t)rcount * rext; if (rank < remote) { tmprecv = (char *)rbuf + (ptrdiff_t)(sendblocklocation + distance) * (ptrdiff_t)rcount * rext; } else { tmprecv = (char *)rbuf + (ptrdiff_t)(sendblocklocation - distance) * (ptrdiff_t)rcount * rext; sendblocklocation -= distance; } res = NBC_Sched_send(tmpsend, false, (ptrdiff_t)distance * (ptrdiff_t)rcount, rdtype, remote, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } res = NBC_Sched_recv(tmprecv, false, (ptrdiff_t)distance * (ptrdiff_t)rcount, rdtype, remote, schedule, true); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } } cleanup_and_return: return res; }
int ompi_coll_libnbc_iallreduce(void* sendbuf, void* recvbuf, int count, MPI_Datatype datatype, MPI_Op op, struct ompi_communicator_t *comm, ompi_request_t ** request, struct mca_coll_base_module_2_1_0_t *module) { int rank, p, res; OPAL_PTRDIFF_TYPE ext, lb; NBC_Schedule *schedule; size_t size; #ifdef NBC_CACHE_SCHEDULE NBC_Allreduce_args *args, *found, search; #endif enum { NBC_ARED_BINOMIAL, NBC_ARED_RING } alg; char inplace; NBC_Handle *handle; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; NBC_IN_PLACE(sendbuf, recvbuf, inplace); rank = ompi_comm_rank (comm); p = ompi_comm_size (comm); res = ompi_datatype_get_extent(datatype, &lb, &ext); if (OMPI_SUCCESS != res) { NBC_Error ("MPI Error in MPI_Type_extent() (%i)", res); return res; } res = ompi_datatype_type_size (datatype, &size); if (OMPI_SUCCESS != res) { NBC_Error ("MPI Error in MPI_Type_size() (%i)", res); return res; } res = NBC_Init_handle (comm, &handle, libnbc_module); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } handle->tmpbuf = malloc (ext * count); if (OPAL_UNLIKELY(NULL == handle->tmpbuf)) { NBC_Return_handle (handle); return OMPI_ERR_OUT_OF_RESOURCE; } if ((p == 1) && !inplace) { /* for a single node - copy data to receivebuf */ res = NBC_Copy(sendbuf, count, datatype, recvbuf, count, datatype, comm); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; } } /* algorithm selection */ if(p < 4 || size*count < 65536 || inplace) { alg = NBC_ARED_BINOMIAL; } else { alg = NBC_ARED_RING; } #ifdef NBC_CACHE_SCHEDULE /* search schedule in communicator specific tree */ search.sendbuf = sendbuf; search.recvbuf = recvbuf; search.count = count; search.datatype = datatype; search.op = op; found = (NBC_Allreduce_args *) hb_tree_search ((hb_tree *) libnbc_module->NBC_Dict[NBC_ALLREDUCE], &search); if (NULL == found) { #endif schedule = OBJ_NEW(NBC_Schedule); if (NULL == schedule) { NBC_Return_handle (handle); return OMPI_ERR_OUT_OF_RESOURCE; } /* ensure the schedule is released with the handle on error */ handle->schedule = schedule; switch(alg) { case NBC_ARED_BINOMIAL: res = allred_sched_diss(rank, p, count, datatype, sendbuf, recvbuf, op, schedule, handle); break; case NBC_ARED_RING: res = allred_sched_ring(rank, p, count, datatype, sendbuf, recvbuf, op, size, ext, schedule, handle); break; } if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; } res = NBC_Sched_commit(schedule); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; } #ifdef NBC_CACHE_SCHEDULE /* save schedule to tree */ args = (NBC_Allreduce_args *) malloc (sizeof(args)); if (NULL != args) { args->sendbuf = sendbuf; args->recvbuf = recvbuf; args->count = count; args->datatype = datatype; args->op = op; args->schedule = schedule; res = hb_tree_insert ((hb_tree *) libnbc_module->NBC_Dict[NBC_ALLREDUCE], args, args, 0); if (0 == res) { OBJ_RETAIN(schedule); /* increase number of elements for A2A */ if (++libnbc_module->NBC_Dict_size[NBC_ALLREDUCE] > NBC_SCHED_DICT_UPPER) { NBC_SchedCache_dictwipe ((hb_tree *) libnbc_module->NBC_Dict[NBC_ALLREDUCE], &libnbc_module->NBC_Dict_size[NBC_ALLREDUCE]); } } else { NBC_Error("error in dict_insert() (%i)", res); free (args); } } } else { /* found schedule */ schedule = found->schedule; OBJ_RETAIN(schedule); } #endif res = NBC_Start (handle, schedule); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; } *request = (ompi_request_t *) handle; /* tmpbuf is freed with the handle */ return OMPI_SUCCESS; }
/* * reduce_scatter * * Function: - reduce then scatter * Accepts: - same as MPI_Reduce_scatter() * Returns: - MPI_SUCCESS or error code * * Algorithm: * Cummutative, reasonable sized messages * recursive halving algorithm * Others: * reduce and scatterv (needs to be cleaned * up at some point) * * NOTE: that the recursive halving algorithm should be faster than * the reduce/scatter for all message sizes. However, the memory * usage for the recusive halving is msg_size + 2 * comm_size greater * for the recursive halving, so I've limited where the recursive * halving is used to be nice to the app memory wise. There are much * better algorithms for large messages with cummutative operations, * so this should be investigated further. * * NOTE: We default to a simple reduce/scatterv if one of the rcounts * is zero. This is because the existing algorithms do not currently * support a count of zero in the array. */ int mca_coll_basic_reduce_scatter_intra(void *sbuf, void *rbuf, int *rcounts, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, rank, size, count, err = OMPI_SUCCESS; ptrdiff_t true_lb, true_extent, lb, extent, buf_size; int *disps = NULL; char *recv_buf = NULL, *recv_buf_free = NULL; char *result_buf = NULL, *result_buf_free = NULL; bool zerocounts = false; /* Initialize */ rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); /* Find displacements and the like */ disps = (int*) malloc(sizeof(int) * size); if (NULL == disps) return OMPI_ERR_OUT_OF_RESOURCE; disps[0] = 0; for (i = 0; i < (size - 1); ++i) { disps[i + 1] = disps[i] + rcounts[i]; if (0 == rcounts[i]) { zerocounts = true; } } count = disps[size - 1] + rcounts[size - 1]; if (0 == rcounts[size - 1]) { zerocounts = true; } /* short cut the trivial case */ if (0 == count) { free(disps); return OMPI_SUCCESS; } /* get datatype information */ ompi_datatype_get_extent(dtype, &lb, &extent); ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); buf_size = true_extent + (count - 1) * extent; /* Handle MPI_IN_PLACE */ if (MPI_IN_PLACE == sbuf) { sbuf = rbuf; } if ((op->o_flags & OMPI_OP_FLAGS_COMMUTE) && (buf_size < COMMUTATIVE_LONG_MSG) && (!zerocounts)) { int tmp_size, remain = 0, tmp_rank; /* temporary receive buffer. See coll_basic_reduce.c for details on sizing */ recv_buf_free = (char*) malloc(buf_size); recv_buf = recv_buf_free - lb; if (NULL == recv_buf_free) { err = OMPI_ERR_OUT_OF_RESOURCE; goto cleanup; } /* allocate temporary buffer for results */ result_buf_free = (char*) malloc(buf_size); result_buf = result_buf_free - lb; /* copy local buffer into the temporary results */ err = ompi_datatype_sndrcv(sbuf, count, dtype, result_buf, count, dtype); if (OMPI_SUCCESS != err) goto cleanup; /* figure out power of two mapping: grow until larger than comm size, then go back one, to get the largest power of two less than comm size */ tmp_size = opal_next_poweroftwo(size); tmp_size >>= 1; remain = size - tmp_size; /* If comm size is not a power of two, have the first "remain" procs with an even rank send to rank + 1, leaving a power of two procs to do the rest of the algorithm */ if (rank < 2 * remain) { if ((rank & 1) == 0) { err = MCA_PML_CALL(send(result_buf, count, dtype, rank + 1, MCA_COLL_BASE_TAG_REDUCE_SCATTER, MCA_PML_BASE_SEND_STANDARD, comm)); if (OMPI_SUCCESS != err) goto cleanup; /* we don't participate from here on out */ tmp_rank = -1; } else { err = MCA_PML_CALL(recv(recv_buf, count, dtype, rank - 1, MCA_COLL_BASE_TAG_REDUCE_SCATTER, comm, MPI_STATUS_IGNORE)); if (OMPI_SUCCESS != err) goto cleanup; /* integrate their results into our temp results */ ompi_op_reduce(op, recv_buf, result_buf, count, dtype); /* adjust rank to be the bottom "remain" ranks */ tmp_rank = rank / 2; } } else { /* just need to adjust rank to show that the bottom "even remain" ranks dropped out */ tmp_rank = rank - remain; } /* For ranks not kicked out by the above code, perform the recursive halving */ if (tmp_rank >= 0) { int *tmp_disps = NULL, *tmp_rcounts = NULL; int mask, send_index, recv_index, last_index; /* recalculate disps and rcounts to account for the special "remainder" processes that are no longer doing anything */ tmp_rcounts = (int*) malloc(tmp_size * sizeof(int)); if (NULL == tmp_rcounts) { err = OMPI_ERR_OUT_OF_RESOURCE; goto cleanup; } tmp_disps = (int*) malloc(tmp_size * sizeof(int)); if (NULL == tmp_disps) { free(tmp_rcounts); err = OMPI_ERR_OUT_OF_RESOURCE; goto cleanup; } for (i = 0 ; i < tmp_size ; ++i) { if (i < remain) { /* need to include old neighbor as well */ tmp_rcounts[i] = rcounts[i * 2 + 1] + rcounts[i * 2]; } else { tmp_rcounts[i] = rcounts[i + remain]; } } tmp_disps[0] = 0; for (i = 0; i < tmp_size - 1; ++i) { tmp_disps[i + 1] = tmp_disps[i] + tmp_rcounts[i]; } /* do the recursive halving communication. Don't use the dimension information on the communicator because I think the information is invalidated by our "shrinking" of the communicator */ mask = tmp_size >> 1; send_index = recv_index = 0; last_index = tmp_size; while (mask > 0) { int tmp_peer, peer, send_count, recv_count; struct ompi_request_t *request; tmp_peer = tmp_rank ^ mask; peer = (tmp_peer < remain) ? tmp_peer * 2 + 1 : tmp_peer + remain; /* figure out if we're sending, receiving, or both */ send_count = recv_count = 0; if (tmp_rank < tmp_peer) { send_index = recv_index + mask; for (i = send_index ; i < last_index ; ++i) { send_count += tmp_rcounts[i]; } for (i = recv_index ; i < send_index ; ++i) { recv_count += tmp_rcounts[i]; } } else { recv_index = send_index + mask; for (i = send_index ; i < recv_index ; ++i) { send_count += tmp_rcounts[i]; } for (i = recv_index ; i < last_index ; ++i) { recv_count += tmp_rcounts[i]; } } /* actual data transfer. Send from result_buf, receive into recv_buf */ if (send_count > 0 && recv_count != 0) { err = MCA_PML_CALL(irecv(recv_buf + tmp_disps[recv_index] * extent, recv_count, dtype, peer, MCA_COLL_BASE_TAG_REDUCE_SCATTER, comm, &request)); if (OMPI_SUCCESS != err) { free(tmp_rcounts); free(tmp_disps); goto cleanup; } } if (recv_count > 0 && send_count != 0) { err = MCA_PML_CALL(send(result_buf + tmp_disps[send_index] * extent, send_count, dtype, peer, MCA_COLL_BASE_TAG_REDUCE_SCATTER, MCA_PML_BASE_SEND_STANDARD, comm)); if (OMPI_SUCCESS != err) { free(tmp_rcounts); free(tmp_disps); goto cleanup; } } if (send_count > 0 && recv_count != 0) { err = ompi_request_wait(&request, MPI_STATUS_IGNORE); if (OMPI_SUCCESS != err) { free(tmp_rcounts); free(tmp_disps); goto cleanup; } } /* if we received something on this step, push it into the results buffer */ if (recv_count > 0) { ompi_op_reduce(op, recv_buf + tmp_disps[recv_index] * extent, result_buf + tmp_disps[recv_index] * extent, recv_count, dtype); } /* update for next iteration */ send_index = recv_index; last_index = recv_index + mask; mask >>= 1; } /* copy local results from results buffer into real receive buffer */ if (0 != rcounts[rank]) { err = ompi_datatype_sndrcv(result_buf + disps[rank] * extent, rcounts[rank], dtype, rbuf, rcounts[rank], dtype); if (OMPI_SUCCESS != err) { free(tmp_rcounts); free(tmp_disps); goto cleanup; } } free(tmp_rcounts); free(tmp_disps); }
/* * ompi_coll_base_allreduce_intra_recursivedoubling * * Function: Recursive doubling algorithm for allreduce operation * Accepts: Same as MPI_Allreduce() * Returns: MPI_SUCCESS or error code * * Description: Implements recursive doubling algorithm for allreduce. * Original (non-segmented) implementation is used in MPICH-2 * for small and intermediate size messages. * The algorithm preserves order of operations so it can * be used both by commutative and non-commutative operations. * * Example on 7 nodes: * Initial state * # 0 1 2 3 4 5 6 * [0] [1] [2] [3] [4] [5] [6] * Initial adjustment step for non-power of two nodes. * old rank 1 3 5 6 * new rank 0 1 2 3 * [0+1] [2+3] [4+5] [6] * Step 1 * old rank 1 3 5 6 * new rank 0 1 2 3 * [0+1+] [0+1+] [4+5+] [4+5+] * [2+3+] [2+3+] [6 ] [6 ] * Step 2 * old rank 1 3 5 6 * new rank 0 1 2 3 * [0+1+] [0+1+] [0+1+] [0+1+] * [2+3+] [2+3+] [2+3+] [2+3+] * [4+5+] [4+5+] [4+5+] [4+5+] * [6 ] [6 ] [6 ] [6 ] * Final adjustment step for non-power of two nodes * # 0 1 2 3 4 5 6 * [0+1+] [0+1+] [0+1+] [0+1+] [0+1+] [0+1+] [0+1+] * [2+3+] [2+3+] [2+3+] [2+3+] [2+3+] [2+3+] [2+3+] * [4+5+] [4+5+] [4+5+] [4+5+] [4+5+] [4+5+] [4+5+] * [6 ] [6 ] [6 ] [6 ] [6 ] [6 ] [6 ] * */ int ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int ret, line, rank, size, adjsize, remote, distance; int newrank, newremote, extra_ranks; char *tmpsend = NULL, *tmprecv = NULL, *tmpswap = NULL, *inplacebuf = NULL; ptrdiff_t true_lb, true_extent, lb, extent; ompi_request_t *reqs[2] = {NULL, NULL}; size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:allreduce_intra_recursivedoubling rank %d", rank)); /* Special case for size == 1 */ if (1 == size) { if (MPI_IN_PLACE != sbuf) { ret = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf, (char*)sbuf); if (ret < 0) { line = __LINE__; goto error_hndl; } } return MPI_SUCCESS; } /* Allocate and initialize temporary send buffer */ ret = ompi_datatype_get_extent(dtype, &lb, &extent); if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } ret = ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } inplacebuf = (char*) malloc(true_extent + (ptrdiff_t)(count - 1) * extent); if (NULL == inplacebuf) { ret = -1; line = __LINE__; goto error_hndl; } if (MPI_IN_PLACE == sbuf) { ret = ompi_datatype_copy_content_same_ddt(dtype, count, inplacebuf, (char*)rbuf); if (ret < 0) { line = __LINE__; goto error_hndl; } } else { ret = ompi_datatype_copy_content_same_ddt(dtype, count, inplacebuf, (char*)sbuf); if (ret < 0) { line = __LINE__; goto error_hndl; } } tmpsend = (char*) inplacebuf; tmprecv = (char*) rbuf; /* Determine nearest power of two less than or equal to size */ adjsize = opal_next_poweroftwo (size); adjsize >>= 1; /* Handle non-power-of-two case: - Even ranks less than 2 * extra_ranks send their data to (rank + 1), and sets new rank to -1. - Odd ranks less than 2 * extra_ranks receive data from (rank - 1), apply appropriate operation, and set new rank to rank/2 - Everyone else sets rank to rank - extra_ranks */ extra_ranks = size - adjsize; if (rank < (2 * extra_ranks)) { if (0 == (rank % 2)) { ret = MCA_PML_CALL(send(tmpsend, count, dtype, (rank + 1), MCA_COLL_BASE_TAG_ALLREDUCE, MCA_PML_BASE_SEND_STANDARD, comm)); if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } newrank = -1; } else { ret = MCA_PML_CALL(recv(tmprecv, count, dtype, (rank - 1), MCA_COLL_BASE_TAG_ALLREDUCE, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } /* tmpsend = tmprecv (op) tmpsend */ ompi_op_reduce(op, tmprecv, tmpsend, count, dtype); newrank = rank >> 1; } } else {
/* * gather_inter * * Function: - basic gather operation * Accepts: - same arguments as MPI_Gather() * Returns: - MPI_SUCCESS or error code */ int mca_coll_inter_gather_inter(void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int err; int rank; int size,size_local; char *ptmp = NULL; MPI_Aint incr; MPI_Aint extent; MPI_Aint lb; size = ompi_comm_remote_size(comm); rank = ompi_comm_rank(comm); if (MPI_PROC_NULL == root) { /* do nothing */ err = OMPI_SUCCESS; } else if (MPI_ROOT != root) { /* Perform the gather locally with the first process as root */ err = ompi_datatype_get_extent(sdtype, &lb, &extent); if (OMPI_SUCCESS != err) { return OMPI_ERROR; } incr = extent * scount; size_local = ompi_comm_size(comm->c_local_comm); ptmp = (char*)malloc(size_local * incr); if (NULL == ptmp) { return OMPI_ERR_OUT_OF_RESOURCE; } err = comm->c_local_comm->c_coll.coll_gather(sbuf, scount, sdtype, ptmp, scount, sdtype, 0, comm->c_local_comm, comm->c_local_comm->c_coll.coll_gather_module); if (0 == rank) { /* First process sends data to the root */ err = MCA_PML_CALL(send(ptmp, scount*size_local, sdtype, root, MCA_COLL_BASE_TAG_GATHER, MCA_PML_BASE_SEND_STANDARD, comm)); if (OMPI_SUCCESS != err) { return err; } } if (NULL != ptmp) { free(ptmp); } } else { /* I am the root, loop receiving the data. */ err = MCA_PML_CALL(recv(rbuf, rcount*size, rdtype, 0, MCA_COLL_BASE_TAG_GATHER, comm, MPI_STATUS_IGNORE)); if (OMPI_SUCCESS != err) { return err; } } /* All done */ return err; }
static inline __opal_attribute_always_inline__ int mca_coll_ml_allgather_start (const void *sbuf, int scount, struct ompi_datatype_t *sdtype, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module, ompi_request_t **req) { size_t pack_len, sdt_size; int ret, n_fragments = 1, comm_size; mca_coll_ml_topology_t *topo_info; mca_bcol_base_payload_buffer_desc_t *src_buffer_desc; mca_coll_ml_component_t *cm = &mca_coll_ml_component; mca_coll_ml_collective_operation_progress_t *coll_op; mca_coll_ml_module_t *ml_module = (mca_coll_ml_module_t *) module; ptrdiff_t lb, extent; bool scontig, rcontig, in_place = false; /* check for in place setting */ if (MPI_IN_PLACE == sbuf) { in_place = true; sdtype = rdtype; scount = rcount; } /* scontig could be != to rcontig */ scontig = ompi_datatype_is_contiguous_memory_layout(sdtype, scount); rcontig = ompi_datatype_is_contiguous_memory_layout(rdtype, rcount); comm_size = ompi_comm_size(comm); ML_VERBOSE(10, ("Starting allgather")); assert(NULL != sdtype); /* Calculate size of the data, * at this stage, only contiguous data is supported */ /* this is valid for allagther */ ompi_datatype_type_size(sdtype, &sdt_size); pack_len = scount * sdt_size; if (in_place) { sbuf = (char *) rbuf + ompi_comm_rank(comm) * pack_len; } /* Allocate collective schedule and pack message */ /* this is the total ending message size that will need to fit in the ml-buffer */ if (pack_len <= (size_t) ml_module->small_message_thresholds[BCOL_ALLGATHER]) { /* The len of the message can not be larger than ML buffer size */ ML_VERBOSE(10, ("Single frag %d %d %d", pack_len, comm_size, ml_module->payload_block->size_buffer)); assert(pack_len * comm_size <= ml_module->payload_block->size_buffer); src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); while (NULL == src_buffer_desc) { opal_progress(); src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); } /* change 1 */ coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module, ml_module->coll_ml_allgather_functions[ML_SMALL_DATA_ALLGATHER], sbuf, rbuf, pack_len, 0 /* offset for first pack */); MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(coll_op, src_buffer_desc->buffer_index, src_buffer_desc); coll_op->fragment_data.current_coll_op = ML_SMALL_DATA_ALLGATHER; /* task setup callback function */ coll_op->sequential_routine.seq_task_setup = mca_coll_ml_allgather_task_setup; /* change 2 */ if (!scontig) { coll_op->full_message.n_bytes_scheduled = mca_coll_ml_convertor_prepare(sdtype, scount, sbuf, &coll_op->full_message.send_convertor, MCA_COLL_ML_NET_STREAM_SEND); mca_coll_ml_convertor_pack( (void *) ((uintptr_t) src_buffer_desc->data_addr + pack_len * (coll_op->coll_schedule->topo_info->hier_layout_info[0].offset + coll_op->coll_schedule->topo_info->hier_layout_info[0].level_one_index)), pack_len, &coll_op->full_message.send_convertor); } else { /* change 3 */ memcpy((void *)((uintptr_t) src_buffer_desc->data_addr + pack_len * (coll_op->coll_schedule->topo_info->hier_layout_info[0].offset + coll_op->coll_schedule->topo_info->hier_layout_info[0].level_one_index)), sbuf, pack_len); coll_op->full_message.n_bytes_scheduled = pack_len; } if (!rcontig) { mca_coll_ml_convertor_prepare(rdtype, rcount * comm_size, rbuf, &coll_op->full_message.recv_convertor, MCA_COLL_ML_NET_STREAM_RECV); } if (coll_op->coll_schedule->topo_info->ranks_contiguous) { coll_op->process_fn = mca_coll_ml_allgather_small_unpack_data; } else { coll_op->process_fn = mca_coll_ml_allgather_noncontiguous_unpack_data; } /* whole ml-buffer is used to send AND receive */ coll_op->variable_fn_params.sbuf = (void *) src_buffer_desc->data_addr; coll_op->variable_fn_params.rbuf = (void *) src_buffer_desc->data_addr; /* we can set the initial offset here */ coll_op->variable_fn_params.sbuf_offset = 0; coll_op->variable_fn_params.rbuf_offset = 0; coll_op->variable_fn_params.count = scount; coll_op->fragment_data.fragment_size = coll_op->full_message.n_bytes_scheduled; /* For small CINCO, we may use the native datatype */ coll_op->variable_fn_params.dtype = sdtype; coll_op->variable_fn_params.buffer_size = pack_len; coll_op->variable_fn_params.root = 0; } else if (cm->enable_fragmentation || pack_len * comm_size < (1 << 20)) { /* calculate the number of fragments and the size of each frag */ size_t n_dts_per_frag, frag_len; int pipeline_depth = mca_coll_ml_component.pipeline_depth; /* Calculate the number of fragments required for this message careful watch the integer division !*/ frag_len = (pack_len <= (size_t) ml_module->small_message_thresholds[BCOL_ALLGATHER] ? pack_len : (size_t) ml_module->small_message_thresholds[BCOL_ALLGATHER]); n_dts_per_frag = frag_len / sdt_size; n_fragments = (pack_len + sdt_size * n_dts_per_frag - 1) / (sdt_size * n_dts_per_frag); pipeline_depth = (n_fragments < pipeline_depth ? n_fragments : pipeline_depth); src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); while (NULL == src_buffer_desc) { opal_progress(); src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); } /* change 4 */ coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module, ml_module->coll_ml_allgather_functions[ML_SMALL_DATA_ALLGATHER], sbuf, rbuf, pack_len, 0 /* offset for first pack */); MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(coll_op, src_buffer_desc->buffer_index, src_buffer_desc); topo_info = coll_op->coll_schedule->topo_info; /* task setup callback function */ coll_op->sequential_routine.seq_task_setup = mca_coll_ml_allgather_task_setup; if (!scontig) { coll_op->full_message.send_converter_bytes_packed = mca_coll_ml_convertor_prepare( sdtype, scount, NULL, &coll_op->full_message.dummy_convertor, MCA_COLL_ML_NET_STREAM_SEND); coll_op->full_message.dummy_conv_position = 0; mca_coll_ml_convertor_get_send_frag_size( ml_module, &frag_len, &coll_op->full_message); /* change 5 */ mca_coll_ml_convertor_prepare(sdtype, scount, sbuf, &coll_op->full_message.send_convertor, MCA_COLL_ML_NET_STREAM_SEND); mca_coll_ml_convertor_pack( (void *) ((uintptr_t) src_buffer_desc->data_addr + frag_len * (topo_info->hier_layout_info[0].offset + topo_info->hier_layout_info[0].level_one_index)), frag_len, &coll_op->full_message.send_convertor); } else { /* change 6 */ memcpy((void *)((uintptr_t)src_buffer_desc->data_addr + frag_len * (topo_info->hier_layout_info[0].offset + topo_info->hier_layout_info[0].level_one_index)), sbuf, frag_len); } if (!rcontig) { mca_coll_ml_convertor_prepare(rdtype, rcount * comm_size, rbuf, &coll_op->full_message.recv_convertor, MCA_COLL_ML_NET_STREAM_RECV); } coll_op->process_fn = mca_coll_ml_allgather_noncontiguous_unpack_data; /* hopefully this doesn't royaly screw things up idea behind this is the * whole ml-buffer is used to send and receive */ coll_op->variable_fn_params.sbuf = (void *) src_buffer_desc->data_addr; coll_op->variable_fn_params.rbuf = (void *) src_buffer_desc->data_addr; /* we can set the initial offset here */ coll_op->variable_fn_params.sbuf_offset = 0; coll_op->variable_fn_params.rbuf_offset = 0; coll_op->fragment_data.buffer_desc = src_buffer_desc; coll_op->fragment_data.fragment_size = frag_len; coll_op->fragment_data.message_descriptor->n_active = 1; coll_op->full_message.n_bytes_scheduled = frag_len; coll_op->full_message.fragment_launcher = mca_coll_ml_allgather_frag_progress; coll_op->full_message.pipeline_depth = pipeline_depth; coll_op->fragment_data.current_coll_op = ML_SMALL_DATA_ALLGATHER; /* remember this is different for frags !! Caused data corruption when * not properly set. Need to be sure you have consistent units. */ coll_op->variable_fn_params.count = frag_len; coll_op->variable_fn_params.dtype = MPI_BYTE; /* for fragmented data, we work in * units of bytes. This means that * all of our arithmetic is done * in terms of bytes */ coll_op->variable_fn_params.root = 0; coll_op->variable_fn_params.frag_size = frag_len; coll_op->variable_fn_params.buffer_size = frag_len; } else { /* change 7 */ ML_VERBOSE(10, ("ML_ALLGATHER_LARGE_DATA_KNOWN case.")); coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module, ml_module->coll_ml_allgather_functions[ML_LARGE_DATA_ALLGATHER], sbuf, rbuf, pack_len, 0 /* offset for first pack */); topo_info = coll_op->coll_schedule->topo_info; if (MCA_BCOL_BASE_NO_ML_BUFFER_FOR_LARGE_MSG & topo_info->all_bcols_mode) { MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(coll_op, MCA_COLL_ML_NO_BUFFER, NULL); } else { src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); while (NULL == src_buffer_desc) { opal_progress(); src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); } MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(coll_op, src_buffer_desc->buffer_index, src_buffer_desc); } /* not sure if I really need this here */ coll_op->sequential_routine.seq_task_setup = mca_coll_ml_allgather_task_setup; coll_op->process_fn = NULL; /* probably the most important piece */ coll_op->variable_fn_params.sbuf = sbuf; coll_op->variable_fn_params.rbuf = rbuf; coll_op->variable_fn_params.sbuf_offset = 0; coll_op->variable_fn_params.rbuf_offset = 0; coll_op->variable_fn_params.count = scount; coll_op->variable_fn_params.dtype = sdtype;/* for zero copy, we want the * native datatype and actual count */ coll_op->variable_fn_params.root = 0; /* you still need to copy in your own data into the rbuf */ /* don't need to do this if you have in place data */ if (!in_place) { memcpy((char *) rbuf + ompi_comm_rank(comm) * pack_len, sbuf, pack_len); } } coll_op->full_message.send_count = scount; coll_op->full_message.recv_count = rcount; coll_op->full_message.send_data_continguous = scontig; coll_op->full_message.recv_data_continguous = rcontig; ompi_datatype_get_extent(sdtype, &lb, &extent); coll_op->full_message.send_extent = (size_t) extent; ompi_datatype_get_extent(rdtype, &lb, &extent); coll_op->full_message.recv_extent = (size_t) extent; /* Fill in the function arguments */ coll_op->variable_fn_params.sequence_num = OPAL_THREAD_ADD32(&(ml_module->collective_sequence_num), 1); coll_op->variable_fn_params.hier_factor = comm_size; MCA_COLL_ML_SET_ORDER_INFO(coll_op, n_fragments); ret = mca_coll_ml_launch_sequential_collective (coll_op); if (OMPI_SUCCESS != ret) { ML_VERBOSE(10, ("Failed to launch")); return ret; } *req = &coll_op->full_message.super; return OMPI_SUCCESS; }
/* * scatterv_inter * * Function: - scatterv operation * Accepts: - same arguments as MPI_Scatterv() * Returns: - MPI_SUCCESS or error code */ int mca_coll_inter_scatterv_inter(void *sbuf, int *scounts, int *disps, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, rank, size, err, total, size_local; int *counts=NULL,*displace=NULL; char *ptmp=NULL; MPI_Aint incr; MPI_Aint extent; MPI_Aint lb; ompi_datatype_t *ndtype; /* Initialize */ rank = ompi_comm_rank(comm); size = ompi_comm_remote_size(comm); size_local = ompi_comm_size(comm); if (MPI_PROC_NULL == root) { /* do nothing */ err = OMPI_SUCCESS; } else if (MPI_ROOT != root) { if(0 == rank) { /* local root recieves the counts from the root */ counts = (int *)malloc(sizeof(int) * size_local); err = MCA_PML_CALL(recv(counts, size_local, MPI_INT, root, MCA_COLL_BASE_TAG_SCATTERV, comm, MPI_STATUS_IGNORE)); if (OMPI_SUCCESS != err) { return err; } /* calculate the whole buffer size and recieve it from root */ err = ompi_datatype_get_extent(rdtype, &lb, &extent); if (OMPI_SUCCESS != err) { return OMPI_ERROR; } incr = 0; for (i = 0; i < size_local; i++) { incr = incr + extent*counts[i]; } if ( incr > 0 ) { ptmp = (char*)malloc(incr); if (NULL == ptmp) { return OMPI_ERR_OUT_OF_RESOURCE; } } total = 0; for (i = 0; i < size_local; i++) { total = total + counts[i]; } err = MCA_PML_CALL(recv(ptmp, total, rdtype, root, MCA_COLL_BASE_TAG_SCATTERV, comm, MPI_STATUS_IGNORE)); if (OMPI_SUCCESS != err) { return err; } /* set the local displacement i.e. no displacements here */ displace = (int *)malloc(sizeof(int) * size_local); displace[0] = 0; for (i = 1; i < size_local; i++) { displace[i] = displace[i-1] + counts[i-1]; } } /* perform the scatterv locally */ err = comm->c_local_comm->c_coll.coll_scatterv(ptmp, counts, displace, rdtype, rbuf, rcount, rdtype, 0, comm->c_local_comm, comm->c_local_comm->c_coll.coll_scatterv_module); if (OMPI_SUCCESS != err) { return err; } if (NULL != ptmp) { free(ptmp); } if (NULL != displace) { free(displace); } if (NULL != counts) { free(counts); } } else { err = MCA_PML_CALL(send(scounts, size, MPI_INT, 0, MCA_COLL_BASE_TAG_SCATTERV, MCA_PML_BASE_SEND_STANDARD, comm)); if (OMPI_SUCCESS != err) { return err; } ompi_datatype_create_indexed(size,scounts,disps,sdtype,&ndtype); ompi_datatype_commit(&ndtype); err = MCA_PML_CALL(send(sbuf, 1, ndtype, 0, MCA_COLL_BASE_TAG_SCATTERV, MCA_PML_BASE_SEND_STANDARD, comm)); if (OMPI_SUCCESS != err) { return err; } ompi_datatype_destroy(&ndtype); } /* All done */ return err; }
/* * reduce_lin_inter * * Function: - reduction using O(N) algorithm * Accepts: - same as MPI_Reduce() * Returns: - MPI_SUCCESS or error code */ int mca_coll_basic_reduce_lin_inter(void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, err, size; ptrdiff_t true_lb, true_extent, lb, extent; char *free_buffer = NULL; char *pml_buffer = NULL; /* Initialize */ size = ompi_comm_remote_size(comm); if (MPI_PROC_NULL == root) { /* do nothing */ err = OMPI_SUCCESS; } else if (MPI_ROOT != root) { /* If not root, send data to the root. */ err = MCA_PML_CALL(send(sbuf, count, dtype, root, MCA_COLL_BASE_TAG_REDUCE, MCA_PML_BASE_SEND_STANDARD, comm)); } else { /* Root receives and reduces messages */ ompi_datatype_get_extent(dtype, &lb, &extent); ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); free_buffer = (char*)malloc(true_extent + (count - 1) * extent); if (NULL == free_buffer) { return OMPI_ERR_OUT_OF_RESOURCE; } pml_buffer = free_buffer - lb; /* Initialize the receive buffer. */ err = MCA_PML_CALL(recv(rbuf, count, dtype, 0, MCA_COLL_BASE_TAG_REDUCE, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != err) { if (NULL != free_buffer) { free(free_buffer); } return err; } /* Loop receiving and calling reduction function (C or Fortran). */ for (i = 1; i < size; i++) { err = MCA_PML_CALL(recv(pml_buffer, count, dtype, i, MCA_COLL_BASE_TAG_REDUCE, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != err) { if (NULL != free_buffer) { free(free_buffer); } return err; } /* Perform the reduction */ ompi_op_reduce(op, pml_buffer, rbuf, count, dtype); } if (NULL != free_buffer) { free(free_buffer); } } /* All done */ return err; }
static int mca_coll_basic_neighbor_alltoallv_cart(const void *sbuf, const int scounts[], const int sdisps[], struct ompi_datatype_t *sdtype, void *rbuf, const int rcounts[], const int rdisps[], struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { mca_coll_basic_module_t *basic_module = (mca_coll_basic_module_t *) module; const mca_topo_base_comm_cart_2_2_0_t *cart = comm->c_topo->mtc.cart; const int rank = ompi_comm_rank (comm); int rc = MPI_SUCCESS, dim, i, nreqs; ptrdiff_t lb, rdextent, sdextent; ompi_request_t **reqs; /* ensure we have enough storage for requests */ rc = mca_coll_basic_check_for_requests (basic_module, cart->ndims * 4); if (OMPI_SUCCESS != rc) { return rc; } ompi_datatype_get_extent(rdtype, &lb, &rdextent); ompi_datatype_get_extent(sdtype, &lb, &sdextent); /* post receives first */ for (dim = 0, nreqs = 0, i = 0, reqs = basic_module->mccb_reqs ; dim < cart->ndims ; ++dim, i += 2) { int srank = MPI_PROC_NULL, drank = MPI_PROC_NULL; if (cart->dims[dim] > 1) { mca_topo_base_cart_shift (comm, dim, 1, &srank, &drank); } else if (1 == cart->dims[dim] && cart->periods[dim]) { srank = drank = rank; } if (MPI_PROC_NULL != srank) { rc = MCA_PML_CALL(irecv((char *) rbuf + rdisps[i] * rdextent, rcounts[i], rdtype, srank, MCA_COLL_BASE_TAG_ALLTOALL, comm, reqs++)); if (OMPI_SUCCESS != rc) break; nreqs++; } if (MPI_PROC_NULL != drank) { rc = MCA_PML_CALL(irecv((char *) rbuf + rdisps[i+1] * rdextent, rcounts[i+1], rdtype, drank, MCA_COLL_BASE_TAG_ALLTOALL, comm, reqs++)); if (OMPI_SUCCESS != rc) break; nreqs++; } } if (OMPI_SUCCESS != rc) { /* should probably try to clean up here */ return rc; } for (dim = 0, i = 0 ; dim < cart->ndims ; ++dim, i += 2) { int srank = MPI_PROC_NULL, drank = MPI_PROC_NULL; if (cart->dims[dim] > 1) { mca_topo_base_cart_shift (comm, dim, 1, &srank, &drank); } else if (1 == cart->dims[dim] && cart->periods[dim]) { srank = drank = rank; } if (MPI_PROC_NULL != srank) { /* remove cast from const when the pml layer is updated to take a const for the send buffer */ rc = MCA_PML_CALL(isend((char *) sbuf + sdisps[i] * sdextent, scounts[i], sdtype, srank, MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD, comm, reqs++)); if (OMPI_SUCCESS != rc) break; nreqs++; } if (MPI_PROC_NULL != drank) { rc = MCA_PML_CALL(isend((char *) sbuf + sdisps[i+1] * sdextent, scounts[i+1], sdtype, drank, MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD, comm, reqs++)); if (OMPI_SUCCESS != rc) break; nreqs++; } } if (OMPI_SUCCESS != rc) { /* should probably try to clean up here */ return rc; } return ompi_request_wait_all (nreqs, basic_module->mccb_reqs, MPI_STATUSES_IGNORE); }
/* * reduce_log_intra * * Function: - reduction using O(log N) algorithm * Accepts: - same as MPI_Reduce() * Returns: - MPI_SUCCESS or error code * * * Performing reduction on each dimension of the hypercube. * An example for 8 procs (dimensions = 3): * * Stage 1, reduce on X dimension, 1 -> 0, 3 -> 2, 5 -> 4, 7 -> 6 * * 6----<---7 proc_0: 0+1 * /| /| proc_1: 1 * / | / | proc_2: 2+3 * / | / | proc_3: 3 * 4----<---5 | proc_4: 4+5 * | 2--< |---3 proc_5: 5 * | / | / proc_6: 6+7 * | / | / proc_7: 7 * |/ |/ * 0----<---1 * * Stage 2, reduce on Y dimension, 2 -> 0, 6 -> 4 * * 6--------7 proc_0: 0+1+2+3 * /| /| proc_1: 1 * v | / | proc_2: 2+3 * / | / | proc_3: 3 * 4--------5 | proc_4: 4+5+6+7 * | 2--- |---3 proc_5: 5 * | / | / proc_6: 6+7 * | v | / proc_7: 7 * |/ |/ * 0--------1 * * Stage 3, reduce on Z dimension, 4 -> 0 * * 6--------7 proc_0: 0+1+2+3+4+5+6+7 * /| /| proc_1: 1 * / | / | proc_2: 2+3 * / | / | proc_3: 3 * 4--------5 | proc_4: 4+5+6+7 * | 2--- |---3 proc_5: 5 * v / | / proc_6: 6+7 * | / | / proc_7: 7 * |/ |/ * 0--------1 * * */ int mca_coll_basic_reduce_log_intra(void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, size, rank, vrank; int err, peer, dim, mask; ptrdiff_t true_lb, true_extent, lb, extent; char *free_buffer = NULL; char *free_rbuf = NULL; char *pml_buffer = NULL; char *snd_buffer = NULL; char *rcv_buffer = (char*)rbuf; char *inplace_temp = NULL; /* JMS Codearound for now -- if the operations is not communative, * just call the linear algorithm. Need to talk to Edgar / George * about fixing this algorithm here to work with non-communative * operations. */ if (!ompi_op_is_commute(op)) { return mca_coll_basic_reduce_lin_intra(sbuf, rbuf, count, dtype, op, root, comm, module); } /* Some variables */ size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); vrank = ompi_op_is_commute(op) ? (rank - root + size) % size : rank; dim = comm->c_cube_dim; /* Allocate the incoming and resulting message buffers. See lengthy * rationale above. */ ompi_datatype_get_extent(dtype, &lb, &extent); ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); free_buffer = (char*)malloc(true_extent + (count - 1) * extent); if (NULL == free_buffer) { return OMPI_ERR_OUT_OF_RESOURCE; } pml_buffer = free_buffer - lb; /* read the comment about commutative operations (few lines down * the page) */ if (ompi_op_is_commute(op)) { rcv_buffer = pml_buffer; } /* Allocate sendbuf in case the MPI_IN_PLACE option has been used. See lengthy * rationale above. */ if (MPI_IN_PLACE == sbuf) { inplace_temp = (char*)malloc(true_extent + (count - 1) * extent); if (NULL == inplace_temp) { err = OMPI_ERR_OUT_OF_RESOURCE; goto cleanup_and_return; } sbuf = inplace_temp - lb; err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)sbuf, (char*)rbuf); } snd_buffer = (char*)sbuf; if (rank != root && 0 == (vrank & 1)) { /* root is the only one required to provide a valid rbuf. * Assume rbuf is invalid for all other ranks, so fix it up * here to be valid on all non-leaf ranks */ free_rbuf = (char*)malloc(true_extent + (count - 1) * extent); if (NULL == free_rbuf) { err = OMPI_ERR_OUT_OF_RESOURCE; goto cleanup_and_return; } rbuf = free_rbuf - lb; } /* Loop over cube dimensions. High processes send to low ones in the * dimension. */ for (i = 0, mask = 1; i < dim; ++i, mask <<= 1) { /* A high-proc sends to low-proc and stops. */ if (vrank & mask) { peer = vrank & ~mask; if (ompi_op_is_commute(op)) { peer = (peer + root) % size; } err = MCA_PML_CALL(send(snd_buffer, count, dtype, peer, MCA_COLL_BASE_TAG_REDUCE, MCA_PML_BASE_SEND_STANDARD, comm)); if (MPI_SUCCESS != err) { goto cleanup_and_return; } snd_buffer = (char*)rbuf; break; } /* A low-proc receives, reduces, and moves to a higher * dimension. */ else { peer = vrank | mask; if (peer >= size) { continue; } if (ompi_op_is_commute(op)) { peer = (peer + root) % size; } /* Most of the time (all except the first one for commutative * operations) we receive in the user provided buffer * (rbuf). But the exception is here to allow us to dont have * to copy from the sbuf to a temporary location. If the * operation is commutative we dont care in which order we * apply the operation, so for the first time we can receive * the data in the pml_buffer and then apply to operation * between this buffer and the user provided data. */ err = MCA_PML_CALL(recv(rcv_buffer, count, dtype, peer, MCA_COLL_BASE_TAG_REDUCE, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != err) { goto cleanup_and_return; } /* Perform the operation. The target is always the user * provided buffer We do the operation only if we receive it * not in the user buffer */ if (snd_buffer != sbuf) { /* the target buffer is the locally allocated one */ ompi_op_reduce(op, rcv_buffer, pml_buffer, count, dtype); } else { /* If we're commutative, we don't care about the order of * operations and we can just reduce the operations now. * If we are not commutative, we have to copy the send * buffer into a temp buffer (pml_buffer) and then reduce * what we just received against it. */ if (!ompi_op_is_commute(op)) { ompi_datatype_copy_content_same_ddt(dtype, count, pml_buffer, (char*)sbuf); ompi_op_reduce(op, rbuf, pml_buffer, count, dtype); } else { ompi_op_reduce(op, sbuf, pml_buffer, count, dtype); } /* now we have to send the buffer containing the computed data */ snd_buffer = pml_buffer; /* starting from now we always receive in the user * provided buffer */ rcv_buffer = (char*)rbuf; } } } /* Get the result to the root if needed. */ err = MPI_SUCCESS; if (0 == vrank) { if (root == rank) { ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf, snd_buffer); } else { err = MCA_PML_CALL(send(snd_buffer, count, dtype, root, MCA_COLL_BASE_TAG_REDUCE, MCA_PML_BASE_SEND_STANDARD, comm)); } } else if (rank == root) { err = MCA_PML_CALL(recv(rcv_buffer, count, dtype, 0, MCA_COLL_BASE_TAG_REDUCE, comm, MPI_STATUS_IGNORE)); if (rcv_buffer != rbuf) { ompi_op_reduce(op, rcv_buffer, rbuf, count, dtype); } } cleanup_and_return: if (NULL != inplace_temp) { free(inplace_temp); } if (NULL != free_buffer) { free(free_buffer); } if (NULL != free_rbuf) { free(free_rbuf); } /* All done */ return err; }
static int mca_coll_basic_neighbor_allgather_cart(const void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { const mca_topo_base_comm_cart_2_2_0_t *cart = comm->c_topo->mtc.cart; const int rank = ompi_comm_rank (comm); ompi_request_t **reqs, **preqs; ptrdiff_t lb, extent; int rc = MPI_SUCCESS, dim, nreqs; if( 0 == cart->ndims ) return OMPI_SUCCESS; ompi_datatype_get_extent(rdtype, &lb, &extent); reqs = preqs = coll_base_comm_get_reqs( module->base_data, 4 * cart->ndims ); if( NULL == reqs ) { return OMPI_ERR_OUT_OF_RESOURCE; } /* The ordering is defined as -1 then +1 in each dimension in * order of dimension. */ for (dim = 0, nreqs = 0 ; dim < cart->ndims ; ++dim) { int srank = MPI_PROC_NULL, drank = MPI_PROC_NULL; if (cart->dims[dim] > 1) { mca_topo_base_cart_shift (comm, dim, 1, &srank, &drank); } else if (1 == cart->dims[dim] && cart->periods[dim]) { srank = drank = rank; } if (MPI_PROC_NULL != srank) { nreqs++; rc = MCA_PML_CALL(irecv(rbuf, rcount, rdtype, srank, MCA_COLL_BASE_TAG_ALLGATHER, comm, preqs++)); if (OMPI_SUCCESS != rc) break; nreqs++; /* remove cast from const when the pml layer is updated to take * a const for the send buffer. */ rc = MCA_PML_CALL(isend((void *) sbuf, scount, sdtype, srank, MCA_COLL_BASE_TAG_ALLGATHER, MCA_PML_BASE_SEND_STANDARD, comm, preqs++)); if (OMPI_SUCCESS != rc) break; } rbuf = (char *) rbuf + extent * rcount; if (MPI_PROC_NULL != drank) { nreqs++; rc = MCA_PML_CALL(irecv(rbuf, rcount, rdtype, drank, MCA_COLL_BASE_TAG_ALLGATHER, comm, preqs++)); if (OMPI_SUCCESS != rc) break; nreqs++; rc = MCA_PML_CALL(isend((void *) sbuf, scount, sdtype, drank, MCA_COLL_BASE_TAG_ALLGATHER, MCA_PML_BASE_SEND_STANDARD, comm, preqs++)); if (OMPI_SUCCESS != rc) break; } rbuf = (char *) rbuf + extent * rcount; } if (OMPI_SUCCESS != rc) { ompi_coll_base_free_reqs(reqs, nreqs); return rc; } rc = ompi_request_wait_all (nreqs, reqs, MPI_STATUSES_IGNORE); if (OMPI_SUCCESS != rc) { ompi_coll_base_free_reqs(reqs, nreqs); } return rc; }
/* * reduce_lin_intra * * Function: - reduction using O(N) algorithm * Accepts: - same as MPI_Reduce() * Returns: - MPI_SUCCESS or error code */ int mca_coll_basic_reduce_lin_intra(void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, rank, err, size; ptrdiff_t true_lb, true_extent, lb, extent; char *free_buffer = NULL; char *pml_buffer = NULL; char *inplace_temp = NULL; char *inbuf; /* Initialize */ rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); /* If not root, send data to the root. */ if (rank != root) { err = MCA_PML_CALL(send(sbuf, count, dtype, root, MCA_COLL_BASE_TAG_REDUCE, MCA_PML_BASE_SEND_STANDARD, comm)); return err; } /* Root receives and reduces messages. Allocate buffer to receive * messages. This comment applies to all collectives in this basic * module where we allocate a temporary buffer. For the next few * lines of code, it's tremendously complicated how we decided that * this was the Right Thing to do. Sit back and enjoy. And prepare * to have your mind warped. :-) * * Recall some definitions (I always get these backwards, so I'm * going to put them here): * * extent: the length from the lower bound to the upper bound -- may * be considerably larger than the buffer required to hold the data * (or smaller! But it's easiest to think about when it's larger). * * true extent: the exact number of bytes required to hold the data * in the layout pattern in the datatype. * * For example, consider the following buffer (just talking about * LB, extent, and true extent -- extrapolate for UB; i.e., assume * the UB equals exactly where the data ends): * * A B C * -------------------------------------------------------- * | | | * -------------------------------------------------------- * * There are multiple cases: * * 1. A is what we give to MPI_Send (and friends), and A is where * the data starts, and C is where the data ends. In this case: * * - extent: C-A * - true extent: C-A * - LB: 0 * * A C * -------------------------------------------------------- * | | * -------------------------------------------------------- * <=======================extent=========================> * <======================true extent=====================> * * 2. A is what we give to MPI_Send (and friends), B is where the * data starts, and C is where the data ends. In this case: * * - extent: C-A * - true extent: C-B * - LB: positive * * A B C * -------------------------------------------------------- * | | User buffer | * -------------------------------------------------------- * <=======================extent=========================> * <===============true extent=============> * * 3. B is what we give to MPI_Send (and friends), A is where the * data starts, and C is where the data ends. In this case: * * - extent: C-A * - true extent: C-A * - LB: negative * * A B C * -------------------------------------------------------- * | | User buffer | * -------------------------------------------------------- * <=======================extent=========================> * <======================true extent=====================> * * 4. MPI_BOTTOM is what we give to MPI_Send (and friends), B is * where the data starts, and C is where the data ends. In this * case: * * - extent: C-MPI_BOTTOM * - true extent: C-B * - LB: [potentially very large] positive * * MPI_BOTTOM B C * -------------------------------------------------------- * | | User buffer | * -------------------------------------------------------- * <=======================extent=========================> * <===============true extent=============> * * So in all cases, for a temporary buffer, all we need to malloc() * is a buffer of size true_extent. We therefore need to know two * pointer values: what value to give to MPI_Send (and friends) and * what value to give to free(), because they might not be the same. * * Clearly, what we give to free() is exactly what was returned from * malloc(). That part is easy. :-) * * What we give to MPI_Send (and friends) is a bit more complicated. * Let's take the 4 cases from above: * * 1. If A is what we give to MPI_Send and A is where the data * starts, then clearly we give to MPI_Send what we got back from * malloc(). * * 2. If B is what we get back from malloc, but we give A to * MPI_Send, then the buffer range [A,B) represents "dead space" * -- no data will be put there. So it's safe to give B-LB to * MPI_Send. More specifically, the LB is positive, so B-LB is * actually A. * * 3. If A is what we get back from malloc, and B is what we give to * MPI_Send, then the LB is negative, so A-LB will actually equal * B. * * 4. Although this seems like the weirdest case, it's actually * quite similar to case #2 -- the pointer we give to MPI_Send is * smaller than the pointer we got back from malloc(). * * Hence, in all cases, we give (return_from_malloc - LB) to MPI_Send. * * This works fine and dandy if we only have (count==1), which we * rarely do. ;-) So we really need to allocate (true_extent + * ((count - 1) * extent)) to get enough space for the rest. This may * be more than is necessary, but it's ok. * * Simple, no? :-) * */ ompi_datatype_get_extent(dtype, &lb, &extent); ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); if (MPI_IN_PLACE == sbuf) { sbuf = rbuf; inplace_temp = (char*)malloc(true_extent + (count - 1) * extent); if (NULL == inplace_temp) { return OMPI_ERR_OUT_OF_RESOURCE; } rbuf = inplace_temp - lb; } if (size > 1) { free_buffer = (char*)malloc(true_extent + (count - 1) * extent); if (NULL == free_buffer) { if (NULL != inplace_temp) { free(inplace_temp); } return OMPI_ERR_OUT_OF_RESOURCE; } pml_buffer = free_buffer - lb; } /* Initialize the receive buffer. */ if (rank == (size - 1)) { err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf, (char*)sbuf); } else { err = MCA_PML_CALL(recv(rbuf, count, dtype, size - 1, MCA_COLL_BASE_TAG_REDUCE, comm, MPI_STATUS_IGNORE)); } if (MPI_SUCCESS != err) { if (NULL != free_buffer) { free(free_buffer); } return err; } /* Loop receiving and calling reduction function (C or Fortran). */ for (i = size - 2; i >= 0; --i) { if (rank == i) { inbuf = (char*)sbuf; } else { err = MCA_PML_CALL(recv(pml_buffer, count, dtype, i, MCA_COLL_BASE_TAG_REDUCE, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != err) { if (NULL != free_buffer) { free(free_buffer); } return err; } inbuf = pml_buffer; } /* Perform the reduction */ ompi_op_reduce(op, inbuf, rbuf, count, dtype); } if (NULL != inplace_temp) { err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)sbuf, inplace_temp); free(inplace_temp); } if (NULL != free_buffer) { free(free_buffer); } /* All done */ return MPI_SUCCESS; }