static int mca_coll_basic_neighbor_allgather_dist_graph(const void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { const mca_topo_base_comm_dist_graph_2_2_0_t *dist_graph = comm->c_topo->mtc.dist_graph; const int *inedges, *outedges; int indegree, outdegree; ompi_request_t **reqs, **preqs; ptrdiff_t lb, extent; int rc = MPI_SUCCESS, neighbor; indegree = dist_graph->indegree; outdegree = dist_graph->outdegree; if( 0 == (indegree + outdegree) ) return OMPI_SUCCESS; inedges = dist_graph->in; outedges = dist_graph->out; ompi_datatype_get_extent(rdtype, &lb, &extent); reqs = preqs = coll_base_comm_get_reqs( module->base_data, indegree + outdegree); if( NULL == reqs ) { return OMPI_ERR_OUT_OF_RESOURCE; } for (neighbor = 0; neighbor < indegree ; ++neighbor) { rc = MCA_PML_CALL(irecv(rbuf, rcount, rdtype, inedges[neighbor], MCA_COLL_BASE_TAG_ALLGATHER, comm, preqs++)); if (OMPI_SUCCESS != rc) break; rbuf = (char *) rbuf + extent * rcount; } if (OMPI_SUCCESS != rc) { ompi_coll_base_free_reqs(reqs, neighbor + 1); return rc; } for (neighbor = 0 ; neighbor < outdegree ; ++neighbor) { /* remove cast from const when the pml layer is updated to take * a const for the send buffer. */ rc = MCA_PML_CALL(isend((void *) sbuf, scount, sdtype, outedges[neighbor], MCA_COLL_BASE_TAG_ALLGATHER, MCA_PML_BASE_SEND_STANDARD, comm, preqs++)); if (OMPI_SUCCESS != rc) break; } if (OMPI_SUCCESS != rc) { ompi_coll_base_free_reqs(reqs, indegree + neighbor + 1); return rc; } rc = ompi_request_wait_all (indegree + outdegree, reqs, MPI_STATUSES_IGNORE); if (OMPI_SUCCESS != rc) { ompi_coll_base_free_reqs(reqs, indegree + outdegree); } return rc; }
static int mca_coll_basic_neighbor_alltoallv_graph(const void *sbuf, const int scounts[], const int sdisps[], struct ompi_datatype_t *sdtype, void *rbuf, const int rcounts[], const int rdisps[], struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { const mca_topo_base_comm_graph_2_2_0_t *graph = comm->c_topo->mtc.graph; int rc = MPI_SUCCESS, neighbor, degree; const int rank = ompi_comm_rank (comm); ptrdiff_t lb, rdextent, sdextent; ompi_request_t **reqs, **preqs; const int *edges; mca_topo_base_graph_neighbors_count (comm, rank, °ree); if( 0 == degree ) return OMPI_SUCCESS; edges = graph->edges; if (rank > 0) { edges += graph->index[rank - 1]; } ompi_datatype_get_extent(rdtype, &lb, &rdextent); ompi_datatype_get_extent(sdtype, &lb, &sdextent); reqs = preqs = ompi_coll_base_comm_get_reqs( module->base_data, 2 * degree ); if( NULL == reqs ) { return OMPI_ERR_OUT_OF_RESOURCE; } /* post all receives first */ for (neighbor = 0; neighbor < degree ; ++neighbor) { rc = MCA_PML_CALL(irecv((char *) rbuf + rdisps[neighbor] * rdextent, rcounts[neighbor], rdtype, edges[neighbor], MCA_COLL_BASE_TAG_ALLTOALL, comm, preqs++)); if (OMPI_SUCCESS != rc) break; } if (OMPI_SUCCESS != rc) { ompi_coll_base_free_reqs( reqs, neighbor + 1); return rc; } for (neighbor = 0 ; neighbor < degree ; ++neighbor) { /* remove cast from const when the pml layer is updated to take a const for the send buffer */ rc = MCA_PML_CALL(isend((char *) sbuf + sdisps[neighbor] * sdextent, scounts[neighbor], sdtype, edges[neighbor], MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD, comm, preqs++)); if (OMPI_SUCCESS != rc) break; } if (OMPI_SUCCESS != rc) { ompi_coll_base_free_reqs( reqs, degree + neighbor + 1); return rc; } rc = ompi_request_wait_all (degree * 2, reqs, MPI_STATUSES_IGNORE); if (OMPI_SUCCESS != rc) { ompi_coll_base_free_reqs( reqs, degree * 2); } return rc; }
/* * scatterv_inter * * Function: - scatterv operation * Accepts: - same arguments as MPI_Scatterv() * Returns: - MPI_SUCCESS or error code */ int mca_coll_basic_scatterv_inter(const void *sbuf, const int *scounts, const int *disps, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, size, err; char *ptmp; ptrdiff_t lb, extent; ompi_request_t **reqs; /* Initialize */ size = ompi_comm_remote_size(comm); /* If not root, receive data. Note that we will only get here if * rcount > 0 or rank == root. */ if (MPI_PROC_NULL == root) { /* do nothing */ err = OMPI_SUCCESS; } else if (MPI_ROOT != root) { /* If not root, receive data. */ err = MCA_PML_CALL(recv(rbuf, rcount, rdtype, root, MCA_COLL_BASE_TAG_SCATTERV, comm, MPI_STATUS_IGNORE)); } else { /* I am the root, loop sending data. */ err = ompi_datatype_get_extent(sdtype, &lb, &extent); if (OMPI_SUCCESS != err) { return OMPI_ERROR; } reqs = coll_base_comm_get_reqs(module->base_data, size); for (i = 0; i < size; ++i) { ptmp = ((char *) sbuf) + (extent * disps[i]); err = MCA_PML_CALL(isend(ptmp, scounts[i], sdtype, i, MCA_COLL_BASE_TAG_SCATTERV, MCA_PML_BASE_SEND_STANDARD, comm, &(reqs[i]))); if (OMPI_SUCCESS != err) { ompi_coll_base_free_reqs(reqs, i); return err; } } err = ompi_request_wait_all(size, reqs, MPI_STATUSES_IGNORE); if (OMPI_SUCCESS != err) { ompi_coll_base_free_reqs(reqs, size); } } /* All done */ return err; }
/* * gatherv_inter * * Function: - basic gatherv operation * Accepts: - same arguments as MPI_Gatherv() * Returns: - MPI_SUCCESS or error code */ int mca_coll_basic_gatherv_inter(const void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, const int *rcounts, const int *disps, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, size, err; char *ptmp; ptrdiff_t lb, extent; ompi_request_t **reqs = NULL; size = ompi_comm_remote_size(comm); /* If not root, receive data. Note that we will only get here if * scount > 0 or rank == root. */ if (MPI_PROC_NULL == root) { /* do nothing */ err = OMPI_SUCCESS; } else if (MPI_ROOT != root) { /* Everyone but root sends data and returns. */ err = MCA_PML_CALL(send(sbuf, scount, sdtype, root, MCA_COLL_BASE_TAG_GATHERV, MCA_PML_BASE_SEND_STANDARD, comm)); } else { /* I am the root, loop receiving data. */ err = ompi_datatype_get_extent(rdtype, &lb, &extent); if (OMPI_SUCCESS != err) { return OMPI_ERROR; } reqs = coll_base_comm_get_reqs(module->base_data, size); if( NULL == reqs ) { return OMPI_ERR_OUT_OF_RESOURCE; } for (i = 0; i < size; ++i) { ptmp = ((char *) rbuf) + (extent * disps[i]); err = MCA_PML_CALL(irecv(ptmp, rcounts[i], rdtype, i, MCA_COLL_BASE_TAG_GATHERV, comm, &reqs[i])); if (OMPI_SUCCESS != err) { ompi_coll_base_free_reqs(reqs, i + 1); return err; } } err = ompi_request_wait_all(size, reqs, MPI_STATUSES_IGNORE); if (OMPI_SUCCESS != err) { ompi_coll_base_free_reqs(reqs, size); } } /* All done */ return err; }
static int mca_coll_basic_neighbor_allgather_graph(const void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { const mca_topo_base_comm_graph_2_2_0_t *graph = comm->c_topo->mtc.graph; const int rank = ompi_comm_rank (comm); const int *edges; int degree; ompi_request_t **reqs, **preqs; ptrdiff_t lb, extent; int rc = MPI_SUCCESS, neighbor; mca_topo_base_graph_neighbors_count (comm, rank, °ree); edges = graph->edges; if (rank > 0) { edges += graph->index[rank - 1]; } ompi_datatype_get_extent(rdtype, &lb, &extent); reqs = preqs = coll_base_comm_get_reqs( module->base_data, 2 * degree); if( NULL == reqs ) { return OMPI_ERR_OUT_OF_RESOURCE; } for (neighbor = 0; neighbor < degree ; ++neighbor) { rc = MCA_PML_CALL(irecv(rbuf, rcount, rdtype, edges[neighbor], MCA_COLL_BASE_TAG_ALLGATHER, comm, preqs++)); if (OMPI_SUCCESS != rc) break; rbuf = (char *) rbuf + extent * rcount; /* remove cast from const when the pml layer is updated to take * a const for the send buffer. */ rc = MCA_PML_CALL(isend((void *) sbuf, scount, sdtype, edges[neighbor], MCA_COLL_BASE_TAG_ALLGATHER, MCA_PML_BASE_SEND_STANDARD, comm, preqs++)); if (OMPI_SUCCESS != rc) break; } if (OMPI_SUCCESS != rc) { ompi_coll_base_free_reqs( reqs, (2 * neighbor + 1)); return rc; } rc = ompi_request_wait_all (degree * 2, reqs, MPI_STATUSES_IGNORE); if (OMPI_SUCCESS != rc) { ompi_coll_base_free_reqs( reqs, degree * 2); } return rc; }
static void coll_base_comm_destruct(mca_coll_base_comm_t *data) { if( NULL != data->mcct_reqs ) { ompi_coll_base_free_reqs( data->mcct_reqs, data->mcct_num_reqs ); free(data->mcct_reqs); data->mcct_reqs = NULL; data->mcct_num_reqs = 0; } assert(0 == data->mcct_num_reqs); /* free any cached information that has been allocated */ if (data->cached_ntree) { /* destroy general tree if defined */ ompi_coll_base_topo_destroy_tree (&data->cached_ntree); } if (data->cached_bintree) { /* destroy bintree if defined */ ompi_coll_base_topo_destroy_tree (&data->cached_bintree); } if (data->cached_bmtree) { /* destroy bmtree if defined */ ompi_coll_base_topo_destroy_tree (&data->cached_bmtree); } if (data->cached_in_order_bmtree) { /* destroy bmtree if defined */ ompi_coll_base_topo_destroy_tree (&data->cached_in_order_bmtree); } if (data->cached_chain) { /* destroy general chain if defined */ ompi_coll_base_topo_destroy_tree (&data->cached_chain); } if (data->cached_pipeline) { /* destroy pipeline if defined */ ompi_coll_base_topo_destroy_tree (&data->cached_pipeline); } if (data->cached_in_order_bintree) { /* destroy in order bintree if defined */ ompi_coll_base_topo_destroy_tree (&data->cached_in_order_bintree); } }
int ompi_coll_base_barrier_intra_basic_linear(struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, err, rank, size, line; ompi_request_t** requests = NULL; rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); /* All non-root send & receive zero-length message. */ if (rank > 0) { err = MCA_PML_CALL(send (NULL, 0, MPI_BYTE, 0, MCA_COLL_BASE_TAG_BARRIER, MCA_PML_BASE_SEND_STANDARD, comm)); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } err = MCA_PML_CALL(recv (NULL, 0, MPI_BYTE, 0, MCA_COLL_BASE_TAG_BARRIER, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } } /* The root collects and broadcasts the messages. */ else { requests = coll_base_comm_get_reqs(module->base_data, size); if( NULL == requests ) { err = OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl; } for (i = 1; i < size; ++i) { err = MCA_PML_CALL(irecv(NULL, 0, MPI_BYTE, MPI_ANY_SOURCE, MCA_COLL_BASE_TAG_BARRIER, comm, &(requests[i]))); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } } err = ompi_request_wait_all( size-1, requests+1, MPI_STATUSES_IGNORE ); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } requests = NULL; /* we're done the requests array is clean */ for (i = 1; i < size; ++i) { err = MCA_PML_CALL(send(NULL, 0, MPI_BYTE, i, MCA_COLL_BASE_TAG_BARRIER, MCA_PML_BASE_SEND_STANDARD, comm)); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } } } /* All done */ return MPI_SUCCESS; err_hndl: OPAL_OUTPUT( (ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank) ); (void)line; // silence compiler warning if( NULL != requests ) ompi_coll_base_free_reqs(requests, size); return err; }
/** * This is a generic implementation of the reduce protocol. It used the tree * provided as an argument and execute all operations using a segment of * count times a datatype. * For the last communication it will update the count in order to limit * the number of datatype to the original count (original_count) * * Note that for non-commutative operations we cannot save memory copy * for the first block: thus we must copy sendbuf to accumbuf on intermediate * to keep the optimized loop happy. */ int ompi_coll_base_reduce_generic( const void* sendbuf, void* recvbuf, int original_count, ompi_datatype_t* datatype, ompi_op_t* op, int root, ompi_communicator_t* comm, mca_coll_base_module_t *module, ompi_coll_tree_t* tree, int count_by_segment, int max_outstanding_reqs ) { char *inbuf[2] = {NULL, NULL}, *inbuf_free[2] = {NULL, NULL}; char *accumbuf = NULL, *accumbuf_free = NULL; char *local_op_buffer = NULL, *sendtmpbuf = NULL; ptrdiff_t extent, size, gap = 0, segment_increment; ompi_request_t **sreq = NULL, *reqs[2] = {MPI_REQUEST_NULL, MPI_REQUEST_NULL}; int num_segments, line, ret, segindex, i, rank; int recvcount, prevcount, inbi; /** * Determine number of segments and number of elements * sent per operation */ ompi_datatype_type_extent( datatype, &extent ); num_segments = (int)(((size_t)original_count + (size_t)count_by_segment - (size_t)1) / (size_t)count_by_segment); segment_increment = (ptrdiff_t)count_by_segment * extent; sendtmpbuf = (char*) sendbuf; if( sendbuf == MPI_IN_PLACE ) { sendtmpbuf = (char *)recvbuf; } OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:reduce_generic count %d, msg size %ld, segsize %ld, max_requests %d", original_count, (unsigned long)((ptrdiff_t)num_segments * (ptrdiff_t)segment_increment), (unsigned long)segment_increment, max_outstanding_reqs)); rank = ompi_comm_rank(comm); /* non-leaf nodes - wait for children to send me data & forward up (if needed) */ if( tree->tree_nextsize > 0 ) { ptrdiff_t real_segment_size; /* handle non existant recv buffer (i.e. its NULL) and protect the recv buffer on non-root nodes */ accumbuf = (char*)recvbuf; if( (NULL == accumbuf) || (root != rank) ) { /* Allocate temporary accumulator buffer. */ size = opal_datatype_span(&datatype->super, original_count, &gap); accumbuf_free = (char*)malloc(size); if (accumbuf_free == NULL) { line = __LINE__; ret = -1; goto error_hndl; } accumbuf = accumbuf_free - gap; } /* If this is a non-commutative operation we must copy sendbuf to the accumbuf, in order to simplfy the loops */ if (!ompi_op_is_commute(op) && MPI_IN_PLACE != sendbuf) { ompi_datatype_copy_content_same_ddt(datatype, original_count, (char*)accumbuf, (char*)sendtmpbuf); } /* Allocate two buffers for incoming segments */ real_segment_size = opal_datatype_span(&datatype->super, count_by_segment, &gap); inbuf_free[0] = (char*) malloc(real_segment_size); if( inbuf_free[0] == NULL ) { line = __LINE__; ret = -1; goto error_hndl; } inbuf[0] = inbuf_free[0] - gap; /* if there is chance to overlap communication - allocate second buffer */ if( (num_segments > 1) || (tree->tree_nextsize > 1) ) { inbuf_free[1] = (char*) malloc(real_segment_size); if( inbuf_free[1] == NULL ) { line = __LINE__; ret = -1; goto error_hndl; } inbuf[1] = inbuf_free[1] - gap; } /* reset input buffer index and receive count */ inbi = 0; recvcount = 0; /* for each segment */ for( segindex = 0; segindex <= num_segments; segindex++ ) { prevcount = recvcount; /* recvcount - number of elements in current segment */ recvcount = count_by_segment; if( segindex == (num_segments-1) ) recvcount = original_count - (ptrdiff_t)count_by_segment * (ptrdiff_t)segindex; /* for each child */ for( i = 0; i < tree->tree_nextsize; i++ ) { /** * We try to overlap communication: * either with next segment or with the next child */ /* post irecv for current segindex on current child */ if( segindex < num_segments ) { void* local_recvbuf = inbuf[inbi]; if( 0 == i ) { /* for the first step (1st child per segment) and * commutative operations we might be able to irecv * directly into the accumulate buffer so that we can * reduce(op) this with our sendbuf in one step as * ompi_op_reduce only has two buffer pointers, * this avoids an extra memory copy. * * BUT if the operation is non-commutative or * we are root and are USING MPI_IN_PLACE this is wrong! */ if( (ompi_op_is_commute(op)) && !((MPI_IN_PLACE == sendbuf) && (rank == tree->tree_root)) ) { local_recvbuf = accumbuf + (ptrdiff_t)segindex * (ptrdiff_t)segment_increment; } } ret = MCA_PML_CALL(irecv(local_recvbuf, recvcount, datatype, tree->tree_next[i], MCA_COLL_BASE_TAG_REDUCE, comm, &reqs[inbi])); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl;} } /* wait for previous req to complete, if any. if there are no requests reqs[inbi ^1] will be MPI_REQUEST_NULL. */ /* wait on data from last child for previous segment */ ret = ompi_request_wait(&reqs[inbi ^ 1], MPI_STATUSES_IGNORE ); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } local_op_buffer = inbuf[inbi ^ 1]; if( i > 0 ) { /* our first operation is to combine our own [sendbuf] data * with the data we recvd from down stream (but only * the operation is commutative and if we are not root and * not using MPI_IN_PLACE) */ if( 1 == i ) { if( (ompi_op_is_commute(op)) && !((MPI_IN_PLACE == sendbuf) && (rank == tree->tree_root)) ) { local_op_buffer = sendtmpbuf + (ptrdiff_t)segindex * (ptrdiff_t)segment_increment; } } /* apply operation */ ompi_op_reduce(op, local_op_buffer, accumbuf + (ptrdiff_t)segindex * (ptrdiff_t)segment_increment, recvcount, datatype ); } else if ( segindex > 0 ) { void* accumulator = accumbuf + (ptrdiff_t)(segindex-1) * (ptrdiff_t)segment_increment; if( tree->tree_nextsize <= 1 ) { if( (ompi_op_is_commute(op)) && !((MPI_IN_PLACE == sendbuf) && (rank == tree->tree_root)) ) { local_op_buffer = sendtmpbuf + (ptrdiff_t)(segindex-1) * (ptrdiff_t)segment_increment; } } ompi_op_reduce(op, local_op_buffer, accumulator, prevcount, datatype ); /* all reduced on available data this step (i) complete, * pass to the next process unless you are the root. */ if (rank != tree->tree_root) { /* send combined/accumulated data to parent */ ret = MCA_PML_CALL( send( accumulator, prevcount, datatype, tree->tree_prev, MCA_COLL_BASE_TAG_REDUCE, MCA_PML_BASE_SEND_STANDARD, comm) ); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } } /* we stop when segindex = number of segments (i.e. we do num_segment+1 steps for pipelining */ if (segindex == num_segments) break; } /* update input buffer index */ inbi = inbi ^ 1; } /* end of for each child */ } /* end of for each segment */ /* clean up */ if( inbuf_free[0] != NULL) free(inbuf_free[0]); if( inbuf_free[1] != NULL) free(inbuf_free[1]); if( accumbuf_free != NULL ) free(accumbuf_free); } /* leaf nodes Depending on the value of max_outstanding_reqs and the number of segments we have two options: - send all segments using blocking send to the parent, or - avoid overflooding the parent nodes by limiting the number of outstanding requests to max_oustanding_reqs. TODO/POSSIBLE IMPROVEMENT: If there is a way to determine the eager size for the current communication, synchronization should be used only when the message/segment size is smaller than the eager size. */ else { /* If the number of segments is less than a maximum number of oustanding requests or there is no limit on the maximum number of outstanding requests, we send data to the parent using blocking send */ if ((0 == max_outstanding_reqs) || (num_segments <= max_outstanding_reqs)) { segindex = 0; while ( original_count > 0) { if (original_count < count_by_segment) { count_by_segment = original_count; } ret = MCA_PML_CALL( send((char*)sendbuf + (ptrdiff_t)segindex * (ptrdiff_t)segment_increment, count_by_segment, datatype, tree->tree_prev, MCA_COLL_BASE_TAG_REDUCE, MCA_PML_BASE_SEND_STANDARD, comm) ); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } segindex++; original_count -= count_by_segment; } } /* Otherwise, introduce flow control: - post max_outstanding_reqs non-blocking synchronous send, - for remaining segments - wait for a ssend to complete, and post the next one. - wait for all outstanding sends to complete. */ else { int creq = 0; sreq = ompi_coll_base_comm_get_reqs(module->base_data, max_outstanding_reqs); if (NULL == sreq) { line = __LINE__; ret = -1; goto error_hndl; } /* post first group of requests */ for (segindex = 0; segindex < max_outstanding_reqs; segindex++) { ret = MCA_PML_CALL( isend((char*)sendbuf + (ptrdiff_t)segindex * (ptrdiff_t)segment_increment, count_by_segment, datatype, tree->tree_prev, MCA_COLL_BASE_TAG_REDUCE, MCA_PML_BASE_SEND_SYNCHRONOUS, comm, &sreq[segindex]) ); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } original_count -= count_by_segment; } creq = 0; while ( original_count > 0 ) { /* wait on a posted request to complete */ ret = ompi_request_wait(&sreq[creq], MPI_STATUS_IGNORE); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } if( original_count < count_by_segment ) { count_by_segment = original_count; } ret = MCA_PML_CALL( isend((char*)sendbuf + (ptrdiff_t)segindex * (ptrdiff_t)segment_increment, count_by_segment, datatype, tree->tree_prev, MCA_COLL_BASE_TAG_REDUCE, MCA_PML_BASE_SEND_SYNCHRONOUS, comm, &sreq[creq]) ); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } creq = (creq + 1) % max_outstanding_reqs; segindex++; original_count -= count_by_segment; } /* Wait on the remaining request to complete */ ret = ompi_request_wait_all( max_outstanding_reqs, sreq, MPI_STATUSES_IGNORE ); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } } } return OMPI_SUCCESS; error_hndl: /* error handler */ OPAL_OUTPUT (( ompi_coll_base_framework.framework_output, "ERROR_HNDL: node %d file %s line %d error %d\n", rank, __FILE__, line, ret )); (void)line; // silence compiler warning if( inbuf_free[0] != NULL ) free(inbuf_free[0]); if( inbuf_free[1] != NULL ) free(inbuf_free[1]); if( accumbuf_free != NULL ) free(accumbuf); if( NULL != sreq ) { ompi_coll_base_free_reqs(sreq, max_outstanding_reqs); } return ret; }
/* * alltoall_intra_linear_sync * * Function: Linear implementation of alltoall with limited number * of outstanding requests. * Accepts: Same as MPI_Alltoall(), and the maximum number of * outstanding requests (actual number is 2 * max, since * we count receive and send requests separately). * Returns: MPI_SUCCESS or error code * * Description: Algorithm is the following: * 1) post K irecvs, K <= N * 2) post K isends, K <= N * 3) while not done * - wait for any request to complete * - replace that request by the new one of the same type. */ int ompi_coll_base_alltoall_intra_linear_sync(const void *sbuf, int scount, struct ompi_datatype_t *sdtype, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module, int max_outstanding_reqs) { int line, error, ri, si, rank, size, nrreqs, nsreqs, total_reqs; int nreqs = 0; char *psnd, *prcv; ptrdiff_t slb, sext, rlb, rext; ompi_request_t **reqs = NULL; if (MPI_IN_PLACE == sbuf) { return mca_coll_base_alltoall_intra_basic_inplace (rbuf, rcount, rdtype, comm, module); } /* Initialize. */ size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "ompi_coll_base_alltoall_intra_linear_sync rank %d", rank)); error = ompi_datatype_get_extent(sdtype, &slb, &sext); if (OMPI_SUCCESS != error) { return error; } sext *= scount; error = ompi_datatype_get_extent(rdtype, &rlb, &rext); if (OMPI_SUCCESS != error) { return error; } rext *= rcount; /* simple optimization */ psnd = ((char *) sbuf) + (ptrdiff_t)rank * sext; prcv = ((char *) rbuf) + (ptrdiff_t)rank * rext; error = ompi_datatype_sndrcv(psnd, scount, sdtype, prcv, rcount, rdtype); if (MPI_SUCCESS != error) { return error; } /* If only one process, we're done. */ if (1 == size) { return MPI_SUCCESS; } /* Initiate send/recv to/from others. */ total_reqs = (((max_outstanding_reqs > (size - 1)) || (max_outstanding_reqs <= 0)) ? (size - 1) : (max_outstanding_reqs)); if (0 < total_reqs) { reqs = coll_base_comm_get_reqs(module->base_data, 2 * total_reqs); if (NULL == reqs) { error = -1; line = __LINE__; goto error_hndl; } } prcv = (char *) rbuf; psnd = (char *) sbuf; /* Post first batch or ireceive and isend requests */ for (nreqs = 0, nrreqs = 0, ri = (rank + 1) % size; nreqs < total_reqs; ri = (ri + 1) % size, ++nrreqs) { nreqs++; error = MCA_PML_CALL(irecv (prcv + (ptrdiff_t)ri * rext, rcount, rdtype, ri, MCA_COLL_BASE_TAG_ALLTOALL, comm, &reqs[nreqs])); if (MPI_SUCCESS != error) { line = __LINE__; goto error_hndl; } } for (nsreqs = 0, si = (rank + size - 1) % size; nreqs < 2 * total_reqs; si = (si + size - 1) % size, ++nsreqs) { nreqs++; error = MCA_PML_CALL(isend (psnd + (ptrdiff_t)si * sext, scount, sdtype, si, MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD, comm, &reqs[nreqs])); if (MPI_SUCCESS != error) { line = __LINE__; goto error_hndl; } } /* Wait for requests to complete */ if (nreqs == 2 * (size - 1)) { /* Optimization for the case when all requests have been posted */ error = ompi_request_wait_all(nreqs, reqs, MPI_STATUSES_IGNORE); if (MPI_SUCCESS != error) { line = __LINE__; goto error_hndl; } } else { /* As requests complete, replace them with corresponding requests: - wait for any request to complete, mark the request as MPI_REQUEST_NULL - If it was a receive request, replace it with new irecv request (if any) - if it was a send request, replace it with new isend request (if any) */ int ncreqs = 0; while (ncreqs < 2 * (size - 1)) { int completed; error = ompi_request_wait_any(2 * total_reqs, reqs, &completed, MPI_STATUS_IGNORE); if (MPI_SUCCESS != error) { line = __LINE__; goto error_hndl; } reqs[completed] = MPI_REQUEST_NULL; ncreqs++; if (completed < total_reqs) { if (nrreqs < (size - 1)) { error = MCA_PML_CALL(irecv (prcv + (ptrdiff_t)ri * rext, rcount, rdtype, ri, MCA_COLL_BASE_TAG_ALLTOALL, comm, &reqs[completed])); if (MPI_SUCCESS != error) { line = __LINE__; goto error_hndl; } ++nrreqs; ri = (ri + 1) % size; } } else { if (nsreqs < (size - 1)) { error = MCA_PML_CALL(isend (psnd + (ptrdiff_t)si * sext, scount, sdtype, si, MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD, comm, &reqs[completed])); if (MPI_SUCCESS != error) { line = __LINE__; goto error_hndl; } ++nsreqs; si = (si + size - 1) % size; } } } } /* All done */ return MPI_SUCCESS; error_hndl: OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, error, rank)); (void)line; // silence compiler warning ompi_coll_base_free_reqs(reqs, nreqs); return error; }
/** * Linear functions are copied from the basic coll module. For * some small number of nodes and/or small data sizes they are just as * fast as base/tree based segmenting operations and as such may be * selected by the decision functions. These are copied into this module * due to the way we select modules in V1. i.e. in V2 we will handle this * differently and so will not have to duplicate code. */ int ompi_coll_base_alltoallv_intra_basic_linear(const void *sbuf, const int *scounts, const int *sdisps, struct ompi_datatype_t *sdtype, void *rbuf, const int *rcounts, const int *rdisps, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, size, rank, err, nreqs; char *psnd, *prcv; ptrdiff_t sext, rext; ompi_request_t **preq, **reqs; mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module; mca_coll_base_comm_t *data = base_module->base_data; if (MPI_IN_PLACE == sbuf) { return mca_coll_base_alltoallv_intra_basic_inplace (rbuf, rcounts, rdisps, rdtype, comm, module); } size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:alltoallv_intra_basic_linear rank %d", rank)); ompi_datatype_type_extent(sdtype, &sext); ompi_datatype_type_extent(rdtype, &rext); /* Simple optimization - handle send to self first */ psnd = ((char *) sbuf) + (ptrdiff_t)sdisps[rank] * sext; prcv = ((char *) rbuf) + (ptrdiff_t)rdisps[rank] * rext; if (0 != scounts[rank]) { err = ompi_datatype_sndrcv(psnd, scounts[rank], sdtype, prcv, rcounts[rank], rdtype); if (MPI_SUCCESS != err) { return err; } } /* If only one process, we're done. */ if (1 == size) { return MPI_SUCCESS; } /* Now, initiate all send/recv to/from others. */ nreqs = 0; reqs = preq = coll_base_comm_get_reqs(data, 2 * size); if( NULL == reqs ) { err = OMPI_ERR_OUT_OF_RESOURCE; goto err_hndl; } /* Post all receives first */ for (i = 0; i < size; ++i) { if (i == rank || 0 == rcounts[i]) { continue; } ++nreqs; prcv = ((char *) rbuf) + (ptrdiff_t)rdisps[i] * rext; err = MCA_PML_CALL(irecv_init(prcv, rcounts[i], rdtype, i, MCA_COLL_BASE_TAG_ALLTOALLV, comm, preq++)); if (MPI_SUCCESS != err) { goto err_hndl; } } /* Now post all sends */ for (i = 0; i < size; ++i) { if (i == rank || 0 == scounts[i]) { continue; } ++nreqs; psnd = ((char *) sbuf) + (ptrdiff_t)sdisps[i] * sext; err = MCA_PML_CALL(isend_init(psnd, scounts[i], sdtype, i, MCA_COLL_BASE_TAG_ALLTOALLV, MCA_PML_BASE_SEND_STANDARD, comm, preq++)); if (MPI_SUCCESS != err) { goto err_hndl; } } /* Start your engines. This will never return an error. */ MCA_PML_CALL(start(nreqs, reqs)); /* Wait for them all. If there's an error, note that we don't care * what the error was -- just that there *was* an error. The PML * will finish all requests, even if one or more of them fail. * i.e., by the end of this call, all the requests are free-able. * So free them anyway -- even if there was an error, and return the * error after we free everything. */ err = ompi_request_wait_all(nreqs, reqs, MPI_STATUSES_IGNORE); err_hndl: /* Free the requests in all cases as they are persistent */ ompi_coll_base_free_reqs(reqs, nreqs); return err; }
int mca_coll_base_alltoallv_intra_basic_inplace(const void *rbuf, const int *rcounts, const int *rdisps, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module; int i, j, size, rank, err=MPI_SUCCESS; ompi_request_t **preq, **reqs; char *allocated_buffer, *tmp_buffer; size_t max_size, rdtype_size; OPAL_PTRDIFF_TYPE ext, gap = 0; /* Initialize. */ size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); ompi_datatype_type_size(rdtype, &rdtype_size); /* If only one process, we're done. */ if (1 == size || 0 == rdtype_size) { return MPI_SUCCESS; } /* Find the largest receive amount */ ompi_datatype_type_extent (rdtype, &ext); for (i = 0, max_size = 0 ; i < size ; ++i) { size_t size = opal_datatype_span(&rdtype->super, rcounts[i], &gap); max_size = size > max_size ? size : max_size; } /* The gap will always be the same as we are working on the same datatype */ /* Allocate a temporary buffer */ allocated_buffer = calloc (max_size, 1); if (NULL == allocated_buffer) { return OMPI_ERR_OUT_OF_RESOURCE; } tmp_buffer = allocated_buffer - gap; /* Initiate all send/recv to/from others. */ reqs = preq = coll_base_comm_get_reqs(base_module->base_data, 2); if( NULL == reqs ) { err = OMPI_ERR_OUT_OF_RESOURCE; goto error_hndl; } /* in-place alltoallv slow algorithm (but works) */ for (i = 0 ; i < size ; ++i) { for (j = i+1 ; j < size ; ++j) { preq = reqs; if (i == rank && rcounts[j]) { /* Copy the data into the temporary buffer */ err = ompi_datatype_copy_content_same_ddt (rdtype, rcounts[j], tmp_buffer, (char *) rbuf + rdisps[j] * ext); if (MPI_SUCCESS != err) { goto error_hndl; } /* Exchange data with the peer */ err = MCA_PML_CALL(irecv ((char *) rbuf + rdisps[j] * ext, rcounts[j], rdtype, j, MCA_COLL_BASE_TAG_ALLTOALLV, comm, preq++)); if (MPI_SUCCESS != err) { goto error_hndl; } err = MCA_PML_CALL(isend ((void *) tmp_buffer, rcounts[j], rdtype, j, MCA_COLL_BASE_TAG_ALLTOALLV, MCA_PML_BASE_SEND_STANDARD, comm, preq++)); if (MPI_SUCCESS != err) { goto error_hndl; } } else if (j == rank && rcounts[i]) { /* Copy the data into the temporary buffer */ err = ompi_datatype_copy_content_same_ddt (rdtype, rcounts[i], tmp_buffer, (char *) rbuf + rdisps[i] * ext); if (MPI_SUCCESS != err) { goto error_hndl; } /* Exchange data with the peer */ err = MCA_PML_CALL(irecv ((char *) rbuf + rdisps[i] * ext, rcounts[i], rdtype, i, MCA_COLL_BASE_TAG_ALLTOALLV, comm, preq++)); if (MPI_SUCCESS != err) { goto error_hndl; } err = MCA_PML_CALL(isend ((void *) tmp_buffer, rcounts[i], rdtype, i, MCA_COLL_BASE_TAG_ALLTOALLV, MCA_PML_BASE_SEND_STANDARD, comm, preq++)); if (MPI_SUCCESS != err) { goto error_hndl; } } else { continue; } /* Wait for the requests to complete */ err = ompi_request_wait_all (2, reqs, MPI_STATUSES_IGNORE); if (MPI_SUCCESS != err) { goto error_hndl; } } } error_hndl: /* Free the temporary buffer */ free (allocated_buffer); if( MPI_SUCCESS != err ) { ompi_coll_base_free_reqs(reqs, 2 ); } /* All done */ return err; }
/* MPI_IN_PLACE all to all algorithm. TODO: implement a better one. */ int mca_coll_base_alltoall_intra_basic_inplace(const void *rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module; int i, j, size, rank, err = MPI_SUCCESS, line; MPI_Request *preq; char *tmp_buffer; size_t max_size; ptrdiff_t ext; /* Initialize. */ size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); /* If only one process, we're done. */ if (1 == size) { return MPI_SUCCESS; } /* Find the largest receive amount */ ompi_datatype_type_extent (rdtype, &ext); max_size = ext * rcount; /* Allocate a temporary buffer */ tmp_buffer = calloc (max_size, 1); if (NULL == tmp_buffer) { return OMPI_ERR_OUT_OF_RESOURCE; } /* in-place alltoall slow algorithm (but works) */ for (i = 0 ; i < size ; ++i) { for (j = i+1 ; j < size ; ++j) { /* Initiate all send/recv to/from others. */ preq = coll_base_comm_get_reqs(base_module->base_data, size * 2); if (i == rank) { /* Copy the data into the temporary buffer */ err = ompi_datatype_copy_content_same_ddt (rdtype, rcount, tmp_buffer, (char *) rbuf + j * max_size); if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; } /* Exchange data with the peer */ err = MCA_PML_CALL(irecv ((char *) rbuf + max_size * j, rcount, rdtype, j, MCA_COLL_BASE_TAG_ALLTOALL, comm, preq++)); if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; } err = MCA_PML_CALL(isend ((char *) tmp_buffer, rcount, rdtype, j, MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD, comm, preq++)); if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; } } else if (j == rank) { /* Copy the data into the temporary buffer */ err = ompi_datatype_copy_content_same_ddt (rdtype, rcount, tmp_buffer, (char *) rbuf + i * max_size); if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; } /* Exchange data with the peer */ err = MCA_PML_CALL(irecv ((char *) rbuf + max_size * i, rcount, rdtype, i, MCA_COLL_BASE_TAG_ALLTOALL, comm, preq++)); if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; } err = MCA_PML_CALL(isend ((char *) tmp_buffer, rcount, rdtype, i, MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD, comm, preq++)); if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; } } else { continue; } /* Wait for the requests to complete */ err = ompi_request_wait_all (2, base_module->base_data->mcct_reqs, MPI_STATUSES_IGNORE); if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; } } } error_hndl: /* Free the temporary buffer */ free (tmp_buffer); if( MPI_SUCCESS != err ) { OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank)); ompi_coll_base_free_reqs(base_module->base_data->mcct_reqs, 2); } /* All done */ return err; }
static int mca_coll_basic_neighbor_alltoallv_cart(const void *sbuf, const int scounts[], const int sdisps[], struct ompi_datatype_t *sdtype, void *rbuf, const int rcounts[], const int rdisps[], struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { const mca_topo_base_comm_cart_2_2_0_t *cart = comm->c_topo->mtc.cart; const int rank = ompi_comm_rank (comm); int rc = MPI_SUCCESS, dim, i, nreqs; ptrdiff_t lb, rdextent, sdextent; ompi_request_t **reqs, **preqs; if( 0 == cart->ndims ) return OMPI_SUCCESS; ompi_datatype_get_extent(rdtype, &lb, &rdextent); ompi_datatype_get_extent(sdtype, &lb, &sdextent); reqs = preqs = ompi_coll_base_comm_get_reqs( module->base_data, 4 * cart->ndims ); if( NULL == reqs ) { return OMPI_ERR_OUT_OF_RESOURCE; } /* post receives first */ for (dim = 0, nreqs = 0, i = 0; dim < cart->ndims ; ++dim, i += 2) { int srank = MPI_PROC_NULL, drank = MPI_PROC_NULL; if (cart->dims[dim] > 1) { mca_topo_base_cart_shift (comm, dim, 1, &srank, &drank); } else if (1 == cart->dims[dim] && cart->periods[dim]) { srank = drank = rank; } if (MPI_PROC_NULL != srank) { nreqs++; rc = MCA_PML_CALL(irecv((char *) rbuf + rdisps[i] * rdextent, rcounts[i], rdtype, srank, MCA_COLL_BASE_TAG_ALLTOALL, comm, preqs++)); if (OMPI_SUCCESS != rc) break; } if (MPI_PROC_NULL != drank) { nreqs++; rc = MCA_PML_CALL(irecv((char *) rbuf + rdisps[i+1] * rdextent, rcounts[i+1], rdtype, drank, MCA_COLL_BASE_TAG_ALLTOALL, comm, preqs++)); if (OMPI_SUCCESS != rc) break; } } if (OMPI_SUCCESS != rc) { ompi_coll_base_free_reqs( reqs, nreqs ); return rc; } for (dim = 0, i = 0 ; dim < cart->ndims ; ++dim, i += 2) { int srank = MPI_PROC_NULL, drank = MPI_PROC_NULL; if (cart->dims[dim] > 1) { mca_topo_base_cart_shift (comm, dim, 1, &srank, &drank); } else if (1 == cart->dims[dim] && cart->periods[dim]) { srank = drank = rank; } if (MPI_PROC_NULL != srank) { nreqs++; /* remove cast from const when the pml layer is updated to take a const for the send buffer */ rc = MCA_PML_CALL(isend((char *) sbuf + sdisps[i] * sdextent, scounts[i], sdtype, srank, MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD, comm, preqs++)); if (OMPI_SUCCESS != rc) break; } if (MPI_PROC_NULL != drank) { nreqs++; rc = MCA_PML_CALL(isend((char *) sbuf + sdisps[i+1] * sdextent, scounts[i+1], sdtype, drank, MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD, comm, preqs++)); if (OMPI_SUCCESS != rc) break; } } if (OMPI_SUCCESS != rc) { ompi_coll_base_free_reqs( reqs, nreqs ); return rc; } rc = ompi_request_wait_all (nreqs, reqs, MPI_STATUSES_IGNORE); if (OMPI_SUCCESS != rc) { ompi_coll_base_free_reqs( reqs, nreqs ); } return rc; }
int ompi_coll_base_alltoall_intra_basic_linear(const void *sbuf, int scount, struct ompi_datatype_t *sdtype, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, rank, size, err, line; int nreqs = 0; char *psnd, *prcv; MPI_Aint lb, sndinc, rcvinc; ompi_request_t **req, **sreq, **rreq; mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module; mca_coll_base_comm_t *data = base_module->base_data; if (MPI_IN_PLACE == sbuf) { return mca_coll_base_alltoall_intra_basic_inplace (rbuf, rcount, rdtype, comm, module); } /* Initialize. */ size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "ompi_coll_base_alltoall_intra_basic_linear rank %d", rank)); err = ompi_datatype_get_extent(sdtype, &lb, &sndinc); if (OMPI_SUCCESS != err) { return err; } sndinc *= scount; err = ompi_datatype_get_extent(rdtype, &lb, &rcvinc); if (OMPI_SUCCESS != err) { return err; } rcvinc *= rcount; /* simple optimization */ psnd = ((char *) sbuf) + (ptrdiff_t)rank * sndinc; prcv = ((char *) rbuf) + (ptrdiff_t)rank * rcvinc; err = ompi_datatype_sndrcv(psnd, scount, sdtype, prcv, rcount, rdtype); if (MPI_SUCCESS != err) { return err; } /* If only one process, we're done. */ if (1 == size) { return MPI_SUCCESS; } /* Initiate all send/recv to/from others. */ req = rreq = coll_base_comm_get_reqs(data, (size - 1) * 2); if (NULL == req) { err = OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl; } prcv = (char *) rbuf; psnd = (char *) sbuf; /* Post all receives first -- a simple optimization */ for (nreqs = 0, i = (rank + 1) % size; i != rank; i = (i + 1) % size, ++rreq) { nreqs++; err = MCA_PML_CALL(irecv_init (prcv + (ptrdiff_t)i * rcvinc, rcount, rdtype, i, MCA_COLL_BASE_TAG_ALLTOALL, comm, rreq)); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } } /* Now post all sends in reverse order - We would like to minimize the search time through message queue when messages actually arrive in the order in which they were posted. */ sreq = rreq; for (i = (rank + size - 1) % size; i != rank; i = (i + size - 1) % size, ++sreq) { nreqs++; err = MCA_PML_CALL(isend_init (psnd + (ptrdiff_t)i * sndinc, scount, sdtype, i, MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD, comm, sreq)); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } } /* Start your engines. This will never return an error. */ MCA_PML_CALL(start(nreqs, req)); /* Wait for them all. If there's an error, note that we don't * care what the error was -- just that there *was* an error. The * PML will finish all requests, even if one or more of them fail. * i.e., by the end of this call, all the requests are free-able. * So free them anyway -- even if there was an error, and return * the error after we free everything. */ err = ompi_request_wait_all(nreqs, req, MPI_STATUSES_IGNORE); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } err_hndl: if( MPI_SUCCESS != err ) { OPAL_OUTPUT( (ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank) ); (void)line; // silence compiler warning } /* Free the reqs in all cases as they are persistent requests */ ompi_coll_base_free_reqs(req, nreqs); /* All done */ return err; }
/* * alltoallw_inter * * Function: - MPI_Alltoallw * Accepts: - same as MPI_Alltoallw() * Returns: - MPI_SUCCESS or an MPI error code */ int mca_coll_basic_alltoallw_inter(const void *sbuf, const int *scounts, const int *sdisps, struct ompi_datatype_t * const *sdtypes, void *rbuf, const int *rcounts, const int *rdisps, struct ompi_datatype_t * const *rdtypes, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i; int size; int err; char *psnd; char *prcv; int nreqs; MPI_Request *preq, *reqs; /* Initialize. */ size = ompi_comm_remote_size(comm); /* Initiate all send/recv to/from others. */ nreqs = 0; reqs = preq = coll_base_comm_get_reqs(module->base_data, 2 * size); /* Post all receives first -- a simple optimization */ for (i = 0; i < size; ++i) { size_t msg_size; ompi_datatype_type_size(rdtypes[i], &msg_size); msg_size *= rcounts[i]; if (0 == msg_size) continue; prcv = ((char *) rbuf) + rdisps[i]; err = MCA_PML_CALL(irecv_init(prcv, rcounts[i], rdtypes[i], i, MCA_COLL_BASE_TAG_ALLTOALLW, comm, preq++)); ++nreqs; if (OMPI_SUCCESS != err) { ompi_coll_base_free_reqs(reqs, nreqs); return err; } } /* Now post all sends */ for (i = 0; i < size; ++i) { size_t msg_size; ompi_datatype_type_size(sdtypes[i], &msg_size); msg_size *= scounts[i]; if (0 == msg_size) continue; psnd = ((char *) sbuf) + sdisps[i]; err = MCA_PML_CALL(isend_init(psnd, scounts[i], sdtypes[i], i, MCA_COLL_BASE_TAG_ALLTOALLW, MCA_PML_BASE_SEND_STANDARD, comm, preq++)); ++nreqs; if (OMPI_SUCCESS != err) { ompi_coll_base_free_reqs(reqs, nreqs); return err; } } /* Start your engines. This will never return an error. */ MCA_PML_CALL(start(nreqs, reqs)); /* Wait for them all. If there's an error, note that we don't care * what the error was -- just that there *was* an error. The PML * will finish all requests, even if one or more of them fail. * i.e., by the end of this call, all the requests are free-able. * So free them anyway -- even if there was an error, and return the * error after we free everything. */ err = ompi_request_wait_all(nreqs, reqs, MPI_STATUSES_IGNORE); /* Free the requests in all cases as they are persistent */ ompi_coll_base_free_reqs(reqs, nreqs); /* All done */ return err; }
static int mca_coll_basic_alltoallw_intra_inplace(const void *rbuf, const int *rcounts, const int *rdisps, struct ompi_datatype_t * const *rdtypes, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, j, size, rank, err=MPI_SUCCESS, max_size; MPI_Request *preq, *reqs = NULL; char *tmp_buffer; ptrdiff_t ext; /* Initialize. */ size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); /* If only one process, we're done. */ if (1 == size) { return MPI_SUCCESS; } /* Find the largest receive amount */ for (i = 0, max_size = 0 ; i < size ; ++i) { ompi_datatype_type_extent (rdtypes[i], &ext); ext *= rcounts[i]; max_size = ext > max_size ? ext : max_size; } /* Allocate a temporary buffer */ tmp_buffer = calloc (max_size, 1); if (NULL == tmp_buffer) { return OMPI_ERR_OUT_OF_RESOURCE; } reqs = coll_base_comm_get_reqs( module->base_data, 2); /* in-place alltoallw slow algorithm (but works) */ for (i = 0 ; i < size ; ++i) { size_t msg_size_i; ompi_datatype_type_size(rdtypes[i], &msg_size_i); msg_size_i *= rcounts[i]; for (j = i+1 ; j < size ; ++j) { size_t msg_size_j; ompi_datatype_type_size(rdtypes[j], &msg_size_j); msg_size_j *= rcounts[j]; /* Initiate all send/recv to/from others. */ preq = reqs; if (i == rank && msg_size_j != 0) { /* Copy the data into the temporary buffer */ err = ompi_datatype_copy_content_same_ddt (rdtypes[j], rcounts[j], tmp_buffer, (char *) rbuf + rdisps[j]); if (MPI_SUCCESS != err) { goto error_hndl; } /* Exchange data with the peer */ err = MCA_PML_CALL(irecv ((char *) rbuf + rdisps[j], rcounts[j], rdtypes[j], j, MCA_COLL_BASE_TAG_ALLTOALLW, comm, preq++)); if (MPI_SUCCESS != err) { goto error_hndl; } err = MCA_PML_CALL(isend ((void *) tmp_buffer, rcounts[j], rdtypes[j], j, MCA_COLL_BASE_TAG_ALLTOALLW, MCA_PML_BASE_SEND_STANDARD, comm, preq++)); if (MPI_SUCCESS != err) { goto error_hndl; } } else if (j == rank && msg_size_i != 0) { /* Copy the data into the temporary buffer */ err = ompi_datatype_copy_content_same_ddt (rdtypes[i], rcounts[i], tmp_buffer, (char *) rbuf + rdisps[i]); if (MPI_SUCCESS != err) { goto error_hndl; } /* Exchange data with the peer */ err = MCA_PML_CALL(irecv ((char *) rbuf + rdisps[i], rcounts[i], rdtypes[i], i, MCA_COLL_BASE_TAG_ALLTOALLW, comm, preq++)); if (MPI_SUCCESS != err) { goto error_hndl; } err = MCA_PML_CALL(isend ((void *) tmp_buffer, rcounts[i], rdtypes[i], i, MCA_COLL_BASE_TAG_ALLTOALLW, MCA_PML_BASE_SEND_STANDARD, comm, preq++)); if (MPI_SUCCESS != err) { goto error_hndl; } } else { continue; } /* Wait for the requests to complete */ err = ompi_request_wait_all (2, reqs, MPI_STATUSES_IGNORE); if (MPI_SUCCESS != err) { goto error_hndl; } } } error_hndl: /* Free the temporary buffer */ free (tmp_buffer); if( MPI_SUCCESS != err ) { /* Free the requests. */ if( NULL != reqs ) { ompi_coll_base_free_reqs(reqs, 2); } } /* All done */ return err; }
/* * allgather_inter * * Function: - allgather using other MPI collections * Accepts: - same as MPI_Allgather() * Returns: - MPI_SUCCESS or error code */ int mca_coll_basic_allgather_inter(const void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int rank, root = 0, size, rsize, err, i, line; char *tmpbuf_free = NULL, *tmpbuf, *ptmp; ptrdiff_t rlb, rextent, incr; ptrdiff_t gap, span; ompi_request_t *req; ompi_request_t **reqs = NULL; rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); rsize = ompi_comm_remote_size(comm); /* Algorithm: * - a gather to the root in remote group (simultaniously executed, * thats why we cannot use coll_gather). * - exchange the temp-results between two roots * - inter-bcast (again simultanious). */ /* Step one: gather operations: */ if (rank != root) { /* send your data to root */ err = MCA_PML_CALL(send(sbuf, scount, sdtype, root, MCA_COLL_BASE_TAG_ALLGATHER, MCA_PML_BASE_SEND_STANDARD, comm)); if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; } } else { /* receive a msg. from all other procs. */ err = ompi_datatype_get_extent(rdtype, &rlb, &rextent); if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; } /* Get a requests arrays of the right size */ reqs = ompi_coll_base_comm_get_reqs(module->base_data, rsize + 1); if( NULL == reqs ) { line = __LINE__; err = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } /* Do a send-recv between the two root procs. to avoid deadlock */ err = MCA_PML_CALL(isend(sbuf, scount, sdtype, 0, MCA_COLL_BASE_TAG_ALLGATHER, MCA_PML_BASE_SEND_STANDARD, comm, &reqs[rsize])); if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; } err = MCA_PML_CALL(irecv(rbuf, rcount, rdtype, 0, MCA_COLL_BASE_TAG_ALLGATHER, comm, &reqs[0])); if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; } incr = rextent * rcount; ptmp = (char *) rbuf + incr; for (i = 1; i < rsize; ++i, ptmp += incr) { err = MCA_PML_CALL(irecv(ptmp, rcount, rdtype, i, MCA_COLL_BASE_TAG_ALLGATHER, comm, &reqs[i])); if (MPI_SUCCESS != err) { line = __LINE__; goto exit; } } err = ompi_request_wait_all(rsize + 1, reqs, MPI_STATUSES_IGNORE); if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; } /* Step 2: exchange the resuts between the root processes */ span = opal_datatype_span(&sdtype->super, (int64_t)scount * (int64_t)size, &gap); tmpbuf_free = (char *) malloc(span); if (NULL == tmpbuf_free) { line = __LINE__; err = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } tmpbuf = tmpbuf_free - gap; err = MCA_PML_CALL(isend(rbuf, rsize * rcount, rdtype, 0, MCA_COLL_BASE_TAG_ALLGATHER, MCA_PML_BASE_SEND_STANDARD, comm, &req)); if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; } err = MCA_PML_CALL(recv(tmpbuf, size * scount, sdtype, 0, MCA_COLL_BASE_TAG_ALLGATHER, comm, MPI_STATUS_IGNORE)); if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; } err = ompi_request_wait( &req, MPI_STATUS_IGNORE); if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; } } /* Step 3: bcast the data to the remote group. This * happens in both groups simultaneously, thus we can * not use coll_bcast (this would deadlock). */ if (rank != root) { /* post the recv */ err = MCA_PML_CALL(recv(rbuf, rsize * rcount, rdtype, 0, MCA_COLL_BASE_TAG_ALLGATHER, comm, MPI_STATUS_IGNORE)); if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; } } else { /* Send the data to every other process in the remote group * except to rank zero. which has it already. */ for (i = 1; i < rsize; i++) { err = MCA_PML_CALL(isend(tmpbuf, size * scount, sdtype, i, MCA_COLL_BASE_TAG_ALLGATHER, MCA_PML_BASE_SEND_STANDARD, comm, &reqs[i - 1])); if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; } } err = ompi_request_wait_all(rsize - 1, reqs, MPI_STATUSES_IGNORE); if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; } } exit: if( MPI_SUCCESS != err ) { OPAL_OUTPUT( (ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank) ); (void)line; // silence compiler warning if( NULL != reqs ) ompi_coll_base_free_reqs(reqs, rsize+1); } if (NULL != tmpbuf_free) { free(tmpbuf_free); } return err; }
/* * alltoall_inter * * Function: - MPI_Alltoall * Accepts: - same as MPI_Alltoall() * Returns: - MPI_SUCCESS or an MPI error code */ int mca_coll_basic_alltoall_inter(const void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i; int size; int err; int nreqs; char *psnd; char *prcv; MPI_Aint lb; MPI_Aint sndinc; MPI_Aint rcvinc; ompi_request_t **req, **sreq, **rreq; /* Initialize. */ size = ompi_comm_remote_size(comm); err = ompi_datatype_get_extent(sdtype, &lb, &sndinc); if (OMPI_SUCCESS != err) { return err; } sndinc *= scount; err = ompi_datatype_get_extent(rdtype, &lb, &rcvinc); if (OMPI_SUCCESS != err) { return err; } rcvinc *= rcount; /* Initiate all send/recv to/from others. */ nreqs = size * 2; req = rreq = coll_base_comm_get_reqs( module->base_data, nreqs); if( NULL == req ) { return OMPI_ERR_OUT_OF_RESOURCE; } sreq = rreq + size; prcv = (char *) rbuf; psnd = (char *) sbuf; /* Post all receives first */ for (i = 0; i < size; i++, ++rreq) { err = MCA_PML_CALL(irecv(prcv + (i * rcvinc), rcount, rdtype, i, MCA_COLL_BASE_TAG_ALLTOALL, comm, rreq)); if (OMPI_SUCCESS != err) { ompi_coll_base_free_reqs(req, i + 1); return err; } } /* Now post all sends */ for (i = 0; i < size; i++, ++sreq) { err = MCA_PML_CALL(isend(psnd + (i * sndinc), scount, sdtype, i, MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD, comm, sreq)); if (OMPI_SUCCESS != err) { ompi_coll_base_free_reqs(req, i + size + 1); return err; } } /* Wait for them all. If there's an error, note that we don't * care what the error was -- just that there *was* an error. The * PML will finish all requests, even if one or more of them fail. * i.e., by the end of this call, all the requests are free-able. * So free them anyway -- even if there was an error, and return * the error after we free everything. */ err = ompi_request_wait_all(nreqs, req, MPI_STATUSES_IGNORE); if (OMPI_SUCCESS != err) { ompi_coll_base_free_reqs(req, nreqs); } /* All done */ return err; }
static int mca_coll_basic_neighbor_allgather_cart(const void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { const mca_topo_base_comm_cart_2_2_0_t *cart = comm->c_topo->mtc.cart; const int rank = ompi_comm_rank (comm); ompi_request_t **reqs, **preqs; ptrdiff_t lb, extent; int rc = MPI_SUCCESS, dim, nreqs; if( 0 == cart->ndims ) return OMPI_SUCCESS; ompi_datatype_get_extent(rdtype, &lb, &extent); reqs = preqs = coll_base_comm_get_reqs( module->base_data, 4 * cart->ndims ); if( NULL == reqs ) { return OMPI_ERR_OUT_OF_RESOURCE; } /* The ordering is defined as -1 then +1 in each dimension in * order of dimension. */ for (dim = 0, nreqs = 0 ; dim < cart->ndims ; ++dim) { int srank = MPI_PROC_NULL, drank = MPI_PROC_NULL; if (cart->dims[dim] > 1) { mca_topo_base_cart_shift (comm, dim, 1, &srank, &drank); } else if (1 == cart->dims[dim] && cart->periods[dim]) { srank = drank = rank; } if (MPI_PROC_NULL != srank) { nreqs++; rc = MCA_PML_CALL(irecv(rbuf, rcount, rdtype, srank, MCA_COLL_BASE_TAG_ALLGATHER, comm, preqs++)); if (OMPI_SUCCESS != rc) break; nreqs++; /* remove cast from const when the pml layer is updated to take * a const for the send buffer. */ rc = MCA_PML_CALL(isend((void *) sbuf, scount, sdtype, srank, MCA_COLL_BASE_TAG_ALLGATHER, MCA_PML_BASE_SEND_STANDARD, comm, preqs++)); if (OMPI_SUCCESS != rc) break; } rbuf = (char *) rbuf + extent * rcount; if (MPI_PROC_NULL != drank) { nreqs++; rc = MCA_PML_CALL(irecv(rbuf, rcount, rdtype, drank, MCA_COLL_BASE_TAG_ALLGATHER, comm, preqs++)); if (OMPI_SUCCESS != rc) break; nreqs++; rc = MCA_PML_CALL(isend((void *) sbuf, scount, sdtype, drank, MCA_COLL_BASE_TAG_ALLGATHER, MCA_PML_BASE_SEND_STANDARD, comm, preqs++)); if (OMPI_SUCCESS != rc) break; } rbuf = (char *) rbuf + extent * rcount; } if (OMPI_SUCCESS != rc) { ompi_coll_base_free_reqs(reqs, nreqs); return rc; } rc = ompi_request_wait_all (nreqs, reqs, MPI_STATUSES_IGNORE); if (OMPI_SUCCESS != rc) { ompi_coll_base_free_reqs(reqs, nreqs); } return rc; }
/* * gather_intra_linear_sync * * Function: - synchronized gather operation with * Accepts: - same arguments as MPI_Gather(), first segment size * Returns: - MPI_SUCCESS or error code */ int ompi_coll_base_gather_intra_linear_sync(const void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module, int first_segment_size) { int i, ret, line, rank, size, first_segment_count; ompi_request_t **reqs = NULL; MPI_Aint extent, lb; size_t typelng; size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "ompi_coll_base_gather_intra_linear_sync rank %d, segment %d", rank, first_segment_size)); if (rank != root) { /* Non-root processes: - receive zero byte message from the root, - send the first segment of the data synchronously, - send the second segment of the data. */ ompi_datatype_type_size(sdtype, &typelng); ompi_datatype_get_extent(sdtype, &lb, &extent); first_segment_count = scount; COLL_BASE_COMPUTED_SEGCOUNT( (size_t) first_segment_size, typelng, first_segment_count ); ret = MCA_PML_CALL(recv(rbuf, 0, MPI_BYTE, root, MCA_COLL_BASE_TAG_GATHER, comm, MPI_STATUS_IGNORE)); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } ret = MCA_PML_CALL(send(sbuf, first_segment_count, sdtype, root, MCA_COLL_BASE_TAG_GATHER, MCA_PML_BASE_SEND_STANDARD, comm)); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } ret = MCA_PML_CALL(send((char*)sbuf + extent * first_segment_count, (scount - first_segment_count), sdtype, root, MCA_COLL_BASE_TAG_GATHER, MCA_PML_BASE_SEND_STANDARD, comm)); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } } else { /* Root process, - For every non-root node: - post irecv for the first segment of the message - send zero byte message to signal node to send the message - post irecv for the second segment of the message - wait for the first segment to complete - Copy local data if necessary - Waitall for all the second segments to complete. */ char *ptmp; ompi_request_t *first_segment_req; reqs = coll_base_comm_get_reqs(module->base_data, size); if (NULL == reqs) { ret = -1; line = __LINE__; goto error_hndl; } ompi_datatype_type_size(rdtype, &typelng); ompi_datatype_get_extent(rdtype, &lb, &extent); first_segment_count = rcount; COLL_BASE_COMPUTED_SEGCOUNT( (size_t)first_segment_size, typelng, first_segment_count ); ptmp = (char *) rbuf; for (i = 0; i < size; ++i) { if (i == rank) { /* skip myself */ reqs[i] = MPI_REQUEST_NULL; continue; } /* irecv for the first segment from i */ ptmp = (char*)rbuf + (ptrdiff_t)i * (ptrdiff_t)rcount * extent; ret = MCA_PML_CALL(irecv(ptmp, first_segment_count, rdtype, i, MCA_COLL_BASE_TAG_GATHER, comm, &first_segment_req)); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } /* send sync message */ ret = MCA_PML_CALL(send(rbuf, 0, MPI_BYTE, i, MCA_COLL_BASE_TAG_GATHER, MCA_PML_BASE_SEND_STANDARD, comm)); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } /* irecv for the second segment */ ptmp = (char*)rbuf + ((ptrdiff_t)i * (ptrdiff_t)rcount + first_segment_count) * extent; ret = MCA_PML_CALL(irecv(ptmp, (rcount - first_segment_count), rdtype, i, MCA_COLL_BASE_TAG_GATHER, comm, &reqs[i])); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } /* wait on the first segment to complete */ ret = ompi_request_wait(&first_segment_req, MPI_STATUS_IGNORE); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } } /* copy local data if necessary */ if (MPI_IN_PLACE != sbuf) { ret = ompi_datatype_sndrcv((void *)sbuf, scount, sdtype, (char*)rbuf + (ptrdiff_t)rank * (ptrdiff_t)rcount * extent, rcount, rdtype); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } } /* wait all second segments to complete */ ret = ompi_request_wait_all(size, reqs, MPI_STATUSES_IGNORE); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } } /* All done */ return MPI_SUCCESS; error_hndl: if (NULL != reqs) { ompi_coll_base_free_reqs(reqs, size); } OPAL_OUTPUT (( ompi_coll_base_framework.framework_output, "ERROR_HNDL: node %d file %s line %d error %d\n", rank, __FILE__, line, ret )); (void)line; // silence compiler warning return ret; }