static inline int a2aw_sched_linear(int rank, int p, NBC_Schedule *schedule, const void *sendbuf, const int *sendcounts, const int *sdispls, struct ompi_datatype_t * const * sendtypes, void *recvbuf, const int *recvcounts, const int *rdispls, struct ompi_datatype_t * const * recvtypes) { int res; for (int i = 0; i < p; i++) { ptrdiff_t gap, span; if (i == rank) { continue; } /* post send */ span = opal_datatype_span(&sendtypes[i]->super, sendcounts[i], &gap); if (OPAL_LIKELY(0 < span)) { char *sbuf = (char *) sendbuf + sdispls[i]; res = NBC_Sched_send (sbuf, false, sendcounts[i], sendtypes[i], i, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } } /* post receive */ span = opal_datatype_span(&recvtypes[i]->super, recvcounts[i], &gap); if (OPAL_LIKELY(0 < span)) { char *rbuf = (char *) recvbuf + rdispls[i]; res = NBC_Sched_recv (rbuf, false, recvcounts[i], recvtypes[i], i, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } } } return OMPI_SUCCESS; }
/* * reduce_inter * * Function: - reduction using the local_comm * Accepts: - same as MPI_Reduce() * Returns: - MPI_SUCCESS or error code */ int mca_coll_inter_reduce_inter(const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int rank, err; /* Initialize */ rank = ompi_comm_rank(comm); if (MPI_PROC_NULL == root) { /* do nothing */ err = OMPI_SUCCESS; } else if (MPI_ROOT != root) { ptrdiff_t gap, span; char *free_buffer = NULL; char *pml_buffer = NULL; /* Perform the reduce locally with the first process as root */ span = opal_datatype_span(&dtype->super, count, &gap); free_buffer = (char*)malloc(span); if (NULL == free_buffer) { return OMPI_ERR_OUT_OF_RESOURCE; } pml_buffer = free_buffer - gap; err = comm->c_local_comm->c_coll.coll_reduce(sbuf, pml_buffer, count, dtype, op, 0, comm->c_local_comm, comm->c_local_comm->c_coll.coll_reduce_module); if (0 == rank) { /* First process sends the result to the root */ err = MCA_PML_CALL(send(pml_buffer, count, dtype, root, MCA_COLL_BASE_TAG_REDUCE, MCA_PML_BASE_SEND_STANDARD, comm)); if (OMPI_SUCCESS != err) { return err; } } if (NULL != free_buffer) { free(free_buffer); } } else { /* Root receives the reduced message from the first process */ err = MCA_PML_CALL(recv(rbuf, count, dtype, 0, MCA_COLL_BASE_TAG_REDUCE, comm, MPI_STATUS_IGNORE)); if (OMPI_SUCCESS != err) { return err; } } /* All done */ return err; }
/* * reduce_scatter_block * * Function: - reduce then scatter * Accepts: - same as MPI_Reduce_scatter_block() * Returns: - MPI_SUCCESS or error code * * Algorithm: * reduce and scatter (needs to be cleaned * up at some point) */ int mca_coll_cuda_reduce_scatter_block(const void *sbuf, void *rbuf, int rcount, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { mca_coll_cuda_module_t *s = (mca_coll_cuda_module_t*) module; ptrdiff_t gap; char *rbuf1 = NULL, *sbuf1 = NULL, *rbuf2 = NULL; const char *sbuf2; size_t sbufsize, rbufsize; int rc; rbufsize = opal_datatype_span(&dtype->super, rcount, &gap); sbufsize = rbufsize * ompi_comm_size(comm); if ((MPI_IN_PLACE != sbuf) && (opal_cuda_check_bufs((char *)sbuf, NULL))) { sbuf1 = (char*)malloc(sbufsize); if (NULL == sbuf1) { return OMPI_ERR_OUT_OF_RESOURCE; } opal_cuda_memcpy_sync(sbuf1, sbuf, sbufsize); sbuf2 = sbuf; /* save away original buffer */ sbuf = sbuf1 - gap; } if (opal_cuda_check_bufs(rbuf, NULL)) { rbuf1 = (char*)malloc(rbufsize); if (NULL == rbuf1) { if (NULL != sbuf1) free(sbuf1); return OMPI_ERR_OUT_OF_RESOURCE; } opal_cuda_memcpy_sync(rbuf1, rbuf, rbufsize); rbuf2 = rbuf; /* save away original buffer */ rbuf = rbuf1 - gap; } rc = s->c_coll.coll_reduce_scatter_block(sbuf, rbuf, rcount, dtype, op, comm, s->c_coll.coll_reduce_scatter_block_module); if (NULL != sbuf1) { free(sbuf1); } if (NULL != rbuf1) { rbuf = rbuf2; opal_cuda_memcpy_sync(rbuf, rbuf1, rbufsize); free(rbuf1); } return rc; }
/* MPI_IN_PLACE all to all algorithm. TODO: implement a better one. */ int mca_coll_base_alltoall_intra_basic_inplace(const void *rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, j, size, rank, err = MPI_SUCCESS, line; OPAL_PTRDIFF_TYPE ext, gap; ompi_request_t *req; char *allocated_buffer = NULL, *tmp_buffer; size_t max_size; /* Initialize. */ size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); /* If only one process, we're done. */ if (1 == size) { return MPI_SUCCESS; } /* Find the largest receive amount */ ompi_datatype_type_extent (rdtype, &ext); max_size = opal_datatype_span(&rdtype->super, rcount, &gap); /* Initiate all send/recv to/from others. */ /* Allocate a temporary buffer */ allocated_buffer = calloc (max_size, 1); if( NULL == allocated_buffer) { err = OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto error_hndl; } tmp_buffer = allocated_buffer - gap; max_size = ext * rcount; /* in-place alltoall slow algorithm (but works) */ for (i = 0 ; i < size ; ++i) { for (j = i+1 ; j < size ; ++j) { if (i == rank) { /* Copy the data into the temporary buffer */ err = ompi_datatype_copy_content_same_ddt (rdtype, rcount, tmp_buffer, (char *) rbuf + j * max_size); if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; } /* Exchange data with the peer */ err = MCA_PML_CALL(irecv ((char *) rbuf + max_size * j, rcount, rdtype, j, MCA_COLL_BASE_TAG_ALLTOALL, comm, &req)); if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; } err = MCA_PML_CALL(send ((char *) tmp_buffer, rcount, rdtype, j, MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD, comm)); if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; } } else if (j == rank) { /* Copy the data into the temporary buffer */ err = ompi_datatype_copy_content_same_ddt (rdtype, rcount, tmp_buffer, (char *) rbuf + i * max_size); if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; } /* Exchange data with the peer */ err = MCA_PML_CALL(irecv ((char *) rbuf + max_size * i, rcount, rdtype, i, MCA_COLL_BASE_TAG_ALLTOALL, comm, &req)); if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; } err = MCA_PML_CALL(send ((char *) tmp_buffer, rcount, rdtype, i, MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD, comm)); if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; } } else { continue; } /* Wait for the requests to complete */ err = ompi_request_wait ( &req, MPI_STATUSES_IGNORE); if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; } } } error_hndl: /* Free the temporary buffer */ if( NULL != allocated_buffer ) free (allocated_buffer); if( MPI_SUCCESS != err ) { OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank)); (void)line; // silence compiler warning } /* All done */ return err; }
int ompi_coll_base_alltoall_intra_bruck(const void *sbuf, int scount, struct ompi_datatype_t *sdtype, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, k, line = -1, rank, size, err = 0; int sendto, recvfrom, distance, *displs = NULL, *blen = NULL; char *tmpbuf = NULL, *tmpbuf_free = NULL; OPAL_PTRDIFF_TYPE sext, rext, span, gap; struct ompi_datatype_t *new_ddt; if (MPI_IN_PLACE == sbuf) { return mca_coll_base_alltoall_intra_basic_inplace (rbuf, rcount, rdtype, comm, module); } size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:alltoall_intra_bruck rank %d", rank)); err = ompi_datatype_type_extent (sdtype, &sext); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } err = ompi_datatype_type_extent (rdtype, &rext); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } span = opal_datatype_span(&sdtype->super, (int64_t)size * scount, &gap); displs = (int *) malloc(size * sizeof(int)); if (displs == NULL) { line = __LINE__; err = -1; goto err_hndl; } blen = (int *) malloc(size * sizeof(int)); if (blen == NULL) { line = __LINE__; err = -1; goto err_hndl; } /* tmp buffer allocation for message data */ tmpbuf_free = (char *)malloc(span); if (tmpbuf_free == NULL) { line = __LINE__; err = -1; goto err_hndl; } tmpbuf = tmpbuf_free - gap; /* Step 1 - local rotation - shift up by rank */ err = ompi_datatype_copy_content_same_ddt (sdtype, (int32_t) ((ptrdiff_t)(size - rank) * (ptrdiff_t)scount), tmpbuf, ((char*) sbuf) + (ptrdiff_t)rank * (ptrdiff_t)scount * sext); if (err<0) { line = __LINE__; err = -1; goto err_hndl; } if (rank != 0) { err = ompi_datatype_copy_content_same_ddt (sdtype, (ptrdiff_t)rank * (ptrdiff_t)scount, tmpbuf + (ptrdiff_t)(size - rank) * (ptrdiff_t)scount* sext, (char*) sbuf); if (err<0) { line = __LINE__; err = -1; goto err_hndl; } } /* perform communication step */ for (distance = 1; distance < size; distance<<=1) { sendto = (rank + distance) % size; recvfrom = (rank - distance + size) % size; k = 0; /* create indexed datatype */ for (i = 1; i < size; i++) { if (( i & distance) == distance) { displs[k] = (ptrdiff_t)i * (ptrdiff_t)scount; blen[k] = scount; k++; } } /* Set indexes and displacements */ err = ompi_datatype_create_indexed(k, blen, displs, sdtype, &new_ddt); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } /* Commit the new datatype */ err = ompi_datatype_commit(&new_ddt); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } /* Sendreceive */ err = ompi_coll_base_sendrecv ( tmpbuf, 1, new_ddt, sendto, MCA_COLL_BASE_TAG_ALLTOALL, rbuf, 1, new_ddt, recvfrom, MCA_COLL_BASE_TAG_ALLTOALL, comm, MPI_STATUS_IGNORE, rank ); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } /* Copy back new data from recvbuf to tmpbuf */ err = ompi_datatype_copy_content_same_ddt(new_ddt, 1,tmpbuf, (char *) rbuf); if (err < 0) { line = __LINE__; err = -1; goto err_hndl; } /* free ddt */ err = ompi_datatype_destroy(&new_ddt); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } } /* end of for (distance = 1... */ /* Step 3 - local rotation - */ for (i = 0; i < size; i++) { err = ompi_datatype_copy_content_same_ddt (rdtype, (int32_t) rcount, ((char*)rbuf) + ((ptrdiff_t)((rank - i + size) % size) * (ptrdiff_t)rcount * rext), tmpbuf + (ptrdiff_t)i * (ptrdiff_t)rcount * rext); if (err < 0) { line = __LINE__; err = -1; goto err_hndl; } } /* Step 4 - clean up */ if (tmpbuf != NULL) free(tmpbuf_free); if (displs != NULL) free(displs); if (blen != NULL) free(blen); return OMPI_SUCCESS; err_hndl: OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank)); (void)line; // silence compiler warning if (tmpbuf != NULL) free(tmpbuf_free); if (displs != NULL) free(displs); if (blen != NULL) free(blen); return err; }
/* * reduce_scatter_block_inter * * Function: - reduce/scatter operation * Accepts: - same arguments as MPI_Reduce_scatter() * Returns: - MPI_SUCCESS or error code */ int mca_coll_basic_reduce_scatter_block_inter(const void *sbuf, void *rbuf, int rcount, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int err, i, rank, root = 0, rsize, lsize; int totalcounts; ptrdiff_t gap, span; char *tmpbuf = NULL, *tmpbuf2 = NULL; char *lbuf = NULL, *buf; ompi_request_t *req; rank = ompi_comm_rank(comm); rsize = ompi_comm_remote_size(comm); lsize = ompi_comm_size(comm); totalcounts = lsize * rcount; /* * The following code basically does an interreduce followed by a * intrascatter. This is implemented by having the roots of each * group exchange their sbuf. Then, the roots receive the data * from each of the remote ranks and execute the reduce. When * this is complete, they have the reduced data available to them * for doing the scatter. They do this on the local communicator * associated with the intercommunicator. * * Note: There are other ways to implement MPI_Reduce_scatter_block on * intercommunicators. For example, one could do a MPI_Reduce locally, * then send the results to the other root which could scatter it. * */ if (rank == root) { span = opal_datatype_span(&dtype->super, totalcounts, &gap); tmpbuf = (char *) malloc(span); tmpbuf2 = (char *) malloc(span); if (NULL == tmpbuf || NULL == tmpbuf2) { return OMPI_ERR_OUT_OF_RESOURCE; } lbuf = tmpbuf - gap; buf = tmpbuf2 - gap; /* Do a send-recv between the two root procs. to avoid deadlock */ err = MCA_PML_CALL(isend(sbuf, totalcounts, dtype, 0, MCA_COLL_BASE_TAG_REDUCE_SCATTER, MCA_PML_BASE_SEND_STANDARD, comm, &req)); if (OMPI_SUCCESS != err) { goto exit; } err = MCA_PML_CALL(recv(lbuf, totalcounts, dtype, 0, MCA_COLL_BASE_TAG_REDUCE_SCATTER, comm, MPI_STATUS_IGNORE)); if (OMPI_SUCCESS != err) { goto exit; } err = ompi_request_wait( &req, MPI_STATUS_IGNORE); if (OMPI_SUCCESS != err) { goto exit; } /* Loop receiving and calling reduction function (C or Fortran) * The result of this reduction operations is then in * tmpbuf2. */ for (i = 1; i < rsize; i++) { char *tbuf; err = MCA_PML_CALL(recv(buf, totalcounts, dtype, i, MCA_COLL_BASE_TAG_REDUCE_SCATTER, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != err) { goto exit; } /* Perform the reduction */ ompi_op_reduce(op, lbuf, buf, totalcounts, dtype); /* swap the buffers */ tbuf = lbuf; lbuf = buf; buf = tbuf; } } else { /* If not root, send data to the root. */ err = MCA_PML_CALL(send(sbuf, totalcounts, dtype, root, MCA_COLL_BASE_TAG_REDUCE_SCATTER, MCA_PML_BASE_SEND_STANDARD, comm)); if (OMPI_SUCCESS != err) { goto exit; } } /* Now do a scatterv on the local communicator */ err = comm->c_local_comm->c_coll->coll_scatter(lbuf, rcount, dtype, rbuf, rcount, dtype, 0, comm->c_local_comm, comm->c_local_comm->c_coll->coll_scatter_module); exit: if (NULL != tmpbuf) { free(tmpbuf); } if (NULL != tmpbuf2) { free(tmpbuf2); } return err; }
/* * allgatherv_inter * * Function: - allgatherv using other MPI collectives * Accepts: - same as MPI_Allgatherv() * Returns: - MPI_SUCCESS or error code */ int mca_coll_inter_allgatherv_inter(const void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, const int *rcounts, const int *disps, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, rank, size, size_local, total=0, err; int *count=NULL,*displace=NULL; char *ptmp_free=NULL, *ptmp=NULL; ompi_datatype_t *ndtype = NULL; rank = ompi_comm_rank(comm); size_local = ompi_comm_size(comm->c_local_comm); size = ompi_comm_remote_size(comm); if (0 == rank) { count = (int *)malloc(sizeof(int) * size_local); displace = (int *)malloc(sizeof(int) * size_local); if ((NULL == count) || (NULL == displace)) { err = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } /* Local gather to get the scount of each process */ err = comm->c_local_comm->c_coll->coll_gather(&scount, 1, MPI_INT, count, 1, MPI_INT, 0, comm->c_local_comm, comm->c_local_comm->c_coll->coll_gather_module); if (OMPI_SUCCESS != err) { goto exit; } if(0 == rank) { displace[0] = 0; for (i = 1; i < size_local; i++) { displace[i] = displace[i-1] + count[i-1]; } total = 0; for (i = 0; i < size_local; i++) { total = total + count[i]; } if ( total > 0 ) { ptrdiff_t gap, span; span = opal_datatype_span(&sdtype->super, total, &gap); ptmp_free = (char*)malloc(span); if (NULL == ptmp_free) { err = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } ptmp = ptmp_free - gap; } } err = comm->c_local_comm->c_coll->coll_gatherv(sbuf, scount, sdtype, ptmp, count, displace, sdtype,0, comm->c_local_comm, comm->c_local_comm->c_coll->coll_gatherv_module); if (OMPI_SUCCESS != err) { goto exit; } ompi_datatype_create_indexed(size,rcounts,disps,rdtype,&ndtype); ompi_datatype_commit(&ndtype); if (0 == rank) { /* Exchange data between roots */ err = ompi_coll_base_sendrecv_actual(ptmp, total, sdtype, 0, MCA_COLL_BASE_TAG_ALLGATHERV, rbuf, 1, ndtype, 0, MCA_COLL_BASE_TAG_ALLGATHERV, comm, MPI_STATUS_IGNORE); if (OMPI_SUCCESS != err) { goto exit; } } /* bcast the message to all the local processes */ err = comm->c_local_comm->c_coll->coll_bcast(rbuf, 1, ndtype, 0, comm->c_local_comm, comm->c_local_comm->c_coll->coll_bcast_module); exit: if( NULL != ndtype ) { ompi_datatype_destroy(&ndtype); } if (NULL != ptmp_free) { free(ptmp_free); } if (NULL != displace) { free(displace); } if (NULL != count) { free(count); } return err; }
/* * allreduce_inter * * Function: - allreduce using other MPI collectives * Accepts: - same as MPI_Allreduce() * Returns: - MPI_SUCCESS or error code */ int mca_coll_inter_allreduce_inter(const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int err, rank, root = 0; char *tmpbuf = NULL, *pml_buffer = NULL; ompi_request_t *req[2]; ptrdiff_t gap, span; rank = ompi_comm_rank(comm); /* Perform the reduction locally */ span = opal_datatype_span(&dtype->super, count, &gap); tmpbuf = (char *) malloc(span); if (NULL == tmpbuf) { return OMPI_ERR_OUT_OF_RESOURCE; } pml_buffer = tmpbuf - gap; err = comm->c_local_comm->c_coll->coll_reduce(sbuf, pml_buffer, count, dtype, op, root, comm->c_local_comm, comm->c_local_comm->c_coll->coll_reduce_module); if (OMPI_SUCCESS != err) { goto exit; } if (rank == root) { /* Do a send-recv between the two root procs. to avoid deadlock */ err = MCA_PML_CALL(irecv(rbuf, count, dtype, 0, MCA_COLL_BASE_TAG_ALLREDUCE, comm, &(req[0]))); if (OMPI_SUCCESS != err) { goto exit; } err = MCA_PML_CALL(isend(pml_buffer, count, dtype, 0, MCA_COLL_BASE_TAG_ALLREDUCE, MCA_PML_BASE_SEND_STANDARD, comm, &(req[1]))); if (OMPI_SUCCESS != err) { goto exit; } err = ompi_request_wait_all(2, req, MPI_STATUSES_IGNORE); if (OMPI_SUCCESS != err) { goto exit; } } /* bcast the message to all the local processes */ err = comm->c_local_comm->c_coll->coll_bcast(rbuf, count, dtype, root, comm->c_local_comm, comm->c_local_comm->c_coll->coll_bcast_module); if (OMPI_SUCCESS != err) { goto exit; } exit: if (NULL != tmpbuf) { free(tmpbuf); } return err; }
/** * This is a generic implementation of the reduce protocol. It used the tree * provided as an argument and execute all operations using a segment of * count times a datatype. * For the last communication it will update the count in order to limit * the number of datatype to the original count (original_count) * * Note that for non-commutative operations we cannot save memory copy * for the first block: thus we must copy sendbuf to accumbuf on intermediate * to keep the optimized loop happy. */ int ompi_coll_base_reduce_generic( const void* sendbuf, void* recvbuf, int original_count, ompi_datatype_t* datatype, ompi_op_t* op, int root, ompi_communicator_t* comm, mca_coll_base_module_t *module, ompi_coll_tree_t* tree, int count_by_segment, int max_outstanding_reqs ) { char *inbuf[2] = {NULL, NULL}, *inbuf_free[2] = {NULL, NULL}; char *accumbuf = NULL, *accumbuf_free = NULL; char *local_op_buffer = NULL, *sendtmpbuf = NULL; ptrdiff_t extent, size, gap = 0, segment_increment; ompi_request_t **sreq = NULL, *reqs[2] = {MPI_REQUEST_NULL, MPI_REQUEST_NULL}; int num_segments, line, ret, segindex, i, rank; int recvcount, prevcount, inbi; /** * Determine number of segments and number of elements * sent per operation */ ompi_datatype_type_extent( datatype, &extent ); num_segments = (int)(((size_t)original_count + (size_t)count_by_segment - (size_t)1) / (size_t)count_by_segment); segment_increment = (ptrdiff_t)count_by_segment * extent; sendtmpbuf = (char*) sendbuf; if( sendbuf == MPI_IN_PLACE ) { sendtmpbuf = (char *)recvbuf; } OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:reduce_generic count %d, msg size %ld, segsize %ld, max_requests %d", original_count, (unsigned long)((ptrdiff_t)num_segments * (ptrdiff_t)segment_increment), (unsigned long)segment_increment, max_outstanding_reqs)); rank = ompi_comm_rank(comm); /* non-leaf nodes - wait for children to send me data & forward up (if needed) */ if( tree->tree_nextsize > 0 ) { ptrdiff_t real_segment_size; /* handle non existant recv buffer (i.e. its NULL) and protect the recv buffer on non-root nodes */ accumbuf = (char*)recvbuf; if( (NULL == accumbuf) || (root != rank) ) { /* Allocate temporary accumulator buffer. */ size = opal_datatype_span(&datatype->super, original_count, &gap); accumbuf_free = (char*)malloc(size); if (accumbuf_free == NULL) { line = __LINE__; ret = -1; goto error_hndl; } accumbuf = accumbuf_free - gap; } /* If this is a non-commutative operation we must copy sendbuf to the accumbuf, in order to simplfy the loops */ if (!ompi_op_is_commute(op) && MPI_IN_PLACE != sendbuf) { ompi_datatype_copy_content_same_ddt(datatype, original_count, (char*)accumbuf, (char*)sendtmpbuf); } /* Allocate two buffers for incoming segments */ real_segment_size = opal_datatype_span(&datatype->super, count_by_segment, &gap); inbuf_free[0] = (char*) malloc(real_segment_size); if( inbuf_free[0] == NULL ) { line = __LINE__; ret = -1; goto error_hndl; } inbuf[0] = inbuf_free[0] - gap; /* if there is chance to overlap communication - allocate second buffer */ if( (num_segments > 1) || (tree->tree_nextsize > 1) ) { inbuf_free[1] = (char*) malloc(real_segment_size); if( inbuf_free[1] == NULL ) { line = __LINE__; ret = -1; goto error_hndl; } inbuf[1] = inbuf_free[1] - gap; } /* reset input buffer index and receive count */ inbi = 0; recvcount = 0; /* for each segment */ for( segindex = 0; segindex <= num_segments; segindex++ ) { prevcount = recvcount; /* recvcount - number of elements in current segment */ recvcount = count_by_segment; if( segindex == (num_segments-1) ) recvcount = original_count - (ptrdiff_t)count_by_segment * (ptrdiff_t)segindex; /* for each child */ for( i = 0; i < tree->tree_nextsize; i++ ) { /** * We try to overlap communication: * either with next segment or with the next child */ /* post irecv for current segindex on current child */ if( segindex < num_segments ) { void* local_recvbuf = inbuf[inbi]; if( 0 == i ) { /* for the first step (1st child per segment) and * commutative operations we might be able to irecv * directly into the accumulate buffer so that we can * reduce(op) this with our sendbuf in one step as * ompi_op_reduce only has two buffer pointers, * this avoids an extra memory copy. * * BUT if the operation is non-commutative or * we are root and are USING MPI_IN_PLACE this is wrong! */ if( (ompi_op_is_commute(op)) && !((MPI_IN_PLACE == sendbuf) && (rank == tree->tree_root)) ) { local_recvbuf = accumbuf + (ptrdiff_t)segindex * (ptrdiff_t)segment_increment; } } ret = MCA_PML_CALL(irecv(local_recvbuf, recvcount, datatype, tree->tree_next[i], MCA_COLL_BASE_TAG_REDUCE, comm, &reqs[inbi])); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl;} } /* wait for previous req to complete, if any. if there are no requests reqs[inbi ^1] will be MPI_REQUEST_NULL. */ /* wait on data from last child for previous segment */ ret = ompi_request_wait(&reqs[inbi ^ 1], MPI_STATUSES_IGNORE ); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } local_op_buffer = inbuf[inbi ^ 1]; if( i > 0 ) { /* our first operation is to combine our own [sendbuf] data * with the data we recvd from down stream (but only * the operation is commutative and if we are not root and * not using MPI_IN_PLACE) */ if( 1 == i ) { if( (ompi_op_is_commute(op)) && !((MPI_IN_PLACE == sendbuf) && (rank == tree->tree_root)) ) { local_op_buffer = sendtmpbuf + (ptrdiff_t)segindex * (ptrdiff_t)segment_increment; } } /* apply operation */ ompi_op_reduce(op, local_op_buffer, accumbuf + (ptrdiff_t)segindex * (ptrdiff_t)segment_increment, recvcount, datatype ); } else if ( segindex > 0 ) { void* accumulator = accumbuf + (ptrdiff_t)(segindex-1) * (ptrdiff_t)segment_increment; if( tree->tree_nextsize <= 1 ) { if( (ompi_op_is_commute(op)) && !((MPI_IN_PLACE == sendbuf) && (rank == tree->tree_root)) ) { local_op_buffer = sendtmpbuf + (ptrdiff_t)(segindex-1) * (ptrdiff_t)segment_increment; } } ompi_op_reduce(op, local_op_buffer, accumulator, prevcount, datatype ); /* all reduced on available data this step (i) complete, * pass to the next process unless you are the root. */ if (rank != tree->tree_root) { /* send combined/accumulated data to parent */ ret = MCA_PML_CALL( send( accumulator, prevcount, datatype, tree->tree_prev, MCA_COLL_BASE_TAG_REDUCE, MCA_PML_BASE_SEND_STANDARD, comm) ); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } } /* we stop when segindex = number of segments (i.e. we do num_segment+1 steps for pipelining */ if (segindex == num_segments) break; } /* update input buffer index */ inbi = inbi ^ 1; } /* end of for each child */ } /* end of for each segment */ /* clean up */ if( inbuf_free[0] != NULL) free(inbuf_free[0]); if( inbuf_free[1] != NULL) free(inbuf_free[1]); if( accumbuf_free != NULL ) free(accumbuf_free); } /* leaf nodes Depending on the value of max_outstanding_reqs and the number of segments we have two options: - send all segments using blocking send to the parent, or - avoid overflooding the parent nodes by limiting the number of outstanding requests to max_oustanding_reqs. TODO/POSSIBLE IMPROVEMENT: If there is a way to determine the eager size for the current communication, synchronization should be used only when the message/segment size is smaller than the eager size. */ else { /* If the number of segments is less than a maximum number of oustanding requests or there is no limit on the maximum number of outstanding requests, we send data to the parent using blocking send */ if ((0 == max_outstanding_reqs) || (num_segments <= max_outstanding_reqs)) { segindex = 0; while ( original_count > 0) { if (original_count < count_by_segment) { count_by_segment = original_count; } ret = MCA_PML_CALL( send((char*)sendbuf + (ptrdiff_t)segindex * (ptrdiff_t)segment_increment, count_by_segment, datatype, tree->tree_prev, MCA_COLL_BASE_TAG_REDUCE, MCA_PML_BASE_SEND_STANDARD, comm) ); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } segindex++; original_count -= count_by_segment; } } /* Otherwise, introduce flow control: - post max_outstanding_reqs non-blocking synchronous send, - for remaining segments - wait for a ssend to complete, and post the next one. - wait for all outstanding sends to complete. */ else { int creq = 0; sreq = ompi_coll_base_comm_get_reqs(module->base_data, max_outstanding_reqs); if (NULL == sreq) { line = __LINE__; ret = -1; goto error_hndl; } /* post first group of requests */ for (segindex = 0; segindex < max_outstanding_reqs; segindex++) { ret = MCA_PML_CALL( isend((char*)sendbuf + (ptrdiff_t)segindex * (ptrdiff_t)segment_increment, count_by_segment, datatype, tree->tree_prev, MCA_COLL_BASE_TAG_REDUCE, MCA_PML_BASE_SEND_SYNCHRONOUS, comm, &sreq[segindex]) ); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } original_count -= count_by_segment; } creq = 0; while ( original_count > 0 ) { /* wait on a posted request to complete */ ret = ompi_request_wait(&sreq[creq], MPI_STATUS_IGNORE); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } if( original_count < count_by_segment ) { count_by_segment = original_count; } ret = MCA_PML_CALL( isend((char*)sendbuf + (ptrdiff_t)segindex * (ptrdiff_t)segment_increment, count_by_segment, datatype, tree->tree_prev, MCA_COLL_BASE_TAG_REDUCE, MCA_PML_BASE_SEND_SYNCHRONOUS, comm, &sreq[creq]) ); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } creq = (creq + 1) % max_outstanding_reqs; segindex++; original_count -= count_by_segment; } /* Wait on the remaining request to complete */ ret = ompi_request_wait_all( max_outstanding_reqs, sreq, MPI_STATUSES_IGNORE ); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } } } return OMPI_SUCCESS; error_hndl: /* error handler */ OPAL_OUTPUT (( ompi_coll_base_framework.framework_output, "ERROR_HNDL: node %d file %s line %d error %d\n", rank, __FILE__, line, ret )); (void)line; // silence compiler warning if( inbuf_free[0] != NULL ) free(inbuf_free[0]); if( inbuf_free[1] != NULL ) free(inbuf_free[1]); if( accumbuf_free != NULL ) free(accumbuf); if( NULL != sreq ) { ompi_coll_base_free_reqs(sreq, max_outstanding_reqs); } return ret; }
/* simple linear Alltoallv */ static int nbc_alltoallv_init(const void* sendbuf, const int *sendcounts, const int *sdispls, MPI_Datatype sendtype, void* recvbuf, const int *recvcounts, const int *rdispls, MPI_Datatype recvtype, struct ompi_communicator_t *comm, ompi_request_t ** request, struct mca_coll_base_module_2_3_0_t *module, bool persistent) { int rank, p, res; MPI_Aint sndext, rcvext; NBC_Schedule *schedule; char *rbuf, *sbuf, inplace; ptrdiff_t gap = 0, span; void * tmpbuf = NULL; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; NBC_IN_PLACE(sendbuf, recvbuf, inplace); rank = ompi_comm_rank (comm); p = ompi_comm_size (comm); res = ompi_datatype_type_extent (recvtype, &rcvext); if (MPI_SUCCESS != res) { NBC_Error("MPI Error in ompi_datatype_type_extent() (%i)", res); return res; } /* copy data to receivbuffer */ if (inplace) { int count = 0; for (int i = 0; i < p; i++) { if (recvcounts[i] > count) { count = recvcounts[i]; } } span = opal_datatype_span(&recvtype->super, count, &gap); if (OPAL_UNLIKELY(0 == span)) { return nbc_get_noop_request(persistent, request); } tmpbuf = malloc(span); if (OPAL_UNLIKELY(NULL == tmpbuf)) { return OMPI_ERR_OUT_OF_RESOURCE; } sendcounts = recvcounts; sdispls = rdispls; } else { res = ompi_datatype_type_extent (sendtype, &sndext); if (MPI_SUCCESS != res) { NBC_Error("MPI Error in ompi_datatype_type_extent() (%i)", res); return res; } } schedule = OBJ_NEW(NBC_Schedule); if (OPAL_UNLIKELY(NULL == schedule)) { free(tmpbuf); return OMPI_ERR_OUT_OF_RESOURCE; } if (!inplace && sendcounts[rank] != 0) { rbuf = (char *) recvbuf + rdispls[rank] * rcvext; sbuf = (char *) sendbuf + sdispls[rank] * sndext; res = NBC_Sched_copy (sbuf, false, sendcounts[rank], sendtype, rbuf, false, recvcounts[rank], recvtype, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } } if (inplace) { res = a2av_sched_inplace(rank, p, schedule, recvbuf, recvcounts, rdispls, rcvext, recvtype, gap); } else { res = a2av_sched_linear(rank, p, schedule, sendbuf, sendcounts, sdispls, sndext, sendtype, recvbuf, recvcounts, rdispls, rcvext, recvtype); } if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); free(tmpbuf); return res; } res = NBC_Sched_commit (schedule); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); free(tmpbuf); return res; } res = NBC_Schedule_request(schedule, comm, libnbc_module, persistent, request, tmpbuf); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); free(tmpbuf); return res; } return OMPI_SUCCESS; }
/* simple linear Alltoallv */ int ompi_coll_libnbc_ialltoallv(const void* sendbuf, const int *sendcounts, const int *sdispls, MPI_Datatype sendtype, void* recvbuf, const int *recvcounts, const int *rdispls, MPI_Datatype recvtype, struct ompi_communicator_t *comm, ompi_request_t ** request, struct mca_coll_base_module_2_2_0_t *module) { int rank, p, res; MPI_Aint sndext, rcvext; NBC_Schedule *schedule; char *rbuf, *sbuf, inplace; ptrdiff_t gap, span; NBC_Handle *handle; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; NBC_IN_PLACE(sendbuf, recvbuf, inplace); rank = ompi_comm_rank (comm); p = ompi_comm_size (comm); res = ompi_datatype_type_extent (recvtype, &rcvext); if (MPI_SUCCESS != res) { NBC_Error("MPI Error in ompi_datatype_type_extent() (%i)", res); return res; } res = NBC_Init_handle (comm, &handle, libnbc_module); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } /* copy data to receivbuffer */ if (inplace) { int count = 0; for (int i = 0; i < p; i++) { if (recvcounts[i] > count) { count = recvcounts[i]; } } span = opal_datatype_span(&recvtype->super, count, &gap); if (OPAL_UNLIKELY(0 == span)) { *request = &ompi_request_empty; NBC_Return_handle (handle); return MPI_SUCCESS; } handle->tmpbuf = malloc(span); if (OPAL_UNLIKELY(NULL == handle->tmpbuf)) { NBC_Return_handle (handle); return OMPI_ERR_OUT_OF_RESOURCE; } sendcounts = recvcounts; sdispls = rdispls; } else { res = ompi_datatype_type_extent (sendtype, &sndext); if (MPI_SUCCESS != res) { NBC_Error("MPI Error in ompi_datatype_type_extent() (%i)", res); NBC_Return_handle (handle); return res; } if (sendcounts[rank] != 0) { rbuf = (char *) recvbuf + rdispls[rank] * rcvext; sbuf = (char *) sendbuf + sdispls[rank] * sndext; res = NBC_Copy (sbuf, sendcounts[rank], sendtype, rbuf, recvcounts[rank], recvtype, comm); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } } } schedule = OBJ_NEW(NBC_Schedule); if (OPAL_UNLIKELY(NULL == schedule)) { NBC_Return_handle (handle); return OMPI_ERR_OUT_OF_RESOURCE; } if (inplace) { res = a2av_sched_inplace(rank, p, schedule, recvbuf, recvcounts, rdispls, rcvext, recvtype, gap); } else { res = a2av_sched_linear(rank, p, schedule, sendbuf, sendcounts, sdispls, sndext, sendtype, recvbuf, recvcounts, rdispls, rcvext, recvtype); } if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); OBJ_RELEASE(schedule); return res; } res = NBC_Sched_commit (schedule); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); OBJ_RELEASE(schedule); return res; } res = NBC_Start(handle, schedule); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); OBJ_RELEASE(schedule); return res; } *request = (ompi_request_t *) handle; return OMPI_SUCCESS; }
/* simple linear Alltoallw */ static int nbc_alltoallw_init(const void* sendbuf, const int *sendcounts, const int *sdispls, struct ompi_datatype_t * const *sendtypes, void* recvbuf, const int *recvcounts, const int *rdispls, struct ompi_datatype_t * const *recvtypes, struct ompi_communicator_t *comm, ompi_request_t ** request, struct mca_coll_base_module_2_3_0_t *module, bool persistent) { int rank, p, res; NBC_Schedule *schedule; char *rbuf, *sbuf, inplace; ptrdiff_t span=0; void *tmpbuf = NULL; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; NBC_IN_PLACE(sendbuf, recvbuf, inplace); rank = ompi_comm_rank (comm); p = ompi_comm_size (comm); /* copy data to receivbuffer */ if (inplace) { ptrdiff_t lgap, lspan; for (int i = 0; i < p; i++) { lspan = opal_datatype_span(&recvtypes[i]->super, recvcounts[i], &lgap); if (lspan > span) { span = lspan; } } if (OPAL_UNLIKELY(0 == span)) { return nbc_get_noop_request(persistent, request); } tmpbuf = malloc(span); if (OPAL_UNLIKELY(NULL == tmpbuf)) { return OMPI_ERR_OUT_OF_RESOURCE; } sendcounts = recvcounts; sdispls = rdispls; sendtypes = recvtypes; } schedule = OBJ_NEW(NBC_Schedule); if (OPAL_UNLIKELY(NULL == schedule)) { free(tmpbuf); return OMPI_ERR_OUT_OF_RESOURCE; } if (!inplace && sendcounts[rank] != 0) { rbuf = (char *) recvbuf + rdispls[rank]; sbuf = (char *) sendbuf + sdispls[rank]; res = NBC_Sched_copy(sbuf, false, sendcounts[rank], sendtypes[rank], rbuf, false, recvcounts[rank], recvtypes[rank], schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } } if (inplace) { res = a2aw_sched_inplace(rank, p, schedule, recvbuf, recvcounts, rdispls, recvtypes); } else { res = a2aw_sched_linear(rank, p, schedule, sendbuf, sendcounts, sdispls, sendtypes, recvbuf, recvcounts, rdispls, recvtypes); } if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); free(tmpbuf); return res; } res = NBC_Sched_commit (schedule); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); free(tmpbuf); return res; } res = NBC_Schedule_request(schedule, comm, libnbc_module, persistent, request, tmpbuf); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); free(tmpbuf); return res; } return OMPI_SUCCESS; }
static inline int a2aw_sched_inplace(int rank, int p, NBC_Schedule *schedule, void *buf, const int *counts, const int *displs, struct ompi_datatype_t * const * types) { ptrdiff_t gap; int res; for (int i = 1; i < (p+1)/2; i++) { int speer = (rank + i) % p; int rpeer = (rank + p - i) % p; char *sbuf = (char *) buf + displs[speer]; char *rbuf = (char *) buf + displs[rpeer]; if (0 != counts[rpeer]) { (void)opal_datatype_span(&types[rpeer]->super, counts[rpeer], &gap); res = NBC_Sched_copy (rbuf, false, counts[rpeer], types[rpeer], (void *)(-gap), true, counts[rpeer], types[rpeer], schedule, true); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } } if (0 != counts[speer]) { res = NBC_Sched_send (sbuf, false , counts[speer], types[speer], speer, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } } if (0 != counts[rpeer]) { res = NBC_Sched_recv (rbuf, false , counts[rpeer], types[rpeer], rpeer, schedule, true); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } } if (0 != counts[rpeer]) { res = NBC_Sched_send ((void *)(-gap), true, counts[rpeer], types[rpeer], rpeer, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } } if (0 != counts[speer]) { res = NBC_Sched_recv (sbuf, false, counts[speer], types[speer], speer, schedule, true); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } } } if (0 == (p%2)) { int peer = (rank + p/2) % p; char *tbuf = (char *) buf + displs[peer]; (void)opal_datatype_span(&types[peer]->super, counts[peer], &gap); res = NBC_Sched_copy (tbuf, false, counts[peer], types[peer], (void *)(-gap), true, counts[peer], types[peer], schedule, true); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } res = NBC_Sched_send ((void *)(-gap), true , counts[peer], types[peer], peer, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } res = NBC_Sched_recv (tbuf, false , counts[peer], types[peer], peer, schedule, true); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } } return OMPI_SUCCESS; }
/* linear iexscan * working principle: * 1. each node (but node 0) receives from left neigbor * 2. performs op * 3. all but rank p-1 do sends to it's right neigbor and exits * */ int ompi_coll_libnbc_iexscan(const void* sendbuf, void* recvbuf, int count, MPI_Datatype datatype, MPI_Op op, struct ompi_communicator_t *comm, ompi_request_t ** request, struct mca_coll_base_module_2_2_0_t *module) { int rank, p, res; ptrdiff_t gap, span; NBC_Schedule *schedule; #ifdef NBC_CACHE_SCHEDULE NBC_Scan_args *args, *found, search; #endif char inplace; void *tmpbuf = NULL; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; NBC_IN_PLACE(sendbuf, recvbuf, inplace); rank = ompi_comm_rank (comm); p = ompi_comm_size (comm); span = opal_datatype_span(&datatype->super, count, &gap); if (0 < rank) { tmpbuf = malloc(span); if (NULL == tmpbuf) { return OMPI_ERR_OUT_OF_RESOURCE; } if (inplace) { res = NBC_Copy(recvbuf, count, datatype, (char *)tmpbuf-gap, count, datatype, comm); } else { res = NBC_Copy(sendbuf, count, datatype, (char *)tmpbuf-gap, count, datatype, comm); } if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { free(tmpbuf); return res; } } #ifdef NBC_CACHE_SCHEDULE /* search schedule in communicator specific tree */ search.sendbuf = sendbuf; search.recvbuf = recvbuf; search.count = count; search.datatype = datatype; search.op = op; found = (NBC_Scan_args *) hb_tree_search ((hb_tree *) libnbc_module->NBC_Dict[NBC_EXSCAN], &search); if (NULL == found) { #endif schedule = OBJ_NEW(NBC_Schedule); if (OPAL_UNLIKELY(NULL == schedule)) { free(tmpbuf); return OMPI_ERR_OUT_OF_RESOURCE; } if (rank != 0) { res = NBC_Sched_recv (recvbuf, false, count, datatype, rank-1, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); free(tmpbuf); return res; } if (rank < p - 1) { /* we have to wait until we have the data */ res = NBC_Sched_barrier(schedule); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); free(tmpbuf); return res; } res = NBC_Sched_op (recvbuf, false, (void *)(-gap), true, count, datatype, op, schedule, true); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); free(tmpbuf); return res; } /* send reduced data onward */ res = NBC_Sched_send ((void *)(-gap), true, count, datatype, rank + 1, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); free(tmpbuf); return res; } } } else if (p > 1) { if (inplace) { res = NBC_Sched_send (recvbuf, false, count, datatype, 1, schedule, false); } else { res = NBC_Sched_send (sendbuf, false, count, datatype, 1, schedule, false); } if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); free(tmpbuf); return res; } } res = NBC_Sched_commit(schedule); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); free(tmpbuf); return res; } #ifdef NBC_CACHE_SCHEDULE /* save schedule to tree */ args = (NBC_Scan_args *) malloc (sizeof (args)); if (NULL != args) { args->sendbuf = sendbuf; args->recvbuf = recvbuf; args->count = count; args->datatype = datatype; args->op = op; args->schedule = schedule; res = hb_tree_insert ((hb_tree *) libnbc_module->NBC_Dict[NBC_EXSCAN], args, args, 0); if (0 == res) { OBJ_RETAIN(schedule); /* increase number of elements for A2A */ if (++libnbc_module->NBC_Dict_size[NBC_EXSCAN] > NBC_SCHED_DICT_UPPER) { NBC_SchedCache_dictwipe ((hb_tree *) libnbc_module->NBC_Dict[NBC_EXSCAN], &libnbc_module->NBC_Dict_size[NBC_EXSCAN]); } } else { NBC_Error("error in dict_insert() (%i)", res); free (args); } } } else { /* found schedule */ schedule = found->schedule; OBJ_RETAIN(schedule); } #endif res = NBC_Schedule_request(schedule, comm, libnbc_module, request, tmpbuf); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); free(tmpbuf); return res; } return OMPI_SUCCESS; }
int mca_coll_base_alltoallv_intra_basic_inplace(const void *rbuf, const int *rcounts, const int *rdisps, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module; int i, j, size, rank, err=MPI_SUCCESS; ompi_request_t **preq, **reqs; char *allocated_buffer, *tmp_buffer; size_t max_size, rdtype_size; OPAL_PTRDIFF_TYPE ext, gap = 0; /* Initialize. */ size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); ompi_datatype_type_size(rdtype, &rdtype_size); /* If only one process, we're done. */ if (1 == size || 0 == rdtype_size) { return MPI_SUCCESS; } /* Find the largest receive amount */ ompi_datatype_type_extent (rdtype, &ext); for (i = 0, max_size = 0 ; i < size ; ++i) { size_t size = opal_datatype_span(&rdtype->super, rcounts[i], &gap); max_size = size > max_size ? size : max_size; } /* The gap will always be the same as we are working on the same datatype */ /* Allocate a temporary buffer */ allocated_buffer = calloc (max_size, 1); if (NULL == allocated_buffer) { return OMPI_ERR_OUT_OF_RESOURCE; } tmp_buffer = allocated_buffer - gap; /* Initiate all send/recv to/from others. */ reqs = preq = coll_base_comm_get_reqs(base_module->base_data, 2); if( NULL == reqs ) { err = OMPI_ERR_OUT_OF_RESOURCE; goto error_hndl; } /* in-place alltoallv slow algorithm (but works) */ for (i = 0 ; i < size ; ++i) { for (j = i+1 ; j < size ; ++j) { preq = reqs; if (i == rank && rcounts[j]) { /* Copy the data into the temporary buffer */ err = ompi_datatype_copy_content_same_ddt (rdtype, rcounts[j], tmp_buffer, (char *) rbuf + rdisps[j] * ext); if (MPI_SUCCESS != err) { goto error_hndl; } /* Exchange data with the peer */ err = MCA_PML_CALL(irecv ((char *) rbuf + rdisps[j] * ext, rcounts[j], rdtype, j, MCA_COLL_BASE_TAG_ALLTOALLV, comm, preq++)); if (MPI_SUCCESS != err) { goto error_hndl; } err = MCA_PML_CALL(isend ((void *) tmp_buffer, rcounts[j], rdtype, j, MCA_COLL_BASE_TAG_ALLTOALLV, MCA_PML_BASE_SEND_STANDARD, comm, preq++)); if (MPI_SUCCESS != err) { goto error_hndl; } } else if (j == rank && rcounts[i]) { /* Copy the data into the temporary buffer */ err = ompi_datatype_copy_content_same_ddt (rdtype, rcounts[i], tmp_buffer, (char *) rbuf + rdisps[i] * ext); if (MPI_SUCCESS != err) { goto error_hndl; } /* Exchange data with the peer */ err = MCA_PML_CALL(irecv ((char *) rbuf + rdisps[i] * ext, rcounts[i], rdtype, i, MCA_COLL_BASE_TAG_ALLTOALLV, comm, preq++)); if (MPI_SUCCESS != err) { goto error_hndl; } err = MCA_PML_CALL(isend ((void *) tmp_buffer, rcounts[i], rdtype, i, MCA_COLL_BASE_TAG_ALLTOALLV, MCA_PML_BASE_SEND_STANDARD, comm, preq++)); if (MPI_SUCCESS != err) { goto error_hndl; } } else { continue; } /* Wait for the requests to complete */ err = ompi_request_wait_all (2, reqs, MPI_STATUSES_IGNORE); if (MPI_SUCCESS != err) { goto error_hndl; } } } error_hndl: /* Free the temporary buffer */ free (allocated_buffer); if( MPI_SUCCESS != err ) { ompi_coll_base_free_reqs(reqs, 2 ); } /* All done */ return err; }
/* * reduce_scatter * * Function: - reduce then scatter * Accepts: - same as MPI_Reduce_scatter() * Returns: - MPI_SUCCESS or error code * * Algorithm: * Cummutative, reasonable sized messages * recursive halving algorithm * Others: * reduce and scatterv (needs to be cleaned * up at some point) * * NOTE: that the recursive halving algorithm should be faster than * the reduce/scatter for all message sizes. However, the memory * usage for the recusive halving is msg_size + 2 * comm_size greater * for the recursive halving, so I've limited where the recursive * halving is used to be nice to the app memory wise. There are much * better algorithms for large messages with commutative operations, * so this should be investigated further. */ int mca_coll_basic_reduce_scatter_intra(const void *sbuf, void *rbuf, const int *rcounts, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, rank, size, count, err = OMPI_SUCCESS; ptrdiff_t extent, buf_size, gap; int *disps = NULL; char *recv_buf = NULL, *recv_buf_free = NULL; char *result_buf = NULL, *result_buf_free = NULL; /* Initialize */ rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); /* Find displacements and the like */ disps = (int*) malloc(sizeof(int) * size); if (NULL == disps) return OMPI_ERR_OUT_OF_RESOURCE; disps[0] = 0; for (i = 0; i < (size - 1); ++i) { disps[i + 1] = disps[i] + rcounts[i]; } count = disps[size - 1] + rcounts[size - 1]; /* short cut the trivial case */ if (0 == count) { free(disps); return OMPI_SUCCESS; } /* get datatype information */ ompi_datatype_type_extent(dtype, &extent); buf_size = opal_datatype_span(&dtype->super, count, &gap); /* Handle MPI_IN_PLACE */ if (MPI_IN_PLACE == sbuf) { sbuf = rbuf; } if ((op->o_flags & OMPI_OP_FLAGS_COMMUTE) && (buf_size < COMMUTATIVE_LONG_MSG)) { int tmp_size, remain = 0, tmp_rank; /* temporary receive buffer. See coll_basic_reduce.c for details on sizing */ recv_buf_free = (char*) malloc(buf_size); recv_buf = recv_buf_free - gap; if (NULL == recv_buf_free) { err = OMPI_ERR_OUT_OF_RESOURCE; goto cleanup; } /* allocate temporary buffer for results */ result_buf_free = (char*) malloc(buf_size); result_buf = result_buf_free - gap; /* copy local buffer into the temporary results */ err = ompi_datatype_sndrcv(sbuf, count, dtype, result_buf, count, dtype); if (OMPI_SUCCESS != err) goto cleanup; /* figure out power of two mapping: grow until larger than comm size, then go back one, to get the largest power of two less than comm size */ tmp_size = opal_next_poweroftwo(size); tmp_size >>= 1; remain = size - tmp_size; /* If comm size is not a power of two, have the first "remain" procs with an even rank send to rank + 1, leaving a power of two procs to do the rest of the algorithm */ if (rank < 2 * remain) { if ((rank & 1) == 0) { err = MCA_PML_CALL(send(result_buf, count, dtype, rank + 1, MCA_COLL_BASE_TAG_REDUCE_SCATTER, MCA_PML_BASE_SEND_STANDARD, comm)); if (OMPI_SUCCESS != err) goto cleanup; /* we don't participate from here on out */ tmp_rank = -1; } else { err = MCA_PML_CALL(recv(recv_buf, count, dtype, rank - 1, MCA_COLL_BASE_TAG_REDUCE_SCATTER, comm, MPI_STATUS_IGNORE)); if (OMPI_SUCCESS != err) goto cleanup; /* integrate their results into our temp results */ ompi_op_reduce(op, recv_buf, result_buf, count, dtype); /* adjust rank to be the bottom "remain" ranks */ tmp_rank = rank / 2; } } else { /* just need to adjust rank to show that the bottom "even remain" ranks dropped out */ tmp_rank = rank - remain; } /* For ranks not kicked out by the above code, perform the recursive halving */ if (tmp_rank >= 0) { int *tmp_disps = NULL, *tmp_rcounts = NULL; int mask, send_index, recv_index, last_index; /* recalculate disps and rcounts to account for the special "remainder" processes that are no longer doing anything */ tmp_rcounts = (int*) malloc(tmp_size * sizeof(int)); if (NULL == tmp_rcounts) { err = OMPI_ERR_OUT_OF_RESOURCE; goto cleanup; } tmp_disps = (int*) malloc(tmp_size * sizeof(int)); if (NULL == tmp_disps) { free(tmp_rcounts); err = OMPI_ERR_OUT_OF_RESOURCE; goto cleanup; } for (i = 0 ; i < tmp_size ; ++i) { if (i < remain) { /* need to include old neighbor as well */ tmp_rcounts[i] = rcounts[i * 2 + 1] + rcounts[i * 2]; } else { tmp_rcounts[i] = rcounts[i + remain]; } } tmp_disps[0] = 0; for (i = 0; i < tmp_size - 1; ++i) { tmp_disps[i + 1] = tmp_disps[i] + tmp_rcounts[i]; } /* do the recursive halving communication. Don't use the dimension information on the communicator because I think the information is invalidated by our "shrinking" of the communicator */ mask = tmp_size >> 1; send_index = recv_index = 0; last_index = tmp_size; while (mask > 0) { int tmp_peer, peer, send_count, recv_count; struct ompi_request_t *request; tmp_peer = tmp_rank ^ mask; peer = (tmp_peer < remain) ? tmp_peer * 2 + 1 : tmp_peer + remain; /* figure out if we're sending, receiving, or both */ send_count = recv_count = 0; if (tmp_rank < tmp_peer) { send_index = recv_index + mask; for (i = send_index ; i < last_index ; ++i) { send_count += tmp_rcounts[i]; } for (i = recv_index ; i < send_index ; ++i) { recv_count += tmp_rcounts[i]; } } else { recv_index = send_index + mask; for (i = send_index ; i < recv_index ; ++i) { send_count += tmp_rcounts[i]; } for (i = recv_index ; i < last_index ; ++i) { recv_count += tmp_rcounts[i]; } } /* actual data transfer. Send from result_buf, receive into recv_buf */ if (recv_count > 0) { err = MCA_PML_CALL(irecv(recv_buf + tmp_disps[recv_index] * extent, recv_count, dtype, peer, MCA_COLL_BASE_TAG_REDUCE_SCATTER, comm, &request)); if (OMPI_SUCCESS != err) { free(tmp_rcounts); free(tmp_disps); goto cleanup; } } if (send_count > 0) { err = MCA_PML_CALL(send(result_buf + tmp_disps[send_index] * extent, send_count, dtype, peer, MCA_COLL_BASE_TAG_REDUCE_SCATTER, MCA_PML_BASE_SEND_STANDARD, comm)); if (OMPI_SUCCESS != err) { free(tmp_rcounts); free(tmp_disps); goto cleanup; } } /* if we received something on this step, push it into the results buffer */ if (recv_count > 0) { err = ompi_request_wait(&request, MPI_STATUS_IGNORE); if (OMPI_SUCCESS != err) { free(tmp_rcounts); free(tmp_disps); goto cleanup; } ompi_op_reduce(op, recv_buf + tmp_disps[recv_index] * extent, result_buf + tmp_disps[recv_index] * extent, recv_count, dtype); } /* update for next iteration */ send_index = recv_index; last_index = recv_index + mask; mask >>= 1; } /* copy local results from results buffer into real receive buffer */ if (0 != rcounts[rank]) { err = ompi_datatype_sndrcv(result_buf + disps[rank] * extent, rcounts[rank], dtype, rbuf, rcounts[rank], dtype); if (OMPI_SUCCESS != err) { free(tmp_rcounts); free(tmp_disps); goto cleanup; } } free(tmp_rcounts); free(tmp_disps); }
/* * reduce_intra_in_order_binary * * Function: Logarithmic reduce operation for non-commutative operations. * Acecpts: same as MPI_Reduce() * Returns: MPI_SUCCESS or error code */ int ompi_coll_base_reduce_intra_in_order_binary( const void *sendbuf, void *recvbuf, int count, ompi_datatype_t* datatype, ompi_op_t* op, int root, ompi_communicator_t* comm, mca_coll_base_module_t *module, uint32_t segsize, int max_outstanding_reqs ) { int ret, rank, size, io_root, segcount = count; void *use_this_sendbuf = NULL; void *use_this_recvbuf = NULL; char *tmpbuf_free = NULL; size_t typelng; mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module; mca_coll_base_comm_t *data = base_module->base_data; rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:reduce_intra_in_order_binary rank %d ss %5d", rank, segsize)); COLL_BASE_UPDATE_IN_ORDER_BINTREE( comm, base_module ); /** * Determine number of segments and number of elements * sent per operation */ ompi_datatype_type_size( datatype, &typelng ); COLL_BASE_COMPUTED_SEGCOUNT( segsize, typelng, segcount ); /* An in-order binary tree must use root (size-1) to preserve the order of operations. Thus, if root is not rank (size - 1), then we must handle 1. MPI_IN_PLACE option on real root, and 2. we must allocate temporary recvbuf on rank (size - 1). Note that generic function must be careful not to switch order of operations for non-commutative ops. */ io_root = size - 1; use_this_sendbuf = (void *)sendbuf; use_this_recvbuf = recvbuf; if (io_root != root) { ptrdiff_t dsize, gap = 0; char *tmpbuf; dsize = opal_datatype_span(&datatype->super, count, &gap); if ((root == rank) && (MPI_IN_PLACE == sendbuf)) { tmpbuf_free = (char *) malloc(dsize); if (NULL == tmpbuf_free) { return MPI_ERR_INTERN; } tmpbuf = tmpbuf_free - gap; ompi_datatype_copy_content_same_ddt(datatype, count, (char*)tmpbuf, (char*)recvbuf); use_this_sendbuf = tmpbuf; } else if (io_root == rank) { tmpbuf_free = (char *) malloc(dsize); if (NULL == tmpbuf_free) { return MPI_ERR_INTERN; } tmpbuf = tmpbuf_free - gap; use_this_recvbuf = tmpbuf; } } /* Use generic reduce with in-order binary tree topology and io_root */ ret = ompi_coll_base_reduce_generic( use_this_sendbuf, use_this_recvbuf, count, datatype, op, io_root, comm, module, data->cached_in_order_bintree, segcount, max_outstanding_reqs ); if (MPI_SUCCESS != ret) { return ret; } /* Clean up */ if (io_root != root) { if (root == rank) { /* Receive result from rank io_root to recvbuf */ ret = MCA_PML_CALL(recv(recvbuf, count, datatype, io_root, MCA_COLL_BASE_TAG_REDUCE, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != ret) { return ret; } } else if (io_root == rank) { /* Send result from use_this_recvbuf to root */ ret = MCA_PML_CALL(send(use_this_recvbuf, count, datatype, root, MCA_COLL_BASE_TAG_REDUCE, MCA_PML_BASE_SEND_STANDARD, comm)); if (MPI_SUCCESS != ret) { return ret; } } } if (NULL != tmpbuf_free) { free(tmpbuf_free); } return MPI_SUCCESS; }
/* * ompi_coll_base_allreduce_intra_recursivedoubling * * Function: Recursive doubling algorithm for allreduce operation * Accepts: Same as MPI_Allreduce() * Returns: MPI_SUCCESS or error code * * Description: Implements recursive doubling algorithm for allreduce. * Original (non-segmented) implementation is used in MPICH-2 * for small and intermediate size messages. * The algorithm preserves order of operations so it can * be used both by commutative and non-commutative operations. * * Example on 7 nodes: * Initial state * # 0 1 2 3 4 5 6 * [0] [1] [2] [3] [4] [5] [6] * Initial adjustment step for non-power of two nodes. * old rank 1 3 5 6 * new rank 0 1 2 3 * [0+1] [2+3] [4+5] [6] * Step 1 * old rank 1 3 5 6 * new rank 0 1 2 3 * [0+1+] [0+1+] [4+5+] [4+5+] * [2+3+] [2+3+] [6 ] [6 ] * Step 2 * old rank 1 3 5 6 * new rank 0 1 2 3 * [0+1+] [0+1+] [0+1+] [0+1+] * [2+3+] [2+3+] [2+3+] [2+3+] * [4+5+] [4+5+] [4+5+] [4+5+] * [6 ] [6 ] [6 ] [6 ] * Final adjustment step for non-power of two nodes * # 0 1 2 3 4 5 6 * [0+1+] [0+1+] [0+1+] [0+1+] [0+1+] [0+1+] [0+1+] * [2+3+] [2+3+] [2+3+] [2+3+] [2+3+] [2+3+] [2+3+] * [4+5+] [4+5+] [4+5+] [4+5+] [4+5+] [4+5+] [4+5+] * [6 ] [6 ] [6 ] [6 ] [6 ] [6 ] [6 ] * */ int ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int ret, line, rank, size, adjsize, remote, distance; int newrank, newremote, extra_ranks; char *tmpsend = NULL, *tmprecv = NULL, *tmpswap = NULL, *inplacebuf_free = NULL, *inplacebuf; ompi_request_t *reqs[2] = {NULL, NULL}; OPAL_PTRDIFF_TYPE span, gap; size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:allreduce_intra_recursivedoubling rank %d", rank)); /* Special case for size == 1 */ if (1 == size) { if (MPI_IN_PLACE != sbuf) { ret = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf, (char*)sbuf); if (ret < 0) { line = __LINE__; goto error_hndl; } } return MPI_SUCCESS; } /* Allocate and initialize temporary send buffer */ span = opal_datatype_span(&dtype->super, count, &gap); inplacebuf_free = (char*) malloc(span); if (NULL == inplacebuf_free) { ret = -1; line = __LINE__; goto error_hndl; } inplacebuf = inplacebuf_free - gap; if (MPI_IN_PLACE == sbuf) { ret = ompi_datatype_copy_content_same_ddt(dtype, count, inplacebuf, (char*)rbuf); if (ret < 0) { line = __LINE__; goto error_hndl; } } else { ret = ompi_datatype_copy_content_same_ddt(dtype, count, inplacebuf, (char*)sbuf); if (ret < 0) { line = __LINE__; goto error_hndl; } } tmpsend = (char*) inplacebuf; tmprecv = (char*) rbuf; /* Determine nearest power of two less than or equal to size */ adjsize = opal_next_poweroftwo (size); adjsize >>= 1; /* Handle non-power-of-two case: - Even ranks less than 2 * extra_ranks send their data to (rank + 1), and sets new rank to -1. - Odd ranks less than 2 * extra_ranks receive data from (rank - 1), apply appropriate operation, and set new rank to rank/2 - Everyone else sets rank to rank - extra_ranks */ extra_ranks = size - adjsize; if (rank < (2 * extra_ranks)) { if (0 == (rank % 2)) { ret = MCA_PML_CALL(send(tmpsend, count, dtype, (rank + 1), MCA_COLL_BASE_TAG_ALLREDUCE, MCA_PML_BASE_SEND_STANDARD, comm)); if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } newrank = -1; } else { ret = MCA_PML_CALL(recv(tmprecv, count, dtype, (rank - 1), MCA_COLL_BASE_TAG_ALLREDUCE, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } /* tmpsend = tmprecv (op) tmpsend */ ompi_op_reduce(op, tmprecv, tmpsend, count, dtype); newrank = rank >> 1; } } else {
/* * reduce_lin_intra * * Function: - reduction using O(N) algorithm * Accepts: - same as MPI_Reduce() * Returns: - MPI_SUCCESS or error code */ int ompi_coll_base_reduce_intra_basic_linear(const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, rank, err, size; ptrdiff_t extent, dsize, gap = 0; char *free_buffer = NULL; char *pml_buffer = NULL; char *inplace_temp_free = NULL; char *inbuf; /* Initialize */ rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); /* If not root, send data to the root. */ if (rank != root) { err = MCA_PML_CALL(send(sbuf, count, dtype, root, MCA_COLL_BASE_TAG_REDUCE, MCA_PML_BASE_SEND_STANDARD, comm)); return err; } dsize = opal_datatype_span(&dtype->super, count, &gap); ompi_datatype_type_extent(dtype, &extent); if (MPI_IN_PLACE == sbuf) { sbuf = rbuf; inplace_temp_free = (char*)malloc(dsize); if (NULL == inplace_temp_free) { return OMPI_ERR_OUT_OF_RESOURCE; } rbuf = inplace_temp_free - gap; } if (size > 1) { free_buffer = (char*)malloc(dsize); if (NULL == free_buffer) { if (NULL != inplace_temp_free) { free(inplace_temp_free); } return OMPI_ERR_OUT_OF_RESOURCE; } pml_buffer = free_buffer - gap; } /* Initialize the receive buffer. */ if (rank == (size - 1)) { err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf, (char*)sbuf); } else { err = MCA_PML_CALL(recv(rbuf, count, dtype, size - 1, MCA_COLL_BASE_TAG_REDUCE, comm, MPI_STATUS_IGNORE)); } if (MPI_SUCCESS != err) { if (NULL != free_buffer) { free(free_buffer); } return err; } /* Loop receiving and calling reduction function (C or Fortran). */ for (i = size - 2; i >= 0; --i) { if (rank == i) { inbuf = (char*)sbuf; } else { err = MCA_PML_CALL(recv(pml_buffer, count, dtype, i, MCA_COLL_BASE_TAG_REDUCE, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != err) { if (NULL != free_buffer) { free(free_buffer); } return err; } inbuf = pml_buffer; } /* Perform the reduction */ ompi_op_reduce(op, inbuf, rbuf, count, dtype); } if (NULL != inplace_temp_free) { err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)sbuf, rbuf); free(inplace_temp_free); } if (NULL != free_buffer) { free(free_buffer); } /* All done */ return MPI_SUCCESS; }
/* * allgather_inter * * Function: - allgather using other MPI collections * Accepts: - same as MPI_Allgather() * Returns: - MPI_SUCCESS or error code */ int mca_coll_basic_allgather_inter(const void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int rank, root = 0, size, rsize, err, i, line; char *tmpbuf_free = NULL, *tmpbuf, *ptmp; ptrdiff_t rlb, rextent, incr; ptrdiff_t gap, span; ompi_request_t *req; ompi_request_t **reqs = NULL; rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); rsize = ompi_comm_remote_size(comm); /* Algorithm: * - a gather to the root in remote group (simultaniously executed, * thats why we cannot use coll_gather). * - exchange the temp-results between two roots * - inter-bcast (again simultanious). */ /* Step one: gather operations: */ if (rank != root) { /* send your data to root */ err = MCA_PML_CALL(send(sbuf, scount, sdtype, root, MCA_COLL_BASE_TAG_ALLGATHER, MCA_PML_BASE_SEND_STANDARD, comm)); if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; } } else { /* receive a msg. from all other procs. */ err = ompi_datatype_get_extent(rdtype, &rlb, &rextent); if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; } /* Get a requests arrays of the right size */ reqs = ompi_coll_base_comm_get_reqs(module->base_data, rsize + 1); if( NULL == reqs ) { line = __LINE__; err = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } /* Do a send-recv between the two root procs. to avoid deadlock */ err = MCA_PML_CALL(isend(sbuf, scount, sdtype, 0, MCA_COLL_BASE_TAG_ALLGATHER, MCA_PML_BASE_SEND_STANDARD, comm, &reqs[rsize])); if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; } err = MCA_PML_CALL(irecv(rbuf, rcount, rdtype, 0, MCA_COLL_BASE_TAG_ALLGATHER, comm, &reqs[0])); if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; } incr = rextent * rcount; ptmp = (char *) rbuf + incr; for (i = 1; i < rsize; ++i, ptmp += incr) { err = MCA_PML_CALL(irecv(ptmp, rcount, rdtype, i, MCA_COLL_BASE_TAG_ALLGATHER, comm, &reqs[i])); if (MPI_SUCCESS != err) { line = __LINE__; goto exit; } } err = ompi_request_wait_all(rsize + 1, reqs, MPI_STATUSES_IGNORE); if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; } /* Step 2: exchange the resuts between the root processes */ span = opal_datatype_span(&sdtype->super, (int64_t)scount * (int64_t)size, &gap); tmpbuf_free = (char *) malloc(span); if (NULL == tmpbuf_free) { line = __LINE__; err = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } tmpbuf = tmpbuf_free - gap; err = MCA_PML_CALL(isend(rbuf, rsize * rcount, rdtype, 0, MCA_COLL_BASE_TAG_ALLGATHER, MCA_PML_BASE_SEND_STANDARD, comm, &req)); if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; } err = MCA_PML_CALL(recv(tmpbuf, size * scount, sdtype, 0, MCA_COLL_BASE_TAG_ALLGATHER, comm, MPI_STATUS_IGNORE)); if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; } err = ompi_request_wait( &req, MPI_STATUS_IGNORE); if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; } } /* Step 3: bcast the data to the remote group. This * happens in both groups simultaneously, thus we can * not use coll_bcast (this would deadlock). */ if (rank != root) { /* post the recv */ err = MCA_PML_CALL(recv(rbuf, rsize * rcount, rdtype, 0, MCA_COLL_BASE_TAG_ALLGATHER, comm, MPI_STATUS_IGNORE)); if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; } } else { /* Send the data to every other process in the remote group * except to rank zero. which has it already. */ for (i = 1; i < rsize; i++) { err = MCA_PML_CALL(isend(tmpbuf, size * scount, sdtype, i, MCA_COLL_BASE_TAG_ALLGATHER, MCA_PML_BASE_SEND_STANDARD, comm, &reqs[i - 1])); if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; } } err = ompi_request_wait_all(rsize - 1, reqs, MPI_STATUSES_IGNORE); if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; } } exit: if( MPI_SUCCESS != err ) { OPAL_OUTPUT( (ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank) ); (void)line; // silence compiler warning if( NULL != reqs ) ompi_coll_base_free_reqs(reqs, rsize+1); } if (NULL != tmpbuf_free) { free(tmpbuf_free); } return err; }
/* linear iscan * working principle: * 1. each node (but node 0) receives from left neighbor * 2. performs op * 3. all but rank p-1 do sends to it's right neighbor and exits * */ int ompi_coll_libnbc_iscan(const void* sendbuf, void* recvbuf, int count, MPI_Datatype datatype, MPI_Op op, struct ompi_communicator_t *comm, ompi_request_t ** request, struct mca_coll_base_module_2_1_0_t *module) { int rank, p, res; ptrdiff_t gap, span; NBC_Schedule *schedule; char inplace; NBC_Handle *handle; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; NBC_IN_PLACE(sendbuf, recvbuf, inplace); rank = ompi_comm_rank (comm); p = ompi_comm_size (comm); if (!inplace) { /* copy data to receivebuf */ res = NBC_Copy (sendbuf, count, datatype, recvbuf, count, datatype, comm); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } } res = NBC_Init_handle(comm, &handle, libnbc_module); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } #ifdef NBC_CACHE_SCHEDULE NBC_Scan_args *args, *found, search; /* search schedule in communicator specific tree */ search.sendbuf = sendbuf; search.recvbuf = recvbuf; search.count = count; search.datatype = datatype; search.op = op; found = (NBC_Scan_args *) hb_tree_search ((hb_tree *) libnbc_module->NBC_Dict[NBC_SCAN], &search); if (NULL == found) { #endif schedule = OBJ_NEW(NBC_Schedule); if (OPAL_UNLIKELY(NULL == schedule)) { NBC_Return_handle (handle); return OMPI_ERR_OUT_OF_RESOURCE; } /* ensure the schedule is released with the handle */ handle->schedule = schedule; if(rank != 0) { span = opal_datatype_span(&datatype->super, count, &gap); handle->tmpbuf = malloc (span); if (NULL == handle->tmpbuf) { NBC_Return_handle (handle); return OMPI_ERR_OUT_OF_RESOURCE; } /* we have to wait until we have the data */ res = NBC_Sched_recv ((void *)(-gap), true, count, datatype, rank-1, schedule, true); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; } /* perform the reduce in my local buffer */ /* this cannot be done until handle->tmpbuf is unused :-( so barrier after the op */ res = NBC_Sched_op ((void *)(-gap), true, recvbuf, false, count, datatype, op, schedule, true); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; } } if (rank != p-1) { res = NBC_Sched_send (recvbuf, false, count, datatype, rank+1, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; } } res = NBC_Sched_commit (schedule); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; } #ifdef NBC_CACHE_SCHEDULE /* save schedule to tree */ args = (NBC_Scan_args *) malloc (sizeof (args)); if (NULL != args) { args->sendbuf = sendbuf; args->recvbuf = recvbuf; args->count = count; args->datatype = datatype; args->op = op; args->schedule = schedule; res = hb_tree_insert ((hb_tree *) libnbc_module->NBC_Dict[NBC_SCAN], args, args, 0); if (0 == res) { OBJ_RETAIN(schedule); /* increase number of elements for A2A */ if (++libnbc_module->NBC_Dict_size[NBC_SCAN] > NBC_SCHED_DICT_UPPER) { NBC_SchedCache_dictwipe ((hb_tree *) libnbc_module->NBC_Dict[NBC_SCAN], &libnbc_module->NBC_Dict_size[NBC_SCAN]); } } else { NBC_Error("error in dict_insert() (%i)", res); free (args); } } } else { /* found schedule */ schedule = found->schedule; OBJ_RETAIN(schedule); } #endif res = NBC_Start(handle, schedule); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; } *request = (ompi_request_t *) handle; /* tmpbuf is freed with the handle */ return OMPI_SUCCESS; }
/* Todo: gather_intra_generic, gather_intra_binary, gather_intra_chain, * gather_intra_pipeline, segmentation? */ int ompi_coll_base_gather_intra_binomial(const void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int line = -1, i, rank, vrank, size, total_recv = 0, err; char *ptmp = NULL, *tempbuf = NULL; ompi_coll_tree_t* bmtree; MPI_Status status; MPI_Aint sextent, sgap, ssize; MPI_Aint rextent, rgap, rsize; mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module; mca_coll_base_comm_t *data = base_module->base_data; size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "ompi_coll_base_gather_intra_binomial rank %d", rank)); /* create the binomial tree */ COLL_BASE_UPDATE_IN_ORDER_BMTREE( comm, base_module, root ); bmtree = data->cached_in_order_bmtree; ompi_datatype_type_extent(sdtype, &sextent); ompi_datatype_type_extent(rdtype, &rextent); ssize = opal_datatype_span(&sdtype->super, (int64_t)scount * size, &sgap); rsize = opal_datatype_span(&rdtype->super, (int64_t)rcount * size, &rgap); vrank = (rank - root + size) % size; if (rank == root) { if (0 == root){ /* root on 0, just use the recv buffer */ ptmp = (char *) rbuf; if (sbuf != MPI_IN_PLACE) { err = ompi_datatype_sndrcv((void *)sbuf, scount, sdtype, ptmp, rcount, rdtype); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } } } else { /* root is not on 0, allocate temp buffer for recv, * rotate data at the end */ tempbuf = (char *) malloc(rsize); if (NULL == tempbuf) { err= OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl; } ptmp = tempbuf - rgap; if (sbuf != MPI_IN_PLACE) { /* copy from sbuf to temp buffer */ err = ompi_datatype_sndrcv((void *)sbuf, scount, sdtype, ptmp, rcount, rdtype); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } } else { /* copy from rbuf to temp buffer */ err = ompi_datatype_copy_content_same_ddt(rdtype, rcount, ptmp, (char *)rbuf + (ptrdiff_t)rank * rextent * (ptrdiff_t)rcount); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } } } total_recv = rcount; } else if (!(vrank % 2)) { /* other non-leaf nodes, allocate temp buffer for data received from * children, the most we need is half of the total data elements due * to the property of binimoal tree */ tempbuf = (char *) malloc(ssize); if (NULL == tempbuf) { err= OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl; } ptmp = tempbuf - sgap; /* local copy to tempbuf */ err = ompi_datatype_sndrcv((void *)sbuf, scount, sdtype, ptmp, scount, sdtype); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } /* use sdtype,scount as rdtype,rdcount since they are ignored on * non-root procs */ rdtype = sdtype; rcount = scount; rextent = sextent; total_recv = rcount; } else { /* leaf nodes, no temp buffer needed, use sdtype,scount as * rdtype,rdcount since they are ignored on non-root procs */ ptmp = (char *) sbuf; total_recv = scount; } if (!(vrank % 2)) { /* all non-leaf nodes recv from children */ for (i = 0; i < bmtree->tree_nextsize; i++) { int mycount = 0, vkid; /* figure out how much data I have to send to this child */ vkid = (bmtree->tree_next[i] - root + size) % size; mycount = vkid - vrank; if (mycount > (size - vkid)) mycount = size - vkid; mycount *= rcount; OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "ompi_coll_base_gather_intra_binomial rank %d recv %d mycount = %d", rank, bmtree->tree_next[i], mycount)); err = MCA_PML_CALL(recv(ptmp + total_recv*rextent, (ptrdiff_t)rcount * size - total_recv, rdtype, bmtree->tree_next[i], MCA_COLL_BASE_TAG_GATHER, comm, &status)); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } total_recv += mycount; } } if (rank != root) { /* all nodes except root send to parents */ OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "ompi_coll_base_gather_intra_binomial rank %d send %d count %d\n", rank, bmtree->tree_prev, total_recv)); err = MCA_PML_CALL(send(ptmp, total_recv, sdtype, bmtree->tree_prev, MCA_COLL_BASE_TAG_GATHER, MCA_PML_BASE_SEND_STANDARD, comm)); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } } if (rank == root) { if (root != 0) { /* rotate received data on root if root != 0 */ err = ompi_datatype_copy_content_same_ddt(rdtype, (ptrdiff_t)rcount * (ptrdiff_t)(size - root), (char *)rbuf + rextent * (ptrdiff_t)root * (ptrdiff_t)rcount, ptmp); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } err = ompi_datatype_copy_content_same_ddt(rdtype, (ptrdiff_t)rcount * (ptrdiff_t)root, (char *) rbuf, ptmp + rextent * (ptrdiff_t)rcount * (ptrdiff_t)(size-root)); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } free(tempbuf); } } else if (!(vrank % 2)) { /* other non-leaf nodes */ free(tempbuf); } return MPI_SUCCESS; err_hndl: if (NULL != tempbuf) free(tempbuf); OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank)); (void)line; // silence compiler warning return err; }