int smpi_coll_tuned_reduce_ompi_binary( void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPI_Comm comm) { uint32_t segsize; int segcount = count; size_t typelng; /** * Determine number of segments and number of elements * sent per operation */ typelng=smpi_datatype_size( datatype ); // Binary_32K segsize = 32*1024; XBT_DEBUG("coll:tuned:reduce_intra_binary rank %d ss %5d", smpi_comm_rank(comm), segsize); COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount ); return smpi_coll_tuned_ompi_reduce_generic( sendbuf, recvbuf, count, datatype, op, root, comm, ompi_coll_tuned_topo_build_tree(2, comm, root), segcount, 0); }
int ompi_coll_tuned_reduce_intra_binary( void *sendbuf, void *recvbuf, int count, ompi_datatype_t* datatype, ompi_op_t* op, int root, ompi_communicator_t* comm, mca_coll_base_module_t *module, uint32_t segsize, int max_outstanding_reqs ) { int segcount = count; size_t typelng; mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; mca_coll_tuned_comm_t *data = tuned_module->tuned_data; OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_binary rank %d ss %5d", ompi_comm_rank(comm), segsize)); COLL_TUNED_UPDATE_BINTREE( comm, tuned_module, root ); /** * Determine number of segments and number of elements * sent per operation */ ompi_datatype_type_size( datatype, &typelng ); COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount ); return ompi_coll_tuned_reduce_generic( sendbuf, recvbuf, count, datatype, op, root, comm, module, data->cached_bintree, segcount, max_outstanding_reqs ); }
int ompi_coll_tuned_bcast_intra_binomial( void* buffer, int count, struct ompi_datatype_t* datatype, int root, struct ompi_communicator_t* comm, mca_coll_base_module_t *module, uint32_t segsize ) { int segcount = count; size_t typelng; mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; mca_coll_tuned_comm_t *data = tuned_module->tuned_data; COLL_TUNED_UPDATE_BMTREE( comm, tuned_module, root ); /** * Determine number of elements sent per operation. */ ompi_datatype_type_size( datatype, &typelng ); COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount ); OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_binomial rank %d ss %5d typelng %lu segcount %d", ompi_comm_rank(comm), segsize, (unsigned long)typelng, segcount)); return ompi_coll_tuned_bcast_intra_generic( buffer, count, datatype, root, comm, module, segcount, data->cached_bmtree ); }
int smpi_coll_tuned_reduce_ompi_pipeline( void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPI_Comm comm ) { uint32_t segsize; int segcount = count; size_t typelng; // COLL_TUNED_UPDATE_PIPELINE( comm, tuned_module, root ); /** * Determine number of segments and number of elements * sent per operation */ const double a2 = 0.0410 / 1024.0; /* [1/B] */ const double b2 = 9.7128; const double a4 = 0.0033 / 1024.0; /* [1/B] */ const double b4 = 1.6761; typelng= smpi_datatype_size( datatype); int communicator_size = smpi_comm_size(comm); size_t message_size = typelng * count; if (communicator_size > (a2 * message_size + b2)) { // Pipeline_1K segsize = 1024; }else if (communicator_size > (a4 * message_size + b4)) { // Pipeline_32K segsize = 32*1024; } else { // Pipeline_64K segsize = 64*1024; } XBT_DEBUG("coll:tuned:reduce_intra_pipeline rank %d ss %5d", smpi_comm_rank(comm), segsize); COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount ); return smpi_coll_tuned_ompi_reduce_generic( sendbuf, recvbuf, count, datatype, op, root, comm, ompi_coll_tuned_topo_build_chain( 1, comm, root), segcount, 0); }
int ompi_coll_tuned_reduce_intra_chain( void *sendbuf, void *recvbuf, int count, ompi_datatype_t* datatype, ompi_op_t* op, int root, ompi_communicator_t* comm, uint32_t segsize, int fanout) { int segcount = count; size_t typelng; OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_chain rank %d fo %d ss %5d", ompi_comm_rank(comm), fanout, segsize)); COLL_TUNED_UPDATE_CHAIN( comm, root, fanout ); /** * Determine number of segments and number of elements * sent per operation */ ompi_ddt_type_size( datatype, &typelng ); COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount ); return ompi_coll_tuned_reduce_generic( sendbuf, recvbuf, count, datatype, op, root, comm, comm->c_coll_selected_data->cached_chain, segcount ); }
int smpi_coll_tuned_reduce_ompi_binomial( void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPI_Comm comm) { uint32_t segsize=0; int segcount = count; size_t typelng; const double a1 = 0.6016 / 1024.0; /* [1/B] */ const double b1 = 1.3496; // COLL_TUNED_UPDATE_IN_ORDER_BMTREE( comm, tuned_module, root ); /** * Determine number of segments and number of elements * sent per operation */ typelng= smpi_datatype_size( datatype); int communicator_size = smpi_comm_size(comm); size_t message_size = typelng * count; if (((communicator_size < 8) && (message_size < 20480)) || (message_size < 2048) || (count <= 1)) { /* Binomial_0K */ segsize = 0; } else if (communicator_size > (a1 * message_size + b1)) { // Binomial_1K segsize = 1024; } XBT_DEBUG("coll:tuned:reduce_intra_binomial rank %d ss %5d", smpi_comm_rank(comm), segsize); COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount ); return smpi_coll_tuned_ompi_reduce_generic( sendbuf, recvbuf, count, datatype, op, root, comm, ompi_coll_tuned_topo_build_in_order_bmtree(comm, root), segcount, 0); }
int smpi_coll_tuned_bcast_ompi_pipeline( void* buffer, int original_count, MPI_Datatype datatype, int root, MPI_Comm comm) { int count_by_segment = original_count; size_t type_size; int segsize =1024 << 7; //mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; //mca_coll_tuned_comm_t *data = tuned_module->tuned_data; // return ompi_coll_tuned_bcast_intra_generic( buffer, count, datatype, root, comm, module, // count_by_segment, data->cached_pipeline ); ompi_coll_tree_t * tree = ompi_coll_tuned_topo_build_chain( 1, comm, root ); int i; int rank, size; int segindex; int num_segments; /* Number of segments */ int sendcount; /* number of elements sent in this segment */ size_t realsegsize; char *tmpbuf; ptrdiff_t extent; MPI_Request recv_reqs[2] = {MPI_REQUEST_NULL, MPI_REQUEST_NULL}; MPI_Request *send_reqs = NULL; int req_index; /** * Determine number of elements sent per operation. */ type_size = smpi_datatype_size(datatype); size = smpi_comm_size(comm); rank = smpi_comm_rank(comm); xbt_assert( size > 1 ); const double a_p16 = 3.2118e-6; /* [1 / byte] */ const double b_p16 = 8.7936; const double a_p64 = 2.3679e-6; /* [1 / byte] */ const double b_p64 = 1.1787; const double a_p128 = 1.6134e-6; /* [1 / byte] */ const double b_p128 = 2.1102; size_t message_size; /* else we need data size for decision function */ message_size = type_size * (unsigned long)original_count; /* needed for decision */ if (size < (a_p128 * message_size + b_p128)) { //Pipeline with 128KB segments segsize = 1024 << 7; }else if (size < (a_p64 * message_size + b_p64)) { // Pipeline with 64KB segments segsize = 1024 << 6; }else if (size < (a_p16 * message_size + b_p16)) { //Pipeline with 16KB segments segsize = 1024 << 4; } COLL_TUNED_COMPUTED_SEGCOUNT( segsize, type_size, count_by_segment ); XBT_DEBUG("coll:tuned:bcast_intra_pipeline rank %d ss %5d type_size %lu count_by_segment %d", smpi_comm_rank(comm), segsize, (unsigned long)type_size, count_by_segment); extent = smpi_datatype_get_extent (datatype); num_segments = (original_count + count_by_segment - 1) / count_by_segment; realsegsize = count_by_segment * extent; /* Set the buffer pointers */ tmpbuf = (char *) buffer; if( tree->tree_nextsize != 0 ) { send_reqs = xbt_new(MPI_Request, tree->tree_nextsize ); } /* Root code */ if( rank == root ) { /* For each segment: - send segment to all children. The last segment may have less elements than other segments. */ sendcount = count_by_segment; for( segindex = 0; segindex < num_segments; segindex++ ) { if( segindex == (num_segments - 1) ) { sendcount = original_count - segindex * count_by_segment; } for( i = 0; i < tree->tree_nextsize; i++ ) { send_reqs[i] = smpi_mpi_isend(tmpbuf, sendcount, datatype, tree->tree_next[i], COLL_TAG_BCAST, comm); } /* complete the sends before starting the next sends */ smpi_mpi_waitall( tree->tree_nextsize, send_reqs, MPI_STATUSES_IGNORE ); /* update tmp buffer */ tmpbuf += realsegsize; } } /* Intermediate nodes code */ else if( tree->tree_nextsize > 0 ) { /* Create the pipeline. 1) Post the first receive 2) For segments 1 .. num_segments - post new receive - wait on the previous receive to complete - send this data to children 3) Wait on the last segment 4) Compute number of elements in last segment. 5) Send the last segment to children */ req_index = 0; recv_reqs[req_index]=smpi_mpi_irecv(tmpbuf, count_by_segment, datatype, tree->tree_prev, COLL_TAG_BCAST, comm); for( segindex = 1; segindex < num_segments; segindex++ ) { req_index = req_index ^ 0x1; /* post new irecv */ recv_reqs[req_index]= smpi_mpi_irecv( tmpbuf + realsegsize, count_by_segment, datatype, tree->tree_prev, COLL_TAG_BCAST, comm); /* wait for and forward the previous segment to children */ smpi_mpi_wait( &recv_reqs[req_index ^ 0x1], MPI_STATUSES_IGNORE ); for( i = 0; i < tree->tree_nextsize; i++ ) { send_reqs[i]=smpi_mpi_isend(tmpbuf, count_by_segment, datatype, tree->tree_next[i], COLL_TAG_BCAST, comm ); } /* complete the sends before starting the next iteration */ smpi_mpi_waitall( tree->tree_nextsize, send_reqs, MPI_STATUSES_IGNORE ); /* Update the receive buffer */ tmpbuf += realsegsize; } /* Process the last segment */ smpi_mpi_wait( &recv_reqs[req_index], MPI_STATUSES_IGNORE ); sendcount = original_count - (num_segments - 1) * count_by_segment; for( i = 0; i < tree->tree_nextsize; i++ ) { send_reqs[i] = smpi_mpi_isend(tmpbuf, sendcount, datatype, tree->tree_next[i], COLL_TAG_BCAST, comm); } smpi_mpi_waitall( tree->tree_nextsize, send_reqs, MPI_STATUSES_IGNORE ); } /* Leaf nodes */ else { /* Receive all segments from parent in a loop: 1) post irecv for the first segment 2) for segments 1 .. num_segments - post irecv for the next segment - wait on the previous segment to arrive 3) wait for the last segment */ req_index = 0; recv_reqs[req_index] = smpi_mpi_irecv(tmpbuf, count_by_segment, datatype, tree->tree_prev, COLL_TAG_BCAST, comm); for( segindex = 1; segindex < num_segments; segindex++ ) { req_index = req_index ^ 0x1; tmpbuf += realsegsize; /* post receive for the next segment */ recv_reqs[req_index] = smpi_mpi_irecv(tmpbuf, count_by_segment, datatype, tree->tree_prev, COLL_TAG_BCAST, comm); /* wait on the previous segment */ smpi_mpi_wait( &recv_reqs[req_index ^ 0x1], MPI_STATUS_IGNORE ); } smpi_mpi_wait( &recv_reqs[req_index], MPI_STATUS_IGNORE ); } if( NULL != send_reqs ) free(send_reqs); return (MPI_SUCCESS); }
/* * reduce_intra_in_order_binary * * Function: Logarithmic reduce operation for non-commutative operations. * Acecpts: same as MPI_Reduce() * Returns: MPI_SUCCESS or error code */ int ompi_coll_tuned_reduce_intra_in_order_binary( void *sendbuf, void *recvbuf, int count, ompi_datatype_t* datatype, ompi_op_t* op, int root, ompi_communicator_t* comm, mca_coll_base_module_t *module, uint32_t segsize, int max_outstanding_reqs ) { int ret, rank, size, io_root, segcount = count; void *use_this_sendbuf = NULL, *use_this_recvbuf = NULL; size_t typelng; mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; mca_coll_tuned_comm_t *data = tuned_module->tuned_data; rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_in_order_binary rank %d ss %5d", rank, segsize)); COLL_TUNED_UPDATE_IN_ORDER_BINTREE( comm, tuned_module ); /** * Determine number of segments and number of elements * sent per operation */ ompi_datatype_type_size( datatype, &typelng ); COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount ); /* An in-order binary tree must use root (size-1) to preserve the order of operations. Thus, if root is not rank (size - 1), then we must handle 1. MPI_IN_PLACE option on real root, and 2. we must allocate temporary recvbuf on rank (size - 1). Note that generic function must be careful not to switch order of operations for non-commutative ops. */ io_root = size - 1; use_this_sendbuf = sendbuf; use_this_recvbuf = recvbuf; if (io_root != root) { ptrdiff_t tlb, text, lb, ext; char *tmpbuf = NULL; ompi_datatype_get_extent(datatype, &lb, &ext); ompi_datatype_get_true_extent(datatype, &tlb, &text); if ((root == rank) && (MPI_IN_PLACE == sendbuf)) { tmpbuf = (char *) malloc(text + (ptrdiff_t)(count - 1) * ext); if (NULL == tmpbuf) { return MPI_ERR_INTERN; } ompi_datatype_copy_content_same_ddt(datatype, count, (char*)tmpbuf, (char*)recvbuf); use_this_sendbuf = tmpbuf; } else if (io_root == rank) { tmpbuf = (char *) malloc(text + (ptrdiff_t)(count - 1) * ext); if (NULL == tmpbuf) { return MPI_ERR_INTERN; } use_this_recvbuf = tmpbuf; } } /* Use generic reduce with in-order binary tree topology and io_root */ ret = ompi_coll_tuned_reduce_generic( use_this_sendbuf, use_this_recvbuf, count, datatype, op, io_root, comm, module, data->cached_in_order_bintree, segcount, max_outstanding_reqs ); if (MPI_SUCCESS != ret) { return ret; } /* Clean up */ if (io_root != root) { if (root == rank) { /* Receive result from rank io_root to recvbuf */ ret = MCA_PML_CALL(recv(recvbuf, count, datatype, io_root, MCA_COLL_BASE_TAG_REDUCE, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != ret) { return ret; } if (MPI_IN_PLACE == sendbuf) { free(use_this_sendbuf); } } else if (io_root == rank) { /* Send result from use_this_recvbuf to root */ ret = MCA_PML_CALL(send(use_this_recvbuf, count, datatype, root, MCA_COLL_BASE_TAG_REDUCE, MCA_PML_BASE_SEND_STANDARD, comm)); if (MPI_SUCCESS != ret) { return ret; } free(use_this_recvbuf); } } return MPI_SUCCESS; }
int ompi_coll_tuned_reduce_intra_binomial( void *sendbuf, void *recvbuf, int count, ompi_datatype_t* datatype, ompi_op_t* op, int root, ompi_communicator_t* comm, mca_coll_base_module_t *module, uint32_t segsize, int max_outstanding_reqs ) { int segcount = count; size_t typelng; mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; mca_coll_tuned_comm_t *data = tuned_module->tuned_data; OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_binomial rank %d ss %5d", ompi_comm_rank(comm), segsize)); //printf ("USE BINOMIAL REDUCTION\n"); if (sdn_comp_enable > 1) { assert(false); ompi_coll_tree_t *tree = (ompi_coll_tree_t*)malloc(sizeof(ompi_coll_tree_t)); tree->tree_bmtree = 1; tree->tree_root = MPI_UNDEFINED; tree->tree_nextsize = MPI_UNDEFINED; tree->tree_root = root; switch (ompi_comm_rank(comm)) { case 0: tree->tree_nextsize = 0; tree->tree_prev = 1; break; case 1: tree->tree_nextsize = 1; tree->tree_next[0] = 0; tree->tree_prev = 3; break; case 2: tree->tree_nextsize = 0; tree->tree_prev = 3; break; case 3: tree->tree_nextsize = 2; tree->tree_next[0] = 1; tree->tree_next[1] = 2; tree->tree_prev = 7; break; case 4: tree->tree_nextsize = 1; tree->tree_next[0] = 5; tree->tree_prev = 7; break; case 5: tree->tree_nextsize = 0; tree->tree_prev = 4; break; case 6: tree->tree_nextsize = 0; tree->tree_prev = 7; break; case 7: tree->tree_nextsize = 3; tree->tree_next[0] = 3; tree->tree_next[1] = 4; tree->tree_next[2] = 6; tree->tree_prev = 7; break; } /** * Determine number of segments and number of elements * sent per operation */ ompi_datatype_type_size( datatype, &typelng ); COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount ); return ompi_coll_tuned_reduce_generic( sendbuf, recvbuf, count, datatype, op, root, comm, module, tree, segcount, max_outstanding_reqs ); } else if (sdn_comp_enable > 0) { /** * Determine number of segments and number of elements * sent per operation */ ompi_datatype_type_size( datatype, &typelng ); COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount ); return ompi_coll_tuned_reduce_generic( sendbuf, recvbuf, count, datatype, op, root, comm, module, sdn_shortest_bmtree[root], segcount, max_outstanding_reqs ); } else { COLL_TUNED_UPDATE_IN_ORDER_BMTREE( comm, tuned_module, root ); /** * Determine number of segments and number of elements * sent per operation */ ompi_datatype_type_size( datatype, &typelng ); COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount ); return ompi_coll_tuned_reduce_generic( sendbuf, recvbuf, count, datatype, op, root, comm, module, data->cached_in_order_bmtree, segcount, max_outstanding_reqs ); } /* end if sdn_comp_enable */ }
/* * reduce_intra_in_order_binary * * Function: Logarithmic reduce operation for non-commutative operations. * Acecpts: same as MPI_Reduce() * Returns: MPI_SUCCESS or error code */ int smpi_coll_tuned_reduce_ompi_in_order_binary( void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPI_Comm comm) { uint32_t segsize=0; int ret; int rank, size, io_root; int segcount = count; void *use_this_sendbuf = NULL, *use_this_recvbuf = NULL; size_t typelng; rank = smpi_comm_rank(comm); size = smpi_comm_size(comm); XBT_DEBUG("coll:tuned:reduce_intra_in_order_binary rank %d ss %5d", rank, segsize); /** * Determine number of segments and number of elements * sent per operation */ typelng=smpi_datatype_size( datatype); COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount ); /* An in-order binary tree must use root (size-1) to preserve the order of operations. Thus, if root is not rank (size - 1), then we must handle 1. MPI_IN_PLACE option on real root, and 2. we must allocate temporary recvbuf on rank (size - 1). Note that generic function must be careful not to switch order of operations for non-commutative ops. */ io_root = size - 1; use_this_sendbuf = sendbuf; use_this_recvbuf = recvbuf; if (io_root != root) { ptrdiff_t text, ext; char *tmpbuf = NULL; ext=smpi_datatype_get_extent(datatype); text=smpi_datatype_get_extent(datatype); if ((root == rank) && (MPI_IN_PLACE == sendbuf)) { tmpbuf = (char *) malloc(text + (count - 1) * ext); if (NULL == tmpbuf) { return MPI_ERR_INTERN; } smpi_datatype_copy ( (char*)recvbuf, count, datatype, (char*)tmpbuf, count, datatype); use_this_sendbuf = tmpbuf; } else if (io_root == rank) { tmpbuf = (char *) malloc(text + (count - 1) * ext); if (NULL == tmpbuf) { return MPI_ERR_INTERN; } use_this_recvbuf = tmpbuf; } } /* Use generic reduce with in-order binary tree topology and io_root */ ret = smpi_coll_tuned_ompi_reduce_generic( use_this_sendbuf, use_this_recvbuf, count, datatype, op, io_root, comm, ompi_coll_tuned_topo_build_in_order_bintree(comm), segcount, 0 ); if (MPI_SUCCESS != ret) { return ret; } /* Clean up */ if (io_root != root) { if (root == rank) { /* Receive result from rank io_root to recvbuf */ smpi_mpi_recv(recvbuf, count, datatype, io_root, COLL_TAG_REDUCE, comm, MPI_STATUS_IGNORE); if (MPI_IN_PLACE == sendbuf) { free(use_this_sendbuf); } } else if (io_root == rank) { /* Send result from use_this_recvbuf to root */ smpi_mpi_send(use_this_recvbuf, count, datatype, root, COLL_TAG_REDUCE, comm); free(use_this_recvbuf); } } return MPI_SUCCESS; }
/* * gather_intra_linear_sync * * Function: - synchronized gather operation with * Accepts: - same arguments as MPI_Gather(), first segment size * Returns: - MPI_SUCCESS or error code */ int Coll_gather_ompi_linear_sync::gather(void *sbuf, int scount, MPI_Datatype sdtype, void *rbuf, int rcount, MPI_Datatype rdtype, int root, MPI_Comm comm) { int i; int ret, line; int rank, size; int first_segment_count; size_t typelng; MPI_Aint extent; MPI_Aint lb; int first_segment_size=0; size = comm->size(); rank = comm->rank(); size_t dsize, block_size; if (rank == root) { dsize= rdtype->size(); block_size = dsize * rcount; } else { dsize=sdtype->size(); block_size = dsize * scount; } if (block_size > 92160){ first_segment_size = 32768; }else{ first_segment_size = 1024; } XBT_DEBUG("smpi_coll_tuned_gather_ompi_linear_sync rank %d, segment %d", rank, first_segment_size); if (rank != root) { /* Non-root processes: - receive zero byte message from the root, - send the first segment of the data synchronously, - send the second segment of the data. */ typelng = sdtype->size(); sdtype->extent(&lb, &extent); first_segment_count = scount; COLL_TUNED_COMPUTED_SEGCOUNT((size_t)first_segment_size, typelng, first_segment_count); Request::recv(sbuf, 0, MPI_BYTE, root, COLL_TAG_GATHER, comm, MPI_STATUS_IGNORE); Request::send(sbuf, first_segment_count, sdtype, root, COLL_TAG_GATHER, comm); Request::send((char*)sbuf + extent * first_segment_count, (scount - first_segment_count), sdtype, root, COLL_TAG_GATHER, comm); } else { /* Root process, - For every non-root node: - post irecv for the first segment of the message - send zero byte message to signal node to send the message - post irecv for the second segment of the message - wait for the first segment to complete - Copy local data if necessary - Waitall for all the second segments to complete. */ char* ptmp; MPI_Request first_segment_req; MPI_Request* reqs = new (std::nothrow) MPI_Request[size]; if (NULL == reqs) { ret = -1; line = __LINE__; goto error_hndl; } typelng=rdtype->size(); rdtype->extent(&lb, &extent); first_segment_count = rcount; COLL_TUNED_COMPUTED_SEGCOUNT( (size_t)first_segment_size, typelng, first_segment_count ); for (i = 0; i < size; ++i) { if (i == rank) { /* skip myself */ reqs[i] = MPI_REQUEST_NULL; continue; } /* irecv for the first segment from i */ ptmp = (char*)rbuf + i * rcount * extent; first_segment_req = Request::irecv(ptmp, first_segment_count, rdtype, i, COLL_TAG_GATHER, comm ); /* send sync message */ Request::send(rbuf, 0, MPI_BYTE, i, COLL_TAG_GATHER, comm); /* irecv for the second segment */ ptmp = (char*)rbuf + (i * rcount + first_segment_count) * extent; reqs[i]=Request::irecv(ptmp, (rcount - first_segment_count), rdtype, i, COLL_TAG_GATHER, comm ); /* wait on the first segment to complete */ Request::wait(&first_segment_req, MPI_STATUS_IGNORE); } /* copy local data if necessary */ if (MPI_IN_PLACE != sbuf) { ret = Datatype::copy(sbuf, scount, sdtype, (char*)rbuf + rank * rcount * extent, rcount, rdtype); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } } /* wait all second segments to complete */ ret = Request::waitall(size, reqs, MPI_STATUSES_IGNORE); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } delete[] reqs; } /* All done */ return MPI_SUCCESS; error_hndl: XBT_DEBUG( "ERROR_HNDL: node %d file %s line %d error %d\n", rank, __FILE__, line, ret ); return ret; }