int smpi_coll_tuned_reduce_scatter_ompi( void *sbuf, void *rbuf, int *rcounts, MPI_Datatype dtype, MPI_Op op, MPI_Comm comm ) { int comm_size, i, pow2; size_t total_message_size, dsize; const double a = 0.0012; const double b = 8.0; const size_t small_message_size = 12 * 1024; const size_t large_message_size = 256 * 1024; int zerocounts = 0; XBT_DEBUG("smpi_coll_tuned_reduce_scatter_ompi"); comm_size = smpi_comm_size(comm); // We need data size for decision function dsize=smpi_datatype_size(dtype); total_message_size = 0; for (i = 0; i < comm_size; i++) { total_message_size += rcounts[i]; if (0 == rcounts[i]) { zerocounts = 1; } } if( !smpi_op_is_commute(op) || (zerocounts)) { smpi_mpi_reduce_scatter (sbuf, rbuf, rcounts, dtype, op, comm); return MPI_SUCCESS; } total_message_size *= dsize; // compute the nearest power of 2 for (pow2 = 1; pow2 < comm_size; pow2 <<= 1); if ((total_message_size <= small_message_size) || ((total_message_size <= large_message_size) && (pow2 == comm_size)) || (comm_size >= a * total_message_size + b)) { return smpi_coll_tuned_reduce_scatter_ompi_basic_recursivehalving(sbuf, rbuf, rcounts, dtype, op, comm); } return smpi_coll_tuned_reduce_scatter_ompi_ring(sbuf, rbuf, rcounts, dtype, op, comm); }
int smpi_coll_tuned_allreduce_ompi(void *sbuf, void *rbuf, int count, MPI_Datatype dtype, MPI_Op op, MPI_Comm comm) { size_t dsize, block_dsize; int comm_size = smpi_comm_size(comm); const size_t intermediate_message = 10000; /** * Decision function based on MX results from the Grig cluster at UTK. * * Currently, linear, recursive doubling, and nonoverlapping algorithms * can handle both commutative and non-commutative operations. * Ring algorithm does not support non-commutative operations. */ dsize = smpi_datatype_size(dtype); block_dsize = dsize * count; if (block_dsize < intermediate_message) { return (smpi_coll_tuned_allreduce_rdb (sbuf, rbuf, count, dtype, op, comm)); } if( smpi_op_is_commute(op) && (count > comm_size) ) { const size_t segment_size = 1 << 20; /* 1 MB */ if ((comm_size * segment_size >= block_dsize)) { //FIXME: ok, these are not the right algorithms, try to find closer ones // lr is a good match for allreduce_ring (difference is mainly the use of sendrecv) return smpi_coll_tuned_allreduce_lr(sbuf, rbuf, count, dtype, op, comm); } else { return (smpi_coll_tuned_allreduce_ompi_ring_segmented (sbuf, rbuf, count, dtype, op, comm /*segment_size*/)); } } return (smpi_coll_tuned_allreduce_redbcast(sbuf, rbuf, count, dtype, op, comm)); }
int smpi_coll_tuned_reduce_ompi( void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPI_Comm comm ) { int communicator_size=0; //int segsize = 0; size_t message_size, dsize; const double a1 = 0.6016 / 1024.0; /* [1/B] */ const double b1 = 1.3496; const double a2 = 0.0410 / 1024.0; /* [1/B] */ const double b2 = 9.7128; const double a3 = 0.0422 / 1024.0; /* [1/B] */ const double b3 = 1.1614; //const double a4 = 0.0033 / 1024.0; /* [1/B] */ //const double b4 = 1.6761; //const int max_requests = 0; /* no limit on # of outstanding requests */ communicator_size = smpi_comm_size(comm); /* need data size for decision function */ dsize=smpi_datatype_size(datatype); message_size = dsize * count; /* needed for decision */ /** * If the operation is non commutative we currently have choice of linear * or in-order binary tree algorithm. */ if( !smpi_op_is_commute(op) ) { if ((communicator_size < 12) && (message_size < 2048)) { return smpi_coll_tuned_reduce_ompi_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm/*, module*/); } return smpi_coll_tuned_reduce_ompi_in_order_binary (sendbuf, recvbuf, count, datatype, op, root, comm/*, module, 0, max_requests*/); } if ((communicator_size < 8) && (message_size < 512)){ /* Linear_0K */ return smpi_coll_tuned_reduce_ompi_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm); } else if (((communicator_size < 8) && (message_size < 20480)) || (message_size < 2048) || (count <= 1)) { /* Binomial_0K */ //segsize = 0; return smpi_coll_tuned_reduce_ompi_binomial(sendbuf, recvbuf, count, datatype, op, root, comm/*, module, segsize, max_requests*/); } else if (communicator_size > (a1 * message_size + b1)) { // Binomial_1K //segsize = 1024; return smpi_coll_tuned_reduce_ompi_binomial(sendbuf, recvbuf, count, datatype, op, root, comm/*, module, segsize, max_requests*/); } else if (communicator_size > (a2 * message_size + b2)) { // Pipeline_1K //segsize = 1024; return smpi_coll_tuned_reduce_ompi_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm/*, module, segsize, max_requests*/); } else if (communicator_size > (a3 * message_size + b3)) { // Binary_32K //segsize = 32*1024; return smpi_coll_tuned_reduce_ompi_binary( sendbuf, recvbuf, count, datatype, op, root, comm/*, module, segsize, max_requests*/); } /*if (communicator_size > (a4 * message_size + b4)) { // Pipeline_32K segsize = 32*1024; } else { // Pipeline_64K segsize = 64*1024; }*/ return smpi_coll_tuned_reduce_ompi_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm/*, module, segsize, max_requests*/); #if 0 /* for small messages use linear algorithm */ if (message_size <= 4096) { segsize = 0; fanout = communicator_size - 1; /* when linear implemented or taken from basic put here, right now using chain as a linear system */ /* it is implemented and I shouldn't be calling a chain with a fanout bigger than MAXTREEFANOUT from topo.h! */ return smpi_coll_tuned_reduce_intra_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm, module); /* return smpi_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, segsize, fanout); */ } if (message_size < 524288) { if (message_size <= 65536 ) { segsize = 32768; fanout = 8; } else { segsize = 1024; fanout = communicator_size/2; } /* later swap this for a binary tree */ /* fanout = 2; */ return smpi_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, module, segsize, fanout, max_requests); } segsize = 1024; return smpi_coll_tuned_reduce_intra_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm, module, segsize, max_requests); #endif /* 0 */ }
int smpi_coll_tuned_reduce_mvapich2_knomial ( void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPI_Comm comm) { int mpi_errno = MPI_SUCCESS; int rank, is_commutative; int src, k; MPI_Request send_request; int index=0; MPI_Aint true_lb, true_extent, extent; MPI_Status status; int recv_iter=0, dst=-1, expected_send_count, expected_recv_count; int *src_array=NULL; void **tmp_buf=NULL; MPI_Request *requests=NULL; if (count == 0) return MPI_SUCCESS; rank = smpi_comm_rank(comm); /* Create a temporary buffer */ smpi_datatype_extent(datatype, &true_lb, &true_extent); extent = smpi_datatype_get_extent(datatype); is_commutative = smpi_op_is_commute(op); if (rank != root) { recvbuf=(void *)smpi_get_tmp_recvbuffer(count*(MAX(extent,true_extent))); recvbuf = (void *)((char*)recvbuf - true_lb); } if ((rank != root) || (sendbuf != MPI_IN_PLACE)) { mpi_errno = smpi_datatype_copy(sendbuf, count, datatype, recvbuf, count, datatype); } if(mv2_reduce_intra_knomial_factor<0) { mv2_reduce_intra_knomial_factor = SMPI_DEFAULT_KNOMIAL_FACTOR; } if(mv2_reduce_inter_knomial_factor<0) { mv2_reduce_inter_knomial_factor = SMPI_DEFAULT_KNOMIAL_FACTOR; } MPIR_Reduce_knomial_trace(root, mv2_reduce_intra_knomial_factor, comm, &dst, &expected_send_count, &expected_recv_count, &src_array); if(expected_recv_count > 0 ) { tmp_buf = xbt_malloc(sizeof(void *)*expected_recv_count); requests = xbt_malloc(sizeof(MPI_Request)*expected_recv_count); for(k=0; k < expected_recv_count; k++ ) { tmp_buf[k] = smpi_get_tmp_sendbuffer(count*(MAX(extent,true_extent))); tmp_buf[k] = (void *)((char*)tmp_buf[k] - true_lb); } while(recv_iter < expected_recv_count) { src = src_array[expected_recv_count - (recv_iter+1)]; requests[recv_iter]=smpi_mpi_irecv (tmp_buf[recv_iter], count, datatype ,src, COLL_TAG_REDUCE, comm); recv_iter++; } recv_iter=0; while(recv_iter < expected_recv_count) { index=smpi_mpi_waitany(expected_recv_count, requests, &status); recv_iter++; if (is_commutative) { smpi_op_apply(op, tmp_buf[index], recvbuf, &count, &datatype); } } for(k=0; k < expected_recv_count; k++ ) { smpi_free_tmp_buffer(tmp_buf[k]); } xbt_free(tmp_buf); xbt_free(requests); } if(src_array != NULL) { xbt_free(src_array); } if(rank != root) { send_request=smpi_mpi_isend(recvbuf,count, datatype, dst, COLL_TAG_REDUCE,comm); smpi_mpi_waitall(1, &send_request, &status); smpi_free_tmp_buffer((void *)((char*)recvbuf + true_lb)); } /* --END ERROR HANDLING-- */ return mpi_errno; }
int smpi_coll_tuned_allreduce_mvapich2_rs(void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm) { int comm_size, rank; int mpi_errno = MPI_SUCCESS; int mask, dst, is_commutative, pof2, newrank = 0, rem, newdst, i, send_idx, recv_idx, last_idx, send_cnt, recv_cnt, *cnts, *disps; MPI_Aint true_lb, true_extent, extent; void *tmp_buf, *tmp_buf_free; if (count == 0) { return MPI_SUCCESS; } /* homogeneous */ comm_size = smpi_comm_size(comm); rank = smpi_comm_rank(comm); is_commutative = smpi_op_is_commute(op); /* need to allocate temporary buffer to store incoming data */ smpi_datatype_extent(datatype, &true_lb, &true_extent); extent = smpi_datatype_get_extent(datatype); tmp_buf_free= smpi_get_tmp_recvbuffer(count * (MAX(extent, true_extent))); /* adjust for potential negative lower bound in datatype */ tmp_buf = (void *) ((char *) tmp_buf_free - true_lb); /* copy local data into recvbuf */ if (sendbuf != MPI_IN_PLACE) { mpi_errno = smpi_datatype_copy(sendbuf, count, datatype, recvbuf, count, datatype); } /* find nearest power-of-two less than or equal to comm_size */ for( pof2 = 1; pof2 <= comm_size; pof2 <<= 1 ); pof2 >>=1; rem = comm_size - pof2; /* In the non-power-of-two case, all even-numbered processes of rank < 2*rem send their data to (rank+1). These even-numbered processes no longer participate in the algorithm until the very end. The remaining processes form a nice power-of-two. */ if (rank < 2 * rem) { if (rank % 2 == 0) { /* even */ smpi_mpi_send(recvbuf, count, datatype, rank + 1, COLL_TAG_ALLREDUCE, comm); /* temporarily set the rank to -1 so that this process does not pariticipate in recursive doubling */ newrank = -1; } else { /* odd */ smpi_mpi_recv(tmp_buf, count, datatype, rank - 1, COLL_TAG_ALLREDUCE, comm, MPI_STATUS_IGNORE); /* do the reduction on received data. since the ordering is right, it doesn't matter whether the operation is commutative or not. */ smpi_op_apply(op, tmp_buf, recvbuf, &count, &datatype); /* change the rank */ newrank = rank / 2; } } else { /* rank >= 2*rem */ newrank = rank - rem; } /* If op is user-defined or count is less than pof2, use recursive doubling algorithm. Otherwise do a reduce-scatter followed by allgather. (If op is user-defined, derived datatypes are allowed and the user could pass basic datatypes on one process and derived on another as long as the type maps are the same. Breaking up derived datatypes to do the reduce-scatter is tricky, therefore using recursive doubling in that case.) */ if (newrank != -1) { if (/*(HANDLE_GET_KIND(op) != HANDLE_KIND_BUILTIN) ||*/ (count < pof2)) { /* use recursive doubling */ mask = 0x1; while (mask < pof2) { newdst = newrank ^ mask; /* find real rank of dest */ dst = (newdst < rem) ? newdst * 2 + 1 : newdst + rem; /* Send the most current data, which is in recvbuf. Recv into tmp_buf */ smpi_mpi_sendrecv(recvbuf, count, datatype, dst, COLL_TAG_ALLREDUCE, tmp_buf, count, datatype, dst, COLL_TAG_ALLREDUCE, comm, MPI_STATUS_IGNORE); /* tmp_buf contains data received in this step. recvbuf contains data accumulated so far */ if (is_commutative || (dst < rank)) { /* op is commutative OR the order is already right */ smpi_op_apply(op, tmp_buf, recvbuf, &count, &datatype); } else { /* op is noncommutative and the order is not right */ smpi_op_apply(op, recvbuf, tmp_buf, &count, &datatype); /* copy result back into recvbuf */ mpi_errno = smpi_datatype_copy(tmp_buf, count, datatype, recvbuf, count, datatype); } mask <<= 1; } } else { /* do a reduce-scatter followed by allgather */ /* for the reduce-scatter, calculate the count that each process receives and the displacement within the buffer */ cnts = (int *)xbt_malloc(pof2 * sizeof (int)); disps = (int *)xbt_malloc(pof2 * sizeof (int)); for (i = 0; i < (pof2 - 1); i++) { cnts[i] = count / pof2; } cnts[pof2 - 1] = count - (count / pof2) * (pof2 - 1); disps[0] = 0; for (i = 1; i < pof2; i++) { disps[i] = disps[i - 1] + cnts[i - 1]; } mask = 0x1; send_idx = recv_idx = 0; last_idx = pof2; while (mask < pof2) { newdst = newrank ^ mask; /* find real rank of dest */ dst = (newdst < rem) ? newdst * 2 + 1 : newdst + rem; send_cnt = recv_cnt = 0; if (newrank < newdst) { send_idx = recv_idx + pof2 / (mask * 2); for (i = send_idx; i < last_idx; i++) send_cnt += cnts[i]; for (i = recv_idx; i < send_idx; i++) recv_cnt += cnts[i]; } else { recv_idx = send_idx + pof2 / (mask * 2); for (i = send_idx; i < recv_idx; i++) send_cnt += cnts[i]; for (i = recv_idx; i < last_idx; i++) recv_cnt += cnts[i]; } /* Send data from recvbuf. Recv into tmp_buf */ smpi_mpi_sendrecv((char *) recvbuf + disps[send_idx] * extent, send_cnt, datatype, dst, COLL_TAG_ALLREDUCE, (char *) tmp_buf + disps[recv_idx] * extent, recv_cnt, datatype, dst, COLL_TAG_ALLREDUCE, comm, MPI_STATUS_IGNORE); /* tmp_buf contains data received in this step. recvbuf contains data accumulated so far */ /* This algorithm is used only for predefined ops and predefined ops are always commutative. */ smpi_op_apply(op, (char *) tmp_buf + disps[recv_idx] * extent, (char *) recvbuf + disps[recv_idx] * extent, &recv_cnt, &datatype); /* update send_idx for next iteration */ send_idx = recv_idx; mask <<= 1; /* update last_idx, but not in last iteration because the value is needed in the allgather step below. */ if (mask < pof2) last_idx = recv_idx + pof2 / mask; } /* now do the allgather */ mask >>= 1; while (mask > 0) { newdst = newrank ^ mask; /* find real rank of dest */ dst = (newdst < rem) ? newdst * 2 + 1 : newdst + rem; send_cnt = recv_cnt = 0; if (newrank < newdst) { /* update last_idx except on first iteration */ if (mask != pof2 / 2) { last_idx = last_idx + pof2 / (mask * 2); } recv_idx = send_idx + pof2 / (mask * 2); for (i = send_idx; i < recv_idx; i++) { send_cnt += cnts[i]; } for (i = recv_idx; i < last_idx; i++) { recv_cnt += cnts[i]; } } else { recv_idx = send_idx - pof2 / (mask * 2); for (i = send_idx; i < last_idx; i++) { send_cnt += cnts[i]; } for (i = recv_idx; i < send_idx; i++) { recv_cnt += cnts[i]; } } smpi_mpi_sendrecv((char *) recvbuf + disps[send_idx] * extent, send_cnt, datatype, dst, COLL_TAG_ALLREDUCE, (char *) recvbuf + disps[recv_idx] * extent, recv_cnt, datatype, dst, COLL_TAG_ALLREDUCE, comm, MPI_STATUS_IGNORE); if (newrank > newdst) { send_idx = recv_idx; } mask >>= 1; } } } /* In the non-power-of-two case, all odd-numbered processes of rank < 2*rem send the result to (rank-1), the ranks who didn't participate above. */ if (rank < 2 * rem) { if (rank % 2) { /* odd */ smpi_mpi_send(recvbuf, count, datatype, rank - 1, COLL_TAG_ALLREDUCE, comm); } else { /* even */ smpi_mpi_recv(recvbuf, count, datatype, rank + 1, COLL_TAG_ALLREDUCE, comm, MPI_STATUS_IGNORE); } } smpi_free_tmp_buffer(tmp_buf_free); return (mpi_errno); }
/** * This is a generic implementation of the reduce protocol. It used the tree * provided as an argument and execute all operations using a segment of * count times a datatype. * For the last communication it will update the count in order to limit * the number of datatype to the original count (original_count) * * Note that for non-commutative operations we cannot save memory copy * for the first block: thus we must copy sendbuf to accumbuf on intermediate * to keep the optimized loop happy. */ int smpi_coll_tuned_ompi_reduce_generic( void* sendbuf, void* recvbuf, int original_count, MPI_Datatype datatype, MPI_Op op, int root, MPI_Comm comm, ompi_coll_tree_t* tree, int count_by_segment, int max_outstanding_reqs ) { char *inbuf[2] = {NULL, NULL}, *inbuf_free[2] = {NULL, NULL}; char *accumbuf = NULL, *accumbuf_free = NULL; char *local_op_buffer = NULL, *sendtmpbuf = NULL; ptrdiff_t extent, lower_bound, segment_increment; MPI_Request reqs[2] = {MPI_REQUEST_NULL, MPI_REQUEST_NULL}; int num_segments, line, ret, segindex, i, rank; int recvcount, prevcount, inbi; /** * Determine number of segments and number of elements * sent per operation */ smpi_datatype_extent( datatype, &lower_bound, &extent); num_segments = (original_count + count_by_segment - 1) / count_by_segment; segment_increment = count_by_segment * extent; sendtmpbuf = (char*) sendbuf; if( sendbuf == MPI_IN_PLACE ) { sendtmpbuf = (char *)recvbuf; } XBT_DEBUG( "coll:tuned:reduce_generic count %d, msg size %ld, segsize %ld, max_requests %d", original_count, (unsigned long)(num_segments * segment_increment), (unsigned long)segment_increment, max_outstanding_reqs); rank = smpi_comm_rank(comm); /* non-leaf nodes - wait for children to send me data & forward up (if needed) */ if( tree->tree_nextsize > 0 ) { ptrdiff_t true_extent, real_segment_size; true_extent=smpi_datatype_get_extent( datatype); /* handle non existant recv buffer (i.e. its NULL) and protect the recv buffer on non-root nodes */ accumbuf = (char*)recvbuf; if( (NULL == accumbuf) || (root != rank) ) { /* Allocate temporary accumulator buffer. */ accumbuf_free = (char*)malloc(true_extent + (original_count - 1) * extent); if (accumbuf_free == NULL) { line = __LINE__; ret = -1; goto error_hndl; } accumbuf = accumbuf_free - lower_bound; } /* If this is a non-commutative operation we must copy sendbuf to the accumbuf, in order to simplfy the loops */ if (!smpi_op_is_commute(op)) { smpi_datatype_copy( (char*)sendtmpbuf, original_count, datatype, (char*)accumbuf, original_count, datatype); } /* Allocate two buffers for incoming segments */ real_segment_size = true_extent + (count_by_segment - 1) * extent; inbuf_free[0] = (char*) malloc(real_segment_size); if( inbuf_free[0] == NULL ) { line = __LINE__; ret = -1; goto error_hndl; } inbuf[0] = inbuf_free[0] - lower_bound; /* if there is chance to overlap communication - allocate second buffer */ if( (num_segments > 1) || (tree->tree_nextsize > 1) ) { inbuf_free[1] = (char*) malloc(real_segment_size); if( inbuf_free[1] == NULL ) { line = __LINE__; ret = -1; goto error_hndl; } inbuf[1] = inbuf_free[1] - lower_bound; } /* reset input buffer index and receive count */ inbi = 0; recvcount = 0; /* for each segment */ for( segindex = 0; segindex <= num_segments; segindex++ ) { prevcount = recvcount; /* recvcount - number of elements in current segment */ recvcount = count_by_segment; if( segindex == (num_segments-1) ) recvcount = original_count - count_by_segment * segindex; /* for each child */ for( i = 0; i < tree->tree_nextsize; i++ ) { /** * We try to overlap communication: * either with next segment or with the next child */ /* post irecv for current segindex on current child */ if( segindex < num_segments ) { void* local_recvbuf = inbuf[inbi]; if( 0 == i ) { /* for the first step (1st child per segment) and * commutative operations we might be able to irecv * directly into the accumulate buffer so that we can * reduce(op) this with our sendbuf in one step as * ompi_op_reduce only has two buffer pointers, * this avoids an extra memory copy. * * BUT if the operation is non-commutative or * we are root and are USING MPI_IN_PLACE this is wrong! */ if( (smpi_op_is_commute(op)) && !((MPI_IN_PLACE == sendbuf) && (rank == tree->tree_root)) ) { local_recvbuf = accumbuf + segindex * segment_increment; } } reqs[inbi]=smpi_mpi_irecv(local_recvbuf, recvcount, datatype, tree->tree_next[i], COLL_TAG_REDUCE, comm ); } /* wait for previous req to complete, if any. if there are no requests reqs[inbi ^1] will be MPI_REQUEST_NULL. */ /* wait on data from last child for previous segment */ smpi_mpi_waitall( 1, &reqs[inbi ^ 1], MPI_STATUSES_IGNORE ); local_op_buffer = inbuf[inbi ^ 1]; if( i > 0 ) { /* our first operation is to combine our own [sendbuf] data * with the data we recvd from down stream (but only * the operation is commutative and if we are not root and * not using MPI_IN_PLACE) */ if( 1 == i ) { if( (smpi_op_is_commute(op)) && !((MPI_IN_PLACE == sendbuf) && (rank == tree->tree_root)) ) { local_op_buffer = sendtmpbuf + segindex * segment_increment; } } /* apply operation */ smpi_op_apply(op, local_op_buffer, accumbuf + segindex * segment_increment, &recvcount, &datatype ); } else if ( segindex > 0 ) { void* accumulator = accumbuf + (segindex-1) * segment_increment; if( tree->tree_nextsize <= 1 ) { if( (smpi_op_is_commute(op)) && !((MPI_IN_PLACE == sendbuf) && (rank == tree->tree_root)) ) { local_op_buffer = sendtmpbuf + (segindex-1) * segment_increment; } } smpi_op_apply(op, local_op_buffer, accumulator, &prevcount, &datatype ); /* all reduced on available data this step (i) complete, * pass to the next process unless you are the root. */ if (rank != tree->tree_root) { /* send combined/accumulated data to parent */ smpi_mpi_send( accumulator, prevcount, datatype, tree->tree_prev, COLL_TAG_REDUCE, comm); } /* we stop when segindex = number of segments (i.e. we do num_segment+1 steps for pipelining */ if (segindex == num_segments) break; } /* update input buffer index */ inbi = inbi ^ 1; } /* end of for each child */ } /* end of for each segment */ /* clean up */ if( inbuf_free[0] != NULL) free(inbuf_free[0]); if( inbuf_free[1] != NULL) free(inbuf_free[1]); if( accumbuf_free != NULL ) free(accumbuf_free); } /* leaf nodes Depending on the value of max_outstanding_reqs and the number of segments we have two options: - send all segments using blocking send to the parent, or - avoid overflooding the parent nodes by limiting the number of outstanding requests to max_oustanding_reqs. TODO/POSSIBLE IMPROVEMENT: If there is a way to determine the eager size for the current communication, synchronization should be used only when the message/segment size is smaller than the eager size. */ else { /* If the number of segments is less than a maximum number of oustanding requests or there is no limit on the maximum number of outstanding requests, we send data to the parent using blocking send */ if ((0 == max_outstanding_reqs) || (num_segments <= max_outstanding_reqs)) { segindex = 0; while ( original_count > 0) { if (original_count < count_by_segment) { count_by_segment = original_count; } smpi_mpi_send((char*)sendbuf + segindex * segment_increment, count_by_segment, datatype, tree->tree_prev, COLL_TAG_REDUCE, comm) ; segindex++; original_count -= count_by_segment; } } /* Otherwise, introduce flow control: - post max_outstanding_reqs non-blocking synchronous send, - for remaining segments - wait for a ssend to complete, and post the next one. - wait for all outstanding sends to complete. */ else { int creq = 0; MPI_Request* sreq = NULL; sreq = (MPI_Request*) calloc( max_outstanding_reqs, sizeof(MPI_Request ) ); if (NULL == sreq) { line = __LINE__; ret = -1; goto error_hndl; } /* post first group of requests */ for (segindex = 0; segindex < max_outstanding_reqs; segindex++) { sreq[segindex]=smpi_mpi_isend((char*)sendbuf + segindex * segment_increment, count_by_segment, datatype, tree->tree_prev, COLL_TAG_REDUCE, comm); original_count -= count_by_segment; } creq = 0; while ( original_count > 0 ) { /* wait on a posted request to complete */ smpi_mpi_wait(&sreq[creq], MPI_STATUS_IGNORE); sreq[creq] = MPI_REQUEST_NULL; if( original_count < count_by_segment ) { count_by_segment = original_count; } sreq[creq]=smpi_mpi_isend((char*)sendbuf + segindex * segment_increment, count_by_segment, datatype, tree->tree_prev, COLL_TAG_REDUCE, comm ); creq = (creq + 1) % max_outstanding_reqs; segindex++; original_count -= count_by_segment; } /* Wait on the remaining request to complete */ smpi_mpi_waitall( max_outstanding_reqs, sreq, MPI_STATUSES_IGNORE ); /* free requests */ free(sreq); } } return MPI_SUCCESS; error_hndl: /* error handler */ XBT_DEBUG("ERROR_HNDL: node %d file %s line %d error %d\n", rank, __FILE__, line, ret ); if( inbuf_free[0] != NULL ) free(inbuf_free[0]); if( inbuf_free[1] != NULL ) free(inbuf_free[1]); if( accumbuf_free != NULL ) free(accumbuf); return ret; }
int smpi_coll_tuned_reduce_binomial(void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPI_Comm comm) { MPI_Status status; int comm_size, rank; int mask, relrank, source; int dst; int tag = COLL_TAG_REDUCE; MPI_Aint extent; void *tmp_buf; MPI_Aint true_lb, true_extent; if (count == 0) return 0; rank = smpi_comm_rank(comm); comm_size = smpi_comm_size(comm); extent = smpi_datatype_get_extent(datatype); tmp_buf = (void *) smpi_get_tmp_sendbuffer(count * extent); int is_commutative = smpi_op_is_commute(op); mask = 1; int lroot; if (is_commutative) lroot = root; else lroot = 0; relrank = (rank - lroot + comm_size) % comm_size; smpi_datatype_extent(datatype, &true_lb, &true_extent); /* adjust for potential negative lower bound in datatype */ tmp_buf = (void *)((char*)tmp_buf - true_lb); /* If I'm not the root, then my recvbuf may not be valid, therefore I have to allocate a temporary one */ if (rank != root) { recvbuf = (void *) smpi_get_tmp_recvbuffer(count*(max(extent,true_extent))); recvbuf = (void *)((char*)recvbuf - true_lb); } if ((rank != root) || (sendbuf != MPI_IN_PLACE)) { smpi_datatype_copy(sendbuf, count, datatype, recvbuf,count, datatype); } while (mask < comm_size) { /* Receive */ if ((mask & relrank) == 0) { source = (relrank | mask); if (source < comm_size) { source = (source + lroot) % comm_size; smpi_mpi_recv(tmp_buf, count, datatype, source, tag, comm, &status); if (is_commutative) { smpi_op_apply(op, tmp_buf, recvbuf, &count, &datatype); } else { smpi_op_apply(op, recvbuf, tmp_buf, &count, &datatype); smpi_datatype_copy(tmp_buf, count, datatype,recvbuf, count, datatype); } } } else { dst = ((relrank & (~mask)) + lroot) % comm_size; smpi_mpi_send(recvbuf, count, datatype, dst, tag, comm); break; } mask <<= 1; } if (!is_commutative && (root != 0)){ if (rank == 0){ smpi_mpi_send(recvbuf, count, datatype, root,tag, comm); }else if (rank == root){ smpi_mpi_recv(recvbuf, count, datatype, 0, tag, comm, &status); } } if (rank != root) { smpi_free_tmp_buffer(recvbuf); } smpi_free_tmp_buffer(tmp_buf); return 0; }
int smpi_coll_tuned_reduce_scatter_mpich_rdb(void *sendbuf, void *recvbuf, int recvcounts[], MPI_Datatype datatype, MPI_Op op, MPI_Comm comm) { int rank, comm_size, i; MPI_Aint extent, true_extent, true_lb; int *disps; void *tmp_recvbuf, *tmp_results; int mpi_errno = MPI_SUCCESS; int dis[2], blklens[2], total_count, dst; int mask, dst_tree_root, my_tree_root, j, k; int received; MPI_Datatype sendtype, recvtype; int nprocs_completed, tmp_mask, tree_root, is_commutative; comm_size = smpi_comm_size(comm); rank = smpi_comm_rank(comm); extent =smpi_datatype_get_extent(datatype); smpi_datatype_extent(datatype, &true_lb, &true_extent); if (smpi_op_is_commute(op)) { is_commutative = 1; } disps = (int*)xbt_malloc( comm_size * sizeof(int)); total_count = 0; for (i=0; i<comm_size; i++) { disps[i] = total_count; total_count += recvcounts[i]; } /* noncommutative and (non-pof2 or block irregular), use recursive doubling. */ /* need to allocate temporary buffer to receive incoming data*/ tmp_recvbuf= (void *) xbt_malloc( total_count*(max(true_extent,extent))); /* adjust for potential negative lower bound in datatype */ tmp_recvbuf = (void *)((char*)tmp_recvbuf - true_lb); /* need to allocate another temporary buffer to accumulate results */ tmp_results = (void *)xbt_malloc( total_count*(max(true_extent,extent))); /* adjust for potential negative lower bound in datatype */ tmp_results = (void *)((char*)tmp_results - true_lb); /* copy sendbuf into tmp_results */ if (sendbuf != MPI_IN_PLACE) mpi_errno = smpi_datatype_copy(sendbuf, total_count, datatype, tmp_results, total_count, datatype); else mpi_errno = smpi_datatype_copy(recvbuf, total_count, datatype, tmp_results, total_count, datatype); if (mpi_errno) return(mpi_errno); mask = 0x1; i = 0; while (mask < comm_size) { dst = rank ^ mask; dst_tree_root = dst >> i; dst_tree_root <<= i; my_tree_root = rank >> i; my_tree_root <<= i; /* At step 1, processes exchange (n-n/p) amount of data; at step 2, (n-2n/p) amount of data; at step 3, (n-4n/p) amount of data, and so forth. We use derived datatypes for this. At each step, a process does not need to send data indexed from my_tree_root to my_tree_root+mask-1. Similarly, a process won't receive data indexed from dst_tree_root to dst_tree_root+mask-1. */ /* calculate sendtype */ blklens[0] = blklens[1] = 0; for (j=0; j<my_tree_root; j++) blklens[0] += recvcounts[j]; for (j=my_tree_root+mask; j<comm_size; j++) blklens[1] += recvcounts[j]; dis[0] = 0; dis[1] = blklens[0]; for (j=my_tree_root; (j<my_tree_root+mask) && (j<comm_size); j++) dis[1] += recvcounts[j]; mpi_errno = smpi_datatype_indexed(2, blklens, dis, datatype, &sendtype); if (mpi_errno) return(mpi_errno); smpi_datatype_commit(&sendtype); /* calculate recvtype */ blklens[0] = blklens[1] = 0; for (j=0; j<dst_tree_root && j<comm_size; j++) blklens[0] += recvcounts[j]; for (j=dst_tree_root+mask; j<comm_size; j++) blklens[1] += recvcounts[j]; dis[0] = 0; dis[1] = blklens[0]; for (j=dst_tree_root; (j<dst_tree_root+mask) && (j<comm_size); j++) dis[1] += recvcounts[j]; mpi_errno = smpi_datatype_indexed(2, blklens, dis, datatype, &recvtype); if (mpi_errno) return(mpi_errno); smpi_datatype_commit(&recvtype); received = 0; if (dst < comm_size) { /* tmp_results contains data to be sent in each step. Data is received in tmp_recvbuf and then accumulated into tmp_results. accumulation is done later below. */ smpi_mpi_sendrecv(tmp_results, 1, sendtype, dst, COLL_TAG_SCATTER, tmp_recvbuf, 1, recvtype, dst, COLL_TAG_SCATTER, comm, MPI_STATUS_IGNORE); received = 1; } /* if some processes in this process's subtree in this step did not have any destination process to communicate with because of non-power-of-two, we need to send them the result. We use a logarithmic recursive-halfing algorithm for this. */ if (dst_tree_root + mask > comm_size) { nprocs_completed = comm_size - my_tree_root - mask; /* nprocs_completed is the number of processes in this subtree that have all the data. Send data to others in a tree fashion. First find root of current tree that is being divided into two. k is the number of least-significant bits in this process's rank that must be zeroed out to find the rank of the root */ j = mask; k = 0; while (j) { j >>= 1; k++; } k--; tmp_mask = mask >> 1; while (tmp_mask) { dst = rank ^ tmp_mask; tree_root = rank >> k; tree_root <<= k; /* send only if this proc has data and destination doesn't have data. at any step, multiple processes can send if they have the data */ if ((dst > rank) && (rank < tree_root + nprocs_completed) && (dst >= tree_root + nprocs_completed)) { /* send the current result */ smpi_mpi_send(tmp_recvbuf, 1, recvtype, dst, COLL_TAG_SCATTER, comm); } /* recv only if this proc. doesn't have data and sender has data */ else if ((dst < rank) && (dst < tree_root + nprocs_completed) && (rank >= tree_root + nprocs_completed)) { smpi_mpi_recv(tmp_recvbuf, 1, recvtype, dst, COLL_TAG_SCATTER, comm, MPI_STATUS_IGNORE); received = 1; } tmp_mask >>= 1; k--; } } /* The following reduction is done here instead of after the MPIC_Sendrecv_ft or MPIC_Recv_ft above. This is because to do it above, in the noncommutative case, we would need an extra temp buffer so as not to overwrite temp_recvbuf, because temp_recvbuf may have to be communicated to other processes in the non-power-of-two case. To avoid that extra allocation, we do the reduce here. */ if (received) { if (is_commutative || (dst_tree_root < my_tree_root)) { { smpi_op_apply(op, tmp_recvbuf, tmp_results, &blklens[0], &datatype); smpi_op_apply(op, ((char *)tmp_recvbuf + dis[1]*extent), ((char *)tmp_results + dis[1]*extent), &blklens[1], &datatype); } } else { { smpi_op_apply(op, tmp_results, tmp_recvbuf, &blklens[0], &datatype); smpi_op_apply(op, ((char *)tmp_results + dis[1]*extent), ((char *)tmp_recvbuf + dis[1]*extent), &blklens[1], &datatype); } /* copy result back into tmp_results */ mpi_errno = smpi_datatype_copy(tmp_recvbuf, 1, recvtype, tmp_results, 1, recvtype); if (mpi_errno) return(mpi_errno); } } //smpi_datatype_free(&sendtype); //smpi_datatype_free(&recvtype); mask <<= 1; i++; }
int smpi_coll_tuned_reduce_scatter_mpich_pair(void *sendbuf, void *recvbuf, int recvcounts[], MPI_Datatype datatype, MPI_Op op, MPI_Comm comm) { int rank, comm_size, i; MPI_Aint extent, true_extent, true_lb; int *disps; void *tmp_recvbuf; int mpi_errno = MPI_SUCCESS; int total_count, dst, src; int is_commutative; comm_size = smpi_comm_size(comm); rank = smpi_comm_rank(comm); extent =smpi_datatype_get_extent(datatype); smpi_datatype_extent(datatype, &true_lb, &true_extent); if (smpi_op_is_commute(op)) { is_commutative = 1; } disps = (int*)xbt_malloc( comm_size * sizeof(int)); total_count = 0; for (i=0; i<comm_size; i++) { disps[i] = total_count; total_count += recvcounts[i]; } if (total_count == 0) { return MPI_ERR_COUNT; } if (sendbuf != MPI_IN_PLACE) { /* copy local data into recvbuf */ smpi_datatype_copy(((char *)sendbuf+disps[rank]*extent), recvcounts[rank], datatype, recvbuf, recvcounts[rank], datatype); } /* allocate temporary buffer to store incoming data */ tmp_recvbuf = (void*)xbt_malloc(recvcounts[rank]*(max(true_extent,extent))+1); /* adjust for potential negative lower bound in datatype */ tmp_recvbuf = (void *)((char*)tmp_recvbuf - true_lb); for (i=1; i<comm_size; i++) { src = (rank - i + comm_size) % comm_size; dst = (rank + i) % comm_size; /* send the data that dst needs. recv data that this process needs from src into tmp_recvbuf */ if (sendbuf != MPI_IN_PLACE) smpi_mpi_sendrecv(((char *)sendbuf+disps[dst]*extent), recvcounts[dst], datatype, dst, COLL_TAG_SCATTER, tmp_recvbuf, recvcounts[rank], datatype, src, COLL_TAG_SCATTER, comm, MPI_STATUS_IGNORE); else smpi_mpi_sendrecv(((char *)recvbuf+disps[dst]*extent), recvcounts[dst], datatype, dst, COLL_TAG_SCATTER, tmp_recvbuf, recvcounts[rank], datatype, src, COLL_TAG_SCATTER, comm, MPI_STATUS_IGNORE); if (is_commutative || (src < rank)) { if (sendbuf != MPI_IN_PLACE) { smpi_op_apply( op, tmp_recvbuf, recvbuf, &recvcounts[rank], &datatype); } else { smpi_op_apply(op, tmp_recvbuf, ((char *)recvbuf+disps[rank]*extent), &recvcounts[rank], &datatype); /* we can't store the result at the beginning of recvbuf right here because there is useful data there that other process/processes need. at the end, we will copy back the result to the beginning of recvbuf. */ } } else { if (sendbuf != MPI_IN_PLACE) { smpi_op_apply(op, recvbuf, tmp_recvbuf, &recvcounts[rank], &datatype); /* copy result back into recvbuf */ mpi_errno = smpi_datatype_copy(tmp_recvbuf, recvcounts[rank], datatype, recvbuf, recvcounts[rank], datatype); if (mpi_errno) return(mpi_errno); } else { smpi_op_apply(op, ((char *)recvbuf+disps[rank]*extent), tmp_recvbuf, &recvcounts[rank], &datatype); /* copy result back into recvbuf */ mpi_errno = smpi_datatype_copy(tmp_recvbuf, recvcounts[rank], datatype, ((char *)recvbuf + disps[rank]*extent), recvcounts[rank], datatype); if (mpi_errno) return(mpi_errno); } } } /* if MPI_IN_PLACE, move output data to the beginning of recvbuf. already done for rank 0. */ if ((sendbuf == MPI_IN_PLACE) && (rank != 0)) { mpi_errno = smpi_datatype_copy(((char *)recvbuf + disps[rank]*extent), recvcounts[rank], datatype, recvbuf, recvcounts[rank], datatype ); if (mpi_errno) return(mpi_errno); } return MPI_SUCCESS; }
int smpi_coll_tuned_reduce_mvapich2_two_level( void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPI_Comm comm) { int mpi_errno = MPI_SUCCESS; int my_rank, total_size, local_rank, local_size; int leader_comm_rank = -1, leader_comm_size = 0; MPI_Comm shmem_comm, leader_comm; int leader_root, leader_of_root; void *in_buf = NULL, *out_buf = NULL, *tmp_buf = NULL; MPI_Aint true_lb, true_extent, extent; int is_commutative = 0, stride = 0; int intra_node_root=0; //if not set (use of the algo directly, without mvapich2 selector) if(MV2_Reduce_function==NULL) MV2_Reduce_function=smpi_coll_tuned_reduce_mpich; if(MV2_Reduce_intra_function==NULL) MV2_Reduce_intra_function=smpi_coll_tuned_reduce_mpich; if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){ smpi_comm_init_smp(comm); } my_rank = smpi_comm_rank(comm); total_size = smpi_comm_size(comm); shmem_comm = smpi_comm_get_intra_comm(comm); local_rank = smpi_comm_rank(shmem_comm); local_size = smpi_comm_size(shmem_comm); leader_comm = smpi_comm_get_leaders_comm(comm); int* leaders_map = smpi_comm_get_leaders_map(comm); leader_of_root = smpi_group_rank(smpi_comm_group(comm),leaders_map[root]); leader_root = smpi_group_rank(smpi_comm_group(leader_comm),leaders_map[root]); is_commutative=smpi_op_is_commute(op); smpi_datatype_extent(datatype, &true_lb, &true_extent); extent =smpi_datatype_get_extent(datatype); stride = count * MAX(extent, true_extent); if (local_size == total_size) { /* First handle the case where there is only one node */ if (stride <= MV2_INTRA_SHMEM_REDUCE_MSG && is_commutative == 1) { if (local_rank == 0 ) { tmp_buf=(void *)smpi_get_tmp_sendbuffer( count * (MAX(extent, true_extent))); tmp_buf = (void *) ((char *) tmp_buf - true_lb); } if (sendbuf != MPI_IN_PLACE) { in_buf = (void *)sendbuf; } else { in_buf = recvbuf; } if (local_rank == 0) { if( my_rank != root) { out_buf = tmp_buf; } else { out_buf = recvbuf; if(in_buf == out_buf) { in_buf = MPI_IN_PLACE; out_buf = recvbuf; } } } else { in_buf = (void *)sendbuf; out_buf = NULL; } if (count * (MAX(extent, true_extent)) < SHMEM_COLL_BLOCK_SIZE) { mpi_errno = MPIR_Reduce_shmem_MV2(in_buf, out_buf, count, datatype, op, 0, shmem_comm); } else { mpi_errno = MPIR_Reduce_intra_knomial_wrapper_MV2(in_buf, out_buf, count, datatype, op, 0, shmem_comm); } if (local_rank == 0 && root != my_rank) { smpi_mpi_send(out_buf, count, datatype, root, COLL_TAG_REDUCE+1, comm); } if ((local_rank != 0) && (root == my_rank)) { smpi_mpi_recv(recvbuf, count, datatype, leader_of_root, COLL_TAG_REDUCE+1, comm, MPI_STATUS_IGNORE); } } else { if(mv2_use_knomial_reduce == 1) { reduce_fn = &MPIR_Reduce_intra_knomial_wrapper_MV2; } else { reduce_fn = &MPIR_Reduce_binomial_MV2; } mpi_errno = reduce_fn(sendbuf, recvbuf, count, datatype, op, root, comm); } /* We are done */ if(tmp_buf!=NULL) smpi_free_tmp_buffer((void *) ((char *) tmp_buf + true_lb)); goto fn_exit; } if (local_rank == 0) { leader_comm = smpi_comm_get_leaders_comm(comm); if(leader_comm==MPI_COMM_NULL){ leader_comm = MPI_COMM_WORLD; } leader_comm_size = smpi_comm_size(leader_comm); leader_comm_rank = smpi_comm_rank(leader_comm); tmp_buf=(void *)smpi_get_tmp_sendbuffer(count * (MAX(extent, true_extent))); tmp_buf = (void *) ((char *) tmp_buf - true_lb); } if (sendbuf != MPI_IN_PLACE) { in_buf = (void *)sendbuf; } else { in_buf = recvbuf; } if (local_rank == 0) { out_buf = tmp_buf; } else { out_buf = NULL; } if(local_size > 1) { /* Lets do the intra-node reduce operations, if we have more than one * process in the node */ /*Fix the input and outbuf buffers for the intra-node reduce. *Node leaders will have the reduced data in tmp_buf after *this step*/ if (MV2_Reduce_intra_function == & MPIR_Reduce_shmem_MV2) { if (is_commutative == 1 && (count * (MAX(extent, true_extent)) < SHMEM_COLL_BLOCK_SIZE)) { mpi_errno = MV2_Reduce_intra_function(in_buf, out_buf, count, datatype, op, intra_node_root, shmem_comm); } else { mpi_errno = MPIR_Reduce_intra_knomial_wrapper_MV2(in_buf, out_buf, count, datatype, op, intra_node_root, shmem_comm); } } else { mpi_errno = MV2_Reduce_intra_function(in_buf, out_buf, count, datatype, op, intra_node_root, shmem_comm); } } else { smpi_free_tmp_buffer((void *) ((char *) tmp_buf + true_lb)); tmp_buf = in_buf; } /* Now work on the inter-leader phase. Data is in tmp_buf */ if (local_rank == 0 && leader_comm_size > 1) { /*The leader of root will have the global reduced data in tmp_buf or recv_buf at the end of the reduce */ if (leader_comm_rank == leader_root) { if (my_rank == root) { /* I am the root of the leader-comm, and the * root of the reduce op. So, I will write the * final result directly into my recvbuf */ if(tmp_buf != recvbuf) { in_buf = tmp_buf; out_buf = recvbuf; } else { in_buf = (char *)smpi_get_tmp_sendbuffer(count* smpi_datatype_get_extent(datatype)); smpi_datatype_copy(tmp_buf, count, datatype, in_buf, count, datatype); //in_buf = MPI_IN_PLACE; out_buf = recvbuf; } } else { in_buf = (char *)smpi_get_tmp_sendbuffer(count* smpi_datatype_get_extent(datatype)); smpi_datatype_copy(tmp_buf, count, datatype, in_buf, count, datatype); //in_buf = MPI_IN_PLACE; out_buf = tmp_buf; } } else { in_buf = tmp_buf; out_buf = NULL; } /* inter-leader communication */ mpi_errno = MV2_Reduce_function(in_buf, out_buf, count, datatype, op, leader_root, leader_comm); } if (local_size > 1) { /* Send the message to the root if the leader is not the * root of the reduce operation. The reduced data is in tmp_buf */ if ((local_rank == 0) && (root != my_rank) && (leader_root == leader_comm_rank)) { smpi_mpi_send(tmp_buf, count, datatype, root, COLL_TAG_REDUCE+1, comm); } if ((local_rank != 0) && (root == my_rank)) { smpi_mpi_recv(recvbuf, count, datatype, leader_of_root, COLL_TAG_REDUCE+1, comm, MPI_STATUS_IGNORE); } smpi_free_tmp_buffer((void *) ((char *) tmp_buf + true_lb)); if (leader_comm_rank == leader_root) { if (my_rank != root || (my_rank == root && tmp_buf == recvbuf)) { smpi_free_tmp_buffer(in_buf); } } } fn_exit: return mpi_errno; }