// Allgather-Non-Topoloty-Scecific-Logical-Ring algorithm int smpi_coll_tuned_allgather_NTSLR_NB(void *sbuf, int scount, MPI_Datatype stype, void *rbuf, int rcount, MPI_Datatype rtype, MPI_Comm comm) { MPI_Aint rextent, sextent; MPI_Status status, status2; int i, to, from, rank, size; int send_offset, recv_offset; int tag = COLL_TAG_ALLGATHER; rank = smpi_comm_rank(comm); size = smpi_comm_size(comm); rextent = smpi_datatype_get_extent(rtype); sextent = smpi_datatype_get_extent(stype); MPI_Request *rrequest_array; MPI_Request *srequest_array; rrequest_array = (MPI_Request *) xbt_malloc(size * sizeof(MPI_Request)); srequest_array = (MPI_Request *) xbt_malloc(size * sizeof(MPI_Request)); // irregular case use default MPI fucntions if (scount * sextent != rcount * rextent) { XBT_WARN("MPI_allgather_NTSLR_NB use default MPI_allgather."); smpi_mpi_allgather(sbuf, scount, stype, rbuf, rcount, rtype, comm); return MPI_SUCCESS; } // topo non-specific to = (rank + 1) % size; from = (rank + size - 1) % size; //copy a single segment from sbuf to rbuf send_offset = rank * scount * sextent; smpi_mpi_sendrecv(sbuf, scount, stype, rank, tag, (char *)rbuf + send_offset, rcount, rtype, rank, tag, comm, &status); //start sending logical ring message int increment = scount * sextent; //post all irecv first for (i = 0; i < size - 1; i++) { recv_offset = ((rank - i - 1 + size) % size) * increment; rrequest_array[i] = smpi_mpi_irecv((char *)rbuf + recv_offset, rcount, rtype, from, tag + i, comm); } for (i = 0; i < size - 1; i++) { send_offset = ((rank - i + size) % size) * increment; srequest_array[i] = smpi_mpi_isend((char *)rbuf + send_offset, scount, stype, to, tag + i, comm); smpi_mpi_wait(&rrequest_array[i], &status); smpi_mpi_wait(&srequest_array[i], &status2); } free(rrequest_array); free(srequest_array); return MPI_SUCCESS; }
static void action_wait(const char *const *action){ double clock = smpi_process_simulated_elapsed(); MPI_Request request; MPI_Status status; smpi_replay_globals_t globals = (smpi_replay_globals_t) smpi_process_get_user_data(); xbt_assert(xbt_dynar_length(globals->irecvs), "action wait not preceded by any irecv: %s", xbt_str_join_array(action," ")); request = xbt_dynar_pop_as(globals->irecvs,MPI_Request); #ifdef HAVE_TRACING int rank = request && request->comm != MPI_COMM_NULL ? smpi_comm_rank(request->comm) : -1; TRACE_smpi_computing_out(rank); MPI_Group group = smpi_comm_group(request->comm); int src_traced = smpi_group_rank(group, request->src); int dst_traced = smpi_group_rank(group, request->dst); int is_wait_for_receive = request->recv; TRACE_smpi_ptp_in(rank, src_traced, dst_traced, __FUNCTION__); #endif smpi_mpi_wait(&request, &status); #ifdef HAVE_TRACING TRACE_smpi_ptp_out(rank, src_traced, dst_traced, __FUNCTION__); if (is_wait_for_receive) { TRACE_smpi_recv(rank, src_traced, dst_traced); } TRACE_smpi_computing_in(rank); #endif log_timed_action (action, clock); }
static void action_wait(const char *const *action){ CHECK_ACTION_PARAMS(action, 0, 0); double clock = smpi_process_simulated_elapsed(); MPI_Request request; MPI_Status status; xbt_assert(xbt_dynar_length(get_reqq_self()), "action wait not preceded by any irecv or isend: %s", xbt_str_join_array(action," ")); request = xbt_dynar_pop_as(get_reqq_self(),MPI_Request); if (!request){ /* Assuming that the trace is well formed, this mean the comm might have * been caught by a MPI_test. Then just return. */ return; } int rank = request->comm != MPI_COMM_NULL ? smpi_comm_rank(request->comm) : -1; MPI_Group group = smpi_comm_group(request->comm); int src_traced = smpi_group_rank(group, request->src); int dst_traced = smpi_group_rank(group, request->dst); int is_wait_for_receive = request->recv; instr_extra_data extra = xbt_new0(s_instr_extra_data_t,1); extra->type = TRACING_WAIT; TRACE_smpi_ptp_in(rank, src_traced, dst_traced, __FUNCTION__, extra); smpi_mpi_wait(&request, &status); TRACE_smpi_ptp_out(rank, src_traced, dst_traced, __FUNCTION__); if (is_wait_for_receive) TRACE_smpi_recv(rank, src_traced, dst_traced); log_timed_action (action, clock); }
/* Non-topology-specific pipelined linear-bcast function */ int smpi_coll_tuned_bcast_arrival_pattern_aware_wait(void *buf, int count, MPI_Datatype datatype, int root, MPI_Comm comm) { MPI_Status status; MPI_Request request; MPI_Request *send_request_array; MPI_Request *recv_request_array; MPI_Status *send_status_array; MPI_Status *recv_status_array; MPI_Status temp_status_array[BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE]; int rank, size; int i, j, k; int tag = -COLL_TAG_BCAST; int will_send[BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE]; int sent_count; int header_index; int flag_array[BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE]; int already_sent[BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE]; int header_buf[BCAST_ARRIVAL_PATTERN_AWARE_HEADER_SIZE]; char temp_buf[BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE]; int max_node = BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE; int header_size = BCAST_ARRIVAL_PATTERN_AWARE_HEADER_SIZE; MPI_Aint extent; extent = smpi_datatype_get_extent(datatype); /* source and destination */ int to, from; rank = smpi_comm_rank(MPI_COMM_WORLD); size = smpi_comm_size(MPI_COMM_WORLD); /* segment is segment size in number of elements (not bytes) */ int segment = bcast_arrival_pattern_aware_wait_segment_size_in_byte / extent; /* pipeline length */ int pipe_length = count / segment; /* use for buffer offset for sending and receiving data = segment size in byte */ int increment = segment * extent; /* if the input size is not divisible by segment size => the small remainder will be done with native implementation */ int remainder = count % segment; /* if root is not zero send to rank zero first this can be modified to make it faster by using logical src, dst. */ if (root != 0) { if (rank == root) { smpi_mpi_send(buf, count, datatype, 0, tag, comm); } else if (rank == 0) { smpi_mpi_recv(buf, count, datatype, root, tag, comm, &status); } } /* value == 0 means root has not send data (or header) to the node yet */ for (i = 0; i < max_node; i++) { already_sent[i] = 0; } /* when a message is smaller than a block size => no pipeline */ if (count <= segment) { segment = count; pipe_length = 1; } /* start pipeline bcast */ send_request_array = (MPI_Request *) xbt_malloc((size + pipe_length) * sizeof(MPI_Request)); recv_request_array = (MPI_Request *) xbt_malloc((size + pipe_length) * sizeof(MPI_Request)); send_status_array = (MPI_Status *) xbt_malloc((size + pipe_length) * sizeof(MPI_Status)); recv_status_array = (MPI_Status *) xbt_malloc((size + pipe_length) * sizeof(MPI_Status)); /* root */ if (rank == 0) { sent_count = 0; int iteration = 0; for (i = 0; i < BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE; i++) will_send[i] = 0; while (sent_count < (size - 1)) { iteration++; /* loop k times to let more processes arrive before start sending data */ for (k = 0; k < 3; k++) { for (i = 1; i < size; i++) { if ((already_sent[i] == 0) && (will_send[i] == 0)) { smpi_mpi_iprobe(i, MPI_ANY_TAG, MPI_COMM_WORLD, &flag_array[i], &temp_status_array[i]); if (flag_array[i] == 1) { will_send[i] = 1; smpi_mpi_recv(&temp_buf[i], 1, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status); i = 0; } } } } header_index = 0; /* recv 1-byte message */ for (i = 1; i < size; i++) { /* message arrive */ if ((will_send[i] == 1) && (already_sent[i] == 0)) { header_buf[header_index] = i; header_index++; sent_count++; /* will send in the next step */ already_sent[i] = 1; } } /* send header followed by data */ if (header_index != 0) { header_buf[header_index] = -1; to = header_buf[0]; /* send header */ smpi_mpi_send(header_buf, header_size, MPI_INT, to, tag, comm); /* send data - pipeline */ for (i = 0; i < pipe_length; i++) { send_request_array[i] = smpi_mpi_isend((char *)buf + (i * increment), segment, datatype, to, tag, comm); } smpi_mpi_waitall((pipe_length), send_request_array, send_status_array); } /* end - send header followed by data */ /* randomly MPI_Send to one node */ /* this part has been commented out - performance-wise */ else if (2 == 3) { /* search for the first node that never received data before */ for (i = 0; i < size; i++) { if (i == root) continue; if (already_sent[i] == 0) { header_buf[0] = i; header_buf[1] = -1; to = i; smpi_mpi_send(header_buf, header_size, MPI_INT, to, tag, comm); /* still need to chop data so that we can use the same non-root code */ for (j = 0; j < pipe_length; j++) { smpi_mpi_send((char *)buf + (j * increment), segment, datatype, to, tag, comm); } } } } } /* end - while (send_count < size-1) loop */ } /* end - root */ /* none root */ else { /* send 1-byte message to root */ smpi_mpi_send(temp_buf, 1, MPI_CHAR, 0, tag, comm); /* wait for header forward when required */ request = smpi_mpi_irecv(header_buf, header_size, MPI_INT, MPI_ANY_SOURCE, tag, comm); smpi_mpi_wait(&request, MPI_STATUS_IGNORE); /* search for where it is */ int myordering = 0; while (rank != header_buf[myordering]) { myordering++; } to = header_buf[myordering + 1]; if (myordering == 0) { from = 0; } else { from = header_buf[myordering - 1]; } /* send header when required */ if (to != -1) { smpi_mpi_send(header_buf, header_size, MPI_INT, to, tag, comm); } /* receive data */ for (i = 0; i < pipe_length; i++) { recv_request_array[i] = smpi_mpi_irecv((char *)buf + (i * increment), segment, datatype, from, tag, comm); } /* forward data */ if (to != -1) { for (i = 0; i < pipe_length; i++) { smpi_mpi_wait(&recv_request_array[i], MPI_STATUS_IGNORE); send_request_array[i] = smpi_mpi_isend((char *)buf + (i * increment), segment, datatype, to, tag, comm); } smpi_mpi_waitall((pipe_length), send_request_array, send_status_array); } /* recv only */ else { smpi_mpi_waitall((pipe_length), recv_request_array, recv_status_array); } } free(send_request_array); free(recv_request_array); free(send_status_array); free(recv_status_array); /* end pipeline */ /* when count is not divisible by block size, use default BCAST for the remainder */ if ((remainder != 0) && (count > segment)) { XBT_WARN("MPI_bcast_arrival_pattern_aware_wait use default MPI_bcast."); smpi_mpi_bcast((char *)buf + (pipe_length * increment), remainder, datatype, root, comm); } return MPI_SUCCESS; }
int smpi_coll_tuned_bcast_ompi_pipeline( void* buffer, int original_count, MPI_Datatype datatype, int root, MPI_Comm comm) { int count_by_segment = original_count; size_t type_size; int segsize =1024 << 7; //mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; //mca_coll_tuned_comm_t *data = tuned_module->tuned_data; // return ompi_coll_tuned_bcast_intra_generic( buffer, count, datatype, root, comm, module, // count_by_segment, data->cached_pipeline ); ompi_coll_tree_t * tree = ompi_coll_tuned_topo_build_chain( 1, comm, root ); int i; int rank, size; int segindex; int num_segments; /* Number of segments */ int sendcount; /* number of elements sent in this segment */ size_t realsegsize; char *tmpbuf; ptrdiff_t extent; MPI_Request recv_reqs[2] = {MPI_REQUEST_NULL, MPI_REQUEST_NULL}; MPI_Request *send_reqs = NULL; int req_index; /** * Determine number of elements sent per operation. */ type_size = smpi_datatype_size(datatype); size = smpi_comm_size(comm); rank = smpi_comm_rank(comm); xbt_assert( size > 1 ); const double a_p16 = 3.2118e-6; /* [1 / byte] */ const double b_p16 = 8.7936; const double a_p64 = 2.3679e-6; /* [1 / byte] */ const double b_p64 = 1.1787; const double a_p128 = 1.6134e-6; /* [1 / byte] */ const double b_p128 = 2.1102; size_t message_size; /* else we need data size for decision function */ message_size = type_size * (unsigned long)original_count; /* needed for decision */ if (size < (a_p128 * message_size + b_p128)) { //Pipeline with 128KB segments segsize = 1024 << 7; }else if (size < (a_p64 * message_size + b_p64)) { // Pipeline with 64KB segments segsize = 1024 << 6; }else if (size < (a_p16 * message_size + b_p16)) { //Pipeline with 16KB segments segsize = 1024 << 4; } COLL_TUNED_COMPUTED_SEGCOUNT( segsize, type_size, count_by_segment ); XBT_DEBUG("coll:tuned:bcast_intra_pipeline rank %d ss %5d type_size %lu count_by_segment %d", smpi_comm_rank(comm), segsize, (unsigned long)type_size, count_by_segment); extent = smpi_datatype_get_extent (datatype); num_segments = (original_count + count_by_segment - 1) / count_by_segment; realsegsize = count_by_segment * extent; /* Set the buffer pointers */ tmpbuf = (char *) buffer; if( tree->tree_nextsize != 0 ) { send_reqs = xbt_new(MPI_Request, tree->tree_nextsize ); } /* Root code */ if( rank == root ) { /* For each segment: - send segment to all children. The last segment may have less elements than other segments. */ sendcount = count_by_segment; for( segindex = 0; segindex < num_segments; segindex++ ) { if( segindex == (num_segments - 1) ) { sendcount = original_count - segindex * count_by_segment; } for( i = 0; i < tree->tree_nextsize; i++ ) { send_reqs[i] = smpi_mpi_isend(tmpbuf, sendcount, datatype, tree->tree_next[i], COLL_TAG_BCAST, comm); } /* complete the sends before starting the next sends */ smpi_mpi_waitall( tree->tree_nextsize, send_reqs, MPI_STATUSES_IGNORE ); /* update tmp buffer */ tmpbuf += realsegsize; } } /* Intermediate nodes code */ else if( tree->tree_nextsize > 0 ) { /* Create the pipeline. 1) Post the first receive 2) For segments 1 .. num_segments - post new receive - wait on the previous receive to complete - send this data to children 3) Wait on the last segment 4) Compute number of elements in last segment. 5) Send the last segment to children */ req_index = 0; recv_reqs[req_index]=smpi_mpi_irecv(tmpbuf, count_by_segment, datatype, tree->tree_prev, COLL_TAG_BCAST, comm); for( segindex = 1; segindex < num_segments; segindex++ ) { req_index = req_index ^ 0x1; /* post new irecv */ recv_reqs[req_index]= smpi_mpi_irecv( tmpbuf + realsegsize, count_by_segment, datatype, tree->tree_prev, COLL_TAG_BCAST, comm); /* wait for and forward the previous segment to children */ smpi_mpi_wait( &recv_reqs[req_index ^ 0x1], MPI_STATUSES_IGNORE ); for( i = 0; i < tree->tree_nextsize; i++ ) { send_reqs[i]=smpi_mpi_isend(tmpbuf, count_by_segment, datatype, tree->tree_next[i], COLL_TAG_BCAST, comm ); } /* complete the sends before starting the next iteration */ smpi_mpi_waitall( tree->tree_nextsize, send_reqs, MPI_STATUSES_IGNORE ); /* Update the receive buffer */ tmpbuf += realsegsize; } /* Process the last segment */ smpi_mpi_wait( &recv_reqs[req_index], MPI_STATUSES_IGNORE ); sendcount = original_count - (num_segments - 1) * count_by_segment; for( i = 0; i < tree->tree_nextsize; i++ ) { send_reqs[i] = smpi_mpi_isend(tmpbuf, sendcount, datatype, tree->tree_next[i], COLL_TAG_BCAST, comm); } smpi_mpi_waitall( tree->tree_nextsize, send_reqs, MPI_STATUSES_IGNORE ); } /* Leaf nodes */ else { /* Receive all segments from parent in a loop: 1) post irecv for the first segment 2) for segments 1 .. num_segments - post irecv for the next segment - wait on the previous segment to arrive 3) wait for the last segment */ req_index = 0; recv_reqs[req_index] = smpi_mpi_irecv(tmpbuf, count_by_segment, datatype, tree->tree_prev, COLL_TAG_BCAST, comm); for( segindex = 1; segindex < num_segments; segindex++ ) { req_index = req_index ^ 0x1; tmpbuf += realsegsize; /* post receive for the next segment */ recv_reqs[req_index] = smpi_mpi_irecv(tmpbuf, count_by_segment, datatype, tree->tree_prev, COLL_TAG_BCAST, comm); /* wait on the previous segment */ smpi_mpi_wait( &recv_reqs[req_index ^ 0x1], MPI_STATUS_IGNORE ); } smpi_mpi_wait( &recv_reqs[req_index], MPI_STATUS_IGNORE ); } if( NULL != send_reqs ) free(send_reqs); return (MPI_SUCCESS); }
/* * reduce_scatter_ompi_basic_recursivehalving * * Function: - reduce scatter implementation using recursive-halving * algorithm * Accepts: - same as MPI_Reduce_scatter() * Returns: - MPI_SUCCESS or error code * Limitation: - Works only for commutative operations. */ int smpi_coll_tuned_reduce_scatter_ompi_basic_recursivehalving(void *sbuf, void *rbuf, int *rcounts, MPI_Datatype dtype, MPI_Op op, MPI_Comm comm ) { int i, rank, size, count, err = MPI_SUCCESS; int tmp_size=1, remain = 0, tmp_rank, *disps = NULL; ptrdiff_t true_lb, true_extent, lb, extent, buf_size; char *recv_buf = NULL, *recv_buf_free = NULL; char *result_buf = NULL, *result_buf_free = NULL; /* Initialize */ rank = smpi_comm_rank(comm); size = smpi_comm_size(comm); XBT_DEBUG("coll:tuned:reduce_scatter_ompi_basic_recursivehalving, rank %d", rank); /* Find displacements and the like */ disps = (int*) xbt_malloc(sizeof(int) * size); if (NULL == disps) return MPI_ERR_OTHER; disps[0] = 0; for (i = 0; i < (size - 1); ++i) { disps[i + 1] = disps[i] + rcounts[i]; } count = disps[size - 1] + rcounts[size - 1]; /* short cut the trivial case */ if (0 == count) { xbt_free(disps); return MPI_SUCCESS; } /* get datatype information */ smpi_datatype_extent(dtype, &lb, &extent); smpi_datatype_extent(dtype, &true_lb, &true_extent); buf_size = true_extent + (ptrdiff_t)(count - 1) * extent; /* Handle MPI_IN_PLACE */ if (MPI_IN_PLACE == sbuf) { sbuf = rbuf; } /* Allocate temporary receive buffer. */ recv_buf_free = (char*) xbt_malloc(buf_size); recv_buf = recv_buf_free - lb; if (NULL == recv_buf_free) { err = MPI_ERR_OTHER; goto cleanup; } /* allocate temporary buffer for results */ result_buf_free = (char*) xbt_malloc(buf_size); result_buf = result_buf_free - lb; /* copy local buffer into the temporary results */ err =smpi_datatype_copy(sbuf, count, dtype, result_buf, count, dtype); if (MPI_SUCCESS != err) goto cleanup; /* figure out power of two mapping: grow until larger than comm size, then go back one, to get the largest power of two less than comm size */ while (tmp_size <= size) tmp_size <<= 1; tmp_size >>= 1; remain = size - tmp_size; /* If comm size is not a power of two, have the first "remain" procs with an even rank send to rank + 1, leaving a power of two procs to do the rest of the algorithm */ if (rank < 2 * remain) { if ((rank & 1) == 0) { smpi_mpi_send(result_buf, count, dtype, rank + 1, COLL_TAG_REDUCE_SCATTER, comm); /* we don't participate from here on out */ tmp_rank = -1; } else { smpi_mpi_recv(recv_buf, count, dtype, rank - 1, COLL_TAG_REDUCE_SCATTER, comm, MPI_STATUS_IGNORE); /* integrate their results into our temp results */ smpi_op_apply(op, recv_buf, result_buf, &count, &dtype); /* adjust rank to be the bottom "remain" ranks */ tmp_rank = rank / 2; } } else { /* just need to adjust rank to show that the bottom "even remain" ranks dropped out */ tmp_rank = rank - remain; } /* For ranks not kicked out by the above code, perform the recursive halving */ if (tmp_rank >= 0) { int *tmp_disps = NULL, *tmp_rcounts = NULL; int mask, send_index, recv_index, last_index; /* recalculate disps and rcounts to account for the special "remainder" processes that are no longer doing anything */ tmp_rcounts = (int*) xbt_malloc(tmp_size * sizeof(int)); if (NULL == tmp_rcounts) { err = MPI_ERR_OTHER; goto cleanup; } tmp_disps = (int*) xbt_malloc(tmp_size * sizeof(int)); if (NULL == tmp_disps) { xbt_free(tmp_rcounts); err = MPI_ERR_OTHER; goto cleanup; } for (i = 0 ; i < tmp_size ; ++i) { if (i < remain) { /* need to include old neighbor as well */ tmp_rcounts[i] = rcounts[i * 2 + 1] + rcounts[i * 2]; } else { tmp_rcounts[i] = rcounts[i + remain]; } } tmp_disps[0] = 0; for (i = 0; i < tmp_size - 1; ++i) { tmp_disps[i + 1] = tmp_disps[i] + tmp_rcounts[i]; } /* do the recursive halving communication. Don't use the dimension information on the communicator because I think the information is invalidated by our "shrinking" of the communicator */ mask = tmp_size >> 1; send_index = recv_index = 0; last_index = tmp_size; while (mask > 0) { int tmp_peer, peer, send_count, recv_count; MPI_Request request; tmp_peer = tmp_rank ^ mask; peer = (tmp_peer < remain) ? tmp_peer * 2 + 1 : tmp_peer + remain; /* figure out if we're sending, receiving, or both */ send_count = recv_count = 0; if (tmp_rank < tmp_peer) { send_index = recv_index + mask; for (i = send_index ; i < last_index ; ++i) { send_count += tmp_rcounts[i]; } for (i = recv_index ; i < send_index ; ++i) { recv_count += tmp_rcounts[i]; } } else { recv_index = send_index + mask; for (i = send_index ; i < recv_index ; ++i) { send_count += tmp_rcounts[i]; } for (i = recv_index ; i < last_index ; ++i) { recv_count += tmp_rcounts[i]; } } /* actual data transfer. Send from result_buf, receive into recv_buf */ if (send_count > 0 && recv_count != 0) { request=smpi_mpi_irecv(recv_buf + (ptrdiff_t)tmp_disps[recv_index] * extent, recv_count, dtype, peer, COLL_TAG_REDUCE_SCATTER, comm); if (MPI_SUCCESS != err) { xbt_free(tmp_rcounts); xbt_free(tmp_disps); goto cleanup; } } if (recv_count > 0 && send_count != 0) { smpi_mpi_send(result_buf + (ptrdiff_t)tmp_disps[send_index] * extent, send_count, dtype, peer, COLL_TAG_REDUCE_SCATTER, comm); if (MPI_SUCCESS != err) { xbt_free(tmp_rcounts); xbt_free(tmp_disps); goto cleanup; } } if (send_count > 0 && recv_count != 0) { smpi_mpi_wait(&request, MPI_STATUS_IGNORE); } /* if we received something on this step, push it into the results buffer */ if (recv_count > 0) { smpi_op_apply(op, recv_buf + (ptrdiff_t)tmp_disps[recv_index] * extent, result_buf + (ptrdiff_t)tmp_disps[recv_index] * extent, &recv_count, &dtype); } /* update for next iteration */ send_index = recv_index; last_index = recv_index + mask; mask >>= 1; } /* copy local results from results buffer into real receive buffer */ if (0 != rcounts[rank]) { err = smpi_datatype_copy(result_buf + disps[rank] * extent, rcounts[rank], dtype, rbuf, rcounts[rank], dtype); if (MPI_SUCCESS != err) { xbt_free(tmp_rcounts); xbt_free(tmp_disps); goto cleanup; } } xbt_free(tmp_rcounts); xbt_free(tmp_disps); }
/* Non-topology-specific pipelined linear-reduce function */ int smpi_coll_tuned_reduce_arrival_pattern_aware(void *buf, void *rbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPI_Comm comm) { int rank; rank = smpi_comm_rank(comm); int tag = -COLL_TAG_REDUCE; MPI_Status status; MPI_Request request; MPI_Request *send_request_array; MPI_Request *recv_request_array; MPI_Status *send_status_array; MPI_Status *recv_status_array; MPI_Status temp_status_array[MAX_NODE]; int size; int i; int sent_count; int header_index; int flag_array[MAX_NODE]; int already_received[MAX_NODE]; int header_buf[HEADER_SIZE]; char temp_buf[MAX_NODE]; MPI_Aint extent, lb; smpi_datatype_extent(datatype, &lb, &extent); /* source and destination */ int to, from; size=smpi_comm_size(comm); rank=smpi_comm_rank(comm); /* segment is segment size in number of elements (not bytes) */ int segment = reduce_arrival_pattern_aware_segment_size_in_byte / extent; /* pipeline length */ int pipe_length = count / segment; /* use for buffer offset for sending and receiving data = segment size in byte */ int increment = segment * extent; /* if the input size is not divisible by segment size => the small remainder will be done with native implementation */ int remainder = count % segment; /* value == 0 means root has not send data (or header) to the node yet */ for (i = 0; i < MAX_NODE; i++) { already_received[i] = 0; } char *tmp_buf; tmp_buf = (char *) xbt_malloc(count * extent); smpi_mpi_sendrecv(buf, count, datatype, rank, tag, rbuf, count, datatype, rank, tag, comm, &status); /* when a message is smaller than a block size => no pipeline */ if (count <= segment) { if (rank == 0) { sent_count = 0; while (sent_count < (size - 1)) { for (i = 1; i < size; i++) { if (already_received[i] == 0) { smpi_mpi_iprobe(i, MPI_ANY_TAG, MPI_COMM_WORLD, &flag_array[i], MPI_STATUSES_IGNORE); simcall_process_sleep(0.0001); } } header_index = 0; /* recv 1-byte message */ for (i = 0; i < size; i++) { if (i == rank) continue; /* 1-byte message arrive */ if ((flag_array[i] == 1) && (already_received[i] == 0)) { smpi_mpi_recv(temp_buf, 1, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status); header_buf[header_index] = i; header_index++; sent_count++; //printf("root send to %d recv from %d : data = ",to,from); /* for (i=0;i<=header_index;i++) { printf("%d ",header_buf[i]); } printf("\n"); */ /* will receive in the next step */ already_received[i] = 1; } } /* send header followed by receive and reduce data */ if (header_index != 0) { header_buf[header_index] = -1; to = header_buf[0]; from = header_buf[header_index - 1]; smpi_mpi_send(header_buf, HEADER_SIZE, MPI_INT, to, tag, comm); smpi_mpi_recv(tmp_buf, count, datatype, from, tag, comm, &status); smpi_op_apply(op, tmp_buf, rbuf, &count, &datatype); } } /* while loop */ } /* root */ /* non-root */ else { /* send 1-byte message to root */ smpi_mpi_send(temp_buf, 1, MPI_CHAR, 0, tag, comm); /* wait for header and data, forward when required */ smpi_mpi_recv(header_buf, HEADER_SIZE, MPI_INT, MPI_ANY_SOURCE, tag, comm, &status); // smpi_mpi_recv(buf,count,datatype,MPI_ANY_SOURCE,tag,comm,&status); /* search for where it is */ int myordering = 0; while (rank != header_buf[myordering]) { myordering++; } /* forward header */ if (header_buf[myordering + 1] != -1) { smpi_mpi_send(header_buf, HEADER_SIZE, MPI_INT, header_buf[myordering + 1], tag, comm); } //printf("node %d ordering %d\n",rank,myordering); /* receive, reduce, and forward data */ /* send only */ if (myordering == 0) { if (header_buf[myordering + 1] == -1) { to = 0; } else { to = header_buf[myordering + 1]; } smpi_mpi_send(rbuf, count, datatype, to, tag, comm); } /* recv, reduce, send */ else { if (header_buf[myordering + 1] == -1) { to = 0; } else { to = header_buf[myordering + 1]; } from = header_buf[myordering - 1]; smpi_mpi_recv(tmp_buf, count, datatype, header_buf[myordering - 1], tag, comm, &status); smpi_op_apply(op, tmp_buf, rbuf, &count, &datatype); smpi_mpi_send(rbuf, count, datatype, to, tag, comm); } } /* non-root */ } /* pipeline bcast */ else { // printf("node %d start\n",rank); send_request_array = (MPI_Request *) xbt_malloc((size + pipe_length) * sizeof(MPI_Request)); recv_request_array = (MPI_Request *) xbt_malloc((size + pipe_length) * sizeof(MPI_Request)); send_status_array = (MPI_Status *) xbt_malloc((size + pipe_length) * sizeof(MPI_Status)); recv_status_array = (MPI_Status *) xbt_malloc((size + pipe_length) * sizeof(MPI_Status)); if (rank == 0) { sent_count = 0; int will_send[MAX_NODE]; for (i = 0; i < MAX_NODE; i++) will_send[i] = 0; /* loop until all data are received (sent) */ while (sent_count < (size - 1)) { int k; for (k = 0; k < 1; k++) { for (i = 1; i < size; i++) { //if (i == rank) //continue; if ((already_received[i] == 0) && (will_send[i] == 0)) { smpi_mpi_iprobe(i, MPI_ANY_TAG, MPI_COMM_WORLD, &flag_array[i], &temp_status_array[i]); if (flag_array[i] == 1) { will_send[i] = 1; smpi_mpi_recv(&temp_buf[i], 1, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status); //printf("recv from %d\n",i); i = 1; } } } } /* end of probing */ header_index = 0; /* recv 1-byte message */ for (i = 1; i < size; i++) { //if (i==rank) //continue; /* message arrived in this round (put in the header) */ if ((will_send[i] == 1) && (already_received[i] == 0)) { header_buf[header_index] = i; header_index++; sent_count++; /* will send in the next step */ already_received[i] = 1; } } /* send header followed by data */ if (header_index != 0) { header_buf[header_index] = -1; to = header_buf[0]; /* send header */ smpi_mpi_send(header_buf, HEADER_SIZE, MPI_INT, to, tag, comm); /* recv data - pipeline */ from = header_buf[header_index - 1]; for (i = 0; i < pipe_length; i++) { smpi_mpi_recv(tmp_buf + (i * increment), segment, datatype, from, tag, comm, &status); smpi_op_apply(op, tmp_buf + (i * increment), (char *)rbuf + (i * increment), &segment, &datatype); } } } /* while loop (sent_count < size-1 ) */ } /* root */ /* none root */ else { /* send 1-byte message to root */ smpi_mpi_send(temp_buf, 1, MPI_CHAR, 0, tag, comm); /* wait for header forward when required */ request=smpi_mpi_irecv(header_buf, HEADER_SIZE, MPI_INT, MPI_ANY_SOURCE, tag, comm); smpi_mpi_wait(&request, MPI_STATUS_IGNORE); /* search for where it is */ int myordering = 0; while (rank != header_buf[myordering]) { myordering++; } /* send header when required */ if (header_buf[myordering + 1] != -1) { smpi_mpi_send(header_buf, HEADER_SIZE, MPI_INT, header_buf[myordering + 1], tag, comm); } /* (receive, reduce), and send data */ if (header_buf[myordering + 1] == -1) { to = 0; } else { to = header_buf[myordering + 1]; } /* send only */ if (myordering == 0) { for (i = 0; i < pipe_length; i++) { send_request_array[i]= smpi_mpi_isend((char *)rbuf + (i * increment), segment, datatype, to, tag, comm); } smpi_mpi_waitall((pipe_length), send_request_array, send_status_array); } /* receive, reduce, and send */ else { from = header_buf[myordering - 1]; for (i = 0; i < pipe_length; i++) { recv_request_array[i]=smpi_mpi_irecv(tmp_buf + (i * increment), segment, datatype, from, tag, comm); } for (i = 0; i < pipe_length; i++) { smpi_mpi_wait(&recv_request_array[i], MPI_STATUS_IGNORE); smpi_op_apply(op, tmp_buf + (i * increment), (char *)rbuf + (i * increment), &segment, &datatype); send_request_array[i]=smpi_mpi_isend((char *)rbuf + (i * increment), segment, datatype, to, tag, comm); } smpi_mpi_waitall((pipe_length), send_request_array, send_status_array); } } /* non-root */ free(send_request_array); free(recv_request_array); free(send_status_array); free(recv_status_array); //printf("node %d done\n",rank); } /* end pipeline */ /* if root is not zero send root after finished this can be modified to make it faster by using logical src, dst. */ if (root != 0) { if (rank == 0) { smpi_mpi_send(rbuf, count, datatype, root, tag, comm); } else if (rank == root) { smpi_mpi_recv(rbuf, count, datatype, 0, tag, comm, &status); } } /* when count is not divisible by block size, use default BCAST for the remainder */ if ((remainder != 0) && (count > segment)) { smpi_mpi_reduce((char *)buf + (pipe_length * increment), (char *)rbuf + (pipe_length * increment), remainder, datatype, op, root, comm); } free(tmp_buf); return MPI_SUCCESS; }
int smpi_coll_tuned_bcast_NTSB(void *buf, int count, MPI_Datatype datatype, int root, MPI_Comm comm) { int tag = COLL_TAG_BCAST; MPI_Status status; int rank, size; int i; MPI_Request *send_request_array; MPI_Request *recv_request_array; MPI_Status *send_status_array; MPI_Status *recv_status_array; MPI_Aint extent; extent = smpi_datatype_get_extent(datatype); rank = smpi_comm_rank(MPI_COMM_WORLD); size = smpi_comm_size(MPI_COMM_WORLD); /* source node and destination nodes (same through out the functions) */ int from = (rank - 1) / 2; int to_left = rank * 2 + 1; int to_right = rank * 2 + 2; if (to_left >= size) to_left = -1; if (to_right >= size) to_right = -1; /* segment is segment size in number of elements (not bytes) */ int segment = bcast_NTSB_segment_size_in_byte / extent; /* pipeline length */ int pipe_length = count / segment; /* use for buffer offset for sending and receiving data = segment size in byte */ int increment = segment * extent; /* if the input size is not divisible by segment size => the small remainder will be done with native implementation */ int remainder = count % segment; /* if root is not zero send to rank zero first */ if (root != 0) { if (rank == root) { smpi_mpi_send(buf, count, datatype, 0, tag, comm); } else if (rank == 0) { smpi_mpi_recv(buf, count, datatype, root, tag, comm, &status); } } /* when a message is smaller than a block size => no pipeline */ if (count <= segment) { /* case: root */ if (rank == 0) { /* case root has only a left child */ if (to_right == -1) { smpi_mpi_send(buf, count, datatype, to_left, tag, comm); } /* case root has both left and right children */ else { smpi_mpi_send(buf, count, datatype, to_left, tag, comm); smpi_mpi_send(buf, count, datatype, to_right, tag, comm); } } /* case: leaf ==> receive only */ else if (to_left == -1) { smpi_mpi_recv(buf, count, datatype, from, tag, comm, &status); } /* case: intermidiate node with only left child ==> relay message */ else if (to_right == -1) { smpi_mpi_recv(buf, count, datatype, from, tag, comm, &status); smpi_mpi_send(buf, count, datatype, to_left, tag, comm); } /* case: intermidiate node with both left and right children ==> relay message */ else { smpi_mpi_recv(buf, count, datatype, from, tag, comm, &status); smpi_mpi_send(buf, count, datatype, to_left, tag, comm); smpi_mpi_send(buf, count, datatype, to_right, tag, comm); } return MPI_SUCCESS; } // pipelining else { send_request_array = (MPI_Request *) xbt_malloc(2 * (size + pipe_length) * sizeof(MPI_Request)); recv_request_array = (MPI_Request *) xbt_malloc((size + pipe_length) * sizeof(MPI_Request)); send_status_array = (MPI_Status *) xbt_malloc(2 * (size + pipe_length) * sizeof(MPI_Status)); recv_status_array = (MPI_Status *) xbt_malloc((size + pipe_length) * sizeof(MPI_Status)); /* case: root */ if (rank == 0) { /* case root has only a left child */ if (to_right == -1) { for (i = 0; i < pipe_length; i++) { send_request_array[i] = smpi_mpi_isend((char *) buf + (i * increment), segment, datatype, to_left, tag + i, comm); } smpi_mpi_waitall((pipe_length), send_request_array, send_status_array); } /* case root has both left and right children */ else { for (i = 0; i < pipe_length; i++) { send_request_array[i] = smpi_mpi_isend((char *) buf + (i * increment), segment, datatype, to_left, tag + i, comm); send_request_array[i + pipe_length] = smpi_mpi_isend((char *) buf + (i * increment), segment, datatype, to_right, tag + i, comm); } smpi_mpi_waitall((2 * pipe_length), send_request_array, send_status_array); } } /* case: leaf ==> receive only */ else if (to_left == -1) { for (i = 0; i < pipe_length; i++) { recv_request_array[i] = smpi_mpi_irecv((char *) buf + (i * increment), segment, datatype, from, tag + i, comm); } smpi_mpi_waitall((pipe_length), recv_request_array, recv_status_array); } /* case: intermidiate node with only left child ==> relay message */ else if (to_right == -1) { for (i = 0; i < pipe_length; i++) { recv_request_array[i] = smpi_mpi_irecv((char *) buf + (i * increment), segment, datatype, from, tag + i, comm); } for (i = 0; i < pipe_length; i++) { smpi_mpi_wait(&recv_request_array[i], &status); send_request_array[i] = smpi_mpi_isend((char *) buf + (i * increment), segment, datatype, to_left, tag + i, comm); } smpi_mpi_waitall(pipe_length, send_request_array, send_status_array); } /* case: intermidiate node with both left and right children ==> relay message */ else { for (i = 0; i < pipe_length; i++) { recv_request_array[i] = smpi_mpi_irecv((char *) buf + (i * increment), segment, datatype, from, tag + i, comm); } for (i = 0; i < pipe_length; i++) { smpi_mpi_wait(&recv_request_array[i], &status); send_request_array[i] = smpi_mpi_isend((char *) buf + (i * increment), segment, datatype, to_left, tag + i, comm); send_request_array[i + pipe_length] = smpi_mpi_isend((char *) buf + (i * increment), segment, datatype, to_right, tag + i, comm); } smpi_mpi_waitall((2 * pipe_length), send_request_array, send_status_array); } free(send_request_array); free(recv_request_array); free(send_status_array); free(recv_status_array); } /* end pipeline */ /* when count is not divisible by block size, use default BCAST for the remainder */ if ((remainder != 0) && (count > segment)) { XBT_WARN("MPI_bcast_NTSB use default MPI_bcast."); smpi_mpi_bcast((char *) buf + (pipe_length * increment), remainder, datatype, root, comm); } return MPI_SUCCESS; }
int smpi_coll_tuned_allgather_SMP_NTS(void *sbuf, int scount, MPI_Datatype stype, void *rbuf, int rcount, MPI_Datatype rtype, MPI_Comm comm) { int src, dst, comm_size, rank; comm_size = smpi_comm_size(comm); rank = smpi_comm_rank(comm); MPI_Aint rextent, sextent; rextent = smpi_datatype_get_extent(rtype); sextent = smpi_datatype_get_extent(stype); int tag = COLL_TAG_ALLGATHER; MPI_Request request; MPI_Request rrequest_array[128]; MPI_Status status; int i, send_offset, recv_offset; int intra_rank, inter_rank; intra_rank = rank % NUM_CORE; inter_rank = rank / NUM_CORE; int inter_comm_size = (comm_size + NUM_CORE - 1) / NUM_CORE; int num_core_in_current_smp = NUM_CORE; /* for too small number of processes, use default implementation */ if (comm_size <= NUM_CORE) { XBT_WARN("MPI_allgather_SMP_NTS use default MPI_allgather."); smpi_mpi_allgather(sbuf, scount, stype, rbuf, rcount, rtype, comm); return MPI_SUCCESS; } // the last SMP node may have fewer number of running processes than all others if (inter_rank == (inter_comm_size - 1)) { num_core_in_current_smp = comm_size - (inter_rank * NUM_CORE); } //copy corresponding message from sbuf to rbuf recv_offset = rank * rextent * rcount; smpi_mpi_sendrecv(sbuf, scount, stype, rank, tag, ((char *) rbuf + recv_offset), rcount, rtype, rank, tag, comm, &status); //gather to root of each SMP for (i = 1; i < num_core_in_current_smp; i++) { dst = (inter_rank * NUM_CORE) + (intra_rank + i) % (num_core_in_current_smp); src = (inter_rank * NUM_CORE) + (intra_rank - i + num_core_in_current_smp) % (num_core_in_current_smp); recv_offset = src * rextent * rcount; smpi_mpi_sendrecv(sbuf, scount, stype, dst, tag, ((char *) rbuf + recv_offset), rcount, rtype, src, tag, comm, &status); } // INTER-SMP-ALLGATHER // Every root of each SMP node post INTER-Sendrecv, then do INTRA-Bcast for each receiving message // Use logical ring algorithm // root of each SMP if (intra_rank == 0) { src = ((inter_rank - 1 + inter_comm_size) % inter_comm_size) * NUM_CORE; dst = ((inter_rank + 1) % inter_comm_size) * NUM_CORE; // post all inter Irecv for (i = 0; i < inter_comm_size - 1; i++) { recv_offset = ((inter_rank - i - 1 + inter_comm_size) % inter_comm_size) * NUM_CORE * sextent * scount; rrequest_array[i] = smpi_mpi_irecv((char *)rbuf+recv_offset, rcount * NUM_CORE, rtype, src, tag+i, comm); } // send first message send_offset = ((inter_rank + inter_comm_size) % inter_comm_size) * NUM_CORE * sextent * scount; smpi_mpi_isend((char *) rbuf + send_offset, scount * NUM_CORE, stype, dst, tag, comm); // loop : recv-inter , send-inter, send-intra (linear-bcast) for (i = 0; i < inter_comm_size - 2; i++) { recv_offset = ((inter_rank - i - 1 + inter_comm_size) % inter_comm_size) * NUM_CORE * sextent * scount; smpi_mpi_wait(&rrequest_array[i], &status); smpi_mpi_isend((char *) rbuf + recv_offset, scount * NUM_CORE, stype, dst, tag + i + 1, comm); if (num_core_in_current_smp > 1) { request = smpi_mpi_isend((char *) rbuf + recv_offset, scount * NUM_CORE, stype, (rank + 1), tag + i + 1, comm); } } // recv last message and send_intra recv_offset = ((inter_rank - i - 1 + inter_comm_size) % inter_comm_size) * NUM_CORE * sextent * scount; //recv_offset = ((inter_rank + 1) % inter_comm_size) * NUM_CORE * sextent * scount; //i=inter_comm_size-2; smpi_mpi_wait(&rrequest_array[i], &status); if (num_core_in_current_smp > 1) { request = smpi_mpi_isend((char *) rbuf + recv_offset, scount * NUM_CORE, stype, (rank + 1), tag + i + 1, comm); } } // last rank of each SMP else if (intra_rank == (num_core_in_current_smp - 1)) { for (i = 0; i < inter_comm_size - 1; i++) { recv_offset = ((inter_rank - i - 1 + inter_comm_size) % inter_comm_size) * NUM_CORE * sextent * scount; request = smpi_mpi_irecv((char *) rbuf + recv_offset, (rcount * NUM_CORE), rtype, rank - 1, tag + i + 1, comm); smpi_mpi_wait(&request, &status); } } // intermediate rank of each SMP else { for (i = 0; i < inter_comm_size - 1; i++) { recv_offset = ((inter_rank - i - 1 + inter_comm_size) % inter_comm_size) * NUM_CORE * sextent * scount; request = smpi_mpi_irecv((char *) rbuf + recv_offset, (rcount * NUM_CORE), rtype, rank - 1, tag + i + 1, comm); smpi_mpi_wait(&request, &status); request = smpi_mpi_isend((char *) rbuf + recv_offset, (scount * NUM_CORE), stype, (rank + 1), tag + i + 1, comm); } } return MPI_SUCCESS; }
/** * This is a generic implementation of the reduce protocol. It used the tree * provided as an argument and execute all operations using a segment of * count times a datatype. * For the last communication it will update the count in order to limit * the number of datatype to the original count (original_count) * * Note that for non-commutative operations we cannot save memory copy * for the first block: thus we must copy sendbuf to accumbuf on intermediate * to keep the optimized loop happy. */ int smpi_coll_tuned_ompi_reduce_generic( void* sendbuf, void* recvbuf, int original_count, MPI_Datatype datatype, MPI_Op op, int root, MPI_Comm comm, ompi_coll_tree_t* tree, int count_by_segment, int max_outstanding_reqs ) { char *inbuf[2] = {NULL, NULL}, *inbuf_free[2] = {NULL, NULL}; char *accumbuf = NULL, *accumbuf_free = NULL; char *local_op_buffer = NULL, *sendtmpbuf = NULL; ptrdiff_t extent, lower_bound, segment_increment; MPI_Request reqs[2] = {MPI_REQUEST_NULL, MPI_REQUEST_NULL}; int num_segments, line, ret, segindex, i, rank; int recvcount, prevcount, inbi; /** * Determine number of segments and number of elements * sent per operation */ smpi_datatype_extent( datatype, &lower_bound, &extent); num_segments = (original_count + count_by_segment - 1) / count_by_segment; segment_increment = count_by_segment * extent; sendtmpbuf = (char*) sendbuf; if( sendbuf == MPI_IN_PLACE ) { sendtmpbuf = (char *)recvbuf; } XBT_DEBUG( "coll:tuned:reduce_generic count %d, msg size %ld, segsize %ld, max_requests %d", original_count, (unsigned long)(num_segments * segment_increment), (unsigned long)segment_increment, max_outstanding_reqs); rank = smpi_comm_rank(comm); /* non-leaf nodes - wait for children to send me data & forward up (if needed) */ if( tree->tree_nextsize > 0 ) { ptrdiff_t true_extent, real_segment_size; true_extent=smpi_datatype_get_extent( datatype); /* handle non existant recv buffer (i.e. its NULL) and protect the recv buffer on non-root nodes */ accumbuf = (char*)recvbuf; if( (NULL == accumbuf) || (root != rank) ) { /* Allocate temporary accumulator buffer. */ accumbuf_free = (char*)malloc(true_extent + (original_count - 1) * extent); if (accumbuf_free == NULL) { line = __LINE__; ret = -1; goto error_hndl; } accumbuf = accumbuf_free - lower_bound; } /* If this is a non-commutative operation we must copy sendbuf to the accumbuf, in order to simplfy the loops */ if (!smpi_op_is_commute(op)) { smpi_datatype_copy( (char*)sendtmpbuf, original_count, datatype, (char*)accumbuf, original_count, datatype); } /* Allocate two buffers for incoming segments */ real_segment_size = true_extent + (count_by_segment - 1) * extent; inbuf_free[0] = (char*) malloc(real_segment_size); if( inbuf_free[0] == NULL ) { line = __LINE__; ret = -1; goto error_hndl; } inbuf[0] = inbuf_free[0] - lower_bound; /* if there is chance to overlap communication - allocate second buffer */ if( (num_segments > 1) || (tree->tree_nextsize > 1) ) { inbuf_free[1] = (char*) malloc(real_segment_size); if( inbuf_free[1] == NULL ) { line = __LINE__; ret = -1; goto error_hndl; } inbuf[1] = inbuf_free[1] - lower_bound; } /* reset input buffer index and receive count */ inbi = 0; recvcount = 0; /* for each segment */ for( segindex = 0; segindex <= num_segments; segindex++ ) { prevcount = recvcount; /* recvcount - number of elements in current segment */ recvcount = count_by_segment; if( segindex == (num_segments-1) ) recvcount = original_count - count_by_segment * segindex; /* for each child */ for( i = 0; i < tree->tree_nextsize; i++ ) { /** * We try to overlap communication: * either with next segment or with the next child */ /* post irecv for current segindex on current child */ if( segindex < num_segments ) { void* local_recvbuf = inbuf[inbi]; if( 0 == i ) { /* for the first step (1st child per segment) and * commutative operations we might be able to irecv * directly into the accumulate buffer so that we can * reduce(op) this with our sendbuf in one step as * ompi_op_reduce only has two buffer pointers, * this avoids an extra memory copy. * * BUT if the operation is non-commutative or * we are root and are USING MPI_IN_PLACE this is wrong! */ if( (smpi_op_is_commute(op)) && !((MPI_IN_PLACE == sendbuf) && (rank == tree->tree_root)) ) { local_recvbuf = accumbuf + segindex * segment_increment; } } reqs[inbi]=smpi_mpi_irecv(local_recvbuf, recvcount, datatype, tree->tree_next[i], COLL_TAG_REDUCE, comm ); } /* wait for previous req to complete, if any. if there are no requests reqs[inbi ^1] will be MPI_REQUEST_NULL. */ /* wait on data from last child for previous segment */ smpi_mpi_waitall( 1, &reqs[inbi ^ 1], MPI_STATUSES_IGNORE ); local_op_buffer = inbuf[inbi ^ 1]; if( i > 0 ) { /* our first operation is to combine our own [sendbuf] data * with the data we recvd from down stream (but only * the operation is commutative and if we are not root and * not using MPI_IN_PLACE) */ if( 1 == i ) { if( (smpi_op_is_commute(op)) && !((MPI_IN_PLACE == sendbuf) && (rank == tree->tree_root)) ) { local_op_buffer = sendtmpbuf + segindex * segment_increment; } } /* apply operation */ smpi_op_apply(op, local_op_buffer, accumbuf + segindex * segment_increment, &recvcount, &datatype ); } else if ( segindex > 0 ) { void* accumulator = accumbuf + (segindex-1) * segment_increment; if( tree->tree_nextsize <= 1 ) { if( (smpi_op_is_commute(op)) && !((MPI_IN_PLACE == sendbuf) && (rank == tree->tree_root)) ) { local_op_buffer = sendtmpbuf + (segindex-1) * segment_increment; } } smpi_op_apply(op, local_op_buffer, accumulator, &prevcount, &datatype ); /* all reduced on available data this step (i) complete, * pass to the next process unless you are the root. */ if (rank != tree->tree_root) { /* send combined/accumulated data to parent */ smpi_mpi_send( accumulator, prevcount, datatype, tree->tree_prev, COLL_TAG_REDUCE, comm); } /* we stop when segindex = number of segments (i.e. we do num_segment+1 steps for pipelining */ if (segindex == num_segments) break; } /* update input buffer index */ inbi = inbi ^ 1; } /* end of for each child */ } /* end of for each segment */ /* clean up */ if( inbuf_free[0] != NULL) free(inbuf_free[0]); if( inbuf_free[1] != NULL) free(inbuf_free[1]); if( accumbuf_free != NULL ) free(accumbuf_free); } /* leaf nodes Depending on the value of max_outstanding_reqs and the number of segments we have two options: - send all segments using blocking send to the parent, or - avoid overflooding the parent nodes by limiting the number of outstanding requests to max_oustanding_reqs. TODO/POSSIBLE IMPROVEMENT: If there is a way to determine the eager size for the current communication, synchronization should be used only when the message/segment size is smaller than the eager size. */ else { /* If the number of segments is less than a maximum number of oustanding requests or there is no limit on the maximum number of outstanding requests, we send data to the parent using blocking send */ if ((0 == max_outstanding_reqs) || (num_segments <= max_outstanding_reqs)) { segindex = 0; while ( original_count > 0) { if (original_count < count_by_segment) { count_by_segment = original_count; } smpi_mpi_send((char*)sendbuf + segindex * segment_increment, count_by_segment, datatype, tree->tree_prev, COLL_TAG_REDUCE, comm) ; segindex++; original_count -= count_by_segment; } } /* Otherwise, introduce flow control: - post max_outstanding_reqs non-blocking synchronous send, - for remaining segments - wait for a ssend to complete, and post the next one. - wait for all outstanding sends to complete. */ else { int creq = 0; MPI_Request* sreq = NULL; sreq = (MPI_Request*) calloc( max_outstanding_reqs, sizeof(MPI_Request ) ); if (NULL == sreq) { line = __LINE__; ret = -1; goto error_hndl; } /* post first group of requests */ for (segindex = 0; segindex < max_outstanding_reqs; segindex++) { sreq[segindex]=smpi_mpi_isend((char*)sendbuf + segindex * segment_increment, count_by_segment, datatype, tree->tree_prev, COLL_TAG_REDUCE, comm); original_count -= count_by_segment; } creq = 0; while ( original_count > 0 ) { /* wait on a posted request to complete */ smpi_mpi_wait(&sreq[creq], MPI_STATUS_IGNORE); sreq[creq] = MPI_REQUEST_NULL; if( original_count < count_by_segment ) { count_by_segment = original_count; } sreq[creq]=smpi_mpi_isend((char*)sendbuf + segindex * segment_increment, count_by_segment, datatype, tree->tree_prev, COLL_TAG_REDUCE, comm ); creq = (creq + 1) % max_outstanding_reqs; segindex++; original_count -= count_by_segment; } /* Wait on the remaining request to complete */ smpi_mpi_waitall( max_outstanding_reqs, sreq, MPI_STATUSES_IGNORE ); /* free requests */ free(sreq); } } return MPI_SUCCESS; error_hndl: /* error handler */ XBT_DEBUG("ERROR_HNDL: node %d file %s line %d error %d\n", rank, __FILE__, line, ret ); if( inbuf_free[0] != NULL ) free(inbuf_free[0]); if( inbuf_free[1] != NULL ) free(inbuf_free[1]); if( accumbuf_free != NULL ) free(accumbuf); return ret; }
int smpi_coll_tuned_bcast_SMP_binary(void *buf, int count, MPI_Datatype datatype, int root, MPI_Comm comm) { int tag = COLL_TAG_BCAST; MPI_Status status; MPI_Request request; MPI_Request *request_array; MPI_Status *status_array; int rank, size; int i; MPI_Aint extent; extent = smpi_datatype_get_extent(datatype); rank = smpi_comm_rank(comm); size = smpi_comm_size(comm); if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){ smpi_comm_init_smp(comm); } int host_num_core=1; if (smpi_comm_is_uniform(comm)){ host_num_core = smpi_comm_size(smpi_comm_get_intra_comm(comm)); }else{ //implementation buggy in this case return smpi_coll_tuned_bcast_mpich( buf , count, datatype, root, comm); } int segment = bcast_SMP_binary_segment_byte / extent; int pipe_length = count / segment; int remainder = count % segment; int to_intra_left = (rank / host_num_core) * host_num_core + (rank % host_num_core) * 2 + 1; int to_intra_right = (rank / host_num_core) * host_num_core + (rank % host_num_core) * 2 + 2; int to_inter_left = ((rank / host_num_core) * 2 + 1) * host_num_core; int to_inter_right = ((rank / host_num_core) * 2 + 2) * host_num_core; int from_inter = (((rank / host_num_core) - 1) / 2) * host_num_core; int from_intra = (rank / host_num_core) * host_num_core + ((rank % host_num_core) - 1) / 2; int increment = segment * extent; int base = (rank / host_num_core) * host_num_core; int num_core = host_num_core; if (((rank / host_num_core) * host_num_core) == ((size / host_num_core) * host_num_core)) num_core = size - (rank / host_num_core) * host_num_core; // if root is not zero send to rank zero first if (root != 0) { if (rank == root) smpi_mpi_send(buf, count, datatype, 0, tag, comm); else if (rank == 0) smpi_mpi_recv(buf, count, datatype, root, tag, comm, &status); } // when a message is smaller than a block size => no pipeline if (count <= segment) { // case ROOT-of-each-SMP if (rank % host_num_core == 0) { // case ROOT if (rank == 0) { //printf("node %d left %d right %d\n",rank,to_inter_left,to_inter_right); if (to_inter_left < size) smpi_mpi_send(buf, count, datatype, to_inter_left, tag, comm); if (to_inter_right < size) smpi_mpi_send(buf, count, datatype, to_inter_right, tag, comm); if ((to_intra_left - base) < num_core) smpi_mpi_send(buf, count, datatype, to_intra_left, tag, comm); if ((to_intra_right - base) < num_core) smpi_mpi_send(buf, count, datatype, to_intra_right, tag, comm); } // case LEAVES ROOT-of-eash-SMP else if (to_inter_left >= size) { //printf("node %d from %d\n",rank,from_inter); request = smpi_mpi_irecv(buf, count, datatype, from_inter, tag, comm); smpi_mpi_wait(&request, &status); if ((to_intra_left - base) < num_core) smpi_mpi_send(buf, count, datatype, to_intra_left, tag, comm); if ((to_intra_right - base) < num_core) smpi_mpi_send(buf, count, datatype, to_intra_right, tag, comm); } // case INTERMEDIAT ROOT-of-each-SMP else { //printf("node %d left %d right %d from %d\n",rank,to_inter_left,to_inter_right,from_inter); request = smpi_mpi_irecv(buf, count, datatype, from_inter, tag, comm); smpi_mpi_wait(&request, &status); smpi_mpi_send(buf, count, datatype, to_inter_left, tag, comm); if (to_inter_right < size) smpi_mpi_send(buf, count, datatype, to_inter_right, tag, comm); if ((to_intra_left - base) < num_core) smpi_mpi_send(buf, count, datatype, to_intra_left, tag, comm); if ((to_intra_right - base) < num_core) smpi_mpi_send(buf, count, datatype, to_intra_right, tag, comm); } } // case non ROOT-of-each-SMP else { // case leaves if ((to_intra_left - base) >= num_core) { request = smpi_mpi_irecv(buf, count, datatype, from_intra, tag, comm); smpi_mpi_wait(&request, &status); } // case intermediate else { request = smpi_mpi_irecv(buf, count, datatype, from_intra, tag, comm); smpi_mpi_wait(&request, &status); smpi_mpi_send(buf, count, datatype, to_intra_left, tag, comm); if ((to_intra_right - base) < num_core) smpi_mpi_send(buf, count, datatype, to_intra_right, tag, comm); } } return MPI_SUCCESS; } // pipeline bcast else { request_array = (MPI_Request *) xbt_malloc((size + pipe_length) * sizeof(MPI_Request)); status_array = (MPI_Status *) xbt_malloc((size + pipe_length) * sizeof(MPI_Status)); // case ROOT-of-each-SMP if (rank % host_num_core == 0) { // case ROOT if (rank == 0) { for (i = 0; i < pipe_length; i++) { //printf("node %d left %d right %d\n",rank,to_inter_left,to_inter_right); if (to_inter_left < size) smpi_mpi_send((char *) buf + (i * increment), segment, datatype, to_inter_left, (tag + i), comm); if (to_inter_right < size) smpi_mpi_send((char *) buf + (i * increment), segment, datatype, to_inter_right, (tag + i), comm); if ((to_intra_left - base) < num_core) smpi_mpi_send((char *) buf + (i * increment), segment, datatype, to_intra_left, (tag + i), comm); if ((to_intra_right - base) < num_core) smpi_mpi_send((char *) buf + (i * increment), segment, datatype, to_intra_right, (tag + i), comm); } } // case LEAVES ROOT-of-eash-SMP else if (to_inter_left >= size) { //printf("node %d from %d\n",rank,from_inter); for (i = 0; i < pipe_length; i++) { request_array[i] = smpi_mpi_irecv((char *) buf + (i * increment), segment, datatype, from_inter, (tag + i), comm); } for (i = 0; i < pipe_length; i++) { smpi_mpi_wait(&request_array[i], &status); if ((to_intra_left - base) < num_core) smpi_mpi_send((char *) buf + (i * increment), segment, datatype, to_intra_left, (tag + i), comm); if ((to_intra_right - base) < num_core) smpi_mpi_send((char *) buf + (i * increment), segment, datatype, to_intra_right, (tag + i), comm); } } // case INTERMEDIAT ROOT-of-each-SMP else { //printf("node %d left %d right %d from %d\n",rank,to_inter_left,to_inter_right,from_inter); for (i = 0; i < pipe_length; i++) { request_array[i] = smpi_mpi_irecv((char *) buf + (i * increment), segment, datatype, from_inter, (tag + i), comm); } for (i = 0; i < pipe_length; i++) { smpi_mpi_wait(&request_array[i], &status); smpi_mpi_send((char *) buf + (i * increment), segment, datatype, to_inter_left, (tag + i), comm); if (to_inter_right < size) smpi_mpi_send((char *) buf + (i * increment), segment, datatype, to_inter_right, (tag + i), comm); if ((to_intra_left - base) < num_core) smpi_mpi_send((char *) buf + (i * increment), segment, datatype, to_intra_left, (tag + i), comm); if ((to_intra_right - base) < num_core) smpi_mpi_send((char *) buf + (i * increment), segment, datatype, to_intra_right, (tag + i), comm); } } } // case non-ROOT-of-each-SMP else { // case leaves if ((to_intra_left - base) >= num_core) { for (i = 0; i < pipe_length; i++) { request_array[i] = smpi_mpi_irecv((char *) buf + (i * increment), segment, datatype, from_intra, (tag + i), comm); } smpi_mpi_waitall((pipe_length), request_array, status_array); } // case intermediate else { for (i = 0; i < pipe_length; i++) { request_array[i] = smpi_mpi_irecv((char *) buf + (i * increment), segment, datatype, from_intra, (tag + i), comm); } for (i = 0; i < pipe_length; i++) { smpi_mpi_wait(&request_array[i], &status); smpi_mpi_send((char *) buf + (i * increment), segment, datatype, to_intra_left, (tag + i), comm); if ((to_intra_right - base) < num_core) smpi_mpi_send((char *) buf + (i * increment), segment, datatype, to_intra_right, (tag + i), comm); } } } free(request_array); free(status_array); } // when count is not divisible by block size, use default BCAST for the remainder if ((remainder != 0) && (count > segment)) { XBT_WARN("MPI_bcast_SMP_binary use default MPI_bcast."); smpi_mpi_bcast((char *) buf + (pipe_length * increment), remainder, datatype, root, comm); } return 1; }
int smpi_coll_tuned_allreduce_ompi_ring_segmented(void *sbuf, void *rbuf, int count, MPI_Datatype dtype, MPI_Op op, MPI_Comm comm) { int ret = MPI_SUCCESS; int line; int k, recv_from, send_to; int early_blockcount, late_blockcount, split_rank; int segcount, max_segcount; int num_phases, phase; int block_count; unsigned int inbi; size_t typelng; char *tmpsend = NULL, *tmprecv = NULL; char *inbuf[2] = {NULL, NULL}; ptrdiff_t true_extent, extent; ptrdiff_t block_offset, max_real_segsize; MPI_Request reqs[2] = {NULL, NULL}; const size_t segsize = 1 << 20; /* 1 MB */ unsigned int size = smpi_comm_size(comm); unsigned int rank = smpi_comm_rank(comm); XBT_DEBUG("coll:tuned:allreduce_intra_ring_segmented rank %d, count %d", rank, count); /* Special case for size == 1 */ if (1 == size) { if (MPI_IN_PLACE != sbuf) { ret= smpi_datatype_copy(sbuf, count, dtype,rbuf, count, dtype); if (ret < 0) { line = __LINE__; goto error_hndl; } } return MPI_SUCCESS; } /* Determine segment count based on the suggested segment size */ extent = smpi_datatype_get_extent(dtype); if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } true_extent = smpi_datatype_get_extent(dtype); if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } typelng = smpi_datatype_size(dtype); if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } segcount = count; COLL_TUNED_COMPUTED_SEGCOUNT(segsize, typelng, segcount) /* Special case for count less than size * segcount - use regular ring */ if (count < size * segcount) { XBT_DEBUG( "coll:tuned:allreduce_ring_segmented rank %d/%d, count %d, switching to regular ring", rank, size, count); return (smpi_coll_tuned_allreduce_lr(sbuf, rbuf, count, dtype, op, comm)); } /* Determine the number of phases of the algorithm */ num_phases = count / (size * segcount); if ((count % (size * segcount) >= size) && (count % (size * segcount) > ((size * segcount) / 2))) { num_phases++; } /* Determine the number of elements per block and corresponding block sizes. The blocks are divided into "early" and "late" ones: blocks 0 .. (split_rank - 1) are "early" and blocks (split_rank) .. (size - 1) are "late". Early blocks are at most 1 element larger than the late ones. Note, these blocks will be split into num_phases segments, out of the largest one will have max_segcount elements. */ COLL_TUNED_COMPUTE_BLOCKCOUNT( count, size, split_rank, early_blockcount, late_blockcount ) COLL_TUNED_COMPUTE_BLOCKCOUNT( early_blockcount, num_phases, inbi, max_segcount, k) max_real_segsize = true_extent + (max_segcount - 1) * extent; /* Allocate and initialize temporary buffers */ inbuf[0] = (char*)smpi_get_tmp_sendbuffer(max_real_segsize); if (NULL == inbuf[0]) { ret = -1; line = __LINE__; goto error_hndl; } if (size > 2) { inbuf[1] = (char*)smpi_get_tmp_recvbuffer(max_real_segsize); if (NULL == inbuf[1]) { ret = -1; line = __LINE__; goto error_hndl; } } /* Handle MPI_IN_PLACE */ if (MPI_IN_PLACE != sbuf) { ret= smpi_datatype_copy(sbuf, count, dtype,rbuf, count, dtype); if (ret < 0) { line = __LINE__; goto error_hndl; } } /* Computation loop: for each phase, repeat ring allreduce computation loop */ for (phase = 0; phase < num_phases; phase ++) { ptrdiff_t phase_offset; int early_phase_segcount, late_phase_segcount, split_phase, phase_count; /* For each of the remote nodes: - post irecv for block (r-1) - send block (r) To do this, first compute block offset and count, and use block offset to compute phase offset. - in loop for every step k = 2 .. n - post irecv for block (r + n - k) % n - wait on block (r + n - k + 1) % n to arrive - compute on block (r + n - k + 1) % n - send block (r + n - k + 1) % n - wait on block (r + 1) - compute on block (r + 1) - send block (r + 1) to rank (r + 1) Note that we must be careful when computing the begining of buffers and for send operations and computation we must compute the exact block size. */ send_to = (rank + 1) % size; recv_from = (rank + size - 1) % size; inbi = 0; /* Initialize first receive from the neighbor on the left */ reqs[inbi] = smpi_mpi_irecv(inbuf[inbi], max_segcount, dtype, recv_from, 666, comm); /* Send first block (my block) to the neighbor on the right: - compute my block and phase offset - send data */ block_offset = ((rank < split_rank)? (rank * early_blockcount) : (rank * late_blockcount + split_rank)); block_count = ((rank < split_rank)? early_blockcount : late_blockcount); COLL_TUNED_COMPUTE_BLOCKCOUNT(block_count, num_phases, split_phase, early_phase_segcount, late_phase_segcount) phase_count = ((phase < split_phase)? (early_phase_segcount) : (late_phase_segcount)); phase_offset = ((phase < split_phase)? (phase * early_phase_segcount) : (phase * late_phase_segcount + split_phase)); tmpsend = ((char*)rbuf) + (block_offset + phase_offset) * extent; smpi_mpi_send(tmpsend, phase_count, dtype, send_to, 666, comm); for (k = 2; k < size; k++) { const int prevblock = (rank + size - k + 1) % size; inbi = inbi ^ 0x1; /* Post irecv for the current block */ reqs[inbi] = smpi_mpi_irecv(inbuf[inbi], max_segcount, dtype, recv_from, 666, comm); if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } /* Wait on previous block to arrive */ smpi_mpi_wait(&reqs[inbi ^ 0x1], MPI_STATUS_IGNORE); /* Apply operation on previous block: result goes to rbuf rbuf[prevblock] = inbuf[inbi ^ 0x1] (op) rbuf[prevblock] */ block_offset = ((prevblock < split_rank)? (prevblock * early_blockcount) : (prevblock * late_blockcount + split_rank)); block_count = ((prevblock < split_rank)? early_blockcount : late_blockcount); COLL_TUNED_COMPUTE_BLOCKCOUNT(block_count, num_phases, split_phase, early_phase_segcount, late_phase_segcount) phase_count = ((phase < split_phase)? (early_phase_segcount) : (late_phase_segcount)); phase_offset = ((phase < split_phase)? (phase * early_phase_segcount) : (phase * late_phase_segcount + split_phase)); tmprecv = ((char*)rbuf) + (block_offset + phase_offset) * extent; smpi_op_apply(op, inbuf[inbi ^ 0x1], tmprecv, &phase_count, &dtype); /* send previous block to send_to */ smpi_mpi_send(tmprecv, phase_count, dtype, send_to, 666, comm); } /* Wait on the last block to arrive */ smpi_mpi_wait(&reqs[inbi], MPI_STATUS_IGNORE); /* Apply operation on the last block (from neighbor (rank + 1) rbuf[rank+1] = inbuf[inbi] (op) rbuf[rank + 1] */ recv_from = (rank + 1) % size; block_offset = ((recv_from < split_rank)? (recv_from * early_blockcount) : (recv_from * late_blockcount + split_rank)); block_count = ((recv_from < split_rank)? early_blockcount : late_blockcount); COLL_TUNED_COMPUTE_BLOCKCOUNT(block_count, num_phases, split_phase, early_phase_segcount, late_phase_segcount) phase_count = ((phase < split_phase)? (early_phase_segcount) : (late_phase_segcount)); phase_offset = ((phase < split_phase)? (phase * early_phase_segcount) : (phase * late_phase_segcount + split_phase)); tmprecv = ((char*)rbuf) + (block_offset + phase_offset) * extent; smpi_op_apply(op, inbuf[inbi], tmprecv, &phase_count, &dtype); } /* Distribution loop - variation of ring allgather */ send_to = (rank + 1) % size; recv_from = (rank + size - 1) % size; for (k = 0; k < size - 1; k++) { const int recv_data_from = (rank + size - k) % size; const int send_data_from = (rank + 1 + size - k) % size; const int send_block_offset = ((send_data_from < split_rank)? (send_data_from * early_blockcount) : (send_data_from * late_blockcount + split_rank)); const int recv_block_offset = ((recv_data_from < split_rank)? (recv_data_from * early_blockcount) : (recv_data_from * late_blockcount + split_rank)); block_count = ((send_data_from < split_rank)? early_blockcount : late_blockcount); tmprecv = (char*)rbuf + recv_block_offset * extent; tmpsend = (char*)rbuf + send_block_offset * extent; smpi_mpi_sendrecv(tmpsend, block_count, dtype, send_to, 666, tmprecv, early_blockcount, dtype, recv_from, 666, comm, MPI_STATUS_IGNORE); } if (NULL != inbuf[0]) smpi_free_tmp_buffer(inbuf[0]); if (NULL != inbuf[1]) smpi_free_tmp_buffer(inbuf[1]); return MPI_SUCCESS; error_hndl: XBT_DEBUG("%s:%4d\tRank %d Error occurred %d\n", __FILE__, line, rank, ret); if (NULL != inbuf[0]) smpi_free_tmp_buffer(inbuf[0]); if (NULL != inbuf[1]) smpi_free_tmp_buffer(inbuf[1]); return ret; }
/* Non-topology-specific pipelined linear-bcast function */ int smpi_coll_tuned_bcast_arrival_pattern_aware(void *buf, int count, MPI_Datatype datatype, int root, MPI_Comm comm) { int tag = -COLL_TAG_BCAST; MPI_Status status; MPI_Request request; MPI_Request *send_request_array; MPI_Request *recv_request_array; MPI_Status *send_status_array; MPI_Status *recv_status_array; MPI_Status temp_status_array[MAX_NODE]; int rank, size; int i, j; int sent_count; int header_index; int flag_array[MAX_NODE]; int already_sent[MAX_NODE]; int to_clean[MAX_NODE]; int header_buf[HEADER_SIZE]; char temp_buf[MAX_NODE]; MPI_Aint extent; extent = smpi_datatype_get_extent(datatype); /* destination */ int to; rank = smpi_comm_rank(comm); size = smpi_comm_size(comm); /* segment is segment size in number of elements (not bytes) */ int segment = bcast_NTSL_segment_size_in_byte / extent; segment = segment == 0 ? 1 :segment; /* pipeline length */ int pipe_length = count / segment; /* use for buffer offset for sending and receiving data = segment size in byte */ int increment = segment * extent; /* if the input size is not divisible by segment size => the small remainder will be done with native implementation */ int remainder = count % segment; /* if root is not zero send to rank zero first this can be modified to make it faster by using logical src, dst. */ if (root != 0) { if (rank == root) { smpi_mpi_send(buf, count, datatype, 0, tag, comm); } else if (rank == 0) { smpi_mpi_recv(buf, count, datatype, root, tag, comm, &status); } } /* value == 0 means root has not send data (or header) to the node yet */ for (i = 0; i < MAX_NODE; i++) { already_sent[i] = 0; to_clean[i] = 0; } /* when a message is smaller than a block size => no pipeline */ if (count <= segment) { if (rank == 0) { sent_count = 0; while (sent_count < (size - 1)) { for (i = 1; i < size; i++) { smpi_mpi_iprobe(i, MPI_ANY_TAG, comm, &flag_array[i], MPI_STATUSES_IGNORE); } header_index = 0; /* recv 1-byte message */ for (i = 1; i < size; i++) { /* message arrive */ if ((flag_array[i] == 1) && (already_sent[i] == 0)) { smpi_mpi_recv(temp_buf, 1, MPI_CHAR, i, tag, comm, &status); header_buf[header_index] = i; header_index++; sent_count++; /* will send in the next step */ already_sent[i] = 1; } } /* send header followed by data */ if (header_index != 0) { header_buf[header_index] = -1; to = header_buf[0]; smpi_mpi_send(header_buf, HEADER_SIZE, MPI_INT, to, tag, comm); smpi_mpi_send(buf, count, datatype, to, tag, comm); } /* randomly MPI_Send to one */ else { /* search for the first node that never received data before */ for (i = 1; i < size; i++) { if (already_sent[i] == 0) { header_buf[0] = i; header_buf[1] = -1; smpi_mpi_send(header_buf, HEADER_SIZE, MPI_INT, i, tag, comm); smpi_mpi_send(buf, count, datatype, i, tag, comm); already_sent[i] = 1; sent_count++; break; } } } } /* while loop */ } /* non-root */ else { /* send 1-byte message to root */ smpi_mpi_send(temp_buf, 1, MPI_CHAR, 0, tag, comm); /* wait for header and data, forward when required */ smpi_mpi_recv(header_buf, HEADER_SIZE, MPI_INT, MPI_ANY_SOURCE, tag, comm, &status); smpi_mpi_recv(buf, count, datatype, MPI_ANY_SOURCE, tag, comm, &status); /* search for where it is */ int myordering = 0; while (rank != header_buf[myordering]) { myordering++; } /* send header followed by data */ if (header_buf[myordering + 1] != -1) { smpi_mpi_send(header_buf, HEADER_SIZE, MPI_INT, header_buf[myordering + 1], tag, comm); smpi_mpi_send(buf, count, datatype, header_buf[myordering + 1], tag, comm); } } } /* pipeline bcast */ else { send_request_array = (MPI_Request *) xbt_malloc((size + pipe_length) * sizeof(MPI_Request)); recv_request_array = (MPI_Request *) xbt_malloc((size + pipe_length) * sizeof(MPI_Request)); send_status_array = (MPI_Status *) xbt_malloc((size + pipe_length) * sizeof(MPI_Status)); recv_status_array = (MPI_Status *) xbt_malloc((size + pipe_length) * sizeof(MPI_Status)); if (rank == 0) { //double start2 = MPI_Wtime(); sent_count = 0; //int iteration = 0; while (sent_count < (size - 1)) { //iteration++; //start = MPI_Wtime(); for (i = 1; i < size; i++) { smpi_mpi_iprobe(i, MPI_ANY_TAG, comm, &flag_array[i], &temp_status_array[i]); } //total = MPI_Wtime() - start; //total *= 1000; //printf("Iprobe time = %.2f\n",total); header_index = 0; MPI_Wtime(); /* recv 1-byte message */ for (i = 1; i < size; i++) { /* message arrive */ if ((flag_array[i] == 1) && (already_sent[i] == 0)) { smpi_mpi_recv(&temp_buf[i], 1, MPI_CHAR, i, tag, comm, &status); header_buf[header_index] = i; header_index++; sent_count++; /* will send in the next step */ already_sent[i] = 1; } } //total = MPI_Wtime() - start; //total *= 1000; //printf("Recv 1-byte time = %.2f\n",total); /* if (header_index != 0) { printf("header index = %d node = ",header_index); for (i=0;i<header_index;i++) { printf("%d ",header_buf[i]); } printf("\n"); } */ /* send header followed by data */ if (header_index != 0) { header_buf[header_index] = -1; to = header_buf[0]; //start = MPI_Wtime(); /* send header */ smpi_mpi_send(header_buf, HEADER_SIZE, MPI_INT, to, tag, comm); //total = MPI_Wtime() - start; //total *= 1000; //printf("\tSend header to %d time = %.2f\n",to,total); //start = MPI_Wtime(); /* send data - non-pipeline case */ if (0 == 1) { //if (header_index == 1) { smpi_mpi_send(buf, count, datatype, to, tag, comm); } /* send data - pipeline */ else { for (i = 0; i < pipe_length; i++) { smpi_mpi_send((char *)buf + (i * increment), segment, datatype, to, tag, comm); } //smpi_mpi_waitall((pipe_length), send_request_array, send_status_array); } //total = MPI_Wtime() - start; //total *= 1000; //printf("\tSend data to %d time = %.2f\n",to,total); } /* randomly MPI_Send to one node */ else { /* search for the first node that never received data before */ for (i = 1; i < size; i++) { if (already_sent[i] == 0) { header_buf[0] = i; header_buf[1] = -1; to = i; //start = MPI_Wtime(); smpi_mpi_send(header_buf, HEADER_SIZE, MPI_INT, to, tag, comm); /* still need to chop data so that we can use the same non-root code */ for (j = 0; j < pipe_length; j++) { smpi_mpi_send((char *)buf + (j * increment), segment, datatype, to, tag, comm); } //smpi_mpi_send(buf,count,datatype,to,tag,comm); //smpi_mpi_wait(&request,MPI_STATUS_IGNORE); //total = MPI_Wtime() - start; //total *= 1000; //printf("SEND TO SINGLE node %d time = %.2f\n",i,total); already_sent[i] = 1; to_clean[i]=1; sent_count++; break; } } } } /* while loop */ for(i=0; i<size; i++) if(to_clean[i]!=0)smpi_mpi_recv(&temp_buf[i], 1, MPI_CHAR, i, tag, comm, &status); //total = MPI_Wtime() - start2; //total *= 1000; //printf("Node zero iter = %d time = %.2f\n",iteration,total); } /* rank 0 */ /* none root */ else { /* send 1-byte message to root */ smpi_mpi_send(temp_buf, 1, MPI_CHAR, 0, tag, comm); /* wait for header forward when required */ request = smpi_mpi_irecv(header_buf, HEADER_SIZE, MPI_INT, MPI_ANY_SOURCE, tag, comm); smpi_mpi_wait(&request, MPI_STATUS_IGNORE); /* search for where it is */ int myordering = 0; while (rank != header_buf[myordering]) { myordering++; } /* send header when required */ if (header_buf[myordering + 1] != -1) { smpi_mpi_send(header_buf, HEADER_SIZE, MPI_INT, header_buf[myordering + 1], tag, comm); } /* receive data */ if (0 == -1) { //if (header_buf[1] == -1) { request = smpi_mpi_irecv(buf, count, datatype, 0, tag, comm); smpi_mpi_wait(&request, MPI_STATUS_IGNORE); //printf("\t\tnode %d ordering = %d receive data from root\n",rank,myordering); } else { for (i = 0; i < pipe_length; i++) { recv_request_array[i] = smpi_mpi_irecv((char *)buf + (i * increment), segment, datatype, MPI_ANY_SOURCE, tag, comm); } } /* send data */ if (header_buf[myordering + 1] != -1) { for (i = 0; i < pipe_length; i++) { smpi_mpi_wait(&recv_request_array[i], MPI_STATUS_IGNORE); send_request_array[i] = smpi_mpi_isend((char *)buf + (i * increment), segment, datatype, header_buf[myordering + 1], tag, comm); } smpi_mpi_waitall((pipe_length), send_request_array, send_status_array); }else{ smpi_mpi_waitall(pipe_length, recv_request_array, recv_status_array); } } free(send_request_array); free(recv_request_array); free(send_status_array); free(recv_status_array); } /* end pipeline */ /* when count is not divisible by block size, use default BCAST for the remainder */ if ((remainder != 0) && (count > segment)) { XBT_WARN("MPI_bcast_arrival_pattern_aware use default MPI_bcast."); smpi_mpi_bcast((char *)buf + (pipe_length * increment), remainder, datatype, root, comm); } return MPI_SUCCESS; }