int smpi_coll_tuned_reduce_flat_tree(void *sbuf, void *rbuf, int count, MPI_Datatype dtype, MPI_Op op, int root, MPI_Comm comm) { int i, tag = 4321; int size; int rank; MPI_Aint extent; char *origin = 0; char *inbuf; MPI_Status status; rank = smpi_comm_rank(comm); size = smpi_comm_size(comm); /* If not root, send data to the root. */ extent = smpi_datatype_get_extent(dtype); if (rank != root) { smpi_mpi_send(sbuf, count, dtype, root, tag, comm); return 0; } /* Root receives and reduces messages. Allocate buffer to receive messages. */ if (size > 1) origin = (char *) xbt_malloc(count * extent); /* Initialize the receive buffer. */ if (rank == (size - 1)) smpi_mpi_sendrecv(sbuf, count, dtype, rank, tag, rbuf, count, dtype, rank, tag, comm, &status); else smpi_mpi_recv(rbuf, count, dtype, size - 1, tag, comm, &status); /* Loop receiving and calling reduction function (C or Fortran). */ for (i = size - 2; i >= 0; --i) { if (rank == i) inbuf = sbuf; else { smpi_mpi_recv(origin, count, dtype, i, tag, comm, &status); inbuf = origin; } /* Call reduction function. */ smpi_op_apply(op, inbuf, rbuf, &count, &dtype); } if (origin) free(origin); /* All done */ return 0; }
/* * Simple double ring version of barrier * * synchronous gurantee made by last ring of sends are synchronous * */ int smpi_coll_tuned_barrier_ompi_doublering(MPI_Comm comm ) { int rank, size; int left, right; rank = smpi_comm_rank(comm); size = smpi_comm_size(comm); XBT_DEBUG("ompi_coll_tuned_barrier_ompi_doublering rank %d", rank); left = ((rank-1+size)%size); right = ((rank+1)%size); if (rank > 0) { /* receive message from the left */ smpi_mpi_recv((void*)NULL, 0, MPI_BYTE, left, COLL_TAG_BARRIER, comm, MPI_STATUS_IGNORE); } /* Send message to the right */ smpi_mpi_send((void*)NULL, 0, MPI_BYTE, right, COLL_TAG_BARRIER, comm); /* root needs to receive from the last node */ if (rank == 0) { smpi_mpi_recv((void*)NULL, 0, MPI_BYTE, left, COLL_TAG_BARRIER, comm, MPI_STATUS_IGNORE); } /* Allow nodes to exit */ if (rank > 0) { /* post Receive from left */ smpi_mpi_recv((void*)NULL, 0, MPI_BYTE, left, COLL_TAG_BARRIER, comm, MPI_STATUS_IGNORE); } /* send message to the right one */ smpi_mpi_send((void*)NULL, 0, MPI_BYTE, right, COLL_TAG_BARRIER, comm); /* rank 0 post receive from the last node */ if (rank == 0) { smpi_mpi_recv((void*)NULL, 0, MPI_BYTE, left, COLL_TAG_BARRIER, comm, MPI_STATUS_IGNORE); } return MPI_SUCCESS; }
static void action_recv(const char *const *action) { int from = atoi(action[2]); double size=parse_double(action[3]); double clock = smpi_process_simulated_elapsed(); MPI_Status status; #ifdef HAVE_TRACING int rank = smpi_comm_rank(MPI_COMM_WORLD); int src_traced = smpi_group_rank(smpi_comm_group(MPI_COMM_WORLD), from); TRACE_smpi_computing_out(rank); TRACE_smpi_ptp_in(rank, src_traced, rank, __FUNCTION__); #endif smpi_mpi_recv(NULL, size, MPI_BYTE, from, 0, MPI_COMM_WORLD, &status); #ifdef HAVE_TRACING TRACE_smpi_ptp_out(rank, src_traced, rank, __FUNCTION__); TRACE_smpi_recv(rank, src_traced, rank); TRACE_smpi_computing_in(rank); #endif if (XBT_LOG_ISENABLED(smpi_replay, xbt_log_priority_verbose)){ char *name = xbt_str_join_array(action, " "); XBT_VERB("%s %f", name, smpi_process_simulated_elapsed()-clock); free(name); } }
static void action_recv(const char *const *action) { int from = atoi(action[2]); double size=parse_double(action[3]); double clock = smpi_process_simulated_elapsed(); MPI_Status status; if(action[4]) MPI_CURRENT_TYPE=decode_datatype(action[4]); else MPI_CURRENT_TYPE= MPI_DEFAULT_TYPE; #ifdef HAVE_TRACING int rank = smpi_comm_rank(MPI_COMM_WORLD); int src_traced = smpi_group_rank(smpi_comm_group(MPI_COMM_WORLD), from); TRACE_smpi_computing_out(rank); TRACE_smpi_ptp_in(rank, src_traced, rank, __FUNCTION__); #endif smpi_mpi_recv(NULL, size, MPI_CURRENT_TYPE, from, 0, MPI_COMM_WORLD, &status); #ifdef HAVE_TRACING TRACE_smpi_ptp_out(rank, src_traced, rank, __FUNCTION__); TRACE_smpi_recv(rank, src_traced, rank); TRACE_smpi_computing_in(rank); #endif log_timed_action (action, clock); }
int smpi_coll_tuned_bcast_flattree(void *buff, int count, MPI_Datatype data_type, int root, MPI_Comm comm) { MPI_Request *req_ptr; MPI_Request *reqs; int i, rank, num_procs; int tag = 1; rank = smpi_comm_rank(comm); num_procs = smpi_comm_size(comm); if (rank != root) { smpi_mpi_recv(buff, count, data_type, root, tag, comm, MPI_STATUS_IGNORE); } else { reqs = (MPI_Request *) xbt_malloc((num_procs - 1) * sizeof(MPI_Request)); req_ptr = reqs; // Root sends data to all others for (i = 0; i < num_procs; i++) { if (i == rank) continue; *(req_ptr++) = smpi_mpi_isend(buff, count, data_type, i, tag, comm); } // wait on all requests smpi_mpi_waitall(num_procs - 1, reqs, MPI_STATUSES_IGNORE); free(reqs); } return MPI_SUCCESS; }
/* * Another recursive doubling type algorithm, but in this case * we go up the tree and back down the tree. */ int smpi_coll_tuned_barrier_ompi_tree(MPI_Comm comm) { int rank, size, depth; int jump, partner; rank = smpi_comm_rank(comm); size = smpi_comm_size(comm); XBT_DEBUG( "ompi_coll_tuned_barrier_ompi_tree %d", rank); /* Find the nearest power of 2 of the communicator size. */ for(depth = 1; depth < size; depth <<= 1 ); for (jump=1; jump<depth; jump<<=1) { partner = rank ^ jump; if (!(partner & (jump-1)) && partner < size) { if (partner > rank) { smpi_mpi_recv (NULL, 0, MPI_BYTE, partner, COLL_TAG_BARRIER, comm, MPI_STATUS_IGNORE); } else if (partner < rank) { smpi_mpi_send (NULL, 0, MPI_BYTE, partner, COLL_TAG_BARRIER, comm); } } } depth>>=1; for (jump = depth; jump>0; jump>>=1) { partner = rank ^ jump; if (!(partner & (jump-1)) && partner < size) { if (partner > rank) { smpi_mpi_send (NULL, 0, MPI_BYTE, partner, COLL_TAG_BARRIER, comm); } else if (partner < rank) { smpi_mpi_recv (NULL, 0, MPI_BYTE, partner, COLL_TAG_BARRIER, comm, MPI_STATUS_IGNORE); } } } return MPI_SUCCESS; }
/* * scatter_intra * * Function: - basic scatter operation * Accepts: - same arguments as MPI_Scatter() * Returns: - MPI_SUCCESS or error code */ int smpi_coll_tuned_scatter_ompi_basic_linear(void *sbuf, int scount, MPI_Datatype sdtype, void *rbuf, int rcount, MPI_Datatype rdtype, int root, MPI_Comm comm ) { int i, rank, size, err; char *ptmp; ptrdiff_t lb, incr; /* Initialize */ rank = smpi_comm_rank(comm); size = smpi_comm_size(comm); /* If not root, receive data. */ if (rank != root) { smpi_mpi_recv(rbuf, rcount, rdtype, root, COLL_TAG_SCATTER, comm, MPI_STATUS_IGNORE); return MPI_SUCCESS; } /* I am the root, loop sending data. */ err = smpi_datatype_extent(sdtype, &lb, &incr); if (MPI_SUCCESS != err) { return MPI_ERR_OTHER; } incr *= scount; for (i = 0, ptmp = (char *) sbuf; i < size; ++i, ptmp += incr) { /* simple optimization */ if (i == rank) { if (MPI_IN_PLACE != rbuf) { err = smpi_datatype_copy(ptmp, scount, sdtype, rbuf, rcount, rdtype); } } else { smpi_mpi_send(ptmp, scount, sdtype, i, COLL_TAG_SCATTER, comm); } if (MPI_SUCCESS != err) { return err; } } /* All done */ return MPI_SUCCESS; }
int smpi_coll_tuned_barrier_mvapich2_pair(MPI_Comm comm) { int size, rank; int d, dst, src; int mpi_errno = MPI_SUCCESS; size = smpi_comm_size(comm); /* Trivial barriers return immediately */ if (size == 1) return MPI_SUCCESS; rank = smpi_comm_rank(comm); int N2_prev = 1; /* N2_prev = greatest power of two < size of Comm */ for( N2_prev = 1; N2_prev <= size; N2_prev <<= 1 ); N2_prev >>= 1; int surfeit = size - N2_prev; /* Perform a combine-like operation */ if (rank < N2_prev) { if (rank < surfeit) { /* get the fanin letter from the upper "half" process: */ dst = N2_prev + rank; smpi_mpi_recv(NULL, 0, MPI_BYTE, dst, COLL_TAG_BARRIER, comm, MPI_STATUS_IGNORE); } /* combine on embedded N2_prev power-of-two processes */ for (d = 1; d < N2_prev; d <<= 1) { dst = (rank ^ d); smpi_mpi_sendrecv(NULL, 0, MPI_BYTE, dst, COLL_TAG_BARRIER, NULL, 0, MPI_BYTE, dst, COLL_TAG_BARRIER, comm, MPI_STATUS_IGNORE); } /* fanout data to nodes above N2_prev... */ if (rank < surfeit) { dst = N2_prev + rank; smpi_mpi_send(NULL, 0, MPI_BYTE, dst, COLL_TAG_BARRIER, comm); } } else { /* fanin data to power of 2 subset */ src = rank - N2_prev; smpi_mpi_sendrecv(NULL, 0, MPI_BYTE, src, COLL_TAG_BARRIER, NULL, 0, MPI_BYTE, src, COLL_TAG_BARRIER, comm, MPI_STATUS_IGNORE); } return mpi_errno; }
int smpi_coll_tuned_barrier_ompi_basic_linear(MPI_Comm comm) { int i; int size = smpi_comm_size(comm); int rank = smpi_comm_rank(comm); /* All non-root send & receive zero-length message. */ if (rank > 0) { smpi_mpi_send (NULL, 0, MPI_BYTE, 0, COLL_TAG_BARRIER, comm); smpi_mpi_recv (NULL, 0, MPI_BYTE, 0, COLL_TAG_BARRIER, comm, MPI_STATUS_IGNORE); } /* The root collects and broadcasts the messages. */ else { MPI_Request* requests; requests = (MPI_Request*)malloc( size * sizeof(MPI_Request) ); for (i = 1; i < size; ++i) { requests[i] = smpi_mpi_irecv(NULL, 0, MPI_BYTE, MPI_ANY_SOURCE, COLL_TAG_BARRIER, comm ); } smpi_mpi_waitall( size-1, requests+1, MPI_STATUSES_IGNORE ); for (i = 1; i < size; ++i) { requests[i] = smpi_mpi_isend(NULL, 0, MPI_BYTE, i, COLL_TAG_BARRIER, comm ); } smpi_mpi_waitall( size-1, requests+1, MPI_STATUSES_IGNORE ); free( requests ); } /* All done */ return MPI_SUCCESS; }
static void action_recv(const char *const *action) { CHECK_ACTION_PARAMS(action, 2, 1); int from = atoi(action[2]); double size=parse_double(action[3]); double clock = smpi_process_simulated_elapsed(); MPI_Status status; if(action[4]) MPI_CURRENT_TYPE=decode_datatype(action[4]); else MPI_CURRENT_TYPE= MPI_DEFAULT_TYPE; int rank = smpi_process_index(); int src_traced = smpi_group_rank(smpi_comm_group(MPI_COMM_WORLD), from); instr_extra_data extra = xbt_new0(s_instr_extra_data_t,1); extra->type = TRACING_RECV; extra->send_size = size; extra->src = src_traced; extra->dst = rank; extra->datatype1 = encode_datatype(MPI_CURRENT_TYPE, NULL); TRACE_smpi_ptp_in(rank, src_traced, rank, __FUNCTION__, extra); //unknow size from the receiver pov if(size==-1){ smpi_mpi_probe(from, 0, MPI_COMM_WORLD, &status); size=status.count; } smpi_mpi_recv(NULL, size, MPI_CURRENT_TYPE, from, 0, MPI_COMM_WORLD, &status); TRACE_smpi_ptp_out(rank, src_traced, rank, __FUNCTION__); if (!TRACE_smpi_view_internals()) { TRACE_smpi_recv(rank, src_traced, rank); } log_timed_action (action, clock); }
/* Non-topology-specific pipelined linear-bcast function */ int smpi_coll_tuned_bcast_arrival_scatter(void *buf, int count, MPI_Datatype datatype, int root, MPI_Comm comm) { int tag = -COLL_TAG_BCAST;//in order to use ANY_TAG, make this one positive int header_tag = 10; MPI_Status status; int curr_remainder; int curr_size; int curr_increment; int send_offset; int recv_offset; int send_count; int recv_count; MPI_Status temp_status_array[BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE]; int rank, size; int i, k; int sent_count; int header_index; int flag_array[BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE]; int already_sent[BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE]; int header_buf[BCAST_ARRIVAL_PATTERN_AWARE_HEADER_SIZE]; char temp_buf[BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE]; int will_send[BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE]; int max_node = BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE; int header_size = BCAST_ARRIVAL_PATTERN_AWARE_HEADER_SIZE; MPI_Aint extent; extent = smpi_datatype_get_extent(datatype); /* source and destination */ int to, from; rank = smpi_comm_rank(comm); size = smpi_comm_size(comm); /* message too small */ if (count < size) { XBT_WARN("MPI_bcast_arrival_scatter use default MPI_bcast."); smpi_mpi_bcast(buf, count, datatype, root, comm); return MPI_SUCCESS; } /* if root is not zero send to rank zero first this can be modified to make it faster by using logical src, dst. */ if (root != 0) { if (rank == root) { smpi_mpi_send(buf, count, datatype, 0, tag - 1, comm); } else if (rank == 0) { smpi_mpi_recv(buf, count, datatype, root, tag - 1, comm, &status); } } /* value == 0 means root has not send data (or header) to the node yet */ for (i = 0; i < max_node; i++) { already_sent[i] = 0; } /* start bcast */ /* root */ if (rank == 0) { for (i = 0; i < max_node; i++) will_send[i] = 0; sent_count = 0; while (sent_count < (size - 1)) { for (k = 0; k < 3; k++) { for (i = 1; i < size; i++) { if ((already_sent[i] == 0) && (will_send[i] == 0)) { smpi_mpi_iprobe(i, MPI_ANY_TAG, comm, &flag_array[i], &temp_status_array[i]); if (flag_array[i] == 1) { will_send[i] = 1; smpi_mpi_recv(&temp_buf[i], 1, MPI_CHAR, i, tag, comm, &status); i = 0; } } } } header_index = 0; /* recv 1-byte message in this round */ for (i = 1; i < size; i++) { /* message arrive */ if ((will_send[i] == 1) && (already_sent[i] == 0)) { header_buf[header_index] = i; header_index++; sent_count++; /* will send in the next step */ already_sent[i] = 1; } } /* if (header_index != 0) { printf("header index = %d node = ",header_index); for (i=0;i<header_index;i++) { printf("%d ",header_buf[i]); } printf("\n"); } */ /* send header followed by data */ if (header_index != 0) { header_buf[header_index] = -1; /* send header */ for (i = 0; i < header_index; i++) { to = header_buf[i]; smpi_mpi_send(header_buf, header_size, MPI_INT, to, header_tag, comm); } curr_remainder = count % header_index; curr_size = (count / header_index); curr_increment = curr_size * extent; /* send data */ for (i = 0; i < header_index; i++) { to = header_buf[i]; if ((i == (header_index - 1)) || (curr_size == 0)) curr_size += curr_remainder; //printf("Root send to %d index %d\n",to,(i*curr_increment)); smpi_mpi_send((char *) buf + (i * curr_increment), curr_size, datatype, to, tag, comm); } } } /* while (sent_count < size-1) */ } /* rank 0 */ /* none root */ else { /* send 1-byte message to root */ smpi_mpi_send(temp_buf, 1, MPI_CHAR, 0, tag, comm); /* wait for header forward when required */ smpi_mpi_recv(header_buf, header_size, MPI_INT, 0, header_tag, comm, &status); /* search for where it is */ int myordering = 0; while (rank != header_buf[myordering]) { myordering++; } int total_nodes = 0; while (header_buf[total_nodes] != -1) { total_nodes++; } curr_remainder = count % total_nodes; curr_size = (count / total_nodes); curr_increment = curr_size * extent; int recv_size = curr_size; /* receive data */ if (myordering == (total_nodes - 1)) recv_size += curr_remainder; smpi_mpi_recv((char *) buf + (myordering * curr_increment), recv_size, datatype, 0, tag, comm, &status); /* at this point all nodes in this set perform all-gather operation */ to = header_buf[myordering + 1]; from = header_buf[myordering - 1]; if (myordering == 0) from = header_buf[total_nodes - 1]; if (myordering == (total_nodes - 1)) to = header_buf[0]; /* last segment may have a larger size since it also include the remainder */ int last_segment_ptr = (total_nodes - 1) * (count / total_nodes) * extent; /* allgather */ for (i = 0; i < total_nodes - 1; i++) { send_offset = ((myordering - i + total_nodes) % total_nodes) * curr_increment; recv_offset = ((myordering - i - 1 + total_nodes) % total_nodes) * curr_increment; /* adjust size */ if (send_offset != last_segment_ptr) send_count = curr_size; else send_count = curr_size + curr_remainder; if (recv_offset != last_segment_ptr) recv_count = curr_size; else recv_count = curr_size + curr_remainder; //printf("\t\tnode %d sent_to %d recv_from %d send_size %d recv_size %d\n",rank,to,from,send_count,recv_count); //printf("\tnode %d sent_offset %d send_count %d\n",rank,send_offset,send_count); smpi_mpi_sendrecv((char *) buf + send_offset, send_count, datatype, to, tag + i, (char *) buf + recv_offset, recv_count, datatype, from, tag + i, comm, &status); } } /* non-root */ return MPI_SUCCESS; }
int smpi_coll_tuned_bcast_mvapich2_inter_node(void *buffer, int count, MPI_Datatype datatype, int root, MPI_Comm comm) { int rank; int mpi_errno = MPI_SUCCESS; MPI_Comm shmem_comm, leader_comm; int local_rank, local_size, global_rank = -1; int leader_root, leader_of_root; rank = smpi_comm_rank(comm); //comm_size = smpi_comm_size(comm); if (MV2_Bcast_function==NULL){ MV2_Bcast_function=smpi_coll_tuned_bcast_mpich; } if (MV2_Bcast_intra_node_function==NULL){ MV2_Bcast_intra_node_function= smpi_coll_tuned_bcast_mpich; } if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){ smpi_comm_init_smp(comm); } shmem_comm = smpi_comm_get_intra_comm(comm); local_rank = smpi_comm_rank(shmem_comm); local_size = smpi_comm_size(shmem_comm); leader_comm = smpi_comm_get_leaders_comm(comm); if ((local_rank == 0) && (local_size > 1)) { global_rank = smpi_comm_rank(leader_comm); } int* leaders_map = smpi_comm_get_leaders_map(comm); leader_of_root = smpi_group_rank(smpi_comm_group(comm),leaders_map[root]); leader_root = smpi_group_rank(smpi_comm_group(leader_comm),leaders_map[root]); if (local_size > 1) { if ((local_rank == 0) && (root != rank) && (leader_root == global_rank)) { smpi_mpi_recv(buffer, count, datatype, root, COLL_TAG_BCAST, comm, MPI_STATUS_IGNORE); } if ((local_rank != 0) && (root == rank)) { smpi_mpi_send(buffer, count, datatype, leader_of_root, COLL_TAG_BCAST, comm); } } #if defined(_MCST_SUPPORT_) if (comm_ptr->ch.is_mcast_ok) { mpi_errno = MPIR_Mcast_inter_node_MV2(buffer, count, datatype, root, comm_ptr, errflag); if (mpi_errno == MPI_SUCCESS) { goto fn_exit; } else { goto fn_fail; } } #endif /* if (local_rank == 0) { leader_comm = smpi_comm_get_leaders_comm(comm); root = leader_root; } if (MV2_Bcast_function == &MPIR_Pipelined_Bcast_MV2) { mpi_errno = MPIR_Pipelined_Bcast_MV2(buffer, count, datatype, root, comm); } else if (MV2_Bcast_function == &MPIR_Bcast_scatter_ring_allgather_shm_MV2) { mpi_errno = MPIR_Bcast_scatter_ring_allgather_shm_MV2(buffer, count, datatype, root, comm); } else */{ if (local_rank == 0) { /* if (MV2_Bcast_function == &MPIR_Knomial_Bcast_inter_node_wrapper_MV2) { mpi_errno = MPIR_Knomial_Bcast_inter_node_wrapper_MV2(buffer, count, datatype, root, comm); } else {*/ mpi_errno = MV2_Bcast_function(buffer, count, datatype, leader_root, leader_comm); // } } } return mpi_errno; }
int smpi_coll_tuned_bcast_mvapich2_knomial_intra_node(void *buffer, int count, MPI_Datatype datatype, int root, MPI_Comm comm) { int local_size = 0, rank; int mpi_errno = MPI_SUCCESS; MPI_Request *reqarray = NULL; MPI_Status *starray = NULL; int src, dst, mask, relative_rank; int k; if (MV2_Bcast_function==NULL){ MV2_Bcast_function=smpi_coll_tuned_bcast_mpich; } if (MV2_Bcast_intra_node_function==NULL){ MV2_Bcast_intra_node_function= smpi_coll_tuned_bcast_mpich; } if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){ smpi_comm_init_smp(comm); } local_size = smpi_comm_size(comm); rank = smpi_comm_rank(comm); reqarray=(MPI_Request *)xbt_malloc(2 * mv2_intra_node_knomial_factor * sizeof (MPI_Request)); starray=(MPI_Status *)xbt_malloc(2 * mv2_intra_node_knomial_factor * sizeof (MPI_Status)); /* intra-node k-nomial bcast */ if (local_size > 1) { relative_rank = (rank >= root) ? rank - root : rank - root + local_size; mask = 0x1; while (mask < local_size) { if (relative_rank % (mv2_intra_node_knomial_factor * mask)) { src = relative_rank / (mv2_intra_node_knomial_factor * mask) * (mv2_intra_node_knomial_factor * mask) + root; if (src >= local_size) { src -= local_size; } smpi_mpi_recv(buffer, count, datatype, src, COLL_TAG_BCAST, comm, MPI_STATUS_IGNORE); break; } mask *= mv2_intra_node_knomial_factor; } mask /= mv2_intra_node_knomial_factor; while (mask > 0) { int reqs = 0; for (k = 1; k < mv2_intra_node_knomial_factor; k++) { if (relative_rank + mask * k < local_size) { dst = rank + mask * k; if (dst >= local_size) { dst -= local_size; } reqarray[reqs++]=smpi_mpi_isend(buffer, count, datatype, dst, COLL_TAG_BCAST, comm); } } smpi_mpi_waitall(reqs, reqarray, starray); mask /= mv2_intra_node_knomial_factor; } } xbt_free(reqarray); xbt_free(starray); return mpi_errno; }
int smpi_coll_tuned_allreduce_rdb(void *sbuff, void *rbuff, int count, MPI_Datatype dtype, MPI_Op op, MPI_Comm comm) { int nprocs, rank, tag = COLL_TAG_ALLREDUCE; int mask, dst, pof2, newrank, rem, newdst; MPI_Aint extent, lb; MPI_Status status; void *tmp_buf = NULL; /* #ifdef MPICH2_REDUCTION MPI_User_function * uop = MPIR_Op_table[op % 16 - 1]; #else MPI_User_function *uop; struct MPIR_OP *op_ptr; op_ptr = MPIR_ToPointer(op); uop = op_ptr->op; #endif */ nprocs=smpi_comm_size(comm); rank=smpi_comm_rank(comm); smpi_datatype_extent(dtype, &lb, &extent); tmp_buf = (void *) xbt_malloc(count * extent); smpi_mpi_sendrecv(sbuff, count, dtype, rank, 500, rbuff, count, dtype, rank, 500, comm, &status); // find nearest power-of-two less than or equal to comm_size pof2 = 1; while (pof2 <= nprocs) pof2 <<= 1; pof2 >>= 1; rem = nprocs - pof2; // In the non-power-of-two case, all even-numbered // processes of rank < 2*rem send their data to // (rank+1). These even-numbered processes no longer // participate in the algorithm until the very end. The // remaining processes form a nice power-of-two. if (rank < 2 * rem) { // even if (rank % 2 == 0) { smpi_mpi_send(rbuff, count, dtype, rank + 1, tag, comm); // temporarily set the rank to -1 so that this // process does not pariticipate in recursive // doubling newrank = -1; } else // odd { smpi_mpi_recv(tmp_buf, count, dtype, rank - 1, tag, comm, &status); // do the reduction on received data. since the // ordering is right, it doesn't matter whether // the operation is commutative or not. smpi_op_apply(op, tmp_buf, rbuff, &count, &dtype); // change the rank newrank = rank / 2; } } else // rank >= 2 * rem newrank = rank - rem; // If op is user-defined or count is less than pof2, use // recursive doubling algorithm. Otherwise do a reduce-scatter // followed by allgather. (If op is user-defined, // derived datatypes are allowed and the user could pass basic // datatypes on one process and derived on another as long as // the type maps are the same. Breaking up derived // datatypes to do the reduce-scatter is tricky, therefore // using recursive doubling in that case.) if (newrank != -1) { mask = 0x1; while (mask < pof2) { newdst = newrank ^ mask; // find real rank of dest dst = (newdst < rem) ? newdst * 2 + 1 : newdst + rem; // Send the most current data, which is in recvbuf. Recv // into tmp_buf smpi_mpi_sendrecv(rbuff, count, dtype, dst, tag, tmp_buf, count, dtype, dst, tag, comm, &status); // tmp_buf contains data received in this step. // recvbuf contains data accumulated so far // op is commutative OR the order is already right // we assume it is commuttive op // if (op -> op_commute || (dst < rank)) if ((dst < rank)) { smpi_op_apply(op, tmp_buf, rbuff, &count, &dtype); } else // op is noncommutative and the order is not right { smpi_op_apply(op, rbuff, tmp_buf, &count, &dtype); // copy result back into recvbuf smpi_mpi_sendrecv(tmp_buf, count, dtype, rank, tag, rbuff, count, dtype, rank, tag, comm, &status); } mask <<= 1; } } // In the non-power-of-two case, all odd-numbered processes of // rank < 2 * rem send the result to (rank-1), the ranks who didn't // participate above. if (rank < 2 * rem) { if (rank % 2) // odd smpi_mpi_send(rbuff, count, dtype, rank - 1, tag, comm); else // even smpi_mpi_recv(rbuff, count, dtype, rank + 1, tag, comm, &status); } free(tmp_buf); return MPI_SUCCESS; }
/* Non-topology-specific pipelined linear-reduce function */ int smpi_coll_tuned_reduce_arrival_pattern_aware(void *buf, void *rbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPI_Comm comm) { int rank; rank = smpi_comm_rank(comm); int tag = -COLL_TAG_REDUCE; MPI_Status status; MPI_Request request; MPI_Request *send_request_array; MPI_Request *recv_request_array; MPI_Status *send_status_array; MPI_Status *recv_status_array; MPI_Status temp_status_array[MAX_NODE]; int size; int i; int sent_count; int header_index; int flag_array[MAX_NODE]; int already_received[MAX_NODE]; int header_buf[HEADER_SIZE]; char temp_buf[MAX_NODE]; MPI_Aint extent, lb; smpi_datatype_extent(datatype, &lb, &extent); /* source and destination */ int to, from; size=smpi_comm_size(comm); rank=smpi_comm_rank(comm); /* segment is segment size in number of elements (not bytes) */ int segment = reduce_arrival_pattern_aware_segment_size_in_byte / extent; /* pipeline length */ int pipe_length = count / segment; /* use for buffer offset for sending and receiving data = segment size in byte */ int increment = segment * extent; /* if the input size is not divisible by segment size => the small remainder will be done with native implementation */ int remainder = count % segment; /* value == 0 means root has not send data (or header) to the node yet */ for (i = 0; i < MAX_NODE; i++) { already_received[i] = 0; } char *tmp_buf; tmp_buf = (char *) xbt_malloc(count * extent); smpi_mpi_sendrecv(buf, count, datatype, rank, tag, rbuf, count, datatype, rank, tag, comm, &status); /* when a message is smaller than a block size => no pipeline */ if (count <= segment) { if (rank == 0) { sent_count = 0; while (sent_count < (size - 1)) { for (i = 1; i < size; i++) { if (already_received[i] == 0) { smpi_mpi_iprobe(i, MPI_ANY_TAG, MPI_COMM_WORLD, &flag_array[i], MPI_STATUSES_IGNORE); simcall_process_sleep(0.0001); } } header_index = 0; /* recv 1-byte message */ for (i = 0; i < size; i++) { if (i == rank) continue; /* 1-byte message arrive */ if ((flag_array[i] == 1) && (already_received[i] == 0)) { smpi_mpi_recv(temp_buf, 1, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status); header_buf[header_index] = i; header_index++; sent_count++; //printf("root send to %d recv from %d : data = ",to,from); /* for (i=0;i<=header_index;i++) { printf("%d ",header_buf[i]); } printf("\n"); */ /* will receive in the next step */ already_received[i] = 1; } } /* send header followed by receive and reduce data */ if (header_index != 0) { header_buf[header_index] = -1; to = header_buf[0]; from = header_buf[header_index - 1]; smpi_mpi_send(header_buf, HEADER_SIZE, MPI_INT, to, tag, comm); smpi_mpi_recv(tmp_buf, count, datatype, from, tag, comm, &status); smpi_op_apply(op, tmp_buf, rbuf, &count, &datatype); } } /* while loop */ } /* root */ /* non-root */ else { /* send 1-byte message to root */ smpi_mpi_send(temp_buf, 1, MPI_CHAR, 0, tag, comm); /* wait for header and data, forward when required */ smpi_mpi_recv(header_buf, HEADER_SIZE, MPI_INT, MPI_ANY_SOURCE, tag, comm, &status); // smpi_mpi_recv(buf,count,datatype,MPI_ANY_SOURCE,tag,comm,&status); /* search for where it is */ int myordering = 0; while (rank != header_buf[myordering]) { myordering++; } /* forward header */ if (header_buf[myordering + 1] != -1) { smpi_mpi_send(header_buf, HEADER_SIZE, MPI_INT, header_buf[myordering + 1], tag, comm); } //printf("node %d ordering %d\n",rank,myordering); /* receive, reduce, and forward data */ /* send only */ if (myordering == 0) { if (header_buf[myordering + 1] == -1) { to = 0; } else { to = header_buf[myordering + 1]; } smpi_mpi_send(rbuf, count, datatype, to, tag, comm); } /* recv, reduce, send */ else { if (header_buf[myordering + 1] == -1) { to = 0; } else { to = header_buf[myordering + 1]; } from = header_buf[myordering - 1]; smpi_mpi_recv(tmp_buf, count, datatype, header_buf[myordering - 1], tag, comm, &status); smpi_op_apply(op, tmp_buf, rbuf, &count, &datatype); smpi_mpi_send(rbuf, count, datatype, to, tag, comm); } } /* non-root */ } /* pipeline bcast */ else { // printf("node %d start\n",rank); send_request_array = (MPI_Request *) xbt_malloc((size + pipe_length) * sizeof(MPI_Request)); recv_request_array = (MPI_Request *) xbt_malloc((size + pipe_length) * sizeof(MPI_Request)); send_status_array = (MPI_Status *) xbt_malloc((size + pipe_length) * sizeof(MPI_Status)); recv_status_array = (MPI_Status *) xbt_malloc((size + pipe_length) * sizeof(MPI_Status)); if (rank == 0) { sent_count = 0; int will_send[MAX_NODE]; for (i = 0; i < MAX_NODE; i++) will_send[i] = 0; /* loop until all data are received (sent) */ while (sent_count < (size - 1)) { int k; for (k = 0; k < 1; k++) { for (i = 1; i < size; i++) { //if (i == rank) //continue; if ((already_received[i] == 0) && (will_send[i] == 0)) { smpi_mpi_iprobe(i, MPI_ANY_TAG, MPI_COMM_WORLD, &flag_array[i], &temp_status_array[i]); if (flag_array[i] == 1) { will_send[i] = 1; smpi_mpi_recv(&temp_buf[i], 1, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status); //printf("recv from %d\n",i); i = 1; } } } } /* end of probing */ header_index = 0; /* recv 1-byte message */ for (i = 1; i < size; i++) { //if (i==rank) //continue; /* message arrived in this round (put in the header) */ if ((will_send[i] == 1) && (already_received[i] == 0)) { header_buf[header_index] = i; header_index++; sent_count++; /* will send in the next step */ already_received[i] = 1; } } /* send header followed by data */ if (header_index != 0) { header_buf[header_index] = -1; to = header_buf[0]; /* send header */ smpi_mpi_send(header_buf, HEADER_SIZE, MPI_INT, to, tag, comm); /* recv data - pipeline */ from = header_buf[header_index - 1]; for (i = 0; i < pipe_length; i++) { smpi_mpi_recv(tmp_buf + (i * increment), segment, datatype, from, tag, comm, &status); smpi_op_apply(op, tmp_buf + (i * increment), (char *)rbuf + (i * increment), &segment, &datatype); } } } /* while loop (sent_count < size-1 ) */ } /* root */ /* none root */ else { /* send 1-byte message to root */ smpi_mpi_send(temp_buf, 1, MPI_CHAR, 0, tag, comm); /* wait for header forward when required */ request=smpi_mpi_irecv(header_buf, HEADER_SIZE, MPI_INT, MPI_ANY_SOURCE, tag, comm); smpi_mpi_wait(&request, MPI_STATUS_IGNORE); /* search for where it is */ int myordering = 0; while (rank != header_buf[myordering]) { myordering++; } /* send header when required */ if (header_buf[myordering + 1] != -1) { smpi_mpi_send(header_buf, HEADER_SIZE, MPI_INT, header_buf[myordering + 1], tag, comm); } /* (receive, reduce), and send data */ if (header_buf[myordering + 1] == -1) { to = 0; } else { to = header_buf[myordering + 1]; } /* send only */ if (myordering == 0) { for (i = 0; i < pipe_length; i++) { send_request_array[i]= smpi_mpi_isend((char *)rbuf + (i * increment), segment, datatype, to, tag, comm); } smpi_mpi_waitall((pipe_length), send_request_array, send_status_array); } /* receive, reduce, and send */ else { from = header_buf[myordering - 1]; for (i = 0; i < pipe_length; i++) { recv_request_array[i]=smpi_mpi_irecv(tmp_buf + (i * increment), segment, datatype, from, tag, comm); } for (i = 0; i < pipe_length; i++) { smpi_mpi_wait(&recv_request_array[i], MPI_STATUS_IGNORE); smpi_op_apply(op, tmp_buf + (i * increment), (char *)rbuf + (i * increment), &segment, &datatype); send_request_array[i]=smpi_mpi_isend((char *)rbuf + (i * increment), segment, datatype, to, tag, comm); } smpi_mpi_waitall((pipe_length), send_request_array, send_status_array); } } /* non-root */ free(send_request_array); free(recv_request_array); free(send_status_array); free(recv_status_array); //printf("node %d done\n",rank); } /* end pipeline */ /* if root is not zero send root after finished this can be modified to make it faster by using logical src, dst. */ if (root != 0) { if (rank == 0) { smpi_mpi_send(rbuf, count, datatype, root, tag, comm); } else if (rank == root) { smpi_mpi_recv(rbuf, count, datatype, 0, tag, comm, &status); } } /* when count is not divisible by block size, use default BCAST for the remainder */ if ((remainder != 0) && (count > segment)) { smpi_mpi_reduce((char *)buf + (pipe_length * increment), (char *)rbuf + (pipe_length * increment), remainder, datatype, op, root, comm); } free(tmp_buf); return MPI_SUCCESS; }
int smpi_coll_tuned_bcast_NTSB(void *buf, int count, MPI_Datatype datatype, int root, MPI_Comm comm) { int tag = COLL_TAG_BCAST; MPI_Status status; int rank, size; int i; MPI_Request *send_request_array; MPI_Request *recv_request_array; MPI_Status *send_status_array; MPI_Status *recv_status_array; MPI_Aint extent; extent = smpi_datatype_get_extent(datatype); rank = smpi_comm_rank(MPI_COMM_WORLD); size = smpi_comm_size(MPI_COMM_WORLD); /* source node and destination nodes (same through out the functions) */ int from = (rank - 1) / 2; int to_left = rank * 2 + 1; int to_right = rank * 2 + 2; if (to_left >= size) to_left = -1; if (to_right >= size) to_right = -1; /* segment is segment size in number of elements (not bytes) */ int segment = bcast_NTSB_segment_size_in_byte / extent; /* pipeline length */ int pipe_length = count / segment; /* use for buffer offset for sending and receiving data = segment size in byte */ int increment = segment * extent; /* if the input size is not divisible by segment size => the small remainder will be done with native implementation */ int remainder = count % segment; /* if root is not zero send to rank zero first */ if (root != 0) { if (rank == root) { smpi_mpi_send(buf, count, datatype, 0, tag, comm); } else if (rank == 0) { smpi_mpi_recv(buf, count, datatype, root, tag, comm, &status); } } /* when a message is smaller than a block size => no pipeline */ if (count <= segment) { /* case: root */ if (rank == 0) { /* case root has only a left child */ if (to_right == -1) { smpi_mpi_send(buf, count, datatype, to_left, tag, comm); } /* case root has both left and right children */ else { smpi_mpi_send(buf, count, datatype, to_left, tag, comm); smpi_mpi_send(buf, count, datatype, to_right, tag, comm); } } /* case: leaf ==> receive only */ else if (to_left == -1) { smpi_mpi_recv(buf, count, datatype, from, tag, comm, &status); } /* case: intermidiate node with only left child ==> relay message */ else if (to_right == -1) { smpi_mpi_recv(buf, count, datatype, from, tag, comm, &status); smpi_mpi_send(buf, count, datatype, to_left, tag, comm); } /* case: intermidiate node with both left and right children ==> relay message */ else { smpi_mpi_recv(buf, count, datatype, from, tag, comm, &status); smpi_mpi_send(buf, count, datatype, to_left, tag, comm); smpi_mpi_send(buf, count, datatype, to_right, tag, comm); } return MPI_SUCCESS; } // pipelining else { send_request_array = (MPI_Request *) xbt_malloc(2 * (size + pipe_length) * sizeof(MPI_Request)); recv_request_array = (MPI_Request *) xbt_malloc((size + pipe_length) * sizeof(MPI_Request)); send_status_array = (MPI_Status *) xbt_malloc(2 * (size + pipe_length) * sizeof(MPI_Status)); recv_status_array = (MPI_Status *) xbt_malloc((size + pipe_length) * sizeof(MPI_Status)); /* case: root */ if (rank == 0) { /* case root has only a left child */ if (to_right == -1) { for (i = 0; i < pipe_length; i++) { send_request_array[i] = smpi_mpi_isend((char *) buf + (i * increment), segment, datatype, to_left, tag + i, comm); } smpi_mpi_waitall((pipe_length), send_request_array, send_status_array); } /* case root has both left and right children */ else { for (i = 0; i < pipe_length; i++) { send_request_array[i] = smpi_mpi_isend((char *) buf + (i * increment), segment, datatype, to_left, tag + i, comm); send_request_array[i + pipe_length] = smpi_mpi_isend((char *) buf + (i * increment), segment, datatype, to_right, tag + i, comm); } smpi_mpi_waitall((2 * pipe_length), send_request_array, send_status_array); } } /* case: leaf ==> receive only */ else if (to_left == -1) { for (i = 0; i < pipe_length; i++) { recv_request_array[i] = smpi_mpi_irecv((char *) buf + (i * increment), segment, datatype, from, tag + i, comm); } smpi_mpi_waitall((pipe_length), recv_request_array, recv_status_array); } /* case: intermidiate node with only left child ==> relay message */ else if (to_right == -1) { for (i = 0; i < pipe_length; i++) { recv_request_array[i] = smpi_mpi_irecv((char *) buf + (i * increment), segment, datatype, from, tag + i, comm); } for (i = 0; i < pipe_length; i++) { smpi_mpi_wait(&recv_request_array[i], &status); send_request_array[i] = smpi_mpi_isend((char *) buf + (i * increment), segment, datatype, to_left, tag + i, comm); } smpi_mpi_waitall(pipe_length, send_request_array, send_status_array); } /* case: intermidiate node with both left and right children ==> relay message */ else { for (i = 0; i < pipe_length; i++) { recv_request_array[i] = smpi_mpi_irecv((char *) buf + (i * increment), segment, datatype, from, tag + i, comm); } for (i = 0; i < pipe_length; i++) { smpi_mpi_wait(&recv_request_array[i], &status); send_request_array[i] = smpi_mpi_isend((char *) buf + (i * increment), segment, datatype, to_left, tag + i, comm); send_request_array[i + pipe_length] = smpi_mpi_isend((char *) buf + (i * increment), segment, datatype, to_right, tag + i, comm); } smpi_mpi_waitall((2 * pipe_length), send_request_array, send_status_array); } free(send_request_array); free(recv_request_array); free(send_status_array); free(recv_status_array); } /* end pipeline */ /* when count is not divisible by block size, use default BCAST for the remainder */ if ((remainder != 0) && (count > segment)) { XBT_WARN("MPI_bcast_NTSB use default MPI_bcast."); smpi_mpi_bcast((char *) buf + (pipe_length * increment), remainder, datatype, root, comm); } return MPI_SUCCESS; }
int smpi_coll_tuned_bcast_SMP_binary(void *buf, int count, MPI_Datatype datatype, int root, MPI_Comm comm) { int tag = COLL_TAG_BCAST; MPI_Status status; MPI_Request request; MPI_Request *request_array; MPI_Status *status_array; int rank, size; int i; MPI_Aint extent; extent = smpi_datatype_get_extent(datatype); rank = smpi_comm_rank(comm); size = smpi_comm_size(comm); if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){ smpi_comm_init_smp(comm); } int host_num_core=1; if (smpi_comm_is_uniform(comm)){ host_num_core = smpi_comm_size(smpi_comm_get_intra_comm(comm)); }else{ //implementation buggy in this case return smpi_coll_tuned_bcast_mpich( buf , count, datatype, root, comm); } int segment = bcast_SMP_binary_segment_byte / extent; int pipe_length = count / segment; int remainder = count % segment; int to_intra_left = (rank / host_num_core) * host_num_core + (rank % host_num_core) * 2 + 1; int to_intra_right = (rank / host_num_core) * host_num_core + (rank % host_num_core) * 2 + 2; int to_inter_left = ((rank / host_num_core) * 2 + 1) * host_num_core; int to_inter_right = ((rank / host_num_core) * 2 + 2) * host_num_core; int from_inter = (((rank / host_num_core) - 1) / 2) * host_num_core; int from_intra = (rank / host_num_core) * host_num_core + ((rank % host_num_core) - 1) / 2; int increment = segment * extent; int base = (rank / host_num_core) * host_num_core; int num_core = host_num_core; if (((rank / host_num_core) * host_num_core) == ((size / host_num_core) * host_num_core)) num_core = size - (rank / host_num_core) * host_num_core; // if root is not zero send to rank zero first if (root != 0) { if (rank == root) smpi_mpi_send(buf, count, datatype, 0, tag, comm); else if (rank == 0) smpi_mpi_recv(buf, count, datatype, root, tag, comm, &status); } // when a message is smaller than a block size => no pipeline if (count <= segment) { // case ROOT-of-each-SMP if (rank % host_num_core == 0) { // case ROOT if (rank == 0) { //printf("node %d left %d right %d\n",rank,to_inter_left,to_inter_right); if (to_inter_left < size) smpi_mpi_send(buf, count, datatype, to_inter_left, tag, comm); if (to_inter_right < size) smpi_mpi_send(buf, count, datatype, to_inter_right, tag, comm); if ((to_intra_left - base) < num_core) smpi_mpi_send(buf, count, datatype, to_intra_left, tag, comm); if ((to_intra_right - base) < num_core) smpi_mpi_send(buf, count, datatype, to_intra_right, tag, comm); } // case LEAVES ROOT-of-eash-SMP else if (to_inter_left >= size) { //printf("node %d from %d\n",rank,from_inter); request = smpi_mpi_irecv(buf, count, datatype, from_inter, tag, comm); smpi_mpi_wait(&request, &status); if ((to_intra_left - base) < num_core) smpi_mpi_send(buf, count, datatype, to_intra_left, tag, comm); if ((to_intra_right - base) < num_core) smpi_mpi_send(buf, count, datatype, to_intra_right, tag, comm); } // case INTERMEDIAT ROOT-of-each-SMP else { //printf("node %d left %d right %d from %d\n",rank,to_inter_left,to_inter_right,from_inter); request = smpi_mpi_irecv(buf, count, datatype, from_inter, tag, comm); smpi_mpi_wait(&request, &status); smpi_mpi_send(buf, count, datatype, to_inter_left, tag, comm); if (to_inter_right < size) smpi_mpi_send(buf, count, datatype, to_inter_right, tag, comm); if ((to_intra_left - base) < num_core) smpi_mpi_send(buf, count, datatype, to_intra_left, tag, comm); if ((to_intra_right - base) < num_core) smpi_mpi_send(buf, count, datatype, to_intra_right, tag, comm); } } // case non ROOT-of-each-SMP else { // case leaves if ((to_intra_left - base) >= num_core) { request = smpi_mpi_irecv(buf, count, datatype, from_intra, tag, comm); smpi_mpi_wait(&request, &status); } // case intermediate else { request = smpi_mpi_irecv(buf, count, datatype, from_intra, tag, comm); smpi_mpi_wait(&request, &status); smpi_mpi_send(buf, count, datatype, to_intra_left, tag, comm); if ((to_intra_right - base) < num_core) smpi_mpi_send(buf, count, datatype, to_intra_right, tag, comm); } } return MPI_SUCCESS; } // pipeline bcast else { request_array = (MPI_Request *) xbt_malloc((size + pipe_length) * sizeof(MPI_Request)); status_array = (MPI_Status *) xbt_malloc((size + pipe_length) * sizeof(MPI_Status)); // case ROOT-of-each-SMP if (rank % host_num_core == 0) { // case ROOT if (rank == 0) { for (i = 0; i < pipe_length; i++) { //printf("node %d left %d right %d\n",rank,to_inter_left,to_inter_right); if (to_inter_left < size) smpi_mpi_send((char *) buf + (i * increment), segment, datatype, to_inter_left, (tag + i), comm); if (to_inter_right < size) smpi_mpi_send((char *) buf + (i * increment), segment, datatype, to_inter_right, (tag + i), comm); if ((to_intra_left - base) < num_core) smpi_mpi_send((char *) buf + (i * increment), segment, datatype, to_intra_left, (tag + i), comm); if ((to_intra_right - base) < num_core) smpi_mpi_send((char *) buf + (i * increment), segment, datatype, to_intra_right, (tag + i), comm); } } // case LEAVES ROOT-of-eash-SMP else if (to_inter_left >= size) { //printf("node %d from %d\n",rank,from_inter); for (i = 0; i < pipe_length; i++) { request_array[i] = smpi_mpi_irecv((char *) buf + (i * increment), segment, datatype, from_inter, (tag + i), comm); } for (i = 0; i < pipe_length; i++) { smpi_mpi_wait(&request_array[i], &status); if ((to_intra_left - base) < num_core) smpi_mpi_send((char *) buf + (i * increment), segment, datatype, to_intra_left, (tag + i), comm); if ((to_intra_right - base) < num_core) smpi_mpi_send((char *) buf + (i * increment), segment, datatype, to_intra_right, (tag + i), comm); } } // case INTERMEDIAT ROOT-of-each-SMP else { //printf("node %d left %d right %d from %d\n",rank,to_inter_left,to_inter_right,from_inter); for (i = 0; i < pipe_length; i++) { request_array[i] = smpi_mpi_irecv((char *) buf + (i * increment), segment, datatype, from_inter, (tag + i), comm); } for (i = 0; i < pipe_length; i++) { smpi_mpi_wait(&request_array[i], &status); smpi_mpi_send((char *) buf + (i * increment), segment, datatype, to_inter_left, (tag + i), comm); if (to_inter_right < size) smpi_mpi_send((char *) buf + (i * increment), segment, datatype, to_inter_right, (tag + i), comm); if ((to_intra_left - base) < num_core) smpi_mpi_send((char *) buf + (i * increment), segment, datatype, to_intra_left, (tag + i), comm); if ((to_intra_right - base) < num_core) smpi_mpi_send((char *) buf + (i * increment), segment, datatype, to_intra_right, (tag + i), comm); } } } // case non-ROOT-of-each-SMP else { // case leaves if ((to_intra_left - base) >= num_core) { for (i = 0; i < pipe_length; i++) { request_array[i] = smpi_mpi_irecv((char *) buf + (i * increment), segment, datatype, from_intra, (tag + i), comm); } smpi_mpi_waitall((pipe_length), request_array, status_array); } // case intermediate else { for (i = 0; i < pipe_length; i++) { request_array[i] = smpi_mpi_irecv((char *) buf + (i * increment), segment, datatype, from_intra, (tag + i), comm); } for (i = 0; i < pipe_length; i++) { smpi_mpi_wait(&request_array[i], &status); smpi_mpi_send((char *) buf + (i * increment), segment, datatype, to_intra_left, (tag + i), comm); if ((to_intra_right - base) < num_core) smpi_mpi_send((char *) buf + (i * increment), segment, datatype, to_intra_right, (tag + i), comm); } } } free(request_array); free(status_array); } // when count is not divisible by block size, use default BCAST for the remainder if ((remainder != 0) && (count > segment)) { XBT_WARN("MPI_bcast_SMP_binary use default MPI_bcast."); smpi_mpi_bcast((char *) buf + (pipe_length * increment), remainder, datatype, root, comm); } return 1; }
int smpi_coll_tuned_reduce_scatter_gather(void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPI_Comm comm) { MPI_Status status; int comm_size, rank, pof2, rem, newrank; int mask, *cnts, *disps, i, j, send_idx = 0; int recv_idx, last_idx = 0, newdst; int dst, send_cnt, recv_cnt, newroot, newdst_tree_root; int newroot_tree_root, new_count; int tag = COLL_TAG_REDUCE; void *send_ptr, *recv_ptr, *tmp_buf; cnts = NULL; disps = NULL; MPI_Aint extent; if (count == 0) return 0; rank = smpi_comm_rank(comm); comm_size = smpi_comm_size(comm); extent = smpi_datatype_get_extent(datatype); /* find nearest power-of-two less than or equal to comm_size */ pof2 = 1; while (pof2 <= comm_size) pof2 <<= 1; pof2 >>= 1; if (count < comm_size) { new_count = comm_size; send_ptr = (void *) xbt_malloc(new_count * extent); recv_ptr = (void *) xbt_malloc(new_count * extent); tmp_buf = (void *) xbt_malloc(new_count * extent); memcpy(send_ptr, sendbuf, extent * count); //if ((rank != root)) smpi_mpi_sendrecv(send_ptr, new_count, datatype, rank, tag, recv_ptr, new_count, datatype, rank, tag, comm, &status); rem = comm_size - pof2; if (rank < 2 * rem) { if (rank % 2 != 0) { /* odd */ smpi_mpi_send(recv_ptr, new_count, datatype, rank - 1, tag, comm); newrank = -1; } else { smpi_mpi_recv(tmp_buf, count, datatype, rank + 1, tag, comm, &status); smpi_op_apply(op, tmp_buf, recv_ptr, &new_count, &datatype); newrank = rank / 2; } } else /* rank >= 2*rem */ newrank = rank - rem; cnts = (int *) xbt_malloc(pof2 * sizeof(int)); disps = (int *) xbt_malloc(pof2 * sizeof(int)); if (newrank != -1) { for (i = 0; i < (pof2 - 1); i++) cnts[i] = new_count / pof2; cnts[pof2 - 1] = new_count - (new_count / pof2) * (pof2 - 1); disps[0] = 0; for (i = 1; i < pof2; i++) disps[i] = disps[i - 1] + cnts[i - 1]; mask = 0x1; send_idx = recv_idx = 0; last_idx = pof2; while (mask < pof2) { newdst = newrank ^ mask; /* find real rank of dest */ dst = (newdst < rem) ? newdst * 2 : newdst + rem; send_cnt = recv_cnt = 0; if (newrank < newdst) { send_idx = recv_idx + pof2 / (mask * 2); for (i = send_idx; i < last_idx; i++) send_cnt += cnts[i]; for (i = recv_idx; i < send_idx; i++) recv_cnt += cnts[i]; } else { recv_idx = send_idx + pof2 / (mask * 2); for (i = send_idx; i < recv_idx; i++) send_cnt += cnts[i]; for (i = recv_idx; i < last_idx; i++) recv_cnt += cnts[i]; } /* Send data from recvbuf. Recv into tmp_buf */ smpi_mpi_sendrecv((char *) recv_ptr + disps[send_idx] * extent, send_cnt, datatype, dst, tag, (char *) tmp_buf + disps[recv_idx] * extent, recv_cnt, datatype, dst, tag, comm, &status); /* tmp_buf contains data received in this step. recvbuf contains data accumulated so far */ smpi_op_apply(op, (char *) tmp_buf + disps[recv_idx] * extent, (char *) recv_ptr + disps[recv_idx] * extent, &recv_cnt, &datatype); /* update send_idx for next iteration */ send_idx = recv_idx; mask <<= 1; if (mask < pof2) last_idx = recv_idx + pof2 / mask; } } /* now do the gather to root */ if (root < 2 * rem) { if (root % 2 != 0) { if (rank == root) { /* recv */ for (i = 0; i < (pof2 - 1); i++) cnts[i] = new_count / pof2; cnts[pof2 - 1] = new_count - (new_count / pof2) * (pof2 - 1); disps[0] = 0; for (i = 1; i < pof2; i++) disps[i] = disps[i - 1] + cnts[i - 1]; smpi_mpi_recv(recv_ptr, cnts[0], datatype, 0, tag, comm, &status); newrank = 0; send_idx = 0; last_idx = 2; } else if (newrank == 0) { smpi_mpi_send(recv_ptr, cnts[0], datatype, root, tag, comm); newrank = -1; } newroot = 0; } else newroot = root / 2; } else newroot = root - rem; if (newrank != -1) { j = 0; mask = 0x1; while (mask < pof2) { mask <<= 1; j++; } mask >>= 1; j--; while (mask > 0) { newdst = newrank ^ mask; /* find real rank of dest */ dst = (newdst < rem) ? newdst * 2 : newdst + rem; if ((newdst == 0) && (root < 2 * rem) && (root % 2 != 0)) dst = root; newdst_tree_root = newdst >> j; newdst_tree_root <<= j; newroot_tree_root = newroot >> j; newroot_tree_root <<= j; send_cnt = recv_cnt = 0; if (newrank < newdst) { /* update last_idx except on first iteration */ if (mask != pof2 / 2) last_idx = last_idx + pof2 / (mask * 2); recv_idx = send_idx + pof2 / (mask * 2); for (i = send_idx; i < recv_idx; i++) send_cnt += cnts[i]; for (i = recv_idx; i < last_idx; i++) recv_cnt += cnts[i]; } else { recv_idx = send_idx - pof2 / (mask * 2); for (i = send_idx; i < last_idx; i++) send_cnt += cnts[i]; for (i = recv_idx; i < send_idx; i++) recv_cnt += cnts[i]; } if (newdst_tree_root == newroot_tree_root) { smpi_mpi_send((char *) recv_ptr + disps[send_idx] * extent, send_cnt, datatype, dst, tag, comm); break; } else { smpi_mpi_recv((char *) recv_ptr + disps[recv_idx] * extent, recv_cnt, datatype, dst, tag, comm, &status); } if (newrank > newdst) send_idx = recv_idx; mask >>= 1; j--; } }
int smpi_coll_tuned_reduce_binomial(void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPI_Comm comm) { MPI_Status status; int comm_size, rank; int mask, relrank, source; int dst; int tag = COLL_TAG_REDUCE; MPI_Aint extent; void *tmp_buf; MPI_Aint true_lb, true_extent; if (count == 0) return 0; rank = smpi_comm_rank(comm); comm_size = smpi_comm_size(comm); extent = smpi_datatype_get_extent(datatype); tmp_buf = (void *) smpi_get_tmp_sendbuffer(count * extent); int is_commutative = smpi_op_is_commute(op); mask = 1; int lroot; if (is_commutative) lroot = root; else lroot = 0; relrank = (rank - lroot + comm_size) % comm_size; smpi_datatype_extent(datatype, &true_lb, &true_extent); /* adjust for potential negative lower bound in datatype */ tmp_buf = (void *)((char*)tmp_buf - true_lb); /* If I'm not the root, then my recvbuf may not be valid, therefore I have to allocate a temporary one */ if (rank != root) { recvbuf = (void *) smpi_get_tmp_recvbuffer(count*(max(extent,true_extent))); recvbuf = (void *)((char*)recvbuf - true_lb); } if ((rank != root) || (sendbuf != MPI_IN_PLACE)) { smpi_datatype_copy(sendbuf, count, datatype, recvbuf,count, datatype); } while (mask < comm_size) { /* Receive */ if ((mask & relrank) == 0) { source = (relrank | mask); if (source < comm_size) { source = (source + lroot) % comm_size; smpi_mpi_recv(tmp_buf, count, datatype, source, tag, comm, &status); if (is_commutative) { smpi_op_apply(op, tmp_buf, recvbuf, &count, &datatype); } else { smpi_op_apply(op, recvbuf, tmp_buf, &count, &datatype); smpi_datatype_copy(tmp_buf, count, datatype,recvbuf, count, datatype); } } } else { dst = ((relrank & (~mask)) + lroot) % comm_size; smpi_mpi_send(recvbuf, count, datatype, dst, tag, comm); break; } mask <<= 1; } if (!is_commutative && (root != 0)){ if (rank == 0){ smpi_mpi_send(recvbuf, count, datatype, root,tag, comm); }else if (rank == root){ smpi_mpi_recv(recvbuf, count, datatype, 0, tag, comm, &status); } } if (rank != root) { smpi_free_tmp_buffer(recvbuf); } smpi_free_tmp_buffer(tmp_buf); return 0; }
/* Non-topology-specific pipelined linear-bcast function */ int smpi_coll_tuned_bcast_arrival_pattern_aware_wait(void *buf, int count, MPI_Datatype datatype, int root, MPI_Comm comm) { MPI_Status status; MPI_Request request; MPI_Request *send_request_array; MPI_Request *recv_request_array; MPI_Status *send_status_array; MPI_Status *recv_status_array; MPI_Status temp_status_array[BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE]; int rank, size; int i, j, k; int tag = -COLL_TAG_BCAST; int will_send[BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE]; int sent_count; int header_index; int flag_array[BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE]; int already_sent[BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE]; int header_buf[BCAST_ARRIVAL_PATTERN_AWARE_HEADER_SIZE]; char temp_buf[BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE]; int max_node = BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE; int header_size = BCAST_ARRIVAL_PATTERN_AWARE_HEADER_SIZE; MPI_Aint extent; extent = smpi_datatype_get_extent(datatype); /* source and destination */ int to, from; rank = smpi_comm_rank(MPI_COMM_WORLD); size = smpi_comm_size(MPI_COMM_WORLD); /* segment is segment size in number of elements (not bytes) */ int segment = bcast_arrival_pattern_aware_wait_segment_size_in_byte / extent; /* pipeline length */ int pipe_length = count / segment; /* use for buffer offset for sending and receiving data = segment size in byte */ int increment = segment * extent; /* if the input size is not divisible by segment size => the small remainder will be done with native implementation */ int remainder = count % segment; /* if root is not zero send to rank zero first this can be modified to make it faster by using logical src, dst. */ if (root != 0) { if (rank == root) { smpi_mpi_send(buf, count, datatype, 0, tag, comm); } else if (rank == 0) { smpi_mpi_recv(buf, count, datatype, root, tag, comm, &status); } } /* value == 0 means root has not send data (or header) to the node yet */ for (i = 0; i < max_node; i++) { already_sent[i] = 0; } /* when a message is smaller than a block size => no pipeline */ if (count <= segment) { segment = count; pipe_length = 1; } /* start pipeline bcast */ send_request_array = (MPI_Request *) xbt_malloc((size + pipe_length) * sizeof(MPI_Request)); recv_request_array = (MPI_Request *) xbt_malloc((size + pipe_length) * sizeof(MPI_Request)); send_status_array = (MPI_Status *) xbt_malloc((size + pipe_length) * sizeof(MPI_Status)); recv_status_array = (MPI_Status *) xbt_malloc((size + pipe_length) * sizeof(MPI_Status)); /* root */ if (rank == 0) { sent_count = 0; int iteration = 0; for (i = 0; i < BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE; i++) will_send[i] = 0; while (sent_count < (size - 1)) { iteration++; /* loop k times to let more processes arrive before start sending data */ for (k = 0; k < 3; k++) { for (i = 1; i < size; i++) { if ((already_sent[i] == 0) && (will_send[i] == 0)) { smpi_mpi_iprobe(i, MPI_ANY_TAG, MPI_COMM_WORLD, &flag_array[i], &temp_status_array[i]); if (flag_array[i] == 1) { will_send[i] = 1; smpi_mpi_recv(&temp_buf[i], 1, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status); i = 0; } } } } header_index = 0; /* recv 1-byte message */ for (i = 1; i < size; i++) { /* message arrive */ if ((will_send[i] == 1) && (already_sent[i] == 0)) { header_buf[header_index] = i; header_index++; sent_count++; /* will send in the next step */ already_sent[i] = 1; } } /* send header followed by data */ if (header_index != 0) { header_buf[header_index] = -1; to = header_buf[0]; /* send header */ smpi_mpi_send(header_buf, header_size, MPI_INT, to, tag, comm); /* send data - pipeline */ for (i = 0; i < pipe_length; i++) { send_request_array[i] = smpi_mpi_isend((char *)buf + (i * increment), segment, datatype, to, tag, comm); } smpi_mpi_waitall((pipe_length), send_request_array, send_status_array); } /* end - send header followed by data */ /* randomly MPI_Send to one node */ /* this part has been commented out - performance-wise */ else if (2 == 3) { /* search for the first node that never received data before */ for (i = 0; i < size; i++) { if (i == root) continue; if (already_sent[i] == 0) { header_buf[0] = i; header_buf[1] = -1; to = i; smpi_mpi_send(header_buf, header_size, MPI_INT, to, tag, comm); /* still need to chop data so that we can use the same non-root code */ for (j = 0; j < pipe_length; j++) { smpi_mpi_send((char *)buf + (j * increment), segment, datatype, to, tag, comm); } } } } } /* end - while (send_count < size-1) loop */ } /* end - root */ /* none root */ else { /* send 1-byte message to root */ smpi_mpi_send(temp_buf, 1, MPI_CHAR, 0, tag, comm); /* wait for header forward when required */ request = smpi_mpi_irecv(header_buf, header_size, MPI_INT, MPI_ANY_SOURCE, tag, comm); smpi_mpi_wait(&request, MPI_STATUS_IGNORE); /* search for where it is */ int myordering = 0; while (rank != header_buf[myordering]) { myordering++; } to = header_buf[myordering + 1]; if (myordering == 0) { from = 0; } else { from = header_buf[myordering - 1]; } /* send header when required */ if (to != -1) { smpi_mpi_send(header_buf, header_size, MPI_INT, to, tag, comm); } /* receive data */ for (i = 0; i < pipe_length; i++) { recv_request_array[i] = smpi_mpi_irecv((char *)buf + (i * increment), segment, datatype, from, tag, comm); } /* forward data */ if (to != -1) { for (i = 0; i < pipe_length; i++) { smpi_mpi_wait(&recv_request_array[i], MPI_STATUS_IGNORE); send_request_array[i] = smpi_mpi_isend((char *)buf + (i * increment), segment, datatype, to, tag, comm); } smpi_mpi_waitall((pipe_length), send_request_array, send_status_array); } /* recv only */ else { smpi_mpi_waitall((pipe_length), recv_request_array, recv_status_array); } } free(send_request_array); free(recv_request_array); free(send_status_array); free(recv_status_array); /* end pipeline */ /* when count is not divisible by block size, use default BCAST for the remainder */ if ((remainder != 0) && (count > segment)) { XBT_WARN("MPI_bcast_arrival_pattern_aware_wait use default MPI_bcast."); smpi_mpi_bcast((char *)buf + (pipe_length * increment), remainder, datatype, root, comm); } return MPI_SUCCESS; }
/* * reduce_scatter_ompi_basic_recursivehalving * * Function: - reduce scatter implementation using recursive-halving * algorithm * Accepts: - same as MPI_Reduce_scatter() * Returns: - MPI_SUCCESS or error code * Limitation: - Works only for commutative operations. */ int smpi_coll_tuned_reduce_scatter_ompi_basic_recursivehalving(void *sbuf, void *rbuf, int *rcounts, MPI_Datatype dtype, MPI_Op op, MPI_Comm comm ) { int i, rank, size, count, err = MPI_SUCCESS; int tmp_size=1, remain = 0, tmp_rank, *disps = NULL; ptrdiff_t true_lb, true_extent, lb, extent, buf_size; char *recv_buf = NULL, *recv_buf_free = NULL; char *result_buf = NULL, *result_buf_free = NULL; /* Initialize */ rank = smpi_comm_rank(comm); size = smpi_comm_size(comm); XBT_DEBUG("coll:tuned:reduce_scatter_ompi_basic_recursivehalving, rank %d", rank); /* Find displacements and the like */ disps = (int*) xbt_malloc(sizeof(int) * size); if (NULL == disps) return MPI_ERR_OTHER; disps[0] = 0; for (i = 0; i < (size - 1); ++i) { disps[i + 1] = disps[i] + rcounts[i]; } count = disps[size - 1] + rcounts[size - 1]; /* short cut the trivial case */ if (0 == count) { xbt_free(disps); return MPI_SUCCESS; } /* get datatype information */ smpi_datatype_extent(dtype, &lb, &extent); smpi_datatype_extent(dtype, &true_lb, &true_extent); buf_size = true_extent + (ptrdiff_t)(count - 1) * extent; /* Handle MPI_IN_PLACE */ if (MPI_IN_PLACE == sbuf) { sbuf = rbuf; } /* Allocate temporary receive buffer. */ recv_buf_free = (char*) xbt_malloc(buf_size); recv_buf = recv_buf_free - lb; if (NULL == recv_buf_free) { err = MPI_ERR_OTHER; goto cleanup; } /* allocate temporary buffer for results */ result_buf_free = (char*) xbt_malloc(buf_size); result_buf = result_buf_free - lb; /* copy local buffer into the temporary results */ err =smpi_datatype_copy(sbuf, count, dtype, result_buf, count, dtype); if (MPI_SUCCESS != err) goto cleanup; /* figure out power of two mapping: grow until larger than comm size, then go back one, to get the largest power of two less than comm size */ while (tmp_size <= size) tmp_size <<= 1; tmp_size >>= 1; remain = size - tmp_size; /* If comm size is not a power of two, have the first "remain" procs with an even rank send to rank + 1, leaving a power of two procs to do the rest of the algorithm */ if (rank < 2 * remain) { if ((rank & 1) == 0) { smpi_mpi_send(result_buf, count, dtype, rank + 1, COLL_TAG_REDUCE_SCATTER, comm); /* we don't participate from here on out */ tmp_rank = -1; } else { smpi_mpi_recv(recv_buf, count, dtype, rank - 1, COLL_TAG_REDUCE_SCATTER, comm, MPI_STATUS_IGNORE); /* integrate their results into our temp results */ smpi_op_apply(op, recv_buf, result_buf, &count, &dtype); /* adjust rank to be the bottom "remain" ranks */ tmp_rank = rank / 2; } } else { /* just need to adjust rank to show that the bottom "even remain" ranks dropped out */ tmp_rank = rank - remain; } /* For ranks not kicked out by the above code, perform the recursive halving */ if (tmp_rank >= 0) { int *tmp_disps = NULL, *tmp_rcounts = NULL; int mask, send_index, recv_index, last_index; /* recalculate disps and rcounts to account for the special "remainder" processes that are no longer doing anything */ tmp_rcounts = (int*) xbt_malloc(tmp_size * sizeof(int)); if (NULL == tmp_rcounts) { err = MPI_ERR_OTHER; goto cleanup; } tmp_disps = (int*) xbt_malloc(tmp_size * sizeof(int)); if (NULL == tmp_disps) { xbt_free(tmp_rcounts); err = MPI_ERR_OTHER; goto cleanup; } for (i = 0 ; i < tmp_size ; ++i) { if (i < remain) { /* need to include old neighbor as well */ tmp_rcounts[i] = rcounts[i * 2 + 1] + rcounts[i * 2]; } else { tmp_rcounts[i] = rcounts[i + remain]; } } tmp_disps[0] = 0; for (i = 0; i < tmp_size - 1; ++i) { tmp_disps[i + 1] = tmp_disps[i] + tmp_rcounts[i]; } /* do the recursive halving communication. Don't use the dimension information on the communicator because I think the information is invalidated by our "shrinking" of the communicator */ mask = tmp_size >> 1; send_index = recv_index = 0; last_index = tmp_size; while (mask > 0) { int tmp_peer, peer, send_count, recv_count; MPI_Request request; tmp_peer = tmp_rank ^ mask; peer = (tmp_peer < remain) ? tmp_peer * 2 + 1 : tmp_peer + remain; /* figure out if we're sending, receiving, or both */ send_count = recv_count = 0; if (tmp_rank < tmp_peer) { send_index = recv_index + mask; for (i = send_index ; i < last_index ; ++i) { send_count += tmp_rcounts[i]; } for (i = recv_index ; i < send_index ; ++i) { recv_count += tmp_rcounts[i]; } } else { recv_index = send_index + mask; for (i = send_index ; i < recv_index ; ++i) { send_count += tmp_rcounts[i]; } for (i = recv_index ; i < last_index ; ++i) { recv_count += tmp_rcounts[i]; } } /* actual data transfer. Send from result_buf, receive into recv_buf */ if (send_count > 0 && recv_count != 0) { request=smpi_mpi_irecv(recv_buf + (ptrdiff_t)tmp_disps[recv_index] * extent, recv_count, dtype, peer, COLL_TAG_REDUCE_SCATTER, comm); if (MPI_SUCCESS != err) { xbt_free(tmp_rcounts); xbt_free(tmp_disps); goto cleanup; } } if (recv_count > 0 && send_count != 0) { smpi_mpi_send(result_buf + (ptrdiff_t)tmp_disps[send_index] * extent, send_count, dtype, peer, COLL_TAG_REDUCE_SCATTER, comm); if (MPI_SUCCESS != err) { xbt_free(tmp_rcounts); xbt_free(tmp_disps); goto cleanup; } } if (send_count > 0 && recv_count != 0) { smpi_mpi_wait(&request, MPI_STATUS_IGNORE); } /* if we received something on this step, push it into the results buffer */ if (recv_count > 0) { smpi_op_apply(op, recv_buf + (ptrdiff_t)tmp_disps[recv_index] * extent, result_buf + (ptrdiff_t)tmp_disps[recv_index] * extent, &recv_count, &dtype); } /* update for next iteration */ send_index = recv_index; last_index = recv_index + mask; mask >>= 1; } /* copy local results from results buffer into real receive buffer */ if (0 != rcounts[rank]) { err = smpi_datatype_copy(result_buf + disps[rank] * extent, rcounts[rank], dtype, rbuf, rcounts[rank], dtype); if (MPI_SUCCESS != err) { xbt_free(tmp_rcounts); xbt_free(tmp_disps); goto cleanup; } } xbt_free(tmp_rcounts); xbt_free(tmp_disps); }
int smpi_coll_tuned_scatter_ompi_binomial(void *sbuf, int scount, MPI_Datatype sdtype, void *rbuf, int rcount, MPI_Datatype rdtype, int root, MPI_Comm comm ) { int line = -1; int i; int rank; int vrank; int size; int total_send = 0; char *ptmp = NULL; char *tempbuf = NULL; int err; ompi_coll_tree_t* bmtree; MPI_Status status; MPI_Aint sextent, slb, strue_lb, strue_extent; MPI_Aint rextent, rlb, rtrue_lb, rtrue_extent; size = smpi_comm_size(comm); rank = smpi_comm_rank(comm); XBT_DEBUG( "smpi_coll_tuned_scatter_ompi_binomial rank %d", rank); /* create the binomial tree */ // COLL_TUNED_UPDATE_IN_ORDER_BMTREE( comm, tuned_module, root ); bmtree = ompi_coll_tuned_topo_build_in_order_bmtree( comm, root);//ompi_ data->cached_in_order_bmtree; smpi_datatype_extent(sdtype, &slb, &sextent); smpi_datatype_extent(sdtype, &strue_lb, &strue_extent); smpi_datatype_extent(rdtype, &rlb, &rextent); smpi_datatype_extent(rdtype, &rtrue_lb, &rtrue_extent); vrank = (rank - root + size) % size; if (rank == root) { if (0 == root) { /* root on 0, just use the send buffer */ ptmp = (char *) sbuf; if (rbuf != MPI_IN_PLACE) { /* local copy to rbuf */ err = smpi_datatype_copy(sbuf, scount, sdtype, rbuf, rcount, rdtype); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } } } else { /* root is not on 0, allocate temp buffer for send */ tempbuf = (char *) malloc(strue_extent + (scount*size - 1) * sextent); if (NULL == tempbuf) { err = MPI_ERR_OTHER; line = __LINE__; goto err_hndl; } ptmp = tempbuf - slb; /* and rotate data so they will eventually in the right place */ err = smpi_datatype_copy((char *) sbuf + sextent*root*scount, scount*(size-root), sdtype, ptmp, scount*(size-root), sdtype); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } err = smpi_datatype_copy((char*)sbuf, scount*root, sdtype, ptmp + sextent*scount*(size - root), scount*root, sdtype); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } if (rbuf != MPI_IN_PLACE) { /* local copy to rbuf */ err = smpi_datatype_copy(ptmp, scount, sdtype, rbuf, rcount, rdtype); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } } } total_send = scount; } else if (!(vrank % 2)) { /* non-root, non-leaf nodes, allocte temp buffer for recv * the most we need is rcount*size/2 */ tempbuf = (char *) malloc(rtrue_extent + (rcount*size - 1) * rextent); if (NULL == tempbuf) { err= MPI_ERR_OTHER; line = __LINE__; goto err_hndl; } ptmp = tempbuf - rlb; sdtype = rdtype; scount = rcount; sextent = rextent; total_send = scount; } else { /* leaf nodes, just use rbuf */ ptmp = (char *) rbuf; } if (!(vrank % 2)) { if (rank != root) { /* recv from parent on non-root */ smpi_mpi_recv(ptmp, rcount*size, rdtype, bmtree->tree_prev, COLL_TAG_SCATTER, comm, &status); /* local copy to rbuf */ err = smpi_datatype_copy(ptmp, scount, sdtype, rbuf, rcount, rdtype); } /* send to children on all non-leaf */ for (i = 0; i < bmtree->tree_nextsize; i++) { int mycount = 0, vkid; /* figure out how much data I have to send to this child */ vkid = (bmtree->tree_next[i] - root + size) % size; mycount = vkid - vrank; if (mycount > (size - vkid)) mycount = size - vkid; mycount *= scount; smpi_mpi_send(ptmp + total_send*sextent, mycount, sdtype, bmtree->tree_next[i], COLL_TAG_SCATTER, comm); total_send += mycount; } if (NULL != tempbuf) free(tempbuf); } else { /* recv from parent on leaf nodes */ smpi_mpi_recv(ptmp, rcount, rdtype, bmtree->tree_prev, COLL_TAG_SCATTER, comm, &status); } //!FIXME : store the tree, as done in ompi, instead of calculating it each time ? xbt_free(bmtree); return MPI_SUCCESS; err_hndl: if (NULL != tempbuf) free(tempbuf); XBT_DEBUG( "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank); return err; }
int smpi_coll_tuned_reduce_ompi_basic_linear(void *sbuf, void *rbuf, int count, MPI_Datatype dtype, MPI_Op op, int root, MPI_Comm comm) { int i, rank, size; ptrdiff_t true_extent, lb, extent; char *free_buffer = NULL; char *pml_buffer = NULL; char *inplace_temp = NULL; char *inbuf; /* Initialize */ rank = smpi_comm_rank(comm); size = smpi_comm_size(comm); XBT_DEBUG("coll:tuned:reduce_intra_basic_linear rank %d", rank); /* If not root, send data to the root. */ if (rank != root) { smpi_mpi_send(sbuf, count, dtype, root, COLL_TAG_REDUCE, comm); return -1; } /* see discussion in ompi_coll_basic_reduce_lin_intra about extent and true extent */ /* for reducing buffer allocation lengths.... */ smpi_datatype_extent(dtype, &lb, &extent); true_extent = smpi_datatype_get_extent(dtype); if (MPI_IN_PLACE == sbuf) { sbuf = rbuf; inplace_temp = (char*)malloc(true_extent + (count - 1) * extent); if (NULL == inplace_temp) { return -1; } rbuf = inplace_temp - lb; } if (size > 1) { free_buffer = (char*)malloc(true_extent + (count - 1) * extent); pml_buffer = free_buffer - lb; } /* Initialize the receive buffer. */ if (rank == (size - 1)) { smpi_datatype_copy((char*)sbuf, count, dtype,(char*)rbuf, count, dtype); } else { smpi_mpi_recv(rbuf, count, dtype, size - 1, COLL_TAG_REDUCE, comm, MPI_STATUS_IGNORE); } /* Loop receiving and calling reduction function (C or Fortran). */ for (i = size - 2; i >= 0; --i) { if (rank == i) { inbuf = (char*)sbuf; } else { smpi_mpi_recv(pml_buffer, count, dtype, i, COLL_TAG_REDUCE, comm, MPI_STATUS_IGNORE); inbuf = pml_buffer; } /* Perform the reduction */ smpi_op_apply(op, inbuf, rbuf, &count, &dtype); } if (NULL != inplace_temp) { smpi_datatype_copy(inplace_temp, count, dtype,(char*)sbuf ,count , dtype); free(inplace_temp); } if (NULL != free_buffer) { free(free_buffer); } /* All done */ return MPI_SUCCESS; }
/* * reduce_intra_in_order_binary * * Function: Logarithmic reduce operation for non-commutative operations. * Acecpts: same as MPI_Reduce() * Returns: MPI_SUCCESS or error code */ int smpi_coll_tuned_reduce_ompi_in_order_binary( void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPI_Comm comm) { uint32_t segsize=0; int ret; int rank, size, io_root; int segcount = count; void *use_this_sendbuf = NULL, *use_this_recvbuf = NULL; size_t typelng; rank = smpi_comm_rank(comm); size = smpi_comm_size(comm); XBT_DEBUG("coll:tuned:reduce_intra_in_order_binary rank %d ss %5d", rank, segsize); /** * Determine number of segments and number of elements * sent per operation */ typelng=smpi_datatype_size( datatype); COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount ); /* An in-order binary tree must use root (size-1) to preserve the order of operations. Thus, if root is not rank (size - 1), then we must handle 1. MPI_IN_PLACE option on real root, and 2. we must allocate temporary recvbuf on rank (size - 1). Note that generic function must be careful not to switch order of operations for non-commutative ops. */ io_root = size - 1; use_this_sendbuf = sendbuf; use_this_recvbuf = recvbuf; if (io_root != root) { ptrdiff_t text, ext; char *tmpbuf = NULL; ext=smpi_datatype_get_extent(datatype); text=smpi_datatype_get_extent(datatype); if ((root == rank) && (MPI_IN_PLACE == sendbuf)) { tmpbuf = (char *) malloc(text + (count - 1) * ext); if (NULL == tmpbuf) { return MPI_ERR_INTERN; } smpi_datatype_copy ( (char*)recvbuf, count, datatype, (char*)tmpbuf, count, datatype); use_this_sendbuf = tmpbuf; } else if (io_root == rank) { tmpbuf = (char *) malloc(text + (count - 1) * ext); if (NULL == tmpbuf) { return MPI_ERR_INTERN; } use_this_recvbuf = tmpbuf; } } /* Use generic reduce with in-order binary tree topology and io_root */ ret = smpi_coll_tuned_ompi_reduce_generic( use_this_sendbuf, use_this_recvbuf, count, datatype, op, io_root, comm, ompi_coll_tuned_topo_build_in_order_bintree(comm), segcount, 0 ); if (MPI_SUCCESS != ret) { return ret; } /* Clean up */ if (io_root != root) { if (root == rank) { /* Receive result from rank io_root to recvbuf */ smpi_mpi_recv(recvbuf, count, datatype, io_root, COLL_TAG_REDUCE, comm, MPI_STATUS_IGNORE); if (MPI_IN_PLACE == sendbuf) { free(use_this_sendbuf); } } else if (io_root == rank) { /* Send result from use_this_recvbuf to root */ smpi_mpi_send(use_this_recvbuf, count, datatype, root, COLL_TAG_REDUCE, comm); free(use_this_recvbuf); } } return MPI_SUCCESS; }
int smpi_coll_tuned_allgatherv_mpich_rdb ( void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int *recvcounts, int *displs, MPI_Datatype recvtype, MPI_Comm comm) { int comm_size, rank, j, i; MPI_Status status; MPI_Aint recvtype_extent, recvtype_true_extent, recvtype_true_lb; int curr_cnt, dst, total_count; void *tmp_buf, *tmp_buf_rl; int mask, dst_tree_root, my_tree_root, position, send_offset, recv_offset, last_recv_cnt=0, nprocs_completed, k, offset, tmp_mask, tree_root; comm_size = smpi_comm_size(comm); rank = smpi_comm_rank(comm); total_count = 0; for (i=0; i<comm_size; i++) total_count += recvcounts[i]; if (total_count == 0) return MPI_ERR_COUNT; recvtype_extent=smpi_datatype_get_extent( recvtype); /* need to receive contiguously into tmp_buf because displs could make the recvbuf noncontiguous */ smpi_datatype_extent(recvtype, &recvtype_true_lb, &recvtype_true_extent); tmp_buf_rl= (void*)smpi_get_tmp_sendbuffer(total_count*(MAX(recvtype_true_extent,recvtype_extent))); /* adjust for potential negative lower bound in datatype */ tmp_buf = (void *)((char*)tmp_buf_rl - recvtype_true_lb); /* copy local data into right location in tmp_buf */ position = 0; for (i=0; i<rank; i++) position += recvcounts[i]; if (sendbuf != MPI_IN_PLACE) { smpi_datatype_copy(sendbuf, sendcount, sendtype, ((char *)tmp_buf + position* recvtype_extent), recvcounts[rank], recvtype); } else { /* if in_place specified, local data is found in recvbuf */ smpi_datatype_copy(((char *)recvbuf + displs[rank]*recvtype_extent), recvcounts[rank], recvtype, ((char *)tmp_buf + position* recvtype_extent), recvcounts[rank], recvtype); } curr_cnt = recvcounts[rank]; mask = 0x1; i = 0; while (mask < comm_size) { dst = rank ^ mask; /* find offset into send and recv buffers. zero out the least significant "i" bits of rank and dst to find root of src and dst subtrees. Use ranks of roots as index to send from and recv into buffer */ dst_tree_root = dst >> i; dst_tree_root <<= i; my_tree_root = rank >> i; my_tree_root <<= i; if (dst < comm_size) { send_offset = 0; for (j=0; j<my_tree_root; j++) send_offset += recvcounts[j]; recv_offset = 0; for (j=0; j<dst_tree_root; j++) recv_offset += recvcounts[j]; smpi_mpi_sendrecv(((char *)tmp_buf + send_offset * recvtype_extent), curr_cnt, recvtype, dst, COLL_TAG_ALLGATHERV, ((char *)tmp_buf + recv_offset * recvtype_extent), total_count - recv_offset, recvtype, dst, COLL_TAG_ALLGATHERV, comm, &status); /* for convenience, recv is posted for a bigger amount than will be sent */ last_recv_cnt=smpi_mpi_get_count(&status, recvtype); curr_cnt += last_recv_cnt; } /* if some processes in this process's subtree in this step did not have any destination process to communicate with because of non-power-of-two, we need to send them the data that they would normally have received from those processes. That is, the haves in this subtree must send to the havenots. We use a logarithmic recursive-halfing algorithm for this. */ /* This part of the code will not currently be executed because we are not using recursive doubling for non power of two. Mark it as experimental so that it doesn't show up as red in the coverage tests. */ /* --BEGIN EXPERIMENTAL-- */ if (dst_tree_root + mask > comm_size) { nprocs_completed = comm_size - my_tree_root - mask; /* nprocs_completed is the number of processes in this subtree that have all the data. Send data to others in a tree fashion. First find root of current tree that is being divided into two. k is the number of least-significant bits in this process's rank that must be zeroed out to find the rank of the root */ j = mask; k = 0; while (j) { j >>= 1; k++; } k--; tmp_mask = mask >> 1; while (tmp_mask) { dst = rank ^ tmp_mask; tree_root = rank >> k; tree_root <<= k; /* send only if this proc has data and destination doesn't have data. at any step, multiple processes can send if they have the data */ if ((dst > rank) && (rank < tree_root + nprocs_completed) && (dst >= tree_root + nprocs_completed)) { offset = 0; for (j=0; j<(my_tree_root+mask); j++) offset += recvcounts[j]; offset *= recvtype_extent; smpi_mpi_send(((char *)tmp_buf + offset), last_recv_cnt, recvtype, dst, COLL_TAG_ALLGATHERV, comm); /* last_recv_cnt was set in the previous receive. that's the amount of data to be sent now. */ } /* recv only if this proc. doesn't have data and sender has data */ else if ((dst < rank) && (dst < tree_root + nprocs_completed) && (rank >= tree_root + nprocs_completed)) { offset = 0; for (j=0; j<(my_tree_root+mask); j++) offset += recvcounts[j]; smpi_mpi_recv(((char *)tmp_buf + offset * recvtype_extent), total_count - offset, recvtype, dst, COLL_TAG_ALLGATHERV, comm, &status); /* for convenience, recv is posted for a bigger amount than will be sent */ last_recv_cnt=smpi_mpi_get_count(&status, recvtype); curr_cnt += last_recv_cnt; } tmp_mask >>= 1; k--; } } /* --END EXPERIMENTAL-- */ mask <<= 1; i++; }
/* This fucntion performs all-reduce operation as follow. 1) binomial_tree reduce inside each SMP node 2) reduce-scatter -inter between root of each SMP node 3) allgather - inter between root of each SMP node 4) binomial_tree bcast inside each SMP node */ int smpi_coll_tuned_allreduce_smp_rsag(void *send_buf, void *recv_buf, int count, MPI_Datatype dtype, MPI_Op op, MPI_Comm comm) { int comm_size, rank; void *tmp_buf; int tag = COLL_TAG_ALLREDUCE; int mask, src, dst; MPI_Status status; if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){ smpi_comm_init_smp(comm); } int num_core=1; if (smpi_comm_is_uniform(comm)){ num_core = smpi_comm_size(smpi_comm_get_intra_comm(comm)); } /* #ifdef MPICH2_REDUCTION MPI_User_function * uop = MPIR_Op_table[op % 16 - 1]; #else MPI_User_function *uop; struct MPIR_OP *op_ptr; op_ptr = MPIR_ToPointer(op); uop = op_ptr->op; #endif */ comm_size = smpi_comm_size(comm); rank = smpi_comm_rank(comm); MPI_Aint extent; extent = smpi_datatype_get_extent(dtype); tmp_buf = (void *) smpi_get_tmp_sendbuffer(count * extent); int intra_rank, inter_rank; intra_rank = rank % num_core; inter_rank = rank / num_core; //printf("node %d intra_rank = %d, inter_rank = %d\n", rank, intra_rank, inter_rank); int inter_comm_size = (comm_size + num_core - 1) / num_core; if (!rank) { //printf("intra com size = %d\n",num_core); //printf("inter com size = %d\n",inter_comm_size); } smpi_mpi_sendrecv(send_buf, count, dtype, rank, tag, recv_buf, count, dtype, rank, tag, comm, &status); // SMP_binomial_reduce mask = 1; while (mask < num_core) { if ((mask & intra_rank) == 0) { src = (inter_rank * num_core) + (intra_rank | mask); // if (src < ((inter_rank + 1) * num_core)) { if (src < comm_size) { smpi_mpi_recv(tmp_buf, count, dtype, src, tag, comm, &status); smpi_op_apply(op, tmp_buf, recv_buf, &count, &dtype); //printf("Node %d recv from node %d when mask is %d\n", rank, src, mask); } } else { dst = (inter_rank * num_core) + (intra_rank & (~mask)); smpi_mpi_send(recv_buf, count, dtype, dst, tag, comm); //printf("Node %d send to node %d when mask is %d\n", rank, dst, mask); break; } mask <<= 1; } // INTER: reduce-scatter if (intra_rank == 0) { int send_offset, recv_offset; int seg_count = count / inter_comm_size; int to = ((inter_rank + 1) % inter_comm_size) * num_core; int from = ((inter_rank + inter_comm_size - 1) % inter_comm_size) * num_core; int i; //printf("node %d to %d from %d\n",rank,to,from); for (i = 0; i < (inter_comm_size - 1); i++) { send_offset = ((inter_rank - 1 - i + inter_comm_size) % inter_comm_size) * seg_count * extent; recv_offset = ((inter_rank - 2 - i + inter_comm_size) % inter_comm_size) * seg_count * extent; smpi_mpi_sendrecv((char *) recv_buf + send_offset, seg_count, dtype, to, tag + i, tmp_buf, seg_count, dtype, from, tag + i, comm, &status); // result is in rbuf smpi_op_apply(op, tmp_buf, (char *) recv_buf + recv_offset, &seg_count, &dtype); } // INTER: allgather for (i = 0; i < (inter_comm_size - 1); i++) { send_offset = ((inter_rank - i + inter_comm_size) % inter_comm_size) * seg_count * extent; recv_offset = ((inter_rank - 1 - i + inter_comm_size) % inter_comm_size) * seg_count * extent; smpi_mpi_sendrecv((char *) recv_buf + send_offset, seg_count, dtype, to, tag + i, (char *) recv_buf + recv_offset, seg_count, dtype, from, tag + i, comm, &status); } } // INTER_binomial_reduce // only root node for each SMP // if (intra_rank == 0) { // // mask = 1; // while (mask < inter_comm_size) { // if ((mask & inter_rank) == 0) { // src = (inter_rank | mask) * num_core; // if (src < comm_size) { // smpi_mpi_recv(tmp_buf, count, dtype, src, tag, comm, &status); // (* uop) (tmp_buf, recv_buf, &count, &dtype); //printf("Node %d recv from node %d when mask is %d\n", rank, src, mask); // } // } // else { // dst = (inter_rank & (~mask)) * num_core; // smpi_mpi_send(recv_buf, count, dtype, dst, tag, comm); //printf("Node %d send to node %d when mask is %d\n", rank, dst, mask); // break; // } // mask <<=1; // } // } // INTER_binomial_bcast // if (intra_rank == 0) { // mask = 1; // while (mask < inter_comm_size) { // if (inter_rank & mask) { // src = (inter_rank - mask) * num_core; //printf("Node %d recv from node %d when mask is %d\n", rank, src, mask); // smpi_mpi_recv(recv_buf, count, dtype, src, tag, comm, &status); // break; // } // mask <<= 1; // } // // mask >>= 1; //printf("My rank = %d my mask = %d\n", rank,mask); // while (mask > 0) { // if (inter_rank < inter_comm_size) { // dst = (inter_rank + mask) * num_core; // if (dst < comm_size) { //printf("Node %d send to node %d when mask is %d\n", rank, dst, mask); // smpi_mpi_send(recv_buf, count, dtype, dst, tag, comm); // } // } // mask >>= 1; // } // } // INTRA_binomial_bcast int num_core_in_current_smp = num_core; if (inter_rank == (inter_comm_size - 1)) { num_core_in_current_smp = comm_size - (inter_rank * num_core); } // printf("Node %d num_core = %d\n",rank, num_core_in_current_smp); mask = 1; while (mask < num_core_in_current_smp) { if (intra_rank & mask) { src = (inter_rank * num_core) + (intra_rank - mask); //printf("Node %d recv from node %d when mask is %d\n", rank, src, mask); smpi_mpi_recv(recv_buf, count, dtype, src, tag, comm, &status); break; } mask <<= 1; } mask >>= 1; //printf("My rank = %d my mask = %d\n", rank,mask); while (mask > 0) { dst = (inter_rank * num_core) + (intra_rank + mask); if (dst < comm_size) { //printf("Node %d send to node %d when mask is %d\n", rank, dst, mask); smpi_mpi_send(recv_buf, count, dtype, dst, tag, comm); } mask >>= 1; } smpi_free_tmp_buffer(tmp_buf); return MPI_SUCCESS; }
int smpi_coll_tuned_bcast_scatter_rdb_allgather(void *buff, int count, MPI_Datatype data_type, int root, MPI_Comm comm) { MPI_Aint extent; MPI_Status status; int i, j, k, src, dst, rank, num_procs, send_offset, recv_offset; int mask, relative_rank, curr_size, recv_size = 0, send_size, nbytes; int scatter_size, tree_root, relative_dst, dst_tree_root; int my_tree_root, offset, tmp_mask, num_procs_completed; int tag = COLL_TAG_BCAST; rank = smpi_comm_rank(comm); num_procs = smpi_comm_size(comm); extent = smpi_datatype_get_extent(data_type); nbytes = extent * count; scatter_size = (nbytes + num_procs - 1) / num_procs; // ceiling division curr_size = (rank == root) ? nbytes : 0; // root starts with all the data relative_rank = (rank >= root) ? rank - root : rank - root + num_procs; mask = 0x1; while (mask < num_procs) { if (relative_rank & mask) { src = rank - mask; if (src < 0) src += num_procs; recv_size = nbytes - relative_rank * scatter_size; // recv_size is larger than what might actually be sent by the // sender. We don't need compute the exact value because MPI // allows you to post a larger recv. if (recv_size <= 0) curr_size = 0; // this process doesn't receive any data // because of uneven division else { smpi_mpi_recv((char *)buff + relative_rank * scatter_size, recv_size, MPI_BYTE, src, tag, comm, &status); curr_size = smpi_mpi_get_count(&status, MPI_BYTE); } break; } mask <<= 1; } // This process is responsible for all processes that have bits // set from the LSB upto (but not including) mask. Because of // the "not including", we start by shifting mask back down // one. mask >>= 1; while (mask > 0) { if (relative_rank + mask < num_procs) { send_size = curr_size - scatter_size * mask; // mask is also the size of this process's subtree if (send_size > 0) { dst = rank + mask; if (dst >= num_procs) dst -= num_procs; smpi_mpi_send((char *)buff + scatter_size * (relative_rank + mask), send_size, MPI_BYTE, dst, tag, comm); curr_size -= send_size; } } mask >>= 1; } // done scatter now do allgather mask = 0x1; i = 0; while (mask < num_procs) { relative_dst = relative_rank ^ mask; dst = (relative_dst + root) % num_procs; /* find offset into send and recv buffers. zero out the least significant "i" bits of relative_rank and relative_dst to find root of src and dst subtrees. Use ranks of roots as index to send from and recv into buffer */ dst_tree_root = relative_dst >> i; dst_tree_root <<= i; my_tree_root = relative_rank >> i; my_tree_root <<= i; send_offset = my_tree_root * scatter_size; recv_offset = dst_tree_root * scatter_size; if (relative_dst < num_procs) { smpi_mpi_sendrecv((char *)buff + send_offset, curr_size, MPI_BYTE, dst, tag, (char *)buff + recv_offset, scatter_size * mask, MPI_BYTE, dst, tag, comm, &status); recv_size = smpi_mpi_get_count(&status, MPI_BYTE); curr_size += recv_size; } /* if some processes in this process's subtree in this step did not have any destination process to communicate with because of non-power-of-two, we need to send them the data that they would normally have received from those processes. That is, the haves in this subtree must send to the havenots. We use a logarithmic recursive-halfing algorithm for this. */ if (dst_tree_root + mask > num_procs) { num_procs_completed = num_procs - my_tree_root - mask; /* num_procs_completed is the number of processes in this subtree that have all the data. Send data to others in a tree fashion. First find root of current tree that is being divided into two. k is the number of least-significant bits in this process's rank that must be zeroed out to find the rank of the root */ j = mask; k = 0; while (j) { j >>= 1; k++; } k--; offset = scatter_size * (my_tree_root + mask); tmp_mask = mask >> 1; while (tmp_mask) { relative_dst = relative_rank ^ tmp_mask; dst = (relative_dst + root) % num_procs; tree_root = relative_rank >> k; tree_root <<= k; /* send only if this proc has data and destination doesn't have data. */ if ((relative_dst > relative_rank) && (relative_rank < tree_root + num_procs_completed) && (relative_dst >= tree_root + num_procs_completed)) { smpi_mpi_send((char *)buff + offset, recv_size, MPI_BYTE, dst, tag, comm); /* recv_size was set in the previous receive. that's the amount of data to be sent now. */ } /* recv only if this proc. doesn't have data and sender has data */ else if ((relative_dst < relative_rank) && (relative_dst < tree_root + num_procs_completed) && (relative_rank >= tree_root + num_procs_completed)) { smpi_mpi_recv((char *)buff + offset, scatter_size * num_procs_completed, MPI_BYTE, dst, tag, comm, &status); /* num_procs_completed is also equal to the no. of processes whose data we don't have */ recv_size = smpi_mpi_get_count(&status, MPI_BYTE); curr_size += recv_size; } tmp_mask >>= 1; k--; } } mask <<= 1; i++; }
int smpi_coll_tuned_allreduce_rab_rsag(void *sbuff, void *rbuff, int count, MPI_Datatype dtype, MPI_Op op, MPI_Comm comm) { int nprocs, rank, tag = COLL_TAG_ALLREDUCE; int mask, dst, pof2, newrank, rem, newdst, i, send_idx, recv_idx, last_idx, send_cnt, recv_cnt, *cnts, *disps; MPI_Aint extent; MPI_Status status; void *tmp_buf = NULL; nprocs = smpi_comm_size(comm); rank = smpi_comm_rank(comm); extent = smpi_datatype_get_extent(dtype); tmp_buf = (void *) xbt_malloc(count * extent); smpi_mpi_sendrecv(sbuff, count, dtype, rank, tag, rbuff, count, dtype, rank, tag, comm, &status); // find nearest power-of-two less than or equal to comm_size pof2 = 1; while (pof2 <= nprocs) pof2 <<= 1; pof2 >>= 1; rem = nprocs - pof2; // In the non-power-of-two case, all even-numbered // processes of rank < 2*rem send their data to // (rank+1). These even-numbered processes no longer // participate in the algorithm until the very end. The // remaining processes form a nice power-of-two. if (rank < 2 * rem) { // even if (rank % 2 == 0) { smpi_mpi_send(rbuff, count, dtype, rank + 1, tag, comm); // temporarily set the rank to -1 so that this // process does not pariticipate in recursive // doubling newrank = -1; } else // odd { smpi_mpi_recv(tmp_buf, count, dtype, rank - 1, tag, comm, &status); // do the reduction on received data. since the // ordering is right, it doesn't matter whether // the operation is commutative or not. smpi_op_apply(op, tmp_buf, rbuff, &count, &dtype); // change the rank newrank = rank / 2; } } else // rank >= 2 * rem newrank = rank - rem; // If op is user-defined or count is less than pof2, use // recursive doubling algorithm. Otherwise do a reduce-scatter // followed by allgather. (If op is user-defined, // derived datatypes are allowed and the user could pass basic // datatypes on one process and derived on another as long as // the type maps are the same. Breaking up derived // datatypes to do the reduce-scatter is tricky, therefore // using recursive doubling in that case.) if (newrank != -1) { // do a reduce-scatter followed by allgather. for the // reduce-scatter, calculate the count that each process receives // and the displacement within the buffer cnts = (int *) xbt_malloc(pof2 * sizeof(int)); disps = (int *) xbt_malloc(pof2 * sizeof(int)); for (i = 0; i < (pof2 - 1); i++) cnts[i] = count / pof2; cnts[pof2 - 1] = count - (count / pof2) * (pof2 - 1); disps[0] = 0; for (i = 1; i < pof2; i++) disps[i] = disps[i - 1] + cnts[i - 1]; mask = 0x1; send_idx = recv_idx = 0; last_idx = pof2; while (mask < pof2) { newdst = newrank ^ mask; // find real rank of dest dst = (newdst < rem) ? newdst * 2 + 1 : newdst + rem; send_cnt = recv_cnt = 0; if (newrank < newdst) { send_idx = recv_idx + pof2 / (mask * 2); for (i = send_idx; i < last_idx; i++) send_cnt += cnts[i]; for (i = recv_idx; i < send_idx; i++) recv_cnt += cnts[i]; } else { recv_idx = send_idx + pof2 / (mask * 2); for (i = send_idx; i < recv_idx; i++) send_cnt += cnts[i]; for (i = recv_idx; i < last_idx; i++) recv_cnt += cnts[i]; } // Send data from recvbuf. Recv into tmp_buf smpi_mpi_sendrecv((char *) rbuff + disps[send_idx] * extent, send_cnt, dtype, dst, tag, (char *) tmp_buf + disps[recv_idx] * extent, recv_cnt, dtype, dst, tag, comm, &status); // tmp_buf contains data received in this step. // recvbuf contains data accumulated so far // This algorithm is used only for predefined ops // and predefined ops are always commutative. smpi_op_apply(op, (char *) tmp_buf + disps[recv_idx] * extent, (char *) rbuff + disps[recv_idx] * extent, &recv_cnt, &dtype); // update send_idx for next iteration send_idx = recv_idx; mask <<= 1; // update last_idx, but not in last iteration because the value // is needed in the allgather step below. if (mask < pof2) last_idx = recv_idx + pof2 / mask; } // now do the allgather mask >>= 1; while (mask > 0) { newdst = newrank ^ mask; // find real rank of dest dst = (newdst < rem) ? newdst * 2 + 1 : newdst + rem; send_cnt = recv_cnt = 0; if (newrank < newdst) { // update last_idx except on first iteration if (mask != pof2 / 2) last_idx = last_idx + pof2 / (mask * 2); recv_idx = send_idx + pof2 / (mask * 2); for (i = send_idx; i < recv_idx; i++) send_cnt += cnts[i]; for (i = recv_idx; i < last_idx; i++) recv_cnt += cnts[i]; } else { recv_idx = send_idx - pof2 / (mask * 2); for (i = send_idx; i < last_idx; i++) send_cnt += cnts[i]; for (i = recv_idx; i < send_idx; i++) recv_cnt += cnts[i]; } smpi_mpi_sendrecv((char *) rbuff + disps[send_idx] * extent, send_cnt, dtype, dst, tag, (char *) rbuff + disps[recv_idx] * extent, recv_cnt, dtype, dst, tag, comm, &status); if (newrank > newdst) send_idx = recv_idx; mask >>= 1; } free(cnts); free(disps); }
int smpi_coll_tuned_gather_mvapich2_two_level(void *sendbuf, int sendcnt, MPI_Datatype sendtype, void *recvbuf, int recvcnt, MPI_Datatype recvtype, int root, MPI_Comm comm) { void *leader_gather_buf = NULL; int comm_size, rank; int local_rank, local_size; int leader_comm_rank = -1, leader_comm_size = 0; int mpi_errno = MPI_SUCCESS; int recvtype_size = 0, sendtype_size = 0, nbytes=0; int leader_root, leader_of_root; MPI_Status status; MPI_Aint sendtype_extent = 0, recvtype_extent = 0; /* Datatype extent */ MPI_Aint true_lb, sendtype_true_extent, recvtype_true_extent; MPI_Comm shmem_comm, leader_comm; void* tmp_buf = NULL; //if not set (use of the algo directly, without mvapich2 selector) if(MV2_Gather_intra_node_function==NULL) MV2_Gather_intra_node_function=smpi_coll_tuned_gather_mpich; if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){ smpi_comm_init_smp(comm); } comm_size = smpi_comm_size(comm); rank = smpi_comm_rank(comm); if (((rank == root) && (recvcnt == 0)) || ((rank != root) && (sendcnt == 0))) { return MPI_SUCCESS; } if (sendtype != MPI_DATATYPE_NULL) { sendtype_extent=smpi_datatype_get_extent(sendtype); sendtype_size=smpi_datatype_size(sendtype); smpi_datatype_extent(sendtype, &true_lb, &sendtype_true_extent); } if (recvtype != MPI_DATATYPE_NULL) { recvtype_extent=smpi_datatype_get_extent(recvtype); recvtype_size=smpi_datatype_size(recvtype); smpi_datatype_extent(recvtype, &true_lb, &recvtype_true_extent); } /* extract the rank,size information for the intra-node * communicator */ shmem_comm = smpi_comm_get_intra_comm(comm); local_rank = smpi_comm_rank(shmem_comm); local_size = smpi_comm_size(shmem_comm); if (local_rank == 0) { /* Node leader. Extract the rank, size information for the leader * communicator */ leader_comm = smpi_comm_get_leaders_comm(comm); if(leader_comm==MPI_COMM_NULL){ leader_comm = MPI_COMM_WORLD; } leader_comm_size = smpi_comm_size(leader_comm); leader_comm_rank = smpi_comm_rank(leader_comm); } if (rank == root) { nbytes = recvcnt * recvtype_size; } else { nbytes = sendcnt * sendtype_size; } #if defined(_SMP_LIMIC_) if((g_use_limic2_coll) && (shmem_commptr->ch.use_intra_sock_comm == 1) && (use_limic_gather) &&((num_scheme == USE_GATHER_PT_PT_BINOMIAL) || (num_scheme == USE_GATHER_PT_PT_DIRECT) ||(num_scheme == USE_GATHER_PT_LINEAR_BINOMIAL) || (num_scheme == USE_GATHER_PT_LINEAR_DIRECT) || (num_scheme == USE_GATHER_LINEAR_PT_BINOMIAL) || (num_scheme == USE_GATHER_LINEAR_PT_DIRECT) || (num_scheme == USE_GATHER_LINEAR_LINEAR) || (num_scheme == USE_GATHER_SINGLE_LEADER))) { mpi_errno = MV2_Gather_intra_node_function(sendbuf, sendcnt, sendtype, recvbuf, recvcnt,recvtype, root, comm); } else #endif/*#if defined(_SMP_LIMIC_)*/ { if (local_rank == 0) { /* Node leader, allocate tmp_buffer */ if (rank == root) { tmp_buf = smpi_get_tmp_recvbuffer(recvcnt * MAX(recvtype_extent, recvtype_true_extent) * local_size); } else { tmp_buf = smpi_get_tmp_sendbuffer(sendcnt * MAX(sendtype_extent, sendtype_true_extent) * local_size); } if (tmp_buf == NULL) { mpi_errno = MPI_ERR_OTHER; return mpi_errno; } } /*while testing mpich2 gather test, we see that * which basically splits the comm, and we come to * a point, where use_intra_sock_comm == 0, but if the * intra node function is MPIR_Intra_node_LIMIC_Gather_MV2, * it would use the intra sock comm. In such cases, we * fallback to binomial as a default case.*/ #if defined(_SMP_LIMIC_) if(*MV2_Gather_intra_node_function == MPIR_Intra_node_LIMIC_Gather_MV2) { mpi_errno = MPIR_pt_pt_intra_gather(sendbuf,sendcnt, sendtype, recvbuf, recvcnt, recvtype, root, rank, tmp_buf, nbytes, TEMP_BUF_HAS_NO_DATA, shmem_commptr, MPIR_Gather_intra); } else #endif { /*We are gathering the data into tmp_buf and the output * will be of MPI_BYTE datatype. Since the tmp_buf has no * local data, we pass is_data_avail = TEMP_BUF_HAS_NO_DATA*/ mpi_errno = MPIR_pt_pt_intra_gather(sendbuf,sendcnt, sendtype, recvbuf, recvcnt, recvtype, root, rank, tmp_buf, nbytes, TEMP_BUF_HAS_NO_DATA, shmem_comm, MV2_Gather_intra_node_function ); } } leader_comm = smpi_comm_get_leaders_comm(comm); int* leaders_map = smpi_comm_get_leaders_map(comm); leader_of_root = smpi_group_rank(smpi_comm_group(comm),leaders_map[root]); leader_root = smpi_group_rank(smpi_comm_group(leader_comm),leaders_map[root]); /* leader_root is the rank of the leader of the root in leader_comm. * leader_root is to be used as the root of the inter-leader gather ops */ if (!smpi_comm_is_uniform(comm)) { if (local_rank == 0) { int *displs = NULL; int *recvcnts = NULL; int *node_sizes; int i = 0; /* Node leaders have all the data. But, different nodes can have * different number of processes. Do a Gather first to get the * buffer lengths at each leader, followed by a Gatherv to move * the actual data */ if (leader_comm_rank == leader_root && root != leader_of_root) { /* The root of the Gather operation is not a node-level * leader and this process's rank in the leader_comm * is the same as leader_root */ if(rank == root) { leader_gather_buf = smpi_get_tmp_recvbuffer(recvcnt * MAX(recvtype_extent, recvtype_true_extent) * comm_size); } else { leader_gather_buf = smpi_get_tmp_sendbuffer(sendcnt * MAX(sendtype_extent, sendtype_true_extent) * comm_size); } if (leader_gather_buf == NULL) { mpi_errno = MPI_ERR_OTHER; return mpi_errno; } } node_sizes = smpi_comm_get_non_uniform_map(comm); if (leader_comm_rank == leader_root) { displs = xbt_malloc(sizeof (int) * leader_comm_size); recvcnts = xbt_malloc(sizeof (int) * leader_comm_size); if (!displs || !recvcnts) { mpi_errno = MPI_ERR_OTHER; return mpi_errno; } } if (root == leader_of_root) { /* The root of the gather operation is also the node * leader. Receive into recvbuf and we are done */ if (leader_comm_rank == leader_root) { recvcnts[0] = node_sizes[0] * recvcnt; displs[0] = 0; for (i = 1; i < leader_comm_size; i++) { displs[i] = displs[i - 1] + node_sizes[i - 1] * recvcnt; recvcnts[i] = node_sizes[i] * recvcnt; } } smpi_mpi_gatherv(tmp_buf, local_size * nbytes, MPI_BYTE, recvbuf, recvcnts, displs, recvtype, leader_root, leader_comm); } else { /* The root of the gather operation is not the node leader. * Receive into leader_gather_buf and then send * to the root */ if (leader_comm_rank == leader_root) { recvcnts[0] = node_sizes[0] * nbytes; displs[0] = 0; for (i = 1; i < leader_comm_size; i++) { displs[i] = displs[i - 1] + node_sizes[i - 1] * nbytes; recvcnts[i] = node_sizes[i] * nbytes; } } smpi_mpi_gatherv(tmp_buf, local_size * nbytes, MPI_BYTE, leader_gather_buf, recvcnts, displs, MPI_BYTE, leader_root, leader_comm); } if (leader_comm_rank == leader_root) { xbt_free(displs); xbt_free(recvcnts); } } } else { /* All nodes have the same number of processes. * Just do one Gather to get all * the data at the leader of the root process */ if (local_rank == 0) { if (leader_comm_rank == leader_root && root != leader_of_root) { /* The root of the Gather operation is not a node-level leader */ leader_gather_buf = smpi_get_tmp_sendbuffer(nbytes * comm_size); if (leader_gather_buf == NULL) { mpi_errno = MPI_ERR_OTHER; return mpi_errno; } } if (root == leader_of_root) { mpi_errno = MPIR_Gather_MV2_Direct(tmp_buf, nbytes * local_size, MPI_BYTE, recvbuf, recvcnt * local_size, recvtype, leader_root, leader_comm); } else { mpi_errno = MPIR_Gather_MV2_Direct(tmp_buf, nbytes * local_size, MPI_BYTE, leader_gather_buf, nbytes * local_size, MPI_BYTE, leader_root, leader_comm); } } } if ((local_rank == 0) && (root != rank) && (leader_of_root == rank)) { smpi_mpi_send(leader_gather_buf, nbytes * comm_size, MPI_BYTE, root, COLL_TAG_GATHER, comm); } if (rank == root && local_rank != 0) { /* The root of the gather operation is not the node leader. Receive y* data from the node leader */ smpi_mpi_recv(recvbuf, recvcnt * comm_size, recvtype, leader_of_root, COLL_TAG_GATHER, comm, &status); } /* check if multiple threads are calling this collective function */ if (local_rank == 0 ) { if (tmp_buf != NULL) { smpi_free_tmp_buffer(tmp_buf); } if (leader_gather_buf != NULL) { smpi_free_tmp_buffer(leader_gather_buf); } } return (mpi_errno); }
int smpi_coll_tuned_allreduce_mvapich2_rs(void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm) { int comm_size, rank; int mpi_errno = MPI_SUCCESS; int mask, dst, is_commutative, pof2, newrank = 0, rem, newdst, i, send_idx, recv_idx, last_idx, send_cnt, recv_cnt, *cnts, *disps; MPI_Aint true_lb, true_extent, extent; void *tmp_buf, *tmp_buf_free; if (count == 0) { return MPI_SUCCESS; } /* homogeneous */ comm_size = smpi_comm_size(comm); rank = smpi_comm_rank(comm); is_commutative = smpi_op_is_commute(op); /* need to allocate temporary buffer to store incoming data */ smpi_datatype_extent(datatype, &true_lb, &true_extent); extent = smpi_datatype_get_extent(datatype); tmp_buf_free= smpi_get_tmp_recvbuffer(count * (MAX(extent, true_extent))); /* adjust for potential negative lower bound in datatype */ tmp_buf = (void *) ((char *) tmp_buf_free - true_lb); /* copy local data into recvbuf */ if (sendbuf != MPI_IN_PLACE) { mpi_errno = smpi_datatype_copy(sendbuf, count, datatype, recvbuf, count, datatype); } /* find nearest power-of-two less than or equal to comm_size */ for( pof2 = 1; pof2 <= comm_size; pof2 <<= 1 ); pof2 >>=1; rem = comm_size - pof2; /* In the non-power-of-two case, all even-numbered processes of rank < 2*rem send their data to (rank+1). These even-numbered processes no longer participate in the algorithm until the very end. The remaining processes form a nice power-of-two. */ if (rank < 2 * rem) { if (rank % 2 == 0) { /* even */ smpi_mpi_send(recvbuf, count, datatype, rank + 1, COLL_TAG_ALLREDUCE, comm); /* temporarily set the rank to -1 so that this process does not pariticipate in recursive doubling */ newrank = -1; } else { /* odd */ smpi_mpi_recv(tmp_buf, count, datatype, rank - 1, COLL_TAG_ALLREDUCE, comm, MPI_STATUS_IGNORE); /* do the reduction on received data. since the ordering is right, it doesn't matter whether the operation is commutative or not. */ smpi_op_apply(op, tmp_buf, recvbuf, &count, &datatype); /* change the rank */ newrank = rank / 2; } } else { /* rank >= 2*rem */ newrank = rank - rem; } /* If op is user-defined or count is less than pof2, use recursive doubling algorithm. Otherwise do a reduce-scatter followed by allgather. (If op is user-defined, derived datatypes are allowed and the user could pass basic datatypes on one process and derived on another as long as the type maps are the same. Breaking up derived datatypes to do the reduce-scatter is tricky, therefore using recursive doubling in that case.) */ if (newrank != -1) { if (/*(HANDLE_GET_KIND(op) != HANDLE_KIND_BUILTIN) ||*/ (count < pof2)) { /* use recursive doubling */ mask = 0x1; while (mask < pof2) { newdst = newrank ^ mask; /* find real rank of dest */ dst = (newdst < rem) ? newdst * 2 + 1 : newdst + rem; /* Send the most current data, which is in recvbuf. Recv into tmp_buf */ smpi_mpi_sendrecv(recvbuf, count, datatype, dst, COLL_TAG_ALLREDUCE, tmp_buf, count, datatype, dst, COLL_TAG_ALLREDUCE, comm, MPI_STATUS_IGNORE); /* tmp_buf contains data received in this step. recvbuf contains data accumulated so far */ if (is_commutative || (dst < rank)) { /* op is commutative OR the order is already right */ smpi_op_apply(op, tmp_buf, recvbuf, &count, &datatype); } else { /* op is noncommutative and the order is not right */ smpi_op_apply(op, recvbuf, tmp_buf, &count, &datatype); /* copy result back into recvbuf */ mpi_errno = smpi_datatype_copy(tmp_buf, count, datatype, recvbuf, count, datatype); } mask <<= 1; } } else { /* do a reduce-scatter followed by allgather */ /* for the reduce-scatter, calculate the count that each process receives and the displacement within the buffer */ cnts = (int *)xbt_malloc(pof2 * sizeof (int)); disps = (int *)xbt_malloc(pof2 * sizeof (int)); for (i = 0; i < (pof2 - 1); i++) { cnts[i] = count / pof2; } cnts[pof2 - 1] = count - (count / pof2) * (pof2 - 1); disps[0] = 0; for (i = 1; i < pof2; i++) { disps[i] = disps[i - 1] + cnts[i - 1]; } mask = 0x1; send_idx = recv_idx = 0; last_idx = pof2; while (mask < pof2) { newdst = newrank ^ mask; /* find real rank of dest */ dst = (newdst < rem) ? newdst * 2 + 1 : newdst + rem; send_cnt = recv_cnt = 0; if (newrank < newdst) { send_idx = recv_idx + pof2 / (mask * 2); for (i = send_idx; i < last_idx; i++) send_cnt += cnts[i]; for (i = recv_idx; i < send_idx; i++) recv_cnt += cnts[i]; } else { recv_idx = send_idx + pof2 / (mask * 2); for (i = send_idx; i < recv_idx; i++) send_cnt += cnts[i]; for (i = recv_idx; i < last_idx; i++) recv_cnt += cnts[i]; } /* Send data from recvbuf. Recv into tmp_buf */ smpi_mpi_sendrecv((char *) recvbuf + disps[send_idx] * extent, send_cnt, datatype, dst, COLL_TAG_ALLREDUCE, (char *) tmp_buf + disps[recv_idx] * extent, recv_cnt, datatype, dst, COLL_TAG_ALLREDUCE, comm, MPI_STATUS_IGNORE); /* tmp_buf contains data received in this step. recvbuf contains data accumulated so far */ /* This algorithm is used only for predefined ops and predefined ops are always commutative. */ smpi_op_apply(op, (char *) tmp_buf + disps[recv_idx] * extent, (char *) recvbuf + disps[recv_idx] * extent, &recv_cnt, &datatype); /* update send_idx for next iteration */ send_idx = recv_idx; mask <<= 1; /* update last_idx, but not in last iteration because the value is needed in the allgather step below. */ if (mask < pof2) last_idx = recv_idx + pof2 / mask; } /* now do the allgather */ mask >>= 1; while (mask > 0) { newdst = newrank ^ mask; /* find real rank of dest */ dst = (newdst < rem) ? newdst * 2 + 1 : newdst + rem; send_cnt = recv_cnt = 0; if (newrank < newdst) { /* update last_idx except on first iteration */ if (mask != pof2 / 2) { last_idx = last_idx + pof2 / (mask * 2); } recv_idx = send_idx + pof2 / (mask * 2); for (i = send_idx; i < recv_idx; i++) { send_cnt += cnts[i]; } for (i = recv_idx; i < last_idx; i++) { recv_cnt += cnts[i]; } } else { recv_idx = send_idx - pof2 / (mask * 2); for (i = send_idx; i < last_idx; i++) { send_cnt += cnts[i]; } for (i = recv_idx; i < send_idx; i++) { recv_cnt += cnts[i]; } } smpi_mpi_sendrecv((char *) recvbuf + disps[send_idx] * extent, send_cnt, datatype, dst, COLL_TAG_ALLREDUCE, (char *) recvbuf + disps[recv_idx] * extent, recv_cnt, datatype, dst, COLL_TAG_ALLREDUCE, comm, MPI_STATUS_IGNORE); if (newrank > newdst) { send_idx = recv_idx; } mask >>= 1; } } } /* In the non-power-of-two case, all odd-numbered processes of rank < 2*rem send the result to (rank-1), the ranks who didn't participate above. */ if (rank < 2 * rem) { if (rank % 2) { /* odd */ smpi_mpi_send(recvbuf, count, datatype, rank - 1, COLL_TAG_ALLREDUCE, comm); } else { /* even */ smpi_mpi_recv(recvbuf, count, datatype, rank + 1, COLL_TAG_ALLREDUCE, comm, MPI_STATUS_IGNORE); } } smpi_free_tmp_buffer(tmp_buf_free); return (mpi_errno); }