int Coll_reduce_binomial::reduce(void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPI_Comm comm) { MPI_Status status; int comm_size, rank; int mask, relrank, source; int dst; int tag = COLL_TAG_REDUCE; MPI_Aint extent; void *tmp_buf; MPI_Aint true_lb, true_extent; if (count == 0) return 0; rank = comm->rank(); comm_size = comm->size(); extent = datatype->get_extent(); tmp_buf = (void *) smpi_get_tmp_sendbuffer(count * extent); int is_commutative = (op==MPI_OP_NULL || op->is_commutative()); mask = 1; int lroot; if (is_commutative) lroot = root; else lroot = 0; relrank = (rank - lroot + comm_size) % comm_size; datatype->extent(&true_lb, &true_extent); /* adjust for potential negative lower bound in datatype */ tmp_buf = (void *)((char*)tmp_buf - true_lb); /* If I'm not the root, then my recvbuf may not be valid, therefore I have to allocate a temporary one */ if (rank != root) { recvbuf = (void*)smpi_get_tmp_recvbuffer(count * std::max(extent, true_extent)); recvbuf = (void *)((char*)recvbuf - true_lb); } if ((rank != root) || (sendbuf != MPI_IN_PLACE)) { Datatype::copy(sendbuf, count, datatype, recvbuf,count, datatype); } while (mask < comm_size) { /* Receive */ if ((mask & relrank) == 0) { source = (relrank | mask); if (source < comm_size) { source = (source + lroot) % comm_size; Request::recv(tmp_buf, count, datatype, source, tag, comm, &status); if (is_commutative) { if(op!=MPI_OP_NULL) op->apply( tmp_buf, recvbuf, &count, datatype); } else { if(op!=MPI_OP_NULL) op->apply( recvbuf, tmp_buf, &count, datatype); Datatype::copy(tmp_buf, count, datatype,recvbuf, count, datatype); } } } else { dst = ((relrank & (~mask)) + lroot) % comm_size; Request::send(recvbuf, count, datatype, dst, tag, comm); break; } mask <<= 1; } if (not is_commutative && (root != 0)) { if (rank == 0){ Request::send(recvbuf, count, datatype, root,tag, comm); }else if (rank == root){ Request::recv(recvbuf, count, datatype, 0, tag, comm, &status); } } if (rank != root) { smpi_free_tmp_buffer(recvbuf); } smpi_free_tmp_buffer(tmp_buf); return 0; }
int smpi_coll_tuned_bcast_mvapich2_intra_node(void *buffer, int count, MPI_Datatype datatype, int root, MPI_Comm comm) { int mpi_errno = MPI_SUCCESS; int comm_size; int two_level_bcast = 1; size_t nbytes = 0; int is_homogeneous, is_contig; MPI_Aint type_size; void *tmp_buf = NULL; MPI_Comm shmem_comm; if (count == 0) return MPI_SUCCESS; if (MV2_Bcast_function==NULL){ MV2_Bcast_function=smpi_coll_tuned_bcast_mpich; } if (MV2_Bcast_intra_node_function==NULL){ MV2_Bcast_intra_node_function= smpi_coll_tuned_bcast_mpich; } if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){ smpi_comm_init_smp(comm); } comm_size = smpi_comm_size(comm); // rank = smpi_comm_rank(comm); /* if (HANDLE_GET_KIND(datatype) == HANDLE_KIND_BUILTIN)*/ is_contig = 1; /* else { MPID_Datatype_get_ptr(datatype, dtp); is_contig = dtp->is_contig; } */ is_homogeneous = 1; #ifdef MPID_HAS_HETERO if (comm_ptr->is_hetero) is_homogeneous = 0; #endif /* MPI_Type_size() might not give the accurate size of the packed * datatype for heterogeneous systems (because of padding, encoding, * etc). On the other hand, MPI_Pack_size() can become very * expensive, depending on the implementation, especially for * heterogeneous systems. We want to use MPI_Type_size() wherever * possible, and MPI_Pack_size() in other places. */ //if (is_homogeneous) { type_size=smpi_datatype_size(datatype); //} /* else {*/ /* MPIR_Pack_size_impl(1, datatype, &type_size);*/ /* }*/ nbytes = (size_t) (count) * (type_size); if (comm_size <= mv2_bcast_two_level_system_size) { if (nbytes > mv2_bcast_short_msg && nbytes < mv2_bcast_large_msg) { two_level_bcast = 1; } else { two_level_bcast = 0; } } if (two_level_bcast == 1 #if defined(_MCST_SUPPORT_) || comm_ptr->ch.is_mcast_ok #endif ) { if (!is_contig || !is_homogeneous) { tmp_buf=(void *)smpi_get_tmp_sendbuffer(nbytes); /* TODO: Pipeline the packing and communication */ // position = 0; /* if (rank == root) {*/ /* mpi_errno =*/ /* MPIR_Pack_impl(buffer, count, datatype, tmp_buf, nbytes, &position);*/ /* if (mpi_errno)*/ /* MPIU_ERR_POP(mpi_errno);*/ /* }*/ } shmem_comm = smpi_comm_get_intra_comm(comm); if (!is_contig || !is_homogeneous) { mpi_errno = MPIR_Bcast_inter_node_helper_MV2(tmp_buf, nbytes, MPI_BYTE, root, comm); } else { mpi_errno = MPIR_Bcast_inter_node_helper_MV2(buffer, count, datatype, root, comm); } /* We are now done with the inter-node phase */ if (nbytes <= mv2_knomial_intra_node_threshold) { if (!is_contig || !is_homogeneous) { mpi_errno = MPIR_Shmem_Bcast_MV2(tmp_buf, nbytes, MPI_BYTE, root, shmem_comm); } else { mpi_errno = MPIR_Shmem_Bcast_MV2(buffer, count, datatype, root, shmem_comm); } } else { if (!is_contig || !is_homogeneous) { mpi_errno = MPIR_Knomial_Bcast_intra_node_MV2(tmp_buf, nbytes, MPI_BYTE, INTRA_NODE_ROOT, shmem_comm); } else { mpi_errno = MPIR_Knomial_Bcast_intra_node_MV2(buffer, count, datatype, INTRA_NODE_ROOT, shmem_comm); } } } else { if (nbytes <= mv2_bcast_short_msg) { mpi_errno = MPIR_Bcast_binomial_MV2(buffer, count, datatype, root, comm); } else { if (mv2_scatter_rd_inter_leader_bcast) { mpi_errno = MPIR_Bcast_scatter_ring_allgather_MV2(buffer, count, datatype, root, comm); } else { mpi_errno = MPIR_Bcast_scatter_doubling_allgather_MV2(buffer, count, datatype, root, comm); } } } return mpi_errno; }
int smpi_coll_tuned_reduce_mvapich2_knomial ( void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPI_Comm comm) { int mpi_errno = MPI_SUCCESS; int rank, is_commutative; int src, k; MPI_Request send_request; int index=0; MPI_Aint true_lb, true_extent, extent; MPI_Status status; int recv_iter=0, dst=-1, expected_send_count, expected_recv_count; int *src_array=NULL; void **tmp_buf=NULL; MPI_Request *requests=NULL; if (count == 0) return MPI_SUCCESS; rank = smpi_comm_rank(comm); /* Create a temporary buffer */ smpi_datatype_extent(datatype, &true_lb, &true_extent); extent = smpi_datatype_get_extent(datatype); is_commutative = smpi_op_is_commute(op); if (rank != root) { recvbuf=(void *)smpi_get_tmp_recvbuffer(count*(MAX(extent,true_extent))); recvbuf = (void *)((char*)recvbuf - true_lb); } if ((rank != root) || (sendbuf != MPI_IN_PLACE)) { mpi_errno = smpi_datatype_copy(sendbuf, count, datatype, recvbuf, count, datatype); } if(mv2_reduce_intra_knomial_factor<0) { mv2_reduce_intra_knomial_factor = SMPI_DEFAULT_KNOMIAL_FACTOR; } if(mv2_reduce_inter_knomial_factor<0) { mv2_reduce_inter_knomial_factor = SMPI_DEFAULT_KNOMIAL_FACTOR; } MPIR_Reduce_knomial_trace(root, mv2_reduce_intra_knomial_factor, comm, &dst, &expected_send_count, &expected_recv_count, &src_array); if(expected_recv_count > 0 ) { tmp_buf = xbt_malloc(sizeof(void *)*expected_recv_count); requests = xbt_malloc(sizeof(MPI_Request)*expected_recv_count); for(k=0; k < expected_recv_count; k++ ) { tmp_buf[k] = smpi_get_tmp_sendbuffer(count*(MAX(extent,true_extent))); tmp_buf[k] = (void *)((char*)tmp_buf[k] - true_lb); } while(recv_iter < expected_recv_count) { src = src_array[expected_recv_count - (recv_iter+1)]; requests[recv_iter]=smpi_mpi_irecv (tmp_buf[recv_iter], count, datatype ,src, COLL_TAG_REDUCE, comm); recv_iter++; } recv_iter=0; while(recv_iter < expected_recv_count) { index=smpi_mpi_waitany(expected_recv_count, requests, &status); recv_iter++; if (is_commutative) { smpi_op_apply(op, tmp_buf[index], recvbuf, &count, &datatype); } } for(k=0; k < expected_recv_count; k++ ) { smpi_free_tmp_buffer(tmp_buf[k]); } xbt_free(tmp_buf); xbt_free(requests); } if(src_array != NULL) { xbt_free(src_array); } if(rank != root) { send_request=smpi_mpi_isend(recvbuf,count, datatype, dst, COLL_TAG_REDUCE,comm); smpi_mpi_waitall(1, &send_request, &status); smpi_free_tmp_buffer((void *)((char*)recvbuf + true_lb)); } /* --END ERROR HANDLING-- */ return mpi_errno; }
static int MPIR_Reduce_knomial_trace(int root, int reduce_knomial_factor, MPI_Comm comm, int *dst, int *expected_send_count, int *expected_recv_count, int **src_array) { int mask=0x1, k, comm_size, src, rank, relative_rank, lroot=0; int orig_mask=0x1; int recv_iter=0, send_iter=0; int *knomial_reduce_src_array=NULL; comm_size = smpi_comm_size(comm); rank = smpi_comm_rank(comm); lroot = root; relative_rank = (rank - lroot + comm_size) % comm_size; /* First compute to whom we need to send data */ while (mask < comm_size) { if (relative_rank % (reduce_knomial_factor*mask)) { *dst = relative_rank/(reduce_knomial_factor*mask)* (reduce_knomial_factor*mask)+root; if (*dst >= comm_size) { *dst -= comm_size; } send_iter++; break; } mask *= reduce_knomial_factor; } mask /= reduce_knomial_factor; /* Now compute how many children we have in the knomial-tree */ orig_mask = mask; while (mask > 0) { for(k=1;k<reduce_knomial_factor;k++) { if (relative_rank + mask*k < comm_size) { recv_iter++; } } mask /= reduce_knomial_factor; } /* Finally, fill up the src array */ if(recv_iter > 0) { knomial_reduce_src_array = smpi_get_tmp_sendbuffer(sizeof(int)*recv_iter); } mask = orig_mask; recv_iter=0; while (mask > 0) { for(k=1;k<reduce_knomial_factor;k++) { if (relative_rank + mask*k < comm_size) { src = rank + mask*k; if (src >= comm_size) { src -= comm_size; } knomial_reduce_src_array[recv_iter++] = src; } } mask /= reduce_knomial_factor; } *expected_recv_count = recv_iter; *expected_send_count = send_iter; *src_array = knomial_reduce_src_array; return 0; }
int smpi_coll_tuned_allgatherv_mpich_rdb ( void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int *recvcounts, int *displs, MPI_Datatype recvtype, MPI_Comm comm) { int comm_size, rank, j, i; MPI_Status status; MPI_Aint recvtype_extent, recvtype_true_extent, recvtype_true_lb; int curr_cnt, dst, total_count; void *tmp_buf, *tmp_buf_rl; int mask, dst_tree_root, my_tree_root, position, send_offset, recv_offset, last_recv_cnt=0, nprocs_completed, k, offset, tmp_mask, tree_root; comm_size = smpi_comm_size(comm); rank = smpi_comm_rank(comm); total_count = 0; for (i=0; i<comm_size; i++) total_count += recvcounts[i]; if (total_count == 0) return MPI_ERR_COUNT; recvtype_extent=smpi_datatype_get_extent( recvtype); /* need to receive contiguously into tmp_buf because displs could make the recvbuf noncontiguous */ smpi_datatype_extent(recvtype, &recvtype_true_lb, &recvtype_true_extent); tmp_buf_rl= (void*)smpi_get_tmp_sendbuffer(total_count*(MAX(recvtype_true_extent,recvtype_extent))); /* adjust for potential negative lower bound in datatype */ tmp_buf = (void *)((char*)tmp_buf_rl - recvtype_true_lb); /* copy local data into right location in tmp_buf */ position = 0; for (i=0; i<rank; i++) position += recvcounts[i]; if (sendbuf != MPI_IN_PLACE) { smpi_datatype_copy(sendbuf, sendcount, sendtype, ((char *)tmp_buf + position* recvtype_extent), recvcounts[rank], recvtype); } else { /* if in_place specified, local data is found in recvbuf */ smpi_datatype_copy(((char *)recvbuf + displs[rank]*recvtype_extent), recvcounts[rank], recvtype, ((char *)tmp_buf + position* recvtype_extent), recvcounts[rank], recvtype); } curr_cnt = recvcounts[rank]; mask = 0x1; i = 0; while (mask < comm_size) { dst = rank ^ mask; /* find offset into send and recv buffers. zero out the least significant "i" bits of rank and dst to find root of src and dst subtrees. Use ranks of roots as index to send from and recv into buffer */ dst_tree_root = dst >> i; dst_tree_root <<= i; my_tree_root = rank >> i; my_tree_root <<= i; if (dst < comm_size) { send_offset = 0; for (j=0; j<my_tree_root; j++) send_offset += recvcounts[j]; recv_offset = 0; for (j=0; j<dst_tree_root; j++) recv_offset += recvcounts[j]; smpi_mpi_sendrecv(((char *)tmp_buf + send_offset * recvtype_extent), curr_cnt, recvtype, dst, COLL_TAG_ALLGATHERV, ((char *)tmp_buf + recv_offset * recvtype_extent), total_count - recv_offset, recvtype, dst, COLL_TAG_ALLGATHERV, comm, &status); /* for convenience, recv is posted for a bigger amount than will be sent */ last_recv_cnt=smpi_mpi_get_count(&status, recvtype); curr_cnt += last_recv_cnt; } /* if some processes in this process's subtree in this step did not have any destination process to communicate with because of non-power-of-two, we need to send them the data that they would normally have received from those processes. That is, the haves in this subtree must send to the havenots. We use a logarithmic recursive-halfing algorithm for this. */ /* This part of the code will not currently be executed because we are not using recursive doubling for non power of two. Mark it as experimental so that it doesn't show up as red in the coverage tests. */ /* --BEGIN EXPERIMENTAL-- */ if (dst_tree_root + mask > comm_size) { nprocs_completed = comm_size - my_tree_root - mask; /* nprocs_completed is the number of processes in this subtree that have all the data. Send data to others in a tree fashion. First find root of current tree that is being divided into two. k is the number of least-significant bits in this process's rank that must be zeroed out to find the rank of the root */ j = mask; k = 0; while (j) { j >>= 1; k++; } k--; tmp_mask = mask >> 1; while (tmp_mask) { dst = rank ^ tmp_mask; tree_root = rank >> k; tree_root <<= k; /* send only if this proc has data and destination doesn't have data. at any step, multiple processes can send if they have the data */ if ((dst > rank) && (rank < tree_root + nprocs_completed) && (dst >= tree_root + nprocs_completed)) { offset = 0; for (j=0; j<(my_tree_root+mask); j++) offset += recvcounts[j]; offset *= recvtype_extent; smpi_mpi_send(((char *)tmp_buf + offset), last_recv_cnt, recvtype, dst, COLL_TAG_ALLGATHERV, comm); /* last_recv_cnt was set in the previous receive. that's the amount of data to be sent now. */ } /* recv only if this proc. doesn't have data and sender has data */ else if ((dst < rank) && (dst < tree_root + nprocs_completed) && (rank >= tree_root + nprocs_completed)) { offset = 0; for (j=0; j<(my_tree_root+mask); j++) offset += recvcounts[j]; smpi_mpi_recv(((char *)tmp_buf + offset * recvtype_extent), total_count - offset, recvtype, dst, COLL_TAG_ALLGATHERV, comm, &status); /* for convenience, recv is posted for a bigger amount than will be sent */ last_recv_cnt=smpi_mpi_get_count(&status, recvtype); curr_cnt += last_recv_cnt; } tmp_mask >>= 1; k--; } } /* --END EXPERIMENTAL-- */ mask <<= 1; i++; }
int smpi_coll_tuned_allreduce_rab_rdb(void *sbuff, void *rbuff, int count, MPI_Datatype dtype, MPI_Op op, MPI_Comm comm) { int tag = COLL_TAG_ALLREDUCE; unsigned int mask, pof2; int dst, newrank, rem, newdst, i, send_idx, recv_idx, last_idx, send_cnt, recv_cnt, *cnts, *disps; MPI_Aint extent; MPI_Status status; void *tmp_buf = NULL; unsigned int nprocs = smpi_comm_size(comm); unsigned int rank = smpi_comm_rank(comm); extent = smpi_datatype_get_extent(dtype); tmp_buf = (void *) smpi_get_tmp_sendbuffer(count * extent); smpi_datatype_copy(sbuff, count, dtype, rbuff, count, dtype); // find nearest power-of-two less than or equal to comm_size pof2 = 1; while (pof2 <= nprocs) pof2 <<= 1; pof2 >>= 1; rem = nprocs - pof2; // In the non-power-of-two case, all even-numbered // processes of rank < 2*rem send their data to // (rank+1). These even-numbered processes no longer // participate in the algorithm until the very end. The // remaining processes form a nice power-of-two. if (rank < 2 * rem) { // even if (rank % 2 == 0) { smpi_mpi_send(rbuff, count, dtype, rank + 1, tag, comm); // temporarily set the rank to -1 so that this // process does not pariticipate in recursive // doubling newrank = -1; } else // odd { smpi_mpi_recv(tmp_buf, count, dtype, rank - 1, tag, comm, &status); // do the reduction on received data. since the // ordering is right, it doesn't matter whether // the operation is commutative or not. smpi_op_apply(op, tmp_buf, rbuff, &count, &dtype); // change the rank newrank = rank / 2; } } else // rank >= 2 * rem newrank = rank - rem; // If op is user-defined or count is less than pof2, use // recursive doubling algorithm. Otherwise do a reduce-scatter // followed by allgather. (If op is user-defined, // derived datatypes are allowed and the user could pass basic // datatypes on one process and derived on another as long as // the type maps are the same. Breaking up derived // datatypes to do the reduce-scatter is tricky, therefore // using recursive doubling in that case.) if (newrank != -1) { // do a reduce-scatter followed by allgather. for the // reduce-scatter, calculate the count that each process receives // and the displacement within the buffer cnts = (int *) xbt_malloc(pof2 * sizeof(int)); disps = (int *) xbt_malloc(pof2 * sizeof(int)); for (i = 0; i < (pof2 - 1); i++) cnts[i] = count / pof2; cnts[pof2 - 1] = count - (count / pof2) * (pof2 - 1); disps[0] = 0; for (i = 1; i < pof2; i++) disps[i] = disps[i - 1] + cnts[i - 1]; mask = 0x1; send_idx = recv_idx = 0; last_idx = pof2; while (mask < pof2) { newdst = newrank ^ mask; // find real rank of dest dst = (newdst < rem) ? newdst * 2 + 1 : newdst + rem; send_cnt = recv_cnt = 0; if (newrank < newdst) { send_idx = recv_idx + pof2 / (mask * 2); for (i = send_idx; i < last_idx; i++) send_cnt += cnts[i]; for (i = recv_idx; i < send_idx; i++) recv_cnt += cnts[i]; } else { recv_idx = send_idx + pof2 / (mask * 2); for (i = send_idx; i < recv_idx; i++) send_cnt += cnts[i]; for (i = recv_idx; i < last_idx; i++) recv_cnt += cnts[i]; } // Send data from recvbuf. Recv into tmp_buf smpi_mpi_sendrecv((char *) rbuff + disps[send_idx] * extent, send_cnt, dtype, dst, tag, (char *) tmp_buf + disps[recv_idx] * extent, recv_cnt, dtype, dst, tag, comm, &status); // tmp_buf contains data received in this step. // recvbuf contains data accumulated so far // This algorithm is used only for predefined ops // and predefined ops are always commutative. smpi_op_apply(op, (char *) tmp_buf + disps[recv_idx] * extent, (char *) rbuff + disps[recv_idx] * extent, &recv_cnt, &dtype); // update send_idx for next iteration send_idx = recv_idx; mask <<= 1; // update last_idx, but not in last iteration because the value // is needed in the allgather step below. if (mask < pof2) last_idx = recv_idx + pof2 / mask; } // now do the allgather mask >>= 1; while (mask > 0) { newdst = newrank ^ mask; // find real rank of dest dst = (newdst < rem) ? newdst * 2 + 1 : newdst + rem; send_cnt = recv_cnt = 0; if (newrank < newdst) { // update last_idx except on first iteration if (mask != pof2 / 2) last_idx = last_idx + pof2 / (mask * 2); recv_idx = send_idx + pof2 / (mask * 2); for (i = send_idx; i < recv_idx; i++) send_cnt += cnts[i]; for (i = recv_idx; i < last_idx; i++) recv_cnt += cnts[i]; } else { recv_idx = send_idx - pof2 / (mask * 2); for (i = send_idx; i < last_idx; i++) send_cnt += cnts[i]; for (i = recv_idx; i < send_idx; i++) recv_cnt += cnts[i]; } smpi_mpi_sendrecv((char *) rbuff + disps[send_idx] * extent, send_cnt, dtype, dst, tag, (char *) rbuff + disps[recv_idx] * extent, recv_cnt, dtype, dst, tag, comm, &status); if (newrank > newdst) send_idx = recv_idx; mask >>= 1; } free(cnts); free(disps); }
int Coll_reduce_mvapich2_two_level::reduce( const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPI_Comm comm) { int mpi_errno = MPI_SUCCESS; int my_rank, total_size, local_rank, local_size; int leader_comm_rank = -1, leader_comm_size = 0; MPI_Comm shmem_comm, leader_comm; int leader_root, leader_of_root; const unsigned char* in_buf = nullptr; unsigned char *out_buf = nullptr, *tmp_buf = nullptr; MPI_Aint true_lb, true_extent, extent; int is_commutative = 0, stride = 0; int intra_node_root=0; //if not set (use of the algo directly, without mvapich2 selector) if(MV2_Reduce_function==NULL) MV2_Reduce_function=Coll_reduce_mpich::reduce; if(MV2_Reduce_intra_function==NULL) MV2_Reduce_intra_function=Coll_reduce_mpich::reduce; if(comm->get_leaders_comm()==MPI_COMM_NULL){ comm->init_smp(); } my_rank = comm->rank(); total_size = comm->size(); shmem_comm = comm->get_intra_comm(); local_rank = shmem_comm->rank(); local_size = shmem_comm->size(); leader_comm = comm->get_leaders_comm(); int* leaders_map = comm->get_leaders_map(); leader_of_root = comm->group()->rank(leaders_map[root]); leader_root = leader_comm->group()->rank(leaders_map[root]); is_commutative= (op==MPI_OP_NULL || op->is_commutative()); datatype->extent(&true_lb, &true_extent); extent =datatype->get_extent(); stride = count * std::max(extent, true_extent); if (local_size == total_size) { /* First handle the case where there is only one node */ if (stride <= MV2_INTRA_SHMEM_REDUCE_MSG && is_commutative == 1) { if (local_rank == 0 ) { tmp_buf = smpi_get_tmp_sendbuffer(count * std::max(extent, true_extent)); tmp_buf = tmp_buf - true_lb; } if (sendbuf != MPI_IN_PLACE) { in_buf = static_cast<const unsigned char*>(sendbuf); } else { in_buf = static_cast<const unsigned char*>(recvbuf); } if (local_rank == 0) { if( my_rank != root) { out_buf = tmp_buf; } else { out_buf = static_cast<unsigned char*>(recvbuf); if (in_buf == out_buf) { in_buf = static_cast<const unsigned char*>(MPI_IN_PLACE); out_buf = static_cast<unsigned char*>(recvbuf); } } } else { in_buf = static_cast<const unsigned char*>(sendbuf); out_buf = nullptr; } if (count * (std::max(extent, true_extent)) < SHMEM_COLL_BLOCK_SIZE) { mpi_errno = MPIR_Reduce_shmem_MV2(in_buf, out_buf, count, datatype, op, 0, shmem_comm); } else { mpi_errno = MPIR_Reduce_intra_knomial_wrapper_MV2(in_buf, out_buf, count, datatype, op, 0, shmem_comm); } if (local_rank == 0 && root != my_rank) { Request::send(out_buf, count, datatype, root, COLL_TAG_REDUCE+1, comm); } if ((local_rank != 0) && (root == my_rank)) { Request::recv(recvbuf, count, datatype, leader_of_root, COLL_TAG_REDUCE+1, comm, MPI_STATUS_IGNORE); } } else { if(mv2_use_knomial_reduce == 1) { reduce_fn = &MPIR_Reduce_intra_knomial_wrapper_MV2; } else { reduce_fn = &MPIR_Reduce_binomial_MV2; } mpi_errno = reduce_fn(sendbuf, recvbuf, count, datatype, op, root, comm); } /* We are done */ if (tmp_buf != nullptr) smpi_free_tmp_buffer(tmp_buf + true_lb); goto fn_exit; } if (local_rank == 0) { leader_comm = comm->get_leaders_comm(); if(leader_comm==MPI_COMM_NULL){ leader_comm = MPI_COMM_WORLD; } leader_comm_size = leader_comm->size(); leader_comm_rank = leader_comm->rank(); tmp_buf = smpi_get_tmp_sendbuffer(count * std::max(extent, true_extent)); tmp_buf = tmp_buf - true_lb; } if (sendbuf != MPI_IN_PLACE) { in_buf = static_cast<const unsigned char*>(sendbuf); } else { in_buf = static_cast<const unsigned char*>(recvbuf); } if (local_rank == 0) { out_buf = static_cast<unsigned char*>(tmp_buf); } else { out_buf = nullptr; } if(local_size > 1) { /* Lets do the intra-node reduce operations, if we have more than one * process in the node */ /*Fix the input and outbuf buffers for the intra-node reduce. *Node leaders will have the reduced data in tmp_buf after *this step*/ if (MV2_Reduce_intra_function == & MPIR_Reduce_shmem_MV2) { if (is_commutative == 1 && (count * (std::max(extent, true_extent)) < SHMEM_COLL_BLOCK_SIZE)) { mpi_errno = MV2_Reduce_intra_function(in_buf, out_buf, count, datatype, op, intra_node_root, shmem_comm); } else { mpi_errno = MPIR_Reduce_intra_knomial_wrapper_MV2(in_buf, out_buf, count, datatype, op, intra_node_root, shmem_comm); } } else { mpi_errno = MV2_Reduce_intra_function(in_buf, out_buf, count, datatype, op, intra_node_root, shmem_comm); } } else { smpi_free_tmp_buffer(tmp_buf + true_lb); tmp_buf = (unsigned char*)in_buf; // xxx } /* Now work on the inter-leader phase. Data is in tmp_buf */ if (local_rank == 0 && leader_comm_size > 1) { /*The leader of root will have the global reduced data in tmp_buf or recv_buf at the end of the reduce */ if (leader_comm_rank == leader_root) { if (my_rank == root) { /* I am the root of the leader-comm, and the * root of the reduce op. So, I will write the * final result directly into my recvbuf */ if(tmp_buf != recvbuf) { in_buf = tmp_buf; out_buf = static_cast<unsigned char*>(recvbuf); } else { unsigned char* buf = smpi_get_tmp_sendbuffer(count * datatype->get_extent()); Datatype::copy(tmp_buf, count, datatype, buf, count, datatype); // in_buf = MPI_IN_PLACE; in_buf = buf; out_buf = static_cast<unsigned char*>(recvbuf); } } else { unsigned char* buf = smpi_get_tmp_sendbuffer(count * datatype->get_extent()); Datatype::copy(tmp_buf, count, datatype, buf, count, datatype); // in_buf = MPI_IN_PLACE; in_buf = buf; out_buf = tmp_buf; } } else { in_buf = tmp_buf; out_buf = nullptr; } /* inter-leader communication */ mpi_errno = MV2_Reduce_function(in_buf, out_buf, count, datatype, op, leader_root, leader_comm); } if (local_size > 1) { /* Send the message to the root if the leader is not the * root of the reduce operation. The reduced data is in tmp_buf */ if ((local_rank == 0) && (root != my_rank) && (leader_root == leader_comm_rank)) { Request::send(tmp_buf, count, datatype, root, COLL_TAG_REDUCE + 1, comm); } if ((local_rank != 0) && (root == my_rank)) { Request::recv(recvbuf, count, datatype, leader_of_root, COLL_TAG_REDUCE + 1, comm, MPI_STATUS_IGNORE); } smpi_free_tmp_buffer(tmp_buf + true_lb); if (leader_comm_rank == leader_root) { if (my_rank != root || (my_rank == root && tmp_buf == recvbuf)) { smpi_free_tmp_buffer(in_buf); } } } fn_exit: return mpi_errno; }
/***************************************************************************** * Function: alltoall_bruck * Return: int * inputs: send_buff: send input buffer send_count: number of elements to send send_type: data type of elements being sent recv_buff: receive output buffer recv_count: number of elements to received recv_type: data type of elements being received comm: communicator * Descrp: Function realizes the alltoall operation using the bruck algorithm. * Auther: MPICH / modified by Ahmad Faraj ****************************************************************************/ int smpi_coll_tuned_alltoall_bruck(void *send_buff, int send_count, MPI_Datatype send_type, void *recv_buff, int recv_count, MPI_Datatype recv_type, MPI_Comm comm) { MPI_Status status; MPI_Aint extent; MPI_Datatype new_type; int *blocks_length, *disps; int i, src, dst, rank, num_procs, count, remainder, block, position; int pack_size, tag = COLL_TAG_ALLTOALL, pof2 = 1; char *tmp_buff; char *send_ptr = (char *) send_buff; char *recv_ptr = (char *) recv_buff; num_procs = smpi_comm_size(comm); rank = smpi_comm_rank(comm); extent = smpi_datatype_get_extent(recv_type); tmp_buff = (char *) smpi_get_tmp_sendbuffer(num_procs * recv_count * extent); disps = (int *) xbt_malloc(sizeof(int) * num_procs); blocks_length = (int *) xbt_malloc(sizeof(int) * num_procs); smpi_mpi_sendrecv(send_ptr + rank * send_count * extent, (num_procs - rank) * send_count, send_type, rank, tag, recv_ptr, (num_procs - rank) * recv_count, recv_type, rank, tag, comm, &status); smpi_mpi_sendrecv(send_ptr, rank * send_count, send_type, rank, tag, recv_ptr + (num_procs - rank) * recv_count * extent, rank * recv_count, recv_type, rank, tag, comm, &status); MPI_Pack_size(send_count * num_procs, send_type, comm, &pack_size); while (pof2 < num_procs) { dst = (rank + pof2) % num_procs; src = (rank - pof2 + num_procs) % num_procs; count = 0; for (block = 1; block < num_procs; block++) if (block & pof2) { blocks_length[count] = send_count; disps[count] = block * send_count; count++; } MPI_Type_indexed(count, blocks_length, disps, recv_type, &new_type); smpi_datatype_commit(&new_type); position = 0; MPI_Pack(recv_buff, 1, new_type, tmp_buff, pack_size, &position, comm); smpi_mpi_sendrecv(tmp_buff, position, MPI_PACKED, dst, tag, recv_buff, 1, new_type, src, tag, comm, &status); smpi_datatype_unuse(new_type); pof2 *= 2; } free(disps); free(blocks_length); smpi_mpi_sendrecv(recv_ptr + (rank + 1) * recv_count * extent, (num_procs - rank - 1) * recv_count, send_type, rank, tag, tmp_buff, (num_procs - rank - 1) * recv_count, recv_type, rank, tag, comm, &status); smpi_mpi_sendrecv(recv_ptr, (rank + 1) * recv_count, send_type, rank, tag, tmp_buff + (num_procs - rank - 1) * recv_count * extent, (rank + 1) * recv_count, recv_type, rank, tag, comm, &status); for (i = 0; i < num_procs; i++) smpi_mpi_sendrecv(tmp_buff + i * recv_count * extent, recv_count, send_type, rank, tag, recv_ptr + (num_procs - i - 1) * recv_count * extent, recv_count, recv_type, rank, tag, comm, &status); smpi_free_tmp_buffer(tmp_buff); return MPI_SUCCESS; }
int smpi_coll_tuned_allreduce_ompi_ring_segmented(void *sbuf, void *rbuf, int count, MPI_Datatype dtype, MPI_Op op, MPI_Comm comm) { int ret = MPI_SUCCESS; int line; int k, recv_from, send_to; int early_blockcount, late_blockcount, split_rank; int segcount, max_segcount; int num_phases, phase; int block_count; unsigned int inbi; size_t typelng; char *tmpsend = NULL, *tmprecv = NULL; char *inbuf[2] = {NULL, NULL}; ptrdiff_t true_extent, extent; ptrdiff_t block_offset, max_real_segsize; MPI_Request reqs[2] = {NULL, NULL}; const size_t segsize = 1 << 20; /* 1 MB */ unsigned int size = smpi_comm_size(comm); unsigned int rank = smpi_comm_rank(comm); XBT_DEBUG("coll:tuned:allreduce_intra_ring_segmented rank %d, count %d", rank, count); /* Special case for size == 1 */ if (1 == size) { if (MPI_IN_PLACE != sbuf) { ret= smpi_datatype_copy(sbuf, count, dtype,rbuf, count, dtype); if (ret < 0) { line = __LINE__; goto error_hndl; } } return MPI_SUCCESS; } /* Determine segment count based on the suggested segment size */ extent = smpi_datatype_get_extent(dtype); if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } true_extent = smpi_datatype_get_extent(dtype); if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } typelng = smpi_datatype_size(dtype); if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } segcount = count; COLL_TUNED_COMPUTED_SEGCOUNT(segsize, typelng, segcount) /* Special case for count less than size * segcount - use regular ring */ if (count < size * segcount) { XBT_DEBUG( "coll:tuned:allreduce_ring_segmented rank %d/%d, count %d, switching to regular ring", rank, size, count); return (smpi_coll_tuned_allreduce_lr(sbuf, rbuf, count, dtype, op, comm)); } /* Determine the number of phases of the algorithm */ num_phases = count / (size * segcount); if ((count % (size * segcount) >= size) && (count % (size * segcount) > ((size * segcount) / 2))) { num_phases++; } /* Determine the number of elements per block and corresponding block sizes. The blocks are divided into "early" and "late" ones: blocks 0 .. (split_rank - 1) are "early" and blocks (split_rank) .. (size - 1) are "late". Early blocks are at most 1 element larger than the late ones. Note, these blocks will be split into num_phases segments, out of the largest one will have max_segcount elements. */ COLL_TUNED_COMPUTE_BLOCKCOUNT( count, size, split_rank, early_blockcount, late_blockcount ) COLL_TUNED_COMPUTE_BLOCKCOUNT( early_blockcount, num_phases, inbi, max_segcount, k) max_real_segsize = true_extent + (max_segcount - 1) * extent; /* Allocate and initialize temporary buffers */ inbuf[0] = (char*)smpi_get_tmp_sendbuffer(max_real_segsize); if (NULL == inbuf[0]) { ret = -1; line = __LINE__; goto error_hndl; } if (size > 2) { inbuf[1] = (char*)smpi_get_tmp_recvbuffer(max_real_segsize); if (NULL == inbuf[1]) { ret = -1; line = __LINE__; goto error_hndl; } } /* Handle MPI_IN_PLACE */ if (MPI_IN_PLACE != sbuf) { ret= smpi_datatype_copy(sbuf, count, dtype,rbuf, count, dtype); if (ret < 0) { line = __LINE__; goto error_hndl; } } /* Computation loop: for each phase, repeat ring allreduce computation loop */ for (phase = 0; phase < num_phases; phase ++) { ptrdiff_t phase_offset; int early_phase_segcount, late_phase_segcount, split_phase, phase_count; /* For each of the remote nodes: - post irecv for block (r-1) - send block (r) To do this, first compute block offset and count, and use block offset to compute phase offset. - in loop for every step k = 2 .. n - post irecv for block (r + n - k) % n - wait on block (r + n - k + 1) % n to arrive - compute on block (r + n - k + 1) % n - send block (r + n - k + 1) % n - wait on block (r + 1) - compute on block (r + 1) - send block (r + 1) to rank (r + 1) Note that we must be careful when computing the begining of buffers and for send operations and computation we must compute the exact block size. */ send_to = (rank + 1) % size; recv_from = (rank + size - 1) % size; inbi = 0; /* Initialize first receive from the neighbor on the left */ reqs[inbi] = smpi_mpi_irecv(inbuf[inbi], max_segcount, dtype, recv_from, 666, comm); /* Send first block (my block) to the neighbor on the right: - compute my block and phase offset - send data */ block_offset = ((rank < split_rank)? (rank * early_blockcount) : (rank * late_blockcount + split_rank)); block_count = ((rank < split_rank)? early_blockcount : late_blockcount); COLL_TUNED_COMPUTE_BLOCKCOUNT(block_count, num_phases, split_phase, early_phase_segcount, late_phase_segcount) phase_count = ((phase < split_phase)? (early_phase_segcount) : (late_phase_segcount)); phase_offset = ((phase < split_phase)? (phase * early_phase_segcount) : (phase * late_phase_segcount + split_phase)); tmpsend = ((char*)rbuf) + (block_offset + phase_offset) * extent; smpi_mpi_send(tmpsend, phase_count, dtype, send_to, 666, comm); for (k = 2; k < size; k++) { const int prevblock = (rank + size - k + 1) % size; inbi = inbi ^ 0x1; /* Post irecv for the current block */ reqs[inbi] = smpi_mpi_irecv(inbuf[inbi], max_segcount, dtype, recv_from, 666, comm); if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } /* Wait on previous block to arrive */ smpi_mpi_wait(&reqs[inbi ^ 0x1], MPI_STATUS_IGNORE); /* Apply operation on previous block: result goes to rbuf rbuf[prevblock] = inbuf[inbi ^ 0x1] (op) rbuf[prevblock] */ block_offset = ((prevblock < split_rank)? (prevblock * early_blockcount) : (prevblock * late_blockcount + split_rank)); block_count = ((prevblock < split_rank)? early_blockcount : late_blockcount); COLL_TUNED_COMPUTE_BLOCKCOUNT(block_count, num_phases, split_phase, early_phase_segcount, late_phase_segcount) phase_count = ((phase < split_phase)? (early_phase_segcount) : (late_phase_segcount)); phase_offset = ((phase < split_phase)? (phase * early_phase_segcount) : (phase * late_phase_segcount + split_phase)); tmprecv = ((char*)rbuf) + (block_offset + phase_offset) * extent; smpi_op_apply(op, inbuf[inbi ^ 0x1], tmprecv, &phase_count, &dtype); /* send previous block to send_to */ smpi_mpi_send(tmprecv, phase_count, dtype, send_to, 666, comm); } /* Wait on the last block to arrive */ smpi_mpi_wait(&reqs[inbi], MPI_STATUS_IGNORE); /* Apply operation on the last block (from neighbor (rank + 1) rbuf[rank+1] = inbuf[inbi] (op) rbuf[rank + 1] */ recv_from = (rank + 1) % size; block_offset = ((recv_from < split_rank)? (recv_from * early_blockcount) : (recv_from * late_blockcount + split_rank)); block_count = ((recv_from < split_rank)? early_blockcount : late_blockcount); COLL_TUNED_COMPUTE_BLOCKCOUNT(block_count, num_phases, split_phase, early_phase_segcount, late_phase_segcount) phase_count = ((phase < split_phase)? (early_phase_segcount) : (late_phase_segcount)); phase_offset = ((phase < split_phase)? (phase * early_phase_segcount) : (phase * late_phase_segcount + split_phase)); tmprecv = ((char*)rbuf) + (block_offset + phase_offset) * extent; smpi_op_apply(op, inbuf[inbi], tmprecv, &phase_count, &dtype); } /* Distribution loop - variation of ring allgather */ send_to = (rank + 1) % size; recv_from = (rank + size - 1) % size; for (k = 0; k < size - 1; k++) { const int recv_data_from = (rank + size - k) % size; const int send_data_from = (rank + 1 + size - k) % size; const int send_block_offset = ((send_data_from < split_rank)? (send_data_from * early_blockcount) : (send_data_from * late_blockcount + split_rank)); const int recv_block_offset = ((recv_data_from < split_rank)? (recv_data_from * early_blockcount) : (recv_data_from * late_blockcount + split_rank)); block_count = ((send_data_from < split_rank)? early_blockcount : late_blockcount); tmprecv = (char*)rbuf + recv_block_offset * extent; tmpsend = (char*)rbuf + send_block_offset * extent; smpi_mpi_sendrecv(tmpsend, block_count, dtype, send_to, 666, tmprecv, early_blockcount, dtype, recv_from, 666, comm, MPI_STATUS_IGNORE); } if (NULL != inbuf[0]) smpi_free_tmp_buffer(inbuf[0]); if (NULL != inbuf[1]) smpi_free_tmp_buffer(inbuf[1]); return MPI_SUCCESS; error_hndl: XBT_DEBUG("%s:%4d\tRank %d Error occurred %d\n", __FILE__, line, rank, ret); if (NULL != inbuf[0]) smpi_free_tmp_buffer(inbuf[0]); if (NULL != inbuf[1]) smpi_free_tmp_buffer(inbuf[1]); return ret; }
int smpi_coll_tuned_reduce_binomial(void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPI_Comm comm) { MPI_Status status; int comm_size, rank; int mask, relrank, source; int dst; int tag = COLL_TAG_REDUCE; MPI_Aint extent; void *tmp_buf; MPI_Aint true_lb, true_extent; if (count == 0) return 0; rank = smpi_comm_rank(comm); comm_size = smpi_comm_size(comm); extent = smpi_datatype_get_extent(datatype); tmp_buf = (void *) smpi_get_tmp_sendbuffer(count * extent); int is_commutative = smpi_op_is_commute(op); mask = 1; int lroot; if (is_commutative) lroot = root; else lroot = 0; relrank = (rank - lroot + comm_size) % comm_size; smpi_datatype_extent(datatype, &true_lb, &true_extent); /* adjust for potential negative lower bound in datatype */ tmp_buf = (void *)((char*)tmp_buf - true_lb); /* If I'm not the root, then my recvbuf may not be valid, therefore I have to allocate a temporary one */ if (rank != root) { recvbuf = (void *) smpi_get_tmp_recvbuffer(count*(max(extent,true_extent))); recvbuf = (void *)((char*)recvbuf - true_lb); } if ((rank != root) || (sendbuf != MPI_IN_PLACE)) { smpi_datatype_copy(sendbuf, count, datatype, recvbuf,count, datatype); } while (mask < comm_size) { /* Receive */ if ((mask & relrank) == 0) { source = (relrank | mask); if (source < comm_size) { source = (source + lroot) % comm_size; smpi_mpi_recv(tmp_buf, count, datatype, source, tag, comm, &status); if (is_commutative) { smpi_op_apply(op, tmp_buf, recvbuf, &count, &datatype); } else { smpi_op_apply(op, recvbuf, tmp_buf, &count, &datatype); smpi_datatype_copy(tmp_buf, count, datatype,recvbuf, count, datatype); } } } else { dst = ((relrank & (~mask)) + lroot) % comm_size; smpi_mpi_send(recvbuf, count, datatype, dst, tag, comm); break; } mask <<= 1; } if (!is_commutative && (root != 0)){ if (rank == 0){ smpi_mpi_send(recvbuf, count, datatype, root,tag, comm); }else if (rank == root){ smpi_mpi_recv(recvbuf, count, datatype, 0, tag, comm, &status); } } if (rank != root) { smpi_free_tmp_buffer(recvbuf); } smpi_free_tmp_buffer(tmp_buf); return 0; }
int Coll_alltoall_2dmesh::alltoall(void *send_buff, int send_count, MPI_Datatype send_type, void *recv_buff, int recv_count, MPI_Datatype recv_type, MPI_Comm comm) { MPI_Status *statuses, s; MPI_Request *reqs, *req_ptr;; MPI_Aint extent; char *tmp_buff1, *tmp_buff2; int i, j, src, dst, rank, num_procs, count, num_reqs; int X, Y, send_offset, recv_offset; int my_row_base, my_col_base, src_row_base, block_size; int tag = COLL_TAG_ALLTOALL; rank = comm->rank(); num_procs = comm->size(); extent = send_type->get_extent(); if (not alltoall_check_is_2dmesh(num_procs, &X, &Y)) return MPI_ERR_OTHER; my_row_base = (rank / Y) * Y; my_col_base = rank % Y; block_size = extent * send_count; tmp_buff1 = (char *) smpi_get_tmp_sendbuffer(block_size * num_procs * Y); tmp_buff2 = (char *) smpi_get_tmp_recvbuffer(block_size * Y); num_reqs = X; if (Y > X) num_reqs = Y; statuses = (MPI_Status *) xbt_malloc(num_reqs * sizeof(MPI_Status)); reqs = (MPI_Request *) xbt_malloc(num_reqs * sizeof(MPI_Request)); req_ptr = reqs; count = send_count * num_procs; for (i = 0; i < Y; i++) { src = i + my_row_base; if (src == rank) continue; recv_offset = (src % Y) * block_size * num_procs; *(req_ptr++) = Request::irecv(tmp_buff1 + recv_offset, count, recv_type, src, tag, comm); } for (i = 0; i < Y; i++) { dst = i + my_row_base; if (dst == rank) continue; Request::send(send_buff, count, send_type, dst, tag, comm); } Request::waitall(Y - 1, reqs, statuses); req_ptr = reqs; for (i = 0; i < Y; i++) { send_offset = (rank * block_size) + (i * block_size * num_procs); recv_offset = (my_row_base * block_size) + (i * block_size); if (i + my_row_base == rank) Request::sendrecv((char *) send_buff + recv_offset, send_count, send_type, rank, tag, (char *) recv_buff + recv_offset, recv_count, recv_type, rank, tag, comm, &s); else Request::sendrecv(tmp_buff1 + send_offset, send_count, send_type, rank, tag, (char *) recv_buff + recv_offset, recv_count, recv_type, rank, tag, comm, &s); } for (i = 0; i < X; i++) { src = (i * Y + my_col_base); if (src == rank) continue; src_row_base = (src / Y) * Y; *(req_ptr++) = Request::irecv((char *) recv_buff + src_row_base * block_size, recv_count * Y, recv_type, src, tag, comm); } for (i = 0; i < X; i++) { dst = (i * Y + my_col_base); if (dst == rank) continue; recv_offset = 0; for (j = 0; j < Y; j++) { send_offset = (dst + j * num_procs) * block_size; if (j + my_row_base == rank) Request::sendrecv((char *) send_buff + dst * block_size, send_count, send_type, rank, tag, tmp_buff2 + recv_offset, recv_count, recv_type, rank, tag, comm, &s); else Request::sendrecv(tmp_buff1 + send_offset, send_count, send_type, rank, tag, tmp_buff2 + recv_offset, recv_count, recv_type, rank, tag, comm, &s); recv_offset += block_size; } Request::send(tmp_buff2, send_count * Y, send_type, dst, tag, comm); } Request::waitall(X - 1, reqs, statuses); free(reqs); free(statuses); smpi_free_tmp_buffer(tmp_buff1); smpi_free_tmp_buffer(tmp_buff2); return MPI_SUCCESS; }
int Coll_gather_ompi_binomial::gather(void* sbuf, int scount, MPI_Datatype sdtype, void* rbuf, int rcount, MPI_Datatype rdtype, int root, MPI_Comm comm) { int line = -1; int i; int rank; int vrank; int size; int total_recv = 0; char *ptmp = NULL; char *tempbuf = NULL; int err; ompi_coll_tree_t* bmtree; MPI_Status status; MPI_Aint sextent, slb, strue_lb, strue_extent; MPI_Aint rextent, rlb, rtrue_lb, rtrue_extent; size = comm->size(); rank = comm->rank(); XBT_DEBUG("smpi_coll_tuned_gather_ompi_binomial rank %d", rank); /* create the binomial tree */ // COLL_TUNED_UPDATE_IN_ORDER_BMTREE( comm, tuned_module, root ); bmtree = ompi_coll_tuned_topo_build_in_order_bmtree(comm, root); // data->cached_in_order_bmtree; sdtype->extent(&slb, &sextent); sdtype->extent(&strue_lb, &strue_extent); vrank = (rank - root + size) % size; if (rank == root) { rdtype->extent(&rlb, &rextent); rdtype->extent(&rtrue_lb, &rtrue_extent); if (0 == root) { /* root on 0, just use the recv buffer */ ptmp = (char*)rbuf; if (sbuf != MPI_IN_PLACE) { err = Datatype::copy(sbuf, scount, sdtype, ptmp, rcount, rdtype); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } } } else { /* root is not on 0, allocate temp buffer for recv, * rotate data at the end */ tempbuf = (char*)smpi_get_tmp_recvbuffer(rtrue_extent + (rcount * size - 1) * rextent); if (NULL == tempbuf) { err = MPI_ERR_OTHER; line = __LINE__; goto err_hndl; } ptmp = tempbuf - rlb; if (sbuf != MPI_IN_PLACE) { /* copy from sbuf to temp buffer */ err = Datatype::copy(sbuf, scount, sdtype, ptmp, rcount, rdtype); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } } else { /* copy from rbuf to temp buffer */ err = Datatype::copy((char*)rbuf + rank * rextent * rcount, rcount, rdtype, ptmp, rcount, rdtype); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } } } total_recv = rcount; } else if (!(vrank % 2)) { /* other non-leaf nodes, allocate temp buffer for data received from * children, the most we need is half of the total data elements due * to the property of binimoal tree */ tempbuf = (char*)smpi_get_tmp_sendbuffer(strue_extent + (scount * size - 1) * sextent); if (NULL == tempbuf) { err = MPI_ERR_OTHER; line = __LINE__; goto err_hndl; } ptmp = tempbuf - slb; /* local copy to tempbuf */ err = Datatype::copy(sbuf, scount, sdtype, ptmp, scount, sdtype); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } /* use sdtype,scount as rdtype,rdcount since they are ignored on * non-root procs */ rdtype = sdtype; rcount = scount; rextent = sextent; total_recv = rcount; } else { /* leaf nodes, no temp buffer needed, use sdtype,scount as * rdtype,rdcount since they are ignored on non-root procs */ ptmp = (char*)sbuf; total_recv = scount; } if (!(vrank % 2)) { /* all non-leaf nodes recv from children */ for (i = 0; i < bmtree->tree_nextsize; i++) { int mycount = 0, vkid; /* figure out how much data I have to send to this child */ vkid = (bmtree->tree_next[i] - root + size) % size; mycount = vkid - vrank; if (mycount > (size - vkid)) mycount = size - vkid; mycount *= rcount; XBT_DEBUG("smpi_coll_tuned_gather_ompi_binomial rank %d recv %d mycount = %d", rank, bmtree->tree_next[i], mycount); Request::recv(ptmp + total_recv * rextent, mycount, rdtype, bmtree->tree_next[i], COLL_TAG_GATHER, comm, &status); total_recv += mycount; } } if (rank != root) { /* all nodes except root send to parents */ XBT_DEBUG("smpi_coll_tuned_gather_ompi_binomial rank %d send %d count %d\n", rank, bmtree->tree_prev, total_recv); Request::send(ptmp, total_recv, sdtype, bmtree->tree_prev, COLL_TAG_GATHER, comm); } if (rank == root) { if (root != 0) { /* rotate received data on root if root != 0 */ err = Datatype::copy(ptmp, rcount * (size - root), rdtype, (char*)rbuf + rextent * root * rcount, rcount * (size - root), rdtype); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } err = Datatype::copy(ptmp + rextent * rcount * (size - root), rcount * root, rdtype, (char*)rbuf, rcount * root, rdtype); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } smpi_free_tmp_buffer(tempbuf); } } else if (!(vrank % 2)) { /* other non-leaf nodes */ smpi_free_tmp_buffer(tempbuf); } ompi_coll_tuned_topo_destroy_tree(&bmtree); return MPI_SUCCESS; err_hndl: if (NULL != tempbuf) smpi_free_tmp_buffer(tempbuf); XBT_DEBUG("%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank); return err; }
int smpi_coll_tuned_reduce_mvapich2_two_level( void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPI_Comm comm) { int mpi_errno = MPI_SUCCESS; int my_rank, total_size, local_rank, local_size; int leader_comm_rank = -1, leader_comm_size = 0; MPI_Comm shmem_comm, leader_comm; int leader_root, leader_of_root; void *in_buf = NULL, *out_buf = NULL, *tmp_buf = NULL; MPI_Aint true_lb, true_extent, extent; int is_commutative = 0, stride = 0; int intra_node_root=0; //if not set (use of the algo directly, without mvapich2 selector) if(MV2_Reduce_function==NULL) MV2_Reduce_function=smpi_coll_tuned_reduce_mpich; if(MV2_Reduce_intra_function==NULL) MV2_Reduce_intra_function=smpi_coll_tuned_reduce_mpich; if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){ smpi_comm_init_smp(comm); } my_rank = smpi_comm_rank(comm); total_size = smpi_comm_size(comm); shmem_comm = smpi_comm_get_intra_comm(comm); local_rank = smpi_comm_rank(shmem_comm); local_size = smpi_comm_size(shmem_comm); leader_comm = smpi_comm_get_leaders_comm(comm); int* leaders_map = smpi_comm_get_leaders_map(comm); leader_of_root = smpi_group_rank(smpi_comm_group(comm),leaders_map[root]); leader_root = smpi_group_rank(smpi_comm_group(leader_comm),leaders_map[root]); is_commutative=smpi_op_is_commute(op); smpi_datatype_extent(datatype, &true_lb, &true_extent); extent =smpi_datatype_get_extent(datatype); stride = count * MAX(extent, true_extent); if (local_size == total_size) { /* First handle the case where there is only one node */ if (stride <= MV2_INTRA_SHMEM_REDUCE_MSG && is_commutative == 1) { if (local_rank == 0 ) { tmp_buf=(void *)smpi_get_tmp_sendbuffer( count * (MAX(extent, true_extent))); tmp_buf = (void *) ((char *) tmp_buf - true_lb); } if (sendbuf != MPI_IN_PLACE) { in_buf = (void *)sendbuf; } else { in_buf = recvbuf; } if (local_rank == 0) { if( my_rank != root) { out_buf = tmp_buf; } else { out_buf = recvbuf; if(in_buf == out_buf) { in_buf = MPI_IN_PLACE; out_buf = recvbuf; } } } else { in_buf = (void *)sendbuf; out_buf = NULL; } if (count * (MAX(extent, true_extent)) < SHMEM_COLL_BLOCK_SIZE) { mpi_errno = MPIR_Reduce_shmem_MV2(in_buf, out_buf, count, datatype, op, 0, shmem_comm); } else { mpi_errno = MPIR_Reduce_intra_knomial_wrapper_MV2(in_buf, out_buf, count, datatype, op, 0, shmem_comm); } if (local_rank == 0 && root != my_rank) { smpi_mpi_send(out_buf, count, datatype, root, COLL_TAG_REDUCE+1, comm); } if ((local_rank != 0) && (root == my_rank)) { smpi_mpi_recv(recvbuf, count, datatype, leader_of_root, COLL_TAG_REDUCE+1, comm, MPI_STATUS_IGNORE); } } else { if(mv2_use_knomial_reduce == 1) { reduce_fn = &MPIR_Reduce_intra_knomial_wrapper_MV2; } else { reduce_fn = &MPIR_Reduce_binomial_MV2; } mpi_errno = reduce_fn(sendbuf, recvbuf, count, datatype, op, root, comm); } /* We are done */ if(tmp_buf!=NULL) smpi_free_tmp_buffer((void *) ((char *) tmp_buf + true_lb)); goto fn_exit; } if (local_rank == 0) { leader_comm = smpi_comm_get_leaders_comm(comm); if(leader_comm==MPI_COMM_NULL){ leader_comm = MPI_COMM_WORLD; } leader_comm_size = smpi_comm_size(leader_comm); leader_comm_rank = smpi_comm_rank(leader_comm); tmp_buf=(void *)smpi_get_tmp_sendbuffer(count * (MAX(extent, true_extent))); tmp_buf = (void *) ((char *) tmp_buf - true_lb); } if (sendbuf != MPI_IN_PLACE) { in_buf = (void *)sendbuf; } else { in_buf = recvbuf; } if (local_rank == 0) { out_buf = tmp_buf; } else { out_buf = NULL; } if(local_size > 1) { /* Lets do the intra-node reduce operations, if we have more than one * process in the node */ /*Fix the input and outbuf buffers for the intra-node reduce. *Node leaders will have the reduced data in tmp_buf after *this step*/ if (MV2_Reduce_intra_function == & MPIR_Reduce_shmem_MV2) { if (is_commutative == 1 && (count * (MAX(extent, true_extent)) < SHMEM_COLL_BLOCK_SIZE)) { mpi_errno = MV2_Reduce_intra_function(in_buf, out_buf, count, datatype, op, intra_node_root, shmem_comm); } else { mpi_errno = MPIR_Reduce_intra_knomial_wrapper_MV2(in_buf, out_buf, count, datatype, op, intra_node_root, shmem_comm); } } else { mpi_errno = MV2_Reduce_intra_function(in_buf, out_buf, count, datatype, op, intra_node_root, shmem_comm); } } else { smpi_free_tmp_buffer((void *) ((char *) tmp_buf + true_lb)); tmp_buf = in_buf; } /* Now work on the inter-leader phase. Data is in tmp_buf */ if (local_rank == 0 && leader_comm_size > 1) { /*The leader of root will have the global reduced data in tmp_buf or recv_buf at the end of the reduce */ if (leader_comm_rank == leader_root) { if (my_rank == root) { /* I am the root of the leader-comm, and the * root of the reduce op. So, I will write the * final result directly into my recvbuf */ if(tmp_buf != recvbuf) { in_buf = tmp_buf; out_buf = recvbuf; } else { in_buf = (char *)smpi_get_tmp_sendbuffer(count* smpi_datatype_get_extent(datatype)); smpi_datatype_copy(tmp_buf, count, datatype, in_buf, count, datatype); //in_buf = MPI_IN_PLACE; out_buf = recvbuf; } } else { in_buf = (char *)smpi_get_tmp_sendbuffer(count* smpi_datatype_get_extent(datatype)); smpi_datatype_copy(tmp_buf, count, datatype, in_buf, count, datatype); //in_buf = MPI_IN_PLACE; out_buf = tmp_buf; } } else { in_buf = tmp_buf; out_buf = NULL; } /* inter-leader communication */ mpi_errno = MV2_Reduce_function(in_buf, out_buf, count, datatype, op, leader_root, leader_comm); } if (local_size > 1) { /* Send the message to the root if the leader is not the * root of the reduce operation. The reduced data is in tmp_buf */ if ((local_rank == 0) && (root != my_rank) && (leader_root == leader_comm_rank)) { smpi_mpi_send(tmp_buf, count, datatype, root, COLL_TAG_REDUCE+1, comm); } if ((local_rank != 0) && (root == my_rank)) { smpi_mpi_recv(recvbuf, count, datatype, leader_of_root, COLL_TAG_REDUCE+1, comm, MPI_STATUS_IGNORE); } smpi_free_tmp_buffer((void *) ((char *) tmp_buf + true_lb)); if (leader_comm_rank == leader_root) { if (my_rank != root || (my_rank == root && tmp_buf == recvbuf)) { smpi_free_tmp_buffer(in_buf); } } } fn_exit: return mpi_errno; }
int Coll_scatter_mvapich2_two_level_direct::scatter(void *sendbuf, int sendcnt, MPI_Datatype sendtype, void *recvbuf, int recvcnt, MPI_Datatype recvtype, int root, MPI_Comm comm) { int comm_size, rank; int local_rank, local_size; int leader_comm_rank = -1, leader_comm_size = -1; int mpi_errno = MPI_SUCCESS; int recvtype_size, sendtype_size, nbytes; void *tmp_buf = NULL; void *leader_scatter_buf = NULL; MPI_Status status; int leader_root, leader_of_root = -1; MPI_Comm shmem_comm, leader_comm; //if not set (use of the algo directly, without mvapich2 selector) if(MV2_Scatter_intra_function==NULL) MV2_Scatter_intra_function=Coll_scatter_mpich::scatter; if(comm->get_leaders_comm()==MPI_COMM_NULL){ comm->init_smp(); } comm_size = comm->size(); rank = comm->rank(); if (((rank == root) && (recvcnt == 0)) || ((rank != root) && (sendcnt == 0))) { return MPI_SUCCESS; } /* extract the rank,size information for the intra-node * communicator */ shmem_comm = comm->get_intra_comm(); local_rank = shmem_comm->rank(); local_size = shmem_comm->size(); if (local_rank == 0) { /* Node leader. Extract the rank, size information for the leader * communicator */ leader_comm = comm->get_leaders_comm(); leader_comm_size = leader_comm->size(); leader_comm_rank = leader_comm->rank(); } if (local_size == comm_size) { /* purely intra-node scatter. Just use the direct algorithm and we are done */ mpi_errno = MPIR_Scatter_MV2_Direct(sendbuf, sendcnt, sendtype, recvbuf, recvcnt, recvtype, root, comm); } else { recvtype_size=recvtype->size(); sendtype_size=sendtype->size(); if (rank == root) { nbytes = sendcnt * sendtype_size; } else { nbytes = recvcnt * recvtype_size; } if (local_rank == 0) { /* Node leader, allocate tmp_buffer */ tmp_buf = smpi_get_tmp_sendbuffer(nbytes * local_size); } leader_comm = comm->get_leaders_comm(); int* leaders_map = comm->get_leaders_map(); leader_of_root = comm->group()->rank(leaders_map[root]); leader_root = leader_comm->group()->rank(leaders_map[root]); /* leader_root is the rank of the leader of the root in leader_comm. * leader_root is to be used as the root of the inter-leader gather ops */ if ((local_rank == 0) && (root != rank) && (leader_of_root == rank)) { /* The root of the scatter operation is not the node leader. Recv * data from the node leader */ leader_scatter_buf = smpi_get_tmp_sendbuffer(nbytes * comm_size); Request::recv(leader_scatter_buf, nbytes * comm_size, MPI_BYTE, root, COLL_TAG_SCATTER, comm, &status); } if (rank == root && local_rank != 0) { /* The root of the scatter operation is not the node leader. Send * data to the node leader */ Request::send(sendbuf, sendcnt * comm_size, sendtype, leader_of_root, COLL_TAG_SCATTER, comm ); } if (leader_comm_size > 1 && local_rank == 0) { if (not comm->is_uniform()) { int* displs = NULL; int* sendcnts = NULL; int* node_sizes; int i = 0; node_sizes = comm->get_non_uniform_map(); if (root != leader_of_root) { if (leader_comm_rank == leader_root) { displs = static_cast<int*>(xbt_malloc(sizeof(int) * leader_comm_size)); sendcnts = static_cast<int*>(xbt_malloc(sizeof(int) * leader_comm_size)); sendcnts[0] = node_sizes[0] * nbytes; displs[0] = 0; for (i = 1; i < leader_comm_size; i++) { displs[i] = displs[i - 1] + node_sizes[i - 1] * nbytes; sendcnts[i] = node_sizes[i] * nbytes; } } Colls::scatterv(leader_scatter_buf, sendcnts, displs, MPI_BYTE, tmp_buf, nbytes * local_size, MPI_BYTE, leader_root, leader_comm); } else { if (leader_comm_rank == leader_root) { displs = static_cast<int*>(xbt_malloc(sizeof(int) * leader_comm_size)); sendcnts = static_cast<int*>(xbt_malloc(sizeof(int) * leader_comm_size)); sendcnts[0] = node_sizes[0] * sendcnt; displs[0] = 0; for (i = 1; i < leader_comm_size; i++) { displs[i] = displs[i - 1] + node_sizes[i - 1] * sendcnt; sendcnts[i] = node_sizes[i] * sendcnt; } } Colls::scatterv(sendbuf, sendcnts, displs, sendtype, tmp_buf, nbytes * local_size, MPI_BYTE, leader_root, leader_comm); } if (leader_comm_rank == leader_root) { xbt_free(displs); xbt_free(sendcnts); } } else { if (leader_of_root != root) { mpi_errno = MPIR_Scatter_MV2_Direct(leader_scatter_buf, nbytes * local_size, MPI_BYTE, tmp_buf, nbytes * local_size, MPI_BYTE, leader_root, leader_comm); } else { mpi_errno = MPIR_Scatter_MV2_Direct(sendbuf, sendcnt * local_size, sendtype, tmp_buf, nbytes * local_size, MPI_BYTE, leader_root, leader_comm); } } } /* The leaders are now done with the inter-leader part. Scatter the data within the nodes */ if (rank == root && recvbuf == MPI_IN_PLACE) { mpi_errno = MV2_Scatter_intra_function(tmp_buf, nbytes, MPI_BYTE, (void *)sendbuf, sendcnt, sendtype, 0, shmem_comm); } else { mpi_errno = MV2_Scatter_intra_function(tmp_buf, nbytes, MPI_BYTE, recvbuf, recvcnt, recvtype, 0, shmem_comm); } } /* check if multiple threads are calling this collective function */ if (comm_size != local_size && local_rank == 0) { smpi_free_tmp_buffer(tmp_buf); if (leader_of_root == rank && root != rank) { smpi_free_tmp_buffer(leader_scatter_buf); } } return (mpi_errno); }
/* This fucntion performs all-reduce operation as follow. 1) binomial_tree reduce inside each SMP node 2) reduce-scatter -inter between root of each SMP node 3) allgather - inter between root of each SMP node 4) binomial_tree bcast inside each SMP node */ int smpi_coll_tuned_allreduce_smp_rsag(void *send_buf, void *recv_buf, int count, MPI_Datatype dtype, MPI_Op op, MPI_Comm comm) { int comm_size, rank; void *tmp_buf; int tag = COLL_TAG_ALLREDUCE; int mask, src, dst; MPI_Status status; if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){ smpi_comm_init_smp(comm); } int num_core=1; if (smpi_comm_is_uniform(comm)){ num_core = smpi_comm_size(smpi_comm_get_intra_comm(comm)); } /* #ifdef MPICH2_REDUCTION MPI_User_function * uop = MPIR_Op_table[op % 16 - 1]; #else MPI_User_function *uop; struct MPIR_OP *op_ptr; op_ptr = MPIR_ToPointer(op); uop = op_ptr->op; #endif */ comm_size = smpi_comm_size(comm); rank = smpi_comm_rank(comm); MPI_Aint extent; extent = smpi_datatype_get_extent(dtype); tmp_buf = (void *) smpi_get_tmp_sendbuffer(count * extent); int intra_rank, inter_rank; intra_rank = rank % num_core; inter_rank = rank / num_core; //printf("node %d intra_rank = %d, inter_rank = %d\n", rank, intra_rank, inter_rank); int inter_comm_size = (comm_size + num_core - 1) / num_core; if (!rank) { //printf("intra com size = %d\n",num_core); //printf("inter com size = %d\n",inter_comm_size); } smpi_mpi_sendrecv(send_buf, count, dtype, rank, tag, recv_buf, count, dtype, rank, tag, comm, &status); // SMP_binomial_reduce mask = 1; while (mask < num_core) { if ((mask & intra_rank) == 0) { src = (inter_rank * num_core) + (intra_rank | mask); // if (src < ((inter_rank + 1) * num_core)) { if (src < comm_size) { smpi_mpi_recv(tmp_buf, count, dtype, src, tag, comm, &status); smpi_op_apply(op, tmp_buf, recv_buf, &count, &dtype); //printf("Node %d recv from node %d when mask is %d\n", rank, src, mask); } } else { dst = (inter_rank * num_core) + (intra_rank & (~mask)); smpi_mpi_send(recv_buf, count, dtype, dst, tag, comm); //printf("Node %d send to node %d when mask is %d\n", rank, dst, mask); break; } mask <<= 1; } // INTER: reduce-scatter if (intra_rank == 0) { int send_offset, recv_offset; int seg_count = count / inter_comm_size; int to = ((inter_rank + 1) % inter_comm_size) * num_core; int from = ((inter_rank + inter_comm_size - 1) % inter_comm_size) * num_core; int i; //printf("node %d to %d from %d\n",rank,to,from); for (i = 0; i < (inter_comm_size - 1); i++) { send_offset = ((inter_rank - 1 - i + inter_comm_size) % inter_comm_size) * seg_count * extent; recv_offset = ((inter_rank - 2 - i + inter_comm_size) % inter_comm_size) * seg_count * extent; smpi_mpi_sendrecv((char *) recv_buf + send_offset, seg_count, dtype, to, tag + i, tmp_buf, seg_count, dtype, from, tag + i, comm, &status); // result is in rbuf smpi_op_apply(op, tmp_buf, (char *) recv_buf + recv_offset, &seg_count, &dtype); } // INTER: allgather for (i = 0; i < (inter_comm_size - 1); i++) { send_offset = ((inter_rank - i + inter_comm_size) % inter_comm_size) * seg_count * extent; recv_offset = ((inter_rank - 1 - i + inter_comm_size) % inter_comm_size) * seg_count * extent; smpi_mpi_sendrecv((char *) recv_buf + send_offset, seg_count, dtype, to, tag + i, (char *) recv_buf + recv_offset, seg_count, dtype, from, tag + i, comm, &status); } } // INTER_binomial_reduce // only root node for each SMP // if (intra_rank == 0) { // // mask = 1; // while (mask < inter_comm_size) { // if ((mask & inter_rank) == 0) { // src = (inter_rank | mask) * num_core; // if (src < comm_size) { // smpi_mpi_recv(tmp_buf, count, dtype, src, tag, comm, &status); // (* uop) (tmp_buf, recv_buf, &count, &dtype); //printf("Node %d recv from node %d when mask is %d\n", rank, src, mask); // } // } // else { // dst = (inter_rank & (~mask)) * num_core; // smpi_mpi_send(recv_buf, count, dtype, dst, tag, comm); //printf("Node %d send to node %d when mask is %d\n", rank, dst, mask); // break; // } // mask <<=1; // } // } // INTER_binomial_bcast // if (intra_rank == 0) { // mask = 1; // while (mask < inter_comm_size) { // if (inter_rank & mask) { // src = (inter_rank - mask) * num_core; //printf("Node %d recv from node %d when mask is %d\n", rank, src, mask); // smpi_mpi_recv(recv_buf, count, dtype, src, tag, comm, &status); // break; // } // mask <<= 1; // } // // mask >>= 1; //printf("My rank = %d my mask = %d\n", rank,mask); // while (mask > 0) { // if (inter_rank < inter_comm_size) { // dst = (inter_rank + mask) * num_core; // if (dst < comm_size) { //printf("Node %d send to node %d when mask is %d\n", rank, dst, mask); // smpi_mpi_send(recv_buf, count, dtype, dst, tag, comm); // } // } // mask >>= 1; // } // } // INTRA_binomial_bcast int num_core_in_current_smp = num_core; if (inter_rank == (inter_comm_size - 1)) { num_core_in_current_smp = comm_size - (inter_rank * num_core); } // printf("Node %d num_core = %d\n",rank, num_core_in_current_smp); mask = 1; while (mask < num_core_in_current_smp) { if (intra_rank & mask) { src = (inter_rank * num_core) + (intra_rank - mask); //printf("Node %d recv from node %d when mask is %d\n", rank, src, mask); smpi_mpi_recv(recv_buf, count, dtype, src, tag, comm, &status); break; } mask <<= 1; } mask >>= 1; //printf("My rank = %d my mask = %d\n", rank,mask); while (mask > 0) { dst = (inter_rank * num_core) + (intra_rank + mask); if (dst < comm_size) { //printf("Node %d send to node %d when mask is %d\n", rank, dst, mask); smpi_mpi_send(recv_buf, count, dtype, dst, tag, comm); } mask >>= 1; } smpi_free_tmp_buffer(tmp_buf); return MPI_SUCCESS; }
int smpi_coll_tuned_gather_mvapich2_two_level(void *sendbuf, int sendcnt, MPI_Datatype sendtype, void *recvbuf, int recvcnt, MPI_Datatype recvtype, int root, MPI_Comm comm) { void *leader_gather_buf = NULL; int comm_size, rank; int local_rank, local_size; int leader_comm_rank = -1, leader_comm_size = 0; int mpi_errno = MPI_SUCCESS; int recvtype_size = 0, sendtype_size = 0, nbytes=0; int leader_root, leader_of_root; MPI_Status status; MPI_Aint sendtype_extent = 0, recvtype_extent = 0; /* Datatype extent */ MPI_Aint true_lb, sendtype_true_extent, recvtype_true_extent; MPI_Comm shmem_comm, leader_comm; void* tmp_buf = NULL; //if not set (use of the algo directly, without mvapich2 selector) if(MV2_Gather_intra_node_function==NULL) MV2_Gather_intra_node_function=smpi_coll_tuned_gather_mpich; if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){ smpi_comm_init_smp(comm); } comm_size = smpi_comm_size(comm); rank = smpi_comm_rank(comm); if (((rank == root) && (recvcnt == 0)) || ((rank != root) && (sendcnt == 0))) { return MPI_SUCCESS; } if (sendtype != MPI_DATATYPE_NULL) { sendtype_extent=smpi_datatype_get_extent(sendtype); sendtype_size=smpi_datatype_size(sendtype); smpi_datatype_extent(sendtype, &true_lb, &sendtype_true_extent); } if (recvtype != MPI_DATATYPE_NULL) { recvtype_extent=smpi_datatype_get_extent(recvtype); recvtype_size=smpi_datatype_size(recvtype); smpi_datatype_extent(recvtype, &true_lb, &recvtype_true_extent); } /* extract the rank,size information for the intra-node * communicator */ shmem_comm = smpi_comm_get_intra_comm(comm); local_rank = smpi_comm_rank(shmem_comm); local_size = smpi_comm_size(shmem_comm); if (local_rank == 0) { /* Node leader. Extract the rank, size information for the leader * communicator */ leader_comm = smpi_comm_get_leaders_comm(comm); if(leader_comm==MPI_COMM_NULL){ leader_comm = MPI_COMM_WORLD; } leader_comm_size = smpi_comm_size(leader_comm); leader_comm_rank = smpi_comm_rank(leader_comm); } if (rank == root) { nbytes = recvcnt * recvtype_size; } else { nbytes = sendcnt * sendtype_size; } #if defined(_SMP_LIMIC_) if((g_use_limic2_coll) && (shmem_commptr->ch.use_intra_sock_comm == 1) && (use_limic_gather) &&((num_scheme == USE_GATHER_PT_PT_BINOMIAL) || (num_scheme == USE_GATHER_PT_PT_DIRECT) ||(num_scheme == USE_GATHER_PT_LINEAR_BINOMIAL) || (num_scheme == USE_GATHER_PT_LINEAR_DIRECT) || (num_scheme == USE_GATHER_LINEAR_PT_BINOMIAL) || (num_scheme == USE_GATHER_LINEAR_PT_DIRECT) || (num_scheme == USE_GATHER_LINEAR_LINEAR) || (num_scheme == USE_GATHER_SINGLE_LEADER))) { mpi_errno = MV2_Gather_intra_node_function(sendbuf, sendcnt, sendtype, recvbuf, recvcnt,recvtype, root, comm); } else #endif/*#if defined(_SMP_LIMIC_)*/ { if (local_rank == 0) { /* Node leader, allocate tmp_buffer */ if (rank == root) { tmp_buf = smpi_get_tmp_recvbuffer(recvcnt * MAX(recvtype_extent, recvtype_true_extent) * local_size); } else { tmp_buf = smpi_get_tmp_sendbuffer(sendcnt * MAX(sendtype_extent, sendtype_true_extent) * local_size); } if (tmp_buf == NULL) { mpi_errno = MPI_ERR_OTHER; return mpi_errno; } } /*while testing mpich2 gather test, we see that * which basically splits the comm, and we come to * a point, where use_intra_sock_comm == 0, but if the * intra node function is MPIR_Intra_node_LIMIC_Gather_MV2, * it would use the intra sock comm. In such cases, we * fallback to binomial as a default case.*/ #if defined(_SMP_LIMIC_) if(*MV2_Gather_intra_node_function == MPIR_Intra_node_LIMIC_Gather_MV2) { mpi_errno = MPIR_pt_pt_intra_gather(sendbuf,sendcnt, sendtype, recvbuf, recvcnt, recvtype, root, rank, tmp_buf, nbytes, TEMP_BUF_HAS_NO_DATA, shmem_commptr, MPIR_Gather_intra); } else #endif { /*We are gathering the data into tmp_buf and the output * will be of MPI_BYTE datatype. Since the tmp_buf has no * local data, we pass is_data_avail = TEMP_BUF_HAS_NO_DATA*/ mpi_errno = MPIR_pt_pt_intra_gather(sendbuf,sendcnt, sendtype, recvbuf, recvcnt, recvtype, root, rank, tmp_buf, nbytes, TEMP_BUF_HAS_NO_DATA, shmem_comm, MV2_Gather_intra_node_function ); } } leader_comm = smpi_comm_get_leaders_comm(comm); int* leaders_map = smpi_comm_get_leaders_map(comm); leader_of_root = smpi_group_rank(smpi_comm_group(comm),leaders_map[root]); leader_root = smpi_group_rank(smpi_comm_group(leader_comm),leaders_map[root]); /* leader_root is the rank of the leader of the root in leader_comm. * leader_root is to be used as the root of the inter-leader gather ops */ if (!smpi_comm_is_uniform(comm)) { if (local_rank == 0) { int *displs = NULL; int *recvcnts = NULL; int *node_sizes; int i = 0; /* Node leaders have all the data. But, different nodes can have * different number of processes. Do a Gather first to get the * buffer lengths at each leader, followed by a Gatherv to move * the actual data */ if (leader_comm_rank == leader_root && root != leader_of_root) { /* The root of the Gather operation is not a node-level * leader and this process's rank in the leader_comm * is the same as leader_root */ if(rank == root) { leader_gather_buf = smpi_get_tmp_recvbuffer(recvcnt * MAX(recvtype_extent, recvtype_true_extent) * comm_size); } else { leader_gather_buf = smpi_get_tmp_sendbuffer(sendcnt * MAX(sendtype_extent, sendtype_true_extent) * comm_size); } if (leader_gather_buf == NULL) { mpi_errno = MPI_ERR_OTHER; return mpi_errno; } } node_sizes = smpi_comm_get_non_uniform_map(comm); if (leader_comm_rank == leader_root) { displs = xbt_malloc(sizeof (int) * leader_comm_size); recvcnts = xbt_malloc(sizeof (int) * leader_comm_size); if (!displs || !recvcnts) { mpi_errno = MPI_ERR_OTHER; return mpi_errno; } } if (root == leader_of_root) { /* The root of the gather operation is also the node * leader. Receive into recvbuf and we are done */ if (leader_comm_rank == leader_root) { recvcnts[0] = node_sizes[0] * recvcnt; displs[0] = 0; for (i = 1; i < leader_comm_size; i++) { displs[i] = displs[i - 1] + node_sizes[i - 1] * recvcnt; recvcnts[i] = node_sizes[i] * recvcnt; } } smpi_mpi_gatherv(tmp_buf, local_size * nbytes, MPI_BYTE, recvbuf, recvcnts, displs, recvtype, leader_root, leader_comm); } else { /* The root of the gather operation is not the node leader. * Receive into leader_gather_buf and then send * to the root */ if (leader_comm_rank == leader_root) { recvcnts[0] = node_sizes[0] * nbytes; displs[0] = 0; for (i = 1; i < leader_comm_size; i++) { displs[i] = displs[i - 1] + node_sizes[i - 1] * nbytes; recvcnts[i] = node_sizes[i] * nbytes; } } smpi_mpi_gatherv(tmp_buf, local_size * nbytes, MPI_BYTE, leader_gather_buf, recvcnts, displs, MPI_BYTE, leader_root, leader_comm); } if (leader_comm_rank == leader_root) { xbt_free(displs); xbt_free(recvcnts); } } } else { /* All nodes have the same number of processes. * Just do one Gather to get all * the data at the leader of the root process */ if (local_rank == 0) { if (leader_comm_rank == leader_root && root != leader_of_root) { /* The root of the Gather operation is not a node-level leader */ leader_gather_buf = smpi_get_tmp_sendbuffer(nbytes * comm_size); if (leader_gather_buf == NULL) { mpi_errno = MPI_ERR_OTHER; return mpi_errno; } } if (root == leader_of_root) { mpi_errno = MPIR_Gather_MV2_Direct(tmp_buf, nbytes * local_size, MPI_BYTE, recvbuf, recvcnt * local_size, recvtype, leader_root, leader_comm); } else { mpi_errno = MPIR_Gather_MV2_Direct(tmp_buf, nbytes * local_size, MPI_BYTE, leader_gather_buf, nbytes * local_size, MPI_BYTE, leader_root, leader_comm); } } } if ((local_rank == 0) && (root != rank) && (leader_of_root == rank)) { smpi_mpi_send(leader_gather_buf, nbytes * comm_size, MPI_BYTE, root, COLL_TAG_GATHER, comm); } if (rank == root && local_rank != 0) { /* The root of the gather operation is not the node leader. Receive y* data from the node leader */ smpi_mpi_recv(recvbuf, recvcnt * comm_size, recvtype, leader_of_root, COLL_TAG_GATHER, comm, &status); } /* check if multiple threads are calling this collective function */ if (local_rank == 0 ) { if (tmp_buf != NULL) { smpi_free_tmp_buffer(tmp_buf); } if (leader_gather_buf != NULL) { smpi_free_tmp_buffer(leader_gather_buf); } } return (mpi_errno); }
/* This fucntion performs all-reduce operation as follow. 1) binomial_tree reduce inside each SMP node 2) Recursive doubling intra-communication between root of each SMP node 3) binomial_tree bcast inside each SMP node */ int smpi_coll_tuned_allreduce_smp_rdb(void *send_buf, void *recv_buf, int count, MPI_Datatype dtype, MPI_Op op, MPI_Comm comm) { int comm_size, rank; void *tmp_buf; int tag = COLL_TAG_ALLREDUCE; int mask, src, dst; MPI_Status status; if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){ smpi_comm_init_smp(comm); } int num_core=1; if (smpi_comm_is_uniform(comm)){ num_core = smpi_comm_size(smpi_comm_get_intra_comm(comm)); } /* #ifdef MPICH2_REDUCTION MPI_User_function * uop = MPIR_Op_table[op % 16 - 1]; #else MPI_User_function *uop; struct MPIR_OP *op_ptr; op_ptr = MPIR_ToPointer(op); uop = op_ptr->op; #endif */ comm_size = smpi_comm_size(comm); rank = smpi_comm_rank(comm); MPI_Aint extent; extent = smpi_datatype_get_extent(dtype); tmp_buf = (void *) smpi_get_tmp_sendbuffer(count * extent); /* compute intra and inter ranking */ int intra_rank, inter_rank; intra_rank = rank % num_core; inter_rank = rank / num_core; /* size of processes participate in intra communications => should be equal to number of machines */ int inter_comm_size = (comm_size + num_core - 1) / num_core; /* copy input buffer to output buffer */ smpi_mpi_sendrecv(send_buf, count, dtype, rank, tag, recv_buf, count, dtype, rank, tag, comm, &status); /* start binomial reduce intra communication inside each SMP node */ mask = 1; while (mask < num_core) { if ((mask & intra_rank) == 0) { src = (inter_rank * num_core) + (intra_rank | mask); if (src < comm_size) { smpi_mpi_recv(tmp_buf, count, dtype, src, tag, comm, &status); smpi_op_apply(op, tmp_buf, recv_buf, &count, &dtype); } } else { dst = (inter_rank * num_core) + (intra_rank & (~mask)); smpi_mpi_send(recv_buf, count, dtype, dst, tag, comm); break; } mask <<= 1; } /* end binomial reduce intra-communication */ /* start rdb (recursive doubling) all-reduce inter-communication between each SMP nodes : each node only have one process that can communicate to other nodes */ if (intra_rank == 0) { /* find nearest power-of-two less than or equal to inter_comm_size */ int pof2, rem, newrank, newdst; pof2 = 1; while (pof2 <= inter_comm_size) pof2 <<= 1; pof2 >>= 1; rem = inter_comm_size - pof2; /* In the non-power-of-two case, all even-numbered processes of rank < 2*rem send their data to (rank+1). These even-numbered processes no longer participate in the algorithm until the very end. */ if (inter_rank < 2 * rem) { if (inter_rank % 2 == 0) { dst = rank + num_core; smpi_mpi_send(recv_buf, count, dtype, dst, tag, comm); newrank = -1; } else { src = rank - num_core; smpi_mpi_recv(tmp_buf, count, dtype, src, tag, comm, &status); smpi_op_apply(op, tmp_buf, recv_buf, &count, &dtype); newrank = inter_rank / 2; } } else { newrank = inter_rank - rem; } /* example inter-communication RDB rank change algorithm 0,4,8,12..36 <= true rank (assume 4 core per SMP) 0123 4567 89 <= inter_rank 1 3 4567 89 (1,3 got data from 0,2 : 0,2 will be idle until the end) 0 1 4567 89 0 1 2345 67 => newrank */ if (newrank != -1) { mask = 1; while (mask < pof2) { newdst = newrank ^ mask; /* find real rank of dest */ dst = (newdst < rem) ? newdst * 2 + 1 : newdst + rem; dst *= num_core; /* exchange data in rdb manner */ smpi_mpi_sendrecv(recv_buf, count, dtype, dst, tag, tmp_buf, count, dtype, dst, tag, comm, &status); smpi_op_apply(op, tmp_buf, recv_buf, &count, &dtype); mask <<= 1; } } /* non pof2 case left-over processes (all even ranks: < 2 * rem) get the result */ if (inter_rank < 2 * rem) { if (inter_rank % 2) { smpi_mpi_send(recv_buf, count, dtype, rank - num_core, tag, comm); } else { smpi_mpi_recv(recv_buf, count, dtype, rank + num_core, tag, comm, &status); } } }