int Coll_allgather_mvapich2_smp::allgather(void *sendbuf,int sendcnt, MPI_Datatype sendtype, void *recvbuf, int recvcnt,MPI_Datatype recvtype, MPI_Comm comm) { int rank, size; int local_rank, local_size; int leader_comm_size = 0; int mpi_errno = MPI_SUCCESS; MPI_Aint recvtype_extent = 0; /* Datatype extent */ MPI_Comm shmem_comm, leader_comm; if(comm->get_leaders_comm()==MPI_COMM_NULL){ comm->init_smp(); } if (not comm->is_uniform() || not comm->is_blocked()) THROWF(arg_error,0, "allgather MVAPICH2 smp algorithm can't be used with irregular deployment. Please insure that processes deployed on the same node are contiguous and that each node has the same number of processes"); if (recvcnt == 0) { return MPI_SUCCESS; } rank = comm->rank(); size = comm->size(); /* extract the rank,size information for the intra-node communicator */ recvtype_extent=recvtype->get_extent(); shmem_comm = comm->get_intra_comm(); local_rank = shmem_comm->rank(); local_size = shmem_comm->size(); if (local_rank == 0) { /* Node leader. Extract the rank, size information for the leader communicator */ leader_comm = comm->get_leaders_comm(); if(leader_comm==MPI_COMM_NULL){ leader_comm = MPI_COMM_WORLD; } leader_comm_size = leader_comm->size(); } /*If there is just one node, after gather itself, * root has all the data and it can do bcast*/ if(local_rank == 0) { mpi_errno = Colls::gather(sendbuf, sendcnt,sendtype, (void*)((char*)recvbuf + (rank * recvcnt * recvtype_extent)), recvcnt, recvtype, 0, shmem_comm); } else { /*Since in allgather all the processes could have * its own data in place*/ if(sendbuf == MPI_IN_PLACE) { mpi_errno = Colls::gather((void*)((char*)recvbuf + (rank * recvcnt * recvtype_extent)), recvcnt , recvtype, recvbuf, recvcnt, recvtype, 0, shmem_comm); } else { mpi_errno = Colls::gather(sendbuf, sendcnt,sendtype, recvbuf, recvcnt, recvtype, 0, shmem_comm); } } /* Exchange the data between the node leaders*/ if (local_rank == 0 && (leader_comm_size > 1)) { /*When data in each socket is different*/ if (comm->is_uniform() != 1) { int *displs = NULL; int *recvcnts = NULL; int *node_sizes = NULL; int i = 0; node_sizes = comm->get_non_uniform_map(); displs = static_cast<int *>(xbt_malloc(sizeof (int) * leader_comm_size)); recvcnts = static_cast<int *>(xbt_malloc(sizeof (int) * leader_comm_size)); if (not displs || not recvcnts) { return MPI_ERR_OTHER; } recvcnts[0] = node_sizes[0] * recvcnt; displs[0] = 0; for (i = 1; i < leader_comm_size; i++) { displs[i] = displs[i - 1] + node_sizes[i - 1] * recvcnt; recvcnts[i] = node_sizes[i] * recvcnt; } void* sendbuf=((char*)recvbuf)+recvtype->get_extent()*displs[leader_comm->rank()]; mpi_errno = Colls::allgatherv(sendbuf, (recvcnt*local_size), recvtype, recvbuf, recvcnts, displs, recvtype, leader_comm); xbt_free(displs); xbt_free(recvcnts); } else { void* sendtmpbuf=((char*)recvbuf)+recvtype->get_extent()*(recvcnt*local_size)*leader_comm->rank(); mpi_errno = Coll_allgather_mpich::allgather(sendtmpbuf, (recvcnt*local_size), recvtype, recvbuf, (recvcnt*local_size), recvtype, leader_comm); } } /*Bcast the entire data from node leaders to all other cores*/ mpi_errno = Colls::bcast (recvbuf, recvcnt * size, recvtype, 0, shmem_comm); return mpi_errno; }
int Coll_scatter_mvapich2_two_level_direct::scatter(void *sendbuf, int sendcnt, MPI_Datatype sendtype, void *recvbuf, int recvcnt, MPI_Datatype recvtype, int root, MPI_Comm comm) { int comm_size, rank; int local_rank, local_size; int leader_comm_rank = -1, leader_comm_size = -1; int mpi_errno = MPI_SUCCESS; int recvtype_size, sendtype_size, nbytes; void *tmp_buf = NULL; void *leader_scatter_buf = NULL; MPI_Status status; int leader_root, leader_of_root = -1; MPI_Comm shmem_comm, leader_comm; //if not set (use of the algo directly, without mvapich2 selector) if(MV2_Scatter_intra_function==NULL) MV2_Scatter_intra_function=Coll_scatter_mpich::scatter; if(comm->get_leaders_comm()==MPI_COMM_NULL){ comm->init_smp(); } comm_size = comm->size(); rank = comm->rank(); if (((rank == root) && (recvcnt == 0)) || ((rank != root) && (sendcnt == 0))) { return MPI_SUCCESS; } /* extract the rank,size information for the intra-node * communicator */ shmem_comm = comm->get_intra_comm(); local_rank = shmem_comm->rank(); local_size = shmem_comm->size(); if (local_rank == 0) { /* Node leader. Extract the rank, size information for the leader * communicator */ leader_comm = comm->get_leaders_comm(); leader_comm_size = leader_comm->size(); leader_comm_rank = leader_comm->rank(); } if (local_size == comm_size) { /* purely intra-node scatter. Just use the direct algorithm and we are done */ mpi_errno = MPIR_Scatter_MV2_Direct(sendbuf, sendcnt, sendtype, recvbuf, recvcnt, recvtype, root, comm); } else { recvtype_size=recvtype->size(); sendtype_size=sendtype->size(); if (rank == root) { nbytes = sendcnt * sendtype_size; } else { nbytes = recvcnt * recvtype_size; } if (local_rank == 0) { /* Node leader, allocate tmp_buffer */ tmp_buf = smpi_get_tmp_sendbuffer(nbytes * local_size); } leader_comm = comm->get_leaders_comm(); int* leaders_map = comm->get_leaders_map(); leader_of_root = comm->group()->rank(leaders_map[root]); leader_root = leader_comm->group()->rank(leaders_map[root]); /* leader_root is the rank of the leader of the root in leader_comm. * leader_root is to be used as the root of the inter-leader gather ops */ if ((local_rank == 0) && (root != rank) && (leader_of_root == rank)) { /* The root of the scatter operation is not the node leader. Recv * data from the node leader */ leader_scatter_buf = smpi_get_tmp_sendbuffer(nbytes * comm_size); Request::recv(leader_scatter_buf, nbytes * comm_size, MPI_BYTE, root, COLL_TAG_SCATTER, comm, &status); } if (rank == root && local_rank != 0) { /* The root of the scatter operation is not the node leader. Send * data to the node leader */ Request::send(sendbuf, sendcnt * comm_size, sendtype, leader_of_root, COLL_TAG_SCATTER, comm ); } if (leader_comm_size > 1 && local_rank == 0) { if (not comm->is_uniform()) { int* displs = NULL; int* sendcnts = NULL; int* node_sizes; int i = 0; node_sizes = comm->get_non_uniform_map(); if (root != leader_of_root) { if (leader_comm_rank == leader_root) { displs = static_cast<int*>(xbt_malloc(sizeof(int) * leader_comm_size)); sendcnts = static_cast<int*>(xbt_malloc(sizeof(int) * leader_comm_size)); sendcnts[0] = node_sizes[0] * nbytes; displs[0] = 0; for (i = 1; i < leader_comm_size; i++) { displs[i] = displs[i - 1] + node_sizes[i - 1] * nbytes; sendcnts[i] = node_sizes[i] * nbytes; } } Colls::scatterv(leader_scatter_buf, sendcnts, displs, MPI_BYTE, tmp_buf, nbytes * local_size, MPI_BYTE, leader_root, leader_comm); } else { if (leader_comm_rank == leader_root) { displs = static_cast<int*>(xbt_malloc(sizeof(int) * leader_comm_size)); sendcnts = static_cast<int*>(xbt_malloc(sizeof(int) * leader_comm_size)); sendcnts[0] = node_sizes[0] * sendcnt; displs[0] = 0; for (i = 1; i < leader_comm_size; i++) { displs[i] = displs[i - 1] + node_sizes[i - 1] * sendcnt; sendcnts[i] = node_sizes[i] * sendcnt; } } Colls::scatterv(sendbuf, sendcnts, displs, sendtype, tmp_buf, nbytes * local_size, MPI_BYTE, leader_root, leader_comm); } if (leader_comm_rank == leader_root) { xbt_free(displs); xbt_free(sendcnts); } } else { if (leader_of_root != root) { mpi_errno = MPIR_Scatter_MV2_Direct(leader_scatter_buf, nbytes * local_size, MPI_BYTE, tmp_buf, nbytes * local_size, MPI_BYTE, leader_root, leader_comm); } else { mpi_errno = MPIR_Scatter_MV2_Direct(sendbuf, sendcnt * local_size, sendtype, tmp_buf, nbytes * local_size, MPI_BYTE, leader_root, leader_comm); } } } /* The leaders are now done with the inter-leader part. Scatter the data within the nodes */ if (rank == root && recvbuf == MPI_IN_PLACE) { mpi_errno = MV2_Scatter_intra_function(tmp_buf, nbytes, MPI_BYTE, (void *)sendbuf, sendcnt, sendtype, 0, shmem_comm); } else { mpi_errno = MV2_Scatter_intra_function(tmp_buf, nbytes, MPI_BYTE, recvbuf, recvcnt, recvtype, 0, shmem_comm); } } /* check if multiple threads are calling this collective function */ if (comm_size != local_size && local_rank == 0) { smpi_free_tmp_buffer(tmp_buf); if (leader_of_root == rank && root != rank) { smpi_free_tmp_buffer(leader_scatter_buf); } } return (mpi_errno); }
int Coll_allgather_loosely_lr::allgather(const void *sbuf, int scount, MPI_Datatype stype, void *rbuf, int rcount, MPI_Datatype rtype, MPI_Comm comm) { int comm_size, rank; int tag = COLL_TAG_ALLGATHER; int i, j, send_offset, recv_offset; int intra_rank, inter_rank, inter_comm_size, intra_comm_size; int inter_dst, inter_src; comm_size = comm->size(); if(comm->get_leaders_comm()==MPI_COMM_NULL){ comm->init_smp(); } int num_core=1; if (comm->is_uniform()){ num_core = comm->get_intra_comm()->size(); } if(comm_size%num_core) THROWF(arg_error,0, "allgather loosely lr algorithm can't be used with non multiple of NUM_CORE=%d number of processes ! ",num_core); rank = comm->rank(); MPI_Aint rextent, sextent; rextent = rtype->get_extent(); sextent = stype->get_extent(); MPI_Request inter_rrequest; MPI_Request rrequest_array[128]; MPI_Request srequest_array[128]; MPI_Request inter_srequest_array[128]; int rrequest_count = 0; int srequest_count = 0; int inter_srequest_count = 0; MPI_Status status; intra_rank = rank % num_core; inter_rank = rank / num_core; inter_comm_size = (comm_size + num_core - 1) / num_core; intra_comm_size = num_core; int src_seg, dst_seg; //copy corresponding message from sbuf to rbuf recv_offset = rank * rextent * rcount; Request::sendrecv(sbuf, scount, stype, rank, tag, (char *)rbuf + recv_offset, rcount, rtype, rank, tag, comm, &status); int dst, src; int inter_send_offset, inter_recv_offset; rrequest_count = 0; srequest_count = 0; inter_srequest_count = 0; for (i = 0; i < inter_comm_size; i++) { // inter_communication inter_dst = (rank + intra_comm_size) % comm_size; inter_src = (rank - intra_comm_size + comm_size) % comm_size; src_seg = ((inter_rank - 1 - i + inter_comm_size) % inter_comm_size) * intra_comm_size + intra_rank; dst_seg = ((inter_rank - i + inter_comm_size) % inter_comm_size) * intra_comm_size + intra_rank; inter_send_offset = dst_seg * sextent * scount; inter_recv_offset = src_seg * rextent * rcount; for (j = 0; j < intra_comm_size; j++) { // inter communication if (intra_rank == j) { if (i != inter_comm_size - 1) { inter_rrequest = Request::irecv((char*)rbuf + inter_recv_offset, rcount, rtype, inter_src, tag, comm); inter_srequest_array[inter_srequest_count++] = Request::isend((char*)rbuf + inter_send_offset, scount, stype, inter_dst, tag, comm); } } //intra_communication src = inter_rank * intra_comm_size + j; dst = inter_rank * intra_comm_size + j; src_seg = ((inter_rank - i + inter_comm_size) % inter_comm_size) * intra_comm_size + j; dst_seg = ((inter_rank - i + inter_comm_size) % inter_comm_size) * intra_comm_size + intra_rank; send_offset = dst_seg * sextent * scount; recv_offset = src_seg * rextent * rcount; if (j != intra_rank) { rrequest_array[rrequest_count++] = Request::irecv((char *)rbuf + recv_offset, rcount, rtype, src, tag, comm); srequest_array[srequest_count++] = Request::isend((char *)rbuf + send_offset, scount, stype, dst, tag, comm); } } // intra loop // wait for inter communication to finish for these rounds (# of round equals num_core) if (i != inter_comm_size - 1) { Request::wait(&inter_rrequest, &status); } } //inter loop Request::waitall(rrequest_count, rrequest_array, MPI_STATUSES_IGNORE); Request::waitall(srequest_count, srequest_array, MPI_STATUSES_IGNORE); Request::waitall(inter_srequest_count, inter_srequest_array, MPI_STATUSES_IGNORE); return MPI_SUCCESS; }
int Coll_bcast_SMP_binary::bcast(void *buf, int count, MPI_Datatype datatype, int root, MPI_Comm comm) { int tag = COLL_TAG_BCAST; MPI_Status status; MPI_Request request; MPI_Request *request_array; MPI_Status *status_array; int rank, size; int i; MPI_Aint extent; extent = datatype->get_extent(); rank = comm->rank(); size = comm->size(); if(comm->get_leaders_comm()==MPI_COMM_NULL){ comm->init_smp(); } int host_num_core=1; if (comm->is_uniform()){ host_num_core = comm->get_intra_comm()->size(); }else{ //implementation buggy in this case return Coll_bcast_mpich::bcast( buf , count, datatype, root, comm); } int segment = bcast_SMP_binary_segment_byte / extent; int pipe_length = count / segment; int remainder = count % segment; int to_intra_left = (rank / host_num_core) * host_num_core + (rank % host_num_core) * 2 + 1; int to_intra_right = (rank / host_num_core) * host_num_core + (rank % host_num_core) * 2 + 2; int to_inter_left = ((rank / host_num_core) * 2 + 1) * host_num_core; int to_inter_right = ((rank / host_num_core) * 2 + 2) * host_num_core; int from_inter = (((rank / host_num_core) - 1) / 2) * host_num_core; int from_intra = (rank / host_num_core) * host_num_core + ((rank % host_num_core) - 1) / 2; int increment = segment * extent; int base = (rank / host_num_core) * host_num_core; int num_core = host_num_core; if (((rank / host_num_core) * host_num_core) == ((size / host_num_core) * host_num_core)) num_core = size - (rank / host_num_core) * host_num_core; // if root is not zero send to rank zero first if (root != 0) { if (rank == root) Request::send(buf, count, datatype, 0, tag, comm); else if (rank == 0) Request::recv(buf, count, datatype, root, tag, comm, &status); } // when a message is smaller than a block size => no pipeline if (count <= segment) { // case ROOT-of-each-SMP if (rank % host_num_core == 0) { // case ROOT if (rank == 0) { //printf("node %d left %d right %d\n",rank,to_inter_left,to_inter_right); if (to_inter_left < size) Request::send(buf, count, datatype, to_inter_left, tag, comm); if (to_inter_right < size) Request::send(buf, count, datatype, to_inter_right, tag, comm); if ((to_intra_left - base) < num_core) Request::send(buf, count, datatype, to_intra_left, tag, comm); if ((to_intra_right - base) < num_core) Request::send(buf, count, datatype, to_intra_right, tag, comm); } // case LEAVES ROOT-of-eash-SMP else if (to_inter_left >= size) { //printf("node %d from %d\n",rank,from_inter); request = Request::irecv(buf, count, datatype, from_inter, tag, comm); Request::wait(&request, &status); if ((to_intra_left - base) < num_core) Request::send(buf, count, datatype, to_intra_left, tag, comm); if ((to_intra_right - base) < num_core) Request::send(buf, count, datatype, to_intra_right, tag, comm); } // case INTERMEDIAT ROOT-of-each-SMP else { //printf("node %d left %d right %d from %d\n",rank,to_inter_left,to_inter_right,from_inter); request = Request::irecv(buf, count, datatype, from_inter, tag, comm); Request::wait(&request, &status); Request::send(buf, count, datatype, to_inter_left, tag, comm); if (to_inter_right < size) Request::send(buf, count, datatype, to_inter_right, tag, comm); if ((to_intra_left - base) < num_core) Request::send(buf, count, datatype, to_intra_left, tag, comm); if ((to_intra_right - base) < num_core) Request::send(buf, count, datatype, to_intra_right, tag, comm); } } // case non ROOT-of-each-SMP else { // case leaves if ((to_intra_left - base) >= num_core) { request = Request::irecv(buf, count, datatype, from_intra, tag, comm); Request::wait(&request, &status); } // case intermediate else { request = Request::irecv(buf, count, datatype, from_intra, tag, comm); Request::wait(&request, &status); Request::send(buf, count, datatype, to_intra_left, tag, comm); if ((to_intra_right - base) < num_core) Request::send(buf, count, datatype, to_intra_right, tag, comm); } } return MPI_SUCCESS; } // pipeline bcast else { request_array = (MPI_Request *) xbt_malloc((size + pipe_length) * sizeof(MPI_Request)); status_array = (MPI_Status *) xbt_malloc((size + pipe_length) * sizeof(MPI_Status)); // case ROOT-of-each-SMP if (rank % host_num_core == 0) { // case ROOT if (rank == 0) { for (i = 0; i < pipe_length; i++) { //printf("node %d left %d right %d\n",rank,to_inter_left,to_inter_right); if (to_inter_left < size) Request::send((char *) buf + (i * increment), segment, datatype, to_inter_left, (tag + i), comm); if (to_inter_right < size) Request::send((char *) buf + (i * increment), segment, datatype, to_inter_right, (tag + i), comm); if ((to_intra_left - base) < num_core) Request::send((char *) buf + (i * increment), segment, datatype, to_intra_left, (tag + i), comm); if ((to_intra_right - base) < num_core) Request::send((char *) buf + (i * increment), segment, datatype, to_intra_right, (tag + i), comm); } } // case LEAVES ROOT-of-eash-SMP else if (to_inter_left >= size) { //printf("node %d from %d\n",rank,from_inter); for (i = 0; i < pipe_length; i++) { request_array[i] = Request::irecv((char *) buf + (i * increment), segment, datatype, from_inter, (tag + i), comm); } for (i = 0; i < pipe_length; i++) { Request::wait(&request_array[i], &status); if ((to_intra_left - base) < num_core) Request::send((char *) buf + (i * increment), segment, datatype, to_intra_left, (tag + i), comm); if ((to_intra_right - base) < num_core) Request::send((char *) buf + (i * increment), segment, datatype, to_intra_right, (tag + i), comm); } } // case INTERMEDIAT ROOT-of-each-SMP else { //printf("node %d left %d right %d from %d\n",rank,to_inter_left,to_inter_right,from_inter); for (i = 0; i < pipe_length; i++) { request_array[i] = Request::irecv((char *) buf + (i * increment), segment, datatype, from_inter, (tag + i), comm); } for (i = 0; i < pipe_length; i++) { Request::wait(&request_array[i], &status); Request::send((char *) buf + (i * increment), segment, datatype, to_inter_left, (tag + i), comm); if (to_inter_right < size) Request::send((char *) buf + (i * increment), segment, datatype, to_inter_right, (tag + i), comm); if ((to_intra_left - base) < num_core) Request::send((char *) buf + (i * increment), segment, datatype, to_intra_left, (tag + i), comm); if ((to_intra_right - base) < num_core) Request::send((char *) buf + (i * increment), segment, datatype, to_intra_right, (tag + i), comm); } } } // case non-ROOT-of-each-SMP else { // case leaves if ((to_intra_left - base) >= num_core) { for (i = 0; i < pipe_length; i++) { request_array[i] = Request::irecv((char *) buf + (i * increment), segment, datatype, from_intra, (tag + i), comm); } Request::waitall((pipe_length), request_array, status_array); } // case intermediate else { for (i = 0; i < pipe_length; i++) { request_array[i] = Request::irecv((char *) buf + (i * increment), segment, datatype, from_intra, (tag + i), comm); } for (i = 0; i < pipe_length; i++) { Request::wait(&request_array[i], &status); Request::send((char *) buf + (i * increment), segment, datatype, to_intra_left, (tag + i), comm); if ((to_intra_right - base) < num_core) Request::send((char *) buf + (i * increment), segment, datatype, to_intra_right, (tag + i), comm); } } } free(request_array); free(status_array); } // when count is not divisible by block size, use default BCAST for the remainder if ((remainder != 0) && (count > segment)) { XBT_WARN("MPI_bcast_SMP_binary use default MPI_bcast."); Colls::bcast((char *) buf + (pipe_length * increment), remainder, datatype, root, comm); } return 1; }
int Coll_scatter_mvapich2::scatter(const void *sendbuf, int sendcnt, MPI_Datatype sendtype, void *recvbuf, int recvcnt, MPI_Datatype recvtype, int root, MPI_Comm comm) { int range = 0, range_threshold = 0, range_threshold_intra = 0; int mpi_errno = MPI_SUCCESS; // int mpi_errno_ret = MPI_SUCCESS; int rank, nbytes, comm_size; int partial_sub_ok = 0; int conf_index = 0; MPI_Comm shmem_comm; // MPID_Comm *shmem_commptr=NULL; if(mv2_scatter_thresholds_table==NULL) init_mv2_scatter_tables_stampede(); if(comm->get_leaders_comm()==MPI_COMM_NULL){ comm->init_smp(); } comm_size = comm->size(); rank = comm->rank(); if (rank == root) { int sendtype_size = sendtype->size(); nbytes = sendcnt * sendtype_size; } else { int recvtype_size = recvtype->size(); nbytes = recvcnt * recvtype_size; } // check if safe to use partial subscription mode if (comm->is_uniform()) { shmem_comm = comm->get_intra_comm(); if (mv2_scatter_table_ppn_conf[0] == -1) { // Indicating user defined tuning conf_index = 0; }else{ int local_size = shmem_comm->size(); int i = 0; do { if (local_size == mv2_scatter_table_ppn_conf[i]) { conf_index = i; partial_sub_ok = 1; break; } i++; } while(i < mv2_scatter_num_ppn_conf); } } if (partial_sub_ok != 1) { conf_index = 0; } /* Search for the corresponding system size inside the tuning table */ while ((range < (mv2_size_scatter_tuning_table[conf_index] - 1)) && (comm_size > mv2_scatter_thresholds_table[conf_index][range].numproc)) { range++; } /* Search for corresponding inter-leader function */ while ((range_threshold < (mv2_scatter_thresholds_table[conf_index][range].size_inter_table - 1)) && (nbytes > mv2_scatter_thresholds_table[conf_index][range].inter_leader[range_threshold].max) && (mv2_scatter_thresholds_table[conf_index][range].inter_leader[range_threshold].max != -1)) { range_threshold++; } /* Search for corresponding intra-node function */ while ((range_threshold_intra < (mv2_scatter_thresholds_table[conf_index][range].size_intra_table - 1)) && (nbytes > mv2_scatter_thresholds_table[conf_index][range].intra_node[range_threshold_intra].max) && (mv2_scatter_thresholds_table[conf_index][range].intra_node[range_threshold_intra].max != -1)) { range_threshold_intra++; } MV2_Scatter_function = mv2_scatter_thresholds_table[conf_index][range].inter_leader[range_threshold] .MV2_pt_Scatter_function; if(MV2_Scatter_function == &MPIR_Scatter_mcst_wrap_MV2) { #if defined(_MCST_SUPPORT_) if(comm->ch.is_mcast_ok == 1 && mv2_use_mcast_scatter == 1 && comm->ch.shmem_coll_ok == 1) { MV2_Scatter_function = &MPIR_Scatter_mcst_MV2; } else #endif /*#if defined(_MCST_SUPPORT_) */ { if(mv2_scatter_thresholds_table[conf_index][range].inter_leader[range_threshold + 1]. MV2_pt_Scatter_function != NULL) { MV2_Scatter_function = mv2_scatter_thresholds_table[conf_index][range].inter_leader[range_threshold + 1] .MV2_pt_Scatter_function; } else { /* Fallback! */ MV2_Scatter_function = &MPIR_Scatter_MV2_Binomial; } } } if( (MV2_Scatter_function == &MPIR_Scatter_MV2_two_level_Direct) || (MV2_Scatter_function == &MPIR_Scatter_MV2_two_level_Binomial)) { if( comm->is_blocked()) { MV2_Scatter_intra_function = mv2_scatter_thresholds_table[conf_index][range].intra_node[range_threshold_intra] .MV2_pt_Scatter_function; mpi_errno = MV2_Scatter_function(sendbuf, sendcnt, sendtype, recvbuf, recvcnt, recvtype, root, comm); } else { mpi_errno = MPIR_Scatter_MV2_Binomial(sendbuf, sendcnt, sendtype, recvbuf, recvcnt, recvtype, root, comm); } } else { mpi_errno = MV2_Scatter_function(sendbuf, sendcnt, sendtype, recvbuf, recvcnt, recvtype, root, comm); } return (mpi_errno); }
int Coll_allgather_SMP_NTS::allgather(const void *sbuf, int scount, MPI_Datatype stype, void *rbuf, int rcount, MPI_Datatype rtype, MPI_Comm comm) { int src, dst, comm_size, rank; comm_size = comm->size(); rank = comm->rank(); MPI_Aint rextent, sextent; rextent = rtype->get_extent(); sextent = stype->get_extent(); int tag = COLL_TAG_ALLGATHER; int i, send_offset, recv_offset; int intra_rank, inter_rank; if(comm->get_leaders_comm()==MPI_COMM_NULL){ comm->init_smp(); } int num_core=1; if (comm->is_uniform()){ num_core = comm->get_intra_comm()->size(); } intra_rank = rank % num_core; inter_rank = rank / num_core; int inter_comm_size = (comm_size + num_core - 1) / num_core; int num_core_in_current_smp = num_core; if(comm_size%num_core) THROWF(arg_error,0, "allgather SMP NTS algorithm can't be used with non multiple of NUM_CORE=%d number of processes ! ", num_core); /* for too small number of processes, use default implementation */ if (comm_size <= num_core) { XBT_WARN("MPI_allgather_SMP_NTS use default MPI_allgather."); Coll_allgather_default::allgather(sbuf, scount, stype, rbuf, rcount, rtype, comm); return MPI_SUCCESS; } // the last SMP node may have fewer number of running processes than all others if (inter_rank == (inter_comm_size - 1)) { num_core_in_current_smp = comm_size - (inter_rank * num_core); } //copy corresponding message from sbuf to rbuf recv_offset = rank * rextent * rcount; Request::sendrecv(sbuf, scount, stype, rank, tag, ((char *) rbuf + recv_offset), rcount, rtype, rank, tag, comm, MPI_STATUS_IGNORE); //gather to root of each SMP for (i = 1; i < num_core_in_current_smp; i++) { dst = (inter_rank * num_core) + (intra_rank + i) % (num_core_in_current_smp); src = (inter_rank * num_core) + (intra_rank - i + num_core_in_current_smp) % (num_core_in_current_smp); recv_offset = src * rextent * rcount; Request::sendrecv(sbuf, scount, stype, dst, tag, ((char *) rbuf + recv_offset), rcount, rtype, src, tag, comm, MPI_STATUS_IGNORE); } // INTER-SMP-ALLGATHER // Every root of each SMP node post INTER-Sendrecv, then do INTRA-Bcast for each receiving message // Use logical ring algorithm // root of each SMP if (intra_rank == 0) { MPI_Request* rrequest_array = new MPI_Request[inter_comm_size - 1]; MPI_Request* srequest_array = new MPI_Request[inter_comm_size - 1]; src = ((inter_rank - 1 + inter_comm_size) % inter_comm_size) * num_core; dst = ((inter_rank + 1) % inter_comm_size) * num_core; // post all inter Irecv for (i = 0; i < inter_comm_size - 1; i++) { recv_offset = ((inter_rank - i - 1 + inter_comm_size) % inter_comm_size) * num_core * sextent * scount; rrequest_array[i] = Request::irecv((char *)rbuf + recv_offset, rcount * num_core, rtype, src, tag + i, comm); } // send first message send_offset = ((inter_rank + inter_comm_size) % inter_comm_size) * num_core * sextent * scount; srequest_array[0] = Request::isend((char *)rbuf + send_offset, scount * num_core, stype, dst, tag, comm); // loop : recv-inter , send-inter, send-intra (linear-bcast) for (i = 0; i < inter_comm_size - 2; i++) { recv_offset = ((inter_rank - i - 1 + inter_comm_size) % inter_comm_size) * num_core * sextent * scount; Request::wait(&rrequest_array[i], MPI_STATUS_IGNORE); srequest_array[i + 1] = Request::isend((char *)rbuf + recv_offset, scount * num_core, stype, dst, tag + i + 1, comm); if (num_core_in_current_smp > 1) { Request::send((char *)rbuf + recv_offset, scount * num_core, stype, (rank + 1), tag + i + 1, comm); } } // recv last message and send_intra recv_offset = ((inter_rank - i - 1 + inter_comm_size) % inter_comm_size) * num_core * sextent * scount; //recv_offset = ((inter_rank + 1) % inter_comm_size) * num_core * sextent * scount; //i=inter_comm_size-2; Request::wait(&rrequest_array[i], MPI_STATUS_IGNORE); if (num_core_in_current_smp > 1) { Request::send((char *)rbuf + recv_offset, scount * num_core, stype, (rank + 1), tag + i + 1, comm); } Request::waitall(inter_comm_size - 1, srequest_array, MPI_STATUSES_IGNORE); delete[] rrequest_array; delete[] srequest_array; } // last rank of each SMP else if (intra_rank == (num_core_in_current_smp - 1)) { for (i = 0; i < inter_comm_size - 1; i++) { recv_offset = ((inter_rank - i - 1 + inter_comm_size) % inter_comm_size) * num_core * sextent * scount; Request::recv((char *) rbuf + recv_offset, (rcount * num_core), rtype, rank - 1, tag + i + 1, comm, MPI_STATUS_IGNORE); } } // intermediate rank of each SMP else { for (i = 0; i < inter_comm_size - 1; i++) { recv_offset = ((inter_rank - i - 1 + inter_comm_size) % inter_comm_size) * num_core * sextent * scount; Request::recv((char *) rbuf + recv_offset, (rcount * num_core), rtype, rank - 1, tag + i + 1, comm, MPI_STATUS_IGNORE); Request::send((char *) rbuf + recv_offset, (scount * num_core), stype, (rank + 1), tag + i + 1, comm); } } return MPI_SUCCESS; }