예제 #1
0
int Coll_reduce_mvapich2_two_level::reduce( const void *sendbuf,
                                     void *recvbuf,
                                     int count,
                                     MPI_Datatype datatype,
                                     MPI_Op op,
                                     int root,
                                     MPI_Comm comm)
{
    int mpi_errno = MPI_SUCCESS;
    int my_rank, total_size, local_rank, local_size;
    int leader_comm_rank = -1, leader_comm_size = 0;
    MPI_Comm shmem_comm, leader_comm;
    int leader_root, leader_of_root;
    const unsigned char* in_buf = nullptr;
    unsigned char *out_buf = nullptr, *tmp_buf = nullptr;
    MPI_Aint true_lb, true_extent, extent;
    int is_commutative = 0, stride = 0;
    int intra_node_root=0;

    //if not set (use of the algo directly, without mvapich2 selector)
    if(MV2_Reduce_function==NULL)
      MV2_Reduce_function=Coll_reduce_mpich::reduce;
    if(MV2_Reduce_intra_function==NULL)
      MV2_Reduce_intra_function=Coll_reduce_mpich::reduce;

    if(comm->get_leaders_comm()==MPI_COMM_NULL){
      comm->init_smp();
    }

    my_rank = comm->rank();
    total_size = comm->size();
    shmem_comm = comm->get_intra_comm();
    local_rank = shmem_comm->rank();
    local_size = shmem_comm->size();

    leader_comm = comm->get_leaders_comm();
    int* leaders_map = comm->get_leaders_map();
    leader_of_root = comm->group()->rank(leaders_map[root]);
    leader_root = leader_comm->group()->rank(leaders_map[root]);

    is_commutative= (op==MPI_OP_NULL || op->is_commutative());

    datatype->extent(&true_lb,
                                       &true_extent);
    extent =datatype->get_extent();
    stride = count * std::max(extent, true_extent);

    if (local_size == total_size) {
        /* First handle the case where there is only one node */
        if (stride <= MV2_INTRA_SHMEM_REDUCE_MSG &&
            is_commutative == 1) {
            if (local_rank == 0 ) {
              tmp_buf = smpi_get_tmp_sendbuffer(count * std::max(extent, true_extent));
              tmp_buf = tmp_buf - true_lb;
            }

            if (sendbuf != MPI_IN_PLACE) {
              in_buf = static_cast<const unsigned char*>(sendbuf);
            } else {
              in_buf = static_cast<const unsigned char*>(recvbuf);
            }

            if (local_rank == 0) {
                 if( my_rank != root) {
                     out_buf = tmp_buf;
                 } else {
                   out_buf = static_cast<unsigned char*>(recvbuf);
                   if (in_buf == out_buf) {
                     in_buf  = static_cast<const unsigned char*>(MPI_IN_PLACE);
                     out_buf = static_cast<unsigned char*>(recvbuf);
                     }
                 }
            } else {
              in_buf  = static_cast<const unsigned char*>(sendbuf);
              out_buf = nullptr;
            }

            if (count * (std::max(extent, true_extent)) < SHMEM_COLL_BLOCK_SIZE) {
              mpi_errno = MPIR_Reduce_shmem_MV2(in_buf, out_buf, count, datatype, op, 0, shmem_comm);
            } else {
              mpi_errno = MPIR_Reduce_intra_knomial_wrapper_MV2(in_buf, out_buf, count, datatype, op, 0, shmem_comm);
            }

            if (local_rank == 0 && root != my_rank) {
                Request::send(out_buf, count, datatype, root,
                                         COLL_TAG_REDUCE+1, comm);
            }
            if ((local_rank != 0) && (root == my_rank)) {
                Request::recv(recvbuf, count, datatype,
                                         leader_of_root, COLL_TAG_REDUCE+1, comm,
                                         MPI_STATUS_IGNORE);
            }
        } else {
            if(mv2_use_knomial_reduce == 1) {
                reduce_fn = &MPIR_Reduce_intra_knomial_wrapper_MV2;
            } else {
                reduce_fn = &MPIR_Reduce_binomial_MV2;
            }
            mpi_errno = reduce_fn(sendbuf, recvbuf, count,
                                  datatype, op,
                                  root, comm);
        }
        /* We are done */
        if (tmp_buf != nullptr)
          smpi_free_tmp_buffer(tmp_buf + true_lb);
        goto fn_exit;
    }


    if (local_rank == 0) {
        leader_comm = comm->get_leaders_comm();
        if(leader_comm==MPI_COMM_NULL){
          leader_comm = MPI_COMM_WORLD;
        }
        leader_comm_size = leader_comm->size();
        leader_comm_rank = leader_comm->rank();
        tmp_buf          = smpi_get_tmp_sendbuffer(count * std::max(extent, true_extent));
        tmp_buf          = tmp_buf - true_lb;
    }
    if (sendbuf != MPI_IN_PLACE) {
      in_buf = static_cast<const unsigned char*>(sendbuf);
    } else {
      in_buf = static_cast<const unsigned char*>(recvbuf);
    }
    if (local_rank == 0) {
      out_buf = static_cast<unsigned char*>(tmp_buf);
    } else {
      out_buf = nullptr;
    }


    if(local_size > 1) {
        /* Lets do the intra-node reduce operations, if we have more than one
         * process in the node */

        /*Fix the input and outbuf buffers for the intra-node reduce.
         *Node leaders will have the reduced data in tmp_buf after
         *this step*/
        if (MV2_Reduce_intra_function == & MPIR_Reduce_shmem_MV2)
        {
          if (is_commutative == 1 && (count * (std::max(extent, true_extent)) < SHMEM_COLL_BLOCK_SIZE)) {
            mpi_errno = MV2_Reduce_intra_function(in_buf, out_buf, count, datatype, op, intra_node_root, shmem_comm);
            } else {
                    mpi_errno = MPIR_Reduce_intra_knomial_wrapper_MV2(in_buf, out_buf, count,
                                      datatype, op,
                                      intra_node_root, shmem_comm);
            }
        } else {

            mpi_errno = MV2_Reduce_intra_function(in_buf, out_buf, count,
                                      datatype, op,
                                      intra_node_root, shmem_comm);
        }
    } else {
      smpi_free_tmp_buffer(tmp_buf + true_lb);
      tmp_buf = (unsigned char*)in_buf; // xxx
    }

    /* Now work on the inter-leader phase. Data is in tmp_buf */
    if (local_rank == 0 && leader_comm_size > 1) {
        /*The leader of root will have the global reduced data in tmp_buf
           or recv_buf
           at the end of the reduce */
        if (leader_comm_rank == leader_root) {
            if (my_rank == root) {
                /* I am the root of the leader-comm, and the
                 * root of the reduce op. So, I will write the
                 * final result directly into my recvbuf */
                if(tmp_buf != recvbuf) {
                  in_buf  = tmp_buf;
                  out_buf = static_cast<unsigned char*>(recvbuf);
                } else {

                  unsigned char* buf = smpi_get_tmp_sendbuffer(count * datatype->get_extent());
                  Datatype::copy(tmp_buf, count, datatype, buf, count, datatype);
                  // in_buf = MPI_IN_PLACE;
                  in_buf  = buf;
                  out_buf = static_cast<unsigned char*>(recvbuf);
                }
            } else {
              unsigned char* buf = smpi_get_tmp_sendbuffer(count * datatype->get_extent());
              Datatype::copy(tmp_buf, count, datatype, buf, count, datatype);
              // in_buf = MPI_IN_PLACE;
              in_buf  = buf;
              out_buf = tmp_buf;
            }
        } else {
            in_buf = tmp_buf;
            out_buf = nullptr;
        }

        /* inter-leader communication  */
        mpi_errno = MV2_Reduce_function(in_buf, out_buf, count,
                              datatype, op,
                              leader_root, leader_comm);

    }

    if (local_size > 1) {
      /* Send the message to the root if the leader is not the
       * root of the reduce operation. The reduced data is in tmp_buf */
      if ((local_rank == 0) && (root != my_rank) && (leader_root == leader_comm_rank)) {
        Request::send(tmp_buf, count, datatype, root, COLL_TAG_REDUCE + 1, comm);
      }
      if ((local_rank != 0) && (root == my_rank)) {
        Request::recv(recvbuf, count, datatype, leader_of_root, COLL_TAG_REDUCE + 1, comm, MPI_STATUS_IGNORE);
      }
      smpi_free_tmp_buffer(tmp_buf + true_lb);

      if (leader_comm_rank == leader_root) {
        if (my_rank != root || (my_rank == root && tmp_buf == recvbuf)) {
          smpi_free_tmp_buffer(in_buf);
        }
      }
    }



  fn_exit:
    return mpi_errno;
}
예제 #2
0
int Coll_scatter_mvapich2_two_level_direct::scatter(void *sendbuf,
                                      int sendcnt,
                                      MPI_Datatype sendtype,
                                      void *recvbuf,
                                      int recvcnt,
                                      MPI_Datatype recvtype,
                                      int root, MPI_Comm  comm)
{
    int comm_size, rank;
    int local_rank, local_size;
    int leader_comm_rank = -1, leader_comm_size = -1;
    int mpi_errno = MPI_SUCCESS;
    int recvtype_size, sendtype_size, nbytes;
    void *tmp_buf = NULL;
    void *leader_scatter_buf = NULL;
    MPI_Status status;
    int leader_root, leader_of_root = -1;
    MPI_Comm shmem_comm, leader_comm;
    //if not set (use of the algo directly, without mvapich2 selector)
    if(MV2_Scatter_intra_function==NULL)
      MV2_Scatter_intra_function=Coll_scatter_mpich::scatter;

    if(comm->get_leaders_comm()==MPI_COMM_NULL){
      comm->init_smp();
    }
    comm_size = comm->size();
    rank = comm->rank();

    if (((rank == root) && (recvcnt == 0))
        || ((rank != root) && (sendcnt == 0))) {
        return MPI_SUCCESS;
    }

    /* extract the rank,size information for the intra-node
     * communicator */
    shmem_comm = comm->get_intra_comm();
    local_rank = shmem_comm->rank();
    local_size = shmem_comm->size();

    if (local_rank == 0) {
        /* Node leader. Extract the rank, size information for the leader
         * communicator */
        leader_comm = comm->get_leaders_comm();
        leader_comm_size = leader_comm->size();
        leader_comm_rank = leader_comm->rank();
    }

    if (local_size == comm_size) {
        /* purely intra-node scatter. Just use the direct algorithm and we are done */
        mpi_errno = MPIR_Scatter_MV2_Direct(sendbuf, sendcnt, sendtype,
                                            recvbuf, recvcnt, recvtype,
                                            root, comm);

    } else {
        recvtype_size=recvtype->size();
        sendtype_size=sendtype->size();

        if (rank == root) {
            nbytes = sendcnt * sendtype_size;
        } else {
            nbytes = recvcnt * recvtype_size;
        }

        if (local_rank == 0) {
            /* Node leader, allocate tmp_buffer */
            tmp_buf = smpi_get_tmp_sendbuffer(nbytes * local_size);
        }

        leader_comm = comm->get_leaders_comm();
        int* leaders_map = comm->get_leaders_map();
        leader_of_root = comm->group()->rank(leaders_map[root]);
        leader_root = leader_comm->group()->rank(leaders_map[root]);
        /* leader_root is the rank of the leader of the root in leader_comm.
         * leader_root is to be used as the root of the inter-leader gather ops
         */

        if ((local_rank == 0) && (root != rank)
            && (leader_of_root == rank)) {
            /* The root of the scatter operation is not the node leader. Recv
             * data from the node leader */
            leader_scatter_buf = smpi_get_tmp_sendbuffer(nbytes * comm_size);
            Request::recv(leader_scatter_buf, nbytes * comm_size, MPI_BYTE,
                             root, COLL_TAG_SCATTER, comm, &status);

        }

        if (rank == root && local_rank != 0) {
            /* The root of the scatter operation is not the node leader. Send
             * data to the node leader */
            Request::send(sendbuf, sendcnt * comm_size, sendtype,
                                     leader_of_root, COLL_TAG_SCATTER, comm
                                     );
        }

        if (leader_comm_size > 1 && local_rank == 0) {
          if (not comm->is_uniform()) {
            int* displs   = NULL;
            int* sendcnts = NULL;
            int* node_sizes;
            int i      = 0;
            node_sizes = comm->get_non_uniform_map();

            if (root != leader_of_root) {
              if (leader_comm_rank == leader_root) {
                displs      = static_cast<int*>(xbt_malloc(sizeof(int) * leader_comm_size));
                sendcnts    = static_cast<int*>(xbt_malloc(sizeof(int) * leader_comm_size));
                sendcnts[0] = node_sizes[0] * nbytes;
                displs[0]   = 0;

                for (i = 1; i < leader_comm_size; i++) {
                  displs[i]   = displs[i - 1] + node_sizes[i - 1] * nbytes;
                  sendcnts[i] = node_sizes[i] * nbytes;
                }
              }
              Colls::scatterv(leader_scatter_buf, sendcnts, displs, MPI_BYTE, tmp_buf, nbytes * local_size, MPI_BYTE,
                              leader_root, leader_comm);
            } else {
              if (leader_comm_rank == leader_root) {
                displs      = static_cast<int*>(xbt_malloc(sizeof(int) * leader_comm_size));
                sendcnts    = static_cast<int*>(xbt_malloc(sizeof(int) * leader_comm_size));
                sendcnts[0] = node_sizes[0] * sendcnt;
                displs[0]   = 0;

                for (i = 1; i < leader_comm_size; i++) {
                  displs[i]   = displs[i - 1] + node_sizes[i - 1] * sendcnt;
                  sendcnts[i] = node_sizes[i] * sendcnt;
                }
              }
              Colls::scatterv(sendbuf, sendcnts, displs, sendtype, tmp_buf, nbytes * local_size, MPI_BYTE, leader_root,
                              leader_comm);
            }
            if (leader_comm_rank == leader_root) {
              xbt_free(displs);
              xbt_free(sendcnts);
            }
            } else {
                if (leader_of_root != root) {
                    mpi_errno =
                        MPIR_Scatter_MV2_Direct(leader_scatter_buf,
                                                nbytes * local_size, MPI_BYTE,
                                                tmp_buf, nbytes * local_size,
                                                MPI_BYTE, leader_root,
                                                leader_comm);
                } else {
                    mpi_errno =
                        MPIR_Scatter_MV2_Direct(sendbuf, sendcnt * local_size,
                                                sendtype, tmp_buf,
                                                nbytes * local_size, MPI_BYTE,
                                                leader_root, leader_comm);

                }
            }
        }
        /* The leaders are now done with the inter-leader part. Scatter the data within the nodes */

        if (rank == root && recvbuf == MPI_IN_PLACE) {
            mpi_errno = MV2_Scatter_intra_function(tmp_buf, nbytes, MPI_BYTE,
                                                (void *)sendbuf, sendcnt, sendtype,
                                                0, shmem_comm);
        } else {
            mpi_errno = MV2_Scatter_intra_function(tmp_buf, nbytes, MPI_BYTE,
                                                recvbuf, recvcnt, recvtype,
                                                0, shmem_comm);
        }
    }

    /* check if multiple threads are calling this collective function */
    if (comm_size != local_size && local_rank == 0) {
        smpi_free_tmp_buffer(tmp_buf);
        if (leader_of_root == rank && root != rank) {
            smpi_free_tmp_buffer(leader_scatter_buf);
        }
    }
    return (mpi_errno);
}
예제 #3
0
int Coll_bcast_mvapich2_inter_node::bcast(void *buffer,
                                                 int count,
                                                 MPI_Datatype datatype,
                                                 int root,
                                                 MPI_Comm  comm)
{
    int rank;
    int mpi_errno = MPI_SUCCESS;
    MPI_Comm shmem_comm, leader_comm;
    int local_rank, local_size, global_rank = -1;
    int leader_root, leader_of_root;


    rank = comm->rank();
    //comm_size = comm->size();


    if (MV2_Bcast_function==NULL){
      MV2_Bcast_function=Coll_bcast_mpich::bcast;
    }

    if (MV2_Bcast_intra_node_function==NULL){
      MV2_Bcast_intra_node_function= Coll_bcast_mpich::bcast;
    }

    if(comm->get_leaders_comm()==MPI_COMM_NULL){
      comm->init_smp();
    }

    shmem_comm = comm->get_intra_comm();
    local_rank = shmem_comm->rank();
    local_size = shmem_comm->size();

    leader_comm = comm->get_leaders_comm();

    if ((local_rank == 0) && (local_size > 1)) {
      global_rank = leader_comm->rank();
    }

    int* leaders_map = comm->get_leaders_map();
    leader_of_root = comm->group()->rank(leaders_map[root]);
    leader_root = leader_comm->group()->rank(leaders_map[root]);


    if (local_size > 1) {
        if ((local_rank == 0) && (root != rank) && (leader_root == global_rank)) {
            Request::recv(buffer, count, datatype, root,
                                     COLL_TAG_BCAST, comm, MPI_STATUS_IGNORE);
        }
        if ((local_rank != 0) && (root == rank)) {
            Request::send(buffer, count, datatype,
                                     leader_of_root, COLL_TAG_BCAST, comm);
        }
    }
#if defined(_MCST_SUPPORT_)
    if (comm_ptr->ch.is_mcast_ok) {
        mpi_errno = MPIR_Mcast_inter_node_MV2(buffer, count, datatype, root, comm_ptr,
                                              errflag);
        if (mpi_errno == MPI_SUCCESS) {
            goto fn_exit;
        } else {
            goto fn_fail;
        }
    }
#endif
/*
    if (local_rank == 0) {
        leader_comm = comm->get_leaders_comm();
        root = leader_root;
    }

    if (MV2_Bcast_function == &MPIR_Pipelined_Bcast_MV2) {
        mpi_errno = MPIR_Pipelined_Bcast_MV2(buffer, count, datatype,
                                             root, comm);
    } else if (MV2_Bcast_function == &MPIR_Bcast_scatter_ring_allgather_shm_MV2) {
        mpi_errno = MPIR_Bcast_scatter_ring_allgather_shm_MV2(buffer, count,
                                                              datatype, root,
                                                              comm);
    } else */{
        if (local_rank == 0) {
      /*      if (MV2_Bcast_function == &MPIR_Knomial_Bcast_inter_node_wrapper_MV2) {
                mpi_errno = MPIR_Knomial_Bcast_inter_node_wrapper_MV2(buffer, count,
                                                              datatype, root,
                                                              comm);
            } else {*/
                mpi_errno = MV2_Bcast_function(buffer, count, datatype,
                                               leader_root, leader_comm);
          //  }
        }
    }

    return mpi_errno;
}