int Coll_scatter_mvapich2_two_level_direct::scatter(void *sendbuf,
                                      int sendcnt,
                                      MPI_Datatype sendtype,
                                      void *recvbuf,
                                      int recvcnt,
                                      MPI_Datatype recvtype,
                                      int root, MPI_Comm  comm)
{
    int comm_size, rank;
    int local_rank, local_size;
    int leader_comm_rank = -1, leader_comm_size = -1;
    int mpi_errno = MPI_SUCCESS;
    int recvtype_size, sendtype_size, nbytes;
    void *tmp_buf = NULL;
    void *leader_scatter_buf = NULL;
    MPI_Status status;
    int leader_root, leader_of_root = -1;
    MPI_Comm shmem_comm, leader_comm;
    //if not set (use of the algo directly, without mvapich2 selector)
    if(MV2_Scatter_intra_function==NULL)
      MV2_Scatter_intra_function=Coll_scatter_mpich::scatter;

    if(comm->get_leaders_comm()==MPI_COMM_NULL){
      comm->init_smp();
    }
    comm_size = comm->size();
    rank = comm->rank();

    if (((rank == root) && (recvcnt == 0))
        || ((rank != root) && (sendcnt == 0))) {
        return MPI_SUCCESS;
    }

    /* extract the rank,size information for the intra-node
     * communicator */
    shmem_comm = comm->get_intra_comm();
    local_rank = shmem_comm->rank();
    local_size = shmem_comm->size();

    if (local_rank == 0) {
        /* Node leader. Extract the rank, size information for the leader
         * communicator */
        leader_comm = comm->get_leaders_comm();
        leader_comm_size = leader_comm->size();
        leader_comm_rank = leader_comm->rank();
    }

    if (local_size == comm_size) {
        /* purely intra-node scatter. Just use the direct algorithm and we are done */
        mpi_errno = MPIR_Scatter_MV2_Direct(sendbuf, sendcnt, sendtype,
                                            recvbuf, recvcnt, recvtype,
                                            root, comm);

    } else {
        recvtype_size=recvtype->size();
        sendtype_size=sendtype->size();

        if (rank == root) {
            nbytes = sendcnt * sendtype_size;
        } else {
            nbytes = recvcnt * recvtype_size;
        }

        if (local_rank == 0) {
            /* Node leader, allocate tmp_buffer */
            tmp_buf = smpi_get_tmp_sendbuffer(nbytes * local_size);
        }

        leader_comm = comm->get_leaders_comm();
        int* leaders_map = comm->get_leaders_map();
        leader_of_root = comm->group()->rank(leaders_map[root]);
        leader_root = leader_comm->group()->rank(leaders_map[root]);
        /* leader_root is the rank of the leader of the root in leader_comm.
         * leader_root is to be used as the root of the inter-leader gather ops
         */

        if ((local_rank == 0) && (root != rank)
            && (leader_of_root == rank)) {
            /* The root of the scatter operation is not the node leader. Recv
             * data from the node leader */
            leader_scatter_buf = smpi_get_tmp_sendbuffer(nbytes * comm_size);
            Request::recv(leader_scatter_buf, nbytes * comm_size, MPI_BYTE,
                             root, COLL_TAG_SCATTER, comm, &status);

        }

        if (rank == root && local_rank != 0) {
            /* The root of the scatter operation is not the node leader. Send
             * data to the node leader */
            Request::send(sendbuf, sendcnt * comm_size, sendtype,
                                     leader_of_root, COLL_TAG_SCATTER, comm
                                     );
        }

        if (leader_comm_size > 1 && local_rank == 0) {
          if (not comm->is_uniform()) {
            int* displs   = NULL;
            int* sendcnts = NULL;
            int* node_sizes;
            int i      = 0;
            node_sizes = comm->get_non_uniform_map();

            if (root != leader_of_root) {
              if (leader_comm_rank == leader_root) {
                displs      = static_cast<int*>(xbt_malloc(sizeof(int) * leader_comm_size));
                sendcnts    = static_cast<int*>(xbt_malloc(sizeof(int) * leader_comm_size));
                sendcnts[0] = node_sizes[0] * nbytes;
                displs[0]   = 0;

                for (i = 1; i < leader_comm_size; i++) {
                  displs[i]   = displs[i - 1] + node_sizes[i - 1] * nbytes;
                  sendcnts[i] = node_sizes[i] * nbytes;
                }
              }
              Colls::scatterv(leader_scatter_buf, sendcnts, displs, MPI_BYTE, tmp_buf, nbytes * local_size, MPI_BYTE,
                              leader_root, leader_comm);
            } else {
              if (leader_comm_rank == leader_root) {
                displs      = static_cast<int*>(xbt_malloc(sizeof(int) * leader_comm_size));
                sendcnts    = static_cast<int*>(xbt_malloc(sizeof(int) * leader_comm_size));
                sendcnts[0] = node_sizes[0] * sendcnt;
                displs[0]   = 0;

                for (i = 1; i < leader_comm_size; i++) {
                  displs[i]   = displs[i - 1] + node_sizes[i - 1] * sendcnt;
                  sendcnts[i] = node_sizes[i] * sendcnt;
                }
              }
              Colls::scatterv(sendbuf, sendcnts, displs, sendtype, tmp_buf, nbytes * local_size, MPI_BYTE, leader_root,
                              leader_comm);
            }
            if (leader_comm_rank == leader_root) {
              xbt_free(displs);
              xbt_free(sendcnts);
            }
            } else {
                if (leader_of_root != root) {
                    mpi_errno =
                        MPIR_Scatter_MV2_Direct(leader_scatter_buf,
                                                nbytes * local_size, MPI_BYTE,
                                                tmp_buf, nbytes * local_size,
                                                MPI_BYTE, leader_root,
                                                leader_comm);
                } else {
                    mpi_errno =
                        MPIR_Scatter_MV2_Direct(sendbuf, sendcnt * local_size,
                                                sendtype, tmp_buf,
                                                nbytes * local_size, MPI_BYTE,
                                                leader_root, leader_comm);

                }
            }
        }
        /* The leaders are now done with the inter-leader part. Scatter the data within the nodes */

        if (rank == root && recvbuf == MPI_IN_PLACE) {
            mpi_errno = MV2_Scatter_intra_function(tmp_buf, nbytes, MPI_BYTE,
                                                (void *)sendbuf, sendcnt, sendtype,
                                                0, shmem_comm);
        } else {
            mpi_errno = MV2_Scatter_intra_function(tmp_buf, nbytes, MPI_BYTE,
                                                recvbuf, recvcnt, recvtype,
                                                0, shmem_comm);
        }
    }

    /* check if multiple threads are calling this collective function */
    if (comm_size != local_size && local_rank == 0) {
        smpi_free_tmp_buffer(tmp_buf);
        if (leader_of_root == rank && root != rank) {
            smpi_free_tmp_buffer(leader_scatter_buf);
        }
    }
    return (mpi_errno);
}
int Coll_allgather_mvapich2_smp::allgather(void *sendbuf,int sendcnt, MPI_Datatype sendtype,
                            void *recvbuf, int recvcnt,MPI_Datatype recvtype,
                            MPI_Comm  comm)
{
    int rank, size;
    int local_rank, local_size;
    int leader_comm_size = 0;
    int mpi_errno = MPI_SUCCESS;
    MPI_Aint recvtype_extent = 0;  /* Datatype extent */
    MPI_Comm shmem_comm, leader_comm;

  if(comm->get_leaders_comm()==MPI_COMM_NULL){
    comm->init_smp();
  }

  if (not comm->is_uniform() || not comm->is_blocked())
    THROWF(arg_error,0, "allgather MVAPICH2 smp algorithm can't be used with irregular deployment. Please insure that processes deployed on the same node are contiguous and that each node has the same number of processes");

    if (recvcnt == 0) {
        return MPI_SUCCESS;
    }

    rank = comm->rank();
    size = comm->size();

    /* extract the rank,size information for the intra-node communicator */
    recvtype_extent=recvtype->get_extent();

    shmem_comm = comm->get_intra_comm();
    local_rank = shmem_comm->rank();
    local_size = shmem_comm->size();

    if (local_rank == 0) {
        /* Node leader. Extract the rank, size information for the leader communicator */
        leader_comm = comm->get_leaders_comm();
        if(leader_comm==MPI_COMM_NULL){
          leader_comm = MPI_COMM_WORLD;
        }
        leader_comm_size = leader_comm->size();
    }

    /*If there is just one node, after gather itself,
     * root has all the data and it can do bcast*/
    if(local_rank == 0) {
        mpi_errno = Colls::gather(sendbuf, sendcnt,sendtype,
                                    (void*)((char*)recvbuf + (rank * recvcnt * recvtype_extent)),
                                     recvcnt, recvtype,
                                     0, shmem_comm);
    } else {
        /*Since in allgather all the processes could have
         * its own data in place*/
        if(sendbuf == MPI_IN_PLACE) {
            mpi_errno = Colls::gather((void*)((char*)recvbuf + (rank * recvcnt * recvtype_extent)),
                                         recvcnt , recvtype,
                                         recvbuf, recvcnt, recvtype,
                                         0, shmem_comm);
        } else {
            mpi_errno = Colls::gather(sendbuf, sendcnt,sendtype,
                                         recvbuf, recvcnt, recvtype,
                                         0, shmem_comm);
        }
    }
    /* Exchange the data between the node leaders*/
    if (local_rank == 0 && (leader_comm_size > 1)) {
        /*When data in each socket is different*/
        if (comm->is_uniform() != 1) {

            int *displs = NULL;
            int *recvcnts = NULL;
            int *node_sizes = NULL;
            int i = 0;

            node_sizes = comm->get_non_uniform_map();

            displs =  static_cast<int *>(xbt_malloc(sizeof (int) * leader_comm_size));
            recvcnts =  static_cast<int *>(xbt_malloc(sizeof (int) * leader_comm_size));
            if (not displs || not recvcnts) {
              return MPI_ERR_OTHER;
            }
            recvcnts[0] = node_sizes[0] * recvcnt;
            displs[0] = 0;

            for (i = 1; i < leader_comm_size; i++) {
                displs[i] = displs[i - 1] + node_sizes[i - 1] * recvcnt;
                recvcnts[i] = node_sizes[i] * recvcnt;
            }


            void* sendbuf=((char*)recvbuf)+recvtype->get_extent()*displs[leader_comm->rank()];

            mpi_errno = Colls::allgatherv(sendbuf,
                                       (recvcnt*local_size),
                                       recvtype,
                                       recvbuf, recvcnts,
                                       displs, recvtype,
                                       leader_comm);
            xbt_free(displs);
            xbt_free(recvcnts);
        } else {
        void* sendtmpbuf=((char*)recvbuf)+recvtype->get_extent()*(recvcnt*local_size)*leader_comm->rank();



            mpi_errno = Coll_allgather_mpich::allgather(sendtmpbuf,
                                               (recvcnt*local_size),
                                               recvtype,
                                               recvbuf, (recvcnt*local_size), recvtype,
                                             leader_comm);

        }
    }

    /*Bcast the entire data from node leaders to all other cores*/
    mpi_errno = Colls::bcast (recvbuf, recvcnt * size, recvtype, 0, shmem_comm);
    return mpi_errno;
}
Beispiel #3
0
int Coll_bcast_SMP_binary::bcast(void *buf, int count,
                                     MPI_Datatype datatype, int root,
                                     MPI_Comm comm)
{
  int tag = COLL_TAG_BCAST;
  MPI_Status status;
  MPI_Request request;
  MPI_Request *request_array;
  MPI_Status *status_array;
  int rank, size;
  int i;
  MPI_Aint extent;
  extent = datatype->get_extent();

  rank = comm->rank();
  size = comm->size();
  if(comm->get_leaders_comm()==MPI_COMM_NULL){
    comm->init_smp();
  }
  int host_num_core=1;
  if (comm->is_uniform()){
    host_num_core = comm->get_intra_comm()->size();
  }else{
    //implementation buggy in this case
    return Coll_bcast_mpich::bcast( buf , count, datatype,
              root, comm);
  }

  int segment = bcast_SMP_binary_segment_byte / extent;
  int pipe_length = count / segment;
  int remainder = count % segment;

  int to_intra_left = (rank / host_num_core) * host_num_core + (rank % host_num_core) * 2 + 1;
  int to_intra_right = (rank / host_num_core) * host_num_core + (rank % host_num_core) * 2 + 2;
  int to_inter_left = ((rank / host_num_core) * 2 + 1) * host_num_core;
  int to_inter_right = ((rank / host_num_core) * 2 + 2) * host_num_core;
  int from_inter = (((rank / host_num_core) - 1) / 2) * host_num_core;
  int from_intra = (rank / host_num_core) * host_num_core + ((rank % host_num_core) - 1) / 2;
  int increment = segment * extent;

  int base = (rank / host_num_core) * host_num_core;
  int num_core = host_num_core;
  if (((rank / host_num_core) * host_num_core) == ((size / host_num_core) * host_num_core))
    num_core = size - (rank / host_num_core) * host_num_core;

  // if root is not zero send to rank zero first
  if (root != 0) {
    if (rank == root)
      Request::send(buf, count, datatype, 0, tag, comm);
    else if (rank == 0)
      Request::recv(buf, count, datatype, root, tag, comm, &status);
  }
  // when a message is smaller than a block size => no pipeline
  if (count <= segment) {
    // case ROOT-of-each-SMP
    if (rank % host_num_core == 0) {
      // case ROOT
      if (rank == 0) {
        //printf("node %d left %d right %d\n",rank,to_inter_left,to_inter_right);
        if (to_inter_left < size)
          Request::send(buf, count, datatype, to_inter_left, tag, comm);
        if (to_inter_right < size)
          Request::send(buf, count, datatype, to_inter_right, tag, comm);
        if ((to_intra_left - base) < num_core)
          Request::send(buf, count, datatype, to_intra_left, tag, comm);
        if ((to_intra_right - base) < num_core)
          Request::send(buf, count, datatype, to_intra_right, tag, comm);
      }
      // case LEAVES ROOT-of-eash-SMP
      else if (to_inter_left >= size) {
        //printf("node %d from %d\n",rank,from_inter);
        request = Request::irecv(buf, count, datatype, from_inter, tag, comm);
        Request::wait(&request, &status);
        if ((to_intra_left - base) < num_core)
          Request::send(buf, count, datatype, to_intra_left, tag, comm);
        if ((to_intra_right - base) < num_core)
          Request::send(buf, count, datatype, to_intra_right, tag, comm);
      }
      // case INTERMEDIAT ROOT-of-each-SMP
      else {
        //printf("node %d left %d right %d from %d\n",rank,to_inter_left,to_inter_right,from_inter);
        request = Request::irecv(buf, count, datatype, from_inter, tag, comm);
        Request::wait(&request, &status);
        Request::send(buf, count, datatype, to_inter_left, tag, comm);
        if (to_inter_right < size)
          Request::send(buf, count, datatype, to_inter_right, tag, comm);
        if ((to_intra_left - base) < num_core)
          Request::send(buf, count, datatype, to_intra_left, tag, comm);
        if ((to_intra_right - base) < num_core)
          Request::send(buf, count, datatype, to_intra_right, tag, comm);
      }
    }
    // case non ROOT-of-each-SMP
    else {
      // case leaves
      if ((to_intra_left - base) >= num_core) {
        request = Request::irecv(buf, count, datatype, from_intra, tag, comm);
        Request::wait(&request, &status);
      }
      // case intermediate
      else {
        request = Request::irecv(buf, count, datatype, from_intra, tag, comm);
        Request::wait(&request, &status);
        Request::send(buf, count, datatype, to_intra_left, tag, comm);
        if ((to_intra_right - base) < num_core)
          Request::send(buf, count, datatype, to_intra_right, tag, comm);
      }
    }

    return MPI_SUCCESS;
  }

  // pipeline bcast
  else {
    request_array =
        (MPI_Request *) xbt_malloc((size + pipe_length) * sizeof(MPI_Request));
    status_array =
        (MPI_Status *) xbt_malloc((size + pipe_length) * sizeof(MPI_Status));

    // case ROOT-of-each-SMP
    if (rank % host_num_core == 0) {
      // case ROOT
      if (rank == 0) {
        for (i = 0; i < pipe_length; i++) {
          //printf("node %d left %d right %d\n",rank,to_inter_left,to_inter_right);
          if (to_inter_left < size)
            Request::send((char *) buf + (i * increment), segment, datatype,
                     to_inter_left, (tag + i), comm);
          if (to_inter_right < size)
            Request::send((char *) buf + (i * increment), segment, datatype,
                     to_inter_right, (tag + i), comm);
          if ((to_intra_left - base) < num_core)
            Request::send((char *) buf + (i * increment), segment, datatype,
                     to_intra_left, (tag + i), comm);
          if ((to_intra_right - base) < num_core)
            Request::send((char *) buf + (i * increment), segment, datatype,
                     to_intra_right, (tag + i), comm);
        }
      }
      // case LEAVES ROOT-of-eash-SMP
      else if (to_inter_left >= size) {
        //printf("node %d from %d\n",rank,from_inter);
        for (i = 0; i < pipe_length; i++) {
          request_array[i] = Request::irecv((char *) buf + (i * increment), segment, datatype,
                    from_inter, (tag + i), comm);
        }
        for (i = 0; i < pipe_length; i++) {
          Request::wait(&request_array[i], &status);
          if ((to_intra_left - base) < num_core)
            Request::send((char *) buf + (i * increment), segment, datatype,
                     to_intra_left, (tag + i), comm);
          if ((to_intra_right - base) < num_core)
            Request::send((char *) buf + (i * increment), segment, datatype,
                     to_intra_right, (tag + i), comm);
        }
      }
      // case INTERMEDIAT ROOT-of-each-SMP
      else {
        //printf("node %d left %d right %d from %d\n",rank,to_inter_left,to_inter_right,from_inter);
        for (i = 0; i < pipe_length; i++) {
          request_array[i] = Request::irecv((char *) buf + (i * increment), segment, datatype,
                    from_inter, (tag + i), comm);
        }
        for (i = 0; i < pipe_length; i++) {
          Request::wait(&request_array[i], &status);
          Request::send((char *) buf + (i * increment), segment, datatype,
                   to_inter_left, (tag + i), comm);
          if (to_inter_right < size)
            Request::send((char *) buf + (i * increment), segment, datatype,
                     to_inter_right, (tag + i), comm);
          if ((to_intra_left - base) < num_core)
            Request::send((char *) buf + (i * increment), segment, datatype,
                     to_intra_left, (tag + i), comm);
          if ((to_intra_right - base) < num_core)
            Request::send((char *) buf + (i * increment), segment, datatype,
                     to_intra_right, (tag + i), comm);
        }
      }
    }
    // case non-ROOT-of-each-SMP
    else {
      // case leaves
      if ((to_intra_left - base) >= num_core) {
        for (i = 0; i < pipe_length; i++) {
          request_array[i] = Request::irecv((char *) buf + (i * increment), segment, datatype,
                    from_intra, (tag + i), comm);
        }
        Request::waitall((pipe_length), request_array, status_array);
      }
      // case intermediate
      else {
        for (i = 0; i < pipe_length; i++) {
          request_array[i] = Request::irecv((char *) buf + (i * increment), segment, datatype,
                    from_intra, (tag + i), comm);
        }
        for (i = 0; i < pipe_length; i++) {
          Request::wait(&request_array[i], &status);
          Request::send((char *) buf + (i * increment), segment, datatype,
                   to_intra_left, (tag + i), comm);
          if ((to_intra_right - base) < num_core)
            Request::send((char *) buf + (i * increment), segment, datatype,
                     to_intra_right, (tag + i), comm);
        }
      }
    }

    free(request_array);
    free(status_array);
  }

  // when count is not divisible by block size, use default BCAST for the remainder
  if ((remainder != 0) && (count > segment)) {
    XBT_WARN("MPI_bcast_SMP_binary use default MPI_bcast.");
    Colls::bcast((char *) buf + (pipe_length * increment), remainder, datatype,
              root, comm);
  }

  return 1;
}
int Coll_allgather_loosely_lr::allgather(const void *sbuf, int scount,
                                         MPI_Datatype stype, void *rbuf,
                                         int rcount, MPI_Datatype rtype,
                                         MPI_Comm comm)
{
  int comm_size, rank;
  int tag = COLL_TAG_ALLGATHER;
  int i, j, send_offset, recv_offset;
  int intra_rank, inter_rank, inter_comm_size, intra_comm_size;
  int inter_dst, inter_src;

  comm_size = comm->size();

if(comm->get_leaders_comm()==MPI_COMM_NULL){
    comm->init_smp();
  }
  int num_core=1;
  if (comm->is_uniform()){
    num_core = comm->get_intra_comm()->size();
  }

  if(comm_size%num_core)
    THROWF(arg_error,0, "allgather loosely lr algorithm can't be used with non multiple of NUM_CORE=%d number of processes ! ",num_core);

  rank = comm->rank();
  MPI_Aint rextent, sextent;
  rextent = rtype->get_extent();
  sextent = stype->get_extent();
  MPI_Request inter_rrequest;
  MPI_Request rrequest_array[128];
  MPI_Request srequest_array[128];
  MPI_Request inter_srequest_array[128];


  int rrequest_count = 0;
  int srequest_count = 0;
  int inter_srequest_count = 0;

  MPI_Status status;

  intra_rank = rank % num_core;
  inter_rank = rank / num_core;
  inter_comm_size = (comm_size + num_core - 1) / num_core;
  intra_comm_size = num_core;

  int src_seg, dst_seg;

  //copy corresponding message from sbuf to rbuf
  recv_offset = rank * rextent * rcount;
  Request::sendrecv(sbuf, scount, stype, rank, tag,
               (char *)rbuf + recv_offset, rcount, rtype, rank, tag, comm, &status);

  int dst, src;
  int inter_send_offset, inter_recv_offset;

  rrequest_count = 0;
  srequest_count = 0;
  inter_srequest_count = 0;

  for (i = 0; i < inter_comm_size; i++) {

    // inter_communication

    inter_dst = (rank + intra_comm_size) % comm_size;
    inter_src = (rank - intra_comm_size + comm_size) % comm_size;

    src_seg =
        ((inter_rank - 1 - i +
          inter_comm_size) % inter_comm_size) * intra_comm_size + intra_rank;
    dst_seg =
        ((inter_rank - i +
          inter_comm_size) % inter_comm_size) * intra_comm_size + intra_rank;

    inter_send_offset = dst_seg * sextent * scount;
    inter_recv_offset = src_seg * rextent * rcount;

    for (j = 0; j < intra_comm_size; j++) {

      // inter communication
      if (intra_rank == j) {
        if (i != inter_comm_size - 1) {

          inter_rrequest = Request::irecv((char*)rbuf + inter_recv_offset, rcount, rtype, inter_src, tag, comm);
          inter_srequest_array[inter_srequest_count++] =
              Request::isend((char*)rbuf + inter_send_offset, scount, stype, inter_dst, tag, comm);
        }
      }
      //intra_communication
      src = inter_rank * intra_comm_size + j;
      dst = inter_rank * intra_comm_size + j;

      src_seg =
          ((inter_rank - i +
            inter_comm_size) % inter_comm_size) * intra_comm_size + j;
      dst_seg =
          ((inter_rank - i +
            inter_comm_size) % inter_comm_size) * intra_comm_size + intra_rank;

      send_offset = dst_seg * sextent * scount;
      recv_offset = src_seg * rextent * rcount;


      if (j != intra_rank) {

        rrequest_array[rrequest_count++] = Request::irecv((char *)rbuf + recv_offset, rcount, rtype, src, tag, comm);
        srequest_array[srequest_count++] = Request::isend((char *)rbuf + send_offset, scount, stype, dst, tag, comm);

      }
    }                           // intra loop


    // wait for inter communication to finish for these rounds (# of round equals num_core)
    if (i != inter_comm_size - 1) {
      Request::wait(&inter_rrequest, &status);
    }

  }                             //inter loop

  Request::waitall(rrequest_count, rrequest_array, MPI_STATUSES_IGNORE);
  Request::waitall(srequest_count, srequest_array, MPI_STATUSES_IGNORE);
  Request::waitall(inter_srequest_count, inter_srequest_array, MPI_STATUSES_IGNORE);

  return MPI_SUCCESS;
}
int Coll_reduce_mvapich2::reduce(const void *sendbuf,
    void *recvbuf,
    int count,
    MPI_Datatype datatype,
    MPI_Op op, int root, MPI_Comm comm)
{
  if(mv2_reduce_thresholds_table == NULL)
    init_mv2_reduce_tables_stampede();

  int mpi_errno = MPI_SUCCESS;
  int range = 0;
  int range_threshold = 0;
  int range_intra_threshold = 0;
  int is_commutative, pof2;
  int comm_size = 0;
  long nbytes = 0;
  int sendtype_size;
  int is_two_level = 0;

  comm_size = comm->size();
  sendtype_size=datatype->size();
  nbytes = count * sendtype_size;

  if (count == 0)
    return MPI_SUCCESS;

  is_commutative = (op==MPI_OP_NULL || op->is_commutative());

  /* find nearest power-of-two less than or equal to comm_size */
  for( pof2 = 1; pof2 <= comm_size; pof2 <<= 1 );
  pof2 >>=1;


  /* Search for the corresponding system size inside the tuning table */
  while ((range < (mv2_size_reduce_tuning_table - 1)) &&
      (comm_size > mv2_reduce_thresholds_table[range].numproc)) {
      range++;
  }
  /* Search for corresponding inter-leader function */
  while ((range_threshold < (mv2_reduce_thresholds_table[range].size_inter_table - 1))
      && (nbytes >
  mv2_reduce_thresholds_table[range].inter_leader[range_threshold].max)
  && (mv2_reduce_thresholds_table[range].inter_leader[range_threshold].max !=
      -1)) {
      range_threshold++;
  }

  /* Search for corresponding intra node function */
  while ((range_intra_threshold < (mv2_reduce_thresholds_table[range].size_intra_table - 1))
      && (nbytes >
  mv2_reduce_thresholds_table[range].intra_node[range_intra_threshold].max)
  && (mv2_reduce_thresholds_table[range].intra_node[range_intra_threshold].max !=
      -1)) {
      range_intra_threshold++;
  }

  /* Set intra-node function pt for reduce_two_level */
  MV2_Reduce_intra_function =
      mv2_reduce_thresholds_table[range].intra_node[range_intra_threshold].
      MV2_pt_Reduce_function;
  /* Set inter-leader pt */
  MV2_Reduce_function =
      mv2_reduce_thresholds_table[range].inter_leader[range_threshold].
      MV2_pt_Reduce_function;

  if(mv2_reduce_intra_knomial_factor<0)
    {
      mv2_reduce_intra_knomial_factor = mv2_reduce_thresholds_table[range].intra_k_degree;
    }
  if(mv2_reduce_inter_knomial_factor<0)
    {
      mv2_reduce_inter_knomial_factor = mv2_reduce_thresholds_table[range].inter_k_degree;
    }
  if(mv2_reduce_thresholds_table[range].is_two_level_reduce[range_threshold] == 1){
      is_two_level = 1;
  }
  /* We call Reduce function */
  if(is_two_level == 1)
    {
       if (is_commutative == 1) {
         if(comm->get_leaders_comm()==MPI_COMM_NULL){
           comm->init_smp();
         }
         mpi_errno = MPIR_Reduce_two_level_helper_MV2(sendbuf, recvbuf, count,
                                           datatype, op, root, comm);
        } else {
      mpi_errno = MPIR_Reduce_binomial_MV2(sendbuf, recvbuf, count,
          datatype, op, root, comm);
      }
    } else if(MV2_Reduce_function == &MPIR_Reduce_inter_knomial_wrapper_MV2 ){
        if(is_commutative ==1)
          {
            mpi_errno = MV2_Reduce_function(sendbuf, recvbuf, count,
                datatype, op, root, comm);
          } else {
              mpi_errno = MPIR_Reduce_binomial_MV2(sendbuf, recvbuf, count,
                  datatype, op, root, comm);
          }
    } else if(MV2_Reduce_function == &MPIR_Reduce_redscat_gather_MV2){
        if (/*(HANDLE_GET_KIND(op) == HANDLE_KIND_BUILTIN) &&*/ (count >= pof2))
          {
            mpi_errno = MV2_Reduce_function(sendbuf, recvbuf, count,
                datatype, op, root, comm);
          } else {
              mpi_errno = MPIR_Reduce_binomial_MV2(sendbuf, recvbuf, count,
                  datatype, op, root, comm);
          }
    } else {
        mpi_errno = MV2_Reduce_function(sendbuf, recvbuf, count,
            datatype, op, root, comm);
    }


  return mpi_errno;

}
int Coll_scatter_mvapich2::scatter(const void *sendbuf,
    int sendcnt,
    MPI_Datatype sendtype,
    void *recvbuf,
    int recvcnt,
    MPI_Datatype recvtype,
    int root, MPI_Comm comm)
{
  int range = 0, range_threshold = 0, range_threshold_intra = 0;
  int mpi_errno = MPI_SUCCESS;
  //   int mpi_errno_ret = MPI_SUCCESS;
  int rank, nbytes, comm_size;
  int partial_sub_ok = 0;
  int conf_index = 0;
     MPI_Comm shmem_comm;
  //    MPID_Comm *shmem_commptr=NULL;
  if(mv2_scatter_thresholds_table==NULL)
    init_mv2_scatter_tables_stampede();

  if(comm->get_leaders_comm()==MPI_COMM_NULL){
    comm->init_smp();
  }

  comm_size = comm->size();

  rank = comm->rank();

  if (rank == root) {
    int sendtype_size = sendtype->size();
    nbytes            = sendcnt * sendtype_size;
  } else {
    int recvtype_size = recvtype->size();
    nbytes            = recvcnt * recvtype_size;
  }

    // check if safe to use partial subscription mode
    if (comm->is_uniform()) {

        shmem_comm = comm->get_intra_comm();
        if (mv2_scatter_table_ppn_conf[0] == -1) {
            // Indicating user defined tuning
            conf_index = 0;
        }else{
          int local_size = shmem_comm->size();
          int i          = 0;
            do {
                if (local_size == mv2_scatter_table_ppn_conf[i]) {
                    conf_index = i;
                    partial_sub_ok = 1;
                    break;
                }
                i++;
            } while(i < mv2_scatter_num_ppn_conf);
        }
    }

  if (partial_sub_ok != 1) {
      conf_index = 0;
  }

  /* Search for the corresponding system size inside the tuning table */
  while ((range < (mv2_size_scatter_tuning_table[conf_index] - 1)) &&
      (comm_size > mv2_scatter_thresholds_table[conf_index][range].numproc)) {
      range++;
  }
  /* Search for corresponding inter-leader function */
  while ((range_threshold < (mv2_scatter_thresholds_table[conf_index][range].size_inter_table - 1))
      && (nbytes >
  mv2_scatter_thresholds_table[conf_index][range].inter_leader[range_threshold].max)
  && (mv2_scatter_thresholds_table[conf_index][range].inter_leader[range_threshold].max != -1)) {
      range_threshold++;
  }

  /* Search for corresponding intra-node function */
  while ((range_threshold_intra <
      (mv2_scatter_thresholds_table[conf_index][range].size_intra_table - 1))
      && (nbytes >
  mv2_scatter_thresholds_table[conf_index][range].intra_node[range_threshold_intra].max)
  && (mv2_scatter_thresholds_table[conf_index][range].intra_node[range_threshold_intra].max !=
      -1)) {
      range_threshold_intra++;
  }

  MV2_Scatter_function = mv2_scatter_thresholds_table[conf_index][range].inter_leader[range_threshold]
                                                                                      .MV2_pt_Scatter_function;

  if(MV2_Scatter_function == &MPIR_Scatter_mcst_wrap_MV2) {
#if defined(_MCST_SUPPORT_)
      if(comm->ch.is_mcast_ok == 1
          && mv2_use_mcast_scatter == 1
          && comm->ch.shmem_coll_ok == 1) {
          MV2_Scatter_function = &MPIR_Scatter_mcst_MV2;
      } else
#endif /*#if defined(_MCST_SUPPORT_) */
        {
          if(mv2_scatter_thresholds_table[conf_index][range].inter_leader[range_threshold + 1].
              MV2_pt_Scatter_function != NULL) {
              MV2_Scatter_function = mv2_scatter_thresholds_table[conf_index][range].inter_leader[range_threshold + 1]
                                                                                                  .MV2_pt_Scatter_function;
          } else {
              /* Fallback! */
              MV2_Scatter_function = &MPIR_Scatter_MV2_Binomial;
          }
        }
  }

  if( (MV2_Scatter_function == &MPIR_Scatter_MV2_two_level_Direct) ||
      (MV2_Scatter_function == &MPIR_Scatter_MV2_two_level_Binomial)) {
       if( comm->is_blocked()) {
             MV2_Scatter_intra_function = mv2_scatter_thresholds_table[conf_index][range].intra_node[range_threshold_intra]
                                .MV2_pt_Scatter_function;

             mpi_errno =
                   MV2_Scatter_function(sendbuf, sendcnt, sendtype,
                                        recvbuf, recvcnt, recvtype, root,
                                        comm);
         } else {
      mpi_errno = MPIR_Scatter_MV2_Binomial(sendbuf, sendcnt, sendtype,
          recvbuf, recvcnt, recvtype, root,
          comm);

      }
  } else {
      mpi_errno = MV2_Scatter_function(sendbuf, sendcnt, sendtype,
          recvbuf, recvcnt, recvtype, root,
          comm);
  }
  return (mpi_errno);
}
int Coll_allreduce_mvapich2::allreduce(const void *sendbuf,
    void *recvbuf,
    int count,
    MPI_Datatype datatype,
    MPI_Op op, MPI_Comm comm)
{

  int mpi_errno = MPI_SUCCESS;
  //int rank = 0,
  int comm_size = 0;

  comm_size = comm->size();
  //rank = comm->rank();

  if (count == 0) {
      return MPI_SUCCESS;
  }

  if (mv2_allreduce_thresholds_table == NULL)
    init_mv2_allreduce_tables_stampede();

  /* check if multiple threads are calling this collective function */

  MPI_Aint sendtype_size = 0;
  long nbytes = 0;
  int is_commutative = 0;
  MPI_Aint true_lb, true_extent;

  sendtype_size=datatype->size();
  nbytes = count * sendtype_size;

  datatype->extent(&true_lb, &true_extent);
  is_commutative = op->is_commutative();

  {
    int range = 0, range_threshold = 0, range_threshold_intra = 0;
    int is_two_level = 0;

    /* Search for the corresponding system size inside the tuning table */
    while ((range < (mv2_size_allreduce_tuning_table - 1)) &&
        (comm_size > mv2_allreduce_thresholds_table[range].numproc)) {
        range++;
    }
    /* Search for corresponding inter-leader function */
    /* skip mcast poiters if mcast is not available */
    if(mv2_allreduce_thresholds_table[range].mcast_enabled != 1){
        while ((range_threshold < (mv2_allreduce_thresholds_table[range].size_inter_table - 1))
            && ((mv2_allreduce_thresholds_table[range].
                inter_leader[range_threshold].MV2_pt_Allreducection
                == &MPIR_Allreduce_mcst_reduce_redscat_gather_MV2) ||
                (mv2_allreduce_thresholds_table[range].
                    inter_leader[range_threshold].MV2_pt_Allreducection
                    == &MPIR_Allreduce_mcst_reduce_two_level_helper_MV2)
            )) {
            range_threshold++;
        }
    }
    while ((range_threshold < (mv2_allreduce_thresholds_table[range].size_inter_table - 1))
        && (nbytes >
    mv2_allreduce_thresholds_table[range].inter_leader[range_threshold].max)
    && (mv2_allreduce_thresholds_table[range].inter_leader[range_threshold].max != -1)) {
        range_threshold++;
    }
    if(mv2_allreduce_thresholds_table[range].is_two_level_allreduce[range_threshold] == 1){
        is_two_level = 1;
    }
    /* Search for corresponding intra-node function */
    while ((range_threshold_intra <
        (mv2_allreduce_thresholds_table[range].size_intra_table - 1))
        && (nbytes >
    mv2_allreduce_thresholds_table[range].intra_node[range_threshold_intra].max)
    && (mv2_allreduce_thresholds_table[range].intra_node[range_threshold_intra].max !=
        -1)) {
        range_threshold_intra++;
    }

    MV2_Allreducection = mv2_allreduce_thresholds_table[range].inter_leader[range_threshold]
                                                                                .MV2_pt_Allreducection;

    MV2_Allreduce_intra_function = mv2_allreduce_thresholds_table[range].intra_node[range_threshold_intra]
                                                                                    .MV2_pt_Allreducection;

    /* check if mcast is ready, otherwise replace mcast with other algorithm */
    if((MV2_Allreducection == &MPIR_Allreduce_mcst_reduce_redscat_gather_MV2)||
        (MV2_Allreducection == &MPIR_Allreduce_mcst_reduce_two_level_helper_MV2)){
        {
          MV2_Allreducection = &MPIR_Allreduce_pt2pt_rd_MV2;
        }
        if(is_two_level != 1) {
            MV2_Allreducection = &MPIR_Allreduce_pt2pt_rd_MV2;
        }
    }

    if(is_two_level == 1){
        // check if shm is ready, if not use other algorithm first
        if (is_commutative) {
          if(comm->get_leaders_comm()==MPI_COMM_NULL){
            comm->init_smp();
          }
          mpi_errno = MPIR_Allreduce_two_level_MV2(sendbuf, recvbuf, count,
                                                     datatype, op, comm);
                } else {
        mpi_errno = MPIR_Allreduce_pt2pt_rd_MV2(sendbuf, recvbuf, count,
            datatype, op, comm);
        }
    } else {
        mpi_errno = MV2_Allreducection(sendbuf, recvbuf, count,
            datatype, op, comm);
    }
  }

  //comm->ch.intra_node_done=0;

  return (mpi_errno);


}
int Coll_bcast_mvapich2::bcast(void *buffer,
    int count,
    MPI_Datatype datatype,
    int root, MPI_Comm comm)
{
    int mpi_errno = MPI_SUCCESS;
    int comm_size/*, rank*/;
    int two_level_bcast = 1;
    long nbytes = 0;
    int range = 0;
    int range_threshold = 0;
    int range_threshold_intra = 0;
    // int is_homogeneous, is_contig;
    MPI_Aint type_size;
    //, position;
    // unsigned char *tmp_buf = NULL;
    MPI_Comm shmem_comm;
    //MPID_Datatype *dtp;

    if (count == 0)
        return MPI_SUCCESS;
    if(comm->get_leaders_comm()==MPI_COMM_NULL){
      comm->init_smp();
    }
    if (not mv2_bcast_thresholds_table)
      init_mv2_bcast_tables_stampede();
    comm_size = comm->size();
    //rank = comm->rank();

    //is_contig=1;
/*    if (HANDLE_GET_KIND(datatype) == HANDLE_KIND_BUILTIN)*/
/*        is_contig = 1;*/
/*    else {*/
/*        MPID_Datatype_get_ptr(datatype, dtp);*/
/*        is_contig = dtp->is_contig;*/
/*    }*/

    // is_homogeneous = 1;

    /* MPI_Type_size() might not give the accurate size of the packed
     * datatype for heterogeneous systems (because of padding, encoding,
     * etc). On the other hand, MPI_Pack_size() can become very
     * expensive, depending on the implementation, especially for
     * heterogeneous systems. We want to use MPI_Type_size() wherever
     * possible, and MPI_Pack_size() in other places.
     */
    //if (is_homogeneous) {
        type_size=datatype->size();

   /* } else {
        MPIR_Pack_size_impl(1, datatype, &type_size);
    }*/
    nbytes =  (count) * (type_size);

    /* Search for the corresponding system size inside the tuning table */
    while ((range < (mv2_size_bcast_tuning_table - 1)) &&
           (comm_size > mv2_bcast_thresholds_table[range].numproc)) {
        range++;
    }
    /* Search for corresponding inter-leader function */
    while ((range_threshold < (mv2_bcast_thresholds_table[range].size_inter_table - 1))
           && (nbytes >
               mv2_bcast_thresholds_table[range].inter_leader[range_threshold].max)
           && (mv2_bcast_thresholds_table[range].inter_leader[range_threshold].max != -1)) {
        range_threshold++;
    }

    /* Search for corresponding intra-node function */
    while ((range_threshold_intra <
            (mv2_bcast_thresholds_table[range].size_intra_table - 1))
           && (nbytes >
               mv2_bcast_thresholds_table[range].intra_node[range_threshold_intra].max)
           && (mv2_bcast_thresholds_table[range].intra_node[range_threshold_intra].max !=
               -1)) {
        range_threshold_intra++;
    }

    MV2_Bcast_function =
        mv2_bcast_thresholds_table[range].inter_leader[range_threshold].
        MV2_pt_Bcast_function;

    MV2_Bcast_intra_node_function =
        mv2_bcast_thresholds_table[range].
        intra_node[range_threshold_intra].MV2_pt_Bcast_function;

/*    if (mv2_user_bcast_intra == NULL && */
/*            MV2_Bcast_intra_node_function == &MPIR_Knomial_Bcast_intra_node_MV2) {*/
/*            MV2_Bcast_intra_node_function = &MPIR_Shmem_Bcast_MV2;*/
/*    }*/

    if (mv2_bcast_thresholds_table[range].inter_leader[range_threshold].
        zcpy_pipelined_knomial_factor != -1) {
        zcpy_knomial_factor =
            mv2_bcast_thresholds_table[range].inter_leader[range_threshold].
            zcpy_pipelined_knomial_factor;
    }

    if (mv2_pipelined_zcpy_knomial_factor != -1) {
        zcpy_knomial_factor = mv2_pipelined_zcpy_knomial_factor;
    }

    if(MV2_Bcast_intra_node_function == NULL) {
        /* if tuning table do not have any intra selection, set func pointer to
        ** default one for mcast intra node */
        MV2_Bcast_intra_node_function = &MPIR_Shmem_Bcast_MV2;
    }

    /* Set value of pipeline segment size */
    bcast_segment_size = mv2_bcast_thresholds_table[range].bcast_segment_size;

    /* Set value of inter node knomial factor */
    mv2_inter_node_knomial_factor = mv2_bcast_thresholds_table[range].inter_node_knomial_factor;

    /* Set value of intra node knomial factor */
    mv2_intra_node_knomial_factor = mv2_bcast_thresholds_table[range].intra_node_knomial_factor;

    /* Check if we will use a two level algorithm or not */
    two_level_bcast =
#if defined(_MCST_SUPPORT_)
        mv2_bcast_thresholds_table[range].is_two_level_bcast[range_threshold]
        || comm->ch.is_mcast_ok;
#else
        mv2_bcast_thresholds_table[range].is_two_level_bcast[range_threshold];
#endif
     if (two_level_bcast == 1) {
       // if (not is_contig || not is_homogeneous) {
//   tmp_buf = smpi_get_tmp_sendbuffer(nbytes);

/*            position = 0;*/
/*            if (rank == root) {*/
/*                mpi_errno =*/
/*                    MPIR_Pack_impl(buffer, count, datatype, tmp_buf, nbytes, &position);*/
/*                if (mpi_errno)*/
/*                    MPIU_ERR_POP(mpi_errno);*/
/*            }*/
// }
#ifdef CHANNEL_MRAIL_GEN2
        if ((mv2_enable_zcpy_bcast == 1) &&
              (&MPIR_Pipelined_Bcast_Zcpy_MV2 == MV2_Bcast_function)) {
          // if (not is_contig || not is_homogeneous) {
          //   mpi_errno = MPIR_Pipelined_Bcast_Zcpy_MV2(tmp_buf, nbytes, MPI_BYTE, root, comm);
          // } else {
                mpi_errno = MPIR_Pipelined_Bcast_Zcpy_MV2(buffer, count, datatype,
                                                 root, comm);
          // }
        } else
#endif /* defined(CHANNEL_MRAIL_GEN2) */
        {
            shmem_comm = comm->get_intra_comm();
            // if (not is_contig || not is_homogeneous) {
            //   MPIR_Bcast_tune_inter_node_helper_MV2(tmp_buf, nbytes, MPI_BYTE, root, comm);
            // } else {
              MPIR_Bcast_tune_inter_node_helper_MV2(buffer, count, datatype, root, comm);
            // }

            /* We are now done with the inter-node phase */


                    root = INTRA_NODE_ROOT;

                    // if (not is_contig || not is_homogeneous) {
                    //       mpi_errno = MV2_Bcast_intra_node_function(tmp_buf, nbytes, MPI_BYTE, root, shmem_comm);
                    // } else {
                    mpi_errno = MV2_Bcast_intra_node_function(buffer, count,
                                                              datatype, root, shmem_comm);

                    // }
        }
        /*        if (not is_contig || not is_homogeneous) {*/
        /*            if (rank != root) {*/
        /*                position = 0;*/
        /*                mpi_errno = MPIR_Unpack_impl(tmp_buf, nbytes, &position, buffer,*/
        /*                                             count, datatype);*/
        /*            }*/
        /*        }*/
    } else {
        /* We use Knomial for intra node */
        MV2_Bcast_intra_node_function = &MPIR_Knomial_Bcast_intra_node_MV2;
/*        if (mv2_enable_shmem_bcast == 0) {*/
            /* Fall back to non-tuned version */
/*            MPIR_Bcast_intra_MV2(buffer, count, datatype, root, comm);*/
/*        } else {*/
            mpi_errno = MV2_Bcast_function(buffer, count, datatype, root,
                                           comm);

/*        }*/
    }


    return mpi_errno;

}
int Coll_reduce_mvapich2_two_level::reduce( const void *sendbuf,
                                     void *recvbuf,
                                     int count,
                                     MPI_Datatype datatype,
                                     MPI_Op op,
                                     int root,
                                     MPI_Comm comm)
{
    int mpi_errno = MPI_SUCCESS;
    int my_rank, total_size, local_rank, local_size;
    int leader_comm_rank = -1, leader_comm_size = 0;
    MPI_Comm shmem_comm, leader_comm;
    int leader_root, leader_of_root;
    const unsigned char* in_buf = nullptr;
    unsigned char *out_buf = nullptr, *tmp_buf = nullptr;
    MPI_Aint true_lb, true_extent, extent;
    int is_commutative = 0, stride = 0;
    int intra_node_root=0;

    //if not set (use of the algo directly, without mvapich2 selector)
    if(MV2_Reduce_function==NULL)
      MV2_Reduce_function=Coll_reduce_mpich::reduce;
    if(MV2_Reduce_intra_function==NULL)
      MV2_Reduce_intra_function=Coll_reduce_mpich::reduce;

    if(comm->get_leaders_comm()==MPI_COMM_NULL){
      comm->init_smp();
    }

    my_rank = comm->rank();
    total_size = comm->size();
    shmem_comm = comm->get_intra_comm();
    local_rank = shmem_comm->rank();
    local_size = shmem_comm->size();

    leader_comm = comm->get_leaders_comm();
    int* leaders_map = comm->get_leaders_map();
    leader_of_root = comm->group()->rank(leaders_map[root]);
    leader_root = leader_comm->group()->rank(leaders_map[root]);

    is_commutative= (op==MPI_OP_NULL || op->is_commutative());

    datatype->extent(&true_lb,
                                       &true_extent);
    extent =datatype->get_extent();
    stride = count * std::max(extent, true_extent);

    if (local_size == total_size) {
        /* First handle the case where there is only one node */
        if (stride <= MV2_INTRA_SHMEM_REDUCE_MSG &&
            is_commutative == 1) {
            if (local_rank == 0 ) {
              tmp_buf = smpi_get_tmp_sendbuffer(count * std::max(extent, true_extent));
              tmp_buf = tmp_buf - true_lb;
            }

            if (sendbuf != MPI_IN_PLACE) {
              in_buf = static_cast<const unsigned char*>(sendbuf);
            } else {
              in_buf = static_cast<const unsigned char*>(recvbuf);
            }

            if (local_rank == 0) {
                 if( my_rank != root) {
                     out_buf = tmp_buf;
                 } else {
                   out_buf = static_cast<unsigned char*>(recvbuf);
                   if (in_buf == out_buf) {
                     in_buf  = static_cast<const unsigned char*>(MPI_IN_PLACE);
                     out_buf = static_cast<unsigned char*>(recvbuf);
                     }
                 }
            } else {
              in_buf  = static_cast<const unsigned char*>(sendbuf);
              out_buf = nullptr;
            }

            if (count * (std::max(extent, true_extent)) < SHMEM_COLL_BLOCK_SIZE) {
              mpi_errno = MPIR_Reduce_shmem_MV2(in_buf, out_buf, count, datatype, op, 0, shmem_comm);
            } else {
              mpi_errno = MPIR_Reduce_intra_knomial_wrapper_MV2(in_buf, out_buf, count, datatype, op, 0, shmem_comm);
            }

            if (local_rank == 0 && root != my_rank) {
                Request::send(out_buf, count, datatype, root,
                                         COLL_TAG_REDUCE+1, comm);
            }
            if ((local_rank != 0) && (root == my_rank)) {
                Request::recv(recvbuf, count, datatype,
                                         leader_of_root, COLL_TAG_REDUCE+1, comm,
                                         MPI_STATUS_IGNORE);
            }
        } else {
            if(mv2_use_knomial_reduce == 1) {
                reduce_fn = &MPIR_Reduce_intra_knomial_wrapper_MV2;
            } else {
                reduce_fn = &MPIR_Reduce_binomial_MV2;
            }
            mpi_errno = reduce_fn(sendbuf, recvbuf, count,
                                  datatype, op,
                                  root, comm);
        }
        /* We are done */
        if (tmp_buf != nullptr)
          smpi_free_tmp_buffer(tmp_buf + true_lb);
        goto fn_exit;
    }


    if (local_rank == 0) {
        leader_comm = comm->get_leaders_comm();
        if(leader_comm==MPI_COMM_NULL){
          leader_comm = MPI_COMM_WORLD;
        }
        leader_comm_size = leader_comm->size();
        leader_comm_rank = leader_comm->rank();
        tmp_buf          = smpi_get_tmp_sendbuffer(count * std::max(extent, true_extent));
        tmp_buf          = tmp_buf - true_lb;
    }
    if (sendbuf != MPI_IN_PLACE) {
      in_buf = static_cast<const unsigned char*>(sendbuf);
    } else {
      in_buf = static_cast<const unsigned char*>(recvbuf);
    }
    if (local_rank == 0) {
      out_buf = static_cast<unsigned char*>(tmp_buf);
    } else {
      out_buf = nullptr;
    }


    if(local_size > 1) {
        /* Lets do the intra-node reduce operations, if we have more than one
         * process in the node */

        /*Fix the input and outbuf buffers for the intra-node reduce.
         *Node leaders will have the reduced data in tmp_buf after
         *this step*/
        if (MV2_Reduce_intra_function == & MPIR_Reduce_shmem_MV2)
        {
          if (is_commutative == 1 && (count * (std::max(extent, true_extent)) < SHMEM_COLL_BLOCK_SIZE)) {
            mpi_errno = MV2_Reduce_intra_function(in_buf, out_buf, count, datatype, op, intra_node_root, shmem_comm);
            } else {
                    mpi_errno = MPIR_Reduce_intra_knomial_wrapper_MV2(in_buf, out_buf, count,
                                      datatype, op,
                                      intra_node_root, shmem_comm);
            }
        } else {

            mpi_errno = MV2_Reduce_intra_function(in_buf, out_buf, count,
                                      datatype, op,
                                      intra_node_root, shmem_comm);
        }
    } else {
      smpi_free_tmp_buffer(tmp_buf + true_lb);
      tmp_buf = (unsigned char*)in_buf; // xxx
    }

    /* Now work on the inter-leader phase. Data is in tmp_buf */
    if (local_rank == 0 && leader_comm_size > 1) {
        /*The leader of root will have the global reduced data in tmp_buf
           or recv_buf
           at the end of the reduce */
        if (leader_comm_rank == leader_root) {
            if (my_rank == root) {
                /* I am the root of the leader-comm, and the
                 * root of the reduce op. So, I will write the
                 * final result directly into my recvbuf */
                if(tmp_buf != recvbuf) {
                  in_buf  = tmp_buf;
                  out_buf = static_cast<unsigned char*>(recvbuf);
                } else {

                  unsigned char* buf = smpi_get_tmp_sendbuffer(count * datatype->get_extent());
                  Datatype::copy(tmp_buf, count, datatype, buf, count, datatype);
                  // in_buf = MPI_IN_PLACE;
                  in_buf  = buf;
                  out_buf = static_cast<unsigned char*>(recvbuf);
                }
            } else {
              unsigned char* buf = smpi_get_tmp_sendbuffer(count * datatype->get_extent());
              Datatype::copy(tmp_buf, count, datatype, buf, count, datatype);
              // in_buf = MPI_IN_PLACE;
              in_buf  = buf;
              out_buf = tmp_buf;
            }
        } else {
            in_buf = tmp_buf;
            out_buf = nullptr;
        }

        /* inter-leader communication  */
        mpi_errno = MV2_Reduce_function(in_buf, out_buf, count,
                              datatype, op,
                              leader_root, leader_comm);

    }

    if (local_size > 1) {
      /* Send the message to the root if the leader is not the
       * root of the reduce operation. The reduced data is in tmp_buf */
      if ((local_rank == 0) && (root != my_rank) && (leader_root == leader_comm_rank)) {
        Request::send(tmp_buf, count, datatype, root, COLL_TAG_REDUCE + 1, comm);
      }
      if ((local_rank != 0) && (root == my_rank)) {
        Request::recv(recvbuf, count, datatype, leader_of_root, COLL_TAG_REDUCE + 1, comm, MPI_STATUS_IGNORE);
      }
      smpi_free_tmp_buffer(tmp_buf + true_lb);

      if (leader_comm_rank == leader_root) {
        if (my_rank != root || (my_rank == root && tmp_buf == recvbuf)) {
          smpi_free_tmp_buffer(in_buf);
        }
      }
    }



  fn_exit:
    return mpi_errno;
}
Beispiel #10
0
int Coll_bcast_mvapich2_inter_node::bcast(void *buffer,
                                                 int count,
                                                 MPI_Datatype datatype,
                                                 int root,
                                                 MPI_Comm  comm)
{
    int rank;
    int mpi_errno = MPI_SUCCESS;
    MPI_Comm shmem_comm, leader_comm;
    int local_rank, local_size, global_rank = -1;
    int leader_root, leader_of_root;


    rank = comm->rank();
    //comm_size = comm->size();


    if (MV2_Bcast_function==NULL){
      MV2_Bcast_function=Coll_bcast_mpich::bcast;
    }

    if (MV2_Bcast_intra_node_function==NULL){
      MV2_Bcast_intra_node_function= Coll_bcast_mpich::bcast;
    }

    if(comm->get_leaders_comm()==MPI_COMM_NULL){
      comm->init_smp();
    }

    shmem_comm = comm->get_intra_comm();
    local_rank = shmem_comm->rank();
    local_size = shmem_comm->size();

    leader_comm = comm->get_leaders_comm();

    if ((local_rank == 0) && (local_size > 1)) {
      global_rank = leader_comm->rank();
    }

    int* leaders_map = comm->get_leaders_map();
    leader_of_root = comm->group()->rank(leaders_map[root]);
    leader_root = leader_comm->group()->rank(leaders_map[root]);


    if (local_size > 1) {
        if ((local_rank == 0) && (root != rank) && (leader_root == global_rank)) {
            Request::recv(buffer, count, datatype, root,
                                     COLL_TAG_BCAST, comm, MPI_STATUS_IGNORE);
        }
        if ((local_rank != 0) && (root == rank)) {
            Request::send(buffer, count, datatype,
                                     leader_of_root, COLL_TAG_BCAST, comm);
        }
    }
#if defined(_MCST_SUPPORT_)
    if (comm_ptr->ch.is_mcast_ok) {
        mpi_errno = MPIR_Mcast_inter_node_MV2(buffer, count, datatype, root, comm_ptr,
                                              errflag);
        if (mpi_errno == MPI_SUCCESS) {
            goto fn_exit;
        } else {
            goto fn_fail;
        }
    }
#endif
/*
    if (local_rank == 0) {
        leader_comm = comm->get_leaders_comm();
        root = leader_root;
    }

    if (MV2_Bcast_function == &MPIR_Pipelined_Bcast_MV2) {
        mpi_errno = MPIR_Pipelined_Bcast_MV2(buffer, count, datatype,
                                             root, comm);
    } else if (MV2_Bcast_function == &MPIR_Bcast_scatter_ring_allgather_shm_MV2) {
        mpi_errno = MPIR_Bcast_scatter_ring_allgather_shm_MV2(buffer, count,
                                                              datatype, root,
                                                              comm);
    } else */{
        if (local_rank == 0) {
      /*      if (MV2_Bcast_function == &MPIR_Knomial_Bcast_inter_node_wrapper_MV2) {
                mpi_errno = MPIR_Knomial_Bcast_inter_node_wrapper_MV2(buffer, count,
                                                              datatype, root,
                                                              comm);
            } else {*/
                mpi_errno = MV2_Bcast_function(buffer, count, datatype,
                                               leader_root, leader_comm);
          //  }
        }
    }

    return mpi_errno;
}
Beispiel #11
0
int Coll_bcast_mvapich2_intra_node::bcast(void *buffer,
                         int count,
                         MPI_Datatype datatype,
                         int root, MPI_Comm  comm)
{
    int mpi_errno = MPI_SUCCESS;
    int comm_size;
    int two_level_bcast = 1;
    size_t nbytes = 0;
    int is_homogeneous, is_contig;
    MPI_Aint type_size;
    unsigned char* tmp_buf = nullptr;
    MPI_Comm shmem_comm;

    if (count == 0)
        return MPI_SUCCESS;
    if (MV2_Bcast_function==NULL){
      MV2_Bcast_function=Coll_bcast_mpich::bcast;
    }

    if (MV2_Bcast_intra_node_function==NULL){
      MV2_Bcast_intra_node_function= Coll_bcast_mpich::bcast;
    }

    if(comm->get_leaders_comm()==MPI_COMM_NULL){
      comm->init_smp();
    }

    comm_size = comm->size();
   // rank = comm->rank();
/*
    if (HANDLE_GET_KIND(datatype) == HANDLE_KIND_BUILTIN)*/
        is_contig = 1;
/*    else {
        MPID_Datatype_get_ptr(datatype, dtp);
        is_contig = dtp->is_contig;
    }
*/
    is_homogeneous = 1;
#ifdef MPID_HAS_HETERO
    if (comm_ptr->is_hetero)
        is_homogeneous = 0;
#endif

    /* MPI_Type_size() might not give the accurate size of the packed
     * datatype for heterogeneous systems (because of padding, encoding,
     * etc). On the other hand, MPI_Pack_size() can become very
     * expensive, depending on the implementation, especially for
     * heterogeneous systems. We want to use MPI_Type_size() wherever
     * possible, and MPI_Pack_size() in other places.
     */
    //if (is_homogeneous) {
        type_size=datatype->size();
    //}
/*    else {*/
/*        MPIR_Pack_size_impl(1, datatype, &type_size);*/
/*    }*/
    nbytes = (size_t) (count) * (type_size);
    if (comm_size <= mv2_bcast_two_level_system_size) {
        if (nbytes > mv2_bcast_short_msg && nbytes < mv2_bcast_large_msg) {
            two_level_bcast = 1;
        } else {
            two_level_bcast = 0;
        }
    }

    if (two_level_bcast == 1
#if defined(_MCST_SUPPORT_)
            || comm_ptr->ch.is_mcast_ok
#endif
        ) {

      if (not is_contig || not is_homogeneous) {
        tmp_buf = smpi_get_tmp_sendbuffer(nbytes);

        /* TODO: Pipeline the packing and communication */
        // position = 0;
        /*            if (rank == root) {*/
        /*                mpi_errno =*/
        /*                    MPIR_Pack_impl(buffer, count, datatype, tmp_buf, nbytes, &position);*/
        /*                if (mpi_errno)*/
        /*                    MPIU_ERR_POP(mpi_errno);*/
        /*            }*/
        }

        shmem_comm = comm->get_intra_comm();
        if (not is_contig || not is_homogeneous) {
          mpi_errno = MPIR_Bcast_inter_node_helper_MV2(tmp_buf, nbytes, MPI_BYTE, root, comm);
        } else {
            mpi_errno =
                MPIR_Bcast_inter_node_helper_MV2(buffer, count, datatype, root,
                                                 comm);
        }

        /* We are now done with the inter-node phase */
            if (nbytes <= mv2_knomial_intra_node_threshold) {
              if (not is_contig || not is_homogeneous) {
                mpi_errno = MPIR_Shmem_Bcast_MV2(tmp_buf, nbytes, MPI_BYTE, root, shmem_comm);
                } else {
                    mpi_errno = MPIR_Shmem_Bcast_MV2(buffer, count, datatype,
                                                     root, shmem_comm);
                }
            } else {
              if (not is_contig || not is_homogeneous) {
                mpi_errno = MPIR_Knomial_Bcast_intra_node_MV2(tmp_buf, nbytes, MPI_BYTE, INTRA_NODE_ROOT, shmem_comm);
                } else {
                    mpi_errno =
                        MPIR_Knomial_Bcast_intra_node_MV2(buffer, count,
                                                          datatype,
                                                          INTRA_NODE_ROOT,
                                                          shmem_comm);
                }
            }

    } else {
        if (nbytes <= mv2_bcast_short_msg) {
            mpi_errno = MPIR_Bcast_binomial_MV2(buffer, count, datatype, root,
                                                comm);
        } else {
            if (mv2_scatter_rd_inter_leader_bcast) {
                mpi_errno = MPIR_Bcast_scatter_ring_allgather_MV2(buffer, count,
                                                                  datatype,
                                                                  root,
                                                                  comm);
            } else {
                mpi_errno =
                    MPIR_Bcast_scatter_doubling_allgather_MV2(buffer, count,
                                                              datatype, root,
                                                              comm);
            }
        }
    }


    return mpi_errno;

}
Beispiel #12
0
int Coll_bcast_mvapich2_knomial_intra_node::bcast(void *buffer,
                                      int count,
                                      MPI_Datatype datatype,
                                      int root, MPI_Comm  comm)
{
    int local_size = 0, rank;
    int mpi_errno = MPI_SUCCESS;
    int src, dst, mask, relative_rank;
    int k;
    if (MV2_Bcast_function==NULL){
      MV2_Bcast_function=Coll_bcast_mpich::bcast;
    }

    if (MV2_Bcast_intra_node_function==NULL){
      MV2_Bcast_intra_node_function= Coll_bcast_mpich::bcast;
    }

    if(comm->get_leaders_comm()==MPI_COMM_NULL){
      comm->init_smp();
    }

    local_size = comm->size();
    rank = comm->rank();

    MPI_Request* reqarray = new MPI_Request[2 * mv2_intra_node_knomial_factor];

    MPI_Status* starray = new MPI_Status[2 * mv2_intra_node_knomial_factor];

    /* intra-node k-nomial bcast  */
    if (local_size > 1) {
        relative_rank = (rank >= root) ? rank - root : rank - root + local_size;
        mask = 0x1;

        while (mask < local_size) {
            if (relative_rank % (mv2_intra_node_knomial_factor * mask)) {
                src = relative_rank / (mv2_intra_node_knomial_factor * mask) *
                    (mv2_intra_node_knomial_factor * mask) + root;
                if (src >= local_size) {
                    src -= local_size;
                }

                Request::recv(buffer, count, datatype, src,
                                         COLL_TAG_BCAST, comm,
                                         MPI_STATUS_IGNORE);
                break;
            }
            mask *= mv2_intra_node_knomial_factor;
        }
        mask /= mv2_intra_node_knomial_factor;

        while (mask > 0) {
            int reqs = 0;
            for (k = 1; k < mv2_intra_node_knomial_factor; k++) {
                if (relative_rank + mask * k < local_size) {
                    dst = rank + mask * k;
                    if (dst >= local_size) {
                        dst -= local_size;
                    }
                    reqarray[reqs++]=Request::isend(buffer, count, datatype, dst,
                                              COLL_TAG_BCAST, comm);
                }
            }
            Request::waitall(reqs, reqarray, starray);

            mask /= mv2_intra_node_knomial_factor;
        }
    }
    delete[] reqarray;
    delete[] starray;
    return mpi_errno;
}
Beispiel #13
0
int Coll_allgather_SMP_NTS::allgather(const void *sbuf, int scount,
                                      MPI_Datatype stype, void *rbuf,
                                      int rcount, MPI_Datatype rtype,
                                      MPI_Comm comm)
{
  int src, dst, comm_size, rank;
  comm_size = comm->size();
  rank = comm->rank();
  MPI_Aint rextent, sextent;
  rextent = rtype->get_extent();
  sextent = stype->get_extent();
  int tag = COLL_TAG_ALLGATHER;

  int i, send_offset, recv_offset;
  int intra_rank, inter_rank;

  if(comm->get_leaders_comm()==MPI_COMM_NULL){
    comm->init_smp();
  }
  int num_core=1;
  if (comm->is_uniform()){
    num_core = comm->get_intra_comm()->size();
  }


  intra_rank = rank % num_core;
  inter_rank = rank / num_core;
  int inter_comm_size = (comm_size + num_core - 1) / num_core;
  int num_core_in_current_smp = num_core;

  if(comm_size%num_core)
    THROWF(arg_error,0, "allgather SMP NTS algorithm can't be used with non multiple of NUM_CORE=%d number of processes ! ", num_core);

  /* for too small number of processes, use default implementation */
  if (comm_size <= num_core) {
    XBT_WARN("MPI_allgather_SMP_NTS use default MPI_allgather.");
    Coll_allgather_default::allgather(sbuf, scount, stype, rbuf, rcount, rtype, comm);
    return MPI_SUCCESS;
  }

  // the last SMP node may have fewer number of running processes than all others
  if (inter_rank == (inter_comm_size - 1)) {
    num_core_in_current_smp = comm_size - (inter_rank * num_core);
  }
  //copy corresponding message from sbuf to rbuf
  recv_offset = rank * rextent * rcount;
  Request::sendrecv(sbuf, scount, stype, rank, tag,
               ((char *) rbuf + recv_offset), rcount, rtype, rank, tag, comm,
               MPI_STATUS_IGNORE);

  //gather to root of each SMP

  for (i = 1; i < num_core_in_current_smp; i++) {

    dst =
        (inter_rank * num_core) + (intra_rank + i) % (num_core_in_current_smp);
    src =
        (inter_rank * num_core) + (intra_rank - i +
                                   num_core_in_current_smp) %
        (num_core_in_current_smp);
    recv_offset = src * rextent * rcount;

    Request::sendrecv(sbuf, scount, stype, dst, tag,
                 ((char *) rbuf + recv_offset), rcount, rtype, src, tag, comm,
                 MPI_STATUS_IGNORE);

  }

  // INTER-SMP-ALLGATHER
  // Every root of each SMP node post INTER-Sendrecv, then do INTRA-Bcast for each receiving message
  // Use logical ring algorithm

  // root of each SMP
  if (intra_rank == 0) {
    MPI_Request* rrequest_array = new MPI_Request[inter_comm_size - 1];
    MPI_Request* srequest_array = new MPI_Request[inter_comm_size - 1];

    src = ((inter_rank - 1 + inter_comm_size) % inter_comm_size) * num_core;
    dst = ((inter_rank + 1) % inter_comm_size) * num_core;

    // post all inter Irecv
    for (i = 0; i < inter_comm_size - 1; i++) {
      recv_offset =
          ((inter_rank - i - 1 +
            inter_comm_size) % inter_comm_size) * num_core * sextent * scount;
      rrequest_array[i] = Request::irecv((char *)rbuf + recv_offset, rcount * num_core,
                                         rtype, src, tag + i, comm);
    }

    // send first message
    send_offset =
        ((inter_rank +
          inter_comm_size) % inter_comm_size) * num_core * sextent * scount;
    srequest_array[0] = Request::isend((char *)rbuf + send_offset, scount * num_core,
                                       stype, dst, tag, comm);

    // loop : recv-inter , send-inter, send-intra (linear-bcast)
    for (i = 0; i < inter_comm_size - 2; i++) {
      recv_offset =
          ((inter_rank - i - 1 +
            inter_comm_size) % inter_comm_size) * num_core * sextent * scount;
      Request::wait(&rrequest_array[i], MPI_STATUS_IGNORE);
      srequest_array[i + 1] = Request::isend((char *)rbuf + recv_offset, scount * num_core,
                                             stype, dst, tag + i + 1, comm);
      if (num_core_in_current_smp > 1) {
        Request::send((char *)rbuf + recv_offset, scount * num_core,
                      stype, (rank + 1), tag + i + 1, comm);
      }
    }

    // recv last message and send_intra
    recv_offset =
        ((inter_rank - i - 1 +
          inter_comm_size) % inter_comm_size) * num_core * sextent * scount;
    //recv_offset = ((inter_rank + 1) % inter_comm_size) * num_core * sextent * scount;
    //i=inter_comm_size-2;
    Request::wait(&rrequest_array[i], MPI_STATUS_IGNORE);
    if (num_core_in_current_smp > 1) {
      Request::send((char *)rbuf + recv_offset, scount * num_core,
                                  stype, (rank + 1), tag + i + 1, comm);
    }

    Request::waitall(inter_comm_size - 1, srequest_array, MPI_STATUSES_IGNORE);
    delete[] rrequest_array;
    delete[] srequest_array;
  }
  // last rank of each SMP
  else if (intra_rank == (num_core_in_current_smp - 1)) {
    for (i = 0; i < inter_comm_size - 1; i++) {
      recv_offset =
          ((inter_rank - i - 1 +
            inter_comm_size) % inter_comm_size) * num_core * sextent * scount;
      Request::recv((char *) rbuf + recv_offset, (rcount * num_core), rtype,
                    rank - 1, tag + i + 1, comm, MPI_STATUS_IGNORE);
    }
  }
  // intermediate rank of each SMP
  else {
    for (i = 0; i < inter_comm_size - 1; i++) {
      recv_offset =
          ((inter_rank - i - 1 +
            inter_comm_size) % inter_comm_size) * num_core * sextent * scount;
      Request::recv((char *) rbuf + recv_offset, (rcount * num_core), rtype,
                    rank - 1, tag + i + 1, comm, MPI_STATUS_IGNORE);
      Request::send((char *) rbuf + recv_offset, (scount * num_core), stype,
                    (rank + 1), tag + i + 1, comm);
    }
  }

  return MPI_SUCCESS;
}