Beispiel #1
0
int PMPI_Win_allocate_shared( MPI_Aint size, int disp_unit, MPI_Info info, MPI_Comm comm, void *base, MPI_Win *win){
  int retval = 0;
  smpi_bench_end();
  if (comm == MPI_COMM_NULL) {
    retval= MPI_ERR_COMM;
  }else if (disp_unit <= 0 || size < 0 ){
    retval= MPI_ERR_OTHER;
  }else{
    void* ptr = nullptr;
    int rank = comm->rank();
    if(rank==0){
       ptr = xbt_malloc(size*comm->size());
       if(ptr==nullptr)
         return MPI_ERR_NO_MEM;
    }
    
    simgrid::smpi::Colls::bcast(&ptr, sizeof(void*), MPI_BYTE, 0, comm);
    simgrid::smpi::Colls::barrier(comm);
    
    *static_cast<void**>(base) = (char*)ptr+rank*size;
    *win = new simgrid::smpi::Win( ptr, size, disp_unit, info, comm,rank==0);
    retval = MPI_SUCCESS;
  }
  smpi_bench_begin();
  return retval;
}
Beispiel #2
0
int Coll_barrier_ompi_bruck::barrier(MPI_Comm comm)
{
    int rank, size;
    int distance, to, from;

    rank = comm->rank();
    size = comm->size();
    XBT_DEBUG(
                 "ompi_coll_tuned_barrier_ompi_bruck rank %d", rank);

    /* exchange data with rank-2^k and rank+2^k */
    for (distance = 1; distance < size; distance <<= 1) {
        from = (rank + size - distance) % size;
        to   = (rank + distance) % size;

        /* send message to lower ranked node */
        Request::sendrecv(NULL, 0, MPI_BYTE, to,
                                              COLL_TAG_BARRIER,
                                              NULL, 0, MPI_BYTE, from,
                                              COLL_TAG_BARRIER,
                                              comm, MPI_STATUS_IGNORE);
    }

    return MPI_SUCCESS;

}
Beispiel #3
0
int Coll_alltoallv_ring::alltoallv(const void* send_buff, const int* send_counts, const int* send_disps, MPI_Datatype send_type,
                                   void* recv_buff, const int* recv_counts, const int* recv_disps, MPI_Datatype recv_type,
                                   MPI_Comm comm)
{
  MPI_Status s;
  MPI_Aint send_chunk, recv_chunk;
  int i, src, dst, rank, num_procs;
  int tag = COLL_TAG_ALLTOALLV;

  char *send_ptr = (char *) send_buff;
  char *recv_ptr = (char *) recv_buff;

  rank = comm->rank();
  num_procs = comm->size();
  send_chunk = send_type->get_extent();
  recv_chunk = recv_type->get_extent();
  int pof2 = ((num_procs != 0) && ((num_procs & (~num_procs + 1)) == num_procs));
  for (i = 0; i < num_procs; i++) {

    if (pof2 == 1) {
      /* use exclusive-or algorithm */
      src = dst = rank ^ i;
    } else {
      src = (rank - i + num_procs) % num_procs;
      dst = (rank + i) % num_procs;
    }

    Request::sendrecv(send_ptr + send_disps[dst] * send_chunk, send_counts[dst], send_type, dst,
                 tag, recv_ptr + recv_disps[src] * recv_chunk, recv_counts[src], recv_type,
                 src, tag, comm, &s);

  }
  return MPI_SUCCESS;
}
Beispiel #4
0
int Coll_alltoallv_pair::alltoallv(void *send_buff, int *send_counts, int *send_disps,
                                  MPI_Datatype send_type,
                                  void *recv_buff, int *recv_counts, int *recv_disps,
                                  MPI_Datatype recv_type, MPI_Comm comm)
{

  MPI_Aint send_chunk, recv_chunk;
  MPI_Status s;
  int i, src, dst, rank, num_procs;
  int tag = COLL_TAG_ALLTOALLV;
  char *send_ptr = (char *) send_buff;
  char *recv_ptr = (char *) recv_buff;

  rank = comm->rank();
  num_procs = comm->size();

  if((num_procs&(num_procs-1)))
    THROWF(arg_error,0, "alltoallv pair algorithm can't be used with non power of two number of processes ! ");

  send_chunk = send_type->get_extent();
  recv_chunk = recv_type->get_extent();

  for (i = 0; i < num_procs; i++) {
    src = dst = rank ^ i;
    Request::sendrecv(send_ptr + send_disps[dst] * send_chunk, send_counts[dst], send_type, dst, tag,
                      recv_ptr + recv_disps[src] * recv_chunk, recv_counts[src], recv_type, src, tag, comm, &s);
  }
  return MPI_SUCCESS;
}
int Coll_gather_ompi::gather(const void *sbuf, int scount,
                                           MPI_Datatype sdtype,
                                           void* rbuf, int rcount,
                                           MPI_Datatype rdtype,
                                           int root,
                                           MPI_Comm  comm
                                           )
{
    //const int large_segment_size = 32768;
    //const int small_segment_size = 1024;

    //const size_t large_block_size = 92160;
    const size_t intermediate_block_size = 6000;
    const size_t small_block_size = 1024;

    const int large_communicator_size = 60;
    const int small_communicator_size = 10;

    int communicator_size, rank;
    size_t dsize, block_size;

    XBT_DEBUG("smpi_coll_tuned_gather_ompi");

    communicator_size = comm->size();
    rank = comm->rank();

    // Determine block size
    if (rank == root) {
        dsize = rdtype->size();
        block_size = dsize * rcount;
    } else {
        dsize = sdtype->size();
        block_size = dsize * scount;
    }

/*    if (block_size > large_block_size) {*/
/*        return smpi_coll_tuned_gather_ompi_linear_sync (sbuf, scount, sdtype, */
/*                                                         rbuf, rcount, rdtype, */
/*                                                         root, comm);*/

/*    } else*/ if (block_size > intermediate_block_size) {
        return Coll_gather_ompi_linear_sync::gather (sbuf, scount, sdtype,
                                                         rbuf, rcount, rdtype,
                                                         root, comm);

    } else if ((communicator_size > large_communicator_size) ||
               ((communicator_size > small_communicator_size) &&
                (block_size < small_block_size))) {
        return Coll_gather_ompi_binomial::gather (sbuf, scount, sdtype,
                                                      rbuf, rcount, rdtype,
                                                      root, comm);

    }
    // Otherwise, use basic linear
    return Coll_gather_ompi_basic_linear::gather (sbuf, scount, sdtype,
                                                      rbuf, rcount, rdtype,
                                                      root, comm);
}
Beispiel #6
0
int
Coll_reduce_flat_tree::reduce(const void *sbuf, void *rbuf, int count,
                                 MPI_Datatype dtype, MPI_Op op,
                                 int root, MPI_Comm comm)
{
  int i, tag = COLL_TAG_REDUCE;
  int size;
  int rank;
  MPI_Aint extent;
  unsigned char* origin = nullptr;
  const unsigned char* inbuf;
  MPI_Status status;

  rank = comm->rank();
  size = comm->size();

  /* If not root, send data to the root. */
  extent = dtype->get_extent();

  if (rank != root) {
    Request::send(sbuf, count, dtype, root, tag, comm);
    return 0;
  }

  /* Root receives and reduces messages.  Allocate buffer to receive
     messages. */

  if (size > 1)
    origin = smpi_get_tmp_recvbuffer(count * extent);

  /* Initialize the receive buffer. */
  if (rank == (size - 1))
    Request::sendrecv(sbuf, count, dtype, rank, tag,
                 rbuf, count, dtype, rank, tag, comm, &status);
  else
    Request::recv(rbuf, count, dtype, size - 1, tag, comm, &status);

  /* Loop receiving and calling reduction function (C or Fortran). */

  for (i = size - 2; i >= 0; --i) {
    if (rank == i)
      inbuf = static_cast<const unsigned char*>(sbuf);
    else {
      Request::recv(origin, count, dtype, i, tag, comm, &status);
      inbuf = origin;
    }

    /* Call reduction function. */
    if(op!=MPI_OP_NULL) op->apply( inbuf, rbuf, &count, dtype);

  }

  smpi_free_tmp_buffer(origin);

  /* All done */
  return 0;
}
int
Coll_bcast_flattree_pipeline::bcast(void *buff, int count,
                                        MPI_Datatype data_type, int root,
                                        MPI_Comm comm)
{
  int i, j, rank, num_procs;
  int tag = COLL_TAG_BCAST;

  MPI_Aint extent;
  extent = data_type->get_extent();

  int segment = flattree_segment_in_byte / extent;
  segment =  segment == 0 ? 1 :segment;
  int pipe_length = count / segment;
  int increment = segment * extent;
  if (pipe_length==0) {
    XBT_WARN("MPI_bcast_flattree_pipeline use default MPI_bcast_flattree.");
    return Coll_bcast_flattree::bcast(buff, count, data_type, root, comm);
  }
  rank = comm->rank();
  num_procs = comm->size();

  MPI_Request *request_array;
  MPI_Status *status_array;

  request_array = (MPI_Request *) xbt_malloc(pipe_length * sizeof(MPI_Request));
  status_array = (MPI_Status *) xbt_malloc(pipe_length * sizeof(MPI_Status));

  if (rank != root) {
    for (i = 0; i < pipe_length; i++) {
      request_array[i] = Request::irecv((char *)buff + (i * increment), segment, data_type, root, tag, comm);
    }
    Request::waitall(pipe_length, request_array, status_array);
  }

  else {
    // Root sends data to all others
    for (j = 0; j < num_procs; j++) {
      if (j == rank)
        continue;
      else {
        for (i = 0; i < pipe_length; i++) {
          Request::send((char *)buff + (i * increment), segment, data_type, j, tag, comm);
        }
      }
    }

  }

  free(request_array);
  free(status_array);
  return MPI_SUCCESS;
}
int Coll_reduce_scatter_ompi::reduce_scatter(const void *sbuf, void *rbuf,
                                                    const int *rcounts,
                                                    MPI_Datatype dtype,
                                                    MPI_Op  op,
                                                    MPI_Comm  comm
                                                    )
{
    int comm_size, i, pow2;
    size_t total_message_size, dsize;
    const double a = 0.0012;
    const double b = 8.0;
    const size_t small_message_size = 12 * 1024;
    const size_t large_message_size = 256 * 1024;
    int zerocounts = 0;

    XBT_DEBUG("Coll_reduce_scatter_ompi::reduce_scatter");

    comm_size = comm->size();
    // We need data size for decision function
    dsize=dtype->size();
    total_message_size = 0;
    for (i = 0; i < comm_size; i++) {
        total_message_size += rcounts[i];
        if (0 == rcounts[i]) {
            zerocounts = 1;
        }
    }

    if (((op != MPI_OP_NULL) && not op->is_commutative()) || (zerocounts)) {
      Coll_reduce_scatter_default::reduce_scatter(sbuf, rbuf, rcounts, dtype, op, comm);
      return MPI_SUCCESS;
    }

    total_message_size *= dsize;

    // compute the nearest power of 2
    for (pow2 = 1; pow2 < comm_size; pow2 <<= 1);

    if ((total_message_size <= small_message_size) ||
        ((total_message_size <= large_message_size) && (pow2 == comm_size)) ||
        (comm_size >= a * total_message_size + b)) {
        return
            Coll_reduce_scatter_ompi_basic_recursivehalving::reduce_scatter(sbuf, rbuf, rcounts,
                                                                        dtype, op,
                                                                        comm);
    }
    return Coll_reduce_scatter_ompi_ring::reduce_scatter(sbuf, rbuf, rcounts,
                                                     dtype, op,
                                                     comm);



}
Beispiel #9
0
/*
 *  gather_intra
 *
 *  Function:  - basic gather operation
 *  Accepts:  - same arguments as MPI_Gather()
 *  Returns:  - MPI_SUCCESS or error code
 */
int Coll_gather_ompi_basic_linear::gather(void* sbuf, int scount, MPI_Datatype sdtype, void* rbuf, int rcount,
                                          MPI_Datatype rdtype, int root, MPI_Comm comm)
{
    int i;
    int err;
    int rank;
    int size;
    char *ptmp;
    MPI_Aint incr;
    MPI_Aint extent;
    MPI_Aint lb;

    size = comm->size();
    rank = comm->rank();

    /* Everyone but root sends data and returns. */
    XBT_DEBUG("ompi_coll_tuned_gather_intra_basic_linear rank %d", rank);

    if (rank != root) {
        Request::send(sbuf, scount, sdtype, root,
                                 COLL_TAG_GATHER,
                                  comm);
        return MPI_SUCCESS;
    }

    /* I am the root, loop receiving the data. */

    rdtype->extent(&lb, &extent);
    incr = extent * rcount;
    for (i = 0, ptmp = (char *) rbuf; i < size; ++i, ptmp += incr) {
        if (i == rank) {
            if (MPI_IN_PLACE != sbuf) {
                err = Datatype::copy(sbuf, scount, sdtype,
                                      ptmp, rcount, rdtype);
            } else {
                err = MPI_SUCCESS;
            }
        } else {
            Request::recv(ptmp, rcount, rdtype, i,
                                    COLL_TAG_GATHER,
                                    comm, MPI_STATUS_IGNORE);
            err = MPI_SUCCESS;
        }
        if (MPI_SUCCESS != err) {
            return err;
        }
    }

    /* All done */

    return MPI_SUCCESS;
}
int Coll_alltoall_basic_linear::alltoall(void *sendbuf, int sendcount, MPI_Datatype sendtype,
                                          void *recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm)
{
  int system_tag = 888;
  int i;
  int count;
  MPI_Aint lb = 0, sendext = 0, recvext = 0;
  MPI_Request *requests;

  /* Initialize. */
  int rank = comm->rank();
  int size = comm->size();
  XBT_DEBUG("<%d> algorithm alltoall_basic_linear() called.", rank);
  sendtype->extent(&lb, &sendext);
  recvtype->extent(&lb, &recvext);
  /* simple optimization */
  int err = Datatype::copy(static_cast<char *>(sendbuf) + rank * sendcount * sendext, sendcount, sendtype,
                               static_cast<char *>(recvbuf) + rank * recvcount * recvext, recvcount, recvtype);
  if (err == MPI_SUCCESS && size > 1) {
    /* Initiate all send/recv to/from others. */
    requests = xbt_new(MPI_Request, 2 * (size - 1));
    /* Post all receives first -- a simple optimization */
    count = 0;
    for (i = (rank + 1) % size; i != rank; i = (i + 1) % size) {
      requests[count] = Request::irecv_init(static_cast<char *>(recvbuf) + i * recvcount * recvext, recvcount,
                                        recvtype, i, system_tag, comm);
      count++;
    }
    /* Now post all sends in reverse order
     *   - We would like to minimize the search time through message queue
     *     when messages actually arrive in the order in which they were posted.
     * TODO: check the previous assertion
     */
    for (i = (rank + size - 1) % size; i != rank; i = (i + size - 1) % size) {
      requests[count] = Request::isend_init(static_cast<char *>(sendbuf) + i * sendcount * sendext, sendcount,
                                        sendtype, i, system_tag, comm);
      count++;
    }
    /* Wait for them all. */
    Request::startall(count, requests);
    XBT_DEBUG("<%d> wait for %d requests", rank, count);
    Request::waitall(count, requests, MPI_STATUS_IGNORE);
    for(i = 0; i < count; i++) {
      if(requests[i]!=MPI_REQUEST_NULL)
        Request::unref(&requests[i]);
    }
    xbt_free(requests);
  }
  return err;
}
Beispiel #11
0
/*
 * Another recursive doubling type algorithm, but in this case
 * we go up the tree and back down the tree.
 */
int Coll_barrier_ompi_tree::barrier(MPI_Comm comm)
{
    int rank, size, depth;
    int jump, partner;

    rank = comm->rank();
    size = comm->size();
    XBT_DEBUG(
                 "ompi_coll_tuned_barrier_ompi_tree %d",
                 rank);

    /* Find the nearest power of 2 of the communicator size. */
    for(depth = 1; depth < size; depth <<= 1 );

    for (jump=1; jump<depth; jump<<=1) {
        partner = rank ^ jump;
        if (!(partner & (jump-1)) && partner < size) {
            if (partner > rank) {
                Request::recv (NULL, 0, MPI_BYTE, partner,
                                         COLL_TAG_BARRIER, comm,
                                         MPI_STATUS_IGNORE);
            } else if (partner < rank) {
                Request::send (NULL, 0, MPI_BYTE, partner,
                                         COLL_TAG_BARRIER,
                                          comm);
            }
        }
    }

    depth>>=1;
    for (jump = depth; jump>0; jump>>=1) {
        partner = rank ^ jump;
        if (!(partner & (jump-1)) && partner < size) {
            if (partner > rank) {
                Request::send (NULL, 0, MPI_BYTE, partner,
                                         COLL_TAG_BARRIER,
                                          comm);
            } else if (partner < rank) {
                Request::recv (NULL, 0, MPI_BYTE, partner,
                                         COLL_TAG_BARRIER, comm,
                                         MPI_STATUS_IGNORE);
            }
        }
    }

    return MPI_SUCCESS;
}
int Coll_allgatherv_ompi::allgatherv(const void *sbuf, int scount,
                                               MPI_Datatype sdtype,
                                               void* rbuf, const int *rcounts,
                                               const int *rdispls,
                                               MPI_Datatype rdtype,
                                               MPI_Comm  comm
                                               )
{
    int i;
    int communicator_size;
    size_t dsize, total_dsize;

    communicator_size = comm->size();

    /* Special case for 2 processes */
    if (communicator_size == 2) {
        return Coll_allgatherv_pair::allgatherv(sbuf, scount, sdtype,
                                                           rbuf, rcounts, rdispls, rdtype,
                                                           comm);
    }

    /* Determine complete data size */
    dsize=sdtype->size();
    total_dsize = 0;
    for (i = 0; i < communicator_size; i++) {
        total_dsize += dsize * rcounts[i];
    }

    /* Decision based on allgather decision.   */
    if (total_dsize < 50000) {
        return Coll_allgatherv_ompi_bruck::allgatherv(sbuf, scount, sdtype,
                                                      rbuf, rcounts, rdispls, rdtype,
                                                      comm);

    } else {
        if (communicator_size % 2) {
            return Coll_allgatherv_ring::allgatherv(sbuf, scount, sdtype,
                                                         rbuf, rcounts, rdispls, rdtype,
                                                         comm);
        } else {
            return  Coll_allgatherv_ompi_neighborexchange::allgatherv(sbuf, scount, sdtype,
                                                                      rbuf, rcounts, rdispls, rdtype,
                                                                      comm);
        }
    }
}
Beispiel #13
0
int Coll_barrier_ompi_basic_linear::barrier(MPI_Comm comm)
{
    int i;
    int size = comm->size();
    int rank = comm->rank();

    /* All non-root send & receive zero-length message. */

    if (rank > 0) {
        Request::send (NULL, 0, MPI_BYTE, 0,
                                 COLL_TAG_BARRIER,
                                  comm);

        Request::recv (NULL, 0, MPI_BYTE, 0,
                                 COLL_TAG_BARRIER,
                                 comm, MPI_STATUS_IGNORE);
    }

    /* The root collects and broadcasts the messages. */

    else {
        MPI_Request* requests;

        requests = new MPI_Request[size];
        for (i = 1; i < size; ++i) {
            requests[i] = Request::irecv(NULL, 0, MPI_BYTE, MPI_ANY_SOURCE,
                                     COLL_TAG_BARRIER, comm
                                     );
        }
        Request::waitall( size-1, requests+1, MPI_STATUSES_IGNORE );

        for (i = 1; i < size; ++i) {
            requests[i] = Request::isend(NULL, 0, MPI_BYTE, i,
                                     COLL_TAG_BARRIER,
                                      comm
                                     );
        }
        Request::waitall( size-1, requests+1, MPI_STATUSES_IGNORE );
        delete[] requests;
    }

    /* All done */

    return MPI_SUCCESS;

}
int Coll_barrier_ompi::barrier(MPI_Comm  comm)
{    int communicator_size = comm->size();

    if( 2 == communicator_size )
        return Coll_barrier_ompi_two_procs::barrier(comm);
/*     * Basic optimisation. If we have a power of 2 number of nodes*/
/*     * the use the recursive doubling algorithm, otherwise*/
/*     * bruck is the one we want.*/
    {
        int has_one = 0;
        for( ; communicator_size > 0; communicator_size >>= 1 ) {
            if( communicator_size & 0x1 ) {
                if( has_one )
                    return Coll_barrier_ompi_bruck::barrier(comm);
                has_one = 1;
            }
        }
    }
    return Coll_barrier_ompi_recursivedoubling::barrier(comm);
}
int Coll_scatter_ompi::scatter(const void *sbuf, int scount,
                                            MPI_Datatype sdtype,
                                            void* rbuf, int rcount,
                                            MPI_Datatype rdtype,
                                            int root, MPI_Comm  comm
                                            )
{
    const size_t small_block_size = 300;
    const int small_comm_size = 10;
    int communicator_size, rank;
    size_t dsize, block_size;

    XBT_DEBUG("Coll_scatter_ompi::scatter");

    communicator_size = comm->size();
    rank = comm->rank();
    // Determine block size
    if (root == rank) {
        dsize=sdtype->size();
        block_size = dsize * scount;
    } else {
        dsize=rdtype->size();
        block_size = dsize * rcount;
    }

    if ((communicator_size > small_comm_size) &&
        (block_size < small_block_size)) {
      std::unique_ptr<unsigned char[]> tmp_buf;
      if (rank != root) {
        tmp_buf.reset(new unsigned char[rcount * rdtype->get_extent()]);
        sbuf   = tmp_buf.get();
        scount = rcount;
        sdtype = rdtype;
      }
      return Coll_scatter_ompi_binomial::scatter(sbuf, scount, sdtype, rbuf, rcount, rdtype, root, comm);
    }
    return Coll_scatter_ompi_basic_linear::scatter (sbuf, scount, sdtype,
                                                       rbuf, rcount, rdtype,
                                                       root, comm);
}
Beispiel #16
0
int
Coll_allgatherv_pair::allgatherv(void *send_buff, int send_count,
                               MPI_Datatype send_type, void *recv_buff,
                               int *recv_counts, int *recv_disps, MPI_Datatype recv_type,
                               MPI_Comm comm)
{

  MPI_Aint extent;
  unsigned int i, src, dst;
  int tag = COLL_TAG_ALLGATHERV;
  MPI_Status status;

  char *send_ptr = (char *) send_buff;
  char *recv_ptr = (char *) recv_buff;

  unsigned int rank = comm->rank();
  unsigned int num_procs = comm->size();

  if((num_procs&(num_procs-1)))
    THROWF(arg_error,0, "allgatherv pair algorithm can't be used with non power of two number of processes ! ");

  extent = send_type->get_extent();

  // local send/recv
  Request::sendrecv(send_ptr, send_count, send_type, rank, tag,
               recv_ptr + recv_disps[rank] * extent,
               recv_counts[rank], recv_type, rank, tag, comm, &status);
  for (i = 1; i < num_procs; i++) {
    src = dst = rank ^ i;
    Request::sendrecv(send_ptr, send_count, send_type, dst, tag,
                 recv_ptr + recv_disps[src] * extent, recv_counts[src], recv_type,
                 src, tag, comm, &status);
  }

  return MPI_SUCCESS;
}
int Coll_reduce_mvapich2_two_level::reduce( const void *sendbuf,
                                     void *recvbuf,
                                     int count,
                                     MPI_Datatype datatype,
                                     MPI_Op op,
                                     int root,
                                     MPI_Comm comm)
{
    int mpi_errno = MPI_SUCCESS;
    int my_rank, total_size, local_rank, local_size;
    int leader_comm_rank = -1, leader_comm_size = 0;
    MPI_Comm shmem_comm, leader_comm;
    int leader_root, leader_of_root;
    const unsigned char* in_buf = nullptr;
    unsigned char *out_buf = nullptr, *tmp_buf = nullptr;
    MPI_Aint true_lb, true_extent, extent;
    int is_commutative = 0, stride = 0;
    int intra_node_root=0;

    //if not set (use of the algo directly, without mvapich2 selector)
    if(MV2_Reduce_function==NULL)
      MV2_Reduce_function=Coll_reduce_mpich::reduce;
    if(MV2_Reduce_intra_function==NULL)
      MV2_Reduce_intra_function=Coll_reduce_mpich::reduce;

    if(comm->get_leaders_comm()==MPI_COMM_NULL){
      comm->init_smp();
    }

    my_rank = comm->rank();
    total_size = comm->size();
    shmem_comm = comm->get_intra_comm();
    local_rank = shmem_comm->rank();
    local_size = shmem_comm->size();

    leader_comm = comm->get_leaders_comm();
    int* leaders_map = comm->get_leaders_map();
    leader_of_root = comm->group()->rank(leaders_map[root]);
    leader_root = leader_comm->group()->rank(leaders_map[root]);

    is_commutative= (op==MPI_OP_NULL || op->is_commutative());

    datatype->extent(&true_lb,
                                       &true_extent);
    extent =datatype->get_extent();
    stride = count * std::max(extent, true_extent);

    if (local_size == total_size) {
        /* First handle the case where there is only one node */
        if (stride <= MV2_INTRA_SHMEM_REDUCE_MSG &&
            is_commutative == 1) {
            if (local_rank == 0 ) {
              tmp_buf = smpi_get_tmp_sendbuffer(count * std::max(extent, true_extent));
              tmp_buf = tmp_buf - true_lb;
            }

            if (sendbuf != MPI_IN_PLACE) {
              in_buf = static_cast<const unsigned char*>(sendbuf);
            } else {
              in_buf = static_cast<const unsigned char*>(recvbuf);
            }

            if (local_rank == 0) {
                 if( my_rank != root) {
                     out_buf = tmp_buf;
                 } else {
                   out_buf = static_cast<unsigned char*>(recvbuf);
                   if (in_buf == out_buf) {
                     in_buf  = static_cast<const unsigned char*>(MPI_IN_PLACE);
                     out_buf = static_cast<unsigned char*>(recvbuf);
                     }
                 }
            } else {
              in_buf  = static_cast<const unsigned char*>(sendbuf);
              out_buf = nullptr;
            }

            if (count * (std::max(extent, true_extent)) < SHMEM_COLL_BLOCK_SIZE) {
              mpi_errno = MPIR_Reduce_shmem_MV2(in_buf, out_buf, count, datatype, op, 0, shmem_comm);
            } else {
              mpi_errno = MPIR_Reduce_intra_knomial_wrapper_MV2(in_buf, out_buf, count, datatype, op, 0, shmem_comm);
            }

            if (local_rank == 0 && root != my_rank) {
                Request::send(out_buf, count, datatype, root,
                                         COLL_TAG_REDUCE+1, comm);
            }
            if ((local_rank != 0) && (root == my_rank)) {
                Request::recv(recvbuf, count, datatype,
                                         leader_of_root, COLL_TAG_REDUCE+1, comm,
                                         MPI_STATUS_IGNORE);
            }
        } else {
            if(mv2_use_knomial_reduce == 1) {
                reduce_fn = &MPIR_Reduce_intra_knomial_wrapper_MV2;
            } else {
                reduce_fn = &MPIR_Reduce_binomial_MV2;
            }
            mpi_errno = reduce_fn(sendbuf, recvbuf, count,
                                  datatype, op,
                                  root, comm);
        }
        /* We are done */
        if (tmp_buf != nullptr)
          smpi_free_tmp_buffer(tmp_buf + true_lb);
        goto fn_exit;
    }


    if (local_rank == 0) {
        leader_comm = comm->get_leaders_comm();
        if(leader_comm==MPI_COMM_NULL){
          leader_comm = MPI_COMM_WORLD;
        }
        leader_comm_size = leader_comm->size();
        leader_comm_rank = leader_comm->rank();
        tmp_buf          = smpi_get_tmp_sendbuffer(count * std::max(extent, true_extent));
        tmp_buf          = tmp_buf - true_lb;
    }
    if (sendbuf != MPI_IN_PLACE) {
      in_buf = static_cast<const unsigned char*>(sendbuf);
    } else {
      in_buf = static_cast<const unsigned char*>(recvbuf);
    }
    if (local_rank == 0) {
      out_buf = static_cast<unsigned char*>(tmp_buf);
    } else {
      out_buf = nullptr;
    }


    if(local_size > 1) {
        /* Lets do the intra-node reduce operations, if we have more than one
         * process in the node */

        /*Fix the input and outbuf buffers for the intra-node reduce.
         *Node leaders will have the reduced data in tmp_buf after
         *this step*/
        if (MV2_Reduce_intra_function == & MPIR_Reduce_shmem_MV2)
        {
          if (is_commutative == 1 && (count * (std::max(extent, true_extent)) < SHMEM_COLL_BLOCK_SIZE)) {
            mpi_errno = MV2_Reduce_intra_function(in_buf, out_buf, count, datatype, op, intra_node_root, shmem_comm);
            } else {
                    mpi_errno = MPIR_Reduce_intra_knomial_wrapper_MV2(in_buf, out_buf, count,
                                      datatype, op,
                                      intra_node_root, shmem_comm);
            }
        } else {

            mpi_errno = MV2_Reduce_intra_function(in_buf, out_buf, count,
                                      datatype, op,
                                      intra_node_root, shmem_comm);
        }
    } else {
      smpi_free_tmp_buffer(tmp_buf + true_lb);
      tmp_buf = (unsigned char*)in_buf; // xxx
    }

    /* Now work on the inter-leader phase. Data is in tmp_buf */
    if (local_rank == 0 && leader_comm_size > 1) {
        /*The leader of root will have the global reduced data in tmp_buf
           or recv_buf
           at the end of the reduce */
        if (leader_comm_rank == leader_root) {
            if (my_rank == root) {
                /* I am the root of the leader-comm, and the
                 * root of the reduce op. So, I will write the
                 * final result directly into my recvbuf */
                if(tmp_buf != recvbuf) {
                  in_buf  = tmp_buf;
                  out_buf = static_cast<unsigned char*>(recvbuf);
                } else {

                  unsigned char* buf = smpi_get_tmp_sendbuffer(count * datatype->get_extent());
                  Datatype::copy(tmp_buf, count, datatype, buf, count, datatype);
                  // in_buf = MPI_IN_PLACE;
                  in_buf  = buf;
                  out_buf = static_cast<unsigned char*>(recvbuf);
                }
            } else {
              unsigned char* buf = smpi_get_tmp_sendbuffer(count * datatype->get_extent());
              Datatype::copy(tmp_buf, count, datatype, buf, count, datatype);
              // in_buf = MPI_IN_PLACE;
              in_buf  = buf;
              out_buf = tmp_buf;
            }
        } else {
            in_buf = tmp_buf;
            out_buf = nullptr;
        }

        /* inter-leader communication  */
        mpi_errno = MV2_Reduce_function(in_buf, out_buf, count,
                              datatype, op,
                              leader_root, leader_comm);

    }

    if (local_size > 1) {
      /* Send the message to the root if the leader is not the
       * root of the reduce operation. The reduced data is in tmp_buf */
      if ((local_rank == 0) && (root != my_rank) && (leader_root == leader_comm_rank)) {
        Request::send(tmp_buf, count, datatype, root, COLL_TAG_REDUCE + 1, comm);
      }
      if ((local_rank != 0) && (root == my_rank)) {
        Request::recv(recvbuf, count, datatype, leader_of_root, COLL_TAG_REDUCE + 1, comm, MPI_STATUS_IGNORE);
      }
      smpi_free_tmp_buffer(tmp_buf + true_lb);

      if (leader_comm_rank == leader_root) {
        if (my_rank != root || (my_rank == root && tmp_buf == recvbuf)) {
          smpi_free_tmp_buffer(in_buf);
        }
      }
    }



  fn_exit:
    return mpi_errno;
}
int Coll_reduce_ompi::reduce(const void *sendbuf, void *recvbuf,
                                            int count, MPI_Datatype  datatype,
                                            MPI_Op   op, int root,
                                            MPI_Comm   comm
                                            )
{
    int communicator_size=0;
    //int segsize = 0;
    size_t message_size, dsize;
    const double a1 =  0.6016 / 1024.0; /* [1/B] */
    const double b1 =  1.3496;
    const double a2 =  0.0410 / 1024.0; /* [1/B] */
    const double b2 =  9.7128;
    const double a3 =  0.0422 / 1024.0; /* [1/B] */
    const double b3 =  1.1614;
    //const double a4 =  0.0033 / 1024.0;  [1/B]
    //const double b4 =  1.6761;

    /* no limit on # of outstanding requests */
    //const int max_requests = 0;

    communicator_size = comm->size();

    /* need data size for decision function */
    dsize=datatype->size();
    message_size = dsize * count;   /* needed for decision */

    /**
     * If the operation is non commutative we currently have choice of linear
     * or in-order binary tree algorithm.
     */
    if ((op != MPI_OP_NULL) && not op->is_commutative()) {
      if ((communicator_size < 12) && (message_size < 2048)) {
        return Coll_reduce_ompi_basic_linear::reduce(sendbuf, recvbuf, count, datatype, op, root, comm /*, module*/);
      }
      return Coll_reduce_ompi_in_order_binary::reduce(sendbuf, recvbuf, count, datatype, op, root, comm /*, module,
                                                             0, max_requests*/);
    }

    if ((communicator_size < 8) && (message_size < 512)){
        /* Linear_0K */
        return Coll_reduce_ompi_basic_linear::reduce (sendbuf, recvbuf, count, datatype, op, root, comm);
    } else if (((communicator_size < 8) && (message_size < 20480)) ||
               (message_size < 2048) || (count <= 1)) {
        /* Binomial_0K */
        //segsize = 0;
        return Coll_reduce_ompi_binomial::reduce(sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
                                                     segsize, max_requests*/);
    } else if (communicator_size > (a1 * message_size + b1)) {
        // Binomial_1K
        //segsize = 1024;
        return Coll_reduce_ompi_binomial::reduce(sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
                                                     segsize, max_requests*/);
    } else if (communicator_size > (a2 * message_size + b2)) {
        // Pipeline_1K
        //segsize = 1024;
        return Coll_reduce_ompi_pipeline::reduce (sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
                                                      segsize, max_requests*/);
    } else if (communicator_size > (a3 * message_size + b3)) {
        // Binary_32K
        //segsize = 32*1024;
        return Coll_reduce_ompi_binary::reduce( sendbuf, recvbuf, count, datatype, op, root,
                                                    comm/*, module, segsize, max_requests*/);
    }
//    if (communicator_size > (a4 * message_size + b4)) {
        // Pipeline_32K
//        segsize = 32*1024;
//    } else {
        // Pipeline_64K
//        segsize = 64*1024;
//    }
    return Coll_reduce_ompi_pipeline::reduce (sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
                                                  segsize, max_requests*/);

#if 0
    /* for small messages use linear algorithm */
    if (message_size <= 4096) {
        segsize = 0;
        fanout = communicator_size - 1;
        /* when linear implemented or taken from basic put here, right now using chain as a linear system */
        /* it is implemented and I shouldn't be calling a chain with a fanout bigger than MAXTREEFANOUT from topo.h! */
        return Coll_reduce_intra_basic_linear::reduce (sendbuf, recvbuf, count, datatype, op, root, comm, module);
        /*        return Coll_reduce_intra_chain::reduce (sendbuf, recvbuf, count, datatype, op, root, comm, segsize, fanout); */
    }
    if (message_size < 524288) {
        if (message_size <= 65536 ) {
            segsize = 32768;
            fanout = 8;
        } else {
            segsize = 1024;
            fanout = communicator_size/2;
        }
        /* later swap this for a binary tree */
        /*         fanout = 2; */
        return Coll_reduce_intra_chain::reduce (sendbuf, recvbuf, count, datatype, op, root, comm, module,
                                                   segsize, fanout, max_requests);
    }
    segsize = 1024;
    return Coll_reduce_intra_pipeline::reduce (sendbuf, recvbuf, count, datatype, op, root, comm, module,
                                                  segsize, max_requests);
#endif  /* 0 */
}
int Coll_bcast_ompi::bcast(void *buff, int count,
                                          MPI_Datatype datatype, int root,
                                          MPI_Comm  comm
                                          )
{
    /* Decision function based on MX results for
       messages up to 36MB and communicator sizes up to 64 nodes */
    const size_t small_message_size = 2048;
    const size_t intermediate_message_size = 370728;
    const double a_p16  = 3.2118e-6; /* [1 / byte] */
    const double b_p16  = 8.7936;
    const double a_p64  = 2.3679e-6; /* [1 / byte] */
    const double b_p64  = 1.1787;
    const double a_p128 = 1.6134e-6; /* [1 / byte] */
    const double b_p128 = 2.1102;

    int communicator_size;
    //int segsize = 0;
    size_t message_size, dsize;

    communicator_size = comm->size();

    /* else we need data size for decision function */
    dsize = datatype->size();
    message_size = dsize * (unsigned long)count;   /* needed for decision */

    /* Handle messages of small and intermediate size, and
       single-element broadcasts */
    if ((message_size < small_message_size) || (count <= 1)) {
        /* Binomial without segmentation */
        return  Coll_bcast_binomial_tree::bcast (buff, count, datatype,
                                                      root, comm);

    } else if (message_size < intermediate_message_size) {
        // SplittedBinary with 1KB segments
        return Coll_bcast_ompi_split_bintree::bcast(buff, count, datatype,
                                                         root, comm);

    }
     //Handle large message sizes
    else if (communicator_size < (a_p128 * message_size + b_p128)) {
        //Pipeline with 128KB segments
        //segsize = 1024  << 7;
        return Coll_bcast_ompi_pipeline::bcast (buff, count, datatype,
                                                     root, comm);


    } else if (communicator_size < 13) {
        // Split Binary with 8KB segments
        return Coll_bcast_ompi_split_bintree::bcast(buff, count, datatype,
                                                         root, comm);

    } else if (communicator_size < (a_p64 * message_size + b_p64)) {
        // Pipeline with 64KB segments
        //segsize = 1024 << 6;
        return Coll_bcast_ompi_pipeline::bcast (buff, count, datatype,
                                                     root, comm);


    } else if (communicator_size < (a_p16 * message_size + b_p16)) {
        //Pipeline with 16KB segments
        //segsize = 1024 << 4;
        return Coll_bcast_ompi_pipeline::bcast (buff, count, datatype,
                                                     root, comm);


    }
    /* Pipeline with 8KB segments */
    //segsize = 1024 << 3;
    return Coll_bcast_flattree_pipeline::bcast (buff, count, datatype,
                                                 root, comm
                                                 /*segsize*/);
#if 0
    /* this is based on gige measurements */

    if (communicator_size  < 4) {
        return Coll_bcast_intra_basic_linear::bcast (buff, count, datatype, root, comm, module);
    }
    if (communicator_size == 4) {
        if (message_size < 524288) segsize = 0;
        else segsize = 16384;
        return Coll_bcast_intra_bintree::bcast (buff, count, datatype, root, comm, module, segsize);
    }
    if (communicator_size <= 8 && message_size < 4096) {
        return Coll_bcast_intra_basic_linear::bcast (buff, count, datatype, root, comm, module);
    }
    if (communicator_size > 8 && message_size >= 32768 && message_size < 524288) {
        segsize = 16384;
        return  Coll_bcast_intra_bintree::bcast (buff, count, datatype, root, comm, module, segsize);
    }
    if (message_size >= 524288) {
        segsize = 16384;
        return Coll_bcast_intra_pipeline::bcast (buff, count, datatype, root, comm, module, segsize);
    }
    segsize = 0;
    /* once tested can swap this back in */
    /* return Coll_bcast_intra_bmtree::bcast (buff, count, datatype, root, comm, segsize); */
    return Coll_bcast_intra_bintree::bcast (buff, count, datatype, root, comm, module, segsize);
#endif  /* 0 */
}
int Coll_allreduce_mvapich2::allreduce(const void *sendbuf,
    void *recvbuf,
    int count,
    MPI_Datatype datatype,
    MPI_Op op, MPI_Comm comm)
{

  int mpi_errno = MPI_SUCCESS;
  //int rank = 0,
  int comm_size = 0;

  comm_size = comm->size();
  //rank = comm->rank();

  if (count == 0) {
      return MPI_SUCCESS;
  }

  if (mv2_allreduce_thresholds_table == NULL)
    init_mv2_allreduce_tables_stampede();

  /* check if multiple threads are calling this collective function */

  MPI_Aint sendtype_size = 0;
  long nbytes = 0;
  int is_commutative = 0;
  MPI_Aint true_lb, true_extent;

  sendtype_size=datatype->size();
  nbytes = count * sendtype_size;

  datatype->extent(&true_lb, &true_extent);
  is_commutative = op->is_commutative();

  {
    int range = 0, range_threshold = 0, range_threshold_intra = 0;
    int is_two_level = 0;

    /* Search for the corresponding system size inside the tuning table */
    while ((range < (mv2_size_allreduce_tuning_table - 1)) &&
        (comm_size > mv2_allreduce_thresholds_table[range].numproc)) {
        range++;
    }
    /* Search for corresponding inter-leader function */
    /* skip mcast poiters if mcast is not available */
    if(mv2_allreduce_thresholds_table[range].mcast_enabled != 1){
        while ((range_threshold < (mv2_allreduce_thresholds_table[range].size_inter_table - 1))
            && ((mv2_allreduce_thresholds_table[range].
                inter_leader[range_threshold].MV2_pt_Allreducection
                == &MPIR_Allreduce_mcst_reduce_redscat_gather_MV2) ||
                (mv2_allreduce_thresholds_table[range].
                    inter_leader[range_threshold].MV2_pt_Allreducection
                    == &MPIR_Allreduce_mcst_reduce_two_level_helper_MV2)
            )) {
            range_threshold++;
        }
    }
    while ((range_threshold < (mv2_allreduce_thresholds_table[range].size_inter_table - 1))
        && (nbytes >
    mv2_allreduce_thresholds_table[range].inter_leader[range_threshold].max)
    && (mv2_allreduce_thresholds_table[range].inter_leader[range_threshold].max != -1)) {
        range_threshold++;
    }
    if(mv2_allreduce_thresholds_table[range].is_two_level_allreduce[range_threshold] == 1){
        is_two_level = 1;
    }
    /* Search for corresponding intra-node function */
    while ((range_threshold_intra <
        (mv2_allreduce_thresholds_table[range].size_intra_table - 1))
        && (nbytes >
    mv2_allreduce_thresholds_table[range].intra_node[range_threshold_intra].max)
    && (mv2_allreduce_thresholds_table[range].intra_node[range_threshold_intra].max !=
        -1)) {
        range_threshold_intra++;
    }

    MV2_Allreducection = mv2_allreduce_thresholds_table[range].inter_leader[range_threshold]
                                                                                .MV2_pt_Allreducection;

    MV2_Allreduce_intra_function = mv2_allreduce_thresholds_table[range].intra_node[range_threshold_intra]
                                                                                    .MV2_pt_Allreducection;

    /* check if mcast is ready, otherwise replace mcast with other algorithm */
    if((MV2_Allreducection == &MPIR_Allreduce_mcst_reduce_redscat_gather_MV2)||
        (MV2_Allreducection == &MPIR_Allreduce_mcst_reduce_two_level_helper_MV2)){
        {
          MV2_Allreducection = &MPIR_Allreduce_pt2pt_rd_MV2;
        }
        if(is_two_level != 1) {
            MV2_Allreducection = &MPIR_Allreduce_pt2pt_rd_MV2;
        }
    }

    if(is_two_level == 1){
        // check if shm is ready, if not use other algorithm first
        if (is_commutative) {
          if(comm->get_leaders_comm()==MPI_COMM_NULL){
            comm->init_smp();
          }
          mpi_errno = MPIR_Allreduce_two_level_MV2(sendbuf, recvbuf, count,
                                                     datatype, op, comm);
                } else {
        mpi_errno = MPIR_Allreduce_pt2pt_rd_MV2(sendbuf, recvbuf, count,
            datatype, op, comm);
        }
    } else {
        mpi_errno = MV2_Allreducection(sendbuf, recvbuf, count,
            datatype, op, comm);
    }
  }

  //comm->ch.intra_node_done=0;

  return (mpi_errno);


}
int Coll_allgather_ompi::allgather(const void *sbuf, int scount,
                                              MPI_Datatype sdtype,
                                              void* rbuf, int rcount,
                                              MPI_Datatype rdtype,
                                              MPI_Comm  comm
                                              )
{
    int communicator_size, pow2_size;
    size_t dsize, total_dsize;

    communicator_size = comm->size();

    /* Special case for 2 processes */
    if (communicator_size == 2) {
        return Coll_allgather_pair::allgather (sbuf, scount, sdtype,
                                                          rbuf, rcount, rdtype,
                                                          comm/*, module*/);
    }

    /* Determine complete data size */
    dsize=sdtype->size();
    total_dsize = dsize * scount * communicator_size;

    for (pow2_size  = 1; pow2_size < communicator_size; pow2_size <<=1);

    /* Decision based on MX 2Gb results from Grig cluster at
       The University of Tennesse, Knoxville
       - if total message size is less than 50KB use either bruck or
       recursive doubling for non-power of two and power of two nodes,
       respectively.
       - else use ring and neighbor exchange algorithms for odd and even
       number of nodes, respectively.
    */
    if (total_dsize < 50000) {
        if (pow2_size == communicator_size) {
            return Coll_allgather_rdb::allgather(sbuf, scount, sdtype,
                                                                     rbuf, rcount, rdtype,
                                                                     comm);
        } else {
            return Coll_allgather_bruck::allgather(sbuf, scount, sdtype,
                                                         rbuf, rcount, rdtype,
                                                         comm);
        }
    } else {
        if (communicator_size % 2) {
            return Coll_allgather_ring::allgather(sbuf, scount, sdtype,
                                                        rbuf, rcount, rdtype,
                                                        comm);
        } else {
            return  Coll_allgather_ompi_neighborexchange::allgather(sbuf, scount, sdtype,
                                                                     rbuf, rcount, rdtype,
                                                                     comm);
        }
    }

#if defined(USE_MPICH2_DECISION)
    /* Decision as in MPICH-2
       presented in Thakur et.al. "Optimization of Collective Communication
       Operations in MPICH", International Journal of High Performance Computing
       Applications, Vol. 19, No. 1, 49-66 (2005)
       - for power-of-two processes and small and medium size messages
       (up to 512KB) use recursive doubling
       - for non-power-of-two processes and small messages (80KB) use bruck,
       - for everything else use ring.
    */
    if ((pow2_size == communicator_size) && (total_dsize < 524288)) {
        return Coll_allgather_rdb::allgather(sbuf, scount, sdtype,
                                                                 rbuf, rcount, rdtype,
                                                                 comm);
    } else if (total_dsize <= 81920) {
        return Coll_allgather_bruck::allgather(sbuf, scount, sdtype,
                                                     rbuf, rcount, rdtype,
                                                     comm);
    }
    return Coll_allgather_ring::allgather(sbuf, scount, sdtype,
                                                rbuf, rcount, rdtype,
                                                comm);
#endif  /* defined(USE_MPICH2_DECISION) */
}
Beispiel #22
0
int Coll_bcast_SMP_binary::bcast(void *buf, int count,
                                     MPI_Datatype datatype, int root,
                                     MPI_Comm comm)
{
  int tag = COLL_TAG_BCAST;
  MPI_Status status;
  MPI_Request request;
  MPI_Request *request_array;
  MPI_Status *status_array;
  int rank, size;
  int i;
  MPI_Aint extent;
  extent = datatype->get_extent();

  rank = comm->rank();
  size = comm->size();
  if(comm->get_leaders_comm()==MPI_COMM_NULL){
    comm->init_smp();
  }
  int host_num_core=1;
  if (comm->is_uniform()){
    host_num_core = comm->get_intra_comm()->size();
  }else{
    //implementation buggy in this case
    return Coll_bcast_mpich::bcast( buf , count, datatype,
              root, comm);
  }

  int segment = bcast_SMP_binary_segment_byte / extent;
  int pipe_length = count / segment;
  int remainder = count % segment;

  int to_intra_left = (rank / host_num_core) * host_num_core + (rank % host_num_core) * 2 + 1;
  int to_intra_right = (rank / host_num_core) * host_num_core + (rank % host_num_core) * 2 + 2;
  int to_inter_left = ((rank / host_num_core) * 2 + 1) * host_num_core;
  int to_inter_right = ((rank / host_num_core) * 2 + 2) * host_num_core;
  int from_inter = (((rank / host_num_core) - 1) / 2) * host_num_core;
  int from_intra = (rank / host_num_core) * host_num_core + ((rank % host_num_core) - 1) / 2;
  int increment = segment * extent;

  int base = (rank / host_num_core) * host_num_core;
  int num_core = host_num_core;
  if (((rank / host_num_core) * host_num_core) == ((size / host_num_core) * host_num_core))
    num_core = size - (rank / host_num_core) * host_num_core;

  // if root is not zero send to rank zero first
  if (root != 0) {
    if (rank == root)
      Request::send(buf, count, datatype, 0, tag, comm);
    else if (rank == 0)
      Request::recv(buf, count, datatype, root, tag, comm, &status);
  }
  // when a message is smaller than a block size => no pipeline
  if (count <= segment) {
    // case ROOT-of-each-SMP
    if (rank % host_num_core == 0) {
      // case ROOT
      if (rank == 0) {
        //printf("node %d left %d right %d\n",rank,to_inter_left,to_inter_right);
        if (to_inter_left < size)
          Request::send(buf, count, datatype, to_inter_left, tag, comm);
        if (to_inter_right < size)
          Request::send(buf, count, datatype, to_inter_right, tag, comm);
        if ((to_intra_left - base) < num_core)
          Request::send(buf, count, datatype, to_intra_left, tag, comm);
        if ((to_intra_right - base) < num_core)
          Request::send(buf, count, datatype, to_intra_right, tag, comm);
      }
      // case LEAVES ROOT-of-eash-SMP
      else if (to_inter_left >= size) {
        //printf("node %d from %d\n",rank,from_inter);
        request = Request::irecv(buf, count, datatype, from_inter, tag, comm);
        Request::wait(&request, &status);
        if ((to_intra_left - base) < num_core)
          Request::send(buf, count, datatype, to_intra_left, tag, comm);
        if ((to_intra_right - base) < num_core)
          Request::send(buf, count, datatype, to_intra_right, tag, comm);
      }
      // case INTERMEDIAT ROOT-of-each-SMP
      else {
        //printf("node %d left %d right %d from %d\n",rank,to_inter_left,to_inter_right,from_inter);
        request = Request::irecv(buf, count, datatype, from_inter, tag, comm);
        Request::wait(&request, &status);
        Request::send(buf, count, datatype, to_inter_left, tag, comm);
        if (to_inter_right < size)
          Request::send(buf, count, datatype, to_inter_right, tag, comm);
        if ((to_intra_left - base) < num_core)
          Request::send(buf, count, datatype, to_intra_left, tag, comm);
        if ((to_intra_right - base) < num_core)
          Request::send(buf, count, datatype, to_intra_right, tag, comm);
      }
    }
    // case non ROOT-of-each-SMP
    else {
      // case leaves
      if ((to_intra_left - base) >= num_core) {
        request = Request::irecv(buf, count, datatype, from_intra, tag, comm);
        Request::wait(&request, &status);
      }
      // case intermediate
      else {
        request = Request::irecv(buf, count, datatype, from_intra, tag, comm);
        Request::wait(&request, &status);
        Request::send(buf, count, datatype, to_intra_left, tag, comm);
        if ((to_intra_right - base) < num_core)
          Request::send(buf, count, datatype, to_intra_right, tag, comm);
      }
    }

    return MPI_SUCCESS;
  }

  // pipeline bcast
  else {
    request_array =
        (MPI_Request *) xbt_malloc((size + pipe_length) * sizeof(MPI_Request));
    status_array =
        (MPI_Status *) xbt_malloc((size + pipe_length) * sizeof(MPI_Status));

    // case ROOT-of-each-SMP
    if (rank % host_num_core == 0) {
      // case ROOT
      if (rank == 0) {
        for (i = 0; i < pipe_length; i++) {
          //printf("node %d left %d right %d\n",rank,to_inter_left,to_inter_right);
          if (to_inter_left < size)
            Request::send((char *) buf + (i * increment), segment, datatype,
                     to_inter_left, (tag + i), comm);
          if (to_inter_right < size)
            Request::send((char *) buf + (i * increment), segment, datatype,
                     to_inter_right, (tag + i), comm);
          if ((to_intra_left - base) < num_core)
            Request::send((char *) buf + (i * increment), segment, datatype,
                     to_intra_left, (tag + i), comm);
          if ((to_intra_right - base) < num_core)
            Request::send((char *) buf + (i * increment), segment, datatype,
                     to_intra_right, (tag + i), comm);
        }
      }
      // case LEAVES ROOT-of-eash-SMP
      else if (to_inter_left >= size) {
        //printf("node %d from %d\n",rank,from_inter);
        for (i = 0; i < pipe_length; i++) {
          request_array[i] = Request::irecv((char *) buf + (i * increment), segment, datatype,
                    from_inter, (tag + i), comm);
        }
        for (i = 0; i < pipe_length; i++) {
          Request::wait(&request_array[i], &status);
          if ((to_intra_left - base) < num_core)
            Request::send((char *) buf + (i * increment), segment, datatype,
                     to_intra_left, (tag + i), comm);
          if ((to_intra_right - base) < num_core)
            Request::send((char *) buf + (i * increment), segment, datatype,
                     to_intra_right, (tag + i), comm);
        }
      }
      // case INTERMEDIAT ROOT-of-each-SMP
      else {
        //printf("node %d left %d right %d from %d\n",rank,to_inter_left,to_inter_right,from_inter);
        for (i = 0; i < pipe_length; i++) {
          request_array[i] = Request::irecv((char *) buf + (i * increment), segment, datatype,
                    from_inter, (tag + i), comm);
        }
        for (i = 0; i < pipe_length; i++) {
          Request::wait(&request_array[i], &status);
          Request::send((char *) buf + (i * increment), segment, datatype,
                   to_inter_left, (tag + i), comm);
          if (to_inter_right < size)
            Request::send((char *) buf + (i * increment), segment, datatype,
                     to_inter_right, (tag + i), comm);
          if ((to_intra_left - base) < num_core)
            Request::send((char *) buf + (i * increment), segment, datatype,
                     to_intra_left, (tag + i), comm);
          if ((to_intra_right - base) < num_core)
            Request::send((char *) buf + (i * increment), segment, datatype,
                     to_intra_right, (tag + i), comm);
        }
      }
    }
    // case non-ROOT-of-each-SMP
    else {
      // case leaves
      if ((to_intra_left - base) >= num_core) {
        for (i = 0; i < pipe_length; i++) {
          request_array[i] = Request::irecv((char *) buf + (i * increment), segment, datatype,
                    from_intra, (tag + i), comm);
        }
        Request::waitall((pipe_length), request_array, status_array);
      }
      // case intermediate
      else {
        for (i = 0; i < pipe_length; i++) {
          request_array[i] = Request::irecv((char *) buf + (i * increment), segment, datatype,
                    from_intra, (tag + i), comm);
        }
        for (i = 0; i < pipe_length; i++) {
          Request::wait(&request_array[i], &status);
          Request::send((char *) buf + (i * increment), segment, datatype,
                   to_intra_left, (tag + i), comm);
          if ((to_intra_right - base) < num_core)
            Request::send((char *) buf + (i * increment), segment, datatype,
                     to_intra_right, (tag + i), comm);
        }
      }
    }

    free(request_array);
    free(status_array);
  }

  // when count is not divisible by block size, use default BCAST for the remainder
  if ((remainder != 0) && (count > segment)) {
    XBT_WARN("MPI_bcast_SMP_binary use default MPI_bcast.");
    Colls::bcast((char *) buf + (pipe_length * increment), remainder, datatype,
              root, comm);
  }

  return 1;
}
int Coll_bcast_mvapich2::bcast(void *buffer,
    int count,
    MPI_Datatype datatype,
    int root, MPI_Comm comm)
{
    int mpi_errno = MPI_SUCCESS;
    int comm_size/*, rank*/;
    int two_level_bcast = 1;
    long nbytes = 0;
    int range = 0;
    int range_threshold = 0;
    int range_threshold_intra = 0;
    // int is_homogeneous, is_contig;
    MPI_Aint type_size;
    //, position;
    // unsigned char *tmp_buf = NULL;
    MPI_Comm shmem_comm;
    //MPID_Datatype *dtp;

    if (count == 0)
        return MPI_SUCCESS;
    if(comm->get_leaders_comm()==MPI_COMM_NULL){
      comm->init_smp();
    }
    if (not mv2_bcast_thresholds_table)
      init_mv2_bcast_tables_stampede();
    comm_size = comm->size();
    //rank = comm->rank();

    //is_contig=1;
/*    if (HANDLE_GET_KIND(datatype) == HANDLE_KIND_BUILTIN)*/
/*        is_contig = 1;*/
/*    else {*/
/*        MPID_Datatype_get_ptr(datatype, dtp);*/
/*        is_contig = dtp->is_contig;*/
/*    }*/

    // is_homogeneous = 1;

    /* MPI_Type_size() might not give the accurate size of the packed
     * datatype for heterogeneous systems (because of padding, encoding,
     * etc). On the other hand, MPI_Pack_size() can become very
     * expensive, depending on the implementation, especially for
     * heterogeneous systems. We want to use MPI_Type_size() wherever
     * possible, and MPI_Pack_size() in other places.
     */
    //if (is_homogeneous) {
        type_size=datatype->size();

   /* } else {
        MPIR_Pack_size_impl(1, datatype, &type_size);
    }*/
    nbytes =  (count) * (type_size);

    /* Search for the corresponding system size inside the tuning table */
    while ((range < (mv2_size_bcast_tuning_table - 1)) &&
           (comm_size > mv2_bcast_thresholds_table[range].numproc)) {
        range++;
    }
    /* Search for corresponding inter-leader function */
    while ((range_threshold < (mv2_bcast_thresholds_table[range].size_inter_table - 1))
           && (nbytes >
               mv2_bcast_thresholds_table[range].inter_leader[range_threshold].max)
           && (mv2_bcast_thresholds_table[range].inter_leader[range_threshold].max != -1)) {
        range_threshold++;
    }

    /* Search for corresponding intra-node function */
    while ((range_threshold_intra <
            (mv2_bcast_thresholds_table[range].size_intra_table - 1))
           && (nbytes >
               mv2_bcast_thresholds_table[range].intra_node[range_threshold_intra].max)
           && (mv2_bcast_thresholds_table[range].intra_node[range_threshold_intra].max !=
               -1)) {
        range_threshold_intra++;
    }

    MV2_Bcast_function =
        mv2_bcast_thresholds_table[range].inter_leader[range_threshold].
        MV2_pt_Bcast_function;

    MV2_Bcast_intra_node_function =
        mv2_bcast_thresholds_table[range].
        intra_node[range_threshold_intra].MV2_pt_Bcast_function;

/*    if (mv2_user_bcast_intra == NULL && */
/*            MV2_Bcast_intra_node_function == &MPIR_Knomial_Bcast_intra_node_MV2) {*/
/*            MV2_Bcast_intra_node_function = &MPIR_Shmem_Bcast_MV2;*/
/*    }*/

    if (mv2_bcast_thresholds_table[range].inter_leader[range_threshold].
        zcpy_pipelined_knomial_factor != -1) {
        zcpy_knomial_factor =
            mv2_bcast_thresholds_table[range].inter_leader[range_threshold].
            zcpy_pipelined_knomial_factor;
    }

    if (mv2_pipelined_zcpy_knomial_factor != -1) {
        zcpy_knomial_factor = mv2_pipelined_zcpy_knomial_factor;
    }

    if(MV2_Bcast_intra_node_function == NULL) {
        /* if tuning table do not have any intra selection, set func pointer to
        ** default one for mcast intra node */
        MV2_Bcast_intra_node_function = &MPIR_Shmem_Bcast_MV2;
    }

    /* Set value of pipeline segment size */
    bcast_segment_size = mv2_bcast_thresholds_table[range].bcast_segment_size;

    /* Set value of inter node knomial factor */
    mv2_inter_node_knomial_factor = mv2_bcast_thresholds_table[range].inter_node_knomial_factor;

    /* Set value of intra node knomial factor */
    mv2_intra_node_knomial_factor = mv2_bcast_thresholds_table[range].intra_node_knomial_factor;

    /* Check if we will use a two level algorithm or not */
    two_level_bcast =
#if defined(_MCST_SUPPORT_)
        mv2_bcast_thresholds_table[range].is_two_level_bcast[range_threshold]
        || comm->ch.is_mcast_ok;
#else
        mv2_bcast_thresholds_table[range].is_two_level_bcast[range_threshold];
#endif
     if (two_level_bcast == 1) {
       // if (not is_contig || not is_homogeneous) {
//   tmp_buf = smpi_get_tmp_sendbuffer(nbytes);

/*            position = 0;*/
/*            if (rank == root) {*/
/*                mpi_errno =*/
/*                    MPIR_Pack_impl(buffer, count, datatype, tmp_buf, nbytes, &position);*/
/*                if (mpi_errno)*/
/*                    MPIU_ERR_POP(mpi_errno);*/
/*            }*/
// }
#ifdef CHANNEL_MRAIL_GEN2
        if ((mv2_enable_zcpy_bcast == 1) &&
              (&MPIR_Pipelined_Bcast_Zcpy_MV2 == MV2_Bcast_function)) {
          // if (not is_contig || not is_homogeneous) {
          //   mpi_errno = MPIR_Pipelined_Bcast_Zcpy_MV2(tmp_buf, nbytes, MPI_BYTE, root, comm);
          // } else {
                mpi_errno = MPIR_Pipelined_Bcast_Zcpy_MV2(buffer, count, datatype,
                                                 root, comm);
          // }
        } else
#endif /* defined(CHANNEL_MRAIL_GEN2) */
        {
            shmem_comm = comm->get_intra_comm();
            // if (not is_contig || not is_homogeneous) {
            //   MPIR_Bcast_tune_inter_node_helper_MV2(tmp_buf, nbytes, MPI_BYTE, root, comm);
            // } else {
              MPIR_Bcast_tune_inter_node_helper_MV2(buffer, count, datatype, root, comm);
            // }

            /* We are now done with the inter-node phase */


                    root = INTRA_NODE_ROOT;

                    // if (not is_contig || not is_homogeneous) {
                    //       mpi_errno = MV2_Bcast_intra_node_function(tmp_buf, nbytes, MPI_BYTE, root, shmem_comm);
                    // } else {
                    mpi_errno = MV2_Bcast_intra_node_function(buffer, count,
                                                              datatype, root, shmem_comm);

                    // }
        }
        /*        if (not is_contig || not is_homogeneous) {*/
        /*            if (rank != root) {*/
        /*                position = 0;*/
        /*                mpi_errno = MPIR_Unpack_impl(tmp_buf, nbytes, &position, buffer,*/
        /*                                             count, datatype);*/
        /*            }*/
        /*        }*/
    } else {
        /* We use Knomial for intra node */
        MV2_Bcast_intra_node_function = &MPIR_Knomial_Bcast_intra_node_MV2;
/*        if (mv2_enable_shmem_bcast == 0) {*/
            /* Fall back to non-tuned version */
/*            MPIR_Bcast_intra_MV2(buffer, count, datatype, root, comm);*/
/*        } else {*/
            mpi_errno = MV2_Bcast_function(buffer, count, datatype, root,
                                           comm);

/*        }*/
    }


    return mpi_errno;

}
int Coll_scatter_mvapich2::scatter(const void *sendbuf,
    int sendcnt,
    MPI_Datatype sendtype,
    void *recvbuf,
    int recvcnt,
    MPI_Datatype recvtype,
    int root, MPI_Comm comm)
{
  int range = 0, range_threshold = 0, range_threshold_intra = 0;
  int mpi_errno = MPI_SUCCESS;
  //   int mpi_errno_ret = MPI_SUCCESS;
  int rank, nbytes, comm_size;
  int partial_sub_ok = 0;
  int conf_index = 0;
     MPI_Comm shmem_comm;
  //    MPID_Comm *shmem_commptr=NULL;
  if(mv2_scatter_thresholds_table==NULL)
    init_mv2_scatter_tables_stampede();

  if(comm->get_leaders_comm()==MPI_COMM_NULL){
    comm->init_smp();
  }

  comm_size = comm->size();

  rank = comm->rank();

  if (rank == root) {
    int sendtype_size = sendtype->size();
    nbytes            = sendcnt * sendtype_size;
  } else {
    int recvtype_size = recvtype->size();
    nbytes            = recvcnt * recvtype_size;
  }

    // check if safe to use partial subscription mode
    if (comm->is_uniform()) {

        shmem_comm = comm->get_intra_comm();
        if (mv2_scatter_table_ppn_conf[0] == -1) {
            // Indicating user defined tuning
            conf_index = 0;
        }else{
          int local_size = shmem_comm->size();
          int i          = 0;
            do {
                if (local_size == mv2_scatter_table_ppn_conf[i]) {
                    conf_index = i;
                    partial_sub_ok = 1;
                    break;
                }
                i++;
            } while(i < mv2_scatter_num_ppn_conf);
        }
    }

  if (partial_sub_ok != 1) {
      conf_index = 0;
  }

  /* Search for the corresponding system size inside the tuning table */
  while ((range < (mv2_size_scatter_tuning_table[conf_index] - 1)) &&
      (comm_size > mv2_scatter_thresholds_table[conf_index][range].numproc)) {
      range++;
  }
  /* Search for corresponding inter-leader function */
  while ((range_threshold < (mv2_scatter_thresholds_table[conf_index][range].size_inter_table - 1))
      && (nbytes >
  mv2_scatter_thresholds_table[conf_index][range].inter_leader[range_threshold].max)
  && (mv2_scatter_thresholds_table[conf_index][range].inter_leader[range_threshold].max != -1)) {
      range_threshold++;
  }

  /* Search for corresponding intra-node function */
  while ((range_threshold_intra <
      (mv2_scatter_thresholds_table[conf_index][range].size_intra_table - 1))
      && (nbytes >
  mv2_scatter_thresholds_table[conf_index][range].intra_node[range_threshold_intra].max)
  && (mv2_scatter_thresholds_table[conf_index][range].intra_node[range_threshold_intra].max !=
      -1)) {
      range_threshold_intra++;
  }

  MV2_Scatter_function = mv2_scatter_thresholds_table[conf_index][range].inter_leader[range_threshold]
                                                                                      .MV2_pt_Scatter_function;

  if(MV2_Scatter_function == &MPIR_Scatter_mcst_wrap_MV2) {
#if defined(_MCST_SUPPORT_)
      if(comm->ch.is_mcast_ok == 1
          && mv2_use_mcast_scatter == 1
          && comm->ch.shmem_coll_ok == 1) {
          MV2_Scatter_function = &MPIR_Scatter_mcst_MV2;
      } else
#endif /*#if defined(_MCST_SUPPORT_) */
        {
          if(mv2_scatter_thresholds_table[conf_index][range].inter_leader[range_threshold + 1].
              MV2_pt_Scatter_function != NULL) {
              MV2_Scatter_function = mv2_scatter_thresholds_table[conf_index][range].inter_leader[range_threshold + 1]
                                                                                                  .MV2_pt_Scatter_function;
          } else {
              /* Fallback! */
              MV2_Scatter_function = &MPIR_Scatter_MV2_Binomial;
          }
        }
  }

  if( (MV2_Scatter_function == &MPIR_Scatter_MV2_two_level_Direct) ||
      (MV2_Scatter_function == &MPIR_Scatter_MV2_two_level_Binomial)) {
       if( comm->is_blocked()) {
             MV2_Scatter_intra_function = mv2_scatter_thresholds_table[conf_index][range].intra_node[range_threshold_intra]
                                .MV2_pt_Scatter_function;

             mpi_errno =
                   MV2_Scatter_function(sendbuf, sendcnt, sendtype,
                                        recvbuf, recvcnt, recvtype, root,
                                        comm);
         } else {
      mpi_errno = MPIR_Scatter_MV2_Binomial(sendbuf, sendcnt, sendtype,
          recvbuf, recvcnt, recvtype, root,
          comm);

      }
  } else {
      mpi_errno = MV2_Scatter_function(sendbuf, sendcnt, sendtype,
          recvbuf, recvcnt, recvtype, root,
          comm);
  }
  return (mpi_errno);
}
int Coll_scatter_mvapich2_two_level_direct::scatter(void *sendbuf,
                                      int sendcnt,
                                      MPI_Datatype sendtype,
                                      void *recvbuf,
                                      int recvcnt,
                                      MPI_Datatype recvtype,
                                      int root, MPI_Comm  comm)
{
    int comm_size, rank;
    int local_rank, local_size;
    int leader_comm_rank = -1, leader_comm_size = -1;
    int mpi_errno = MPI_SUCCESS;
    int recvtype_size, sendtype_size, nbytes;
    void *tmp_buf = NULL;
    void *leader_scatter_buf = NULL;
    MPI_Status status;
    int leader_root, leader_of_root = -1;
    MPI_Comm shmem_comm, leader_comm;
    //if not set (use of the algo directly, without mvapich2 selector)
    if(MV2_Scatter_intra_function==NULL)
      MV2_Scatter_intra_function=Coll_scatter_mpich::scatter;

    if(comm->get_leaders_comm()==MPI_COMM_NULL){
      comm->init_smp();
    }
    comm_size = comm->size();
    rank = comm->rank();

    if (((rank == root) && (recvcnt == 0))
        || ((rank != root) && (sendcnt == 0))) {
        return MPI_SUCCESS;
    }

    /* extract the rank,size information for the intra-node
     * communicator */
    shmem_comm = comm->get_intra_comm();
    local_rank = shmem_comm->rank();
    local_size = shmem_comm->size();

    if (local_rank == 0) {
        /* Node leader. Extract the rank, size information for the leader
         * communicator */
        leader_comm = comm->get_leaders_comm();
        leader_comm_size = leader_comm->size();
        leader_comm_rank = leader_comm->rank();
    }

    if (local_size == comm_size) {
        /* purely intra-node scatter. Just use the direct algorithm and we are done */
        mpi_errno = MPIR_Scatter_MV2_Direct(sendbuf, sendcnt, sendtype,
                                            recvbuf, recvcnt, recvtype,
                                            root, comm);

    } else {
        recvtype_size=recvtype->size();
        sendtype_size=sendtype->size();

        if (rank == root) {
            nbytes = sendcnt * sendtype_size;
        } else {
            nbytes = recvcnt * recvtype_size;
        }

        if (local_rank == 0) {
            /* Node leader, allocate tmp_buffer */
            tmp_buf = smpi_get_tmp_sendbuffer(nbytes * local_size);
        }

        leader_comm = comm->get_leaders_comm();
        int* leaders_map = comm->get_leaders_map();
        leader_of_root = comm->group()->rank(leaders_map[root]);
        leader_root = leader_comm->group()->rank(leaders_map[root]);
        /* leader_root is the rank of the leader of the root in leader_comm.
         * leader_root is to be used as the root of the inter-leader gather ops
         */

        if ((local_rank == 0) && (root != rank)
            && (leader_of_root == rank)) {
            /* The root of the scatter operation is not the node leader. Recv
             * data from the node leader */
            leader_scatter_buf = smpi_get_tmp_sendbuffer(nbytes * comm_size);
            Request::recv(leader_scatter_buf, nbytes * comm_size, MPI_BYTE,
                             root, COLL_TAG_SCATTER, comm, &status);

        }

        if (rank == root && local_rank != 0) {
            /* The root of the scatter operation is not the node leader. Send
             * data to the node leader */
            Request::send(sendbuf, sendcnt * comm_size, sendtype,
                                     leader_of_root, COLL_TAG_SCATTER, comm
                                     );
        }

        if (leader_comm_size > 1 && local_rank == 0) {
          if (not comm->is_uniform()) {
            int* displs   = NULL;
            int* sendcnts = NULL;
            int* node_sizes;
            int i      = 0;
            node_sizes = comm->get_non_uniform_map();

            if (root != leader_of_root) {
              if (leader_comm_rank == leader_root) {
                displs      = static_cast<int*>(xbt_malloc(sizeof(int) * leader_comm_size));
                sendcnts    = static_cast<int*>(xbt_malloc(sizeof(int) * leader_comm_size));
                sendcnts[0] = node_sizes[0] * nbytes;
                displs[0]   = 0;

                for (i = 1; i < leader_comm_size; i++) {
                  displs[i]   = displs[i - 1] + node_sizes[i - 1] * nbytes;
                  sendcnts[i] = node_sizes[i] * nbytes;
                }
              }
              Colls::scatterv(leader_scatter_buf, sendcnts, displs, MPI_BYTE, tmp_buf, nbytes * local_size, MPI_BYTE,
                              leader_root, leader_comm);
            } else {
              if (leader_comm_rank == leader_root) {
                displs      = static_cast<int*>(xbt_malloc(sizeof(int) * leader_comm_size));
                sendcnts    = static_cast<int*>(xbt_malloc(sizeof(int) * leader_comm_size));
                sendcnts[0] = node_sizes[0] * sendcnt;
                displs[0]   = 0;

                for (i = 1; i < leader_comm_size; i++) {
                  displs[i]   = displs[i - 1] + node_sizes[i - 1] * sendcnt;
                  sendcnts[i] = node_sizes[i] * sendcnt;
                }
              }
              Colls::scatterv(sendbuf, sendcnts, displs, sendtype, tmp_buf, nbytes * local_size, MPI_BYTE, leader_root,
                              leader_comm);
            }
            if (leader_comm_rank == leader_root) {
              xbt_free(displs);
              xbt_free(sendcnts);
            }
            } else {
                if (leader_of_root != root) {
                    mpi_errno =
                        MPIR_Scatter_MV2_Direct(leader_scatter_buf,
                                                nbytes * local_size, MPI_BYTE,
                                                tmp_buf, nbytes * local_size,
                                                MPI_BYTE, leader_root,
                                                leader_comm);
                } else {
                    mpi_errno =
                        MPIR_Scatter_MV2_Direct(sendbuf, sendcnt * local_size,
                                                sendtype, tmp_buf,
                                                nbytes * local_size, MPI_BYTE,
                                                leader_root, leader_comm);

                }
            }
        }
        /* The leaders are now done with the inter-leader part. Scatter the data within the nodes */

        if (rank == root && recvbuf == MPI_IN_PLACE) {
            mpi_errno = MV2_Scatter_intra_function(tmp_buf, nbytes, MPI_BYTE,
                                                (void *)sendbuf, sendcnt, sendtype,
                                                0, shmem_comm);
        } else {
            mpi_errno = MV2_Scatter_intra_function(tmp_buf, nbytes, MPI_BYTE,
                                                recvbuf, recvcnt, recvtype,
                                                0, shmem_comm);
        }
    }

    /* check if multiple threads are calling this collective function */
    if (comm_size != local_size && local_rank == 0) {
        smpi_free_tmp_buffer(tmp_buf);
        if (leader_of_root == rank && root != rank) {
            smpi_free_tmp_buffer(leader_scatter_buf);
        }
    }
    return (mpi_errno);
}
Beispiel #26
0
int Coll_reduce_binomial::reduce(void *sendbuf, void *recvbuf, int count,
                                    MPI_Datatype datatype, MPI_Op op, int root,
                                    MPI_Comm comm)
{
  MPI_Status status;
  int comm_size, rank;
  int mask, relrank, source;
  int dst;
  int tag = COLL_TAG_REDUCE;
  MPI_Aint extent;
  void *tmp_buf;
  MPI_Aint true_lb, true_extent;
  if (count == 0)
    return 0;
  rank = comm->rank();
  comm_size = comm->size();

  extent = datatype->get_extent();

  tmp_buf = (void *) smpi_get_tmp_sendbuffer(count * extent);
  int is_commutative =  (op==MPI_OP_NULL || op->is_commutative());
  mask = 1;

  int lroot;
  if (is_commutative)
        lroot   = root;
  else
        lroot   = 0;
  relrank = (rank - lroot + comm_size) % comm_size;

  datatype->extent(&true_lb, &true_extent);

  /* adjust for potential negative lower bound in datatype */
  tmp_buf = (void *)((char*)tmp_buf - true_lb);

  /* If I'm not the root, then my recvbuf may not be valid, therefore
     I have to allocate a temporary one */
  if (rank != root) {
      recvbuf = (void*)smpi_get_tmp_recvbuffer(count * std::max(extent, true_extent));
      recvbuf = (void *)((char*)recvbuf - true_lb);
  }
   if ((rank != root) || (sendbuf != MPI_IN_PLACE)) {
      Datatype::copy(sendbuf, count, datatype, recvbuf,count, datatype);
  }

  while (mask < comm_size) {
    /* Receive */
    if ((mask & relrank) == 0) {
      source = (relrank | mask);
      if (source < comm_size) {
        source = (source + lroot) % comm_size;
        Request::recv(tmp_buf, count, datatype, source, tag, comm, &status);

        if (is_commutative) {
          if(op!=MPI_OP_NULL) op->apply( tmp_buf, recvbuf, &count, datatype);
        } else {
          if(op!=MPI_OP_NULL) op->apply( recvbuf, tmp_buf, &count, datatype);
          Datatype::copy(tmp_buf, count, datatype,recvbuf, count, datatype);
        }
      }
    } else {
      dst = ((relrank & (~mask)) + lroot) % comm_size;
      Request::send(recvbuf, count, datatype, dst, tag, comm);
      break;
    }
    mask <<= 1;
  }

  if (not is_commutative && (root != 0)) {
    if (rank == 0){
      Request::send(recvbuf, count, datatype, root,tag, comm);
    }else if (rank == root){
      Request::recv(recvbuf, count, datatype, 0, tag, comm, &status);
    }
  }

  if (rank != root) {
    smpi_free_tmp_buffer(recvbuf);
  }
  smpi_free_tmp_buffer(tmp_buf);

  return 0;
}
int Coll_allgather_loosely_lr::allgather(const void *sbuf, int scount,
                                         MPI_Datatype stype, void *rbuf,
                                         int rcount, MPI_Datatype rtype,
                                         MPI_Comm comm)
{
  int comm_size, rank;
  int tag = COLL_TAG_ALLGATHER;
  int i, j, send_offset, recv_offset;
  int intra_rank, inter_rank, inter_comm_size, intra_comm_size;
  int inter_dst, inter_src;

  comm_size = comm->size();

if(comm->get_leaders_comm()==MPI_COMM_NULL){
    comm->init_smp();
  }
  int num_core=1;
  if (comm->is_uniform()){
    num_core = comm->get_intra_comm()->size();
  }

  if(comm_size%num_core)
    THROWF(arg_error,0, "allgather loosely lr algorithm can't be used with non multiple of NUM_CORE=%d number of processes ! ",num_core);

  rank = comm->rank();
  MPI_Aint rextent, sextent;
  rextent = rtype->get_extent();
  sextent = stype->get_extent();
  MPI_Request inter_rrequest;
  MPI_Request rrequest_array[128];
  MPI_Request srequest_array[128];
  MPI_Request inter_srequest_array[128];


  int rrequest_count = 0;
  int srequest_count = 0;
  int inter_srequest_count = 0;

  MPI_Status status;

  intra_rank = rank % num_core;
  inter_rank = rank / num_core;
  inter_comm_size = (comm_size + num_core - 1) / num_core;
  intra_comm_size = num_core;

  int src_seg, dst_seg;

  //copy corresponding message from sbuf to rbuf
  recv_offset = rank * rextent * rcount;
  Request::sendrecv(sbuf, scount, stype, rank, tag,
               (char *)rbuf + recv_offset, rcount, rtype, rank, tag, comm, &status);

  int dst, src;
  int inter_send_offset, inter_recv_offset;

  rrequest_count = 0;
  srequest_count = 0;
  inter_srequest_count = 0;

  for (i = 0; i < inter_comm_size; i++) {

    // inter_communication

    inter_dst = (rank + intra_comm_size) % comm_size;
    inter_src = (rank - intra_comm_size + comm_size) % comm_size;

    src_seg =
        ((inter_rank - 1 - i +
          inter_comm_size) % inter_comm_size) * intra_comm_size + intra_rank;
    dst_seg =
        ((inter_rank - i +
          inter_comm_size) % inter_comm_size) * intra_comm_size + intra_rank;

    inter_send_offset = dst_seg * sextent * scount;
    inter_recv_offset = src_seg * rextent * rcount;

    for (j = 0; j < intra_comm_size; j++) {

      // inter communication
      if (intra_rank == j) {
        if (i != inter_comm_size - 1) {

          inter_rrequest = Request::irecv((char*)rbuf + inter_recv_offset, rcount, rtype, inter_src, tag, comm);
          inter_srequest_array[inter_srequest_count++] =
              Request::isend((char*)rbuf + inter_send_offset, scount, stype, inter_dst, tag, comm);
        }
      }
      //intra_communication
      src = inter_rank * intra_comm_size + j;
      dst = inter_rank * intra_comm_size + j;

      src_seg =
          ((inter_rank - i +
            inter_comm_size) % inter_comm_size) * intra_comm_size + j;
      dst_seg =
          ((inter_rank - i +
            inter_comm_size) % inter_comm_size) * intra_comm_size + intra_rank;

      send_offset = dst_seg * sextent * scount;
      recv_offset = src_seg * rextent * rcount;


      if (j != intra_rank) {

        rrequest_array[rrequest_count++] = Request::irecv((char *)rbuf + recv_offset, rcount, rtype, src, tag, comm);
        srequest_array[srequest_count++] = Request::isend((char *)rbuf + send_offset, scount, stype, dst, tag, comm);

      }
    }                           // intra loop


    // wait for inter communication to finish for these rounds (# of round equals num_core)
    if (i != inter_comm_size - 1) {
      Request::wait(&inter_rrequest, &status);
    }

  }                             //inter loop

  Request::waitall(rrequest_count, rrequest_array, MPI_STATUSES_IGNORE);
  Request::waitall(srequest_count, srequest_array, MPI_STATUSES_IGNORE);
  Request::waitall(inter_srequest_count, inter_srequest_array, MPI_STATUSES_IGNORE);

  return MPI_SUCCESS;
}
int Coll_reduce_mvapich2::reduce(const void *sendbuf,
    void *recvbuf,
    int count,
    MPI_Datatype datatype,
    MPI_Op op, int root, MPI_Comm comm)
{
  if(mv2_reduce_thresholds_table == NULL)
    init_mv2_reduce_tables_stampede();

  int mpi_errno = MPI_SUCCESS;
  int range = 0;
  int range_threshold = 0;
  int range_intra_threshold = 0;
  int is_commutative, pof2;
  int comm_size = 0;
  long nbytes = 0;
  int sendtype_size;
  int is_two_level = 0;

  comm_size = comm->size();
  sendtype_size=datatype->size();
  nbytes = count * sendtype_size;

  if (count == 0)
    return MPI_SUCCESS;

  is_commutative = (op==MPI_OP_NULL || op->is_commutative());

  /* find nearest power-of-two less than or equal to comm_size */
  for( pof2 = 1; pof2 <= comm_size; pof2 <<= 1 );
  pof2 >>=1;


  /* Search for the corresponding system size inside the tuning table */
  while ((range < (mv2_size_reduce_tuning_table - 1)) &&
      (comm_size > mv2_reduce_thresholds_table[range].numproc)) {
      range++;
  }
  /* Search for corresponding inter-leader function */
  while ((range_threshold < (mv2_reduce_thresholds_table[range].size_inter_table - 1))
      && (nbytes >
  mv2_reduce_thresholds_table[range].inter_leader[range_threshold].max)
  && (mv2_reduce_thresholds_table[range].inter_leader[range_threshold].max !=
      -1)) {
      range_threshold++;
  }

  /* Search for corresponding intra node function */
  while ((range_intra_threshold < (mv2_reduce_thresholds_table[range].size_intra_table - 1))
      && (nbytes >
  mv2_reduce_thresholds_table[range].intra_node[range_intra_threshold].max)
  && (mv2_reduce_thresholds_table[range].intra_node[range_intra_threshold].max !=
      -1)) {
      range_intra_threshold++;
  }

  /* Set intra-node function pt for reduce_two_level */
  MV2_Reduce_intra_function =
      mv2_reduce_thresholds_table[range].intra_node[range_intra_threshold].
      MV2_pt_Reduce_function;
  /* Set inter-leader pt */
  MV2_Reduce_function =
      mv2_reduce_thresholds_table[range].inter_leader[range_threshold].
      MV2_pt_Reduce_function;

  if(mv2_reduce_intra_knomial_factor<0)
    {
      mv2_reduce_intra_knomial_factor = mv2_reduce_thresholds_table[range].intra_k_degree;
    }
  if(mv2_reduce_inter_knomial_factor<0)
    {
      mv2_reduce_inter_knomial_factor = mv2_reduce_thresholds_table[range].inter_k_degree;
    }
  if(mv2_reduce_thresholds_table[range].is_two_level_reduce[range_threshold] == 1){
      is_two_level = 1;
  }
  /* We call Reduce function */
  if(is_two_level == 1)
    {
       if (is_commutative == 1) {
         if(comm->get_leaders_comm()==MPI_COMM_NULL){
           comm->init_smp();
         }
         mpi_errno = MPIR_Reduce_two_level_helper_MV2(sendbuf, recvbuf, count,
                                           datatype, op, root, comm);
        } else {
      mpi_errno = MPIR_Reduce_binomial_MV2(sendbuf, recvbuf, count,
          datatype, op, root, comm);
      }
    } else if(MV2_Reduce_function == &MPIR_Reduce_inter_knomial_wrapper_MV2 ){
        if(is_commutative ==1)
          {
            mpi_errno = MV2_Reduce_function(sendbuf, recvbuf, count,
                datatype, op, root, comm);
          } else {
              mpi_errno = MPIR_Reduce_binomial_MV2(sendbuf, recvbuf, count,
                  datatype, op, root, comm);
          }
    } else if(MV2_Reduce_function == &MPIR_Reduce_redscat_gather_MV2){
        if (/*(HANDLE_GET_KIND(op) == HANDLE_KIND_BUILTIN) &&*/ (count >= pof2))
          {
            mpi_errno = MV2_Reduce_function(sendbuf, recvbuf, count,
                datatype, op, root, comm);
          } else {
              mpi_errno = MPIR_Reduce_binomial_MV2(sendbuf, recvbuf, count,
                  datatype, op, root, comm);
          }
    } else {
        mpi_errno = MV2_Reduce_function(sendbuf, recvbuf, count,
            datatype, op, root, comm);
    }


  return mpi_errno;

}
int Coll_allgather_mvapich2_smp::allgather(void *sendbuf,int sendcnt, MPI_Datatype sendtype,
                            void *recvbuf, int recvcnt,MPI_Datatype recvtype,
                            MPI_Comm  comm)
{
    int rank, size;
    int local_rank, local_size;
    int leader_comm_size = 0;
    int mpi_errno = MPI_SUCCESS;
    MPI_Aint recvtype_extent = 0;  /* Datatype extent */
    MPI_Comm shmem_comm, leader_comm;

  if(comm->get_leaders_comm()==MPI_COMM_NULL){
    comm->init_smp();
  }

  if (not comm->is_uniform() || not comm->is_blocked())
    THROWF(arg_error,0, "allgather MVAPICH2 smp algorithm can't be used with irregular deployment. Please insure that processes deployed on the same node are contiguous and that each node has the same number of processes");

    if (recvcnt == 0) {
        return MPI_SUCCESS;
    }

    rank = comm->rank();
    size = comm->size();

    /* extract the rank,size information for the intra-node communicator */
    recvtype_extent=recvtype->get_extent();

    shmem_comm = comm->get_intra_comm();
    local_rank = shmem_comm->rank();
    local_size = shmem_comm->size();

    if (local_rank == 0) {
        /* Node leader. Extract the rank, size information for the leader communicator */
        leader_comm = comm->get_leaders_comm();
        if(leader_comm==MPI_COMM_NULL){
          leader_comm = MPI_COMM_WORLD;
        }
        leader_comm_size = leader_comm->size();
    }

    /*If there is just one node, after gather itself,
     * root has all the data and it can do bcast*/
    if(local_rank == 0) {
        mpi_errno = Colls::gather(sendbuf, sendcnt,sendtype,
                                    (void*)((char*)recvbuf + (rank * recvcnt * recvtype_extent)),
                                     recvcnt, recvtype,
                                     0, shmem_comm);
    } else {
        /*Since in allgather all the processes could have
         * its own data in place*/
        if(sendbuf == MPI_IN_PLACE) {
            mpi_errno = Colls::gather((void*)((char*)recvbuf + (rank * recvcnt * recvtype_extent)),
                                         recvcnt , recvtype,
                                         recvbuf, recvcnt, recvtype,
                                         0, shmem_comm);
        } else {
            mpi_errno = Colls::gather(sendbuf, sendcnt,sendtype,
                                         recvbuf, recvcnt, recvtype,
                                         0, shmem_comm);
        }
    }
    /* Exchange the data between the node leaders*/
    if (local_rank == 0 && (leader_comm_size > 1)) {
        /*When data in each socket is different*/
        if (comm->is_uniform() != 1) {

            int *displs = NULL;
            int *recvcnts = NULL;
            int *node_sizes = NULL;
            int i = 0;

            node_sizes = comm->get_non_uniform_map();

            displs =  static_cast<int *>(xbt_malloc(sizeof (int) * leader_comm_size));
            recvcnts =  static_cast<int *>(xbt_malloc(sizeof (int) * leader_comm_size));
            if (not displs || not recvcnts) {
              return MPI_ERR_OTHER;
            }
            recvcnts[0] = node_sizes[0] * recvcnt;
            displs[0] = 0;

            for (i = 1; i < leader_comm_size; i++) {
                displs[i] = displs[i - 1] + node_sizes[i - 1] * recvcnt;
                recvcnts[i] = node_sizes[i] * recvcnt;
            }


            void* sendbuf=((char*)recvbuf)+recvtype->get_extent()*displs[leader_comm->rank()];

            mpi_errno = Colls::allgatherv(sendbuf,
                                       (recvcnt*local_size),
                                       recvtype,
                                       recvbuf, recvcnts,
                                       displs, recvtype,
                                       leader_comm);
            xbt_free(displs);
            xbt_free(recvcnts);
        } else {
        void* sendtmpbuf=((char*)recvbuf)+recvtype->get_extent()*(recvcnt*local_size)*leader_comm->rank();



            mpi_errno = Coll_allgather_mpich::allgather(sendtmpbuf,
                                               (recvcnt*local_size),
                                               recvtype,
                                               recvbuf, (recvcnt*local_size), recvtype,
                                             leader_comm);

        }
    }

    /*Bcast the entire data from node leaders to all other cores*/
    mpi_errno = Colls::bcast (recvbuf, recvcnt * size, recvtype, 0, shmem_comm);
    return mpi_errno;
}
int Coll_reduce_scatter_mvapich2::reduce_scatter(const void *sendbuf, void *recvbuf, const int *recvcnts,
    MPI_Datatype datatype, MPI_Op op,
    MPI_Comm comm)
{
  int mpi_errno = MPI_SUCCESS;
  int i = 0, comm_size = comm->size(), total_count = 0, type_size =
      0, nbytes = 0;
  int is_commutative = 0;
  int* disps          = new int[comm_size];

  if(mv2_red_scat_thresholds_table==NULL)
    init_mv2_reduce_scatter_tables_stampede();

  is_commutative=(op==MPI_OP_NULL || op->is_commutative());
  for (i = 0; i < comm_size; i++) {
      disps[i] = total_count;
      total_count += recvcnts[i];
  }

  type_size=datatype->size();
  nbytes = total_count * type_size;

  if (is_commutative) {
    int range           = 0;
    int range_threshold = 0;

      /* Search for the corresponding system size inside the tuning table */
      while ((range < (mv2_size_red_scat_tuning_table - 1)) &&
          (comm_size > mv2_red_scat_thresholds_table[range].numproc)) {
          range++;
      }
      /* Search for corresponding inter-leader function */
      while ((range_threshold < (mv2_red_scat_thresholds_table[range].size_inter_table - 1))
          && (nbytes >
      mv2_red_scat_thresholds_table[range].inter_leader[range_threshold].max)
      && (mv2_red_scat_thresholds_table[range].inter_leader[range_threshold].max !=
          -1)) {
          range_threshold++;
      }

      /* Set inter-leader pt */
      MV2_Red_scat_function =
          mv2_red_scat_thresholds_table[range].inter_leader[range_threshold].
          MV2_pt_Red_scat_function;

      mpi_errno = MV2_Red_scat_function(sendbuf, recvbuf,
          recvcnts, datatype,
          op, comm);
  } else {
      int is_block_regular = 1;
      for (i = 0; i < (comm_size - 1); ++i) {
          if (recvcnts[i] != recvcnts[i+1]) {
              is_block_regular = 0;
              break;
          }
      }
      int pof2 = 1;
      while (pof2 < comm_size) pof2 <<= 1;
      if (pof2 == comm_size && is_block_regular) {
          /* noncommutative, pof2 size, and block regular */
          MPIR_Reduce_scatter_non_comm_MV2(sendbuf, recvbuf,
              recvcnts, datatype,
              op, comm);
      }
      mpi_errno =  Coll_reduce_scatter_mpich_rdb::reduce_scatter(sendbuf, recvbuf,
          recvcnts, datatype,
          op, comm);
  }
  delete[] disps;
  return mpi_errno;

}