Beispiel #1
0
int Coll_barrier_ompi_bruck::barrier(MPI_Comm comm)
{
    int rank, size;
    int distance, to, from;

    rank = comm->rank();
    size = comm->size();
    XBT_DEBUG(
                 "ompi_coll_tuned_barrier_ompi_bruck rank %d", rank);

    /* exchange data with rank-2^k and rank+2^k */
    for (distance = 1; distance < size; distance <<= 1) {
        from = (rank + size - distance) % size;
        to   = (rank + distance) % size;

        /* send message to lower ranked node */
        Request::sendrecv(NULL, 0, MPI_BYTE, to,
                                              COLL_TAG_BARRIER,
                                              NULL, 0, MPI_BYTE, from,
                                              COLL_TAG_BARRIER,
                                              comm, MPI_STATUS_IGNORE);
    }

    return MPI_SUCCESS;

}
Beispiel #2
0
int Coll_alltoallv_pair::alltoallv(void *send_buff, int *send_counts, int *send_disps,
                                  MPI_Datatype send_type,
                                  void *recv_buff, int *recv_counts, int *recv_disps,
                                  MPI_Datatype recv_type, MPI_Comm comm)
{

  MPI_Aint send_chunk, recv_chunk;
  MPI_Status s;
  int i, src, dst, rank, num_procs;
  int tag = COLL_TAG_ALLTOALLV;
  char *send_ptr = (char *) send_buff;
  char *recv_ptr = (char *) recv_buff;

  rank = comm->rank();
  num_procs = comm->size();

  if((num_procs&(num_procs-1)))
    THROWF(arg_error,0, "alltoallv pair algorithm can't be used with non power of two number of processes ! ");

  send_chunk = send_type->get_extent();
  recv_chunk = recv_type->get_extent();

  for (i = 0; i < num_procs; i++) {
    src = dst = rank ^ i;
    Request::sendrecv(send_ptr + send_disps[dst] * send_chunk, send_counts[dst], send_type, dst, tag,
                      recv_ptr + recv_disps[src] * recv_chunk, recv_counts[src], recv_type, src, tag, comm, &s);
  }
  return MPI_SUCCESS;
}
Beispiel #3
0
int PMPI_Win_allocate_shared( MPI_Aint size, int disp_unit, MPI_Info info, MPI_Comm comm, void *base, MPI_Win *win){
  int retval = 0;
  smpi_bench_end();
  if (comm == MPI_COMM_NULL) {
    retval= MPI_ERR_COMM;
  }else if (disp_unit <= 0 || size < 0 ){
    retval= MPI_ERR_OTHER;
  }else{
    void* ptr = nullptr;
    int rank = comm->rank();
    if(rank==0){
       ptr = xbt_malloc(size*comm->size());
       if(ptr==nullptr)
         return MPI_ERR_NO_MEM;
    }
    
    simgrid::smpi::Colls::bcast(&ptr, sizeof(void*), MPI_BYTE, 0, comm);
    simgrid::smpi::Colls::barrier(comm);
    
    *static_cast<void**>(base) = (char*)ptr+rank*size;
    *win = new simgrid::smpi::Win( ptr, size, disp_unit, info, comm,rank==0);
    retval = MPI_SUCCESS;
  }
  smpi_bench_begin();
  return retval;
}
Beispiel #4
0
int Coll_alltoallv_ring::alltoallv(const void* send_buff, const int* send_counts, const int* send_disps, MPI_Datatype send_type,
                                   void* recv_buff, const int* recv_counts, const int* recv_disps, MPI_Datatype recv_type,
                                   MPI_Comm comm)
{
  MPI_Status s;
  MPI_Aint send_chunk, recv_chunk;
  int i, src, dst, rank, num_procs;
  int tag = COLL_TAG_ALLTOALLV;

  char *send_ptr = (char *) send_buff;
  char *recv_ptr = (char *) recv_buff;

  rank = comm->rank();
  num_procs = comm->size();
  send_chunk = send_type->get_extent();
  recv_chunk = recv_type->get_extent();
  int pof2 = ((num_procs != 0) && ((num_procs & (~num_procs + 1)) == num_procs));
  for (i = 0; i < num_procs; i++) {

    if (pof2 == 1) {
      /* use exclusive-or algorithm */
      src = dst = rank ^ i;
    } else {
      src = (rank - i + num_procs) % num_procs;
      dst = (rank + i) % num_procs;
    }

    Request::sendrecv(send_ptr + send_disps[dst] * send_chunk, send_counts[dst], send_type, dst,
                 tag, recv_ptr + recv_disps[src] * recv_chunk, recv_counts[src], recv_type,
                 src, tag, comm, &s);

  }
  return MPI_SUCCESS;
}
Beispiel #5
0
int PMPI_Sendrecv(const void* sendbuf, int sendcount, MPI_Datatype sendtype, int dst, int sendtag, void* recvbuf,
                  int recvcount, MPI_Datatype recvtype, int src, int recvtag, MPI_Comm comm, MPI_Status* status)
{
  int retval = 0;

  smpi_bench_end();

  if (comm == MPI_COMM_NULL) {
    retval = MPI_ERR_COMM;
  } else if (not sendtype->is_valid() || not recvtype->is_valid()) {
    retval = MPI_ERR_TYPE;
  } else if (src == MPI_PROC_NULL) {
    if(status!=MPI_STATUS_IGNORE){
      simgrid::smpi::Status::empty(status);
      status->MPI_SOURCE = MPI_PROC_NULL;
    }
    if(dst != MPI_PROC_NULL)
      simgrid::smpi::Request::send(sendbuf, sendcount, sendtype, dst, sendtag, comm);
    retval = MPI_SUCCESS;
  }else if (dst == MPI_PROC_NULL){
    simgrid::smpi::Request::recv(recvbuf, recvcount, recvtype, src, recvtag, comm, status);
    retval = MPI_SUCCESS;
  }else if (dst >= comm->group()->size() || dst <0 ||
      (src!=MPI_ANY_SOURCE && (src >= comm->group()->size() || src <0))){
    retval = MPI_ERR_RANK;
  } else if ((sendcount < 0 || recvcount<0) ||
      (sendbuf==nullptr && sendcount > 0) || (recvbuf==nullptr && recvcount>0)) {
    retval = MPI_ERR_COUNT;
  } else if((sendtag<0 && sendtag !=  MPI_ANY_TAG)||(recvtag<0 && recvtag != MPI_ANY_TAG)){
    retval = MPI_ERR_TAG;
  } else {
    int my_proc_id         = simgrid::s4u::this_actor::get_pid();
    int dst_traced         = getPid(comm, dst);
    int src_traced         = getPid(comm, src);

    // FIXME: Hack the way to trace this one
    std::vector<int>* dst_hack = new std::vector<int>;
    std::vector<int>* src_hack = new std::vector<int>;
    dst_hack->push_back(dst_traced);
    src_hack->push_back(src_traced);
    TRACE_smpi_comm_in(my_proc_id, __func__,
                       new simgrid::instr::VarCollTIData(
                           "sendRecv", -1, sendtype->is_replayable() ? sendcount : sendcount * sendtype->size(),
                           dst_hack, recvtype->is_replayable() ? recvcount : recvcount * recvtype->size(), src_hack,
                           simgrid::smpi::Datatype::encode(sendtype), simgrid::smpi::Datatype::encode(recvtype)));

    TRACE_smpi_send(my_proc_id, my_proc_id, dst_traced, sendtag, sendcount * sendtype->size());

    simgrid::smpi::Request::sendrecv(sendbuf, sendcount, sendtype, dst, sendtag, recvbuf, recvcount, recvtype, src,
                                     recvtag, comm, status);
    retval = MPI_SUCCESS;

    TRACE_smpi_recv(src_traced, my_proc_id, recvtag);
    TRACE_smpi_comm_out(my_proc_id);
  }

  smpi_bench_begin();
  return retval;
}
int Coll_gather_ompi::gather(const void *sbuf, int scount,
                                           MPI_Datatype sdtype,
                                           void* rbuf, int rcount,
                                           MPI_Datatype rdtype,
                                           int root,
                                           MPI_Comm  comm
                                           )
{
    //const int large_segment_size = 32768;
    //const int small_segment_size = 1024;

    //const size_t large_block_size = 92160;
    const size_t intermediate_block_size = 6000;
    const size_t small_block_size = 1024;

    const int large_communicator_size = 60;
    const int small_communicator_size = 10;

    int communicator_size, rank;
    size_t dsize, block_size;

    XBT_DEBUG("smpi_coll_tuned_gather_ompi");

    communicator_size = comm->size();
    rank = comm->rank();

    // Determine block size
    if (rank == root) {
        dsize = rdtype->size();
        block_size = dsize * rcount;
    } else {
        dsize = sdtype->size();
        block_size = dsize * scount;
    }

/*    if (block_size > large_block_size) {*/
/*        return smpi_coll_tuned_gather_ompi_linear_sync (sbuf, scount, sdtype, */
/*                                                         rbuf, rcount, rdtype, */
/*                                                         root, comm);*/

/*    } else*/ if (block_size > intermediate_block_size) {
        return Coll_gather_ompi_linear_sync::gather (sbuf, scount, sdtype,
                                                         rbuf, rcount, rdtype,
                                                         root, comm);

    } else if ((communicator_size > large_communicator_size) ||
               ((communicator_size > small_communicator_size) &&
                (block_size < small_block_size))) {
        return Coll_gather_ompi_binomial::gather (sbuf, scount, sdtype,
                                                      rbuf, rcount, rdtype,
                                                      root, comm);

    }
    // Otherwise, use basic linear
    return Coll_gather_ompi_basic_linear::gather (sbuf, scount, sdtype,
                                                      rbuf, rcount, rdtype,
                                                      root, comm);
}
Beispiel #7
0
int
Coll_reduce_flat_tree::reduce(const void *sbuf, void *rbuf, int count,
                                 MPI_Datatype dtype, MPI_Op op,
                                 int root, MPI_Comm comm)
{
  int i, tag = COLL_TAG_REDUCE;
  int size;
  int rank;
  MPI_Aint extent;
  unsigned char* origin = nullptr;
  const unsigned char* inbuf;
  MPI_Status status;

  rank = comm->rank();
  size = comm->size();

  /* If not root, send data to the root. */
  extent = dtype->get_extent();

  if (rank != root) {
    Request::send(sbuf, count, dtype, root, tag, comm);
    return 0;
  }

  /* Root receives and reduces messages.  Allocate buffer to receive
     messages. */

  if (size > 1)
    origin = smpi_get_tmp_recvbuffer(count * extent);

  /* Initialize the receive buffer. */
  if (rank == (size - 1))
    Request::sendrecv(sbuf, count, dtype, rank, tag,
                 rbuf, count, dtype, rank, tag, comm, &status);
  else
    Request::recv(rbuf, count, dtype, size - 1, tag, comm, &status);

  /* Loop receiving and calling reduction function (C or Fortran). */

  for (i = size - 2; i >= 0; --i) {
    if (rank == i)
      inbuf = static_cast<const unsigned char*>(sbuf);
    else {
      Request::recv(origin, count, dtype, i, tag, comm, &status);
      inbuf = origin;
    }

    /* Call reduction function. */
    if(op!=MPI_OP_NULL) op->apply( inbuf, rbuf, &count, dtype);

  }

  smpi_free_tmp_buffer(origin);

  /* All done */
  return 0;
}
int
Coll_bcast_flattree_pipeline::bcast(void *buff, int count,
                                        MPI_Datatype data_type, int root,
                                        MPI_Comm comm)
{
  int i, j, rank, num_procs;
  int tag = COLL_TAG_BCAST;

  MPI_Aint extent;
  extent = data_type->get_extent();

  int segment = flattree_segment_in_byte / extent;
  segment =  segment == 0 ? 1 :segment;
  int pipe_length = count / segment;
  int increment = segment * extent;
  if (pipe_length==0) {
    XBT_WARN("MPI_bcast_flattree_pipeline use default MPI_bcast_flattree.");
    return Coll_bcast_flattree::bcast(buff, count, data_type, root, comm);
  }
  rank = comm->rank();
  num_procs = comm->size();

  MPI_Request *request_array;
  MPI_Status *status_array;

  request_array = (MPI_Request *) xbt_malloc(pipe_length * sizeof(MPI_Request));
  status_array = (MPI_Status *) xbt_malloc(pipe_length * sizeof(MPI_Status));

  if (rank != root) {
    for (i = 0; i < pipe_length; i++) {
      request_array[i] = Request::irecv((char *)buff + (i * increment), segment, data_type, root, tag, comm);
    }
    Request::waitall(pipe_length, request_array, status_array);
  }

  else {
    // Root sends data to all others
    for (j = 0; j < num_procs; j++) {
      if (j == rank)
        continue;
      else {
        for (i = 0; i < pipe_length; i++) {
          Request::send((char *)buff + (i * increment), segment, data_type, j, tag, comm);
        }
      }
    }

  }

  free(request_array);
  free(status_array);
  return MPI_SUCCESS;
}
Beispiel #9
0
/*
 *  gather_intra
 *
 *  Function:  - basic gather operation
 *  Accepts:  - same arguments as MPI_Gather()
 *  Returns:  - MPI_SUCCESS or error code
 */
int Coll_gather_ompi_basic_linear::gather(void* sbuf, int scount, MPI_Datatype sdtype, void* rbuf, int rcount,
                                          MPI_Datatype rdtype, int root, MPI_Comm comm)
{
    int i;
    int err;
    int rank;
    int size;
    char *ptmp;
    MPI_Aint incr;
    MPI_Aint extent;
    MPI_Aint lb;

    size = comm->size();
    rank = comm->rank();

    /* Everyone but root sends data and returns. */
    XBT_DEBUG("ompi_coll_tuned_gather_intra_basic_linear rank %d", rank);

    if (rank != root) {
        Request::send(sbuf, scount, sdtype, root,
                                 COLL_TAG_GATHER,
                                  comm);
        return MPI_SUCCESS;
    }

    /* I am the root, loop receiving the data. */

    rdtype->extent(&lb, &extent);
    incr = extent * rcount;
    for (i = 0, ptmp = (char *) rbuf; i < size; ++i, ptmp += incr) {
        if (i == rank) {
            if (MPI_IN_PLACE != sbuf) {
                err = Datatype::copy(sbuf, scount, sdtype,
                                      ptmp, rcount, rdtype);
            } else {
                err = MPI_SUCCESS;
            }
        } else {
            Request::recv(ptmp, rcount, rdtype, i,
                                    COLL_TAG_GATHER,
                                    comm, MPI_STATUS_IGNORE);
            err = MPI_SUCCESS;
        }
        if (MPI_SUCCESS != err) {
            return err;
        }
    }

    /* All done */

    return MPI_SUCCESS;
}
int Coll_alltoall_basic_linear::alltoall(void *sendbuf, int sendcount, MPI_Datatype sendtype,
                                          void *recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm)
{
  int system_tag = 888;
  int i;
  int count;
  MPI_Aint lb = 0, sendext = 0, recvext = 0;
  MPI_Request *requests;

  /* Initialize. */
  int rank = comm->rank();
  int size = comm->size();
  XBT_DEBUG("<%d> algorithm alltoall_basic_linear() called.", rank);
  sendtype->extent(&lb, &sendext);
  recvtype->extent(&lb, &recvext);
  /* simple optimization */
  int err = Datatype::copy(static_cast<char *>(sendbuf) + rank * sendcount * sendext, sendcount, sendtype,
                               static_cast<char *>(recvbuf) + rank * recvcount * recvext, recvcount, recvtype);
  if (err == MPI_SUCCESS && size > 1) {
    /* Initiate all send/recv to/from others. */
    requests = xbt_new(MPI_Request, 2 * (size - 1));
    /* Post all receives first -- a simple optimization */
    count = 0;
    for (i = (rank + 1) % size; i != rank; i = (i + 1) % size) {
      requests[count] = Request::irecv_init(static_cast<char *>(recvbuf) + i * recvcount * recvext, recvcount,
                                        recvtype, i, system_tag, comm);
      count++;
    }
    /* Now post all sends in reverse order
     *   - We would like to minimize the search time through message queue
     *     when messages actually arrive in the order in which they were posted.
     * TODO: check the previous assertion
     */
    for (i = (rank + size - 1) % size; i != rank; i = (i + size - 1) % size) {
      requests[count] = Request::isend_init(static_cast<char *>(sendbuf) + i * sendcount * sendext, sendcount,
                                        sendtype, i, system_tag, comm);
      count++;
    }
    /* Wait for them all. */
    Request::startall(count, requests);
    XBT_DEBUG("<%d> wait for %d requests", rank, count);
    Request::waitall(count, requests, MPI_STATUS_IGNORE);
    for(i = 0; i < count; i++) {
      if(requests[i]!=MPI_REQUEST_NULL)
        Request::unref(&requests[i]);
    }
    xbt_free(requests);
  }
  return err;
}
Beispiel #11
0
int PMPI_Ssend(const void* buf, int count, MPI_Datatype datatype, int dst, int tag, MPI_Comm comm) {
  int retval = 0;

  smpi_bench_end();

  if (comm == MPI_COMM_NULL) {
    retval = MPI_ERR_COMM;
  } else if (dst == MPI_PROC_NULL) {
    retval = MPI_SUCCESS;
  } else if (dst >= comm->group()->size() || dst <0){
    retval = MPI_ERR_RANK;
  } else if ((count < 0) || (buf==nullptr && count > 0)) {
    retval = MPI_ERR_COUNT;
  } else if (datatype==MPI_DATATYPE_NULL || not datatype->is_valid()) {
    retval = MPI_ERR_TYPE;
  } else if(tag<0 && tag !=  MPI_ANY_TAG){
    retval = MPI_ERR_TAG;
  } else {
    int my_proc_id         = simgrid::s4u::this_actor::get_pid();
    int dst_traced         = getPid(comm, dst);
    TRACE_smpi_comm_in(my_proc_id, __func__,
                       new simgrid::instr::Pt2PtTIData("Ssend", dst,
                                                       datatype->is_replayable() ? count : count * datatype->size(),
                                                       tag, simgrid::smpi::Datatype::encode(datatype)));
    TRACE_smpi_send(my_proc_id, my_proc_id, dst_traced, tag, count * datatype->size());

    simgrid::smpi::Request::ssend(buf, count, datatype, dst, tag, comm);
    retval = MPI_SUCCESS;

    TRACE_smpi_comm_out(my_proc_id);
  }

  smpi_bench_begin();
  return retval;
}
Beispiel #12
0
/*
 * Another recursive doubling type algorithm, but in this case
 * we go up the tree and back down the tree.
 */
int Coll_barrier_ompi_tree::barrier(MPI_Comm comm)
{
    int rank, size, depth;
    int jump, partner;

    rank = comm->rank();
    size = comm->size();
    XBT_DEBUG(
                 "ompi_coll_tuned_barrier_ompi_tree %d",
                 rank);

    /* Find the nearest power of 2 of the communicator size. */
    for(depth = 1; depth < size; depth <<= 1 );

    for (jump=1; jump<depth; jump<<=1) {
        partner = rank ^ jump;
        if (!(partner & (jump-1)) && partner < size) {
            if (partner > rank) {
                Request::recv (NULL, 0, MPI_BYTE, partner,
                                         COLL_TAG_BARRIER, comm,
                                         MPI_STATUS_IGNORE);
            } else if (partner < rank) {
                Request::send (NULL, 0, MPI_BYTE, partner,
                                         COLL_TAG_BARRIER,
                                          comm);
            }
        }
    }

    depth>>=1;
    for (jump = depth; jump>0; jump>>=1) {
        partner = rank ^ jump;
        if (!(partner & (jump-1)) && partner < size) {
            if (partner > rank) {
                Request::send (NULL, 0, MPI_BYTE, partner,
                                         COLL_TAG_BARRIER,
                                          comm);
            } else if (partner < rank) {
                Request::recv (NULL, 0, MPI_BYTE, partner,
                                         COLL_TAG_BARRIER, comm,
                                         MPI_STATUS_IGNORE);
            }
        }
    }

    return MPI_SUCCESS;
}
Beispiel #13
0
int Coll_barrier_ompi_basic_linear::barrier(MPI_Comm comm)
{
    int i;
    int size = comm->size();
    int rank = comm->rank();

    /* All non-root send & receive zero-length message. */

    if (rank > 0) {
        Request::send (NULL, 0, MPI_BYTE, 0,
                                 COLL_TAG_BARRIER,
                                  comm);

        Request::recv (NULL, 0, MPI_BYTE, 0,
                                 COLL_TAG_BARRIER,
                                 comm, MPI_STATUS_IGNORE);
    }

    /* The root collects and broadcasts the messages. */

    else {
        MPI_Request* requests;

        requests = new MPI_Request[size];
        for (i = 1; i < size; ++i) {
            requests[i] = Request::irecv(NULL, 0, MPI_BYTE, MPI_ANY_SOURCE,
                                     COLL_TAG_BARRIER, comm
                                     );
        }
        Request::waitall( size-1, requests+1, MPI_STATUSES_IGNORE );

        for (i = 1; i < size; ++i) {
            requests[i] = Request::isend(NULL, 0, MPI_BYTE, i,
                                     COLL_TAG_BARRIER,
                                      comm
                                     );
        }
        Request::waitall( size-1, requests+1, MPI_STATUSES_IGNORE );
        delete[] requests;
    }

    /* All done */

    return MPI_SUCCESS;

}
int Coll_reduce_scatter_ompi::reduce_scatter(const void *sbuf, void *rbuf,
                                                    const int *rcounts,
                                                    MPI_Datatype dtype,
                                                    MPI_Op  op,
                                                    MPI_Comm  comm
                                                    )
{
    int comm_size, i, pow2;
    size_t total_message_size, dsize;
    const double a = 0.0012;
    const double b = 8.0;
    const size_t small_message_size = 12 * 1024;
    const size_t large_message_size = 256 * 1024;
    int zerocounts = 0;

    XBT_DEBUG("Coll_reduce_scatter_ompi::reduce_scatter");

    comm_size = comm->size();
    // We need data size for decision function
    dsize=dtype->size();
    total_message_size = 0;
    for (i = 0; i < comm_size; i++) {
        total_message_size += rcounts[i];
        if (0 == rcounts[i]) {
            zerocounts = 1;
        }
    }

    if (((op != MPI_OP_NULL) && not op->is_commutative()) || (zerocounts)) {
      Coll_reduce_scatter_default::reduce_scatter(sbuf, rbuf, rcounts, dtype, op, comm);
      return MPI_SUCCESS;
    }

    total_message_size *= dsize;

    // compute the nearest power of 2
    for (pow2 = 1; pow2 < comm_size; pow2 <<= 1);

    if ((total_message_size <= small_message_size) ||
        ((total_message_size <= large_message_size) && (pow2 == comm_size)) ||
        (comm_size >= a * total_message_size + b)) {
        return
            Coll_reduce_scatter_ompi_basic_recursivehalving::reduce_scatter(sbuf, rbuf, rcounts,
                                                                        dtype, op,
                                                                        comm);
    }
    return Coll_reduce_scatter_ompi_ring::reduce_scatter(sbuf, rbuf, rcounts,
                                                     dtype, op,
                                                     comm);



}
int Coll_scatter_ompi::scatter(const void *sbuf, int scount,
                                            MPI_Datatype sdtype,
                                            void* rbuf, int rcount,
                                            MPI_Datatype rdtype,
                                            int root, MPI_Comm  comm
                                            )
{
    const size_t small_block_size = 300;
    const int small_comm_size = 10;
    int communicator_size, rank;
    size_t dsize, block_size;

    XBT_DEBUG("Coll_scatter_ompi::scatter");

    communicator_size = comm->size();
    rank = comm->rank();
    // Determine block size
    if (root == rank) {
        dsize=sdtype->size();
        block_size = dsize * scount;
    } else {
        dsize=rdtype->size();
        block_size = dsize * rcount;
    }

    if ((communicator_size > small_comm_size) &&
        (block_size < small_block_size)) {
      std::unique_ptr<unsigned char[]> tmp_buf;
      if (rank != root) {
        tmp_buf.reset(new unsigned char[rcount * rdtype->get_extent()]);
        sbuf   = tmp_buf.get();
        scount = rcount;
        sdtype = rdtype;
      }
      return Coll_scatter_ompi_binomial::scatter(sbuf, scount, sdtype, rbuf, rcount, rdtype, root, comm);
    }
    return Coll_scatter_ompi_basic_linear::scatter (sbuf, scount, sdtype,
                                                       rbuf, rcount, rdtype,
                                                       root, comm);
}
Beispiel #16
0
int
Coll_allgatherv_pair::allgatherv(void *send_buff, int send_count,
                               MPI_Datatype send_type, void *recv_buff,
                               int *recv_counts, int *recv_disps, MPI_Datatype recv_type,
                               MPI_Comm comm)
{

  MPI_Aint extent;
  unsigned int i, src, dst;
  int tag = COLL_TAG_ALLGATHERV;
  MPI_Status status;

  char *send_ptr = (char *) send_buff;
  char *recv_ptr = (char *) recv_buff;

  unsigned int rank = comm->rank();
  unsigned int num_procs = comm->size();

  if((num_procs&(num_procs-1)))
    THROWF(arg_error,0, "allgatherv pair algorithm can't be used with non power of two number of processes ! ");

  extent = send_type->get_extent();

  // local send/recv
  Request::sendrecv(send_ptr, send_count, send_type, rank, tag,
               recv_ptr + recv_disps[rank] * extent,
               recv_counts[rank], recv_type, rank, tag, comm, &status);
  for (i = 1; i < num_procs; i++) {
    src = dst = rank ^ i;
    Request::sendrecv(send_ptr, send_count, send_type, dst, tag,
                 recv_ptr + recv_disps[src] * extent, recv_counts[src], recv_type,
                 src, tag, comm, &status);
  }

  return MPI_SUCCESS;
}
Beispiel #17
0
/* special case for two processes */
int Coll_barrier_ompi_two_procs::barrier(MPI_Comm comm)
{
    int remote;

    remote = comm->rank();
    XBT_DEBUG(
                 "ompi_coll_tuned_barrier_ompi_two_procs rank %d", remote);
    remote = (remote + 1) & 0x1;

    Request::sendrecv(NULL, 0, MPI_BYTE, remote,
                                          COLL_TAG_BARRIER,
                                          NULL, 0, MPI_BYTE, remote,
                                          COLL_TAG_BARRIER,
                                          comm, MPI_STATUS_IGNORE);
    return (MPI_SUCCESS);
}
Beispiel #18
0
int PMPI_Recv(void *buf, int count, MPI_Datatype datatype, int src, int tag, MPI_Comm comm, MPI_Status * status)
{
  int retval = 0;

  smpi_bench_end();
  if (comm == MPI_COMM_NULL) {
    retval = MPI_ERR_COMM;
  } else if (src == MPI_PROC_NULL) {
    if(status != MPI_STATUS_IGNORE){
      simgrid::smpi::Status::empty(status);
      status->MPI_SOURCE = MPI_PROC_NULL;
    }
    retval = MPI_SUCCESS;
  } else if (src!=MPI_ANY_SOURCE && (src >= comm->group()->size() || src <0)){
    retval = MPI_ERR_RANK;
  } else if ((count < 0) || (buf==nullptr && count > 0)) {
    retval = MPI_ERR_COUNT;
  } else if (datatype==MPI_DATATYPE_NULL || not datatype->is_valid()) {
    retval = MPI_ERR_TYPE;
  } else if(tag<0 && tag !=  MPI_ANY_TAG){
    retval = MPI_ERR_TAG;
  } else {
    int my_proc_id = simgrid::s4u::this_actor::get_pid();
    TRACE_smpi_comm_in(my_proc_id, __func__,
                       new simgrid::instr::Pt2PtTIData("recv", src,
                                                       datatype->is_replayable() ? count : count * datatype->size(),
                                                       tag, simgrid::smpi::Datatype::encode(datatype)));

    simgrid::smpi::Request::recv(buf, count, datatype, src, tag, comm, status);
    retval = MPI_SUCCESS;

    // the src may not have been known at the beginning of the recv (MPI_ANY_SOURCE)
    int src_traced=0;
    if (status != MPI_STATUS_IGNORE) 
      src_traced = getPid(comm, status->MPI_SOURCE);
    else
      src_traced = getPid(comm, src);
    if (not TRACE_smpi_view_internals()) {
      TRACE_smpi_recv(src_traced, my_proc_id, tag);
    }
    
    TRACE_smpi_comm_out(my_proc_id);
  }

  smpi_bench_begin();
  return retval;
}
int Coll_allgatherv_ompi::allgatherv(const void *sbuf, int scount,
                                               MPI_Datatype sdtype,
                                               void* rbuf, const int *rcounts,
                                               const int *rdispls,
                                               MPI_Datatype rdtype,
                                               MPI_Comm  comm
                                               )
{
    int i;
    int communicator_size;
    size_t dsize, total_dsize;

    communicator_size = comm->size();

    /* Special case for 2 processes */
    if (communicator_size == 2) {
        return Coll_allgatherv_pair::allgatherv(sbuf, scount, sdtype,
                                                           rbuf, rcounts, rdispls, rdtype,
                                                           comm);
    }

    /* Determine complete data size */
    dsize=sdtype->size();
    total_dsize = 0;
    for (i = 0; i < communicator_size; i++) {
        total_dsize += dsize * rcounts[i];
    }

    /* Decision based on allgather decision.   */
    if (total_dsize < 50000) {
        return Coll_allgatherv_ompi_bruck::allgatherv(sbuf, scount, sdtype,
                                                      rbuf, rcounts, rdispls, rdtype,
                                                      comm);

    } else {
        if (communicator_size % 2) {
            return Coll_allgatherv_ring::allgatherv(sbuf, scount, sdtype,
                                                         rbuf, rcounts, rdispls, rdtype,
                                                         comm);
        } else {
            return  Coll_allgatherv_ompi_neighborexchange::allgatherv(sbuf, scount, sdtype,
                                                                      rbuf, rcounts, rdispls, rdtype,
                                                                      comm);
        }
    }
}
int Coll_barrier_ompi::barrier(MPI_Comm  comm)
{    int communicator_size = comm->size();

    if( 2 == communicator_size )
        return Coll_barrier_ompi_two_procs::barrier(comm);
/*     * Basic optimisation. If we have a power of 2 number of nodes*/
/*     * the use the recursive doubling algorithm, otherwise*/
/*     * bruck is the one we want.*/
    {
        int has_one = 0;
        for( ; communicator_size > 0; communicator_size >>= 1 ) {
            if( communicator_size & 0x1 ) {
                if( has_one )
                    return Coll_barrier_ompi_bruck::barrier(comm);
                has_one = 1;
            }
        }
    }
    return Coll_barrier_ompi_recursivedoubling::barrier(comm);
}
Beispiel #21
0
int PMPI_Irecv(void *buf, int count, MPI_Datatype datatype, int src, int tag, MPI_Comm comm, MPI_Request * request)
{
  int retval = 0;

  smpi_bench_end();

  if (request == nullptr) {
    retval = MPI_ERR_ARG;
  } else if (comm == MPI_COMM_NULL) {
    retval = MPI_ERR_COMM;
  } else if (src == MPI_PROC_NULL) {
    *request = MPI_REQUEST_NULL;
    retval = MPI_SUCCESS;
  } else if (src!=MPI_ANY_SOURCE && (src >= comm->group()->size() || src <0)){
    retval = MPI_ERR_RANK;
  } else if ((count < 0) || (buf==nullptr && count > 0)) {
    retval = MPI_ERR_COUNT;
  } else if (datatype==MPI_DATATYPE_NULL || not datatype->is_valid()) {
    retval = MPI_ERR_TYPE;
  } else if(tag<0 && tag !=  MPI_ANY_TAG){
    retval = MPI_ERR_TAG;
  } else {

    int my_proc_id = simgrid::s4u::this_actor::get_pid();

    TRACE_smpi_comm_in(my_proc_id, __func__,
                       new simgrid::instr::Pt2PtTIData("irecv", src,
                                                       datatype->is_replayable() ? count : count * datatype->size(),
                                                       tag, simgrid::smpi::Datatype::encode(datatype)));

    *request = simgrid::smpi::Request::irecv(buf, count, datatype, src, tag, comm);
    retval = MPI_SUCCESS;

    TRACE_smpi_comm_out(my_proc_id);
  }

  smpi_bench_begin();
  if (retval != MPI_SUCCESS && request != nullptr)
    *request = MPI_REQUEST_NULL;
  return retval;
}
int Coll_allgather_mvapich2_smp::allgather(void *sendbuf,int sendcnt, MPI_Datatype sendtype,
                            void *recvbuf, int recvcnt,MPI_Datatype recvtype,
                            MPI_Comm  comm)
{
    int rank, size;
    int local_rank, local_size;
    int leader_comm_size = 0;
    int mpi_errno = MPI_SUCCESS;
    MPI_Aint recvtype_extent = 0;  /* Datatype extent */
    MPI_Comm shmem_comm, leader_comm;

  if(comm->get_leaders_comm()==MPI_COMM_NULL){
    comm->init_smp();
  }

  if (not comm->is_uniform() || not comm->is_blocked())
    THROWF(arg_error,0, "allgather MVAPICH2 smp algorithm can't be used with irregular deployment. Please insure that processes deployed on the same node are contiguous and that each node has the same number of processes");

    if (recvcnt == 0) {
        return MPI_SUCCESS;
    }

    rank = comm->rank();
    size = comm->size();

    /* extract the rank,size information for the intra-node communicator */
    recvtype_extent=recvtype->get_extent();

    shmem_comm = comm->get_intra_comm();
    local_rank = shmem_comm->rank();
    local_size = shmem_comm->size();

    if (local_rank == 0) {
        /* Node leader. Extract the rank, size information for the leader communicator */
        leader_comm = comm->get_leaders_comm();
        if(leader_comm==MPI_COMM_NULL){
          leader_comm = MPI_COMM_WORLD;
        }
        leader_comm_size = leader_comm->size();
    }

    /*If there is just one node, after gather itself,
     * root has all the data and it can do bcast*/
    if(local_rank == 0) {
        mpi_errno = Colls::gather(sendbuf, sendcnt,sendtype,
                                    (void*)((char*)recvbuf + (rank * recvcnt * recvtype_extent)),
                                     recvcnt, recvtype,
                                     0, shmem_comm);
    } else {
        /*Since in allgather all the processes could have
         * its own data in place*/
        if(sendbuf == MPI_IN_PLACE) {
            mpi_errno = Colls::gather((void*)((char*)recvbuf + (rank * recvcnt * recvtype_extent)),
                                         recvcnt , recvtype,
                                         recvbuf, recvcnt, recvtype,
                                         0, shmem_comm);
        } else {
            mpi_errno = Colls::gather(sendbuf, sendcnt,sendtype,
                                         recvbuf, recvcnt, recvtype,
                                         0, shmem_comm);
        }
    }
    /* Exchange the data between the node leaders*/
    if (local_rank == 0 && (leader_comm_size > 1)) {
        /*When data in each socket is different*/
        if (comm->is_uniform() != 1) {

            int *displs = NULL;
            int *recvcnts = NULL;
            int *node_sizes = NULL;
            int i = 0;

            node_sizes = comm->get_non_uniform_map();

            displs =  static_cast<int *>(xbt_malloc(sizeof (int) * leader_comm_size));
            recvcnts =  static_cast<int *>(xbt_malloc(sizeof (int) * leader_comm_size));
            if (not displs || not recvcnts) {
              return MPI_ERR_OTHER;
            }
            recvcnts[0] = node_sizes[0] * recvcnt;
            displs[0] = 0;

            for (i = 1; i < leader_comm_size; i++) {
                displs[i] = displs[i - 1] + node_sizes[i - 1] * recvcnt;
                recvcnts[i] = node_sizes[i] * recvcnt;
            }


            void* sendbuf=((char*)recvbuf)+recvtype->get_extent()*displs[leader_comm->rank()];

            mpi_errno = Colls::allgatherv(sendbuf,
                                       (recvcnt*local_size),
                                       recvtype,
                                       recvbuf, recvcnts,
                                       displs, recvtype,
                                       leader_comm);
            xbt_free(displs);
            xbt_free(recvcnts);
        } else {
        void* sendtmpbuf=((char*)recvbuf)+recvtype->get_extent()*(recvcnt*local_size)*leader_comm->rank();



            mpi_errno = Coll_allgather_mpich::allgather(sendtmpbuf,
                                               (recvcnt*local_size),
                                               recvtype,
                                               recvbuf, (recvcnt*local_size), recvtype,
                                             leader_comm);

        }
    }

    /*Bcast the entire data from node leaders to all other cores*/
    mpi_errno = Colls::bcast (recvbuf, recvcnt * size, recvtype, 0, shmem_comm);
    return mpi_errno;
}
int Coll_scatter_mvapich2_two_level_direct::scatter(void *sendbuf,
                                      int sendcnt,
                                      MPI_Datatype sendtype,
                                      void *recvbuf,
                                      int recvcnt,
                                      MPI_Datatype recvtype,
                                      int root, MPI_Comm  comm)
{
    int comm_size, rank;
    int local_rank, local_size;
    int leader_comm_rank = -1, leader_comm_size = -1;
    int mpi_errno = MPI_SUCCESS;
    int recvtype_size, sendtype_size, nbytes;
    void *tmp_buf = NULL;
    void *leader_scatter_buf = NULL;
    MPI_Status status;
    int leader_root, leader_of_root = -1;
    MPI_Comm shmem_comm, leader_comm;
    //if not set (use of the algo directly, without mvapich2 selector)
    if(MV2_Scatter_intra_function==NULL)
      MV2_Scatter_intra_function=Coll_scatter_mpich::scatter;

    if(comm->get_leaders_comm()==MPI_COMM_NULL){
      comm->init_smp();
    }
    comm_size = comm->size();
    rank = comm->rank();

    if (((rank == root) && (recvcnt == 0))
        || ((rank != root) && (sendcnt == 0))) {
        return MPI_SUCCESS;
    }

    /* extract the rank,size information for the intra-node
     * communicator */
    shmem_comm = comm->get_intra_comm();
    local_rank = shmem_comm->rank();
    local_size = shmem_comm->size();

    if (local_rank == 0) {
        /* Node leader. Extract the rank, size information for the leader
         * communicator */
        leader_comm = comm->get_leaders_comm();
        leader_comm_size = leader_comm->size();
        leader_comm_rank = leader_comm->rank();
    }

    if (local_size == comm_size) {
        /* purely intra-node scatter. Just use the direct algorithm and we are done */
        mpi_errno = MPIR_Scatter_MV2_Direct(sendbuf, sendcnt, sendtype,
                                            recvbuf, recvcnt, recvtype,
                                            root, comm);

    } else {
        recvtype_size=recvtype->size();
        sendtype_size=sendtype->size();

        if (rank == root) {
            nbytes = sendcnt * sendtype_size;
        } else {
            nbytes = recvcnt * recvtype_size;
        }

        if (local_rank == 0) {
            /* Node leader, allocate tmp_buffer */
            tmp_buf = smpi_get_tmp_sendbuffer(nbytes * local_size);
        }

        leader_comm = comm->get_leaders_comm();
        int* leaders_map = comm->get_leaders_map();
        leader_of_root = comm->group()->rank(leaders_map[root]);
        leader_root = leader_comm->group()->rank(leaders_map[root]);
        /* leader_root is the rank of the leader of the root in leader_comm.
         * leader_root is to be used as the root of the inter-leader gather ops
         */

        if ((local_rank == 0) && (root != rank)
            && (leader_of_root == rank)) {
            /* The root of the scatter operation is not the node leader. Recv
             * data from the node leader */
            leader_scatter_buf = smpi_get_tmp_sendbuffer(nbytes * comm_size);
            Request::recv(leader_scatter_buf, nbytes * comm_size, MPI_BYTE,
                             root, COLL_TAG_SCATTER, comm, &status);

        }

        if (rank == root && local_rank != 0) {
            /* The root of the scatter operation is not the node leader. Send
             * data to the node leader */
            Request::send(sendbuf, sendcnt * comm_size, sendtype,
                                     leader_of_root, COLL_TAG_SCATTER, comm
                                     );
        }

        if (leader_comm_size > 1 && local_rank == 0) {
          if (not comm->is_uniform()) {
            int* displs   = NULL;
            int* sendcnts = NULL;
            int* node_sizes;
            int i      = 0;
            node_sizes = comm->get_non_uniform_map();

            if (root != leader_of_root) {
              if (leader_comm_rank == leader_root) {
                displs      = static_cast<int*>(xbt_malloc(sizeof(int) * leader_comm_size));
                sendcnts    = static_cast<int*>(xbt_malloc(sizeof(int) * leader_comm_size));
                sendcnts[0] = node_sizes[0] * nbytes;
                displs[0]   = 0;

                for (i = 1; i < leader_comm_size; i++) {
                  displs[i]   = displs[i - 1] + node_sizes[i - 1] * nbytes;
                  sendcnts[i] = node_sizes[i] * nbytes;
                }
              }
              Colls::scatterv(leader_scatter_buf, sendcnts, displs, MPI_BYTE, tmp_buf, nbytes * local_size, MPI_BYTE,
                              leader_root, leader_comm);
            } else {
              if (leader_comm_rank == leader_root) {
                displs      = static_cast<int*>(xbt_malloc(sizeof(int) * leader_comm_size));
                sendcnts    = static_cast<int*>(xbt_malloc(sizeof(int) * leader_comm_size));
                sendcnts[0] = node_sizes[0] * sendcnt;
                displs[0]   = 0;

                for (i = 1; i < leader_comm_size; i++) {
                  displs[i]   = displs[i - 1] + node_sizes[i - 1] * sendcnt;
                  sendcnts[i] = node_sizes[i] * sendcnt;
                }
              }
              Colls::scatterv(sendbuf, sendcnts, displs, sendtype, tmp_buf, nbytes * local_size, MPI_BYTE, leader_root,
                              leader_comm);
            }
            if (leader_comm_rank == leader_root) {
              xbt_free(displs);
              xbt_free(sendcnts);
            }
            } else {
                if (leader_of_root != root) {
                    mpi_errno =
                        MPIR_Scatter_MV2_Direct(leader_scatter_buf,
                                                nbytes * local_size, MPI_BYTE,
                                                tmp_buf, nbytes * local_size,
                                                MPI_BYTE, leader_root,
                                                leader_comm);
                } else {
                    mpi_errno =
                        MPIR_Scatter_MV2_Direct(sendbuf, sendcnt * local_size,
                                                sendtype, tmp_buf,
                                                nbytes * local_size, MPI_BYTE,
                                                leader_root, leader_comm);

                }
            }
        }
        /* The leaders are now done with the inter-leader part. Scatter the data within the nodes */

        if (rank == root && recvbuf == MPI_IN_PLACE) {
            mpi_errno = MV2_Scatter_intra_function(tmp_buf, nbytes, MPI_BYTE,
                                                (void *)sendbuf, sendcnt, sendtype,
                                                0, shmem_comm);
        } else {
            mpi_errno = MV2_Scatter_intra_function(tmp_buf, nbytes, MPI_BYTE,
                                                recvbuf, recvcnt, recvtype,
                                                0, shmem_comm);
        }
    }

    /* check if multiple threads are calling this collective function */
    if (comm_size != local_size && local_rank == 0) {
        smpi_free_tmp_buffer(tmp_buf);
        if (leader_of_root == rank && root != rank) {
            smpi_free_tmp_buffer(leader_scatter_buf);
        }
    }
    return (mpi_errno);
}
int Coll_allgather_loosely_lr::allgather(const void *sbuf, int scount,
                                         MPI_Datatype stype, void *rbuf,
                                         int rcount, MPI_Datatype rtype,
                                         MPI_Comm comm)
{
  int comm_size, rank;
  int tag = COLL_TAG_ALLGATHER;
  int i, j, send_offset, recv_offset;
  int intra_rank, inter_rank, inter_comm_size, intra_comm_size;
  int inter_dst, inter_src;

  comm_size = comm->size();

if(comm->get_leaders_comm()==MPI_COMM_NULL){
    comm->init_smp();
  }
  int num_core=1;
  if (comm->is_uniform()){
    num_core = comm->get_intra_comm()->size();
  }

  if(comm_size%num_core)
    THROWF(arg_error,0, "allgather loosely lr algorithm can't be used with non multiple of NUM_CORE=%d number of processes ! ",num_core);

  rank = comm->rank();
  MPI_Aint rextent, sextent;
  rextent = rtype->get_extent();
  sextent = stype->get_extent();
  MPI_Request inter_rrequest;
  MPI_Request rrequest_array[128];
  MPI_Request srequest_array[128];
  MPI_Request inter_srequest_array[128];


  int rrequest_count = 0;
  int srequest_count = 0;
  int inter_srequest_count = 0;

  MPI_Status status;

  intra_rank = rank % num_core;
  inter_rank = rank / num_core;
  inter_comm_size = (comm_size + num_core - 1) / num_core;
  intra_comm_size = num_core;

  int src_seg, dst_seg;

  //copy corresponding message from sbuf to rbuf
  recv_offset = rank * rextent * rcount;
  Request::sendrecv(sbuf, scount, stype, rank, tag,
               (char *)rbuf + recv_offset, rcount, rtype, rank, tag, comm, &status);

  int dst, src;
  int inter_send_offset, inter_recv_offset;

  rrequest_count = 0;
  srequest_count = 0;
  inter_srequest_count = 0;

  for (i = 0; i < inter_comm_size; i++) {

    // inter_communication

    inter_dst = (rank + intra_comm_size) % comm_size;
    inter_src = (rank - intra_comm_size + comm_size) % comm_size;

    src_seg =
        ((inter_rank - 1 - i +
          inter_comm_size) % inter_comm_size) * intra_comm_size + intra_rank;
    dst_seg =
        ((inter_rank - i +
          inter_comm_size) % inter_comm_size) * intra_comm_size + intra_rank;

    inter_send_offset = dst_seg * sextent * scount;
    inter_recv_offset = src_seg * rextent * rcount;

    for (j = 0; j < intra_comm_size; j++) {

      // inter communication
      if (intra_rank == j) {
        if (i != inter_comm_size - 1) {

          inter_rrequest = Request::irecv((char*)rbuf + inter_recv_offset, rcount, rtype, inter_src, tag, comm);
          inter_srequest_array[inter_srequest_count++] =
              Request::isend((char*)rbuf + inter_send_offset, scount, stype, inter_dst, tag, comm);
        }
      }
      //intra_communication
      src = inter_rank * intra_comm_size + j;
      dst = inter_rank * intra_comm_size + j;

      src_seg =
          ((inter_rank - i +
            inter_comm_size) % inter_comm_size) * intra_comm_size + j;
      dst_seg =
          ((inter_rank - i +
            inter_comm_size) % inter_comm_size) * intra_comm_size + intra_rank;

      send_offset = dst_seg * sextent * scount;
      recv_offset = src_seg * rextent * rcount;


      if (j != intra_rank) {

        rrequest_array[rrequest_count++] = Request::irecv((char *)rbuf + recv_offset, rcount, rtype, src, tag, comm);
        srequest_array[srequest_count++] = Request::isend((char *)rbuf + send_offset, scount, stype, dst, tag, comm);

      }
    }                           // intra loop


    // wait for inter communication to finish for these rounds (# of round equals num_core)
    if (i != inter_comm_size - 1) {
      Request::wait(&inter_rrequest, &status);
    }

  }                             //inter loop

  Request::waitall(rrequest_count, rrequest_array, MPI_STATUSES_IGNORE);
  Request::waitall(srequest_count, srequest_array, MPI_STATUSES_IGNORE);
  Request::waitall(inter_srequest_count, inter_srequest_array, MPI_STATUSES_IGNORE);

  return MPI_SUCCESS;
}
Beispiel #25
0
int Coll_reduce_binomial::reduce(void *sendbuf, void *recvbuf, int count,
                                    MPI_Datatype datatype, MPI_Op op, int root,
                                    MPI_Comm comm)
{
  MPI_Status status;
  int comm_size, rank;
  int mask, relrank, source;
  int dst;
  int tag = COLL_TAG_REDUCE;
  MPI_Aint extent;
  void *tmp_buf;
  MPI_Aint true_lb, true_extent;
  if (count == 0)
    return 0;
  rank = comm->rank();
  comm_size = comm->size();

  extent = datatype->get_extent();

  tmp_buf = (void *) smpi_get_tmp_sendbuffer(count * extent);
  int is_commutative =  (op==MPI_OP_NULL || op->is_commutative());
  mask = 1;

  int lroot;
  if (is_commutative)
        lroot   = root;
  else
        lroot   = 0;
  relrank = (rank - lroot + comm_size) % comm_size;

  datatype->extent(&true_lb, &true_extent);

  /* adjust for potential negative lower bound in datatype */
  tmp_buf = (void *)((char*)tmp_buf - true_lb);

  /* If I'm not the root, then my recvbuf may not be valid, therefore
     I have to allocate a temporary one */
  if (rank != root) {
      recvbuf = (void*)smpi_get_tmp_recvbuffer(count * std::max(extent, true_extent));
      recvbuf = (void *)((char*)recvbuf - true_lb);
  }
   if ((rank != root) || (sendbuf != MPI_IN_PLACE)) {
      Datatype::copy(sendbuf, count, datatype, recvbuf,count, datatype);
  }

  while (mask < comm_size) {
    /* Receive */
    if ((mask & relrank) == 0) {
      source = (relrank | mask);
      if (source < comm_size) {
        source = (source + lroot) % comm_size;
        Request::recv(tmp_buf, count, datatype, source, tag, comm, &status);

        if (is_commutative) {
          if(op!=MPI_OP_NULL) op->apply( tmp_buf, recvbuf, &count, datatype);
        } else {
          if(op!=MPI_OP_NULL) op->apply( recvbuf, tmp_buf, &count, datatype);
          Datatype::copy(tmp_buf, count, datatype,recvbuf, count, datatype);
        }
      }
    } else {
      dst = ((relrank & (~mask)) + lroot) % comm_size;
      Request::send(recvbuf, count, datatype, dst, tag, comm);
      break;
    }
    mask <<= 1;
  }

  if (not is_commutative && (root != 0)) {
    if (rank == 0){
      Request::send(recvbuf, count, datatype, root,tag, comm);
    }else if (rank == root){
      Request::recv(recvbuf, count, datatype, 0, tag, comm, &status);
    }
  }

  if (rank != root) {
    smpi_free_tmp_buffer(recvbuf);
  }
  smpi_free_tmp_buffer(tmp_buf);

  return 0;
}
int Coll_scatter_mvapich2::scatter(const void *sendbuf,
    int sendcnt,
    MPI_Datatype sendtype,
    void *recvbuf,
    int recvcnt,
    MPI_Datatype recvtype,
    int root, MPI_Comm comm)
{
  int range = 0, range_threshold = 0, range_threshold_intra = 0;
  int mpi_errno = MPI_SUCCESS;
  //   int mpi_errno_ret = MPI_SUCCESS;
  int rank, nbytes, comm_size;
  int partial_sub_ok = 0;
  int conf_index = 0;
     MPI_Comm shmem_comm;
  //    MPID_Comm *shmem_commptr=NULL;
  if(mv2_scatter_thresholds_table==NULL)
    init_mv2_scatter_tables_stampede();

  if(comm->get_leaders_comm()==MPI_COMM_NULL){
    comm->init_smp();
  }

  comm_size = comm->size();

  rank = comm->rank();

  if (rank == root) {
    int sendtype_size = sendtype->size();
    nbytes            = sendcnt * sendtype_size;
  } else {
    int recvtype_size = recvtype->size();
    nbytes            = recvcnt * recvtype_size;
  }

    // check if safe to use partial subscription mode
    if (comm->is_uniform()) {

        shmem_comm = comm->get_intra_comm();
        if (mv2_scatter_table_ppn_conf[0] == -1) {
            // Indicating user defined tuning
            conf_index = 0;
        }else{
          int local_size = shmem_comm->size();
          int i          = 0;
            do {
                if (local_size == mv2_scatter_table_ppn_conf[i]) {
                    conf_index = i;
                    partial_sub_ok = 1;
                    break;
                }
                i++;
            } while(i < mv2_scatter_num_ppn_conf);
        }
    }

  if (partial_sub_ok != 1) {
      conf_index = 0;
  }

  /* Search for the corresponding system size inside the tuning table */
  while ((range < (mv2_size_scatter_tuning_table[conf_index] - 1)) &&
      (comm_size > mv2_scatter_thresholds_table[conf_index][range].numproc)) {
      range++;
  }
  /* Search for corresponding inter-leader function */
  while ((range_threshold < (mv2_scatter_thresholds_table[conf_index][range].size_inter_table - 1))
      && (nbytes >
  mv2_scatter_thresholds_table[conf_index][range].inter_leader[range_threshold].max)
  && (mv2_scatter_thresholds_table[conf_index][range].inter_leader[range_threshold].max != -1)) {
      range_threshold++;
  }

  /* Search for corresponding intra-node function */
  while ((range_threshold_intra <
      (mv2_scatter_thresholds_table[conf_index][range].size_intra_table - 1))
      && (nbytes >
  mv2_scatter_thresholds_table[conf_index][range].intra_node[range_threshold_intra].max)
  && (mv2_scatter_thresholds_table[conf_index][range].intra_node[range_threshold_intra].max !=
      -1)) {
      range_threshold_intra++;
  }

  MV2_Scatter_function = mv2_scatter_thresholds_table[conf_index][range].inter_leader[range_threshold]
                                                                                      .MV2_pt_Scatter_function;

  if(MV2_Scatter_function == &MPIR_Scatter_mcst_wrap_MV2) {
#if defined(_MCST_SUPPORT_)
      if(comm->ch.is_mcast_ok == 1
          && mv2_use_mcast_scatter == 1
          && comm->ch.shmem_coll_ok == 1) {
          MV2_Scatter_function = &MPIR_Scatter_mcst_MV2;
      } else
#endif /*#if defined(_MCST_SUPPORT_) */
        {
          if(mv2_scatter_thresholds_table[conf_index][range].inter_leader[range_threshold + 1].
              MV2_pt_Scatter_function != NULL) {
              MV2_Scatter_function = mv2_scatter_thresholds_table[conf_index][range].inter_leader[range_threshold + 1]
                                                                                                  .MV2_pt_Scatter_function;
          } else {
              /* Fallback! */
              MV2_Scatter_function = &MPIR_Scatter_MV2_Binomial;
          }
        }
  }

  if( (MV2_Scatter_function == &MPIR_Scatter_MV2_two_level_Direct) ||
      (MV2_Scatter_function == &MPIR_Scatter_MV2_two_level_Binomial)) {
       if( comm->is_blocked()) {
             MV2_Scatter_intra_function = mv2_scatter_thresholds_table[conf_index][range].intra_node[range_threshold_intra]
                                .MV2_pt_Scatter_function;

             mpi_errno =
                   MV2_Scatter_function(sendbuf, sendcnt, sendtype,
                                        recvbuf, recvcnt, recvtype, root,
                                        comm);
         } else {
      mpi_errno = MPIR_Scatter_MV2_Binomial(sendbuf, sendcnt, sendtype,
          recvbuf, recvcnt, recvtype, root,
          comm);

      }
  } else {
      mpi_errno = MV2_Scatter_function(sendbuf, sendcnt, sendtype,
          recvbuf, recvcnt, recvtype, root,
          comm);
  }
  return (mpi_errno);
}
int Coll_bcast_ompi::bcast(void *buff, int count,
                                          MPI_Datatype datatype, int root,
                                          MPI_Comm  comm
                                          )
{
    /* Decision function based on MX results for
       messages up to 36MB and communicator sizes up to 64 nodes */
    const size_t small_message_size = 2048;
    const size_t intermediate_message_size = 370728;
    const double a_p16  = 3.2118e-6; /* [1 / byte] */
    const double b_p16  = 8.7936;
    const double a_p64  = 2.3679e-6; /* [1 / byte] */
    const double b_p64  = 1.1787;
    const double a_p128 = 1.6134e-6; /* [1 / byte] */
    const double b_p128 = 2.1102;

    int communicator_size;
    //int segsize = 0;
    size_t message_size, dsize;

    communicator_size = comm->size();

    /* else we need data size for decision function */
    dsize = datatype->size();
    message_size = dsize * (unsigned long)count;   /* needed for decision */

    /* Handle messages of small and intermediate size, and
       single-element broadcasts */
    if ((message_size < small_message_size) || (count <= 1)) {
        /* Binomial without segmentation */
        return  Coll_bcast_binomial_tree::bcast (buff, count, datatype,
                                                      root, comm);

    } else if (message_size < intermediate_message_size) {
        // SplittedBinary with 1KB segments
        return Coll_bcast_ompi_split_bintree::bcast(buff, count, datatype,
                                                         root, comm);

    }
     //Handle large message sizes
    else if (communicator_size < (a_p128 * message_size + b_p128)) {
        //Pipeline with 128KB segments
        //segsize = 1024  << 7;
        return Coll_bcast_ompi_pipeline::bcast (buff, count, datatype,
                                                     root, comm);


    } else if (communicator_size < 13) {
        // Split Binary with 8KB segments
        return Coll_bcast_ompi_split_bintree::bcast(buff, count, datatype,
                                                         root, comm);

    } else if (communicator_size < (a_p64 * message_size + b_p64)) {
        // Pipeline with 64KB segments
        //segsize = 1024 << 6;
        return Coll_bcast_ompi_pipeline::bcast (buff, count, datatype,
                                                     root, comm);


    } else if (communicator_size < (a_p16 * message_size + b_p16)) {
        //Pipeline with 16KB segments
        //segsize = 1024 << 4;
        return Coll_bcast_ompi_pipeline::bcast (buff, count, datatype,
                                                     root, comm);


    }
    /* Pipeline with 8KB segments */
    //segsize = 1024 << 3;
    return Coll_bcast_flattree_pipeline::bcast (buff, count, datatype,
                                                 root, comm
                                                 /*segsize*/);
#if 0
    /* this is based on gige measurements */

    if (communicator_size  < 4) {
        return Coll_bcast_intra_basic_linear::bcast (buff, count, datatype, root, comm, module);
    }
    if (communicator_size == 4) {
        if (message_size < 524288) segsize = 0;
        else segsize = 16384;
        return Coll_bcast_intra_bintree::bcast (buff, count, datatype, root, comm, module, segsize);
    }
    if (communicator_size <= 8 && message_size < 4096) {
        return Coll_bcast_intra_basic_linear::bcast (buff, count, datatype, root, comm, module);
    }
    if (communicator_size > 8 && message_size >= 32768 && message_size < 524288) {
        segsize = 16384;
        return  Coll_bcast_intra_bintree::bcast (buff, count, datatype, root, comm, module, segsize);
    }
    if (message_size >= 524288) {
        segsize = 16384;
        return Coll_bcast_intra_pipeline::bcast (buff, count, datatype, root, comm, module, segsize);
    }
    segsize = 0;
    /* once tested can swap this back in */
    /* return Coll_bcast_intra_bmtree::bcast (buff, count, datatype, root, comm, segsize); */
    return Coll_bcast_intra_bintree::bcast (buff, count, datatype, root, comm, module, segsize);
#endif  /* 0 */
}
Beispiel #28
0
int Coll_bcast_SMP_binary::bcast(void *buf, int count,
                                     MPI_Datatype datatype, int root,
                                     MPI_Comm comm)
{
  int tag = COLL_TAG_BCAST;
  MPI_Status status;
  MPI_Request request;
  MPI_Request *request_array;
  MPI_Status *status_array;
  int rank, size;
  int i;
  MPI_Aint extent;
  extent = datatype->get_extent();

  rank = comm->rank();
  size = comm->size();
  if(comm->get_leaders_comm()==MPI_COMM_NULL){
    comm->init_smp();
  }
  int host_num_core=1;
  if (comm->is_uniform()){
    host_num_core = comm->get_intra_comm()->size();
  }else{
    //implementation buggy in this case
    return Coll_bcast_mpich::bcast( buf , count, datatype,
              root, comm);
  }

  int segment = bcast_SMP_binary_segment_byte / extent;
  int pipe_length = count / segment;
  int remainder = count % segment;

  int to_intra_left = (rank / host_num_core) * host_num_core + (rank % host_num_core) * 2 + 1;
  int to_intra_right = (rank / host_num_core) * host_num_core + (rank % host_num_core) * 2 + 2;
  int to_inter_left = ((rank / host_num_core) * 2 + 1) * host_num_core;
  int to_inter_right = ((rank / host_num_core) * 2 + 2) * host_num_core;
  int from_inter = (((rank / host_num_core) - 1) / 2) * host_num_core;
  int from_intra = (rank / host_num_core) * host_num_core + ((rank % host_num_core) - 1) / 2;
  int increment = segment * extent;

  int base = (rank / host_num_core) * host_num_core;
  int num_core = host_num_core;
  if (((rank / host_num_core) * host_num_core) == ((size / host_num_core) * host_num_core))
    num_core = size - (rank / host_num_core) * host_num_core;

  // if root is not zero send to rank zero first
  if (root != 0) {
    if (rank == root)
      Request::send(buf, count, datatype, 0, tag, comm);
    else if (rank == 0)
      Request::recv(buf, count, datatype, root, tag, comm, &status);
  }
  // when a message is smaller than a block size => no pipeline
  if (count <= segment) {
    // case ROOT-of-each-SMP
    if (rank % host_num_core == 0) {
      // case ROOT
      if (rank == 0) {
        //printf("node %d left %d right %d\n",rank,to_inter_left,to_inter_right);
        if (to_inter_left < size)
          Request::send(buf, count, datatype, to_inter_left, tag, comm);
        if (to_inter_right < size)
          Request::send(buf, count, datatype, to_inter_right, tag, comm);
        if ((to_intra_left - base) < num_core)
          Request::send(buf, count, datatype, to_intra_left, tag, comm);
        if ((to_intra_right - base) < num_core)
          Request::send(buf, count, datatype, to_intra_right, tag, comm);
      }
      // case LEAVES ROOT-of-eash-SMP
      else if (to_inter_left >= size) {
        //printf("node %d from %d\n",rank,from_inter);
        request = Request::irecv(buf, count, datatype, from_inter, tag, comm);
        Request::wait(&request, &status);
        if ((to_intra_left - base) < num_core)
          Request::send(buf, count, datatype, to_intra_left, tag, comm);
        if ((to_intra_right - base) < num_core)
          Request::send(buf, count, datatype, to_intra_right, tag, comm);
      }
      // case INTERMEDIAT ROOT-of-each-SMP
      else {
        //printf("node %d left %d right %d from %d\n",rank,to_inter_left,to_inter_right,from_inter);
        request = Request::irecv(buf, count, datatype, from_inter, tag, comm);
        Request::wait(&request, &status);
        Request::send(buf, count, datatype, to_inter_left, tag, comm);
        if (to_inter_right < size)
          Request::send(buf, count, datatype, to_inter_right, tag, comm);
        if ((to_intra_left - base) < num_core)
          Request::send(buf, count, datatype, to_intra_left, tag, comm);
        if ((to_intra_right - base) < num_core)
          Request::send(buf, count, datatype, to_intra_right, tag, comm);
      }
    }
    // case non ROOT-of-each-SMP
    else {
      // case leaves
      if ((to_intra_left - base) >= num_core) {
        request = Request::irecv(buf, count, datatype, from_intra, tag, comm);
        Request::wait(&request, &status);
      }
      // case intermediate
      else {
        request = Request::irecv(buf, count, datatype, from_intra, tag, comm);
        Request::wait(&request, &status);
        Request::send(buf, count, datatype, to_intra_left, tag, comm);
        if ((to_intra_right - base) < num_core)
          Request::send(buf, count, datatype, to_intra_right, tag, comm);
      }
    }

    return MPI_SUCCESS;
  }

  // pipeline bcast
  else {
    request_array =
        (MPI_Request *) xbt_malloc((size + pipe_length) * sizeof(MPI_Request));
    status_array =
        (MPI_Status *) xbt_malloc((size + pipe_length) * sizeof(MPI_Status));

    // case ROOT-of-each-SMP
    if (rank % host_num_core == 0) {
      // case ROOT
      if (rank == 0) {
        for (i = 0; i < pipe_length; i++) {
          //printf("node %d left %d right %d\n",rank,to_inter_left,to_inter_right);
          if (to_inter_left < size)
            Request::send((char *) buf + (i * increment), segment, datatype,
                     to_inter_left, (tag + i), comm);
          if (to_inter_right < size)
            Request::send((char *) buf + (i * increment), segment, datatype,
                     to_inter_right, (tag + i), comm);
          if ((to_intra_left - base) < num_core)
            Request::send((char *) buf + (i * increment), segment, datatype,
                     to_intra_left, (tag + i), comm);
          if ((to_intra_right - base) < num_core)
            Request::send((char *) buf + (i * increment), segment, datatype,
                     to_intra_right, (tag + i), comm);
        }
      }
      // case LEAVES ROOT-of-eash-SMP
      else if (to_inter_left >= size) {
        //printf("node %d from %d\n",rank,from_inter);
        for (i = 0; i < pipe_length; i++) {
          request_array[i] = Request::irecv((char *) buf + (i * increment), segment, datatype,
                    from_inter, (tag + i), comm);
        }
        for (i = 0; i < pipe_length; i++) {
          Request::wait(&request_array[i], &status);
          if ((to_intra_left - base) < num_core)
            Request::send((char *) buf + (i * increment), segment, datatype,
                     to_intra_left, (tag + i), comm);
          if ((to_intra_right - base) < num_core)
            Request::send((char *) buf + (i * increment), segment, datatype,
                     to_intra_right, (tag + i), comm);
        }
      }
      // case INTERMEDIAT ROOT-of-each-SMP
      else {
        //printf("node %d left %d right %d from %d\n",rank,to_inter_left,to_inter_right,from_inter);
        for (i = 0; i < pipe_length; i++) {
          request_array[i] = Request::irecv((char *) buf + (i * increment), segment, datatype,
                    from_inter, (tag + i), comm);
        }
        for (i = 0; i < pipe_length; i++) {
          Request::wait(&request_array[i], &status);
          Request::send((char *) buf + (i * increment), segment, datatype,
                   to_inter_left, (tag + i), comm);
          if (to_inter_right < size)
            Request::send((char *) buf + (i * increment), segment, datatype,
                     to_inter_right, (tag + i), comm);
          if ((to_intra_left - base) < num_core)
            Request::send((char *) buf + (i * increment), segment, datatype,
                     to_intra_left, (tag + i), comm);
          if ((to_intra_right - base) < num_core)
            Request::send((char *) buf + (i * increment), segment, datatype,
                     to_intra_right, (tag + i), comm);
        }
      }
    }
    // case non-ROOT-of-each-SMP
    else {
      // case leaves
      if ((to_intra_left - base) >= num_core) {
        for (i = 0; i < pipe_length; i++) {
          request_array[i] = Request::irecv((char *) buf + (i * increment), segment, datatype,
                    from_intra, (tag + i), comm);
        }
        Request::waitall((pipe_length), request_array, status_array);
      }
      // case intermediate
      else {
        for (i = 0; i < pipe_length; i++) {
          request_array[i] = Request::irecv((char *) buf + (i * increment), segment, datatype,
                    from_intra, (tag + i), comm);
        }
        for (i = 0; i < pipe_length; i++) {
          Request::wait(&request_array[i], &status);
          Request::send((char *) buf + (i * increment), segment, datatype,
                   to_intra_left, (tag + i), comm);
          if ((to_intra_right - base) < num_core)
            Request::send((char *) buf + (i * increment), segment, datatype,
                     to_intra_right, (tag + i), comm);
        }
      }
    }

    free(request_array);
    free(status_array);
  }

  // when count is not divisible by block size, use default BCAST for the remainder
  if ((remainder != 0) && (count > segment)) {
    XBT_WARN("MPI_bcast_SMP_binary use default MPI_bcast.");
    Colls::bcast((char *) buf + (pipe_length * increment), remainder, datatype,
              root, comm);
  }

  return 1;
}
int Coll_reduce_ompi::reduce(const void *sendbuf, void *recvbuf,
                                            int count, MPI_Datatype  datatype,
                                            MPI_Op   op, int root,
                                            MPI_Comm   comm
                                            )
{
    int communicator_size=0;
    //int segsize = 0;
    size_t message_size, dsize;
    const double a1 =  0.6016 / 1024.0; /* [1/B] */
    const double b1 =  1.3496;
    const double a2 =  0.0410 / 1024.0; /* [1/B] */
    const double b2 =  9.7128;
    const double a3 =  0.0422 / 1024.0; /* [1/B] */
    const double b3 =  1.1614;
    //const double a4 =  0.0033 / 1024.0;  [1/B]
    //const double b4 =  1.6761;

    /* no limit on # of outstanding requests */
    //const int max_requests = 0;

    communicator_size = comm->size();

    /* need data size for decision function */
    dsize=datatype->size();
    message_size = dsize * count;   /* needed for decision */

    /**
     * If the operation is non commutative we currently have choice of linear
     * or in-order binary tree algorithm.
     */
    if ((op != MPI_OP_NULL) && not op->is_commutative()) {
      if ((communicator_size < 12) && (message_size < 2048)) {
        return Coll_reduce_ompi_basic_linear::reduce(sendbuf, recvbuf, count, datatype, op, root, comm /*, module*/);
      }
      return Coll_reduce_ompi_in_order_binary::reduce(sendbuf, recvbuf, count, datatype, op, root, comm /*, module,
                                                             0, max_requests*/);
    }

    if ((communicator_size < 8) && (message_size < 512)){
        /* Linear_0K */
        return Coll_reduce_ompi_basic_linear::reduce (sendbuf, recvbuf, count, datatype, op, root, comm);
    } else if (((communicator_size < 8) && (message_size < 20480)) ||
               (message_size < 2048) || (count <= 1)) {
        /* Binomial_0K */
        //segsize = 0;
        return Coll_reduce_ompi_binomial::reduce(sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
                                                     segsize, max_requests*/);
    } else if (communicator_size > (a1 * message_size + b1)) {
        // Binomial_1K
        //segsize = 1024;
        return Coll_reduce_ompi_binomial::reduce(sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
                                                     segsize, max_requests*/);
    } else if (communicator_size > (a2 * message_size + b2)) {
        // Pipeline_1K
        //segsize = 1024;
        return Coll_reduce_ompi_pipeline::reduce (sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
                                                      segsize, max_requests*/);
    } else if (communicator_size > (a3 * message_size + b3)) {
        // Binary_32K
        //segsize = 32*1024;
        return Coll_reduce_ompi_binary::reduce( sendbuf, recvbuf, count, datatype, op, root,
                                                    comm/*, module, segsize, max_requests*/);
    }
//    if (communicator_size > (a4 * message_size + b4)) {
        // Pipeline_32K
//        segsize = 32*1024;
//    } else {
        // Pipeline_64K
//        segsize = 64*1024;
//    }
    return Coll_reduce_ompi_pipeline::reduce (sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
                                                  segsize, max_requests*/);

#if 0
    /* for small messages use linear algorithm */
    if (message_size <= 4096) {
        segsize = 0;
        fanout = communicator_size - 1;
        /* when linear implemented or taken from basic put here, right now using chain as a linear system */
        /* it is implemented and I shouldn't be calling a chain with a fanout bigger than MAXTREEFANOUT from topo.h! */
        return Coll_reduce_intra_basic_linear::reduce (sendbuf, recvbuf, count, datatype, op, root, comm, module);
        /*        return Coll_reduce_intra_chain::reduce (sendbuf, recvbuf, count, datatype, op, root, comm, segsize, fanout); */
    }
    if (message_size < 524288) {
        if (message_size <= 65536 ) {
            segsize = 32768;
            fanout = 8;
        } else {
            segsize = 1024;
            fanout = communicator_size/2;
        }
        /* later swap this for a binary tree */
        /*         fanout = 2; */
        return Coll_reduce_intra_chain::reduce (sendbuf, recvbuf, count, datatype, op, root, comm, module,
                                                   segsize, fanout, max_requests);
    }
    segsize = 1024;
    return Coll_reduce_intra_pipeline::reduce (sendbuf, recvbuf, count, datatype, op, root, comm, module,
                                                  segsize, max_requests);
#endif  /* 0 */
}
int Coll_allgather_ompi::allgather(const void *sbuf, int scount,
                                              MPI_Datatype sdtype,
                                              void* rbuf, int rcount,
                                              MPI_Datatype rdtype,
                                              MPI_Comm  comm
                                              )
{
    int communicator_size, pow2_size;
    size_t dsize, total_dsize;

    communicator_size = comm->size();

    /* Special case for 2 processes */
    if (communicator_size == 2) {
        return Coll_allgather_pair::allgather (sbuf, scount, sdtype,
                                                          rbuf, rcount, rdtype,
                                                          comm/*, module*/);
    }

    /* Determine complete data size */
    dsize=sdtype->size();
    total_dsize = dsize * scount * communicator_size;

    for (pow2_size  = 1; pow2_size < communicator_size; pow2_size <<=1);

    /* Decision based on MX 2Gb results from Grig cluster at
       The University of Tennesse, Knoxville
       - if total message size is less than 50KB use either bruck or
       recursive doubling for non-power of two and power of two nodes,
       respectively.
       - else use ring and neighbor exchange algorithms for odd and even
       number of nodes, respectively.
    */
    if (total_dsize < 50000) {
        if (pow2_size == communicator_size) {
            return Coll_allgather_rdb::allgather(sbuf, scount, sdtype,
                                                                     rbuf, rcount, rdtype,
                                                                     comm);
        } else {
            return Coll_allgather_bruck::allgather(sbuf, scount, sdtype,
                                                         rbuf, rcount, rdtype,
                                                         comm);
        }
    } else {
        if (communicator_size % 2) {
            return Coll_allgather_ring::allgather(sbuf, scount, sdtype,
                                                        rbuf, rcount, rdtype,
                                                        comm);
        } else {
            return  Coll_allgather_ompi_neighborexchange::allgather(sbuf, scount, sdtype,
                                                                     rbuf, rcount, rdtype,
                                                                     comm);
        }
    }

#if defined(USE_MPICH2_DECISION)
    /* Decision as in MPICH-2
       presented in Thakur et.al. "Optimization of Collective Communication
       Operations in MPICH", International Journal of High Performance Computing
       Applications, Vol. 19, No. 1, 49-66 (2005)
       - for power-of-two processes and small and medium size messages
       (up to 512KB) use recursive doubling
       - for non-power-of-two processes and small messages (80KB) use bruck,
       - for everything else use ring.
    */
    if ((pow2_size == communicator_size) && (total_dsize < 524288)) {
        return Coll_allgather_rdb::allgather(sbuf, scount, sdtype,
                                                                 rbuf, rcount, rdtype,
                                                                 comm);
    } else if (total_dsize <= 81920) {
        return Coll_allgather_bruck::allgather(sbuf, scount, sdtype,
                                                     rbuf, rcount, rdtype,
                                                     comm);
    }
    return Coll_allgather_ring::allgather(sbuf, scount, sdtype,
                                                rbuf, rcount, rdtype,
                                                comm);
#endif  /* defined(USE_MPICH2_DECISION) */
}