Beispiel #1
0
int
Coll_reduce_flat_tree::reduce(const void *sbuf, void *rbuf, int count,
                                 MPI_Datatype dtype, MPI_Op op,
                                 int root, MPI_Comm comm)
{
  int i, tag = COLL_TAG_REDUCE;
  int size;
  int rank;
  MPI_Aint extent;
  unsigned char* origin = nullptr;
  const unsigned char* inbuf;
  MPI_Status status;

  rank = comm->rank();
  size = comm->size();

  /* If not root, send data to the root. */
  extent = dtype->get_extent();

  if (rank != root) {
    Request::send(sbuf, count, dtype, root, tag, comm);
    return 0;
  }

  /* Root receives and reduces messages.  Allocate buffer to receive
     messages. */

  if (size > 1)
    origin = smpi_get_tmp_recvbuffer(count * extent);

  /* Initialize the receive buffer. */
  if (rank == (size - 1))
    Request::sendrecv(sbuf, count, dtype, rank, tag,
                 rbuf, count, dtype, rank, tag, comm, &status);
  else
    Request::recv(rbuf, count, dtype, size - 1, tag, comm, &status);

  /* Loop receiving and calling reduction function (C or Fortran). */

  for (i = size - 2; i >= 0; --i) {
    if (rank == i)
      inbuf = static_cast<const unsigned char*>(sbuf);
    else {
      Request::recv(origin, count, dtype, i, tag, comm, &status);
      inbuf = origin;
    }

    /* Call reduction function. */
    if(op!=MPI_OP_NULL) op->apply( inbuf, rbuf, &count, dtype);

  }

  smpi_free_tmp_buffer(origin);

  /* All done */
  return 0;
}
Beispiel #2
0
int smpi_coll_tuned_gather_mvapich2_two_level(void *sendbuf,
                                            int sendcnt,
                                            MPI_Datatype sendtype,
                                            void *recvbuf,
                                            int recvcnt,
                                            MPI_Datatype recvtype,
                                            int root,
                                            MPI_Comm comm)
{
    void *leader_gather_buf = NULL;
    int comm_size, rank;
    int local_rank, local_size;
    int leader_comm_rank = -1, leader_comm_size = 0;
    int mpi_errno = MPI_SUCCESS;
    int recvtype_size = 0, sendtype_size = 0, nbytes=0;
    int leader_root, leader_of_root;
    MPI_Status status;
    MPI_Aint sendtype_extent = 0, recvtype_extent = 0;  /* Datatype extent */
    MPI_Aint true_lb, sendtype_true_extent, recvtype_true_extent;
    MPI_Comm shmem_comm, leader_comm;
    void* tmp_buf = NULL;
    

    //if not set (use of the algo directly, without mvapich2 selector)
    if(MV2_Gather_intra_node_function==NULL)
      MV2_Gather_intra_node_function=smpi_coll_tuned_gather_mpich;
    
    if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){
      smpi_comm_init_smp(comm);
    }
    comm_size = smpi_comm_size(comm);
    rank = smpi_comm_rank(comm);

    if (((rank == root) && (recvcnt == 0)) ||
        ((rank != root) && (sendcnt == 0))) {
        return MPI_SUCCESS;
    }

    if (sendtype != MPI_DATATYPE_NULL) {
        sendtype_extent=smpi_datatype_get_extent(sendtype);
        sendtype_size=smpi_datatype_size(sendtype);
        smpi_datatype_extent(sendtype, &true_lb,
                                       &sendtype_true_extent);
    }
    if (recvtype != MPI_DATATYPE_NULL) {
        recvtype_extent=smpi_datatype_get_extent(recvtype);
        recvtype_size=smpi_datatype_size(recvtype);
        smpi_datatype_extent(recvtype, &true_lb,
                                       &recvtype_true_extent);
    }

    /* extract the rank,size information for the intra-node
     * communicator */
    shmem_comm = smpi_comm_get_intra_comm(comm);
    local_rank = smpi_comm_rank(shmem_comm);
    local_size = smpi_comm_size(shmem_comm);
    
    if (local_rank == 0) {
        /* Node leader. Extract the rank, size information for the leader
         * communicator */
        leader_comm = smpi_comm_get_leaders_comm(comm);
        if(leader_comm==MPI_COMM_NULL){
          leader_comm = MPI_COMM_WORLD;
        }
        leader_comm_size = smpi_comm_size(leader_comm);
        leader_comm_rank = smpi_comm_rank(leader_comm);
    }

    if (rank == root) {
        nbytes = recvcnt * recvtype_size;

    } else {
        nbytes = sendcnt * sendtype_size;
    }

#if defined(_SMP_LIMIC_)
     if((g_use_limic2_coll) && (shmem_commptr->ch.use_intra_sock_comm == 1) 
         && (use_limic_gather)
         &&((num_scheme == USE_GATHER_PT_PT_BINOMIAL) 
            || (num_scheme == USE_GATHER_PT_PT_DIRECT)
            ||(num_scheme == USE_GATHER_PT_LINEAR_BINOMIAL) 
            || (num_scheme == USE_GATHER_PT_LINEAR_DIRECT)
            || (num_scheme == USE_GATHER_LINEAR_PT_BINOMIAL)
            || (num_scheme == USE_GATHER_LINEAR_PT_DIRECT)
            || (num_scheme == USE_GATHER_LINEAR_LINEAR)
            || (num_scheme == USE_GATHER_SINGLE_LEADER))) {
            
            mpi_errno = MV2_Gather_intra_node_function(sendbuf, sendcnt, sendtype,
                                                    recvbuf, recvcnt,recvtype, 
                                                    root, comm);
     } else

#endif/*#if defined(_SMP_LIMIC_)*/    
    {
        if (local_rank == 0) {
            /* Node leader, allocate tmp_buffer */
            if (rank == root) {
                tmp_buf = smpi_get_tmp_recvbuffer(recvcnt * MAX(recvtype_extent,
                            recvtype_true_extent) * local_size);
            } else {
                tmp_buf = smpi_get_tmp_sendbuffer(sendcnt * MAX(sendtype_extent,
                            sendtype_true_extent) *
                        local_size);
            }
            if (tmp_buf == NULL) {
                mpi_errno = MPI_ERR_OTHER;
                return mpi_errno;
            }
        }
         /*while testing mpich2 gather test, we see that
         * which basically splits the comm, and we come to
         * a point, where use_intra_sock_comm == 0, but if the 
         * intra node function is MPIR_Intra_node_LIMIC_Gather_MV2,
         * it would use the intra sock comm. In such cases, we 
         * fallback to binomial as a default case.*/
#if defined(_SMP_LIMIC_)         
        if(*MV2_Gather_intra_node_function == MPIR_Intra_node_LIMIC_Gather_MV2) {

            mpi_errno  = MPIR_pt_pt_intra_gather(sendbuf,sendcnt, sendtype,
                                                 recvbuf, recvcnt, recvtype,
                                                 root, rank, 
                                                 tmp_buf, nbytes, 
                                                 TEMP_BUF_HAS_NO_DATA,
                                                 shmem_commptr,
                                                 MPIR_Gather_intra);
        } else
#endif
        {
            /*We are gathering the data into tmp_buf and the output
             * will be of MPI_BYTE datatype. Since the tmp_buf has no
             * local data, we pass is_data_avail = TEMP_BUF_HAS_NO_DATA*/
            mpi_errno  = MPIR_pt_pt_intra_gather(sendbuf,sendcnt, sendtype,
                                                 recvbuf, recvcnt, recvtype,
                                                 root, rank, 
                                                 tmp_buf, nbytes, 
                                                 TEMP_BUF_HAS_NO_DATA,
                                                 shmem_comm,
                                                 MV2_Gather_intra_node_function
                                                 );
        }
    }
    leader_comm = smpi_comm_get_leaders_comm(comm);
    int* leaders_map = smpi_comm_get_leaders_map(comm);
    leader_of_root = smpi_group_rank(smpi_comm_group(comm),leaders_map[root]);
    leader_root = smpi_group_rank(smpi_comm_group(leader_comm),leaders_map[root]);
    /* leader_root is the rank of the leader of the root in leader_comm. 
     * leader_root is to be used as the root of the inter-leader gather ops 
     */
    if (!smpi_comm_is_uniform(comm)) {
        if (local_rank == 0) {
            int *displs = NULL;
            int *recvcnts = NULL;
            int *node_sizes;
            int i = 0;
            /* Node leaders have all the data. But, different nodes can have
             * different number of processes. Do a Gather first to get the 
             * buffer lengths at each leader, followed by a Gatherv to move
             * the actual data */

            if (leader_comm_rank == leader_root && root != leader_of_root) {
                /* The root of the Gather operation is not a node-level 
                 * leader and this process's rank in the leader_comm 
                 * is the same as leader_root */
                if(rank == root) { 
                    leader_gather_buf = smpi_get_tmp_recvbuffer(recvcnt *
                                                MAX(recvtype_extent,
                                                recvtype_true_extent) *
                                                comm_size);
                } else { 
                    leader_gather_buf = smpi_get_tmp_sendbuffer(sendcnt *
                                                MAX(sendtype_extent,
                                                sendtype_true_extent) *
                                                comm_size);
                } 
                if (leader_gather_buf == NULL) {
                    mpi_errno =  MPI_ERR_OTHER;
                    return mpi_errno;
                }
            }

            node_sizes = smpi_comm_get_non_uniform_map(comm);

            if (leader_comm_rank == leader_root) {
                displs = xbt_malloc(sizeof (int) * leader_comm_size);
                recvcnts = xbt_malloc(sizeof (int) * leader_comm_size);
                if (!displs || !recvcnts) {
                    mpi_errno = MPI_ERR_OTHER;
                    return mpi_errno;
                }
            }

            if (root == leader_of_root) {
                /* The root of the gather operation is also the node 
                 * leader. Receive into recvbuf and we are done */
                if (leader_comm_rank == leader_root) {
                    recvcnts[0] = node_sizes[0] * recvcnt;
                    displs[0] = 0;

                    for (i = 1; i < leader_comm_size; i++) {
                        displs[i] = displs[i - 1] + node_sizes[i - 1] * recvcnt;
                        recvcnts[i] = node_sizes[i] * recvcnt;
                    }
                } 
                smpi_mpi_gatherv(tmp_buf,
                                         local_size * nbytes,
                                         MPI_BYTE, recvbuf, recvcnts,
                                         displs, recvtype,
                                         leader_root, leader_comm);
            } else {
                /* The root of the gather operation is not the node leader. 
                 * Receive into leader_gather_buf and then send 
                 * to the root */
                if (leader_comm_rank == leader_root) {
                    recvcnts[0] = node_sizes[0] * nbytes;
                    displs[0] = 0;

                    for (i = 1; i < leader_comm_size; i++) {
                        displs[i] = displs[i - 1] + node_sizes[i - 1] * nbytes;
                        recvcnts[i] = node_sizes[i] * nbytes;
                    }
                } 
                smpi_mpi_gatherv(tmp_buf, local_size * nbytes,
                                         MPI_BYTE, leader_gather_buf,
                                         recvcnts, displs, MPI_BYTE,
                                         leader_root, leader_comm);
            }
            if (leader_comm_rank == leader_root) {
                xbt_free(displs);
                xbt_free(recvcnts);
            }
        }
    } else {
        /* All nodes have the same number of processes. 
         * Just do one Gather to get all 
         * the data at the leader of the root process */
        if (local_rank == 0) {
            if (leader_comm_rank == leader_root && root != leader_of_root) {
                /* The root of the Gather operation is not a node-level leader
                 */
                leader_gather_buf = smpi_get_tmp_sendbuffer(nbytes * comm_size);
                if (leader_gather_buf == NULL) {
                    mpi_errno = MPI_ERR_OTHER;
                    return mpi_errno;
                }
            }
            if (root == leader_of_root) {
                mpi_errno = MPIR_Gather_MV2_Direct(tmp_buf,
                                                   nbytes * local_size,
                                                   MPI_BYTE, recvbuf,
                                                   recvcnt * local_size,
                                                   recvtype, leader_root,
                                                   leader_comm);
                 
            } else {
                mpi_errno = MPIR_Gather_MV2_Direct(tmp_buf, nbytes * local_size,
                                                   MPI_BYTE, leader_gather_buf,
                                                   nbytes * local_size,
                                                   MPI_BYTE, leader_root,
                                                   leader_comm);
            }
        }
    }
    if ((local_rank == 0) && (root != rank)
        && (leader_of_root == rank)) {
        smpi_mpi_send(leader_gather_buf,
                                 nbytes * comm_size, MPI_BYTE,
                                 root, COLL_TAG_GATHER, comm);
    }

    if (rank == root && local_rank != 0) {
        /* The root of the gather operation is not the node leader. Receive
         y* data from the node leader */
        smpi_mpi_recv(recvbuf, recvcnt * comm_size, recvtype,
                                 leader_of_root, COLL_TAG_GATHER, comm,
                                 &status);
    }

    /* check if multiple threads are calling this collective function */
    if (local_rank == 0 ) {
        if (tmp_buf != NULL) {
            smpi_free_tmp_buffer(tmp_buf);
        }
        if (leader_gather_buf != NULL) {
            smpi_free_tmp_buffer(leader_gather_buf);
        }
    }

    return (mpi_errno);
}
Beispiel #3
0
int Coll_reduce_binomial::reduce(void *sendbuf, void *recvbuf, int count,
                                    MPI_Datatype datatype, MPI_Op op, int root,
                                    MPI_Comm comm)
{
  MPI_Status status;
  int comm_size, rank;
  int mask, relrank, source;
  int dst;
  int tag = COLL_TAG_REDUCE;
  MPI_Aint extent;
  void *tmp_buf;
  MPI_Aint true_lb, true_extent;
  if (count == 0)
    return 0;
  rank = comm->rank();
  comm_size = comm->size();

  extent = datatype->get_extent();

  tmp_buf = (void *) smpi_get_tmp_sendbuffer(count * extent);
  int is_commutative =  (op==MPI_OP_NULL || op->is_commutative());
  mask = 1;

  int lroot;
  if (is_commutative)
        lroot   = root;
  else
        lroot   = 0;
  relrank = (rank - lroot + comm_size) % comm_size;

  datatype->extent(&true_lb, &true_extent);

  /* adjust for potential negative lower bound in datatype */
  tmp_buf = (void *)((char*)tmp_buf - true_lb);

  /* If I'm not the root, then my recvbuf may not be valid, therefore
     I have to allocate a temporary one */
  if (rank != root) {
      recvbuf = (void*)smpi_get_tmp_recvbuffer(count * std::max(extent, true_extent));
      recvbuf = (void *)((char*)recvbuf - true_lb);
  }
   if ((rank != root) || (sendbuf != MPI_IN_PLACE)) {
      Datatype::copy(sendbuf, count, datatype, recvbuf,count, datatype);
  }

  while (mask < comm_size) {
    /* Receive */
    if ((mask & relrank) == 0) {
      source = (relrank | mask);
      if (source < comm_size) {
        source = (source + lroot) % comm_size;
        Request::recv(tmp_buf, count, datatype, source, tag, comm, &status);

        if (is_commutative) {
          if(op!=MPI_OP_NULL) op->apply( tmp_buf, recvbuf, &count, datatype);
        } else {
          if(op!=MPI_OP_NULL) op->apply( recvbuf, tmp_buf, &count, datatype);
          Datatype::copy(tmp_buf, count, datatype,recvbuf, count, datatype);
        }
      }
    } else {
      dst = ((relrank & (~mask)) + lroot) % comm_size;
      Request::send(recvbuf, count, datatype, dst, tag, comm);
      break;
    }
    mask <<= 1;
  }

  if (not is_commutative && (root != 0)) {
    if (rank == 0){
      Request::send(recvbuf, count, datatype, root,tag, comm);
    }else if (rank == root){
      Request::recv(recvbuf, count, datatype, 0, tag, comm, &status);
    }
  }

  if (rank != root) {
    smpi_free_tmp_buffer(recvbuf);
  }
  smpi_free_tmp_buffer(tmp_buf);

  return 0;
}
 int smpi_coll_tuned_allreduce_mvapich2_rs(void *sendbuf,
                             void *recvbuf,
                             int count,
                             MPI_Datatype datatype,
                             MPI_Op op, MPI_Comm comm)
{
    int comm_size, rank;
    int mpi_errno = MPI_SUCCESS;
    int mask, dst, is_commutative, pof2, newrank = 0, rem, newdst, i,
        send_idx, recv_idx, last_idx, send_cnt, recv_cnt, *cnts, *disps;
    MPI_Aint true_lb, true_extent, extent;
    void *tmp_buf, *tmp_buf_free;

    if (count == 0) {
        return MPI_SUCCESS;
    }

    /* homogeneous */

    comm_size =  smpi_comm_size(comm);
    rank = smpi_comm_rank(comm);

    is_commutative = smpi_op_is_commute(op);

    /* need to allocate temporary buffer to store incoming data */
    smpi_datatype_extent(datatype, &true_lb, &true_extent);
    extent = smpi_datatype_get_extent(datatype);

    tmp_buf_free= smpi_get_tmp_recvbuffer(count * (MAX(extent, true_extent)));

    /* adjust for potential negative lower bound in datatype */
    tmp_buf = (void *) ((char *) tmp_buf_free - true_lb);

    /* copy local data into recvbuf */
    if (sendbuf != MPI_IN_PLACE) {
        mpi_errno =
            smpi_datatype_copy(sendbuf, count, datatype, recvbuf, count,
                           datatype);
    }

    /* find nearest power-of-two less than or equal to comm_size */
    for( pof2 = 1; pof2 <= comm_size; pof2 <<= 1 );
    pof2 >>=1;

    rem = comm_size - pof2;

    /* In the non-power-of-two case, all even-numbered
       processes of rank < 2*rem send their data to
       (rank+1). These even-numbered processes no longer
       participate in the algorithm until the very end. The
       remaining processes form a nice power-of-two. */

    if (rank < 2 * rem) {
        if (rank % 2 == 0) {
            /* even */
            smpi_mpi_send(recvbuf, count, datatype, rank + 1,
                                     COLL_TAG_ALLREDUCE, comm);

            /* temporarily set the rank to -1 so that this
               process does not pariticipate in recursive
               doubling */
            newrank = -1;
        } else {
            /* odd */
            smpi_mpi_recv(tmp_buf, count, datatype, rank - 1,
                                     COLL_TAG_ALLREDUCE, comm,
                                     MPI_STATUS_IGNORE);
            /* do the reduction on received data. since the
               ordering is right, it doesn't matter whether
               the operation is commutative or not. */
               smpi_op_apply(op, tmp_buf, recvbuf, &count, &datatype);
                /* change the rank */
                newrank = rank / 2;
        }
    } else {                /* rank >= 2*rem */
        newrank = rank - rem;
    }

    /* If op is user-defined or count is less than pof2, use
       recursive doubling algorithm. Otherwise do a reduce-scatter
       followed by allgather. (If op is user-defined,
       derived datatypes are allowed and the user could pass basic
       datatypes on one process and derived on another as long as
       the type maps are the same. Breaking up derived
       datatypes to do the reduce-scatter is tricky, therefore
       using recursive doubling in that case.) */

    if (newrank != -1) {
        if (/*(HANDLE_GET_KIND(op) != HANDLE_KIND_BUILTIN) ||*/ (count < pof2)) {  /* use recursive doubling */
            mask = 0x1;
            while (mask < pof2) {
                newdst = newrank ^ mask;
                /* find real rank of dest */
                dst = (newdst < rem) ? newdst * 2 + 1 : newdst + rem;

                /* Send the most current data, which is in recvbuf. Recv
                   into tmp_buf */
                smpi_mpi_sendrecv(recvbuf, count, datatype,
                                             dst, COLL_TAG_ALLREDUCE,
                                             tmp_buf, count, datatype, dst,
                                             COLL_TAG_ALLREDUCE, comm,
                                             MPI_STATUS_IGNORE);

                /* tmp_buf contains data received in this step.
                   recvbuf contains data accumulated so far */

                if (is_commutative || (dst < rank)) {
                    /* op is commutative OR the order is already right */
                     smpi_op_apply(op, tmp_buf, recvbuf, &count, &datatype);
                } else {
                    /* op is noncommutative and the order is not right */
                    smpi_op_apply(op, recvbuf, tmp_buf, &count, &datatype);
                    /* copy result back into recvbuf */
                    mpi_errno = smpi_datatype_copy(tmp_buf, count, datatype,
                                               recvbuf, count, datatype);
                }
                mask <<= 1;
            }
        } else {

            /* do a reduce-scatter followed by allgather */

            /* for the reduce-scatter, calculate the count that
               each process receives and the displacement within
               the buffer */
            cnts = (int *)xbt_malloc(pof2 * sizeof (int));
            disps = (int *)xbt_malloc(pof2 * sizeof (int));

            for (i = 0; i < (pof2 - 1); i++) {
                cnts[i] = count / pof2;
            }
            cnts[pof2 - 1] = count - (count / pof2) * (pof2 - 1);

            disps[0] = 0;
            for (i = 1; i < pof2; i++) {
                disps[i] = disps[i - 1] + cnts[i - 1];
            }

            mask = 0x1;
            send_idx = recv_idx = 0;
            last_idx = pof2;
            while (mask < pof2) {
                newdst = newrank ^ mask;
                /* find real rank of dest */
                dst = (newdst < rem) ? newdst * 2 + 1 : newdst + rem;

                send_cnt = recv_cnt = 0;
                if (newrank < newdst) {
                    send_idx = recv_idx + pof2 / (mask * 2);
                    for (i = send_idx; i < last_idx; i++)
                        send_cnt += cnts[i];
                    for (i = recv_idx; i < send_idx; i++)
                        recv_cnt += cnts[i];
                } else {
                    recv_idx = send_idx + pof2 / (mask * 2);
                    for (i = send_idx; i < recv_idx; i++)
                        send_cnt += cnts[i];
                    for (i = recv_idx; i < last_idx; i++)
                        recv_cnt += cnts[i];
                }

                /* Send data from recvbuf. Recv into tmp_buf */
                smpi_mpi_sendrecv((char *) recvbuf +
                                             disps[send_idx] * extent,
                                             send_cnt, datatype,
                                             dst, COLL_TAG_ALLREDUCE,
                                             (char *) tmp_buf +
                                             disps[recv_idx] * extent,
                                             recv_cnt, datatype, dst,
                                             COLL_TAG_ALLREDUCE, comm,
                                             MPI_STATUS_IGNORE);

                /* tmp_buf contains data received in this step.
                   recvbuf contains data accumulated so far */

                /* This algorithm is used only for predefined ops
                   and predefined ops are always commutative. */

                smpi_op_apply(op, (char *) tmp_buf + disps[recv_idx] * extent,
                        (char *) recvbuf + disps[recv_idx] * extent,
                        &recv_cnt, &datatype);

                /* update send_idx for next iteration */
                send_idx = recv_idx;
                mask <<= 1;

                /* update last_idx, but not in last iteration
                   because the value is needed in the allgather
                   step below. */
                if (mask < pof2)
                    last_idx = recv_idx + pof2 / mask;
            }

            /* now do the allgather */

            mask >>= 1;
            while (mask > 0) {
                newdst = newrank ^ mask;
                /* find real rank of dest */
                dst = (newdst < rem) ? newdst * 2 + 1 : newdst + rem;

                send_cnt = recv_cnt = 0;
                if (newrank < newdst) {
                    /* update last_idx except on first iteration */
                    if (mask != pof2 / 2) {
                        last_idx = last_idx + pof2 / (mask * 2);
                    }

                    recv_idx = send_idx + pof2 / (mask * 2);
                    for (i = send_idx; i < recv_idx; i++) {
                        send_cnt += cnts[i];
                    }
                    for (i = recv_idx; i < last_idx; i++) {
                        recv_cnt += cnts[i];
                    }
                } else {
                    recv_idx = send_idx - pof2 / (mask * 2);
                    for (i = send_idx; i < last_idx; i++) {
                        send_cnt += cnts[i];
                    }
                    for (i = recv_idx; i < send_idx; i++) {
                        recv_cnt += cnts[i];
                    }
                }

               smpi_mpi_sendrecv((char *) recvbuf +
                                             disps[send_idx] * extent,
                                             send_cnt, datatype,
                                             dst, COLL_TAG_ALLREDUCE,
                                             (char *) recvbuf +
                                             disps[recv_idx] * extent,
                                             recv_cnt, datatype, dst,
                                             COLL_TAG_ALLREDUCE, comm,
                                             MPI_STATUS_IGNORE);
                if (newrank > newdst) {
                    send_idx = recv_idx;
                }

                mask >>= 1;
            }
        }
    }

    /* In the non-power-of-two case, all odd-numbered
       processes of rank < 2*rem send the result to
       (rank-1), the ranks who didn't participate above. */
    if (rank < 2 * rem) {
        if (rank % 2) {     /* odd */
            smpi_mpi_send(recvbuf, count,
                                     datatype, rank - 1,
                                     COLL_TAG_ALLREDUCE, comm);
        } else {            /* even */
            smpi_mpi_recv(recvbuf, count,
                                  datatype, rank + 1,
                                  COLL_TAG_ALLREDUCE, comm,
                                  MPI_STATUS_IGNORE);
        }
    }
    smpi_free_tmp_buffer(tmp_buf_free);
    return (mpi_errno);

}
Beispiel #5
0
int smpi_coll_tuned_reduce_mvapich2_knomial (
        void *sendbuf,
        void *recvbuf,
        int count,
        MPI_Datatype datatype,
        MPI_Op op,
        int root,
        MPI_Comm comm)
{
    int mpi_errno = MPI_SUCCESS;
    int rank, is_commutative;
    int src, k;
    MPI_Request send_request;
    int index=0;
    MPI_Aint true_lb, true_extent, extent;
    MPI_Status status; 
    int recv_iter=0, dst=-1, expected_send_count, expected_recv_count;
    int *src_array=NULL;
    void **tmp_buf=NULL;
    MPI_Request *requests=NULL;


    if (count == 0) return MPI_SUCCESS;

    rank = smpi_comm_rank(comm);

    /* Create a temporary buffer */

    smpi_datatype_extent(datatype, &true_lb, &true_extent);
    extent = smpi_datatype_get_extent(datatype);

    is_commutative = smpi_op_is_commute(op);

    if (rank != root) {
        recvbuf=(void *)smpi_get_tmp_recvbuffer(count*(MAX(extent,true_extent)));
        recvbuf = (void *)((char*)recvbuf - true_lb);
    }

    if ((rank != root) || (sendbuf != MPI_IN_PLACE)) {
        mpi_errno = smpi_datatype_copy(sendbuf, count, datatype, recvbuf,
                count, datatype);
    }


    if(mv2_reduce_intra_knomial_factor<0)
      {
        mv2_reduce_intra_knomial_factor = SMPI_DEFAULT_KNOMIAL_FACTOR;
      }
    if(mv2_reduce_inter_knomial_factor<0)
      {
        mv2_reduce_inter_knomial_factor = SMPI_DEFAULT_KNOMIAL_FACTOR;
      }


    MPIR_Reduce_knomial_trace(root, mv2_reduce_intra_knomial_factor, comm, 
           &dst, &expected_send_count, &expected_recv_count, &src_array);

    if(expected_recv_count > 0 ) {
        tmp_buf  = xbt_malloc(sizeof(void *)*expected_recv_count);
        requests = xbt_malloc(sizeof(MPI_Request)*expected_recv_count);
        for(k=0; k < expected_recv_count; k++ ) {
            tmp_buf[k] = smpi_get_tmp_sendbuffer(count*(MAX(extent,true_extent)));
            tmp_buf[k] = (void *)((char*)tmp_buf[k] - true_lb);
        }

        while(recv_iter  < expected_recv_count) {
            src = src_array[expected_recv_count - (recv_iter+1)];

            requests[recv_iter]=smpi_mpi_irecv (tmp_buf[recv_iter], count, datatype ,src,
                    COLL_TAG_REDUCE, comm);
            recv_iter++;

        }

        recv_iter=0;
        while(recv_iter < expected_recv_count) {
            index=smpi_mpi_waitany(expected_recv_count, requests,
                    &status);
            recv_iter++;

            if (is_commutative) {
              smpi_op_apply(op, tmp_buf[index], recvbuf, &count, &datatype);
            }
        }

        for(k=0; k < expected_recv_count; k++ ) {
            smpi_free_tmp_buffer(tmp_buf[k]);
        }
        xbt_free(tmp_buf);
        xbt_free(requests);
    }

    if(src_array != NULL) { 
        xbt_free(src_array);
    } 

    if(rank != root) {
        send_request=smpi_mpi_isend(recvbuf,count, datatype, dst,
                COLL_TAG_REDUCE,comm);

        smpi_mpi_waitall(1, &send_request, &status);

        smpi_free_tmp_buffer((void *)((char*)recvbuf + true_lb));
    }

    /* --END ERROR HANDLING-- */

    return mpi_errno;
}
Beispiel #6
0
int smpi_coll_tuned_reduce_binomial(void *sendbuf, void *recvbuf, int count,
                                    MPI_Datatype datatype, MPI_Op op, int root,
                                    MPI_Comm comm)
{
  MPI_Status status;
  int comm_size, rank;
  int mask, relrank, source;
  int dst;
  int tag = COLL_TAG_REDUCE;
  MPI_Aint extent;
  void *tmp_buf;
  MPI_Aint true_lb, true_extent;
  if (count == 0)
    return 0;
  rank = smpi_comm_rank(comm);
  comm_size = smpi_comm_size(comm);

  extent = smpi_datatype_get_extent(datatype);

  tmp_buf = (void *) smpi_get_tmp_sendbuffer(count * extent);
  int is_commutative = smpi_op_is_commute(op);
  mask = 1;
  
  int lroot;
  if (is_commutative) 
        lroot   = root;
  else
        lroot   = 0;
  relrank = (rank - lroot + comm_size) % comm_size;

  smpi_datatype_extent(datatype, &true_lb, &true_extent);

  /* adjust for potential negative lower bound in datatype */
  tmp_buf = (void *)((char*)tmp_buf - true_lb);
    
  /* If I'm not the root, then my recvbuf may not be valid, therefore
     I have to allocate a temporary one */
  if (rank != root) {
      recvbuf = (void *) smpi_get_tmp_recvbuffer(count*(max(extent,true_extent)));
      recvbuf = (void *)((char*)recvbuf - true_lb);
  }
   if ((rank != root) || (sendbuf != MPI_IN_PLACE)) {
      smpi_datatype_copy(sendbuf, count, datatype, recvbuf,count, datatype);
  }

  while (mask < comm_size) {
    /* Receive */
    if ((mask & relrank) == 0) {
      source = (relrank | mask);
      if (source < comm_size) {
        source = (source + lroot) % comm_size;
        smpi_mpi_recv(tmp_buf, count, datatype, source, tag, comm, &status);
        
        if (is_commutative) {
          smpi_op_apply(op, tmp_buf, recvbuf, &count, &datatype);
        } else {
          smpi_op_apply(op, recvbuf, tmp_buf, &count, &datatype);
          smpi_datatype_copy(tmp_buf, count, datatype,recvbuf, count, datatype);
        }
      }
    } else {
      dst = ((relrank & (~mask)) + lroot) % comm_size;
      smpi_mpi_send(recvbuf, count, datatype, dst, tag, comm);
      break;
    }
    mask <<= 1;
  }

  if (!is_commutative && (root != 0)){
    if (rank == 0){
      smpi_mpi_send(recvbuf, count, datatype, root,tag, comm);
    }else if (rank == root){
      smpi_mpi_recv(recvbuf, count, datatype, 0, tag, comm, &status);
    }
  }

  if (rank != root) {
	  smpi_free_tmp_buffer(recvbuf);
  }
  smpi_free_tmp_buffer(tmp_buf);

  return 0;
}
int 
smpi_coll_tuned_allreduce_ompi_ring_segmented(void *sbuf, void *rbuf, int count,
                                               MPI_Datatype dtype,
                                               MPI_Op op,
                                               MPI_Comm comm) 
{
   int ret = MPI_SUCCESS;
   int line;
   int k, recv_from, send_to;
   int early_blockcount, late_blockcount, split_rank; 
   int segcount, max_segcount;
   int num_phases, phase;
   int block_count;
   unsigned int inbi;
   size_t typelng;
   char *tmpsend = NULL, *tmprecv = NULL;
   char *inbuf[2] = {NULL, NULL};
   ptrdiff_t true_extent, extent;
   ptrdiff_t block_offset, max_real_segsize;
   MPI_Request reqs[2] = {NULL, NULL};
   const size_t segsize = 1 << 20; /* 1 MB */
   unsigned int size = smpi_comm_size(comm);
   unsigned int rank = smpi_comm_rank(comm);

   XBT_DEBUG("coll:tuned:allreduce_intra_ring_segmented rank %d, count %d", rank, count);

   /* Special case for size == 1 */
   if (1 == size) {
      if (MPI_IN_PLACE != sbuf) {
      ret= smpi_datatype_copy(sbuf, count, dtype,rbuf, count, dtype);
         if (ret < 0) { line = __LINE__; goto error_hndl; }
      }
      return MPI_SUCCESS;
   }
   
   /* Determine segment count based on the suggested segment size */
   extent = smpi_datatype_get_extent(dtype);
   if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
   true_extent = smpi_datatype_get_extent(dtype);
   if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
   typelng = smpi_datatype_size(dtype);
   if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
   segcount = count;
   COLL_TUNED_COMPUTED_SEGCOUNT(segsize, typelng, segcount)

   /* Special case for count less than size * segcount - use regular ring */
   if (count < size * segcount) {
      XBT_DEBUG( "coll:tuned:allreduce_ring_segmented rank %d/%d, count %d, switching to regular ring", rank, size, count);
      return (smpi_coll_tuned_allreduce_lr(sbuf, rbuf, count, dtype, op, 
                                                   comm));
   }

   /* Determine the number of phases of the algorithm */
   num_phases = count / (size * segcount);
   if ((count % (size * segcount) >= size) && 
       (count % (size * segcount) > ((size * segcount) / 2))) {
      num_phases++;
   }

   /* Determine the number of elements per block and corresponding 
      block sizes.
      The blocks are divided into "early" and "late" ones:
      blocks 0 .. (split_rank - 1) are "early" and 
      blocks (split_rank) .. (size - 1) are "late".
      Early blocks are at most 1 element larger than the late ones.
      Note, these blocks will be split into num_phases segments,
      out of the largest one will have max_segcount elements.
    */
   COLL_TUNED_COMPUTE_BLOCKCOUNT( count, size, split_rank, 
                                  early_blockcount, late_blockcount )
   COLL_TUNED_COMPUTE_BLOCKCOUNT( early_blockcount, num_phases, inbi,
                                  max_segcount, k)
   max_real_segsize = true_extent + (max_segcount - 1) * extent;

   /* Allocate and initialize temporary buffers */
   inbuf[0] = (char*)smpi_get_tmp_sendbuffer(max_real_segsize);
   if (NULL == inbuf[0]) { ret = -1; line = __LINE__; goto error_hndl; }
   if (size > 2) {
      inbuf[1] = (char*)smpi_get_tmp_recvbuffer(max_real_segsize);
      if (NULL == inbuf[1]) { ret = -1; line = __LINE__; goto error_hndl; }
   }

   /* Handle MPI_IN_PLACE */
   if (MPI_IN_PLACE != sbuf) {
      ret= smpi_datatype_copy(sbuf, count, dtype,rbuf, count, dtype);
      if (ret < 0) { line = __LINE__; goto error_hndl; }
   }

   /* Computation loop: for each phase, repeat ring allreduce computation loop */
   for (phase = 0; phase < num_phases; phase ++) {
      ptrdiff_t phase_offset;
      int early_phase_segcount, late_phase_segcount, split_phase, phase_count;

      /* 
         For each of the remote nodes:
         - post irecv for block (r-1)
         - send block (r)
           To do this, first compute block offset and count, and use block offset
           to compute phase offset.
         - in loop for every step k = 2 .. n
           - post irecv for block (r + n - k) % n
           - wait on block (r + n - k + 1) % n to arrive
           - compute on block (r + n - k + 1) % n
           - send block (r + n - k + 1) % n
         - wait on block (r + 1)
         - compute on block (r + 1)
         - send block (r + 1) to rank (r + 1)
         Note that we must be careful when computing the begining of buffers and
         for send operations and computation we must compute the exact block size.
      */
      send_to = (rank + 1) % size;
      recv_from = (rank + size - 1) % size;
      
      inbi = 0;
      /* Initialize first receive from the neighbor on the left */
      reqs[inbi] = smpi_mpi_irecv(inbuf[inbi], max_segcount, dtype, recv_from,
                               666, comm);
      /* Send first block (my block) to the neighbor on the right:
         - compute my block and phase offset
         - send data */
      block_offset = ((rank < split_rank)? 
                      (rank * early_blockcount) : 
                      (rank * late_blockcount + split_rank));
      block_count = ((rank < split_rank)? early_blockcount : late_blockcount);
      COLL_TUNED_COMPUTE_BLOCKCOUNT(block_count, num_phases, split_phase,
                                    early_phase_segcount, late_phase_segcount)
      phase_count = ((phase < split_phase)?
                     (early_phase_segcount) : (late_phase_segcount));
      phase_offset = ((phase < split_phase)?
                      (phase * early_phase_segcount) : 
                      (phase * late_phase_segcount + split_phase));
      tmpsend = ((char*)rbuf) + (block_offset + phase_offset) * extent;
      smpi_mpi_send(tmpsend, phase_count, dtype, send_to,
                              666, comm);
      
      for (k = 2; k < size; k++) {
         const int prevblock = (rank + size - k + 1) % size;
         
         inbi = inbi ^ 0x1;
         
         /* Post irecv for the current block */
         reqs[inbi] = smpi_mpi_irecv(inbuf[inbi], max_segcount, dtype, recv_from,
                               666, comm);
         if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
         
         /* Wait on previous block to arrive */
         smpi_mpi_wait(&reqs[inbi ^ 0x1], MPI_STATUS_IGNORE);
         
         /* Apply operation on previous block: result goes to rbuf
            rbuf[prevblock] = inbuf[inbi ^ 0x1] (op) rbuf[prevblock]
         */
         block_offset = ((prevblock < split_rank)?
                         (prevblock * early_blockcount) :
                         (prevblock * late_blockcount + split_rank));
         block_count = ((prevblock < split_rank)? 
                        early_blockcount : late_blockcount);
         COLL_TUNED_COMPUTE_BLOCKCOUNT(block_count, num_phases, split_phase,
                                       early_phase_segcount, late_phase_segcount)
         phase_count = ((phase < split_phase)?
                        (early_phase_segcount) : (late_phase_segcount));
         phase_offset = ((phase < split_phase)?
                         (phase * early_phase_segcount) : 
                         (phase * late_phase_segcount + split_phase));
         tmprecv = ((char*)rbuf) + (block_offset + phase_offset) * extent;
         smpi_op_apply(op, inbuf[inbi ^ 0x1], tmprecv, &phase_count, &dtype);
         /* send previous block to send_to */
         smpi_mpi_send(tmprecv, phase_count, dtype, send_to,
                              666, comm);
      }
      
      /* Wait on the last block to arrive */
      smpi_mpi_wait(&reqs[inbi], MPI_STATUS_IGNORE);

      
      /* Apply operation on the last block (from neighbor (rank + 1) 
         rbuf[rank+1] = inbuf[inbi] (op) rbuf[rank + 1] */
      recv_from = (rank + 1) % size;
      block_offset = ((recv_from < split_rank)?
                      (recv_from * early_blockcount) :
                      (recv_from * late_blockcount + split_rank));
      block_count = ((recv_from < split_rank)? 
                     early_blockcount : late_blockcount);
      COLL_TUNED_COMPUTE_BLOCKCOUNT(block_count, num_phases, split_phase,
                                    early_phase_segcount, late_phase_segcount)
      phase_count = ((phase < split_phase)?
                     (early_phase_segcount) : (late_phase_segcount));
      phase_offset = ((phase < split_phase)?
                      (phase * early_phase_segcount) : 
                      (phase * late_phase_segcount + split_phase));
      tmprecv = ((char*)rbuf) + (block_offset + phase_offset) * extent;
      smpi_op_apply(op, inbuf[inbi], tmprecv, &phase_count, &dtype);
   }

   /* Distribution loop - variation of ring allgather */
   send_to = (rank + 1) % size;
   recv_from = (rank + size - 1) % size;
   for (k = 0; k < size - 1; k++) {
      const int recv_data_from = (rank + size - k) % size;
      const int send_data_from = (rank + 1 + size - k) % size;
      const int send_block_offset = 
         ((send_data_from < split_rank)?
          (send_data_from * early_blockcount) :
          (send_data_from * late_blockcount + split_rank));
      const int recv_block_offset = 
         ((recv_data_from < split_rank)?
          (recv_data_from * early_blockcount) :
          (recv_data_from * late_blockcount + split_rank));
      block_count = ((send_data_from < split_rank)? 
                     early_blockcount : late_blockcount);

      tmprecv = (char*)rbuf + recv_block_offset * extent;
      tmpsend = (char*)rbuf + send_block_offset * extent;

      smpi_mpi_sendrecv(tmpsend, block_count, dtype, send_to,
                                     666,
                                     tmprecv, early_blockcount, dtype, recv_from,
                                     666,
                                     comm, MPI_STATUS_IGNORE);

   }

   if (NULL != inbuf[0]) smpi_free_tmp_buffer(inbuf[0]);
   if (NULL != inbuf[1]) smpi_free_tmp_buffer(inbuf[1]);

   return MPI_SUCCESS;

 error_hndl:
   XBT_DEBUG("%s:%4d\tRank %d Error occurred %d\n",
                __FILE__, line, rank, ret);
   if (NULL != inbuf[0]) smpi_free_tmp_buffer(inbuf[0]);
   if (NULL != inbuf[1]) smpi_free_tmp_buffer(inbuf[1]);
   return ret;
}
Beispiel #8
0
int Coll_alltoall_2dmesh::alltoall(void *send_buff, int send_count,
                                    MPI_Datatype send_type,
                                    void *recv_buff, int recv_count,
                                    MPI_Datatype recv_type, MPI_Comm comm)
{
  MPI_Status *statuses, s;
  MPI_Request *reqs, *req_ptr;;
  MPI_Aint extent;

  char *tmp_buff1, *tmp_buff2;
  int i, j, src, dst, rank, num_procs, count, num_reqs;
  int X, Y, send_offset, recv_offset;
  int my_row_base, my_col_base, src_row_base, block_size;
  int tag = COLL_TAG_ALLTOALL;

  rank = comm->rank();
  num_procs = comm->size();
  extent = send_type->get_extent();

  if (not alltoall_check_is_2dmesh(num_procs, &X, &Y))
    return MPI_ERR_OTHER;

  my_row_base = (rank / Y) * Y;
  my_col_base = rank % Y;

  block_size = extent * send_count;

  tmp_buff1 = (char *) smpi_get_tmp_sendbuffer(block_size * num_procs * Y);
  tmp_buff2 = (char *) smpi_get_tmp_recvbuffer(block_size * Y);

  num_reqs = X;
  if (Y > X)
    num_reqs = Y;

  statuses = (MPI_Status *) xbt_malloc(num_reqs * sizeof(MPI_Status));
  reqs = (MPI_Request *) xbt_malloc(num_reqs * sizeof(MPI_Request));

  req_ptr = reqs;

  count = send_count * num_procs;

  for (i = 0; i < Y; i++) {
    src = i + my_row_base;
    if (src == rank)
      continue;

    recv_offset = (src % Y) * block_size * num_procs;
    *(req_ptr++) = Request::irecv(tmp_buff1 + recv_offset, count, recv_type, src, tag, comm);
  }

  for (i = 0; i < Y; i++) {
    dst = i + my_row_base;
    if (dst == rank)
      continue;
    Request::send(send_buff, count, send_type, dst, tag, comm);
  }

  Request::waitall(Y - 1, reqs, statuses);
  req_ptr = reqs;

  for (i = 0; i < Y; i++) {
    send_offset = (rank * block_size) + (i * block_size * num_procs);
    recv_offset = (my_row_base * block_size) + (i * block_size);

    if (i + my_row_base == rank)
      Request::sendrecv((char *) send_buff + recv_offset, send_count, send_type,
                   rank, tag,
                   (char *) recv_buff + recv_offset, recv_count, recv_type,
                   rank, tag, comm, &s);

    else
      Request::sendrecv(tmp_buff1 + send_offset, send_count, send_type,
                   rank, tag,
                   (char *) recv_buff + recv_offset, recv_count, recv_type,
                   rank, tag, comm, &s);
  }


  for (i = 0; i < X; i++) {
    src = (i * Y + my_col_base);
    if (src == rank)
      continue;
    src_row_base = (src / Y) * Y;

    *(req_ptr++) = Request::irecv((char *) recv_buff + src_row_base * block_size, recv_count * Y,
              recv_type, src, tag, comm);
  }

  for (i = 0; i < X; i++) {
    dst = (i * Y + my_col_base);
    if (dst == rank)
      continue;

    recv_offset = 0;
    for (j = 0; j < Y; j++) {
      send_offset = (dst + j * num_procs) * block_size;

      if (j + my_row_base == rank)
        Request::sendrecv((char *) send_buff + dst * block_size, send_count,
                     send_type, rank, tag, tmp_buff2 + recv_offset, recv_count,
                     recv_type, rank, tag, comm, &s);
      else
        Request::sendrecv(tmp_buff1 + send_offset, send_count, send_type,
                     rank, tag,
                     tmp_buff2 + recv_offset, recv_count, recv_type,
                     rank, tag, comm, &s);

      recv_offset += block_size;
    }

    Request::send(tmp_buff2, send_count * Y, send_type, dst, tag, comm);
  }
  Request::waitall(X - 1, reqs, statuses);
  free(reqs);
  free(statuses);
  smpi_free_tmp_buffer(tmp_buff1);
  smpi_free_tmp_buffer(tmp_buff2);
  return MPI_SUCCESS;
}
Beispiel #9
0
int Coll_gather_ompi_binomial::gather(void* sbuf, int scount, MPI_Datatype sdtype, void* rbuf, int rcount,
                                      MPI_Datatype rdtype, int root, MPI_Comm comm)
{
    int line = -1;
    int i;
    int rank;
    int vrank;
    int size;
    int total_recv = 0;
    char *ptmp     = NULL;
    char *tempbuf  = NULL;
    int err;
    ompi_coll_tree_t* bmtree;
    MPI_Status status;
    MPI_Aint sextent, slb, strue_lb, strue_extent;
    MPI_Aint rextent, rlb, rtrue_lb, rtrue_extent;


    size = comm->size();
    rank = comm->rank();

    XBT_DEBUG("smpi_coll_tuned_gather_ompi_binomial rank %d", rank);

    /* create the binomial tree */
   // COLL_TUNED_UPDATE_IN_ORDER_BMTREE( comm, tuned_module, root );
    bmtree = ompi_coll_tuned_topo_build_in_order_bmtree(comm, root);
    // data->cached_in_order_bmtree;

    sdtype->extent(&slb, &sextent);
    sdtype->extent(&strue_lb, &strue_extent);

    vrank = (rank - root + size) % size;

    if (rank == root) {
        rdtype->extent(&rlb, &rextent);
        rdtype->extent(&rtrue_lb, &rtrue_extent);
        if (0 == root) {
          /* root on 0, just use the recv buffer */
          ptmp = (char*)rbuf;
          if (sbuf != MPI_IN_PLACE) {
            err = Datatype::copy(sbuf, scount, sdtype, ptmp, rcount, rdtype);
            if (MPI_SUCCESS != err) {
              line = __LINE__;
              goto err_hndl;
            }
          }
        } else {
          /* root is not on 0, allocate temp buffer for recv,
           * rotate data at the end */
          tempbuf = (char*)smpi_get_tmp_recvbuffer(rtrue_extent + (rcount * size - 1) * rextent);
          if (NULL == tempbuf) {
            err  = MPI_ERR_OTHER;
            line = __LINE__;
            goto err_hndl;
          }

          ptmp = tempbuf - rlb;
          if (sbuf != MPI_IN_PLACE) {
            /* copy from sbuf to temp buffer */
            err = Datatype::copy(sbuf, scount, sdtype, ptmp, rcount, rdtype);
            if (MPI_SUCCESS != err) {
              line = __LINE__;
              goto err_hndl;
            }
          } else {
            /* copy from rbuf to temp buffer  */
            err = Datatype::copy((char*)rbuf + rank * rextent * rcount, rcount, rdtype, ptmp, rcount, rdtype);
            if (MPI_SUCCESS != err) {
              line = __LINE__;
              goto err_hndl;
            }
          }
        }
        total_recv = rcount;
    } else if (!(vrank % 2)) {
      /* other non-leaf nodes, allocate temp buffer for data received from
       * children, the most we need is half of the total data elements due
       * to the property of binimoal tree */
      tempbuf = (char*)smpi_get_tmp_sendbuffer(strue_extent + (scount * size - 1) * sextent);
      if (NULL == tempbuf) {
        err  = MPI_ERR_OTHER;
        line = __LINE__;
        goto err_hndl;
      }

      ptmp = tempbuf - slb;
      /* local copy to tempbuf */
      err = Datatype::copy(sbuf, scount, sdtype, ptmp, scount, sdtype);
      if (MPI_SUCCESS != err) {
        line = __LINE__;
        goto err_hndl;
      }

      /* use sdtype,scount as rdtype,rdcount since they are ignored on
       * non-root procs */
      rdtype     = sdtype;
      rcount     = scount;
      rextent    = sextent;
      total_recv = rcount;
    } else {
      /* leaf nodes, no temp buffer needed, use sdtype,scount as
       * rdtype,rdcount since they are ignored on non-root procs */
      ptmp       = (char*)sbuf;
      total_recv = scount;
    }

    if (!(vrank % 2)) {
      /* all non-leaf nodes recv from children */
      for (i = 0; i < bmtree->tree_nextsize; i++) {
        int mycount = 0, vkid;
        /* figure out how much data I have to send to this child */
        vkid    = (bmtree->tree_next[i] - root + size) % size;
        mycount = vkid - vrank;
        if (mycount > (size - vkid))
          mycount = size - vkid;
        mycount *= rcount;

        XBT_DEBUG("smpi_coll_tuned_gather_ompi_binomial rank %d recv %d mycount = %d", rank, bmtree->tree_next[i],
                  mycount);

        Request::recv(ptmp + total_recv * rextent, mycount, rdtype, bmtree->tree_next[i], COLL_TAG_GATHER, comm,
                      &status);

        total_recv += mycount;
      }
    }

    if (rank != root) {
      /* all nodes except root send to parents */
      XBT_DEBUG("smpi_coll_tuned_gather_ompi_binomial rank %d send %d count %d\n", rank, bmtree->tree_prev, total_recv);

      Request::send(ptmp, total_recv, sdtype, bmtree->tree_prev, COLL_TAG_GATHER, comm);
  }
    if (rank == root) {
      if (root != 0) {
        /* rotate received data on root if root != 0 */
        err = Datatype::copy(ptmp, rcount * (size - root), rdtype, (char*)rbuf + rextent * root * rcount,
                             rcount * (size - root), rdtype);
        if (MPI_SUCCESS != err) {
          line = __LINE__;
          goto err_hndl;
        }

        err = Datatype::copy(ptmp + rextent * rcount * (size - root), rcount * root, rdtype, (char*)rbuf, rcount * root,
                             rdtype);
        if (MPI_SUCCESS != err) {
          line = __LINE__;
          goto err_hndl;
        }

        smpi_free_tmp_buffer(tempbuf);
      }
    } else if (!(vrank % 2)) {
      /* other non-leaf nodes */
      smpi_free_tmp_buffer(tempbuf);
    }
    ompi_coll_tuned_topo_destroy_tree(&bmtree);
    return MPI_SUCCESS;

 err_hndl:
    if (NULL != tempbuf)
      smpi_free_tmp_buffer(tempbuf);

    XBT_DEBUG("%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank);
    return err;
}