int smpi_coll_tuned_reduce_scatter_mpich_rdb(void *sendbuf, void *recvbuf, int recvcounts[],
                              MPI_Datatype datatype, MPI_Op op, MPI_Comm comm)
{
    int   rank, comm_size, i;
    MPI_Aint extent, true_extent, true_lb; 
    int  *disps;
    void *tmp_recvbuf, *tmp_results;
    int mpi_errno = MPI_SUCCESS;
    int dis[2], blklens[2], total_count, dst;
    int mask, dst_tree_root, my_tree_root, j, k;
    int received;
    MPI_Datatype sendtype, recvtype;
    int nprocs_completed, tmp_mask, tree_root, is_commutative;
    comm_size = smpi_comm_size(comm);
    rank = smpi_comm_rank(comm);

    extent =smpi_datatype_get_extent(datatype);
    smpi_datatype_extent(datatype, &true_lb, &true_extent);
    
    if (smpi_op_is_commute(op)) {
        is_commutative = 1;
    }

    disps = (int*)xbt_malloc( comm_size * sizeof(int));

    total_count = 0;
    for (i=0; i<comm_size; i++) {
        disps[i] = total_count;
        total_count += recvcounts[i];
    }
    
            /* noncommutative and (non-pof2 or block irregular), use recursive doubling. */

            /* need to allocate temporary buffer to receive incoming data*/
            tmp_recvbuf= (void *) xbt_malloc( total_count*(max(true_extent,extent)));
            /* adjust for potential negative lower bound in datatype */
            tmp_recvbuf = (void *)((char*)tmp_recvbuf - true_lb);

            /* need to allocate another temporary buffer to accumulate
               results */
            tmp_results = (void *)xbt_malloc( total_count*(max(true_extent,extent)));
            /* adjust for potential negative lower bound in datatype */
            tmp_results = (void *)((char*)tmp_results - true_lb);

            /* copy sendbuf into tmp_results */
            if (sendbuf != MPI_IN_PLACE)
                mpi_errno = smpi_datatype_copy(sendbuf, total_count, datatype,
                                           tmp_results, total_count, datatype);
            else
                mpi_errno = smpi_datatype_copy(recvbuf, total_count, datatype,
                                           tmp_results, total_count, datatype);

            if (mpi_errno) return(mpi_errno);

            mask = 0x1;
            i = 0;
            while (mask < comm_size) {
                dst = rank ^ mask;

                dst_tree_root = dst >> i;
                dst_tree_root <<= i;

                my_tree_root = rank >> i;
                my_tree_root <<= i;

                /* At step 1, processes exchange (n-n/p) amount of
                   data; at step 2, (n-2n/p) amount of data; at step 3, (n-4n/p)
                   amount of data, and so forth. We use derived datatypes for this.

                   At each step, a process does not need to send data
                   indexed from my_tree_root to
                   my_tree_root+mask-1. Similarly, a process won't receive
                   data indexed from dst_tree_root to dst_tree_root+mask-1. */

                /* calculate sendtype */
                blklens[0] = blklens[1] = 0;
                for (j=0; j<my_tree_root; j++)
                    blklens[0] += recvcounts[j];
                for (j=my_tree_root+mask; j<comm_size; j++)
                    blklens[1] += recvcounts[j];

                dis[0] = 0;
                dis[1] = blklens[0];
                for (j=my_tree_root; (j<my_tree_root+mask) && (j<comm_size); j++)
                    dis[1] += recvcounts[j];

                mpi_errno = smpi_datatype_indexed(2, blklens, dis, datatype, &sendtype);
                if (mpi_errno) return(mpi_errno);
                
                smpi_datatype_commit(&sendtype);

                /* calculate recvtype */
                blklens[0] = blklens[1] = 0;
                for (j=0; j<dst_tree_root && j<comm_size; j++)
                    blklens[0] += recvcounts[j];
                for (j=dst_tree_root+mask; j<comm_size; j++)
                    blklens[1] += recvcounts[j];

                dis[0] = 0;
                dis[1] = blklens[0];
                for (j=dst_tree_root; (j<dst_tree_root+mask) && (j<comm_size); j++)
                    dis[1] += recvcounts[j];

                mpi_errno = smpi_datatype_indexed(2, blklens, dis, datatype, &recvtype);
                if (mpi_errno) return(mpi_errno);
                
                smpi_datatype_commit(&recvtype);

                received = 0;
                if (dst < comm_size) {
                    /* tmp_results contains data to be sent in each step. Data is
                       received in tmp_recvbuf and then accumulated into
                       tmp_results. accumulation is done later below.   */ 

                    smpi_mpi_sendrecv(tmp_results, 1, sendtype, dst,
                                                 COLL_TAG_SCATTER,
                                                 tmp_recvbuf, 1, recvtype, dst,
                                                 COLL_TAG_SCATTER, comm,
                                                 MPI_STATUS_IGNORE);
                    received = 1;
                }

                /* if some processes in this process's subtree in this step
                   did not have any destination process to communicate with
                   because of non-power-of-two, we need to send them the
                   result. We use a logarithmic recursive-halfing algorithm
                   for this. */

                if (dst_tree_root + mask > comm_size) {
                    nprocs_completed = comm_size - my_tree_root - mask;
                    /* nprocs_completed is the number of processes in this
                       subtree that have all the data. Send data to others
                       in a tree fashion. First find root of current tree
                       that is being divided into two. k is the number of
                       least-significant bits in this process's rank that
                       must be zeroed out to find the rank of the root */ 
                    j = mask;
                    k = 0;
                    while (j) {
                        j >>= 1;
                        k++;
                    }
                    k--;

                    tmp_mask = mask >> 1;
                    while (tmp_mask) {
                        dst = rank ^ tmp_mask;

                        tree_root = rank >> k;
                        tree_root <<= k;

                        /* send only if this proc has data and destination
                           doesn't have data. at any step, multiple processes
                           can send if they have the data */
                        if ((dst > rank) && 
                            (rank < tree_root + nprocs_completed)
                            && (dst >= tree_root + nprocs_completed)) {
                            /* send the current result */
                            smpi_mpi_send(tmp_recvbuf, 1, recvtype,
                                                     dst, COLL_TAG_SCATTER,
                                                     comm);
                        }
                        /* recv only if this proc. doesn't have data and sender
                           has data */
                        else if ((dst < rank) && 
                                 (dst < tree_root + nprocs_completed) &&
                                 (rank >= tree_root + nprocs_completed)) {
                            smpi_mpi_recv(tmp_recvbuf, 1, recvtype, dst,
                                                     COLL_TAG_SCATTER,
                                                     comm, MPI_STATUS_IGNORE); 
                            received = 1;
                        }
                        tmp_mask >>= 1;
                        k--;
                    }
                }

                /* The following reduction is done here instead of after 
                   the MPIC_Sendrecv_ft or MPIC_Recv_ft above. This is
                   because to do it above, in the noncommutative 
                   case, we would need an extra temp buffer so as not to
                   overwrite temp_recvbuf, because temp_recvbuf may have
                   to be communicated to other processes in the
                   non-power-of-two case. To avoid that extra allocation,
                   we do the reduce here. */
                if (received) {
                    if (is_commutative || (dst_tree_root < my_tree_root)) {
                        {
			         smpi_op_apply(op, 
                               tmp_recvbuf, tmp_results, &blklens[0],
			       &datatype); 
			        smpi_op_apply(op, 
                               ((char *)tmp_recvbuf + dis[1]*extent),
			       ((char *)tmp_results + dis[1]*extent),
			       &blklens[1], &datatype); 
                        }
                    }
                    else {
                        {
			         smpi_op_apply(op,
                                   tmp_results, tmp_recvbuf, &blklens[0],
                                   &datatype); 
			         smpi_op_apply(op,
                                   ((char *)tmp_results + dis[1]*extent),
                                   ((char *)tmp_recvbuf + dis[1]*extent),
                                   &blklens[1], &datatype); 
                        }
                        /* copy result back into tmp_results */
                        mpi_errno = smpi_datatype_copy(tmp_recvbuf, 1, recvtype, 
                                                   tmp_results, 1, recvtype);
                        if (mpi_errno) return(mpi_errno);
                    }
                }

                //smpi_datatype_free(&sendtype);
                //smpi_datatype_free(&recvtype);

                mask <<= 1;
                i++;
            }
int smpi_coll_tuned_reduce_scatter_mpich_pair(void *sendbuf, void *recvbuf, int recvcounts[],
                              MPI_Datatype datatype, MPI_Op op, MPI_Comm comm)
{
    int   rank, comm_size, i;
    MPI_Aint extent, true_extent, true_lb; 
    int  *disps;
    void *tmp_recvbuf;
    int mpi_errno = MPI_SUCCESS;
    int total_count, dst, src;
    int is_commutative;
    comm_size = smpi_comm_size(comm);
    rank = smpi_comm_rank(comm);

    extent =smpi_datatype_get_extent(datatype);
    smpi_datatype_extent(datatype, &true_lb, &true_extent);
    
    if (smpi_op_is_commute(op)) {
        is_commutative = 1;
    }

    disps = (int*)xbt_malloc( comm_size * sizeof(int));

    total_count = 0;
    for (i=0; i<comm_size; i++) {
        disps[i] = total_count;
        total_count += recvcounts[i];
    }
    
    if (total_count == 0) {
        return MPI_ERR_COUNT;
    }

        if (sendbuf != MPI_IN_PLACE) {
            /* copy local data into recvbuf */
            smpi_datatype_copy(((char *)sendbuf+disps[rank]*extent),
                                       recvcounts[rank], datatype, recvbuf,
                                       recvcounts[rank], datatype);
        }
        
        /* allocate temporary buffer to store incoming data */
        tmp_recvbuf = (void*)xbt_malloc(recvcounts[rank]*(max(true_extent,extent))+1);
        /* adjust for potential negative lower bound in datatype */
        tmp_recvbuf = (void *)((char*)tmp_recvbuf - true_lb);
        
        for (i=1; i<comm_size; i++) {
            src = (rank - i + comm_size) % comm_size;
            dst = (rank + i) % comm_size;
            
            /* send the data that dst needs. recv data that this process
               needs from src into tmp_recvbuf */
            if (sendbuf != MPI_IN_PLACE) 
                smpi_mpi_sendrecv(((char *)sendbuf+disps[dst]*extent), 
                                             recvcounts[dst], datatype, dst,
                                             COLL_TAG_SCATTER, tmp_recvbuf,
                                             recvcounts[rank], datatype, src,
                                             COLL_TAG_SCATTER, comm,
                                             MPI_STATUS_IGNORE);
            else
                smpi_mpi_sendrecv(((char *)recvbuf+disps[dst]*extent), 
                                             recvcounts[dst], datatype, dst,
                                             COLL_TAG_SCATTER, tmp_recvbuf,
                                             recvcounts[rank], datatype, src,
                                             COLL_TAG_SCATTER, comm,
                                             MPI_STATUS_IGNORE);
            
            if (is_commutative || (src < rank)) {
                if (sendbuf != MPI_IN_PLACE) {
		     smpi_op_apply( op,
			                          tmp_recvbuf, recvbuf, &recvcounts[rank],
                               &datatype); 
                }
                else {
		    smpi_op_apply(op, 
			tmp_recvbuf, ((char *)recvbuf+disps[rank]*extent), 
			&recvcounts[rank], &datatype);
                    /* we can't store the result at the beginning of
                       recvbuf right here because there is useful data
                       there that other process/processes need. at the
                       end, we will copy back the result to the
                       beginning of recvbuf. */
                }
            }
            else {
                if (sendbuf != MPI_IN_PLACE) {
		    smpi_op_apply(op, 
		       recvbuf, tmp_recvbuf, &recvcounts[rank], &datatype);
                    /* copy result back into recvbuf */
                    mpi_errno = smpi_datatype_copy(tmp_recvbuf, recvcounts[rank],
                                               datatype, recvbuf,
                                               recvcounts[rank], datatype);
                    if (mpi_errno) return(mpi_errno);
                }
                else {
		    smpi_op_apply(op, 
                        ((char *)recvbuf+disps[rank]*extent),
			tmp_recvbuf, &recvcounts[rank], &datatype);
                    /* copy result back into recvbuf */
                    mpi_errno = smpi_datatype_copy(tmp_recvbuf, recvcounts[rank],
                                               datatype, 
                                               ((char *)recvbuf +
                                                disps[rank]*extent), 
                                               recvcounts[rank], datatype);
                    if (mpi_errno) return(mpi_errno);
                }
            }
        }
        
        /* if MPI_IN_PLACE, move output data to the beginning of
           recvbuf. already done for rank 0. */
        if ((sendbuf == MPI_IN_PLACE) && (rank != 0)) {
            mpi_errno = smpi_datatype_copy(((char *)recvbuf +
                                        disps[rank]*extent),  
                                       recvcounts[rank], datatype,
                                       recvbuf, 
                                       recvcounts[rank], datatype );
            if (mpi_errno) return(mpi_errno);
        }
    
return MPI_SUCCESS;
}
int smpi_coll_tuned_reduce_scatter_mpich_noncomm(void *sendbuf, void *recvbuf, int recvcounts[],
                              MPI_Datatype datatype, MPI_Op op, MPI_Comm comm)
{
    int mpi_errno = MPI_SUCCESS;
    int comm_size = smpi_comm_size(comm) ;
    int rank = smpi_comm_rank(comm);
    int pof2;
    int log2_comm_size;
    int i, k;
    int recv_offset, send_offset;
    int block_size, total_count, size;
    MPI_Aint true_extent, true_lb;
    int buf0_was_inout;
    void *tmp_buf0;
    void *tmp_buf1;
    void *result_ptr;

    smpi_datatype_extent(datatype, &true_lb, &true_extent);

    pof2 = 1;
    log2_comm_size = 0;
    while (pof2 < comm_size) {
        pof2 <<= 1;
        ++log2_comm_size;
    }

    /* begin error checking */
    xbt_assert(pof2 == comm_size); /* FIXME this version only works for power of 2 procs */

    for (i = 0; i < (comm_size - 1); ++i) {
        xbt_assert(recvcounts[i] == recvcounts[i+1]);
    }
    /* end error checking */

    /* size of a block (count of datatype per block, NOT bytes per block) */
    block_size = recvcounts[0];
    total_count = block_size * comm_size;

    tmp_buf0=( void *)xbt_malloc( true_extent * total_count);
    tmp_buf1=( void *)xbt_malloc( true_extent * total_count);
    /* adjust for potential negative lower bound in datatype */
    tmp_buf0 = (void *)((char*)tmp_buf0 - true_lb);
    tmp_buf1 = (void *)((char*)tmp_buf1 - true_lb);

    /* Copy our send data to tmp_buf0.  We do this one block at a time and
       permute the blocks as we go according to the mirror permutation. */
    for (i = 0; i < comm_size; ++i) {
        mpi_errno = smpi_datatype_copy((char *)(sendbuf == MPI_IN_PLACE ? recvbuf : sendbuf) + (i * true_extent * block_size), block_size, datatype,
                                   (char *)tmp_buf0 + (MPIU_Mirror_permutation(i, log2_comm_size) * true_extent * block_size), block_size, datatype);
        if (mpi_errno) return(mpi_errno);
    }
    buf0_was_inout = 1;

    send_offset = 0;
    recv_offset = 0;
    size = total_count;
    for (k = 0; k < log2_comm_size; ++k) {
        /* use a double-buffering scheme to avoid local copies */
        char *incoming_data = (buf0_was_inout ? tmp_buf1 : tmp_buf0);
        char *outgoing_data = (buf0_was_inout ? tmp_buf0 : tmp_buf1);
        int peer = rank ^ (0x1 << k);
        size /= 2;

        if (rank > peer) {
            /* we have the higher rank: send top half, recv bottom half */
            recv_offset += size;
        }
        else {
            /* we have the lower rank: recv top half, send bottom half */
            send_offset += size;
        }

        smpi_mpi_sendrecv(outgoing_data + send_offset*true_extent,
                                     size, datatype, peer, COLL_TAG_SCATTER,
                                     incoming_data + recv_offset*true_extent,
                                     size, datatype, peer, COLL_TAG_SCATTER,
                                     comm, MPI_STATUS_IGNORE);
        /* always perform the reduction at recv_offset, the data at send_offset
           is now our peer's responsibility */
        if (rank > peer) {
            /* higher ranked value so need to call op(received_data, my_data) */
            smpi_op_apply(op, 
                   incoming_data + recv_offset*true_extent,
                     outgoing_data + recv_offset*true_extent,
                     &size, &datatype );
            /* buf0_was_inout = buf0_was_inout; */
        }
        else {
            /* lower ranked value so need to call op(my_data, received_data) */
	    smpi_op_apply( op,
		     outgoing_data + recv_offset*true_extent,
                     incoming_data + recv_offset*true_extent,
                     &size, &datatype);
            buf0_was_inout = !buf0_was_inout;
        }

        /* the next round of send/recv needs to happen within the block (of size
           "size") that we just received and reduced */
        send_offset = recv_offset;
    }

    xbt_assert(size == recvcounts[rank]);

    /* copy the reduced data to the recvbuf */
    result_ptr = (char *)(buf0_was_inout ? tmp_buf0 : tmp_buf1) + recv_offset * true_extent;
    mpi_errno = smpi_datatype_copy(result_ptr, size, datatype,
                               recvbuf, size, datatype);
    if (mpi_errno) return(mpi_errno);
    return MPI_SUCCESS;
}
Beispiel #4
0
int
smpi_coll_tuned_allreduce_lr(void *sbuf, void *rbuf, int rcount,
                             MPI_Datatype dtype, MPI_Op op, MPI_Comm comm)
{
  int tag = COLL_TAG_ALLREDUCE;
  MPI_Status status;
  int rank, i, size, count;
  int send_offset, recv_offset;
  int remainder, remainder_flag, remainder_offset;

  rank = smpi_comm_rank(comm);
  size = smpi_comm_size(comm);

  /* make it compatible with all data type */
  MPI_Aint extent;
  extent = smpi_datatype_get_extent(dtype);

  /* when communication size is smaller than number of process (not support) */
  if (rcount < size) {
    XBT_WARN("MPI_allreduce_lr use default MPI_allreduce.");	  
    smpi_mpi_allreduce(sbuf, rbuf, rcount, dtype, op, comm);
    return MPI_SUCCESS; 
  }

  /* when communication size is not divisible by number of process: 
     call the native implementation for the remain chunk at the end of the operation */
  if (rcount % size != 0) {
    remainder = rcount % size;
    remainder_flag = 1;
    remainder_offset = (rcount / size) * size * extent;
  } else {
    remainder = remainder_flag = remainder_offset = 0;
  }

  /* size of each point-to-point communication is equal to the size of the whole message
     divided by number of processes
   */
  count = rcount / size;

  /* our ALL-REDUCE implementation
     1. copy (partial of)send_buf to recv_buf
     2. use logical ring reduce-scatter
     3. use logical ring all-gather 
   */

  // copy partial data
  send_offset = ((rank - 1 + size) % size) * count * extent;
  recv_offset = ((rank - 1 + size) % size) * count * extent;
  smpi_mpi_sendrecv((char *) sbuf + send_offset, count, dtype, rank, tag - 1,
               (char *) rbuf + recv_offset, count, dtype, rank, tag - 1, comm,
               &status);

  // reduce-scatter
  for (i = 0; i < (size - 1); i++) {
    send_offset = ((rank - 1 - i + 2 * size) % size) * count * extent;
    recv_offset = ((rank - 2 - i + 2 * size) % size) * count * extent;
    //    recv_offset = ((rank-i+2*size)%size)*count*extent;
    smpi_mpi_sendrecv((char *) rbuf + send_offset, count, dtype, ((rank + 1) % size),
                 tag + i, (char *) rbuf + recv_offset, count, dtype,
                 ((rank + size - 1) % size), tag + i, comm, &status);

    // compute result to rbuf+recv_offset
    smpi_op_apply(op, (char *) sbuf + recv_offset, (char *) rbuf + recv_offset,
                   &count, &dtype);
  }

  // all-gather
  for (i = 0; i < (size - 1); i++) {
    send_offset = ((rank - i + 2 * size) % size) * count * extent;
    recv_offset = ((rank - 1 - i + 2 * size) % size) * count * extent;
    smpi_mpi_sendrecv((char *) rbuf + send_offset, count, dtype, ((rank + 1) % size),
                 tag + i, (char *) rbuf + recv_offset, count, dtype,
                 ((rank + size - 1) % size), tag + i, comm, &status);
  }

  /* when communication size is not divisible by number of process: 
     call the native implementation for the remain chunk at the end of the operation */
  if (remainder_flag) {
    return mpi_coll_allreduce_fun((char *) sbuf + remainder_offset,
                         (char *) rbuf + remainder_offset, remainder, dtype, op,
                         comm);
  }

  return 0;
}
/*
This fucntion performs all-reduce operation as follow.
1) binomial_tree reduce inside each SMP node
2) reduce-scatter -inter between root of each SMP node
3) allgather - inter between root of each SMP node
4) binomial_tree bcast inside each SMP node
*/
int smpi_coll_tuned_allreduce_smp_rsag(void *send_buf, void *recv_buf,
                                       int count, MPI_Datatype dtype, MPI_Op op,
                                       MPI_Comm comm)
{
  int comm_size, rank;
  void *tmp_buf;
  int tag = COLL_TAG_ALLREDUCE;
  int mask, src, dst;
  MPI_Status status;
  if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){
    smpi_comm_init_smp(comm);
  }
  int num_core=1;
  if (smpi_comm_is_uniform(comm)){
    num_core = smpi_comm_size(smpi_comm_get_intra_comm(comm));
  }
  /*
     #ifdef MPICH2_REDUCTION
     MPI_User_function * uop = MPIR_Op_table[op % 16 - 1];
     #else
     MPI_User_function *uop;
     struct MPIR_OP *op_ptr;
     op_ptr = MPIR_ToPointer(op);
     uop  = op_ptr->op;
     #endif
   */
  comm_size = smpi_comm_size(comm);
  rank = smpi_comm_rank(comm);
  MPI_Aint extent;
  extent = smpi_datatype_get_extent(dtype);
  tmp_buf = (void *) smpi_get_tmp_sendbuffer(count * extent);

  int intra_rank, inter_rank;
  intra_rank = rank % num_core;
  inter_rank = rank / num_core;

  //printf("node %d intra_rank = %d, inter_rank = %d\n", rank, intra_rank, inter_rank);

  int inter_comm_size = (comm_size + num_core - 1) / num_core;

  if (!rank) {
    //printf("intra com size = %d\n",num_core);
    //printf("inter com size = %d\n",inter_comm_size);
  }


  smpi_mpi_sendrecv(send_buf, count, dtype, rank, tag,
               recv_buf, count, dtype, rank, tag, comm, &status);


  // SMP_binomial_reduce
  mask = 1;
  while (mask < num_core) {
    if ((mask & intra_rank) == 0) {
      src = (inter_rank * num_core) + (intra_rank | mask);
      //      if (src < ((inter_rank + 1) * num_core)) {
      if (src < comm_size) {
        smpi_mpi_recv(tmp_buf, count, dtype, src, tag, comm, &status);
        smpi_op_apply(op, tmp_buf, recv_buf, &count, &dtype);
        //printf("Node %d recv from node %d when mask is %d\n", rank, src, mask);
      }
    } else {

      dst = (inter_rank * num_core) + (intra_rank & (~mask));
      smpi_mpi_send(recv_buf, count, dtype, dst, tag, comm);
      //printf("Node %d send to node %d when mask is %d\n", rank, dst, mask);
      break;
    }
    mask <<= 1;
  }



  // INTER: reduce-scatter
  if (intra_rank == 0) {
    int send_offset, recv_offset;
    int seg_count = count / inter_comm_size;
    int to = ((inter_rank + 1) % inter_comm_size) * num_core;
    int from =
        ((inter_rank + inter_comm_size - 1) % inter_comm_size) * num_core;
    int i;

    //printf("node %d to %d from %d\n",rank,to,from);

    for (i = 0; i < (inter_comm_size - 1); i++) {

      send_offset =
          ((inter_rank - 1 - i +
            inter_comm_size) % inter_comm_size) * seg_count * extent;
      recv_offset =
          ((inter_rank - 2 - i +
            inter_comm_size) % inter_comm_size) * seg_count * extent;

      smpi_mpi_sendrecv((char *) recv_buf + send_offset, seg_count, dtype, to,
                   tag + i, tmp_buf, seg_count, dtype, from, tag + i, comm,
                   &status);

      // result is in rbuf
      smpi_op_apply(op, tmp_buf, (char *) recv_buf + recv_offset, &seg_count,
                     &dtype);
    }

    // INTER: allgather
    for (i = 0; i < (inter_comm_size - 1); i++) {

      send_offset =
          ((inter_rank - i +
            inter_comm_size) % inter_comm_size) * seg_count * extent;
      recv_offset =
          ((inter_rank - 1 - i +
            inter_comm_size) % inter_comm_size) * seg_count * extent;

      smpi_mpi_sendrecv((char *) recv_buf + send_offset, seg_count, dtype, to,
                   tag + i, (char *) recv_buf + recv_offset, seg_count, dtype,
                   from, tag + i, comm, &status);

    }
  }



  /*
     // INTER_binomial_reduce

     // only root node for each SMP
     if (intra_rank == 0) {

     mask = 1;
     while (mask < inter_comm_size) {
     if ((mask & inter_rank) == 0) {
     src = (inter_rank | mask) * num_core;
     if (src < comm_size) {
     smpi_mpi_recv(tmp_buf, count, dtype, src, tag, comm, &status);
     (* uop) (tmp_buf, recv_buf, &count, &dtype);
     //printf("Node %d recv from node %d when mask is %d\n", rank, src, mask);
     }
     }
     else {
     dst = (inter_rank & (~mask)) * num_core;
     smpi_mpi_send(recv_buf, count, dtype, dst, tag, comm);
     //printf("Node %d send to node %d when mask is %d\n", rank, dst, mask);
     break;
     }
     mask <<=1;
     }
     }
   */

  /*
     // INTER_binomial_bcast


     if (intra_rank == 0) {
     mask = 1;
     while (mask < inter_comm_size) {
     if (inter_rank & mask) {
     src = (inter_rank - mask) * num_core;
     //printf("Node %d recv from node %d when mask is %d\n", rank, src, mask);
     smpi_mpi_recv(recv_buf, count, dtype, src, tag, comm, &status);
     break;
     }
     mask <<= 1;
     }

     mask >>= 1;
     //printf("My rank = %d my mask = %d\n", rank,mask);

     while (mask > 0) {
     if (inter_rank < inter_comm_size) {
     dst = (inter_rank + mask) * num_core;
     if (dst < comm_size) {
     //printf("Node %d send to node %d when mask is %d\n", rank, dst, mask);
     smpi_mpi_send(recv_buf, count, dtype, dst, tag, comm);
     }
     }
     mask >>= 1;
     }
     }
   */


  // INTRA_binomial_bcast

  int num_core_in_current_smp = num_core;
  if (inter_rank == (inter_comm_size - 1)) {
    num_core_in_current_smp = comm_size - (inter_rank * num_core);
  }
  //  printf("Node %d num_core = %d\n",rank, num_core_in_current_smp);
  mask = 1;
  while (mask < num_core_in_current_smp) {
    if (intra_rank & mask) {
      src = (inter_rank * num_core) + (intra_rank - mask);
      //printf("Node %d recv from node %d when mask is %d\n", rank, src, mask);
      smpi_mpi_recv(recv_buf, count, dtype, src, tag, comm, &status);
      break;
    }
    mask <<= 1;
  }

  mask >>= 1;
  //printf("My rank = %d my mask = %d\n", rank,mask);

  while (mask > 0) {
    dst = (inter_rank * num_core) + (intra_rank + mask);
    if (dst < comm_size) {
      //printf("Node %d send to node %d when mask is %d\n", rank, dst, mask);
      smpi_mpi_send(recv_buf, count, dtype, dst, tag, comm);
    }
    mask >>= 1;
  }


  smpi_free_tmp_buffer(tmp_buf);
  return MPI_SUCCESS;
}
/*
This fucntion performs all-reduce operation as follow.
1) binomial_tree reduce inside each SMP node
2) Recursive doubling intra-communication between root of each SMP node
3) binomial_tree bcast inside each SMP node
*/
int smpi_coll_tuned_allreduce_smp_rdb(void *send_buf, void *recv_buf, int count,
                                      MPI_Datatype dtype, MPI_Op op,
                                      MPI_Comm comm)
{
  int comm_size, rank;
  void *tmp_buf;
  int tag = COLL_TAG_ALLREDUCE;
  int mask, src, dst;
  MPI_Status status;
  if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){
    smpi_comm_init_smp(comm);
  }
  int num_core=1;
  if (smpi_comm_is_uniform(comm)){
    num_core = smpi_comm_size(smpi_comm_get_intra_comm(comm));
  }
  /*
     #ifdef MPICH2_REDUCTION
     MPI_User_function * uop = MPIR_Op_table[op % 16 - 1];
     #else
     MPI_User_function *uop;
     struct MPIR_OP *op_ptr;
     op_ptr = MPIR_ToPointer(op);
     uop  = op_ptr->op;
     #endif
   */
  comm_size = smpi_comm_size(comm);
  rank = smpi_comm_rank(comm);
  MPI_Aint extent;
  extent = smpi_datatype_get_extent(dtype);
  tmp_buf = (void *) smpi_get_tmp_sendbuffer(count * extent);

  /* compute intra and inter ranking */
  int intra_rank, inter_rank;
  intra_rank = rank % num_core;
  inter_rank = rank / num_core;

  /* size of processes participate in intra communications =>
     should be equal to number of machines */
  int inter_comm_size = (comm_size + num_core - 1) / num_core;

  /* copy input buffer to output buffer */
  smpi_mpi_sendrecv(send_buf, count, dtype, rank, tag,
               recv_buf, count, dtype, rank, tag, comm, &status);

  /* start binomial reduce intra communication inside each SMP node */
  mask = 1;
  while (mask < num_core) {
    if ((mask & intra_rank) == 0) {
      src = (inter_rank * num_core) + (intra_rank | mask);
      if (src < comm_size) {
        smpi_mpi_recv(tmp_buf, count, dtype, src, tag, comm, &status);
        smpi_op_apply(op, tmp_buf, recv_buf, &count, &dtype);
      }
    } else {
      dst = (inter_rank * num_core) + (intra_rank & (~mask));
      smpi_mpi_send(recv_buf, count, dtype, dst, tag, comm);
      break;
    }
    mask <<= 1;
  }                             /* end binomial reduce intra-communication */


  /* start rdb (recursive doubling) all-reduce inter-communication 
     between each SMP nodes : each node only have one process that can communicate
     to other nodes */
  if (intra_rank == 0) {

    /* find nearest power-of-two less than or equal to inter_comm_size */
    int pof2, rem, newrank, newdst;
    pof2 = 1;
    while (pof2 <= inter_comm_size)
      pof2 <<= 1;
    pof2 >>= 1;
    rem = inter_comm_size - pof2;

    /* In the non-power-of-two case, all even-numbered
       processes of rank < 2*rem send their data to
       (rank+1). These even-numbered processes no longer
       participate in the algorithm until the very end.
     */
    if (inter_rank < 2 * rem) {
      if (inter_rank % 2 == 0) {
        dst = rank + num_core;
        smpi_mpi_send(recv_buf, count, dtype, dst, tag, comm);
        newrank = -1;
      } else {
        src = rank - num_core;
        smpi_mpi_recv(tmp_buf, count, dtype, src, tag, comm, &status);
        smpi_op_apply(op, tmp_buf, recv_buf, &count, &dtype);
        newrank = inter_rank / 2;
      }
    } else {
      newrank = inter_rank - rem;
    }

    /* example inter-communication RDB rank change algorithm 
       0,4,8,12..36 <= true rank (assume 4 core per SMP)
       0123 4567 89 <= inter_rank
       1 3 4567 89 (1,3 got data from 0,2 : 0,2 will be idle until the end)
       0 1 4567 89 
       0 1 2345 67 => newrank
     */

    if (newrank != -1) {
      mask = 1;
      while (mask < pof2) {
        newdst = newrank ^ mask;
        /* find real rank of dest */
        dst = (newdst < rem) ? newdst * 2 + 1 : newdst + rem;
        dst *= num_core;

        /* exchange data in rdb manner */
        smpi_mpi_sendrecv(recv_buf, count, dtype, dst, tag, tmp_buf, count, dtype,
                     dst, tag, comm, &status);
        smpi_op_apply(op, tmp_buf, recv_buf, &count, &dtype);
        mask <<= 1;
      }
    }

    /* non pof2 case 
       left-over processes (all even ranks: < 2 * rem) get the result    
     */
    if (inter_rank < 2 * rem) {
      if (inter_rank % 2) {
        smpi_mpi_send(recv_buf, count, dtype, rank - num_core, tag, comm);
      } else {
        smpi_mpi_recv(recv_buf, count, dtype, rank + num_core, tag, comm, &status);
      }
    }
  }