Esempio n. 1
0
int smpi_coll_tuned_reduce_scatter_ompi( void *sbuf, void *rbuf,
                                                    int *rcounts,
                                                    MPI_Datatype dtype,
                                                    MPI_Op  op,
                                                    MPI_Comm  comm
                                                    )
{
    int comm_size, i, pow2;
    size_t total_message_size, dsize;
    const double a = 0.0012;
    const double b = 8.0;
    const size_t small_message_size = 12 * 1024;
    const size_t large_message_size = 256 * 1024;
    int zerocounts = 0;

    XBT_DEBUG("smpi_coll_tuned_reduce_scatter_ompi");
    
    comm_size = smpi_comm_size(comm);
    // We need data size for decision function 
    dsize=smpi_datatype_size(dtype);
    total_message_size = 0;
    for (i = 0; i < comm_size; i++) { 
        total_message_size += rcounts[i];
        if (0 == rcounts[i]) {
            zerocounts = 1;
        }
    }

    if( !smpi_op_is_commute(op) || (zerocounts)) {
        smpi_mpi_reduce_scatter (sbuf, rbuf, rcounts, 
                                                                    dtype, op, 
                                                                    comm); 
        return MPI_SUCCESS;
    }
   
    total_message_size *= dsize;

    // compute the nearest power of 2 
    for (pow2 = 1; pow2 < comm_size; pow2 <<= 1);

    if ((total_message_size <= small_message_size) ||
        ((total_message_size <= large_message_size) && (pow2 == comm_size)) ||
        (comm_size >= a * total_message_size + b)) {
        return 
            smpi_coll_tuned_reduce_scatter_ompi_basic_recursivehalving(sbuf, rbuf, rcounts,
                                                                        dtype, op,
                                                                        comm);
    } 
    return smpi_coll_tuned_reduce_scatter_ompi_ring(sbuf, rbuf, rcounts,
                                                     dtype, op,
                                                     comm);



}
Esempio n. 2
0
int smpi_coll_tuned_allreduce_ompi(void *sbuf, void *rbuf, int count,
                        MPI_Datatype dtype, MPI_Op op, MPI_Comm comm)
{
    size_t dsize, block_dsize;
    int comm_size = smpi_comm_size(comm);
    const size_t intermediate_message = 10000;

    /**
     * Decision function based on MX results from the Grig cluster at UTK.
     * 
     * Currently, linear, recursive doubling, and nonoverlapping algorithms 
     * can handle both commutative and non-commutative operations.
     * Ring algorithm does not support non-commutative operations.
     */
    dsize = smpi_datatype_size(dtype);
    block_dsize = dsize * count;

    if (block_dsize < intermediate_message) {
        return (smpi_coll_tuned_allreduce_rdb (sbuf, rbuf, 
                                                                   count, dtype,
                                                                   op, comm));
    } 

    if( smpi_op_is_commute(op) && (count > comm_size) ) {
        const size_t segment_size = 1 << 20; /* 1 MB */
        if ((comm_size * segment_size >= block_dsize)) {
            //FIXME: ok, these are not the right algorithms, try to find closer ones
            // lr is a good match for allreduce_ring (difference is mainly the use of sendrecv)
            return smpi_coll_tuned_allreduce_lr(sbuf, rbuf, count, dtype,
                                              op, comm);
        } else {
           return (smpi_coll_tuned_allreduce_ompi_ring_segmented (sbuf, rbuf,
                                                                    count, dtype, 
                                                                    op, comm 
                                                                    /*segment_size*/));
        }
    }

    return (smpi_coll_tuned_allreduce_redbcast(sbuf, rbuf, count, 
                                                            dtype, op, comm));
}
Esempio n. 3
0
int smpi_coll_tuned_reduce_ompi( void *sendbuf, void *recvbuf,
                                            int count, MPI_Datatype  datatype,
                                            MPI_Op   op, int root,
                                            MPI_Comm   comm
                                            )
{
    int communicator_size=0;
    //int segsize = 0;
    size_t message_size, dsize;
    const double a1 =  0.6016 / 1024.0; /* [1/B] */
    const double b1 =  1.3496;
    const double a2 =  0.0410 / 1024.0; /* [1/B] */
    const double b2 =  9.7128;
    const double a3 =  0.0422 / 1024.0; /* [1/B] */
    const double b3 =  1.1614;
    //const double a4 =  0.0033 / 1024.0; /* [1/B] */
    //const double b4 =  1.6761;

    //const int max_requests = 0; /* no limit on # of outstanding requests */

    communicator_size = smpi_comm_size(comm);

    /* need data size for decision function */
    dsize=smpi_datatype_size(datatype);
    message_size = dsize * count;   /* needed for decision */

    /**
     * If the operation is non commutative we currently have choice of linear 
     * or in-order binary tree algorithm.
     */
    if( !smpi_op_is_commute(op) ) {
        if ((communicator_size < 12) && (message_size < 2048)) {
            return smpi_coll_tuned_reduce_ompi_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm/*, module*/); 
        } 
        return smpi_coll_tuned_reduce_ompi_in_order_binary (sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
                                                             0, max_requests*/); 
    }

    if ((communicator_size < 8) && (message_size < 512)){
        /* Linear_0K */
        return smpi_coll_tuned_reduce_ompi_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm); 
    } else if (((communicator_size < 8) && (message_size < 20480)) ||
               (message_size < 2048) || (count <= 1)) {
        /* Binomial_0K */
        //segsize = 0;
        return smpi_coll_tuned_reduce_ompi_binomial(sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
                                                     segsize, max_requests*/);
    } else if (communicator_size > (a1 * message_size + b1)) {
        // Binomial_1K 
        //segsize = 1024;
        return smpi_coll_tuned_reduce_ompi_binomial(sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
                                                     segsize, max_requests*/);
    } else if (communicator_size > (a2 * message_size + b2)) {
        // Pipeline_1K 
        //segsize = 1024;
        return smpi_coll_tuned_reduce_ompi_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm/*, module, 
                                                      segsize, max_requests*/);
    } else if (communicator_size > (a3 * message_size + b3)) {
        // Binary_32K 
        //segsize = 32*1024;
        return smpi_coll_tuned_reduce_ompi_binary( sendbuf, recvbuf, count, datatype, op, root,
                                                    comm/*, module, segsize, max_requests*/);
    }
    /*if (communicator_size > (a4 * message_size + b4)) {
        // Pipeline_32K 
        segsize = 32*1024;
    } else {
        // Pipeline_64K 
        segsize = 64*1024;
    }*/
    return smpi_coll_tuned_reduce_ompi_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm/*, module, 
                                                  segsize, max_requests*/);

#if 0
    /* for small messages use linear algorithm */
    if (message_size <= 4096) {
        segsize = 0;
        fanout = communicator_size - 1;
        /* when linear implemented or taken from basic put here, right now using chain as a linear system */
        /* it is implemented and I shouldn't be calling a chain with a fanout bigger than MAXTREEFANOUT from topo.h! */
        return smpi_coll_tuned_reduce_intra_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm, module); 
        /*        return smpi_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, segsize, fanout); */
    }
    if (message_size < 524288) {
        if (message_size <= 65536 ) {
            segsize = 32768;
            fanout = 8;
        } else {
            segsize = 1024;
            fanout = communicator_size/2;
        }
        /* later swap this for a binary tree */
        /*         fanout = 2; */
        return smpi_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, module,
                                                   segsize, fanout, max_requests);
    }
    segsize = 1024;
    return smpi_coll_tuned_reduce_intra_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm, module,
                                                  segsize, max_requests);
#endif  /* 0 */
}
Esempio n. 4
0
int smpi_coll_tuned_reduce_mvapich2_knomial (
        void *sendbuf,
        void *recvbuf,
        int count,
        MPI_Datatype datatype,
        MPI_Op op,
        int root,
        MPI_Comm comm)
{
    int mpi_errno = MPI_SUCCESS;
    int rank, is_commutative;
    int src, k;
    MPI_Request send_request;
    int index=0;
    MPI_Aint true_lb, true_extent, extent;
    MPI_Status status; 
    int recv_iter=0, dst=-1, expected_send_count, expected_recv_count;
    int *src_array=NULL;
    void **tmp_buf=NULL;
    MPI_Request *requests=NULL;


    if (count == 0) return MPI_SUCCESS;

    rank = smpi_comm_rank(comm);

    /* Create a temporary buffer */

    smpi_datatype_extent(datatype, &true_lb, &true_extent);
    extent = smpi_datatype_get_extent(datatype);

    is_commutative = smpi_op_is_commute(op);

    if (rank != root) {
        recvbuf=(void *)smpi_get_tmp_recvbuffer(count*(MAX(extent,true_extent)));
        recvbuf = (void *)((char*)recvbuf - true_lb);
    }

    if ((rank != root) || (sendbuf != MPI_IN_PLACE)) {
        mpi_errno = smpi_datatype_copy(sendbuf, count, datatype, recvbuf,
                count, datatype);
    }


    if(mv2_reduce_intra_knomial_factor<0)
      {
        mv2_reduce_intra_knomial_factor = SMPI_DEFAULT_KNOMIAL_FACTOR;
      }
    if(mv2_reduce_inter_knomial_factor<0)
      {
        mv2_reduce_inter_knomial_factor = SMPI_DEFAULT_KNOMIAL_FACTOR;
      }


    MPIR_Reduce_knomial_trace(root, mv2_reduce_intra_knomial_factor, comm, 
           &dst, &expected_send_count, &expected_recv_count, &src_array);

    if(expected_recv_count > 0 ) {
        tmp_buf  = xbt_malloc(sizeof(void *)*expected_recv_count);
        requests = xbt_malloc(sizeof(MPI_Request)*expected_recv_count);
        for(k=0; k < expected_recv_count; k++ ) {
            tmp_buf[k] = smpi_get_tmp_sendbuffer(count*(MAX(extent,true_extent)));
            tmp_buf[k] = (void *)((char*)tmp_buf[k] - true_lb);
        }

        while(recv_iter  < expected_recv_count) {
            src = src_array[expected_recv_count - (recv_iter+1)];

            requests[recv_iter]=smpi_mpi_irecv (tmp_buf[recv_iter], count, datatype ,src,
                    COLL_TAG_REDUCE, comm);
            recv_iter++;

        }

        recv_iter=0;
        while(recv_iter < expected_recv_count) {
            index=smpi_mpi_waitany(expected_recv_count, requests,
                    &status);
            recv_iter++;

            if (is_commutative) {
              smpi_op_apply(op, tmp_buf[index], recvbuf, &count, &datatype);
            }
        }

        for(k=0; k < expected_recv_count; k++ ) {
            smpi_free_tmp_buffer(tmp_buf[k]);
        }
        xbt_free(tmp_buf);
        xbt_free(requests);
    }

    if(src_array != NULL) { 
        xbt_free(src_array);
    } 

    if(rank != root) {
        send_request=smpi_mpi_isend(recvbuf,count, datatype, dst,
                COLL_TAG_REDUCE,comm);

        smpi_mpi_waitall(1, &send_request, &status);

        smpi_free_tmp_buffer((void *)((char*)recvbuf + true_lb));
    }

    /* --END ERROR HANDLING-- */

    return mpi_errno;
}
Esempio n. 5
0
 int smpi_coll_tuned_allreduce_mvapich2_rs(void *sendbuf,
                             void *recvbuf,
                             int count,
                             MPI_Datatype datatype,
                             MPI_Op op, MPI_Comm comm)
{
    int comm_size, rank;
    int mpi_errno = MPI_SUCCESS;
    int mask, dst, is_commutative, pof2, newrank = 0, rem, newdst, i,
        send_idx, recv_idx, last_idx, send_cnt, recv_cnt, *cnts, *disps;
    MPI_Aint true_lb, true_extent, extent;
    void *tmp_buf, *tmp_buf_free;

    if (count == 0) {
        return MPI_SUCCESS;
    }

    /* homogeneous */

    comm_size =  smpi_comm_size(comm);
    rank = smpi_comm_rank(comm);

    is_commutative = smpi_op_is_commute(op);

    /* need to allocate temporary buffer to store incoming data */
    smpi_datatype_extent(datatype, &true_lb, &true_extent);
    extent = smpi_datatype_get_extent(datatype);

    tmp_buf_free= smpi_get_tmp_recvbuffer(count * (MAX(extent, true_extent)));

    /* adjust for potential negative lower bound in datatype */
    tmp_buf = (void *) ((char *) tmp_buf_free - true_lb);

    /* copy local data into recvbuf */
    if (sendbuf != MPI_IN_PLACE) {
        mpi_errno =
            smpi_datatype_copy(sendbuf, count, datatype, recvbuf, count,
                           datatype);
    }

    /* find nearest power-of-two less than or equal to comm_size */
    for( pof2 = 1; pof2 <= comm_size; pof2 <<= 1 );
    pof2 >>=1;

    rem = comm_size - pof2;

    /* In the non-power-of-two case, all even-numbered
       processes of rank < 2*rem send their data to
       (rank+1). These even-numbered processes no longer
       participate in the algorithm until the very end. The
       remaining processes form a nice power-of-two. */

    if (rank < 2 * rem) {
        if (rank % 2 == 0) {
            /* even */
            smpi_mpi_send(recvbuf, count, datatype, rank + 1,
                                     COLL_TAG_ALLREDUCE, comm);

            /* temporarily set the rank to -1 so that this
               process does not pariticipate in recursive
               doubling */
            newrank = -1;
        } else {
            /* odd */
            smpi_mpi_recv(tmp_buf, count, datatype, rank - 1,
                                     COLL_TAG_ALLREDUCE, comm,
                                     MPI_STATUS_IGNORE);
            /* do the reduction on received data. since the
               ordering is right, it doesn't matter whether
               the operation is commutative or not. */
               smpi_op_apply(op, tmp_buf, recvbuf, &count, &datatype);
                /* change the rank */
                newrank = rank / 2;
        }
    } else {                /* rank >= 2*rem */
        newrank = rank - rem;
    }

    /* If op is user-defined or count is less than pof2, use
       recursive doubling algorithm. Otherwise do a reduce-scatter
       followed by allgather. (If op is user-defined,
       derived datatypes are allowed and the user could pass basic
       datatypes on one process and derived on another as long as
       the type maps are the same. Breaking up derived
       datatypes to do the reduce-scatter is tricky, therefore
       using recursive doubling in that case.) */

    if (newrank != -1) {
        if (/*(HANDLE_GET_KIND(op) != HANDLE_KIND_BUILTIN) ||*/ (count < pof2)) {  /* use recursive doubling */
            mask = 0x1;
            while (mask < pof2) {
                newdst = newrank ^ mask;
                /* find real rank of dest */
                dst = (newdst < rem) ? newdst * 2 + 1 : newdst + rem;

                /* Send the most current data, which is in recvbuf. Recv
                   into tmp_buf */
                smpi_mpi_sendrecv(recvbuf, count, datatype,
                                             dst, COLL_TAG_ALLREDUCE,
                                             tmp_buf, count, datatype, dst,
                                             COLL_TAG_ALLREDUCE, comm,
                                             MPI_STATUS_IGNORE);

                /* tmp_buf contains data received in this step.
                   recvbuf contains data accumulated so far */

                if (is_commutative || (dst < rank)) {
                    /* op is commutative OR the order is already right */
                     smpi_op_apply(op, tmp_buf, recvbuf, &count, &datatype);
                } else {
                    /* op is noncommutative and the order is not right */
                    smpi_op_apply(op, recvbuf, tmp_buf, &count, &datatype);
                    /* copy result back into recvbuf */
                    mpi_errno = smpi_datatype_copy(tmp_buf, count, datatype,
                                               recvbuf, count, datatype);
                }
                mask <<= 1;
            }
        } else {

            /* do a reduce-scatter followed by allgather */

            /* for the reduce-scatter, calculate the count that
               each process receives and the displacement within
               the buffer */
            cnts = (int *)xbt_malloc(pof2 * sizeof (int));
            disps = (int *)xbt_malloc(pof2 * sizeof (int));

            for (i = 0; i < (pof2 - 1); i++) {
                cnts[i] = count / pof2;
            }
            cnts[pof2 - 1] = count - (count / pof2) * (pof2 - 1);

            disps[0] = 0;
            for (i = 1; i < pof2; i++) {
                disps[i] = disps[i - 1] + cnts[i - 1];
            }

            mask = 0x1;
            send_idx = recv_idx = 0;
            last_idx = pof2;
            while (mask < pof2) {
                newdst = newrank ^ mask;
                /* find real rank of dest */
                dst = (newdst < rem) ? newdst * 2 + 1 : newdst + rem;

                send_cnt = recv_cnt = 0;
                if (newrank < newdst) {
                    send_idx = recv_idx + pof2 / (mask * 2);
                    for (i = send_idx; i < last_idx; i++)
                        send_cnt += cnts[i];
                    for (i = recv_idx; i < send_idx; i++)
                        recv_cnt += cnts[i];
                } else {
                    recv_idx = send_idx + pof2 / (mask * 2);
                    for (i = send_idx; i < recv_idx; i++)
                        send_cnt += cnts[i];
                    for (i = recv_idx; i < last_idx; i++)
                        recv_cnt += cnts[i];
                }

                /* Send data from recvbuf. Recv into tmp_buf */
                smpi_mpi_sendrecv((char *) recvbuf +
                                             disps[send_idx] * extent,
                                             send_cnt, datatype,
                                             dst, COLL_TAG_ALLREDUCE,
                                             (char *) tmp_buf +
                                             disps[recv_idx] * extent,
                                             recv_cnt, datatype, dst,
                                             COLL_TAG_ALLREDUCE, comm,
                                             MPI_STATUS_IGNORE);

                /* tmp_buf contains data received in this step.
                   recvbuf contains data accumulated so far */

                /* This algorithm is used only for predefined ops
                   and predefined ops are always commutative. */

                smpi_op_apply(op, (char *) tmp_buf + disps[recv_idx] * extent,
                        (char *) recvbuf + disps[recv_idx] * extent,
                        &recv_cnt, &datatype);

                /* update send_idx for next iteration */
                send_idx = recv_idx;
                mask <<= 1;

                /* update last_idx, but not in last iteration
                   because the value is needed in the allgather
                   step below. */
                if (mask < pof2)
                    last_idx = recv_idx + pof2 / mask;
            }

            /* now do the allgather */

            mask >>= 1;
            while (mask > 0) {
                newdst = newrank ^ mask;
                /* find real rank of dest */
                dst = (newdst < rem) ? newdst * 2 + 1 : newdst + rem;

                send_cnt = recv_cnt = 0;
                if (newrank < newdst) {
                    /* update last_idx except on first iteration */
                    if (mask != pof2 / 2) {
                        last_idx = last_idx + pof2 / (mask * 2);
                    }

                    recv_idx = send_idx + pof2 / (mask * 2);
                    for (i = send_idx; i < recv_idx; i++) {
                        send_cnt += cnts[i];
                    }
                    for (i = recv_idx; i < last_idx; i++) {
                        recv_cnt += cnts[i];
                    }
                } else {
                    recv_idx = send_idx - pof2 / (mask * 2);
                    for (i = send_idx; i < last_idx; i++) {
                        send_cnt += cnts[i];
                    }
                    for (i = recv_idx; i < send_idx; i++) {
                        recv_cnt += cnts[i];
                    }
                }

               smpi_mpi_sendrecv((char *) recvbuf +
                                             disps[send_idx] * extent,
                                             send_cnt, datatype,
                                             dst, COLL_TAG_ALLREDUCE,
                                             (char *) recvbuf +
                                             disps[recv_idx] * extent,
                                             recv_cnt, datatype, dst,
                                             COLL_TAG_ALLREDUCE, comm,
                                             MPI_STATUS_IGNORE);
                if (newrank > newdst) {
                    send_idx = recv_idx;
                }

                mask >>= 1;
            }
        }
    }

    /* In the non-power-of-two case, all odd-numbered
       processes of rank < 2*rem send the result to
       (rank-1), the ranks who didn't participate above. */
    if (rank < 2 * rem) {
        if (rank % 2) {     /* odd */
            smpi_mpi_send(recvbuf, count,
                                     datatype, rank - 1,
                                     COLL_TAG_ALLREDUCE, comm);
        } else {            /* even */
            smpi_mpi_recv(recvbuf, count,
                                  datatype, rank + 1,
                                  COLL_TAG_ALLREDUCE, comm,
                                  MPI_STATUS_IGNORE);
        }
    }
    smpi_free_tmp_buffer(tmp_buf_free);
    return (mpi_errno);

}
Esempio n. 6
0
/**
 * This is a generic implementation of the reduce protocol. It used the tree
 * provided as an argument and execute all operations using a segment of
 * count times a datatype.
 * For the last communication it will update the count in order to limit
 * the number of datatype to the original count (original_count)
 *
 * Note that for non-commutative operations we cannot save memory copy
 * for the first block: thus we must copy sendbuf to accumbuf on intermediate 
 * to keep the optimized loop happy.
 */
int smpi_coll_tuned_ompi_reduce_generic( void* sendbuf, void* recvbuf, int original_count,
                                    MPI_Datatype datatype, MPI_Op  op,
                                    int root, MPI_Comm comm,
                                    ompi_coll_tree_t* tree, int count_by_segment,
                                    int max_outstanding_reqs )
{
    char *inbuf[2] = {NULL, NULL}, *inbuf_free[2] = {NULL, NULL};
    char *accumbuf = NULL, *accumbuf_free = NULL;
    char *local_op_buffer = NULL, *sendtmpbuf = NULL;
    ptrdiff_t extent, lower_bound, segment_increment;
    MPI_Request  reqs[2] = {MPI_REQUEST_NULL, MPI_REQUEST_NULL};
    int num_segments, line, ret, segindex, i, rank;
    int recvcount, prevcount, inbi;

    /**
     * Determine number of segments and number of elements
     * sent per operation
     */
    smpi_datatype_extent( datatype, &lower_bound, &extent);
    num_segments = (original_count + count_by_segment - 1) / count_by_segment;
    segment_increment = count_by_segment * extent;

    sendtmpbuf = (char*) sendbuf; 
    if( sendbuf == MPI_IN_PLACE ) { 
        sendtmpbuf = (char *)recvbuf; 
    }

    XBT_DEBUG( "coll:tuned:reduce_generic count %d, msg size %ld, segsize %ld, max_requests %d", original_count, (unsigned long)(num_segments * segment_increment), (unsigned long)segment_increment, max_outstanding_reqs);

    rank = smpi_comm_rank(comm);

    /* non-leaf nodes - wait for children to send me data & forward up 
       (if needed) */
    if( tree->tree_nextsize > 0 ) {
        ptrdiff_t true_extent, real_segment_size;
        true_extent=smpi_datatype_get_extent( datatype);

        /* handle non existant recv buffer (i.e. its NULL) and 
           protect the recv buffer on non-root nodes */
        accumbuf = (char*)recvbuf;
        if( (NULL == accumbuf) || (root != rank) ) {
            /* Allocate temporary accumulator buffer. */
            accumbuf_free = (char*)malloc(true_extent + 
                                          (original_count - 1) * extent);
            if (accumbuf_free == NULL) { 
                line = __LINE__; ret = -1; goto error_hndl; 
            }
            accumbuf = accumbuf_free - lower_bound;
        } 

        /* If this is a non-commutative operation we must copy
           sendbuf to the accumbuf, in order to simplfy the loops */
        if (!smpi_op_is_commute(op)) {
            smpi_datatype_copy(
                                                (char*)sendtmpbuf, original_count, datatype,
                                                (char*)accumbuf, original_count, datatype);
        }
        /* Allocate two buffers for incoming segments */
        real_segment_size = true_extent + (count_by_segment - 1) * extent;
        inbuf_free[0] = (char*) malloc(real_segment_size);
        if( inbuf_free[0] == NULL ) { 
            line = __LINE__; ret = -1; goto error_hndl; 
        }
        inbuf[0] = inbuf_free[0] - lower_bound;
        /* if there is chance to overlap communication -
           allocate second buffer */
        if( (num_segments > 1) || (tree->tree_nextsize > 1) ) {
            inbuf_free[1] = (char*) malloc(real_segment_size);
            if( inbuf_free[1] == NULL ) { 
                line = __LINE__; ret = -1; goto error_hndl;
            }
            inbuf[1] = inbuf_free[1] - lower_bound;
        } 

        /* reset input buffer index and receive count */
        inbi = 0;
        recvcount = 0;
        /* for each segment */
        for( segindex = 0; segindex <= num_segments; segindex++ ) {
            prevcount = recvcount;
            /* recvcount - number of elements in current segment */
            recvcount = count_by_segment;
            if( segindex == (num_segments-1) )
                recvcount = original_count - count_by_segment * segindex;

            /* for each child */
            for( i = 0; i < tree->tree_nextsize; i++ ) {
                /**
                 * We try to overlap communication:
                 * either with next segment or with the next child
                 */
                /* post irecv for current segindex on current child */
                if( segindex < num_segments ) {
                    void* local_recvbuf = inbuf[inbi];
                    if( 0 == i ) {
                        /* for the first step (1st child per segment) and 
                         * commutative operations we might be able to irecv 
                         * directly into the accumulate buffer so that we can 
                         * reduce(op) this with our sendbuf in one step as 
                         * ompi_op_reduce only has two buffer pointers, 
                         * this avoids an extra memory copy.
                         *
                         * BUT if the operation is non-commutative or 
                         * we are root and are USING MPI_IN_PLACE this is wrong!
                         */
                        if( (smpi_op_is_commute(op)) &&
                            !((MPI_IN_PLACE == sendbuf) && (rank == tree->tree_root)) ) {
                            local_recvbuf = accumbuf + segindex * segment_increment;
                        }
                    }

                    reqs[inbi]=smpi_mpi_irecv(local_recvbuf, recvcount, datatype,
                                             tree->tree_next[i], 
                                             COLL_TAG_REDUCE, comm
                                             );
                }
                /* wait for previous req to complete, if any.
                   if there are no requests reqs[inbi ^1] will be 
                   MPI_REQUEST_NULL. */
                /* wait on data from last child for previous segment */
                smpi_mpi_waitall( 1, &reqs[inbi ^ 1], 
                                             MPI_STATUSES_IGNORE );
                local_op_buffer = inbuf[inbi ^ 1];
                if( i > 0 ) {
                    /* our first operation is to combine our own [sendbuf] data 
                     * with the data we recvd from down stream (but only 
                     * the operation is commutative and if we are not root and 
                     * not using MPI_IN_PLACE)
                     */
                    if( 1 == i ) {
                        if( (smpi_op_is_commute(op)) && 
                            !((MPI_IN_PLACE == sendbuf) && (rank == tree->tree_root)) ) {
                            local_op_buffer = sendtmpbuf + segindex * segment_increment;
                        }
                    }
                    /* apply operation */
                    smpi_op_apply(op, local_op_buffer, 
                                   accumbuf + segindex * segment_increment, 
                                   &recvcount, &datatype );
                } else if ( segindex > 0 ) {
                    void* accumulator = accumbuf + (segindex-1) * segment_increment;
                    if( tree->tree_nextsize <= 1 ) {
                        if( (smpi_op_is_commute(op)) &&
                            !((MPI_IN_PLACE == sendbuf) && (rank == tree->tree_root)) ) {
                            local_op_buffer = sendtmpbuf + (segindex-1) * segment_increment;
                        }
                    }
                    smpi_op_apply(op, local_op_buffer, accumulator, &prevcount, 
                                   &datatype );

                    /* all reduced on available data this step (i) complete, 
                     * pass to the next process unless you are the root.
                     */
                    if (rank != tree->tree_root) {
                        /* send combined/accumulated data to parent */
                        smpi_mpi_send( accumulator, prevcount, 
                                                  datatype, tree->tree_prev, 
                                                  COLL_TAG_REDUCE,
                                                  comm);
                    }

                    /* we stop when segindex = number of segments 
                       (i.e. we do num_segment+1 steps for pipelining */
                    if (segindex == num_segments) break;
                }

                /* update input buffer index */
                inbi = inbi ^ 1;
            } /* end of for each child */
        } /* end of for each segment */

        /* clean up */
        if( inbuf_free[0] != NULL) free(inbuf_free[0]);
        if( inbuf_free[1] != NULL) free(inbuf_free[1]);
        if( accumbuf_free != NULL ) free(accumbuf_free);
    }

    /* leaf nodes 
       Depending on the value of max_outstanding_reqs and 
       the number of segments we have two options:
       - send all segments using blocking send to the parent, or
       - avoid overflooding the parent nodes by limiting the number of 
       outstanding requests to max_oustanding_reqs.
       TODO/POSSIBLE IMPROVEMENT: If there is a way to determine the eager size 
       for the current communication, synchronization should be used only 
       when the message/segment size is smaller than the eager size.
    */
    else {

        /* If the number of segments is less than a maximum number of oustanding
           requests or there is no limit on the maximum number of outstanding 
           requests, we send data to the parent using blocking send */
        if ((0 == max_outstanding_reqs) || 
            (num_segments <= max_outstanding_reqs)) {
            
            segindex = 0;
            while ( original_count > 0) {
                if (original_count < count_by_segment) {
                    count_by_segment = original_count;
                }
                smpi_mpi_send((char*)sendbuf + 
                                         segindex * segment_increment,
                                         count_by_segment, datatype,
                                         tree->tree_prev, 
                                         COLL_TAG_REDUCE,
                                         comm) ;
                segindex++;
                original_count -= count_by_segment;
            }
        }

        /* Otherwise, introduce flow control:
           - post max_outstanding_reqs non-blocking synchronous send,
           - for remaining segments
           - wait for a ssend to complete, and post the next one.
           - wait for all outstanding sends to complete.
        */
        else {

            int creq = 0;
            MPI_Request* sreq = NULL;

            sreq = (MPI_Request*) calloc( max_outstanding_reqs,
                                              sizeof(MPI_Request ) );
            if (NULL == sreq) { line = __LINE__; ret = -1; goto error_hndl; }

            /* post first group of requests */
            for (segindex = 0; segindex < max_outstanding_reqs; segindex++) {
                sreq[segindex]=smpi_mpi_isend((char*)sendbuf +
                                          segindex * segment_increment,
                                          count_by_segment, datatype,
                                          tree->tree_prev, 
                                          COLL_TAG_REDUCE,
                                          comm);
                original_count -= count_by_segment;
            }

            creq = 0;
            while ( original_count > 0 ) {
                /* wait on a posted request to complete */
                smpi_mpi_wait(&sreq[creq], MPI_STATUS_IGNORE);
                sreq[creq] = MPI_REQUEST_NULL;

                if( original_count < count_by_segment ) {
                    count_by_segment = original_count;
                }
                sreq[creq]=smpi_mpi_isend((char*)sendbuf + 
                                          segindex * segment_increment, 
                                          count_by_segment, datatype, 
                                          tree->tree_prev, 
                                          COLL_TAG_REDUCE,
                                          comm );
                creq = (creq + 1) % max_outstanding_reqs;
                segindex++;
                original_count -= count_by_segment;
            }

            /* Wait on the remaining request to complete */
            smpi_mpi_waitall( max_outstanding_reqs, sreq, 
                                         MPI_STATUSES_IGNORE );

            /* free requests */
            free(sreq);
        }
    }
    return MPI_SUCCESS;

 error_hndl:  /* error handler */
    XBT_DEBUG("ERROR_HNDL: node %d file %s line %d error %d\n", 
                   rank, __FILE__, line, ret );
    if( inbuf_free[0] != NULL ) free(inbuf_free[0]);
    if( inbuf_free[1] != NULL ) free(inbuf_free[1]);
    if( accumbuf_free != NULL ) free(accumbuf);
    return ret;
}
Esempio n. 7
0
int smpi_coll_tuned_reduce_binomial(void *sendbuf, void *recvbuf, int count,
                                    MPI_Datatype datatype, MPI_Op op, int root,
                                    MPI_Comm comm)
{
  MPI_Status status;
  int comm_size, rank;
  int mask, relrank, source;
  int dst;
  int tag = COLL_TAG_REDUCE;
  MPI_Aint extent;
  void *tmp_buf;
  MPI_Aint true_lb, true_extent;
  if (count == 0)
    return 0;
  rank = smpi_comm_rank(comm);
  comm_size = smpi_comm_size(comm);

  extent = smpi_datatype_get_extent(datatype);

  tmp_buf = (void *) smpi_get_tmp_sendbuffer(count * extent);
  int is_commutative = smpi_op_is_commute(op);
  mask = 1;
  
  int lroot;
  if (is_commutative) 
        lroot   = root;
  else
        lroot   = 0;
  relrank = (rank - lroot + comm_size) % comm_size;

  smpi_datatype_extent(datatype, &true_lb, &true_extent);

  /* adjust for potential negative lower bound in datatype */
  tmp_buf = (void *)((char*)tmp_buf - true_lb);
    
  /* If I'm not the root, then my recvbuf may not be valid, therefore
     I have to allocate a temporary one */
  if (rank != root) {
      recvbuf = (void *) smpi_get_tmp_recvbuffer(count*(max(extent,true_extent)));
      recvbuf = (void *)((char*)recvbuf - true_lb);
  }
   if ((rank != root) || (sendbuf != MPI_IN_PLACE)) {
      smpi_datatype_copy(sendbuf, count, datatype, recvbuf,count, datatype);
  }

  while (mask < comm_size) {
    /* Receive */
    if ((mask & relrank) == 0) {
      source = (relrank | mask);
      if (source < comm_size) {
        source = (source + lroot) % comm_size;
        smpi_mpi_recv(tmp_buf, count, datatype, source, tag, comm, &status);
        
        if (is_commutative) {
          smpi_op_apply(op, tmp_buf, recvbuf, &count, &datatype);
        } else {
          smpi_op_apply(op, recvbuf, tmp_buf, &count, &datatype);
          smpi_datatype_copy(tmp_buf, count, datatype,recvbuf, count, datatype);
        }
      }
    } else {
      dst = ((relrank & (~mask)) + lroot) % comm_size;
      smpi_mpi_send(recvbuf, count, datatype, dst, tag, comm);
      break;
    }
    mask <<= 1;
  }

  if (!is_commutative && (root != 0)){
    if (rank == 0){
      smpi_mpi_send(recvbuf, count, datatype, root,tag, comm);
    }else if (rank == root){
      smpi_mpi_recv(recvbuf, count, datatype, 0, tag, comm, &status);
    }
  }

  if (rank != root) {
	  smpi_free_tmp_buffer(recvbuf);
  }
  smpi_free_tmp_buffer(tmp_buf);

  return 0;
}
int smpi_coll_tuned_reduce_scatter_mpich_rdb(void *sendbuf, void *recvbuf, int recvcounts[],
                              MPI_Datatype datatype, MPI_Op op, MPI_Comm comm)
{
    int   rank, comm_size, i;
    MPI_Aint extent, true_extent, true_lb; 
    int  *disps;
    void *tmp_recvbuf, *tmp_results;
    int mpi_errno = MPI_SUCCESS;
    int dis[2], blklens[2], total_count, dst;
    int mask, dst_tree_root, my_tree_root, j, k;
    int received;
    MPI_Datatype sendtype, recvtype;
    int nprocs_completed, tmp_mask, tree_root, is_commutative;
    comm_size = smpi_comm_size(comm);
    rank = smpi_comm_rank(comm);

    extent =smpi_datatype_get_extent(datatype);
    smpi_datatype_extent(datatype, &true_lb, &true_extent);
    
    if (smpi_op_is_commute(op)) {
        is_commutative = 1;
    }

    disps = (int*)xbt_malloc( comm_size * sizeof(int));

    total_count = 0;
    for (i=0; i<comm_size; i++) {
        disps[i] = total_count;
        total_count += recvcounts[i];
    }
    
            /* noncommutative and (non-pof2 or block irregular), use recursive doubling. */

            /* need to allocate temporary buffer to receive incoming data*/
            tmp_recvbuf= (void *) xbt_malloc( total_count*(max(true_extent,extent)));
            /* adjust for potential negative lower bound in datatype */
            tmp_recvbuf = (void *)((char*)tmp_recvbuf - true_lb);

            /* need to allocate another temporary buffer to accumulate
               results */
            tmp_results = (void *)xbt_malloc( total_count*(max(true_extent,extent)));
            /* adjust for potential negative lower bound in datatype */
            tmp_results = (void *)((char*)tmp_results - true_lb);

            /* copy sendbuf into tmp_results */
            if (sendbuf != MPI_IN_PLACE)
                mpi_errno = smpi_datatype_copy(sendbuf, total_count, datatype,
                                           tmp_results, total_count, datatype);
            else
                mpi_errno = smpi_datatype_copy(recvbuf, total_count, datatype,
                                           tmp_results, total_count, datatype);

            if (mpi_errno) return(mpi_errno);

            mask = 0x1;
            i = 0;
            while (mask < comm_size) {
                dst = rank ^ mask;

                dst_tree_root = dst >> i;
                dst_tree_root <<= i;

                my_tree_root = rank >> i;
                my_tree_root <<= i;

                /* At step 1, processes exchange (n-n/p) amount of
                   data; at step 2, (n-2n/p) amount of data; at step 3, (n-4n/p)
                   amount of data, and so forth. We use derived datatypes for this.

                   At each step, a process does not need to send data
                   indexed from my_tree_root to
                   my_tree_root+mask-1. Similarly, a process won't receive
                   data indexed from dst_tree_root to dst_tree_root+mask-1. */

                /* calculate sendtype */
                blklens[0] = blklens[1] = 0;
                for (j=0; j<my_tree_root; j++)
                    blklens[0] += recvcounts[j];
                for (j=my_tree_root+mask; j<comm_size; j++)
                    blklens[1] += recvcounts[j];

                dis[0] = 0;
                dis[1] = blklens[0];
                for (j=my_tree_root; (j<my_tree_root+mask) && (j<comm_size); j++)
                    dis[1] += recvcounts[j];

                mpi_errno = smpi_datatype_indexed(2, blklens, dis, datatype, &sendtype);
                if (mpi_errno) return(mpi_errno);
                
                smpi_datatype_commit(&sendtype);

                /* calculate recvtype */
                blklens[0] = blklens[1] = 0;
                for (j=0; j<dst_tree_root && j<comm_size; j++)
                    blklens[0] += recvcounts[j];
                for (j=dst_tree_root+mask; j<comm_size; j++)
                    blklens[1] += recvcounts[j];

                dis[0] = 0;
                dis[1] = blklens[0];
                for (j=dst_tree_root; (j<dst_tree_root+mask) && (j<comm_size); j++)
                    dis[1] += recvcounts[j];

                mpi_errno = smpi_datatype_indexed(2, blklens, dis, datatype, &recvtype);
                if (mpi_errno) return(mpi_errno);
                
                smpi_datatype_commit(&recvtype);

                received = 0;
                if (dst < comm_size) {
                    /* tmp_results contains data to be sent in each step. Data is
                       received in tmp_recvbuf and then accumulated into
                       tmp_results. accumulation is done later below.   */ 

                    smpi_mpi_sendrecv(tmp_results, 1, sendtype, dst,
                                                 COLL_TAG_SCATTER,
                                                 tmp_recvbuf, 1, recvtype, dst,
                                                 COLL_TAG_SCATTER, comm,
                                                 MPI_STATUS_IGNORE);
                    received = 1;
                }

                /* if some processes in this process's subtree in this step
                   did not have any destination process to communicate with
                   because of non-power-of-two, we need to send them the
                   result. We use a logarithmic recursive-halfing algorithm
                   for this. */

                if (dst_tree_root + mask > comm_size) {
                    nprocs_completed = comm_size - my_tree_root - mask;
                    /* nprocs_completed is the number of processes in this
                       subtree that have all the data. Send data to others
                       in a tree fashion. First find root of current tree
                       that is being divided into two. k is the number of
                       least-significant bits in this process's rank that
                       must be zeroed out to find the rank of the root */ 
                    j = mask;
                    k = 0;
                    while (j) {
                        j >>= 1;
                        k++;
                    }
                    k--;

                    tmp_mask = mask >> 1;
                    while (tmp_mask) {
                        dst = rank ^ tmp_mask;

                        tree_root = rank >> k;
                        tree_root <<= k;

                        /* send only if this proc has data and destination
                           doesn't have data. at any step, multiple processes
                           can send if they have the data */
                        if ((dst > rank) && 
                            (rank < tree_root + nprocs_completed)
                            && (dst >= tree_root + nprocs_completed)) {
                            /* send the current result */
                            smpi_mpi_send(tmp_recvbuf, 1, recvtype,
                                                     dst, COLL_TAG_SCATTER,
                                                     comm);
                        }
                        /* recv only if this proc. doesn't have data and sender
                           has data */
                        else if ((dst < rank) && 
                                 (dst < tree_root + nprocs_completed) &&
                                 (rank >= tree_root + nprocs_completed)) {
                            smpi_mpi_recv(tmp_recvbuf, 1, recvtype, dst,
                                                     COLL_TAG_SCATTER,
                                                     comm, MPI_STATUS_IGNORE); 
                            received = 1;
                        }
                        tmp_mask >>= 1;
                        k--;
                    }
                }

                /* The following reduction is done here instead of after 
                   the MPIC_Sendrecv_ft or MPIC_Recv_ft above. This is
                   because to do it above, in the noncommutative 
                   case, we would need an extra temp buffer so as not to
                   overwrite temp_recvbuf, because temp_recvbuf may have
                   to be communicated to other processes in the
                   non-power-of-two case. To avoid that extra allocation,
                   we do the reduce here. */
                if (received) {
                    if (is_commutative || (dst_tree_root < my_tree_root)) {
                        {
			         smpi_op_apply(op, 
                               tmp_recvbuf, tmp_results, &blklens[0],
			       &datatype); 
			        smpi_op_apply(op, 
                               ((char *)tmp_recvbuf + dis[1]*extent),
			       ((char *)tmp_results + dis[1]*extent),
			       &blklens[1], &datatype); 
                        }
                    }
                    else {
                        {
			         smpi_op_apply(op,
                                   tmp_results, tmp_recvbuf, &blklens[0],
                                   &datatype); 
			         smpi_op_apply(op,
                                   ((char *)tmp_results + dis[1]*extent),
                                   ((char *)tmp_recvbuf + dis[1]*extent),
                                   &blklens[1], &datatype); 
                        }
                        /* copy result back into tmp_results */
                        mpi_errno = smpi_datatype_copy(tmp_recvbuf, 1, recvtype, 
                                                   tmp_results, 1, recvtype);
                        if (mpi_errno) return(mpi_errno);
                    }
                }

                //smpi_datatype_free(&sendtype);
                //smpi_datatype_free(&recvtype);

                mask <<= 1;
                i++;
            }
int smpi_coll_tuned_reduce_scatter_mpich_pair(void *sendbuf, void *recvbuf, int recvcounts[],
                              MPI_Datatype datatype, MPI_Op op, MPI_Comm comm)
{
    int   rank, comm_size, i;
    MPI_Aint extent, true_extent, true_lb; 
    int  *disps;
    void *tmp_recvbuf;
    int mpi_errno = MPI_SUCCESS;
    int total_count, dst, src;
    int is_commutative;
    comm_size = smpi_comm_size(comm);
    rank = smpi_comm_rank(comm);

    extent =smpi_datatype_get_extent(datatype);
    smpi_datatype_extent(datatype, &true_lb, &true_extent);
    
    if (smpi_op_is_commute(op)) {
        is_commutative = 1;
    }

    disps = (int*)xbt_malloc( comm_size * sizeof(int));

    total_count = 0;
    for (i=0; i<comm_size; i++) {
        disps[i] = total_count;
        total_count += recvcounts[i];
    }
    
    if (total_count == 0) {
        return MPI_ERR_COUNT;
    }

        if (sendbuf != MPI_IN_PLACE) {
            /* copy local data into recvbuf */
            smpi_datatype_copy(((char *)sendbuf+disps[rank]*extent),
                                       recvcounts[rank], datatype, recvbuf,
                                       recvcounts[rank], datatype);
        }
        
        /* allocate temporary buffer to store incoming data */
        tmp_recvbuf = (void*)xbt_malloc(recvcounts[rank]*(max(true_extent,extent))+1);
        /* adjust for potential negative lower bound in datatype */
        tmp_recvbuf = (void *)((char*)tmp_recvbuf - true_lb);
        
        for (i=1; i<comm_size; i++) {
            src = (rank - i + comm_size) % comm_size;
            dst = (rank + i) % comm_size;
            
            /* send the data that dst needs. recv data that this process
               needs from src into tmp_recvbuf */
            if (sendbuf != MPI_IN_PLACE) 
                smpi_mpi_sendrecv(((char *)sendbuf+disps[dst]*extent), 
                                             recvcounts[dst], datatype, dst,
                                             COLL_TAG_SCATTER, tmp_recvbuf,
                                             recvcounts[rank], datatype, src,
                                             COLL_TAG_SCATTER, comm,
                                             MPI_STATUS_IGNORE);
            else
                smpi_mpi_sendrecv(((char *)recvbuf+disps[dst]*extent), 
                                             recvcounts[dst], datatype, dst,
                                             COLL_TAG_SCATTER, tmp_recvbuf,
                                             recvcounts[rank], datatype, src,
                                             COLL_TAG_SCATTER, comm,
                                             MPI_STATUS_IGNORE);
            
            if (is_commutative || (src < rank)) {
                if (sendbuf != MPI_IN_PLACE) {
		     smpi_op_apply( op,
			                          tmp_recvbuf, recvbuf, &recvcounts[rank],
                               &datatype); 
                }
                else {
		    smpi_op_apply(op, 
			tmp_recvbuf, ((char *)recvbuf+disps[rank]*extent), 
			&recvcounts[rank], &datatype);
                    /* we can't store the result at the beginning of
                       recvbuf right here because there is useful data
                       there that other process/processes need. at the
                       end, we will copy back the result to the
                       beginning of recvbuf. */
                }
            }
            else {
                if (sendbuf != MPI_IN_PLACE) {
		    smpi_op_apply(op, 
		       recvbuf, tmp_recvbuf, &recvcounts[rank], &datatype);
                    /* copy result back into recvbuf */
                    mpi_errno = smpi_datatype_copy(tmp_recvbuf, recvcounts[rank],
                                               datatype, recvbuf,
                                               recvcounts[rank], datatype);
                    if (mpi_errno) return(mpi_errno);
                }
                else {
		    smpi_op_apply(op, 
                        ((char *)recvbuf+disps[rank]*extent),
			tmp_recvbuf, &recvcounts[rank], &datatype);
                    /* copy result back into recvbuf */
                    mpi_errno = smpi_datatype_copy(tmp_recvbuf, recvcounts[rank],
                                               datatype, 
                                               ((char *)recvbuf +
                                                disps[rank]*extent), 
                                               recvcounts[rank], datatype);
                    if (mpi_errno) return(mpi_errno);
                }
            }
        }
        
        /* if MPI_IN_PLACE, move output data to the beginning of
           recvbuf. already done for rank 0. */
        if ((sendbuf == MPI_IN_PLACE) && (rank != 0)) {
            mpi_errno = smpi_datatype_copy(((char *)recvbuf +
                                        disps[rank]*extent),  
                                       recvcounts[rank], datatype,
                                       recvbuf, 
                                       recvcounts[rank], datatype );
            if (mpi_errno) return(mpi_errno);
        }
    
return MPI_SUCCESS;
}
Esempio n. 10
0
int smpi_coll_tuned_reduce_mvapich2_two_level( void *sendbuf,
                                     void *recvbuf,
                                     int count,
                                     MPI_Datatype datatype,
                                     MPI_Op op,
                                     int root,
                                     MPI_Comm comm)
{
    int mpi_errno = MPI_SUCCESS;
    int my_rank, total_size, local_rank, local_size;
    int leader_comm_rank = -1, leader_comm_size = 0;
    MPI_Comm shmem_comm, leader_comm;
    int leader_root, leader_of_root;
    void *in_buf = NULL, *out_buf = NULL, *tmp_buf = NULL;
    MPI_Aint true_lb, true_extent, extent;
    int is_commutative = 0, stride = 0;
    int intra_node_root=0; 
    
    //if not set (use of the algo directly, without mvapich2 selector)
    if(MV2_Reduce_function==NULL)
      MV2_Reduce_function=smpi_coll_tuned_reduce_mpich;
    if(MV2_Reduce_intra_function==NULL)
      MV2_Reduce_intra_function=smpi_coll_tuned_reduce_mpich;

    if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){
      smpi_comm_init_smp(comm);
    }
  
    my_rank = smpi_comm_rank(comm);
    total_size = smpi_comm_size(comm);
    shmem_comm = smpi_comm_get_intra_comm(comm);
    local_rank = smpi_comm_rank(shmem_comm);
    local_size = smpi_comm_size(shmem_comm);
    
    leader_comm = smpi_comm_get_leaders_comm(comm);
    int* leaders_map = smpi_comm_get_leaders_map(comm);
    leader_of_root = smpi_group_rank(smpi_comm_group(comm),leaders_map[root]);
    leader_root = smpi_group_rank(smpi_comm_group(leader_comm),leaders_map[root]);

    is_commutative=smpi_op_is_commute(op);

    smpi_datatype_extent(datatype, &true_lb,
                                       &true_extent);
    extent =smpi_datatype_get_extent(datatype);
    stride = count * MAX(extent, true_extent);

    if (local_size == total_size) {
        /* First handle the case where there is only one node */
        if (stride <= MV2_INTRA_SHMEM_REDUCE_MSG &&
            is_commutative == 1) {
            if (local_rank == 0 ) {
                tmp_buf=(void *)smpi_get_tmp_sendbuffer( count *
                                    (MAX(extent, true_extent)));
                tmp_buf = (void *) ((char *) tmp_buf - true_lb);
            }

            if (sendbuf != MPI_IN_PLACE) {
                in_buf = (void *)sendbuf;
            } else {
                in_buf = recvbuf;
            }

            if (local_rank == 0) { 
                 if( my_rank != root) {
                     out_buf = tmp_buf;
                 } else { 
                     out_buf = recvbuf; 
                     if(in_buf == out_buf) { 
                        in_buf = MPI_IN_PLACE; 
                        out_buf = recvbuf; 
                     } 
                 } 
            } else {
                in_buf  = (void *)sendbuf; 
                out_buf = NULL;
            }

	    if (count * (MAX(extent, true_extent)) < SHMEM_COLL_BLOCK_SIZE) {
		mpi_errno = MPIR_Reduce_shmem_MV2(in_buf, out_buf, count,
						  datatype, op,
						  0, shmem_comm);
	    }
	    else {
		mpi_errno = MPIR_Reduce_intra_knomial_wrapper_MV2(in_buf, out_buf, count,
								  datatype, op,
								  0, shmem_comm);
	    }
	    
            if (local_rank == 0 && root != my_rank) {
                smpi_mpi_send(out_buf, count, datatype, root,
                                         COLL_TAG_REDUCE+1, comm);
            }
            if ((local_rank != 0) && (root == my_rank)) {
                smpi_mpi_recv(recvbuf, count, datatype,
                                         leader_of_root, COLL_TAG_REDUCE+1, comm,
                                         MPI_STATUS_IGNORE);
            }
        } else {
            if(mv2_use_knomial_reduce == 1) { 
                reduce_fn = &MPIR_Reduce_intra_knomial_wrapper_MV2; 
            } else { 
                reduce_fn = &MPIR_Reduce_binomial_MV2; 
            } 
            mpi_errno = reduce_fn(sendbuf, recvbuf, count,
                                  datatype, op,
                                  root, comm);
        }
        /* We are done */
        if(tmp_buf!=NULL) 
          smpi_free_tmp_buffer((void *) ((char *) tmp_buf + true_lb));
        goto fn_exit;
    }
    

    if (local_rank == 0) {
        leader_comm = smpi_comm_get_leaders_comm(comm);
        if(leader_comm==MPI_COMM_NULL){
          leader_comm = MPI_COMM_WORLD;
        }
        leader_comm_size = smpi_comm_size(leader_comm);
        leader_comm_rank = smpi_comm_rank(leader_comm);
        tmp_buf=(void *)smpi_get_tmp_sendbuffer(count *
                            (MAX(extent, true_extent)));
        tmp_buf = (void *) ((char *) tmp_buf - true_lb);
    }
    if (sendbuf != MPI_IN_PLACE) {
        in_buf = (void *)sendbuf;
    } else {
        in_buf = recvbuf;
    }
    if (local_rank == 0) {
        out_buf = tmp_buf;
    } else {
        out_buf = NULL;
    }


    if(local_size > 1) { 
        /* Lets do the intra-node reduce operations, if we have more than one
         * process in the node */

        /*Fix the input and outbuf buffers for the intra-node reduce.
         *Node leaders will have the reduced data in tmp_buf after 
         *this step*/
        if (MV2_Reduce_intra_function == & MPIR_Reduce_shmem_MV2)
        {
            if (is_commutative == 1
		&& (count * (MAX(extent, true_extent)) < SHMEM_COLL_BLOCK_SIZE)) {
                    mpi_errno = MV2_Reduce_intra_function(in_buf, out_buf, count,
                                      datatype, op,
                                      intra_node_root, shmem_comm);
            } else {
                    mpi_errno = MPIR_Reduce_intra_knomial_wrapper_MV2(in_buf, out_buf, count,
                                      datatype, op,
                                      intra_node_root, shmem_comm);
            }
        } else {

            mpi_errno = MV2_Reduce_intra_function(in_buf, out_buf, count,
                                      datatype, op,
                                      intra_node_root, shmem_comm);
        }
    } else { 
        smpi_free_tmp_buffer((void *) ((char *) tmp_buf + true_lb));
        tmp_buf = in_buf; 
    } 

    /* Now work on the inter-leader phase. Data is in tmp_buf */
    if (local_rank == 0 && leader_comm_size > 1) {
        /*The leader of root will have the global reduced data in tmp_buf 
           or recv_buf
           at the end of the reduce */
        if (leader_comm_rank == leader_root) {
            if (my_rank == root) {
                /* I am the root of the leader-comm, and the 
                 * root of the reduce op. So, I will write the 
                 * final result directly into my recvbuf */
                if(tmp_buf != recvbuf) { 
                    in_buf = tmp_buf;
                    out_buf = recvbuf;
                } else { 

                     in_buf = (char *)smpi_get_tmp_sendbuffer(count*
                                       smpi_datatype_get_extent(datatype));
                     smpi_datatype_copy(tmp_buf, count, datatype,
                                        in_buf, count, datatype);
                    //in_buf = MPI_IN_PLACE; 
                    out_buf = recvbuf; 
                } 
            } else {
                in_buf = (char *)smpi_get_tmp_sendbuffer(count*
                                       smpi_datatype_get_extent(datatype));
                smpi_datatype_copy(tmp_buf, count, datatype,
                                        in_buf, count, datatype);
                //in_buf = MPI_IN_PLACE;
                out_buf = tmp_buf;
            }
        } else {
            in_buf = tmp_buf;
            out_buf = NULL;
        }

        /* inter-leader communication  */
        mpi_errno = MV2_Reduce_function(in_buf, out_buf, count,
                              datatype, op,
                              leader_root, leader_comm);

    }

    if (local_size > 1) {
        /* Send the message to the root if the leader is not the
         * root of the reduce operation. The reduced data is in tmp_buf */
        if ((local_rank == 0) && (root != my_rank)
            && (leader_root == leader_comm_rank)) {
            smpi_mpi_send(tmp_buf, count, datatype, root,
                                     COLL_TAG_REDUCE+1, comm);
        }
        if ((local_rank != 0) && (root == my_rank)) {
            smpi_mpi_recv(recvbuf, count, datatype,
                                     leader_of_root,
                                     COLL_TAG_REDUCE+1, comm,
                                     MPI_STATUS_IGNORE);
        }
      smpi_free_tmp_buffer((void *) ((char *) tmp_buf + true_lb));

      if (leader_comm_rank == leader_root) {
        if (my_rank != root || (my_rank == root && tmp_buf == recvbuf)) { 
          smpi_free_tmp_buffer(in_buf);
        }
      }
    }



  fn_exit:
    return mpi_errno;
}