int smpi_coll_tuned_bcast_mvapich2_inter_node(void *buffer,
                                                 int count,
                                                 MPI_Datatype datatype,
                                                 int root,
                                                 MPI_Comm  comm)
{
    int rank;
    int mpi_errno = MPI_SUCCESS;
    MPI_Comm shmem_comm, leader_comm;
    int local_rank, local_size, global_rank = -1;
    int leader_root, leader_of_root;


    rank = smpi_comm_rank(comm);
    //comm_size = smpi_comm_size(comm);


    if (MV2_Bcast_function==NULL){
      MV2_Bcast_function=smpi_coll_tuned_bcast_mpich;
    }
    
    if (MV2_Bcast_intra_node_function==NULL){
      MV2_Bcast_intra_node_function= smpi_coll_tuned_bcast_mpich;
    }
    
    if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){
      smpi_comm_init_smp(comm);
    }
    
    shmem_comm = smpi_comm_get_intra_comm(comm);
    local_rank = smpi_comm_rank(shmem_comm);
    local_size = smpi_comm_size(shmem_comm);

    leader_comm = smpi_comm_get_leaders_comm(comm);

    if ((local_rank == 0) && (local_size > 1)) {
      global_rank = smpi_comm_rank(leader_comm);
    }

    int* leaders_map = smpi_comm_get_leaders_map(comm);
    leader_of_root = smpi_group_rank(smpi_comm_group(comm),leaders_map[root]);
    leader_root = smpi_group_rank(smpi_comm_group(leader_comm),leaders_map[root]);
    
    
    if (local_size > 1) {
        if ((local_rank == 0) && (root != rank) && (leader_root == global_rank)) {
            smpi_mpi_recv(buffer, count, datatype, root,
                                     COLL_TAG_BCAST, comm, MPI_STATUS_IGNORE);
        }
        if ((local_rank != 0) && (root == rank)) {
            smpi_mpi_send(buffer, count, datatype,
                                     leader_of_root, COLL_TAG_BCAST, comm);
        }
    }
#if defined(_MCST_SUPPORT_)
    if (comm_ptr->ch.is_mcast_ok) {
        mpi_errno = MPIR_Mcast_inter_node_MV2(buffer, count, datatype, root, comm_ptr,
                                              errflag);
        if (mpi_errno == MPI_SUCCESS) {
            goto fn_exit;
        } else {
            goto fn_fail;
        }
    }
#endif
/*
    if (local_rank == 0) {
        leader_comm = smpi_comm_get_leaders_comm(comm);
        root = leader_root;
    }

    if (MV2_Bcast_function == &MPIR_Pipelined_Bcast_MV2) {
        mpi_errno = MPIR_Pipelined_Bcast_MV2(buffer, count, datatype,
                                             root, comm);
    } else if (MV2_Bcast_function == &MPIR_Bcast_scatter_ring_allgather_shm_MV2) {
        mpi_errno = MPIR_Bcast_scatter_ring_allgather_shm_MV2(buffer, count,
                                                              datatype, root,
                                                              comm);
    } else */{
        if (local_rank == 0) {
      /*      if (MV2_Bcast_function == &MPIR_Knomial_Bcast_inter_node_wrapper_MV2) {
                mpi_errno = MPIR_Knomial_Bcast_inter_node_wrapper_MV2(buffer, count,
                                                              datatype, root,
                                                              comm);
            } else {*/
                mpi_errno = MV2_Bcast_function(buffer, count, datatype,
                                               leader_root, leader_comm);
          //  }
        }
    }

    return mpi_errno;
}
int Coll_bcast_mvapich2::bcast(void *buffer,
    int count,
    MPI_Datatype datatype,
    int root, MPI_Comm comm)
{
    int mpi_errno = MPI_SUCCESS;
    int comm_size/*, rank*/;
    int two_level_bcast = 1;
    long nbytes = 0;
    int range = 0;
    int range_threshold = 0;
    int range_threshold_intra = 0;
    // int is_homogeneous, is_contig;
    MPI_Aint type_size;
    //, position;
    // unsigned char *tmp_buf = NULL;
    MPI_Comm shmem_comm;
    //MPID_Datatype *dtp;

    if (count == 0)
        return MPI_SUCCESS;
    if(comm->get_leaders_comm()==MPI_COMM_NULL){
      comm->init_smp();
    }
    if (not mv2_bcast_thresholds_table)
      init_mv2_bcast_tables_stampede();
    comm_size = comm->size();
    //rank = comm->rank();

    //is_contig=1;
/*    if (HANDLE_GET_KIND(datatype) == HANDLE_KIND_BUILTIN)*/
/*        is_contig = 1;*/
/*    else {*/
/*        MPID_Datatype_get_ptr(datatype, dtp);*/
/*        is_contig = dtp->is_contig;*/
/*    }*/

    // is_homogeneous = 1;

    /* MPI_Type_size() might not give the accurate size of the packed
     * datatype for heterogeneous systems (because of padding, encoding,
     * etc). On the other hand, MPI_Pack_size() can become very
     * expensive, depending on the implementation, especially for
     * heterogeneous systems. We want to use MPI_Type_size() wherever
     * possible, and MPI_Pack_size() in other places.
     */
    //if (is_homogeneous) {
        type_size=datatype->size();

   /* } else {
        MPIR_Pack_size_impl(1, datatype, &type_size);
    }*/
    nbytes =  (count) * (type_size);

    /* Search for the corresponding system size inside the tuning table */
    while ((range < (mv2_size_bcast_tuning_table - 1)) &&
           (comm_size > mv2_bcast_thresholds_table[range].numproc)) {
        range++;
    }
    /* Search for corresponding inter-leader function */
    while ((range_threshold < (mv2_bcast_thresholds_table[range].size_inter_table - 1))
           && (nbytes >
               mv2_bcast_thresholds_table[range].inter_leader[range_threshold].max)
           && (mv2_bcast_thresholds_table[range].inter_leader[range_threshold].max != -1)) {
        range_threshold++;
    }

    /* Search for corresponding intra-node function */
    while ((range_threshold_intra <
            (mv2_bcast_thresholds_table[range].size_intra_table - 1))
           && (nbytes >
               mv2_bcast_thresholds_table[range].intra_node[range_threshold_intra].max)
           && (mv2_bcast_thresholds_table[range].intra_node[range_threshold_intra].max !=
               -1)) {
        range_threshold_intra++;
    }

    MV2_Bcast_function =
        mv2_bcast_thresholds_table[range].inter_leader[range_threshold].
        MV2_pt_Bcast_function;

    MV2_Bcast_intra_node_function =
        mv2_bcast_thresholds_table[range].
        intra_node[range_threshold_intra].MV2_pt_Bcast_function;

/*    if (mv2_user_bcast_intra == NULL && */
/*            MV2_Bcast_intra_node_function == &MPIR_Knomial_Bcast_intra_node_MV2) {*/
/*            MV2_Bcast_intra_node_function = &MPIR_Shmem_Bcast_MV2;*/
/*    }*/

    if (mv2_bcast_thresholds_table[range].inter_leader[range_threshold].
        zcpy_pipelined_knomial_factor != -1) {
        zcpy_knomial_factor =
            mv2_bcast_thresholds_table[range].inter_leader[range_threshold].
            zcpy_pipelined_knomial_factor;
    }

    if (mv2_pipelined_zcpy_knomial_factor != -1) {
        zcpy_knomial_factor = mv2_pipelined_zcpy_knomial_factor;
    }

    if(MV2_Bcast_intra_node_function == NULL) {
        /* if tuning table do not have any intra selection, set func pointer to
        ** default one for mcast intra node */
        MV2_Bcast_intra_node_function = &MPIR_Shmem_Bcast_MV2;
    }

    /* Set value of pipeline segment size */
    bcast_segment_size = mv2_bcast_thresholds_table[range].bcast_segment_size;

    /* Set value of inter node knomial factor */
    mv2_inter_node_knomial_factor = mv2_bcast_thresholds_table[range].inter_node_knomial_factor;

    /* Set value of intra node knomial factor */
    mv2_intra_node_knomial_factor = mv2_bcast_thresholds_table[range].intra_node_knomial_factor;

    /* Check if we will use a two level algorithm or not */
    two_level_bcast =
#if defined(_MCST_SUPPORT_)
        mv2_bcast_thresholds_table[range].is_two_level_bcast[range_threshold]
        || comm->ch.is_mcast_ok;
#else
        mv2_bcast_thresholds_table[range].is_two_level_bcast[range_threshold];
#endif
     if (two_level_bcast == 1) {
       // if (not is_contig || not is_homogeneous) {
//   tmp_buf = smpi_get_tmp_sendbuffer(nbytes);

/*            position = 0;*/
/*            if (rank == root) {*/
/*                mpi_errno =*/
/*                    MPIR_Pack_impl(buffer, count, datatype, tmp_buf, nbytes, &position);*/
/*                if (mpi_errno)*/
/*                    MPIU_ERR_POP(mpi_errno);*/
/*            }*/
// }
#ifdef CHANNEL_MRAIL_GEN2
        if ((mv2_enable_zcpy_bcast == 1) &&
              (&MPIR_Pipelined_Bcast_Zcpy_MV2 == MV2_Bcast_function)) {
          // if (not is_contig || not is_homogeneous) {
          //   mpi_errno = MPIR_Pipelined_Bcast_Zcpy_MV2(tmp_buf, nbytes, MPI_BYTE, root, comm);
          // } else {
                mpi_errno = MPIR_Pipelined_Bcast_Zcpy_MV2(buffer, count, datatype,
                                                 root, comm);
          // }
        } else
#endif /* defined(CHANNEL_MRAIL_GEN2) */
        {
            shmem_comm = comm->get_intra_comm();
            // if (not is_contig || not is_homogeneous) {
            //   MPIR_Bcast_tune_inter_node_helper_MV2(tmp_buf, nbytes, MPI_BYTE, root, comm);
            // } else {
              MPIR_Bcast_tune_inter_node_helper_MV2(buffer, count, datatype, root, comm);
            // }

            /* We are now done with the inter-node phase */


                    root = INTRA_NODE_ROOT;

                    // if (not is_contig || not is_homogeneous) {
                    //       mpi_errno = MV2_Bcast_intra_node_function(tmp_buf, nbytes, MPI_BYTE, root, shmem_comm);
                    // } else {
                    mpi_errno = MV2_Bcast_intra_node_function(buffer, count,
                                                              datatype, root, shmem_comm);

                    // }
        }
        /*        if (not is_contig || not is_homogeneous) {*/
        /*            if (rank != root) {*/
        /*                position = 0;*/
        /*                mpi_errno = MPIR_Unpack_impl(tmp_buf, nbytes, &position, buffer,*/
        /*                                             count, datatype);*/
        /*            }*/
        /*        }*/
    } else {
        /* We use Knomial for intra node */
        MV2_Bcast_intra_node_function = &MPIR_Knomial_Bcast_intra_node_MV2;
/*        if (mv2_enable_shmem_bcast == 0) {*/
            /* Fall back to non-tuned version */
/*            MPIR_Bcast_intra_MV2(buffer, count, datatype, root, comm);*/
/*        } else {*/
            mpi_errno = MV2_Bcast_function(buffer, count, datatype, root,
                                           comm);

/*        }*/
    }


    return mpi_errno;

}