Example #1
0
int smpi_coll_tuned_gather_ompi(void *sbuf, int scount, 
                                           MPI_Datatype sdtype,
                                           void* rbuf, int rcount, 
                                           MPI_Datatype rdtype, 
                                           int root,
                                           MPI_Comm  comm
                                           )
{
    //const int large_segment_size = 32768;
    //const int small_segment_size = 1024;

    //const size_t large_block_size = 92160;
    const size_t intermediate_block_size = 6000;
    const size_t small_block_size = 1024;

    const int large_communicator_size = 60;
    const int small_communicator_size = 10;

    int communicator_size, rank;
    size_t dsize, block_size;

    XBT_DEBUG("smpi_coll_tuned_gather_ompi");

    communicator_size = smpi_comm_size(comm);
    rank = smpi_comm_rank(comm);

    // Determine block size 
    if (rank == root) {
        dsize = smpi_datatype_size(rdtype);
        block_size = dsize * rcount;
    } else {
        dsize = smpi_datatype_size(sdtype);
        block_size = dsize * scount;
    }

/*    if (block_size > large_block_size) {*/
/*        return smpi_coll_tuned_gather_ompi_linear_sync (sbuf, scount, sdtype, */
/*                                                         rbuf, rcount, rdtype, */
/*                                                         root, comm);*/

/*    } else*/ if (block_size > intermediate_block_size) {
        return smpi_coll_tuned_gather_ompi_linear_sync (sbuf, scount, sdtype, 
                                                         rbuf, rcount, rdtype, 
                                                         root, comm);

    } else if ((communicator_size > large_communicator_size) ||
               ((communicator_size > small_communicator_size) &&
                (block_size < small_block_size))) {
        return smpi_coll_tuned_gather_ompi_binomial (sbuf, scount, sdtype, 
                                                      rbuf, rcount, rdtype, 
                                                      root, comm);

    }
    // Otherwise, use basic linear 
    return smpi_coll_tuned_gather_ompi_basic_linear (sbuf, scount, sdtype, 
                                                      rbuf, rcount, rdtype, 
                                                      root, comm);
}
Example #2
0
int smpi_coll_tuned_alltoall_ompi( void *sbuf, int scount, 
                                             MPI_Datatype sdtype,
                                             void* rbuf, int rcount, 
                                             MPI_Datatype rdtype, 
                                             MPI_Comm comm)
{
    int communicator_size;
    size_t dsize, block_dsize;
    communicator_size = smpi_comm_size(comm);

    /* Decision function based on measurement on Grig cluster at 
       the University of Tennessee (2GB MX) up to 64 nodes.
       Has better performance for messages of intermediate sizes than the old one */
    /* determine block size */
    dsize = smpi_datatype_size(sdtype);
    block_dsize = dsize * scount;

    if ((block_dsize < 200) && (communicator_size > 12)) {
        return smpi_coll_tuned_alltoall_bruck(sbuf, scount, sdtype, 
                                                    rbuf, rcount, rdtype,
                                                    comm);

    } else if (block_dsize < 3000) {
        return smpi_coll_tuned_alltoall_basic_linear(sbuf, scount, sdtype, 
                                                           rbuf, rcount, rdtype, 
                                                           comm);
    }

    return smpi_coll_tuned_alltoall_ring (sbuf, scount, sdtype, 
                                                    rbuf, rcount, rdtype,
                                                    comm);
}
Example #3
0
static void action_Isend(const char *const *action)
{
  CHECK_ACTION_PARAMS(action, 2, 1);
  int to = atoi(action[2]);
  double size=parse_double(action[3]);
  double clock = smpi_process_simulated_elapsed();
  MPI_Request request;

  if(action[4]) MPI_CURRENT_TYPE=decode_datatype(action[4]);
  else MPI_CURRENT_TYPE= MPI_DEFAULT_TYPE;

  int rank = smpi_process_index();
  int dst_traced = smpi_group_rank(smpi_comm_group(MPI_COMM_WORLD), to);
  instr_extra_data extra = xbt_new0(s_instr_extra_data_t,1);
  extra->type = TRACING_ISEND;
  extra->send_size = size;
  extra->src = rank;
  extra->dst = dst_traced;
  extra->datatype1 = encode_datatype(MPI_CURRENT_TYPE, NULL);
  TRACE_smpi_ptp_in(rank, rank, dst_traced, __FUNCTION__, extra);
  if (!TRACE_smpi_view_internals()) {
    TRACE_smpi_send(rank, rank, dst_traced, size*smpi_datatype_size(MPI_CURRENT_TYPE));
  }

  request = smpi_mpi_isend(NULL, size, MPI_CURRENT_TYPE, to, 0,MPI_COMM_WORLD);

  TRACE_smpi_ptp_out(rank, rank, dst_traced, __FUNCTION__);
  request->send = 1;

  xbt_dynar_push(get_reqq_self(),&request);

  log_timed_action (action, clock);
}
Example #4
0
int smpi_coll_tuned_reduce_ompi_binary( void *sendbuf, void *recvbuf,
                                         int count, MPI_Datatype datatype,
                                         MPI_Op  op, int root,
                                         MPI_Comm  comm)
{
    uint32_t segsize;
    int segcount = count;
    size_t typelng;



    /**
     * Determine number of segments and number of elements
     * sent per operation
     */
    typelng=smpi_datatype_size( datatype );

        // Binary_32K 
    segsize = 32*1024;

    XBT_DEBUG("coll:tuned:reduce_intra_binary rank %d ss %5d",
                 smpi_comm_rank(comm), segsize);

    COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount );

    return smpi_coll_tuned_ompi_reduce_generic( sendbuf, recvbuf, count, datatype, 
                                           op, root, comm, 
                                           ompi_coll_tuned_topo_build_tree(2, comm, root), 
                                           segcount, 0);
}
Example #5
0
int smpi_coll_tuned_alltoall_ompi2(void *sendbuf, int sendcount,
                                   MPI_Datatype sendtype, void *recvbuf,
                                   int recvcount, MPI_Datatype recvtype,
                                   MPI_Comm comm)
{
  int size, sendsize;  
  size = smpi_comm_size(comm);  
  sendsize = smpi_datatype_size(sendtype) * sendcount;  
  if (sendsize < 200 && size > 12) {
    return
        smpi_coll_tuned_alltoall_bruck(sendbuf, sendcount, sendtype,
                                       recvbuf, recvcount, recvtype,
                                       comm);
  } else if (sendsize < 3000) {
    return
        smpi_coll_tuned_alltoall_basic_linear(sendbuf, sendcount,
                                              sendtype, recvbuf,
                                              recvcount, recvtype, comm);
  } else {
    return
        smpi_coll_tuned_alltoall_ring(sendbuf, sendcount, sendtype,
                                      recvbuf, recvcount, recvtype,
                                      comm);
  }
}
Example #6
0
int smpi_coll_tuned_scatter_ompi(void *sbuf, int scount, 
                                            MPI_Datatype sdtype,
                                            void* rbuf, int rcount, 
                                            MPI_Datatype rdtype, 
                                            int root, MPI_Comm  comm
                                            )
{
    const size_t small_block_size = 300;
    const int small_comm_size = 10;
    int communicator_size, rank;
    size_t dsize, block_size;

    XBT_DEBUG("smpi_coll_tuned_scatter_ompi");

    communicator_size = smpi_comm_size(comm);
    rank = smpi_comm_rank(comm);
    // Determine block size 
    if (root == rank) {
        dsize=smpi_datatype_size(sdtype);
        block_size = dsize * scount;
    } else {
        dsize=smpi_datatype_size(rdtype);
        block_size = dsize * rcount;
    } 

    if ((communicator_size > small_comm_size) &&
        (block_size < small_block_size)) {
        if(rank!=root){
            sbuf=xbt_malloc(rcount*smpi_datatype_get_extent(rdtype));
            scount=rcount;
            sdtype=rdtype;
        }
        int ret=smpi_coll_tuned_scatter_ompi_binomial (sbuf, scount, sdtype,
            rbuf, rcount, rdtype,
            root, comm);
        if(rank!=root){
            xbt_free(sbuf);
        }
        return ret;
    }
    return smpi_coll_tuned_scatter_ompi_basic_linear (sbuf, scount, sdtype, 
                                                       rbuf, rcount, rdtype, 
                                                       root, comm);
}
Example #7
0
int smpi_coll_tuned_reduce_scatter_ompi( void *sbuf, void *rbuf,
                                                    int *rcounts,
                                                    MPI_Datatype dtype,
                                                    MPI_Op  op,
                                                    MPI_Comm  comm
                                                    )
{
    int comm_size, i, pow2;
    size_t total_message_size, dsize;
    const double a = 0.0012;
    const double b = 8.0;
    const size_t small_message_size = 12 * 1024;
    const size_t large_message_size = 256 * 1024;
    int zerocounts = 0;

    XBT_DEBUG("smpi_coll_tuned_reduce_scatter_ompi");
    
    comm_size = smpi_comm_size(comm);
    // We need data size for decision function 
    dsize=smpi_datatype_size(dtype);
    total_message_size = 0;
    for (i = 0; i < comm_size; i++) { 
        total_message_size += rcounts[i];
        if (0 == rcounts[i]) {
            zerocounts = 1;
        }
    }

    if( !smpi_op_is_commute(op) || (zerocounts)) {
        smpi_mpi_reduce_scatter (sbuf, rbuf, rcounts, 
                                                                    dtype, op, 
                                                                    comm); 
        return MPI_SUCCESS;
    }
   
    total_message_size *= dsize;

    // compute the nearest power of 2 
    for (pow2 = 1; pow2 < comm_size; pow2 <<= 1);

    if ((total_message_size <= small_message_size) ||
        ((total_message_size <= large_message_size) && (pow2 == comm_size)) ||
        (comm_size >= a * total_message_size + b)) {
        return 
            smpi_coll_tuned_reduce_scatter_ompi_basic_recursivehalving(sbuf, rbuf, rcounts,
                                                                        dtype, op,
                                                                        comm);
    } 
    return smpi_coll_tuned_reduce_scatter_ompi_ring(sbuf, rbuf, rcounts,
                                                     dtype, op,
                                                     comm);



}
Example #8
0
int smpi_coll_tuned_allgatherv_ompi(void *sbuf, int scount, 
                                               MPI_Datatype sdtype,
                                               void* rbuf, int *rcounts, 
                                               int *rdispls,
                                               MPI_Datatype rdtype, 
                                               MPI_Comm  comm
                                               )
{
    int i;
    int communicator_size;
    size_t dsize, total_dsize;
    
    communicator_size = smpi_comm_size(comm);
    
    /* Special case for 2 processes */
    if (communicator_size == 2) {
        return smpi_coll_tuned_allgatherv_pair(sbuf, scount, sdtype,
                                                           rbuf, rcounts, rdispls, rdtype, 
                                                           comm);
    }
    
    /* Determine complete data size */
    dsize=smpi_datatype_size(sdtype);
    total_dsize = 0;
    for (i = 0; i < communicator_size; i++) {
        total_dsize += dsize * rcounts[i];
    }
    
    /* Decision based on allgather decision.   */
    if (total_dsize < 50000) {
/*        return smpi_coll_tuned_allgatherv_intra_bruck(sbuf, scount, sdtype, 
                                                      rbuf, rcounts, rdispls, rdtype, 
                                                      comm, module);*/
    return smpi_coll_tuned_allgatherv_ring(sbuf, scount, sdtype, 
                                                      rbuf, rcounts, rdispls, rdtype, 
                                                      comm);

    } else {
        if (communicator_size % 2) {
            return smpi_coll_tuned_allgatherv_ring(sbuf, scount, sdtype, 
                                                         rbuf, rcounts, rdispls, rdtype, 
                                                         comm);
        } else {
            return  smpi_coll_tuned_allgatherv_ompi_neighborexchange(sbuf, scount, sdtype,
                                                                      rbuf, rcounts, rdispls, rdtype, 
                                                                      comm);
        }
    }
}
Example #9
0
int smpi_coll_tuned_reduce_ompi_pipeline( void *sendbuf, void *recvbuf,
                                           int count, MPI_Datatype datatype,
                                           MPI_Op  op, int root,
                                           MPI_Comm  comm  )
{

    uint32_t segsize;
    int segcount = count;
    size_t typelng;
//    COLL_TUNED_UPDATE_PIPELINE( comm, tuned_module, root );

    /**
     * Determine number of segments and number of elements
     * sent per operation
     */
    const double a2 =  0.0410 / 1024.0; /* [1/B] */
    const double b2 =  9.7128;
    const double a4 =  0.0033 / 1024.0; /* [1/B] */
    const double b4 =  1.6761;
    typelng= smpi_datatype_size( datatype);
    int communicator_size = smpi_comm_size(comm);
    size_t message_size = typelng * count; 

    if (communicator_size > (a2 * message_size + b2)) {
        // Pipeline_1K 
        segsize = 1024;
    }else if (communicator_size > (a4 * message_size + b4)) {
        // Pipeline_32K 
        segsize = 32*1024;
    } else {
        // Pipeline_64K 
        segsize = 64*1024;
    }

    XBT_DEBUG("coll:tuned:reduce_intra_pipeline rank %d ss %5d",
                 smpi_comm_rank(comm), segsize);

    COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount );

    return smpi_coll_tuned_ompi_reduce_generic( sendbuf, recvbuf, count, datatype, 
                                           op, root, comm,
                                           ompi_coll_tuned_topo_build_chain( 1, comm, root), 
                                           segcount, 0);
}
Example #10
0
int smpi_coll_tuned_allreduce_ompi(void *sbuf, void *rbuf, int count,
                        MPI_Datatype dtype, MPI_Op op, MPI_Comm comm)
{
    size_t dsize, block_dsize;
    int comm_size = smpi_comm_size(comm);
    const size_t intermediate_message = 10000;

    /**
     * Decision function based on MX results from the Grig cluster at UTK.
     * 
     * Currently, linear, recursive doubling, and nonoverlapping algorithms 
     * can handle both commutative and non-commutative operations.
     * Ring algorithm does not support non-commutative operations.
     */
    dsize = smpi_datatype_size(dtype);
    block_dsize = dsize * count;

    if (block_dsize < intermediate_message) {
        return (smpi_coll_tuned_allreduce_rdb (sbuf, rbuf, 
                                                                   count, dtype,
                                                                   op, comm));
    } 

    if( smpi_op_is_commute(op) && (count > comm_size) ) {
        const size_t segment_size = 1 << 20; /* 1 MB */
        if ((comm_size * segment_size >= block_dsize)) {
            //FIXME: ok, these are not the right algorithms, try to find closer ones
            // lr is a good match for allreduce_ring (difference is mainly the use of sendrecv)
            return smpi_coll_tuned_allreduce_lr(sbuf, rbuf, count, dtype,
                                              op, comm);
        } else {
           return (smpi_coll_tuned_allreduce_ompi_ring_segmented (sbuf, rbuf,
                                                                    count, dtype, 
                                                                    op, comm 
                                                                    /*segment_size*/));
        }
    }

    return (smpi_coll_tuned_allreduce_redbcast(sbuf, rbuf, count, 
                                                            dtype, op, comm));
}
Example #11
0
int smpi_coll_tuned_reduce_ompi_binomial( void *sendbuf, void *recvbuf,
                                           int count, MPI_Datatype datatype,
                                           MPI_Op  op, int root,
                                           MPI_Comm  comm)
{

    uint32_t segsize=0;
    int segcount = count;
    size_t typelng;

    const double a1 =  0.6016 / 1024.0; /* [1/B] */
    const double b1 =  1.3496;

//    COLL_TUNED_UPDATE_IN_ORDER_BMTREE( comm, tuned_module, root );

    /**
     * Determine number of segments and number of elements
     * sent per operation
     */
    typelng= smpi_datatype_size( datatype);
    int communicator_size = smpi_comm_size(comm);
    size_t message_size = typelng * count; 
    if (((communicator_size < 8) && (message_size < 20480)) ||
               (message_size < 2048) || (count <= 1)) {
        /* Binomial_0K */
        segsize = 0;
    } else if (communicator_size > (a1 * message_size + b1)) {
        // Binomial_1K 
        segsize = 1024;
    }

    XBT_DEBUG("coll:tuned:reduce_intra_binomial rank %d ss %5d",
                 smpi_comm_rank(comm), segsize);
    COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount );

    return smpi_coll_tuned_ompi_reduce_generic( sendbuf, recvbuf, count, datatype, 
                                           op, root, comm, 
                                           ompi_coll_tuned_topo_build_in_order_bmtree(comm, root), 
                                           segcount, 0);
}
Example #12
0
int smpi_coll_tuned_gather_mvapich2_two_level(void *sendbuf,
                                            int sendcnt,
                                            MPI_Datatype sendtype,
                                            void *recvbuf,
                                            int recvcnt,
                                            MPI_Datatype recvtype,
                                            int root,
                                            MPI_Comm comm)
{
    void *leader_gather_buf = NULL;
    int comm_size, rank;
    int local_rank, local_size;
    int leader_comm_rank = -1, leader_comm_size = 0;
    int mpi_errno = MPI_SUCCESS;
    int recvtype_size = 0, sendtype_size = 0, nbytes=0;
    int leader_root, leader_of_root;
    MPI_Status status;
    MPI_Aint sendtype_extent = 0, recvtype_extent = 0;  /* Datatype extent */
    MPI_Aint true_lb, sendtype_true_extent, recvtype_true_extent;
    MPI_Comm shmem_comm, leader_comm;
    void* tmp_buf = NULL;
    

    //if not set (use of the algo directly, without mvapich2 selector)
    if(MV2_Gather_intra_node_function==NULL)
      MV2_Gather_intra_node_function=smpi_coll_tuned_gather_mpich;
    
    if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){
      smpi_comm_init_smp(comm);
    }
    comm_size = smpi_comm_size(comm);
    rank = smpi_comm_rank(comm);

    if (((rank == root) && (recvcnt == 0)) ||
        ((rank != root) && (sendcnt == 0))) {
        return MPI_SUCCESS;
    }

    if (sendtype != MPI_DATATYPE_NULL) {
        sendtype_extent=smpi_datatype_get_extent(sendtype);
        sendtype_size=smpi_datatype_size(sendtype);
        smpi_datatype_extent(sendtype, &true_lb,
                                       &sendtype_true_extent);
    }
    if (recvtype != MPI_DATATYPE_NULL) {
        recvtype_extent=smpi_datatype_get_extent(recvtype);
        recvtype_size=smpi_datatype_size(recvtype);
        smpi_datatype_extent(recvtype, &true_lb,
                                       &recvtype_true_extent);
    }

    /* extract the rank,size information for the intra-node
     * communicator */
    shmem_comm = smpi_comm_get_intra_comm(comm);
    local_rank = smpi_comm_rank(shmem_comm);
    local_size = smpi_comm_size(shmem_comm);
    
    if (local_rank == 0) {
        /* Node leader. Extract the rank, size information for the leader
         * communicator */
        leader_comm = smpi_comm_get_leaders_comm(comm);
        if(leader_comm==MPI_COMM_NULL){
          leader_comm = MPI_COMM_WORLD;
        }
        leader_comm_size = smpi_comm_size(leader_comm);
        leader_comm_rank = smpi_comm_rank(leader_comm);
    }

    if (rank == root) {
        nbytes = recvcnt * recvtype_size;

    } else {
        nbytes = sendcnt * sendtype_size;
    }

#if defined(_SMP_LIMIC_)
     if((g_use_limic2_coll) && (shmem_commptr->ch.use_intra_sock_comm == 1) 
         && (use_limic_gather)
         &&((num_scheme == USE_GATHER_PT_PT_BINOMIAL) 
            || (num_scheme == USE_GATHER_PT_PT_DIRECT)
            ||(num_scheme == USE_GATHER_PT_LINEAR_BINOMIAL) 
            || (num_scheme == USE_GATHER_PT_LINEAR_DIRECT)
            || (num_scheme == USE_GATHER_LINEAR_PT_BINOMIAL)
            || (num_scheme == USE_GATHER_LINEAR_PT_DIRECT)
            || (num_scheme == USE_GATHER_LINEAR_LINEAR)
            || (num_scheme == USE_GATHER_SINGLE_LEADER))) {
            
            mpi_errno = MV2_Gather_intra_node_function(sendbuf, sendcnt, sendtype,
                                                    recvbuf, recvcnt,recvtype, 
                                                    root, comm);
     } else

#endif/*#if defined(_SMP_LIMIC_)*/    
    {
        if (local_rank == 0) {
            /* Node leader, allocate tmp_buffer */
            if (rank == root) {
                tmp_buf = smpi_get_tmp_recvbuffer(recvcnt * MAX(recvtype_extent,
                            recvtype_true_extent) * local_size);
            } else {
                tmp_buf = smpi_get_tmp_sendbuffer(sendcnt * MAX(sendtype_extent,
                            sendtype_true_extent) *
                        local_size);
            }
            if (tmp_buf == NULL) {
                mpi_errno = MPI_ERR_OTHER;
                return mpi_errno;
            }
        }
         /*while testing mpich2 gather test, we see that
         * which basically splits the comm, and we come to
         * a point, where use_intra_sock_comm == 0, but if the 
         * intra node function is MPIR_Intra_node_LIMIC_Gather_MV2,
         * it would use the intra sock comm. In such cases, we 
         * fallback to binomial as a default case.*/
#if defined(_SMP_LIMIC_)         
        if(*MV2_Gather_intra_node_function == MPIR_Intra_node_LIMIC_Gather_MV2) {

            mpi_errno  = MPIR_pt_pt_intra_gather(sendbuf,sendcnt, sendtype,
                                                 recvbuf, recvcnt, recvtype,
                                                 root, rank, 
                                                 tmp_buf, nbytes, 
                                                 TEMP_BUF_HAS_NO_DATA,
                                                 shmem_commptr,
                                                 MPIR_Gather_intra);
        } else
#endif
        {
            /*We are gathering the data into tmp_buf and the output
             * will be of MPI_BYTE datatype. Since the tmp_buf has no
             * local data, we pass is_data_avail = TEMP_BUF_HAS_NO_DATA*/
            mpi_errno  = MPIR_pt_pt_intra_gather(sendbuf,sendcnt, sendtype,
                                                 recvbuf, recvcnt, recvtype,
                                                 root, rank, 
                                                 tmp_buf, nbytes, 
                                                 TEMP_BUF_HAS_NO_DATA,
                                                 shmem_comm,
                                                 MV2_Gather_intra_node_function
                                                 );
        }
    }
    leader_comm = smpi_comm_get_leaders_comm(comm);
    int* leaders_map = smpi_comm_get_leaders_map(comm);
    leader_of_root = smpi_group_rank(smpi_comm_group(comm),leaders_map[root]);
    leader_root = smpi_group_rank(smpi_comm_group(leader_comm),leaders_map[root]);
    /* leader_root is the rank of the leader of the root in leader_comm. 
     * leader_root is to be used as the root of the inter-leader gather ops 
     */
    if (!smpi_comm_is_uniform(comm)) {
        if (local_rank == 0) {
            int *displs = NULL;
            int *recvcnts = NULL;
            int *node_sizes;
            int i = 0;
            /* Node leaders have all the data. But, different nodes can have
             * different number of processes. Do a Gather first to get the 
             * buffer lengths at each leader, followed by a Gatherv to move
             * the actual data */

            if (leader_comm_rank == leader_root && root != leader_of_root) {
                /* The root of the Gather operation is not a node-level 
                 * leader and this process's rank in the leader_comm 
                 * is the same as leader_root */
                if(rank == root) { 
                    leader_gather_buf = smpi_get_tmp_recvbuffer(recvcnt *
                                                MAX(recvtype_extent,
                                                recvtype_true_extent) *
                                                comm_size);
                } else { 
                    leader_gather_buf = smpi_get_tmp_sendbuffer(sendcnt *
                                                MAX(sendtype_extent,
                                                sendtype_true_extent) *
                                                comm_size);
                } 
                if (leader_gather_buf == NULL) {
                    mpi_errno =  MPI_ERR_OTHER;
                    return mpi_errno;
                }
            }

            node_sizes = smpi_comm_get_non_uniform_map(comm);

            if (leader_comm_rank == leader_root) {
                displs = xbt_malloc(sizeof (int) * leader_comm_size);
                recvcnts = xbt_malloc(sizeof (int) * leader_comm_size);
                if (!displs || !recvcnts) {
                    mpi_errno = MPI_ERR_OTHER;
                    return mpi_errno;
                }
            }

            if (root == leader_of_root) {
                /* The root of the gather operation is also the node 
                 * leader. Receive into recvbuf and we are done */
                if (leader_comm_rank == leader_root) {
                    recvcnts[0] = node_sizes[0] * recvcnt;
                    displs[0] = 0;

                    for (i = 1; i < leader_comm_size; i++) {
                        displs[i] = displs[i - 1] + node_sizes[i - 1] * recvcnt;
                        recvcnts[i] = node_sizes[i] * recvcnt;
                    }
                } 
                smpi_mpi_gatherv(tmp_buf,
                                         local_size * nbytes,
                                         MPI_BYTE, recvbuf, recvcnts,
                                         displs, recvtype,
                                         leader_root, leader_comm);
            } else {
                /* The root of the gather operation is not the node leader. 
                 * Receive into leader_gather_buf and then send 
                 * to the root */
                if (leader_comm_rank == leader_root) {
                    recvcnts[0] = node_sizes[0] * nbytes;
                    displs[0] = 0;

                    for (i = 1; i < leader_comm_size; i++) {
                        displs[i] = displs[i - 1] + node_sizes[i - 1] * nbytes;
                        recvcnts[i] = node_sizes[i] * nbytes;
                    }
                } 
                smpi_mpi_gatherv(tmp_buf, local_size * nbytes,
                                         MPI_BYTE, leader_gather_buf,
                                         recvcnts, displs, MPI_BYTE,
                                         leader_root, leader_comm);
            }
            if (leader_comm_rank == leader_root) {
                xbt_free(displs);
                xbt_free(recvcnts);
            }
        }
    } else {
        /* All nodes have the same number of processes. 
         * Just do one Gather to get all 
         * the data at the leader of the root process */
        if (local_rank == 0) {
            if (leader_comm_rank == leader_root && root != leader_of_root) {
                /* The root of the Gather operation is not a node-level leader
                 */
                leader_gather_buf = smpi_get_tmp_sendbuffer(nbytes * comm_size);
                if (leader_gather_buf == NULL) {
                    mpi_errno = MPI_ERR_OTHER;
                    return mpi_errno;
                }
            }
            if (root == leader_of_root) {
                mpi_errno = MPIR_Gather_MV2_Direct(tmp_buf,
                                                   nbytes * local_size,
                                                   MPI_BYTE, recvbuf,
                                                   recvcnt * local_size,
                                                   recvtype, leader_root,
                                                   leader_comm);
                 
            } else {
                mpi_errno = MPIR_Gather_MV2_Direct(tmp_buf, nbytes * local_size,
                                                   MPI_BYTE, leader_gather_buf,
                                                   nbytes * local_size,
                                                   MPI_BYTE, leader_root,
                                                   leader_comm);
            }
        }
    }
    if ((local_rank == 0) && (root != rank)
        && (leader_of_root == rank)) {
        smpi_mpi_send(leader_gather_buf,
                                 nbytes * comm_size, MPI_BYTE,
                                 root, COLL_TAG_GATHER, comm);
    }

    if (rank == root && local_rank != 0) {
        /* The root of the gather operation is not the node leader. Receive
         y* data from the node leader */
        smpi_mpi_recv(recvbuf, recvcnt * comm_size, recvtype,
                                 leader_of_root, COLL_TAG_GATHER, comm,
                                 &status);
    }

    /* check if multiple threads are calling this collective function */
    if (local_rank == 0 ) {
        if (tmp_buf != NULL) {
            smpi_free_tmp_buffer(tmp_buf);
        }
        if (leader_gather_buf != NULL) {
            smpi_free_tmp_buffer(leader_gather_buf);
        }
    }

    return (mpi_errno);
}
int smpi_coll_tuned_bcast_mvapich2_intra_node(void *buffer,
                         int count,
                         MPI_Datatype datatype,
                         int root, MPI_Comm  comm)
{
    int mpi_errno = MPI_SUCCESS;
    int comm_size;
    int two_level_bcast = 1;
    size_t nbytes = 0; 
    int is_homogeneous, is_contig;
    MPI_Aint type_size;
    void *tmp_buf = NULL;
    MPI_Comm shmem_comm;

    if (count == 0)
        return MPI_SUCCESS;
    if (MV2_Bcast_function==NULL){
      MV2_Bcast_function=smpi_coll_tuned_bcast_mpich;
    }
    
    if (MV2_Bcast_intra_node_function==NULL){
      MV2_Bcast_intra_node_function= smpi_coll_tuned_bcast_mpich;
    }
    
    if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){
      smpi_comm_init_smp(comm);
    }
    
    comm_size = smpi_comm_size(comm);
   // rank = smpi_comm_rank(comm);
/*
    if (HANDLE_GET_KIND(datatype) == HANDLE_KIND_BUILTIN)*/
        is_contig = 1;
/*    else {
        MPID_Datatype_get_ptr(datatype, dtp);
        is_contig = dtp->is_contig;
    }
*/
    is_homogeneous = 1;
#ifdef MPID_HAS_HETERO
    if (comm_ptr->is_hetero)
        is_homogeneous = 0;
#endif

    /* MPI_Type_size() might not give the accurate size of the packed
     * datatype for heterogeneous systems (because of padding, encoding,
     * etc). On the other hand, MPI_Pack_size() can become very
     * expensive, depending on the implementation, especially for
     * heterogeneous systems. We want to use MPI_Type_size() wherever
     * possible, and MPI_Pack_size() in other places.
     */
    //if (is_homogeneous) {
        type_size=smpi_datatype_size(datatype);
    //}
/*    else {*/
/*        MPIR_Pack_size_impl(1, datatype, &type_size);*/
/*    }*/
    nbytes = (size_t) (count) * (type_size);
    if (comm_size <= mv2_bcast_two_level_system_size) {
        if (nbytes > mv2_bcast_short_msg && nbytes < mv2_bcast_large_msg) {
            two_level_bcast = 1;
        } else {
            two_level_bcast = 0;
        }
    }

    if (two_level_bcast == 1
#if defined(_MCST_SUPPORT_)
            || comm_ptr->ch.is_mcast_ok
#endif
        ) {

        if (!is_contig || !is_homogeneous) {
            tmp_buf=(void *)smpi_get_tmp_sendbuffer(nbytes);

            /* TODO: Pipeline the packing and communication */
           // position = 0;
/*            if (rank == root) {*/
/*                mpi_errno =*/
/*                    MPIR_Pack_impl(buffer, count, datatype, tmp_buf, nbytes, &position);*/
/*                if (mpi_errno)*/
/*                    MPIU_ERR_POP(mpi_errno);*/
/*            }*/
        }

        shmem_comm = smpi_comm_get_intra_comm(comm);
        if (!is_contig || !is_homogeneous) {
            mpi_errno =
                MPIR_Bcast_inter_node_helper_MV2(tmp_buf, nbytes, MPI_BYTE,
                                                 root, comm);
        } else {
            mpi_errno =
                MPIR_Bcast_inter_node_helper_MV2(buffer, count, datatype, root,
                                                 comm);
        }

        /* We are now done with the inter-node phase */
            if (nbytes <= mv2_knomial_intra_node_threshold) {
                if (!is_contig || !is_homogeneous) {
                    mpi_errno = MPIR_Shmem_Bcast_MV2(tmp_buf, nbytes, MPI_BYTE,
                                                     root, shmem_comm);
                } else {
                    mpi_errno = MPIR_Shmem_Bcast_MV2(buffer, count, datatype,
                                                     root, shmem_comm);
                }
            } else {
                if (!is_contig || !is_homogeneous) {
                    mpi_errno =
                        MPIR_Knomial_Bcast_intra_node_MV2(tmp_buf, nbytes,
                                                          MPI_BYTE,
                                                          INTRA_NODE_ROOT,
                                                          shmem_comm);
                } else {
                    mpi_errno =
                        MPIR_Knomial_Bcast_intra_node_MV2(buffer, count,
                                                          datatype,
                                                          INTRA_NODE_ROOT,
                                                          shmem_comm);
                }
            }

    } else {
        if (nbytes <= mv2_bcast_short_msg) {
            mpi_errno = MPIR_Bcast_binomial_MV2(buffer, count, datatype, root,
                                                comm);
        } else {
            if (mv2_scatter_rd_inter_leader_bcast) {
                mpi_errno = MPIR_Bcast_scatter_ring_allgather_MV2(buffer, count,
                                                                  datatype,
                                                                  root,
                                                                  comm);
            } else {
                mpi_errno =
                    MPIR_Bcast_scatter_doubling_allgather_MV2(buffer, count,
                                                              datatype, root,
                                                              comm);
            }
        }
    }


    return mpi_errno;

}
Example #14
0
/*
 * reduce_intra_in_order_binary 
 * 
 * Function:      Logarithmic reduce operation for non-commutative operations.
 * Acecpts:       same as MPI_Reduce()
 * Returns:       MPI_SUCCESS or error code
 */
int smpi_coll_tuned_reduce_ompi_in_order_binary( void *sendbuf, void *recvbuf,
                                                  int count, 
                                                  MPI_Datatype datatype,
                                                  MPI_Op  op, int root,
                                                  MPI_Comm  comm)
{
    uint32_t segsize=0;
    int ret;
    int rank, size, io_root;
    int segcount = count;
    void *use_this_sendbuf = NULL, *use_this_recvbuf = NULL;
    size_t typelng;

    rank = smpi_comm_rank(comm);
    size = smpi_comm_size(comm);
    XBT_DEBUG("coll:tuned:reduce_intra_in_order_binary rank %d ss %5d",
                 rank, segsize);

    /**
     * Determine number of segments and number of elements
     * sent per operation
     */
    typelng=smpi_datatype_size( datatype);
    COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount );

    /* An in-order binary tree must use root (size-1) to preserve the order of
       operations.  Thus, if root is not rank (size - 1), then we must handle
       1. MPI_IN_PLACE option on real root, and 
       2. we must allocate temporary recvbuf on rank (size - 1).
       Note that generic function must be careful not to switch order of 
       operations for non-commutative ops.
    */
    io_root = size - 1;
    use_this_sendbuf = sendbuf;
    use_this_recvbuf = recvbuf;
    if (io_root != root) {
        ptrdiff_t text, ext;
        char *tmpbuf = NULL;
    
        ext=smpi_datatype_get_extent(datatype);
        text=smpi_datatype_get_extent(datatype);

        if ((root == rank) && (MPI_IN_PLACE == sendbuf)) {
            tmpbuf = (char *) malloc(text + (count - 1) * ext);
            if (NULL == tmpbuf) {
                return MPI_ERR_INTERN;
            }
            smpi_datatype_copy (
                                                (char*)recvbuf, count, datatype,
                                                (char*)tmpbuf, count, datatype);
            use_this_sendbuf = tmpbuf;
        } else if (io_root == rank) {
            tmpbuf = (char *) malloc(text + (count - 1) * ext);
            if (NULL == tmpbuf) {
                return MPI_ERR_INTERN;
            }
            use_this_recvbuf = tmpbuf;
        }
    }

    /* Use generic reduce with in-order binary tree topology and io_root */
    ret = smpi_coll_tuned_ompi_reduce_generic( use_this_sendbuf, use_this_recvbuf, count, datatype,
                                          op, io_root, comm, 
                                          ompi_coll_tuned_topo_build_in_order_bintree(comm), 
                                          segcount, 0 );
    if (MPI_SUCCESS != ret) { return ret; }

    /* Clean up */
    if (io_root != root) {
        if (root == rank) {
            /* Receive result from rank io_root to recvbuf */
            smpi_mpi_recv(recvbuf, count, datatype, io_root,
                                    COLL_TAG_REDUCE, comm,
                                    MPI_STATUS_IGNORE);
            if (MPI_IN_PLACE == sendbuf) {
                free(use_this_sendbuf);
            }
          
        } else if (io_root == rank) {
            /* Send result from use_this_recvbuf to root */
            smpi_mpi_send(use_this_recvbuf, count, datatype, root,
                                    COLL_TAG_REDUCE,
                                    comm);
            free(use_this_recvbuf);
        }
    }

    return MPI_SUCCESS;
}
int smpi_coll_tuned_bcast_ompi_pipeline( void* buffer,
                                      int original_count, 
                                      MPI_Datatype datatype, 
                                      int root,
                                      MPI_Comm comm)
{
    int count_by_segment = original_count;
    size_t type_size;
    int segsize =1024  << 7;
    //mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
    //mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
    
//    return ompi_coll_tuned_bcast_intra_generic( buffer, count, datatype, root, comm, module,
//                                                count_by_segment, data->cached_pipeline );
    ompi_coll_tree_t * tree = ompi_coll_tuned_topo_build_chain( 1, comm, root );
    int i;
    int rank, size;
    int segindex;
    int num_segments; /* Number of segments */
    int sendcount;    /* number of elements sent in this segment */ 
    size_t realsegsize;
    char *tmpbuf;
    ptrdiff_t extent;
    MPI_Request recv_reqs[2] = {MPI_REQUEST_NULL, MPI_REQUEST_NULL};
    MPI_Request *send_reqs = NULL;
    int req_index;
    
    /**
     * Determine number of elements sent per operation.
     */
    type_size = smpi_datatype_size(datatype);

    size = smpi_comm_size(comm);
    rank = smpi_comm_rank(comm);
    xbt_assert( size > 1 );


    const double a_p16  = 3.2118e-6; /* [1 / byte] */
    const double b_p16  = 8.7936;   
    const double a_p64  = 2.3679e-6; /* [1 / byte] */
    const double b_p64  = 1.1787;     
    const double a_p128 = 1.6134e-6; /* [1 / byte] */
    const double b_p128 = 2.1102;
    size_t message_size;

    /* else we need data size for decision function */
    message_size = type_size * (unsigned long)original_count;   /* needed for decision */

    if (size < (a_p128 * message_size + b_p128)) {
            //Pipeline with 128KB segments 
            segsize = 1024  << 7;
    }else if (size < (a_p64 * message_size + b_p64)) {
            // Pipeline with 64KB segments 
            segsize = 1024 << 6;
    }else if (size < (a_p16 * message_size + b_p16)) {
            //Pipeline with 16KB segments 
            segsize = 1024 << 4;
    }

    COLL_TUNED_COMPUTED_SEGCOUNT( segsize, type_size, count_by_segment );

    XBT_DEBUG("coll:tuned:bcast_intra_pipeline rank %d ss %5d type_size %lu count_by_segment %d",
                 smpi_comm_rank(comm), segsize, (unsigned long)type_size, count_by_segment);



    extent = smpi_datatype_get_extent (datatype);
    num_segments = (original_count + count_by_segment - 1) / count_by_segment;
    realsegsize = count_by_segment * extent;
    
    /* Set the buffer pointers */
    tmpbuf = (char *) buffer;

    if( tree->tree_nextsize != 0 ) {
        send_reqs = xbt_new(MPI_Request, tree->tree_nextsize  );
    }

    /* Root code */
    if( rank == root ) {
        /* 
           For each segment:
           - send segment to all children.
             The last segment may have less elements than other segments.
        */
        sendcount = count_by_segment;
        for( segindex = 0; segindex < num_segments; segindex++ ) {
            if( segindex == (num_segments - 1) ) {
                sendcount = original_count - segindex * count_by_segment;
            }
            for( i = 0; i < tree->tree_nextsize; i++ ) { 
                send_reqs[i] = smpi_mpi_isend(tmpbuf, sendcount, datatype,
                                         tree->tree_next[i], 
                                         COLL_TAG_BCAST, comm);
           } 

            /* complete the sends before starting the next sends */
            smpi_mpi_waitall( tree->tree_nextsize, send_reqs, 
                                         MPI_STATUSES_IGNORE );

            /* update tmp buffer */
            tmpbuf += realsegsize;

        }
    } 
    
    /* Intermediate nodes code */
    else if( tree->tree_nextsize > 0 ) { 
        /* 
           Create the pipeline. 
           1) Post the first receive
           2) For segments 1 .. num_segments
              - post new receive
              - wait on the previous receive to complete
              - send this data to children
           3) Wait on the last segment
           4) Compute number of elements in last segment.
           5) Send the last segment to children
         */
        req_index = 0;
        recv_reqs[req_index]=smpi_mpi_irecv(tmpbuf, count_by_segment, datatype,
                           tree->tree_prev, COLL_TAG_BCAST,
                           comm);
        
        for( segindex = 1; segindex < num_segments; segindex++ ) {
            
            req_index = req_index ^ 0x1;
            
            /* post new irecv */
            recv_reqs[req_index]= smpi_mpi_irecv( tmpbuf + realsegsize, count_by_segment,
                                datatype, tree->tree_prev, 
                                COLL_TAG_BCAST,
                                comm);
            
            /* wait for and forward the previous segment to children */
            smpi_mpi_wait( &recv_reqs[req_index ^ 0x1], 
                                     MPI_STATUSES_IGNORE );
            
            for( i = 0; i < tree->tree_nextsize; i++ ) { 
                send_reqs[i]=smpi_mpi_isend(tmpbuf, count_by_segment, datatype,
                                         tree->tree_next[i], 
                                         COLL_TAG_BCAST, comm );
            } 
            
            /* complete the sends before starting the next iteration */
            smpi_mpi_waitall( tree->tree_nextsize, send_reqs, 
                                         MPI_STATUSES_IGNORE );
            
            /* Update the receive buffer */
            tmpbuf += realsegsize;
        }

        /* Process the last segment */
        smpi_mpi_wait( &recv_reqs[req_index], MPI_STATUSES_IGNORE );
        sendcount = original_count - (num_segments - 1) * count_by_segment;
        for( i = 0; i < tree->tree_nextsize; i++ ) {
            send_reqs[i] = smpi_mpi_isend(tmpbuf, sendcount, datatype,
                                     tree->tree_next[i], 
                                     COLL_TAG_BCAST, comm);
        }
        
        smpi_mpi_waitall( tree->tree_nextsize, send_reqs, 
                                     MPI_STATUSES_IGNORE );
    }
  
    /* Leaf nodes */
    else {
        /* 
           Receive all segments from parent in a loop:
           1) post irecv for the first segment
           2) for segments 1 .. num_segments
              - post irecv for the next segment
              - wait on the previous segment to arrive
           3) wait for the last segment
        */
        req_index = 0;
        recv_reqs[req_index] = smpi_mpi_irecv(tmpbuf, count_by_segment, datatype,
                                 tree->tree_prev, COLL_TAG_BCAST,
                                 comm);

        for( segindex = 1; segindex < num_segments; segindex++ ) {
            req_index = req_index ^ 0x1;
            tmpbuf += realsegsize;
            /* post receive for the next segment */
            recv_reqs[req_index] = smpi_mpi_irecv(tmpbuf, count_by_segment, datatype, 
                                     tree->tree_prev, COLL_TAG_BCAST,
                                     comm);
            /* wait on the previous segment */
            smpi_mpi_wait( &recv_reqs[req_index ^ 0x1], 
                                     MPI_STATUS_IGNORE );
        }

        smpi_mpi_wait( &recv_reqs[req_index], MPI_STATUS_IGNORE );
    }

    if( NULL != send_reqs ) free(send_reqs);

    return (MPI_SUCCESS);
}
Example #16
0
int smpi_coll_tuned_allgather_ompi(void *sbuf, int scount, 
                                              MPI_Datatype sdtype,
                                              void* rbuf, int rcount, 
                                              MPI_Datatype rdtype, 
                                              MPI_Comm  comm
                                              )
{
    int communicator_size, pow2_size;
    size_t dsize, total_dsize;

    communicator_size = smpi_comm_size(comm);

    /* Special case for 2 processes */
    if (communicator_size == 2) {
        return smpi_coll_tuned_allgather_pair (sbuf, scount, sdtype, 
                                                          rbuf, rcount, rdtype, 
                                                          comm/*, module*/);
    }

    /* Determine complete data size */
    dsize=smpi_datatype_size(sdtype);
    total_dsize = dsize * scount * communicator_size;   
   
    for (pow2_size  = 1; pow2_size < communicator_size; pow2_size <<=1); 

    /* Decision based on MX 2Gb results from Grig cluster at 
       The University of Tennesse, Knoxville 
       - if total message size is less than 50KB use either bruck or 
       recursive doubling for non-power of two and power of two nodes, 
       respectively.
       - else use ring and neighbor exchange algorithms for odd and even 
       number of nodes, respectively.
    */
    if (total_dsize < 50000) {
        if (pow2_size == communicator_size) {
            return smpi_coll_tuned_allgather_rdb(sbuf, scount, sdtype, 
                                                                     rbuf, rcount, rdtype,
                                                                     comm);
        } else {
            return smpi_coll_tuned_allgather_bruck(sbuf, scount, sdtype, 
                                                         rbuf, rcount, rdtype, 
                                                         comm);
        }
    } else {
        if (communicator_size % 2) {
            return smpi_coll_tuned_allgather_ring(sbuf, scount, sdtype, 
                                                        rbuf, rcount, rdtype, 
                                                        comm);
        } else {
            return  smpi_coll_tuned_allgather_ompi_neighborexchange(sbuf, scount, sdtype,
                                                                     rbuf, rcount, rdtype,
                                                                     comm);
        }
    }
   
#if defined(USE_MPICH2_DECISION)
    /* Decision as in MPICH-2 
       presented in Thakur et.al. "Optimization of Collective Communication 
       Operations in MPICH", International Journal of High Performance Computing 
       Applications, Vol. 19, No. 1, 49-66 (2005)
       - for power-of-two processes and small and medium size messages 
       (up to 512KB) use recursive doubling
       - for non-power-of-two processes and small messages (80KB) use bruck,
       - for everything else use ring.
    */
    if ((pow2_size == communicator_size) && (total_dsize < 524288)) {
        return smpi_coll_tuned_allgather_rdb(sbuf, scount, sdtype, 
                                                                 rbuf, rcount, rdtype, 
                                                                 comm);
    } else if (total_dsize <= 81920) { 
        return smpi_coll_tuned_allgather_bruck(sbuf, scount, sdtype, 
                                                     rbuf, rcount, rdtype,
                                                     comm);
    } 
    return smpi_coll_tuned_allgather_ring(sbuf, scount, sdtype, 
                                                rbuf, rcount, rdtype,
                                                comm);
#endif  /* defined(USE_MPICH2_DECISION) */
}
int 
smpi_coll_tuned_allreduce_ompi_ring_segmented(void *sbuf, void *rbuf, int count,
                                               MPI_Datatype dtype,
                                               MPI_Op op,
                                               MPI_Comm comm) 
{
   int ret = MPI_SUCCESS;
   int line;
   int k, recv_from, send_to;
   int early_blockcount, late_blockcount, split_rank; 
   int segcount, max_segcount;
   int num_phases, phase;
   int block_count;
   unsigned int inbi;
   size_t typelng;
   char *tmpsend = NULL, *tmprecv = NULL;
   char *inbuf[2] = {NULL, NULL};
   ptrdiff_t true_extent, extent;
   ptrdiff_t block_offset, max_real_segsize;
   MPI_Request reqs[2] = {NULL, NULL};
   const size_t segsize = 1 << 20; /* 1 MB */
   unsigned int size = smpi_comm_size(comm);
   unsigned int rank = smpi_comm_rank(comm);

   XBT_DEBUG("coll:tuned:allreduce_intra_ring_segmented rank %d, count %d", rank, count);

   /* Special case for size == 1 */
   if (1 == size) {
      if (MPI_IN_PLACE != sbuf) {
      ret= smpi_datatype_copy(sbuf, count, dtype,rbuf, count, dtype);
         if (ret < 0) { line = __LINE__; goto error_hndl; }
      }
      return MPI_SUCCESS;
   }
   
   /* Determine segment count based on the suggested segment size */
   extent = smpi_datatype_get_extent(dtype);
   if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
   true_extent = smpi_datatype_get_extent(dtype);
   if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
   typelng = smpi_datatype_size(dtype);
   if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
   segcount = count;
   COLL_TUNED_COMPUTED_SEGCOUNT(segsize, typelng, segcount)

   /* Special case for count less than size * segcount - use regular ring */
   if (count < size * segcount) {
      XBT_DEBUG( "coll:tuned:allreduce_ring_segmented rank %d/%d, count %d, switching to regular ring", rank, size, count);
      return (smpi_coll_tuned_allreduce_lr(sbuf, rbuf, count, dtype, op, 
                                                   comm));
   }

   /* Determine the number of phases of the algorithm */
   num_phases = count / (size * segcount);
   if ((count % (size * segcount) >= size) && 
       (count % (size * segcount) > ((size * segcount) / 2))) {
      num_phases++;
   }

   /* Determine the number of elements per block and corresponding 
      block sizes.
      The blocks are divided into "early" and "late" ones:
      blocks 0 .. (split_rank - 1) are "early" and 
      blocks (split_rank) .. (size - 1) are "late".
      Early blocks are at most 1 element larger than the late ones.
      Note, these blocks will be split into num_phases segments,
      out of the largest one will have max_segcount elements.
    */
   COLL_TUNED_COMPUTE_BLOCKCOUNT( count, size, split_rank, 
                                  early_blockcount, late_blockcount )
   COLL_TUNED_COMPUTE_BLOCKCOUNT( early_blockcount, num_phases, inbi,
                                  max_segcount, k)
   max_real_segsize = true_extent + (max_segcount - 1) * extent;

   /* Allocate and initialize temporary buffers */
   inbuf[0] = (char*)smpi_get_tmp_sendbuffer(max_real_segsize);
   if (NULL == inbuf[0]) { ret = -1; line = __LINE__; goto error_hndl; }
   if (size > 2) {
      inbuf[1] = (char*)smpi_get_tmp_recvbuffer(max_real_segsize);
      if (NULL == inbuf[1]) { ret = -1; line = __LINE__; goto error_hndl; }
   }

   /* Handle MPI_IN_PLACE */
   if (MPI_IN_PLACE != sbuf) {
      ret= smpi_datatype_copy(sbuf, count, dtype,rbuf, count, dtype);
      if (ret < 0) { line = __LINE__; goto error_hndl; }
   }

   /* Computation loop: for each phase, repeat ring allreduce computation loop */
   for (phase = 0; phase < num_phases; phase ++) {
      ptrdiff_t phase_offset;
      int early_phase_segcount, late_phase_segcount, split_phase, phase_count;

      /* 
         For each of the remote nodes:
         - post irecv for block (r-1)
         - send block (r)
           To do this, first compute block offset and count, and use block offset
           to compute phase offset.
         - in loop for every step k = 2 .. n
           - post irecv for block (r + n - k) % n
           - wait on block (r + n - k + 1) % n to arrive
           - compute on block (r + n - k + 1) % n
           - send block (r + n - k + 1) % n
         - wait on block (r + 1)
         - compute on block (r + 1)
         - send block (r + 1) to rank (r + 1)
         Note that we must be careful when computing the begining of buffers and
         for send operations and computation we must compute the exact block size.
      */
      send_to = (rank + 1) % size;
      recv_from = (rank + size - 1) % size;
      
      inbi = 0;
      /* Initialize first receive from the neighbor on the left */
      reqs[inbi] = smpi_mpi_irecv(inbuf[inbi], max_segcount, dtype, recv_from,
                               666, comm);
      /* Send first block (my block) to the neighbor on the right:
         - compute my block and phase offset
         - send data */
      block_offset = ((rank < split_rank)? 
                      (rank * early_blockcount) : 
                      (rank * late_blockcount + split_rank));
      block_count = ((rank < split_rank)? early_blockcount : late_blockcount);
      COLL_TUNED_COMPUTE_BLOCKCOUNT(block_count, num_phases, split_phase,
                                    early_phase_segcount, late_phase_segcount)
      phase_count = ((phase < split_phase)?
                     (early_phase_segcount) : (late_phase_segcount));
      phase_offset = ((phase < split_phase)?
                      (phase * early_phase_segcount) : 
                      (phase * late_phase_segcount + split_phase));
      tmpsend = ((char*)rbuf) + (block_offset + phase_offset) * extent;
      smpi_mpi_send(tmpsend, phase_count, dtype, send_to,
                              666, comm);
      
      for (k = 2; k < size; k++) {
         const int prevblock = (rank + size - k + 1) % size;
         
         inbi = inbi ^ 0x1;
         
         /* Post irecv for the current block */
         reqs[inbi] = smpi_mpi_irecv(inbuf[inbi], max_segcount, dtype, recv_from,
                               666, comm);
         if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
         
         /* Wait on previous block to arrive */
         smpi_mpi_wait(&reqs[inbi ^ 0x1], MPI_STATUS_IGNORE);
         
         /* Apply operation on previous block: result goes to rbuf
            rbuf[prevblock] = inbuf[inbi ^ 0x1] (op) rbuf[prevblock]
         */
         block_offset = ((prevblock < split_rank)?
                         (prevblock * early_blockcount) :
                         (prevblock * late_blockcount + split_rank));
         block_count = ((prevblock < split_rank)? 
                        early_blockcount : late_blockcount);
         COLL_TUNED_COMPUTE_BLOCKCOUNT(block_count, num_phases, split_phase,
                                       early_phase_segcount, late_phase_segcount)
         phase_count = ((phase < split_phase)?
                        (early_phase_segcount) : (late_phase_segcount));
         phase_offset = ((phase < split_phase)?
                         (phase * early_phase_segcount) : 
                         (phase * late_phase_segcount + split_phase));
         tmprecv = ((char*)rbuf) + (block_offset + phase_offset) * extent;
         smpi_op_apply(op, inbuf[inbi ^ 0x1], tmprecv, &phase_count, &dtype);
         /* send previous block to send_to */
         smpi_mpi_send(tmprecv, phase_count, dtype, send_to,
                              666, comm);
      }
      
      /* Wait on the last block to arrive */
      smpi_mpi_wait(&reqs[inbi], MPI_STATUS_IGNORE);

      
      /* Apply operation on the last block (from neighbor (rank + 1) 
         rbuf[rank+1] = inbuf[inbi] (op) rbuf[rank + 1] */
      recv_from = (rank + 1) % size;
      block_offset = ((recv_from < split_rank)?
                      (recv_from * early_blockcount) :
                      (recv_from * late_blockcount + split_rank));
      block_count = ((recv_from < split_rank)? 
                     early_blockcount : late_blockcount);
      COLL_TUNED_COMPUTE_BLOCKCOUNT(block_count, num_phases, split_phase,
                                    early_phase_segcount, late_phase_segcount)
      phase_count = ((phase < split_phase)?
                     (early_phase_segcount) : (late_phase_segcount));
      phase_offset = ((phase < split_phase)?
                      (phase * early_phase_segcount) : 
                      (phase * late_phase_segcount + split_phase));
      tmprecv = ((char*)rbuf) + (block_offset + phase_offset) * extent;
      smpi_op_apply(op, inbuf[inbi], tmprecv, &phase_count, &dtype);
   }

   /* Distribution loop - variation of ring allgather */
   send_to = (rank + 1) % size;
   recv_from = (rank + size - 1) % size;
   for (k = 0; k < size - 1; k++) {
      const int recv_data_from = (rank + size - k) % size;
      const int send_data_from = (rank + 1 + size - k) % size;
      const int send_block_offset = 
         ((send_data_from < split_rank)?
          (send_data_from * early_blockcount) :
          (send_data_from * late_blockcount + split_rank));
      const int recv_block_offset = 
         ((recv_data_from < split_rank)?
          (recv_data_from * early_blockcount) :
          (recv_data_from * late_blockcount + split_rank));
      block_count = ((send_data_from < split_rank)? 
                     early_blockcount : late_blockcount);

      tmprecv = (char*)rbuf + recv_block_offset * extent;
      tmpsend = (char*)rbuf + send_block_offset * extent;

      smpi_mpi_sendrecv(tmpsend, block_count, dtype, send_to,
                                     666,
                                     tmprecv, early_blockcount, dtype, recv_from,
                                     666,
                                     comm, MPI_STATUS_IGNORE);

   }

   if (NULL != inbuf[0]) smpi_free_tmp_buffer(inbuf[0]);
   if (NULL != inbuf[1]) smpi_free_tmp_buffer(inbuf[1]);

   return MPI_SUCCESS;

 error_hndl:
   XBT_DEBUG("%s:%4d\tRank %d Error occurred %d\n",
                __FILE__, line, rank, ret);
   if (NULL != inbuf[0]) smpi_free_tmp_buffer(inbuf[0]);
   if (NULL != inbuf[1]) smpi_free_tmp_buffer(inbuf[1]);
   return ret;
}
Example #18
0
int smpi_coll_tuned_reduce_ompi( void *sendbuf, void *recvbuf,
                                            int count, MPI_Datatype  datatype,
                                            MPI_Op   op, int root,
                                            MPI_Comm   comm
                                            )
{
    int communicator_size=0;
    //int segsize = 0;
    size_t message_size, dsize;
    const double a1 =  0.6016 / 1024.0; /* [1/B] */
    const double b1 =  1.3496;
    const double a2 =  0.0410 / 1024.0; /* [1/B] */
    const double b2 =  9.7128;
    const double a3 =  0.0422 / 1024.0; /* [1/B] */
    const double b3 =  1.1614;
    //const double a4 =  0.0033 / 1024.0; /* [1/B] */
    //const double b4 =  1.6761;

    //const int max_requests = 0; /* no limit on # of outstanding requests */

    communicator_size = smpi_comm_size(comm);

    /* need data size for decision function */
    dsize=smpi_datatype_size(datatype);
    message_size = dsize * count;   /* needed for decision */

    /**
     * If the operation is non commutative we currently have choice of linear 
     * or in-order binary tree algorithm.
     */
    if( !smpi_op_is_commute(op) ) {
        if ((communicator_size < 12) && (message_size < 2048)) {
            return smpi_coll_tuned_reduce_ompi_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm/*, module*/); 
        } 
        return smpi_coll_tuned_reduce_ompi_in_order_binary (sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
                                                             0, max_requests*/); 
    }

    if ((communicator_size < 8) && (message_size < 512)){
        /* Linear_0K */
        return smpi_coll_tuned_reduce_ompi_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm); 
    } else if (((communicator_size < 8) && (message_size < 20480)) ||
               (message_size < 2048) || (count <= 1)) {
        /* Binomial_0K */
        //segsize = 0;
        return smpi_coll_tuned_reduce_ompi_binomial(sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
                                                     segsize, max_requests*/);
    } else if (communicator_size > (a1 * message_size + b1)) {
        // Binomial_1K 
        //segsize = 1024;
        return smpi_coll_tuned_reduce_ompi_binomial(sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
                                                     segsize, max_requests*/);
    } else if (communicator_size > (a2 * message_size + b2)) {
        // Pipeline_1K 
        //segsize = 1024;
        return smpi_coll_tuned_reduce_ompi_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm/*, module, 
                                                      segsize, max_requests*/);
    } else if (communicator_size > (a3 * message_size + b3)) {
        // Binary_32K 
        //segsize = 32*1024;
        return smpi_coll_tuned_reduce_ompi_binary( sendbuf, recvbuf, count, datatype, op, root,
                                                    comm/*, module, segsize, max_requests*/);
    }
    /*if (communicator_size > (a4 * message_size + b4)) {
        // Pipeline_32K 
        segsize = 32*1024;
    } else {
        // Pipeline_64K 
        segsize = 64*1024;
    }*/
    return smpi_coll_tuned_reduce_ompi_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm/*, module, 
                                                  segsize, max_requests*/);

#if 0
    /* for small messages use linear algorithm */
    if (message_size <= 4096) {
        segsize = 0;
        fanout = communicator_size - 1;
        /* when linear implemented or taken from basic put here, right now using chain as a linear system */
        /* it is implemented and I shouldn't be calling a chain with a fanout bigger than MAXTREEFANOUT from topo.h! */
        return smpi_coll_tuned_reduce_intra_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm, module); 
        /*        return smpi_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, segsize, fanout); */
    }
    if (message_size < 524288) {
        if (message_size <= 65536 ) {
            segsize = 32768;
            fanout = 8;
        } else {
            segsize = 1024;
            fanout = communicator_size/2;
        }
        /* later swap this for a binary tree */
        /*         fanout = 2; */
        return smpi_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, module,
                                                   segsize, fanout, max_requests);
    }
    segsize = 1024;
    return smpi_coll_tuned_reduce_intra_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm, module,
                                                  segsize, max_requests);
#endif  /* 0 */
}
Example #19
0
int smpi_coll_tuned_bcast_ompi(void *buff, int count,
                                          MPI_Datatype datatype, int root,
                                          MPI_Comm  comm
                                          )
{
    /* Decision function based on MX results for 
       messages up to 36MB and communicator sizes up to 64 nodes */
    const size_t small_message_size = 2048;
    const size_t intermediate_message_size = 370728;
    const double a_p16  = 3.2118e-6; /* [1 / byte] */
    const double b_p16  = 8.7936;   
    const double a_p64  = 2.3679e-6; /* [1 / byte] */
    const double b_p64  = 1.1787;     
    const double a_p128 = 1.6134e-6; /* [1 / byte] */
    const double b_p128 = 2.1102;

    int communicator_size;
    //int segsize = 0;
    size_t message_size, dsize;

    communicator_size = smpi_comm_size(comm);

    /* else we need data size for decision function */
    dsize = smpi_datatype_size(datatype);
    message_size = dsize * (unsigned long)count;   /* needed for decision */

    /* Handle messages of small and intermediate size, and 
       single-element broadcasts */
    if ((message_size < small_message_size) || (count <= 1)) {
        /* Binomial without segmentation */
        return  smpi_coll_tuned_bcast_binomial_tree (buff, count, datatype, 
                                                      root, comm);

    } else if (message_size < intermediate_message_size) {
        // SplittedBinary with 1KB segments
        return smpi_coll_tuned_bcast_ompi_split_bintree(buff, count, datatype, 
                                                         root, comm);

    }
     //Handle large message sizes 
    else if (communicator_size < (a_p128 * message_size + b_p128)) {
        //Pipeline with 128KB segments 
        //segsize = 1024  << 7;
        return smpi_coll_tuned_bcast_ompi_pipeline (buff, count, datatype, 
                                                     root, comm);
                                                     

    } else if (communicator_size < 13) {
        // Split Binary with 8KB segments 
        return smpi_coll_tuned_bcast_ompi_split_bintree(buff, count, datatype, 
                                                         root, comm);
       
    } else if (communicator_size < (a_p64 * message_size + b_p64)) {
        // Pipeline with 64KB segments 
        //segsize = 1024 << 6;
        return smpi_coll_tuned_bcast_ompi_pipeline (buff, count, datatype, 
                                                     root, comm);
                                                     

    } else if (communicator_size < (a_p16 * message_size + b_p16)) {
        //Pipeline with 16KB segments 
        //segsize = 1024 << 4;
        return smpi_coll_tuned_bcast_ompi_pipeline (buff, count, datatype, 
                                                     root, comm);
                                                     

    }
    /* Pipeline with 8KB segments */
    //segsize = 1024 << 3;
    return smpi_coll_tuned_bcast_flattree_pipeline (buff, count, datatype, 
                                                 root, comm
                                                 /*segsize*/);
#if 0
    /* this is based on gige measurements */

    if (communicator_size  < 4) {
        return smpi_coll_tuned_bcast_intra_basic_linear (buff, count, datatype, root, comm, module);
    }
    if (communicator_size == 4) {
        if (message_size < 524288) segsize = 0;
        else segsize = 16384;
        return smpi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, module, segsize);
    }
    if (communicator_size <= 8 && message_size < 4096) {
        return smpi_coll_tuned_bcast_intra_basic_linear (buff, count, datatype, root, comm, module);
    }
    if (communicator_size > 8 && message_size >= 32768 && message_size < 524288) {
        segsize = 16384;
        return  smpi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, module, segsize);
    }
    if (message_size >= 524288) {
        segsize = 16384;
        return smpi_coll_tuned_bcast_intra_pipeline (buff, count, datatype, root, comm, module, segsize);
    }
    segsize = 0;
    /* once tested can swap this back in */
    /* return smpi_coll_tuned_bcast_intra_bmtree (buff, count, datatype, root, comm, segsize); */
    return smpi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, module, segsize);
#endif  /* 0 */
}