Beispiel #1
0
int smpi_coll_tuned_reduce_ompi_binary( void *sendbuf, void *recvbuf,
                                         int count, MPI_Datatype datatype,
                                         MPI_Op  op, int root,
                                         MPI_Comm  comm)
{
    uint32_t segsize;
    int segcount = count;
    size_t typelng;



    /**
     * Determine number of segments and number of elements
     * sent per operation
     */
    typelng=smpi_datatype_size( datatype );

        // Binary_32K 
    segsize = 32*1024;

    XBT_DEBUG("coll:tuned:reduce_intra_binary rank %d ss %5d",
                 smpi_comm_rank(comm), segsize);

    COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount );

    return smpi_coll_tuned_ompi_reduce_generic( sendbuf, recvbuf, count, datatype, 
                                           op, root, comm, 
                                           ompi_coll_tuned_topo_build_tree(2, comm, root), 
                                           segcount, 0);
}
int ompi_coll_tuned_reduce_intra_binary( void *sendbuf, void *recvbuf,
                                         int count, ompi_datatype_t* datatype,
                                         ompi_op_t* op, int root,
                                         ompi_communicator_t* comm, 
                                         mca_coll_base_module_t *module,
                                         uint32_t segsize, 
                                         int max_outstanding_reqs  )
{
    int segcount = count;
    size_t typelng;
    mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
    mca_coll_tuned_comm_t *data = tuned_module->tuned_data;

    OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_binary rank %d ss %5d",
                 ompi_comm_rank(comm), segsize));

    COLL_TUNED_UPDATE_BINTREE( comm, tuned_module, root );

    /**
     * Determine number of segments and number of elements
     * sent per operation
     */
    ompi_datatype_type_size( datatype, &typelng );
    COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount );

    return ompi_coll_tuned_reduce_generic( sendbuf, recvbuf, count, datatype, 
                                           op, root, comm, module,
                                           data->cached_bintree, 
                                           segcount, max_outstanding_reqs );
}
int
ompi_coll_tuned_bcast_intra_binomial( void* buffer,
                                      int count,
                                      struct ompi_datatype_t* datatype,
                                      int root,
                                      struct ompi_communicator_t* comm,
                                      mca_coll_base_module_t *module,
                                      uint32_t segsize )
{
    int segcount = count;
    size_t typelng;
    mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
    mca_coll_tuned_comm_t *data = tuned_module->tuned_data;

    COLL_TUNED_UPDATE_BMTREE( comm, tuned_module, root );

    /**
     * Determine number of elements sent per operation.
     */
    ompi_datatype_type_size( datatype, &typelng );
    COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount );

    OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_binomial rank %d ss %5d typelng %lu segcount %d",
                 ompi_comm_rank(comm), segsize, (unsigned long)typelng, segcount));

    return ompi_coll_tuned_bcast_intra_generic( buffer, count, datatype, root, comm, module,
            segcount, data->cached_bmtree );
}
Beispiel #4
0
int smpi_coll_tuned_reduce_ompi_pipeline( void *sendbuf, void *recvbuf,
                                           int count, MPI_Datatype datatype,
                                           MPI_Op  op, int root,
                                           MPI_Comm  comm  )
{

    uint32_t segsize;
    int segcount = count;
    size_t typelng;
//    COLL_TUNED_UPDATE_PIPELINE( comm, tuned_module, root );

    /**
     * Determine number of segments and number of elements
     * sent per operation
     */
    const double a2 =  0.0410 / 1024.0; /* [1/B] */
    const double b2 =  9.7128;
    const double a4 =  0.0033 / 1024.0; /* [1/B] */
    const double b4 =  1.6761;
    typelng= smpi_datatype_size( datatype);
    int communicator_size = smpi_comm_size(comm);
    size_t message_size = typelng * count; 

    if (communicator_size > (a2 * message_size + b2)) {
        // Pipeline_1K 
        segsize = 1024;
    }else if (communicator_size > (a4 * message_size + b4)) {
        // Pipeline_32K 
        segsize = 32*1024;
    } else {
        // Pipeline_64K 
        segsize = 64*1024;
    }

    XBT_DEBUG("coll:tuned:reduce_intra_pipeline rank %d ss %5d",
                 smpi_comm_rank(comm), segsize);

    COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount );

    return smpi_coll_tuned_ompi_reduce_generic( sendbuf, recvbuf, count, datatype, 
                                           op, root, comm,
                                           ompi_coll_tuned_topo_build_chain( 1, comm, root), 
                                           segcount, 0);
}
Beispiel #5
0
int ompi_coll_tuned_reduce_intra_chain( void *sendbuf, void *recvbuf, int count,
                                        ompi_datatype_t* datatype, ompi_op_t* op,
                                        int root, ompi_communicator_t* comm, uint32_t segsize,
                                        int fanout)
{
    int segcount = count;
    size_t typelng;

    OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_chain rank %d fo %d ss %5d", ompi_comm_rank(comm), fanout, segsize));

    COLL_TUNED_UPDATE_CHAIN( comm, root, fanout );
    /**
     * Determine number of segments and number of elements
     * sent per operation
     */
    ompi_ddt_type_size( datatype, &typelng );
    COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount );

    return ompi_coll_tuned_reduce_generic( sendbuf, recvbuf, count, datatype, op, root, comm,
                                           comm->c_coll_selected_data->cached_chain, segcount );
}
Beispiel #6
0
int smpi_coll_tuned_reduce_ompi_binomial( void *sendbuf, void *recvbuf,
                                           int count, MPI_Datatype datatype,
                                           MPI_Op  op, int root,
                                           MPI_Comm  comm)
{

    uint32_t segsize=0;
    int segcount = count;
    size_t typelng;

    const double a1 =  0.6016 / 1024.0; /* [1/B] */
    const double b1 =  1.3496;

//    COLL_TUNED_UPDATE_IN_ORDER_BMTREE( comm, tuned_module, root );

    /**
     * Determine number of segments and number of elements
     * sent per operation
     */
    typelng= smpi_datatype_size( datatype);
    int communicator_size = smpi_comm_size(comm);
    size_t message_size = typelng * count; 
    if (((communicator_size < 8) && (message_size < 20480)) ||
               (message_size < 2048) || (count <= 1)) {
        /* Binomial_0K */
        segsize = 0;
    } else if (communicator_size > (a1 * message_size + b1)) {
        // Binomial_1K 
        segsize = 1024;
    }

    XBT_DEBUG("coll:tuned:reduce_intra_binomial rank %d ss %5d",
                 smpi_comm_rank(comm), segsize);
    COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount );

    return smpi_coll_tuned_ompi_reduce_generic( sendbuf, recvbuf, count, datatype, 
                                           op, root, comm, 
                                           ompi_coll_tuned_topo_build_in_order_bmtree(comm, root), 
                                           segcount, 0);
}
int smpi_coll_tuned_bcast_ompi_pipeline( void* buffer,
                                      int original_count, 
                                      MPI_Datatype datatype, 
                                      int root,
                                      MPI_Comm comm)
{
    int count_by_segment = original_count;
    size_t type_size;
    int segsize =1024  << 7;
    //mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
    //mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
    
//    return ompi_coll_tuned_bcast_intra_generic( buffer, count, datatype, root, comm, module,
//                                                count_by_segment, data->cached_pipeline );
    ompi_coll_tree_t * tree = ompi_coll_tuned_topo_build_chain( 1, comm, root );
    int i;
    int rank, size;
    int segindex;
    int num_segments; /* Number of segments */
    int sendcount;    /* number of elements sent in this segment */ 
    size_t realsegsize;
    char *tmpbuf;
    ptrdiff_t extent;
    MPI_Request recv_reqs[2] = {MPI_REQUEST_NULL, MPI_REQUEST_NULL};
    MPI_Request *send_reqs = NULL;
    int req_index;
    
    /**
     * Determine number of elements sent per operation.
     */
    type_size = smpi_datatype_size(datatype);

    size = smpi_comm_size(comm);
    rank = smpi_comm_rank(comm);
    xbt_assert( size > 1 );


    const double a_p16  = 3.2118e-6; /* [1 / byte] */
    const double b_p16  = 8.7936;   
    const double a_p64  = 2.3679e-6; /* [1 / byte] */
    const double b_p64  = 1.1787;     
    const double a_p128 = 1.6134e-6; /* [1 / byte] */
    const double b_p128 = 2.1102;
    size_t message_size;

    /* else we need data size for decision function */
    message_size = type_size * (unsigned long)original_count;   /* needed for decision */

    if (size < (a_p128 * message_size + b_p128)) {
            //Pipeline with 128KB segments 
            segsize = 1024  << 7;
    }else if (size < (a_p64 * message_size + b_p64)) {
            // Pipeline with 64KB segments 
            segsize = 1024 << 6;
    }else if (size < (a_p16 * message_size + b_p16)) {
            //Pipeline with 16KB segments 
            segsize = 1024 << 4;
    }

    COLL_TUNED_COMPUTED_SEGCOUNT( segsize, type_size, count_by_segment );

    XBT_DEBUG("coll:tuned:bcast_intra_pipeline rank %d ss %5d type_size %lu count_by_segment %d",
                 smpi_comm_rank(comm), segsize, (unsigned long)type_size, count_by_segment);



    extent = smpi_datatype_get_extent (datatype);
    num_segments = (original_count + count_by_segment - 1) / count_by_segment;
    realsegsize = count_by_segment * extent;
    
    /* Set the buffer pointers */
    tmpbuf = (char *) buffer;

    if( tree->tree_nextsize != 0 ) {
        send_reqs = xbt_new(MPI_Request, tree->tree_nextsize  );
    }

    /* Root code */
    if( rank == root ) {
        /* 
           For each segment:
           - send segment to all children.
             The last segment may have less elements than other segments.
        */
        sendcount = count_by_segment;
        for( segindex = 0; segindex < num_segments; segindex++ ) {
            if( segindex == (num_segments - 1) ) {
                sendcount = original_count - segindex * count_by_segment;
            }
            for( i = 0; i < tree->tree_nextsize; i++ ) { 
                send_reqs[i] = smpi_mpi_isend(tmpbuf, sendcount, datatype,
                                         tree->tree_next[i], 
                                         COLL_TAG_BCAST, comm);
           } 

            /* complete the sends before starting the next sends */
            smpi_mpi_waitall( tree->tree_nextsize, send_reqs, 
                                         MPI_STATUSES_IGNORE );

            /* update tmp buffer */
            tmpbuf += realsegsize;

        }
    } 
    
    /* Intermediate nodes code */
    else if( tree->tree_nextsize > 0 ) { 
        /* 
           Create the pipeline. 
           1) Post the first receive
           2) For segments 1 .. num_segments
              - post new receive
              - wait on the previous receive to complete
              - send this data to children
           3) Wait on the last segment
           4) Compute number of elements in last segment.
           5) Send the last segment to children
         */
        req_index = 0;
        recv_reqs[req_index]=smpi_mpi_irecv(tmpbuf, count_by_segment, datatype,
                           tree->tree_prev, COLL_TAG_BCAST,
                           comm);
        
        for( segindex = 1; segindex < num_segments; segindex++ ) {
            
            req_index = req_index ^ 0x1;
            
            /* post new irecv */
            recv_reqs[req_index]= smpi_mpi_irecv( tmpbuf + realsegsize, count_by_segment,
                                datatype, tree->tree_prev, 
                                COLL_TAG_BCAST,
                                comm);
            
            /* wait for and forward the previous segment to children */
            smpi_mpi_wait( &recv_reqs[req_index ^ 0x1], 
                                     MPI_STATUSES_IGNORE );
            
            for( i = 0; i < tree->tree_nextsize; i++ ) { 
                send_reqs[i]=smpi_mpi_isend(tmpbuf, count_by_segment, datatype,
                                         tree->tree_next[i], 
                                         COLL_TAG_BCAST, comm );
            } 
            
            /* complete the sends before starting the next iteration */
            smpi_mpi_waitall( tree->tree_nextsize, send_reqs, 
                                         MPI_STATUSES_IGNORE );
            
            /* Update the receive buffer */
            tmpbuf += realsegsize;
        }

        /* Process the last segment */
        smpi_mpi_wait( &recv_reqs[req_index], MPI_STATUSES_IGNORE );
        sendcount = original_count - (num_segments - 1) * count_by_segment;
        for( i = 0; i < tree->tree_nextsize; i++ ) {
            send_reqs[i] = smpi_mpi_isend(tmpbuf, sendcount, datatype,
                                     tree->tree_next[i], 
                                     COLL_TAG_BCAST, comm);
        }
        
        smpi_mpi_waitall( tree->tree_nextsize, send_reqs, 
                                     MPI_STATUSES_IGNORE );
    }
  
    /* Leaf nodes */
    else {
        /* 
           Receive all segments from parent in a loop:
           1) post irecv for the first segment
           2) for segments 1 .. num_segments
              - post irecv for the next segment
              - wait on the previous segment to arrive
           3) wait for the last segment
        */
        req_index = 0;
        recv_reqs[req_index] = smpi_mpi_irecv(tmpbuf, count_by_segment, datatype,
                                 tree->tree_prev, COLL_TAG_BCAST,
                                 comm);

        for( segindex = 1; segindex < num_segments; segindex++ ) {
            req_index = req_index ^ 0x1;
            tmpbuf += realsegsize;
            /* post receive for the next segment */
            recv_reqs[req_index] = smpi_mpi_irecv(tmpbuf, count_by_segment, datatype, 
                                     tree->tree_prev, COLL_TAG_BCAST,
                                     comm);
            /* wait on the previous segment */
            smpi_mpi_wait( &recv_reqs[req_index ^ 0x1], 
                                     MPI_STATUS_IGNORE );
        }

        smpi_mpi_wait( &recv_reqs[req_index], MPI_STATUS_IGNORE );
    }

    if( NULL != send_reqs ) free(send_reqs);

    return (MPI_SUCCESS);
}
/*
 * reduce_intra_in_order_binary 
 * 
 * Function:      Logarithmic reduce operation for non-commutative operations.
 * Acecpts:       same as MPI_Reduce()
 * Returns:       MPI_SUCCESS or error code
 */
int ompi_coll_tuned_reduce_intra_in_order_binary( void *sendbuf, void *recvbuf,
                                                  int count, 
                                                  ompi_datatype_t* datatype,
                                                  ompi_op_t* op, int root,
                                                  ompi_communicator_t* comm, 
                                                  mca_coll_base_module_t *module,
                                                  uint32_t segsize,
                                                  int max_outstanding_reqs  )
{
    int ret, rank, size, io_root, segcount = count;
    void *use_this_sendbuf = NULL, *use_this_recvbuf = NULL;
    size_t typelng;
    mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
    mca_coll_tuned_comm_t *data = tuned_module->tuned_data;

    rank = ompi_comm_rank(comm);
    size = ompi_comm_size(comm);
    OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_in_order_binary rank %d ss %5d",
                 rank, segsize));

    COLL_TUNED_UPDATE_IN_ORDER_BINTREE( comm, tuned_module );

    /**
     * Determine number of segments and number of elements
     * sent per operation
     */
    ompi_datatype_type_size( datatype, &typelng );
    COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount );

    /* An in-order binary tree must use root (size-1) to preserve the order of
       operations.  Thus, if root is not rank (size - 1), then we must handle
       1. MPI_IN_PLACE option on real root, and 
       2. we must allocate temporary recvbuf on rank (size - 1).
       Note that generic function must be careful not to switch order of 
       operations for non-commutative ops.
    */
    io_root = size - 1;
    use_this_sendbuf = sendbuf;
    use_this_recvbuf = recvbuf;
    if (io_root != root) {
        ptrdiff_t tlb, text, lb, ext;
        char *tmpbuf = NULL;
    
        ompi_datatype_get_extent(datatype, &lb, &ext);
        ompi_datatype_get_true_extent(datatype, &tlb, &text);

        if ((root == rank) && (MPI_IN_PLACE == sendbuf)) {
            tmpbuf = (char *) malloc(text + (ptrdiff_t)(count - 1) * ext);
            if (NULL == tmpbuf) {
                return MPI_ERR_INTERN;
            }
            ompi_datatype_copy_content_same_ddt(datatype, count, 
                                                (char*)tmpbuf,
                                                (char*)recvbuf);
            use_this_sendbuf = tmpbuf;
        } else if (io_root == rank) {
            tmpbuf = (char *) malloc(text + (ptrdiff_t)(count - 1) * ext);
            if (NULL == tmpbuf) {
                return MPI_ERR_INTERN;
            }
            use_this_recvbuf = tmpbuf;
        }
    }

    /* Use generic reduce with in-order binary tree topology and io_root */
    ret = ompi_coll_tuned_reduce_generic( use_this_sendbuf, use_this_recvbuf, count, datatype,
                                          op, io_root, comm, module, 
                                          data->cached_in_order_bintree, 
                                          segcount, max_outstanding_reqs );
    if (MPI_SUCCESS != ret) { return ret; }

    /* Clean up */
    if (io_root != root) {
        if (root == rank) {
            /* Receive result from rank io_root to recvbuf */
            ret = MCA_PML_CALL(recv(recvbuf, count, datatype, io_root,
                                    MCA_COLL_BASE_TAG_REDUCE, comm,
                                    MPI_STATUS_IGNORE));
            if (MPI_SUCCESS != ret) { return ret; }
            if (MPI_IN_PLACE == sendbuf) {
                free(use_this_sendbuf);
            }
          
        } else if (io_root == rank) {
            /* Send result from use_this_recvbuf to root */
            ret = MCA_PML_CALL(send(use_this_recvbuf, count, datatype, root,
                                    MCA_COLL_BASE_TAG_REDUCE, 
                                    MCA_PML_BASE_SEND_STANDARD, comm));
            if (MPI_SUCCESS != ret) { return ret; }
            free(use_this_recvbuf);
        }
    }

    return MPI_SUCCESS;
}
int ompi_coll_tuned_reduce_intra_binomial( void *sendbuf, void *recvbuf,
                                           int count, ompi_datatype_t* datatype,
                                           ompi_op_t* op, int root,
                                           ompi_communicator_t* comm, 
                                           mca_coll_base_module_t *module,
                                           uint32_t segsize,
                                           int max_outstanding_reqs  )
{
    int segcount = count;
    size_t typelng;
    mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
    mca_coll_tuned_comm_t *data = tuned_module->tuned_data;

    OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_binomial rank %d ss %5d",
                 ompi_comm_rank(comm), segsize));

    //printf ("USE BINOMIAL REDUCTION\n");

if (sdn_comp_enable > 1) {
    assert(false);
    ompi_coll_tree_t *tree = (ompi_coll_tree_t*)malloc(sizeof(ompi_coll_tree_t));
    tree->tree_bmtree   = 1;
    tree->tree_root     = MPI_UNDEFINED;
    tree->tree_nextsize = MPI_UNDEFINED;
    tree->tree_root     = root;
    
    switch (ompi_comm_rank(comm)) {
    case 0:
        tree->tree_nextsize = 0;
        tree->tree_prev = 1;
        break;
    case 1:
        tree->tree_nextsize = 1;
        tree->tree_next[0] = 0;
        tree->tree_prev = 3;
        break;
    case 2:
        tree->tree_nextsize = 0;
        tree->tree_prev = 3;
        break;
    case 3:
        tree->tree_nextsize = 2;
        tree->tree_next[0] = 1;
        tree->tree_next[1] = 2;
        tree->tree_prev = 7;
        break;
    case 4:
        tree->tree_nextsize = 1;
        tree->tree_next[0] = 5;
        tree->tree_prev = 7;
        break;
    case 5:
        tree->tree_nextsize = 0;
        tree->tree_prev = 4;
        break;
    case 6:
        tree->tree_nextsize = 0;
        tree->tree_prev = 7;
        break;
    case 7:
        tree->tree_nextsize = 3;
        tree->tree_next[0] = 3;
        tree->tree_next[1] = 4;
        tree->tree_next[2] = 6;
        tree->tree_prev = 7;
        break;
    }

    /**
     * Determine number of segments and number of elements
     * sent per operation
     */
    ompi_datatype_type_size( datatype, &typelng );
    COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount );

    return ompi_coll_tuned_reduce_generic( sendbuf, recvbuf, count, datatype, 
                                           op, root, comm, module, tree, 
                                           segcount, max_outstanding_reqs );
} else if (sdn_comp_enable > 0) {
    /**
     * Determine number of segments and number of elements
     * sent per operation
     */
    ompi_datatype_type_size( datatype, &typelng );
    COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount );

    return ompi_coll_tuned_reduce_generic( sendbuf, recvbuf, count, datatype, 
                                           op, root, comm, module,
                                           sdn_shortest_bmtree[root], 
                                           segcount, max_outstanding_reqs );
} else {

    COLL_TUNED_UPDATE_IN_ORDER_BMTREE( comm, tuned_module, root );

    /**
     * Determine number of segments and number of elements
     * sent per operation
     */
    ompi_datatype_type_size( datatype, &typelng );
    COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount );

    return ompi_coll_tuned_reduce_generic( sendbuf, recvbuf, count, datatype, 
                                           op, root, comm, module,
                                           data->cached_in_order_bmtree, 
                                           segcount, max_outstanding_reqs );
} /* end if sdn_comp_enable */

}
Beispiel #10
0
/*
 * reduce_intra_in_order_binary 
 * 
 * Function:      Logarithmic reduce operation for non-commutative operations.
 * Acecpts:       same as MPI_Reduce()
 * Returns:       MPI_SUCCESS or error code
 */
int smpi_coll_tuned_reduce_ompi_in_order_binary( void *sendbuf, void *recvbuf,
                                                  int count, 
                                                  MPI_Datatype datatype,
                                                  MPI_Op  op, int root,
                                                  MPI_Comm  comm)
{
    uint32_t segsize=0;
    int ret;
    int rank, size, io_root;
    int segcount = count;
    void *use_this_sendbuf = NULL, *use_this_recvbuf = NULL;
    size_t typelng;

    rank = smpi_comm_rank(comm);
    size = smpi_comm_size(comm);
    XBT_DEBUG("coll:tuned:reduce_intra_in_order_binary rank %d ss %5d",
                 rank, segsize);

    /**
     * Determine number of segments and number of elements
     * sent per operation
     */
    typelng=smpi_datatype_size( datatype);
    COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount );

    /* An in-order binary tree must use root (size-1) to preserve the order of
       operations.  Thus, if root is not rank (size - 1), then we must handle
       1. MPI_IN_PLACE option on real root, and 
       2. we must allocate temporary recvbuf on rank (size - 1).
       Note that generic function must be careful not to switch order of 
       operations for non-commutative ops.
    */
    io_root = size - 1;
    use_this_sendbuf = sendbuf;
    use_this_recvbuf = recvbuf;
    if (io_root != root) {
        ptrdiff_t text, ext;
        char *tmpbuf = NULL;
    
        ext=smpi_datatype_get_extent(datatype);
        text=smpi_datatype_get_extent(datatype);

        if ((root == rank) && (MPI_IN_PLACE == sendbuf)) {
            tmpbuf = (char *) malloc(text + (count - 1) * ext);
            if (NULL == tmpbuf) {
                return MPI_ERR_INTERN;
            }
            smpi_datatype_copy (
                                                (char*)recvbuf, count, datatype,
                                                (char*)tmpbuf, count, datatype);
            use_this_sendbuf = tmpbuf;
        } else if (io_root == rank) {
            tmpbuf = (char *) malloc(text + (count - 1) * ext);
            if (NULL == tmpbuf) {
                return MPI_ERR_INTERN;
            }
            use_this_recvbuf = tmpbuf;
        }
    }

    /* Use generic reduce with in-order binary tree topology and io_root */
    ret = smpi_coll_tuned_ompi_reduce_generic( use_this_sendbuf, use_this_recvbuf, count, datatype,
                                          op, io_root, comm, 
                                          ompi_coll_tuned_topo_build_in_order_bintree(comm), 
                                          segcount, 0 );
    if (MPI_SUCCESS != ret) { return ret; }

    /* Clean up */
    if (io_root != root) {
        if (root == rank) {
            /* Receive result from rank io_root to recvbuf */
            smpi_mpi_recv(recvbuf, count, datatype, io_root,
                                    COLL_TAG_REDUCE, comm,
                                    MPI_STATUS_IGNORE);
            if (MPI_IN_PLACE == sendbuf) {
                free(use_this_sendbuf);
            }
          
        } else if (io_root == rank) {
            /* Send result from use_this_recvbuf to root */
            smpi_mpi_send(use_this_recvbuf, count, datatype, root,
                                    COLL_TAG_REDUCE,
                                    comm);
            free(use_this_recvbuf);
        }
    }

    return MPI_SUCCESS;
}
Beispiel #11
0
/*
 *  gather_intra_linear_sync
 *
 *  Function:  - synchronized gather operation with
 *  Accepts:  - same arguments as MPI_Gather(), first segment size
 *  Returns:  - MPI_SUCCESS or error code
 */
int Coll_gather_ompi_linear_sync::gather(void *sbuf, int scount,
                                         MPI_Datatype sdtype,
                                         void *rbuf, int rcount,
                                         MPI_Datatype rdtype,
                                         int root,
                                         MPI_Comm comm)
{
    int i;
    int ret, line;
    int rank, size;
    int first_segment_count;
    size_t typelng;
    MPI_Aint extent;
    MPI_Aint lb;

    int first_segment_size=0;
    size = comm->size();
    rank = comm->rank();

    size_t dsize, block_size;
    if (rank == root) {
        dsize= rdtype->size();
        block_size = dsize * rcount;
    } else {
        dsize=sdtype->size();
        block_size = dsize * scount;
    }

     if (block_size > 92160){
     first_segment_size = 32768;
     }else{
     first_segment_size = 1024;
     }

     XBT_DEBUG("smpi_coll_tuned_gather_ompi_linear_sync rank %d, segment %d", rank, first_segment_size);

     if (rank != root) {
       /* Non-root processes:
          - receive zero byte message from the root,
          - send the first segment of the data synchronously,
          - send the second segment of the data.
       */

       typelng = sdtype->size();
       sdtype->extent(&lb, &extent);
       first_segment_count = scount;
       COLL_TUNED_COMPUTED_SEGCOUNT((size_t)first_segment_size, typelng, first_segment_count);

       Request::recv(sbuf, 0, MPI_BYTE, root, COLL_TAG_GATHER, comm, MPI_STATUS_IGNORE);

       Request::send(sbuf, first_segment_count, sdtype, root, COLL_TAG_GATHER, comm);

       Request::send((char*)sbuf + extent * first_segment_count, (scount - first_segment_count), sdtype, root,
                     COLL_TAG_GATHER, comm);
    }

    else {
      /* Root process,
         - For every non-root node:
   - post irecv for the first segment of the message
   - send zero byte message to signal node to send the message
   - post irecv for the second segment of the message
   - wait for the first segment to complete
         - Copy local data if necessary
         - Waitall for all the second segments to complete.
*/
      char* ptmp;
      MPI_Request first_segment_req;
      MPI_Request* reqs = new (std::nothrow) MPI_Request[size];
      if (NULL == reqs) {
        ret  = -1;
        line = __LINE__;
        goto error_hndl; }

        typelng=rdtype->size();
        rdtype->extent(&lb, &extent);
        first_segment_count = rcount;
        COLL_TUNED_COMPUTED_SEGCOUNT( (size_t)first_segment_size, typelng,
                                      first_segment_count );

        for (i = 0; i < size; ++i) {
            if (i == rank) {
                /* skip myself */
                reqs[i] = MPI_REQUEST_NULL;
                continue;
            }

            /* irecv for the first segment from i */
            ptmp = (char*)rbuf + i * rcount * extent;
            first_segment_req = Request::irecv(ptmp, first_segment_count, rdtype, i,
                                     COLL_TAG_GATHER, comm
                                     );

            /* send sync message */
            Request::send(rbuf, 0, MPI_BYTE, i,
                                    COLL_TAG_GATHER,
                                     comm);

            /* irecv for the second segment */
            ptmp = (char*)rbuf + (i * rcount + first_segment_count) * extent;
            reqs[i]=Request::irecv(ptmp, (rcount - first_segment_count),
                                     rdtype, i, COLL_TAG_GATHER, comm
                                     );

            /* wait on the first segment to complete */
            Request::wait(&first_segment_req, MPI_STATUS_IGNORE);
        }

        /* copy local data if necessary */
        if (MPI_IN_PLACE != sbuf) {
            ret = Datatype::copy(sbuf, scount, sdtype,
                                  (char*)rbuf + rank * rcount * extent,
                                  rcount, rdtype);
            if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
        }

        /* wait all second segments to complete */
        ret = Request::waitall(size, reqs, MPI_STATUSES_IGNORE);
        if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }

        delete[] reqs;
    }

    /* All done */

    return MPI_SUCCESS;
 error_hndl:
    XBT_DEBUG(
                   "ERROR_HNDL: node %d file %s line %d error %d\n",
                   rank, __FILE__, line, ret );
    return ret;
}