Esempio n. 1
0
/**
 * Shared memory reduction.
 *
 * Simply farms out to the associative or non-associative functions.
 */
int mca_coll_sm_reduce_intra(void *sbuf, void* rbuf, int count, 
                             struct ompi_datatype_t *dtype, 
                             struct ompi_op_t *op, 
                             int root, struct ompi_communicator_t *comm)
{
    size_t size;

    /* There are several possibilities:
     *

     * 0. If the datatype is larger than a segment, fall back to basic
     * 1. If the op is user-defined, use the strict order
     * 2. If the op is intrinsic:
     *    a. If the op is float-associative, use the unordered
     *    b. If the op is not float-asociative:
     *       i. if the data is floating point, use the strict order
     *       ii. if the data is not floating point, use the unordered
     */

    ompi_ddt_type_size(dtype, &size);
    if ((int)size > mca_coll_sm_component.sm_control_size) {
        return comm->c_coll_basic_module->coll_reduce(sbuf, rbuf, count,
                                                      dtype, op, root, comm);
    } 
#if WANT_REDUCE_NO_ORDER
    else if (!ompi_op_is_intrinsic(op) ||
        (ompi_op_is_intrinsic(op) && !ompi_op_is_float_assoc(op) &&
         0 != (dtype->flags & DT_FLAG_DATA_FLOAT))) {
        return reduce_inorder(sbuf, rbuf, count, dtype, op, root, comm);
    } else {
        return reduce_no_order(sbuf, rbuf, count, dtype, op, root, comm);
    }
#else
    else {
        return reduce_inorder(sbuf, rbuf, count, dtype, op, root, comm);
Esempio n. 2
0
/**
 * Shared memory reduction.
 *
 * Simply farms out to the associative or non-associative functions.
 */
int mca_coll_sm_reduce_intra(void *sbuf, void* rbuf, int count, 
                             struct ompi_datatype_t *dtype, 
                             struct ompi_op_t *op, 
                             int root, struct ompi_communicator_t *comm,
                             mca_coll_base_module_t *module)
{
    size_t size;
    mca_coll_sm_module_t *sm_module = (mca_coll_sm_module_t*) module;

    /* There are several possibilities:
     *
     * 0. If the datatype is larger than a segment, fall back to
     *    underlying module
     * 1. If the op is user-defined, use the strict order
     * 2. If the op is intrinsic:
     *    a. If the op is float-associative, use the unordered
     *    b. If the op is not float-associative:
     *       i. if the data is floating point, use the strict order
     *       ii. if the data is not floating point, use the unordered
     */

    ompi_ddt_type_size(dtype, &size);
    if ((int)size > mca_coll_sm_component.sm_control_size) {
        return sm_module->previous_reduce(sbuf, rbuf, count,
                                          dtype, op, root, comm,
                                          sm_module->previous_reduce_module);
    } 
#if WANT_REDUCE_NO_ORDER
    else {
        /* Lazily enable the module the first time we invoke a
           collective on it */
        if (!sm_module->enabled) {
            if (OMPI_SUCCESS != 
                (ret = ompi_coll_sm_lazy_enable(module, comm))) {
                return ret;
            }
        }
        
        if (!ompi_op_is_intrinsic(op) ||
            (ompi_op_is_intrinsic(op) && !ompi_op_is_float_assoc(op) &&
             0 != (dtype->flags & OMPI_DATATYPE_FLAG_DATA_FLOAT))) {
            return reduce_inorder(sbuf, rbuf, count, dtype, op, 
                                  root, comm, module);
        } else {
            return reduce_no_order(sbuf, rbuf, count, dtype, op, 
                                   root, comm, module);
        }
    }
#else
    else {
        /* Lazily enable the module the first time we invoke a
           collective on it */
        if (!sm_module->enabled) {
/*
 *	reduce_scatter_intra_dec 
 *
 *	Function:	- seletects reduce_scatter algorithm to use
 *	Accepts:	- same arguments as MPI_Reduce_scatter()
 *	Returns:	- MPI_SUCCESS or error code (passed from 
 *                        the reduce scatter implementation)
 *      Note: If we detect zero valued counts in the rcounts array, we
 *      fall back to the nonoverlapping algorithm because the other
 *      algorithms do not currently handle it.
 */
int ompi_coll_tuned_reduce_scatter_intra_dec_fixed( void *sbuf, void *rbuf,
                                                    int *rcounts,
                                                    struct ompi_datatype_t *dtype,
                                                    struct ompi_op_t *op,
                                                    struct ompi_communicator_t *comm,
                                                    mca_coll_base_module_t *module)
{
    int comm_size, i, pow2;
    size_t total_message_size, dsize;
    const double a = 0.0012;
    const double b = 8.0;
    const size_t small_message_size = 12 * 1024;
    const size_t large_message_size = 256 * 1024;
    bool zerocounts = false;

    OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_reduce_scatter_intra_dec_fixed"));

    comm_size = ompi_comm_size(comm);
    /* We need data size for decision function */
    ompi_ddt_type_size(dtype, &dsize);
    total_message_size = 0;
    for (i = 0; i < comm_size; i++) { 
        total_message_size += rcounts[i];
        if (0 == rcounts[i]) {
            zerocounts = true;
        }
    }

    if( !ompi_op_is_commute(op) || (zerocounts)) {
        return ompi_coll_tuned_reduce_scatter_intra_nonoverlapping (sbuf, rbuf, rcounts, 
                                                                    dtype, op, 
                                                                    comm, module); 
    }
   
    total_message_size *= dsize;

    /* compute the nearest power of 2 */
    for (pow2 = 1; pow2 < comm_size; pow2 <<= 1);

    if ((total_message_size <= small_message_size) ||
        ((total_message_size <= large_message_size) && (pow2 == comm_size)) ||
        (comm_size >= a * total_message_size + b)) {
        return 
            ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(sbuf, rbuf, rcounts,
                                                                        dtype, op,
                                                                        comm, module);
    } 
    return ompi_coll_tuned_reduce_scatter_intra_ring(sbuf, rbuf, rcounts,
                                                     dtype, op,
                                                     comm, module);
}
int ompi_coll_tuned_allgatherv_intra_dec_fixed(void *sbuf, int scount, 
                                               struct ompi_datatype_t *sdtype,
                                               void* rbuf, int *rcounts, 
                                               int *rdispls,
                                               struct ompi_datatype_t *rdtype, 
                                               struct ompi_communicator_t *comm,
                                               mca_coll_base_module_t *module)
{
    int i;
    int communicator_size;
    size_t dsize, total_dsize;
    
    communicator_size = ompi_comm_size(comm);
    
    /* Special case for 2 processes */
    if (communicator_size == 2) {
        return ompi_coll_tuned_allgatherv_intra_two_procs (sbuf, scount, sdtype,
                                                           rbuf, rcounts, rdispls, rdtype, 
                                                           comm, module);
    }
    
    /* Determine complete data size */
    ompi_ddt_type_size(sdtype, &dsize);
    total_dsize = 0;
    for (i = 0; i < communicator_size; i++) {
        total_dsize += dsize * rcounts[i];
    }
    
    OPAL_OUTPUT((ompi_coll_tuned_stream, 
                 "ompi_coll_tuned_allgatherv_intra_dec_fixed"
                 " rank %d com_size %d msg_length %lu",
                 ompi_comm_rank(comm), communicator_size, (unsigned long)total_dsize));
    
    /* Decision based on allgather decision.   */
    if (total_dsize < 50000) {
        return ompi_coll_tuned_allgatherv_intra_bruck(sbuf, scount, sdtype, 
                                                      rbuf, rcounts, rdispls, rdtype, 
                                                      comm, module);
    } else {
        if (communicator_size % 2) {
            return ompi_coll_tuned_allgatherv_intra_ring(sbuf, scount, sdtype, 
                                                         rbuf, rcounts, rdispls, rdtype, 
                                                         comm, module);
        } else {
            return  ompi_coll_tuned_allgatherv_intra_neighborexchange(sbuf, scount, sdtype,
                                                                      rbuf, rcounts, rdispls, rdtype, 
                                                                      comm, module);
        }
    }
}
int ompi_coll_tuned_scatter_intra_dec_fixed(void *sbuf, int scount, 
                                            struct ompi_datatype_t *sdtype,
                                            void* rbuf, int rcount, 
                                            struct ompi_datatype_t *rdtype, 
                                            int root, struct ompi_communicator_t *comm,
                                            mca_coll_base_module_t *module)
{
    const size_t small_block_size = 300;
    const int small_comm_size = 10;
    int communicator_size, rank;
    size_t dsize, block_size;

    OPAL_OUTPUT((ompi_coll_tuned_stream, 
                 "ompi_coll_tuned_scatter_intra_dec_fixed"));

    communicator_size = ompi_comm_size(comm);
    rank = ompi_comm_rank(comm);
    /* Determine block size */
    if (root == rank) {
        ompi_ddt_type_size(sdtype, &dsize);
        block_size = dsize * scount;
    } else {
        ompi_ddt_type_size(rdtype, &dsize);
        block_size = dsize * rcount;
    } 

    if ((communicator_size > small_comm_size) &&
        (block_size < small_block_size)) {
        return ompi_coll_tuned_scatter_intra_binomial (sbuf, scount, sdtype, 
                                                       rbuf, rcount, rdtype, 
                                                       root, comm, module);
    }
    return ompi_coll_tuned_scatter_intra_basic_linear (sbuf, scount, sdtype, 
                                                       rbuf, rcount, rdtype, 
                                                       root, comm, module);
}
/*
 *  allreduce_intra
 *
 *  Function:   - allreduce using other MPI collectives
 *  Accepts:    - same as MPI_Allreduce()
 *  Returns:    - MPI_SUCCESS or error code
 */
int
ompi_coll_tuned_allreduce_intra_dec_fixed (void *sbuf, void *rbuf, int count,
                                           struct ompi_datatype_t *dtype,
                                           struct ompi_op_t *op,
                                           struct ompi_communicator_t *comm,
                                           mca_coll_base_module_t *module)
{
    size_t dsize, block_dsize;
    int comm_size = ompi_comm_size(comm);
    const size_t intermediate_message = 10000;
    OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_allreduce_intra_dec_fixed"));

    /**
     * Decision function based on MX results from the Grig cluster at UTK.
     * 
     * Currently, linear, recursive doubling, and nonoverlapping algorithms 
     * can handle both commutative and non-commutative operations.
     * Ring algorithm does not support non-commutative operations.
     */
    ompi_ddt_type_size(dtype, &dsize);
    block_dsize = dsize * count;

    if (block_dsize < intermediate_message) {
        return (ompi_coll_tuned_allreduce_intra_recursivedoubling (sbuf, rbuf, 
                                                                   count, dtype,
                                                                   op, comm, module));
    } 

    if( ompi_op_is_commute(op) && (count > comm_size) ) {
        const size_t segment_size = 1 << 20; /* 1 MB */
        if ((comm_size * segment_size >= block_dsize)) {
            return (ompi_coll_tuned_allreduce_intra_ring (sbuf, rbuf, count, dtype, 
                                                          op, comm, module));
        } else {
            return (ompi_coll_tuned_allreduce_intra_ring_segmented (sbuf, rbuf, 
                                                                    count, dtype, 
                                                                    op, comm, module,
                                                                    segment_size));
        }
    }

    return (ompi_coll_tuned_allreduce_intra_nonoverlapping (sbuf, rbuf, count, 
                                                            dtype, op, comm, module));
}
Esempio n. 7
0
int ompi_coll_tuned_reduce_intra_chain( void *sendbuf, void *recvbuf, int count,
                                        ompi_datatype_t* datatype, ompi_op_t* op,
                                        int root, ompi_communicator_t* comm, uint32_t segsize,
                                        int fanout)
{
    int segcount = count;
    size_t typelng;

    OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_chain rank %d fo %d ss %5d", ompi_comm_rank(comm), fanout, segsize));

    COLL_TUNED_UPDATE_CHAIN( comm, root, fanout );
    /**
     * Determine number of segments and number of elements
     * sent per operation
     */
    ompi_ddt_type_size( datatype, &typelng );
    COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount );

    return ompi_coll_tuned_reduce_generic( sendbuf, recvbuf, count, datatype, op, root, comm,
                                           comm->c_coll_selected_data->cached_chain, segcount );
}
Esempio n. 8
0
/**
 * This is a generic implementation of the reduce protocol. It used the tree
 * provided as an argument and execute all operations using a segment of
 * count times a datatype.
 * For the last communication it will update the count in order to limit
 * th number of datatype to the original count (original_count)
 */
int ompi_coll_tuned_reduce_generic( void* sendbuf, void* recvbuf, int original_count,
                                    ompi_datatype_t* datatype, ompi_op_t* op,
                                    int root, ompi_communicator_t* comm,
                                    ompi_coll_tree_t* tree, int count_by_segment )
{
    char *inbuf[2] = {(char*)NULL, (char*)NULL};
    char *local_op_buffer = NULL, *accumbuf = NULL, *sendtmpbuf = NULL;
    ptrdiff_t extent, lower_bound;
    size_t typelng, realsegsize;
    ompi_request_t* reqs[2] = {MPI_REQUEST_NULL, MPI_REQUEST_NULL};
    int num_segments, line, ret, segindex, i, rank;
    int recvcount, prevcount, inbi, previnbi;

    /**
     * Determine number of segments and number of elements
     * sent per operation
     */
    ompi_ddt_get_extent( datatype, &lower_bound, &extent );
    ompi_ddt_type_size( datatype, &typelng );
    num_segments = (original_count + count_by_segment - 1) / count_by_segment;
    realsegsize = count_by_segment * extent;

    sendtmpbuf = (char*) sendbuf; 
    if( sendbuf == MPI_IN_PLACE ) { 
        sendtmpbuf = (char *)recvbuf; 
    }

    rank = ompi_comm_rank(comm);

    /* non-leaf nodes - wait for children to send me data & forward up (if needed) */
    if( tree->tree_nextsize > 0 ) {
        /* handle non existant recv buffer (i.e. its NULL) and 
           protect the recv buffer on non-root nodes */
        accumbuf = (char*)recvbuf;
        if( (NULL == accumbuf) || (root != rank) ) {
            accumbuf = (char*)malloc(realsegsize * num_segments);  /* TO BE OPTIMIZED */
	    if (accumbuf == NULL) { line = __LINE__; ret = -1; goto error_hndl; }
	} 

        /* Allocate two buffers for incoming segments */
        inbuf[0] = (char*) malloc(realsegsize);
        if( inbuf[0] == NULL ) { line = __LINE__; ret = -1; goto error_hndl; }
        /* if there is chance to overlap communication -
           allocate second buffer */
        if( (num_segments > 1) || (tree->tree_nextsize > 1) ) {
            inbuf[1] = (char*) malloc(realsegsize);
            if( inbuf[1] == NULL ) { line = __LINE__; ret = -1; goto error_hndl;}
        } else {
            inbuf[1] = NULL;
        }

        /* reset input buffer index and receive count */
        inbi = 0;
        recvcount = 0;
        /* for each segment */
        for( segindex = 0; segindex <= num_segments; segindex++ ) {
            prevcount = recvcount;
            /* recvcount - number of elements in current segment */
            recvcount = count_by_segment;
            if( segindex == (num_segments-1) )
                recvcount = original_count - count_by_segment * segindex;

            /* for each child */
            for( i = 0; i < tree->tree_nextsize; i++ ) {
                /**
                 * We try to overlap communication:
                 * either with next segment or with the next child
                 */
                /* post irecv for current segindex on current child */
                if( segindex < num_segments ) {
                    void* local_recvbuf = inbuf[inbi];
                    if( 0 == i ) {
                        /* for the first step (1st child per segment) we might be able to
                         * irecv directly into the accumulate buffer so that we can
                         * reduce(op) this with our sendbuf in one step as ompi_op_reduce
                         * only has two buffer pointers, this avoids an extra memory copy.
                         *
                         * BUT if we are root and are USING MPI_IN_PLACE this is wrong ek!
                         * check for root might not be needed as it should be checked higher up
                         */
                        if( !((MPI_IN_PLACE == sendbuf) && (rank == tree->tree_root)) ) {
                            local_recvbuf = accumbuf + segindex * realsegsize;
                        }
                    }
                    ret = MCA_PML_CALL(irecv(local_recvbuf, recvcount, datatype, tree->tree_next[i],
                                             MCA_COLL_BASE_TAG_REDUCE, comm, &reqs[inbi]));
                    if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl;  }
                }
                /* wait for previous req to complete, if any */
                previnbi = (inbi+1) % 2;
                /* wait on data from last child for previous segment */
                ret = ompi_request_wait_all( 1, &reqs[previnbi], MPI_STATUSES_IGNORE );
                if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl;  }
                local_op_buffer = inbuf[previnbi];
                if( i > 0 ) {
                    /* our first operation is to combine our own [sendbuf] data with the data
                     * we recvd from down stream (but only if we are not root and not using
                     * MPI_IN_PLACE)
                     */
                    if( 1 == i ) {
                        if( !((MPI_IN_PLACE == sendbuf) && (rank == tree->tree_root)) ) {
                            local_op_buffer = sendtmpbuf + segindex * realsegsize;
                        }
                    }
                    /* apply operation */
                    ompi_op_reduce(op, local_op_buffer, accumbuf+segindex*realsegsize, recvcount, datatype );
                } else if ( segindex > 0 ) {
                    void* accumulator = accumbuf + (segindex-1) * realsegsize;

                    if( tree->tree_nextsize <= 1 ) {
                        if( !((MPI_IN_PLACE == sendbuf) && (rank == tree->tree_root)) ) {            
                            local_op_buffer = sendtmpbuf+(segindex-1)*realsegsize;
                        }
                    }
                    ompi_op_reduce(op, local_op_buffer, accumulator, prevcount, datatype );

                    /* all reduced on available data this step (i) complete, pass to
                     * the next process unless your the root
                     */
                    if (rank != tree->tree_root) {
                        /* send combined/accumulated data to parent */
                        ret = MCA_PML_CALL( send( accumulator, prevcount, datatype,
                                                  tree->tree_prev, MCA_COLL_BASE_TAG_REDUCE,
                                                  MCA_PML_BASE_SEND_STANDARD, comm) );
                        if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl;  }
                    }

                    /* we stop when segindex = number of segments (i.e. we do num_segment+1 steps to allow for pipelining */
                    if (segindex == num_segments) break;
                }

                /* update input buffer index */
                inbi = previnbi;
            } /* end of for each child */
        } /* end of for each segment */

        /* clean up */
        if( inbuf[0] != NULL) free(inbuf[0]);
        if( inbuf[1] != NULL) free(inbuf[1]);
        if( (NULL == recvbuf) || (root != rank) ) free(accumbuf);
    }

    /* leaf nodes */
    else {
        /* Send segmented data to parents */
        segindex = 0;
        while( original_count > 0 ) {
            if( original_count < count_by_segment ) count_by_segment = original_count;
            ret = MCA_PML_CALL( send((char*)sendbuf + segindex * realsegsize, count_by_segment,
                                     datatype, tree->tree_prev,
                                     MCA_COLL_BASE_TAG_REDUCE, MCA_PML_BASE_SEND_STANDARD, comm) );
            if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl;  }
            segindex++;
            original_count -= count_by_segment;
        }
    }
    return OMPI_SUCCESS;

 error_hndl:  /* error handler */
    OPAL_OUTPUT (( ompi_coll_tuned_stream, "ERROR_HNDL: node %d file %s line %d error %d\n", rank, __FILE__, line, ret ));
    if( inbuf[0] != NULL ) free(inbuf[0]);
    if( inbuf[1] != NULL ) free(inbuf[1]);
    if( (NULL == recvbuf) && (NULL != accumbuf) ) free(accumbuf);
    return ret;
}
int ompi_coll_tuned_alltoall_intra_dec_fixed(void *sbuf, int scount, 
                                             struct ompi_datatype_t *sdtype,
                                             void* rbuf, int rcount, 
                                             struct ompi_datatype_t *rdtype, 
                                             struct ompi_communicator_t *comm,
                                             mca_coll_base_module_t *module)
{
    int communicator_size;
    size_t dsize, block_dsize;
#if 0
    size_t total_dsize;
#endif

    communicator_size = ompi_comm_size(comm);

    /* special case */
    if (communicator_size==2) {
        return ompi_coll_tuned_alltoall_intra_two_procs(sbuf, scount, sdtype, 
                                                        rbuf, rcount, rdtype, 
                                                        comm, module);
    }

    /* Decision function based on measurement on Grig cluster at 
       the University of Tennessee (2GB MX) up to 64 nodes.
       Has better performance for messages of intermediate sizes than the old one */
    /* determine block size */
    ompi_ddt_type_size(sdtype, &dsize);
    block_dsize = dsize * scount;

    if ((block_dsize < 200) && (communicator_size > 12)) {
        return ompi_coll_tuned_alltoall_intra_bruck(sbuf, scount, sdtype, 
                                                    rbuf, rcount, rdtype,
                                                    comm, module);

    } else if (block_dsize < 3000) {
        return ompi_coll_tuned_alltoall_intra_basic_linear(sbuf, scount, sdtype, 
                                                           rbuf, rcount, rdtype, 
                                                           comm, module);
    }

    return ompi_coll_tuned_alltoall_intra_pairwise (sbuf, scount, sdtype, 
                                                    rbuf, rcount, rdtype,
                                                    comm, module);

#if 0
    /* previous decision */

    /* else we need data size for decision function */
    ompi_ddt_type_size(sdtype, &dsize);
    total_dsize = dsize * scount * communicator_size;   /* needed for decision */

    OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_alltoall_intra_dec_fixed rank %d com_size %d msg_length %ld",
                 ompi_comm_rank(comm), communicator_size, total_dsize));

    if (communicator_size >= 12 && total_dsize <= 768) {
        return ompi_coll_tuned_alltoall_intra_bruck (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
    }
    if (total_dsize <= 131072) {
        return ompi_coll_tuned_alltoall_intra_basic_linear (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
    }
    return ompi_coll_tuned_alltoall_intra_pairwise (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
#endif
}
int ompi_coll_tuned_gather_intra_dec_fixed(void *sbuf, int scount, 
                                           struct ompi_datatype_t *sdtype,
                                           void* rbuf, int rcount, 
                                           struct ompi_datatype_t *rdtype, 
                                           int root,
                                           struct ompi_communicator_t *comm,
                                           mca_coll_base_module_t *module)
{
    const int large_segment_size = 32768;
    const int small_segment_size = 1024;

    const size_t large_block_size = 92160;
    const size_t intermediate_block_size = 6000;
    const size_t small_block_size = 1024;

    const int large_communicator_size = 60;
    const int small_communicator_size = 10;

    int communicator_size, rank;
    size_t dsize, block_size;

    OPAL_OUTPUT((ompi_coll_tuned_stream, 
                 "ompi_coll_tuned_gather_intra_dec_fixed"));

    communicator_size = ompi_comm_size(comm);
    rank = ompi_comm_rank(comm);

    /* Determine block size */
    if (rank == root) {
        ompi_ddt_type_size(rdtype, &dsize);
        block_size = dsize * rcount;
    } else {
        ompi_ddt_type_size(sdtype, &dsize);
        block_size = dsize * scount;
    }

    if (block_size > large_block_size) {
        return ompi_coll_tuned_gather_intra_linear_sync (sbuf, scount, sdtype, 
                                                         rbuf, rcount, rdtype, 
                                                         root, comm, module,
                                                         large_segment_size);

    } else if (block_size > intermediate_block_size) {
        return ompi_coll_tuned_gather_intra_linear_sync (sbuf, scount, sdtype, 
                                                         rbuf, rcount, rdtype, 
                                                         root, comm, module,
                                                         small_segment_size);

    } else if ((communicator_size > large_communicator_size) ||
               ((communicator_size > small_communicator_size) &&
                (block_size < small_block_size))) {
        return ompi_coll_tuned_gather_intra_binomial (sbuf, scount, sdtype, 
                                                      rbuf, rcount, rdtype, 
                                                      root, comm, module);

    }
    /* Otherwise, use basic linear */
    return ompi_coll_tuned_gather_intra_basic_linear (sbuf, scount, sdtype, 
                                                      rbuf, rcount, rdtype, 
                                                      root, comm, module);
}
int ompi_coll_tuned_allgather_intra_dec_fixed(void *sbuf, int scount, 
                                              struct ompi_datatype_t *sdtype,
                                              void* rbuf, int rcount, 
                                              struct ompi_datatype_t *rdtype, 
                                              struct ompi_communicator_t *comm,
                                              mca_coll_base_module_t *module)
{
    int communicator_size, pow2_size;
    size_t dsize, total_dsize;

    communicator_size = ompi_comm_size(comm);

    /* Special case for 2 processes */
    if (communicator_size == 2) {
        return ompi_coll_tuned_allgather_intra_two_procs (sbuf, scount, sdtype, 
                                                          rbuf, rcount, rdtype, 
                                                          comm, module);
    }

   /* Determine complete data size */
   ompi_ddt_type_size(sdtype, &dsize);
   total_dsize = dsize * scount * communicator_size;   
   
    OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_allgather_intra_dec_fixed"
                 " rank %d com_size %d msg_length %lu",
                 ompi_comm_rank(comm), communicator_size, (unsigned long)total_dsize));

    for (pow2_size  = 1; pow2_size < communicator_size; pow2_size <<=1); 

    /* Decision based on MX 2Gb results from Grig cluster at 
       The University of Tennesse, Knoxville 
       - if total message size is less than 50KB use either bruck or 
       recursive doubling for non-power of two and power of two nodes, 
       respectively.
       - else use ring and neighbor exchange algorithms for odd and even 
       number of nodes, respectively.
    */
    if (total_dsize < 50000) {
        if (pow2_size == communicator_size) {
            return ompi_coll_tuned_allgather_intra_recursivedoubling(sbuf, scount, sdtype, 
                                                                     rbuf, rcount, rdtype,
                                                                     comm, module);
        } else {
            return ompi_coll_tuned_allgather_intra_bruck(sbuf, scount, sdtype, 
                                                         rbuf, rcount, rdtype, 
                                                         comm, module);
        }
    } else {
        if (communicator_size % 2) {
            return ompi_coll_tuned_allgather_intra_ring(sbuf, scount, sdtype, 
                                                        rbuf, rcount, rdtype, 
                                                        comm, module);
        } else {
            return  ompi_coll_tuned_allgather_intra_neighborexchange(sbuf, scount, sdtype,
                                                                     rbuf, rcount, rdtype,
                                                                     comm, module);
        }
    }
   
#if defined(USE_MPICH2_DECISION)
    /* Decision as in MPICH-2 
       presented in Thakur et.al. "Optimization of Collective Communication 
       Operations in MPICH", International Journal of High Performance Computing 
       Applications, Vol. 19, No. 1, 49-66 (2005)
       - for power-of-two processes and small and medium size messages 
       (up to 512KB) use recursive doubling
       - for non-power-of-two processes and small messages (80KB) use bruck,
       - for everything else use ring.
    */
    if ((pow2_size == communicator_size) && (total_dsize < 524288)) {
        return ompi_coll_tuned_allgather_intra_recursivedoubling(sbuf, scount, sdtype, 
                                                                 rbuf, rcount, rdtype, 
                                                                 comm, module);
    } else if (total_dsize <= 81920) { 
        return ompi_coll_tuned_allgather_intra_bruck(sbuf, scount, sdtype, 
                                                     rbuf, rcount, rdtype,
                                                     comm, module);
    } 
    return ompi_coll_tuned_allgather_intra_ring(sbuf, scount, sdtype, 
                                                rbuf, rcount, rdtype,
                                                comm, module);
#endif  /* defined(USE_MPICH2_DECISION) */
}
/*
 *	reduce_intra_dec 
 *
 *	Function:	- seletects reduce algorithm to use
 *	Accepts:	- same arguments as MPI_reduce()
 *	Returns:	- MPI_SUCCESS or error code (passed from the reduce implementation)
 *                                        
 */
int ompi_coll_tuned_reduce_intra_dec_fixed( void *sendbuf, void *recvbuf,
                                            int count, struct ompi_datatype_t* datatype,
                                            struct ompi_op_t* op, int root,
                                            struct ompi_communicator_t* comm,
                                            mca_coll_base_module_t *module)
{
    int communicator_size, segsize = 0;
    size_t message_size, dsize;
    const double a1 =  0.6016 / 1024.0; /* [1/B] */
    const double b1 =  1.3496;
    const double a2 =  0.0410 / 1024.0; /* [1/B] */
    const double b2 =  9.7128;
    const double a3 =  0.0422 / 1024.0; /* [1/B] */
    const double b3 =  1.1614;
    const double a4 =  0.0033 / 1024.0; /* [1/B] */
    const double b4 =  1.6761;

    const int max_requests = 0; /* no limit on # of outstanding requests */

    communicator_size = ompi_comm_size(comm);

    /* need data size for decision function */
    ompi_ddt_type_size(datatype, &dsize);
    message_size = dsize * count;   /* needed for decision */

    /**
     * If the operation is non commutative we currently have choice of linear 
     * or in-order binary tree algorithm.
     */
    if( !ompi_op_is_commute(op) ) {
        if ((communicator_size < 12) && (message_size < 2048)) {
            return ompi_coll_tuned_reduce_intra_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm, module); 
        } 
        return ompi_coll_tuned_reduce_intra_in_order_binary (sendbuf, recvbuf, count, datatype, op, root, comm, module,
                                                             0, max_requests); 
    }

    OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_reduce_intra_dec_fixed "
                 "root %d rank %d com_size %d msg_length %lu",
                 root, ompi_comm_rank(comm), communicator_size, (unsigned long)message_size));

    if ((communicator_size < 8) && (message_size < 512)){
        /* Linear_0K */
        return ompi_coll_tuned_reduce_intra_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm, module); 
    } else if (((communicator_size < 8) && (message_size < 20480)) ||
               (message_size < 2048) || (count <= 1)) {
        /* Binomial_0K */
        segsize = 0;
        return ompi_coll_tuned_reduce_intra_binomial(sendbuf, recvbuf, count, datatype, op, root, comm, module,
                                                     segsize, max_requests);
    } else if (communicator_size > (a1 * message_size + b1)) {
        /* Binomial_1K */
        segsize = 1024;
        return ompi_coll_tuned_reduce_intra_binomial(sendbuf, recvbuf, count, datatype, op, root, comm, module,
                                                     segsize, max_requests);
    } else if (communicator_size > (a2 * message_size + b2)) {
        /* Pipeline_1K */
        segsize = 1024;
        return ompi_coll_tuned_reduce_intra_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm, module, 
                                                      segsize, max_requests);
    } else if (communicator_size > (a3 * message_size + b3)) {
        /* Binary_32K */
        segsize = 32*1024;
        return ompi_coll_tuned_reduce_intra_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm, module,
						      segsize, max_requests);
    }
    if (communicator_size > (a4 * message_size + b4)) {
        /* Pipeline_32K */
        segsize = 32*1024;
    } else {
        /* Pipeline_64K */
        segsize = 64*1024;
    }
    return ompi_coll_tuned_reduce_intra_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm, module, 
                                                  segsize, max_requests);

#if 0
    /* for small messages use linear algorithm */
    if (message_size <= 4096) {
        segsize = 0;
        fanout = communicator_size - 1;
        /* when linear implemented or taken from basic put here, right now using chain as a linear system */
        /* it is implemented and I shouldn't be calling a chain with a fanout bigger than MAXTREEFANOUT from topo.h! */
        return ompi_coll_tuned_reduce_intra_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm, module); 
        /*        return ompi_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, segsize, fanout); */
    }
    if (message_size < 524288) {
        if (message_size <= 65536 ) {
            segsize = 32768;
            fanout = 8;
        } else {
            segsize = 1024;
            fanout = communicator_size/2;
        }
        /* later swap this for a binary tree */
        /*         fanout = 2; */
        return ompi_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, module,
                                                   segsize, fanout, max_requests);
    }
    segsize = 1024;
    return ompi_coll_tuned_reduce_intra_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm, module,
                                                  segsize, max_requests);
#endif  /* 0 */
}
/*
 *	bcast_intra_dec 
 *
 *	Function:	- seletects broadcast algorithm to use
 *	Accepts:	- same arguments as MPI_Bcast()
 *	Returns:	- MPI_SUCCESS or error code (passed from the bcast implementation)
 */
int ompi_coll_tuned_bcast_intra_dec_fixed(void *buff, int count,
                                          struct ompi_datatype_t *datatype, int root,
                                          struct ompi_communicator_t *comm,
                                          mca_coll_base_module_t *module)
{
    /* Decision function based on MX results for 
       messages up to 36MB and communicator sizes up to 64 nodes */
    const size_t small_message_size = 2048;
    const size_t intermediate_message_size = 370728;
    const double a_p16  = 3.2118e-6; /* [1 / byte] */
    const double b_p16  = 8.7936;   
    const double a_p64  = 2.3679e-6; /* [1 / byte] */
    const double b_p64  = 1.1787;     
    const double a_p128 = 1.6134e-6; /* [1 / byte] */
    const double b_p128 = 2.1102;

    int communicator_size;
    int segsize = 0;
    size_t message_size, dsize;

    communicator_size = ompi_comm_size(comm);

    /* else we need data size for decision function */
    ompi_ddt_type_size(datatype, &dsize);
    message_size = dsize * (unsigned long)count;   /* needed for decision */

    OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_bcast_intra_dec_fixed"
                 " root %d rank %d com_size %d msg_length %lu",
                 root, ompi_comm_rank(comm), communicator_size, (unsigned long)message_size));

    /* Handle messages of small and intermediate size, and 
       single-element broadcasts */
    if ((message_size < small_message_size) || (count <= 1)) {
        /* Binomial without segmentation */
        segsize = 0;
        return  ompi_coll_tuned_bcast_intra_binomial (buff, count, datatype, 
                                                      root, comm, module,
                                                      segsize);

    } else if (message_size < intermediate_message_size) {
        /* SplittedBinary with 1KB segments */
        segsize = 1024;
        return ompi_coll_tuned_bcast_intra_split_bintree(buff, count, datatype, 
                                                         root, comm, module,
                                                         segsize);

    } 
    /* Handle large message sizes */
    else if (communicator_size < (a_p128 * message_size + b_p128)) {
        /* Pipeline with 128KB segments */
        segsize = 1024  << 7;
        return ompi_coll_tuned_bcast_intra_pipeline (buff, count, datatype, 
                                                     root, comm, module,
                                                     segsize);

    } else if (communicator_size < 13) {
        /* Split Binary with 8KB segments */
        segsize = 1024 << 3;
        return ompi_coll_tuned_bcast_intra_split_bintree(buff, count, datatype, 
                                                         root, comm, module,
                                                         segsize);
       
    } else if (communicator_size < (a_p64 * message_size + b_p64)) {
        /* Pipeline with 64KB segments */
        segsize = 1024 << 6;
        return ompi_coll_tuned_bcast_intra_pipeline (buff, count, datatype, 
                                                     root, comm, module,
                                                     segsize);

    } else if (communicator_size < (a_p16 * message_size + b_p16)) {
        /* Pipeline with 16KB segments */
        segsize = 1024 << 4;
        return ompi_coll_tuned_bcast_intra_pipeline (buff, count, datatype, 
                                                     root, comm, module,
                                                     segsize);

    }

    /* Pipeline with 8KB segments */
    segsize = 1024 << 3;
    return ompi_coll_tuned_bcast_intra_pipeline (buff, count, datatype, 
                                                 root, comm, module, 
                                                 segsize);
#if 0
    /* this is based on gige measurements */

    if (communicator_size  < 4) {
        return ompi_coll_tuned_bcast_intra_basic_linear (buff, count, datatype, root, comm, module);
    }
    if (communicator_size == 4) {
        if (message_size < 524288) segsize = 0;
        else segsize = 16384;
        return ompi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, module, segsize);
    }
    if (communicator_size <= 8 && message_size < 4096) {
        return ompi_coll_tuned_bcast_intra_basic_linear (buff, count, datatype, root, comm, module);
    }
    if (communicator_size > 8 && message_size >= 32768 && message_size < 524288) {
        segsize = 16384;
        return  ompi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, module, segsize);
    }
    if (message_size >= 524288) {
        segsize = 16384;
        return ompi_coll_tuned_bcast_intra_pipeline (buff, count, datatype, root, comm, module, segsize);
    }
    segsize = 0;
    /* once tested can swap this back in */
    /* return ompi_coll_tuned_bcast_intra_bmtree (buff, count, datatype, root, comm, segsize); */
    return ompi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, module, segsize);
#endif  /* 0 */
}
Esempio n. 14
0
static int mpich_typeub3( void )
{
   int blocklen[2], err = 0, idisp[3];
   size_t sz;
   MPI_Aint disp[3], lb, ub, ex;
   ompi_datatype_t *types[3], *dt1, *dt2, *dt3, *dt4, *dt5;

   /* Create a datatype with explicit LB and UB */
   blocklen[0] = 1;
   blocklen[1] = 1;
   blocklen[2] = 1;
   disp[0] = -3;
   disp[1] = 0; 
   disp[2] = 6;
   types[0] = &ompi_mpi_lb;  /* ompi_ddt_basicDatatypes[DT_LB]; */
   types[1] = &ompi_mpi_int;  /* ompi_ddt_basicDatatypes[DT_INT]; */
   types[2] = &ompi_mpi_ub;  /* ompi_ddt_basicDatatypes[DT_UB]; */
   
   /* Generate samples for contiguous, hindexed, hvector, indexed, and vector (struct and contiguous tested in typeub2) */                                                                                                                         
   ompi_ddt_create_struct(3,blocklen,disp, types,&dt1);
   ompi_ddt_commit(&dt1);

   /* This type is the same as in typeub2, and is tested there */
   types[0]=dt1;               types[1]=dt1;
   blocklen[0]=1;              blocklen[1]=1;
   disp[0]=-4;                 disp[1]=7;
   idisp[0]=-4;                idisp[1]=7;

   ompi_ddt_create_hindexed( 2, blocklen, disp, dt1, &dt2 );
   ompi_ddt_commit( &dt2 );

   ompi_ddt_type_lb( dt2, &lb );       ompi_ddt_type_ub( dt2, &ub );
   ompi_ddt_type_extent( dt2, &ex );   ompi_ddt_type_size( dt2, &sz );

   if (lb != -7 || ub != 13 || ex != 20) {
      printf("hindexed lb %d ub %d extent %d size %d\n", (int)-7, (int)13, (int)20, (int)sz);
      printf("hindexed lb %d ub %d extent %d size %d\n", (int)lb, (int)ub, (int)ex, (int)sz);
      err++;
   }
   else
      printf( "hindexed ok\n" );

   ompi_ddt_create_indexed( 2, blocklen, idisp, dt1, &dt3 );
   ompi_ddt_commit( &dt3 );

   ompi_ddt_type_lb( dt3, &lb );       ompi_ddt_type_ub( dt3, &ub );
   ompi_ddt_type_extent( dt3, &ex );   ompi_ddt_type_size( dt3, &sz );

   if (lb != -39 || ub != 69 || ex != 108) {
      printf("indexed lb %d ub %d extent %d size %d\n", (int)-39, (int)69, (int)108, (int)sz);
      printf("indexed lb %d ub %d extent %d size %d\n", (int)lb, (int)ub, (int)ex, (int)sz);
      err++;
   }
   else
      printf( "indexed ok\n" );

   ompi_ddt_create_hvector( 2, 1, 14, dt1, &dt4 );
   ompi_ddt_commit( &dt4 );

   ompi_ddt_type_lb( dt4, &lb );       ompi_ddt_type_ub( dt4, &ub );
   ompi_ddt_type_extent( dt4, &ex );   ompi_ddt_type_size( dt4, &sz );

   if (lb != -3 || ub != 20 || ex != 23) {
      printf("hvector lb %d ub %d extent %d size %d\n", (int)-3, (int)20, (int)23, (int)sz);
      printf("hvector lb %d ub %d extent %d size %d\n", (int)lb, (int)ub, (int)ex, (int)sz);
      err++;
   }
   else
      printf( "hvector ok\n" );

   ompi_ddt_create_vector( 2, 1, 14, dt1, &dt5 );
   ompi_ddt_commit( &dt5 );

   ompi_ddt_type_lb( dt5, &lb );       ompi_ddt_type_ub( dt5, &ub );
   ompi_ddt_type_extent( dt5, &ex );   ompi_ddt_type_size( dt5, &sz );

   if (lb != -3 || ub != 132 || ex != 135) {
      printf("vector lb %d ub %d extent %d size %d\n", (int)-3, (int)132, (int)135, (int)sz);
      printf("vector lb %d ub %d extent %d size %d\n", (int)lb, (int)ub, (int)ex, (int)sz);
      err++;
   }
   else
      printf( "vector ok\n" );

   OBJ_RELEASE( dt1 ); /*assert( dt1 == NULL );*/
   OBJ_RELEASE( dt2 ); /*assert( dt2 == NULL );*/
   OBJ_RELEASE( dt3 ); /*assert( dt3 == NULL );*/
   OBJ_RELEASE( dt4 ); /*assert( dt4 == NULL );*/
   OBJ_RELEASE( dt5 ); assert( dt5 == NULL );
   return err;
}
Esempio n. 15
0
static int mpich_typeub2( void )
{
   int blocklen[3], err = 0;
   size_t sz1, sz2, sz3;
   MPI_Aint disp[3], lb, ub, ex1, ex2, ex3;
   ompi_datatype_t *types[3], *dt1, *dt2, *dt3;

   blocklen[0] = 1;
   blocklen[1] = 1;
   blocklen[2] = 1;
   disp[0] = -3;
   disp[1] = 0;
   disp[2] = 6;
   types[0] = &ompi_mpi_lb;  /* ompi_ddt_basicDatatypes[DT_LB]; */
   types[1] = &ompi_mpi_int;  /* ompi_ddt_basicDatatypes[DT_INT]; */
   types[2] = &ompi_mpi_ub;  /* ompi_ddt_basicDatatypes[DT_UB]; */

   ompi_ddt_create_struct(3,blocklen,disp, types,&dt1);
   ompi_ddt_commit(&dt1);

   ompi_ddt_type_lb(dt1, &lb);          ompi_ddt_type_ub(dt1, &ub);
   ompi_ddt_type_extent(dt1,&ex1);      ompi_ddt_type_size(dt1,&sz1);

   /* Values should be lb = -3, ub = 6 extent 9; size depends on implementation */
   if (lb != -3 || ub != 6 || ex1 != 9) {
      printf("Example 3.26 type1 lb %d ub %d extent %d size %d\n", (int)lb, (int)ub, (int)ex1, (int)sz1);
      err++;
   }
   else
      printf("Example 3.26 type1 correct\n" );

   ompi_ddt_create_contiguous(2,dt1,&dt2);
   ompi_ddt_type_lb(dt2, &lb);          ompi_ddt_type_ub(dt2, &ub);
   ompi_ddt_type_extent(dt2,&ex2);      ompi_ddt_type_size(dt2,&sz2);
   /* Values should be lb = -3, ub = 15, extent = 18, size depends on implementation */
   if (lb != -3 || ub != 15 || ex2 != 18) {
      printf("Example 3.26 type2 lb %d ub %d extent %d size %d\n", (int)-3, (int)15, (int)18, 8);
      printf("Example 3.26 type2 lb %d ub %d extent %d size %d\n", (int)lb, (int)ub, (int)ex2, (int)sz2);
      err++;
   }
   else
      printf("Example 3.26 type1 correct\n" );
   OBJ_RELEASE( dt2 ); assert( dt2 == NULL );
   ompi_ddt_create_contiguous(2,dt1,&dt2);
   ompi_ddt_type_lb(dt2, &lb);          ompi_ddt_type_ub(dt2, &ub);
   ompi_ddt_type_extent(dt2,&ex2);      ompi_ddt_type_size(dt2,&sz2);
   /* Values should be lb = -3, ub = 15, extent = 18, size depends on implementation */
   if (lb != -3 || ub != 15 || ex2 != 18) {
      printf("Example 3.26 type2 lb %d ub %d extent %d size %d\n", (int)-3, (int)15, (int)18, 8);
      printf("Example 3.26 type2 lb %d ub %d extent %d size %d\n", (int)lb, (int)ub, (int)ex2, (int)sz2);
      err++;
   }
   else
      printf( "Example 3.26 type2 correct\n" );

   types[0]=dt1;               types[1]=dt1;
   blocklen[0]=1;              blocklen[1]=1;
   disp[0]=0;                  disp[1]=ex1;

   ompi_ddt_create_struct(2, blocklen, disp, types, &dt3);
   ompi_ddt_commit(&dt3);

   ompi_ddt_type_lb(dt3, &lb);          ompi_ddt_type_ub(dt3, &ub);
   ompi_ddt_type_extent(dt3,&ex3);      ompi_ddt_type_size(dt3,&sz3);
   /* Another way to express type2 */
   if (lb != -3 || ub != 15 || ex3 != 18) {
      printf("type3 lb %d ub %d extent %d size %d\n", (int)-3, (int)15, (int)18, 8);
      printf("type3 lb %d ub %d extent %d size %d\n", (int)lb, (int)ub, (int)ex3, (int)sz2);
      err++;
   }
   else
      printf( "type3 correct\n" );

   OBJ_RELEASE( dt1 ); /*assert( dt1 == NULL );*/
   OBJ_RELEASE( dt2 ); /*assert( dt2 == NULL );*/
   OBJ_RELEASE( dt3 ); assert( dt3 == NULL );
   return err;
}