コード例 #1
0
int opal_hash_table_init(opal_hash_table_t* ht, size_t table_size)
{
    size_t i;
    size_t power2 = opal_next_poweroftwo (table_size);

    ht->ht_mask = power2-1;
    ht->ht_table = (opal_list_t *)malloc(power2 * sizeof(opal_list_t));
    if(NULL == ht->ht_table) {
        return OPAL_ERR_OUT_OF_RESOURCE;
    }
    for(i=ht->ht_table_size; i<power2; i++) {
        opal_list_t* list = ht->ht_table+i;
        OBJ_CONSTRUCT(list, opal_list_t);
    }
    ht->ht_table_size = power2;
    return OPAL_SUCCESS;
}
コード例 #2
0
/*
 *	reduce_scatter
 *
 *	Function:	- reduce then scatter
 *	Accepts:	- same as MPI_Reduce_scatter()
 *	Returns:	- MPI_SUCCESS or error code
 *
 * Algorithm:
 *   Cummutative, reasonable sized messages
 *     recursive halving algorithm
 *   Others:
 *     reduce and scatterv (needs to be cleaned 
 *     up at some point)
 *
 * NOTE: that the recursive halving algorithm should be faster than
 * the reduce/scatter for all message sizes.  However, the memory
 * usage for the recusive halving is msg_size + 2 * comm_size greater
 * for the recursive halving, so I've limited where the recursive
 * halving is used to be nice to the app memory wise.  There are much
 * better algorithms for large messages with cummutative operations,
 * so this should be investigated further.
 *
 * NOTE: We default to a simple reduce/scatterv if one of the rcounts
 * is zero.  This is because the existing algorithms do not currently
 * support a count of zero in the array.
 */
int
mca_coll_basic_reduce_scatter_intra(void *sbuf, void *rbuf, int *rcounts,
                                    struct ompi_datatype_t *dtype,
                                    struct ompi_op_t *op,
                                    struct ompi_communicator_t *comm,
                                    mca_coll_base_module_t *module)
{
    int i, rank, size, count, err = OMPI_SUCCESS;
    ptrdiff_t true_lb, true_extent, lb, extent, buf_size;
    int *disps = NULL;
    char *recv_buf = NULL, *recv_buf_free = NULL;
    char *result_buf = NULL, *result_buf_free = NULL;
    bool zerocounts = false;

    /* Initialize */
    rank = ompi_comm_rank(comm);
    size = ompi_comm_size(comm);

    /* Find displacements and the like */
    disps = (int*) malloc(sizeof(int) * size);
    if (NULL == disps) return OMPI_ERR_OUT_OF_RESOURCE;

    disps[0] = 0;
    for (i = 0; i < (size - 1); ++i) {
        disps[i + 1] = disps[i] + rcounts[i];
        if (0 == rcounts[i]) {
            zerocounts = true;
        }
    }
    count = disps[size - 1] + rcounts[size - 1];
    if (0 == rcounts[size - 1]) {
        zerocounts = true;
    }

    /* short cut the trivial case */
    if (0 == count) {
        free(disps);
        return OMPI_SUCCESS;
    }

    /* get datatype information */
    ompi_datatype_get_extent(dtype, &lb, &extent);
    ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent);
    buf_size = true_extent + (count - 1) * extent;

    /* Handle MPI_IN_PLACE */
    if (MPI_IN_PLACE == sbuf) {
        sbuf = rbuf;
    }

    if ((op->o_flags & OMPI_OP_FLAGS_COMMUTE) &&
        (buf_size < COMMUTATIVE_LONG_MSG) && (!zerocounts)) {
        int tmp_size, remain = 0, tmp_rank;

        /* temporary receive buffer.  See coll_basic_reduce.c for details on sizing */
        recv_buf_free = (char*) malloc(buf_size);
        recv_buf = recv_buf_free - lb;
        if (NULL == recv_buf_free) {
            err = OMPI_ERR_OUT_OF_RESOURCE;
            goto cleanup;
        }

        /* allocate temporary buffer for results */
        result_buf_free = (char*) malloc(buf_size);
        result_buf = result_buf_free - lb;

        /* copy local buffer into the temporary results */
        err = ompi_datatype_sndrcv(sbuf, count, dtype, result_buf, count, dtype);
        if (OMPI_SUCCESS != err) goto cleanup;

        /* figure out power of two mapping: grow until larger than
           comm size, then go back one, to get the largest power of
           two less than comm size */
        tmp_size = opal_next_poweroftwo(size);
        tmp_size >>= 1;
        remain = size - tmp_size;

        /* If comm size is not a power of two, have the first "remain"
           procs with an even rank send to rank + 1, leaving a power of
           two procs to do the rest of the algorithm */
        if (rank < 2 * remain) {
            if ((rank & 1) == 0) {
                err = MCA_PML_CALL(send(result_buf, count, dtype, rank + 1, 
                                        MCA_COLL_BASE_TAG_REDUCE_SCATTER,
                                        MCA_PML_BASE_SEND_STANDARD,
                                        comm));
                if (OMPI_SUCCESS != err) goto cleanup;

                /* we don't participate from here on out */
                tmp_rank = -1;
            } else {
                err = MCA_PML_CALL(recv(recv_buf, count, dtype, rank - 1,
                                        MCA_COLL_BASE_TAG_REDUCE_SCATTER,
                                        comm, MPI_STATUS_IGNORE));
                if (OMPI_SUCCESS != err) goto cleanup;

                /* integrate their results into our temp results */
                ompi_op_reduce(op, recv_buf, result_buf, count, dtype);

                /* adjust rank to be the bottom "remain" ranks */
                tmp_rank = rank / 2;
            }
        } else {
            /* just need to adjust rank to show that the bottom "even
               remain" ranks dropped out */
            tmp_rank = rank - remain;
        }

        /* For ranks not kicked out by the above code, perform the
           recursive halving */
        if (tmp_rank >= 0) {
            int *tmp_disps = NULL, *tmp_rcounts = NULL;
            int mask, send_index, recv_index, last_index;

            /* recalculate disps and rcounts to account for the
               special "remainder" processes that are no longer doing
               anything */
            tmp_rcounts = (int*) malloc(tmp_size * sizeof(int));
            if (NULL == tmp_rcounts) {
                err = OMPI_ERR_OUT_OF_RESOURCE;
                goto cleanup;
            }
            tmp_disps = (int*) malloc(tmp_size * sizeof(int));
            if (NULL == tmp_disps) {
                free(tmp_rcounts);
                err = OMPI_ERR_OUT_OF_RESOURCE;
                goto cleanup;
            }

            for (i = 0 ; i < tmp_size ; ++i) {
                if (i < remain) {
                    /* need to include old neighbor as well */
                    tmp_rcounts[i] = rcounts[i * 2 + 1] + rcounts[i * 2];
                } else {
                    tmp_rcounts[i] = rcounts[i + remain];
                }
            }

            tmp_disps[0] = 0;
            for (i = 0; i < tmp_size - 1; ++i) {
                tmp_disps[i + 1] = tmp_disps[i] + tmp_rcounts[i];
            }

            /* do the recursive halving communication.  Don't use the
               dimension information on the communicator because I
               think the information is invalidated by our "shrinking"
               of the communicator */
            mask = tmp_size >> 1;
            send_index = recv_index = 0;
            last_index = tmp_size;
            while (mask > 0) {
                int tmp_peer, peer, send_count, recv_count;
                struct ompi_request_t *request;

                tmp_peer = tmp_rank ^ mask;
                peer = (tmp_peer < remain) ? tmp_peer * 2 + 1 : tmp_peer + remain;

                /* figure out if we're sending, receiving, or both */
                send_count = recv_count = 0;
                if (tmp_rank < tmp_peer) {
                    send_index = recv_index + mask;
                    for (i = send_index ; i < last_index ; ++i) {
                        send_count += tmp_rcounts[i];
                    }
                    for (i = recv_index ; i < send_index ; ++i) {
                        recv_count += tmp_rcounts[i];
                    }
                } else {
                    recv_index = send_index + mask;
                    for (i = send_index ; i < recv_index ; ++i) {
                        send_count += tmp_rcounts[i];
                    }
                    for (i = recv_index ; i < last_index ; ++i) {
                        recv_count += tmp_rcounts[i];
                    }
                }

                /* actual data transfer.  Send from result_buf,
                   receive into recv_buf */
                if (send_count > 0 && recv_count != 0) {
                    err = MCA_PML_CALL(irecv(recv_buf + tmp_disps[recv_index] * extent,
                                             recv_count, dtype, peer,
                                             MCA_COLL_BASE_TAG_REDUCE_SCATTER,
                                             comm, &request));
                    if (OMPI_SUCCESS != err) {
                        free(tmp_rcounts);
                        free(tmp_disps);
                        goto cleanup;
                    }                                             
                }
                if (recv_count > 0 && send_count != 0) {
                    err = MCA_PML_CALL(send(result_buf + tmp_disps[send_index] * extent,
                                            send_count, dtype, peer, 
                                            MCA_COLL_BASE_TAG_REDUCE_SCATTER,
                                            MCA_PML_BASE_SEND_STANDARD,
                                            comm));
                    if (OMPI_SUCCESS != err) {
                        free(tmp_rcounts);
                        free(tmp_disps);
                        goto cleanup;
                    }                                             
                }
                if (send_count > 0 && recv_count != 0) {
                    err = ompi_request_wait(&request, MPI_STATUS_IGNORE);
                    if (OMPI_SUCCESS != err) {
                        free(tmp_rcounts);
                        free(tmp_disps);
                        goto cleanup;
                    }                                             
                }

                /* if we received something on this step, push it into
                   the results buffer */
                if (recv_count > 0) {
                    ompi_op_reduce(op, 
                                   recv_buf + tmp_disps[recv_index] * extent, 
                                   result_buf + tmp_disps[recv_index] * extent,
                                   recv_count, dtype);
                }

                /* update for next iteration */
                send_index = recv_index;
                last_index = recv_index + mask;
                mask >>= 1;
            }

            /* copy local results from results buffer into real receive buffer */
            if (0 != rcounts[rank]) {
                err = ompi_datatype_sndrcv(result_buf + disps[rank] * extent,
                                      rcounts[rank], dtype, 
                                      rbuf, rcounts[rank], dtype);
                if (OMPI_SUCCESS != err) {
                    free(tmp_rcounts);
                    free(tmp_disps);
                    goto cleanup;
                }                                             
            }

            free(tmp_rcounts);
            free(tmp_disps);
        }
コード例 #3
0
ファイル: coll_base_allreduce.c プロジェクト: bureddy/ompi
/*
 *   ompi_coll_base_allreduce_intra_recursivedoubling
 *
 *   Function:       Recursive doubling algorithm for allreduce operation
 *   Accepts:        Same as MPI_Allreduce()
 *   Returns:        MPI_SUCCESS or error code
 *
 *   Description:    Implements recursive doubling algorithm for allreduce.
 *                   Original (non-segmented) implementation is used in MPICH-2
 *                   for small and intermediate size messages.
 *                   The algorithm preserves order of operations so it can
 *                   be used both by commutative and non-commutative operations.
 *
 *         Example on 7 nodes:
 *         Initial state
 *         #      0       1      2       3      4       5      6
 *               [0]     [1]    [2]     [3]    [4]     [5]    [6]
 *         Initial adjustment step for non-power of two nodes.
 *         old rank      1              3              5      6
 *         new rank      0              1              2      3
 *                     [0+1]          [2+3]          [4+5]   [6]
 *         Step 1
 *         old rank      1              3              5      6
 *         new rank      0              1              2      3
 *                     [0+1+]         [0+1+]         [4+5+]  [4+5+]
 *                     [2+3+]         [2+3+]         [6   ]  [6   ]
 *         Step 2
 *         old rank      1              3              5      6
 *         new rank      0              1              2      3
 *                     [0+1+]         [0+1+]         [0+1+]  [0+1+]
 *                     [2+3+]         [2+3+]         [2+3+]  [2+3+]
 *                     [4+5+]         [4+5+]         [4+5+]  [4+5+]
 *                     [6   ]         [6   ]         [6   ]  [6   ]
 *         Final adjustment step for non-power of two nodes
 *         #      0       1      2       3      4       5      6
 *              [0+1+] [0+1+] [0+1+]  [0+1+] [0+1+]  [0+1+] [0+1+]
 *              [2+3+] [2+3+] [2+3+]  [2+3+] [2+3+]  [2+3+] [2+3+]
 *              [4+5+] [4+5+] [4+5+]  [4+5+] [4+5+]  [4+5+] [4+5+]
 *              [6   ] [6   ] [6   ]  [6   ] [6   ]  [6   ] [6   ]
 *
 */
int
ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf,
                                                  int count,
                                                  struct ompi_datatype_t *dtype,
                                                  struct ompi_op_t *op,
                                                  struct ompi_communicator_t *comm,
                                                  mca_coll_base_module_t *module)
{
    int ret, line, rank, size, adjsize, remote, distance;
    int newrank, newremote, extra_ranks;
    char *tmpsend = NULL, *tmprecv = NULL, *tmpswap = NULL, *inplacebuf = NULL;
    ptrdiff_t true_lb, true_extent, lb, extent;
    ompi_request_t *reqs[2] = {NULL, NULL};

    size = ompi_comm_size(comm);
    rank = ompi_comm_rank(comm);

    OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
                 "coll:base:allreduce_intra_recursivedoubling rank %d", rank));

    /* Special case for size == 1 */
    if (1 == size) {
        if (MPI_IN_PLACE != sbuf) {
            ret = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf, (char*)sbuf);
            if (ret < 0) { line = __LINE__; goto error_hndl; }
        }
        return MPI_SUCCESS;
    }

    /* Allocate and initialize temporary send buffer */
    ret = ompi_datatype_get_extent(dtype, &lb, &extent);
    if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
    ret = ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent);
    if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }

    inplacebuf = (char*) malloc(true_extent + (ptrdiff_t)(count - 1) * extent);
    if (NULL == inplacebuf) { ret = -1; line = __LINE__; goto error_hndl; }

    if (MPI_IN_PLACE == sbuf) {
        ret = ompi_datatype_copy_content_same_ddt(dtype, count, inplacebuf, (char*)rbuf);
        if (ret < 0) { line = __LINE__; goto error_hndl; }
    } else {
        ret = ompi_datatype_copy_content_same_ddt(dtype, count, inplacebuf, (char*)sbuf);
        if (ret < 0) { line = __LINE__; goto error_hndl; }
    }

    tmpsend = (char*) inplacebuf;
    tmprecv = (char*) rbuf;

    /* Determine nearest power of two less than or equal to size */
    adjsize = opal_next_poweroftwo (size);
    adjsize >>= 1;

    /* Handle non-power-of-two case:
       - Even ranks less than 2 * extra_ranks send their data to (rank + 1), and
       sets new rank to -1.
       - Odd ranks less than 2 * extra_ranks receive data from (rank - 1),
       apply appropriate operation, and set new rank to rank/2
       - Everyone else sets rank to rank - extra_ranks
    */
    extra_ranks = size - adjsize;
    if (rank <  (2 * extra_ranks)) {
        if (0 == (rank % 2)) {
            ret = MCA_PML_CALL(send(tmpsend, count, dtype, (rank + 1),
                                    MCA_COLL_BASE_TAG_ALLREDUCE,
                                    MCA_PML_BASE_SEND_STANDARD, comm));
            if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
            newrank = -1;
        } else {
            ret = MCA_PML_CALL(recv(tmprecv, count, dtype, (rank - 1),
                                    MCA_COLL_BASE_TAG_ALLREDUCE, comm,
                                    MPI_STATUS_IGNORE));
            if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
            /* tmpsend = tmprecv (op) tmpsend */
            ompi_op_reduce(op, tmprecv, tmpsend, count, dtype);
            newrank = rank >> 1;
        }
    } else {
コード例 #4
0
int ompi_coll_tuned_barrier_intra_recursivedoubling(struct ompi_communicator_t *comm,
                                                    mca_coll_base_module_t *module)
{
    int rank, size, adjsize, err, line, mask, remote;

    rank = ompi_comm_rank(comm);
    size = ompi_comm_size(comm);
    OPAL_OUTPUT((ompi_coll_tuned_stream,
                 "ompi_coll_tuned_barrier_intra_recursivedoubling rank %d", 
                 rank));

    /* do nearest power of 2 less than size calc */
    adjsize = opal_next_poweroftwo(size);
    adjsize >>= 1;

    /* if size is not exact power of two, perform an extra step */
    if (adjsize != size) {
        if (rank >= adjsize) {
            /* send message to lower ranked node */
            remote = rank - adjsize;
            err = ompi_coll_tuned_sendrecv_actual(NULL, 0, MPI_BYTE, remote,
                                                  MCA_COLL_BASE_TAG_BARRIER,
                                                  NULL, 0, MPI_BYTE, remote,
                                                  MCA_COLL_BASE_TAG_BARRIER,
                                                  comm, MPI_STATUS_IGNORE);
            if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;}

        } else if (rank < (size - adjsize)) {

            /* receive message from high level rank */
            err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, rank+adjsize,
                                    MCA_COLL_BASE_TAG_BARRIER, comm, 
                                    MPI_STATUS_IGNORE));

            if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;}
        }
    }

    /* exchange messages */
    if ( rank < adjsize ) {
        mask = 0x1;
        while ( mask < adjsize ) {
            remote = rank ^ mask;
            mask <<= 1;
            if (remote >= adjsize) continue;

            /* post receive from the remote node */
            err = ompi_coll_tuned_sendrecv_actual(NULL, 0, MPI_BYTE, remote,
                                                  MCA_COLL_BASE_TAG_BARRIER,
                                                  NULL, 0, MPI_BYTE, remote,
                                                  MCA_COLL_BASE_TAG_BARRIER,
                                                  comm, MPI_STATUS_IGNORE);
            if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;}
        }
    }

    /* non-power of 2 case */
    if (adjsize != size) {
        if (rank < (size - adjsize)) {
            /* send enter message to higher ranked node */
            remote = rank + adjsize;
            err = MCA_PML_CALL(send((void*)NULL, 0, MPI_BYTE, remote, 
                                    MCA_COLL_BASE_TAG_BARRIER, 
                                    MCA_PML_BASE_SEND_SYNCHRONOUS, comm));

            if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;}
        }
    }

    return MPI_SUCCESS;

 err_hndl:
    OPAL_OUTPUT((ompi_coll_tuned_stream,"%s:%4d\tError occurred %d, rank %2d",
                 __FILE__, line, err, rank));
    return err;
}