示例#1
0
int ompi_osc_ucx_get(void *origin_addr, int origin_count,
                     struct ompi_datatype_t *origin_dt,
                     int target, ptrdiff_t target_disp, int target_count,
                     struct ompi_datatype_t *target_dt, struct ompi_win_t *win) {
    ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module;
    ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, target);
    uint64_t remote_addr = (module->win_info_array[target]).addr + target_disp * OSC_UCX_GET_DISP(module, target);
    ucp_rkey_h rkey;
    ptrdiff_t origin_lb, origin_extent, target_lb, target_extent;
    bool is_origin_contig = false, is_target_contig = false;
    ucs_status_t status;
    int ret = OMPI_SUCCESS;

    ret = check_sync_state(module, target, false);
    if (ret != OMPI_SUCCESS) {
        return ret;
    }

    if (module->flavor == MPI_WIN_FLAVOR_DYNAMIC) {
        status = get_dynamic_win_info(remote_addr, module, ep, target);
        if (status != UCS_OK) {
            return OMPI_ERROR;
        }
    }

    rkey = (module->win_info_array[target]).rkey;

    ompi_datatype_get_true_extent(origin_dt, &origin_lb, &origin_extent);
    ompi_datatype_get_true_extent(target_dt, &target_lb, &target_extent);

    is_origin_contig = ompi_datatype_is_contiguous_memory_layout(origin_dt, origin_count);
    is_target_contig = ompi_datatype_is_contiguous_memory_layout(target_dt, target_count);

    if (is_origin_contig && is_target_contig) {
        /* fast path */
        size_t origin_len;

        ompi_datatype_type_size(origin_dt, &origin_len);
        origin_len *= origin_count;

        status = ucp_get_nbi(ep, (void *)((intptr_t)origin_addr + origin_lb), origin_len,
                             remote_addr + target_lb, rkey);
        if (status != UCS_OK && status != UCS_INPROGRESS) {
            opal_output_verbose(1, ompi_osc_base_framework.framework_output,
                                "%s:%d: ucp_get_nbi failed: %d\n",
                                __FILE__, __LINE__, status);
            return OMPI_ERROR;
        }

        return incr_and_check_ops_num(module, target, ep);
    } else {
        return ddt_put_get(module, origin_addr, origin_count, origin_dt, is_origin_contig,
                           origin_lb, target, ep, remote_addr, rkey, target_count, target_dt,
                           is_target_contig, target_lb, true);
    }
}
示例#2
0
/*
 *	reduce_inter
 *
 *	Function:	- reduction using the local_comm
 *	Accepts:	- same as MPI_Reduce()
 *	Returns:	- MPI_SUCCESS or error code
 */
int
mca_coll_inter_reduce_inter(void *sbuf, void *rbuf, int count,
                            struct ompi_datatype_t *dtype,
                            struct ompi_op_t *op,
                            int root, struct ompi_communicator_t *comm,
                            mca_coll_base_module_t *module)
{
    int rank, err;
    ptrdiff_t true_lb, true_extent, lb, extent;
    char *free_buffer = NULL;
    char *pml_buffer = NULL;

    /* Initialize */
    rank = ompi_comm_rank(comm);

    if (MPI_PROC_NULL == root) {
        /* do nothing */
        err = OMPI_SUCCESS;
    } else if (MPI_ROOT != root) {
        /* Perform the reduce locally with the first process as root */
        ompi_datatype_get_extent(dtype, &lb, &extent);
        ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent);

        free_buffer = (char*)malloc(true_extent + (count - 1) * extent);
        if (NULL == free_buffer) {
            return OMPI_ERR_OUT_OF_RESOURCE;
        }
        pml_buffer = free_buffer - true_lb;

        err = comm->c_local_comm->c_coll.coll_reduce(sbuf, pml_buffer, count,
                dtype, op, 0, comm->c_local_comm,
                comm->c_local_comm->c_coll.coll_reduce_module);
        if (0 == rank) {
            /* First process sends the result to the root */
            err = MCA_PML_CALL(send(pml_buffer, count, dtype, root,
                                    MCA_COLL_BASE_TAG_REDUCE,
                                    MCA_PML_BASE_SEND_STANDARD, comm));
            if (OMPI_SUCCESS != err) {
                return err;
            }
        }

        if (NULL != free_buffer) {
            free(free_buffer);
        }
    } else {
        /* Root receives the reduced message from the first process  */
        err = MCA_PML_CALL(recv(rbuf, count, dtype, 0,
                                MCA_COLL_BASE_TAG_REDUCE, comm,
                                MPI_STATUS_IGNORE));
        if (OMPI_SUCCESS != err) {
            return err;
        }
    }
    /* All done */
    return err;
}
示例#3
0
/**
 * Computing the correct buffer length for moving a multiple of a datatype
 * is not an easy task. Define a function to centralize the complexity in a
 * single location.
 */
size_t compute_buffer_length(ompi_datatype_t* pdt, int count)
{
    MPI_Aint extent, lb, true_extent, true_lb;
    size_t length;

    ompi_datatype_get_extent(pdt, &lb, &extent);
    ompi_datatype_get_true_extent(pdt, &true_lb, &true_extent); (void)true_lb;
    length = true_lb + true_extent + (count - 1) * extent;

    return  length;
}
示例#4
0
/*
 *	reduce_log_inter
 *
 *	Function:	- reduction using O(N) algorithm
 *	Accepts:	- same as MPI_Reduce()
 *	Returns:	- MPI_SUCCESS or error code
 */
int
mca_coll_cuda_reduce(const void *sbuf, void *rbuf, int count,
                     struct ompi_datatype_t *dtype,
                     struct ompi_op_t *op,
                     int root, struct ompi_communicator_t *comm,
                     mca_coll_base_module_t *module)
{
    mca_coll_cuda_module_t *s = (mca_coll_cuda_module_t*) module;
    ptrdiff_t true_lb, true_extent, lb, extent;
    char *rbuf1 = NULL, *sbuf1 = NULL, *rbuf2 = NULL;
    const char *sbuf2;
    size_t bufsize;
    int rc;

    ompi_datatype_get_extent(dtype, &lb, &extent);
    ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent);
    bufsize = true_extent + (ptrdiff_t)(count - 1) * extent;
    if ((MPI_IN_PLACE != sbuf) && (opal_cuda_check_bufs((char *)sbuf, NULL))) {
        sbuf1 = (char*)malloc(bufsize);
        if (NULL == sbuf1) {
            return OMPI_ERR_OUT_OF_RESOURCE;
        }
        opal_cuda_memcpy_sync(sbuf1, sbuf, bufsize);
        sbuf2 = sbuf; /* save away original buffer */
        sbuf = sbuf1 - lb;
    }

    if (opal_cuda_check_bufs(rbuf, NULL)) {
        rbuf1 = (char*)malloc(bufsize);
        if (NULL == rbuf1) {
            if (NULL != sbuf1) free(sbuf1);
            return OMPI_ERR_OUT_OF_RESOURCE;
        }
        opal_cuda_memcpy_sync(rbuf1, rbuf, bufsize);
        rbuf2 = rbuf; /* save away original buffer */
        rbuf = rbuf1 - lb;
    }
    rc = s->c_coll.coll_reduce((void *) sbuf, rbuf, count,
                               dtype, op, root, comm,
                               s->c_coll.coll_reduce_module);

    if (NULL != sbuf1) {
        free(sbuf1);
    }
    if (NULL != rbuf1) {
        rbuf = rbuf2;
        opal_cuda_memcpy_sync(rbuf, rbuf1, bufsize);
        free(rbuf1);
    }
    return rc;
}
示例#5
0
void ADIOI_Datatype_iscontig(MPI_Datatype datatype, int *flag)
{
    /*
     * Open MPI contiguous check return true for datatype with
     * gaps in the beginning and at the end. We have to provide
     * a count of 2 in order to get these gaps taken into acount.
     * In addition, if the data is contiguous but true_lb differes
     * from zero, ROMIO will ignore the displacement. Thus, lie!
     */
    *flag = ompi_datatype_is_contiguous_memory_layout(datatype, 2);
    if (*flag) {
        MPI_Aint true_extent, true_lb;

        ompi_datatype_get_true_extent(datatype, &true_lb, &true_extent);

        if (true_lb > 0)
            *flag = 0;
    }
}
示例#6
0
/*
 *	scan
 *
 *	Function:	- basic scan operation
 *	Accepts:	- same arguments as MPI_Scan()
 *	Returns:	- MPI_SUCCESS or error code
 */
int
mca_coll_basic_scan_intra(void *sbuf, void *rbuf, int count,
                          struct ompi_datatype_t *dtype,
                          struct ompi_op_t *op,
                          struct ompi_communicator_t *comm,
                          mca_coll_base_module_t *module)
{
    int size, rank, err;
    ptrdiff_t true_lb, true_extent, lb, extent;
    char *free_buffer = NULL;
    char *pml_buffer = NULL;

    /* Initialize */

    rank = ompi_comm_rank(comm);
    size = ompi_comm_size(comm);

    /* If I'm rank 0, just copy into the receive buffer */

    if (0 == rank) {
        if (MPI_IN_PLACE != sbuf) {
            err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf, (char*)sbuf);
            if (MPI_SUCCESS != err) {
                return err;
            }
        }
    }

    /* Otherwise receive previous buffer and reduce. */

    else {
        /* Allocate a temporary buffer.  Rationale for this size is
         * listed in coll_basic_reduce.c.  Use this temporary buffer to
         * receive into, later. */

        ompi_datatype_get_extent(dtype, &lb, &extent);
        ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent);

        free_buffer = (char*)malloc(true_extent + (count - 1) * extent);
        if (NULL == free_buffer) {
            return OMPI_ERR_OUT_OF_RESOURCE;
        }
        pml_buffer = free_buffer - true_lb;

        /* Copy the send buffer into the receive buffer. */

        if (MPI_IN_PLACE != sbuf) {
            err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf, (char*)sbuf);
            if (MPI_SUCCESS != err) {
                if (NULL != free_buffer) {
                    free(free_buffer);
                }
                return err;
            }
        }

        /* Receive the prior answer */

        err = MCA_PML_CALL(recv(pml_buffer, count, dtype,
                                rank - 1, MCA_COLL_BASE_TAG_SCAN, comm,
                                MPI_STATUS_IGNORE));
        if (MPI_SUCCESS != err) {
            if (NULL != free_buffer) {
                free(free_buffer);
            }
            return err;
        }

        /* Perform the operation */

        ompi_op_reduce(op, pml_buffer, rbuf, count, dtype);

        /* All done */

        if (NULL != free_buffer) {
            free(free_buffer);
        }
    }

    /* Send result to next process. */

    if (rank < (size - 1)) {
        return MCA_PML_CALL(send(rbuf, count, dtype, rank + 1,
                                 MCA_COLL_BASE_TAG_SCAN,
                                 MCA_PML_BASE_SEND_STANDARD, comm));
    }

    /* All done */

    return MPI_SUCCESS;
}
/*
 *	reduce_lin_intra
 *
 *	Function:	- reduction using O(N) algorithm
 *	Accepts:	- same as MPI_Reduce()
 *	Returns:	- MPI_SUCCESS or error code
 */
int
mca_coll_basic_reduce_lin_intra(void *sbuf, void *rbuf, int count,
                                struct ompi_datatype_t *dtype,
                                struct ompi_op_t *op,
                                int root, struct ompi_communicator_t *comm,
                                mca_coll_base_module_t *module)
{
    int i, rank, err, size;
    ptrdiff_t true_lb, true_extent, lb, extent;
    char *free_buffer = NULL;
    char *pml_buffer = NULL;
    char *inplace_temp = NULL;
    char *inbuf;

    /* Initialize */

    rank = ompi_comm_rank(comm);
    size = ompi_comm_size(comm);

    /* If not root, send data to the root. */

    if (rank != root) {
        err = MCA_PML_CALL(send(sbuf, count, dtype, root,
                                MCA_COLL_BASE_TAG_REDUCE,
                                MCA_PML_BASE_SEND_STANDARD, comm));
        return err;
    }

    /* Root receives and reduces messages.  Allocate buffer to receive
     * messages.  This comment applies to all collectives in this basic
     * module where we allocate a temporary buffer.  For the next few
     * lines of code, it's tremendously complicated how we decided that
     * this was the Right Thing to do.  Sit back and enjoy.  And prepare
     * to have your mind warped. :-)
     *
     * Recall some definitions (I always get these backwards, so I'm
     * going to put them here):
     *
     * extent: the length from the lower bound to the upper bound -- may
     * be considerably larger than the buffer required to hold the data
     * (or smaller!  But it's easiest to think about when it's larger).
     *
     * true extent: the exact number of bytes required to hold the data
     * in the layout pattern in the datatype.
     *
     * For example, consider the following buffer (just talking about
     * LB, extent, and true extent -- extrapolate for UB; i.e., assume
     * the UB equals exactly where the data ends):
     *
     * A              B                                       C
     * --------------------------------------------------------
     * |              |                                       |
     * --------------------------------------------------------
     *
     * There are multiple cases:
     *
     * 1. A is what we give to MPI_Send (and friends), and A is where
     * the data starts, and C is where the data ends.  In this case:
     *
     * - extent: C-A
     * - true extent: C-A
     * - LB: 0
     *
     * A                                                      C
     * --------------------------------------------------------
     * |                                                      |
     * --------------------------------------------------------
     * <=======================extent=========================>
     * <======================true extent=====================>
     *
     * 2. A is what we give to MPI_Send (and friends), B is where the
     * data starts, and C is where the data ends.  In this case:
     *
     * - extent: C-A
     * - true extent: C-B
     * - LB: positive
     *
     * A              B                                       C
     * --------------------------------------------------------
     * |              |           User buffer                 |
     * --------------------------------------------------------
     * <=======================extent=========================>
     * <===============true extent=============>
     *
     * 3. B is what we give to MPI_Send (and friends), A is where the
     * data starts, and C is where the data ends.  In this case:
     *
     * - extent: C-A
     * - true extent: C-A
     * - LB: negative
     *
     * A              B                                       C
     * --------------------------------------------------------
     * |              |           User buffer                 |
     * --------------------------------------------------------
     * <=======================extent=========================>
     * <======================true extent=====================>
     *
     * 4. MPI_BOTTOM is what we give to MPI_Send (and friends), B is
     * where the data starts, and C is where the data ends.  In this
     * case:
     *
     * - extent: C-MPI_BOTTOM
     * - true extent: C-B
     * - LB: [potentially very large] positive
     *
     * MPI_BOTTOM     B                                       C
     * --------------------------------------------------------
     * |              |           User buffer                 |
     * --------------------------------------------------------
     * <=======================extent=========================>
     * <===============true extent=============>
     *
     * So in all cases, for a temporary buffer, all we need to malloc()
     * is a buffer of size true_extent.  We therefore need to know two
     * pointer values: what value to give to MPI_Send (and friends) and
     * what value to give to free(), because they might not be the same.
     *
     * Clearly, what we give to free() is exactly what was returned from
     * malloc().  That part is easy.  :-)
     *
     * What we give to MPI_Send (and friends) is a bit more complicated.
     * Let's take the 4 cases from above:
     *
     * 1. If A is what we give to MPI_Send and A is where the data
     * starts, then clearly we give to MPI_Send what we got back from
     * malloc().
     *
     * 2. If B is what we get back from malloc, but we give A to
     * MPI_Send, then the buffer range [A,B) represents "dead space"
     * -- no data will be put there.  So it's safe to give B-LB to
     * MPI_Send.  More specifically, the LB is positive, so B-LB is
     * actually A.
     *
     * 3. If A is what we get back from malloc, and B is what we give to
     * MPI_Send, then the LB is negative, so A-LB will actually equal
     * B.
     *
     * 4. Although this seems like the weirdest case, it's actually
     * quite similar to case #2 -- the pointer we give to MPI_Send is
     * smaller than the pointer we got back from malloc().
     *
     * Hence, in all cases, we give (return_from_malloc - LB) to MPI_Send.
     *
     * This works fine and dandy if we only have (count==1), which we
     * rarely do.  ;-) So we really need to allocate (true_extent +
     * ((count - 1) * extent)) to get enough space for the rest.  This may
     * be more than is necessary, but it's ok.
     *
     * Simple, no?  :-)
     *
     */

    ompi_datatype_get_extent(dtype, &lb, &extent);
    ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent);

    if (MPI_IN_PLACE == sbuf) {
        sbuf = rbuf;
        inplace_temp = (char*)malloc(true_extent + (count - 1) * extent);
        if (NULL == inplace_temp) {
            return OMPI_ERR_OUT_OF_RESOURCE;
        }
        rbuf = inplace_temp - lb;
    }

    if (size > 1) {
        free_buffer = (char*)malloc(true_extent + (count - 1) * extent);
        if (NULL == free_buffer) {
            if (NULL != inplace_temp) {
                free(inplace_temp);
            }
            return OMPI_ERR_OUT_OF_RESOURCE;
        }
        pml_buffer = free_buffer - lb;
    }

    /* Initialize the receive buffer. */

    if (rank == (size - 1)) {
        err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf, (char*)sbuf);
    } else {
        err = MCA_PML_CALL(recv(rbuf, count, dtype, size - 1,
                                MCA_COLL_BASE_TAG_REDUCE, comm,
                                MPI_STATUS_IGNORE));
    }
    if (MPI_SUCCESS != err) {
        if (NULL != free_buffer) {
            free(free_buffer);
        }
        return err;
    }

    /* Loop receiving and calling reduction function (C or Fortran). */

    for (i = size - 2; i >= 0; --i) {
        if (rank == i) {
            inbuf = (char*)sbuf;
        } else {
            err = MCA_PML_CALL(recv(pml_buffer, count, dtype, i,
                                    MCA_COLL_BASE_TAG_REDUCE, comm,
                                    MPI_STATUS_IGNORE));
            if (MPI_SUCCESS != err) {
                if (NULL != free_buffer) {
                    free(free_buffer);
                }
                return err;
            }

            inbuf = pml_buffer;
        }

        /* Perform the reduction */

        ompi_op_reduce(op, inbuf, rbuf, count, dtype);
    }

    if (NULL != inplace_temp) {
        err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)sbuf, inplace_temp);
        free(inplace_temp);
    }
    if (NULL != free_buffer) {
        free(free_buffer);
    }

    /* All done */

    return MPI_SUCCESS;
}
/*
 *	reduce_lin_inter
 *
 *	Function:	- reduction using O(N) algorithm
 *	Accepts:	- same as MPI_Reduce()
 *	Returns:	- MPI_SUCCESS or error code
 */
int
mca_coll_basic_reduce_lin_inter(void *sbuf, void *rbuf, int count,
                                struct ompi_datatype_t *dtype,
                                struct ompi_op_t *op,
                                int root, struct ompi_communicator_t *comm,
                                mca_coll_base_module_t *module)
{
    int i, err, size;
    ptrdiff_t true_lb, true_extent, lb, extent;
    char *free_buffer = NULL;
    char *pml_buffer = NULL;

    /* Initialize */
    size = ompi_comm_remote_size(comm);

    if (MPI_PROC_NULL == root) {
        /* do nothing */
        err = OMPI_SUCCESS;
    } else if (MPI_ROOT != root) {
        /* If not root, send data to the root. */
        err = MCA_PML_CALL(send(sbuf, count, dtype, root,
                                MCA_COLL_BASE_TAG_REDUCE,
                                MCA_PML_BASE_SEND_STANDARD, comm));
    } else {
        /* Root receives and reduces messages  */
        ompi_datatype_get_extent(dtype, &lb, &extent);
        ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent);

        free_buffer = (char*)malloc(true_extent + (count - 1) * extent);
        if (NULL == free_buffer) {
            return OMPI_ERR_OUT_OF_RESOURCE;
        }
        pml_buffer = free_buffer - lb;


        /* Initialize the receive buffer. */
        err = MCA_PML_CALL(recv(rbuf, count, dtype, 0,
                                MCA_COLL_BASE_TAG_REDUCE, comm,
                                MPI_STATUS_IGNORE));
        if (MPI_SUCCESS != err) {
            if (NULL != free_buffer) {
                free(free_buffer);
            }
            return err;
        }

        /* Loop receiving and calling reduction function (C or Fortran). */
        for (i = 1; i < size; i++) {
            err = MCA_PML_CALL(recv(pml_buffer, count, dtype, i,
                                    MCA_COLL_BASE_TAG_REDUCE, comm,
                                    MPI_STATUS_IGNORE));
            if (MPI_SUCCESS != err) {
                if (NULL != free_buffer) {
                    free(free_buffer);
                }
                return err;
            }

            /* Perform the reduction */
            ompi_op_reduce(op, pml_buffer, rbuf, count, dtype);
        }

        if (NULL != free_buffer) {
            free(free_buffer);
        }
    }

    /* All done */
    return err;
}
int ompi_coll_tuned_alltoall_intra_bruck(void *sbuf, int scount,
                                         struct ompi_datatype_t *sdtype,
                                         void* rbuf, int rcount,
                                         struct ompi_datatype_t *rdtype,
                                         struct ompi_communicator_t *comm,
                                         mca_coll_base_module_t *module)
{
    int i, k, line = -1, rank, size, err = 0, weallocated = 0;
    int sendto, recvfrom, distance, *displs = NULL, *blen = NULL;
    char *tmpbuf = NULL, *tmpbuf_free = NULL;
    ptrdiff_t rlb, slb, tlb, sext, rext, tsext;
    struct ompi_datatype_t *new_ddt;
#ifdef blahblah
    mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
    mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
#endif

    if (MPI_IN_PLACE == sbuf) {
        return mca_coll_tuned_alltoall_intra_basic_inplace (rbuf, rcount, rdtype,
                                                            comm, module);
    }

    size = ompi_comm_size(comm);
    rank = ompi_comm_rank(comm);

    OPAL_OUTPUT((ompi_coll_tuned_stream,
                 "coll:tuned:alltoall_intra_bruck rank %d", rank));

    err = ompi_datatype_get_extent (sdtype, &slb, &sext);
    if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }

    err = ompi_datatype_get_true_extent(sdtype, &tlb,  &tsext);
    if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }

    err = ompi_datatype_get_extent (rdtype, &rlb, &rext);
    if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }


#ifdef blahblah
    /* try and SAVE memory by using the data segment hung off 
       the communicator if possible */
    if (data->mcct_num_reqs >= size) { 
        /* we have enought preallocated for displments and lengths */
        displs = (int*) data->mcct_reqs;
        blen = (int *) (displs + size);
        weallocated = 0;
    } 
    else { /* allocate the buffers ourself */
#endif
        displs = (int *) malloc(size * sizeof(int));
        if (displs == NULL) { line = __LINE__; err = -1; goto err_hndl; }
        blen = (int *) malloc(size * sizeof(int));
        if (blen == NULL) { line = __LINE__; err = -1; goto err_hndl; }
        weallocated = 1;
#ifdef blahblah
    }
#endif

    /* tmp buffer allocation for message data */
    tmpbuf_free = (char *) malloc(tsext + ((ptrdiff_t)scount * (ptrdiff_t)size - 1) * sext);
    if (tmpbuf_free == NULL) { line = __LINE__; err = -1; goto err_hndl; }
    tmpbuf = tmpbuf_free - slb;

    /* Step 1 - local rotation - shift up by rank */
    err = ompi_datatype_copy_content_same_ddt (sdtype, 
                                               (int32_t) ((ptrdiff_t)(size - rank) * (ptrdiff_t)scount),
                                               tmpbuf, 
                                               ((char*) sbuf) + (ptrdiff_t)rank * (ptrdiff_t)scount * sext);
    if (err<0) {
        line = __LINE__; err = -1; goto err_hndl;
    }

    if (rank != 0) {
        err = ompi_datatype_copy_content_same_ddt (sdtype, (ptrdiff_t)rank * (ptrdiff_t)scount,
                                                   tmpbuf + (ptrdiff_t)(size - rank) * (ptrdiff_t)scount* sext, 
                                                   (char*) sbuf);
        if (err<0) {
            line = __LINE__; err = -1; goto err_hndl;
        }
    }

    /* perform communication step */
    for (distance = 1; distance < size; distance<<=1) {

        sendto = (rank + distance) % size;
        recvfrom = (rank - distance + size) % size;
        k = 0;

        /* create indexed datatype */
        for (i = 1; i < size; i++) {
            if (( i & distance) == distance) {
                displs[k] = (ptrdiff_t)i * (ptrdiff_t)scount; 
                blen[k] = scount;
                k++;
            }
        }
        /* Set indexes and displacements */
        err = ompi_datatype_create_indexed(k, blen, displs, sdtype, &new_ddt);
        if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;  }
        /* Commit the new datatype */
        err = ompi_datatype_commit(&new_ddt);
        if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;  }

        /* Sendreceive */
        err = ompi_coll_tuned_sendrecv ( tmpbuf, 1, new_ddt, sendto,
                                         MCA_COLL_BASE_TAG_ALLTOALL,
                                         rbuf, 1, new_ddt, recvfrom,
                                         MCA_COLL_BASE_TAG_ALLTOALL,
                                         comm, MPI_STATUS_IGNORE, rank );
        if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }

        /* Copy back new data from recvbuf to tmpbuf */
        err = ompi_datatype_copy_content_same_ddt(new_ddt, 1,tmpbuf, (char *) rbuf);
        if (err < 0) { line = __LINE__; err = -1; goto err_hndl;  }

        /* free ddt */
        err = ompi_datatype_destroy(&new_ddt);
        if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;  }
    } /* end of for (distance = 1... */

    /* Step 3 - local rotation - */
    for (i = 0; i < size; i++) {

        err = ompi_datatype_copy_content_same_ddt (rdtype, (int32_t) rcount,
                                                   ((char*)rbuf) + ((ptrdiff_t)((rank - i + size) % size) * (ptrdiff_t)rcount * rext), 
                                                   tmpbuf + (ptrdiff_t)i * (ptrdiff_t)rcount * rext);
        if (err < 0) { line = __LINE__; err = -1; goto err_hndl;  }
    }

    /* Step 4 - clean up */
    if (tmpbuf != NULL) free(tmpbuf_free);
    if (weallocated) {
        if (displs != NULL) free(displs);
        if (blen != NULL) free(blen);
    }
    return OMPI_SUCCESS;

 err_hndl:
    OPAL_OUTPUT((ompi_coll_tuned_stream,
                 "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, 
                 rank));
    if (tmpbuf != NULL) free(tmpbuf_free);
    if (weallocated) {
        if (displs != NULL) free(displs);
        if (blen != NULL) free(blen);
    }
    return err;
}
/*
 *	reduce_log_intra
 *
 *	Function:	- reduction using O(log N) algorithm
 *	Accepts:	- same as MPI_Reduce()
 *	Returns:	- MPI_SUCCESS or error code
 *
 *
 *      Performing reduction on each dimension of the hypercube.
 *	An example for 8 procs (dimensions = 3):
 *
 *      Stage 1, reduce on X dimension,  1 -> 0, 3 -> 2, 5 -> 4, 7 -> 6
 *
 *          6----<---7		proc_0: 0+1
 *         /|       /|		proc_1: 1
 *        / |      / |		proc_2: 2+3
 *       /  |     /  |		proc_3: 3
 *      4----<---5   |		proc_4: 4+5
 *      |   2--< |---3		proc_5: 5
 *      |  /     |  /		proc_6: 6+7
 *      | /      | /		proc_7: 7
 *      |/       |/
 *      0----<---1
 *
 *	Stage 2, reduce on Y dimension, 2 -> 0, 6 -> 4
 *
 *          6--------7		proc_0: 0+1+2+3
 *         /|       /|		proc_1: 1
 *        v |      / |		proc_2: 2+3
 *       /  |     /  |		proc_3: 3
 *      4--------5   |		proc_4: 4+5+6+7
 *      |   2--- |---3		proc_5: 5
 *      |  /     |  /		proc_6: 6+7
 *      | v      | /		proc_7: 7
 *      |/       |/
 *      0--------1
 *
 *	Stage 3, reduce on Z dimension, 4 -> 0
 *
 *          6--------7		proc_0: 0+1+2+3+4+5+6+7
 *         /|       /|		proc_1: 1
 *        / |      / |		proc_2: 2+3
 *       /  |     /  |		proc_3: 3
 *      4--------5   |		proc_4: 4+5+6+7
 *      |   2--- |---3		proc_5: 5
 *      v  /     |  /		proc_6: 6+7
 *      | /      | /		proc_7: 7
 *      |/       |/
 *      0--------1
 *
 *
 */
int
mca_coll_basic_reduce_log_intra(void *sbuf, void *rbuf, int count,
                                struct ompi_datatype_t *dtype,
                                struct ompi_op_t *op,
                                int root, struct ompi_communicator_t *comm,
                                mca_coll_base_module_t *module)
{
    int i, size, rank, vrank;
    int err, peer, dim, mask;
    ptrdiff_t true_lb, true_extent, lb, extent;
    char *free_buffer = NULL;
    char *free_rbuf = NULL;
    char *pml_buffer = NULL;
    char *snd_buffer = NULL;
    char *rcv_buffer = (char*)rbuf;
    char *inplace_temp = NULL;

    /* JMS Codearound for now -- if the operations is not communative,
     * just call the linear algorithm.  Need to talk to Edgar / George
     * about fixing this algorithm here to work with non-communative
     * operations. */

    if (!ompi_op_is_commute(op)) {
        return mca_coll_basic_reduce_lin_intra(sbuf, rbuf, count, dtype,
                                               op, root, comm, module);
    }

    /* Some variables */
    size = ompi_comm_size(comm);
    rank = ompi_comm_rank(comm);
    vrank = ompi_op_is_commute(op) ? (rank - root + size) % size : rank;
    dim = comm->c_cube_dim;

    /* Allocate the incoming and resulting message buffers.  See lengthy
     * rationale above. */

    ompi_datatype_get_extent(dtype, &lb, &extent);
    ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent);

    free_buffer = (char*)malloc(true_extent + (count - 1) * extent);
    if (NULL == free_buffer) {
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    pml_buffer = free_buffer - lb;
    /* read the comment about commutative operations (few lines down
     * the page) */
    if (ompi_op_is_commute(op)) {
        rcv_buffer = pml_buffer;
    }

    /* Allocate sendbuf in case the MPI_IN_PLACE option has been used. See lengthy
     * rationale above. */

    if (MPI_IN_PLACE == sbuf) {
        inplace_temp = (char*)malloc(true_extent + (count - 1) * extent);
        if (NULL == inplace_temp) {
            err = OMPI_ERR_OUT_OF_RESOURCE;
            goto cleanup_and_return;
        }
        sbuf = inplace_temp - lb;
        err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)sbuf, (char*)rbuf);
    }
    snd_buffer = (char*)sbuf;

    if (rank != root && 0 == (vrank & 1)) {
        /* root is the only one required to provide a valid rbuf.
         * Assume rbuf is invalid for all other ranks, so fix it up
         * here to be valid on all non-leaf ranks */
        free_rbuf = (char*)malloc(true_extent + (count - 1) * extent);
        if (NULL == free_rbuf) {
            err = OMPI_ERR_OUT_OF_RESOURCE;
            goto cleanup_and_return;
        }
        rbuf = free_rbuf - lb;
    }

    /* Loop over cube dimensions. High processes send to low ones in the
     * dimension. */

    for (i = 0, mask = 1; i < dim; ++i, mask <<= 1) {

        /* A high-proc sends to low-proc and stops. */
        if (vrank & mask) {
            peer = vrank & ~mask;
            if (ompi_op_is_commute(op)) {
                peer = (peer + root) % size;
            }

            err = MCA_PML_CALL(send(snd_buffer, count,
                                    dtype, peer, MCA_COLL_BASE_TAG_REDUCE,
                                    MCA_PML_BASE_SEND_STANDARD, comm));
            if (MPI_SUCCESS != err) {
                goto cleanup_and_return;
            }
            snd_buffer = (char*)rbuf;
            break;
        }

        /* A low-proc receives, reduces, and moves to a higher
         * dimension. */

        else {
            peer = vrank | mask;
            if (peer >= size) {
                continue;
            }
            if (ompi_op_is_commute(op)) {
                peer = (peer + root) % size;
            }

            /* Most of the time (all except the first one for commutative
             * operations) we receive in the user provided buffer
             * (rbuf). But the exception is here to allow us to dont have
             * to copy from the sbuf to a temporary location. If the
             * operation is commutative we dont care in which order we
             * apply the operation, so for the first time we can receive
             * the data in the pml_buffer and then apply to operation
             * between this buffer and the user provided data. */

            err = MCA_PML_CALL(recv(rcv_buffer, count, dtype, peer,
                                    MCA_COLL_BASE_TAG_REDUCE, comm,
                                    MPI_STATUS_IGNORE));
            if (MPI_SUCCESS != err) {
                goto cleanup_and_return;
            }
            /* Perform the operation. The target is always the user
             * provided buffer We do the operation only if we receive it
             * not in the user buffer */
            if (snd_buffer != sbuf) {
                /* the target buffer is the locally allocated one */
                ompi_op_reduce(op, rcv_buffer, pml_buffer, count, dtype);
            } else {
                /* If we're commutative, we don't care about the order of
                 * operations and we can just reduce the operations now.
                 * If we are not commutative, we have to copy the send
                 * buffer into a temp buffer (pml_buffer) and then reduce
                 * what we just received against it. */
                if (!ompi_op_is_commute(op)) {
                    ompi_datatype_copy_content_same_ddt(dtype, count, pml_buffer,
                                                   (char*)sbuf);
                    ompi_op_reduce(op, rbuf, pml_buffer, count, dtype);
                } else {
                    ompi_op_reduce(op, sbuf, pml_buffer, count, dtype);
                }
                /* now we have to send the buffer containing the computed data */
                snd_buffer = pml_buffer;
                /* starting from now we always receive in the user
                 * provided buffer */
                rcv_buffer = (char*)rbuf;
            }
        }
    }

    /* Get the result to the root if needed. */
    err = MPI_SUCCESS;
    if (0 == vrank) {
        if (root == rank) {
            ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf, snd_buffer);
        } else {
            err = MCA_PML_CALL(send(snd_buffer, count,
                                    dtype, root, MCA_COLL_BASE_TAG_REDUCE,
                                    MCA_PML_BASE_SEND_STANDARD, comm));
        }
    } else if (rank == root) {
        err = MCA_PML_CALL(recv(rcv_buffer, count, dtype, 0,
                                MCA_COLL_BASE_TAG_REDUCE,
                                comm, MPI_STATUS_IGNORE));
        if (rcv_buffer != rbuf) {
            ompi_op_reduce(op, rcv_buffer, rbuf, count, dtype);
        }
    }

  cleanup_and_return:
    if (NULL != inplace_temp) {
        free(inplace_temp);
    }
    if (NULL != free_buffer) {
        free(free_buffer);
    }
    if (NULL != free_rbuf) {
        free(free_rbuf);
    }

    /* All done */

    return err;
}
示例#11
0
/**
 * This is a generic implementation of the reduce protocol. It used the tree
 * provided as an argument and execute all operations using a segment of
 * count times a datatype.
 * For the last communication it will update the count in order to limit
 * the number of datatype to the original count (original_count)
 *
 * Note that for non-commutative operations we cannot save memory copy
 * for the first block: thus we must copy sendbuf to accumbuf on intermediate 
 * to keep the optimized loop happy.
 */
int ompi_coll_tuned_reduce_generic( void* sendbuf, void* recvbuf, int original_count,
                                    ompi_datatype_t* datatype, ompi_op_t* op,
                                    int root, ompi_communicator_t* comm,
                                    mca_coll_base_module_t *module,
                                    ompi_coll_tree_t* tree, int count_by_segment,
                                    int max_outstanding_reqs )
{
    char *inbuf[2] = {NULL, NULL}, *inbuf_free[2] = {NULL, NULL};
    char *accumbuf = NULL, *accumbuf_free = NULL;
    char *local_op_buffer = NULL, *sendtmpbuf = NULL;
    ptrdiff_t extent, lower_bound, segment_increment;
    size_t typelng;
    ompi_request_t* reqs[2] = {MPI_REQUEST_NULL, MPI_REQUEST_NULL};
    int num_segments, line, ret, segindex, i, rank;
    int recvcount, prevcount, inbi;

    /**
     * Determine number of segments and number of elements
     * sent per operation
     */
    ompi_datatype_get_extent( datatype, &lower_bound, &extent );
    ompi_datatype_type_size( datatype, &typelng );
    num_segments = (original_count + count_by_segment - 1) / count_by_segment;
    segment_increment = (ptrdiff_t)count_by_segment * extent;

    sendtmpbuf = (char*) sendbuf; 
    if( sendbuf == MPI_IN_PLACE ) { 
        sendtmpbuf = (char *)recvbuf; 
    }

    OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:reduce_generic count %d, msg size %ld, segsize %ld, max_requests %d",
                 original_count, (unsigned long)((ptrdiff_t)num_segments * (ptrdiff_t)segment_increment),
                 (unsigned long)segment_increment, max_outstanding_reqs));

    rank = ompi_comm_rank(comm);

    /*printf("Tree of rank %d - ", rank);
    printf("Parent : %d - ", tree->tree_prev);
    printf("Child : ");
    for (i = 0; i < tree->tree_nextsize; i++)
        printf("%d ", tree->tree_next[i]);
    printf("\n");*/

    /* non-leaf nodes - wait for children to send me data & forward up 
       (if needed) */
    if( tree->tree_nextsize > 0 ) {
        ptrdiff_t true_lower_bound, true_extent, real_segment_size;
        ompi_datatype_get_true_extent( datatype, &true_lower_bound, 
                                       &true_extent );

        /* handle non existant recv buffer (i.e. its NULL) and 
           protect the recv buffer on non-root nodes */
        accumbuf = (char*)recvbuf;
        if( (NULL == accumbuf) || (root != rank) ) {
            /* Allocate temporary accumulator buffer. */
            accumbuf_free = (char*)malloc(true_extent + 
                                          (ptrdiff_t)(original_count - 1) * extent);
            if (accumbuf_free == NULL) { 
                line = __LINE__; ret = -1; goto error_hndl; 
            }
            accumbuf = accumbuf_free - lower_bound;
        } 

        /* If this is a non-commutative operation we must copy
           sendbuf to the accumbuf, in order to simplfy the loops */
        if (!ompi_op_is_commute(op)) {
            ompi_datatype_copy_content_same_ddt(datatype, original_count, 
                                                (char*)accumbuf,
                                                (char*)sendtmpbuf);
        }
        /* Allocate two buffers for incoming segments */
        real_segment_size = true_extent + (ptrdiff_t)(count_by_segment - 1) * extent;
        inbuf_free[0] = (char*) malloc(real_segment_size);
        if( inbuf_free[0] == NULL ) { 
            line = __LINE__; ret = -1; goto error_hndl; 
        }
        inbuf[0] = inbuf_free[0] - lower_bound;
        /* if there is chance to overlap communication -
           allocate second buffer */
        if( (num_segments > 1) || (tree->tree_nextsize > 1) ) {
            inbuf_free[1] = (char*) malloc(real_segment_size);
            if( inbuf_free[1] == NULL ) { 
                line = __LINE__; ret = -1; goto error_hndl;
            }
            inbuf[1] = inbuf_free[1] - lower_bound;
        } 

        /* reset input buffer index and receive count */
        inbi = 0;
        recvcount = 0;
        /* for each segment */
        for( segindex = 0; segindex <= num_segments; segindex++ ) {
            prevcount = recvcount;
            /* recvcount - number of elements in current segment */
            recvcount = count_by_segment;
            if( segindex == (num_segments-1) )
                recvcount = original_count - (ptrdiff_t)count_by_segment * (ptrdiff_t)segindex;

            /* for each child */
            for( i = 0; i < tree->tree_nextsize; i++ ) {
                /**
                 * We try to overlap communication:
                 * either with next segment or with the next child
                 */
                /* post irecv for current segindex on current child */
                if( segindex < num_segments ) {
                    void* local_recvbuf = inbuf[inbi];
                    if( 0 == i ) {
                        /* for the first step (1st child per segment) and 
                         * commutative operations we might be able to irecv 
                         * directly into the accumulate buffer so that we can 
                         * reduce(op) this with our sendbuf in one step as 
                         * ompi_op_reduce only has two buffer pointers, 
                         * this avoids an extra memory copy.
                         *
                         * BUT if the operation is non-commutative or 
                         * we are root and are USING MPI_IN_PLACE this is wrong!
                         */
                        if( (ompi_op_is_commute(op)) &&
                            !((MPI_IN_PLACE == sendbuf) && (rank == tree->tree_root)) ) {
                            local_recvbuf = accumbuf + (ptrdiff_t)segindex * (ptrdiff_t)segment_increment;
                        }
                    }
                    ret = MCA_PML_CALL(irecv(local_recvbuf, recvcount, datatype,
                                             tree->tree_next[i], 
                                             MCA_COLL_BASE_TAG_REDUCE, comm, 
                                             &reqs[inbi]));
                    if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl;}
                }
                /* wait for previous req to complete, if any.
                   if there are no requests reqs[inbi ^1] will be 
                   MPI_REQUEST_NULL. */
                /* wait on data from last child for previous segment */
                ret = ompi_request_wait_all( 1, &reqs[inbi ^ 1], 
                                             MPI_STATUSES_IGNORE );
                if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl;  }
                local_op_buffer = inbuf[inbi ^ 1];
                if( i > 0 ) {
                    /* our first operation is to combine our own [sendbuf] data 
                     * with the data we recvd from down stream (but only 
                     * the operation is commutative and if we are not root and 
                     * not using MPI_IN_PLACE)
                     */
                    if( 1 == i ) {
                        if( (ompi_op_is_commute(op)) && 
                            !((MPI_IN_PLACE == sendbuf) && (rank == tree->tree_root)) ) {
                            local_op_buffer = sendtmpbuf + (ptrdiff_t)segindex * (ptrdiff_t)segment_increment;
                        }
                    }
                    /* apply operation */
                    ompi_op_reduce(op, local_op_buffer, 
                                   accumbuf + (ptrdiff_t)segindex * (ptrdiff_t)segment_increment, 
                                   recvcount, datatype );
                } else if ( segindex > 0 ) {
                    void* accumulator = accumbuf + (ptrdiff_t)(segindex-1) * (ptrdiff_t)segment_increment;
                    if( tree->tree_nextsize <= 1 ) {
                        if( (ompi_op_is_commute(op)) &&
                            !((MPI_IN_PLACE == sendbuf) && (rank == tree->tree_root)) ) {
                            local_op_buffer = sendtmpbuf + (ptrdiff_t)(segindex-1) * (ptrdiff_t)segment_increment;
                        }
                    }
                    ompi_op_reduce(op, local_op_buffer, accumulator, prevcount, 
                                   datatype );

                    /* all reduced on available data this step (i) complete, 
                     * pass to the next process unless you are the root.
                     */
                    if (rank != tree->tree_root) {
                        /* send combined/accumulated data to parent */
                        ret = MCA_PML_CALL( send( accumulator, prevcount, 
                                                  datatype, tree->tree_prev, 
                                                  MCA_COLL_BASE_TAG_REDUCE,
                                                  MCA_PML_BASE_SEND_STANDARD, 
                                                  comm) );
                        if (ret != MPI_SUCCESS) { 
                            line = __LINE__; goto error_hndl;  
                        }
                    }

                    /* we stop when segindex = number of segments 
                       (i.e. we do num_segment+1 steps for pipelining */
                    if (segindex == num_segments) break;
                }

                /* update input buffer index */
                inbi = inbi ^ 1;
            } /* end of for each child */
        } /* end of for each segment */

        /* clean up */
        if( inbuf_free[0] != NULL) free(inbuf_free[0]);
        if( inbuf_free[1] != NULL) free(inbuf_free[1]);
        if( accumbuf_free != NULL ) free(accumbuf_free);
    }

    /* leaf nodes 
       Depending on the value of max_outstanding_reqs and 
       the number of segments we have two options:
       - send all segments using blocking send to the parent, or
       - avoid overflooding the parent nodes by limiting the number of 
       outstanding requests to max_oustanding_reqs.
       TODO/POSSIBLE IMPROVEMENT: If there is a way to determine the eager size 
       for the current communication, synchronization should be used only 
       when the message/segment size is smaller than the eager size.
    */
    else {

        /* If the number of segments is less than a maximum number of oustanding
           requests or there is no limit on the maximum number of outstanding 
           requests, we send data to the parent using blocking send */
        if ((0 == max_outstanding_reqs) || 
            (num_segments <= max_outstanding_reqs)) {
            
            segindex = 0;
            while ( original_count > 0) {
                if (original_count < count_by_segment) {
                    count_by_segment = original_count;
                }
                ret = MCA_PML_CALL( send((char*)sendbuf + 
                                         (ptrdiff_t)segindex * (ptrdiff_t)segment_increment,
                                         count_by_segment, datatype,
                                         tree->tree_prev, 
                                         MCA_COLL_BASE_TAG_REDUCE,
                                         MCA_PML_BASE_SEND_STANDARD,
                                         comm) );
                if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
                segindex++;
                original_count -= count_by_segment;
            }
        }

        /* Otherwise, introduce flow control:
           - post max_outstanding_reqs non-blocking synchronous send,
           - for remaining segments
           - wait for a ssend to complete, and post the next one.
           - wait for all outstanding sends to complete.
        */
        else {

            int creq = 0;
            ompi_request_t **sreq = NULL;

            sreq = (ompi_request_t**) calloc( max_outstanding_reqs,
                                              sizeof(ompi_request_t*) );
            if (NULL == sreq) { line = __LINE__; ret = -1; goto error_hndl; }

            /* post first group of requests */
            for (segindex = 0; segindex < max_outstanding_reqs; segindex++) {
                ret = MCA_PML_CALL( isend((char*)sendbuf +
                                          (ptrdiff_t)segindex * (ptrdiff_t)segment_increment,
                                          count_by_segment, datatype,
                                          tree->tree_prev, 
                                          MCA_COLL_BASE_TAG_REDUCE,
                                          MCA_PML_BASE_SEND_SYNCHRONOUS, comm,
                                          &sreq[segindex]) );
                if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl;  }
                original_count -= count_by_segment;
            }

            creq = 0;
            while ( original_count > 0 ) {
                /* wait on a posted request to complete */
                ret = ompi_request_wait(&sreq[creq], MPI_STATUS_IGNORE);
                if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl;  }
                sreq[creq] = MPI_REQUEST_NULL;

                if( original_count < count_by_segment ) {
                    count_by_segment = original_count;
                }
                ret = MCA_PML_CALL( isend((char*)sendbuf + 
                                          (ptrdiff_t)segindex * (ptrdiff_t)segment_increment, 
                                          count_by_segment, datatype, 
                                          tree->tree_prev, 
                                          MCA_COLL_BASE_TAG_REDUCE, 
                                          MCA_PML_BASE_SEND_SYNCHRONOUS, comm, 
                                          &sreq[creq]) );
                if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl;  }
                creq = (creq + 1) % max_outstanding_reqs;
                segindex++;
                original_count -= count_by_segment;
            }

            /* Wait on the remaining request to complete */
            ret = ompi_request_wait_all( max_outstanding_reqs, sreq, 
                                         MPI_STATUSES_IGNORE );
            if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl;  }

            /* free requests */
            free(sreq);
        }
    }
    return OMPI_SUCCESS;

 error_hndl:  /* error handler */
    OPAL_OUTPUT (( ompi_coll_tuned_stream, 
                   "ERROR_HNDL: node %d file %s line %d error %d\n", 
                   rank, __FILE__, line, ret ));
    if( inbuf_free[0] != NULL ) free(inbuf_free[0]);
    if( inbuf_free[1] != NULL ) free(inbuf_free[1]);
    if( accumbuf_free != NULL ) free(accumbuf);
    return ret;
}
示例#12
0
/*
 *  reduce_lin_intra
 *
 *  Function:   - reduction using O(N) algorithm
 *  Accepts:    - same as MPI_Reduce()
 *  Returns:    - MPI_SUCCESS or error code
 */
int
ompi_coll_tuned_reduce_intra_basic_linear(void *sbuf, void *rbuf, int count,
                                          struct ompi_datatype_t *dtype,
                                          struct ompi_op_t *op,
                                          int root,
                                          struct ompi_communicator_t *comm,
                                          mca_coll_base_module_t *module)
{
    int i, rank, err, size;
    ptrdiff_t true_lb, true_extent, lb, extent;
    char *free_buffer = NULL, *pml_buffer = NULL;
    char *inplace_temp = NULL, *inbuf;

    /* Initialize */

    rank = ompi_comm_rank(comm);
    size = ompi_comm_size(comm);

    OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_basic_linear rank %d", rank));

    /* If not root, send data to the root. */

    if (rank != root) {
        err = MCA_PML_CALL(send(sbuf, count, dtype, root,
                                MCA_COLL_BASE_TAG_REDUCE,
                                MCA_PML_BASE_SEND_STANDARD, comm));
        return err;
    }

    /* see discussion in ompi_coll_basic_reduce_lin_intra about 
       extent and true extent */
    /* for reducing buffer allocation lengths.... */

    ompi_datatype_get_extent(dtype, &lb, &extent);
    ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent);

    if (MPI_IN_PLACE == sbuf) {
        sbuf = rbuf;
        inplace_temp = (char*)malloc(true_extent + (ptrdiff_t)(count - 1) * extent);
        if (NULL == inplace_temp) {
            return OMPI_ERR_OUT_OF_RESOURCE;
        }
        rbuf = inplace_temp - lb;
    }

    if (size > 1) {
        free_buffer = (char*)malloc(true_extent + (ptrdiff_t)(count - 1) * extent);
        if (NULL == free_buffer) {
            return OMPI_ERR_OUT_OF_RESOURCE;
        }
        pml_buffer = free_buffer - lb;
    }

    /* Initialize the receive buffer. */

    if (rank == (size - 1)) {
        err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf, 
                                                  (char*)sbuf);
    } else {
        err = MCA_PML_CALL(recv(rbuf, count, dtype, size - 1,
                                MCA_COLL_BASE_TAG_REDUCE, comm,
                                MPI_STATUS_IGNORE));
    }
    if (MPI_SUCCESS != err) {
        if (NULL != free_buffer) {
            free(free_buffer);
        }
        return err;
    }

    /* Loop receiving and calling reduction function (C or Fortran). */

    for (i = size - 2; i >= 0; --i) {
        if (rank == i) {
            inbuf = (char*)sbuf;
        } else {
            err = MCA_PML_CALL(recv(pml_buffer, count, dtype, i,
                                    MCA_COLL_BASE_TAG_REDUCE, comm,
                                    MPI_STATUS_IGNORE));
            if (MPI_SUCCESS != err) {
                if (NULL != free_buffer) {
                    free(free_buffer);
                }
                return err;
            }

            inbuf = pml_buffer;
        }

        /* Perform the reduction */
        ompi_op_reduce(op, inbuf, rbuf, count, dtype);
    }

    if (NULL != inplace_temp) {
        err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)sbuf, 
                                                  inplace_temp);
        free(inplace_temp);
    }
    if (NULL != free_buffer) {
        free(free_buffer);
    }

    /* All done */
    return MPI_SUCCESS;
}
/*
 *	reduce_scatter
 *
 *	Function:	- reduce then scatter
 *	Accepts:	- same as MPI_Reduce_scatter()
 *	Returns:	- MPI_SUCCESS or error code
 *
 * Algorithm:
 *   Cummutative, reasonable sized messages
 *     recursive halving algorithm
 *   Others:
 *     reduce and scatterv (needs to be cleaned 
 *     up at some point)
 *
 * NOTE: that the recursive halving algorithm should be faster than
 * the reduce/scatter for all message sizes.  However, the memory
 * usage for the recusive halving is msg_size + 2 * comm_size greater
 * for the recursive halving, so I've limited where the recursive
 * halving is used to be nice to the app memory wise.  There are much
 * better algorithms for large messages with cummutative operations,
 * so this should be investigated further.
 *
 * NOTE: We default to a simple reduce/scatterv if one of the rcounts
 * is zero.  This is because the existing algorithms do not currently
 * support a count of zero in the array.
 */
int
mca_coll_basic_reduce_scatter_intra(void *sbuf, void *rbuf, int *rcounts,
                                    struct ompi_datatype_t *dtype,
                                    struct ompi_op_t *op,
                                    struct ompi_communicator_t *comm,
                                    mca_coll_base_module_t *module)
{
    int i, rank, size, count, err = OMPI_SUCCESS;
    ptrdiff_t true_lb, true_extent, lb, extent, buf_size;
    int *disps = NULL;
    char *recv_buf = NULL, *recv_buf_free = NULL;
    char *result_buf = NULL, *result_buf_free = NULL;
    bool zerocounts = false;

    /* Initialize */
    rank = ompi_comm_rank(comm);
    size = ompi_comm_size(comm);

    /* Find displacements and the like */
    disps = (int*) malloc(sizeof(int) * size);
    if (NULL == disps) return OMPI_ERR_OUT_OF_RESOURCE;

    disps[0] = 0;
    for (i = 0; i < (size - 1); ++i) {
        disps[i + 1] = disps[i] + rcounts[i];
        if (0 == rcounts[i]) {
            zerocounts = true;
        }
    }
    count = disps[size - 1] + rcounts[size - 1];
    if (0 == rcounts[size - 1]) {
        zerocounts = true;
    }

    /* short cut the trivial case */
    if (0 == count) {
        free(disps);
        return OMPI_SUCCESS;
    }

    /* get datatype information */
    ompi_datatype_get_extent(dtype, &lb, &extent);
    ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent);
    buf_size = true_extent + (count - 1) * extent;

    /* Handle MPI_IN_PLACE */
    if (MPI_IN_PLACE == sbuf) {
        sbuf = rbuf;
    }

    if ((op->o_flags & OMPI_OP_FLAGS_COMMUTE) &&
        (buf_size < COMMUTATIVE_LONG_MSG) && (!zerocounts)) {
        int tmp_size, remain = 0, tmp_rank;

        /* temporary receive buffer.  See coll_basic_reduce.c for details on sizing */
        recv_buf_free = (char*) malloc(buf_size);
        recv_buf = recv_buf_free - lb;
        if (NULL == recv_buf_free) {
            err = OMPI_ERR_OUT_OF_RESOURCE;
            goto cleanup;
        }

        /* allocate temporary buffer for results */
        result_buf_free = (char*) malloc(buf_size);
        result_buf = result_buf_free - lb;

        /* copy local buffer into the temporary results */
        err = ompi_datatype_sndrcv(sbuf, count, dtype, result_buf, count, dtype);
        if (OMPI_SUCCESS != err) goto cleanup;

        /* figure out power of two mapping: grow until larger than
           comm size, then go back one, to get the largest power of
           two less than comm size */
        tmp_size = opal_next_poweroftwo(size);
        tmp_size >>= 1;
        remain = size - tmp_size;

        /* If comm size is not a power of two, have the first "remain"
           procs with an even rank send to rank + 1, leaving a power of
           two procs to do the rest of the algorithm */
        if (rank < 2 * remain) {
            if ((rank & 1) == 0) {
                err = MCA_PML_CALL(send(result_buf, count, dtype, rank + 1, 
                                        MCA_COLL_BASE_TAG_REDUCE_SCATTER,
                                        MCA_PML_BASE_SEND_STANDARD,
                                        comm));
                if (OMPI_SUCCESS != err) goto cleanup;

                /* we don't participate from here on out */
                tmp_rank = -1;
            } else {
                err = MCA_PML_CALL(recv(recv_buf, count, dtype, rank - 1,
                                        MCA_COLL_BASE_TAG_REDUCE_SCATTER,
                                        comm, MPI_STATUS_IGNORE));
                if (OMPI_SUCCESS != err) goto cleanup;

                /* integrate their results into our temp results */
                ompi_op_reduce(op, recv_buf, result_buf, count, dtype);

                /* adjust rank to be the bottom "remain" ranks */
                tmp_rank = rank / 2;
            }
        } else {
            /* just need to adjust rank to show that the bottom "even
               remain" ranks dropped out */
            tmp_rank = rank - remain;
        }

        /* For ranks not kicked out by the above code, perform the
           recursive halving */
        if (tmp_rank >= 0) {
            int *tmp_disps = NULL, *tmp_rcounts = NULL;
            int mask, send_index, recv_index, last_index;

            /* recalculate disps and rcounts to account for the
               special "remainder" processes that are no longer doing
               anything */
            tmp_rcounts = (int*) malloc(tmp_size * sizeof(int));
            if (NULL == tmp_rcounts) {
                err = OMPI_ERR_OUT_OF_RESOURCE;
                goto cleanup;
            }
            tmp_disps = (int*) malloc(tmp_size * sizeof(int));
            if (NULL == tmp_disps) {
                free(tmp_rcounts);
                err = OMPI_ERR_OUT_OF_RESOURCE;
                goto cleanup;
            }

            for (i = 0 ; i < tmp_size ; ++i) {
                if (i < remain) {
                    /* need to include old neighbor as well */
                    tmp_rcounts[i] = rcounts[i * 2 + 1] + rcounts[i * 2];
                } else {
                    tmp_rcounts[i] = rcounts[i + remain];
                }
            }

            tmp_disps[0] = 0;
            for (i = 0; i < tmp_size - 1; ++i) {
                tmp_disps[i + 1] = tmp_disps[i] + tmp_rcounts[i];
            }

            /* do the recursive halving communication.  Don't use the
               dimension information on the communicator because I
               think the information is invalidated by our "shrinking"
               of the communicator */
            mask = tmp_size >> 1;
            send_index = recv_index = 0;
            last_index = tmp_size;
            while (mask > 0) {
                int tmp_peer, peer, send_count, recv_count;
                struct ompi_request_t *request;

                tmp_peer = tmp_rank ^ mask;
                peer = (tmp_peer < remain) ? tmp_peer * 2 + 1 : tmp_peer + remain;

                /* figure out if we're sending, receiving, or both */
                send_count = recv_count = 0;
                if (tmp_rank < tmp_peer) {
                    send_index = recv_index + mask;
                    for (i = send_index ; i < last_index ; ++i) {
                        send_count += tmp_rcounts[i];
                    }
                    for (i = recv_index ; i < send_index ; ++i) {
                        recv_count += tmp_rcounts[i];
                    }
                } else {
                    recv_index = send_index + mask;
                    for (i = send_index ; i < recv_index ; ++i) {
                        send_count += tmp_rcounts[i];
                    }
                    for (i = recv_index ; i < last_index ; ++i) {
                        recv_count += tmp_rcounts[i];
                    }
                }

                /* actual data transfer.  Send from result_buf,
                   receive into recv_buf */
                if (send_count > 0 && recv_count != 0) {
                    err = MCA_PML_CALL(irecv(recv_buf + tmp_disps[recv_index] * extent,
                                             recv_count, dtype, peer,
                                             MCA_COLL_BASE_TAG_REDUCE_SCATTER,
                                             comm, &request));
                    if (OMPI_SUCCESS != err) {
                        free(tmp_rcounts);
                        free(tmp_disps);
                        goto cleanup;
                    }                                             
                }
                if (recv_count > 0 && send_count != 0) {
                    err = MCA_PML_CALL(send(result_buf + tmp_disps[send_index] * extent,
                                            send_count, dtype, peer, 
                                            MCA_COLL_BASE_TAG_REDUCE_SCATTER,
                                            MCA_PML_BASE_SEND_STANDARD,
                                            comm));
                    if (OMPI_SUCCESS != err) {
                        free(tmp_rcounts);
                        free(tmp_disps);
                        goto cleanup;
                    }                                             
                }
                if (send_count > 0 && recv_count != 0) {
                    err = ompi_request_wait(&request, MPI_STATUS_IGNORE);
                    if (OMPI_SUCCESS != err) {
                        free(tmp_rcounts);
                        free(tmp_disps);
                        goto cleanup;
                    }                                             
                }

                /* if we received something on this step, push it into
                   the results buffer */
                if (recv_count > 0) {
                    ompi_op_reduce(op, 
                                   recv_buf + tmp_disps[recv_index] * extent, 
                                   result_buf + tmp_disps[recv_index] * extent,
                                   recv_count, dtype);
                }

                /* update for next iteration */
                send_index = recv_index;
                last_index = recv_index + mask;
                mask >>= 1;
            }

            /* copy local results from results buffer into real receive buffer */
            if (0 != rcounts[rank]) {
                err = ompi_datatype_sndrcv(result_buf + disps[rank] * extent,
                                      rcounts[rank], dtype, 
                                      rbuf, rcounts[rank], dtype);
                if (OMPI_SUCCESS != err) {
                    free(tmp_rcounts);
                    free(tmp_disps);
                    goto cleanup;
                }                                             
            }

            free(tmp_rcounts);
            free(tmp_disps);
        }
示例#14
0
/* MPI_IN_PLACE all to all algorithm. TODO: implement a better one. */
int
mca_coll_base_alltoall_intra_basic_inplace(const void *rbuf, int rcount,
                                           struct ompi_datatype_t *rdtype,
                                           struct ompi_communicator_t *comm,
                                           mca_coll_base_module_t *module)
{
    mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module;
    int i, j, size, rank, err = MPI_SUCCESS, line;
    MPI_Request *preq;
    char *tmp_buffer;
    size_t max_size;
    ptrdiff_t ext, true_lb, true_ext;

    /* Initialize. */

    size = ompi_comm_size(comm);
    rank = ompi_comm_rank(comm);

    /* If only one process, we're done. */
    if (1 == size) {
        return MPI_SUCCESS;
    }

    /* Find the largest receive amount */
    ompi_datatype_type_extent (rdtype, &ext);
    ompi_datatype_get_true_extent ( rdtype, &true_lb, &true_ext);
    max_size = true_ext + ext * (rcount-1);

    /* Allocate a temporary buffer */
    tmp_buffer = calloc (max_size, 1);
    if (NULL == tmp_buffer) {
      return OMPI_ERR_OUT_OF_RESOURCE;
    }
    max_size = ext * rcount;

    /* in-place alltoall slow algorithm (but works) */
    for (i = 0 ; i < size ; ++i) {
        for (j = i+1 ; j < size ; ++j) {
            /* Initiate all send/recv to/from others. */
            preq = coll_base_comm_get_reqs(base_module->base_data, size * 2);

            if (i == rank) {
                /* Copy the data into the temporary buffer */
                err = ompi_datatype_copy_content_same_ddt (rdtype, rcount, tmp_buffer,
                                                       (char *) rbuf + j * max_size);
                if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; }

                /* Exchange data with the peer */
                err = MCA_PML_CALL(irecv ((char *) rbuf + max_size * j, rcount, rdtype,
                                          j, MCA_COLL_BASE_TAG_ALLTOALL, comm, preq++));
                if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; }

                err = MCA_PML_CALL(isend ((char *) tmp_buffer,  rcount, rdtype,
                                          j, MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD,
                                          comm, preq++));
                if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; }
            } else if (j == rank) {
                /* Copy the data into the temporary buffer */
                err = ompi_datatype_copy_content_same_ddt (rdtype, rcount, tmp_buffer,
                                                       (char *) rbuf + i * max_size);
                if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; }

                /* Exchange data with the peer */
                err = MCA_PML_CALL(irecv ((char *) rbuf + max_size * i, rcount, rdtype,
                                          i, MCA_COLL_BASE_TAG_ALLTOALL, comm, preq++));
                if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; }

                err = MCA_PML_CALL(isend ((char *) tmp_buffer,  rcount, rdtype,
                                          i, MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD,
                                          comm, preq++));
                if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; }
            } else {
                continue;
            }

            /* Wait for the requests to complete */
            err = ompi_request_wait_all (2, base_module->base_data->mcct_reqs, MPI_STATUSES_IGNORE);
            if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; }
        }
    }

 error_hndl:
    /* Free the temporary buffer */
    free (tmp_buffer);

    if( MPI_SUCCESS != err ) {
        OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
                     "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err,
                     rank));
        ompi_coll_base_free_reqs(base_module->base_data->mcct_reqs, 2);
    }

    /* All done */
    return err;
}
示例#15
0
int ompi_osc_ucx_get_accumulate(const void *origin_addr, int origin_count,
                                struct ompi_datatype_t *origin_dt,
                                void *result_addr, int result_count,
                                struct ompi_datatype_t *result_dt,
                                int target, ptrdiff_t target_disp,
                                int target_count, struct ompi_datatype_t *target_dt,
                                struct ompi_op_t *op, struct ompi_win_t *win) {
    ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module;
    ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, target);
    int ret = OMPI_SUCCESS;

    ret = check_sync_state(module, target, false);
    if (ret != OMPI_SUCCESS) {
        return ret;
    }

    ret = start_atomicity(module, ep, target);
    if (ret != OMPI_SUCCESS) {
        return ret;
    }

    ret = ompi_osc_ucx_get(result_addr, result_count, result_dt, target,
                           target_disp, target_count, target_dt, win);
    if (ret != OMPI_SUCCESS) {
        return ret;
    }

    if (op != &ompi_mpi_op_no_op.op) {
        if (op == &ompi_mpi_op_replace.op) {
            ret = ompi_osc_ucx_put(origin_addr, origin_count, origin_dt,
                                   target, target_disp, target_count,
                                   target_dt, win);
            if (ret != OMPI_SUCCESS) {
                return ret;
            }
        } else {
            void *temp_addr = NULL;
            uint32_t temp_count;
            ompi_datatype_t *temp_dt;
            ptrdiff_t temp_lb, temp_extent;
            ucs_status_t status;
            bool is_origin_contig = ompi_datatype_is_contiguous_memory_layout(origin_dt, origin_count);

            if (ompi_datatype_is_predefined(target_dt)) {
                temp_dt = target_dt;
                temp_count = target_count;
            } else {
                ret = ompi_osc_base_get_primitive_type_info(target_dt, &temp_dt, &temp_count);
                if (ret != OMPI_SUCCESS) {
                    return ret;
                }
            }
            ompi_datatype_get_true_extent(temp_dt, &temp_lb, &temp_extent);
            temp_addr = malloc(temp_extent * temp_count);
            if (temp_addr == NULL) {
                return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
            }

            ret = ompi_osc_ucx_get(temp_addr, (int)temp_count, temp_dt,
                                   target, target_disp, target_count, target_dt, win);
            if (ret != OMPI_SUCCESS) {
                return ret;
            }

            status = ucp_ep_flush(ep);
            if (status != UCS_OK) {
                opal_output_verbose(1, ompi_osc_base_framework.framework_output,
                                    "%s:%d: ucp_ep_flush failed: %d\n",
                                    __FILE__, __LINE__, status);
                return OMPI_ERROR;
            }

            if (ompi_datatype_is_predefined(origin_dt) || is_origin_contig) {
                ompi_op_reduce(op, (void *)origin_addr, temp_addr, (int)temp_count, temp_dt);
            } else {
                ucx_iovec_t *origin_ucx_iov = NULL;
                uint32_t origin_ucx_iov_count = 0;
                uint32_t origin_ucx_iov_idx = 0;

                ret = create_iov_list(origin_addr, origin_count, origin_dt,
                                      &origin_ucx_iov, &origin_ucx_iov_count);
                if (ret != OMPI_SUCCESS) {
                    return ret;
                }

                if ((op != &ompi_mpi_op_maxloc.op && op != &ompi_mpi_op_minloc.op) ||
                    ompi_datatype_is_contiguous_memory_layout(temp_dt, temp_count)) {
                    size_t temp_size;
                    ompi_datatype_type_size(temp_dt, &temp_size);
                    while (origin_ucx_iov_idx < origin_ucx_iov_count) {
                        int curr_count = origin_ucx_iov[origin_ucx_iov_idx].len / temp_size;
                        ompi_op_reduce(op, origin_ucx_iov[origin_ucx_iov_idx].addr,
                                       temp_addr, curr_count, temp_dt);
                        temp_addr = (void *)((char *)temp_addr + curr_count * temp_size);
                        origin_ucx_iov_idx++;
                    }
                } else {
                    int i;
                    void *curr_origin_addr = origin_ucx_iov[origin_ucx_iov_idx].addr;
                    for (i = 0; i < (int)temp_count; i++) {
                        ompi_op_reduce(op, curr_origin_addr,
                                       (void *)((char *)temp_addr + i * temp_extent),
                                       1, temp_dt);
                        curr_origin_addr = (void *)((char *)curr_origin_addr + temp_extent);
                        origin_ucx_iov_idx++;
                        if (curr_origin_addr >= (void *)((char *)origin_ucx_iov[origin_ucx_iov_idx].addr + origin_ucx_iov[origin_ucx_iov_idx].len)) {
                            origin_ucx_iov_idx++;
                            curr_origin_addr = origin_ucx_iov[origin_ucx_iov_idx].addr;
                        }
                    }
                }
                free(origin_ucx_iov);
            }

            ret = ompi_osc_ucx_put(temp_addr, (int)temp_count, temp_dt, target, target_disp,
                                   target_count, target_dt, win);
            if (ret != OMPI_SUCCESS) {
                return ret;
            }

            status = ucp_ep_flush(ep);
            if (status != UCS_OK) {
                opal_output_verbose(1, ompi_osc_base_framework.framework_output,
                                    "%s:%d: ucp_ep_flush failed: %d\n",
                                    __FILE__, __LINE__, status);
                return OMPI_ERROR;
            }

            free(temp_addr);
        }
    }

    ret = end_atomicity(module, ep, target);

    return ret;
}
示例#16
0
/* Todo: gather_intra_generic, gather_intra_binary, gather_intra_chain,
 * gather_intra_pipeline, segmentation? */
int
ompi_coll_base_gather_intra_binomial(const void *sbuf, int scount,
                                      struct ompi_datatype_t *sdtype,
                                      void *rbuf, int rcount,
                                      struct ompi_datatype_t *rdtype,
                                      int root,
                                      struct ompi_communicator_t *comm,
                                      mca_coll_base_module_t *module)
{
    int line = -1, i, rank, vrank, size, total_recv = 0, err;
    char *ptmp     = NULL, *tempbuf  = NULL;
    ompi_coll_tree_t* bmtree;
    MPI_Status status;
    MPI_Aint sextent, slb, strue_lb, strue_extent;
    MPI_Aint rextent, rlb, rtrue_lb, rtrue_extent;
    mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module;
    mca_coll_base_comm_t *data = base_module->base_data;

    size = ompi_comm_size(comm);
    rank = ompi_comm_rank(comm);

    OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
                 "ompi_coll_base_gather_intra_binomial rank %d", rank));

    /* create the binomial tree */
    COLL_BASE_UPDATE_IN_ORDER_BMTREE( comm, base_module, root );
    bmtree = data->cached_in_order_bmtree;

    ompi_datatype_get_extent(sdtype, &slb, &sextent);
    ompi_datatype_get_true_extent(sdtype, &strue_lb, &strue_extent);

    vrank = (rank - root + size) % size;

    if (rank == root) {
        ompi_datatype_get_extent(rdtype, &rlb, &rextent);
        ompi_datatype_get_true_extent(rdtype, &rtrue_lb, &rtrue_extent);
        if (0 == root){
            /* root on 0, just use the recv buffer */
            ptmp = (char *) rbuf;
            if (sbuf != MPI_IN_PLACE) {
                err = ompi_datatype_sndrcv((void *)sbuf, scount, sdtype,
                                           ptmp, rcount, rdtype);
                if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
            }
        } else {
            /* root is not on 0, allocate temp buffer for recv,
             * rotate data at the end */
            tempbuf = (char *) malloc(rtrue_extent + ((ptrdiff_t)rcount * (ptrdiff_t)size - 1) * rextent);
            if (NULL == tempbuf) {
                err= OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl;
            }

            ptmp = tempbuf - rtrue_lb;
            if (sbuf != MPI_IN_PLACE) {
                /* copy from sbuf to temp buffer */
                err = ompi_datatype_sndrcv((void *)sbuf, scount, sdtype,
                                           ptmp, rcount, rdtype);
                if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
            } else {
                /* copy from rbuf to temp buffer  */
                err = ompi_datatype_copy_content_same_ddt(rdtype, rcount, ptmp,
                                                          (char *)rbuf + (ptrdiff_t)rank * rextent * (ptrdiff_t)rcount);
                if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
            }
        }
        total_recv = rcount;
    } else if (!(vrank % 2)) {
        /* other non-leaf nodes, allocate temp buffer for data received from
         * children, the most we need is half of the total data elements due
         * to the property of binimoal tree */
        tempbuf = (char *) malloc(strue_extent + ((ptrdiff_t)scount * (ptrdiff_t)size - 1) * sextent);
        if (NULL == tempbuf) {
            err= OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl;
        }

        ptmp = tempbuf - strue_lb;
        /* local copy to tempbuf */
        err = ompi_datatype_sndrcv((void *)sbuf, scount, sdtype,
                                   ptmp, scount, sdtype);
        if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }

        /* use sdtype,scount as rdtype,rdcount since they are ignored on
         * non-root procs */
        rdtype = sdtype;
        rcount = scount;
        rextent = sextent;
        total_recv = rcount;
    } else {
        /* leaf nodes, no temp buffer needed, use sdtype,scount as
         * rdtype,rdcount since they are ignored on non-root procs */
        ptmp = (char *) sbuf;
        total_recv = scount;
    }

    if (!(vrank % 2)) {
        /* all non-leaf nodes recv from children */
        for (i = 0; i < bmtree->tree_nextsize; i++) {
            int mycount = 0, vkid;
            /* figure out how much data I have to send to this child */
            vkid = (bmtree->tree_next[i] - root + size) % size;
            mycount = vkid - vrank;
            if (mycount > (size - vkid))
                mycount = size - vkid;
            mycount *= rcount;

            OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
                         "ompi_coll_base_gather_intra_binomial rank %d recv %d mycount = %d",
                         rank, bmtree->tree_next[i], mycount));

            err = MCA_PML_CALL(recv(ptmp + total_recv*rextent, (ptrdiff_t)rcount * size - total_recv, rdtype,
                                    bmtree->tree_next[i], MCA_COLL_BASE_TAG_GATHER,
                                    comm, &status));
            if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }

            total_recv += mycount;
        }
    }

    if (rank != root) {
        /* all nodes except root send to parents */
        OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
                     "ompi_coll_base_gather_intra_binomial rank %d send %d count %d\n",
                     rank, bmtree->tree_prev, total_recv));

        err = MCA_PML_CALL(send(ptmp, total_recv, sdtype,
                                bmtree->tree_prev,
                                MCA_COLL_BASE_TAG_GATHER,
                                MCA_PML_BASE_SEND_STANDARD, comm));
        if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
    }

    if (rank == root) {
        if (root != 0) {
            /* rotate received data on root if root != 0 */
            err = ompi_datatype_copy_content_same_ddt(rdtype, (ptrdiff_t)rcount * (ptrdiff_t)(size - root),
                                                      (char *)rbuf + rextent * (ptrdiff_t)root * (ptrdiff_t)rcount, ptmp);
            if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }


            err = ompi_datatype_copy_content_same_ddt(rdtype, (ptrdiff_t)rcount * (ptrdiff_t)root,
                                                      (char *) rbuf, ptmp + rextent * (ptrdiff_t)rcount * (ptrdiff_t)(size-root));
            if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }

            free(tempbuf);
        }
    } else if (!(vrank % 2)) {
        /* other non-leaf nodes */
        free(tempbuf);
    }
    return MPI_SUCCESS;

 err_hndl:
    if (NULL != tempbuf)
        free(tempbuf);

    OPAL_OUTPUT((ompi_coll_base_framework.framework_output,  "%s:%4d\tError occurred %d, rank %2d",
                 __FILE__, line, err, rank));
    return err;
}
/*
 *	reduce_intra
 *
 *	Function:	- reduction using two level hierarchy algorithm
 *	Accepts:	- same as MPI_Reduce()
 *	Returns:	- MPI_SUCCESS or error code
 */
int mca_coll_hierarch_allreduce_intra(void *sbuf, void *rbuf, int count,
				      struct ompi_datatype_t *dtype, 
				      struct ompi_op_t *op,
				      struct ompi_communicator_t *comm,
				      mca_coll_base_module_t *module)
{
    struct ompi_communicator_t *llcomm=NULL;
    struct ompi_communicator_t *lcomm=NULL;
    mca_coll_hierarch_module_t *hierarch_module = (mca_coll_hierarch_module_t *) module;
    int rank;
    int lroot, llroot;
    ptrdiff_t extent, true_extent, lb, true_lb;
    char *tmpbuf=NULL, *tbuf=NULL;
    int ret=OMPI_SUCCESS;
    int root=0;

    rank   = ompi_comm_rank ( comm );
    lcomm  = hierarch_module->hier_lcomm;

    if ( mca_coll_hierarch_verbose_param ) {
      printf("%s:%d: executing hierarchical allreduce with cnt=%d \n",
	     comm->c_name, rank, count );
    }

    llcomm = mca_coll_hierarch_get_llcomm ( root, hierarch_module, &llroot, &lroot);

    if ( MPI_COMM_NULL != lcomm ) {
      ompi_datatype_get_extent(dtype, &lb, &extent);
      ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent);
      
      tbuf = (char*)malloc(true_extent + (count - 1) * extent);
      if (NULL == tbuf) {
	return OMPI_ERR_OUT_OF_RESOURCE;
      }
      tmpbuf = tbuf - lb;
      
      if ( MPI_IN_PLACE != sbuf ) {
	ret = lcomm->c_coll.coll_reduce (sbuf, tmpbuf, count, dtype, 
					 op, lroot, lcomm, 
					 lcomm->c_coll.coll_reduce_module);
      }
      else {
	ret = lcomm->c_coll.coll_reduce (rbuf, tmpbuf, count, dtype, 
					 op, lroot, lcomm, 
					 lcomm->c_coll.coll_reduce_module);
      }
      if ( OMPI_SUCCESS != ret ) {
	goto exit;
      }
    }

    if ( MPI_UNDEFINED != llroot ) {
      if ( MPI_COMM_NULL != lcomm ) {
	ret = llcomm->c_coll.coll_allreduce (tmpbuf, rbuf, count, dtype,
					     op, llcomm, 
					     llcomm->c_coll.coll_allreduce_module);
      }
      else {
	ret = llcomm->c_coll.coll_allreduce (sbuf, rbuf, count, dtype,
					     op, llcomm, 
					     llcomm->c_coll.coll_allreduce_module);
      }
    }	

    if ( MPI_COMM_NULL != lcomm ) {
	ret = lcomm->c_coll.coll_bcast(rbuf, count, dtype, lroot, lcomm, 
				       lcomm->c_coll.coll_bcast_module );
    }


 exit:
    if ( NULL != tmpbuf ) {
	free ( tmpbuf );
    }

    return ret;
}
示例#18
0
int ompi_coll_base_alltoall_intra_bruck(const void *sbuf, int scount,
                                         struct ompi_datatype_t *sdtype,
                                         void* rbuf, int rcount,
                                         struct ompi_datatype_t *rdtype,
                                         struct ompi_communicator_t *comm,
                                         mca_coll_base_module_t *module)
{
    int i, k, line = -1, rank, size, err = 0;
    int sendto, recvfrom, distance, *displs = NULL, *blen = NULL;
    char *tmpbuf = NULL, *tmpbuf_free = NULL;
    ptrdiff_t rlb, slb, tlb, sext, rext, tsext;
    struct ompi_datatype_t *new_ddt;

    if (MPI_IN_PLACE == sbuf) {
        return mca_coll_base_alltoall_intra_basic_inplace (rbuf, rcount, rdtype,
                                                            comm, module);
    }

    size = ompi_comm_size(comm);
    rank = ompi_comm_rank(comm);

    OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
                 "coll:base:alltoall_intra_bruck rank %d", rank));

    err = ompi_datatype_get_extent (sdtype, &slb, &sext);
    if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }

    err = ompi_datatype_get_true_extent(sdtype, &tlb,  &tsext);
    if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }

    err = ompi_datatype_get_extent (rdtype, &rlb, &rext);
    if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }


    displs = (int *) malloc(size * sizeof(int));
    if (displs == NULL) { line = __LINE__; err = -1; goto err_hndl; }
    blen = (int *) malloc(size * sizeof(int));
    if (blen == NULL) { line = __LINE__; err = -1; goto err_hndl; }

    /* tmp buffer allocation for message data */
    tmpbuf_free = (char *) malloc(tsext + ((ptrdiff_t)scount * (ptrdiff_t)size - 1) * sext);
    if (tmpbuf_free == NULL) { line = __LINE__; err = -1; goto err_hndl; }
    tmpbuf = tmpbuf_free - slb;

    /* Step 1 - local rotation - shift up by rank */
    err = ompi_datatype_copy_content_same_ddt (sdtype,
                                               (int32_t) ((ptrdiff_t)(size - rank) * (ptrdiff_t)scount),
                                               tmpbuf,
                                               ((char*) sbuf) + (ptrdiff_t)rank * (ptrdiff_t)scount * sext);
    if (err<0) {
        line = __LINE__; err = -1; goto err_hndl;
    }

    if (rank != 0) {
        err = ompi_datatype_copy_content_same_ddt (sdtype, (ptrdiff_t)rank * (ptrdiff_t)scount,
                                                   tmpbuf + (ptrdiff_t)(size - rank) * (ptrdiff_t)scount* sext,
                                                   (char*) sbuf);
        if (err<0) {
            line = __LINE__; err = -1; goto err_hndl;
        }
    }

    /* perform communication step */
    for (distance = 1; distance < size; distance<<=1) {

        sendto = (rank + distance) % size;
        recvfrom = (rank - distance + size) % size;
        k = 0;

        /* create indexed datatype */
        for (i = 1; i < size; i++) {
            if (( i & distance) == distance) {
                displs[k] = (ptrdiff_t)i * (ptrdiff_t)scount;
                blen[k] = scount;
                k++;
            }
        }
        /* Set indexes and displacements */
        err = ompi_datatype_create_indexed(k, blen, displs, sdtype, &new_ddt);
        if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;  }
        /* Commit the new datatype */
        err = ompi_datatype_commit(&new_ddt);
        if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;  }

        /* Sendreceive */
        err = ompi_coll_base_sendrecv ( tmpbuf, 1, new_ddt, sendto,
                                         MCA_COLL_BASE_TAG_ALLTOALL,
                                         rbuf, 1, new_ddt, recvfrom,
                                         MCA_COLL_BASE_TAG_ALLTOALL,
                                         comm, MPI_STATUS_IGNORE, rank );
        if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }

        /* Copy back new data from recvbuf to tmpbuf */
        err = ompi_datatype_copy_content_same_ddt(new_ddt, 1,tmpbuf, (char *) rbuf);
        if (err < 0) { line = __LINE__; err = -1; goto err_hndl;  }

        /* free ddt */
        err = ompi_datatype_destroy(&new_ddt);
        if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;  }
    } /* end of for (distance = 1... */

    /* Step 3 - local rotation - */
    for (i = 0; i < size; i++) {

        err = ompi_datatype_copy_content_same_ddt (rdtype, (int32_t) rcount,
                                                   ((char*)rbuf) + ((ptrdiff_t)((rank - i + size) % size) * (ptrdiff_t)rcount * rext),
                                                   tmpbuf + (ptrdiff_t)i * (ptrdiff_t)rcount * rext);
        if (err < 0) { line = __LINE__; err = -1; goto err_hndl;  }
    }

    /* Step 4 - clean up */
    if (tmpbuf != NULL) free(tmpbuf_free);
    if (displs != NULL) free(displs);
    if (blen != NULL) free(blen);
    return OMPI_SUCCESS;

 err_hndl:
    OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
                 "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err,
                 rank));
    if (tmpbuf != NULL) free(tmpbuf_free);
    if (displs != NULL) free(displs);
    if (blen != NULL) free(blen);
    return err;
}
示例#19
0
static int
ompi_coll_portals4_scatter_intra_linear_top(const void *sbuf, int scount, struct ompi_datatype_t *sdtype,
        void *rbuf, int rcount, struct ompi_datatype_t *rdtype,
        int root,
        struct ompi_communicator_t *comm,
        ompi_coll_portals4_request_t *request,
        mca_coll_base_module_t *module)
{
    mca_coll_portals4_module_t *portals4_module = (mca_coll_portals4_module_t*) module;
    int ret, line;
    ptl_ct_event_t ct;

    ptl_ct_event_t sync_incr_event;

    int8_t i_am_root;

    int32_t expected_rtrs = 0;
    int32_t expected_puts = 0;
    int32_t expected_acks = 0;
    int32_t expected_ops  = 0;

    int32_t expected_chained_rtrs = 0;
    int32_t expected_chained_acks = 0;


    OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
                 "coll:portals4:scatter_intra_linear_top enter rank %d", request->u.scatter.my_rank));

    request->type                   = OMPI_COLL_PORTALS4_TYPE_SCATTER;
    request->u.scatter.scatter_buf  = NULL;
    request->u.scatter.scatter_mdh  = PTL_INVALID_HANDLE;
    request->u.scatter.scatter_cth  = PTL_INVALID_HANDLE;
    request->u.scatter.scatter_meh  = PTL_INVALID_HANDLE;
    request->u.scatter.sync_mdh     = PTL_INVALID_HANDLE;
    request->u.scatter.sync_cth     = PTL_INVALID_HANDLE;
    request->u.scatter.sync_meh     = PTL_INVALID_HANDLE;

    request->u.scatter.my_rank   = ompi_comm_rank(comm);
    request->u.scatter.size      = ompi_comm_size(comm);
    request->u.scatter.root_rank = root;
    request->u.scatter.sbuf      = sbuf;
    request->u.scatter.rbuf      = rbuf;

    request->u.scatter.pack_src_buf    = sbuf;
    request->u.scatter.pack_src_count  = scount;
    request->u.scatter.pack_src_dtype  = sdtype;
    ompi_datatype_get_extent(request->u.scatter.pack_src_dtype,
                             &request->u.scatter.pack_src_lb,
                             &request->u.scatter.pack_src_extent);
    ompi_datatype_get_true_extent(request->u.scatter.pack_src_dtype,
                                  &request->u.scatter.pack_src_true_lb,
                                  &request->u.scatter.pack_src_true_extent);

    if ((root == request->u.scatter.my_rank) && (rbuf == MPI_IN_PLACE)) {
        request->u.scatter.unpack_dst_buf   = NULL;
        request->u.scatter.unpack_dst_count = 0;
        request->u.scatter.unpack_dst_dtype = MPI_DATATYPE_NULL;
    } else {
        request->u.scatter.unpack_dst_buf   = rbuf;
        request->u.scatter.unpack_dst_count = rcount;
        request->u.scatter.unpack_dst_dtype = rdtype;
        request->u.scatter.unpack_dst_offset = 0;
        ompi_datatype_get_extent(request->u.scatter.unpack_dst_dtype,
                                 &request->u.scatter.unpack_dst_lb,
                                 &request->u.scatter.unpack_dst_extent);
        ompi_datatype_get_true_extent(request->u.scatter.unpack_dst_dtype,
                                      &request->u.scatter.unpack_dst_true_lb,
                                      &request->u.scatter.unpack_dst_true_extent);
    }

    opal_output_verbose(30, ompi_coll_base_framework.framework_output,
                        "%s:%d:rank(%d): request->u.scatter.unpack_dst_offset(%lu)",
                        __FILE__, __LINE__, request->u.scatter.my_rank,
                        request->u.scatter.unpack_dst_offset);

    /**********************************/
    /* Setup Common Parameters        */
    /**********************************/

    i_am_root = (request->u.scatter.my_rank == request->u.scatter.root_rank);

    request->u.scatter.coll_count = opal_atomic_add_size_t(&portals4_module->coll_count, 1);

    ret = setup_scatter_buffers_linear(comm, request, portals4_module);
    if (MPI_SUCCESS != ret) {
        line = __LINE__;
        goto err_hdlr;
    }

    ret = setup_scatter_handles(comm, request, portals4_module);
    if (MPI_SUCCESS != ret) {
        line = __LINE__;
        goto err_hdlr;
    }

    ret = setup_sync_handles(comm, request, portals4_module);
    if (MPI_SUCCESS != ret) {
        line = __LINE__;
        goto err_hdlr;
    }

    /**********************************/
    /* do the scatter                 */
    /**********************************/
    if (i_am_root) {
        /* operations on the sync counter */
        expected_rtrs = request->u.scatter.size - 1; /* expect RTRs from non-root ranks */
        expected_acks = request->u.scatter.size - 1; /* expect Recv-ACKs from non-root ranks */

        /* operations on the scatter counter */
        expected_puts         = 0;
        expected_chained_rtrs = 1;
        expected_chained_acks = 1;

        /* Chain the RTR and Recv-ACK to the Scatter CT */
        sync_incr_event.success=1;
        sync_incr_event.failure=0;
        ret = PtlTriggeredCTInc(request->u.scatter.scatter_cth,
                                sync_incr_event,
                                request->u.scatter.sync_cth,
                                expected_rtrs);
        if (PTL_OK != ret) {
            ret = OMPI_ERROR;
            line = __LINE__;
            goto err_hdlr;
        }

        ret = PtlTriggeredCTInc(request->u.scatter.scatter_cth,
                                sync_incr_event,
                                request->u.scatter.sync_cth,
                                expected_rtrs + expected_acks);
        if (PTL_OK != ret) {
            ret = OMPI_ERROR;
            line = __LINE__;
            goto err_hdlr;
        }

        /* root, so put packed bytes to other ranks */
        for (int32_t i=0; i<request->u.scatter.size; i++) {
            /* do not put to my scatter_buf.  my data gets unpacked into my out buffer in linear_bottom(). */
            if (i == request->u.scatter.my_rank) {
                continue;
            }

            ptl_size_t offset = request->u.scatter.packed_size * i;

            opal_output_verbose(30, ompi_coll_base_framework.framework_output,
                                "%s:%d:rank(%d): offset(%lu)=rank(%d) * packed_size(%ld)",
                                __FILE__, __LINE__, request->u.scatter.my_rank,
                                offset, i, request->u.scatter.packed_size);

            ret = PtlTriggeredPut(request->u.scatter.scatter_mdh,
                                  (ptl_size_t)request->u.scatter.scatter_buf + offset,
                                  request->u.scatter.packed_size,
                                  PTL_NO_ACK_REQ,
                                  ompi_coll_portals4_get_peer(comm, i),
                                  mca_coll_portals4_component.pt_idx,
                                  request->u.scatter.scatter_match_bits,
                                  0,
                                  NULL,
                                  0,
                                  request->u.scatter.scatter_cth,
                                  expected_chained_rtrs);
            if (PTL_OK != ret) {
                ret = OMPI_ERROR;
                line = __LINE__;
                goto err_hdlr;
            }
        }
    } else {
        /* non-root, so do nothing */

        /* operations on the sync counter */
        expected_rtrs = 0;
        expected_acks = 0;

        /* operations on the scatter counter */
        expected_puts         = 1;  /* scatter put from root */
        expected_chained_rtrs = 0;
        expected_chained_acks = 0;
    }

    expected_ops = expected_chained_rtrs + expected_puts;

    /**********************************************/
    /* only non-root ranks are PUT to, so only    */
    /* non-root ranks must PUT a Recv-ACK to root */
    /**********************************************/
    if (!i_am_root) {
        ret = PtlTriggeredPut(request->u.scatter.sync_mdh,
                              0,
                              0,
                              PTL_NO_ACK_REQ,
                              ompi_coll_portals4_get_peer(comm, request->u.scatter.root_rank),
                              mca_coll_portals4_component.pt_idx,
                              request->u.scatter.sync_match_bits,
                              0,
                              NULL,
                              0,
                              request->u.scatter.scatter_cth,
                              expected_ops);
        if (PTL_OK != ret) {
            ret = OMPI_ERROR;
            line = __LINE__;
            goto err_hdlr;
        }
    }

    expected_ops += expected_chained_acks;

    if (!request->u.scatter.is_sync) {
        /******************************************/
        /* put to finish pt when all ops complete */
        /******************************************/
        ret = PtlTriggeredPut(mca_coll_portals4_component.zero_md_h,
                              0,
                              0,
                              PTL_NO_ACK_REQ,
                              ompi_coll_portals4_get_peer(comm, request->u.scatter.my_rank),
                              mca_coll_portals4_component.finish_pt_idx,
                              0,
                              0,
                              NULL,
                              (uintptr_t) request,
                              request->u.scatter.scatter_cth,
                              expected_ops);
        if (PTL_OK != ret) {
            ret = OMPI_ERROR;
            line = __LINE__;
            goto err_hdlr;
        }
    }

    /**************************************/
    /* all non-root ranks put RTR to root */
    /**************************************/
    if (!i_am_root) {
        ret = PtlPut(request->u.scatter.sync_mdh,
                     0,
                     0,
                     PTL_NO_ACK_REQ,
                     ompi_coll_portals4_get_peer(comm, request->u.scatter.root_rank),
                     mca_coll_portals4_component.pt_idx,
                     request->u.scatter.sync_match_bits,
                     0,
                     NULL,
                     0);
        if (PTL_OK != ret) {
            ret = OMPI_ERROR;
            line = __LINE__;
            goto err_hdlr;
        }
    }

    if (request->u.scatter.is_sync) {
        opal_output_verbose(1, ompi_coll_base_framework.framework_output,
                            "calling CTWait(expected_ops=%d)\n", expected_ops);

        /********************************/
        /* Wait for all ops to complete */
        /********************************/
        ret = PtlCTWait(request->u.scatter.scatter_cth, expected_ops, &ct);
        if (PTL_OK != ret) {
            ret = OMPI_ERROR;
            line = __LINE__;
            goto err_hdlr;
        }

        opal_output_verbose(1, ompi_coll_base_framework.framework_output,
                            "completed CTWait(expected_ops=%d)\n", expected_ops);
    }

    OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
                 "coll:portals4:scatter_intra_linear_top exit rank %d", request->u.scatter.my_rank));

    return OMPI_SUCCESS;

err_hdlr:
    if (NULL != request->u.scatter.scatter_buf)
        free(request->u.scatter.scatter_buf);

    opal_output(ompi_coll_base_framework.framework_output,
                "%s:%4d:%4d\tError occurred ret=%d, rank %2d",
                __FILE__, __LINE__, line, ret, request->u.scatter.my_rank);

    return ret;
}
示例#20
0
/*
 *   ompi_coll_base_allreduce_intra_recursivedoubling
 *
 *   Function:       Recursive doubling algorithm for allreduce operation
 *   Accepts:        Same as MPI_Allreduce()
 *   Returns:        MPI_SUCCESS or error code
 *
 *   Description:    Implements recursive doubling algorithm for allreduce.
 *                   Original (non-segmented) implementation is used in MPICH-2
 *                   for small and intermediate size messages.
 *                   The algorithm preserves order of operations so it can
 *                   be used both by commutative and non-commutative operations.
 *
 *         Example on 7 nodes:
 *         Initial state
 *         #      0       1      2       3      4       5      6
 *               [0]     [1]    [2]     [3]    [4]     [5]    [6]
 *         Initial adjustment step for non-power of two nodes.
 *         old rank      1              3              5      6
 *         new rank      0              1              2      3
 *                     [0+1]          [2+3]          [4+5]   [6]
 *         Step 1
 *         old rank      1              3              5      6
 *         new rank      0              1              2      3
 *                     [0+1+]         [0+1+]         [4+5+]  [4+5+]
 *                     [2+3+]         [2+3+]         [6   ]  [6   ]
 *         Step 2
 *         old rank      1              3              5      6
 *         new rank      0              1              2      3
 *                     [0+1+]         [0+1+]         [0+1+]  [0+1+]
 *                     [2+3+]         [2+3+]         [2+3+]  [2+3+]
 *                     [4+5+]         [4+5+]         [4+5+]  [4+5+]
 *                     [6   ]         [6   ]         [6   ]  [6   ]
 *         Final adjustment step for non-power of two nodes
 *         #      0       1      2       3      4       5      6
 *              [0+1+] [0+1+] [0+1+]  [0+1+] [0+1+]  [0+1+] [0+1+]
 *              [2+3+] [2+3+] [2+3+]  [2+3+] [2+3+]  [2+3+] [2+3+]
 *              [4+5+] [4+5+] [4+5+]  [4+5+] [4+5+]  [4+5+] [4+5+]
 *              [6   ] [6   ] [6   ]  [6   ] [6   ]  [6   ] [6   ]
 *
 */
int
ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf,
                                                  int count,
                                                  struct ompi_datatype_t *dtype,
                                                  struct ompi_op_t *op,
                                                  struct ompi_communicator_t *comm,
                                                  mca_coll_base_module_t *module)
{
    int ret, line, rank, size, adjsize, remote, distance;
    int newrank, newremote, extra_ranks;
    char *tmpsend = NULL, *tmprecv = NULL, *tmpswap = NULL, *inplacebuf = NULL;
    ptrdiff_t true_lb, true_extent, lb, extent;
    ompi_request_t *reqs[2] = {NULL, NULL};

    size = ompi_comm_size(comm);
    rank = ompi_comm_rank(comm);

    OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
                 "coll:base:allreduce_intra_recursivedoubling rank %d", rank));

    /* Special case for size == 1 */
    if (1 == size) {
        if (MPI_IN_PLACE != sbuf) {
            ret = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf, (char*)sbuf);
            if (ret < 0) { line = __LINE__; goto error_hndl; }
        }
        return MPI_SUCCESS;
    }

    /* Allocate and initialize temporary send buffer */
    ret = ompi_datatype_get_extent(dtype, &lb, &extent);
    if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
    ret = ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent);
    if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }

    inplacebuf = (char*) malloc(true_extent + (ptrdiff_t)(count - 1) * extent);
    if (NULL == inplacebuf) { ret = -1; line = __LINE__; goto error_hndl; }

    if (MPI_IN_PLACE == sbuf) {
        ret = ompi_datatype_copy_content_same_ddt(dtype, count, inplacebuf, (char*)rbuf);
        if (ret < 0) { line = __LINE__; goto error_hndl; }
    } else {
        ret = ompi_datatype_copy_content_same_ddt(dtype, count, inplacebuf, (char*)sbuf);
        if (ret < 0) { line = __LINE__; goto error_hndl; }
    }

    tmpsend = (char*) inplacebuf;
    tmprecv = (char*) rbuf;

    /* Determine nearest power of two less than or equal to size */
    adjsize = opal_next_poweroftwo (size);
    adjsize >>= 1;

    /* Handle non-power-of-two case:
       - Even ranks less than 2 * extra_ranks send their data to (rank + 1), and
       sets new rank to -1.
       - Odd ranks less than 2 * extra_ranks receive data from (rank - 1),
       apply appropriate operation, and set new rank to rank/2
       - Everyone else sets rank to rank - extra_ranks
    */
    extra_ranks = size - adjsize;
    if (rank <  (2 * extra_ranks)) {
        if (0 == (rank % 2)) {
            ret = MCA_PML_CALL(send(tmpsend, count, dtype, (rank + 1),
                                    MCA_COLL_BASE_TAG_ALLREDUCE,
                                    MCA_PML_BASE_SEND_STANDARD, comm));
            if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
            newrank = -1;
        } else {
            ret = MCA_PML_CALL(recv(tmprecv, count, dtype, (rank - 1),
                                    MCA_COLL_BASE_TAG_ALLREDUCE, comm,
                                    MPI_STATUS_IGNORE));
            if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
            /* tmpsend = tmprecv (op) tmpsend */
            ompi_op_reduce(op, tmprecv, tmpsend, count, dtype);
            newrank = rank >> 1;
        }
    } else {
/*******************************************************************************
 * ompi_coll_tuned_reduce_scatter_intra_nonoverlapping
 *
 * This function just calls a reduce to rank 0, followed by an 
 * appropriate scatterv call.
 */
int ompi_coll_tuned_reduce_scatter_intra_nonoverlapping(void *sbuf, void *rbuf, 
                                                        int *rcounts,
                                                        struct ompi_datatype_t *dtype,
                                                        struct ompi_op_t *op,
                                                        struct ompi_communicator_t *comm,
                                                        mca_coll_base_module_t *module) 
{
    int err, i, rank, size, total_count, *displs = NULL;
    const int root = 0;
    char *tmprbuf = NULL, *tmprbuf_free = NULL;

    rank = ompi_comm_rank(comm);
    size = ompi_comm_size(comm);

    OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_scatter_intra_nonoverlapping, rank %d", rank));

    for (i = 0, total_count = 0; i < size; i++) { total_count += rcounts[i]; }

    /* Reduce to rank 0 (root) and scatterv */
    tmprbuf = (char*) rbuf;
    if (MPI_IN_PLACE == sbuf) {
        /* rbuf on root (0) is big enough to hold whole data */
        if (root == rank) {
            err = comm->c_coll.coll_reduce (MPI_IN_PLACE, tmprbuf, total_count, 
                                            dtype, op, root, comm, comm->c_coll.coll_reduce_module);
        } else {
            err = comm->c_coll.coll_reduce(tmprbuf, NULL, total_count,
                                           dtype, op, root, comm, comm->c_coll.coll_reduce_module);
        }
    } else {
        if (root == rank) {
            /* We must allocate temporary receive buffer on root to ensure that
               rbuf is big enough */
            ptrdiff_t lb, extent, tlb, textent;
         
            ompi_datatype_get_extent(dtype, &lb, &extent);
            ompi_datatype_get_true_extent(dtype, &tlb, &textent);

            tmprbuf_free = (char*) malloc(textent + (ptrdiff_t)(total_count - 1) * extent);
            tmprbuf = tmprbuf_free - lb;
        } 
        err = comm->c_coll.coll_reduce (sbuf, tmprbuf, total_count,
                                        dtype, op, root, comm, comm->c_coll.coll_reduce_module);
    }
    if (MPI_SUCCESS != err) {
        if (NULL != tmprbuf_free) free(tmprbuf_free);
        return err;
    }
 
    displs = (int*) malloc(size * sizeof(int));
    displs[0] = 0;
    for (i = 1; i < size; i++) {
        displs[i] = displs[i-1] + rcounts[i-1];
    }
    err =  comm->c_coll.coll_scatterv (tmprbuf, rcounts, displs, dtype,
                                       rbuf, rcounts[rank], dtype,
                                       root, comm, comm->c_coll.coll_scatterv_module);
    free(displs);
    if (NULL != tmprbuf_free) free(tmprbuf_free);

    return err;
}
示例#22
0
/*
 * reduce_intra_in_order_binary 
 * 
 * Function:      Logarithmic reduce operation for non-commutative operations.
 * Acecpts:       same as MPI_Reduce()
 * Returns:       MPI_SUCCESS or error code
 */
int ompi_coll_tuned_reduce_intra_in_order_binary( void *sendbuf, void *recvbuf,
                                                  int count, 
                                                  ompi_datatype_t* datatype,
                                                  ompi_op_t* op, int root,
                                                  ompi_communicator_t* comm, 
                                                  mca_coll_base_module_t *module,
                                                  uint32_t segsize,
                                                  int max_outstanding_reqs  )
{
    int ret, rank, size, io_root, segcount = count;
    void *use_this_sendbuf = NULL, *use_this_recvbuf = NULL;
    size_t typelng;
    mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
    mca_coll_tuned_comm_t *data = tuned_module->tuned_data;

    rank = ompi_comm_rank(comm);
    size = ompi_comm_size(comm);
    OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_in_order_binary rank %d ss %5d",
                 rank, segsize));

    COLL_TUNED_UPDATE_IN_ORDER_BINTREE( comm, tuned_module );

    /**
     * Determine number of segments and number of elements
     * sent per operation
     */
    ompi_datatype_type_size( datatype, &typelng );
    COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount );

    /* An in-order binary tree must use root (size-1) to preserve the order of
       operations.  Thus, if root is not rank (size - 1), then we must handle
       1. MPI_IN_PLACE option on real root, and 
       2. we must allocate temporary recvbuf on rank (size - 1).
       Note that generic function must be careful not to switch order of 
       operations for non-commutative ops.
    */
    io_root = size - 1;
    use_this_sendbuf = sendbuf;
    use_this_recvbuf = recvbuf;
    if (io_root != root) {
        ptrdiff_t tlb, text, lb, ext;
        char *tmpbuf = NULL;
    
        ompi_datatype_get_extent(datatype, &lb, &ext);
        ompi_datatype_get_true_extent(datatype, &tlb, &text);

        if ((root == rank) && (MPI_IN_PLACE == sendbuf)) {
            tmpbuf = (char *) malloc(text + (ptrdiff_t)(count - 1) * ext);
            if (NULL == tmpbuf) {
                return MPI_ERR_INTERN;
            }
            ompi_datatype_copy_content_same_ddt(datatype, count, 
                                                (char*)tmpbuf,
                                                (char*)recvbuf);
            use_this_sendbuf = tmpbuf;
        } else if (io_root == rank) {
            tmpbuf = (char *) malloc(text + (ptrdiff_t)(count - 1) * ext);
            if (NULL == tmpbuf) {
                return MPI_ERR_INTERN;
            }
            use_this_recvbuf = tmpbuf;
        }
    }

    /* Use generic reduce with in-order binary tree topology and io_root */
    ret = ompi_coll_tuned_reduce_generic( use_this_sendbuf, use_this_recvbuf, count, datatype,
                                          op, io_root, comm, module, 
                                          data->cached_in_order_bintree, 
                                          segcount, max_outstanding_reqs );
    if (MPI_SUCCESS != ret) { return ret; }

    /* Clean up */
    if (io_root != root) {
        if (root == rank) {
            /* Receive result from rank io_root to recvbuf */
            ret = MCA_PML_CALL(recv(recvbuf, count, datatype, io_root,
                                    MCA_COLL_BASE_TAG_REDUCE, comm,
                                    MPI_STATUS_IGNORE));
            if (MPI_SUCCESS != ret) { return ret; }
            if (MPI_IN_PLACE == sendbuf) {
                free(use_this_sendbuf);
            }
          
        } else if (io_root == rank) {
            /* Send result from use_this_recvbuf to root */
            ret = MCA_PML_CALL(send(use_this_recvbuf, count, datatype, root,
                                    MCA_COLL_BASE_TAG_REDUCE, 
                                    MCA_PML_BASE_SEND_STANDARD, comm));
            if (MPI_SUCCESS != ret) { return ret; }
            free(use_this_recvbuf);
        }
    }

    return MPI_SUCCESS;
}
/*
 *	exscan_intra
 *
 *	Function:	- basic exscan operation
 *	Accepts:	- same arguments as MPI_Exscan()
 *	Returns:	- MPI_SUCCESS or error code
 */
int
mca_coll_basic_exscan_intra(void *sbuf, void *rbuf, int count,
                            struct ompi_datatype_t *dtype,
                            struct ompi_op_t *op,
                            struct ompi_communicator_t *comm,
                            mca_coll_base_module_t *module)
{
    int size, rank, err;
    ptrdiff_t true_lb, true_extent, lb, extent;
    char *free_buffer = NULL;
    char *reduce_buffer = NULL;

    rank = ompi_comm_rank(comm);
    size = ompi_comm_size(comm);

    /* For MPI_IN_PLACE, just adjust send buffer to point to
     * receive buffer. */
    if (MPI_IN_PLACE == sbuf) {
        sbuf = rbuf;
    }

    /* If we're rank 0, then just send our sbuf to the next rank, and
     * we are done. */
    if (0 == rank) {
        return MCA_PML_CALL(send(sbuf, count, dtype, rank + 1,
                                 MCA_COLL_BASE_TAG_EXSCAN,
                                 MCA_PML_BASE_SEND_STANDARD, comm));
    }

    /* If we're the last rank, then just receive the result from the
     * prior rank, and we are done. */
    else if ((size - 1) == rank) {
        return MCA_PML_CALL(recv(rbuf, count, dtype, rank - 1,
                                 MCA_COLL_BASE_TAG_EXSCAN, comm,
                                 MPI_STATUS_IGNORE));
    }

    /* Otherwise, get the result from the prior rank, combine it with my
     * data, and send it to the next rank */

    /* Get a temporary buffer to perform the reduction into.  Rationale
     * for malloc'ing this size is provided in coll_basic_reduce.c. */
    ompi_datatype_get_extent(dtype, &lb, &extent);
    ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent);

    free_buffer = (char*)malloc(true_extent + (count - 1) * extent);
    if (NULL == free_buffer) {
        return OMPI_ERR_OUT_OF_RESOURCE;
    }
    reduce_buffer = free_buffer - lb;
    err = ompi_datatype_copy_content_same_ddt(dtype, count, 
                                              reduce_buffer, (char*)sbuf);

    /* Receive the reduced value from the prior rank */
    err = MCA_PML_CALL(recv(rbuf, count, dtype, rank - 1,
                            MCA_COLL_BASE_TAG_EXSCAN, comm, MPI_STATUS_IGNORE));
    if (MPI_SUCCESS != err) {
        goto error;
    }

    /* Now reduce the prior rank's result with my source buffer.  The source
     * buffer had been previously copied into the temporary reduce_buffer. */
    ompi_op_reduce(op, rbuf, reduce_buffer, count, dtype);

    /* Send my result off to the next rank */
    err = MCA_PML_CALL(send(reduce_buffer, count, dtype, rank + 1,
                            MCA_COLL_BASE_TAG_EXSCAN,
                            MCA_PML_BASE_SEND_STANDARD, comm));
    /* Error */
  error:
    free(free_buffer);

    /* All done */
    return err;
}