Exemple #1
0
static void mca_pml_yalla_noncontig_req_init(mxm_req_base_t *mxm_req,
                                             mca_pml_yalla_convertor_t *convertor,
                                             mxm_stream_cb_t stream_cb)
{
    mxm_req->data_type      = MXM_REQ_DATA_STREAM;
    mxm_req->data.stream.cb = stream_cb;
    opal_convertor_get_packed_size(&convertor->convertor, &mxm_req->data.stream.length);
}
Exemple #2
0
/* try to get a small message out on to the wire quickly */
static inline int mca_pml_ob1_send_inline (const void *buf, size_t count,
                                           ompi_datatype_t * datatype,
                                           int dst, int tag, int16_t seqn,
                                           ompi_proc_t *dst_proc, mca_bml_base_endpoint_t* endpoint,
                                           ompi_communicator_t * comm)
{
    mca_pml_ob1_match_hdr_t match;
    mca_bml_base_btl_t *bml_btl;
    opal_convertor_t convertor;
    size_t size;
    int rc;

    bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager);
    if( NULL == bml_btl->btl->btl_sendi)
        return OMPI_ERR_NOT_AVAILABLE;

    ompi_datatype_type_size (datatype, &size);
    if ((size * count) > 256) {  /* some random number */
        return OMPI_ERR_NOT_AVAILABLE;
    }

    if (count > 0) {
        /* initialize just enough of the convertor to avoid a SEGV in opal_convertor_cleanup */
        OBJ_CONSTRUCT(&convertor, opal_convertor_t);

        /* We will create a convertor specialized for the        */
        /* remote architecture and prepared with the datatype.   */
        opal_convertor_copy_and_prepare_for_send (dst_proc->super.proc_convertor,
                                                  (const struct opal_datatype_t *) datatype,
                                                  count, buf, 0, &convertor);
        opal_convertor_get_packed_size (&convertor, &size);
    } else {
        size = 0;
    }

    mca_pml_ob1_match_hdr_prepare (&match, MCA_PML_OB1_HDR_TYPE_MATCH, 0,
                                   comm->c_contextid, comm->c_my_rank,
                                   tag, seqn);

    ob1_hdr_hton(&match, MCA_PML_OB1_HDR_TYPE_MATCH, dst_proc);

    /* try to send immediately */
    rc = mca_bml_base_sendi (bml_btl, &convertor, &match, OMPI_PML_OB1_MATCH_HDR_LEN,
                             size, MCA_BTL_NO_ORDER, MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP,
                             MCA_PML_OB1_HDR_TYPE_MATCH, NULL);
    if (count > 0) {
        opal_convertor_cleanup (&convertor);
    }

    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
	return rc;
    }

    return (int) size;
}
 static inline __opal_attribute_always_inline__ int
               ompi_mtl_mxm_choose_recv_datatype(mca_mtl_mxm_request_t *mtl_mxm_request)
{
    void **buffer = &mtl_mxm_request->buf;
    size_t *buffer_len = &mtl_mxm_request->length;

    mxm_recv_req_t *mxm_recv_req = &mtl_mxm_request->mxm.recv;
    opal_convertor_t *convertor = mtl_mxm_request->convertor;

    opal_convertor_get_packed_size(convertor, buffer_len);

    if (0 == *buffer_len) {
        *buffer = NULL;
        *buffer_len = 0;

        mxm_recv_req->base.data_type = MXM_REQ_DATA_BUFFER;

        return OMPI_SUCCESS;
    }

    if (opal_convertor_need_buffers(convertor)) {
        mxm_recv_req->base.data_type = MXM_REQ_DATA_STREAM;
        mxm_recv_req->base.data.stream.length = *buffer_len;
        mxm_recv_req->base.data.stream.cb = ompi_mtl_mxm_stream_unpack;

        return OMPI_SUCCESS;
    }

    mxm_recv_req->base.data_type = MXM_REQ_DATA_BUFFER;

    *buffer = convertor->pBaseBuf +
            convertor->use_desc->desc[convertor->use_desc->used].end_loop.first_elem_disp;

    mxm_recv_req->base.data.buffer.ptr     = *buffer;
    mxm_recv_req->base.data.buffer.length  = *buffer_len;

    return OMPI_SUCCESS;
}
Exemple #4
0
static inline __opal_attribute_always_inline__ int
               ompi_mtl_mxm_choose_send_datatype(mxm_send_req_t *mxm_send_req,
                                           opal_convertor_t *convertor)
{
    struct iovec iov;
    uint32_t iov_count = 1;

    size_t *buffer_len = &mxm_send_req->base.data.buffer.length;

    opal_convertor_get_packed_size(convertor, buffer_len);
    if (0 == *buffer_len) {
        mxm_send_req->base.data.buffer.ptr = NULL;
        mxm_send_req->base.data_type = MXM_REQ_DATA_BUFFER;

        return OMPI_SUCCESS;
    }

    if (opal_convertor_need_buffers(convertor)) {
        mxm_send_req->base.context = convertor;
        mxm_send_req->base.data_type = MXM_REQ_DATA_STREAM;
        mxm_send_req->base.data.stream.length = *buffer_len;
        mxm_send_req->base.data.stream.cb = ompi_mtl_mxm_stream_send;

        return OMPI_SUCCESS;
    }

    mxm_send_req->base.data_type = MXM_REQ_DATA_BUFFER;

    iov.iov_base = NULL;
    iov.iov_len = *buffer_len;

    opal_convertor_pack(convertor, &iov, &iov_count, buffer_len);
    mxm_send_req->base.data.buffer.ptr = iov.iov_base;

    return OMPI_SUCCESS;
}
Exemple #5
0
/* try to get a small message out on to the wire quickly */
static inline int mca_pml_ob1_send_inline (void *buf, size_t count,
                                           ompi_datatype_t * datatype,
                                           int dst, int tag, int16_t seqn,
                                           ompi_proc_t *dst_proc, mca_bml_base_endpoint_t* endpoint,
                                           ompi_communicator_t * comm)
{
    mca_btl_base_descriptor_t *des = NULL;
    mca_pml_ob1_match_hdr_t match;
    mca_bml_base_btl_t *bml_btl;
    OPAL_PTRDIFF_TYPE lb, extent;
    opal_convertor_t convertor;
    size_t size = 0;
    int rc;

    bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager);

    ompi_datatype_get_extent (datatype, &lb, &extent);

    if (OPAL_UNLIKELY((extent * count) > 256 || !bml_btl->btl->btl_sendi)) {
        return OMPI_ERR_NOT_AVAILABLE;
    }

    if (count > 0) {
        /* initialize just enough of the convertor to avoid a SEGV in opal_convertor_cleanup */
        OBJ_CONSTRUCT(&convertor, opal_convertor_t);

        /* We will create a convertor specialized for the        */
        /* remote architecture and prepared with the datatype.   */
        opal_convertor_copy_and_prepare_for_send (dst_proc->proc_convertor,
                                                  (const struct opal_datatype_t *) datatype,
						  count, buf, 0, &convertor);
        opal_convertor_get_packed_size (&convertor, &size);
    }

    match.hdr_common.hdr_flags = 0;
    match.hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_MATCH;
    match.hdr_ctx = comm->c_contextid;
    match.hdr_src = comm->c_my_rank;
    match.hdr_tag = tag;
    match.hdr_seq = seqn;

    ob1_hdr_hton(&match, MCA_PML_OB1_HDR_TYPE_MATCH, dst_proc);

    /* try to send immediately */
    rc = mca_bml_base_sendi (bml_btl, &convertor, &match, OMPI_PML_OB1_MATCH_HDR_LEN,
                             size, MCA_BTL_NO_ORDER, MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP,
                             MCA_PML_OB1_HDR_TYPE_MATCH, &des);
    if (count > 0) {
        opal_convertor_cleanup (&convertor);
    }

    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        if (des) {
            mca_bml_base_free (bml_btl, des);
        }

	return rc;
    }

    return (int) size;
}
Exemple #6
0
static int
setup_scatter_buffers_linear(struct ompi_communicator_t   *comm,
                             ompi_coll_portals4_request_t *request,
                             mca_coll_portals4_module_t   *portals4_module)
{
    int ret, line;

    int8_t i_am_root = (request->u.scatter.my_rank == request->u.scatter.root_rank);

    ompi_coll_portals4_create_send_converter (&request->u.scatter.send_converter,
            request->u.scatter.pack_src_buf,
            ompi_comm_peer_lookup(comm, request->u.scatter.my_rank),
            request->u.scatter.pack_src_count,
            request->u.scatter.pack_src_dtype);
    opal_convertor_get_packed_size(&request->u.scatter.send_converter, &request->u.scatter.packed_size);
    OBJ_DESTRUCT(&request->u.scatter.send_converter);

    /**********************************/
    /* Setup Scatter Buffers           */
    /**********************************/
    if (i_am_root) {

        /*
         * calculate the total size of the packed data
         */
        request->u.scatter.scatter_bytes=request->u.scatter.packed_size * (ptrdiff_t)request->u.scatter.size;

        /* all transfers done using request->u.scatter.sdtype.
         * allocate temp buffer for recv, copy and/or rotate data at the end */
        request->u.scatter.scatter_buf = (char *) malloc(request->u.scatter.scatter_bytes);
        if (NULL == request->u.scatter.scatter_buf) {
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            line = __LINE__;
            goto err_hdlr;
        }
        request->u.scatter.free_after = 1;

        for (int32_t i=0; i<request->u.scatter.size; i++) {
            uint32_t iov_count = 1;
            struct iovec iov;
            size_t max_data;

            uint64_t offset = request->u.scatter.pack_src_extent * request->u.scatter.pack_src_count * i;

            opal_output_verbose(30, ompi_coll_base_framework.framework_output,
                                "%s:%d:rank(%d): offset(%lu)",
                                __FILE__, __LINE__, request->u.scatter.my_rank,
                                offset);

            ompi_coll_portals4_create_send_converter (&request->u.scatter.send_converter,
                    request->u.scatter.pack_src_buf + offset,
                    ompi_comm_peer_lookup(comm, request->u.scatter.my_rank),
                    request->u.scatter.pack_src_count,
                    request->u.scatter.pack_src_dtype);

            iov.iov_len = request->u.scatter.packed_size;
            iov.iov_base = (IOVBASE_TYPE *) ((char *)request->u.scatter.scatter_buf + (request->u.scatter.packed_size*i));
            opal_convertor_pack(&request->u.scatter.send_converter, &iov, &iov_count, &max_data);

            OBJ_DESTRUCT(&request->u.scatter.send_converter);
        }

        opal_output_verbose(30, ompi_coll_base_framework.framework_output,
                            "%s:%d:rank(%d): root - scatter_buf(%p) - scatter_bytes(%lu)=packed_size(%ld) * size(%d)",
                            __FILE__, __LINE__, request->u.scatter.my_rank,
                            request->u.scatter.scatter_buf, request->u.scatter.scatter_bytes,
                            request->u.scatter.packed_size, request->u.scatter.size);
    } else {
        request->u.scatter.scatter_bytes=request->u.scatter.packed_size;
        request->u.scatter.scatter_buf = (char *) malloc(request->u.scatter.scatter_bytes);
        if (NULL == request->u.scatter.scatter_buf) {
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            line = __LINE__;
            goto err_hdlr;
        }
        request->u.scatter.free_after = 1;

        opal_output_verbose(30, ompi_coll_base_framework.framework_output,
                            "%s:%d:rank(%d): leaf - scatter_buf(%p) - scatter_bytes(%lu)=packed_size(%ld)",
                            __FILE__, __LINE__, request->u.scatter.my_rank,
                            request->u.scatter.scatter_buf, request->u.scatter.scatter_bytes,
                            request->u.scatter.packed_size);
    }

    return OMPI_SUCCESS;

err_hdlr:
    opal_output(ompi_coll_base_framework.framework_output,
                "%s:%4d:%4d\tError occurred ret=%d, rank %2d",
                __FILE__, __LINE__, line, ret, request->u.scatter.my_rank);

    return ret;
}
Exemple #7
0
/**
 * Shared memory broadcast.
 *
 * For the root, the general algorithm is to wait for a set of
 * segments to become available.  Once it is, the root claims the set
 * by writing the current operation number and the number of processes
 * using the set to the flag.  The root then loops over the set of
 * segments; for each segment, it copies a fragment of the user's
 * buffer into the shared data segment and then writes the data size
 * into its childrens' control buffers.  The process is repeated until
 * all fragments have been written.
 *
 * For non-roots, for each set of buffers, they wait until the current
 * operation number appears in the in-use flag (i.e., written by the
 * root).  Then for each segment, they wait for a nonzero to appear
 * into their control buffers.  If they have children, they copy the
 * data from their parent's shared data segment into their shared data
 * segment, and write the data size into each of their childrens'
 * control buffers.  They then copy the data from their shared [local]
 * data segment into the user's output buffer.  The process is
 * repeated until all fragments have been received.  If they do not
 * have children, they copy the data directly from the parent's shared
 * data segment into the user's output buffer.
 */
int mca_coll_sm_bcast_intra(void *buff, int count,
                            struct ompi_datatype_t *datatype, int root,
                            struct ompi_communicator_t *comm,
                            mca_coll_base_module_t *module)
{
    struct iovec iov;
    mca_coll_sm_module_t *sm_module = (mca_coll_sm_module_t*) module;
    mca_coll_sm_comm_t *data;
    int i, ret, rank, size, num_children, src_rank;
    int flag_num, segment_num, max_segment_num;
    int parent_rank;
    size_t total_size, max_data, bytes;
    mca_coll_sm_in_use_flag_t *flag;
    opal_convertor_t convertor;
    mca_coll_sm_tree_node_t *me, *parent, **children;
    mca_coll_sm_data_index_t *index;

    /* Lazily enable the module the first time we invoke a collective
       on it */
    if (!sm_module->enabled) {
        if (OMPI_SUCCESS != (ret = ompi_coll_sm_lazy_enable(module, comm))) {
            return ret;
        }
    }
    data = sm_module->sm_comm_data;

    /* Setup some identities */

    rank = ompi_comm_rank(comm);
    size = ompi_comm_size(comm);

    OBJ_CONSTRUCT(&convertor, opal_convertor_t);
    iov.iov_len = mca_coll_sm_component.sm_fragment_size;
    bytes = 0;

    me = &data->mcb_tree[(rank + size - root) % size];
    parent = me->mcstn_parent;
    children = me->mcstn_children;
    num_children = me->mcstn_num_children;

    /* Only have one top-level decision as to whether I'm the root or
       not.  Do this at the slight expense of repeating a little logic
       -- but it's better than a conditional branch in every loop
       iteration. */

    /*********************************************************************
     * Root
     *********************************************************************/

    if (root == rank) {

        /* The root needs a send convertor to pack from the user's
           buffer to shared memory */

        if (OMPI_SUCCESS !=
            (ret =
             opal_convertor_copy_and_prepare_for_send(ompi_mpi_local_convertor,
                                                      &(datatype->super),
                                                      count,
                                                      buff,
                                                      0,
                                                      &convertor))) {
            return ret;
        }
        opal_convertor_get_packed_size(&convertor, &total_size);

        /* Main loop over sending fragments */

        do {
            flag_num = (data->mcb_operation_count++ %
                        mca_coll_sm_component.sm_comm_num_in_use_flags);

            FLAG_SETUP(flag_num, flag, data);
            FLAG_WAIT_FOR_IDLE(flag, bcast_root_label);
            FLAG_RETAIN(flag, size - 1, data->mcb_operation_count - 1);

            /* Loop over all the segments in this set */

            segment_num =
                flag_num * mca_coll_sm_component.sm_segs_per_inuse_flag;
            max_segment_num =
                (flag_num + 1) * mca_coll_sm_component.sm_segs_per_inuse_flag;
            do {
                index = &(data->mcb_data_index[segment_num]);

                /* Copy the fragment from the user buffer to my fragment
                   in the current segment */
                max_data = mca_coll_sm_component.sm_fragment_size;
                COPY_FRAGMENT_IN(convertor, index, rank, iov, max_data);
                bytes += max_data;

                /* Wait for the write to absolutely complete */
                opal_atomic_wmb();

                /* Tell my children that this fragment is ready */
                PARENT_NOTIFY_CHILDREN(children, num_children, index,
                                       max_data);

                ++segment_num;
            } while (bytes < total_size && segment_num < max_segment_num);
        } while (bytes < total_size);
    }

    /*********************************************************************
     * Non-root
     *********************************************************************/

    else {

        /* Non-root processes need a receive convertor to unpack from
           shared mmory to the user's buffer */

        if (OMPI_SUCCESS !=
            (ret =
             opal_convertor_copy_and_prepare_for_recv(ompi_mpi_local_convertor,
                                                      &(datatype->super),
                                                      count,
                                                      buff,
                                                      0,
                                                      &convertor))) {
            return ret;
        }
        opal_convertor_get_packed_size(&convertor, &total_size);

        /* Loop over receiving (and possibly re-sending) the
           fragments */

        do {
            flag_num = (data->mcb_operation_count %
                        mca_coll_sm_component.sm_comm_num_in_use_flags);

            /* Wait for the root to mark this set of segments as
               ours */
            FLAG_SETUP(flag_num, flag, data);
            FLAG_WAIT_FOR_OP(flag, data->mcb_operation_count, bcast_nonroot_label1);
            ++data->mcb_operation_count;

            /* Loop over all the segments in this set */

            segment_num =
                flag_num * mca_coll_sm_component.sm_segs_per_inuse_flag;
            max_segment_num =
                (flag_num + 1) * mca_coll_sm_component.sm_segs_per_inuse_flag;
            do {

                /* Pre-calculate some values */
                parent_rank = (parent->mcstn_id + root) % size;
                index = &(data->mcb_data_index[segment_num]);

                /* Wait for my parent to tell me that the segment is ready */
                CHILD_WAIT_FOR_NOTIFY(rank, index, max_data, bcast_nonroot_label2);

                /* If I have children, send the data to them */
                if (num_children > 0) {
                    /* Copy the fragment from the parent's portion in
                       the segment to my portion in the segment. */
                    COPY_FRAGMENT_BETWEEN(parent_rank, rank, index, max_data);

                    /* Wait for the write to absolutely complete */
                    opal_atomic_wmb();

                    /* Tell my children that this fragment is ready */
                    PARENT_NOTIFY_CHILDREN(children, num_children, index,
                                           max_data);

                    /* Set the "copy from buffer" to be my local
                       segment buffer so that we don't potentially
                       incur a non-local memory copy from the parent's
                       fan out data segment [again] when copying to
                       the user's buffer */
                    src_rank = rank;
                }

                /* If I don't have any children, set the "copy from
                   buffer" to be my parent's fan out segment to copy
                   directly from my parent */

                else {
                    src_rank = parent_rank;
                }

                /* Copy to my output buffer */
                COPY_FRAGMENT_OUT(convertor, src_rank, index, iov, max_data);

                bytes += max_data;
                ++segment_num;
            } while (bytes < total_size && segment_num < max_segment_num);

            /* Wait for all copy-out writes to complete before I say
               I'm done with the segments */
            opal_atomic_wmb();

            /* We're finished with this set of segments */
            FLAG_RELEASE(flag);
        } while (bytes < total_size);
    }

    /* Kill the convertor */

    OBJ_DESTRUCT(&convertor);

    /* All done */

    return OMPI_SUCCESS;
}