mca_btl_base_descriptor_t* mca_btl_tcp_prepare_dst(
    struct mca_btl_base_module_t* btl,
    struct mca_btl_base_endpoint_t* endpoint,
    struct mca_mpool_base_registration_t* registration,
    struct opal_convertor_t* convertor,
    uint8_t order,
    size_t reserve,
    size_t* size,
    uint32_t flags)
{
    mca_btl_tcp_frag_t* frag;

    if( OPAL_UNLIKELY((*size) > UINT32_MAX) ) {  /* limit the size to what we support */
        *size = (size_t)UINT32_MAX;
    }
    MCA_BTL_TCP_FRAG_ALLOC_USER(frag);
    if( OPAL_UNLIKELY(NULL == frag) ) {
        return NULL;
    }

    frag->segments->seg_len = *size;
    opal_convertor_get_current_pointer( convertor, (void**)&(frag->segments->seg_addr.pval) );

    frag->base.des_remote = NULL;
    frag->base.des_remote_count = 0;
    frag->base.des_local = frag->segments;
    frag->base.des_local_count = 1;
    frag->base.des_flags = flags;
    frag->base.order = MCA_BTL_NO_ORDER;
    return &frag->base;
}
struct mca_btl_base_descriptor_t *vader_prepare_dst(struct mca_btl_base_module_t *btl,
                                                    struct mca_btl_base_endpoint_t *endpoint,
                                                    struct mca_mpool_base_registration_t *registration,
                                                    struct opal_convertor_t *convertor,
                                                    uint8_t order, size_t reserve, size_t *size,
                                                    uint32_t flags)
{
    mca_btl_vader_frag_t *frag;
    void *data_ptr;

    (void) MCA_BTL_VADER_FRAG_ALLOC_USER(frag);
    if (OPAL_UNLIKELY(NULL == frag)) {
        return NULL;
    }
    
    opal_convertor_get_current_pointer (convertor, &data_ptr);

    frag->segments[0].seg_addr.lval = (uint64_t)(uintptr_t) data_ptr;
    frag->segments[0].seg_len       = *size;
    
    frag->base.order       = order;
    frag->base.des_flags   = flags;

    frag->endpoint = endpoint;

    return &frag->base;
}
Beispiel #3
0
/**
 * Prepare data for receive.
 */
struct mca_btl_base_descriptor_t*
mca_btl_self_prepare_dst( struct mca_btl_base_module_t* btl,
                          struct mca_btl_base_endpoint_t* endpoint,
                          mca_mpool_base_registration_t* registration,
                          struct opal_convertor_t* convertor,
                          uint8_t order,
                          size_t reserve,
                          size_t* size,
                          uint32_t flags )
{
    mca_btl_self_frag_t* frag;
    size_t max_data = *size;
    void *ptr;

    MCA_BTL_SELF_FRAG_ALLOC_RDMA(frag);
    if(OPAL_UNLIKELY(NULL == frag)) {
        return NULL;
    }

    /* setup descriptor to point directly to user buffer */
    opal_convertor_get_current_pointer( convertor, &ptr );
    frag->segment.seg_addr.lval = (uint64_t)(uintptr_t) ptr;

    frag->segment.seg_len = reserve + max_data;
    frag->base.des_dst = &frag->segment;
    frag->base.des_dst_cnt = 1;
    frag->base.des_flags = flags;
    return &frag->base;
}
/**
 * This function always work in local representation. This means no representation
 * conversion (i.e. no heterogeneity) has to be taken into account, and that all
 * length we're working on are local.
 */
int32_t
opal_convertor_raw( opal_convertor_t* pConvertor, 
		    struct iovec* iov, uint32_t* iov_count,
		    size_t* length )
{
    const opal_datatype_t *pData = pConvertor->pDesc;
    dt_stack_t* pStack;       /* pointer to the position on the stack */
    uint32_t pos_desc;        /* actual position in the description of the derived datatype */
    uint32_t count_desc;      /* the number of items already done in the actual pos_desc */
    dt_elem_desc_t* description, *pElem;
    unsigned char *source_base;  /* origin of the data */
    size_t raw_data = 0;      /* sum of raw data lengths in the iov_len fields */
    uint32_t index = 0, i;    /* the iov index and a simple counter */

    assert( (*iov_count) > 0 );
    if( OPAL_LIKELY(pConvertor->flags & CONVERTOR_NO_OP) ) {
        /* The convertor contain minimal informations, we only use the bConverted
         * to manage the conversion. This function work even after the convertor
         * was moved to a specific position.
         */
        opal_convertor_get_current_pointer( pConvertor, (void**)&iov[0].iov_base );
        iov[0].iov_len = pConvertor->local_size - pConvertor->bConverted;
        *length = iov[0].iov_len;
        pConvertor->bConverted = pConvertor->local_size;
        pConvertor->flags |= CONVERTOR_COMPLETED;
        *iov_count = 1;
        return 1;  /* we're done */
    }

    DO_DEBUG( opal_output( 0, "opal_convertor_raw( %p, {%p, %u}, %lu )\n", (void*)pConvertor,
                           (void*)iov, *iov_count, (unsigned long)*length ); );
Beispiel #5
0
static inline struct mca_btl_base_descriptor_t *
mca_btl_scif_prepare_src_send (struct mca_btl_base_module_t *btl,
                               mca_btl_base_endpoint_t *endpoint,
                               struct opal_convertor_t *convertor,
                               uint8_t order, size_t reserve, size_t *size,
                               uint32_t flags)
{
    mca_btl_scif_base_frag_t *frag = NULL;
    uint32_t iov_count = 1;
    struct iovec iov;
    size_t max_size = *size;
    int rc;

    if (OPAL_LIKELY((mca_btl_scif_module.super.btl_flags & MCA_BTL_FLAGS_SEND_INPLACE) &&
                    !opal_convertor_need_buffers (convertor) &&
                    reserve <= 128)) {
        /* inplace send */
        void *data_ptr;
        opal_convertor_get_current_pointer (convertor, &data_ptr);

        (void) MCA_BTL_SCIF_FRAG_ALLOC_DMA(endpoint, frag);
        if (OPAL_UNLIKELY(NULL == frag)) {
            return NULL;
        }

        frag->segments[0].seg_len       = reserve;
        frag->segments[1].seg_addr.pval = data_ptr;
        frag->segments[1].seg_len       = *size;
        frag->base.des_segment_count = 2;
    } else {
        /* buffered send */
        (void) MCA_BTL_SCIF_FRAG_ALLOC_EAGER(endpoint, frag);
        if (OPAL_UNLIKELY(NULL == frag)) {
            return NULL;
        }

        if (*size) {
            iov.iov_len  = *size;
            iov.iov_base = (IOVBASE_TYPE *) ((uintptr_t) frag->segments[0].seg_addr.pval + reserve);

            rc = opal_convertor_pack (convertor, &iov, &iov_count, &max_size);
            if (OPAL_UNLIKELY(rc < 0)) {
                mca_btl_scif_frag_return (frag);
                return NULL;
            }
            *size = max_size;
        }

        frag->segments[0].seg_len = reserve + *size;
        frag->base.des_segment_count = 1;
    }

    frag->base.des_segments = frag->segments;
    frag->base.order        = order;
    frag->base.des_flags    = flags;

    return &frag->base;
}
/** 
 * Return 0 if everything went OK and if there is still room before the complete
 *          conversion of the data (need additional call with others input buffers )
 *        1 if everything went fine and the data was completly converted
 *       -1 something wrong occurs.
 */
int32_t opal_convertor_pack( opal_convertor_t* pConv,
                             struct iovec* iov, uint32_t* out_size,
                             size_t* max_data )
{
    OPAL_CONVERTOR_SET_STATUS_BEFORE_PACK_UNPACK( pConv, iov, out_size, max_data );

    if( OPAL_LIKELY(pConv->flags & CONVERTOR_NO_OP) ) {
        /**
         * We are doing conversion on a contiguous datatype on a homogeneous
         * environment. The convertor contain minimal informations, we only
         * use the bConverted to manage the conversion.
         */
        uint32_t i;
        unsigned char* base_pointer;
        size_t pending_length = pConv->local_size - pConv->bConverted;

        *max_data = pending_length;
        opal_convertor_get_current_pointer( pConv, (void**)&base_pointer );

        for( i = 0; i < *out_size; i++ ) {
            if( iov[i].iov_len >= pending_length ) {
                goto complete_contiguous_data_pack;
            }
            if( OPAL_LIKELY(NULL == iov[i].iov_base) )
                iov[i].iov_base = (IOVBASE_TYPE *) base_pointer;
            else
#if OPAL_CUDA_SUPPORT
                MEMCPY_CUDA( iov[i].iov_base, base_pointer, iov[i].iov_len, pConv );
#else
                MEMCPY( iov[i].iov_base, base_pointer, iov[i].iov_len );
#endif
            pending_length -= iov[i].iov_len;
            base_pointer += iov[i].iov_len;
        }
        *max_data -= pending_length;
        pConv->bConverted += (*max_data);
        return 0;

complete_contiguous_data_pack:
        iov[i].iov_len = pending_length;
        if( OPAL_LIKELY(NULL == iov[i].iov_base) )
            iov[i].iov_base = (IOVBASE_TYPE *) base_pointer;
        else
#if OPAL_CUDA_SUPPORT
            MEMCPY_CUDA( iov[i].iov_base, base_pointer, iov[i].iov_len, pConv );
#else
            MEMCPY( iov[i].iov_base, base_pointer, iov[i].iov_len );
#endif
        pConv->bConverted = pConv->local_size;
        *out_size = i + 1;
        pConv->flags |= CONVERTOR_COMPLETED;
        return 1;
    }

    return pConv->fAdvance( pConv, iov, out_size, max_data );
}
Beispiel #7
0
/**
 * Prepare a descriptor for send/rdma using the supplied
 * convertor. If the convertor references data that is contiguous,
 * the descriptor may simply point to the user buffer. Otherwise,
 * this routine is responsible for allocating buffer space and
 * packing if required.
 *
 * @param btl (IN)          BTL module
 * @param endpoint (IN)     BTL peer addressing
 * @param convertor (IN)    Data type convertor
 * @param reserve (IN)      Additional bytes requested by upper layer to precede user data
 * @param size (IN/OUT)     Number of bytes to prepare (IN), number of bytes actually prepared (OUT)
 */
mca_btl_base_descriptor_t* mca_btl_udapl_prepare_dst(
    struct mca_btl_base_module_t* btl,
    struct mca_btl_base_endpoint_t* endpoint,
    struct mca_mpool_base_registration_t* registration,
    struct opal_convertor_t* convertor,
    uint8_t order,
    size_t reserve,
    size_t* size,
    uint32_t flags)
{
    mca_btl_udapl_frag_t* frag;
    int rc;

    MCA_BTL_UDAPL_FRAG_ALLOC_USER(btl, frag, rc);
    if(NULL == frag) {
        return NULL;
    }

    frag->segment.seg_len = *size;
    opal_convertor_get_current_pointer( convertor, (void**)&(frag->segment.seg_addr.pval) );

    if(NULL == registration) {
        /* didn't get a memory registration passed in, so must
         * register the region now
         */ 
        rc = btl->btl_mpool->mpool_register(btl->btl_mpool,
                                   frag->segment.seg_addr.pval,
                                   frag->segment.seg_len,
                                   0,
                                   &registration);
        if(OMPI_SUCCESS != rc || NULL == registration) {
            MCA_BTL_UDAPL_FRAG_RETURN_USER(btl,frag);
            return NULL;
        }
        frag->registration = (mca_btl_udapl_reg_t*)registration;        
    }

    frag->base.des_src = NULL;
    frag->base.des_src_cnt = 0;
    frag->base.des_dst = &frag->segment;
    frag->base.des_dst_cnt = 1;
    frag->base.des_flags = flags;

    frag->segment.seg_key.key32[0] =
        ((mca_btl_udapl_reg_t*)registration)->rmr_context;
    
    frag->base.order = MCA_BTL_NO_ORDER;

    return &frag->base;
}
Beispiel #8
0
static int mca_btl_self_sendi (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
                               struct opal_convertor_t *convertor, void *header, size_t header_size,
                               size_t payload_size, uint8_t order, uint32_t flags, mca_btl_base_tag_t tag,
                               mca_btl_base_descriptor_t **descriptor)
{
    mca_btl_base_descriptor_t *frag;

    if (!payload_size || !opal_convertor_need_buffers(convertor)) {
        void *data_ptr = NULL;
        if (payload_size) {
            opal_convertor_get_current_pointer (convertor, &data_ptr);
        }

        mca_btl_base_segment_t segments[2] = {{.seg_addr.pval = header, .seg_len = header_size},
                                              {.seg_addr.pval = data_ptr, .seg_len = payload_size}};
static mca_btl_base_descriptor_t *
mca_btl_ugni_prepare_dst (mca_btl_base_module_t *btl,
                          mca_btl_base_endpoint_t *endpoint,
                          mca_mpool_base_registration_t *registration,
                          opal_convertor_t *convertor, uint8_t order,
                          size_t reserve, size_t *size, uint32_t flags)
{
    mca_btl_ugni_base_frag_t *frag;
    void *data_ptr;
    int rc;

    opal_convertor_get_current_pointer (convertor, &data_ptr);

    (void) MCA_BTL_UGNI_FRAG_ALLOC_RDMA(endpoint, frag);
    if (OPAL_UNLIKELY(NULL == frag)) {
        return NULL;
    }

    /* always need to register the buffer for put/get (even for fma) */
    if (NULL == registration) {
        rc = btl->btl_mpool->mpool_register(btl->btl_mpool,
                                            data_ptr, *size, 0,
                                            &registration);
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            mca_btl_ugni_frag_return (frag);
            return NULL;
        }

        frag->registration = (mca_btl_ugni_reg_t*) registration;
    }

    frag->segments[0].memory_handle      = ((mca_btl_ugni_reg_t *)registration)->memory_hdl;
    frag->segments[0].base.seg_len       = *size;
    frag->segments[0].base.seg_addr.lval = (uint64_t)(uintptr_t) data_ptr;

    frag->base.des_dst     = &frag->segments->base;
    frag->base.des_dst_cnt = 1;
    frag->base.order       = order;
    frag->base.des_flags   = flags;

    return (struct mca_btl_base_descriptor_t *) frag;
}
Beispiel #10
0
/**
 * Prepare data for send
 *
 * @param btl (IN)      BTL module
 */
static struct mca_btl_base_descriptor_t *mca_btl_self_prepare_src (struct mca_btl_base_module_t* btl,
                                                                   struct mca_btl_base_endpoint_t *endpoint,
                                                                   struct opal_convertor_t *convertor,
                                                                   uint8_t order, size_t reserve,
                                                                   size_t *size, uint32_t flags)
{
    bool inline_send = !opal_convertor_need_buffers(convertor);
    size_t buffer_len = reserve + (inline_send ? 0 : *size);
    mca_btl_self_frag_t *frag;

    frag = (mca_btl_self_frag_t *) mca_btl_self_alloc (btl, endpoint, order, buffer_len, flags);
    if (OPAL_UNLIKELY(NULL == frag)) {
        return NULL;
    }

    /* non-contigous data */
    if (OPAL_UNLIKELY(!inline_send)) {
        struct iovec iov = {.iov_len = *size, .iov_base = (IOVBASE_TYPE *) ((uintptr_t) frag->data + reserve)};
        size_t max_data = *size;
        uint32_t iov_count = 1;
        int rc;

        rc = opal_convertor_pack (convertor, &iov, &iov_count, &max_data);
        if(rc < 0) {
            mca_btl_self_free (btl, &frag->base);
            return NULL;
        }

        *size = max_data;
        frag->segments[0].seg_len = reserve + max_data;
    } else {
        void *data_ptr;

        opal_convertor_get_current_pointer (convertor, &data_ptr);

        frag->segments[1].seg_addr.pval = data_ptr;
        frag->segments[1].seg_len = *size;
        frag->base.des_segment_count = 2;
    }

    return &frag->base;
}
Beispiel #11
0
/**
 * Handle the CUDA buffer.
 */
int mca_pml_bfo_send_request_start_cuda(mca_pml_bfo_send_request_t* sendreq,
                                        mca_bml_base_btl_t* bml_btl,
                                        size_t size) {
    int rc;
#if OPAL_CUDA_SUPPORT_41
    sendreq->req_send.req_base.req_convertor.flags &= ~CONVERTOR_CUDA;
    if (opal_convertor_need_buffers(&sendreq->req_send.req_base.req_convertor) == false) {
        unsigned char *base;
        opal_convertor_get_current_pointer( &sendreq->req_send.req_base.req_convertor, (void**)&base );
        /* Set flag back */
        sendreq->req_send.req_base.req_convertor.flags |= CONVERTOR_CUDA;
        if( 0 != (sendreq->req_rdma_cnt = (uint32_t)mca_pml_bfo_rdma_cuda_btls(
                                                                           sendreq->req_endpoint,
                                                                           base,
                                                                           sendreq->req_send.req_bytes_packed,
                                                                           sendreq->req_rdma))) {
            rc = mca_pml_bfo_send_request_start_rdma(sendreq, bml_btl,
                                                     sendreq->req_send.req_bytes_packed);
            if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
                mca_pml_bfo_free_rdma_resources(sendreq);
            }
        } else {
            if (bml_btl->btl_flags & MCA_BTL_FLAGS_CUDA_PUT) {
                rc = mca_pml_bfo_send_request_start_rndv(sendreq, bml_btl, size,
                                                         MCA_PML_BFO_HDR_FLAGS_CONTIG);
            } else {
                rc = mca_pml_bfo_send_request_start_rndv(sendreq, bml_btl, size, 0);
            }
        }
    } else {
        /* Do not send anything with first rendezvous message as copying GPU
         * memory into RNDV message is expensive. */
        sendreq->req_send.req_base.req_convertor.flags |= CONVERTOR_CUDA;
        rc = mca_pml_bfo_send_request_start_rndv(sendreq, bml_btl, 0, 0);
    }
#else
    /* Just do the rendezvous but set initial data to be sent to zero */
    rc = mca_pml_bfo_send_request_start_rndv(sendreq, bml_btl, 0, 0);
#endif /* OPAL_CUDA_SUPPORT_41 */
    return rc;
}
/*
 * BTL 2.0 prepare_dst function (this function does not exist in BTL
 * 3.0).
 */
mca_btl_base_descriptor_t*
opal_btl_usnic_prepare_dst(
    struct mca_btl_base_module_t* base_module,
    struct mca_btl_base_endpoint_t* endpoint,
    struct mca_mpool_base_registration_t* registration,
    struct opal_convertor_t* convertor,
    uint8_t order,
    size_t reserve,
    size_t* size,
    uint32_t flags)
{
    opal_btl_usnic_put_dest_frag_t *pfrag;
    opal_btl_usnic_module_t *module;
    void *data_ptr;

    module = (opal_btl_usnic_module_t *)base_module;

    /* allocate a fragment for this */
    pfrag = (opal_btl_usnic_put_dest_frag_t *)
        opal_btl_usnic_put_dest_frag_alloc(module);
    if (NULL == pfrag) {
        return NULL;
    }

    /* find start of the data */
    opal_convertor_get_current_pointer(convertor, (void **) &data_ptr);

    /* make a seg entry pointing at data_ptr */
    pfrag->uf_remote_seg[0].seg_addr.pval = data_ptr;
    pfrag->uf_remote_seg[0].seg_len = *size;

    pfrag->uf_base.order       = order;
    pfrag->uf_base.des_flags   = flags;

#if MSGDEBUG2
    opal_output(0, "prep_dst size=%d, addr=%p, pfrag=%p\n", (int)*size,
            data_ptr, (void *)pfrag);
#endif

    return &pfrag->uf_base;
}
Beispiel #13
0
/**
 * Pack data
 *
 * @param btl (IN)      BTL module
 */
static struct mca_btl_base_descriptor_t *vader_prepare_src (struct mca_btl_base_module_t *btl,
                                                            struct mca_btl_base_endpoint_t *endpoint,
                                                            struct opal_convertor_t *convertor,
                                                            uint8_t order, size_t reserve, size_t *size,
                                                            uint32_t flags)
{
    const size_t total_size = reserve + *size;
    mca_btl_vader_frag_t *frag;
    unsigned char *fbox;
    void *data_ptr;
    int rc;

    opal_convertor_get_current_pointer (convertor, &data_ptr);

    /* in place send fragment */
    if (OPAL_UNLIKELY(opal_convertor_need_buffers(convertor))) {
        uint32_t iov_count = 1;
        struct iovec iov;

        /* non-contiguous data requires the convertor */
        if (MCA_BTL_VADER_XPMEM != mca_btl_vader_component.single_copy_mechanism &&
            total_size > mca_btl_vader.super.btl_eager_limit) {
            (void) MCA_BTL_VADER_FRAG_ALLOC_MAX(frag, endpoint);
        } else
            (void) MCA_BTL_VADER_FRAG_ALLOC_EAGER(frag, endpoint);

        if (OPAL_UNLIKELY(NULL == frag)) {
            return NULL;
        }

        iov.iov_len = *size;
        iov.iov_base =
            (IOVBASE_TYPE *)(((uintptr_t)(frag->segments[0].seg_addr.pval)) +
                             reserve);

        rc = opal_convertor_pack (convertor, &iov, &iov_count, size);
        if (OPAL_UNLIKELY(rc < 0)) {
            MCA_BTL_VADER_FRAG_RETURN(frag);
            return NULL;
        }

        frag->segments[0].seg_len = *size + reserve;
    } else {
        if (MCA_BTL_VADER_XPMEM != mca_btl_vader_component.single_copy_mechanism) {
            if (OPAL_LIKELY(total_size <= mca_btl_vader.super.btl_eager_limit)) {
                (void) MCA_BTL_VADER_FRAG_ALLOC_EAGER(frag, endpoint);
            } else {
                (void) MCA_BTL_VADER_FRAG_ALLOC_MAX(frag, endpoint);
            }
        } else
            (void) MCA_BTL_VADER_FRAG_ALLOC_USER(frag, endpoint);

        if (OPAL_UNLIKELY(NULL == frag)) {
            return NULL;
        }

#if OPAL_BTL_VADER_HAVE_XPMEM
        /* use xpmem to send this segment if it is above the max inline send size */
        if (OPAL_UNLIKELY(MCA_BTL_VADER_XPMEM == mca_btl_vader_component.single_copy_mechanism &&
                          total_size > (size_t) mca_btl_vader_component.max_inline_send)) {
            /* single copy send */
            frag->hdr->flags = MCA_BTL_VADER_FLAG_SINGLE_COPY;

            /* set up single copy io vector */
            frag->hdr->sc_iov.iov_base = data_ptr;
            frag->hdr->sc_iov.iov_len  = *size;

            frag->segments[0].seg_len = reserve;
            frag->segments[1].seg_len = *size;
            frag->segments[1].seg_addr.pval = data_ptr;
            frag->base.des_segment_count = 2;
        } else {
#endif

            /* inline send */
            if (OPAL_LIKELY(MCA_BTL_DES_FLAGS_BTL_OWNERSHIP & flags)) {
                /* try to reserve a fast box for this transfer only if the
                 * fragment does not belong to the caller */
                fbox = mca_btl_vader_reserve_fbox (endpoint, total_size);
                if (OPAL_LIKELY(fbox)) {
                    frag->segments[0].seg_addr.pval = fbox;
                }

                frag->fbox = fbox;
            }

            /* NTH: the covertor adds some latency so we bypass it here */
            memcpy ((void *)((uintptr_t)frag->segments[0].seg_addr.pval + reserve), data_ptr, *size);
            frag->segments[0].seg_len = total_size;
#if OPAL_BTL_VADER_HAVE_XPMEM
        }
#endif
    }

    frag->base.order       = order;
    frag->base.des_flags   = flags;

    return &frag->base;
}
/**
 * Initiate an inline send to the peer.
 *
 * @param btl (IN)      BTL module
 * @param peer (IN)     BTL peer addressing
 */
int mca_btl_vader_sendi (struct mca_btl_base_module_t *btl,
                         struct mca_btl_base_endpoint_t *endpoint,
                         struct opal_convertor_t *convertor,
                         void *header, size_t header_size,
                         size_t payload_size, uint8_t order,
                         uint32_t flags, mca_btl_base_tag_t tag,
                         mca_btl_base_descriptor_t **descriptor)
{
    size_t length = (header_size + payload_size);
    mca_btl_vader_frag_t *frag;
    uint32_t iov_count = 1;
    struct iovec iov;
    size_t max_data;
    void *data_ptr = NULL;

    assert (length < mca_btl_vader_component.eager_limit);
    assert (0 == (flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK));

    /* we won't ever return a descriptor */
    *descriptor = NULL;

    if (OPAL_LIKELY(!(payload_size && opal_convertor_need_buffers (convertor)))) {
        if (payload_size) {
            opal_convertor_get_current_pointer (convertor, &data_ptr);
        }

        if (mca_btl_vader_fbox_sendi (endpoint, tag, header, header_size, data_ptr, payload_size)) {
            return OMPI_SUCCESS;
        }
    }

    /* allocate a fragment, giving up if we can't get one */
    frag = (mca_btl_vader_frag_t *) mca_btl_vader_alloc (btl, endpoint, order, length,
                                                         flags | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
    if (OPAL_UNLIKELY(NULL == frag)) {
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    /* fill in fragment fields */
    frag->hdr->len = length;
    frag->hdr->tag = tag;

    /* write the match header (with MPI comm/tag/etc. info) */
    memcpy (frag->segment.seg_addr.pval, header, header_size);

    /* write the message data if there is any */
    /*
      We can add MEMCHECKER calls before and after the packing.
    */
    /* we can't use single-copy semantics here since as caller will consider the send
       complete if we return success */
    if (OPAL_UNLIKELY(payload_size && opal_convertor_need_buffers (convertor))) {
        /* pack the data into the supplied buffer */
        iov.iov_base = (IOVBASE_TYPE *)((uintptr_t)frag->segment.seg_addr.pval + header_size);
        iov.iov_len  = max_data = payload_size;

        (void) opal_convertor_pack (convertor, &iov, &iov_count, &max_data);

        assert (max_data == payload_size);
    } else if (payload_size) {
        /* bypassing the convertor may speed things up a little */
        opal_convertor_get_current_pointer (convertor, &data_ptr);
        memcpy ((void *)((uintptr_t)frag->segment.seg_addr.pval + header_size), data_ptr, payload_size);
    }

    opal_list_append (&mca_btl_vader_component.active_sends, (opal_list_item_t *) frag);

    /* write the fragment pointer to peer's the FIFO */
    vader_fifo_write (frag->hdr, endpoint->peer_smp_rank);

    /* the progress function will return the fragment */

    return OMPI_SUCCESS;
}
/**
 * Pack data
 *
 * @param btl (IN)      BTL module
 */
static struct mca_btl_base_descriptor_t *vader_prepare_src (struct mca_btl_base_module_t *btl,
                                                            struct mca_btl_base_endpoint_t *endpoint,
                                                            mca_mpool_base_registration_t *registration,
                                                            struct opal_convertor_t *convertor,
                                                            uint8_t order, size_t reserve, size_t *size,
                                                            uint32_t flags)
{
    const size_t total_size = reserve + *size;
    struct iovec iov;
    mca_btl_vader_frag_t *frag;
    uint32_t iov_count = 1;
    void *data_ptr, *fbox_ptr;
    int rc;

    opal_convertor_get_current_pointer (convertor, &data_ptr);

    if (OPAL_LIKELY(reserve)) {
        /* in place send fragment */
        if (OPAL_UNLIKELY(opal_convertor_need_buffers(convertor))) {
            /* non-contiguous data requires the convertor */
            (void) MCA_BTL_VADER_FRAG_ALLOC_EAGER(frag);
            if (OPAL_UNLIKELY(NULL == frag)) {
                return NULL;
            }

            iov.iov_len = *size;
            iov.iov_base =
                (IOVBASE_TYPE *)(((uintptr_t)(frag->segments[0].seg_addr.pval)) +
                                 reserve);

            rc = opal_convertor_pack (convertor, &iov, &iov_count, size);
            if (OPAL_UNLIKELY(rc < 0)) {
                MCA_BTL_VADER_FRAG_RETURN(frag);
                return NULL;
            }

            frag->segments[0].seg_len = total_size;
        } else {
            (void) MCA_BTL_VADER_FRAG_ALLOC_USER(frag);
            if (OPAL_UNLIKELY(NULL == frag)) {
                return NULL;
            }

            if (total_size > (size_t) mca_btl_vader_component.max_inline_send) {
                /* single copy send */
                frag->hdr->flags = MCA_BTL_VADER_FLAG_SINGLE_COPY;

                /* set up single copy io vector */
                frag->hdr->sc_iov.iov_base = data_ptr;
                frag->hdr->sc_iov.iov_len  = *size;

                frag->segments[0].seg_len = reserve;
                frag->segments[1].seg_len = *size;
                frag->segments[1].seg_addr.pval = data_ptr;
                frag->base.des_src_cnt = 2;
            } else {
                /* inline send */
                /* try to reserve a fast box for this transfer */
                fbox_ptr = mca_btl_vader_reserve_fbox (endpoint, total_size);

                if (fbox_ptr) {
                    frag->hdr->flags |= MCA_BTL_VADER_FLAG_FBOX;
                    frag->segments[0].seg_addr.pval = fbox_ptr;
                }

                /* NTH: the covertor adds some latency so we bypass it here */
                vader_memmove ((void *)((uintptr_t)frag->segments[0].seg_addr.pval + reserve),
                               data_ptr, *size);
                frag->segments[0].seg_len = total_size;
            }
        }
    } else {
        /* put/get fragment */
        (void) MCA_BTL_VADER_FRAG_ALLOC_USER(frag);
        if (OPAL_UNLIKELY(NULL == frag)) {
            return NULL;
        }

        frag->segments[0].seg_addr.lval = (uint64_t)(uintptr_t) data_ptr;
        frag->segments[0].seg_len       = total_size;
    }

    frag->base.order       = order;
    frag->base.des_flags   = flags;

    frag->endpoint = endpoint;

    return &frag->base;
}
/* Responsible for sending "small" frags (reserve + *size <= max_frag_payload)
 * in the same manner as btl_prepare_src.  Must return a smaller amount than
 * requested if the given convertor cannot process the entire (*size).
 */
static inline opal_btl_usnic_send_frag_t *
prepare_src_small(
    struct opal_btl_usnic_module_t* module,
    struct mca_btl_base_endpoint_t* endpoint,
    struct opal_convertor_t* convertor,
    uint8_t order,
    size_t reserve,
    size_t* size,
    uint32_t flags)
{
    opal_btl_usnic_send_frag_t *frag;
    opal_btl_usnic_small_send_frag_t *sfrag;
    size_t payload_len;

    payload_len = *size + reserve;
    assert(payload_len <= module->max_frag_payload); /* precondition */

    sfrag = opal_btl_usnic_small_send_frag_alloc(module);
    if (OPAL_UNLIKELY(NULL == sfrag)) {
        return NULL;
    }
    frag = &sfrag->ssf_base;

    /* In the case of a convertor, we will copy the data in now, since that is
     * the cheapest way to discover how much we can actually send (since we know
     * we will pack it anyway later).  The alternative is to do all of the
     * following:
     * 1) clone_with_position(convertor) and see where the new position ends up
     *    actually being (see opal_btl_usnic_convertor_pack_peek).  Otherwise we
     *    aren't fulfilling our contract w.r.t. (*size).
     * 2) Add a bunch of branches checking for different cases, both here and in
     *    progress_sends
     * 3) If we choose to defer the packing, we must clone the convertor because
     *    the PML owns it and might reuse it for another prepare_src call.
     *
     * Two convertor clones is likely to be at least as slow as just copying the
     * data and might consume a similar amount of memory.  Plus we still have to
     * pack it later to send it.
     *
     * The reason we do not copy non-convertor buffer at this point is because
     * we might still use INLINE for the send, and in that case we do not want
     * to copy the data at all.
     */
    if (OPAL_UNLIKELY(opal_convertor_need_buffers(convertor))) {
        /* put user data just after end of 1st seg (upper layer header) */
        assert(payload_len <= module->max_frag_payload);
        usnic_convertor_pack_simple(
                convertor,
                (IOVBASE_TYPE*)(intptr_t)(frag->sf_base.uf_local_seg[0].seg_addr.lval + reserve),
                *size,
                size);
        payload_len = reserve + *size;
        frag->sf_base.uf_base.USNIC_SEND_LOCAL_COUNT = 1;
        /* PML will copy header into beginning of segment */
        frag->sf_base.uf_local_seg[0].seg_len = payload_len;
    } else {
        opal_convertor_get_current_pointer(convertor,
                                           &sfrag->ssf_base.sf_base.uf_local_seg[1].seg_addr.pval);
        frag->sf_base.uf_base.USNIC_SEND_LOCAL_COUNT = 2;
        frag->sf_base.uf_local_seg[0].seg_len = reserve;
        frag->sf_base.uf_local_seg[1].seg_len = *size;
    }

    frag->sf_base.uf_base.des_flags = flags;
    frag->sf_endpoint = endpoint;

    return frag;
}
/* Responsible for handling "large" frags (reserve + *size > max_frag_payload)
 * in the same manner as btl_prepare_src.  Must return a smaller amount than
 * requested if the given convertor cannot process the entire (*size).
 */
static opal_btl_usnic_send_frag_t *
prepare_src_large(
    struct opal_btl_usnic_module_t* module,
    struct mca_btl_base_endpoint_t* endpoint,
    struct opal_convertor_t* convertor,
    uint8_t order,
    size_t reserve,
    size_t* size,
    uint32_t flags)
{
    opal_btl_usnic_send_frag_t *frag;
    opal_btl_usnic_large_send_frag_t *lfrag;
    int rc;

    /* Get holder for the msg */
    lfrag = opal_btl_usnic_large_send_frag_alloc(module);
    if (OPAL_UNLIKELY(NULL == lfrag)) {
        return NULL;
    }
    frag = &lfrag->lsf_base;

    /* The header location goes in SG[0], payload in SG[1].  If we are using a
     * convertor then SG[1].seg_len is accurate but seg_addr is NULL. */
    frag->sf_base.uf_base.USNIC_SEND_LOCAL_COUNT = 2;

    /* stash header location, PML will write here */
    frag->sf_base.uf_local_seg[0].seg_addr.pval = &lfrag->lsf_ompi_header;
    frag->sf_base.uf_local_seg[0].seg_len = reserve;
    /* make sure upper header small enough */
    assert(reserve <= sizeof(lfrag->lsf_ompi_header));

    if (OPAL_UNLIKELY(opal_convertor_need_buffers(convertor))) {
        /* threshold == -1 means always pack eagerly */
        if (mca_btl_usnic_component.pack_lazy_threshold >= 0 &&
            *size >= (size_t)mca_btl_usnic_component.pack_lazy_threshold) {
            MSGDEBUG1_OUT("packing frag %p on the fly", (void *)frag);
            lfrag->lsf_pack_on_the_fly = true;

            /* tell the PML we will absorb as much as possible while still
             * respecting indivisible element boundaries in the convertor */
            *size = opal_btl_usnic_convertor_pack_peek(convertor, *size);

            /* Clone the convertor b/c we (the BTL) don't own it and the PML
             * might mutate it after we return from this function. */
            rc = opal_convertor_clone(convertor, &frag->sf_convertor,
                                      /*copy_stack=*/true);
            if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
                BTL_ERROR(("unexpected convertor clone error"));
                abort(); /* XXX */
            }
        }
        else {
            /* pack everything in the convertor into a chain of segments now,
             * leaving space for the PML header in the first segment */
            lfrag->lsf_base.sf_base.uf_local_seg[0].seg_addr.pval =
                pack_chunk_seg_chain_with_reserve(module, lfrag, reserve,
                                                  convertor, *size, size);
        }

        /* We set SG[1] to {NULL,bytes_packed} so that various calculations
         * by both PML and this BTL will be correct.  For example, the PML adds
         * up the bytes in the descriptor segments to determine if an MPI-level
         * request is complete or not. */
        frag->sf_base.uf_local_seg[1].seg_addr.pval = NULL;
        frag->sf_base.uf_local_seg[1].seg_len = *size;
    } else {
        /* convertor not needed, just save the payload pointer in SG[1] */
        lfrag->lsf_pack_on_the_fly = true;
        opal_convertor_get_current_pointer(convertor,
                                           &frag->sf_base.uf_local_seg[1].seg_addr.pval);
        frag->sf_base.uf_local_seg[1].seg_len = *size;
    }

    frag->sf_base.uf_base.des_flags = flags;
    frag->sf_endpoint = endpoint;

    return frag;
}
/**
 * Initiate an inline send to the peer.
 *
 * @param btl (IN)      BTL module
 * @param peer (IN)     BTL peer addressing
 */
int mca_btl_vader_sendi (struct mca_btl_base_module_t *btl,
                         struct mca_btl_base_endpoint_t *endpoint,
                         struct opal_convertor_t *convertor,
                         void *header, size_t header_size,
                         size_t payload_size, uint8_t order,
                         uint32_t flags, mca_btl_base_tag_t tag,
                         mca_btl_base_descriptor_t **descriptor)
{
    mca_btl_vader_frag_t *frag;
    void *data_ptr = NULL;
    size_t length;

    if (payload_size) {
        opal_convertor_get_current_pointer (convertor, &data_ptr);
    }

    if (!(payload_size && opal_convertor_need_buffers (convertor)) &&
        mca_btl_vader_fbox_sendi (endpoint, tag, header, header_size, data_ptr, payload_size)) {
        return OMPI_SUCCESS;
    }


    length = header_size + payload_size;

    /* allocate a fragment, giving up if we can't get one */
    frag = (mca_btl_vader_frag_t *) mca_btl_vader_alloc (btl, endpoint, order, length,
                                                         flags | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
    if (OPAL_UNLIKELY(NULL == frag)) {
        *descriptor = NULL;

        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    /* fill in fragment fields */
    frag->hdr->len = length;
    frag->hdr->tag = tag;

    /* write the match header (with MPI comm/tag/etc. info) */
    memcpy (frag->segments[0].seg_addr.pval, header, header_size);

    /* write the message data if there is any */
    /* we can't use single-copy semantics here since as caller will consider the send
       complete when we return */
    if (payload_size) {
        uint32_t iov_count = 1;
        struct iovec iov;

        /* pack the data into the supplied buffer */
        iov.iov_base = (IOVBASE_TYPE *)((uintptr_t)frag->segments[0].seg_addr.pval + header_size);
        iov.iov_len  = length = payload_size;

        (void) opal_convertor_pack (convertor, &iov, &iov_count, &length);

        assert (length == payload_size);
    }

    /* write the fragment pointer to peer's the FIFO. the progress function will return the fragment */
    vader_fifo_write_ep (frag->hdr, endpoint);

    return OMPI_SUCCESS;
}