示例#1
0
int mca_btl_vader_get_cma (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address,
                           uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
                           mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
                           int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
{
    struct iovec src_iov = {.iov_base = (void *)(intptr_t) remote_address, .iov_len = size};
    struct iovec dst_iov = {.iov_base = local_address, .iov_len = size};
    ssize_t ret;

    /*
     * According to the man page :
     * "On success, process_vm_readv() returns the number of bytes read and
     * process_vm_writev() returns the number of bytes written.  This return
     * value may be less than the total number of requested bytes, if a
     * partial read/write occurred.  (Partial transfers apply at the
     * granularity of iovec elements.  These system calls won't perform a
     * partial transfer that splits a single iovec element.)".
     * So since we use a single iovec element, the returned size should either
     * be 0 or size, and the do loop should not be needed here.
     * We tried on various Linux kernels with size > 2 GB, and surprisingly,
     * the returned value is always 0x7ffff000 (fwiw, it happens to be the size
     * of the larger number of pages that fits a signed 32 bits integer).
     * We do not know whether this is a bug from the kernel, the libc or even
     * the man page, but for the time being, we do as is process_vm_readv() could
     * return any value.
     */
    do {
        ret = process_vm_readv (endpoint->segment_data.other.seg_ds->seg_cpid, &dst_iov, 1, &src_iov, 1, 0);
        if (0 > ret) {
            opal_output(0, "Read %ld, expected %lu, errno = %d\n", (long)ret, (unsigned long)size, errno);
            return OPAL_ERROR;
        }
        src_iov.iov_base = (void *)((char *)src_iov.iov_base + ret);
        src_iov.iov_len -= ret;
        dst_iov.iov_base = (void *)((char *)dst_iov.iov_base + ret);
        dst_iov.iov_len -= ret;
    } while (0 < src_iov.iov_len);

    /* always call the callback function */
    cbfunc (btl, endpoint, local_address, local_handle, cbcontext, cbdata, OPAL_SUCCESS);

    return OPAL_SUCCESS;
}
#endif

#if OPAL_BTL_VADER_HAVE_KNEM
int mca_btl_vader_get_knem (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address,
                            uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
                            mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
                            int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
{
    struct knem_cmd_param_iovec recv_iovec;
    struct knem_cmd_inline_copy icopy;

    /* Fill in the ioctl data fields.  There's no async completion, so
       we don't need to worry about getting a slot, etc. */
    recv_iovec.base = (uintptr_t) local_address;
    recv_iovec.len = size;
    icopy.local_iovec_array = (uintptr_t) &recv_iovec;
    icopy.local_iovec_nr    = 1;
    icopy.remote_cookie     = remote_handle->cookie;
    icopy.remote_offset     = remote_address - remote_handle->base_addr;
    icopy.write             = 0;
    icopy.flags             = 0;

    /* Use the DMA flag if knem supports it *and* the segment length
     * is greater than the cutoff. Not that if DMA is not supported
     * or the user specified 0 for knem_dma_min the knem_dma_min was
     * set to UINT_MAX in mca_btl_vader_knem_init. */
    if (mca_btl_vader_component.knem_dma_min <= size) {
        icopy.flags = KNEM_FLAG_DMA;
    }
    /* synchronous flags only, no need to specify icopy.async_status_index */

    /* When the ioctl returns, the transfer is done and we can invoke
       the btl callback and return the frag */
    if (OPAL_UNLIKELY(0 != ioctl (mca_btl_vader.knem_fd, KNEM_CMD_INLINE_COPY, &icopy))) {
        return OPAL_ERROR;
    }

    if (KNEM_STATUS_FAILED == icopy.current_status) {
        return OPAL_ERROR;
    }

    /* always call the callback function */
    cbfunc (btl, endpoint, local_address, local_handle, cbcontext, cbdata, OPAL_SUCCESS);

    return OPAL_SUCCESS;
}
#endif

static void mca_btl_vader_sc_emu_get_complete (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint,
                                               mca_btl_base_descriptor_t *desc, int status)
{
    mca_btl_vader_frag_t *frag = (mca_btl_vader_frag_t *) desc;
    mca_btl_vader_sc_emu_hdr_t *hdr;
    void *local_address = frag->rdma.local_address;
    size_t len = frag->segments[0].seg_len - sizeof (*hdr);
    void *context = frag->rdma.context;
    void *cbdata = frag->rdma.cbdata;
    mca_btl_base_rdma_completion_fn_t cbfunc = frag->rdma.cbfunc;
    void *data;

    hdr = (mca_btl_vader_sc_emu_hdr_t *) frag->segments[0].seg_addr.pval;
    data = (void *) (hdr + 1);

    memcpy (local_address, data, len);

    /* return the fragment before calling the callback */
    MCA_BTL_VADER_FRAG_RETURN(frag);

    cbfunc (btl, endpoint, local_address, NULL, context, cbdata, status);
}
/**
 * Return a segment allocated by this BTL.
 *
 * @param btl (IN)      BTL module
 * @param segment (IN)  Allocated segment.
 */
static int vader_free (struct mca_btl_base_module_t *btl, mca_btl_base_descriptor_t *des)
{
    MCA_BTL_VADER_FRAG_RETURN((mca_btl_vader_frag_t *) des);

    return OMPI_SUCCESS;
}
/**
 * Pack data
 *
 * @param btl (IN)      BTL module
 */
static struct mca_btl_base_descriptor_t *vader_prepare_src (struct mca_btl_base_module_t *btl,
                                                            struct mca_btl_base_endpoint_t *endpoint,
                                                            mca_mpool_base_registration_t *registration,
                                                            struct opal_convertor_t *convertor,
                                                            uint8_t order, size_t reserve, size_t *size,
                                                            uint32_t flags)
{
    const size_t total_size = reserve + *size;
    struct iovec iov;
    mca_btl_vader_frag_t *frag;
    uint32_t iov_count = 1;
    void *data_ptr, *fbox_ptr;
    int rc;

    opal_convertor_get_current_pointer (convertor, &data_ptr);

    if (OPAL_LIKELY(reserve)) {
        /* in place send fragment */
        if (OPAL_UNLIKELY(opal_convertor_need_buffers(convertor))) {
            /* non-contiguous data requires the convertor */
            (void) MCA_BTL_VADER_FRAG_ALLOC_EAGER(frag);
            if (OPAL_UNLIKELY(NULL == frag)) {
                return NULL;
            }

            iov.iov_len = *size;
            iov.iov_base =
                (IOVBASE_TYPE *)(((uintptr_t)(frag->segments[0].seg_addr.pval)) +
                                 reserve);

            rc = opal_convertor_pack (convertor, &iov, &iov_count, size);
            if (OPAL_UNLIKELY(rc < 0)) {
                MCA_BTL_VADER_FRAG_RETURN(frag);
                return NULL;
            }

            frag->segments[0].seg_len = total_size;
        } else {
            (void) MCA_BTL_VADER_FRAG_ALLOC_USER(frag);
            if (OPAL_UNLIKELY(NULL == frag)) {
                return NULL;
            }

            if (total_size > (size_t) mca_btl_vader_component.max_inline_send) {
                /* single copy send */
                frag->hdr->flags = MCA_BTL_VADER_FLAG_SINGLE_COPY;

                /* set up single copy io vector */
                frag->hdr->sc_iov.iov_base = data_ptr;
                frag->hdr->sc_iov.iov_len  = *size;

                frag->segments[0].seg_len = reserve;
                frag->segments[1].seg_len = *size;
                frag->segments[1].seg_addr.pval = data_ptr;
                frag->base.des_src_cnt = 2;
            } else {
                /* inline send */
                /* try to reserve a fast box for this transfer */
                fbox_ptr = mca_btl_vader_reserve_fbox (endpoint, total_size);

                if (fbox_ptr) {
                    frag->hdr->flags |= MCA_BTL_VADER_FLAG_FBOX;
                    frag->segments[0].seg_addr.pval = fbox_ptr;
                }

                /* NTH: the covertor adds some latency so we bypass it here */
                vader_memmove ((void *)((uintptr_t)frag->segments[0].seg_addr.pval + reserve),
                               data_ptr, *size);
                frag->segments[0].seg_len = total_size;
            }
        }
    } else {
        /* put/get fragment */
        (void) MCA_BTL_VADER_FRAG_ALLOC_USER(frag);
        if (OPAL_UNLIKELY(NULL == frag)) {
            return NULL;
        }

        frag->segments[0].seg_addr.lval = (uint64_t)(uintptr_t) data_ptr;
        frag->segments[0].seg_len       = total_size;
    }

    frag->base.order       = order;
    frag->base.des_flags   = flags;

    frag->endpoint = endpoint;

    return &frag->base;
}
示例#4
0
/**
 * Pack data
 *
 * @param btl (IN)      BTL module
 */
static struct mca_btl_base_descriptor_t *vader_prepare_src (struct mca_btl_base_module_t *btl,
                                                            struct mca_btl_base_endpoint_t *endpoint,
                                                            struct opal_convertor_t *convertor,
                                                            uint8_t order, size_t reserve, size_t *size,
                                                            uint32_t flags)
{
    const size_t total_size = reserve + *size;
    mca_btl_vader_frag_t *frag;
    unsigned char *fbox;
    void *data_ptr;
    int rc;

    opal_convertor_get_current_pointer (convertor, &data_ptr);

    /* in place send fragment */
    if (OPAL_UNLIKELY(opal_convertor_need_buffers(convertor))) {
        uint32_t iov_count = 1;
        struct iovec iov;

        /* non-contiguous data requires the convertor */
        if (MCA_BTL_VADER_XPMEM != mca_btl_vader_component.single_copy_mechanism &&
            total_size > mca_btl_vader.super.btl_eager_limit) {
            (void) MCA_BTL_VADER_FRAG_ALLOC_MAX(frag, endpoint);
        } else
            (void) MCA_BTL_VADER_FRAG_ALLOC_EAGER(frag, endpoint);

        if (OPAL_UNLIKELY(NULL == frag)) {
            return NULL;
        }

        iov.iov_len = *size;
        iov.iov_base =
            (IOVBASE_TYPE *)(((uintptr_t)(frag->segments[0].seg_addr.pval)) +
                             reserve);

        rc = opal_convertor_pack (convertor, &iov, &iov_count, size);
        if (OPAL_UNLIKELY(rc < 0)) {
            MCA_BTL_VADER_FRAG_RETURN(frag);
            return NULL;
        }

        frag->segments[0].seg_len = *size + reserve;
    } else {
        if (MCA_BTL_VADER_XPMEM != mca_btl_vader_component.single_copy_mechanism) {
            if (OPAL_LIKELY(total_size <= mca_btl_vader.super.btl_eager_limit)) {
                (void) MCA_BTL_VADER_FRAG_ALLOC_EAGER(frag, endpoint);
            } else {
                (void) MCA_BTL_VADER_FRAG_ALLOC_MAX(frag, endpoint);
            }
        } else
            (void) MCA_BTL_VADER_FRAG_ALLOC_USER(frag, endpoint);

        if (OPAL_UNLIKELY(NULL == frag)) {
            return NULL;
        }

#if OPAL_BTL_VADER_HAVE_XPMEM
        /* use xpmem to send this segment if it is above the max inline send size */
        if (OPAL_UNLIKELY(MCA_BTL_VADER_XPMEM == mca_btl_vader_component.single_copy_mechanism &&
                          total_size > (size_t) mca_btl_vader_component.max_inline_send)) {
            /* single copy send */
            frag->hdr->flags = MCA_BTL_VADER_FLAG_SINGLE_COPY;

            /* set up single copy io vector */
            frag->hdr->sc_iov.iov_base = data_ptr;
            frag->hdr->sc_iov.iov_len  = *size;

            frag->segments[0].seg_len = reserve;
            frag->segments[1].seg_len = *size;
            frag->segments[1].seg_addr.pval = data_ptr;
            frag->base.des_segment_count = 2;
        } else {
#endif

            /* inline send */
            if (OPAL_LIKELY(MCA_BTL_DES_FLAGS_BTL_OWNERSHIP & flags)) {
                /* try to reserve a fast box for this transfer only if the
                 * fragment does not belong to the caller */
                fbox = mca_btl_vader_reserve_fbox (endpoint, total_size);
                if (OPAL_LIKELY(fbox)) {
                    frag->segments[0].seg_addr.pval = fbox;
                }

                frag->fbox = fbox;
            }

            /* NTH: the covertor adds some latency so we bypass it here */
            memcpy ((void *)((uintptr_t)frag->segments[0].seg_addr.pval + reserve), data_ptr, *size);
            frag->segments[0].seg_len = total_size;
#if OPAL_BTL_VADER_HAVE_XPMEM
        }
#endif
    }

    frag->base.order       = order;
    frag->base.des_flags   = flags;

    return &frag->base;
}