int mca_btl_vader_get_cma (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { struct iovec src_iov = {.iov_base = (void *)(intptr_t) remote_address, .iov_len = size}; struct iovec dst_iov = {.iov_base = local_address, .iov_len = size}; ssize_t ret; /* * According to the man page : * "On success, process_vm_readv() returns the number of bytes read and * process_vm_writev() returns the number of bytes written. This return * value may be less than the total number of requested bytes, if a * partial read/write occurred. (Partial transfers apply at the * granularity of iovec elements. These system calls won't perform a * partial transfer that splits a single iovec element.)". * So since we use a single iovec element, the returned size should either * be 0 or size, and the do loop should not be needed here. * We tried on various Linux kernels with size > 2 GB, and surprisingly, * the returned value is always 0x7ffff000 (fwiw, it happens to be the size * of the larger number of pages that fits a signed 32 bits integer). * We do not know whether this is a bug from the kernel, the libc or even * the man page, but for the time being, we do as is process_vm_readv() could * return any value. */ do { ret = process_vm_readv (endpoint->segment_data.other.seg_ds->seg_cpid, &dst_iov, 1, &src_iov, 1, 0); if (0 > ret) { opal_output(0, "Read %ld, expected %lu, errno = %d\n", (long)ret, (unsigned long)size, errno); return OPAL_ERROR; } src_iov.iov_base = (void *)((char *)src_iov.iov_base + ret); src_iov.iov_len -= ret; dst_iov.iov_base = (void *)((char *)dst_iov.iov_base + ret); dst_iov.iov_len -= ret; } while (0 < src_iov.iov_len); /* always call the callback function */ cbfunc (btl, endpoint, local_address, local_handle, cbcontext, cbdata, OPAL_SUCCESS); return OPAL_SUCCESS; } #endif #if OPAL_BTL_VADER_HAVE_KNEM int mca_btl_vader_get_knem (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { struct knem_cmd_param_iovec recv_iovec; struct knem_cmd_inline_copy icopy; /* Fill in the ioctl data fields. There's no async completion, so we don't need to worry about getting a slot, etc. */ recv_iovec.base = (uintptr_t) local_address; recv_iovec.len = size; icopy.local_iovec_array = (uintptr_t) &recv_iovec; icopy.local_iovec_nr = 1; icopy.remote_cookie = remote_handle->cookie; icopy.remote_offset = remote_address - remote_handle->base_addr; icopy.write = 0; icopy.flags = 0; /* Use the DMA flag if knem supports it *and* the segment length * is greater than the cutoff. Not that if DMA is not supported * or the user specified 0 for knem_dma_min the knem_dma_min was * set to UINT_MAX in mca_btl_vader_knem_init. */ if (mca_btl_vader_component.knem_dma_min <= size) { icopy.flags = KNEM_FLAG_DMA; } /* synchronous flags only, no need to specify icopy.async_status_index */ /* When the ioctl returns, the transfer is done and we can invoke the btl callback and return the frag */ if (OPAL_UNLIKELY(0 != ioctl (mca_btl_vader.knem_fd, KNEM_CMD_INLINE_COPY, &icopy))) { return OPAL_ERROR; } if (KNEM_STATUS_FAILED == icopy.current_status) { return OPAL_ERROR; } /* always call the callback function */ cbfunc (btl, endpoint, local_address, local_handle, cbcontext, cbdata, OPAL_SUCCESS); return OPAL_SUCCESS; } #endif static void mca_btl_vader_sc_emu_get_complete (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, mca_btl_base_descriptor_t *desc, int status) { mca_btl_vader_frag_t *frag = (mca_btl_vader_frag_t *) desc; mca_btl_vader_sc_emu_hdr_t *hdr; void *local_address = frag->rdma.local_address; size_t len = frag->segments[0].seg_len - sizeof (*hdr); void *context = frag->rdma.context; void *cbdata = frag->rdma.cbdata; mca_btl_base_rdma_completion_fn_t cbfunc = frag->rdma.cbfunc; void *data; hdr = (mca_btl_vader_sc_emu_hdr_t *) frag->segments[0].seg_addr.pval; data = (void *) (hdr + 1); memcpy (local_address, data, len); /* return the fragment before calling the callback */ MCA_BTL_VADER_FRAG_RETURN(frag); cbfunc (btl, endpoint, local_address, NULL, context, cbdata, status); }
/** * Return a segment allocated by this BTL. * * @param btl (IN) BTL module * @param segment (IN) Allocated segment. */ static int vader_free (struct mca_btl_base_module_t *btl, mca_btl_base_descriptor_t *des) { MCA_BTL_VADER_FRAG_RETURN((mca_btl_vader_frag_t *) des); return OMPI_SUCCESS; }
/** * Pack data * * @param btl (IN) BTL module */ static struct mca_btl_base_descriptor_t *vader_prepare_src (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, mca_mpool_base_registration_t *registration, struct opal_convertor_t *convertor, uint8_t order, size_t reserve, size_t *size, uint32_t flags) { const size_t total_size = reserve + *size; struct iovec iov; mca_btl_vader_frag_t *frag; uint32_t iov_count = 1; void *data_ptr, *fbox_ptr; int rc; opal_convertor_get_current_pointer (convertor, &data_ptr); if (OPAL_LIKELY(reserve)) { /* in place send fragment */ if (OPAL_UNLIKELY(opal_convertor_need_buffers(convertor))) { /* non-contiguous data requires the convertor */ (void) MCA_BTL_VADER_FRAG_ALLOC_EAGER(frag); if (OPAL_UNLIKELY(NULL == frag)) { return NULL; } iov.iov_len = *size; iov.iov_base = (IOVBASE_TYPE *)(((uintptr_t)(frag->segments[0].seg_addr.pval)) + reserve); rc = opal_convertor_pack (convertor, &iov, &iov_count, size); if (OPAL_UNLIKELY(rc < 0)) { MCA_BTL_VADER_FRAG_RETURN(frag); return NULL; } frag->segments[0].seg_len = total_size; } else { (void) MCA_BTL_VADER_FRAG_ALLOC_USER(frag); if (OPAL_UNLIKELY(NULL == frag)) { return NULL; } if (total_size > (size_t) mca_btl_vader_component.max_inline_send) { /* single copy send */ frag->hdr->flags = MCA_BTL_VADER_FLAG_SINGLE_COPY; /* set up single copy io vector */ frag->hdr->sc_iov.iov_base = data_ptr; frag->hdr->sc_iov.iov_len = *size; frag->segments[0].seg_len = reserve; frag->segments[1].seg_len = *size; frag->segments[1].seg_addr.pval = data_ptr; frag->base.des_src_cnt = 2; } else { /* inline send */ /* try to reserve a fast box for this transfer */ fbox_ptr = mca_btl_vader_reserve_fbox (endpoint, total_size); if (fbox_ptr) { frag->hdr->flags |= MCA_BTL_VADER_FLAG_FBOX; frag->segments[0].seg_addr.pval = fbox_ptr; } /* NTH: the covertor adds some latency so we bypass it here */ vader_memmove ((void *)((uintptr_t)frag->segments[0].seg_addr.pval + reserve), data_ptr, *size); frag->segments[0].seg_len = total_size; } } } else { /* put/get fragment */ (void) MCA_BTL_VADER_FRAG_ALLOC_USER(frag); if (OPAL_UNLIKELY(NULL == frag)) { return NULL; } frag->segments[0].seg_addr.lval = (uint64_t)(uintptr_t) data_ptr; frag->segments[0].seg_len = total_size; } frag->base.order = order; frag->base.des_flags = flags; frag->endpoint = endpoint; return &frag->base; }
/** * Pack data * * @param btl (IN) BTL module */ static struct mca_btl_base_descriptor_t *vader_prepare_src (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, struct opal_convertor_t *convertor, uint8_t order, size_t reserve, size_t *size, uint32_t flags) { const size_t total_size = reserve + *size; mca_btl_vader_frag_t *frag; unsigned char *fbox; void *data_ptr; int rc; opal_convertor_get_current_pointer (convertor, &data_ptr); /* in place send fragment */ if (OPAL_UNLIKELY(opal_convertor_need_buffers(convertor))) { uint32_t iov_count = 1; struct iovec iov; /* non-contiguous data requires the convertor */ if (MCA_BTL_VADER_XPMEM != mca_btl_vader_component.single_copy_mechanism && total_size > mca_btl_vader.super.btl_eager_limit) { (void) MCA_BTL_VADER_FRAG_ALLOC_MAX(frag, endpoint); } else (void) MCA_BTL_VADER_FRAG_ALLOC_EAGER(frag, endpoint); if (OPAL_UNLIKELY(NULL == frag)) { return NULL; } iov.iov_len = *size; iov.iov_base = (IOVBASE_TYPE *)(((uintptr_t)(frag->segments[0].seg_addr.pval)) + reserve); rc = opal_convertor_pack (convertor, &iov, &iov_count, size); if (OPAL_UNLIKELY(rc < 0)) { MCA_BTL_VADER_FRAG_RETURN(frag); return NULL; } frag->segments[0].seg_len = *size + reserve; } else { if (MCA_BTL_VADER_XPMEM != mca_btl_vader_component.single_copy_mechanism) { if (OPAL_LIKELY(total_size <= mca_btl_vader.super.btl_eager_limit)) { (void) MCA_BTL_VADER_FRAG_ALLOC_EAGER(frag, endpoint); } else { (void) MCA_BTL_VADER_FRAG_ALLOC_MAX(frag, endpoint); } } else (void) MCA_BTL_VADER_FRAG_ALLOC_USER(frag, endpoint); if (OPAL_UNLIKELY(NULL == frag)) { return NULL; } #if OPAL_BTL_VADER_HAVE_XPMEM /* use xpmem to send this segment if it is above the max inline send size */ if (OPAL_UNLIKELY(MCA_BTL_VADER_XPMEM == mca_btl_vader_component.single_copy_mechanism && total_size > (size_t) mca_btl_vader_component.max_inline_send)) { /* single copy send */ frag->hdr->flags = MCA_BTL_VADER_FLAG_SINGLE_COPY; /* set up single copy io vector */ frag->hdr->sc_iov.iov_base = data_ptr; frag->hdr->sc_iov.iov_len = *size; frag->segments[0].seg_len = reserve; frag->segments[1].seg_len = *size; frag->segments[1].seg_addr.pval = data_ptr; frag->base.des_segment_count = 2; } else { #endif /* inline send */ if (OPAL_LIKELY(MCA_BTL_DES_FLAGS_BTL_OWNERSHIP & flags)) { /* try to reserve a fast box for this transfer only if the * fragment does not belong to the caller */ fbox = mca_btl_vader_reserve_fbox (endpoint, total_size); if (OPAL_LIKELY(fbox)) { frag->segments[0].seg_addr.pval = fbox; } frag->fbox = fbox; } /* NTH: the covertor adds some latency so we bypass it here */ memcpy ((void *)((uintptr_t)frag->segments[0].seg_addr.pval + reserve), data_ptr, *size); frag->segments[0].seg_len = total_size; #if OPAL_BTL_VADER_HAVE_XPMEM } #endif } frag->base.order = order; frag->base.des_flags = flags; return &frag->base; }