コード例 #1
0
ファイル: spml_ikrit.c プロジェクト: gpaulsen/ompi
int mca_spml_ikrit_get(shmem_ctx_t ctx, void *src_addr, size_t size, void *dst_addr, int src)
{
    mxm_send_req_t sreq;

    if (0 >= size) {
        return OSHMEM_SUCCESS;
    }

    if (OSHMEM_SUCCESS == mca_spml_ikrit_get_shm(src_addr, size, dst_addr, src))
        return OSHMEM_SUCCESS;

    if (OSHMEM_SUCCESS
            != mca_spml_ikrit_get_helper(&sreq,
                                         src_addr,
                                         size,
                                         dst_addr,
                                         src)) {
        oshmem_shmem_abort(-1);
        return OSHMEM_ERROR;
    }

    sreq.base.completed_cb = NULL;
    sreq.flags = 0;

    SPML_IKRIT_MXM_POST_SEND(sreq);

    mca_spml_irkit_req_wait(&sreq.base);
    if (MXM_OK != sreq.base.error) {
        SPML_ERROR("get request failed: %s - aborting",
                   mxm_error_string(sreq.base.error));
        oshmem_shmem_abort(-1);
        return OSHMEM_ERROR;
    }
    return OSHMEM_SUCCESS;
}
コード例 #2
0
ファイル: spml_ikrit.c プロジェクト: gpaulsen/ompi
static inline int mca_spml_ikrit_get_async(void *src_addr,
                                           size_t size,
                                           void *dst_addr,
                                           int src)
{
    mca_spml_ikrit_get_request_t *get_req;

    if (OSHMEM_SUCCESS == mca_spml_ikrit_get_shm(src_addr, size, dst_addr, src))
        return OSHMEM_SUCCESS;

    get_req = alloc_get_req();

    if (OSHMEM_SUCCESS != mca_spml_ikrit_get_helper(&get_req->mxm_req,
                                                    src_addr,
                                                    size,
                                                    dst_addr,
                                                    src)) {
        oshmem_shmem_abort(-1);
        return OSHMEM_ERROR;
    }

    get_req->mxm_req.flags = 0;
    get_req->mxm_req.base.completed_cb = get_completion_cb;
    get_req->mxm_req.base.context = get_req;
    OPAL_THREAD_ADD_FETCH32(&mca_spml_ikrit.n_active_gets, 1);

    SPML_IKRIT_MXM_POST_SEND(get_req->mxm_req);

    return OSHMEM_SUCCESS;
}
コード例 #3
0
ファイル: spml_ikrit.c プロジェクト: gpaulsen/ompi
/* for now only do blocking copy send */
int mca_spml_ikrit_send(void* buf,
                        size_t size,
                        int dst,
                        mca_spml_base_put_mode_t mode)
{
    mxm_send_req_t req;
    char dummy_buf[1];

    SPML_VERBOSE(100,
                 "sending %p size %d to %d, mode %d",
                 buf, (int)size, dst, (int)mode);
    req.opcode = MXM_REQ_OP_SEND;

    req.op.send.tag = oshmem_my_proc_id();

    req.base.state = MXM_REQ_NEW;
    req.base.mq = mca_spml_ikrit.mxm_mq;
    req.base.conn = mca_spml_ikrit.mxm_peers[dst].mxm_conn;
    req.flags             = MXM_REQ_SEND_FLAG_BLOCKING;
    req.base.completed_cb = NULL;

    req.base.data_type = MXM_REQ_DATA_BUFFER;
    req.base.data.buffer.ptr = buf == NULL ? dummy_buf : buf;
    req.base.data.buffer.length = size == 0 ? sizeof(dummy_buf) : size;
    req.base.data.buffer.memh = NULL;

    SPML_IKRIT_MXM_POST_SEND(req);

    mca_spml_irkit_req_wait(&req.base);
    if (req.base.error != MXM_OK) {
        return OSHMEM_ERROR;
    }

    return OSHMEM_SUCCESS;
}
コード例 #4
0
static int mca_spml_ikrit_mxm_fence(int dst)
{
    mca_spml_ikrit_get_request_t *fence_req;

    fence_req = alloc_get_req();
    if (NULL == fence_req) {
        SPML_ERROR("out of get requests - aborting");
        oshmem_shmem_abort(-1);
        return OSHMEM_ERROR;
    }

    fence_req->mxm_req.base.mq = mca_spml_ikrit.mxm_mq;
    fence_req->mxm_req.base.conn = mca_spml_ikrit.mxm_peers[dst]->mxm_conn;
#if MXM_API < MXM_VERSION(2,0)
    fence_req->mxm_req.opcode = MXM_REQ_OP_FENCE;
    fence_req->mxm_req.base.flags = MXM_REQ_FLAG_SEND_SYNC;
#else
    fence_req->mxm_req.opcode = MXM_REQ_OP_PUT_SYNC;
    fence_req->mxm_req.flags  = MXM_REQ_SEND_FLAG_FENCE;
    fence_req->mxm_req.op.mem.remote_vaddr = 0;
    fence_req->mxm_req.op.mem.remote_mkey  = &mxm_empty_mem_key;
    fence_req->mxm_req.base.data_type      = MXM_REQ_DATA_BUFFER;
    fence_req->mxm_req.base.data.buffer.ptr    = 0;
    fence_req->mxm_req.base.data.buffer.length = 0;
#endif
    fence_req->mxm_req.base.state = MXM_REQ_NEW;
    fence_req->mxm_req.base.completed_cb = fence_completion_cb;
    fence_req->mxm_req.base.context = fence_req;
    OPAL_THREAD_ADD32(&mca_spml_ikrit.n_mxm_fences, 1);

    SPML_IKRIT_MXM_POST_SEND(fence_req->mxm_req);
    return OSHMEM_SUCCESS;
}
コード例 #5
0
/* extension. used 4 fence implementation b4 fence was added to mxm */
int mca_spml_ikrit_get_async(void *src_addr,
                             size_t size,
                             void *dst_addr,
                             int src)
{
    mca_spml_ikrit_get_request_t *get_req;

    if (OSHMEM_SUCCESS == mca_spml_ikrit_get_shm(src_addr, size, dst_addr, src))
        return OSHMEM_SUCCESS;

    get_req = alloc_get_req();
    if (NULL == get_req) {
        SPML_ERROR("out of get requests - aborting");
        oshmem_shmem_abort(-1);
        return OSHMEM_ERROR;
    }

    if (OSHMEM_SUCCESS
            != mca_spml_ikrit_get_helper(&get_req->mxm_req,
                                         src_addr,
                                         size,
                                         dst_addr,
                                         src)) {
        oshmem_shmem_abort(-1);
        return OSHMEM_ERROR;
    }

#if MXM_API < MXM_VERSION(2,0)
    get_req->mxm_req.base.flags = 0;
#else
    get_req->mxm_req.flags = 0;
#endif
    get_req->mxm_req.base.completed_cb = get_completion_cb;
    get_req->mxm_req.base.context = get_req;
    OPAL_THREAD_ADD32(&mca_spml_ikrit.n_active_gets, 1);

    SPML_IKRIT_MXM_POST_SEND(get_req->mxm_req);

    return OSHMEM_SUCCESS;
}
コード例 #6
0
ファイル: spml_ikrit.c プロジェクト: gpaulsen/ompi
static int mca_spml_ikrit_mxm_fence(int dst)
{
    mca_spml_ikrit_get_request_t *fence_req;

    fence_req = alloc_get_req();

    fence_req->mxm_req.base.mq = mca_spml_ikrit.mxm_mq;
    fence_req->mxm_req.base.conn = mca_spml_ikrit.mxm_peers[dst].mxm_conn;
    fence_req->mxm_req.opcode = MXM_REQ_OP_PUT_SYNC;
    fence_req->mxm_req.flags  = MXM_REQ_SEND_FLAG_FENCE;
    fence_req->mxm_req.op.mem.remote_vaddr = 0;
    fence_req->mxm_req.op.mem.remote_mkey  = &mxm_empty_mem_key;
    fence_req->mxm_req.base.data_type      = MXM_REQ_DATA_BUFFER;
    fence_req->mxm_req.base.data.buffer.ptr    = 0;
    fence_req->mxm_req.base.data.buffer.length = 0;
    fence_req->mxm_req.base.state = MXM_REQ_NEW;
    fence_req->mxm_req.base.completed_cb = fence_completion_cb;
    fence_req->mxm_req.base.context = fence_req;
    OPAL_THREAD_ADD_FETCH32(&mca_spml_ikrit.n_mxm_fences, 1);

    SPML_IKRIT_MXM_POST_SEND(fence_req->mxm_req);
    return OSHMEM_SUCCESS;
}
コード例 #7
0
ファイル: spml_ikrit.c プロジェクト: gpaulsen/ompi
/* simple buffered put implementation. NOT IN USE
 * Problems:
 * - slighly worse performance than impl based on non buffered put
 * - fence complexity is O(n_active_connections) instead of O(n_connections_with_outstanding_puts).
 *   Later is bounded by the network RTT & mxm ack timer.
 */
int mca_spml_ikrit_put_simple(void* dst_addr,
                              size_t size,
                              void* src_addr,
                              int dst)
{
    void *rva;
    mxm_send_req_t mxm_req;
    mxm_wait_t wait;
    int ptl_id;
    mxm_mem_key_t *mkey;
    static int count;

    ptl_id = get_ptl_id(dst);
    mkey = mca_spml_ikrit_get_mkey(dst, dst_addr, ptl_id, &rva, &mca_spml_ikrit);

    SPML_VERBOSE_FASTPATH(100, "put: pe:%d ptl=%d dst=%p <- src: %p sz=%d. dst_rva=%p, %s",
                          dst, ptl_id, dst_addr, src_addr, (int)size, (void *)rva);

    if (NULL == mkey) {
        memcpy((void *) (unsigned long) rva, src_addr, size);
        /* call progress as often as we would have with regular put */
        if (++count % SPML_IKRIT_PACKETS_PER_SYNC == 0)
            mxm_progress(mca_spml_ikrit.mxm_context);
        return OSHMEM_SUCCESS;
    }

    SPML_VERBOSE_FASTPATH(100, "put: pe:%d ptl=%d dst=%p <- src: %p sz=%d. dst_rva=%p, %s",
                          dst, MXM_PTL_RDMA, dst_addr, src_addr, (int)size, (void *)rva);

    /* fill out request */
    mxm_req.base.mq = mca_spml_ikrit.mxm_mq;
    mxm_req.flags = MXM_REQ_SEND_FLAG_BLOCKING;
    mxm_req.base.conn = mca_spml_ikrit.mxm_peers[dst].mxm_conn;
    mxm_req.base.data_type = MXM_REQ_DATA_BUFFER;
    mxm_req.base.data.buffer.ptr = src_addr;
    mxm_req.base.data.buffer.length = size;
    mxm_req.base.completed_cb = 0;
    mxm_req.base.context = 0;
    mxm_req.opcode = MXM_REQ_OP_PUT;
    mxm_req.op.mem.remote_vaddr = (intptr_t) rva;
    mxm_req.base.state = MXM_REQ_NEW;
    mxm_req.base.error = MXM_OK;

    mxm_req.op.mem.remote_mkey = mkey;

    if (mca_spml_ikrit.mxm_peers[dst].need_fence == 0) {
        opal_list_append(&mca_spml_ikrit.active_peers,
                         &mca_spml_ikrit.mxm_peers[dst].link);
        mca_spml_ikrit.mxm_peers[dst].need_fence = 1;
    }

    SPML_IKRIT_MXM_POST_SEND(mxm_req);

    wait.req = &mxm_req.base;
    wait.state = (mxm_req_state_t)(MXM_REQ_SENT | MXM_REQ_COMPLETED);
    wait.progress_cb = NULL;
    wait.progress_arg = NULL;
    mxm_wait(&wait);

    return OSHMEM_SUCCESS;
}
コード例 #8
0
ファイル: spml_ikrit.c プロジェクト: gpaulsen/ompi
/**
 * TODO: using put request as handle is not good.
 */
static inline int mca_spml_ikrit_put_internal(void* dst_addr,
                                              size_t size,
                                              void* src_addr,
                                              int dst,
                                              void **handle,
                                              int zcopy)
{
    void *rva;
    mca_spml_ikrit_put_request_t *put_req;
    int ptl_id;
    static int count;
    int need_progress = 0;
    mxm_mem_key_t *mkey;

    if (OPAL_UNLIKELY(0 >= size)) {
        return OSHMEM_SUCCESS;
    }

    ptl_id = get_ptl_id(dst);
    mkey = mca_spml_ikrit_get_mkey(dst, dst_addr, ptl_id, &rva, &mca_spml_ikrit);

    if (OPAL_UNLIKELY(NULL == mkey)) {
        memcpy((void *) (unsigned long) rva, src_addr, size);
        /* call progress as often as we would have with regular put */
        if (++count % SPML_IKRIT_PACKETS_PER_SYNC == 0)
            mxm_progress(mca_spml_ikrit.mxm_context);
        return OSHMEM_SUCCESS;
    }

    SPML_VERBOSE_FASTPATH(100, "put: pe:%d ptl=%d dst=%p <- src: %p sz=%d. dst_rva=%p, %s",
                          dst, ptl_id, dst_addr, src_addr, (int)size, (void *)rva);

    put_req = alloc_put_req();

    if (handle)
        *handle = put_req;

    /* fill out request */
    put_req->mxm_req.base.mq = mca_spml_ikrit.mxm_mq;
    /* request immediate responce if we are getting low on send buffers. We only get responce from remote on ack timeout.
     * Also request explicit ack once in a while  */
    put_req->mxm_req.flags = 0;
    if (mca_spml_ikrit.free_list_max - mca_spml_ikrit.n_active_puts <= SPML_IKRIT_PUT_LOW_WATER ||
            (int)opal_list_get_size(&mca_spml_ikrit.active_peers) > mca_spml_ikrit.unsync_conn_max ||
            (mca_spml_ikrit.mxm_peers[dst].n_active_puts + 1) % SPML_IKRIT_PACKETS_PER_SYNC == 0) {
        need_progress = 1;
        put_req->mxm_req.opcode = MXM_REQ_OP_PUT_SYNC;
    } else  {
        put_req->mxm_req.opcode = MXM_REQ_OP_PUT;
    }
    if (!zcopy) {
        if (size < mca_spml_ikrit.put_zcopy_threshold) {
            put_req->mxm_req.flags |= MXM_REQ_SEND_FLAG_BLOCKING;
        } else {
            put_req->mxm_req.opcode = MXM_REQ_OP_PUT_SYNC;
        }
    }

    put_req->mxm_req.base.conn = mca_spml_ikrit.mxm_peers[dst].mxm_conn;
    put_req->mxm_req.base.data_type = MXM_REQ_DATA_BUFFER;
    put_req->mxm_req.base.data.buffer.ptr = src_addr;
    put_req->mxm_req.base.data.buffer.length = size;
    put_req->mxm_req.base.completed_cb = put_completion_cb;
    put_req->mxm_req.base.context = put_req;
    put_req->mxm_req.op.mem.remote_vaddr = (intptr_t) rva;
    put_req->mxm_req.base.state = MXM_REQ_NEW;
    put_req->pe = dst;

    put_req->mxm_req.op.mem.remote_mkey = mkey; 

    OPAL_THREAD_ADD_FETCH32(&mca_spml_ikrit.n_active_puts, 1);
    if (mca_spml_ikrit.mxm_peers[dst].need_fence == 0) {
        opal_list_append(&mca_spml_ikrit.active_peers,
                         &mca_spml_ikrit.mxm_peers[dst].link);
        mca_spml_ikrit.mxm_peers[dst].need_fence = 1;
    }

    mca_spml_ikrit.mxm_peers[dst].n_active_puts++;

    SPML_IKRIT_MXM_POST_SEND(put_req->mxm_req);

    if (need_progress)
        mxm_progress(mca_spml_ikrit.mxm_context);

    return OSHMEM_SUCCESS;
}
コード例 #9
0
/* simple buffered put implementation. NOT IN USE
 * Problems:
 * - slighly worse performance than impl based on non buffered put
 * - fence complexity is O(n_active_connections) instead of O(n_connections_with_outstanding_puts).
 *   Later is bounded by the network RTT & mxm ack timer.
 */
int mca_spml_ikrit_put_simple(void* dst_addr,
                              size_t size,
                              void* src_addr,
                              int dst)
{
    void *rva;
    mxm_send_req_t mxm_req;
    mxm_wait_t wait;
    int ptl_id;
    sshmem_mkey_t *r_mkey;
    static int count;

    ptl_id = get_ptl_id(dst);
    /* Get rkey of remote PE (dst proc) which must be on memheap  */
    r_mkey = mca_memheap_base_get_cached_mkey(dst, dst_addr, ptl_id, &rva);
    if (!r_mkey) {
        SPML_ERROR("pe=%d: %p is not address of shared variable",
                   dst, dst_addr);
        oshmem_shmem_abort(-1);
        return OSHMEM_ERROR;
    }

#if SPML_IKRIT_PUT_DEBUG == 1
    SPML_VERBOSE(100, "put: pe:%d ptl=%d dst=%p <- src: %p sz=%d. dst_rva=%p, %s",
            dst, ptl_id, dst_addr, src_addr, (int)size, (void *)rva, mca_spml_base_mkey2str(r_mkey));
#endif
    if (ptl_id == MXM_PTL_SHM) {

        if (mca_memheap_base_can_local_copy(r_mkey, dst_addr)) {
            memcpy((void *) (unsigned long) rva, src_addr, size);
            /* call progress as often as we would have with regular put */
            if (++count % SPML_IKRIT_PACKETS_PER_SYNC == 0)
                mxm_progress(mca_spml_ikrit.mxm_context);
            return OSHMEM_SUCCESS;
        }
        /* segment not mapped - fallback to rmda */
        ptl_id = MXM_PTL_RDMA;
        r_mkey = mca_memheap_base_get_cached_mkey(dst,
                                                     //(unsigned long) dst_addr,
                                                     dst_addr,
                                                     ptl_id,
                                                     &rva);
        if (!r_mkey) {
            SPML_ERROR("pe=%d: %p is not address of shared variable",
                       dst, dst_addr);
            oshmem_shmem_abort(-1);
            return OSHMEM_ERROR;
        }
    }

#if SPML_IKRIT_PUT_DEBUG == 1
    SPML_VERBOSE(100, "put: pe:%d ptl=%d dst=%p <- src: %p sz=%d. dst_rva=%p, %s",
            dst, ptl_id, dst_addr, src_addr, (int)size, (void *)rva, mca_spml_base_mkey2str(r_mkey));
#endif

    /* fill out request */
    mxm_req.base.mq = mca_spml_ikrit.mxm_mq;
#if MXM_API <  MXM_VERSION(2,0)
    mxm_req.base.flags = MXM_REQ_FLAG_BLOCKING;
#else
    mxm_req.flags = MXM_REQ_SEND_FLAG_BLOCKING;
#endif
    mxm_req.base.conn = mca_spml_ikrit.mxm_peers[dst]->mxm_conn;
    mxm_req.base.data_type = MXM_REQ_DATA_BUFFER;
    mxm_req.base.data.buffer.ptr = src_addr;
    mxm_req.base.data.buffer.length = size;
    mxm_req.base.completed_cb = 0;
    mxm_req.base.context = 0;
    mxm_req.opcode = MXM_REQ_OP_PUT;
    mxm_req.op.mem.remote_vaddr = (intptr_t) rva;
    mxm_req.base.state = MXM_REQ_NEW;
    mxm_req.base.error = MXM_OK;

#if MXM_API < MXM_VERSION(2, 0)
    mxm_req.base.data.buffer.memh = NULL;
    mxm_req.op.mem.remote_memh = NULL;
#else
    mxm_req.op.mem.remote_mkey = to_mxm_mkey(r_mkey);
#endif

    if (mca_spml_ikrit.mxm_peers[dst]->need_fence == 0) {
        opal_list_append(&mca_spml_ikrit.active_peers,
                         &mca_spml_ikrit.mxm_peers[dst]->super);
        mca_spml_ikrit.mxm_peers[dst]->need_fence = 1;
    }

    SPML_IKRIT_MXM_POST_SEND(mxm_req);

    wait.req = &mxm_req.base;
    wait.state = (mxm_req_state_t)(MXM_REQ_SENT | MXM_REQ_COMPLETED);
    wait.progress_cb = NULL;
    wait.progress_arg = NULL;
    mxm_wait(&wait);

    return OSHMEM_SUCCESS;
}
コード例 #10
0
/**
 * TODO: using put request as handle is not good.
 */
static inline int mca_spml_ikrit_put_internal(void* dst_addr,
                                              size_t size,
                                              void* src_addr,
                                              int dst,
                                              void **handle,
                                              int zcopy)
{
    void *rva;
    mca_spml_ikrit_put_request_t *put_req;
    int ptl_id;
    sshmem_mkey_t *r_mkey;
    static int count;
    int need_progress = 0;

    if (0 >= size) {
        return OSHMEM_SUCCESS;
    }

    ptl_id = get_ptl_id(dst);
    /* Get rkey of remote PE (dst proc) which must be on memheap  */
    r_mkey = mca_memheap_base_get_cached_mkey(dst, dst_addr, ptl_id, &rva);
    if (!r_mkey) {
        SPML_ERROR("pe=%d: %p is not address of shared variable",
                   dst, dst_addr);
        oshmem_shmem_abort(-1);
        return OSHMEM_ERROR;
    }

#if SPML_IKRIT_PUT_DEBUG == 1

    SPML_VERBOSE(100, "put: pe:%d ptl=%d dst=%p <- src: %p sz=%d. dst_rva=%p, %s",
            dst, ptl_id, dst_addr, src_addr, (int)size, (void *)rva, mca_spml_base_mkey2str(r_mkey));
#endif
    if (ptl_id == MXM_PTL_SHM) {

        if (mca_memheap_base_can_local_copy(r_mkey, dst_addr)) {
            memcpy((void *) (unsigned long) rva, src_addr, size);
            /* call progress as often as we would have with regular put */
            if (++count % SPML_IKRIT_PACKETS_PER_SYNC == 0)
                mxm_progress(mca_spml_ikrit.mxm_context);
            return OSHMEM_SUCCESS;
        }
        /* segment not mapped - fallback to rmda */
        ptl_id = MXM_PTL_RDMA;
        r_mkey = mca_memheap_base_get_cached_mkey(dst, dst_addr, ptl_id, &rva);
        if (!r_mkey) {
            SPML_ERROR("pe=%d: %p is not address of shared variable",
                       dst, dst_addr);
            oshmem_shmem_abort(-1);
            return OSHMEM_ERROR;
        }
    }

#if SPML_IKRIT_PUT_DEBUG == 1
    SPML_VERBOSE(100, "put: pe:%d ptl=%d dst=%p <- src: %p sz=%d. dst_rva=%p, %s",
            dst, ptl_id, dst_addr, src_addr, (int)size, (void *)rva, mca_spml_base_mkey2str(r_mkey));
#endif

    put_req = alloc_put_req();
    if (NULL == put_req) {
        SPML_ERROR("out of put requests - aborting");
        oshmem_shmem_abort(-1);
        return OSHMEM_ERROR;
    }
    if (handle)
        *handle = put_req;

    /* fill out request */
    put_req->mxm_req.base.mq = mca_spml_ikrit.mxm_mq;
    /* request immediate responce if we are getting low on send buffers. We only get responce from remote on ack timeout.
     * Also request explicit ack once in a while  */
#if MXM_API < MXM_VERSION(2,0)
    put_req->mxm_req.opcode = MXM_REQ_OP_PUT;
    if (mca_spml_ikrit.free_list_max - mca_spml_ikrit.n_active_puts <= SPML_IKRIT_PUT_LOW_WATER ||
            (mca_spml_ikrit.mxm_peers[dst]->n_active_puts + 1) % SPML_IKRIT_PACKETS_PER_SYNC == 0) {
        put_req->mxm_req.base.flags = MXM_REQ_FLAG_SEND_SYNC;
        need_progress = 1;
    } else  {
        put_req->mxm_req.base.flags = MXM_REQ_FLAG_SEND_LAZY|MXM_REQ_FLAG_SEND_SYNC;
    }
#else
    put_req->mxm_req.flags = 0;
    if (mca_spml_ikrit.free_list_max - mca_spml_ikrit.n_active_puts <= SPML_IKRIT_PUT_LOW_WATER ||
            (int)opal_list_get_size(&mca_spml_ikrit.active_peers) > mca_spml_ikrit.unsync_conn_max ||
            (mca_spml_ikrit.mxm_peers[dst]->n_active_puts + 1) % SPML_IKRIT_PACKETS_PER_SYNC == 0) {
        need_progress = 1;
        put_req->mxm_req.opcode = MXM_REQ_OP_PUT_SYNC;
    } else  {
        put_req->mxm_req.opcode = MXM_REQ_OP_PUT;
    }
    if (!zcopy) {
        if (size < mca_spml_ikrit.put_zcopy_threshold) {
            put_req->mxm_req.flags |= MXM_REQ_SEND_FLAG_BLOCKING;
        } else {
            put_req->mxm_req.opcode = MXM_REQ_OP_PUT_SYNC;
        }
    }
#endif

    put_req->mxm_req.base.conn = mca_spml_ikrit.mxm_peers[dst]->mxm_conn;
    put_req->mxm_req.base.data_type = MXM_REQ_DATA_BUFFER;
    put_req->mxm_req.base.data.buffer.ptr = src_addr;
    put_req->mxm_req.base.data.buffer.length = size;
    put_req->mxm_req.base.completed_cb = put_completion_cb;
    put_req->mxm_req.base.context = put_req;
    put_req->mxm_req.op.mem.remote_vaddr = (intptr_t) rva;
    put_req->mxm_req.base.state = MXM_REQ_NEW;
    put_req->pe = dst;

#if MXM_API < MXM_VERSION(2,0)
    put_req->mxm_req.base.data.buffer.memh = NULL;
    put_req->mxm_req.op.mem.remote_memh = NULL;
#else
    put_req->mxm_req.op.mem.remote_mkey = to_mxm_mkey(r_mkey);
#endif

    OPAL_THREAD_ADD32(&mca_spml_ikrit.n_active_puts, 1);
    if (mca_spml_ikrit.mxm_peers[dst]->need_fence == 0) {
        opal_list_append(&mca_spml_ikrit.active_peers,
                         &mca_spml_ikrit.mxm_peers[dst]->super);
        mca_spml_ikrit.mxm_peers[dst]->need_fence = 1;
    }

    mca_spml_ikrit.mxm_peers[dst]->n_active_puts++;

    SPML_IKRIT_MXM_POST_SEND(put_req->mxm_req);

    if (need_progress)
        mxm_progress(mca_spml_ikrit.mxm_context);

    return OSHMEM_SUCCESS;
}