int mca_spml_ikrit_oob_get_mkeys(int pe, uint32_t seg, sshmem_mkey_t *mkeys) { int ptl; ptl = get_ptl_id(pe); if (ptl < 0) return OSHMEM_ERROR; if (ptl != MXM_PTL_RDMA) return OSHMEM_ERROR; /* we are actually registering memory in 2.0 and later. * So can only skip mkey exchange when ud is the only transport */ if (mca_spml_ikrit.ud_only) { /* assumes that remote has the same va_base as we do */ mkeys[ptl].len = 0; mkeys[ptl].va_base = mca_memheap_seg2base_va(seg); mkeys[ptl].u.key = MAP_SEGMENT_SHM_INVALID; mca_spml_ikrit_cache_mkeys(&mkeys[ptl], seg, pe, ptl); return OSHMEM_SUCCESS; } return OSHMEM_ERROR; }
static inline int mca_spml_ikrit_get_shm(void *src_addr, size_t size, void *dst_addr, int src) { int ptl_id; void *rva; ptl_id = get_ptl_id(src); /** * Get the address to the remote rkey. **/ if (ptl_id != MXM_PTL_SHM) return OSHMEM_ERROR; if (NULL != mca_spml_ikrit_get_mkey(src, src_addr, MXM_PTL_SHM, &rva, &mca_spml_ikrit)) return OSHMEM_ERROR; SPML_VERBOSE_FASTPATH(100, "shm get: pe:%d src=%p -> dst: %p sz=%d. src_rva=%p", src, src_addr, dst_addr, (int)size, (void *)rva); memcpy(dst_addr, (void *) (unsigned long) rva, size); opal_progress(); return OSHMEM_SUCCESS; }
static inline int mca_spml_ikrit_get_shm(void *src_addr, size_t size, void *dst_addr, int src) { int ptl_id; void *rva; sshmem_mkey_t *r_mkey; ptl_id = get_ptl_id(src); /** * Get the address to the remote rkey. **/ if (ptl_id != MXM_PTL_SHM) return OSHMEM_ERROR; r_mkey = mca_memheap_base_get_cached_mkey(src, src_addr, ptl_id, &rva); if (!r_mkey) { SPML_ERROR("pe=%d: %p is not address of shared variable", src, src_addr); oshmem_shmem_abort(-1); return OSHMEM_ERROR; } if (!mca_memheap_base_can_local_copy(r_mkey, src_addr)) return OSHMEM_ERROR; SPML_VERBOSE(100, "shm get: pe:%d src=%p -> dst: %p sz=%d. src_rva=%p, %s", src, src_addr, dst_addr, (int)size, (void *)rva, mca_spml_base_mkey2str(r_mkey)); memcpy(dst_addr, (void *) (unsigned long) rva, size); opal_progress(); return OSHMEM_SUCCESS; }
int mca_spml_ikrit_oob_get_mkeys(int pe, uint32_t seg, sshmem_mkey_t *mkeys) { int ptl; ptl = get_ptl_id(pe); if (ptl < 0) return OSHMEM_ERROR; if (ptl != MXM_PTL_RDMA) return OSHMEM_ERROR; #if MXM_API < MXM_VERSION(2,0) if (seg > 1) return OSHMEM_ERROR; mkeys[ptl].len = 0; mkeys[ptl].u.key = MAP_SEGMENT_SHM_INVALID; return OSHMEM_SUCCESS; #else /* we are actually registering memory in 2.0 and later. * So can only skip mkey exchange when ud is the only transport */ if (mca_spml_ikrit.ud_only) { mkeys[ptl].len = 0; mkeys[ptl].u.key = MAP_SEGMENT_SHM_INVALID; return OSHMEM_SUCCESS; } return OSHMEM_ERROR; #endif }
static int mca_spml_ikrit_get_helper(mxm_send_req_t *sreq, void *src_addr, size_t size, void *dst_addr, int src) { /* shmem spec states that get() operations are blocking. So it is enough to have single mxm request. Also we count on mxm doing copy */ void *rva; sshmem_mkey_t *r_mkey; int ptl_id; ptl_id = get_ptl_id(src); /* already tried to send via shm and failed. go via rdma */ if (ptl_id == MXM_PTL_SHM) ptl_id = MXM_PTL_RDMA; /** * Get the address to the remote rkey. **/ r_mkey = mca_memheap.memheap_get_cached_mkey(src, src_addr, ptl_id, &rva); if (!r_mkey) { SPML_ERROR("pe=%d: %p is not address of shared variable", src, src_addr); oshmem_shmem_abort(-1); return OSHMEM_ERROR; } SPML_VERBOSE(100, "get: pe:%d ptl=%d src=%p -> dst: %p sz=%d. src_rva=%p, %s", src, ptl_id, src_addr, dst_addr, (int)size, (void *)rva, mca_spml_base_mkey2str(r_mkey)); /* mxm does not really cares for get lkey */ sreq->base.mq = mca_spml_ikrit.mxm_mq; sreq->base.conn = mca_spml_ikrit.mxm_peers[src]->mxm_conn; sreq->base.data_type = MXM_REQ_DATA_BUFFER; sreq->base.data.buffer.ptr = dst_addr; sreq->base.data.buffer.length = size; #if MXM_API < MXM_VERSION(2,0) sreq->base.data.buffer.memh = NULL; sreq->op.mem.remote_memh = NULL; #else sreq->op.mem.remote_mkey = to_mxm_mkey(r_mkey); #endif sreq->opcode = MXM_REQ_OP_GET; sreq->op.mem.remote_vaddr = (intptr_t) rva; sreq->base.state = MXM_REQ_NEW; return OSHMEM_SUCCESS; }
/* simple buffered put implementation. NOT IN USE * Problems: * - slighly worse performance than impl based on non buffered put * - fence complexity is O(n_active_connections) instead of O(n_connections_with_outstanding_puts). * Later is bounded by the network RTT & mxm ack timer. */ int mca_spml_ikrit_put_simple(void* dst_addr, size_t size, void* src_addr, int dst) { void *rva; mxm_send_req_t mxm_req; mxm_wait_t wait; int ptl_id; mxm_mem_key_t *mkey; static int count; ptl_id = get_ptl_id(dst); mkey = mca_spml_ikrit_get_mkey(dst, dst_addr, ptl_id, &rva, &mca_spml_ikrit); SPML_VERBOSE_FASTPATH(100, "put: pe:%d ptl=%d dst=%p <- src: %p sz=%d. dst_rva=%p, %s", dst, ptl_id, dst_addr, src_addr, (int)size, (void *)rva); if (NULL == mkey) { memcpy((void *) (unsigned long) rva, src_addr, size); /* call progress as often as we would have with regular put */ if (++count % SPML_IKRIT_PACKETS_PER_SYNC == 0) mxm_progress(mca_spml_ikrit.mxm_context); return OSHMEM_SUCCESS; } SPML_VERBOSE_FASTPATH(100, "put: pe:%d ptl=%d dst=%p <- src: %p sz=%d. dst_rva=%p, %s", dst, MXM_PTL_RDMA, dst_addr, src_addr, (int)size, (void *)rva); /* fill out request */ mxm_req.base.mq = mca_spml_ikrit.mxm_mq; mxm_req.flags = MXM_REQ_SEND_FLAG_BLOCKING; mxm_req.base.conn = mca_spml_ikrit.mxm_peers[dst].mxm_conn; mxm_req.base.data_type = MXM_REQ_DATA_BUFFER; mxm_req.base.data.buffer.ptr = src_addr; mxm_req.base.data.buffer.length = size; mxm_req.base.completed_cb = 0; mxm_req.base.context = 0; mxm_req.opcode = MXM_REQ_OP_PUT; mxm_req.op.mem.remote_vaddr = (intptr_t) rva; mxm_req.base.state = MXM_REQ_NEW; mxm_req.base.error = MXM_OK; mxm_req.op.mem.remote_mkey = mkey; if (mca_spml_ikrit.mxm_peers[dst].need_fence == 0) { opal_list_append(&mca_spml_ikrit.active_peers, &mca_spml_ikrit.mxm_peers[dst].link); mca_spml_ikrit.mxm_peers[dst].need_fence = 1; } SPML_IKRIT_MXM_POST_SEND(mxm_req); wait.req = &mxm_req.base; wait.state = (mxm_req_state_t)(MXM_REQ_SENT | MXM_REQ_COMPLETED); wait.progress_cb = NULL; wait.progress_arg = NULL; mxm_wait(&wait); return OSHMEM_SUCCESS; }
/** * TODO: using put request as handle is not good. */ static inline int mca_spml_ikrit_put_internal(void* dst_addr, size_t size, void* src_addr, int dst, void **handle, int zcopy) { void *rva; mca_spml_ikrit_put_request_t *put_req; int ptl_id; static int count; int need_progress = 0; mxm_mem_key_t *mkey; if (OPAL_UNLIKELY(0 >= size)) { return OSHMEM_SUCCESS; } ptl_id = get_ptl_id(dst); mkey = mca_spml_ikrit_get_mkey(dst, dst_addr, ptl_id, &rva, &mca_spml_ikrit); if (OPAL_UNLIKELY(NULL == mkey)) { memcpy((void *) (unsigned long) rva, src_addr, size); /* call progress as often as we would have with regular put */ if (++count % SPML_IKRIT_PACKETS_PER_SYNC == 0) mxm_progress(mca_spml_ikrit.mxm_context); return OSHMEM_SUCCESS; } SPML_VERBOSE_FASTPATH(100, "put: pe:%d ptl=%d dst=%p <- src: %p sz=%d. dst_rva=%p, %s", dst, ptl_id, dst_addr, src_addr, (int)size, (void *)rva); put_req = alloc_put_req(); if (handle) *handle = put_req; /* fill out request */ put_req->mxm_req.base.mq = mca_spml_ikrit.mxm_mq; /* request immediate responce if we are getting low on send buffers. We only get responce from remote on ack timeout. * Also request explicit ack once in a while */ put_req->mxm_req.flags = 0; if (mca_spml_ikrit.free_list_max - mca_spml_ikrit.n_active_puts <= SPML_IKRIT_PUT_LOW_WATER || (int)opal_list_get_size(&mca_spml_ikrit.active_peers) > mca_spml_ikrit.unsync_conn_max || (mca_spml_ikrit.mxm_peers[dst].n_active_puts + 1) % SPML_IKRIT_PACKETS_PER_SYNC == 0) { need_progress = 1; put_req->mxm_req.opcode = MXM_REQ_OP_PUT_SYNC; } else { put_req->mxm_req.opcode = MXM_REQ_OP_PUT; } if (!zcopy) { if (size < mca_spml_ikrit.put_zcopy_threshold) { put_req->mxm_req.flags |= MXM_REQ_SEND_FLAG_BLOCKING; } else { put_req->mxm_req.opcode = MXM_REQ_OP_PUT_SYNC; } } put_req->mxm_req.base.conn = mca_spml_ikrit.mxm_peers[dst].mxm_conn; put_req->mxm_req.base.data_type = MXM_REQ_DATA_BUFFER; put_req->mxm_req.base.data.buffer.ptr = src_addr; put_req->mxm_req.base.data.buffer.length = size; put_req->mxm_req.base.completed_cb = put_completion_cb; put_req->mxm_req.base.context = put_req; put_req->mxm_req.op.mem.remote_vaddr = (intptr_t) rva; put_req->mxm_req.base.state = MXM_REQ_NEW; put_req->pe = dst; put_req->mxm_req.op.mem.remote_mkey = mkey; OPAL_THREAD_ADD_FETCH32(&mca_spml_ikrit.n_active_puts, 1); if (mca_spml_ikrit.mxm_peers[dst].need_fence == 0) { opal_list_append(&mca_spml_ikrit.active_peers, &mca_spml_ikrit.mxm_peers[dst].link); mca_spml_ikrit.mxm_peers[dst].need_fence = 1; } mca_spml_ikrit.mxm_peers[dst].n_active_puts++; SPML_IKRIT_MXM_POST_SEND(put_req->mxm_req); if (need_progress) mxm_progress(mca_spml_ikrit.mxm_context); return OSHMEM_SUCCESS; }
/* simple buffered put implementation. NOT IN USE * Problems: * - slighly worse performance than impl based on non buffered put * - fence complexity is O(n_active_connections) instead of O(n_connections_with_outstanding_puts). * Later is bounded by the network RTT & mxm ack timer. */ int mca_spml_ikrit_put_simple(void* dst_addr, size_t size, void* src_addr, int dst) { void *rva; mxm_send_req_t mxm_req; mxm_wait_t wait; int ptl_id; sshmem_mkey_t *r_mkey; static int count; ptl_id = get_ptl_id(dst); /* Get rkey of remote PE (dst proc) which must be on memheap */ r_mkey = mca_memheap_base_get_cached_mkey(dst, dst_addr, ptl_id, &rva); if (!r_mkey) { SPML_ERROR("pe=%d: %p is not address of shared variable", dst, dst_addr); oshmem_shmem_abort(-1); return OSHMEM_ERROR; } #if SPML_IKRIT_PUT_DEBUG == 1 SPML_VERBOSE(100, "put: pe:%d ptl=%d dst=%p <- src: %p sz=%d. dst_rva=%p, %s", dst, ptl_id, dst_addr, src_addr, (int)size, (void *)rva, mca_spml_base_mkey2str(r_mkey)); #endif if (ptl_id == MXM_PTL_SHM) { if (mca_memheap_base_can_local_copy(r_mkey, dst_addr)) { memcpy((void *) (unsigned long) rva, src_addr, size); /* call progress as often as we would have with regular put */ if (++count % SPML_IKRIT_PACKETS_PER_SYNC == 0) mxm_progress(mca_spml_ikrit.mxm_context); return OSHMEM_SUCCESS; } /* segment not mapped - fallback to rmda */ ptl_id = MXM_PTL_RDMA; r_mkey = mca_memheap_base_get_cached_mkey(dst, //(unsigned long) dst_addr, dst_addr, ptl_id, &rva); if (!r_mkey) { SPML_ERROR("pe=%d: %p is not address of shared variable", dst, dst_addr); oshmem_shmem_abort(-1); return OSHMEM_ERROR; } } #if SPML_IKRIT_PUT_DEBUG == 1 SPML_VERBOSE(100, "put: pe:%d ptl=%d dst=%p <- src: %p sz=%d. dst_rva=%p, %s", dst, ptl_id, dst_addr, src_addr, (int)size, (void *)rva, mca_spml_base_mkey2str(r_mkey)); #endif /* fill out request */ mxm_req.base.mq = mca_spml_ikrit.mxm_mq; #if MXM_API < MXM_VERSION(2,0) mxm_req.base.flags = MXM_REQ_FLAG_BLOCKING; #else mxm_req.flags = MXM_REQ_SEND_FLAG_BLOCKING; #endif mxm_req.base.conn = mca_spml_ikrit.mxm_peers[dst]->mxm_conn; mxm_req.base.data_type = MXM_REQ_DATA_BUFFER; mxm_req.base.data.buffer.ptr = src_addr; mxm_req.base.data.buffer.length = size; mxm_req.base.completed_cb = 0; mxm_req.base.context = 0; mxm_req.opcode = MXM_REQ_OP_PUT; mxm_req.op.mem.remote_vaddr = (intptr_t) rva; mxm_req.base.state = MXM_REQ_NEW; mxm_req.base.error = MXM_OK; #if MXM_API < MXM_VERSION(2, 0) mxm_req.base.data.buffer.memh = NULL; mxm_req.op.mem.remote_memh = NULL; #else mxm_req.op.mem.remote_mkey = to_mxm_mkey(r_mkey); #endif if (mca_spml_ikrit.mxm_peers[dst]->need_fence == 0) { opal_list_append(&mca_spml_ikrit.active_peers, &mca_spml_ikrit.mxm_peers[dst]->super); mca_spml_ikrit.mxm_peers[dst]->need_fence = 1; } SPML_IKRIT_MXM_POST_SEND(mxm_req); wait.req = &mxm_req.base; wait.state = (mxm_req_state_t)(MXM_REQ_SENT | MXM_REQ_COMPLETED); wait.progress_cb = NULL; wait.progress_arg = NULL; mxm_wait(&wait); return OSHMEM_SUCCESS; }
/** * TODO: using put request as handle is not good. */ static inline int mca_spml_ikrit_put_internal(void* dst_addr, size_t size, void* src_addr, int dst, void **handle, int zcopy) { void *rva; mca_spml_ikrit_put_request_t *put_req; int ptl_id; sshmem_mkey_t *r_mkey; static int count; int need_progress = 0; if (0 >= size) { return OSHMEM_SUCCESS; } ptl_id = get_ptl_id(dst); /* Get rkey of remote PE (dst proc) which must be on memheap */ r_mkey = mca_memheap_base_get_cached_mkey(dst, dst_addr, ptl_id, &rva); if (!r_mkey) { SPML_ERROR("pe=%d: %p is not address of shared variable", dst, dst_addr); oshmem_shmem_abort(-1); return OSHMEM_ERROR; } #if SPML_IKRIT_PUT_DEBUG == 1 SPML_VERBOSE(100, "put: pe:%d ptl=%d dst=%p <- src: %p sz=%d. dst_rva=%p, %s", dst, ptl_id, dst_addr, src_addr, (int)size, (void *)rva, mca_spml_base_mkey2str(r_mkey)); #endif if (ptl_id == MXM_PTL_SHM) { if (mca_memheap_base_can_local_copy(r_mkey, dst_addr)) { memcpy((void *) (unsigned long) rva, src_addr, size); /* call progress as often as we would have with regular put */ if (++count % SPML_IKRIT_PACKETS_PER_SYNC == 0) mxm_progress(mca_spml_ikrit.mxm_context); return OSHMEM_SUCCESS; } /* segment not mapped - fallback to rmda */ ptl_id = MXM_PTL_RDMA; r_mkey = mca_memheap_base_get_cached_mkey(dst, dst_addr, ptl_id, &rva); if (!r_mkey) { SPML_ERROR("pe=%d: %p is not address of shared variable", dst, dst_addr); oshmem_shmem_abort(-1); return OSHMEM_ERROR; } } #if SPML_IKRIT_PUT_DEBUG == 1 SPML_VERBOSE(100, "put: pe:%d ptl=%d dst=%p <- src: %p sz=%d. dst_rva=%p, %s", dst, ptl_id, dst_addr, src_addr, (int)size, (void *)rva, mca_spml_base_mkey2str(r_mkey)); #endif put_req = alloc_put_req(); if (NULL == put_req) { SPML_ERROR("out of put requests - aborting"); oshmem_shmem_abort(-1); return OSHMEM_ERROR; } if (handle) *handle = put_req; /* fill out request */ put_req->mxm_req.base.mq = mca_spml_ikrit.mxm_mq; /* request immediate responce if we are getting low on send buffers. We only get responce from remote on ack timeout. * Also request explicit ack once in a while */ #if MXM_API < MXM_VERSION(2,0) put_req->mxm_req.opcode = MXM_REQ_OP_PUT; if (mca_spml_ikrit.free_list_max - mca_spml_ikrit.n_active_puts <= SPML_IKRIT_PUT_LOW_WATER || (mca_spml_ikrit.mxm_peers[dst]->n_active_puts + 1) % SPML_IKRIT_PACKETS_PER_SYNC == 0) { put_req->mxm_req.base.flags = MXM_REQ_FLAG_SEND_SYNC; need_progress = 1; } else { put_req->mxm_req.base.flags = MXM_REQ_FLAG_SEND_LAZY|MXM_REQ_FLAG_SEND_SYNC; } #else put_req->mxm_req.flags = 0; if (mca_spml_ikrit.free_list_max - mca_spml_ikrit.n_active_puts <= SPML_IKRIT_PUT_LOW_WATER || (int)opal_list_get_size(&mca_spml_ikrit.active_peers) > mca_spml_ikrit.unsync_conn_max || (mca_spml_ikrit.mxm_peers[dst]->n_active_puts + 1) % SPML_IKRIT_PACKETS_PER_SYNC == 0) { need_progress = 1; put_req->mxm_req.opcode = MXM_REQ_OP_PUT_SYNC; } else { put_req->mxm_req.opcode = MXM_REQ_OP_PUT; } if (!zcopy) { if (size < mca_spml_ikrit.put_zcopy_threshold) { put_req->mxm_req.flags |= MXM_REQ_SEND_FLAG_BLOCKING; } else { put_req->mxm_req.opcode = MXM_REQ_OP_PUT_SYNC; } } #endif put_req->mxm_req.base.conn = mca_spml_ikrit.mxm_peers[dst]->mxm_conn; put_req->mxm_req.base.data_type = MXM_REQ_DATA_BUFFER; put_req->mxm_req.base.data.buffer.ptr = src_addr; put_req->mxm_req.base.data.buffer.length = size; put_req->mxm_req.base.completed_cb = put_completion_cb; put_req->mxm_req.base.context = put_req; put_req->mxm_req.op.mem.remote_vaddr = (intptr_t) rva; put_req->mxm_req.base.state = MXM_REQ_NEW; put_req->pe = dst; #if MXM_API < MXM_VERSION(2,0) put_req->mxm_req.base.data.buffer.memh = NULL; put_req->mxm_req.op.mem.remote_memh = NULL; #else put_req->mxm_req.op.mem.remote_mkey = to_mxm_mkey(r_mkey); #endif OPAL_THREAD_ADD32(&mca_spml_ikrit.n_active_puts, 1); if (mca_spml_ikrit.mxm_peers[dst]->need_fence == 0) { opal_list_append(&mca_spml_ikrit.active_peers, &mca_spml_ikrit.mxm_peers[dst]->super); mca_spml_ikrit.mxm_peers[dst]->need_fence = 1; } mca_spml_ikrit.mxm_peers[dst]->n_active_puts++; SPML_IKRIT_MXM_POST_SEND(put_req->mxm_req); if (need_progress) mxm_progress(mca_spml_ikrit.mxm_context); return OSHMEM_SUCCESS; }