static int __reg_segment(map_segment_t *s, int *num_btl) { int rc = OSHMEM_SUCCESS; int my_pe; int nprocs; nprocs = oshmem_num_procs(); my_pe = oshmem_my_proc_id(); s->mkeys_cache = (mca_spml_mkey_t **) calloc(nprocs, sizeof(mca_spml_mkey_t *)); if (NULL == s->mkeys_cache) { MEMHEAP_ERROR("Failed to allocate memory for remote segments"); rc = OSHMEM_ERROR; } if (!rc) { s->mkeys = MCA_SPML_CALL(register((void *)(unsigned long)s->start, s->end - s->start, MEMHEAP_SHM_CODE(s->type, s->shmid), num_btl)); if (NULL == s->mkeys) { free(s->mkeys_cache); s->mkeys_cache = NULL; MEMHEAP_ERROR("Failed to register segment"); rc = OSHMEM_ERROR; } }
static int __load_segments(void) { FILE *fp; char line[1024]; struct map_segment_desc seg; memheap_context.n_segments = 0; fp = fopen("/proc/self/maps", "r"); if (NULL == fp) { MEMHEAP_ERROR("Failed to open /proc/self/maps"); return OSHMEM_ERROR; } while (NULL != fgets(line, sizeof(line), fp)) { memset(&seg, 0, sizeof(seg)); sscanf(line, "%llx-%llx %s %llx %s %llx %s", (long long *) &seg.start, (long long *) &seg.end, seg.perms, (long long *) &seg.offset, seg.dev, (long long *) &seg.inode, seg.pathname); if (OSHMEM_ERROR == __check_address(&seg)) continue; if (OSHMEM_ERROR == __check_pathname(&seg)) continue; if (OSHMEM_ERROR == __check_perms(&seg)) continue; MEMHEAP_VERBOSE(5, "add: %s", line); if (MCA_MEMHEAP_MAX_SEGMENTS <= memheap_context.n_segments) { MEMHEAP_ERROR("too many segments (max = %d): skip %s", MCA_MEMHEAP_MAX_SEGMENTS, line); continue; } if (memheap_context.n_segments > 0 && seg.start == memheap_context.mem_segs[memheap_context.n_segments - 1].end) { MEMHEAP_VERBOSE(5, "Coalescing segment"); memheap_context.mem_segs[memheap_context.n_segments - 1].end = seg.end; } else { memheap_context.mem_segs[memheap_context.n_segments].start = seg.start; memheap_context.mem_segs[memheap_context.n_segments].end = seg.end; memheap_context.n_segments++; } } fclose(fp); return OSHMEM_SUCCESS; }
static int oshmem_mkey_recv_cb(void) { MPI_Status status; int flag; int n; int rc; opal_buffer_t *msg; int32_t size; void *tmp_buf; oob_comm_request_t *r; n = 0; r = (oob_comm_request_t *)opal_list_get_first(&memheap_oob.req_list); assert(r); while (1) { my_MPI_Test(&r->recv_req, &flag, &status); if (OPAL_LIKELY(0 == flag)) { return n; } MPI_Get_count(&status, MPI_BYTE, &size); MEMHEAP_VERBOSE(5, "OOB request from PE: %d, size %d", status.MPI_SOURCE, size); n++; opal_list_remove_first(&memheap_oob.req_list); /* to avoid deadlock we must start request * before processing it. Data are copied to * the tmp buffer */ tmp_buf = malloc(size); if (NULL == tmp_buf) { MEMHEAP_ERROR("not enough memory"); ORTE_ERROR_LOG(0); return n; } memcpy(tmp_buf, (void*)&r->buf, size); msg = OBJ_NEW(opal_buffer_t); if (NULL == msg) { MEMHEAP_ERROR("not enough memory"); ORTE_ERROR_LOG(0); return n; } opal_dss.load(msg, (void*)tmp_buf, size); rc = MPI_Start(&r->recv_req); if (MPI_SUCCESS != rc) { MEMHEAP_ERROR("Failed to post recv request %d", rc); ORTE_ERROR_LOG(rc); return n; } opal_list_append(&memheap_oob.req_list, &r->super); do_recv(status.MPI_SOURCE, msg); OBJ_RELEASE(msg); r = (oob_comm_request_t *)opal_list_get_first(&memheap_oob.req_list); assert(r); } return 1; }
static int memheap_oob_get_mkeys(int pe, uint32_t seg, sshmem_mkey_t *mkeys) { opal_buffer_t *msg; uint8_t cmd; int i; int rc; if (OSHMEM_SUCCESS == MCA_SPML_CALL(oob_get_mkeys(pe, seg, mkeys))) { for (i = 0; i < memheap_map->num_transports; i++) { mkeys[i].va_base = mca_memheap_seg2base_va(seg); MEMHEAP_VERBOSE(5, "MKEY CALCULATED BY LOCAL SPML: pe: %d tr_id: %d %s", pe, i, mca_spml_base_mkey2str(&mkeys[i])); } return OSHMEM_SUCCESS; } OPAL_THREAD_LOCK(&memheap_oob.lck); memheap_oob.mkeys = mkeys; memheap_oob.mkeys_rcvd = 0; msg = OBJ_NEW(opal_buffer_t); if (!msg) { OPAL_THREAD_UNLOCK(&memheap_oob.lck); MEMHEAP_ERROR("failed to get msg buffer"); return OSHMEM_ERROR; } OPAL_THREAD_LOCK(&memheap_oob.lck); cmd = MEMHEAP_RKEY_REQ; opal_dss.pack(msg, &cmd, 1, OPAL_UINT8); opal_dss.pack(msg, &seg, 1, OPAL_UINT32); rc = send_buffer(pe, msg); if (MPI_SUCCESS != rc) { OPAL_THREAD_UNLOCK(&memheap_oob.lck); MEMHEAP_ERROR("FAILED to send rml message %d", rc); return OSHMEM_ERROR; } while (!memheap_oob.mkeys_rcvd) { opal_condition_wait(&memheap_oob.cond, &memheap_oob.lck); } if (MEMHEAP_RKEY_RESP == memheap_oob.mkeys_rcvd) { rc = OSHMEM_SUCCESS; } else { MEMHEAP_ERROR("failed to get rkey seg#%d pe=%d", seg, pe); rc = OSHMEM_ERROR; } OPAL_THREAD_UNLOCK(&memheap_oob.lck); return rc; }
void mca_memheap_base_alloc_exit(mca_memheap_map_t *map) { if (map) { map_segment_t *s = &map->mem_segs[HEAP_SEG_INDEX]; assert(s); switch (s->type) { case MAP_SEGMENT_ALLOC_SHM: _shm_detach(s); break; case MAP_SEGMENT_ALLOC_MMAP: _mmap_detach(s); break; #if defined(MPAGE_ENABLE) && (MPAGE_ENABLE > 0) case MAP_SEGMENT_ALLOC_IBV: _ibv_detach(s); break; #endif /* MPAGE_ENABLE */ default: MEMHEAP_ERROR("Unknown segment type: %d", (int)s->type); } } }
static int _mmap_attach(map_segment_t *s, size_t size) { void *addr = NULL; assert(s); addr = mmap((void *) mca_memheap_base_start_address, size, PROT_READ | PROT_WRITE, MAP_SHARED | #if defined (__APPLE__) MAP_ANON | #elif defined (__GNUC__) MAP_ANONYMOUS | #endif MAP_FIXED, 0, 0); if (MAP_FAILED == addr) { MEMHEAP_ERROR("Failed to mmap() %llu bytes (errno=%d)", (unsigned long long)size, errno); return OSHMEM_ERR_OUT_OF_RESOURCE; } s->type = MAP_SEGMENT_ALLOC_MMAP; s->shmid = MEMHEAP_SHM_INVALID; s->start = addr; s->size = size; s->end = (void*)((uintptr_t)s->start + s->size); s->context = NULL; return OSHMEM_SUCCESS; }
static int pack_local_mkeys(opal_buffer_t *msg, int pe, int seg) { int i, n; sshmem_mkey_t *mkey; /* go over all transports and pack mkeys */ n = memheap_map->num_transports; opal_dss.pack(msg, &n, 1, OPAL_UINT32); MEMHEAP_VERBOSE(5, "found %d transports to %d", n, pe); for (i = 0; i < n; i++) { mkey = mca_memheap_base_get_mkey(mca_memheap_seg2base_va(seg), i); if (!mkey) { MEMHEAP_ERROR("seg#%d tr_id: %d failed to find local mkey", seg, i); return OSHMEM_ERROR; } opal_dss.pack(msg, &i, 1, OPAL_UINT32); opal_dss.pack(msg, &mkey->va_base, 1, OPAL_UINT64); if (0 == mkey->va_base) { opal_dss.pack(msg, &mkey->u.key, 1, OPAL_UINT64); } else { opal_dss.pack(msg, &mkey->len, 1, OPAL_UINT16); if (0 < mkey->len) { opal_dss.pack(msg, mkey->u.data, mkey->len, OPAL_BYTE); } } MEMHEAP_VERBOSE(5, "seg#%d tr_id: %d %s", seg, i, mca_spml_base_mkey2str(mkey)); } return OSHMEM_SUCCESS; }
int memheap_oob_init(mca_memheap_map_t *map) { int rc = OSHMEM_SUCCESS; int i; oob_comm_request_t *r; memheap_map = map; OBJ_CONSTRUCT(&memheap_oob.lck, opal_mutex_t); OBJ_CONSTRUCT(&memheap_oob.cond, opal_condition_t); OBJ_CONSTRUCT(&memheap_oob.req_list, opal_list_t); for (i = 0; i < MEMHEAP_RECV_REQS_MAX; i++) { r = &memheap_oob.req_pool[i]; rc = PMPI_Recv_init(r->buf, sizeof(r->buf), MPI_BYTE, MPI_ANY_SOURCE, 0, oshmem_comm_world, &r->recv_req); if (MPI_SUCCESS != rc) { MEMHEAP_ERROR("Failed to created recv request %d", rc); return rc; } rc = PMPI_Start(&r->recv_req); if (MPI_SUCCESS != rc) { MEMHEAP_ERROR("Failed to post recv request %d", rc); return rc; } opal_list_append(&memheap_oob.req_list, &r->super); } opal_progress_register(oshmem_mkey_recv_cb); memheap_oob.is_inited = 1; return rc; }
static int do_mkey_req(opal_buffer_t *msg, int pe, int seg) { uint8_t msg_type; oshmem_proc_t *proc; int i, n, tr_id; mca_spml_mkey_t *mkey; msg_type = MEMHEAP_RKEY_RESP; opal_dss.pack(msg, &msg_type, 1, OPAL_UINT8); /* go over all transports to remote pe and pack mkeys */ n = oshmem_get_transport_count(pe); proc = oshmem_proc_group_find(oshmem_group_all, pe); opal_dss.pack(msg, &n, 1, OPAL_UINT32); MEMHEAP_VERBOSE(5, "found %d transports to %d", n, pe); for (i = 0; i < n; i++) { tr_id = proc->transport_ids[i]; mkey = mca_memheap_base_get_mkey(__seg2base_va(seg), tr_id); if (!mkey) { MEMHEAP_ERROR("seg#%d tr_id: %d failed to find local mkey", seg, tr_id); return OSHMEM_ERROR; } opal_dss.pack(msg, &tr_id, 1, OPAL_UINT32); opal_dss.pack(msg, &mkey->key, 1, OPAL_UINT64); opal_dss.pack(msg, &mkey->va_base, 1, OPAL_UINT64); if (NULL != MCA_SPML_CALL(get_remote_context_size)) { uint32_t context_size = (mkey->spml_context == NULL ) ? 0 : (uint32_t) MCA_SPML_CALL(get_remote_context_size(mkey->spml_context)); opal_dss.pack(msg, &context_size, 1, OPAL_UINT32); if (0 != context_size) { opal_dss.pack(msg, MCA_SPML_CALL(get_remote_context(mkey->spml_context)), context_size, OPAL_BYTE); } } MEMHEAP_VERBOSE(5, "seg#%d tr_id: %d key %llx base_va %p", seg, tr_id, (unsigned long long)mkey->key, mkey->va_base); } return OSHMEM_SUCCESS; }
static void unpack_remote_mkeys(opal_buffer_t *msg, int remote_pe) { int32_t cnt; int32_t n; int32_t tr_id; int i; oshmem_proc_t *proc; proc = oshmem_proc_group_find(oshmem_group_all, remote_pe); cnt = 1; opal_dss.unpack(msg, &n, &cnt, OPAL_UINT32); for (i = 0; i < n; i++) { cnt = 1; opal_dss.unpack(msg, &tr_id, &cnt, OPAL_UINT32); cnt = 1; opal_dss.unpack(msg, &memheap_oob.mkeys[tr_id].va_base, &cnt, OPAL_UINT64); if (0 == memheap_oob.mkeys[tr_id].va_base) { cnt = 1; opal_dss.unpack(msg, &memheap_oob.mkeys[tr_id].u.key, &cnt, OPAL_UINT64); if (OPAL_PROC_ON_LOCAL_NODE(proc->super.proc_flags)) { memheap_attach_segment(&memheap_oob.mkeys[tr_id], tr_id); } } else { cnt = 1; opal_dss.unpack(msg, &memheap_oob.mkeys[tr_id].len, &cnt, OPAL_UINT16); if (0 < memheap_oob.mkeys[tr_id].len) { memheap_oob.mkeys[tr_id].u.data = malloc(memheap_oob.mkeys[tr_id].len); if (NULL == memheap_oob.mkeys[tr_id].u.data) { MEMHEAP_ERROR("Failed allocate %d bytes", memheap_oob.mkeys[tr_id].len); oshmem_shmem_abort(-1); } cnt = memheap_oob.mkeys[tr_id].len; opal_dss.unpack(msg, memheap_oob.mkeys[tr_id].u.data, &cnt, OPAL_BYTE); MCA_SPML_CALL(rmkey_unpack(&memheap_oob.mkeys[tr_id], remote_pe)); } else { memheap_oob.mkeys[tr_id].u.key = MAP_SEGMENT_SHM_INVALID; } } MEMHEAP_VERBOSE(5, "tr_id: %d %s", tr_id, mca_spml_base_mkey2str(&memheap_oob.mkeys[tr_id])); } }
/** * @param all_trs * 0 - pack mkeys for transports to given pe * 1 - pack mkeys for ALL possible transports. value of pe is ignored */ static int pack_local_mkeys(opal_buffer_t *msg, int pe, int seg, int all_trs) { oshmem_proc_t *proc; int i, n, tr_id; sshmem_mkey_t *mkey; /* go over all transports to remote pe and pack mkeys */ if (!all_trs) { n = oshmem_get_transport_count(pe); proc = oshmem_proc_group_find(oshmem_group_all, pe); } else { proc = NULL; n = memheap_map->num_transports; } opal_dss.pack(msg, &n, 1, OPAL_UINT32); MEMHEAP_VERBOSE(5, "found %d transports to %d", n, pe); for (i = 0; i < n; i++) { if (!all_trs) { tr_id = proc->transport_ids[i]; } else { tr_id = i; } mkey = mca_memheap_base_get_mkey(mca_memheap_seg2base_va(seg), tr_id); if (!mkey) { MEMHEAP_ERROR("seg#%d tr_id: %d failed to find local mkey", seg, tr_id); return OSHMEM_ERROR; } opal_dss.pack(msg, &tr_id, 1, OPAL_UINT32); opal_dss.pack(msg, &mkey->va_base, 1, OPAL_UINT64); if (0 == mkey->va_base) { opal_dss.pack(msg, &mkey->u.key, 1, OPAL_UINT64); } else { opal_dss.pack(msg, &mkey->len, 1, OPAL_UINT16); if (0 < mkey->len) { opal_dss.pack(msg, mkey->u.data, mkey->len, OPAL_BYTE); } } MEMHEAP_VERBOSE(5, "seg#%d tr_id: %d %s", seg, tr_id, mca_spml_base_mkey2str(mkey)); } return OSHMEM_SUCCESS; }
static void memheap_attach_segment(sshmem_mkey_t *mkey, int tr_id) { /* process special case when va was got using sshmem * this case is notable for: * - key is set as (seg_id); * - va_base is set as 0; * - len is set as 0; */ assert(mkey->va_base == 0); assert(mkey->len == 0); MEMHEAP_VERBOSE(5, "shared memory usage tr_id: %d va_base: 0x%p len: %d key %llx", tr_id, mkey->va_base, mkey->len, (unsigned long long)mkey->u.key); mca_sshmem_segment_attach(&(memheap_map->mem_segs[HEAP_SEG_INDEX]), mkey); if ((void *) -1 == (void *) mkey->va_base) { MEMHEAP_ERROR("tr_id: %d key %llx attach failed: errno = %d", tr_id, (unsigned long long)mkey->u.key, errno); oshmem_shmem_abort(-1); } }
void mca_memheap_modex_recv_all(void) { int i; int j; int nprocs, my_pe; opal_buffer_t *msg = NULL; void *send_buffer = NULL; char *rcv_buffer = NULL; int size; int *rcv_size = NULL; int *rcv_n_transports = NULL; int *rcv_offsets = NULL; int rc = OSHMEM_SUCCESS; size_t buffer_size; if (!mca_memheap_base_key_exchange) { oshmem_shmem_barrier(); return; } nprocs = oshmem_num_procs(); my_pe = oshmem_my_proc_id(); /* buffer allocation for num_transports * message sizes and offsets */ rcv_size = (int *)malloc(nprocs * sizeof(int)); if (NULL == rcv_size) { MEMHEAP_ERROR("failed to get rcv_size buffer"); rc = OSHMEM_ERR_OUT_OF_RESOURCE; goto exit_fatal; } rcv_offsets = (int *)malloc(nprocs * sizeof(int)); if (NULL == rcv_offsets) { MEMHEAP_ERROR("failed to get rcv_offsets buffer"); rc = OSHMEM_ERR_OUT_OF_RESOURCE; goto exit_fatal; } rcv_n_transports = (int *)malloc(nprocs * sizeof(int)); if (NULL == rcv_offsets) { MEMHEAP_ERROR("failed to get rcv_offsets buffer"); rc = OSHMEM_ERR_OUT_OF_RESOURCE; goto exit_fatal; } /* serialize our own mkeys */ msg = OBJ_NEW(opal_buffer_t); if (NULL == msg) { MEMHEAP_ERROR("failed to get msg buffer"); rc = OSHMEM_ERR_OUT_OF_RESOURCE; goto exit_fatal; } for (j = 0; j < memheap_map->n_segments; j++) { pack_local_mkeys(msg, 0, j, 1); } /* we assume here that int32_t returned by opal_dss.unload * is equal to size of int we use for MPI_Allgather, MPI_Allgatherv */ assert(sizeof(int32_t) == sizeof(int)); /* Do allgather */ opal_dss.unload(msg, &send_buffer, &size); MEMHEAP_VERBOSE(1, "local keys packed into %d bytes, %d segments", size, memheap_map->n_segments); /* we need to send num_transports and message sizes separately * since message sizes depend on types of btl used */ rc = oshmem_shmem_allgather(&memheap_map->num_transports, rcv_n_transports, sizeof(int)); if (MPI_SUCCESS != rc) { MEMHEAP_ERROR("allgather failed"); goto exit_fatal; } rc = oshmem_shmem_allgather(&size, rcv_size, sizeof(int)); if (MPI_SUCCESS != rc) { MEMHEAP_ERROR("allgather failed"); goto exit_fatal; } /* calculating offsets (displacements) for allgatherv */ rcv_offsets[0] = 0; for (i = 1; i < nprocs; i++) { rcv_offsets[i] = rcv_offsets[i - 1] + rcv_size[i - 1]; } buffer_size = rcv_offsets[nprocs - 1] + rcv_size[nprocs - 1]; rcv_buffer = malloc (buffer_size); if (NULL == rcv_buffer) { MEMHEAP_ERROR("failed to allocate recieve buffer"); rc = OSHMEM_ERR_OUT_OF_RESOURCE; goto exit_fatal; } rc = oshmem_shmem_allgatherv(send_buffer, rcv_buffer, size, rcv_size, rcv_offsets); if (MPI_SUCCESS != rc) { free (rcv_buffer); MEMHEAP_ERROR("allgatherv failed"); goto exit_fatal; } opal_dss.load(msg, rcv_buffer, buffer_size); /* deserialize mkeys */ OPAL_THREAD_LOCK(&memheap_oob.lck); for (i = 0; i < nprocs; i++) { if (i == my_pe) { continue; } msg->unpack_ptr = (void *)((intptr_t) msg->base_ptr + rcv_offsets[i]); for (j = 0; j < memheap_map->n_segments; j++) { map_segment_t *s; s = &memheap_map->mem_segs[j]; if (NULL != s->mkeys_cache[i]) { MEMHEAP_VERBOSE(10, "PE%d: segment%d already exists, mkey will be replaced", i, j); } else { s->mkeys_cache[i] = (sshmem_mkey_t *) calloc(rcv_n_transports[i], sizeof(sshmem_mkey_t)); if (NULL == s->mkeys_cache[i]) { MEMHEAP_ERROR("PE%d: segment%d: Failed to allocate mkeys cache entry", i, j); oshmem_shmem_abort(-1); } } memheap_oob.mkeys = s->mkeys_cache[i]; unpack_remote_mkeys(msg, i); } } OPAL_THREAD_UNLOCK(&memheap_oob.lck); exit_fatal: if (rcv_size) { free(rcv_size); } if (rcv_offsets) { free(rcv_offsets); } if (rcv_n_transports) { free(rcv_n_transports); } if (send_buffer) { free(send_buffer); } if (msg) { OBJ_RELEASE(msg); } /* This function requires abort in any error case */ if (OSHMEM_SUCCESS != rc) { oshmem_shmem_abort(rc); } }
static void memheap_buddy_rml_recv_cb(int status, orte_process_name_t* process_name, opal_buffer_t* buffer, orte_rml_tag_t tag, void* cbdata) { MEMHEAP_VERBOSE(5, "**** get request from %u:%d", process_name->jobid, process_name->vpid); int32_t cnt = 1; int rc; opal_buffer_t *msg; uint8_t msg_type; uint32_t seg; MEMHEAP_VERBOSE(5, "unpacking %d of %d", cnt, OPAL_UINT8); rc = opal_dss.unpack(buffer, &msg_type, &cnt, OPAL_UINT8); if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); goto send_fail; } switch (msg_type) { case MEMHEAP_RKEY_REQ: cnt = 1; rc = opal_dss.unpack(buffer, &seg, &cnt, OPAL_UINT32); if (ORTE_SUCCESS != rc) { MEMHEAP_ERROR("bad RKEY_REQ msg"); goto send_fail; } MEMHEAP_VERBOSE(5, "*** RKEY REQ"); msg = OBJ_NEW(opal_buffer_t); if (!msg) { MEMHEAP_ERROR("failed to get msg buffer"); ORTE_ERROR_LOG(rc); return; } if (OSHMEM_SUCCESS != do_mkey_req(msg, process_name->vpid, seg)) { OBJ_RELEASE(msg); goto send_fail; } rc = orte_rml.send_buffer_nb(process_name, msg, OMPI_RML_TAG_SHMEM, orte_rml_send_callback, NULL); if (0 > rc) { MEMHEAP_ERROR("FAILED to send rml message %d", rc); ORTE_ERROR_LOG(rc); goto send_fail; } break; case MEMHEAP_RKEY_RESP: MEMHEAP_VERBOSE(5, "*** RKEY RESP"); OPAL_THREAD_LOCK(&memheap_oob.lck); do_mkey_resp(buffer); memheap_oob.mkeys_rcvd = MEMHEAP_RKEY_RESP; opal_condition_broadcast(&memheap_oob.cond); OPAL_THREAD_UNLOCK(&memheap_oob.lck); break; case MEMHEAP_RKEY_RESP_FAIL: MEMHEAP_VERBOSE(5, "*** RKEY RESP FAIL"); memheap_oob.mkeys_rcvd = MEMHEAP_RKEY_RESP_FAIL; opal_condition_broadcast(&memheap_oob.cond); OPAL_THREAD_UNLOCK(&memheap_oob.lck); break; default: MEMHEAP_VERBOSE(5, "Unknown message type %x", msg_type); goto send_fail; } return; send_fail: msg = OBJ_NEW(opal_buffer_t); if (!msg) { MEMHEAP_ERROR("failed to get msg buffer"); ORTE_ERROR_LOG(rc); return; } msg_type = MEMHEAP_RKEY_RESP_FAIL; opal_dss.pack(msg, &msg_type, 1, OPAL_UINT8); rc = orte_rml.send_buffer_nb(process_name, msg, OMPI_RML_TAG_SHMEM, orte_rml_send_callback, NULL); if (0 > rc) { MEMHEAP_ERROR("FAILED to send rml message %d", rc); ORTE_ERROR_LOG(rc); } }
static int oshmem_mkey_recv_cb(void) { MPI_Status status; int flag; int n; int rc; opal_buffer_t *msg; int32_t size; void *tmp_buf; oob_comm_request_t *r; n = 0; r = (oob_comm_request_t *)opal_list_get_first(&memheap_oob.req_list); assert(r); while(r != (oob_comm_request_t *)opal_list_get_end(&memheap_oob.req_list)) { my_MPI_Test(&r->recv_req, &flag, &status); if (OPAL_LIKELY(0 == flag)) { return n; } PMPI_Get_count(&status, MPI_BYTE, &size); MEMHEAP_VERBOSE(5, "OOB request from PE: %d, size %d", status.MPI_SOURCE, size); n++; opal_list_remove_first(&memheap_oob.req_list); /* to avoid deadlock we must start request * before processing it. Data are copied to * the tmp buffer */ tmp_buf = malloc(size); if (NULL == tmp_buf) { MEMHEAP_ERROR("not enough memory"); ORTE_ERROR_LOG(0); return n; } else { memcpy(tmp_buf, (void*)&r->buf, size); msg = OBJ_NEW(opal_buffer_t); if (NULL == msg) { MEMHEAP_ERROR("not enough memory"); ORTE_ERROR_LOG(0); free(tmp_buf); return n; } opal_dss.load(msg, (void*)tmp_buf, size); /* * send reply before posting the receive request again to limit the recursion size to * number of receive requests. * send can call opal_progress which calls this function again. If recv req is started * stack size will be proportional to number of job ranks. */ do_recv(status.MPI_SOURCE, msg); OBJ_RELEASE(msg); } rc = PMPI_Start(&r->recv_req); if (MPI_SUCCESS != rc) { MEMHEAP_ERROR("Failed to post recv request %d", rc); ORTE_ERROR_LOG(rc); return n; } opal_list_append(&memheap_oob.req_list, &r->super); r = (oob_comm_request_t *)opal_list_get_first(&memheap_oob.req_list); assert(r); } return 1; }
static void do_recv(int source_pe, opal_buffer_t* buffer) { int32_t cnt = 1; int rc; opal_buffer_t *msg; uint8_t msg_type; uint32_t seg; MEMHEAP_VERBOSE(5, "unpacking %d of %d", cnt, OPAL_UINT8); rc = opal_dss.unpack(buffer, &msg_type, &cnt, OPAL_UINT8); if (OPAL_SUCCESS != rc) { ORTE_ERROR_LOG(rc); goto send_fail; } switch (msg_type) { case MEMHEAP_RKEY_REQ: cnt = 1; rc = opal_dss.unpack(buffer, &seg, &cnt, OPAL_UINT32); if (OPAL_SUCCESS != rc) { MEMHEAP_ERROR("bad RKEY_REQ msg"); goto send_fail; } MEMHEAP_VERBOSE(5, "*** RKEY REQ"); msg = OBJ_NEW(opal_buffer_t); if (!msg) { MEMHEAP_ERROR("failed to get msg buffer"); ORTE_ERROR_LOG(rc); return; } msg_type = MEMHEAP_RKEY_RESP; opal_dss.pack(msg, &msg_type, 1, OPAL_UINT8); if (OSHMEM_SUCCESS != pack_local_mkeys(msg, source_pe, seg, 0)) { OBJ_RELEASE(msg); goto send_fail; } rc = send_buffer(source_pe, msg); if (MPI_SUCCESS != rc) { MEMHEAP_ERROR("FAILED to send rml message %d", rc); ORTE_ERROR_LOG(rc); goto send_fail; } break; case MEMHEAP_RKEY_RESP: MEMHEAP_VERBOSE(5, "*** RKEY RESP"); OPAL_THREAD_LOCK(&memheap_oob.lck); unpack_remote_mkeys(buffer, source_pe); memheap_oob.mkeys_rcvd = MEMHEAP_RKEY_RESP; opal_condition_broadcast(&memheap_oob.cond); OPAL_THREAD_UNLOCK(&memheap_oob.lck); break; case MEMHEAP_RKEY_RESP_FAIL: MEMHEAP_VERBOSE(5, "*** RKEY RESP FAIL"); memheap_oob.mkeys_rcvd = MEMHEAP_RKEY_RESP_FAIL; opal_condition_broadcast(&memheap_oob.cond); OPAL_THREAD_UNLOCK(&memheap_oob.lck); break; default: MEMHEAP_VERBOSE(5, "Unknown message type %x", msg_type); goto send_fail; } return; send_fail: msg = OBJ_NEW(opal_buffer_t); if (!msg) { MEMHEAP_ERROR("failed to get msg buffer"); ORTE_ERROR_LOG(rc); return; } msg_type = MEMHEAP_RKEY_RESP_FAIL; opal_dss.pack(msg, &msg_type, 1, OPAL_UINT8); rc = send_buffer(source_pe, msg); if (MPI_SUCCESS != rc) { MEMHEAP_ERROR("FAILED to send rml message %d", rc); ORTE_ERROR_LOG(rc); } }
static int memheap_oob_get_mkeys(int pe, uint32_t seg, mca_spml_mkey_t *mkeys) { orte_process_name_t name; opal_buffer_t *msg; int rc; uint8_t cmd; int i; if (OSHMEM_SUCCESS == MCA_SPML_CALL(oob_get_mkeys(pe, seg, mkeys))) { for (i = 0; i < memheap_map->num_transports; i++) { mkeys[i].va_base = __seg2base_va(seg); MEMHEAP_VERBOSE(5, "MKEY CALCULATED BY LOCAL SPML: pe: %d tr_id: %d key %llx base_va %p", pe, i, (unsigned long long)mkeys[i].key, mkeys[i].va_base); } return OSHMEM_SUCCESS; } OPAL_THREAD_LOCK(&memheap_oob.lck); memheap_oob.mkeys = mkeys; memheap_oob.mkeys_rcvd = 0; name.jobid = ORTE_PROC_MY_NAME->jobid; name.vpid = pe; msg = OBJ_NEW(opal_buffer_t); if (!msg) { OPAL_THREAD_UNLOCK(&memheap_oob.lck); MEMHEAP_ERROR("failed to get msg buffer"); return OSHMEM_ERROR; } OPAL_THREAD_LOCK(&memheap_oob.lck); cmd = MEMHEAP_RKEY_REQ; opal_dss.pack(msg, &cmd, 1, OPAL_UINT8); opal_dss.pack(msg, &seg, 1, OPAL_UINT32); rc = orte_rml.send_buffer_nb(&name, msg, OMPI_RML_TAG_SHMEM, orte_rml_send_callback, NULL); if (0 > rc) { OBJ_RELEASE(msg); OPAL_THREAD_UNLOCK(&memheap_oob.lck); MEMHEAP_ERROR("FAILED to send rml message %d", rc); return OSHMEM_ERROR; } MEMHEAP_VERBOSE(5, "message sent: %d bytes!", rc); while (!memheap_oob.mkeys_rcvd) { opal_condition_wait(&memheap_oob.cond, &memheap_oob.lck); } if (MEMHEAP_RKEY_RESP == memheap_oob.mkeys_rcvd) { rc = OSHMEM_SUCCESS; } else { MEMHEAP_ERROR("failed to get rkey seg#%d pe=%d", seg, pe); rc = OSHMEM_ERROR; } OPAL_THREAD_UNLOCK(&memheap_oob.lck); return rc; }
void mca_memheap_modex_recv_all(void) { int i; int j; int nprocs, my_pe; oshmem_proc_t *proc; mca_spml_mkey_t *mkey; void* dummy_rva; if (!mca_memheap_base_key_exchange) return; /* init rkey cache */ nprocs = oshmem_num_procs(); my_pe = oshmem_my_proc_id(); /* Note: * Doing exchange via rml till we figure out problem with grpcomm.modex and barrier */ for (i = 0; i < nprocs; i++) { if (i == my_pe) continue; proc = oshmem_proc_group_find(oshmem_group_all, i); for (j = 0; j < memheap_map->n_segments; j++) { mkey = mca_memheap_base_get_cached_mkey(i, memheap_map->mem_segs[j].start, proc->transport_ids[0], &dummy_rva); if (!mkey) { MEMHEAP_ERROR("Failed to receive mkeys"); oshmem_shmem_abort(-1); } } } /* * There is an issue with orte_grpcomm.barrier usage as * ess/pmi directs to use grpcomm/pmi in case slurm srun() call grpcomm/pmi calls PMI_Barrier() * that is a function of external library. * There is no opal_progress() in such way. As a result slow PEs send a request (MEMHEAP_RKEY_REQ) to * fast PEs waiting on barrier and do not get a respond (MEMHEAP_RKEY_RESP). * * there are following ways to solve one: * 1. calculate requests from remote PEs and do ORTE_PROGRESSED_WAIT waiting for expected value; * 2. use shmem_barrier_all(); * 3. rework pmi/barrier to use opal_progress(); * 4. use orte_grpcomm.barrier carefully; * * It seems there is no need to use orte_grpcomm.barrier here */ if (memheap_map->mem_segs[HEAP_SEG_INDEX].shmid != MEMHEAP_SHM_INVALID) { /* unfortunately we must do barrier here to assure that everyone are attached to our segment * good thing that this code path only invoked on older linuxes (-mca shmalloc_use_hugepages 3|4) * try to minimize damage here by waiting 5 seconds and doing progress */ shmem_barrier_all(); /* keys exchanged, segments attached, now we can safely cleanup */ if (memheap_map->mem_segs[HEAP_SEG_INDEX].type == MAP_SEGMENT_ALLOC_SHM) { shmctl(memheap_map->mem_segs[HEAP_SEG_INDEX].shmid, IPC_RMID, NULL ); } } }
static void memheap_attach_segment(mca_spml_mkey_t *mkey, int tr_id) { /* process special case when va was got using shmget(IPC_PRIVATE) * this case is notable for: * - key is set as (type|shmid); * - va_base is set as 0; */ if (!mkey->va_base && ((int) MEMHEAP_SHM_GET_ID(mkey->key) != MEMHEAP_SHM_INVALID)) { MEMHEAP_VERBOSE(5, "shared memory usage tr_id: %d key %llx base_va %p shmid 0x%X|0x%X", tr_id, (unsigned long long)mkey->key, mkey->va_base, MEMHEAP_SHM_GET_TYPE(mkey->key), MEMHEAP_SHM_GET_ID(mkey->key)); if (MEMHEAP_SHM_GET_TYPE(mkey->key) == MAP_SEGMENT_ALLOC_SHM) { mkey->va_base = shmat(MEMHEAP_SHM_GET_ID(mkey->key), 0, 0); } else if (MEMHEAP_SHM_GET_TYPE(mkey->key) == MAP_SEGMENT_ALLOC_IBV) { #if defined(MPAGE_ENABLE) && (MPAGE_ENABLE > 0) openib_device_t *device = NULL; struct ibv_mr *ib_mr; void *addr; static int mr_count; int access_flag = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_NO_RDMA; device = (openib_device_t *)memheap_map->mem_segs[HEAP_SEG_INDEX].context; assert(device); /* workaround mtt problem - request aligned addresses */ ++mr_count; addr = (void *)(mca_memheap_base_start_address + mca_memheap_base_mr_interleave_factor*1024ULL*1024ULL*1024ULL*mr_count); ib_mr = ibv_reg_shared_mr(MEMHEAP_SHM_GET_ID(mkey->key), device->ib_pd, addr, access_flag); if (NULL == ib_mr) { mkey->va_base = (void*)-1; MEMHEAP_ERROR("error to ibv_reg_shared_mr() errno says %d: %s", errno, strerror(errno)); } else { if (ib_mr->addr != addr) { MEMHEAP_WARN("Failed to map shared region to address %p got addr %p. Try to increase 'memheap_mr_interleave_factor' from %d", addr, ib_mr->addr, mca_memheap_base_mr_interleave_factor); } opal_value_array_append_item(&device->ib_mr_array, &ib_mr); mkey->va_base = ib_mr->addr; } #endif /* MPAGE_ENABLE */ } else { MEMHEAP_ERROR("tr_id: %d key %llx attach failed: incorrect shmid 0x%X|0x%X", tr_id, (unsigned long long)mkey->key, MEMHEAP_SHM_GET_TYPE(mkey->key), MEMHEAP_SHM_GET_ID(mkey->key)); oshmem_shmem_abort(-1); } if ((void *) -1 == (void *) mkey->va_base) { MEMHEAP_ERROR("tr_id: %d key %llx attach failed: errno = %d", tr_id, (unsigned long long)mkey->key, errno); oshmem_shmem_abort(-1); } } }
static void _ibv_detach(map_segment_t *s) { int rc = OSHMEM_SUCCESS; openib_device_t *device = NULL; assert(s); device = (openib_device_t *)s->context; if (device) { if(!rc && opal_value_array_get_size(&device->ib_mr_array)) { struct ibv_mr** array; struct ibv_mr* ib_mr = NULL; array = OPAL_VALUE_ARRAY_GET_BASE(&device->ib_mr_array, struct ibv_mr *); while (opal_value_array_get_size(&device->ib_mr_array) > 0) { ib_mr = array[0]; if(ibv_dereg_mr(ib_mr)) { MEMHEAP_ERROR("error ibv_dereg_mr(): %d: %s", errno, strerror(errno)); rc = OSHMEM_ERROR; } opal_value_array_remove_item(&device->ib_mr_array, 0); } if(!rc && device->ib_mr_shared) { device->ib_mr_shared = NULL; } OBJ_DESTRUCT(&device->ib_mr_array); } if(!rc && device->ib_pd) { if(ibv_dealloc_pd(device->ib_pd)) { MEMHEAP_ERROR("error ibv_dealloc_pd(): %d: %s", errno, strerror(errno)); rc = OSHMEM_ERROR; } else { device->ib_pd = NULL; } } if(!rc && device->ib_dev_context) { if(ibv_close_device(device->ib_dev_context)) { MEMHEAP_ERROR("error ibv_close_device(): %d: %s", errno, strerror(errno)); rc = OSHMEM_ERROR; } else { device->ib_dev_context = NULL; } } if(!rc && device->ib_devs) { ibv_free_device_list(device->ib_devs); device->ib_devs = NULL; } } }
int mca_memheap_base_alloc_init(mca_memheap_map_t *map, size_t size) { int ret = OSHMEM_SUCCESS; int value = mca_memheap_base_alloc_type; assert(map); assert(HEAP_SEG_INDEX == map->n_segments); MEMHEAP_VERBOSE(5, "memheap method : %d", mca_memheap_base_alloc_type); map_segment_t *s = &map->mem_segs[map->n_segments]; memset(s, 0, sizeof(*s)); s->is_active = 0; s->shmid = MEMHEAP_SHM_INVALID; s->start = 0; s->end = 0; s->size = 0; s->type = MAP_SEGMENT_UNKNOWN; s->context = NULL; switch (value) { case 0: /* use sysv alloc without hugepages */ ret = _shm_attach(s, size, 0, 1); break; case 1: ret = _shm_attach(s, size, 1, 1); if (OSHMEM_SUCCESS != ret) ret = _shm_attach(s, size, 0, 1); break; case 2: /* huge pages only */ ret = _shm_attach(s, size, 1, 1); if (OSHMEM_SUCCESS != ret) MEMHEAP_ERROR("FAILED to allocated symmetric heap using hugepages fallback is disabled, errno=%d", errno); break; case 3: /* huge pages only + cleanup shmid */ ret = _shm_attach(s, size, 1, 0); if (OSHMEM_SUCCESS != ret) MEMHEAP_ERROR("FAILED to allocated symmetric heap using hugepages fallback is disabled, errno=%d", errno); break; case 4: /* use sysv alloc without hugepages */ ret = _shm_attach(s, size, 0, 0); break; #if defined(MPAGE_ENABLE) && (MPAGE_ENABLE > 0) case 5: /* use shared memory registration (mpages) */ ret = _ibv_attach(s, size); if (OSHMEM_SUCCESS != ret) ret = _shm_attach(s, size, 0, 1); break; #endif /* MPAGE_ENABLE */ case 100: /* use mmap. It will severaly impact performance of intra node communication */ ret = _mmap_attach(s, size); MEMHEAP_VERBOSE(1, "mmap() memheap allocation will severely impact performance of intra node communication"); break; case 101: ret = _shm_attach(s, size, 1, 1); if (OSHMEM_SUCCESS != ret) { MEMHEAP_ERROR("Failed to allocate hugepages. Falling back on regular allocation"); ret = _mmap_attach(s, size); } else { s->shmid = MEMHEAP_SHM_INVALID; } MEMHEAP_VERBOSE(1, "SM BTL will be always used for intranode comm\n"); break; case 102: ret = _shm_attach(s, size, 1, 1); if (OSHMEM_SUCCESS != ret) { MEMHEAP_ERROR("FAILED to allocated symmetric heap using hugepages fallback is disabled, errno=%d", errno); } else { s->shmid = MEMHEAP_SHM_INVALID; } break; default: ret = _adaptive_attach(s, size); } if (OSHMEM_SUCCESS == ret) { map->n_segments++; MEMHEAP_VERBOSE(1, "Memheap alloc memory: %llu byte(s), %d segments by method: %d", (unsigned long long)size, map->n_segments, s->type); } return ret; }
static int _ibv_attach(map_segment_t *s, size_t size) { int rc = OSHMEM_SUCCESS; static openib_device_t memheap_device; openib_device_t *device = &memheap_device; int num_devs = 0; assert(s); memset(device, 0, sizeof(*device)); #ifdef HAVE_IBV_GET_DEVICE_LIST device->ib_devs = ibv_get_device_list(&num_devs); #else #error unsupported ibv_get_device_list in infiniband/verbs.h #endif if (num_devs == 0 || !device->ib_devs) { rc = OSHMEM_ERR_NOT_SUPPORTED; } /* Open device */ if (!rc) { int i = 0; if (num_devs > 1) { if (NULL == mca_memheap_base_param_hca_name) { MEMHEAP_VERBOSE(5, "found %d HCAs, choosing the first", num_devs); } else { MEMHEAP_VERBOSE(5, "found %d HCAs, searching for %s", num_devs, mca_memheap_base_param_hca_name); } } for (i = 0; i < num_devs; i++) { device->ib_dev = device->ib_devs[i]; device->ib_dev_context = ibv_open_device(device->ib_dev); if (NULL == device->ib_dev_context) { MEMHEAP_ERROR("error obtaining device context for %s errno says %d: %s", ibv_get_device_name(device->ib_dev), errno, strerror(errno)); rc = OSHMEM_ERR_RESOURCE_BUSY; } else { if (NULL != mca_memheap_base_param_hca_name) { if (0 == strcmp(mca_memheap_base_param_hca_name,ibv_get_device_name(device->ib_dev))) { MEMHEAP_VERBOSE(5, "mca_memheap_base_param_hca_name = %s, selected %s as %d of %d", mca_memheap_base_param_hca_name, ibv_get_device_name(device->ib_dev), i, num_devs); rc = OSHMEM_SUCCESS; break; } } else { MEMHEAP_VERBOSE(5, "mca_memheap_base_param_hca_name = %s, selected %s as %d of %d", mca_memheap_base_param_hca_name, ibv_get_device_name(device->ib_dev), i, num_devs); rc = OSHMEM_SUCCESS; break; } } } } /* Obtain device attributes */ if (!rc) { if (ibv_query_device(device->ib_dev_context, &device->ib_dev_attr)) { MEMHEAP_ERROR("error obtaining device attributes for %s errno says %d: %s", ibv_get_device_name(device->ib_dev), errno, strerror(errno)); rc = OSHMEM_ERR_RESOURCE_BUSY; } else { MEMHEAP_VERBOSE(5, "ibv device %s", ibv_get_device_name(device->ib_dev)); } } /* Allocate the protection domain for the device */ if (!rc) { device->ib_pd = ibv_alloc_pd(device->ib_dev_context); if (NULL == device->ib_pd) { MEMHEAP_ERROR("error allocating protection domain for %s errno says %d: %s", ibv_get_device_name(device->ib_dev), errno, strerror(errno)); rc = OSHMEM_ERR_RESOURCE_BUSY; } } /* Allocate memory */ if (!rc) { void *addr = NULL; struct ibv_mr *ib_mr = NULL; int access_flag = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ; OBJ_CONSTRUCT(&device->ib_mr_array, opal_value_array_t); opal_value_array_init(&device->ib_mr_array, sizeof(struct ibv_mr *)); #if defined(MPAGE_ENABLE) && (MPAGE_ENABLE > 0) access_flag |= IBV_ACCESS_ALLOCATE_MR | IBV_ACCESS_SHARED_MR_USER_READ | IBV_ACCESS_SHARED_MR_USER_WRITE; #endif /* MPAGE_ENABLE */ ib_mr = ibv_reg_mr(device->ib_pd, addr, size, access_flag); if (NULL == ib_mr) { MEMHEAP_ERROR("error to ibv_reg_mr() %llu bytes errno says %d: %s", (unsigned long long)size, errno, strerror(errno)); rc = OSHMEM_ERR_OUT_OF_RESOURCE; } else { device->ib_mr_shared = ib_mr; opal_value_array_append_item(&device->ib_mr_array, &ib_mr); } #if defined(MPAGE_ENABLE) && (MPAGE_ENABLE > 0) if (!rc) { access_flag = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ| IBV_ACCESS_NO_RDMA; addr = (void *)mca_memheap_base_start_address; ib_mr = ibv_reg_shared_mr(device->ib_mr_shared->handle, device->ib_pd, addr, access_flag); if (NULL == ib_mr) { MEMHEAP_ERROR("error to ibv_reg_shared_mr() %llu bytes errno says %d: %s", (unsigned long long)size, errno, strerror(errno)); rc = OSHMEM_ERR_OUT_OF_RESOURCE; } else { opal_value_array_append_item(&device->ib_mr_array, &ib_mr); } } #endif /* MPAGE_ENABLE */ if (!rc) { assert(size == device->ib_mr_shared->length); s->type = MAP_SEGMENT_ALLOC_IBV; s->shmid = device->ib_mr_shared->handle; s->start = ib_mr->addr; s->size = size; s->end = (void*)((uintptr_t)s->start + s->size); s->context = &memheap_device; } } return rc; }