static int __reg_segment(map_segment_t *s, int *num_btl)
{
    int rc = OSHMEM_SUCCESS;
    int my_pe;
    int nprocs;

    nprocs = oshmem_num_procs();
    my_pe = oshmem_my_proc_id();

    s->mkeys_cache = (mca_spml_mkey_t **) calloc(nprocs,
                                                 sizeof(mca_spml_mkey_t *));
    if (NULL == s->mkeys_cache) {
        MEMHEAP_ERROR("Failed to allocate memory for remote segments");
        rc = OSHMEM_ERROR;
    }

    if (!rc) {
        s->mkeys = MCA_SPML_CALL(register((void *)(unsigned long)s->start,
                        s->end - s->start,
                        MEMHEAP_SHM_CODE(s->type, s->shmid),
                        num_btl));
        if (NULL == s->mkeys) {
            free(s->mkeys_cache);
            s->mkeys_cache = NULL;

            MEMHEAP_ERROR("Failed to register segment");
            rc = OSHMEM_ERROR;
        }
    }
static int __load_segments(void)
{
    FILE *fp;
    char line[1024];
    struct map_segment_desc seg;

    memheap_context.n_segments = 0;

    fp = fopen("/proc/self/maps", "r");
    if (NULL == fp) {
        MEMHEAP_ERROR("Failed to open /proc/self/maps");
        return OSHMEM_ERROR;
    }

    while (NULL != fgets(line, sizeof(line), fp)) {
        memset(&seg, 0, sizeof(seg));
        sscanf(line,
               "%llx-%llx %s %llx %s %llx %s",
               (long long *) &seg.start,
               (long long *) &seg.end,
               seg.perms,
               (long long *) &seg.offset,
               seg.dev,
               (long long *) &seg.inode,
               seg.pathname);

        if (OSHMEM_ERROR == __check_address(&seg))
            continue;

        if (OSHMEM_ERROR == __check_pathname(&seg))
            continue;

        if (OSHMEM_ERROR == __check_perms(&seg))
            continue;

        MEMHEAP_VERBOSE(5, "add: %s", line);
        if (MCA_MEMHEAP_MAX_SEGMENTS <= memheap_context.n_segments) {
            MEMHEAP_ERROR("too many segments (max = %d): skip %s",
                          MCA_MEMHEAP_MAX_SEGMENTS, line);
            continue;
        }
        if (memheap_context.n_segments > 0
                && seg.start
                        == memheap_context.mem_segs[memheap_context.n_segments
                                - 1].end) {
            MEMHEAP_VERBOSE(5, "Coalescing segment");
            memheap_context.mem_segs[memheap_context.n_segments - 1].end =
                    seg.end;
        } else {
            memheap_context.mem_segs[memheap_context.n_segments].start =
                    seg.start;
            memheap_context.mem_segs[memheap_context.n_segments].end = seg.end;
            memheap_context.n_segments++;
        }
    }

    fclose(fp);
    return OSHMEM_SUCCESS;
}
Exemple #3
0
static int oshmem_mkey_recv_cb(void)
{
    MPI_Status status;
    int flag;
    int n;
    int rc;
    opal_buffer_t *msg;
    int32_t size;
    void *tmp_buf;
    oob_comm_request_t *r;

    n = 0;
    r = (oob_comm_request_t *)opal_list_get_first(&memheap_oob.req_list);
    assert(r);
    while (1) {
        my_MPI_Test(&r->recv_req, &flag, &status);
        if (OPAL_LIKELY(0 == flag)) {
            return n;
        }
        MPI_Get_count(&status, MPI_BYTE, &size);
        MEMHEAP_VERBOSE(5, "OOB request from PE: %d, size %d", status.MPI_SOURCE, size);
        n++;
        opal_list_remove_first(&memheap_oob.req_list);

        /* to avoid deadlock we must start request
         * before processing it. Data are copied to
         * the tmp buffer
         */
        tmp_buf = malloc(size);
        if (NULL == tmp_buf) {
            MEMHEAP_ERROR("not enough memory");
            ORTE_ERROR_LOG(0);
            return n;
        }
        memcpy(tmp_buf, (void*)&r->buf, size);
        msg = OBJ_NEW(opal_buffer_t);
        if (NULL == msg) {
            MEMHEAP_ERROR("not enough memory");
            ORTE_ERROR_LOG(0);
            return n;
        }
        opal_dss.load(msg, (void*)tmp_buf, size);

        rc = MPI_Start(&r->recv_req);
        if (MPI_SUCCESS != rc) {
            MEMHEAP_ERROR("Failed to post recv request %d", rc);
            ORTE_ERROR_LOG(rc);
            return n;
        }
        opal_list_append(&memheap_oob.req_list, &r->super);

        do_recv(status.MPI_SOURCE, msg);
        OBJ_RELEASE(msg);

        r = (oob_comm_request_t *)opal_list_get_first(&memheap_oob.req_list);
        assert(r);
    }
    return 1;  
}
Exemple #4
0
static int memheap_oob_get_mkeys(int pe, uint32_t seg, sshmem_mkey_t *mkeys)
{
    opal_buffer_t *msg;
    uint8_t cmd;
    int i;
    int rc;

    if (OSHMEM_SUCCESS == MCA_SPML_CALL(oob_get_mkeys(pe, seg, mkeys))) {
        for (i = 0; i < memheap_map->num_transports; i++) {
            mkeys[i].va_base = mca_memheap_seg2base_va(seg);
            MEMHEAP_VERBOSE(5,
                            "MKEY CALCULATED BY LOCAL SPML: pe: %d tr_id: %d %s",
                            pe,
                            i,
                            mca_spml_base_mkey2str(&mkeys[i]));
        }
        return OSHMEM_SUCCESS;
    }

    OPAL_THREAD_LOCK(&memheap_oob.lck);

    memheap_oob.mkeys = mkeys;
    memheap_oob.mkeys_rcvd = 0;

    msg = OBJ_NEW(opal_buffer_t);
    if (!msg) {
        OPAL_THREAD_UNLOCK(&memheap_oob.lck);
        MEMHEAP_ERROR("failed to get msg buffer");
        return OSHMEM_ERROR;
    }

    OPAL_THREAD_LOCK(&memheap_oob.lck);
    cmd = MEMHEAP_RKEY_REQ;
    opal_dss.pack(msg, &cmd, 1, OPAL_UINT8);
    opal_dss.pack(msg, &seg, 1, OPAL_UINT32);

    rc = send_buffer(pe, msg);
    if (MPI_SUCCESS != rc) {
        OPAL_THREAD_UNLOCK(&memheap_oob.lck);
        MEMHEAP_ERROR("FAILED to send rml message %d", rc);
        return OSHMEM_ERROR;
    }

    while (!memheap_oob.mkeys_rcvd) {
        opal_condition_wait(&memheap_oob.cond, &memheap_oob.lck);
    }

    if (MEMHEAP_RKEY_RESP == memheap_oob.mkeys_rcvd) {
        rc = OSHMEM_SUCCESS;
    } else {
        MEMHEAP_ERROR("failed to get rkey seg#%d pe=%d", seg, pe);
        rc = OSHMEM_ERROR;
    }

    OPAL_THREAD_UNLOCK(&memheap_oob.lck);
    return rc;
}
Exemple #5
0
void mca_memheap_base_alloc_exit(mca_memheap_map_t *map)
{
    if (map) {
        map_segment_t *s = &map->mem_segs[HEAP_SEG_INDEX];

        assert(s);

        switch (s->type) {
        case MAP_SEGMENT_ALLOC_SHM:
            _shm_detach(s);
            break;

        case MAP_SEGMENT_ALLOC_MMAP:
            _mmap_detach(s);
            break;

#if defined(MPAGE_ENABLE) && (MPAGE_ENABLE > 0)
            case MAP_SEGMENT_ALLOC_IBV:
            _ibv_detach(s);
            break;
#endif /* MPAGE_ENABLE */

        default:
            MEMHEAP_ERROR("Unknown segment type: %d", (int)s->type);
        }
    }
}
Exemple #6
0
static int _mmap_attach(map_segment_t *s, size_t size)
{
    void *addr = NULL;

    assert(s);

    addr = mmap((void *) mca_memheap_base_start_address,
                size,
                PROT_READ | PROT_WRITE,
                MAP_SHARED |
#if defined (__APPLE__)
MAP_ANON |
#elif defined (__GNUC__)
MAP_ANONYMOUS |
#endif
                MAP_FIXED,
                0,
                0);

    if (MAP_FAILED == addr) {
        MEMHEAP_ERROR("Failed to mmap() %llu bytes (errno=%d)",
                      (unsigned long long)size, errno);
        return OSHMEM_ERR_OUT_OF_RESOURCE;
    }

    s->type = MAP_SEGMENT_ALLOC_MMAP;
    s->shmid = MEMHEAP_SHM_INVALID;
    s->start = addr;
    s->size = size;
    s->end = (void*)((uintptr_t)s->start + s->size);
    s->context = NULL;

    return OSHMEM_SUCCESS;
}
Exemple #7
0
static int pack_local_mkeys(opal_buffer_t *msg, int pe, int seg)
{
    int i, n;
    sshmem_mkey_t *mkey;

    /* go over all transports and pack mkeys */
    n = memheap_map->num_transports;
    opal_dss.pack(msg, &n, 1, OPAL_UINT32);
    MEMHEAP_VERBOSE(5, "found %d transports to %d", n, pe);
    for (i = 0; i < n; i++) {
        mkey = mca_memheap_base_get_mkey(mca_memheap_seg2base_va(seg), i);
        if (!mkey) {
            MEMHEAP_ERROR("seg#%d tr_id: %d failed to find local mkey",
                          seg, i);
            return OSHMEM_ERROR;
        }
        opal_dss.pack(msg, &i, 1, OPAL_UINT32);
        opal_dss.pack(msg, &mkey->va_base, 1, OPAL_UINT64);
        if (0 == mkey->va_base) {
            opal_dss.pack(msg, &mkey->u.key, 1, OPAL_UINT64);
        } else {
            opal_dss.pack(msg, &mkey->len, 1, OPAL_UINT16);
            if (0 < mkey->len) {
                opal_dss.pack(msg, mkey->u.data, mkey->len, OPAL_BYTE);
            }
        }
        MEMHEAP_VERBOSE(5,
                        "seg#%d tr_id: %d %s",
                        seg, i, mca_spml_base_mkey2str(mkey));
    }
    return OSHMEM_SUCCESS;
}
Exemple #8
0
int memheap_oob_init(mca_memheap_map_t *map)
{
    int rc = OSHMEM_SUCCESS;
    int i;
    oob_comm_request_t *r;

    memheap_map = map;

    OBJ_CONSTRUCT(&memheap_oob.lck, opal_mutex_t);
    OBJ_CONSTRUCT(&memheap_oob.cond, opal_condition_t);
    OBJ_CONSTRUCT(&memheap_oob.req_list, opal_list_t);


    for (i = 0; i < MEMHEAP_RECV_REQS_MAX; i++) {
        r = &memheap_oob.req_pool[i];
        rc = PMPI_Recv_init(r->buf, sizeof(r->buf), MPI_BYTE,
                MPI_ANY_SOURCE, 0,
                oshmem_comm_world,
                &r->recv_req);
        if (MPI_SUCCESS != rc) {
            MEMHEAP_ERROR("Failed to created recv request %d", rc);
            return rc;
        }

        rc = PMPI_Start(&r->recv_req);
        if (MPI_SUCCESS != rc) {
            MEMHEAP_ERROR("Failed to post recv request %d", rc);
            return rc;
        }
        opal_list_append(&memheap_oob.req_list, &r->super);
    }

    opal_progress_register(oshmem_mkey_recv_cb);
    memheap_oob.is_inited = 1;

    return rc;
}
static int do_mkey_req(opal_buffer_t *msg, int pe, int seg)
{
    uint8_t msg_type;
    oshmem_proc_t *proc;
    int i, n, tr_id;
    mca_spml_mkey_t *mkey;

    msg_type = MEMHEAP_RKEY_RESP;
    opal_dss.pack(msg, &msg_type, 1, OPAL_UINT8);

    /* go over all transports to remote pe and pack mkeys */
    n = oshmem_get_transport_count(pe);
    proc = oshmem_proc_group_find(oshmem_group_all, pe);
    opal_dss.pack(msg, &n, 1, OPAL_UINT32);
    MEMHEAP_VERBOSE(5, "found %d transports to %d", n, pe);
    for (i = 0; i < n; i++) {
        tr_id = proc->transport_ids[i];

        mkey = mca_memheap_base_get_mkey(__seg2base_va(seg), tr_id);
        if (!mkey) {
            MEMHEAP_ERROR("seg#%d tr_id: %d failed to find local mkey",
                          seg, tr_id);
            return OSHMEM_ERROR;
        }
        opal_dss.pack(msg, &tr_id, 1, OPAL_UINT32);
        opal_dss.pack(msg, &mkey->key, 1, OPAL_UINT64);
        opal_dss.pack(msg, &mkey->va_base, 1, OPAL_UINT64);

        if (NULL != MCA_SPML_CALL(get_remote_context_size)) {
            uint32_t context_size =
                    (mkey->spml_context == NULL ) ?
                            0 :
                            (uint32_t) MCA_SPML_CALL(get_remote_context_size(mkey->spml_context));
            opal_dss.pack(msg, &context_size, 1, OPAL_UINT32);
            if (0 != context_size) {
                opal_dss.pack(msg,
                              MCA_SPML_CALL(get_remote_context(mkey->spml_context)),
                              context_size,
                              OPAL_BYTE);
            }
        }

        MEMHEAP_VERBOSE(5,
                        "seg#%d tr_id: %d key %llx base_va %p",
                        seg, tr_id, (unsigned long long)mkey->key, mkey->va_base);
    }
    return OSHMEM_SUCCESS;
}
Exemple #10
0
static void unpack_remote_mkeys(opal_buffer_t *msg, int remote_pe)
{
    int32_t cnt;
    int32_t n;
    int32_t tr_id;
    int i;
    oshmem_proc_t *proc;

    proc = oshmem_proc_group_find(oshmem_group_all, remote_pe);
    cnt = 1;
    opal_dss.unpack(msg, &n, &cnt, OPAL_UINT32);
    for (i = 0; i < n; i++) {
        cnt = 1;
        opal_dss.unpack(msg, &tr_id, &cnt, OPAL_UINT32);
        cnt = 1;
        opal_dss.unpack(msg,
                        &memheap_oob.mkeys[tr_id].va_base,
                        &cnt,
                        OPAL_UINT64);

        if (0 == memheap_oob.mkeys[tr_id].va_base) {
            cnt = 1;
            opal_dss.unpack(msg, &memheap_oob.mkeys[tr_id].u.key, &cnt, OPAL_UINT64);
            if (OPAL_PROC_ON_LOCAL_NODE(proc->super.proc_flags)) {
                memheap_attach_segment(&memheap_oob.mkeys[tr_id], tr_id);
            }
        } else {
            cnt = 1;
            opal_dss.unpack(msg, &memheap_oob.mkeys[tr_id].len, &cnt, OPAL_UINT16);
            if (0 < memheap_oob.mkeys[tr_id].len) {
                memheap_oob.mkeys[tr_id].u.data = malloc(memheap_oob.mkeys[tr_id].len);
                if (NULL == memheap_oob.mkeys[tr_id].u.data) {
                    MEMHEAP_ERROR("Failed allocate %d bytes", memheap_oob.mkeys[tr_id].len);
                    oshmem_shmem_abort(-1);
                }
                cnt = memheap_oob.mkeys[tr_id].len;
                opal_dss.unpack(msg, memheap_oob.mkeys[tr_id].u.data, &cnt, OPAL_BYTE);
                MCA_SPML_CALL(rmkey_unpack(&memheap_oob.mkeys[tr_id], remote_pe));
            } else {
                memheap_oob.mkeys[tr_id].u.key = MAP_SEGMENT_SHM_INVALID;
            }
        }

        MEMHEAP_VERBOSE(5,
                        "tr_id: %d %s",
                        tr_id, mca_spml_base_mkey2str(&memheap_oob.mkeys[tr_id]));
    }
}
Exemple #11
0
/**
 * @param all_trs
 * 0 - pack mkeys for transports to given pe
 * 1 - pack mkeys for ALL possible transports. value of pe is ignored
 */
static int pack_local_mkeys(opal_buffer_t *msg, int pe, int seg, int all_trs)
{
    oshmem_proc_t *proc;
    int i, n, tr_id;
    sshmem_mkey_t *mkey;

    /* go over all transports to remote pe and pack mkeys */
    if (!all_trs) {
        n = oshmem_get_transport_count(pe);
        proc = oshmem_proc_group_find(oshmem_group_all, pe);
    }
    else {
        proc = NULL;
        n = memheap_map->num_transports;
    }

    opal_dss.pack(msg, &n, 1, OPAL_UINT32);
    MEMHEAP_VERBOSE(5, "found %d transports to %d", n, pe);
    for (i = 0; i < n; i++) {
        if (!all_trs) {
            tr_id = proc->transport_ids[i];
        }
        else {
            tr_id = i;
        }
        mkey = mca_memheap_base_get_mkey(mca_memheap_seg2base_va(seg), tr_id);
        if (!mkey) {
            MEMHEAP_ERROR("seg#%d tr_id: %d failed to find local mkey",
                          seg, tr_id);
            return OSHMEM_ERROR;
        }
        opal_dss.pack(msg, &tr_id, 1, OPAL_UINT32);
        opal_dss.pack(msg, &mkey->va_base, 1, OPAL_UINT64);
        if (0 == mkey->va_base) {
            opal_dss.pack(msg, &mkey->u.key, 1, OPAL_UINT64);
        } else {
            opal_dss.pack(msg, &mkey->len, 1, OPAL_UINT16);
            if (0 < mkey->len) {
                opal_dss.pack(msg, mkey->u.data, mkey->len, OPAL_BYTE);
            }
        }
        MEMHEAP_VERBOSE(5,
                        "seg#%d tr_id: %d %s",
                        seg, tr_id, mca_spml_base_mkey2str(mkey));
    }
    return OSHMEM_SUCCESS;
}
Exemple #12
0
static void memheap_attach_segment(sshmem_mkey_t *mkey, int tr_id)
{
    /* process special case when va was got using sshmem
     * this case is notable for:
     * - key is set as (seg_id);
     * - va_base is set as 0;
     * - len is set as 0;
     */
    assert(mkey->va_base == 0);
    assert(mkey->len == 0);

    MEMHEAP_VERBOSE(5,
            "shared memory usage tr_id: %d va_base: 0x%p len: %d key %llx",
            tr_id,
            mkey->va_base, mkey->len, (unsigned long long)mkey->u.key);

    mca_sshmem_segment_attach(&(memheap_map->mem_segs[HEAP_SEG_INDEX]), mkey);

    if ((void *) -1 == (void *) mkey->va_base) {
        MEMHEAP_ERROR("tr_id: %d key %llx attach failed: errno = %d",
                tr_id, (unsigned long long)mkey->u.key, errno);
        oshmem_shmem_abort(-1);
    }
}
Exemple #13
0
void mca_memheap_modex_recv_all(void)
{
    int i;
    int j;
    int nprocs, my_pe;
    opal_buffer_t *msg = NULL;
    void *send_buffer = NULL;
    char *rcv_buffer = NULL;
    int size;
    int *rcv_size = NULL;
    int *rcv_n_transports = NULL;
    int *rcv_offsets = NULL;
    int rc = OSHMEM_SUCCESS;
    size_t buffer_size;

    if (!mca_memheap_base_key_exchange) {
        oshmem_shmem_barrier();
        return;
    }

    nprocs = oshmem_num_procs();
    my_pe = oshmem_my_proc_id();

    /* buffer allocation for num_transports
     * message sizes and offsets */

    rcv_size = (int *)malloc(nprocs * sizeof(int));
    if (NULL == rcv_size) {
        MEMHEAP_ERROR("failed to get rcv_size buffer");
        rc = OSHMEM_ERR_OUT_OF_RESOURCE;
        goto exit_fatal;
    }

    rcv_offsets = (int *)malloc(nprocs * sizeof(int));
    if (NULL == rcv_offsets) {
        MEMHEAP_ERROR("failed to get rcv_offsets buffer");
        rc = OSHMEM_ERR_OUT_OF_RESOURCE;
        goto exit_fatal;
    }

    rcv_n_transports = (int *)malloc(nprocs * sizeof(int));
    if (NULL == rcv_offsets) {
        MEMHEAP_ERROR("failed to get rcv_offsets buffer");
        rc = OSHMEM_ERR_OUT_OF_RESOURCE;
        goto exit_fatal;
    }

    /* serialize our own mkeys */
    msg = OBJ_NEW(opal_buffer_t);
    if (NULL == msg) {
        MEMHEAP_ERROR("failed to get msg buffer");
        rc = OSHMEM_ERR_OUT_OF_RESOURCE;
        goto exit_fatal;
    }

    for (j = 0; j < memheap_map->n_segments; j++) {
        pack_local_mkeys(msg, 0, j, 1);
    }

    /* we assume here that int32_t returned by opal_dss.unload
     * is equal to size of int we use for MPI_Allgather, MPI_Allgatherv */

    assert(sizeof(int32_t) == sizeof(int));

    /* Do allgather */
    opal_dss.unload(msg, &send_buffer, &size);
    MEMHEAP_VERBOSE(1, "local keys packed into %d bytes, %d segments", size, memheap_map->n_segments);

    /* we need to send num_transports and message sizes separately
     * since message sizes depend on types of btl used */

    rc = oshmem_shmem_allgather(&memheap_map->num_transports, rcv_n_transports, sizeof(int));
    if (MPI_SUCCESS != rc) {
        MEMHEAP_ERROR("allgather failed");
        goto exit_fatal;
    }

    rc = oshmem_shmem_allgather(&size, rcv_size, sizeof(int));
    if (MPI_SUCCESS != rc) {
        MEMHEAP_ERROR("allgather failed");
        goto exit_fatal;
    }

    /* calculating offsets (displacements) for allgatherv */

    rcv_offsets[0] = 0;
    for (i = 1; i < nprocs; i++) {
        rcv_offsets[i] = rcv_offsets[i - 1] + rcv_size[i - 1];
    }

    buffer_size = rcv_offsets[nprocs - 1] + rcv_size[nprocs - 1];

    rcv_buffer = malloc (buffer_size);
    if (NULL == rcv_buffer) {
        MEMHEAP_ERROR("failed to allocate recieve buffer");
        rc = OSHMEM_ERR_OUT_OF_RESOURCE;
        goto exit_fatal;
    }

    rc = oshmem_shmem_allgatherv(send_buffer, rcv_buffer, size, rcv_size, rcv_offsets);
    if (MPI_SUCCESS != rc) {
        free (rcv_buffer);
        MEMHEAP_ERROR("allgatherv failed");
        goto exit_fatal;
    }

    opal_dss.load(msg, rcv_buffer, buffer_size);

    /* deserialize mkeys */
    OPAL_THREAD_LOCK(&memheap_oob.lck);
    for (i = 0; i < nprocs; i++) {
        if (i == my_pe) {
            continue;
        }

        msg->unpack_ptr = (void *)((intptr_t) msg->base_ptr + rcv_offsets[i]);

        for (j = 0; j < memheap_map->n_segments; j++) {
            map_segment_t *s;

            s = &memheap_map->mem_segs[j];
            if (NULL != s->mkeys_cache[i]) {
                MEMHEAP_VERBOSE(10, "PE%d: segment%d already exists, mkey will be replaced", i, j);
            } else {
                s->mkeys_cache[i] = (sshmem_mkey_t *) calloc(rcv_n_transports[i],
                        sizeof(sshmem_mkey_t));
                if (NULL == s->mkeys_cache[i]) {
                    MEMHEAP_ERROR("PE%d: segment%d: Failed to allocate mkeys cache entry", i, j);
                    oshmem_shmem_abort(-1);
                }
            }
            memheap_oob.mkeys = s->mkeys_cache[i];
            unpack_remote_mkeys(msg, i);
        }
    }

    OPAL_THREAD_UNLOCK(&memheap_oob.lck);

exit_fatal:
    if (rcv_size) {
        free(rcv_size);
    }
    if (rcv_offsets) {
        free(rcv_offsets);
    }
    if (rcv_n_transports) {
        free(rcv_n_transports);
    }
    if (send_buffer) {
        free(send_buffer);
    }
    if (msg) {
        OBJ_RELEASE(msg);
    }

    /* This function requires abort in any error case */
    if (OSHMEM_SUCCESS != rc) {
        oshmem_shmem_abort(rc);
    }
}
static void memheap_buddy_rml_recv_cb(int status,
                                      orte_process_name_t* process_name,
                                      opal_buffer_t* buffer,
                                      orte_rml_tag_t tag,
                                      void* cbdata)
{
    MEMHEAP_VERBOSE(5,
                    "**** get request from %u:%d",
                    process_name->jobid, process_name->vpid);
    int32_t cnt = 1;
    int rc;
    opal_buffer_t *msg;
    uint8_t msg_type;
    uint32_t seg;

    MEMHEAP_VERBOSE(5, "unpacking %d of %d", cnt, OPAL_UINT8);
    rc = opal_dss.unpack(buffer, &msg_type, &cnt, OPAL_UINT8);
    if (ORTE_SUCCESS != rc) {
        ORTE_ERROR_LOG(rc);
        goto send_fail;
    }

    switch (msg_type) {
    case MEMHEAP_RKEY_REQ:
        cnt = 1;
        rc = opal_dss.unpack(buffer, &seg, &cnt, OPAL_UINT32);
        if (ORTE_SUCCESS != rc) {
            MEMHEAP_ERROR("bad RKEY_REQ msg");
            goto send_fail;
        }

        MEMHEAP_VERBOSE(5, "*** RKEY REQ");
        msg = OBJ_NEW(opal_buffer_t);
        if (!msg) {
            MEMHEAP_ERROR("failed to get msg buffer");
            ORTE_ERROR_LOG(rc);
            return;
        }

        if (OSHMEM_SUCCESS != do_mkey_req(msg, process_name->vpid, seg)) {
            OBJ_RELEASE(msg);
            goto send_fail;
        }

        rc = orte_rml.send_buffer_nb(process_name, msg, OMPI_RML_TAG_SHMEM, orte_rml_send_callback, NULL);

        if (0 > rc) {
            MEMHEAP_ERROR("FAILED to send rml message %d", rc);
            ORTE_ERROR_LOG(rc);
            goto send_fail;
        }
        break;

    case MEMHEAP_RKEY_RESP:
        MEMHEAP_VERBOSE(5, "*** RKEY RESP");
        OPAL_THREAD_LOCK(&memheap_oob.lck);
        do_mkey_resp(buffer);
        memheap_oob.mkeys_rcvd = MEMHEAP_RKEY_RESP;
        opal_condition_broadcast(&memheap_oob.cond);
        OPAL_THREAD_UNLOCK(&memheap_oob.lck);
        break;

    case MEMHEAP_RKEY_RESP_FAIL:
        MEMHEAP_VERBOSE(5, "*** RKEY RESP FAIL");
        memheap_oob.mkeys_rcvd = MEMHEAP_RKEY_RESP_FAIL;
        opal_condition_broadcast(&memheap_oob.cond);
        OPAL_THREAD_UNLOCK(&memheap_oob.lck);
        break;

    default:
        MEMHEAP_VERBOSE(5, "Unknown message type %x", msg_type);
        goto send_fail;
    }
    return;

    send_fail: msg = OBJ_NEW(opal_buffer_t);
    if (!msg) {
        MEMHEAP_ERROR("failed to get msg buffer");
        ORTE_ERROR_LOG(rc);
        return;
    }
    msg_type = MEMHEAP_RKEY_RESP_FAIL;
    opal_dss.pack(msg, &msg_type, 1, OPAL_UINT8);

    rc = orte_rml.send_buffer_nb(process_name, msg, OMPI_RML_TAG_SHMEM, orte_rml_send_callback, NULL);
    if (0 > rc) {
        MEMHEAP_ERROR("FAILED to send rml message %d", rc);
        ORTE_ERROR_LOG(rc);
    }

}
Exemple #15
0
static int oshmem_mkey_recv_cb(void)
{
    MPI_Status status;
    int flag;
    int n;
    int rc;
    opal_buffer_t *msg;
    int32_t size;
    void *tmp_buf;
    oob_comm_request_t *r;

    n = 0;
    r = (oob_comm_request_t *)opal_list_get_first(&memheap_oob.req_list);
    assert(r);
    while(r != (oob_comm_request_t *)opal_list_get_end(&memheap_oob.req_list)) {
        my_MPI_Test(&r->recv_req, &flag, &status);
        if (OPAL_LIKELY(0 == flag)) {
            return n;
        }
        PMPI_Get_count(&status, MPI_BYTE, &size);
        MEMHEAP_VERBOSE(5, "OOB request from PE: %d, size %d", status.MPI_SOURCE, size);
        n++;
        opal_list_remove_first(&memheap_oob.req_list);

        /* to avoid deadlock we must start request
         * before processing it. Data are copied to
         * the tmp buffer
         */
        tmp_buf = malloc(size);
        if (NULL == tmp_buf) {
            MEMHEAP_ERROR("not enough memory");
            ORTE_ERROR_LOG(0);
            return n;
        } else {
		    memcpy(tmp_buf, (void*)&r->buf, size);
		    msg = OBJ_NEW(opal_buffer_t);
		    if (NULL == msg) {
		        MEMHEAP_ERROR("not enough memory");
		        ORTE_ERROR_LOG(0);
		        free(tmp_buf);
		        return n;
		    }
		    opal_dss.load(msg, (void*)tmp_buf, size);

            /*
             * send reply before posting the receive request again to limit the recursion size to
             * number of receive requests.
             * send can call opal_progress which calls this function again. If recv req is started
             * stack size will be proportional to number of job ranks.
             */
            do_recv(status.MPI_SOURCE, msg);
            OBJ_RELEASE(msg);
        }

        rc = PMPI_Start(&r->recv_req);
        if (MPI_SUCCESS != rc) {
            MEMHEAP_ERROR("Failed to post recv request %d", rc);
            ORTE_ERROR_LOG(rc);
            return n;
        }
        opal_list_append(&memheap_oob.req_list, &r->super);


        r = (oob_comm_request_t *)opal_list_get_first(&memheap_oob.req_list);
        assert(r);
    }

    return 1;
}
Exemple #16
0
static void do_recv(int source_pe, opal_buffer_t* buffer)
{
    int32_t cnt = 1;
    int rc;
    opal_buffer_t *msg;
    uint8_t msg_type;
    uint32_t seg;

    MEMHEAP_VERBOSE(5, "unpacking %d of %d", cnt, OPAL_UINT8);
    rc = opal_dss.unpack(buffer, &msg_type, &cnt, OPAL_UINT8);
    if (OPAL_SUCCESS != rc) {
        ORTE_ERROR_LOG(rc);
        goto send_fail;
    }

    switch (msg_type) {
    case MEMHEAP_RKEY_REQ:
        cnt = 1;
        rc = opal_dss.unpack(buffer, &seg, &cnt, OPAL_UINT32);
        if (OPAL_SUCCESS != rc) {
            MEMHEAP_ERROR("bad RKEY_REQ msg");
            goto send_fail;
        }

        MEMHEAP_VERBOSE(5, "*** RKEY REQ");
        msg = OBJ_NEW(opal_buffer_t);
        if (!msg) {
            MEMHEAP_ERROR("failed to get msg buffer");
            ORTE_ERROR_LOG(rc);
            return;
        }

        msg_type = MEMHEAP_RKEY_RESP;
        opal_dss.pack(msg, &msg_type, 1, OPAL_UINT8);

        if (OSHMEM_SUCCESS != pack_local_mkeys(msg, source_pe, seg, 0)) {
            OBJ_RELEASE(msg);
            goto send_fail;
        }

        rc = send_buffer(source_pe, msg);
        if (MPI_SUCCESS != rc) {
            MEMHEAP_ERROR("FAILED to send rml message %d", rc);
            ORTE_ERROR_LOG(rc);
            goto send_fail;
        }
        break;

    case MEMHEAP_RKEY_RESP:
        MEMHEAP_VERBOSE(5, "*** RKEY RESP");
        OPAL_THREAD_LOCK(&memheap_oob.lck);
        unpack_remote_mkeys(buffer, source_pe);
        memheap_oob.mkeys_rcvd = MEMHEAP_RKEY_RESP;
        opal_condition_broadcast(&memheap_oob.cond);
        OPAL_THREAD_UNLOCK(&memheap_oob.lck);
        break;

    case MEMHEAP_RKEY_RESP_FAIL:
        MEMHEAP_VERBOSE(5, "*** RKEY RESP FAIL");
        memheap_oob.mkeys_rcvd = MEMHEAP_RKEY_RESP_FAIL;
        opal_condition_broadcast(&memheap_oob.cond);
        OPAL_THREAD_UNLOCK(&memheap_oob.lck);
        break;

    default:
        MEMHEAP_VERBOSE(5, "Unknown message type %x", msg_type);
        goto send_fail;
    }
    return;

    send_fail: msg = OBJ_NEW(opal_buffer_t);
    if (!msg) {
        MEMHEAP_ERROR("failed to get msg buffer");
        ORTE_ERROR_LOG(rc);
        return;
    }
    msg_type = MEMHEAP_RKEY_RESP_FAIL;
    opal_dss.pack(msg, &msg_type, 1, OPAL_UINT8);

    rc = send_buffer(source_pe, msg);
    if (MPI_SUCCESS != rc) {
        MEMHEAP_ERROR("FAILED to send rml message %d", rc);
        ORTE_ERROR_LOG(rc);
    }

}
static int memheap_oob_get_mkeys(int pe, uint32_t seg, mca_spml_mkey_t *mkeys)
{
    orte_process_name_t name;
    opal_buffer_t *msg;
    int rc;
    uint8_t cmd;
    int i;

    if (OSHMEM_SUCCESS == MCA_SPML_CALL(oob_get_mkeys(pe, seg, mkeys))) {
        for (i = 0; i < memheap_map->num_transports; i++) {
            mkeys[i].va_base = __seg2base_va(seg);
            MEMHEAP_VERBOSE(5,
                            "MKEY CALCULATED BY LOCAL SPML: pe: %d tr_id: %d key %llx base_va %p",
                            pe,
                            i,
                            (unsigned long long)mkeys[i].key,
                            mkeys[i].va_base);
        }
        return OSHMEM_SUCCESS;
    }

    OPAL_THREAD_LOCK(&memheap_oob.lck);

    memheap_oob.mkeys = mkeys;
    memheap_oob.mkeys_rcvd = 0;

    name.jobid = ORTE_PROC_MY_NAME->jobid;
    name.vpid = pe;

    msg = OBJ_NEW(opal_buffer_t);
    if (!msg) {
        OPAL_THREAD_UNLOCK(&memheap_oob.lck);
        MEMHEAP_ERROR("failed to get msg buffer");
        return OSHMEM_ERROR;
    }

    OPAL_THREAD_LOCK(&memheap_oob.lck);
    cmd = MEMHEAP_RKEY_REQ;
    opal_dss.pack(msg, &cmd, 1, OPAL_UINT8);
    opal_dss.pack(msg, &seg, 1, OPAL_UINT32);
    rc = orte_rml.send_buffer_nb(&name, msg, OMPI_RML_TAG_SHMEM, orte_rml_send_callback, NULL);
    if (0 > rc) {
        OBJ_RELEASE(msg);
        OPAL_THREAD_UNLOCK(&memheap_oob.lck);
        MEMHEAP_ERROR("FAILED to send rml message %d", rc);
        return OSHMEM_ERROR;
    }

    MEMHEAP_VERBOSE(5, "message sent: %d bytes!", rc);

    while (!memheap_oob.mkeys_rcvd) {
        opal_condition_wait(&memheap_oob.cond, &memheap_oob.lck);
    }

    if (MEMHEAP_RKEY_RESP == memheap_oob.mkeys_rcvd) {
        rc = OSHMEM_SUCCESS;
    } else {
        MEMHEAP_ERROR("failed to get rkey seg#%d pe=%d", seg, pe);
        rc = OSHMEM_ERROR;
    }

    OPAL_THREAD_UNLOCK(&memheap_oob.lck);
    return rc;
}
void mca_memheap_modex_recv_all(void)
{
    int i;
    int j;
    int nprocs, my_pe;
    oshmem_proc_t *proc;
    mca_spml_mkey_t *mkey;
    void* dummy_rva;

    if (!mca_memheap_base_key_exchange)
        return;

    /* init rkey cache */
    nprocs = oshmem_num_procs();
    my_pe = oshmem_my_proc_id();

    /* Note:
     * Doing exchange via rml till we figure out problem with grpcomm.modex and barrier
     */
    for (i = 0; i < nprocs; i++) {
        if (i == my_pe)
            continue;

        proc = oshmem_proc_group_find(oshmem_group_all, i);
        for (j = 0; j < memheap_map->n_segments; j++) {
            mkey =
                    mca_memheap_base_get_cached_mkey(i,
                                                     memheap_map->mem_segs[j].start,
                                                     proc->transport_ids[0],
                                                     &dummy_rva);
            if (!mkey) {
                MEMHEAP_ERROR("Failed to receive mkeys");
                oshmem_shmem_abort(-1);
            }
        }

    }

    /*
     * There is an issue with orte_grpcomm.barrier usage as
     * ess/pmi directs to use grpcomm/pmi in case slurm srun() call grpcomm/pmi calls PMI_Barrier() 
     * that is a function of external library.
     * There is no opal_progress() in such way. As a result slow PEs send a request (MEMHEAP_RKEY_REQ) to
     * fast PEs waiting on barrier and do not get a respond (MEMHEAP_RKEY_RESP).
     *
     * there are following ways to solve one:
     * 1. calculate requests from remote PEs and do ORTE_PROGRESSED_WAIT waiting for expected value;
     * 2. use shmem_barrier_all();
     * 3. rework pmi/barrier to use opal_progress();
     * 4. use orte_grpcomm.barrier carefully;
     * 
     * It seems there is no need to use orte_grpcomm.barrier here
     */

    if (memheap_map->mem_segs[HEAP_SEG_INDEX].shmid != MEMHEAP_SHM_INVALID) {
        /* unfortunately we must do barrier here to assure that everyone are attached to our segment
         * good thing that this code path only invoked on older linuxes (-mca shmalloc_use_hugepages 3|4)
         * try to minimize damage here by waiting 5 seconds and doing progress
         */
        shmem_barrier_all();
        /* keys exchanged, segments attached, now we can safely cleanup */
        if (memheap_map->mem_segs[HEAP_SEG_INDEX].type
                == MAP_SEGMENT_ALLOC_SHM) {
            shmctl(memheap_map->mem_segs[HEAP_SEG_INDEX].shmid,
                   IPC_RMID,
                   NULL );
        }
    }
}
static void memheap_attach_segment(mca_spml_mkey_t *mkey, int tr_id)
{
    /* process special case when va was got using shmget(IPC_PRIVATE)
     * this case is notable for:
     * - key is set as (type|shmid);
     * - va_base is set as 0;
     */
    if (!mkey->va_base
            && ((int) MEMHEAP_SHM_GET_ID(mkey->key) != MEMHEAP_SHM_INVALID)) {
        MEMHEAP_VERBOSE(5,
                        "shared memory usage tr_id: %d key %llx base_va %p shmid 0x%X|0x%X",
                        tr_id,
                        (unsigned long long)mkey->key,
                        mkey->va_base,
                        MEMHEAP_SHM_GET_TYPE(mkey->key),
                        MEMHEAP_SHM_GET_ID(mkey->key));

        if (MEMHEAP_SHM_GET_TYPE(mkey->key) == MAP_SEGMENT_ALLOC_SHM) {
            mkey->va_base = shmat(MEMHEAP_SHM_GET_ID(mkey->key),
                                             0,
                                             0);
        } else if (MEMHEAP_SHM_GET_TYPE(mkey->key) == MAP_SEGMENT_ALLOC_IBV) {
#if defined(MPAGE_ENABLE) && (MPAGE_ENABLE > 0)
            openib_device_t *device = NULL;
            struct ibv_mr *ib_mr;
            void *addr;
            static int mr_count;

            int access_flag = IBV_ACCESS_LOCAL_WRITE |
            IBV_ACCESS_REMOTE_WRITE |
            IBV_ACCESS_REMOTE_READ |
            IBV_ACCESS_NO_RDMA;

            device = (openib_device_t *)memheap_map->mem_segs[HEAP_SEG_INDEX].context;
            assert(device);

            /* workaround mtt problem - request aligned addresses */
            ++mr_count;
            addr = (void *)(mca_memheap_base_start_address + mca_memheap_base_mr_interleave_factor*1024ULL*1024ULL*1024ULL*mr_count);
            ib_mr = ibv_reg_shared_mr(MEMHEAP_SHM_GET_ID(mkey->key),
                    device->ib_pd, addr, access_flag);
            if (NULL == ib_mr)
            {
                mkey->va_base = (void*)-1;
                MEMHEAP_ERROR("error to ibv_reg_shared_mr() errno says %d: %s",
                        errno, strerror(errno));
            }
            else
            {
                if (ib_mr->addr != addr) {
                    MEMHEAP_WARN("Failed to map shared region to address %p got addr %p. Try to increase 'memheap_mr_interleave_factor' from %d", addr, ib_mr->addr, mca_memheap_base_mr_interleave_factor);
                }

                opal_value_array_append_item(&device->ib_mr_array, &ib_mr);
                mkey->va_base = ib_mr->addr;
            }
#endif /* MPAGE_ENABLE */
        } else {
            MEMHEAP_ERROR("tr_id: %d key %llx attach failed: incorrect shmid 0x%X|0x%X",
                          tr_id, 
                          (unsigned long long)mkey->key,
                          MEMHEAP_SHM_GET_TYPE(mkey->key),
                          MEMHEAP_SHM_GET_ID(mkey->key));
            oshmem_shmem_abort(-1);
        }

        if ((void *) -1 == (void *) mkey->va_base) {
            MEMHEAP_ERROR("tr_id: %d key %llx attach failed: errno = %d",
                          tr_id, (unsigned long long)mkey->key, errno);
            oshmem_shmem_abort(-1);
        }
    }
}
Exemple #20
0
static void _ibv_detach(map_segment_t *s)
{
    int rc = OSHMEM_SUCCESS;
    openib_device_t *device = NULL;

    assert(s);

    device = (openib_device_t *)s->context;

    if (device)
    {
        if(!rc && opal_value_array_get_size(&device->ib_mr_array))
        {
            struct ibv_mr** array;
            struct ibv_mr* ib_mr = NULL;
            array = OPAL_VALUE_ARRAY_GET_BASE(&device->ib_mr_array, struct ibv_mr *);
            while (opal_value_array_get_size(&device->ib_mr_array) > 0)
            {
                ib_mr = array[0];
                if(ibv_dereg_mr(ib_mr))
                {
                    MEMHEAP_ERROR("error ibv_dereg_mr(): %d: %s", errno, strerror(errno));
                    rc = OSHMEM_ERROR;
                }
                opal_value_array_remove_item(&device->ib_mr_array, 0);
            }

            if(!rc && device->ib_mr_shared)
            {
                device->ib_mr_shared = NULL;
            }
            OBJ_DESTRUCT(&device->ib_mr_array);
        }

        if(!rc && device->ib_pd)
        {
            if(ibv_dealloc_pd(device->ib_pd))
            {
                MEMHEAP_ERROR("error ibv_dealloc_pd(): %d: %s", errno, strerror(errno));
                rc = OSHMEM_ERROR;
            }
            else
            {
                device->ib_pd = NULL;
            }
        }

        if(!rc && device->ib_dev_context)
        {
            if(ibv_close_device(device->ib_dev_context))
            {
                MEMHEAP_ERROR("error ibv_close_device(): %d: %s", errno, strerror(errno));
                rc = OSHMEM_ERROR;
            }
            else
            {
                device->ib_dev_context = NULL;
            }
        }

        if(!rc && device->ib_devs)
        {
            ibv_free_device_list(device->ib_devs);
            device->ib_devs = NULL;
        }
    }
}
Exemple #21
0
int mca_memheap_base_alloc_init(mca_memheap_map_t *map, size_t size)
{
    int ret = OSHMEM_SUCCESS;
    int value = mca_memheap_base_alloc_type;

    assert(map);
    assert(HEAP_SEG_INDEX == map->n_segments);

    MEMHEAP_VERBOSE(5,
                    "memheap method : %d",
                    mca_memheap_base_alloc_type);

    map_segment_t *s = &map->mem_segs[map->n_segments];
    memset(s, 0, sizeof(*s));
    s->is_active = 0;
    s->shmid = MEMHEAP_SHM_INVALID;
    s->start = 0;
    s->end = 0;
    s->size = 0;
    s->type = MAP_SEGMENT_UNKNOWN;
    s->context = NULL;

    switch (value) {
    case 0:
        /* use sysv alloc without hugepages */
        ret = _shm_attach(s, size, 0, 1);
        break;

    case 1:
        ret = _shm_attach(s, size, 1, 1);
        if (OSHMEM_SUCCESS != ret)
            ret = _shm_attach(s, size, 0, 1);
        break;

    case 2:
        /* huge pages only */
        ret = _shm_attach(s, size, 1, 1);
        if (OSHMEM_SUCCESS != ret)
            MEMHEAP_ERROR("FAILED to allocated symmetric heap using hugepages fallback is disabled, errno=%d",
                          errno);
        break;

    case 3:
        /* huge pages only + cleanup shmid */
        ret = _shm_attach(s, size, 1, 0);
        if (OSHMEM_SUCCESS != ret)
            MEMHEAP_ERROR("FAILED to allocated symmetric heap using hugepages fallback is disabled, errno=%d",
                          errno);
        break;

    case 4:
        /* use sysv alloc without hugepages */
        ret = _shm_attach(s, size, 0, 0);
        break;

#if defined(MPAGE_ENABLE) && (MPAGE_ENABLE > 0)
        case 5:
        /* use shared memory registration (mpages) */
        ret = _ibv_attach(s, size);
        if (OSHMEM_SUCCESS != ret)
            ret = _shm_attach(s, size, 0, 1);

        break;
#endif /* MPAGE_ENABLE */

    case 100:
        /* use mmap. It will severaly impact performance of intra node communication */
        ret = _mmap_attach(s, size);
        MEMHEAP_VERBOSE(1,
                        "mmap() memheap allocation will severely impact performance of intra node communication");
        break;

    case 101:
        ret = _shm_attach(s, size, 1, 1);
        if (OSHMEM_SUCCESS != ret) {
            MEMHEAP_ERROR("Failed to allocate hugepages. Falling back on regular allocation");
            ret = _mmap_attach(s, size);
        } else {
            s->shmid = MEMHEAP_SHM_INVALID;
        }
        MEMHEAP_VERBOSE(1, "SM BTL will be always used for intranode comm\n");
        break;

    case 102:
        ret = _shm_attach(s, size, 1, 1);
        if (OSHMEM_SUCCESS != ret) {
            MEMHEAP_ERROR("FAILED to allocated symmetric heap using hugepages fallback is disabled, errno=%d",
                          errno);
        } else {
            s->shmid = MEMHEAP_SHM_INVALID;
        }
        break;

    default:
        ret = _adaptive_attach(s, size);
    }

    if (OSHMEM_SUCCESS == ret) {
        map->n_segments++;
        MEMHEAP_VERBOSE(1,
                        "Memheap alloc memory: %llu byte(s), %d segments by method: %d",
                        (unsigned long long)size, map->n_segments, s->type);
    }

    return ret;
}
Exemple #22
0
static int _ibv_attach(map_segment_t *s, size_t size)
{
    int rc = OSHMEM_SUCCESS;
    static openib_device_t memheap_device;
    openib_device_t *device = &memheap_device;
    int num_devs = 0;

    assert(s);

    memset(device, 0, sizeof(*device));

#ifdef HAVE_IBV_GET_DEVICE_LIST
    device->ib_devs = ibv_get_device_list(&num_devs);
#else
#error unsupported ibv_get_device_list in infiniband/verbs.h
#endif

    if (num_devs == 0 || !device->ib_devs)
    {
        rc = OSHMEM_ERR_NOT_SUPPORTED;
    }

    /* Open device */
    if (!rc)
    {
        int i = 0;

        if (num_devs > 1)
        {
            if (NULL == mca_memheap_base_param_hca_name)
            {
                MEMHEAP_VERBOSE(5, "found %d HCAs, choosing the first", num_devs);
            }
            else
            {
                MEMHEAP_VERBOSE(5, "found %d HCAs, searching for %s", num_devs, mca_memheap_base_param_hca_name);
            }
        }

        for (i = 0; i < num_devs; i++)
        {
            device->ib_dev = device->ib_devs[i];

            device->ib_dev_context = ibv_open_device(device->ib_dev);
            if (NULL == device->ib_dev_context)
            {
                MEMHEAP_ERROR("error obtaining device context for %s errno says %d: %s",
                        ibv_get_device_name(device->ib_dev), errno, strerror(errno));
                rc = OSHMEM_ERR_RESOURCE_BUSY;
            }
            else
            {
                if (NULL != mca_memheap_base_param_hca_name)
                {
                    if (0 == strcmp(mca_memheap_base_param_hca_name,ibv_get_device_name(device->ib_dev)))
                    {
                        MEMHEAP_VERBOSE(5, "mca_memheap_base_param_hca_name = %s, selected %s as %d of %d", mca_memheap_base_param_hca_name, ibv_get_device_name(device->ib_dev), i, num_devs);
                        rc = OSHMEM_SUCCESS;
                        break;
                    }
                }
                else
                {
                    MEMHEAP_VERBOSE(5, "mca_memheap_base_param_hca_name = %s, selected %s as %d of %d", mca_memheap_base_param_hca_name, ibv_get_device_name(device->ib_dev), i, num_devs);
                    rc = OSHMEM_SUCCESS;
                    break;
                }
            }
        }
    }

    /* Obtain device attributes */
    if (!rc)
    {
        if (ibv_query_device(device->ib_dev_context, &device->ib_dev_attr))
        {
            MEMHEAP_ERROR("error obtaining device attributes for %s errno says %d: %s",
                    ibv_get_device_name(device->ib_dev), errno, strerror(errno));
            rc = OSHMEM_ERR_RESOURCE_BUSY;
        }
        else
        {
            MEMHEAP_VERBOSE(5, "ibv device %s",
                    ibv_get_device_name(device->ib_dev));
        }
    }

    /* Allocate the protection domain for the device */
    if (!rc)
    {
        device->ib_pd = ibv_alloc_pd(device->ib_dev_context);
        if (NULL == device->ib_pd)
        {
            MEMHEAP_ERROR("error allocating protection domain for %s errno says %d: %s",
                    ibv_get_device_name(device->ib_dev), errno, strerror(errno));
            rc = OSHMEM_ERR_RESOURCE_BUSY;
        }
    }

    /* Allocate memory */
    if (!rc)
    {
        void *addr = NULL;
        struct ibv_mr *ib_mr = NULL;
        int access_flag = IBV_ACCESS_LOCAL_WRITE |
        IBV_ACCESS_REMOTE_WRITE |
        IBV_ACCESS_REMOTE_READ;

        OBJ_CONSTRUCT(&device->ib_mr_array, opal_value_array_t);
        opal_value_array_init(&device->ib_mr_array, sizeof(struct ibv_mr *));

#if defined(MPAGE_ENABLE) && (MPAGE_ENABLE > 0)
        access_flag |= IBV_ACCESS_ALLOCATE_MR |
        IBV_ACCESS_SHARED_MR_USER_READ |
        IBV_ACCESS_SHARED_MR_USER_WRITE;
#endif /* MPAGE_ENABLE */

        ib_mr = ibv_reg_mr(device->ib_pd, addr, size, access_flag);
        if (NULL == ib_mr)
        {
            MEMHEAP_ERROR("error to ibv_reg_mr() %llu bytes errno says %d: %s",
                    (unsigned long long)size, errno, strerror(errno));
            rc = OSHMEM_ERR_OUT_OF_RESOURCE;
        }
        else
        {
            device->ib_mr_shared = ib_mr;
            opal_value_array_append_item(&device->ib_mr_array, &ib_mr);
        }

#if defined(MPAGE_ENABLE) && (MPAGE_ENABLE > 0)
        if (!rc)
        {
            access_flag = IBV_ACCESS_LOCAL_WRITE |
            IBV_ACCESS_REMOTE_WRITE |
            IBV_ACCESS_REMOTE_READ|
            IBV_ACCESS_NO_RDMA;

            addr = (void *)mca_memheap_base_start_address;
            ib_mr = ibv_reg_shared_mr(device->ib_mr_shared->handle,
                    device->ib_pd, addr, access_flag);
            if (NULL == ib_mr)
            {
                MEMHEAP_ERROR("error to ibv_reg_shared_mr() %llu bytes errno says %d: %s",
                        (unsigned long long)size, errno, strerror(errno));
                rc = OSHMEM_ERR_OUT_OF_RESOURCE;
            }
            else
            {
                opal_value_array_append_item(&device->ib_mr_array, &ib_mr);
            }
        }
#endif /* MPAGE_ENABLE */

        if (!rc)
        {
            assert(size == device->ib_mr_shared->length);

            s->type = MAP_SEGMENT_ALLOC_IBV;
            s->shmid = device->ib_mr_shared->handle;
            s->start = ib_mr->addr;
            s->size = size;
            s->end = (void*)((uintptr_t)s->start + s->size);
            s->context = &memheap_device;
        }
    }

    return rc;
}