Beispiel #1
0
static int pack_local_mkeys(opal_buffer_t *msg, int pe, int seg)
{
    int i, n;
    sshmem_mkey_t *mkey;

    /* go over all transports and pack mkeys */
    n = memheap_map->num_transports;
    opal_dss.pack(msg, &n, 1, OPAL_UINT32);
    MEMHEAP_VERBOSE(5, "found %d transports to %d", n, pe);
    for (i = 0; i < n; i++) {
        mkey = mca_memheap_base_get_mkey(mca_memheap_seg2base_va(seg), i);
        if (!mkey) {
            MEMHEAP_ERROR("seg#%d tr_id: %d failed to find local mkey",
                          seg, i);
            return OSHMEM_ERROR;
        }
        opal_dss.pack(msg, &i, 1, OPAL_UINT32);
        opal_dss.pack(msg, &mkey->va_base, 1, OPAL_UINT64);
        if (0 == mkey->va_base) {
            opal_dss.pack(msg, &mkey->u.key, 1, OPAL_UINT64);
        } else {
            opal_dss.pack(msg, &mkey->len, 1, OPAL_UINT16);
            if (0 < mkey->len) {
                opal_dss.pack(msg, mkey->u.data, mkey->len, OPAL_BYTE);
            }
        }
        MEMHEAP_VERBOSE(5,
                        "seg#%d tr_id: %d %s",
                        seg, i, mca_spml_base_mkey2str(mkey));
    }
    return OSHMEM_SUCCESS;
}
Beispiel #2
0
static int _check_address(struct map_segment_desc *seg)
{
    /* FIXME Linux specific code */
#ifdef __linux__
    extern unsigned _end;
    void* data_end = &_end;

    /**
     * SGI shmem only supports globals&static in main program. 
     * It does not support them in shared objects or in dlopen()
     * (Clarified on PGAS 2011 tutorial)
     *
     * So ignored any maps that start higher then process _end
     * FIXME: make sure we do not register symmetric heap twice
     * if we decide to allow shared objects
     */
    if ((uintptr_t)seg->start > (uintptr_t)data_end) {
        MEMHEAP_VERBOSE(100,
                        "skip segment: data _end < segment start (%p < %p)",
                        data_end, seg->start);
        return OSHMEM_ERROR;
    }

    if ((uintptr_t)seg->end > (uintptr_t)data_end) {
        MEMHEAP_VERBOSE(100,
                        "adjust segment: data _end < segment end (%p < %p",
                        data_end, seg->end);
         seg->end = data_end;
    }
#endif
    return OSHMEM_SUCCESS;
}
static int __load_segments(void)
{
    FILE *fp;
    char line[1024];
    struct map_segment_desc seg;

    memheap_context.n_segments = 0;

    fp = fopen("/proc/self/maps", "r");
    if (NULL == fp) {
        MEMHEAP_ERROR("Failed to open /proc/self/maps");
        return OSHMEM_ERROR;
    }

    while (NULL != fgets(line, sizeof(line), fp)) {
        memset(&seg, 0, sizeof(seg));
        sscanf(line,
               "%llx-%llx %s %llx %s %llx %s",
               (long long *) &seg.start,
               (long long *) &seg.end,
               seg.perms,
               (long long *) &seg.offset,
               seg.dev,
               (long long *) &seg.inode,
               seg.pathname);

        if (OSHMEM_ERROR == __check_address(&seg))
            continue;

        if (OSHMEM_ERROR == __check_pathname(&seg))
            continue;

        if (OSHMEM_ERROR == __check_perms(&seg))
            continue;

        MEMHEAP_VERBOSE(5, "add: %s", line);
        if (MCA_MEMHEAP_MAX_SEGMENTS <= memheap_context.n_segments) {
            MEMHEAP_ERROR("too many segments (max = %d): skip %s",
                          MCA_MEMHEAP_MAX_SEGMENTS, line);
            continue;
        }
        if (memheap_context.n_segments > 0
                && seg.start
                        == memheap_context.mem_segs[memheap_context.n_segments
                                - 1].end) {
            MEMHEAP_VERBOSE(5, "Coalescing segment");
            memheap_context.mem_segs[memheap_context.n_segments - 1].end =
                    seg.end;
        } else {
            memheap_context.mem_segs[memheap_context.n_segments].start =
                    seg.start;
            memheap_context.mem_segs[memheap_context.n_segments].end = seg.end;
            memheap_context.n_segments++;
        }
    }

    fclose(fp);
    return OSHMEM_SUCCESS;
}
static int do_mkey_req(opal_buffer_t *msg, int pe, int seg)
{
    uint8_t msg_type;
    oshmem_proc_t *proc;
    int i, n, tr_id;
    mca_spml_mkey_t *mkey;

    msg_type = MEMHEAP_RKEY_RESP;
    opal_dss.pack(msg, &msg_type, 1, OPAL_UINT8);

    /* go over all transports to remote pe and pack mkeys */
    n = oshmem_get_transport_count(pe);
    proc = oshmem_proc_group_find(oshmem_group_all, pe);
    opal_dss.pack(msg, &n, 1, OPAL_UINT32);
    MEMHEAP_VERBOSE(5, "found %d transports to %d", n, pe);
    for (i = 0; i < n; i++) {
        tr_id = proc->transport_ids[i];

        mkey = mca_memheap_base_get_mkey(__seg2base_va(seg), tr_id);
        if (!mkey) {
            MEMHEAP_ERROR("seg#%d tr_id: %d failed to find local mkey",
                          seg, tr_id);
            return OSHMEM_ERROR;
        }
        opal_dss.pack(msg, &tr_id, 1, OPAL_UINT32);
        opal_dss.pack(msg, &mkey->key, 1, OPAL_UINT64);
        opal_dss.pack(msg, &mkey->va_base, 1, OPAL_UINT64);

        if (NULL != MCA_SPML_CALL(get_remote_context_size)) {
            uint32_t context_size =
                    (mkey->spml_context == NULL ) ?
                            0 :
                            (uint32_t) MCA_SPML_CALL(get_remote_context_size(mkey->spml_context));
            opal_dss.pack(msg, &context_size, 1, OPAL_UINT32);
            if (0 != context_size) {
                opal_dss.pack(msg,
                              MCA_SPML_CALL(get_remote_context(mkey->spml_context)),
                              context_size,
                              OPAL_BYTE);
            }
        }

        MEMHEAP_VERBOSE(5,
                        "seg#%d tr_id: %d key %llx base_va %p",
                        seg, tr_id, (unsigned long long)mkey->key, mkey->va_base);
    }
    return OSHMEM_SUCCESS;
}
Beispiel #5
0
int mca_memheap_base_static_init(mca_memheap_map_t *map)
{
    /* read and parse segments from /proc/self/maps */
    int ret = OSHMEM_SUCCESS;

    assert(map);
    assert(SYMB_SEG_INDEX <= map->n_segments);

    ret = _load_segments();

    if (OSHMEM_SUCCESS == ret) {
        int i;
        size_t total_mem;

        for (i = 0, total_mem = 0; i < memheap_context.n_segments; i++) {
            map_segment_t *s = &map->mem_segs[map->n_segments];

            memset(s, 0, sizeof(*s));
            MAP_SEGMENT_RESET_FLAGS(s);
            s->seg_id = MAP_SEGMENT_SHM_INVALID;
            s->seg_base_addr = memheap_context.mem_segs[i].start;
            s->end = memheap_context.mem_segs[i].end;
            s->seg_size = ((uintptr_t)s->end - (uintptr_t)s->seg_base_addr);
            s->type = MAP_SEGMENT_STATIC;
            map->n_segments++;

            total_mem += ((uintptr_t)s->end - (uintptr_t)s->seg_base_addr);
        }
        MEMHEAP_VERBOSE(1,
                        "Memheap static memory: %llu byte(s), %d segments",
                        (unsigned long long)total_mem, map->n_segments);
    }

    return ret;
}
Beispiel #6
0
static inline map_segment_t *__find_va(const void* va)
{
    map_segment_t *s;

    if (OPAL_LIKELY((uintptr_t)va >= (uintptr_t)memheap_map->mem_segs[HEAP_SEG_INDEX].seg_base_addr &&
                    (uintptr_t)va < (uintptr_t)memheap_map->mem_segs[HEAP_SEG_INDEX].end)) {
        s = &memheap_map->mem_segs[HEAP_SEG_INDEX];
    } else {
        s = bsearch(va,
                    &memheap_map->mem_segs[SYMB_SEG_INDEX],
                    memheap_map->n_segments - 1,
                    sizeof(*s),
                    _seg_cmp);
    }

#if MEMHEAP_BASE_DEBUG == 1
    if (s) {
        MEMHEAP_VERBOSE(5, "match seg#%02ld: 0x%llX - 0x%llX %llu bytes va=%p",
                s - memheap_map->mem_segs,
                (long long)s->seg_base_addr,
                (long long)s->end,
                (long long)(s->end - s->seg_base_addr),
                (void *)va);
    }
#endif
    return s;
}
Beispiel #7
0
static int _shm_attach(map_segment_t *s, size_t size, int use_hp, int do_rmid)
{
    static int shm_context = 0;
    ;
    void *addr = NULL;
    int shmid = MEMHEAP_SHM_INVALID;
    int flags;

    assert(s);

    shm_context = use_hp;

    flags = IPC_CREAT | IPC_EXCL | SHM_R | SHM_W;
#if defined (SHM_HUGETLB)
    flags |= (use_hp ? SHM_HUGETLB : 0);
#endif

    /* Create a new shared memory segment and save the shmid. */
    shmid = shmget(IPC_PRIVATE, size, flags);
    if (shmid == MEMHEAP_SHM_INVALID) {
        MEMHEAP_VERBOSE(1, "Failed to get shm segment (errno=%d)", errno);
        return OSHMEM_ERROR;
    }

    /* Attach to the sement */
    addr = shmat(shmid, (void *) mca_memheap_base_start_address, 0);
    if (addr == (void *) -1L) {
        MEMHEAP_VERBOSE(1, "Failed to attach to shm segment (errno=%d)", errno);

        shmctl(shmid, IPC_RMID, NULL );
        return OSHMEM_ERR_OUT_OF_RESOURCE;
    }

    MEMHEAP_VERBOSE(5, "got shmid %d", shmid);

    if (do_rmid)
        shmctl(shmid, IPC_RMID, NULL );

    s->type = MAP_SEGMENT_ALLOC_SHM;
    s->shmid = shmid;
    s->start = addr;
    s->size = size;
    s->end = (void*)((uintptr_t)s->start + s->size);
    s->context = &shm_context;

    return OSHMEM_SUCCESS;
}
Beispiel #8
0
/**
 * @param all_trs
 * 0 - pack mkeys for transports to given pe
 * 1 - pack mkeys for ALL possible transports. value of pe is ignored
 */
static int pack_local_mkeys(opal_buffer_t *msg, int pe, int seg, int all_trs)
{
    oshmem_proc_t *proc;
    int i, n, tr_id;
    sshmem_mkey_t *mkey;

    /* go over all transports to remote pe and pack mkeys */
    if (!all_trs) {
        n = oshmem_get_transport_count(pe);
        proc = oshmem_proc_group_find(oshmem_group_all, pe);
    }
    else {
        proc = NULL;
        n = memheap_map->num_transports;
    }

    opal_dss.pack(msg, &n, 1, OPAL_UINT32);
    MEMHEAP_VERBOSE(5, "found %d transports to %d", n, pe);
    for (i = 0; i < n; i++) {
        if (!all_trs) {
            tr_id = proc->transport_ids[i];
        }
        else {
            tr_id = i;
        }
        mkey = mca_memheap_base_get_mkey(mca_memheap_seg2base_va(seg), tr_id);
        if (!mkey) {
            MEMHEAP_ERROR("seg#%d tr_id: %d failed to find local mkey",
                          seg, tr_id);
            return OSHMEM_ERROR;
        }
        opal_dss.pack(msg, &tr_id, 1, OPAL_UINT32);
        opal_dss.pack(msg, &mkey->va_base, 1, OPAL_UINT64);
        if (0 == mkey->va_base) {
            opal_dss.pack(msg, &mkey->u.key, 1, OPAL_UINT64);
        } else {
            opal_dss.pack(msg, &mkey->len, 1, OPAL_UINT16);
            if (0 < mkey->len) {
                opal_dss.pack(msg, mkey->u.data, mkey->len, OPAL_BYTE);
            }
        }
        MEMHEAP_VERBOSE(5,
                        "seg#%d tr_id: %d %s",
                        seg, tr_id, mca_spml_base_mkey2str(mkey));
    }
    return OSHMEM_SUCCESS;
}
Beispiel #9
0
static int oshmem_mkey_recv_cb(void)
{
    MPI_Status status;
    int flag;
    int n;
    int rc;
    opal_buffer_t *msg;
    int32_t size;
    void *tmp_buf;
    oob_comm_request_t *r;

    n = 0;
    r = (oob_comm_request_t *)opal_list_get_first(&memheap_oob.req_list);
    assert(r);
    while (1) {
        my_MPI_Test(&r->recv_req, &flag, &status);
        if (OPAL_LIKELY(0 == flag)) {
            return n;
        }
        MPI_Get_count(&status, MPI_BYTE, &size);
        MEMHEAP_VERBOSE(5, "OOB request from PE: %d, size %d", status.MPI_SOURCE, size);
        n++;
        opal_list_remove_first(&memheap_oob.req_list);

        /* to avoid deadlock we must start request
         * before processing it. Data are copied to
         * the tmp buffer
         */
        tmp_buf = malloc(size);
        if (NULL == tmp_buf) {
            MEMHEAP_ERROR("not enough memory");
            ORTE_ERROR_LOG(0);
            return n;
        }
        memcpy(tmp_buf, (void*)&r->buf, size);
        msg = OBJ_NEW(opal_buffer_t);
        if (NULL == msg) {
            MEMHEAP_ERROR("not enough memory");
            ORTE_ERROR_LOG(0);
            return n;
        }
        opal_dss.load(msg, (void*)tmp_buf, size);

        rc = MPI_Start(&r->recv_req);
        if (MPI_SUCCESS != rc) {
            MEMHEAP_ERROR("Failed to post recv request %d", rc);
            ORTE_ERROR_LOG(rc);
            return n;
        }
        opal_list_append(&memheap_oob.req_list, &r->super);

        do_recv(status.MPI_SOURCE, msg);
        OBJ_RELEASE(msg);

        r = (oob_comm_request_t *)opal_list_get_first(&memheap_oob.req_list);
        assert(r);
    }
    return 1;  
}
Beispiel #10
0
static int memheap_oob_get_mkeys(int pe, uint32_t seg, sshmem_mkey_t *mkeys)
{
    opal_buffer_t *msg;
    uint8_t cmd;
    int i;
    int rc;

    if (OSHMEM_SUCCESS == MCA_SPML_CALL(oob_get_mkeys(pe, seg, mkeys))) {
        for (i = 0; i < memheap_map->num_transports; i++) {
            mkeys[i].va_base = mca_memheap_seg2base_va(seg);
            MEMHEAP_VERBOSE(5,
                            "MKEY CALCULATED BY LOCAL SPML: pe: %d tr_id: %d %s",
                            pe,
                            i,
                            mca_spml_base_mkey2str(&mkeys[i]));
        }
        return OSHMEM_SUCCESS;
    }

    OPAL_THREAD_LOCK(&memheap_oob.lck);

    memheap_oob.mkeys = mkeys;
    memheap_oob.mkeys_rcvd = 0;

    msg = OBJ_NEW(opal_buffer_t);
    if (!msg) {
        OPAL_THREAD_UNLOCK(&memheap_oob.lck);
        MEMHEAP_ERROR("failed to get msg buffer");
        return OSHMEM_ERROR;
    }

    OPAL_THREAD_LOCK(&memheap_oob.lck);
    cmd = MEMHEAP_RKEY_REQ;
    opal_dss.pack(msg, &cmd, 1, OPAL_UINT8);
    opal_dss.pack(msg, &seg, 1, OPAL_UINT32);

    rc = send_buffer(pe, msg);
    if (MPI_SUCCESS != rc) {
        OPAL_THREAD_UNLOCK(&memheap_oob.lck);
        MEMHEAP_ERROR("FAILED to send rml message %d", rc);
        return OSHMEM_ERROR;
    }

    while (!memheap_oob.mkeys_rcvd) {
        opal_condition_wait(&memheap_oob.cond, &memheap_oob.lck);
    }

    if (MEMHEAP_RKEY_RESP == memheap_oob.mkeys_rcvd) {
        rc = OSHMEM_SUCCESS;
    } else {
        MEMHEAP_ERROR("failed to get rkey seg#%d pe=%d", seg, pe);
        rc = OSHMEM_ERROR;
    }

    OPAL_THREAD_UNLOCK(&memheap_oob.lck);
    return rc;
}
Beispiel #11
0
static int send_buffer(int pe, opal_buffer_t *msg)
{
    void *buffer;
    int32_t size;
    int rc;

    opal_dss.unload(msg, &buffer, &size);
    rc = PMPI_Send(buffer, size, MPI_BYTE, pe, 0, oshmem_comm_world);
    free(buffer);
    OBJ_RELEASE(msg);

    MEMHEAP_VERBOSE(5, "message sent: dst=%d, rc=%d, %d bytes!", pe, rc, size);
    return rc;
}
Beispiel #12
0
static void unpack_remote_mkeys(opal_buffer_t *msg, int remote_pe)
{
    int32_t cnt;
    int32_t n;
    int32_t tr_id;
    int i;
    oshmem_proc_t *proc;

    proc = oshmem_proc_group_find(oshmem_group_all, remote_pe);
    cnt = 1;
    opal_dss.unpack(msg, &n, &cnt, OPAL_UINT32);
    for (i = 0; i < n; i++) {
        cnt = 1;
        opal_dss.unpack(msg, &tr_id, &cnt, OPAL_UINT32);
        cnt = 1;
        opal_dss.unpack(msg,
                        &memheap_oob.mkeys[tr_id].va_base,
                        &cnt,
                        OPAL_UINT64);

        if (0 == memheap_oob.mkeys[tr_id].va_base) {
            cnt = 1;
            opal_dss.unpack(msg, &memheap_oob.mkeys[tr_id].u.key, &cnt, OPAL_UINT64);
            if (OPAL_PROC_ON_LOCAL_NODE(proc->super.proc_flags)) {
                memheap_attach_segment(&memheap_oob.mkeys[tr_id], tr_id);
            }
        } else {
            cnt = 1;
            opal_dss.unpack(msg, &memheap_oob.mkeys[tr_id].len, &cnt, OPAL_UINT16);
            if (0 < memheap_oob.mkeys[tr_id].len) {
                memheap_oob.mkeys[tr_id].u.data = malloc(memheap_oob.mkeys[tr_id].len);
                if (NULL == memheap_oob.mkeys[tr_id].u.data) {
                    MEMHEAP_ERROR("Failed allocate %d bytes", memheap_oob.mkeys[tr_id].len);
                    oshmem_shmem_abort(-1);
                }
                cnt = memheap_oob.mkeys[tr_id].len;
                opal_dss.unpack(msg, memheap_oob.mkeys[tr_id].u.data, &cnt, OPAL_BYTE);
                MCA_SPML_CALL(rmkey_unpack(&memheap_oob.mkeys[tr_id], remote_pe));
            } else {
                memheap_oob.mkeys[tr_id].u.key = MAP_SEGMENT_SHM_INVALID;
            }
        }

        MEMHEAP_VERBOSE(5,
                        "tr_id: %d %s",
                        tr_id, mca_spml_base_mkey2str(&memheap_oob.mkeys[tr_id]));
    }
}
int mca_memheap_base_reg(mca_memheap_map_t *memheap_map)
{
    int ret = OSHMEM_SUCCESS;
    int i;

    for (i = 0; i < memheap_map->n_segments; i++) {
        map_segment_t *s = &memheap_map->mem_segs[i];

        MEMHEAP_VERBOSE(5,
                        "register seg#%02d: 0x%p - 0x%p %llu bytes type=0x%X id=0x%X",
                        i,
                        s->start,
                        s->end,
                        (long long)(s->end - s->start),
                        s->type,
                        s->shmid);
        ret = __reg_segment(s, &memheap_map->num_transports);
    }

    return ret;
}
int mca_memheap_base_dereg(mca_memheap_map_t *memheap_map)
{
    int ret = OSHMEM_SUCCESS;
    int i;

    for (i = 0; i < memheap_map->n_segments; i++) {
        map_segment_t *s = &memheap_map->mem_segs[i];

        if (!s->is_active)
            continue;

        MEMHEAP_VERBOSE(5,
                        "deregistering segment#%d: %p - %p %llu bytes",
                        i,
                        s->start,
                        s->end,
                        (long long)(s->end - s->start));
        ret = __dereg_segment(s);
    }

    return ret;
}
static void do_mkey_resp(opal_buffer_t *msg)
{
    int32_t cnt;
    int32_t n;
    int32_t tr_id;
    int i;

    cnt = 1;
    opal_dss.unpack(msg, &n, &cnt, OPAL_UINT32);
    for (i = 0; i < n; i++) {
        opal_dss.unpack(msg, &tr_id, &cnt, OPAL_UINT32);
        opal_dss.unpack(msg, &memheap_oob.mkeys[tr_id].key, &cnt, OPAL_UINT64);
        opal_dss.unpack(msg,
                        &memheap_oob.mkeys[tr_id].va_base,
                        &cnt,
                        OPAL_UINT64);

        if (NULL != MCA_SPML_CALL(set_remote_context_size)) {
            int32_t context_size;
            opal_dss.unpack(msg, &context_size, &cnt, OPAL_UINT32);
            if (0 != context_size) {
                MCA_SPML_CALL(set_remote_context_size(&(memheap_oob.mkeys[tr_id].spml_context), context_size));
                void* context;
                context = calloc(1, context_size);
                opal_dss.unpack(msg, context, &context_size, OPAL_BYTE);
                MCA_SPML_CALL(set_remote_context(&(memheap_oob.mkeys[tr_id].spml_context),context));
            }
        }

        memheap_attach_segment(&memheap_oob.mkeys[tr_id], tr_id);

        MEMHEAP_VERBOSE(5,
                        "tr_id: %d key %llx base_va %p",
                        tr_id, (unsigned long long)memheap_oob.mkeys[tr_id].key, memheap_oob.mkeys[tr_id].va_base);
    }
}
Beispiel #16
0
static void memheap_attach_segment(sshmem_mkey_t *mkey, int tr_id)
{
    /* process special case when va was got using sshmem
     * this case is notable for:
     * - key is set as (seg_id);
     * - va_base is set as 0;
     * - len is set as 0;
     */
    assert(mkey->va_base == 0);
    assert(mkey->len == 0);

    MEMHEAP_VERBOSE(5,
            "shared memory usage tr_id: %d va_base: 0x%p len: %d key %llx",
            tr_id,
            mkey->va_base, mkey->len, (unsigned long long)mkey->u.key);

    mca_sshmem_segment_attach(&(memheap_map->mem_segs[HEAP_SEG_INDEX]), mkey);

    if ((void *) -1 == (void *) mkey->va_base) {
        MEMHEAP_ERROR("tr_id: %d key %llx attach failed: errno = %d",
                tr_id, (unsigned long long)mkey->u.key, errno);
        oshmem_shmem_abort(-1);
    }
}
Beispiel #17
0
void mca_memheap_modex_recv_all(void)
{
    int i;
    int j;
    int nprocs, my_pe;
    opal_buffer_t *msg = NULL;
    void *send_buffer = NULL;
    char *rcv_buffer = NULL;
    int size;
    int *rcv_size = NULL;
    int *rcv_n_transports = NULL;
    int *rcv_offsets = NULL;
    int rc = OSHMEM_SUCCESS;
    size_t buffer_size;

    if (!mca_memheap_base_key_exchange) {
        oshmem_shmem_barrier();
        return;
    }

    nprocs = oshmem_num_procs();
    my_pe = oshmem_my_proc_id();

    /* buffer allocation for num_transports
     * message sizes and offsets */

    rcv_size = (int *)malloc(nprocs * sizeof(int));
    if (NULL == rcv_size) {
        MEMHEAP_ERROR("failed to get rcv_size buffer");
        rc = OSHMEM_ERR_OUT_OF_RESOURCE;
        goto exit_fatal;
    }

    rcv_offsets = (int *)malloc(nprocs * sizeof(int));
    if (NULL == rcv_offsets) {
        MEMHEAP_ERROR("failed to get rcv_offsets buffer");
        rc = OSHMEM_ERR_OUT_OF_RESOURCE;
        goto exit_fatal;
    }

    rcv_n_transports = (int *)malloc(nprocs * sizeof(int));
    if (NULL == rcv_offsets) {
        MEMHEAP_ERROR("failed to get rcv_offsets buffer");
        rc = OSHMEM_ERR_OUT_OF_RESOURCE;
        goto exit_fatal;
    }

    /* serialize our own mkeys */
    msg = OBJ_NEW(opal_buffer_t);
    if (NULL == msg) {
        MEMHEAP_ERROR("failed to get msg buffer");
        rc = OSHMEM_ERR_OUT_OF_RESOURCE;
        goto exit_fatal;
    }

    for (j = 0; j < memheap_map->n_segments; j++) {
        pack_local_mkeys(msg, 0, j, 1);
    }

    /* we assume here that int32_t returned by opal_dss.unload
     * is equal to size of int we use for MPI_Allgather, MPI_Allgatherv */

    assert(sizeof(int32_t) == sizeof(int));

    /* Do allgather */
    opal_dss.unload(msg, &send_buffer, &size);
    MEMHEAP_VERBOSE(1, "local keys packed into %d bytes, %d segments", size, memheap_map->n_segments);

    /* we need to send num_transports and message sizes separately
     * since message sizes depend on types of btl used */

    rc = oshmem_shmem_allgather(&memheap_map->num_transports, rcv_n_transports, sizeof(int));
    if (MPI_SUCCESS != rc) {
        MEMHEAP_ERROR("allgather failed");
        goto exit_fatal;
    }

    rc = oshmem_shmem_allgather(&size, rcv_size, sizeof(int));
    if (MPI_SUCCESS != rc) {
        MEMHEAP_ERROR("allgather failed");
        goto exit_fatal;
    }

    /* calculating offsets (displacements) for allgatherv */

    rcv_offsets[0] = 0;
    for (i = 1; i < nprocs; i++) {
        rcv_offsets[i] = rcv_offsets[i - 1] + rcv_size[i - 1];
    }

    buffer_size = rcv_offsets[nprocs - 1] + rcv_size[nprocs - 1];

    rcv_buffer = malloc (buffer_size);
    if (NULL == rcv_buffer) {
        MEMHEAP_ERROR("failed to allocate recieve buffer");
        rc = OSHMEM_ERR_OUT_OF_RESOURCE;
        goto exit_fatal;
    }

    rc = oshmem_shmem_allgatherv(send_buffer, rcv_buffer, size, rcv_size, rcv_offsets);
    if (MPI_SUCCESS != rc) {
        free (rcv_buffer);
        MEMHEAP_ERROR("allgatherv failed");
        goto exit_fatal;
    }

    opal_dss.load(msg, rcv_buffer, buffer_size);

    /* deserialize mkeys */
    OPAL_THREAD_LOCK(&memheap_oob.lck);
    for (i = 0; i < nprocs; i++) {
        if (i == my_pe) {
            continue;
        }

        msg->unpack_ptr = (void *)((intptr_t) msg->base_ptr + rcv_offsets[i]);

        for (j = 0; j < memheap_map->n_segments; j++) {
            map_segment_t *s;

            s = &memheap_map->mem_segs[j];
            if (NULL != s->mkeys_cache[i]) {
                MEMHEAP_VERBOSE(10, "PE%d: segment%d already exists, mkey will be replaced", i, j);
            } else {
                s->mkeys_cache[i] = (sshmem_mkey_t *) calloc(rcv_n_transports[i],
                        sizeof(sshmem_mkey_t));
                if (NULL == s->mkeys_cache[i]) {
                    MEMHEAP_ERROR("PE%d: segment%d: Failed to allocate mkeys cache entry", i, j);
                    oshmem_shmem_abort(-1);
                }
            }
            memheap_oob.mkeys = s->mkeys_cache[i];
            unpack_remote_mkeys(msg, i);
        }
    }

    OPAL_THREAD_UNLOCK(&memheap_oob.lck);

exit_fatal:
    if (rcv_size) {
        free(rcv_size);
    }
    if (rcv_offsets) {
        free(rcv_offsets);
    }
    if (rcv_n_transports) {
        free(rcv_n_transports);
    }
    if (send_buffer) {
        free(send_buffer);
    }
    if (msg) {
        OBJ_RELEASE(msg);
    }

    /* This function requires abort in any error case */
    if (OSHMEM_SUCCESS != rc) {
        oshmem_shmem_abort(rc);
    }
}
Beispiel #18
0
static int oshmem_mkey_recv_cb(void)
{
    MPI_Status status;
    int flag;
    int n;
    int rc;
    opal_buffer_t *msg;
    int32_t size;
    void *tmp_buf;
    oob_comm_request_t *r;

    n = 0;
    r = (oob_comm_request_t *)opal_list_get_first(&memheap_oob.req_list);
    assert(r);
    while(r != (oob_comm_request_t *)opal_list_get_end(&memheap_oob.req_list)) {
        my_MPI_Test(&r->recv_req, &flag, &status);
        if (OPAL_LIKELY(0 == flag)) {
            return n;
        }
        PMPI_Get_count(&status, MPI_BYTE, &size);
        MEMHEAP_VERBOSE(5, "OOB request from PE: %d, size %d", status.MPI_SOURCE, size);
        n++;
        opal_list_remove_first(&memheap_oob.req_list);

        /* to avoid deadlock we must start request
         * before processing it. Data are copied to
         * the tmp buffer
         */
        tmp_buf = malloc(size);
        if (NULL == tmp_buf) {
            MEMHEAP_ERROR("not enough memory");
            ORTE_ERROR_LOG(0);
            return n;
        } else {
		    memcpy(tmp_buf, (void*)&r->buf, size);
		    msg = OBJ_NEW(opal_buffer_t);
		    if (NULL == msg) {
		        MEMHEAP_ERROR("not enough memory");
		        ORTE_ERROR_LOG(0);
		        free(tmp_buf);
		        return n;
		    }
		    opal_dss.load(msg, (void*)tmp_buf, size);

            /*
             * send reply before posting the receive request again to limit the recursion size to
             * number of receive requests.
             * send can call opal_progress which calls this function again. If recv req is started
             * stack size will be proportional to number of job ranks.
             */
            do_recv(status.MPI_SOURCE, msg);
            OBJ_RELEASE(msg);
        }

        rc = PMPI_Start(&r->recv_req);
        if (MPI_SUCCESS != rc) {
            MEMHEAP_ERROR("Failed to post recv request %d", rc);
            ORTE_ERROR_LOG(rc);
            return n;
        }
        opal_list_append(&memheap_oob.req_list, &r->super);


        r = (oob_comm_request_t *)opal_list_get_first(&memheap_oob.req_list);
        assert(r);
    }

    return 1;
}
Beispiel #19
0
static void do_recv(int source_pe, opal_buffer_t* buffer)
{
    int32_t cnt = 1;
    int rc;
    opal_buffer_t *msg;
    uint8_t msg_type;
    uint32_t seg;

    MEMHEAP_VERBOSE(5, "unpacking %d of %d", cnt, OPAL_UINT8);
    rc = opal_dss.unpack(buffer, &msg_type, &cnt, OPAL_UINT8);
    if (OPAL_SUCCESS != rc) {
        ORTE_ERROR_LOG(rc);
        goto send_fail;
    }

    switch (msg_type) {
    case MEMHEAP_RKEY_REQ:
        cnt = 1;
        rc = opal_dss.unpack(buffer, &seg, &cnt, OPAL_UINT32);
        if (OPAL_SUCCESS != rc) {
            MEMHEAP_ERROR("bad RKEY_REQ msg");
            goto send_fail;
        }

        MEMHEAP_VERBOSE(5, "*** RKEY REQ");
        msg = OBJ_NEW(opal_buffer_t);
        if (!msg) {
            MEMHEAP_ERROR("failed to get msg buffer");
            ORTE_ERROR_LOG(rc);
            return;
        }

        msg_type = MEMHEAP_RKEY_RESP;
        opal_dss.pack(msg, &msg_type, 1, OPAL_UINT8);

        if (OSHMEM_SUCCESS != pack_local_mkeys(msg, source_pe, seg, 0)) {
            OBJ_RELEASE(msg);
            goto send_fail;
        }

        rc = send_buffer(source_pe, msg);
        if (MPI_SUCCESS != rc) {
            MEMHEAP_ERROR("FAILED to send rml message %d", rc);
            ORTE_ERROR_LOG(rc);
            goto send_fail;
        }
        break;

    case MEMHEAP_RKEY_RESP:
        MEMHEAP_VERBOSE(5, "*** RKEY RESP");
        OPAL_THREAD_LOCK(&memheap_oob.lck);
        unpack_remote_mkeys(buffer, source_pe);
        memheap_oob.mkeys_rcvd = MEMHEAP_RKEY_RESP;
        opal_condition_broadcast(&memheap_oob.cond);
        OPAL_THREAD_UNLOCK(&memheap_oob.lck);
        break;

    case MEMHEAP_RKEY_RESP_FAIL:
        MEMHEAP_VERBOSE(5, "*** RKEY RESP FAIL");
        memheap_oob.mkeys_rcvd = MEMHEAP_RKEY_RESP_FAIL;
        opal_condition_broadcast(&memheap_oob.cond);
        OPAL_THREAD_UNLOCK(&memheap_oob.lck);
        break;

    default:
        MEMHEAP_VERBOSE(5, "Unknown message type %x", msg_type);
        goto send_fail;
    }
    return;

    send_fail: msg = OBJ_NEW(opal_buffer_t);
    if (!msg) {
        MEMHEAP_ERROR("failed to get msg buffer");
        ORTE_ERROR_LOG(rc);
        return;
    }
    msg_type = MEMHEAP_RKEY_RESP_FAIL;
    opal_dss.pack(msg, &msg_type, 1, OPAL_UINT8);

    rc = send_buffer(source_pe, msg);
    if (MPI_SUCCESS != rc) {
        MEMHEAP_ERROR("FAILED to send rml message %d", rc);
        ORTE_ERROR_LOG(rc);
    }

}
static void memheap_buddy_rml_recv_cb(int status,
                                      orte_process_name_t* process_name,
                                      opal_buffer_t* buffer,
                                      orte_rml_tag_t tag,
                                      void* cbdata)
{
    MEMHEAP_VERBOSE(5,
                    "**** get request from %u:%d",
                    process_name->jobid, process_name->vpid);
    int32_t cnt = 1;
    int rc;
    opal_buffer_t *msg;
    uint8_t msg_type;
    uint32_t seg;

    MEMHEAP_VERBOSE(5, "unpacking %d of %d", cnt, OPAL_UINT8);
    rc = opal_dss.unpack(buffer, &msg_type, &cnt, OPAL_UINT8);
    if (ORTE_SUCCESS != rc) {
        ORTE_ERROR_LOG(rc);
        goto send_fail;
    }

    switch (msg_type) {
    case MEMHEAP_RKEY_REQ:
        cnt = 1;
        rc = opal_dss.unpack(buffer, &seg, &cnt, OPAL_UINT32);
        if (ORTE_SUCCESS != rc) {
            MEMHEAP_ERROR("bad RKEY_REQ msg");
            goto send_fail;
        }

        MEMHEAP_VERBOSE(5, "*** RKEY REQ");
        msg = OBJ_NEW(opal_buffer_t);
        if (!msg) {
            MEMHEAP_ERROR("failed to get msg buffer");
            ORTE_ERROR_LOG(rc);
            return;
        }

        if (OSHMEM_SUCCESS != do_mkey_req(msg, process_name->vpid, seg)) {
            OBJ_RELEASE(msg);
            goto send_fail;
        }

        rc = orte_rml.send_buffer_nb(process_name, msg, OMPI_RML_TAG_SHMEM, orte_rml_send_callback, NULL);

        if (0 > rc) {
            MEMHEAP_ERROR("FAILED to send rml message %d", rc);
            ORTE_ERROR_LOG(rc);
            goto send_fail;
        }
        break;

    case MEMHEAP_RKEY_RESP:
        MEMHEAP_VERBOSE(5, "*** RKEY RESP");
        OPAL_THREAD_LOCK(&memheap_oob.lck);
        do_mkey_resp(buffer);
        memheap_oob.mkeys_rcvd = MEMHEAP_RKEY_RESP;
        opal_condition_broadcast(&memheap_oob.cond);
        OPAL_THREAD_UNLOCK(&memheap_oob.lck);
        break;

    case MEMHEAP_RKEY_RESP_FAIL:
        MEMHEAP_VERBOSE(5, "*** RKEY RESP FAIL");
        memheap_oob.mkeys_rcvd = MEMHEAP_RKEY_RESP_FAIL;
        opal_condition_broadcast(&memheap_oob.cond);
        OPAL_THREAD_UNLOCK(&memheap_oob.lck);
        break;

    default:
        MEMHEAP_VERBOSE(5, "Unknown message type %x", msg_type);
        goto send_fail;
    }
    return;

    send_fail: msg = OBJ_NEW(opal_buffer_t);
    if (!msg) {
        MEMHEAP_ERROR("failed to get msg buffer");
        ORTE_ERROR_LOG(rc);
        return;
    }
    msg_type = MEMHEAP_RKEY_RESP_FAIL;
    opal_dss.pack(msg, &msg_type, 1, OPAL_UINT8);

    rc = orte_rml.send_buffer_nb(process_name, msg, OMPI_RML_TAG_SHMEM, orte_rml_send_callback, NULL);
    if (0 > rc) {
        MEMHEAP_ERROR("FAILED to send rml message %d", rc);
        ORTE_ERROR_LOG(rc);
    }

}
static void memheap_attach_segment(mca_spml_mkey_t *mkey, int tr_id)
{
    /* process special case when va was got using shmget(IPC_PRIVATE)
     * this case is notable for:
     * - key is set as (type|shmid);
     * - va_base is set as 0;
     */
    if (!mkey->va_base
            && ((int) MEMHEAP_SHM_GET_ID(mkey->key) != MEMHEAP_SHM_INVALID)) {
        MEMHEAP_VERBOSE(5,
                        "shared memory usage tr_id: %d key %llx base_va %p shmid 0x%X|0x%X",
                        tr_id,
                        (unsigned long long)mkey->key,
                        mkey->va_base,
                        MEMHEAP_SHM_GET_TYPE(mkey->key),
                        MEMHEAP_SHM_GET_ID(mkey->key));

        if (MEMHEAP_SHM_GET_TYPE(mkey->key) == MAP_SEGMENT_ALLOC_SHM) {
            mkey->va_base = shmat(MEMHEAP_SHM_GET_ID(mkey->key),
                                             0,
                                             0);
        } else if (MEMHEAP_SHM_GET_TYPE(mkey->key) == MAP_SEGMENT_ALLOC_IBV) {
#if defined(MPAGE_ENABLE) && (MPAGE_ENABLE > 0)
            openib_device_t *device = NULL;
            struct ibv_mr *ib_mr;
            void *addr;
            static int mr_count;

            int access_flag = IBV_ACCESS_LOCAL_WRITE |
            IBV_ACCESS_REMOTE_WRITE |
            IBV_ACCESS_REMOTE_READ |
            IBV_ACCESS_NO_RDMA;

            device = (openib_device_t *)memheap_map->mem_segs[HEAP_SEG_INDEX].context;
            assert(device);

            /* workaround mtt problem - request aligned addresses */
            ++mr_count;
            addr = (void *)(mca_memheap_base_start_address + mca_memheap_base_mr_interleave_factor*1024ULL*1024ULL*1024ULL*mr_count);
            ib_mr = ibv_reg_shared_mr(MEMHEAP_SHM_GET_ID(mkey->key),
                    device->ib_pd, addr, access_flag);
            if (NULL == ib_mr)
            {
                mkey->va_base = (void*)-1;
                MEMHEAP_ERROR("error to ibv_reg_shared_mr() errno says %d: %s",
                        errno, strerror(errno));
            }
            else
            {
                if (ib_mr->addr != addr) {
                    MEMHEAP_WARN("Failed to map shared region to address %p got addr %p. Try to increase 'memheap_mr_interleave_factor' from %d", addr, ib_mr->addr, mca_memheap_base_mr_interleave_factor);
                }

                opal_value_array_append_item(&device->ib_mr_array, &ib_mr);
                mkey->va_base = ib_mr->addr;
            }
#endif /* MPAGE_ENABLE */
        } else {
            MEMHEAP_ERROR("tr_id: %d key %llx attach failed: incorrect shmid 0x%X|0x%X",
                          tr_id, 
                          (unsigned long long)mkey->key,
                          MEMHEAP_SHM_GET_TYPE(mkey->key),
                          MEMHEAP_SHM_GET_ID(mkey->key));
            oshmem_shmem_abort(-1);
        }

        if ((void *) -1 == (void *) mkey->va_base) {
            MEMHEAP_ERROR("tr_id: %d key %llx attach failed: errno = %d",
                          tr_id, (unsigned long long)mkey->key, errno);
            oshmem_shmem_abort(-1);
        }
    }
}
static int memheap_oob_get_mkeys(int pe, uint32_t seg, mca_spml_mkey_t *mkeys)
{
    orte_process_name_t name;
    opal_buffer_t *msg;
    int rc;
    uint8_t cmd;
    int i;

    if (OSHMEM_SUCCESS == MCA_SPML_CALL(oob_get_mkeys(pe, seg, mkeys))) {
        for (i = 0; i < memheap_map->num_transports; i++) {
            mkeys[i].va_base = __seg2base_va(seg);
            MEMHEAP_VERBOSE(5,
                            "MKEY CALCULATED BY LOCAL SPML: pe: %d tr_id: %d key %llx base_va %p",
                            pe,
                            i,
                            (unsigned long long)mkeys[i].key,
                            mkeys[i].va_base);
        }
        return OSHMEM_SUCCESS;
    }

    OPAL_THREAD_LOCK(&memheap_oob.lck);

    memheap_oob.mkeys = mkeys;
    memheap_oob.mkeys_rcvd = 0;

    name.jobid = ORTE_PROC_MY_NAME->jobid;
    name.vpid = pe;

    msg = OBJ_NEW(opal_buffer_t);
    if (!msg) {
        OPAL_THREAD_UNLOCK(&memheap_oob.lck);
        MEMHEAP_ERROR("failed to get msg buffer");
        return OSHMEM_ERROR;
    }

    OPAL_THREAD_LOCK(&memheap_oob.lck);
    cmd = MEMHEAP_RKEY_REQ;
    opal_dss.pack(msg, &cmd, 1, OPAL_UINT8);
    opal_dss.pack(msg, &seg, 1, OPAL_UINT32);
    rc = orte_rml.send_buffer_nb(&name, msg, OMPI_RML_TAG_SHMEM, orte_rml_send_callback, NULL);
    if (0 > rc) {
        OBJ_RELEASE(msg);
        OPAL_THREAD_UNLOCK(&memheap_oob.lck);
        MEMHEAP_ERROR("FAILED to send rml message %d", rc);
        return OSHMEM_ERROR;
    }

    MEMHEAP_VERBOSE(5, "message sent: %d bytes!", rc);

    while (!memheap_oob.mkeys_rcvd) {
        opal_condition_wait(&memheap_oob.cond, &memheap_oob.lck);
    }

    if (MEMHEAP_RKEY_RESP == memheap_oob.mkeys_rcvd) {
        rc = OSHMEM_SUCCESS;
    } else {
        MEMHEAP_ERROR("failed to get rkey seg#%d pe=%d", seg, pe);
        rc = OSHMEM_ERROR;
    }

    OPAL_THREAD_UNLOCK(&memheap_oob.lck);
    return rc;
}
Beispiel #23
0
static int _ibv_attach(map_segment_t *s, size_t size)
{
    int rc = OSHMEM_SUCCESS;
    static openib_device_t memheap_device;
    openib_device_t *device = &memheap_device;
    int num_devs = 0;

    assert(s);

    memset(device, 0, sizeof(*device));

#ifdef HAVE_IBV_GET_DEVICE_LIST
    device->ib_devs = ibv_get_device_list(&num_devs);
#else
#error unsupported ibv_get_device_list in infiniband/verbs.h
#endif

    if (num_devs == 0 || !device->ib_devs)
    {
        rc = OSHMEM_ERR_NOT_SUPPORTED;
    }

    /* Open device */
    if (!rc)
    {
        int i = 0;

        if (num_devs > 1)
        {
            if (NULL == mca_memheap_base_param_hca_name)
            {
                MEMHEAP_VERBOSE(5, "found %d HCAs, choosing the first", num_devs);
            }
            else
            {
                MEMHEAP_VERBOSE(5, "found %d HCAs, searching for %s", num_devs, mca_memheap_base_param_hca_name);
            }
        }

        for (i = 0; i < num_devs; i++)
        {
            device->ib_dev = device->ib_devs[i];

            device->ib_dev_context = ibv_open_device(device->ib_dev);
            if (NULL == device->ib_dev_context)
            {
                MEMHEAP_ERROR("error obtaining device context for %s errno says %d: %s",
                        ibv_get_device_name(device->ib_dev), errno, strerror(errno));
                rc = OSHMEM_ERR_RESOURCE_BUSY;
            }
            else
            {
                if (NULL != mca_memheap_base_param_hca_name)
                {
                    if (0 == strcmp(mca_memheap_base_param_hca_name,ibv_get_device_name(device->ib_dev)))
                    {
                        MEMHEAP_VERBOSE(5, "mca_memheap_base_param_hca_name = %s, selected %s as %d of %d", mca_memheap_base_param_hca_name, ibv_get_device_name(device->ib_dev), i, num_devs);
                        rc = OSHMEM_SUCCESS;
                        break;
                    }
                }
                else
                {
                    MEMHEAP_VERBOSE(5, "mca_memheap_base_param_hca_name = %s, selected %s as %d of %d", mca_memheap_base_param_hca_name, ibv_get_device_name(device->ib_dev), i, num_devs);
                    rc = OSHMEM_SUCCESS;
                    break;
                }
            }
        }
    }

    /* Obtain device attributes */
    if (!rc)
    {
        if (ibv_query_device(device->ib_dev_context, &device->ib_dev_attr))
        {
            MEMHEAP_ERROR("error obtaining device attributes for %s errno says %d: %s",
                    ibv_get_device_name(device->ib_dev), errno, strerror(errno));
            rc = OSHMEM_ERR_RESOURCE_BUSY;
        }
        else
        {
            MEMHEAP_VERBOSE(5, "ibv device %s",
                    ibv_get_device_name(device->ib_dev));
        }
    }

    /* Allocate the protection domain for the device */
    if (!rc)
    {
        device->ib_pd = ibv_alloc_pd(device->ib_dev_context);
        if (NULL == device->ib_pd)
        {
            MEMHEAP_ERROR("error allocating protection domain for %s errno says %d: %s",
                    ibv_get_device_name(device->ib_dev), errno, strerror(errno));
            rc = OSHMEM_ERR_RESOURCE_BUSY;
        }
    }

    /* Allocate memory */
    if (!rc)
    {
        void *addr = NULL;
        struct ibv_mr *ib_mr = NULL;
        int access_flag = IBV_ACCESS_LOCAL_WRITE |
        IBV_ACCESS_REMOTE_WRITE |
        IBV_ACCESS_REMOTE_READ;

        OBJ_CONSTRUCT(&device->ib_mr_array, opal_value_array_t);
        opal_value_array_init(&device->ib_mr_array, sizeof(struct ibv_mr *));

#if defined(MPAGE_ENABLE) && (MPAGE_ENABLE > 0)
        access_flag |= IBV_ACCESS_ALLOCATE_MR |
        IBV_ACCESS_SHARED_MR_USER_READ |
        IBV_ACCESS_SHARED_MR_USER_WRITE;
#endif /* MPAGE_ENABLE */

        ib_mr = ibv_reg_mr(device->ib_pd, addr, size, access_flag);
        if (NULL == ib_mr)
        {
            MEMHEAP_ERROR("error to ibv_reg_mr() %llu bytes errno says %d: %s",
                    (unsigned long long)size, errno, strerror(errno));
            rc = OSHMEM_ERR_OUT_OF_RESOURCE;
        }
        else
        {
            device->ib_mr_shared = ib_mr;
            opal_value_array_append_item(&device->ib_mr_array, &ib_mr);
        }

#if defined(MPAGE_ENABLE) && (MPAGE_ENABLE > 0)
        if (!rc)
        {
            access_flag = IBV_ACCESS_LOCAL_WRITE |
            IBV_ACCESS_REMOTE_WRITE |
            IBV_ACCESS_REMOTE_READ|
            IBV_ACCESS_NO_RDMA;

            addr = (void *)mca_memheap_base_start_address;
            ib_mr = ibv_reg_shared_mr(device->ib_mr_shared->handle,
                    device->ib_pd, addr, access_flag);
            if (NULL == ib_mr)
            {
                MEMHEAP_ERROR("error to ibv_reg_shared_mr() %llu bytes errno says %d: %s",
                        (unsigned long long)size, errno, strerror(errno));
                rc = OSHMEM_ERR_OUT_OF_RESOURCE;
            }
            else
            {
                opal_value_array_append_item(&device->ib_mr_array, &ib_mr);
            }
        }
#endif /* MPAGE_ENABLE */

        if (!rc)
        {
            assert(size == device->ib_mr_shared->length);

            s->type = MAP_SEGMENT_ALLOC_IBV;
            s->shmid = device->ib_mr_shared->handle;
            s->start = ib_mr->addr;
            s->size = size;
            s->end = (void*)((uintptr_t)s->start + s->size);
            s->context = &memheap_device;
        }
    }

    return rc;
}
Beispiel #24
0
int mca_memheap_base_alloc_init(mca_memheap_map_t *map, size_t size)
{
    int ret = OSHMEM_SUCCESS;
    int value = mca_memheap_base_alloc_type;

    assert(map);
    assert(HEAP_SEG_INDEX == map->n_segments);

    MEMHEAP_VERBOSE(5,
                    "memheap method : %d",
                    mca_memheap_base_alloc_type);

    map_segment_t *s = &map->mem_segs[map->n_segments];
    memset(s, 0, sizeof(*s));
    s->is_active = 0;
    s->shmid = MEMHEAP_SHM_INVALID;
    s->start = 0;
    s->end = 0;
    s->size = 0;
    s->type = MAP_SEGMENT_UNKNOWN;
    s->context = NULL;

    switch (value) {
    case 0:
        /* use sysv alloc without hugepages */
        ret = _shm_attach(s, size, 0, 1);
        break;

    case 1:
        ret = _shm_attach(s, size, 1, 1);
        if (OSHMEM_SUCCESS != ret)
            ret = _shm_attach(s, size, 0, 1);
        break;

    case 2:
        /* huge pages only */
        ret = _shm_attach(s, size, 1, 1);
        if (OSHMEM_SUCCESS != ret)
            MEMHEAP_ERROR("FAILED to allocated symmetric heap using hugepages fallback is disabled, errno=%d",
                          errno);
        break;

    case 3:
        /* huge pages only + cleanup shmid */
        ret = _shm_attach(s, size, 1, 0);
        if (OSHMEM_SUCCESS != ret)
            MEMHEAP_ERROR("FAILED to allocated symmetric heap using hugepages fallback is disabled, errno=%d",
                          errno);
        break;

    case 4:
        /* use sysv alloc without hugepages */
        ret = _shm_attach(s, size, 0, 0);
        break;

#if defined(MPAGE_ENABLE) && (MPAGE_ENABLE > 0)
        case 5:
        /* use shared memory registration (mpages) */
        ret = _ibv_attach(s, size);
        if (OSHMEM_SUCCESS != ret)
            ret = _shm_attach(s, size, 0, 1);

        break;
#endif /* MPAGE_ENABLE */

    case 100:
        /* use mmap. It will severaly impact performance of intra node communication */
        ret = _mmap_attach(s, size);
        MEMHEAP_VERBOSE(1,
                        "mmap() memheap allocation will severely impact performance of intra node communication");
        break;

    case 101:
        ret = _shm_attach(s, size, 1, 1);
        if (OSHMEM_SUCCESS != ret) {
            MEMHEAP_ERROR("Failed to allocate hugepages. Falling back on regular allocation");
            ret = _mmap_attach(s, size);
        } else {
            s->shmid = MEMHEAP_SHM_INVALID;
        }
        MEMHEAP_VERBOSE(1, "SM BTL will be always used for intranode comm\n");
        break;

    case 102:
        ret = _shm_attach(s, size, 1, 1);
        if (OSHMEM_SUCCESS != ret) {
            MEMHEAP_ERROR("FAILED to allocated symmetric heap using hugepages fallback is disabled, errno=%d",
                          errno);
        } else {
            s->shmid = MEMHEAP_SHM_INVALID;
        }
        break;

    default:
        ret = _adaptive_attach(s, size);
    }

    if (OSHMEM_SUCCESS == ret) {
        map->n_segments++;
        MEMHEAP_VERBOSE(1,
                        "Memheap alloc memory: %llu byte(s), %d segments by method: %d",
                        (unsigned long long)size, map->n_segments, s->type);
    }

    return ret;
}