/*
 * register memory
 */
static int mca_rcache_grdma_register (mca_rcache_base_module_t *rcache, void *addr,
                                      size_t size, uint32_t flags, int32_t access_flags,
                                      mca_rcache_base_registration_t **reg)
{
    mca_rcache_grdma_module_t *rcache_grdma = (mca_rcache_grdma_module_t*)rcache;
    const bool bypass_cache = !!(flags & MCA_RCACHE_FLAGS_CACHE_BYPASS);
    const bool persist = !!(flags & MCA_RCACHE_FLAGS_PERSIST);
    mca_rcache_base_registration_t *grdma_reg;
    opal_free_list_item_t *item;
    unsigned char *base, *bound;
    unsigned int page_size = opal_getpagesize ();
    int rc;

    *reg = NULL;

    /* if cache bypass is requested don't use the cache */
    base = OPAL_DOWN_ALIGN_PTR(addr, page_size, unsigned char *);
    bound = OPAL_ALIGN_PTR((intptr_t) addr + size, page_size, unsigned char *) - 1;

#if OPAL_CUDA_GDR_SUPPORT
    if (flags & MCA_RCACHE_FLAGS_CUDA_GPU_MEM) {
        size_t psize;
        mca_common_cuda_get_address_range(&base, &psize, addr);
        bound = base + psize - 1;
        /* Check to see if this memory is in the cache and if it has been freed. If so,
         * this call will boot it out of the cache. */
        check_for_cuda_freed_memory(rcache, base, psize);
    }
#endif /* OPAL_CUDA_GDR_SUPPORT */

    do_unregistration_gc (rcache);

    /* look through existing regs if not persistent registration requested.
     * Persistent registration are always registered and placed in the cache */
    if (!(bypass_cache || persist)) {
        mca_rcache_base_find_args_t find_args = {.reg = NULL, .rcache_grdma = rcache_grdma,
                                                 .base = base, .bound = bound,
                                                 .access_flags = access_flags};
        /* check to see if memory is registered */
        rc = mca_rcache_base_vma_iterate (rcache_grdma->cache->vma_module, base, size,
                                          mca_rcache_grdma_check_cached, (void *) &find_args);
        if (1 == rc) {
            *reg = find_args.reg;
            return OPAL_SUCCESS;
        }

        /* get updated access flags */
        access_flags = find_args.access_flags;

        OPAL_THREAD_ADD32((volatile int32_t *) &rcache_grdma->stat_cache_miss, 1);
    }
Exemple #2
0
void mca_mpool_grdma_finalize(struct mca_mpool_base_module_t *mpool)
{
    mca_mpool_grdma_module_t *mpool_grdma = (mca_mpool_grdma_module_t*)mpool;
    mca_mpool_base_registration_t *regs[GRDMA_MPOOL_NREGS];
    int reg_cnt, i;

    /* Statistic */
    if (true == mca_mpool_grdma_component.print_stats) {
        opal_output(0, "%s grdma: stats "
                "(hit/miss/found/not found/evicted): %d/%d/%d/%d/%d\n",
                OPAL_NAME_PRINT(OPAL_PROC_MY_NAME),
                mpool_grdma->stat_cache_hit, mpool_grdma->stat_cache_miss,
                mpool_grdma->stat_cache_found, mpool_grdma->stat_cache_notfound,
                mpool_grdma->stat_evicted);
    }

    OPAL_THREAD_LOCK(&mpool->rcache->lock);

    do_unregistration_gc(mpool);

    do {
        reg_cnt = mpool->rcache->rcache_find_all(mpool->rcache, 0, (size_t)-1,
                                                 regs, GRDMA_MPOOL_NREGS);

        for (i = 0 ; i < reg_cnt ; ++i) {
            if (regs[i]->ref_count) {
                regs[i]->ref_count = 0; /* otherwise dereg will fail on assert */
            } else if (mca_mpool_grdma_component.leave_pinned) {
                opal_list_remove_item(&mpool_grdma->pool->lru_list,
                                      (opal_list_item_t *) regs[i]);
            }

	    (void) dereg_mem(regs[i]);
        }
    } while (reg_cnt == GRDMA_MPOOL_NREGS);

    OBJ_RELEASE(mpool_grdma->pool);

    OBJ_DESTRUCT(&mpool_grdma->reg_list);
    OPAL_THREAD_UNLOCK(&mpool->rcache->lock);

    /* Cleanup any vmas that we have deferred deletion on */
    mpool->rcache->rcache_clean(mpool->rcache);

    /* this mpool was allocated by grdma_init in mpool_grdma_component.c */
    free(mpool);
}
/*
 * register memory
 */
int mca_mpool_grdma_register(mca_mpool_base_module_t *mpool, void *addr,
                              size_t size, uint32_t flags,
                              mca_mpool_base_registration_t **reg)
{
    mca_mpool_grdma_module_t *mpool_grdma = (mca_mpool_grdma_module_t*)mpool;
    mca_mpool_base_registration_t *grdma_reg;
    ompi_free_list_item_t *item;
    unsigned char *base, *bound;
    bool bypass_cache = !!(flags & MCA_MPOOL_FLAGS_CACHE_BYPASS);
    int rc;

    OPAL_THREAD_LOCK(&mpool->rcache->lock);

    /* if cache bypass is requested don't use the cache */
    base = (unsigned char *) down_align_addr(addr, mca_mpool_base_page_size_log);
    bound = (unsigned char *) up_align_addr((void*)((char*) addr + size - 1),
                                            mca_mpool_base_page_size_log);
    if (!opal_list_is_empty (&mpool_grdma->pool->gc_list))
        do_unregistration_gc(mpool);

    /* look through existing regs if not persistent registration requested.
     * Persistent registration are always registered and placed in the cache */
    if(!(flags & MCA_MPOOL_FLAGS_PERSIST) && !bypass_cache) {
        /* check to see if memory is registered */
        mpool->rcache->rcache_find(mpool->rcache, addr, size, reg);
        if(*reg != NULL &&
           (mca_mpool_grdma_component.leave_pinned ||
            ((*reg)->flags & MCA_MPOOL_FLAGS_PERSIST) ||
            ((*reg)->base == base && (*reg)->bound == bound))) {
            if(0 == (*reg)->ref_count &&
               mca_mpool_grdma_component.leave_pinned) {
                opal_list_remove_item(&mpool_grdma->pool->lru_list,
                                      (opal_list_item_t*)(*reg));
            }
            mpool_grdma->stat_cache_hit++;
            (*reg)->ref_count++;
            OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
            return OMPI_SUCCESS;
        }

        mpool_grdma->stat_cache_miss++;
        *reg = NULL; /* in case previous find found something */

        /* If no suitable registration is in cache and leave_pinned isn't
         * set don't use the cache.
         * This is optimisation in case limit is not set. If limit is set we
         * have to put registration into the cache to determine when we hit
         * memory registration limit.
         * NONE: cache is still used for persistent registrations so previous
         * find can find something */
        if(!mca_mpool_grdma_component.leave_pinned) {
            bypass_cache = true;
        }
    }

    OMPI_FREE_LIST_GET(&mpool_grdma->reg_list, item, rc);
    if(OMPI_SUCCESS != rc) {
        OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
        return rc;
    }
    grdma_reg = (mca_mpool_base_registration_t*)item;

    grdma_reg->mpool = mpool;
    grdma_reg->base = base;
    grdma_reg->bound = bound;
    grdma_reg->flags = flags;

    if (false == bypass_cache) {
        rc = mpool->rcache->rcache_insert(mpool->rcache, grdma_reg, 0);

        if (OPAL_UNLIKELY(rc != OMPI_SUCCESS)) {
            OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
            OMPI_FREE_LIST_RETURN(&mpool_grdma->reg_list, item);
            return rc;
        }
    }

    while (OMPI_ERR_OUT_OF_RESOURCE ==
           (rc = mpool_grdma->resources.register_mem(mpool_grdma->resources.reg_data,
                                                     base, bound - base + 1, grdma_reg))) {
        /* try to remove one unused reg and retry */
        if (!mca_mpool_grdma_evict (mpool)) {
            break;
        }
    }

    if (OPAL_UNLIKELY(rc != OMPI_SUCCESS)) {
        if (false == bypass_cache) {
            mpool->rcache->rcache_delete(mpool->rcache, grdma_reg);
        }
        OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
        OMPI_FREE_LIST_RETURN(&mpool_grdma->reg_list, item);
        return rc;
    }

    *reg = grdma_reg;
    (*reg)->ref_count++;
    OPAL_THREAD_UNLOCK(&mpool->rcache->lock);

    /* Cleanup any vmas that we have deferred deletion on */
    mpool->rcache->rcache_clean(mpool->rcache);
    return OMPI_SUCCESS;
}
void mca_mpool_rdma_finalize(struct mca_mpool_base_module_t *mpool)
{
    mca_mpool_rdma_module_t *mpool_rdma = (mca_mpool_rdma_module_t*)mpool;
    mca_mpool_base_registration_t *reg;
    mca_mpool_base_registration_t *regs[RDMA_MPOOL_NREGS];
    int reg_cnt, i;
    int rc;

    /* Statistic */
    if(true == mca_mpool_rdma_component.print_stats) {
        opal_output(0, "%s rdma: stats "
                "(hit/miss/found/not found/evicted): %d/%d/%d/%d/%d\n",
                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                mpool_rdma->stat_cache_hit, mpool_rdma->stat_cache_miss,
                mpool_rdma->stat_cache_found, mpool_rdma->stat_cache_notfound,
                mpool_rdma->stat_evicted);
    }

    OPAL_THREAD_LOCK(&mpool->rcache->lock);
    if(!opal_list_is_empty(&mpool_rdma->gc_list))
        do_unregistration_gc(mpool);
    do {
        reg_cnt = mpool->rcache->rcache_find_all(mpool->rcache, 0, (size_t)-1,
                regs, RDMA_MPOOL_NREGS);

        for(i = 0; i < reg_cnt; i++) {
            reg = regs[i];

            if(reg->ref_count) {
                reg->ref_count = 0; /* otherway dereg will fail on assert */
            } else if (mca_mpool_rdma_component.leave_pinned) {
                opal_list_remove_item(&mpool_rdma->mru_list,
                        (opal_list_item_t*)reg);
            }

	    /* Remove from rcache first */
            mpool->rcache->rcache_delete(mpool->rcache, reg);

	    /* Drop lock before deregistering memory */
	    OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
	    rc = dereg_mem(mpool, reg);
	    OPAL_THREAD_LOCK(&mpool->rcache->lock);

            if(rc != OMPI_SUCCESS) {
		/* Potentially lose track of registrations
		   do we have to put it back? */
                continue;
            }

            OMPI_FREE_LIST_RETURN(&mpool_rdma->reg_list,
                    (ompi_free_list_item_t*)reg);
        }
    } while(reg_cnt == RDMA_MPOOL_NREGS);

    OBJ_DESTRUCT(&mpool_rdma->mru_list);
    OBJ_DESTRUCT(&mpool_rdma->gc_list);
    OBJ_DESTRUCT(&mpool_rdma->reg_list);
    OPAL_THREAD_UNLOCK(&mpool->rcache->lock);

    /* Cleanup any vmas that we have deferred deletion on */
    mpool->rcache->rcache_clean(mpool->rcache);

}
/*
 * register memory
 */
int mca_mpool_rdma_register(mca_mpool_base_module_t *mpool, void *addr,
                              size_t size, uint32_t flags,
                              mca_mpool_base_registration_t **reg)
{
    mca_mpool_rdma_module_t *mpool_rdma = (mca_mpool_rdma_module_t*)mpool;
    mca_mpool_base_registration_t *rdma_reg;
    ompi_free_list_item_t *item;
    unsigned char *base, *bound;
    int rc;

    /* if cache bypass is requested don't use the cache */
    if(flags & MCA_MPOOL_FLAGS_CACHE_BYPASS) {
        return register_cache_bypass(mpool, addr, size, flags, reg);
    }

    base = (unsigned char *) down_align_addr(addr, mca_mpool_base_page_size_log);
    bound = (unsigned char *) up_align_addr((void*)((char*) addr + size - 1),
             mca_mpool_base_page_size_log);
    OPAL_THREAD_LOCK(&mpool->rcache->lock);
    if(!opal_list_is_empty(&mpool_rdma->gc_list))
        do_unregistration_gc(mpool);

    /* look through existing regs if not persistent registration requested.
     * Persistent registration are always registered and placed in the cache */
    if(!(flags & MCA_MPOOL_FLAGS_PERSIST)) {
        /* check to see if memory is registered */
        mpool->rcache->rcache_find(mpool->rcache, addr, size, reg);
        if(*reg != NULL &&
                (mca_mpool_rdma_component.leave_pinned ||
                 ((*reg)->flags & MCA_MPOOL_FLAGS_PERSIST) ||
                 ((*reg)->base == base && (*reg)->bound == bound))) {
            if(0 == (*reg)->ref_count &&
                    mca_mpool_rdma_component.leave_pinned) {
                opal_list_remove_item(&mpool_rdma->mru_list,
                        (opal_list_item_t*)(*reg));
            }
            mpool_rdma->stat_cache_hit++;
            (*reg)->ref_count++;
            OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
            return OMPI_SUCCESS;
        }

        mpool_rdma->stat_cache_miss++;
        *reg = NULL; /* in case previous find found something */

        /* If no suitable registration is in cache and leave_pinned isn't
         * set and size of registration cache is unlimited don't use the cache.
         * This is optimisation in case limit is not set. If limit is set we
         * have to put registration into the cache to determine when we hit
         * memory registration limit.
         * NONE: cache is still used for persistent registrations so previous
         * find can find something */
        if(!mca_mpool_rdma_component.leave_pinned &&
                 mca_mpool_rdma_component.rcache_size_limit == 0) {
            OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
            return register_cache_bypass(mpool, addr, size, flags, reg);
        }
    }

    OMPI_FREE_LIST_GET(&mpool_rdma->reg_list, item, rc);
    if(OMPI_SUCCESS != rc) {
        OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
        return rc;
    }
    rdma_reg = (mca_mpool_base_registration_t*)item;

    rdma_reg->mpool = mpool;
    rdma_reg->base = base;
    rdma_reg->bound = bound;
    rdma_reg->flags = flags;

    while((rc = mpool->rcache->rcache_insert(mpool->rcache, rdma_reg,
             mca_mpool_rdma_component.rcache_size_limit)) ==
            OMPI_ERR_TEMP_OUT_OF_RESOURCE) {
        mca_mpool_base_registration_t *old_reg;
        /* try to remove one unused reg and retry */
        old_reg = (mca_mpool_base_registration_t*)
            opal_list_get_last(&mpool_rdma->mru_list);
        if(opal_list_get_end(&mpool_rdma->mru_list) !=
                (opal_list_item_t*)old_reg) {

	    /* Remove the registration from the cache and list before
	       deregistering the memory */
	    mpool->rcache->rcache_delete(mpool->rcache, old_reg);
	    opal_list_remove_item(&mpool_rdma->mru_list,
				  (opal_list_item_t*)old_reg);

	    /* Drop the rcache lock while we deregister the memory */
	    OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
            rc = dereg_mem(mpool, old_reg);
	    OPAL_THREAD_LOCK(&mpool->rcache->lock);

	    /* This introduces a potential leak of registrations if
	       the deregistration fails to occur as we no longer have
	       a reference to it. Is this possible? */
            if(OMPI_SUCCESS == rc) {
                OMPI_FREE_LIST_RETURN(&mpool_rdma->reg_list,
                        (ompi_free_list_item_t*)old_reg);
                mpool_rdma->stat_evicted++;
            } else
                break;
        } else
            break;
    }

    if(rc != OMPI_SUCCESS) {
        OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
        OMPI_FREE_LIST_RETURN(&mpool_rdma->reg_list, item);
        return rc;
    }

    rc = mpool_rdma->resources.register_mem(mpool_rdma->resources.reg_data,
            base, bound - base + 1, rdma_reg);

    if(rc != OMPI_SUCCESS) {
        mpool->rcache->rcache_delete(mpool->rcache, rdma_reg);
        OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
        OMPI_FREE_LIST_RETURN(&mpool_rdma->reg_list, item);
        return rc;
    }

    *reg = rdma_reg;
    (*reg)->ref_count++;
    OPAL_THREAD_UNLOCK(&mpool->rcache->lock);

    /* Cleanup any vmas that we have deferred deletion on */
    mpool->rcache->rcache_clean(mpool->rcache);
    return OMPI_SUCCESS;
}
Exemple #6
0
/*
 * register memory
 */
int mca_mpool_grdma_register(mca_mpool_base_module_t *mpool, void *addr,
                              size_t size, uint32_t flags,
                              mca_mpool_base_registration_t **reg)
{
    mca_mpool_grdma_module_t *mpool_grdma = (mca_mpool_grdma_module_t*)mpool;
    const bool bypass_cache = !!(flags & MCA_MPOOL_FLAGS_CACHE_BYPASS);
    const bool persist = !!(flags & MCA_MPOOL_FLAGS_PERSIST);
    mca_mpool_base_registration_t *grdma_reg;
    opal_free_list_item_t *item;
    unsigned char *base, *bound;
    int rc;

    OPAL_THREAD_LOCK(&mpool->rcache->lock);

    /* if cache bypass is requested don't use the cache */
    base = (unsigned char *) down_align_addr(addr, mca_mpool_base_page_size_log);
    bound = (unsigned char *) up_align_addr((void*)((char*) addr + size - 1),
                                            mca_mpool_base_page_size_log);
    if (!opal_list_is_empty (&mpool_grdma->pool->gc_list))
        do_unregistration_gc(mpool);

#if OPAL_CUDA_GDR_SUPPORT
    if (flags & MCA_MPOOL_FLAGS_CUDA_GPU_MEM) {
        size_t psize;
        mca_common_cuda_get_address_range(&base, &psize, addr);
        bound = base + psize - 1;
        /* Check to see if this memory is in the cache and if it has been freed. If so,
         * this call will boot it out of the cache. */
        check_for_cuda_freed_memory(mpool, base, psize);
    }
#endif /* OPAL_CUDA_GDR_SUPPORT */

    /* look through existing regs if not persistent registration requested.
     * Persistent registration are always registered and placed in the cache */
    if(!(bypass_cache || persist)) {
        /* check to see if memory is registered */
        mpool->rcache->rcache_find(mpool->rcache, base, bound - base + 1, reg);
        if (*reg && !(flags & MCA_MPOOL_FLAGS_INVALID)) {
            if (0 == (*reg)->ref_count) {
                /* Leave pinned must be set for this to still be in the rcache. */
                opal_list_remove_item(&mpool_grdma->pool->lru_list,
                                      (opal_list_item_t *)(*reg));
            }

            /* This segment fits fully within an existing segment. */
            mpool_grdma->stat_cache_hit++;
            (*reg)->ref_count++;
            OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
            return OPAL_SUCCESS;
        }

        mpool_grdma->stat_cache_miss++;
        *reg = NULL; /* in case previous find found something */

        /* Unless explicitly requested by the caller always store the
         * registration in the rcache. This will speed up the case where
         * no leave pinned protocol is in use but the same segment is in
         * use in multiple simultaneous transactions. We used to set bypass_cache
         * here is !mca_mpool_grdma_component.leave_pinned. */
    }

    item = opal_free_list_get (&mpool_grdma->reg_list);
    if(NULL == item) {
        OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
        return OPAL_ERR_OUT_OF_RESOURCE;
    }
    grdma_reg = (mca_mpool_base_registration_t*)item;

    grdma_reg->mpool = mpool;
    grdma_reg->base = base;
    grdma_reg->bound = bound;
    grdma_reg->flags = flags;
#if OPAL_CUDA_GDR_SUPPORT
    if (flags & MCA_MPOOL_FLAGS_CUDA_GPU_MEM) {
        mca_common_cuda_get_buffer_id(grdma_reg);
    }
#endif /* OPAL_CUDA_GDR_SUPPORT */

    if (false == bypass_cache) {
        rc = mpool->rcache->rcache_insert(mpool->rcache, grdma_reg, 0);

        if (OPAL_UNLIKELY(rc != OPAL_SUCCESS)) {
            OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
            opal_free_list_return (&mpool_grdma->reg_list, item);
            return rc;
        }
    }

    while (OPAL_ERR_OUT_OF_RESOURCE ==
           (rc = mpool_grdma->resources.register_mem(mpool_grdma->resources.reg_data,
                                                     base, bound - base + 1, grdma_reg))) {
        /* try to remove one unused reg and retry */
        if (!mca_mpool_grdma_evict (mpool)) {
            break;
        }
    }

    if (OPAL_UNLIKELY(rc != OPAL_SUCCESS)) {
        if (false == bypass_cache) {
            mpool->rcache->rcache_delete(mpool->rcache, grdma_reg);
        }
        OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
        opal_free_list_return (&mpool_grdma->reg_list, item);
        return rc;
    }

    *reg = grdma_reg;
    (*reg)->ref_count++;
    OPAL_THREAD_UNLOCK(&mpool->rcache->lock);

    /* Cleanup any vmas that we have deferred deletion on */
    mpool->rcache->rcache_clean(mpool->rcache);
    return OPAL_SUCCESS;
}