Beispiel #1
0
/*
 * register memory
 */
int mca_mpool_rdma_register(mca_mpool_base_module_t *mpool, void *addr,
                              size_t size, uint32_t flags,
                              mca_mpool_base_registration_t **reg)
{
    mca_mpool_rdma_module_t *mpool_rdma = (mca_mpool_rdma_module_t*)mpool;
    mca_mpool_base_registration_t *rdma_reg;
    ompi_free_list_item_t *item;
    unsigned char *base, *bound;
    int rc;

    /* if cache bypass is requested don't use the cache */
    if(flags & MCA_MPOOL_FLAGS_CACHE_BYPASS) {
        return register_cache_bypass(mpool, addr, size, flags, reg);
    }

    base = down_align_addr(addr, mca_mpool_base_page_size_log);
    bound = up_align_addr((void*)((char*) addr + size - 1),
             mca_mpool_base_page_size_log);
    OPAL_THREAD_LOCK(&mpool->rcache->lock);
    /* look through existing regs if not persistent registration requested.
     * Persistent registration are always registered and placed in the cache */
    if(!(flags & MCA_MPOOL_FLAGS_PERSIST)) {
        /* check to see if memory is registered */
        mpool->rcache->rcache_find(mpool->rcache, addr, size, reg);
        if(*reg != NULL &&
                (mca_mpool_rdma_component.leave_pinned ||
                 ((*reg)->flags & MCA_MPOOL_FLAGS_PERSIST) ||
                 ((*reg)->base == base && (*reg)->bound == bound))) {
            if(0 == (*reg)->ref_count &&
                    mca_mpool_rdma_component.leave_pinned) {
                opal_list_remove_item(&mpool_rdma->mru_list,
                        (opal_list_item_t*)(*reg));
            }
            mpool_rdma->stat_cache_hit++;
            (*reg)->ref_count++;
            OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
            return MPI_SUCCESS;
        }

        mpool_rdma->stat_cache_miss++;
        *reg = NULL; /* in case previous find found something */

        /* If no suitable registration is in cache and leave_pinned isn't
         * set and size of registration cache is unlimited don't use the cache.
         * This is optimisation in case limit is not set. If limit is set we
         * have to put registration into the cache to determine when we hit
         * memory registration limit.
         * NONE: cache is still used for persistent registrations so previous
         * find can find something */
        if(!mca_mpool_rdma_component.leave_pinned &&
                 mca_mpool_rdma_component.rcache_size_limit == 0) {
            OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
            return register_cache_bypass(mpool, addr, size, flags, reg);
        }
    }

    OMPI_FREE_LIST_GET(&mpool_rdma->reg_list, item, rc);
    if(OMPI_SUCCESS != rc) {
        OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
        return rc;
    }
    rdma_reg = (mca_mpool_base_registration_t*)item;

    rdma_reg->mpool = mpool;
    rdma_reg->base = base;
    rdma_reg->bound = bound;
    rdma_reg->flags = flags;

    while((rc = mpool->rcache->rcache_insert(mpool->rcache, rdma_reg,
             mca_mpool_rdma_component.rcache_size_limit)) ==
            OMPI_ERR_TEMP_OUT_OF_RESOURCE) {
        mca_mpool_base_registration_t *old_reg;
        /* try to remove one unused reg and retry */
        old_reg = (mca_mpool_base_registration_t*)
            opal_list_get_last(&mpool_rdma->mru_list);
        if(opal_list_get_end(&mpool_rdma->mru_list) !=
                (opal_list_item_t*)old_reg) {
            rc = dereg_mem(mpool, old_reg);
            if(MPI_SUCCESS == rc) {
                mpool->rcache->rcache_delete(mpool->rcache, old_reg);
                opal_list_remove_item(&mpool_rdma->mru_list,
                        (opal_list_item_t*)old_reg);
                OMPI_FREE_LIST_RETURN(&mpool_rdma->reg_list,
                        (ompi_free_list_item_t*)old_reg);
                mpool_rdma->stat_evicted++;
            } else
                break;
        } else
            break;
    }

    if(rc != OMPI_SUCCESS) {
        OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
        OMPI_FREE_LIST_RETURN(&mpool_rdma->reg_list, item);
        return rc;
    }

    rc = mpool_rdma->resources.register_mem(mpool_rdma->resources.reg_data,
            base, bound - base + 1, rdma_reg);

    if(rc != OMPI_SUCCESS) {
        mpool->rcache->rcache_delete(mpool->rcache, rdma_reg);
        OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
        OMPI_FREE_LIST_RETURN(&mpool_rdma->reg_list, item);
        return rc;
    }

    *reg = rdma_reg;
    (*reg)->ref_count++;
    OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
    return OMPI_SUCCESS;
}
/*
 * register memory
 */
int mca_mpool_grdma_register(mca_mpool_base_module_t *mpool, void *addr,
                              size_t size, uint32_t flags,
                              mca_mpool_base_registration_t **reg)
{
    mca_mpool_grdma_module_t *mpool_grdma = (mca_mpool_grdma_module_t*)mpool;
    mca_mpool_base_registration_t *grdma_reg;
    ompi_free_list_item_t *item;
    unsigned char *base, *bound;
    bool bypass_cache = !!(flags & MCA_MPOOL_FLAGS_CACHE_BYPASS);
    int rc;

    OPAL_THREAD_LOCK(&mpool->rcache->lock);

    /* if cache bypass is requested don't use the cache */
    base = (unsigned char *) down_align_addr(addr, mca_mpool_base_page_size_log);
    bound = (unsigned char *) up_align_addr((void*)((char*) addr + size - 1),
                                            mca_mpool_base_page_size_log);
    if (!opal_list_is_empty (&mpool_grdma->pool->gc_list))
        do_unregistration_gc(mpool);

    /* look through existing regs if not persistent registration requested.
     * Persistent registration are always registered and placed in the cache */
    if(!(flags & MCA_MPOOL_FLAGS_PERSIST) && !bypass_cache) {
        /* check to see if memory is registered */
        mpool->rcache->rcache_find(mpool->rcache, addr, size, reg);
        if(*reg != NULL &&
           (mca_mpool_grdma_component.leave_pinned ||
            ((*reg)->flags & MCA_MPOOL_FLAGS_PERSIST) ||
            ((*reg)->base == base && (*reg)->bound == bound))) {
            if(0 == (*reg)->ref_count &&
               mca_mpool_grdma_component.leave_pinned) {
                opal_list_remove_item(&mpool_grdma->pool->lru_list,
                                      (opal_list_item_t*)(*reg));
            }
            mpool_grdma->stat_cache_hit++;
            (*reg)->ref_count++;
            OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
            return OMPI_SUCCESS;
        }

        mpool_grdma->stat_cache_miss++;
        *reg = NULL; /* in case previous find found something */

        /* If no suitable registration is in cache and leave_pinned isn't
         * set don't use the cache.
         * This is optimisation in case limit is not set. If limit is set we
         * have to put registration into the cache to determine when we hit
         * memory registration limit.
         * NONE: cache is still used for persistent registrations so previous
         * find can find something */
        if(!mca_mpool_grdma_component.leave_pinned) {
            bypass_cache = true;
        }
    }

    OMPI_FREE_LIST_GET(&mpool_grdma->reg_list, item, rc);
    if(OMPI_SUCCESS != rc) {
        OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
        return rc;
    }
    grdma_reg = (mca_mpool_base_registration_t*)item;

    grdma_reg->mpool = mpool;
    grdma_reg->base = base;
    grdma_reg->bound = bound;
    grdma_reg->flags = flags;

    if (false == bypass_cache) {
        rc = mpool->rcache->rcache_insert(mpool->rcache, grdma_reg, 0);

        if (OPAL_UNLIKELY(rc != OMPI_SUCCESS)) {
            OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
            OMPI_FREE_LIST_RETURN(&mpool_grdma->reg_list, item);
            return rc;
        }
    }

    while (OMPI_ERR_OUT_OF_RESOURCE ==
           (rc = mpool_grdma->resources.register_mem(mpool_grdma->resources.reg_data,
                                                     base, bound - base + 1, grdma_reg))) {
        /* try to remove one unused reg and retry */
        if (!mca_mpool_grdma_evict (mpool)) {
            break;
        }
    }

    if (OPAL_UNLIKELY(rc != OMPI_SUCCESS)) {
        if (false == bypass_cache) {
            mpool->rcache->rcache_delete(mpool->rcache, grdma_reg);
        }
        OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
        OMPI_FREE_LIST_RETURN(&mpool_grdma->reg_list, item);
        return rc;
    }

    *reg = grdma_reg;
    (*reg)->ref_count++;
    OPAL_THREAD_UNLOCK(&mpool->rcache->lock);

    /* Cleanup any vmas that we have deferred deletion on */
    mpool->rcache->rcache_clean(mpool->rcache);
    return OMPI_SUCCESS;
}
/*
 * Return a module-specific IO MPI_Request
 */
int mca_io_base_request_alloc(ompi_file_t *file, 
                              mca_io_base_request_t **req)
{
    int err;
    mca_io_base_module_request_once_init_fn_t func;
    ompi_free_list_item_t *item;

    /* See if we've got a request on the module's freelist (which is
       cached on the file, since there's only one module per
       MPI_File).  Use a quick-but-not-entirely-accurate (but good
       enough) check as a slight optimization to potentially having to
       avoid locking and unlocking. */

    if (opal_list_get_size(&file->f_io_requests) > 0) {
        OPAL_THREAD_LOCK(&file->f_io_requests_lock);
        if (opal_list_get_size(&file->f_io_requests) > 0) {
            *req = (mca_io_base_request_t*) 
                opal_list_remove_first(&file->f_io_requests);
            (*req)->free_called = false;
        } else {
            *req = NULL;
        }
        OPAL_THREAD_UNLOCK(&file->f_io_requests_lock);
    } else {
        *req = NULL;
    }
        
    /* Nope, we didn't have one on the file freelist, so let's get one
       off the global freelist */

    if (NULL == *req) {
        OMPI_FREE_LIST_GET(&mca_io_base_requests, item, err);
        *req = (mca_io_base_request_t*) item;

        /* Call the per-use init function, if it exists */

        switch (file->f_io_version) {
        case MCA_IO_BASE_V_2_0_0:

            /* These can be set once for this request since this
               request will always be used with the same module (and
               therefore, the same MPI_File).  Note that
               (*req)->req_ompi.rq_type is already set by the
               constructor. */

            (*req)->req_file = file;
            (*req)->req_ver = file->f_io_version;
            (*req)->free_called = false;
            (*req)->super.req_free =
                file->f_io_selected_module.v2_0_0.io_module_request_free;
            (*req)->super.req_cancel =
                file->f_io_selected_module.v2_0_0.io_module_request_cancel;

            /* Call the module's once-per process init, if it
               exists */

            func = 
                file->f_io_selected_module.v2_0_0.io_module_request_once_init;
            if (NULL != func) {
                if (OMPI_SUCCESS != 
                    (err = func(&file->f_io_selected_module, *req))) {
                    OMPI_FREE_LIST_RETURN(&mca_io_base_requests, item);
                    return err;
                }
            }

            break;
            
        default:
            OMPI_FREE_LIST_RETURN(&mca_io_base_requests, item);
            return OMPI_ERR_NOT_IMPLEMENTED;
            break;
        }
    }

    /* Initialize the request */

    OMPI_REQUEST_INIT(&((*req)->super), false);
    (*req)->super.req_mpi_object.file = file;

    /*
     * Copied from ompi/mca/pml/base/pml_base_recvreq.h:
     * always set the req_status.MPI_TAG to ANY_TAG before starting the
     * request. This field is used if cancelled to find out if the request
     * has been matched or not.
     */
    (*req)->super.req_status.MPI_TAG = MPI_ANY_TAG;
    (*req)->super.req_status.MPI_ERROR = OMPI_SUCCESS;
    (*req)->super.req_status._count = 0;
    (*req)->super.req_status._cancelled = 0;

    /* All done */

    return OMPI_SUCCESS;
}