Exemplo n.º 1
0
/* Destroy the hashmap    */
int opal_rb_tree_destroy(opal_rb_tree_t *tree)
{
    ompi_free_list_item_t * item;
    /* Recursive inorder traversal for delete    */

    inorder_destroy(tree, tree->root_ptr);
    /* Now free the root -- root does not get free'd in the above
     * inorder destroy    */
    item = (ompi_free_list_item_t *) tree->root_ptr;
    OMPI_FREE_LIST_RETURN_MT(&(tree->free_list), item);

    /* free the tree->nill node */
    item = (ompi_free_list_item_t *) tree->nill;
    OMPI_FREE_LIST_RETURN_MT(&(tree->free_list), item);
    return OPAL_SUCCESS;
}
/* udreg callback functions */
static void *mca_mpool_udreg_reg_func (void *addr, uint64_t len, void *reg_context)
{
    mca_mpool_udreg_module_t *mpool_udreg = (mca_mpool_udreg_module_t *) reg_context;
    mca_mpool_base_registration_t *udreg_reg;
    ompi_free_list_item_t *item;
    int rc;

    OMPI_FREE_LIST_GET_MT(&mpool_udreg->reg_list, item);
    if (NULL == item) {
        return NULL;
    }
    udreg_reg = (mca_mpool_base_registration_t *) item;

    udreg_reg->mpool = reg_context;
    udreg_reg->base  = addr;
    udreg_reg->bound = (void *)((uintptr_t) addr + len);

    rc = mpool_udreg->resources.register_mem(mpool_udreg->resources.reg_data,
                                             addr, len, udreg_reg);
    if (OMPI_SUCCESS != rc) {
        OMPI_FREE_LIST_RETURN_MT(&mpool_udreg->reg_list, item);
        udreg_reg = NULL;
    }

    return udreg_reg;
}
Exemplo n.º 3
0
static inline bool mca_mpool_rgpusm_deregister_lru (mca_mpool_base_module_t *mpool) {
    mca_mpool_rgpusm_module_t *mpool_rgpusm = (mca_mpool_rgpusm_module_t *) mpool;
    mca_mpool_base_registration_t *old_reg;
    int rc;

    /* Remove the registration from the cache and list before
       deregistering the memory */
    old_reg = (mca_mpool_base_registration_t*)
        opal_list_remove_first (&mpool_rgpusm->lru_list);
    if (NULL == old_reg) {
        return false;
    }

    mpool->rcache->rcache_delete(mpool->rcache, old_reg);

    /* Drop the rcache lock while we deregister the memory */
    OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
    assert(old_reg->ref_count == 0);
    rc = mpool_rgpusm->resources.deregister_mem(mpool_rgpusm->resources.reg_data,
                                                old_reg);
    OPAL_THREAD_LOCK(&mpool->rcache->lock);

    /* This introduces a potential leak of registrations if
       the deregistration fails to occur as we no longer have
       a reference to it. Is this possible? */
    if (OMPI_SUCCESS != rc) {
        return false;
    }

    OMPI_FREE_LIST_RETURN_MT(&mpool_rgpusm->reg_list,
                          (ompi_free_list_item_t*)old_reg);
    mpool_rgpusm->stat_evicted++;

    return true;
}
/*
 * Return the registration to the free list.
 */
int mca_mpool_gpusm_deregister(struct mca_mpool_base_module_t *mpool,
                               mca_mpool_base_registration_t *reg)
{
    int rc;
    mca_mpool_gpusm_module_t *mpool_gpusm = (mca_mpool_gpusm_module_t *)mpool;

    rc = mpool_gpusm->resources.deregister_mem(mpool, reg);
    OMPI_FREE_LIST_RETURN_MT(&mpool_gpusm->reg_list, (ompi_free_list_item_t*)reg);
    return OMPI_SUCCESS;
}
Exemplo n.º 5
0
int mca_mpool_rgpusm_deregister(struct mca_mpool_base_module_t *mpool,
                            mca_mpool_base_registration_t *reg)
{
    mca_mpool_rgpusm_module_t *mpool_rgpusm = (mca_mpool_rgpusm_module_t*)mpool;
    int rc = OMPI_SUCCESS;
    assert(reg->ref_count > 0);

    OPAL_THREAD_LOCK(&mpool->rcache->lock);
    reg->ref_count--;
    opal_output(-1, "Deregister: reg->ref_count=%d", (int)reg->ref_count);
    if(reg->ref_count > 0) {
        OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
        return OMPI_SUCCESS;
    }
    if(mca_mpool_rgpusm_component.leave_pinned && registration_is_cachebale(reg))
    {
        /* if leave_pinned is set don't deregister memory, but put it
         * on LRU list for future use */
        opal_list_prepend(&mpool_rgpusm->lru_list, (opal_list_item_t*)reg);
    } else {
        /* Remove from rcache first */
        if(!(reg->flags & MCA_MPOOL_FLAGS_CACHE_BYPASS))
            mpool->rcache->rcache_delete(mpool->rcache, reg);

        /* Drop the rcache lock before deregistring the memory */
        OPAL_THREAD_UNLOCK(&mpool->rcache->lock);

        {
             mca_mpool_rgpusm_module_t *mpool_rgpusm = (mca_mpool_rgpusm_module_t *)mpool;

             assert(reg->ref_count == 0);
             rc = mpool_rgpusm->resources.deregister_mem(mpool_rgpusm->resources.reg_data,
                                                         reg);
         }

        OPAL_THREAD_LOCK(&mpool->rcache->lock);

        if(OMPI_SUCCESS == rc) {
            OMPI_FREE_LIST_RETURN_MT(&mpool_rgpusm->reg_list,
                                  (ompi_free_list_item_t*)reg);
        }
    }
    OPAL_THREAD_UNLOCK(&mpool->rcache->lock);

    /* Cleanup any vmas that we have deferred deletion on */
    mpool->rcache->rcache_clean(mpool->rcache);

    return rc;
}
Exemplo n.º 6
0
static void
inorder_destroy(opal_rb_tree_t *tree, opal_rb_tree_node_t * node)
{
    ompi_free_list_item_t * item;

    if (node == tree->nill) {
        return;
    }

    inorder_destroy(tree, node->left);

    if (node->left != tree->nill) {
        item = (ompi_free_list_item_t *) node->left;
        --tree->tree_size;
        OMPI_FREE_LIST_RETURN_MT(&(tree->free_list), item);
    }

    inorder_destroy(tree, node->right);
    if (node->right != tree->nill) {
        item = (ompi_free_list_item_t *) node->right;
        --tree->tree_size;
        OMPI_FREE_LIST_RETURN_MT(&(tree->free_list), item);
    }
}
Exemplo n.º 7
0
/* Delete a node from the tree based on the key */
int opal_rb_tree_delete(opal_rb_tree_t *tree, void *key)
{
    opal_rb_tree_node_t * p;
    opal_rb_tree_node_t * todelete;
    opal_rb_tree_node_t * y;
    ompi_free_list_item_t * item;

    p = opal_rb_tree_find_node(tree, key);
    if (NULL == p) {
        return OPAL_ERR_NOT_FOUND;
    }
    if ((p->left == tree->nill) || (p->right == tree->nill)) {
        todelete = p;
    } else {
        todelete = btree_successor(tree, p);
    }

    if (todelete->left == tree->nill) {
        y = todelete->right;
    } else {
        y = todelete->left;
    }

    y->parent = todelete->parent;

    if (y->parent == tree->root_ptr) {
        tree->root_ptr->left = y;
    } else {
        if (todelete == todelete->parent->left) {
         todelete->parent->left = y;
        } else {
            todelete->parent->right = y;
        }
    }

    if (todelete != p) {
        p->key = todelete->key;
        p->value = todelete->value;
    }

    if (todelete->color == BLACK) {
        btree_delete_fixup(tree, y);
    }
    item = (ompi_free_list_item_t *) todelete;
    OMPI_FREE_LIST_RETURN_MT(&(tree->free_list), item);
    --tree->tree_size;
    return OPAL_SUCCESS;
}
static uint32_t mca_mpool_udreg_dereg_func (void *device_data, void *dreg_context)
{
    mca_mpool_udreg_module_t *mpool_udreg = (mca_mpool_udreg_module_t *) dreg_context;
    mca_mpool_base_registration_t *udreg_reg = (mca_mpool_base_registration_t *) device_data;
    int rc;

    rc = mpool_udreg->resources.deregister_mem(mpool_udreg->resources.reg_data, udreg_reg);

    if (OPAL_LIKELY(OMPI_SUCCESS == rc)) {
        OMPI_FREE_LIST_RETURN_MT(&mpool_udreg->reg_list,
                              (ompi_free_list_item_t *) udreg_reg);
    }
    /* might be worth printing out a warning if an error occurs here */

    return 0;
}
Exemplo n.º 9
0
/*
 * The free call mark the final stage in a request life-cycle. Starting from this
 * point the request is completed at both SPML and user level, and can be used
 * for others p2p communications. Therefore, in the case of the YODA SPML it should
 * be added to the free request list.
 */
static int mca_spml_yoda_put_request_free(struct oshmem_request_t** request)
{
    mca_spml_yoda_put_request_t* putreq =
            *(mca_spml_yoda_put_request_t**) request;

    assert( false == putreq->req_put.req_base.req_free_called);

    OPAL_THREAD_LOCK(&oshmem_request_lock);
    putreq->req_put.req_base.req_free_called = true;
    OMPI_FREE_LIST_RETURN_MT( &mca_spml_base_put_requests,
                          (ompi_free_list_item_t*)putreq);
    OPAL_THREAD_UNLOCK(&oshmem_request_lock);

    *request = SHMEM_REQUEST_NULL;
    return OSHMEM_SUCCESS;
}
/*
 * This is the one function that does all the work.  It will call into
 * the register function to get the memory handle for the sending
 * buffer.  There is no need to deregister the memory handle so the
 * deregister function is a no-op.
 */
int mca_mpool_gpusm_register(mca_mpool_base_module_t *mpool, void *addr,
                             size_t size, uint32_t flags,
                             mca_mpool_base_registration_t **reg)
{
    mca_mpool_gpusm_module_t *mpool_gpusm = (mca_mpool_gpusm_module_t*)mpool;
    mca_mpool_base_registration_t *gpusm_reg;
    ompi_free_list_item_t *item;
    unsigned char *base, *bound;
    int rc;

    /* In spite of the fact we return an error code, the existing code
     * checks the registration for a NULL value rather than looking at
     * the return code.  So, initialize the registration to NULL in
     * case we run into a failure. */
    *reg = NULL;

    base = addr;
    bound = (unsigned char *)addr + size - 1;

    OMPI_FREE_LIST_GET_MT(&mpool_gpusm->reg_list, item);
    if(NULL == item) {
        return OMPI_ERR_OUT_OF_RESOURCE;
    }
    gpusm_reg = (mca_mpool_base_registration_t*)item;

    gpusm_reg->mpool = mpool;
    gpusm_reg->base = base;
    gpusm_reg->bound = bound;
    gpusm_reg->flags = flags;

    rc = mpool_gpusm->resources.register_mem(base, size, gpusm_reg, NULL);

    if(rc != OMPI_SUCCESS) {
        OMPI_FREE_LIST_RETURN_MT(&mpool_gpusm->reg_list, item);
        return rc;
    }

    *reg = gpusm_reg;
    (*reg)->ref_count++;
    return OMPI_SUCCESS;

}
Exemplo n.º 11
0
/* Create the tree */
int opal_rb_tree_init(opal_rb_tree_t * tree,
                      opal_rb_tree_comp_fn_t comp)
{
    ompi_free_list_item_t * node;
    /* we need to get memory for the root pointer from the free list */
    OMPI_FREE_LIST_GET_MT(&(tree->free_list), node);
    tree->root_ptr = (opal_rb_tree_node_t *) node;
    if (NULL == node) {
        return OPAL_ERR_OUT_OF_RESOURCE;
    }

    OMPI_FREE_LIST_GET_MT(&(tree->free_list), node);
    if (NULL == node) {
        OMPI_FREE_LIST_RETURN_MT(&(tree->free_list), (ompi_free_list_item_t*)tree->root_ptr);
        return OPAL_ERR_OUT_OF_RESOURCE;
    }
    tree->nill = (opal_rb_tree_node_t *) node;
    /* initialize tree->nill */
    tree->nill->color = BLACK;
    tree->nill->left = tree->nill;
    tree->nill->right = tree->nill;
    tree->nill->parent = tree->nill;

    /* initialize the 'root' pointer */
    tree->root_ptr->left = tree->nill;
    tree->root_ptr->right = tree->nill;
    tree->root_ptr->parent = tree->nill;
    tree->root_ptr->color = BLACK;

    tree->comp = comp;

    /* set the tree size to zero */
    tree->tree_size = 0;

    return OPAL_SUCCESS;
}
Exemplo n.º 12
0
/*
 * put an item back into the free list
 */
void mca_mpool_base_tree_item_put(mca_mpool_base_tree_item_t* item) { 
    OMPI_FREE_LIST_RETURN_MT(&mca_mpool_base_tree_item_free_list,
                          &(item->super));
}
static int bcol_ptpcoll_barrier_recurs_knomial_new(
                bcol_function_args_t *input_args,
                struct coll_ml_function_t *const_args)
{
    /* local variable */
    uint64_t sequence_number;
    mca_bcol_ptpcoll_module_t *ptpcoll_module =
                        (mca_bcol_ptpcoll_module_t *) const_args->bcol_module;

    netpatterns_k_exchange_node_t *my_exchange_node =
                                       &ptpcoll_module->knomial_exchange_tree;

    int rc, k, pair_comm_rank, exchange, completed,
        tree_order = my_exchange_node->tree_order, tag,
        n_extra_sources = my_exchange_node->n_extra_sources,
        n_exchange = my_exchange_node->n_exchanges, num_reqs;

    ompi_communicator_t *comm =
            ptpcoll_module->super.sbgp_partner_module->group_comm;

    int *extra_sources_array = NULL,
        **rank_exchanges = my_exchange_node->rank_exchanges;

    ompi_request_t **requests;
    ompi_free_list_item_t *item;

    mca_bcol_ptpcoll_collreq_t *collreq;

    OMPI_FREE_LIST_WAIT_MT(&ptpcoll_module->collreqs_free, item);
    if (OPAL_UNLIKELY(NULL == item)) {
        PTPCOLL_ERROR(("Free list waiting failed."));
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    collreq = (mca_bcol_ptpcoll_collreq_t *) item;
    input_args->bcol_opaque_data = (void *) collreq;

    requests = collreq->requests;

    /* TAG Calculation */
    sequence_number = input_args->sequence_num;

    /* Keep tag within the limit supportd by the pml */
    tag = (PTPCOLL_TAG_OFFSET + sequence_number * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask);

    /* Mark this as a collective tag, to avoid conflict with user-level flags */
    tag = -tag;

    if (0 < n_extra_sources) { /* EXCHANGE_NODE case */
        collreq->need_toserv_extra = 1;
        extra_sources_array = my_exchange_node->rank_extra_sources_array;

        /* I will participate in the exchange (of the algorithm) -
         * wait for signal from extra process */
        for (k = 0; k < n_extra_sources; ++k) {
            pair_comm_rank =
                    ptpcoll_module->super.sbgp_partner_module->group_list[extra_sources_array[k]];

            rc = MCA_PML_CALL(irecv(
                        NULL, 0, MPI_INT,
                        pair_comm_rank, tag,
                        comm, &(requests[k])));
            if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
                PTPCOLL_ERROR(("IRecv failed."));
                return rc;
            }
        }

        num_reqs = n_extra_sources;

        /* Test for completion */
        completed =
            mca_bcol_ptpcoll_test_all_for_match(&num_reqs, requests, &rc);
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            PTPCOLL_ERROR(("Test for all failed."));
            return rc;
        }

        if (!completed) {
            collreq->tag = tag;
            collreq->num_reqs = num_reqs;
            collreq->exchange = 0;

            return BCOL_FN_STARTED;
        }
    } else {
        collreq->need_toserv_extra = 0;
    }

    /* loop over exchange send/recv pairs */
    for (exchange = 0; exchange < n_exchange; ++exchange) {
        for (k = 0; k < tree_order - 1; ++k) {
            /* rank of exchange partner within the group */
            pair_comm_rank =
                ptpcoll_module->super.sbgp_partner_module->group_list[rank_exchanges[exchange][k]];

            assert(2 * ptpcoll_module->k_nomial_radix > (k * 2 + 1));

            /* send to partner - we will wait for completion, as send
             *   completion is at the MPI level, and will not
             *   incur network level completion costs
             */
            rc = MCA_PML_CALL(isend(
                        NULL, 0, MPI_INT,
                        pair_comm_rank, tag,
                        MCA_PML_BASE_SEND_STANDARD,
                        comm, &(requests[k * 2 + 1])));
            if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
                PTPCOLL_ERROR(("ISend failed."));
                return rc;
            }

            PTPCOLL_VERBOSE(10, ("Ex %d, K %d send to %d[%d]", exchange, k,
                                  pair_comm_rank, rank_exchanges[exchange][k]));

            /* recive from partner */
            rc = MCA_PML_CALL(irecv(
                        NULL, 0, MPI_INT,
                        pair_comm_rank, tag,
                        comm, &(requests[k * 2])));
            if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
                PTPCOLL_ERROR(("IRecv failed."));
                return rc;
            }

            PTPCOLL_VERBOSE(10, ("Ex %d, K %d irecv from %d[%d]", exchange, k,
                                  pair_comm_rank, rank_exchanges[exchange][k]));
        }

        num_reqs = 2 * (tree_order - 1);

        /* Test for completion */
        completed =
            mca_bcol_ptpcoll_test_all_for_match(&num_reqs, requests, &rc);
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            PTPCOLL_ERROR(("Test for all failed."));
            return rc;
        }

        if (!completed) {
            collreq->tag = tag;
            collreq->num_reqs = num_reqs;
            collreq->exchange = exchange + 1;

            return BCOL_FN_STARTED;
        }
    }

    /* If non power of 2, may need to send message to "extra" proc */
    if (0 < n_extra_sources)  {  /* EXCHANGE_NODE case */
        for (k = 0; k < n_extra_sources; ++k) {
            pair_comm_rank =
                ptpcoll_module->super.sbgp_partner_module->group_list[extra_sources_array[k]];

            rc = MCA_PML_CALL(isend(
                        NULL, 0, MPI_INT,
                        pair_comm_rank, tag,
                        MCA_PML_BASE_SEND_STANDARD,
                        comm, &(requests[k])));
            if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
                PTPCOLL_ERROR(("ISend failed."));
                return rc;
            }
        }

        num_reqs = n_extra_sources;

        /* Test for completion */
        completed =
            mca_bcol_ptpcoll_test_all_for_match(&num_reqs, requests, &rc);
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            PTPCOLL_ERROR(("Test for all failed."));
            return rc;
        }

        if (!completed) {
            collreq->tag = tag;
            collreq->num_reqs = num_reqs;

            collreq->exchange = n_exchange;
            collreq->need_toserv_extra = 0;

            return BCOL_FN_STARTED;
        }
    }

    OMPI_FREE_LIST_RETURN_MT(&ptpcoll_module->collreqs_free, (ompi_free_list_item_t *) collreq);
    return BCOL_FN_COMPLETE;
}
static int bcol_ptpcoll_barrier_recurs_knomial_extra_new(
                                bcol_function_args_t *input_args,
                                struct coll_ml_function_t *const_args)
{
    /* local variable */
    uint64_t sequence_number;
    int rc, tag, pair_comm_rank,
        completed, num_reqs = 2;

    mca_bcol_ptpcoll_module_t *ptpcoll_module =
                    (mca_bcol_ptpcoll_module_t *) const_args->bcol_module;

    netpatterns_k_exchange_node_t *my_exchange_node =
                                   &ptpcoll_module->knomial_exchange_tree;

    ompi_communicator_t *comm =
                    ptpcoll_module->super.sbgp_partner_module->group_comm;

    int *extra_sources_array = my_exchange_node->rank_extra_sources_array;

    ompi_request_t **requests;
    ompi_free_list_item_t *item;

    mca_bcol_ptpcoll_collreq_t *collreq;

    OMPI_FREE_LIST_WAIT_MT(&ptpcoll_module->collreqs_free, item);
    if (OPAL_UNLIKELY(NULL == item)) {
        PTPCOLL_ERROR(("Free list waiting failed."));
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    collreq = (mca_bcol_ptpcoll_collreq_t *) item;
    input_args->bcol_opaque_data = (void *) collreq;

    requests = collreq->requests;

    /* TAG Calculation */
    sequence_number = input_args->sequence_num;

    /* Keep tag within the limit supportd by the pml */
    tag = (PTPCOLL_TAG_OFFSET + sequence_number * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask);

    /* Mark this as a collective tag, to avoid conflict with user-level flags */
    tag = -tag;

    pair_comm_rank =
            ptpcoll_module->super.sbgp_partner_module->group_list[extra_sources_array[0]];

    rc = MCA_PML_CALL(isend(
                NULL, 0, MPI_INT,
                pair_comm_rank, tag,
                MCA_PML_BASE_SEND_STANDARD,
                comm, &(requests[0])));
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        PTPCOLL_ERROR(("ISend failed."));
        return rc;
    }

    rc = MCA_PML_CALL(irecv(
                NULL, 0, MPI_INT,
                pair_comm_rank, tag,
                comm, &(requests[1])));
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        PTPCOLL_ERROR(("IRecv failed."));
        return rc;
    }

    /* Test for completion */
    completed =
        mca_bcol_ptpcoll_test_all_for_match(&num_reqs, requests, &rc);
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        PTPCOLL_ERROR(("Test for all failed."));
        return rc;
    }

    if (!completed) {
        return BCOL_FN_STARTED;
    }

    OMPI_FREE_LIST_RETURN_MT(&ptpcoll_module->collreqs_free, (ompi_free_list_item_t *) collreq);
    return BCOL_FN_COMPLETE;
}
Exemplo n.º 15
0
static void coll_handle_free(void *handle){
    ompi_request_t *ompi_req = (ompi_request_t *)handle;
    OMPI_FREE_LIST_RETURN_MT(&mca_coll_hcoll_component.requests,
                          (ompi_free_list_item_t *)ompi_req);
}
static int bcol_ptpcoll_barrier_recurs_dbl_new(
                                bcol_function_args_t *input_args,
                                struct coll_ml_function_t *const_args)
{
   /* local variable */
    uint64_t sequence_number;
    mca_bcol_ptpcoll_module_t *ptp_module =
                         (mca_bcol_ptpcoll_module_t *) const_args->bcol_module;

    ompi_communicator_t *comm = ptp_module->super.sbgp_partner_module->group_comm;

    int rc, my_extra_partner_comm_rank = 0, exchange, completed,
        pair_comm_rank, pair_rank, delta, tag, num_reqs = 0,
        my_rank = ptp_module->super.sbgp_partner_module->my_index,
        n_exchange = ptp_module->super.sbgp_partner_module->n_levels_pow2;

    ompi_request_t **requests;
    ompi_free_list_item_t *item;

    mca_bcol_ptpcoll_collreq_t *collreq;

    OMPI_FREE_LIST_WAIT_MT(&ptp_module->collreqs_free, item);
    if (OPAL_UNLIKELY(NULL == item)) {
        PTPCOLL_ERROR(("Free list waiting failed."));
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    collreq = (mca_bcol_ptpcoll_collreq_t *) item;
    input_args->bcol_opaque_data = (void *) collreq;

    assert(PTPCOLL_EXTRA != ptp_module->pow_2type);

    requests = collreq->requests;

    /* TAG Calculation */
    sequence_number = input_args->sequence_num;

    /* keep tag within the limit supportd by the pml */
    tag = (PTPCOLL_TAG_OFFSET + sequence_number * PTPCOLL_TAG_FACTOR) & (ptp_module->tag_mask);

    /* mark this as a collective tag, to avoid conflict with user-level flags */
    tag = -tag;

    if (PTPCOLL_PROXY == ptp_module->pow_2type) {
        /* I will participate in the exchange - wait for signal from extra
         ** process */
        /*
         * recv from extra rank - my_extra_partner_comm_rank
         *  can use blocking recv, as no other communications
         *  need to take place.
         */
        my_extra_partner_comm_rank =
                       ptp_module->super.sbgp_partner_module->group_list[ptp_module->proxy_extra_index];

        collreq->need_toserv_extra = 1;
        collreq->extra_partner_rank = my_extra_partner_comm_rank;

        rc = MCA_PML_CALL(irecv(NULL, 0, MPI_INT,
                    my_extra_partner_comm_rank, tag, comm,
                    &(requests[0])));
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            PTPCOLL_ERROR(("IRecv failed."));
            return rc;
        }

        completed = mca_bcol_ptpcoll_test_for_match(&requests[0], &rc);
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            PTPCOLL_ERROR(("Test for irecv failed."));
            return rc;
        }

        if (!completed) {
            collreq->tag = tag;
            collreq->num_reqs = 1;
            collreq->exchange = 0;

            return BCOL_FN_STARTED;
        }
    } else {
        collreq->need_toserv_extra = 0;
    }

    /* Loop over exchange send/recv pairs */
    delta = 1;
    for (exchange = 0; exchange < n_exchange; ++exchange) {

        /* rank of exchange partner within the group */
        pair_rank = my_rank ^ delta;

        /* rank within the communicator */
        pair_comm_rank =
            ptp_module->super.sbgp_partner_module->group_list[pair_rank];

        /* send to partner - we will wait for completion, as send
         *   completion is at the MPI level, and will not
         *   incur network level completion costs
         */
        rc = MCA_PML_CALL(isend(NULL, 0, MPI_INT,
                    pair_comm_rank, tag,
                    MCA_PML_BASE_SEND_STANDARD, comm,
                    &(requests[0])));
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            PTPCOLL_ERROR(("ISend failed."));
            return rc;
        }

        ++num_reqs;

        /* recive from partner */
        rc = MCA_PML_CALL(irecv(NULL, 0, MPI_INT,
                    pair_comm_rank, tag, comm,
                    &(requests[1])));
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            PTPCOLL_ERROR(("IRecv failed."));
            return rc;
        }

        ++num_reqs;

        PTPCOLL_VERBOSE(5, ("exchange - %d, pair_rank - %d, pair_comm_rank - %d",
                             exchange, pair_rank, pair_comm_rank));

        /* test for completion */
        completed =
            mca_bcol_ptpcoll_test_all_for_match(&num_reqs, requests, &rc);
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            PTPCOLL_ERROR(("Test for all failed."));
            return rc;
        }

        if (!completed) {
            collreq->tag = tag;
            collreq->num_reqs = num_reqs;

            collreq->exchange = exchange + 1;
            assert(collreq->exchange >= 0);

            return BCOL_FN_STARTED;
        }

        delta <<= 1; /* delta *= 2 */
    }

    if (PTPCOLL_PROXY == ptp_module->pow_2type) {
        /* send - let the extra rank know that we are done */
        rc = MCA_PML_CALL(isend(NULL, 0, MPI_INT,
                    my_extra_partner_comm_rank, tag,
                    MCA_PML_BASE_SEND_STANDARD, comm,
                    &(requests[0])));
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            PTPCOLL_ERROR(("ISend failed."));
            return rc;
        }

        completed = mca_bcol_ptpcoll_test_for_match(&requests[0], &rc);
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            PTPCOLL_ERROR(("Test for isend failed."));
            return rc;
        }

        if (!completed) {
            collreq->tag = tag;
            collreq->num_reqs = 1;

            collreq->need_toserv_extra = 0;
            collreq->exchange = n_exchange;

            return BCOL_FN_STARTED;
        }
    }

    OMPI_FREE_LIST_RETURN_MT(&ptp_module->collreqs_free, (ompi_free_list_item_t *) collreq);
    return BCOL_FN_COMPLETE;
}
static int bcol_ptpcoll_barrier_recurs_dbl_extra_new(
                                bcol_function_args_t *input_args,
                                struct coll_ml_function_t *const_args)
{
   /* local variable */
    uint64_t sequence_number;
    int rc, completed, num_reqs = 2,
        tag, my_extra_partner_comm_rank;

    ompi_request_t **requests;
    ompi_free_list_item_t *item;

    mca_bcol_ptpcoll_collreq_t *collreq;

    mca_bcol_ptpcoll_module_t *ptp_module =
                         (mca_bcol_ptpcoll_module_t *) const_args->bcol_module;
    ompi_communicator_t *comm = ptp_module->super.sbgp_partner_module->group_comm;

    OMPI_FREE_LIST_WAIT_MT(&ptp_module->collreqs_free, item);
    if (OPAL_UNLIKELY(NULL == item)) {
        PTPCOLL_ERROR(("Free list waiting failed."));
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    collreq = (mca_bcol_ptpcoll_collreq_t *) item;
    input_args->bcol_opaque_data = (void *) collreq;

    requests = collreq->requests;

    /* TAG Calculation */
    sequence_number = input_args->sequence_num;

    /* Keep tag within the limit supportd by the pml */
    tag = (PTPCOLL_TAG_OFFSET + sequence_number * PTPCOLL_TAG_FACTOR) & (ptp_module->tag_mask);

    /* mark this as a collective tag, to avoid conflict with user-level flags */
    tag = -tag;

    /* I will not participate in the exchange - so just "register" as here,
     * signal the extra rank that I am here */

    my_extra_partner_comm_rank =
                 ptp_module->super.sbgp_partner_module->group_list[ptp_module->proxy_extra_index];

    rc = MCA_PML_CALL(isend(NULL, 0, MPI_INT,
                my_extra_partner_comm_rank, tag,
                MCA_PML_BASE_SEND_STANDARD, comm,
                &(requests[0])));
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        PTPCOLL_ERROR(("Send failed."));
        return rc;
    }

    /* Recv signal that the rest are done - my_extra_partner_comm_rank */
    rc = MCA_PML_CALL(irecv(NULL, 0, MPI_INT,
                my_extra_partner_comm_rank, tag, comm,
                &(requests[1])));
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        PTPCOLL_ERROR(("IRecv failed."));
        return rc;
    }

    /* Test for completion */
    completed =
        mca_bcol_ptpcoll_test_all_for_match(&num_reqs, requests, &rc);
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        PTPCOLL_ERROR(("Test for all failed."));
        return rc;
    }

    if (!completed) {
        return BCOL_FN_STARTED;
    }

    OMPI_FREE_LIST_RETURN_MT(&ptp_module->collreqs_free, (ompi_free_list_item_t *) collreq);
    return BCOL_FN_COMPLETE;
}
Exemplo n.º 18
0
/*
 * We have received a segment, take action based on the 
 * packet type in the BTL header
 */
void ompi_btl_usnic_recv(ompi_btl_usnic_module_t *module,
                           ompi_btl_usnic_recv_segment_t *seg,
                           struct ibv_recv_wr **repost_recv_head)
{
    ompi_btl_usnic_segment_t *bseg;
    mca_btl_active_message_callback_t* reg;
    ompi_btl_usnic_endpoint_t *endpoint;
    ompi_btl_usnic_btl_chunk_header_t *chunk_hdr;
    uint32_t window_index;
#if MSGDEBUG1
    char src_mac[32];
    char dest_mac[32];
#endif

    bseg = &seg->rs_base;

    ++module->num_total_recvs;

    /* Valgrind help */
    opal_memchecker_base_mem_defined((void*)(seg->rs_recv_desc.sg_list[0].addr),
                                     seg->rs_recv_desc.sg_list[0].length);

#if MSGDEBUG1
    memset(src_mac, 0, sizeof(src_mac));
    memset(dest_mac, 0, sizeof(dest_mac));
    ompi_btl_usnic_sprintf_gid_mac(src_mac,
            &seg->rs_protocol_header->grh.sgid);
    ompi_btl_usnic_sprintf_gid_mac(dest_mac, 
            &seg->rs_protocol_header->grh.dgid);

#if MSGDEBUG
    opal_output(0, "Got message from MAC %s", src_mac);
    opal_output(0, "Looking for sender: 0x%016lx",
        bseg->us_btl_header->sender);
#endif
#endif

    /* Find out who sent this segment */
    endpoint = lookup_sender(module, bseg);
    seg->rs_endpoint = endpoint;
    if (FAKE_RECV_FRAG_DROP || OPAL_UNLIKELY(NULL == endpoint)) {
        /* No idea who this was from, so drop it */
#if MSGDEBUG1
        opal_output(0, "=== Unknown sender; dropped: from MAC %s to MAC %s, seq %" UDSEQ, 
                    src_mac, 
                    dest_mac, 
                    bseg->us_btl_header->seq);
#endif
        ++module->num_unk_recvs;
        goto repost_no_endpoint;
    }

    /***********************************************************************/
    /* Segment is an incoming frag */
    if (OMPI_BTL_USNIC_PAYLOAD_TYPE_FRAG == bseg->us_btl_header->payload_type) {

        /* Is incoming sequence # ok? */
        if (!ompi_btl_usnic_check_rx_seq(endpoint, seg, &window_index)) {
            goto repost;
        }

#if MSGDEBUG1
        opal_output(0, "<-- Received FRAG ep %p, seq %" UDSEQ ", len=%d\n",
                    (void*) endpoint,
                    seg->rs_base.us_btl_header->seq,
                    seg->rs_base.us_btl_header->payload_len);
#if 0

        opal_output(0, "<-- Received FRAG ep %p, seq %" UDSEQ " from %s to %s: GOOD! (rel seq %d, lowest seq %" UDSEQ ", highest seq: %" UDSEQ ", rwstart %d) seg %p, module %p\n",
                    (void*) endpoint,
                    seg->rs_base.us_btl_header->seq, 
                    src_mac, dest_mac,
                    window_index,
                    endpoint->endpoint_next_contig_seq_to_recv,
                    endpoint->endpoint_highest_seq_rcvd,
                    endpoint->endpoint_rfstart,
                    (void*) seg, (void*) module);
        if (seg->rs_base.us_btl_header->put_addr != NULL) {
            opal_output(0, "  put_addr = %p\n",
                    seg->rs_base.us_btl_header->put_addr);
        }
#endif
#endif

        /*
         * update window before callback because callback might
         * generate a send, and we'd like to piggy-back ACK if possible
         */
        ompi_btl_usnic_update_window(endpoint, window_index);

        /* Stats */
        ++module->num_frag_recvs;

        /* If this it not a PUT, Pass this segment up to the PML.
         * Be sure to get the payload length from the BTL header because
         * the L2 layer may artificially inflate (or otherwise change)
         * the frame length to meet minimum sizes, add protocol information,
         * etc.
         */
        if (seg->rs_base.us_btl_header->put_addr == NULL) {
            reg = mca_btl_base_active_message_trigger +
                bseg->us_payload.pml_header->tag;
            seg->rs_segment.seg_len = bseg->us_btl_header->payload_len;
            reg->cbfunc(&module->super, bseg->us_payload.pml_header->tag, 
                        &seg->rs_desc, reg->cbdata);

        /*
         * If this is a PUT, need to copy it to user buffer
         */
        } else {
#if MSGDEBUG1
            opal_output(0, "Copy %d PUT bytes to %p\n", 
                seg->rs_base.us_btl_header->payload_len,
                chunk_hdr->ch_hdr.put_addr);
#endif
            memcpy(seg->rs_base.us_btl_header->put_addr,
                    seg->rs_base.us_payload.raw,
                    seg->rs_base.us_btl_header->payload_len);
        }

        goto repost;
    }

    /***********************************************************************/
    /* Segment is an incoming chunk */
    if (OMPI_BTL_USNIC_PAYLOAD_TYPE_CHUNK == bseg->us_btl_header->payload_type) {
        int frag_index;
        ompi_btl_usnic_rx_frag_info_t *fip;

        /* Is incoming sequence # ok? */
        if (!ompi_btl_usnic_check_rx_seq(endpoint, seg, &window_index)) {
            goto repost;
        }

#if MSGDEBUG1
        opal_output(0, "<-- Received CHUNK fid %d ep %p, seq %" UDSEQ " from %s to %s: GOOD! (rel seq %d, lowest seq %" UDSEQ ", highest seq: %" UDSEQ ", rwstart %d) seg %p, module %p\n",
                    seg->rs_base.us_btl_chunk_header->ch_frag_id,
                    (void*) endpoint,
                    seg->rs_base.us_btl_chunk_header->ch_hdr.seq, 
                    src_mac, dest_mac,
                    window_index,
                    endpoint->endpoint_next_contig_seq_to_recv,
                    endpoint->endpoint_highest_seq_rcvd,
                    endpoint->endpoint_rfstart,
                    (void*) seg, (void*) module);
#endif

        /* start a new fragment if not one in progress
         * alloc memory, etc.  when last byte arrives, dealloc the
         * frag_id and pass data to PML
         */
        chunk_hdr = seg->rs_base.us_btl_chunk_header;
        frag_index = chunk_hdr->ch_frag_id % MAX_ACTIVE_FRAGS;
        fip = &(endpoint->endpoint_rx_frag_info[frag_index]);

        /* frag_id == 0 means this slot it empty, grab it! */
        if (0 == fip->rfi_frag_id) {
            fip->rfi_frag_id = chunk_hdr->ch_frag_id;
            fip->rfi_frag_size = chunk_hdr->ch_frag_size;
            if (chunk_hdr->ch_hdr.put_addr == NULL) {
                int pool;

                fip->rfi_data = NULL;

                /* See which data pool this should come from,
                 * or if it should be malloc()ed
                 */
                pool = fls(chunk_hdr->ch_frag_size-1);
                if (pool >= module->first_pool &&
                        pool <= module->last_pool) {
                    ompi_free_list_item_t* item;
                    OMPI_FREE_LIST_GET_MT(&module->module_recv_buffers[pool],
                                          item);
                    if (OPAL_LIKELY(NULL != item)) {
                        fip->rfi_data = (char *)item;
                        fip->rfi_data_pool = pool;
                    }
                }
                if (fip->rfi_data == NULL) {
                    fip->rfi_data = malloc(chunk_hdr->ch_frag_size);
                    fip->rfi_data_pool = 0;
                }
                if (fip->rfi_data == NULL) {
                    abort();
                }
#if MSGDEBUG2
opal_output(0, "Start large recv to %p, size=%d\n",
        fip->rfi_data, chunk_hdr->ch_frag_size);
#endif
            } else {
#if MSGDEBUG2
opal_output(0, "Start PUT to %p\n", chunk_hdr->ch_hdr.put_addr);
#endif
                fip->rfi_data = chunk_hdr->ch_hdr.put_addr;
            }
            fip->rfi_bytes_left = chunk_hdr->ch_frag_size;
            fip->rfi_frag_id = chunk_hdr->ch_frag_id;

        /* frag_id is not 0 - it must match, drop if not */
        } else if (fip->rfi_frag_id != chunk_hdr->ch_frag_id) {
            ++module->num_badfrag_recvs;
            goto repost;
        }
#if MSGDEBUG1
        opal_output(0, "put_addr=%p, copy_addr=%p, off=%d\n",
                chunk_hdr->ch_hdr.put_addr,
                fip->rfi_data+chunk_hdr->ch_frag_offset,
                chunk_hdr->ch_frag_offset);
#endif

        /* Stats */
        ++module->num_chunk_recvs;

        /* validate offset and len to be within fragment */
        assert(chunk_hdr->ch_frag_offset + chunk_hdr->ch_hdr.payload_len <=
                fip->rfi_frag_size);
        assert(fip->rfi_frag_size == chunk_hdr->ch_frag_size);

        /* copy the data into place */
        memcpy(fip->rfi_data + chunk_hdr->ch_frag_offset, (char *)(chunk_hdr+1),
                chunk_hdr->ch_hdr.payload_len);

        /* update sliding window */
        ompi_btl_usnic_update_window(endpoint, window_index);

        fip->rfi_bytes_left -= chunk_hdr->ch_hdr.payload_len;
        if (0 == fip->rfi_bytes_left) {
            mca_btl_base_header_t *pml_header;
            mca_btl_base_descriptor_t desc;
            mca_btl_base_segment_t segment;

            /* Get access to PML header in assembled fragment so we
             * can pull out the tag
             */
            pml_header = (mca_btl_base_header_t *)(fip->rfi_data);
            segment.seg_addr.pval = pml_header;
            segment.seg_len = fip->rfi_frag_size;
            desc.des_dst = &segment;
            desc.des_dst_cnt = 1;

            /* only up to PML if this was not a put */
            if (chunk_hdr->ch_hdr.put_addr == NULL) {

                /* Pass this segment up to the PML */
#if MSGDEBUG2
                opal_output(0, "  large FRAG complete, pass up %p, %d bytes, tag=%d\n",
                        desc.des_dst->seg_addr.pval, desc.des_dst->seg_len,
                        pml_header->tag);
#endif
                reg = mca_btl_base_active_message_trigger + pml_header->tag;

                /* mca_pml_ob1_recv_frag_callback_frag() */
                reg->cbfunc(&module->super, pml_header->tag,
                        &desc, reg->cbdata);

                /* free temp buffer for non-put */
                if (0 == fip->rfi_data_pool) {
                    free(fip->rfi_data);
                } else {
                    OMPI_FREE_LIST_RETURN_MT(
                            &module->module_recv_buffers[fip->rfi_data_pool],
                            (ompi_free_list_item_t *)fip->rfi_data);
                }

#if MSGDEBUG2
            } else {
                opal_output(0, "PUT complete, suppressing callback\n");
#endif
            }

            /* release the fragment ID */
            fip->rfi_frag_id = 0;

            /* force immediate ACK */
            endpoint->endpoint_acktime = 0;
        }
        goto repost;
    }

    /***********************************************************************/
    /* Frag is an incoming ACK */
    else if (OPAL_LIKELY(OMPI_BTL_USNIC_PAYLOAD_TYPE_ACK == 
                         bseg->us_btl_header->payload_type)) {
        ompi_btl_usnic_seq_t ack_seq;

        /* sequence being ACKed */
        ack_seq = bseg->us_btl_header->ack_seq;

        /* Stats */
        ++module->num_ack_recvs;

#if MSGDEBUG1
        opal_output(0, "    Received ACK for sequence number %" UDSEQ " from %s to %s\n",
                    bseg->us_btl_header->ack_seq, src_mac, dest_mac);
#endif
        ompi_btl_usnic_handle_ack(endpoint, ack_seq);

        goto repost;
    }

    /***********************************************************************/
    /* Have no idea what the frag is; drop it */
    else {
        ++module->num_unk_recvs;
        opal_output(0, "==========================unknown 2");
        goto repost;
    }

    /***********************************************************************/
 repost:

    /* if endpoint exiting, and all ACKs received, release the endpoint */
    if (endpoint->endpoint_exiting && ENDPOINT_DRAINED(endpoint)) {
        OBJ_RELEASE(endpoint);
    }
 repost_no_endpoint:
    ++module->num_recv_reposts;

    /* Add recv to linked list for reposting */
    seg->rs_recv_desc.next = *repost_recv_head;
    *repost_recv_head = &seg->rs_recv_desc;
}
Exemplo n.º 19
0
/*
 * This function opens and handle using the handle that was received
 * from the remote memory.  It uses the addr and size of the remote
 * memory for caching the registration.
 */
int mca_mpool_rgpusm_register(mca_mpool_base_module_t *mpool, void *addr,
                             size_t size, uint32_t flags,
                             mca_mpool_base_registration_t **reg)
{
    mca_mpool_rgpusm_module_t *mpool_rgpusm = (mca_mpool_rgpusm_module_t*)mpool;
    mca_mpool_common_cuda_reg_t *rgpusm_reg;
    mca_mpool_common_cuda_reg_t *rget_reg;
    ompi_free_list_item_t *item;
    int rc;
    int mypeer;  /* just for debugging */

    /* In order to preserve the signature of the mca_mpool_rgpusm_register
     * function, we are using the **reg variable to not only get back the
     * registration information, but to hand in the memory handle received
     * from the remote side. */
    rget_reg = (mca_mpool_common_cuda_reg_t *)*reg;

    mypeer = flags;
    flags = 0;
    /* No need to support MCA_MPOOL_FLAGS_CACHE_BYPASS in here. It is not used. */
    assert(0 == (flags & MCA_MPOOL_FLAGS_CACHE_BYPASS));

    /* This chunk of code handles the case where leave pinned is not
     * set and we do not use the cache.  This is not typically how we
     * will be running.  This means that one can have an unlimited
     * number of registrations occuring at the same time.  Since we
     * are not leaving the registrations pinned, the number of
     * registrations is unlimited and there is no need for a cache. */
    if(!mca_mpool_rgpusm_component.leave_pinned && 0 == mca_mpool_rgpusm_component.rcache_size_limit) {
        OMPI_FREE_LIST_GET_MT(&mpool_rgpusm->reg_list, item);
        if(NULL == item) {
            return OMPI_ERR_OUT_OF_RESOURCE;
        }
        rgpusm_reg = (mca_mpool_common_cuda_reg_t*)item;
        rgpusm_reg->base.mpool = mpool;
        rgpusm_reg->base.base = addr;
        rgpusm_reg->base.bound = (unsigned char *)addr + size - 1;;
        rgpusm_reg->base.flags = flags;

        /* Copy the memory handle received into the registration */
        memcpy(rgpusm_reg->memHandle, rget_reg->memHandle, sizeof(rget_reg->memHandle));

        /* The rget_reg registration is holding the memory handle needed
         * to register the remote memory.  This was received from the remote
         * process.  A pointer to the memory is returned in the alloc_base field. */
        rc = mpool_rgpusm->resources.register_mem(addr, size,
                                                 (mca_mpool_base_registration_t *)rgpusm_reg,
                                                 (mca_mpool_base_registration_t *)rget_reg);

        /* This error should not happen with no cache in use. */
        assert(OMPI_ERR_WOULD_BLOCK != rc);

        if(rc != OMPI_SUCCESS) {
            OMPI_FREE_LIST_RETURN_MT(&mpool_rgpusm->reg_list, item);
            return rc;
        }
        rgpusm_reg->base.ref_count++;
        *reg = (mca_mpool_base_registration_t *)rgpusm_reg;
        return OMPI_SUCCESS;
    }

    /* Check to see if memory is registered and stored in the cache. */
    OPAL_THREAD_LOCK(&mpool->rcache->lock);
    SET_PAGE_ALIGNMENT_TO_ZERO();
    mpool->rcache->rcache_find(mpool->rcache, addr, size, reg);
    RESTORE_PAGE_ALIGNMENT();

    /* If *reg is not NULL, we have a registration.  Let us see if the
     * memory handle matches the one we were looking for.  If not, the
     * registration is invalid and needs to be removed. This happens
     * if memory was allocated, freed, and allocated again and ends up
     * with the same virtual address and within the limits of the
     * previous registration.  The memory handle check will catch that
     * scenario as the handles have unique serial numbers.  */
    if (*reg != NULL) {
        mpool_rgpusm->stat_cache_hit++;
        opal_output_verbose(10, mca_mpool_rgpusm_component.output,
                            "RGPUSM: Found addr=%p,size=%d (base=%p,size=%d) in cache",
                            addr, (int)size, (*reg)->base,
                            (int)((*reg)->bound - (*reg)->base));

        if (mca_common_cuda_memhandle_matches((mca_mpool_common_cuda_reg_t *)*reg, rget_reg)) {
            /* Registration matches what was requested.  All is good. */
            mpool_rgpusm->stat_cache_valid++;
        } else {
            /* This is an old registration.  Need to boot it. */
            opal_output_verbose(10, mca_mpool_rgpusm_component.output,
                                "RGPUSM: Mismatched Handle: Evicting/unregistering "
                                "addr=%p,size=%d (base=%p,size=%d) from cache",
                                addr, (int)size, (*reg)->base,
                                (int)((*reg)->bound - (*reg)->base));

            /* The ref_count has to be zero as this memory cannot possibly
             * be in use.  Assert on that just to make sure. */
            assert(0 == (*reg)->ref_count);
            if (mca_mpool_rgpusm_component.leave_pinned) {
                opal_list_remove_item(&mpool_rgpusm->lru_list,
                                      (opal_list_item_t*)(*reg));
            }

            /* Bump the reference count to keep things copacetic in deregister */
            (*reg)->ref_count++;
            /* Invalidate the registration so it will get booted out. */
            (*reg)->flags |= MCA_MPOOL_FLAGS_INVALID;
            mca_mpool_rgpusm_deregister(mpool, *reg);
            *reg = NULL;
            mpool_rgpusm->stat_cache_invalid++;
        }
    } else {
        /* Nothing was found in the cache. */
        mpool_rgpusm->stat_cache_miss++;
    }

    /* If we have a registration here, then we know it is valid. */
    if (*reg != NULL) {
        opal_output_verbose(10, mca_mpool_rgpusm_component.output,
                            "RGPUSM: CACHE HIT is good: ep=%d, addr=%p, size=%d in cache",
                            mypeer, addr, (int)size);

        /* When using leave pinned, we keep an LRU list. */
        if ((0 == (*reg)->ref_count) && mca_mpool_rgpusm_component.leave_pinned) {
            opal_output_verbose(20, mca_mpool_rgpusm_component.output,
                                "RGPUSM: POP OFF LRU: ep=%d, addr=%p, size=%d in cache",
                                mypeer, addr, (int)size);
            opal_list_remove_item(&mpool_rgpusm->lru_list,
                                  (opal_list_item_t*)(*reg));
        }
        (*reg)->ref_count++;
        OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
        opal_output(-1, "reg->ref_count=%d", (int)(*reg)->ref_count);
        opal_output_verbose(80, mca_mpool_rgpusm_component.output,
                           "RGPUSM: Found entry in cache addr=%p, size=%d", addr, (int)size);
        return OMPI_SUCCESS;
    }

    /* If we are here, then we did not find a registration, or it was invalid,
     * so this is a new one, and we are going to use the cache. */
    assert(NULL == *reg);
    opal_output_verbose(10, mca_mpool_rgpusm_component.output,
                        "RGPUSM: New registration ep=%d, addr=%p, size=%d. Need to register and insert in cache",
                         mypeer, addr, (int)size);

    OMPI_FREE_LIST_GET_MT(&mpool_rgpusm->reg_list, item);
    if(NULL == item) {
        OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
        return OMPI_ERR_OUT_OF_RESOURCE;
    }
    rgpusm_reg = (mca_mpool_common_cuda_reg_t*)item;

    rgpusm_reg->base.mpool = mpool;
    rgpusm_reg->base.base = addr;
    rgpusm_reg->base.bound = (unsigned char *)addr + size - 1;
    rgpusm_reg->base.flags = flags;

    /* Need the memory handle saved in the registration */
    memcpy(rgpusm_reg->memHandle, rget_reg->memHandle, sizeof(rget_reg->memHandle));

    /* Actually register the memory, which opens the memory handle.
     * Need to do this prior to putting in the cache as the base and
     * bound values may be changed by the registration.  The memory
     * associated with the handle comes back in the alloc_base
     * value. */
    rc = mpool_rgpusm->resources.register_mem(addr, size, (mca_mpool_base_registration_t *)rgpusm_reg,
                                             (mca_mpool_base_registration_t *)rget_reg);
    /* There is a chance we can get the OMPI_ERR_WOULD_BLOCK from the
     * CUDA codes attempt to register the memory.  The case that this
     * can happen is as follows.  A block of memory is registered.
     * Then the sending side frees the memory.  The sending side then
     * cuMemAllocs memory again and gets the same base
     * address. However, it cuMemAllocs a block that is larger than
     * the one in the cache.  The cache will return that memory is not
     * registered and call into CUDA to register it.  However, that
     * will fail with CUDA_ERROR_ALREADY_MAPPED.  Therefore we need to
     * boot that previous allocation out and deregister it first.
     */
    if (OMPI_ERR_WOULD_BLOCK == rc) {
        mca_mpool_base_registration_t *oldreg;

        SET_PAGE_ALIGNMENT_TO_ZERO();
        /* Need to make sure it is at least 4 bytes in size  This will
         * ensure we get the hit in the cache. */
        mpool->rcache->rcache_find(mpool->rcache, addr, 4, &oldreg);
        RESTORE_PAGE_ALIGNMENT();

        /* For most cases, we will find a registration that overlaps.
         * Removal of it should allow the registration we are
         * attempting to succeed. */
        if (NULL != oldreg) {
            /* The ref_count has to be zero as this memory cannot
             * possibly be in use.  Assert on that just to make sure. */
            assert(0 == oldreg->ref_count);
            if (mca_mpool_rgpusm_component.leave_pinned) {
                opal_list_remove_item(&mpool_rgpusm->lru_list,
                                      (opal_list_item_t*)oldreg);
            }

            /* Bump the reference count to keep things copacetic in deregister */
            oldreg->ref_count++;
            /* Invalidate the registration so it will get booted out. */
            oldreg->flags |= MCA_MPOOL_FLAGS_INVALID;
            mca_mpool_rgpusm_deregister(mpool, oldreg);
            mpool_rgpusm->stat_evicted++;

            /* And try again.  This one usually works. */
            rc = mpool_rgpusm->resources.register_mem(addr, size, (mca_mpool_base_registration_t *)rgpusm_reg,
                                                      (mca_mpool_base_registration_t *)rget_reg);
        }

        /* There is a chance that another registration is blocking our
         * ability to register.  Check the rc to see if we still need
         * to try and clear out registrations. */
        while (OMPI_SUCCESS != rc) {
            if (true != mca_mpool_rgpusm_deregister_lru(mpool)) {
                rc = OMPI_ERROR;
                break;
            }
            /* Clear out one registration. */
            rc = mpool_rgpusm->resources.register_mem(addr, size, (mca_mpool_base_registration_t *)rgpusm_reg,
                                                      (mca_mpool_base_registration_t *)rget_reg);
        }
    }

    if(rc != OMPI_SUCCESS) {
        OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
        OMPI_FREE_LIST_RETURN_MT(&mpool_rgpusm->reg_list, item);
        return rc;
    }

    opal_output_verbose(80, mca_mpool_rgpusm_component.output,
                        "RGPUSM: About to insert in rgpusm cache addr=%p, size=%d", addr, (int)size);
    SET_PAGE_ALIGNMENT_TO_ZERO();
    while((rc = mpool->rcache->rcache_insert(mpool->rcache, (mca_mpool_base_registration_t *)rgpusm_reg,
             mca_mpool_rgpusm_component.rcache_size_limit)) ==
            OMPI_ERR_TEMP_OUT_OF_RESOURCE) {
        opal_output(-1, "No room in the cache - boot one out");
        if (!mca_mpool_rgpusm_deregister_lru(mpool)) {
            break;
        }
    }
    RESTORE_PAGE_ALIGNMENT();

    if(rc != OMPI_SUCCESS) {
        OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
        OMPI_FREE_LIST_RETURN_MT(&mpool_rgpusm->reg_list, item);
        /* We cannot recover from this.  We can be here if the size of
         * the cache is smaller than the amount of memory we are
         * trying to register in a single transfer.  In that case, rc
         * is MPI_ERR_OUT_OF_RESOURCES, but everything is stuck at
         * that point.  Therefore, just error out completely.
         */
        return OMPI_ERROR;
    }

    rgpusm_reg->base.ref_count++;
    *reg = (mca_mpool_base_registration_t *)rgpusm_reg;
    OPAL_THREAD_UNLOCK(&mpool->rcache->lock);

    /* Cleanup any vmas that we have deferred deletion on */
    mpool->rcache->rcache_clean(mpool->rcache);
    return OMPI_SUCCESS;
}
/*
 * Receive prepost:
 * return values:
 * 0 - no prepost was done
 * -1 - fatal error during prepost
 * other value - number preposted elements
 */
static int mca_bcol_iboffload_frag_reg_qp_prepost(
                mca_bcol_iboffload_endpoint_t *endpoint,
                int qp_index, int num_to_prepost)
{
    ompi_free_list_item_t *item;
    mca_bcol_iboffload_frag_t *frag;

    struct ibv_recv_wr *recv_wr, *recv_bad;
    int i, ret, num_preposted = 0, start_wr_index;

    mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
    mca_bcol_iboffload_device_t *device = endpoint->iboffload_module->device;

    opal_list_t *preposted = &(endpoint->qps[qp_index].preposted_frags);
    mca_bcol_iboffload_recv_wr_manager *recv_wrs = &cm->recv_wrs;

    IBOFFLOAD_VERBOSE(10, ("Recv prepost call: endpoint %p, to prepost %d",
                          (void *) endpoint, num_to_prepost));

    if (OPAL_UNLIKELY(0 == num_to_prepost)) {
        IBOFFLOAD_VERBOSE(10, ("num_to_prepost = 0, return immediate"));
        return OMPI_SUCCESS;
    }

    /* make sure that we do not overrun number of rd_wqe */
    if (num_to_prepost > endpoint->qps[qp_index].rd_wqe) {
        IBOFFLOAD_VERBOSE(10, ("Reset num_to_prepost = %d, to rd_wqe = %d",
                                num_to_prepost, endpoint->qps[qp_index].rd_wqe));

        num_to_prepost = endpoint->qps[qp_index].rd_wqe;
    }

    OPAL_THREAD_LOCK(&recv_wrs->lock);

    /* calculate start index in array
     * of pre-allocated work requests */
    start_wr_index = cm->qp_infos[qp_index].rd_num - num_to_prepost;
    recv_wr = &recv_wrs->recv_work_requests[qp_index][start_wr_index];

    IBOFFLOAD_VERBOSE(10, ("Endpoint %p, qp_index - %d, to_porepost %d, "
                           "start index of WRs - %d, rd_wqe - %d",
                           (void *) endpoint, qp_index, num_to_prepost,
                            start_wr_index, endpoint->qps[qp_index].rd_wqe));

    while (num_preposted < num_to_prepost) {
        /* put the item on list of preposted */
        OMPI_FREE_LIST_GET_MT(&device->frags_free[qp_index], item);
        if (OPAL_UNLIKELY(NULL == item)) {
            break;
        }

        frag = (mca_bcol_iboffload_frag_t *) item;
        opal_list_append(preposted, (opal_list_item_t *) item);

        recv_wr[num_preposted].sg_list = &frag->sg_entry;
        /* TODO Pasha - fix it later */ /* Vasily: Is it right place to take a size value ???? */
        frag->sg_entry.length = cm->qp_infos[qp_index].size;
        ++num_preposted;
    }

    if (OPAL_LIKELY(num_preposted > 0)) {
        /* Set the tail */
        recv_wr[num_preposted - 1].next = NULL;

        /* post the list of recvs */
        ret = ibv_post_recv(endpoint->qps[qp_index].qp->lcl_qp, recv_wr, &recv_bad);
        if (OPAL_UNLIKELY(0 != ret)) {
            IBOFFLOAD_ERROR(("ibv_post_recv failed (%s), error: %s [%d], "
                             "qp_index - %d.\n",
                              ibv_get_device_name(device->dev.ib_dev),
                              strerror(errno), ret, qp_index));

            /* Return allocated frags */
            for (i = 0; i < num_preposted; i++) {
                OMPI_FREE_LIST_RETURN_MT(&device->frags_free[qp_index],
                        (ompi_free_list_item_t *)
                            opal_list_remove_last(preposted));
            }

            return OMPI_ERROR;
        }

        /* recover last recv_wr if needed */
        if (OPAL_UNLIKELY(num_to_prepost != num_preposted)) {
            recv_wr[num_preposted - 1].next = &recv_wr[num_preposted];
        }

        /* decresing numbers of free recv wqe */
        endpoint->qps[qp_index].rd_wqe -= num_preposted;
    }

    OPAL_THREAD_UNLOCK(&recv_wrs->lock);

    IBOFFLOAD_VERBOSE(10, ("Endpoint %p, to_porepost %d, num preposted - %d",
                          (void *) endpoint, num_to_prepost, num_preposted));

    return OMPI_SUCCESS;
}
Exemplo n.º 21
0
void mca_mpool_rgpusm_finalize(struct mca_mpool_base_module_t *mpool)
{
    mca_mpool_rgpusm_module_t *mpool_rgpusm = (mca_mpool_rgpusm_module_t*)mpool;
    mca_mpool_base_registration_t *reg;
    mca_mpool_base_registration_t *regs[RGPUSM_MPOOL_NREGS];
    int reg_cnt, i;
    int rc;

    /* Statistic */
    if(true == mca_mpool_rgpusm_component.print_stats) {
        opal_output(0, "%s rgpusm: stats "
                "(hit/valid/invalid/miss/evicted): %d/%d/%d/%d/%d\n",
                OMPI_NAME_PRINT(OMPI_PROC_MY_NAME),
                mpool_rgpusm->stat_cache_hit, mpool_rgpusm->stat_cache_valid, 
                mpool_rgpusm->stat_cache_invalid, mpool_rgpusm->stat_cache_miss,
                mpool_rgpusm->stat_evicted);
    }

    OPAL_THREAD_LOCK(&mpool->rcache->lock);
    do {
        reg_cnt = mpool->rcache->rcache_find_all(mpool->rcache, 0, (size_t)-1,
                regs, RGPUSM_MPOOL_NREGS);
        opal_output(-1, "Registration size at finalize = %d", reg_cnt);

        for(i = 0; i < reg_cnt; i++) {
            reg = regs[i];

            if(reg->ref_count) {
                reg->ref_count = 0; /* otherway dereg will fail on assert */
            } else if (mca_mpool_rgpusm_component.leave_pinned) {
                opal_list_remove_item(&mpool_rgpusm->lru_list,
                        (opal_list_item_t*)reg);
            }

            /* Remove from rcache first */
            mpool->rcache->rcache_delete(mpool->rcache, reg);

            /* Drop lock before deregistering memory */
            OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
            assert(reg->ref_count == 0);
            rc = mpool_rgpusm->resources.deregister_mem(mpool_rgpusm->resources.reg_data,
                                                   reg);
            OPAL_THREAD_LOCK(&mpool->rcache->lock);

            if(rc != OMPI_SUCCESS) {
                /* Potentially lose track of registrations
                   do we have to put it back? */
                continue;
            }

            OMPI_FREE_LIST_RETURN_MT(&mpool_rgpusm->reg_list,
                                  (ompi_free_list_item_t*)reg);
        }
    } while(reg_cnt == RGPUSM_MPOOL_NREGS);

    OBJ_DESTRUCT(&mpool_rgpusm->lru_list);
    OBJ_DESTRUCT(&mpool_rgpusm->reg_list);
    OPAL_THREAD_UNLOCK(&mpool->rcache->lock);

    /* Cleanup any vmas that we have deferred deletion on */
    mpool->rcache->rcache_clean(mpool->rcache);

}
Exemplo n.º 22
0
void test2(void)
{
    ompi_free_list_t key_list;
    ompi_free_list_item_t * new_value;
    ompi_rb_tree_t tree;
    int rc, i, size;
    void * result, * lookup;
    void * mem[NUM_ALLOCATIONS];
    ompi_free_list_item_t * key_array[NUM_ALLOCATIONS];
    struct timeval start, end;
    
    OBJ_CONSTRUCT(&key_list, ompi_free_list_t);
    ompi_free_list_init_new(&key_list, sizeof(ompi_test_rb_value_t),
            opal_cache_line_size,
            OBJ_CLASS(ompi_test_rb_value_t), 
            0,opal_cache_line_size,
            0, -1 , 128, NULL);
    
    OBJ_CONSTRUCT(&tree, ompi_rb_tree_t);
    rc = ompi_rb_tree_init(&tree, mem_node_compare);
    if(!test_verify_int(OMPI_SUCCESS, rc)) {
        test_failure("failed to properly initialize the tree");
    }
  
    size = 1;
    for(i = 0; i < NUM_ALLOCATIONS; i++)
    {
        mem[i] = malloc(size);
        if(NULL == mem[i])
        {
            test_failure("system out of memory");
            return;
        }   
        OMPI_FREE_LIST_GET_MT(&key_list, new_value);
        if(NULL == new_value)
        {
            test_failure("failed to get memory from free list");
        }
        key_array[i] = new_value;
        ((ompi_test_rb_value_t *) new_value)->key.bottom = mem[i];
        ((ompi_test_rb_value_t *) new_value)->key.top = 
                                            (void *) ((size_t) mem[i] + size - 1);
        ((ompi_test_rb_value_t *) new_value)->registered_mpools[0] = (void *)(intptr_t) i;
        rc = ompi_rb_tree_insert(&tree, &((ompi_test_rb_value_t *)new_value)->key, 
                        new_value);
        if(OMPI_SUCCESS != rc) 
        {
            test_failure("failed to properly insert a new node");
        }
        size += 1;   
    }
    
    gettimeofday(&start, NULL);
    for(i = 0; i < NUM_ALLOCATIONS; i++)
    {
        lookup = (void *) ((size_t) mem[i] + i);
        result = ompi_rb_tree_find(&tree, &lookup);
        if(NULL == result) 
        {
            test_failure("lookup returned null!");
        } else if(i != ((int)(intptr_t) ((ompi_test_rb_value_t *) result)->registered_mpools[0]))
        {
            test_failure("lookup returned wrong node!");
        }
        result = ompi_rb_tree_find(&tree, &lookup);
        if(NULL == result) 
        {
            test_failure("lookup returned null!");
        } else if(i != ((int)(intptr_t) ((ompi_test_rb_value_t *) result)->registered_mpools[0]))
        {
            test_failure("lookup returned wrong node!");
        }
    }

    gettimeofday(&end, NULL);

#if 0
    i = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_usec - start.tv_usec);
    printf("In a %d node tree, %d lookups took %f microseonds each\n", 
            NUM_ALLOCATIONS, NUM_ALLOCATIONS * 2, 
            (float) i / (float) (NUM_ALLOCATIONS * 2));
#endif

    for(i = 0; i < NUM_ALLOCATIONS; i++)
    {
        if(NULL != mem[i])
        {
            free(mem[i]);
        }
        OMPI_FREE_LIST_RETURN_MT(&(key_list), key_array[i]);
    }

    OBJ_DESTRUCT(&tree);
    OBJ_DESTRUCT(&key_list);
}