static int progress_pending_collfrags(mca_bcol_iboffload_module_t *iboffload)
{
    mca_bcol_iboffload_collfrag_t *pending_collfrag;
    int rc, size = opal_list_get_size(&iboffload->collfrag_pending);

    IBOFFLOAD_VERBOSE(10, ("Calling progress_pending_collfrags"));

    do {
        pending_collfrag = (mca_bcol_iboffload_collfrag_t *)
                           opal_list_remove_first(&iboffload->collfrag_pending);

        IBOFFLOAD_VERBOSE(10, ("Get pending_collfrag - %p, iboffload - %p, "
                               "pending list size - %d.", pending_collfrag, iboffload,
                               opal_list_get_size(&iboffload->collfrag_pending)));

        /* Return back coll frag to coll request opal_list */
        opal_list_append(&pending_collfrag->coll_full_req->work_requests,
                         (opal_list_item_t *) pending_collfrag);

        rc = pending_collfrag->coll_full_req->progress_fn
             (iboffload, pending_collfrag->coll_full_req);
        if (OPAL_UNLIKELY(BCOL_FN_STARTED != rc && OMPI_SUCCESS != rc)) {
            return OMPI_ERROR;
        }
    } while (--size > 0);

    return OMPI_SUCCESS;
}
Example #2
0
/* Large message scatter-allgather with zero copy */
int mca_bcol_iboffload_zero_copy_progress(bcol_function_args_t *fn_arguments,
                                                   struct mca_bcol_base_function_t *const_args)
{
    int i;
    mca_bcol_iboffload_collreq_t *coll_request =
                 (mca_bcol_iboffload_collreq_t *)fn_arguments->bcol_opaque_data;

    /* IBOFFLOAD_VERBOSE(10, ("Run general progress. %d == %d *  %d == %d",
                coll_request->n_frag_mpi_complete, coll_request->n_fragments,
                coll_request->n_frag_net_complete, coll_request->n_fragments)); */

    /* Complete the bcast - progress releases full request descriptors */
    for (i = 0; i < mca_bcol_iboffload_component.max_progress_pull; i++) {
        if (coll_request->n_frag_mpi_complete == coll_request->n_fragments &&
            coll_request->n_frag_net_complete == coll_request->n_fragments) {

            IBOFFLOAD_VERBOSE(10, ("Deregister user buff.\n"));
            coll_request->module->device->mpool->mpool_deregister(
                    coll_request->module->device->mpool,
                    (mca_mpool_base_registration_t *) coll_request->buffer_info[SBUF].iboffload_reg);
            coll_request->buffer_info[SBUF].iboffload_reg = NULL;

            RELEASE_COLLREQ(coll_request);
            IBOFFLOAD_VERBOSE(10, ("New bcast done !!!"));
            return BCOL_FN_COMPLETE;
        }
    }

    /* IBOFFLOAD_VERBOSE(10, ("Bcast general progress done")); */

    /* done */
    return BCOL_FN_STARTED;
}
static int mca_bcol_iboffload_fanin_init(
                bcol_function_args_t *input_args,
                mca_bcol_iboffload_module_t *iboffload,
                struct mca_bcol_iboffload_collreq_t **coll_request)
{
    ompi_free_list_item_t *item = NULL;
    mca_bcol_iboffload_collfrag_t *coll_fragment = NULL;

    mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;

    IBOFFLOAD_VERBOSE(10, ("Calling for mca_bcol_iboffload_barrier_init"));

    OMPI_FREE_LIST_WAIT_MT(&cm->collreqs_free, item);
    if(OPAL_UNLIKELY(NULL == item)) {
        IBOFFLOAD_VERBOSE(10, ("Failing for coll request free list waiting.\n"));
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    (*coll_request) = (mca_bcol_iboffload_collreq_t *) item;
    (*coll_request)->progress_fn = iboffload->fanin_algth;

    (*coll_request)->completion_cb_fn = NULL;
    (*coll_request)->order_info = &input_args->order_info;

    (*coll_request)->module = iboffload;
    (*coll_request)->ml_buffer_index = input_args->buffer_index;
    (*coll_request)->buffer_info[SBUF].offset = 0;
    (*coll_request)->buffer_info[RBUF].offset = 0;
    (*coll_request)->qp_index = MCA_BCOL_IBOFFLOAD_QP_BARRIER;

    input_args->bcol_opaque_data = (void *) (*coll_request);

    /* finish initializing full message descriptor */
    (*coll_request)->n_fragments  = 1;
    (*coll_request)->n_frags_sent = 1;

    (*coll_request)->n_frag_mpi_complete = 0;
    (*coll_request)->n_frag_net_complete = 0;

    (*coll_request)->user_handle_freed = false;

    /*
     * setup collective work request
     */

    /* get collective frag */
    coll_fragment = &(*coll_request)->first_collfrag;
    mca_bcol_iboffload_collfrag_init(coll_fragment);

    coll_fragment->alg = FANIN_ALG;
    coll_fragment->mq_index = COLL_MQ;

    /* Set mq credits */
    coll_fragment->mq_credits = iboffload->alg_task_consump[FANIN_ALG];

    /* set pointers for (coll frag) <-> (coll full request) */
    MCA_BCOL_IBOFFLOAD_SET_COLL_REQ_LINKS(*coll_request, coll_fragment);

    return OMPI_SUCCESS;
}
/* Unload devices */
static int iboffload_release_devices(void)
{
    int i;
    mca_bcol_iboffload_device_t *device = NULL;

    mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
    opal_pointer_array_t *devs = &cm->devices;

    IBOFFLOAD_VERBOSE(10, ("Destroy all devices.\n"));

    for (i = 0; i < cm->num_devs; i++) {
        device = opal_pointer_array_get_item(devs, i);

        IBOFFLOAD_VERBOSE(10, ("Device %s with index %d will be destroyed.\n",
                               ibv_get_device_name(device->dev.ib_dev), i));
        if (NULL != device) {
            OBJ_RELEASE(device);
        }
    }

    IBOFFLOAD_VERBOSE(10, ("All devices were destroyed.\n"));

    opal_pointer_array_remove_all(devs);
    OBJ_DESTRUCT(devs);

    /* release device list */
    /*ibv_free_device_list_compat(cm->ib_devs);*/
    ompi_ibv_free_device_list(cm->ib_devs);
    cm->ib_devs = NULL;

    IBOFFLOAD_VERBOSE(10, ("All devices destroyed.\n"));

    return OMPI_SUCCESS;
}
Example #5
0
int mca_bcol_iboffload_small_msg_bcast_progress(
                        bcol_function_args_t *input_args,
                        struct mca_bcol_base_function_t *const_args)
{
    mca_bcol_iboffload_collreq_t *coll_request =
                 (mca_bcol_iboffload_collreq_t *)
                                   input_args->bcol_opaque_data;

    IBOFFLOAD_VERBOSE(10, ("Run progress.\n"));

    /* We should send the data to our children in the tree before
       the upper layer will start with buffers recycling */
    if (BCOL_AND_NET_ARE_COMPLETED(coll_request)) {
        coll_request->user_handle_freed = true;
        if (COLLREQ_IS_DONE(coll_request)) {
            IBOFFLOAD_VERBOSE(10, ("Coll request already done.\n"));
            RELEASE_COLLREQ(coll_request);
        }

        IBOFFLOAD_VERBOSE(10, ("New bcast done !!!"));
        return BCOL_FN_COMPLETE;
    }

    return BCOL_FN_STARTED;
}
/* query to see if the component is available for use, and can
 * satisfy the thread and progress requirements
 */
int mca_bcol_iboffload_init_query(bool enable_progress_threads,
                                  bool enable_mpi_threads)
{
    int rc;
    mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;

    IBOFFLOAD_VERBOSE(10, ("Init Iboffload component.\n"));

    /* Get list of HCAs and ports */
    rc = iboffload_load_devices();
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        IBOFFLOAD_ERROR(("Load devices error.\n"));
        goto unload_devices;
    }

    /* Setup the BSRQ QP's based on the final value of
       mca_bcol_iboffload_component.receive_queues. */
    rc = setup_qps();
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        IBOFFLOAD_ERROR(("QPs setup error.\n"));
        goto unload_devices;
    }

    cm->super.collm_init_query = mca_bcol_iboffload_dummy_init_query;

    return OMPI_SUCCESS;

    /* done */
unload_devices:
    IBOFFLOAD_ERROR(("Release devices: an error occured.\n"));

    iboffload_release_devices();

    return rc;
}
static int mca_bcol_iboffload_new_style_fanin_intra(
                                bcol_function_args_t *input_args,
                                struct coll_ml_function_t *const_args)
{
    int rc = OMPI_SUCCESS;

    struct mca_bcol_iboffload_collreq_t *coll_request = NULL;
    mca_bcol_iboffload_module_t *iboffload =
                    (mca_bcol_iboffload_module_t *) const_args->bcol_module;

    assert(NULL != iboffload);

    MCA_BCOL_CHECK_ORDER(const_args->bcol_module, input_args);

    /* Init Fan-In collective reqeust */
    rc = mca_bcol_iboffload_fanin_init(input_args, iboffload, &coll_request);
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        IBOFFLOAD_VERBOSE(10, ("Error from mca_bcol_iboffload_fanin_init.\n"));
        return BCOL_FN_NOT_STARTED;
    }

    rc = iboffload->fanin_algth(iboffload, coll_request);
    if (OPAL_UNLIKELY(OMPI_ERROR == rc)) {
        return BCOL_FN_NOT_STARTED;
    }

    return BCOL_FN_STARTED;
}
int mca_bcol_iboffload_fanin_register(mca_bcol_base_module_t *super)
{
    mca_bcol_base_coll_fn_comm_attributes_t comm_attribs;
    mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs;

    IBOFFLOAD_VERBOSE(10, ("Register iboffload Fan-In.\n"));

    comm_attribs.bcoll_type = BCOL_FANIN;

    comm_attribs.comm_size_min = 0;
    comm_attribs.comm_size_max = 1024 * 1024;
    comm_attribs.waiting_semantics = NON_BLOCKING;

    inv_attribs.bcol_msg_min = 0;
    inv_attribs.bcol_msg_max = 20000; /* range 1 */

    inv_attribs.datatype_bitmap = 0xffffffff;
    inv_attribs.op_types_bitmap = 0xffffffff;

    comm_attribs.data_src = DATA_SRC_KNOWN;

    mca_bcol_base_set_attributes(super,
        &comm_attribs, &inv_attribs,
        mca_bcol_iboffload_new_style_fanin_intra,
        mca_bcol_iboffload_new_style_fanin_progress);

    return OMPI_SUCCESS;
}
Example #9
0
int mca_bcol_iboffload_bcast_scatter_allgather_extra_intra(bcol_function_args_t *fn_arguments,
                                                   struct mca_bcol_base_function_t *const_args)
{
    mca_bcol_iboffload_module_t *iboffload_module =
        (mca_bcol_iboffload_module_t *) const_args->bcol_module;

    int rc;
    int mq_credits = iboffload_module->power_of_2 * 3  + 4;
    bool if_bcol_last = BCOL_IBOFFLOAD_IS_LAST_CALL(const_args);
    mca_bcol_iboffload_collreq_t *coll_request;

    MCA_BCOL_CHECK_ORDER(const_args->bcol_module, fn_arguments);

    rc = mca_bcol_iboffload_bcast_init(fn_arguments, iboffload_module,
            &coll_request, if_bcol_last, mq_credits,
            mca_bcol_iboffload_bcast_scatter_allgather_extra_exec);
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        return rc;
    }

    rc = coll_request->progress_fn(iboffload_module, coll_request);

    IBOFFLOAD_VERBOSE(10, ("mca_bcol_iboffload_bcast_scatter_allgather_extra_intra was started [%d]\n", rc));
    return rc;
}
Example #10
0
static inline __opal_attribute_always_inline__ int
binomial_scatter_smsg(
        mca_bcol_iboffload_module_t *iboffload_module,
        mca_bcol_iboffload_collfrag_t *coll_fragment,
        struct mqe_task **last_send,
        int radix_mask_pow,
        uint32_t my_group_index,
        size_t send_size
        )
{
    int rc, dst;
    int radix_mask = radix_mask_pow >= 0 ? 1 << radix_mask_pow : 0;

    while(radix_mask > 0) {
        /* For each level of tree, do sends */
        dst = my_group_index ^ radix_mask;
        rc = mca_bcol_iboffload_send_small_buff_setup(
                last_send, send_size, dst,
                iboffload_module, coll_fragment);
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            IBOFFLOAD_VERBOSE(10, ("Failed to isend data"));
            return rc;
        }

        radix_mask >>= 1;
    }

    return OMPI_SUCCESS;
}
static int mca_bcol_iboffload_alloc_reg_qp_resource(int qp_index, mca_bcol_iboffload_device_t *device)
{
    int length;
    mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;

    ompi_free_list_t *frags_free = &device->frags_free[qp_index];

    OBJ_CONSTRUCT(frags_free, ompi_free_list_t);
    length = cm->qp_infos[qp_index].size;

    IBOFFLOAD_VERBOSE(10, ("free list len %d\n", length));
    if (OMPI_SUCCESS != ompi_free_list_init_ex_new(frags_free,
                sizeof(mca_bcol_iboffload_frag_t), MCA_IBOFFLOAD_CACHE_LINE_SIZE,
                OBJ_CLASS(mca_bcol_iboffload_frag_t),
                length, cm->buffer_alignment,
                cm->free_list_num,
                cm->free_list_max,
                cm->free_list_inc,
                device->mpool,
                mca_bcol_iboffload_frag_init,
                (void *) &cm->qp_infos[qp_index].qp_index)) {
        IBOFFLOAD_ERROR(("Failed to allocate frags_free"));
        return OMPI_ERROR;
    }

    return OMPI_SUCCESS;
}
static inline __opal_attribute_always_inline__ int
handle_collfrag_done(mca_bcol_iboffload_collfrag_t *coll_frag,
                     mca_bcol_iboffload_collreq_t *coll_request,
                     mca_bcol_iboffload_device_t *device)
{
    int rc;

    if (COLLFRAG_IS_DONE(coll_frag)) {
        IBOFFLOAD_VERBOSE(10, ("Coll frag - %p already done.\n", coll_frag));

        coll_request->n_frag_net_complete++;
        IBOFFLOAD_VERBOSE(10, ("Free tasks resourse.\n"));
        /* Check if we are done with this coll_frag and release resources if so.  */
        rc = mca_bcol_iboffload_free_tasks_frags_resources(coll_frag, device->frags_free);
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            IBOFFLOAD_ERROR(("mca_bcol_iboffload_free_tasks_frags_resources FAILED"));
            fatal_error("Failed to mca_bcol_iboffload_free_tasks_frags_resources\n");
            return -1;
        }

        BCOL_IBOFFLOAD_MQ_RETURN_CREDITS(coll_request->module, coll_frag->mq_index, coll_frag->mq_credits);

        RELEASE_COLLFRAG(coll_frag);

        PROGRESS_PENDING_COLLFRAG(coll_frag);

        IBOFFLOAD_VERBOSE(10, ("Alg %d: user_handle_freed - %d, n_frag_mpi_complete - %d, "
                               "n_fragments- %d, n_frag_net_complete - %d, n_fragments - %d.\n",
                               coll_frag->alg,
                               coll_request->user_handle_freed,
                               coll_request->n_frag_mpi_complete,
                               coll_request->n_fragments,
                               coll_request->n_frag_net_complete,
                               coll_request->n_fragments));

        /* check for full message completion */
        if (COLLREQ_IS_DONE(coll_request)) {
            IBOFFLOAD_VERBOSE(10, ("Coll request already done.\n"));
            RELEASE_COLLREQ(coll_request);
        }
    }

    IBOFFLOAD_VERBOSE(10, ("Exit with success.\n"));

    return 0;
}
Example #13
0
int mca_bcol_iboffload_bcast_register(mca_bcol_base_module_t *super)
{
    mca_bcol_iboffload_module_t *iboffload_module =
                            (mca_bcol_iboffload_module_t *) super;

    int my_group_index = iboffload_module->ibnet->super.my_index;

    mca_bcol_base_coll_fn_comm_attributes_t comm_attribs;
    mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs;

    IBOFFLOAD_VERBOSE(10, ("Register iboffload Bcast.\n"));

    comm_attribs.bcoll_type = BCOL_BCAST;

    comm_attribs.comm_size_min = 0;
    comm_attribs.comm_size_max = 1024 * 1024;
    comm_attribs.waiting_semantics = NON_BLOCKING;

    inv_attribs.bcol_msg_min = 0;
    inv_attribs.bcol_msg_max = 20000; /* range 1 */

    inv_attribs.datatype_bitmap = 0xffffffff;
    inv_attribs.op_types_bitmap = 0xffffffff;

    comm_attribs.data_src = DATA_SRC_KNOWN;

    if (my_group_index < iboffload_module->power_of_2_ranks) {
        mca_bcol_base_set_attributes(super,
            &comm_attribs, &inv_attribs,
            mca_bcol_iboffload_small_msg_bcast_intra,
            mca_bcol_iboffload_small_msg_bcast_progress);

        inv_attribs.bcol_msg_min = 10000000;
        inv_attribs.bcol_msg_max = 10485760; /* range 4 */

        mca_bcol_base_set_attributes(super,
                &comm_attribs, &inv_attribs,
                mca_bcol_iboffload_bcast_scatter_allgather_intra,
                mca_bcol_iboffload_zero_copy_progress);

    } else {
        mca_bcol_base_set_attributes(super,
            &comm_attribs, &inv_attribs,
            mca_bcol_iboffload_small_msg_bcast_extra_intra,
            mca_bcol_iboffload_small_msg_bcast_progress);

        inv_attribs.bcol_msg_min = 10000000;
        inv_attribs.bcol_msg_max = 10485760; /* range 4 */

        mca_bcol_base_set_attributes(super,
                &comm_attribs, &inv_attribs,
                mca_bcol_iboffload_bcast_scatter_allgather_extra_intra,
                mca_bcol_iboffload_zero_copy_progress);

    }

    return OMPI_SUCCESS;
}
/************************************************************************
 ************************ New style Fan-In ******************************
 ***********************************************************************/
static int mca_bcol_iboffload_new_style_fanin_progress(
                        bcol_function_args_t *input_args,
                        struct coll_ml_function_t *const_args)
{
    mca_bcol_iboffload_collreq_t *coll_request =
                 (mca_bcol_iboffload_collreq_t *)
                                   input_args->bcol_opaque_data;

    if (BCOL_IS_COMPLETED(coll_request)) {
        coll_request->user_handle_freed = true;
        if (COLLREQ_IS_DONE(coll_request)) {
            IBOFFLOAD_VERBOSE(10, ("Coll request already done.\n"));
            RELEASE_COLLREQ(coll_request);
        }

        IBOFFLOAD_VERBOSE(10, ("Fan-In already done.\n"));
        return BCOL_FN_COMPLETE;
    }

    return BCOL_FN_STARTED;
}
/* Create list of IB HCA that have active port */
static int iboffload_load_devices(void)
{
    int num_devs = 0, i;
    mca_bcol_iboffload_device_t *device = NULL;
    mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;

    IBOFFLOAD_VERBOSE(10, ("Entering to iboffload_load_devices"));

    /* Get list of devices */
    /*cm->ib_devs = ibv_get_device_list_compat(&num_devs);*/
    cm->ib_devs = ompi_ibv_get_device_list(&num_devs);
    if (0 == num_devs || NULL == cm->ib_devs) {
        IBOFFLOAD_ERROR(("No IB devices found"));
        /* No hca error*/
        orte_show_help("help-mpi-btl-openib.txt", "no-nics", true);
        return OMPI_ERROR;
    }

    cm->num_devs = num_devs;

    for (i = 0; i < num_devs; i++) {
        device = OBJ_NEW(mca_bcol_iboffload_device_t);
        if (NULL != device) {
            opal_pointer_array_set_item(&cm->devices, i, (void *) device);
            device->dev.ib_dev = cm->ib_devs[i];

            IBOFFLOAD_VERBOSE(10, ("Device %s with index %d was appended.\n",
                                   ibv_get_device_name(device->dev.ib_dev), i));
        }
    }

    if (0 == opal_pointer_array_get_size(&cm->devices)) {
        /* No relevand devices were found, return error */
        IBOFFLOAD_ERROR(("No active devices found.\n"));

        return OMPI_ERROR;
    }

    return OMPI_SUCCESS;
}
/*
 * Close the component
 */
static int iboffload_close(void)
{
    int rc;

    mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;

    IBOFFLOAD_VERBOSE(10, ("Destroy component free lists.\n"));

    if (true == cm->init_done) {
        OBJ_DESTRUCT(&cm->tasks_free);
        OBJ_DESTRUCT(&cm->collreqs_free);
        OBJ_DESTRUCT(&cm->collfrags_free);
        OBJ_DESTRUCT(&cm->calc_tasks_free);
    }

    /* Unregister the progress function */
    rc = opal_progress_unregister(mca_bcol_iboffload_component_progress);
    if (OMPI_SUCCESS != rc) {
        IBOFFLOAD_ERROR(("Failed to unregister the progress function"
                         " for iboffload component.\n"));
    }

    rc = iboffload_release_devices();
    if (OMPI_SUCCESS != rc) {
        return rc;
    }

    if (NULL != cm->receive_queues) {
        free(cm->receive_queues);
    }

    OBJ_DESTRUCT(&cm->recv_wrs.lock);

    IBOFFLOAD_VERBOSE(10, ("The component closed.\n"));

    return OMPI_SUCCESS;
}
/*
 * Open the component
 */
static int iboffload_open(void)
{
    int rc;

    /* local variables */
    mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;

    IBOFFLOAD_VERBOSE(10, ("Open Iboffload component.\n"));

    cm->super.priority = 100;
    cm->super.n_net_contexts = 0;
    cm->super.network_contexts = NULL;

    OBJ_CONSTRUCT(&cm->recv_wrs.lock, opal_mutex_t);

    /* construct lists */
    OBJ_CONSTRUCT(&cm->devices, opal_pointer_array_t);
    rc = opal_pointer_array_init(&cm->devices, 10, INT_MAX, 10);
    if (OMPI_SUCCESS != rc) {
        goto close_device;
    }

    /* load mca parametres */
    rc = mca_bcol_iboffload_register_params();
    if (OMPI_SUCCESS != rc) {
        goto close_device;
    }

    /* Register the progress function */
    rc = opal_progress_register(mca_bcol_iboffload_component_progress);
    if (OMPI_SUCCESS != rc) {
        IBOFFLOAD_ERROR(("Failed to register the progress function"
                         " for iboffload component.\n"));
        goto close_device;
    }

    map_ompi_to_ib_dtype();
    map_ompi_to_ib_op_type();

    /* The init_done set to true on first component usage */
    cm->init_done = false;

    return OMPI_SUCCESS;

close_device:
    OBJ_DESTRUCT(&cm->devices);
    OBJ_DESTRUCT(&cm->recv_wrs.lock);
    return rc;
}
static int progress_one_device(mca_bcol_iboffload_device_t *device)
{
    int ne, rc, count = 0;

    mca_bcol_iboffload_collfrag_t *coll_frag;
    mca_bcol_iboffload_collreq_t *coll_request;

    struct ibv_wc wc;
    memset(&wc, 0, sizeof(struct ibv_wc));

    /*
     * poll for collective completion - does not mean resources can
     * be freed, as incomplete network level sends may still be pending
     */

    /* Poll for completion on completion on wait MQEs */
    if(0 != (ne = ibv_poll_cq(device->ib_mq_cq, 1, &wc))) {
        do {
            if (OPAL_UNLIKELY(0 > ne)) {
                IBOFFLOAD_ERROR(("Device %s: "
                                 "failed to poll MQ completion queue\n",
                                 ibv_get_device_name(device->dev.ib_dev)));
                fatal_error("failed to poll MQ completion queue\n");
                return count;
            }

            if (OPAL_UNLIKELY(IBV_WC_SUCCESS != wc.status)) {
                IBOFFLOAD_ERROR(("Device %s: "
                                 "the completion with error on wait was gotten, status %d, opcode %d, "
                                 "vendor_err 0x%x, qp %x, id 0x%x\n", ibv_get_device_name(device->dev.ib_dev),
                                 wc.status, wc.opcode, wc.vendor_err, wc.qp_num, wc.wr_id));
                fatal_error("wc.status \n");
                return count;
            }

            IBOFFLOAD_VERBOSE(10, ("The MQ completion was polled.\n"));

            ++count;

            /* get pointer to mca_bcol_iboffload_collfrag_t */
            coll_frag = (mca_bcol_iboffload_collfrag_t*)
                        (uint64_t) (uintptr_t) wc.wr_id;

            /* Only last MQ task of collective frag
               sends completion signal, so if we got it =>
               all MQEs were done. */
            coll_frag->complete = true;

            IBOFFLOAD_VERBOSE(10, ("MQ completion for algorithm %d coll_frag_addr %p ml buffer index %d",
                                   coll_frag->alg, (void *)coll_frag, coll_frag->coll_full_req->ml_buffer_index));

            /* full request descriptor */
            coll_request = coll_frag->coll_full_req;

            coll_request->n_frag_mpi_complete++;

            /*
             * at this stage all receives have been completed, so
             * unpack the data to user buffer, the resources will be released when we will done with all
             * element in the task list
             */

            if (NULL != coll_request->completion_cb_fn) {
                if (OMPI_SUCCESS !=
                        coll_request->completion_cb_fn(coll_frag)) {
                    fatal_error("coll_request->completion_cb_fn\n");
                    return count;
                }
            }

            if (coll_request->n_frag_mpi_complete ==
                    coll_request->n_fragments) {
                coll_request->super.req_complete = true;
                opal_condition_broadcast(&ompi_request_cond);
                IBOFFLOAD_VERBOSE(10, ("After opal_condition_broadcast.\n"));
            }

            rc = handle_collfrag_done(coll_frag, coll_request, device);
            if (0 != rc) {
                return count;
            }
        } while(0 != (ne = ibv_poll_cq(device->ib_mq_cq, 1, &wc)));

        return count;
    }

    /* poll the send completion queue */
    do {
        ne = ibv_poll_cq(device->ib_cq, 1, &wc);
        if (0 < ne) {
            if (OPAL_UNLIKELY(IBV_WC_SUCCESS != wc.status)) {
                IBOFFLOAD_ERROR(("Device %s, "
                                 "the completion with error on send was gotten, status %d, opcode %d, "
                                 "vendor_err 0x%x, qp %x, id 0x%x\n", ibv_get_device_name(device->dev.ib_dev),
                                 wc.status, wc.opcode, wc.vendor_err, wc.qp_num, wc.wr_id));

#if OPAL_ENABLE_DEBUG
                {
                    mca_bcol_iboffload_module_t *iboffload;
                    int i, qp_index, num_qps = mca_bcol_iboffload_component.num_qps;

                    coll_frag = (mca_bcol_iboffload_collfrag_t*)
                                (uint64_t) (uintptr_t) wc.wr_id;

                    iboffload = coll_frag->coll_full_req->module;

                    for (i = 0; i < iboffload->num_endpoints; ++i) {
                        mca_bcol_iboffload_endpoint_t *ep = iboffload->endpoints[i];

                        for (qp_index = 0; qp_index < num_qps; ++qp_index) {
                            if (NULL != ep->qps[qp_index].qp->lcl_qp &&
                                    wc.qp_num == ep->qps[qp_index].qp->lcl_qp->qp_num) {
                                IBOFFLOAD_ERROR(("Module - %p, coll_frag - %p, "
                                                 "destination %d, qp index - %d.",
                                                 iboffload, coll_frag, i, qp_index));
                            }
                        }
                    }
                }
#endif
                fatal_error("Failed to ibv_poll_cq\n");
                return count;
            }

            ++count;

            /* get pointer to mca_bcol_iboffload_collfrag_t */
            coll_frag = (mca_bcol_iboffload_collfrag_t*)
                        (uint64_t) (uintptr_t) wc.wr_id;

            /* update the number of completed sends */
            coll_frag->n_sends_completed++;

            IBOFFLOAD_VERBOSE(10, ("Send CQ completion for algorithm %d coll_frag_addr %p ml buffer index %d",
                                   coll_frag->alg, (void *)coll_frag, coll_frag->coll_full_req->ml_buffer_index));

            IBOFFLOAD_VERBOSE(10, ("Alg %d coll_frag_addr %p: n_sends_completed - %d, n_sends - %d.\n",
                                   coll_frag->alg, (void *)coll_frag,
                                   coll_frag->n_sends_completed,
                                   coll_frag->n_sends));

            assert(coll_frag->n_sends_completed <= coll_frag->n_sends);

            /* full message descriptor */
            coll_request = coll_frag->coll_full_req;

            /* check to see if all sends are complete from the network
             * perspective */
            rc = handle_collfrag_done(coll_frag, coll_request, device);
            if (0 != rc) {
                return count;
            }
        } else if (OPAL_UNLIKELY(0 > ne)) {
            IBOFFLOAD_ERROR(("Device %s: "
                             "failed to poll send completion queue\n",
                             ibv_get_device_name(device->dev.ib_dev)));
            fatal_error("failed to poll send completion queue\n");
            return count;
        }
    } while (0 != ne);

    return count;
}
static int setup_qps(void)
{
    int ret = OMPI_SUCCESS, qp = 0;
    int rd_num = 0, rd_low = 0, size = 0,
        rd_win = 0, rd_rsv = 0, sd_max = 0;

    mca_bcol_iboffload_qp_type_t type = 0;

    char **queues = NULL, **params = NULL;

    queues = opal_argv_split(mca_bcol_iboffload_component.receive_queues, ':');
    if (0 == opal_argv_count(queues)) {
        orte_show_help("help-mpi-btl-openib.txt",
                       "no qps in receive_queues", true,
                       orte_process_info.nodename,
                       mca_bcol_iboffload_component.receive_queues);

        ret = OMPI_ERROR;

        goto exit;
    }

    while (queues[qp] != NULL) {
        if (0 == strncmp("P,", queues[qp], 2)) {
            type = MCA_BCOL_IBOFFLOAD_PP_QP;
        } else if (0 == strncmp("S,", queues[qp], 2)) {
            type = MCA_BCOL_IBOFFLOAD_SRQ_QP;
        } else if (0 == strncmp("X,", queues[qp], 2)) {
#if HAVE_XRC
            type = MCA_BCOL_IBOFFLOAD_XRC_QP;
#else
            orte_show_help("help-mpi-btl-openib.txt", "No XRC support", true,
                           orte_process_info.nodename,
                           mca_bcol_iboffload_component.receive_queues);
            ret = OMPI_ERR_NOT_AVAILABLE;
            goto exit;
#endif
        } else {
            orte_show_help("help-mpi-btl-openib.txt",
                           "invalid qp type in receive_queues", true,
                           orte_process_info.nodename,
                           mca_bcol_iboffload_component.receive_queues,
                           queues[qp]);

            ret = OMPI_ERR_BAD_PARAM;

            goto exit;
        }

        ++qp;
    }

    mca_bcol_iboffload_component.num_qps = MCA_BCOL_IBOFFLOAD_QP_LAST;

    qp = 0;
#define P(N) (((N) > count) ? NULL : params[(N)])
    while (NULL != queues[qp]) {
        int count;

        params = opal_argv_split_with_empty(queues[qp], ',');
        count = opal_argv_count(params);

        if ('P' == params[0][0]) {
            if (count < 3 || count > 6) {
                orte_show_help("help-mpi-btl-openib.txt",
                               "invalid pp qp specification", true,
                               orte_process_info.nodename, queues[qp]);

                ret = OMPI_ERR_BAD_PARAM;

                goto exit;
            }

            size = atoi_param(P(1), 0);

            rd_num = atoi_param(P(2), 256);

            /* by default set rd_low to be 3/4 of rd_num */
            rd_low = atoi_param(P(3), rd_num - (rd_num / 4));
            rd_win = atoi_param(P(4), (rd_num - rd_low) * 2);
            rd_rsv = atoi_param(P(5), (rd_num * 2) / rd_win);


            if ((rd_num - rd_low) > rd_win) {
                orte_show_help("help-mpi-btl-openib.txt", "non optimal rd_win",
                               true, rd_win, rd_num - rd_low);
            }
        } else {
            if (count < 3 || count > 5) {
                orte_show_help("help-mpi-btl-openib.txt",
                               "invalid srq specification", true,
                               orte_process_info.nodename, queues[qp]);

                ret = OMPI_ERR_BAD_PARAM;

                goto exit;
            }

            size = atoi_param(P(1), 0);
            rd_num = atoi_param(P(2), 256);

            /* by default set rd_low to be 3/4 of rd_num */
            rd_low = atoi_param(P(3), rd_num - (rd_num / 4));
            sd_max = atoi_param(P(4), rd_low / 4);

            IBOFFLOAD_VERBOSE(10, ("srq: rd_num is %d rd_low is %d sd_max is %d",
                                   rd_num, rd_low, sd_max));

        }

        if (rd_num <= rd_low) {
            orte_show_help("help-mpi-btl-openib.txt", "rd_num must be > rd_low",
                           true, orte_process_info.nodename, queues[qp]);
            ret = OMPI_ERR_BAD_PARAM;

            goto exit;
        }

        opal_argv_free(params);

        ++qp;
    }

    params = NULL;

    for (qp = 0; qp < MCA_BCOL_IBOFFLOAD_QP_LAST; ++qp) {
        mca_bcol_iboffload_component.qp_infos[qp].qp_index = qp;

        mca_bcol_iboffload_component.qp_infos[qp].type = type;
        mca_bcol_iboffload_component.qp_infos[qp].size = size;

        mca_bcol_iboffload_component.qp_infos[qp].rd_num = rd_num;
        mca_bcol_iboffload_component.qp_infos[qp].rd_low = rd_low;

        mca_bcol_iboffload_component.qp_infos[qp].rd_pp_win = rd_num - rd_low;

        if (MCA_BCOL_IBOFFLOAD_PP_QP == type) {
            mca_bcol_iboffload_component.qp_infos[qp].u.pp_qp.rd_win = rd_win;
            mca_bcol_iboffload_component.qp_infos[qp].u.pp_qp.rd_rsv = rd_rsv;
        } else {
            mca_bcol_iboffload_component.qp_infos[qp].u.srq_qp.sd_max = sd_max;
        }

        if (NULL != setup_qps_fn[qp]) {
            setup_qps_fn[qp](&mca_bcol_iboffload_component.qp_infos[qp]);
        }
    }

exit:
    if (NULL != params) {
        opal_argv_free(params);
    }

    if (NULL != queues) {
        opal_argv_free(queues);
    }

    return ret;
}
static int mca_bcol_iboffload_dummy_frag_qp_prepost(
                mca_bcol_iboffload_endpoint_t *endpoint,
                int qp_index, int num_to_prepost)
{
    struct ibv_recv_wr *recv_wr, *recv_bad;
    int ret, num_preposted = 0, start_wr_index;

    mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
    mca_bcol_iboffload_recv_wr_manager *recv_wrs = &cm->recv_wrs;

    IBOFFLOAD_VERBOSE(10, ("Recv prepost call: endpoint %p, to prepost %d",
                          (void *) endpoint, num_to_prepost));

    if (OPAL_UNLIKELY(0 == num_to_prepost)) {
        IBOFFLOAD_VERBOSE(10, ("num_to_prepost = 0, return immediate"));
        return OMPI_SUCCESS;
    }

    /* make sure that we do not overrun number of rd_wqe */
    if (num_to_prepost > endpoint->qps[qp_index].rd_wqe) {
        IBOFFLOAD_VERBOSE(10, ("Reset num_to_prepost = %d, to rd_wqe = %d",
                                num_to_prepost, endpoint->qps[qp_index].rd_wqe));

        num_to_prepost = endpoint->qps[qp_index].rd_wqe;
    }

    OPAL_THREAD_LOCK(&recv_wrs->lock);

    /* calculate start index in array
     * of pre-allocated work requests */
    start_wr_index = cm->qp_infos[qp_index].rd_num - num_to_prepost;
    recv_wr = &recv_wrs->recv_work_requests[qp_index][start_wr_index];

    IBOFFLOAD_VERBOSE(10, ("Endpoint %p, qp_index - %d, to_porepost %d, "
                           "start index of WRs - %d, rd_wqe - %d",
                           (void *) endpoint, qp_index, num_to_prepost,
                            start_wr_index, endpoint->qps[qp_index].rd_wqe));

    while (num_preposted < num_to_prepost) {
        /* prepost the special barrier frag to recv queue */
        struct ibv_sge *dummy_sg_entry =
                    &endpoint->iboffload_module->device->dummy_frags[qp_index].sg_entry;

        recv_wr[num_preposted].sg_list = dummy_sg_entry;
        ++num_preposted;
    }

    if (OPAL_LIKELY(num_preposted > 0)) {
        /* Set the tail */
        recv_wr[num_preposted - 1].next = NULL;

        /* post the list of recvs */
        ret = ibv_post_recv(endpoint->qps[qp_index].qp->lcl_qp, recv_wr, &recv_bad);
        if (OPAL_UNLIKELY(0 != ret)) {
            IBOFFLOAD_ERROR(("ibv_post_recv failed, error: %s [%d], "
                             "qp_index - %d.\n", strerror(errno), ret, qp_index));

            return OMPI_ERROR;
        }

        /* recover last recv_wr if needed */
        if (OPAL_UNLIKELY(num_to_prepost != num_preposted)) {
            recv_wr[num_preposted - 1].next = &recv_wr[num_preposted];
        }

        /* decresing numbers of free recv wqe */
        endpoint->qps[qp_index].rd_wqe -= num_preposted;
    }

    OPAL_THREAD_UNLOCK(&recv_wrs->lock);

    IBOFFLOAD_VERBOSE(10, ("Endpoint %p, to_porepost %d, num preposted - %d, qp_index - %d",
                          (void *) endpoint, num_to_prepost, num_preposted, qp_index));

    return OMPI_SUCCESS;
}
static int mca_bcol_iboffload_fanin_leader_progress(
                mca_bcol_iboffload_module_t *iboffload,
                struct mca_bcol_iboffload_collreq_t *coll_request)
{
    int rc = OMPI_SUCCESS, leader_rank = 0, rank,
        sbgp_size = iboffload->ibnet->super.group_size;

    struct mqe_task *last_wait = NULL;

    mca_bcol_iboffload_task_t *wait_task = NULL;
    mca_bcol_iboffload_frag_t *preposted_recv_frag = NULL;

    struct mqe_task **mqe_ptr_to_set;
    mca_bcol_iboffload_collfrag_t *coll_fragment;

    coll_fragment = (mca_bcol_iboffload_collfrag_t *)
                         opal_list_get_last(&coll_request->work_requests);

    mqe_ptr_to_set = &coll_fragment->to_post;

    if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS(
               iboffload, coll_fragment->mq_index, coll_fragment->mq_credits))) {
        IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n"));
        goto out_of_resources;
    }

    for (rank = leader_rank + 1; rank < sbgp_size; ++rank) {
       /* post wait */
        preposted_recv_frag = mca_bcol_iboffload_get_preposted_recv_frag(
                                        iboffload, rank, coll_request->qp_index);
        if(NULL == preposted_recv_frag) {
            IBOFFLOAD_VERBOSE(10, ("Failing for getting prepost recv frag.\n"));
            goto out_of_resources;
        }

        wait_task = mca_bcol_iboffload_get_wait_task(iboffload, rank, 1,
                             preposted_recv_frag, coll_request->qp_index, NULL);
        if(NULL == wait_task) {
            IBOFFLOAD_VERBOSE(10, ("Failing for getting wait task.\n"));
            goto out_of_resources;
        }

        APPEND_TO_TASKLIST(mqe_ptr_to_set, wait_task, last_wait);
        MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, wait_task);
    }

   /* end of list */
    *mqe_ptr_to_set = NULL;

    last_wait->flags |= MQE_WR_FLAG_SIGNAL;

    coll_fragment->signal_task_wr_id = last_wait->wr_id;
    last_wait->wr_id = (uint64_t) (uintptr_t) coll_fragment;

    /* post the mwr */
    rc = mca_bcol_iboffload_post_mqe_tasks(iboffload, coll_fragment->to_post);
    if(OMPI_SUCCESS != rc) {
        IBOFFLOAD_VERBOSE(10, ("MQE task posting failing.\n"));
        /* Note: need to clean up */
        return rc;
    }

    MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload->super, coll_request->order_info);

    return OMPI_SUCCESS;

out_of_resources:
    /* Release all resources */
    IBOFFLOAD_VERBOSE(10, ("Fan-in, adding collfrag to collfrag_pending"));
    return mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload);
}
static void mca_bcol_iboffload_fillin_qp_attr(int qp_index,
                                   mca_bcol_iboffload_endpoint_t *ep,
                                   ompi_common_ofacm_base_qp_config_t *qp_config)
{
        uint32_t max_sge, *init_attr_mask = 
                                  &qp_config->init_attr_mask[qp_index];

        struct ibv_qp_attr *attr = &qp_config->attr[qp_index];
        struct ibv_qp_init_attr *init_attr = &qp_config->init_attr[qp_index];

        mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;

        /* Set special init attributes mask */
        *init_attr_mask = IBV_M_QP_EXT_CLASS_1 |
                          IBV_M_QP_EXT_CLASS_2 |
                          IBV_M_QP_EXT_IGNORE_RQ_OVERFLOW;

        /* Set init attributes */
        init_attr->qp_type = IBV_QPT_RC;

/* Vasily: ??????
        init_attr->cap.max_inline_data =
            max_inline_size(qp, iboffload_module->device);
*/
        /* Pasha: we can not leave max_inline empty !
           Todo: copy max_inline_size() from ofacm to
           common area.
         */
        init_attr->cap.max_inline_data = (int32_t) cm->max_inline_data;

        /* We allocate SG list for some algorithms (Bruck's alltoall) */
        max_sge = ep->iboffload_module->group_size / 2 +
                       ep->iboffload_module->group_size % 2;

        /* max send sge should be less than device maximums */
        if (max_sge > (uint32_t)
                             ep->iboffload_module->device->ib_dev_attr.max_sge) {
            max_sge = (uint32_t) ep->iboffload_module->device->ib_dev_attr.max_sge;
        }

        init_attr->cap.max_send_sge = max_sge;
        init_attr->cap.max_recv_sge = max_sge; 
/* Vasily: the value will be changed later */
/* TODO Pasha: this is real crap */
        init_attr->cap.max_recv_wr  = (uint32_t) cm->cq_size;
        init_attr->cap.max_send_wr  = (uint32_t) cm->cq_size;

        /* Set attributes */

        /* attr->pkey_index = 0; */ /* Vasily: ????? */

        attr->port_num = ep->iboffload_module->port;
/* Vasily: the value will be changed later */
        attr->path_mtu = (uint32_t)cm->mtu;

        attr->max_dest_rd_atomic = cm->max_rdma_dst_ops;
        attr->min_rnr_timer = (uint32_t)cm->min_rnr_timer;

        attr->ah_attr.is_global = 0;
        attr->ah_attr.sl = (uint32_t)cm->service_level;
/* Vasily: from struct mca_bcol_iboffload_port_t ????? */
/*
        attr->ah_attr.src_path_bits = iboffload_module->src_path_bits;
*/
        attr->ah_attr.port_num = ep->iboffload_module->port;
        /* JMS to be filled in later dynamically */
        attr->ah_attr.static_rate = 0;
        /* RTS params */
        attr->timeout        = (uint32_t)cm->timeout;
        attr->retry_cnt      = (uint32_t)cm->retry_count;
        attr->rnr_retry      = (uint32_t)cm->rnr_retry;
        attr->max_rd_atomic  = (uint32_t)cm->max_rdma_dst_ops;

        /* Init for local mca_bcol_iboffload_endpoint_qp_t qps structure
         * that caches the qp information on endpoint */
        OBJ_CONSTRUCT(&ep->qps[qp_index].preposted_frags, opal_list_t);

        /* Pasha: Need to add function that will */
        ep->qps[qp_index].ib_inline_max = cm->max_inline_data;
        /* TODO Pasha - this is crap too... we do not have info for sevice qps. Fix it later */

        ep->qps[qp_index].sd_wqe = cm->qp_infos[qp_index].rd_num;
        ep->qps[qp_index].rd_wqe = cm->qp_infos[qp_index].rd_num;

        IBOFFLOAD_VERBOSE(10, ("ep - %p, qp index - %d, num of rd_wqe - %d.",
                               ep, qp_index, ep->qps[qp_index].rd_wqe));
}
/*
 * Receive prepost:
 * return values:
 * 0 - no prepost was done
 * -1 - fatal error during prepost
 * other value - number preposted elements
 */
static int mca_bcol_iboffload_frag_reg_qp_prepost(
                mca_bcol_iboffload_endpoint_t *endpoint,
                int qp_index, int num_to_prepost)
{
    ompi_free_list_item_t *item;
    mca_bcol_iboffload_frag_t *frag;

    struct ibv_recv_wr *recv_wr, *recv_bad;
    int i, ret, num_preposted = 0, start_wr_index;

    mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
    mca_bcol_iboffload_device_t *device = endpoint->iboffload_module->device;

    opal_list_t *preposted = &(endpoint->qps[qp_index].preposted_frags);
    mca_bcol_iboffload_recv_wr_manager *recv_wrs = &cm->recv_wrs;

    IBOFFLOAD_VERBOSE(10, ("Recv prepost call: endpoint %p, to prepost %d",
                          (void *) endpoint, num_to_prepost));

    if (OPAL_UNLIKELY(0 == num_to_prepost)) {
        IBOFFLOAD_VERBOSE(10, ("num_to_prepost = 0, return immediate"));
        return OMPI_SUCCESS;
    }

    /* make sure that we do not overrun number of rd_wqe */
    if (num_to_prepost > endpoint->qps[qp_index].rd_wqe) {
        IBOFFLOAD_VERBOSE(10, ("Reset num_to_prepost = %d, to rd_wqe = %d",
                                num_to_prepost, endpoint->qps[qp_index].rd_wqe));

        num_to_prepost = endpoint->qps[qp_index].rd_wqe;
    }

    OPAL_THREAD_LOCK(&recv_wrs->lock);

    /* calculate start index in array
     * of pre-allocated work requests */
    start_wr_index = cm->qp_infos[qp_index].rd_num - num_to_prepost;
    recv_wr = &recv_wrs->recv_work_requests[qp_index][start_wr_index];

    IBOFFLOAD_VERBOSE(10, ("Endpoint %p, qp_index - %d, to_porepost %d, "
                           "start index of WRs - %d, rd_wqe - %d",
                           (void *) endpoint, qp_index, num_to_prepost,
                            start_wr_index, endpoint->qps[qp_index].rd_wqe));

    while (num_preposted < num_to_prepost) {
        /* put the item on list of preposted */
        OMPI_FREE_LIST_GET_MT(&device->frags_free[qp_index], item);
        if (OPAL_UNLIKELY(NULL == item)) {
            break;
        }

        frag = (mca_bcol_iboffload_frag_t *) item;
        opal_list_append(preposted, (opal_list_item_t *) item);

        recv_wr[num_preposted].sg_list = &frag->sg_entry;
        /* TODO Pasha - fix it later */ /* Vasily: Is it right place to take a size value ???? */
        frag->sg_entry.length = cm->qp_infos[qp_index].size;
        ++num_preposted;
    }

    if (OPAL_LIKELY(num_preposted > 0)) {
        /* Set the tail */
        recv_wr[num_preposted - 1].next = NULL;

        /* post the list of recvs */
        ret = ibv_post_recv(endpoint->qps[qp_index].qp->lcl_qp, recv_wr, &recv_bad);
        if (OPAL_UNLIKELY(0 != ret)) {
            IBOFFLOAD_ERROR(("ibv_post_recv failed (%s), error: %s [%d], "
                             "qp_index - %d.\n",
                              ibv_get_device_name(device->dev.ib_dev),
                              strerror(errno), ret, qp_index));

            /* Return allocated frags */
            for (i = 0; i < num_preposted; i++) {
                OMPI_FREE_LIST_RETURN_MT(&device->frags_free[qp_index],
                        (ompi_free_list_item_t *)
                            opal_list_remove_last(preposted));
            }

            return OMPI_ERROR;
        }

        /* recover last recv_wr if needed */
        if (OPAL_UNLIKELY(num_to_prepost != num_preposted)) {
            recv_wr[num_preposted - 1].next = &recv_wr[num_preposted];
        }

        /* decresing numbers of free recv wqe */
        endpoint->qps[qp_index].rd_wqe -= num_preposted;
    }

    OPAL_THREAD_UNLOCK(&recv_wrs->lock);

    IBOFFLOAD_VERBOSE(10, ("Endpoint %p, to_porepost %d, num preposted - %d",
                          (void *) endpoint, num_to_prepost, num_preposted));

    return OMPI_SUCCESS;
}
static void mca_bcol_iboffload_device_destructor
(mca_bcol_iboffload_device_t *device)
{
    int qp_index, num_qps = mca_bcol_iboffload_component.num_qps;

    IBOFFLOAD_VERBOSE(10, ("Device %s will be destroyed.\n",
                           ibv_get_device_name(device->dev.ib_dev)));

    if (NULL != device->frags_free) {
        for (qp_index = 0; qp_index < num_qps; ++qp_index) {
            mca_bcol_iboffload_dealloc_qps_resource_fn_t dealloc_resource =
                mca_bcol_iboffload_component.qp_infos[qp_index].dealloc_resource;
            if (NULL != dealloc_resource) {
                dealloc_resource(qp_index, device);
            }
        }

        free(device->frags_free);
    }

    if (NULL != device->mpool) {
        IBOFFLOAD_VERBOSE(10, ("Mpool destroy - %p.\n", device->mpool));
        if (OMPI_SUCCESS != mca_mpool_base_module_destroy(device->mpool)) {
            IBOFFLOAD_ERROR(("Device %s, failed to destroy mpool",
                             ibv_get_device_name(device->dev.ib_dev)));
        }
    }

    if (NULL != device->dummy_reg.mr) {
        IBOFFLOAD_VERBOSE(10, ("Dummy memory MR unregister - %p.\n", device->dummy_reg.mr));
        if (OMPI_SUCCESS !=
                mca_bcol_iboffload_deregister_mr((void *) device, &device->dummy_reg.base)) {
            IBOFFLOAD_ERROR(("Device %s: failed to unregister dummy memory MR.",
                             ibv_get_device_name(device->dev.ib_dev)));
        }
    }

    if (NULL != device->ib_cq) {
        if (ibv_destroy_cq(device->ib_cq)) {
            IBOFFLOAD_ERROR(("Device %s, failed to destroy CQ, errno says %s",
                             ibv_get_device_name(device->dev.ib_dev), strerror(errno)));
        }
    }

    if (NULL != device->ib_mq_cq) {
        if (ibv_destroy_cq(device->ib_mq_cq)) {
            IBOFFLOAD_ERROR(("Device %s, failed to destroy mq CQ, errno says %s",
                             ibv_get_device_name(device->dev.ib_dev), strerror(errno)));
        }
    }

    /* Release IB PD if we have one */
    if (NULL != device->ib_pd) {
        if(ibv_dealloc_pd(device->ib_pd)) {
            IBOFFLOAD_ERROR(("Device %s, failed to release PD, errno says %s",
                             ibv_get_device_name(device->dev.ib_dev), strerror(errno)));
        }
    }

    /* close the device */
    if (NULL != device->dev.ib_dev_context) {
        if (ibv_close_device(device->dev.ib_dev_context)) {
            IBOFFLOAD_ERROR(("Device %s "
                             ", failed to close the device, errno says %s",
                             ibv_get_device_name(device->dev.ib_dev), strerror(errno)));
        }
    }

    /* release memory */
    if (NULL != device->ports) {
        free(device->ports);
    }
}
Example #25
0
static int mca_bcol_iboffload_bcast_scatter_allgather_extra_exec(mca_bcol_iboffload_module_t *iboffload_module,
        mca_bcol_iboffload_collreq_t *coll_request)
{
    netpatterns_pair_exchange_node_t *recursive_doubling_tree =
        &iboffload_module->recursive_doubling_tree;

    int rc, dst;
    int count = coll_request->count * coll_request->dtype->super.size;
    int my_group_index = iboffload_module->ibnet->super.my_index;
    struct mqe_task *last_send = NULL,
                    *last_wait = NULL;
    mca_bcol_iboffload_collfrag_t *coll_fragment = &coll_request->first_collfrag;

    if (OPAL_UNLIKELY(!iboffload_module->connection_status[RECURSIVE_DOUBLING_TREE_BCAST])) {
        bcol_iboffload_setup_binomial_connection(iboffload_module);
    }

    /* register memory in mpool/rcache */
    rc = mca_bcol_iboffload_prepare_buffer(coll_request->buffer_info[SBUF].buf, count,
            &coll_request->buffer_info[SBUF].iboffload_reg, iboffload_module);
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        IBOFFLOAD_ERROR(("Cannot register memory: "
                         "addr - %p, %d bytes.\n",
                          coll_request->buffer_info[SBUF].buf, count));
        return OMPI_ERROR;
    }

    coll_request->buffer_info[SBUF].lkey = coll_request->buffer_info[SBUF].iboffload_reg->mr->lkey;

    /* it is estimated mq consumption... */
    if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS(
                 iboffload_module, coll_fragment->mq_index, coll_fragment->mq_credits) ||
                 false == opal_list_is_empty(&iboffload_module->collfrag_pending))) {
        IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n"));
        goto out_of_resources;
    }

    coll_fragment->tail_next = &coll_fragment->to_post;

    /* send or recv the data */

    if (coll_request->root == my_group_index) {
        IBOFFLOAD_VERBOSE(10, ("I'm root of the data %d", iboffload_module->power_of_2));
        /* send the all data to your extra peer */
        dst = recursive_doubling_tree->rank_extra_source;
        rc = mca_bcol_iboffload_recv_rtr_setup(
                &last_wait, dst, iboffload_module, coll_fragment);
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            IBOFFLOAD_VERBOSE(10, ("Failed to"
                        " mca_bcol_iboffload_recv_rtr_setup"));
            return OMPI_ERROR;
        }
        rc = mca_bcol_iboffload_send_large_buff_setup(
                &last_send, SBUF, 0, count, dst,
                iboffload_module, coll_fragment);
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            IBOFFLOAD_VERBOSE(10, ("Failed to"
                        " mca_bcol_iboffload_send_large_buff_setup"));
            return OMPI_ERROR;
        }
    } else {
        /* Not root case */
        dst = recursive_doubling_tree->rank_extra_source;
        rc = mca_bcol_iboffload_send_rtr_setup(&last_send,
                dst, iboffload_module,
                coll_fragment);
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            IBOFFLOAD_VERBOSE(10, ("Failed to setup send rtr"));
            return OMPI_ERROR;
        }

        rc = mca_bcol_iboffload_recv_large_buff_setup(&last_wait,
                SBUF, 0, count, dst,
                iboffload_module, coll_fragment);
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            IBOFFLOAD_VERBOSE(10, ("Failed to setup data receive"));
            return OMPI_ERROR;
        }
    }

    IBOFFLOAD_VERBOSE(10, ("Fill in the the rest of the coll_fragment.\n"));

    /* end of list */
    *coll_fragment->tail_next = NULL;

    /* finish initializing full message descriptor */
    coll_request->n_fragments  += 1;
    coll_request->n_frags_sent += 1;

    if (NULL != last_wait) {
        last_wait->flags |= MQE_WR_FLAG_SIGNAL;
        coll_fragment->signal_task_wr_id = last_wait->wr_id;
        last_wait->wr_id = (uint64_t) (uintptr_t) coll_fragment;
    } else {
        last_send->flags |= MQE_WR_FLAG_SIGNAL;
        coll_fragment->signal_task_wr_id = last_send->wr_id;
        last_send->wr_id = (uint64_t) (uintptr_t) coll_fragment;
    }

    /* post the mwr */
    rc = mca_bcol_iboffload_post_mqe_tasks(iboffload_module, coll_fragment->to_post);
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        IBOFFLOAD_VERBOSE(10, ("MQE task posting failing.\n"));
        /* Note: need to clean up */
        return rc;
    }

    MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload_module->super, coll_request->order_info);

    IBOFFLOAD_VERBOSE(10, ("Return success.\n"));

    return BCOL_FN_STARTED;

out_of_resources:
    /* Release all resources */
    IBOFFLOAD_VERBOSE(10, ("Barrier, adding collfrag to collfrag_pending.\n"));
    rc =
        mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload_module);
    return (OMPI_SUCCESS != rc) ? BCOL_FN_NOT_STARTED : BCOL_FN_STARTED;
}
Example #26
0
static int mca_bcol_iboffload_bcast_scatter_allgather_exec(mca_bcol_iboffload_module_t *iboffload_module,
        mca_bcol_iboffload_collreq_t *coll_request)
{
    netpatterns_pair_exchange_node_t *recursive_doubling_tree =
        &iboffload_module->recursive_doubling_tree;

    int rc,
        dst,
        group_src, power_of_2_distance,
        recv_count;
    size_t offset;
    int count = coll_request->count * coll_request->dtype->super.size;
    int my_group_index = iboffload_module->ibnet->super.my_index;
    size_t base_block_size =
        (count +  iboffload_module->power_of_2_ranks - 1) /
        iboffload_module->power_of_2_ranks;

    struct mqe_task *last_send = NULL,
                    *last_wait = NULL;
    mca_bcol_iboffload_collfrag_t *coll_fragment = &coll_request->first_collfrag;

    if (OPAL_UNLIKELY(!iboffload_module->connection_status[RECURSIVE_DOUBLING_TREE_BCAST])) {
        bcol_iboffload_setup_binomial_connection(iboffload_module);
    }

    /* register memory in mpool/rcache */
    rc = mca_bcol_iboffload_prepare_buffer(coll_request->buffer_info[SBUF].buf, count,
            &coll_request->buffer_info[SBUF].iboffload_reg, iboffload_module);
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        IBOFFLOAD_ERROR(("Cannot register memory: "
                         "addr - %p, %d bytes.\n",
                          coll_request->buffer_info[SBUF].buf, count));
        return OMPI_ERROR;
    }

    coll_request->buffer_info[SBUF].lkey = coll_request->buffer_info[SBUF].iboffload_reg->mr->lkey;

    /* it is estimated mq consumption... */
    if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS(
                 iboffload_module, coll_fragment->mq_index, coll_fragment->mq_credits) ||
                 false == opal_list_is_empty(&iboffload_module->collfrag_pending))) {
        IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n"));
        goto out_of_resources;
    }

    coll_fragment->tail_next = &coll_fragment->to_post;

    if (coll_request->root == my_group_index) {
        IBOFFLOAD_VERBOSE(10, ("I'm root of the data %d %d",
                    iboffload_module->power_of_2, recursive_doubling_tree->n_extra_sources ));
        /* for proxy we have little bit more work to do */
        if (recursive_doubling_tree->n_extra_sources > 0) {
            /* send the all data to your extra peer */
            dst = recursive_doubling_tree->rank_extra_source;
            rc = mca_bcol_iboffload_recv_rtr_setup(
                    &last_wait, dst, iboffload_module, coll_fragment);
            if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
                IBOFFLOAD_VERBOSE(10, ("Failed to"
                            " mca_bcol_iboffload_recv_rtr_setup"));
                return OMPI_ERROR;
            }
            rc = mca_bcol_iboffload_send_large_buff_setup(
                    &last_send, SBUF, 0, count, dst,
                    iboffload_module, coll_fragment);
            if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
                IBOFFLOAD_VERBOSE(10, ("Failed to"
                            " mca_bcol_iboffload_send_large_buff_setup"));
                return OMPI_ERROR;
            }
        }
        power_of_2_distance = iboffload_module->power_of_2;

        BINOMIAL_SCATTER(iboffload_module, coll_fragment,
                last_wait, last_send,  power_of_2_distance - 1,
                my_group_index, base_block_size, count
                );
        /* EXIT OR GO TO Gather */
        goto GATHER;
    }

    /* prepare and post recv operation */
    group_src = bcol_iboffload_binomial_root_to_src(coll_request->root,
            my_group_index, iboffload_module->power_of_2_ranks,
            iboffload_module->group_size, &power_of_2_distance);

    IBOFFLOAD_VERBOSE(10, ("SRC %d DIST %d ranks %d gsize %d root %d my rank %d",
                group_src, power_of_2_distance, iboffload_module->power_of_2_ranks,
                iboffload_module->group_size,
                coll_request->root, my_group_index));
    assert(group_src >= 0);

    if (0 > power_of_2_distance) {
        /* the rank is virtual root for this group, receive the data
           and scatter gather as root */
        power_of_2_distance =
            iboffload_module->power_of_2;
        offset = 0;
        recv_count = count;
        IBOFFLOAD_VERBOSE(10, ("Virtual root %d , set mask to %d",
                    my_group_index, power_of_2_distance));
    } else {
        int my_left_boundary_rank;
        int delta;
        recv_count = base_block_size * (1 << power_of_2_distance); /* we may receive larger data */
        my_left_boundary_rank = my_group_index & ((~(int)0) << power_of_2_distance );
        offset = (size_t) (base_block_size * my_left_boundary_rank);
        delta = count - offset;
        if (OPAL_UNLIKELY(delta <= 0)) {
            /* no data to recv */
            goto GATHER;
        } else {
            recv_count = (delta < recv_count) ? delta : recv_count;
        }

        IBOFFLOAD_VERBOSE(10, ("Recv data set mask to %d",
                    power_of_2_distance));
    }

    IBOFFLOAD_VERBOSE(10, ("Bcast, receive data from %d[%d], count %d, offset %d",
                group_src, recv_count, offset));

    /* Receive data to user buffer */
    rc = mca_bcol_iboffload_send_rtr_setup(&last_send,
                                group_src, iboffload_module,
                                coll_fragment);
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        IBOFFLOAD_VERBOSE(10, ("Failed to setup send rtr"));
        return OMPI_ERROR;
    }

    rc = mca_bcol_iboffload_recv_large_buff_setup(&last_wait,
                                SBUF, offset, recv_count, group_src,
                                iboffload_module, coll_fragment);
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        IBOFFLOAD_VERBOSE(10, ("Failed to setup data receive"));
        return OMPI_ERROR;
    }

    BINOMIAL_SCATTER(iboffload_module, coll_fragment,
            last_wait, last_send, power_of_2_distance - 1,
            my_group_index, base_block_size, count);

GATHER:
    rc = bcol_iboffload_bcast_binomial_gather(iboffload_module,
            &last_send, &last_wait, coll_fragment,
            count, base_block_size, power_of_2_distance);
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        IBOFFLOAD_VERBOSE(10, ("Failed to setup gather. Return %d", rc));
        return rc;
    }

    if (recursive_doubling_tree->n_extra_sources > 0 &&
            iboffload_module->power_of_2 != power_of_2_distance) {
        dst = recursive_doubling_tree->rank_extra_source;

        rc = mca_bcol_iboffload_recv_rtr_setup(
                &last_wait, dst, iboffload_module, coll_fragment);
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            IBOFFLOAD_VERBOSE(10, ("Failed to"
                        " mca_bcol_iboffload_recv_rtr_setup"));
            return OMPI_ERROR;
        }

        rc = mca_bcol_iboffload_send_large_buff_setup(
                &last_send, SBUF, 0, count, dst,
                iboffload_module, coll_fragment);
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            IBOFFLOAD_VERBOSE(10, ("Failed to"
                        " mca_bcol_iboffload_send_large_buff_setup"));
            return OMPI_ERROR;
        }
    }

    IBOFFLOAD_VERBOSE(10, ("Fill in the the rest of the coll_fragment.\n"));

    /* end of list */
    *coll_fragment->tail_next = NULL;

    /* finish initializing full message descriptor */
    coll_request->n_fragments  += 1;
    coll_request->n_frags_sent += 1;

    if (NULL != last_wait) {
        last_wait->flags |= MQE_WR_FLAG_SIGNAL;
        coll_fragment->signal_task_wr_id = last_wait->wr_id;
        last_wait->wr_id = (uint64_t) (uintptr_t) coll_fragment;
    } else {
        last_send->flags |= MQE_WR_FLAG_SIGNAL;
        coll_fragment->signal_task_wr_id = last_send->wr_id;
        last_send->wr_id = (uint64_t) (uintptr_t) coll_fragment;
    }

    /* post the mwr */
    rc = mca_bcol_iboffload_post_mqe_tasks(iboffload_module, coll_fragment->to_post);
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        IBOFFLOAD_VERBOSE(10, ("MQE task posting failing.\n"));
        /* Note: need to clean up */
        return rc;
    }

    MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload_module->super, coll_request->order_info);

    IBOFFLOAD_VERBOSE(10, ("Return success.\n"));

    return BCOL_FN_STARTED;

out_of_resources:
    /* Release all resources */
    IBOFFLOAD_VERBOSE(10, ("Barrier, adding collfrag to collfrag_pending.\n"));
    rc =
        mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload_module);
    return (OMPI_SUCCESS != rc) ? BCOL_FN_NOT_STARTED : BCOL_FN_STARTED;
}
Example #27
0
static int mca_bcol_iboffload_small_msg_bcast_exec(mca_bcol_iboffload_module_t *iboffload_module,
                                                   mca_bcol_iboffload_collreq_t *coll_request)
{
    netpatterns_pair_exchange_node_t *recursive_doubling_tree =
        &iboffload_module->recursive_doubling_tree;

    int rc,
        distance_mask_pow , dst,
        group_src, power_of_2_distance;

    uint32_t pack_len;
    int my_group_index = iboffload_module->super.sbgp_partner_module->my_index;

    struct mqe_task *last_send = NULL,
                    *last_wait = NULL;
    mca_bcol_iboffload_collfrag_t *coll_fragment = &coll_request->first_collfrag;

    IBOFFLOAD_VERBOSE(10,("Entering small msg iboffload bcast"));

    if (OPAL_UNLIKELY(!iboffload_module->connection_status[RECURSIVE_DOUBLING_TREE_BCAST])) {
        IBOFFLOAD_VERBOSE(10,("Bcast open new connection "));
        bcol_iboffload_setup_binomial_connection(iboffload_module);
    }

    pack_len = coll_request->count * coll_request->dtype->super.size;
    IBOFFLOAD_VERBOSE(10,("My packet length %d pack_len frag_count %d dtype size %d ",
                            pack_len,
                            coll_request->count,
                            coll_request->dtype->super.size));

    /* it is estimated mq consumption... */
    if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS(
                 iboffload_module, coll_fragment->mq_index, coll_fragment->mq_credits))) {
        IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n"));
        goto out_of_resources;
    }

    coll_fragment->tail_next = &coll_fragment->to_post;
    coll_request->buffer_info[SBUF].lkey = iboffload_module->rdma_block.ib_info.lkey;

    if (coll_request->root == my_group_index) {
        IBOFFLOAD_VERBOSE(10, ("I'm root of the data"));

        /* Send data to the extra peer */
        if (recursive_doubling_tree->n_extra_sources > 0) {
            /* send the all data to your extra peer */
            dst = recursive_doubling_tree->rank_extra_source;
            IBOFFLOAD_VERBOSE(10,("Sending the dat to Dst %d",dst));
            rc = mca_bcol_iboffload_send_small_buff_setup(
                    &last_send, pack_len, dst,
                    iboffload_module, coll_fragment);
            if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
                IBOFFLOAD_VERBOSE(10, ("Failed to"
                            " mca_bcol_iboffload_send_large_buff_setup"));
                goto out_of_resources;
            }
        }

        distance_mask_pow =
            iboffload_module->power_of_2 - 1;

       rc = binomial_scatter_smsg(iboffload_module, coll_fragment,
                &last_send, distance_mask_pow,
                my_group_index, pack_len);
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            IBOFFLOAD_VERBOSE(10, ("Failed to binomial_scatter_smsg"));
            goto out_of_resources;
        }

        goto finalize;
    }

    /* prepare and post recv operation */
    group_src = bcol_iboffload_binomial_root_to_src(coll_request->root,
            my_group_index, iboffload_module->power_of_2_ranks,
            iboffload_module->group_size, &power_of_2_distance);
    assert(group_src >= 0);

    if (0 > power_of_2_distance) {
        /* the rank is virtual root for this group, receive the data
           and scatter gather as root */
        IBOFFLOAD_VERBOSE(10,("Virtual root distance_mask_pow %d ",iboffload_module->power_of_2));
        distance_mask_pow = iboffload_module->power_of_2 - 1;
    } else {
        distance_mask_pow = power_of_2_distance - 1;
    }

    IBOFFLOAD_VERBOSE(10, ("Bcast, receive data from %d[%d], count %d, offset %d",
                group_src));

    rc = mca_bcol_iboffload_recv_small_buff_setup(&last_wait,
                                pack_len, group_src,
                                iboffload_module, coll_fragment);
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        IBOFFLOAD_VERBOSE(10, ("Failed to setup data receive"));
        goto out_of_resources;
    }

    rc = binomial_scatter_smsg(iboffload_module, coll_fragment,
            &last_send, distance_mask_pow,
            my_group_index, pack_len);
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        IBOFFLOAD_VERBOSE(10, ("Failed to binomial_scatter_smsg"));
        goto out_of_resources;
    }

    if (recursive_doubling_tree->n_extra_sources > 0 &&
            iboffload_module->power_of_2 - 1 != distance_mask_pow) {
/*

    if ((recursive_doubling_tree->n_extra_sources > 0) &&
            ((my_group_index + iboffload_module->power_of_2_ranks ) <
            iboffload_module->group_size) ) {
  */
          dst = recursive_doubling_tree->rank_extra_source;
        /*
        dst = my_group_index + iboffload_module->power_of_2_ranks;
        */

        rc = mca_bcol_iboffload_send_small_buff_setup(
                &last_send, pack_len, dst,
                iboffload_module, coll_fragment);
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            IBOFFLOAD_VERBOSE(10, ("Failed to"
                        " mca_bcol_iboffload_send_small_buff_setup"));
            goto out_of_resources;
        }
    }

finalize:
    /* end of list */
    *coll_fragment->tail_next = NULL;

    /* finish initializing full message descriptor */
    (coll_request)->n_fragments  += 1;
    (coll_request)->n_frags_sent += 1;

    if (NULL != last_wait) {
        last_wait->flags |= MQE_WR_FLAG_SIGNAL;
        coll_fragment->signal_task_wr_id = last_wait->wr_id;
        last_wait->wr_id = (uint64_t) (uintptr_t) coll_fragment;
    } else {
        last_send->flags |= MQE_WR_FLAG_SIGNAL;
        coll_fragment->signal_task_wr_id = last_send->wr_id;
        last_send->wr_id = (uint64_t) (uintptr_t) coll_fragment;
    }
    /* post the mwr */
    rc = mca_bcol_iboffload_post_mqe_tasks(iboffload_module, coll_fragment->to_post);
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        IBOFFLOAD_VERBOSE(10, ("MQE task posting failing.\n"));
        /* Note: need to clean up */
        return rc;
    }

    MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload_module->super, coll_request->order_info);

    IBOFFLOAD_VERBOSE(10, ("Return success.\n"));
    return BCOL_FN_STARTED;

out_of_resources:
    /* Release all resources */
    IBOFFLOAD_VERBOSE(10, ("Barrier, adding collfrag to collfrag_pending.\n"));
    rc =
        mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload_module);
    return (OMPI_SUCCESS != rc) ? BCOL_FN_NOT_STARTED : BCOL_FN_STARTED;
}
Example #28
0
static int mca_bcol_iboffload_small_msg_bcast_extra_exec(mca_bcol_iboffload_module_t *iboffload_module,
                                                   mca_bcol_iboffload_collreq_t *coll_request)
{
    netpatterns_pair_exchange_node_t *recursive_doubling_tree =
        &iboffload_module->recursive_doubling_tree;

    int rc,
        dst;
    int my_group_index = iboffload_module->super.sbgp_partner_module->my_index;
    uint32_t pack_len;

    struct mqe_task *last_send = NULL,
                    *last_wait = NULL;
    mca_bcol_iboffload_collfrag_t *coll_fragment = &coll_request->first_collfrag;

    IBOFFLOAD_VERBOSE(10,("Entering small msg extra iboffload bcast"));

    if (OPAL_UNLIKELY(!iboffload_module->connection_status[RECURSIVE_DOUBLING_TREE_BCAST])) {
        IBOFFLOAD_VERBOSE(10,("Bcast open new connection "));
        bcol_iboffload_setup_binomial_connection(iboffload_module);
    }


    pack_len = coll_request->count * coll_request->dtype->super.size;
    coll_request->buffer_info[SBUF].lkey = iboffload_module->rdma_block.ib_info.lkey;

    IBOFFLOAD_VERBOSE(10,("My packet length %d pack_len frag_count %d dtype size %d ",
                            pack_len,
                            coll_request->count,
                            coll_request->dtype->super.size));

    /* it is estimated mq consumption... */
    if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS(
                 iboffload_module,
                 coll_fragment->mq_index, coll_fragment->mq_credits))) {
        IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n"));
        goto out_of_resources;
    }

    coll_fragment->tail_next = &coll_fragment->to_post;


    if (coll_request->root == my_group_index) {
        IBOFFLOAD_VERBOSE(10, ("I'm root of the data %d", iboffload_module->power_of_2));
        /* send the all data to your extra peer */

        dst = recursive_doubling_tree->rank_extra_source;
        IBOFFLOAD_VERBOSE(10,("Im extra root sending data to %d \n",dst));
        rc = mca_bcol_iboffload_send_small_buff_setup(
                &last_send, pack_len, dst,
                iboffload_module, coll_fragment);
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            IBOFFLOAD_VERBOSE(10, ("Failed to"
                        " mca_bcol_iboffload_send_small_buff_setup"));
            goto out_of_resources;
        }
    } else {
        /* Not root case */
        dst = recursive_doubling_tree->rank_extra_source;
        rc = mca_bcol_iboffload_recv_small_buff_setup(&last_wait,
                pack_len, dst,
                iboffload_module, coll_fragment);

        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            IBOFFLOAD_VERBOSE(10, ("Failed to setup data receive"));
            return OMPI_ERROR;
        }
    }

    /* end of list */
    *coll_fragment->tail_next = NULL;

    /* finish initializing full message descriptor */
    (coll_request)->n_fragments  = 1;
    (coll_request)->n_frags_sent = 1;

    if (NULL != last_wait) {
        last_wait->flags |= MQE_WR_FLAG_SIGNAL;
        coll_fragment->signal_task_wr_id = last_wait->wr_id;
        last_wait->wr_id = (uint64_t) (uintptr_t) coll_fragment;
    } else {
        last_send->flags |= MQE_WR_FLAG_SIGNAL;
        coll_fragment->signal_task_wr_id = last_send->wr_id;
        last_send->wr_id = (uint64_t) (uintptr_t) coll_fragment;
    }
    /* post the mwr */
    rc = mca_bcol_iboffload_post_mqe_tasks(iboffload_module, coll_fragment->to_post);
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        IBOFFLOAD_VERBOSE(10, ("MQE task posting failing.\n"));
        /* Note: need to clean up */
        return rc;
    }

    MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload_module->super, coll_request->order_info);

    IBOFFLOAD_VERBOSE(10, ("Return success.\n"));
    return BCOL_FN_STARTED;

out_of_resources:
    /* Release all resources */
    IBOFFLOAD_VERBOSE(10, ("Barrier, adding collfrag to collfrag_pending.\n"));
    rc =
        mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload_module);
    return (OMPI_SUCCESS != rc) ? BCOL_FN_NOT_STARTED : BCOL_FN_STARTED;
}
static int mca_bcol_iboffload_fanin_proxy_progress(
                mca_bcol_iboffload_module_t *iboffload,
                struct mca_bcol_iboffload_collreq_t *coll_request)
{
    int rc = OMPI_SUCCESS, leader_rank = 0;

    struct mqe_task *last_send = NULL;
    mca_bcol_iboffload_task_t *send_task = NULL;
    mca_bcol_iboffload_frag_t *send_fragment = NULL;

    struct mqe_task **mqe_ptr_to_set;
    mca_bcol_iboffload_collfrag_t *coll_fragment;

    coll_fragment = (mca_bcol_iboffload_collfrag_t *)
                         opal_list_get_last(&coll_request->work_requests);

    mqe_ptr_to_set = &coll_fragment->to_post;

    if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS(
               iboffload, coll_fragment->mq_index, coll_fragment->mq_credits))) {
        IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n"));
        goto out_of_resources;
    }

    /* post send */
    send_fragment = mca_bcol_iboffload_get_send_frag(coll_request,
                                    leader_rank, coll_request->qp_index, 0,
                                    0, SBUF, MCA_BCOL_IBOFFLOAD_SEND_FRAG_DUMMY);
    if(NULL == send_fragment) {
        IBOFFLOAD_VERBOSE(10, ("Failing for getting and packing send frag.\n"));
        goto out_of_resources;
    }

    send_task = mca_bcol_iboffload_get_send_task(iboffload, leader_rank, MCA_BCOL_IBOFFLOAD_QP_BARRIER,
                                                 send_fragment, coll_fragment, INLINE);
    if(NULL == send_task) {
        IBOFFLOAD_VERBOSE(10, ("Failing for getting send task.\n"));
        goto out_of_resources;
    }

    APPEND_TO_TASKLIST(mqe_ptr_to_set, send_task, last_send);
    MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, send_task);

   /* end of list */
    *mqe_ptr_to_set = NULL;
    assert(NULL != last_send);

    last_send->flags |= MQE_WR_FLAG_SIGNAL;

    coll_fragment->signal_task_wr_id = last_send->wr_id;
    last_send->wr_id = (uint64_t) (uintptr_t) coll_fragment;

    /* post the mwr */
    rc = mca_bcol_iboffload_post_mqe_tasks(iboffload, coll_fragment->to_post);
    if(OMPI_SUCCESS != rc) {
        IBOFFLOAD_VERBOSE(10, ("MQE task posting failing.\n"));
        /* Note: need to clean up */
        return rc;
    }

    MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload->super, coll_request->order_info);

    return OMPI_SUCCESS;

out_of_resources:
    /* Release all resources */
    IBOFFLOAD_VERBOSE(10, ("Fan-in, adding collfrag to collfrag_pending"));
    return mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload);
}
/*
 * Open the component
 */
static int iboffload_open(void)
{
    int rc;

    /* local variables */
    mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;

    IBOFFLOAD_VERBOSE(10, ("Open Iboffload component.\n"));

    (void) mca_bcol_iboffload_verify_params();

    cm->super.priority = 100;
    cm->super.n_net_contexts = 0;
    cm->super.network_contexts = NULL;

    OBJ_CONSTRUCT(&cm->recv_wrs.lock, opal_mutex_t);

    /* construct lists */
    OBJ_CONSTRUCT(&cm->devices, opal_pointer_array_t);
    rc = opal_pointer_array_init(&cm->devices, 10, INT_MAX, 10);
    if (OMPI_SUCCESS != rc) {
        goto close_device;
    }

    /* Check MCA parameters */
    if (0 != (mca_bcol_iboffload_component.exchange_tree_order & (mca_bcol_iboffload_component.exchange_tree_order - 1))) {
        IBOFFLOAD_ERROR(("Warning: ibcol_iboffload_exchange_tree_order is %d which is not a power of 2, setting it to 2", 
                         mca_bcol_iboffload_component.exchange_tree_order));
        mca_bcol_iboffload_component.exchange_tree_order = 2;
    }

    /* Pasha: Since we do not have max inline check like in openib,
       I will put some dummy check here. All mlnx devices support at least 512b */
    if (mca_bcol_iboffload_component.max_inline_data > 512) {
        IBOFFLOAD_ERROR(("Warning the inline %d, is to big and unsupported",
                    mca_bcol_iboffload_component.max_inline_data));
        rc = OMPI_ERROR;
        goto close_device;
    }

    /* Register the progress function */
    rc = opal_progress_register(mca_bcol_iboffload_component_progress);
    if (OMPI_SUCCESS != rc) {
        IBOFFLOAD_ERROR(("Failed to register the progress function"
                         " for iboffload component.\n"));
        goto close_device;
    }

    map_ompi_to_ib_dtype();
    map_ompi_to_ib_op_type();

    /* The init_done set to true on first component usage */
    cm->init_done = false;

    return OMPI_SUCCESS;

close_device:
    OBJ_DESTRUCT(&cm->devices);
    OBJ_DESTRUCT(&cm->recv_wrs.lock);
    return rc;
}