static int mca_bcol_iboffload_fanin_proxy_progress(
                mca_bcol_iboffload_module_t *iboffload,
                struct mca_bcol_iboffload_collreq_t *coll_request)
{
    int rc = OMPI_SUCCESS, leader_rank = 0;

    struct mqe_task *last_send = NULL;
    mca_bcol_iboffload_task_t *send_task = NULL;
    mca_bcol_iboffload_frag_t *send_fragment = NULL;

    struct mqe_task **mqe_ptr_to_set;
    mca_bcol_iboffload_collfrag_t *coll_fragment;

    coll_fragment = (mca_bcol_iboffload_collfrag_t *)
                         opal_list_get_last(&coll_request->work_requests);

    mqe_ptr_to_set = &coll_fragment->to_post;

    if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS(
               iboffload, coll_fragment->mq_index, coll_fragment->mq_credits))) {
        IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n"));
        goto out_of_resources;
    }

    /* post send */
    send_fragment = mca_bcol_iboffload_get_send_frag(coll_request,
                                    leader_rank, coll_request->qp_index, 0,
                                    0, SBUF, MCA_BCOL_IBOFFLOAD_SEND_FRAG_DUMMY);
    if(NULL == send_fragment) {
        IBOFFLOAD_VERBOSE(10, ("Failing for getting and packing send frag.\n"));
        goto out_of_resources;
    }

    send_task = mca_bcol_iboffload_get_send_task(iboffload, leader_rank, MCA_BCOL_IBOFFLOAD_QP_BARRIER,
                                                 send_fragment, coll_fragment, INLINE);
    if(NULL == send_task) {
        IBOFFLOAD_VERBOSE(10, ("Failing for getting send task.\n"));
        goto out_of_resources;
    }

    APPEND_TO_TASKLIST(mqe_ptr_to_set, send_task, last_send);
    MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, send_task);

   /* end of list */
    *mqe_ptr_to_set = NULL;
    assert(NULL != last_send);

    last_send->flags |= MQE_WR_FLAG_SIGNAL;

    coll_fragment->signal_task_wr_id = last_send->wr_id;
    last_send->wr_id = (uint64_t) (uintptr_t) coll_fragment;

    /* post the mwr */
    rc = mca_bcol_iboffload_post_mqe_tasks(iboffload, coll_fragment->to_post);
    if(OMPI_SUCCESS != rc) {
        IBOFFLOAD_VERBOSE(10, ("MQE task posting failing.\n"));
        /* Note: need to clean up */
        return rc;
    }

    MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload->super, coll_request->order_info);

    return OMPI_SUCCESS;

out_of_resources:
    /* Release all resources */
    IBOFFLOAD_VERBOSE(10, ("Fan-in, adding collfrag to collfrag_pending"));
    return mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload);
}
static int mca_bcol_iboffload_fanin_leader_progress(
                mca_bcol_iboffload_module_t *iboffload,
                struct mca_bcol_iboffload_collreq_t *coll_request)
{
    int rc = OMPI_SUCCESS, leader_rank = 0, rank,
        sbgp_size = iboffload->ibnet->super.group_size;

    struct mqe_task *last_wait = NULL;

    mca_bcol_iboffload_task_t *wait_task = NULL;
    mca_bcol_iboffload_frag_t *preposted_recv_frag = NULL;

    struct mqe_task **mqe_ptr_to_set;
    mca_bcol_iboffload_collfrag_t *coll_fragment;

    coll_fragment = (mca_bcol_iboffload_collfrag_t *)
                         opal_list_get_last(&coll_request->work_requests);

    mqe_ptr_to_set = &coll_fragment->to_post;

    if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS(
               iboffload, coll_fragment->mq_index, coll_fragment->mq_credits))) {
        IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n"));
        goto out_of_resources;
    }

    for (rank = leader_rank + 1; rank < sbgp_size; ++rank) {
       /* post wait */
        preposted_recv_frag = mca_bcol_iboffload_get_preposted_recv_frag(
                                        iboffload, rank, coll_request->qp_index);
        if(NULL == preposted_recv_frag) {
            IBOFFLOAD_VERBOSE(10, ("Failing for getting prepost recv frag.\n"));
            goto out_of_resources;
        }

        wait_task = mca_bcol_iboffload_get_wait_task(iboffload, rank, 1,
                             preposted_recv_frag, coll_request->qp_index, NULL);
        if(NULL == wait_task) {
            IBOFFLOAD_VERBOSE(10, ("Failing for getting wait task.\n"));
            goto out_of_resources;
        }

        APPEND_TO_TASKLIST(mqe_ptr_to_set, wait_task, last_wait);
        MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, wait_task);
    }

   /* end of list */
    *mqe_ptr_to_set = NULL;

    last_wait->flags |= MQE_WR_FLAG_SIGNAL;

    coll_fragment->signal_task_wr_id = last_wait->wr_id;
    last_wait->wr_id = (uint64_t) (uintptr_t) coll_fragment;

    /* post the mwr */
    rc = mca_bcol_iboffload_post_mqe_tasks(iboffload, coll_fragment->to_post);
    if(OMPI_SUCCESS != rc) {
        IBOFFLOAD_VERBOSE(10, ("MQE task posting failing.\n"));
        /* Note: need to clean up */
        return rc;
    }

    MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload->super, coll_request->order_info);

    return OMPI_SUCCESS;

out_of_resources:
    /* Release all resources */
    IBOFFLOAD_VERBOSE(10, ("Fan-in, adding collfrag to collfrag_pending"));
    return mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload);
}
Example #3
0
static int mca_bcol_iboffload_bcast_scatter_allgather_extra_exec(mca_bcol_iboffload_module_t *iboffload_module,
        mca_bcol_iboffload_collreq_t *coll_request)
{
    netpatterns_pair_exchange_node_t *recursive_doubling_tree =
        &iboffload_module->recursive_doubling_tree;

    int rc, dst;
    int count = coll_request->count * coll_request->dtype->super.size;
    int my_group_index = iboffload_module->ibnet->super.my_index;
    struct mqe_task *last_send = NULL,
                    *last_wait = NULL;
    mca_bcol_iboffload_collfrag_t *coll_fragment = &coll_request->first_collfrag;

    if (OPAL_UNLIKELY(!iboffload_module->connection_status[RECURSIVE_DOUBLING_TREE_BCAST])) {
        bcol_iboffload_setup_binomial_connection(iboffload_module);
    }

    /* register memory in mpool/rcache */
    rc = mca_bcol_iboffload_prepare_buffer(coll_request->buffer_info[SBUF].buf, count,
            &coll_request->buffer_info[SBUF].iboffload_reg, iboffload_module);
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        IBOFFLOAD_ERROR(("Cannot register memory: "
                         "addr - %p, %d bytes.\n",
                          coll_request->buffer_info[SBUF].buf, count));
        return OMPI_ERROR;
    }

    coll_request->buffer_info[SBUF].lkey = coll_request->buffer_info[SBUF].iboffload_reg->mr->lkey;

    /* it is estimated mq consumption... */
    if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS(
                 iboffload_module, coll_fragment->mq_index, coll_fragment->mq_credits) ||
                 false == opal_list_is_empty(&iboffload_module->collfrag_pending))) {
        IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n"));
        goto out_of_resources;
    }

    coll_fragment->tail_next = &coll_fragment->to_post;

    /* send or recv the data */

    if (coll_request->root == my_group_index) {
        IBOFFLOAD_VERBOSE(10, ("I'm root of the data %d", iboffload_module->power_of_2));
        /* send the all data to your extra peer */
        dst = recursive_doubling_tree->rank_extra_source;
        rc = mca_bcol_iboffload_recv_rtr_setup(
                &last_wait, dst, iboffload_module, coll_fragment);
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            IBOFFLOAD_VERBOSE(10, ("Failed to"
                        " mca_bcol_iboffload_recv_rtr_setup"));
            return OMPI_ERROR;
        }
        rc = mca_bcol_iboffload_send_large_buff_setup(
                &last_send, SBUF, 0, count, dst,
                iboffload_module, coll_fragment);
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            IBOFFLOAD_VERBOSE(10, ("Failed to"
                        " mca_bcol_iboffload_send_large_buff_setup"));
            return OMPI_ERROR;
        }
    } else {
        /* Not root case */
        dst = recursive_doubling_tree->rank_extra_source;
        rc = mca_bcol_iboffload_send_rtr_setup(&last_send,
                dst, iboffload_module,
                coll_fragment);
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            IBOFFLOAD_VERBOSE(10, ("Failed to setup send rtr"));
            return OMPI_ERROR;
        }

        rc = mca_bcol_iboffload_recv_large_buff_setup(&last_wait,
                SBUF, 0, count, dst,
                iboffload_module, coll_fragment);
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            IBOFFLOAD_VERBOSE(10, ("Failed to setup data receive"));
            return OMPI_ERROR;
        }
    }

    IBOFFLOAD_VERBOSE(10, ("Fill in the the rest of the coll_fragment.\n"));

    /* end of list */
    *coll_fragment->tail_next = NULL;

    /* finish initializing full message descriptor */
    coll_request->n_fragments  += 1;
    coll_request->n_frags_sent += 1;

    if (NULL != last_wait) {
        last_wait->flags |= MQE_WR_FLAG_SIGNAL;
        coll_fragment->signal_task_wr_id = last_wait->wr_id;
        last_wait->wr_id = (uint64_t) (uintptr_t) coll_fragment;
    } else {
        last_send->flags |= MQE_WR_FLAG_SIGNAL;
        coll_fragment->signal_task_wr_id = last_send->wr_id;
        last_send->wr_id = (uint64_t) (uintptr_t) coll_fragment;
    }

    /* post the mwr */
    rc = mca_bcol_iboffload_post_mqe_tasks(iboffload_module, coll_fragment->to_post);
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        IBOFFLOAD_VERBOSE(10, ("MQE task posting failing.\n"));
        /* Note: need to clean up */
        return rc;
    }

    MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload_module->super, coll_request->order_info);

    IBOFFLOAD_VERBOSE(10, ("Return success.\n"));

    return BCOL_FN_STARTED;

out_of_resources:
    /* Release all resources */
    IBOFFLOAD_VERBOSE(10, ("Barrier, adding collfrag to collfrag_pending.\n"));
    rc =
        mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload_module);
    return (OMPI_SUCCESS != rc) ? BCOL_FN_NOT_STARTED : BCOL_FN_STARTED;
}
Example #4
0
static int mca_bcol_iboffload_bcast_scatter_allgather_exec(mca_bcol_iboffload_module_t *iboffload_module,
        mca_bcol_iboffload_collreq_t *coll_request)
{
    netpatterns_pair_exchange_node_t *recursive_doubling_tree =
        &iboffload_module->recursive_doubling_tree;

    int rc,
        dst,
        group_src, power_of_2_distance,
        recv_count;
    size_t offset;
    int count = coll_request->count * coll_request->dtype->super.size;
    int my_group_index = iboffload_module->ibnet->super.my_index;
    size_t base_block_size =
        (count +  iboffload_module->power_of_2_ranks - 1) /
        iboffload_module->power_of_2_ranks;

    struct mqe_task *last_send = NULL,
                    *last_wait = NULL;
    mca_bcol_iboffload_collfrag_t *coll_fragment = &coll_request->first_collfrag;

    if (OPAL_UNLIKELY(!iboffload_module->connection_status[RECURSIVE_DOUBLING_TREE_BCAST])) {
        bcol_iboffload_setup_binomial_connection(iboffload_module);
    }

    /* register memory in mpool/rcache */
    rc = mca_bcol_iboffload_prepare_buffer(coll_request->buffer_info[SBUF].buf, count,
            &coll_request->buffer_info[SBUF].iboffload_reg, iboffload_module);
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        IBOFFLOAD_ERROR(("Cannot register memory: "
                         "addr - %p, %d bytes.\n",
                          coll_request->buffer_info[SBUF].buf, count));
        return OMPI_ERROR;
    }

    coll_request->buffer_info[SBUF].lkey = coll_request->buffer_info[SBUF].iboffload_reg->mr->lkey;

    /* it is estimated mq consumption... */
    if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS(
                 iboffload_module, coll_fragment->mq_index, coll_fragment->mq_credits) ||
                 false == opal_list_is_empty(&iboffload_module->collfrag_pending))) {
        IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n"));
        goto out_of_resources;
    }

    coll_fragment->tail_next = &coll_fragment->to_post;

    if (coll_request->root == my_group_index) {
        IBOFFLOAD_VERBOSE(10, ("I'm root of the data %d %d",
                    iboffload_module->power_of_2, recursive_doubling_tree->n_extra_sources ));
        /* for proxy we have little bit more work to do */
        if (recursive_doubling_tree->n_extra_sources > 0) {
            /* send the all data to your extra peer */
            dst = recursive_doubling_tree->rank_extra_source;
            rc = mca_bcol_iboffload_recv_rtr_setup(
                    &last_wait, dst, iboffload_module, coll_fragment);
            if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
                IBOFFLOAD_VERBOSE(10, ("Failed to"
                            " mca_bcol_iboffload_recv_rtr_setup"));
                return OMPI_ERROR;
            }
            rc = mca_bcol_iboffload_send_large_buff_setup(
                    &last_send, SBUF, 0, count, dst,
                    iboffload_module, coll_fragment);
            if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
                IBOFFLOAD_VERBOSE(10, ("Failed to"
                            " mca_bcol_iboffload_send_large_buff_setup"));
                return OMPI_ERROR;
            }
        }
        power_of_2_distance = iboffload_module->power_of_2;

        BINOMIAL_SCATTER(iboffload_module, coll_fragment,
                last_wait, last_send,  power_of_2_distance - 1,
                my_group_index, base_block_size, count
                );
        /* EXIT OR GO TO Gather */
        goto GATHER;
    }

    /* prepare and post recv operation */
    group_src = bcol_iboffload_binomial_root_to_src(coll_request->root,
            my_group_index, iboffload_module->power_of_2_ranks,
            iboffload_module->group_size, &power_of_2_distance);

    IBOFFLOAD_VERBOSE(10, ("SRC %d DIST %d ranks %d gsize %d root %d my rank %d",
                group_src, power_of_2_distance, iboffload_module->power_of_2_ranks,
                iboffload_module->group_size,
                coll_request->root, my_group_index));
    assert(group_src >= 0);

    if (0 > power_of_2_distance) {
        /* the rank is virtual root for this group, receive the data
           and scatter gather as root */
        power_of_2_distance =
            iboffload_module->power_of_2;
        offset = 0;
        recv_count = count;
        IBOFFLOAD_VERBOSE(10, ("Virtual root %d , set mask to %d",
                    my_group_index, power_of_2_distance));
    } else {
        int my_left_boundary_rank;
        int delta;
        recv_count = base_block_size * (1 << power_of_2_distance); /* we may receive larger data */
        my_left_boundary_rank = my_group_index & ((~(int)0) << power_of_2_distance );
        offset = (size_t) (base_block_size * my_left_boundary_rank);
        delta = count - offset;
        if (OPAL_UNLIKELY(delta <= 0)) {
            /* no data to recv */
            goto GATHER;
        } else {
            recv_count = (delta < recv_count) ? delta : recv_count;
        }

        IBOFFLOAD_VERBOSE(10, ("Recv data set mask to %d",
                    power_of_2_distance));
    }

    IBOFFLOAD_VERBOSE(10, ("Bcast, receive data from %d[%d], count %d, offset %d",
                group_src, recv_count, offset));

    /* Receive data to user buffer */
    rc = mca_bcol_iboffload_send_rtr_setup(&last_send,
                                group_src, iboffload_module,
                                coll_fragment);
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        IBOFFLOAD_VERBOSE(10, ("Failed to setup send rtr"));
        return OMPI_ERROR;
    }

    rc = mca_bcol_iboffload_recv_large_buff_setup(&last_wait,
                                SBUF, offset, recv_count, group_src,
                                iboffload_module, coll_fragment);
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        IBOFFLOAD_VERBOSE(10, ("Failed to setup data receive"));
        return OMPI_ERROR;
    }

    BINOMIAL_SCATTER(iboffload_module, coll_fragment,
            last_wait, last_send, power_of_2_distance - 1,
            my_group_index, base_block_size, count);

GATHER:
    rc = bcol_iboffload_bcast_binomial_gather(iboffload_module,
            &last_send, &last_wait, coll_fragment,
            count, base_block_size, power_of_2_distance);
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        IBOFFLOAD_VERBOSE(10, ("Failed to setup gather. Return %d", rc));
        return rc;
    }

    if (recursive_doubling_tree->n_extra_sources > 0 &&
            iboffload_module->power_of_2 != power_of_2_distance) {
        dst = recursive_doubling_tree->rank_extra_source;

        rc = mca_bcol_iboffload_recv_rtr_setup(
                &last_wait, dst, iboffload_module, coll_fragment);
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            IBOFFLOAD_VERBOSE(10, ("Failed to"
                        " mca_bcol_iboffload_recv_rtr_setup"));
            return OMPI_ERROR;
        }

        rc = mca_bcol_iboffload_send_large_buff_setup(
                &last_send, SBUF, 0, count, dst,
                iboffload_module, coll_fragment);
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            IBOFFLOAD_VERBOSE(10, ("Failed to"
                        " mca_bcol_iboffload_send_large_buff_setup"));
            return OMPI_ERROR;
        }
    }

    IBOFFLOAD_VERBOSE(10, ("Fill in the the rest of the coll_fragment.\n"));

    /* end of list */
    *coll_fragment->tail_next = NULL;

    /* finish initializing full message descriptor */
    coll_request->n_fragments  += 1;
    coll_request->n_frags_sent += 1;

    if (NULL != last_wait) {
        last_wait->flags |= MQE_WR_FLAG_SIGNAL;
        coll_fragment->signal_task_wr_id = last_wait->wr_id;
        last_wait->wr_id = (uint64_t) (uintptr_t) coll_fragment;
    } else {
        last_send->flags |= MQE_WR_FLAG_SIGNAL;
        coll_fragment->signal_task_wr_id = last_send->wr_id;
        last_send->wr_id = (uint64_t) (uintptr_t) coll_fragment;
    }

    /* post the mwr */
    rc = mca_bcol_iboffload_post_mqe_tasks(iboffload_module, coll_fragment->to_post);
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        IBOFFLOAD_VERBOSE(10, ("MQE task posting failing.\n"));
        /* Note: need to clean up */
        return rc;
    }

    MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload_module->super, coll_request->order_info);

    IBOFFLOAD_VERBOSE(10, ("Return success.\n"));

    return BCOL_FN_STARTED;

out_of_resources:
    /* Release all resources */
    IBOFFLOAD_VERBOSE(10, ("Barrier, adding collfrag to collfrag_pending.\n"));
    rc =
        mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload_module);
    return (OMPI_SUCCESS != rc) ? BCOL_FN_NOT_STARTED : BCOL_FN_STARTED;
}
Example #5
0
static int mca_bcol_iboffload_small_msg_bcast_extra_exec(mca_bcol_iboffload_module_t *iboffload_module,
                                                   mca_bcol_iboffload_collreq_t *coll_request)
{
    netpatterns_pair_exchange_node_t *recursive_doubling_tree =
        &iboffload_module->recursive_doubling_tree;

    int rc,
        dst;
    int my_group_index = iboffload_module->super.sbgp_partner_module->my_index;
    uint32_t pack_len;

    struct mqe_task *last_send = NULL,
                    *last_wait = NULL;
    mca_bcol_iboffload_collfrag_t *coll_fragment = &coll_request->first_collfrag;

    IBOFFLOAD_VERBOSE(10,("Entering small msg extra iboffload bcast"));

    if (OPAL_UNLIKELY(!iboffload_module->connection_status[RECURSIVE_DOUBLING_TREE_BCAST])) {
        IBOFFLOAD_VERBOSE(10,("Bcast open new connection "));
        bcol_iboffload_setup_binomial_connection(iboffload_module);
    }


    pack_len = coll_request->count * coll_request->dtype->super.size;
    coll_request->buffer_info[SBUF].lkey = iboffload_module->rdma_block.ib_info.lkey;

    IBOFFLOAD_VERBOSE(10,("My packet length %d pack_len frag_count %d dtype size %d ",
                            pack_len,
                            coll_request->count,
                            coll_request->dtype->super.size));

    /* it is estimated mq consumption... */
    if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS(
                 iboffload_module,
                 coll_fragment->mq_index, coll_fragment->mq_credits))) {
        IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n"));
        goto out_of_resources;
    }

    coll_fragment->tail_next = &coll_fragment->to_post;


    if (coll_request->root == my_group_index) {
        IBOFFLOAD_VERBOSE(10, ("I'm root of the data %d", iboffload_module->power_of_2));
        /* send the all data to your extra peer */

        dst = recursive_doubling_tree->rank_extra_source;
        IBOFFLOAD_VERBOSE(10,("Im extra root sending data to %d \n",dst));
        rc = mca_bcol_iboffload_send_small_buff_setup(
                &last_send, pack_len, dst,
                iboffload_module, coll_fragment);
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            IBOFFLOAD_VERBOSE(10, ("Failed to"
                        " mca_bcol_iboffload_send_small_buff_setup"));
            goto out_of_resources;
        }
    } else {
        /* Not root case */
        dst = recursive_doubling_tree->rank_extra_source;
        rc = mca_bcol_iboffload_recv_small_buff_setup(&last_wait,
                pack_len, dst,
                iboffload_module, coll_fragment);

        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            IBOFFLOAD_VERBOSE(10, ("Failed to setup data receive"));
            return OMPI_ERROR;
        }
    }

    /* end of list */
    *coll_fragment->tail_next = NULL;

    /* finish initializing full message descriptor */
    (coll_request)->n_fragments  = 1;
    (coll_request)->n_frags_sent = 1;

    if (NULL != last_wait) {
        last_wait->flags |= MQE_WR_FLAG_SIGNAL;
        coll_fragment->signal_task_wr_id = last_wait->wr_id;
        last_wait->wr_id = (uint64_t) (uintptr_t) coll_fragment;
    } else {
        last_send->flags |= MQE_WR_FLAG_SIGNAL;
        coll_fragment->signal_task_wr_id = last_send->wr_id;
        last_send->wr_id = (uint64_t) (uintptr_t) coll_fragment;
    }
    /* post the mwr */
    rc = mca_bcol_iboffload_post_mqe_tasks(iboffload_module, coll_fragment->to_post);
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        IBOFFLOAD_VERBOSE(10, ("MQE task posting failing.\n"));
        /* Note: need to clean up */
        return rc;
    }

    MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload_module->super, coll_request->order_info);

    IBOFFLOAD_VERBOSE(10, ("Return success.\n"));
    return BCOL_FN_STARTED;

out_of_resources:
    /* Release all resources */
    IBOFFLOAD_VERBOSE(10, ("Barrier, adding collfrag to collfrag_pending.\n"));
    rc =
        mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload_module);
    return (OMPI_SUCCESS != rc) ? BCOL_FN_NOT_STARTED : BCOL_FN_STARTED;
}
Example #6
0
static int mca_bcol_iboffload_small_msg_bcast_exec(mca_bcol_iboffload_module_t *iboffload_module,
                                                   mca_bcol_iboffload_collreq_t *coll_request)
{
    netpatterns_pair_exchange_node_t *recursive_doubling_tree =
        &iboffload_module->recursive_doubling_tree;

    int rc,
        distance_mask_pow , dst,
        group_src, power_of_2_distance;

    uint32_t pack_len;
    int my_group_index = iboffload_module->super.sbgp_partner_module->my_index;

    struct mqe_task *last_send = NULL,
                    *last_wait = NULL;
    mca_bcol_iboffload_collfrag_t *coll_fragment = &coll_request->first_collfrag;

    IBOFFLOAD_VERBOSE(10,("Entering small msg iboffload bcast"));

    if (OPAL_UNLIKELY(!iboffload_module->connection_status[RECURSIVE_DOUBLING_TREE_BCAST])) {
        IBOFFLOAD_VERBOSE(10,("Bcast open new connection "));
        bcol_iboffload_setup_binomial_connection(iboffload_module);
    }

    pack_len = coll_request->count * coll_request->dtype->super.size;
    IBOFFLOAD_VERBOSE(10,("My packet length %d pack_len frag_count %d dtype size %d ",
                            pack_len,
                            coll_request->count,
                            coll_request->dtype->super.size));

    /* it is estimated mq consumption... */
    if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS(
                 iboffload_module, coll_fragment->mq_index, coll_fragment->mq_credits))) {
        IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n"));
        goto out_of_resources;
    }

    coll_fragment->tail_next = &coll_fragment->to_post;
    coll_request->buffer_info[SBUF].lkey = iboffload_module->rdma_block.ib_info.lkey;

    if (coll_request->root == my_group_index) {
        IBOFFLOAD_VERBOSE(10, ("I'm root of the data"));

        /* Send data to the extra peer */
        if (recursive_doubling_tree->n_extra_sources > 0) {
            /* send the all data to your extra peer */
            dst = recursive_doubling_tree->rank_extra_source;
            IBOFFLOAD_VERBOSE(10,("Sending the dat to Dst %d",dst));
            rc = mca_bcol_iboffload_send_small_buff_setup(
                    &last_send, pack_len, dst,
                    iboffload_module, coll_fragment);
            if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
                IBOFFLOAD_VERBOSE(10, ("Failed to"
                            " mca_bcol_iboffload_send_large_buff_setup"));
                goto out_of_resources;
            }
        }

        distance_mask_pow =
            iboffload_module->power_of_2 - 1;

       rc = binomial_scatter_smsg(iboffload_module, coll_fragment,
                &last_send, distance_mask_pow,
                my_group_index, pack_len);
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            IBOFFLOAD_VERBOSE(10, ("Failed to binomial_scatter_smsg"));
            goto out_of_resources;
        }

        goto finalize;
    }

    /* prepare and post recv operation */
    group_src = bcol_iboffload_binomial_root_to_src(coll_request->root,
            my_group_index, iboffload_module->power_of_2_ranks,
            iboffload_module->group_size, &power_of_2_distance);
    assert(group_src >= 0);

    if (0 > power_of_2_distance) {
        /* the rank is virtual root for this group, receive the data
           and scatter gather as root */
        IBOFFLOAD_VERBOSE(10,("Virtual root distance_mask_pow %d ",iboffload_module->power_of_2));
        distance_mask_pow = iboffload_module->power_of_2 - 1;
    } else {
        distance_mask_pow = power_of_2_distance - 1;
    }

    IBOFFLOAD_VERBOSE(10, ("Bcast, receive data from %d[%d], count %d, offset %d",
                group_src));

    rc = mca_bcol_iboffload_recv_small_buff_setup(&last_wait,
                                pack_len, group_src,
                                iboffload_module, coll_fragment);
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        IBOFFLOAD_VERBOSE(10, ("Failed to setup data receive"));
        goto out_of_resources;
    }

    rc = binomial_scatter_smsg(iboffload_module, coll_fragment,
            &last_send, distance_mask_pow,
            my_group_index, pack_len);
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        IBOFFLOAD_VERBOSE(10, ("Failed to binomial_scatter_smsg"));
        goto out_of_resources;
    }

    if (recursive_doubling_tree->n_extra_sources > 0 &&
            iboffload_module->power_of_2 - 1 != distance_mask_pow) {
/*

    if ((recursive_doubling_tree->n_extra_sources > 0) &&
            ((my_group_index + iboffload_module->power_of_2_ranks ) <
            iboffload_module->group_size) ) {
  */
          dst = recursive_doubling_tree->rank_extra_source;
        /*
        dst = my_group_index + iboffload_module->power_of_2_ranks;
        */

        rc = mca_bcol_iboffload_send_small_buff_setup(
                &last_send, pack_len, dst,
                iboffload_module, coll_fragment);
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            IBOFFLOAD_VERBOSE(10, ("Failed to"
                        " mca_bcol_iboffload_send_small_buff_setup"));
            goto out_of_resources;
        }
    }

finalize:
    /* end of list */
    *coll_fragment->tail_next = NULL;

    /* finish initializing full message descriptor */
    (coll_request)->n_fragments  += 1;
    (coll_request)->n_frags_sent += 1;

    if (NULL != last_wait) {
        last_wait->flags |= MQE_WR_FLAG_SIGNAL;
        coll_fragment->signal_task_wr_id = last_wait->wr_id;
        last_wait->wr_id = (uint64_t) (uintptr_t) coll_fragment;
    } else {
        last_send->flags |= MQE_WR_FLAG_SIGNAL;
        coll_fragment->signal_task_wr_id = last_send->wr_id;
        last_send->wr_id = (uint64_t) (uintptr_t) coll_fragment;
    }
    /* post the mwr */
    rc = mca_bcol_iboffload_post_mqe_tasks(iboffload_module, coll_fragment->to_post);
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        IBOFFLOAD_VERBOSE(10, ("MQE task posting failing.\n"));
        /* Note: need to clean up */
        return rc;
    }

    MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload_module->super, coll_request->order_info);

    IBOFFLOAD_VERBOSE(10, ("Return success.\n"));
    return BCOL_FN_STARTED;

out_of_resources:
    /* Release all resources */
    IBOFFLOAD_VERBOSE(10, ("Barrier, adding collfrag to collfrag_pending.\n"));
    rc =
        mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload_module);
    return (OMPI_SUCCESS != rc) ? BCOL_FN_NOT_STARTED : BCOL_FN_STARTED;
}