Example #1
0
static int init_ml_buf_desc(mca_bcol_ptpcoll_ml_buffer_desc_t **desc, void *base_addr, uint32_t num_banks,
                            uint32_t num_buffers_per_bank, uint32_t size_buffer, uint32_t header_size, int group_size, int pow_k)
{
    uint32_t i, j, ci;
    mca_bcol_ptpcoll_ml_buffer_desc_t *tmp_desc = NULL;
    int k_nomial_radix = mca_bcol_ptpcoll_component.k_nomial_radix;
    int pow_k_val = (0 == pow_k) ? 1 : pow_k;
    int num_to_alloc =
        ((k_nomial_radix - 1) * pow_k_val * 2 + 1 > mca_bcol_ptpcoll_component.narray_radix) ?
        (k_nomial_radix - 1) * pow_k_val * 2 + 1 :
        mca_bcol_ptpcoll_component.narray_radix * 2;


    *desc = (mca_bcol_ptpcoll_ml_buffer_desc_t *)calloc(num_banks * num_buffers_per_bank,
                                                        sizeof(mca_bcol_ptpcoll_ml_buffer_desc_t));
    if (NULL == *desc) {
        PTPCOLL_ERROR(("Failed to allocate memory"));
        return OMPI_ERROR;
    }

    tmp_desc = *desc;

    for (i = 0; i < num_banks; i++) {
        for (j = 0; j < num_buffers_per_bank; j++) {
            ci = i * num_buffers_per_bank + j;
            tmp_desc[ci].bank_index = i;
            tmp_desc[ci].buffer_index = j;
            /* *2  is for gather session  +1 for extra peer */
            tmp_desc[ci].requests = (ompi_request_t **)
                calloc(num_to_alloc, sizeof(ompi_request_t *));
            if (NULL == tmp_desc[ci].requests) {
                PTPCOLL_ERROR(("Failed to allocate memory for requests"));
                return OMPI_ERROR;
            }
            /*
             * ptpcoll don't have any header, but other bcols may to have. So
             * we need to take it in account.
             */
            tmp_desc[ci].data_addr = (void *)
                ((unsigned char*)base_addr + ci * size_buffer + header_size);
            PTPCOLL_VERBOSE(10, ("ml memory cache setup %d %d - %p", i, j, tmp_desc[ci].data_addr));

            /* init reduce implementation flags */
            tmp_desc[ci].reduce_init_called = false;
            tmp_desc[ci].reduction_status = 0;
        }
    }

    return OMPI_SUCCESS;
}
/* We have the same progress func for both cases (R-D and K-Nominal) */
static int bcol_ptpcoll_barrier_extra_node_progress(
                            bcol_function_args_t *input_args,
                            struct coll_ml_function_t *const_args)
{
   /* local variable */
    ompi_request_t **requests;
    int rc, completed, num_reqs = 2;

    mca_bcol_ptpcoll_collreq_t *collreq =
                    (mca_bcol_ptpcoll_collreq_t *) input_args->bcol_opaque_data;

    requests = collreq->requests;

    /* test for completion */
    completed =
        mca_bcol_ptpcoll_test_all_for_match(&num_reqs, requests, &rc);
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        PTPCOLL_ERROR(("Test for all failed."));
        return rc;
    }

    if (!completed) {
        return BCOL_FN_STARTED;
    }

    return BCOL_FN_COMPLETE;
}
static int mca_bcol_ptpcoll_barrier_setup(mca_bcol_base_module_t *super, int bcoll_type)
{
    netpatterns_k_exchange_node_t *my_exchange_node;
    mca_bcol_ptpcoll_module_t * ptpcoll_module =
                           (mca_bcol_ptpcoll_module_t *) super;

    mca_bcol_base_coll_fn_comm_attributes_t comm_attribs;
    mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs;

    comm_attribs.bcoll_type = bcoll_type;

    comm_attribs.comm_size_min = 0;
    comm_attribs.comm_size_max = 1024 * 1024;
    comm_attribs.waiting_semantics = NON_BLOCKING;

    inv_attribs.bcol_msg_min = 0;
    inv_attribs.bcol_msg_max = 20000; /* range 1 */

    inv_attribs.datatype_bitmap = 0xffffffff;
    inv_attribs.op_types_bitmap = 0xffffffff;

    comm_attribs.data_src = DATA_SRC_KNOWN;

    switch(mca_bcol_ptpcoll_component.barrier_alg) {
        case 1:
            if (PTPCOLL_EXTRA == ptpcoll_module->pow_2type) {
                mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs,
                    bcol_ptpcoll_barrier_recurs_dbl_extra_new,
                    bcol_ptpcoll_barrier_extra_node_progress);
                break;
            }

            mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs,
                bcol_ptpcoll_barrier_recurs_dbl_new,
                bcol_ptpcoll_barrier_recurs_dbl_new_progress);
            break;
        case 2:
            my_exchange_node = &ptpcoll_module->knomial_exchange_tree;
            if (my_exchange_node->n_extra_sources > 0 &&
                           EXTRA_NODE == my_exchange_node->node_type) {
                mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs,
                    bcol_ptpcoll_barrier_recurs_knomial_extra_new,
                    bcol_ptpcoll_barrier_extra_node_progress);
                break;
            }

            mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs,
                bcol_ptpcoll_barrier_recurs_knomial_new,
                bcol_ptpcoll_barrier_recurs_knomial_new_progress);
            break;
        default:
            PTPCOLL_ERROR(("Wrong barrier_alg flag value."));
    }

    return OMPI_SUCCESS;
}
Example #4
0
static int load_knomial_info(mca_bcol_ptpcoll_module_t *ptpcoll_module)
{
    int i;
    mca_bcol_ptpcoll_component_t *cm = &mca_bcol_ptpcoll_component;

    ptpcoll_module->k_nomial_radix =
        cm->k_nomial_radix > ptpcoll_module->group_size ?
        ptpcoll_module->group_size :
        cm->k_nomial_radix;

    ptpcoll_module->pow_k = pow_k_calc(ptpcoll_module->k_nomial_radix,
                                       ptpcoll_module->group_size,
                                       &ptpcoll_module->pow_knum);

    ptpcoll_module->kn_proxy_extra_index = (int *)
        malloc(sizeof(int) * (ptpcoll_module->k_nomial_radix - 1));
    if (NULL == ptpcoll_module->kn_proxy_extra_index) {
        PTPCOLL_ERROR(("Failed to allocate memory"));
        goto Error;
    }

    /* Setting peer type for K-nomial algorithm*/
    if (ptpcoll_module->super.sbgp_partner_module->my_index < ptpcoll_module->pow_knum ) {
        if (ptpcoll_module->super.sbgp_partner_module->my_index <
            ptpcoll_module->group_size - ptpcoll_module->pow_knum) {
            for (i = 0;
                 i < (ptpcoll_module->k_nomial_radix - 1) &&
                     ptpcoll_module->super.sbgp_partner_module->my_index *
                     (ptpcoll_module->k_nomial_radix - 1)  +
                     i + ptpcoll_module->pow_knum < ptpcoll_module->group_size
                     ; i++) {
                ptpcoll_module->pow_ktype = PTPCOLL_KN_PROXY;
                ptpcoll_module->kn_proxy_extra_index[i] =
                    ptpcoll_module->super.sbgp_partner_module->my_index *
                    (ptpcoll_module->k_nomial_radix - 1) +
                    i + ptpcoll_module->pow_knum;
                PTPCOLL_VERBOSE(10 ,("My type is proxy, pow_knum = %d [%d] my extra %d",
                                     ptpcoll_module->pow_knum,
                                     ptpcoll_module->pow_k,
                                     ptpcoll_module->kn_proxy_extra_index[i]));
            }
            ptpcoll_module->kn_proxy_extra_num = i;
        } else {
            PTPCOLL_VERBOSE(10 ,("My type is in group, pow_knum = %d [%d]", ptpcoll_module->pow_knum,
                                 ptpcoll_module->pow_k));
            ptpcoll_module->pow_ktype = PTPCOLL_KN_IN_GROUP;
        }
    } else {
        ptpcoll_module->pow_ktype = PTPCOLL_KN_EXTRA;
        ptpcoll_module->kn_proxy_extra_index[0] = (ptpcoll_module->super.sbgp_partner_module->my_index -
                                                   ptpcoll_module->pow_knum) / (ptpcoll_module->k_nomial_radix - 1);
        PTPCOLL_VERBOSE(10 ,("My type is extra , pow_knum = %d [%d] my proxy %d",
                             ptpcoll_module->pow_knum,
                             ptpcoll_module->pow_k,
                             ptpcoll_module->kn_proxy_extra_index[0]));
    }

    return OMPI_SUCCESS;

Error:
    if (NULL == ptpcoll_module->kn_proxy_extra_index) {
        free(ptpcoll_module->kn_proxy_extra_index);
    }

    return OMPI_ERROR;
}
Example #5
0
/* Setup N-array scatter Knomial-gather static information */
static int load_narray_knomial_tree (mca_bcol_ptpcoll_module_t *ptpcoll_module)
{
    int rc, i, peer;
    mca_bcol_ptpcoll_component_t *cm = &mca_bcol_ptpcoll_component;

    ptpcoll_module->full_narray_tree_size = calc_full_tree_size(
        cm->narray_knomial_radix,
        ptpcoll_module->group_size,
        &ptpcoll_module->full_narray_tree_num_leafs);

    ptpcoll_module->narray_knomial_proxy_extra_index = (int *)
        malloc(sizeof(int) * (cm->narray_knomial_radix));
    if (NULL == ptpcoll_module->narray_knomial_proxy_extra_index) {
        PTPCOLL_ERROR(("Failed to allocate memory"));
        goto Error;
    }

    ptpcoll_module->narray_knomial_node = calloc(
        ptpcoll_module->full_narray_tree_size,
        sizeof(netpatterns_narray_knomial_tree_node_t));
    if(NULL == ptpcoll_module->narray_knomial_node) {
        goto Error;
    }

    PTPCOLL_VERBOSE(10 ,("My type is proxy, full tree size = %d [%d]",
                         ptpcoll_module->full_narray_tree_size,
                         cm->narray_knomial_radix
                        ));

    if (ptpcoll_module->super.sbgp_partner_module->my_index <
        ptpcoll_module->full_narray_tree_size) {
        if (ptpcoll_module->super.sbgp_partner_module->my_index <
            ptpcoll_module->group_size - ptpcoll_module->full_narray_tree_size) {
            ptpcoll_module->narray_type = PTPCOLL_PROXY;
            for (i = 0; i < cm->narray_knomial_radix; i++) {
                peer =
                    ptpcoll_module->super.sbgp_partner_module->my_index *
                    cm->narray_knomial_radix + i +
                    ptpcoll_module->full_narray_tree_size;
                if (peer >= ptpcoll_module->group_size) {
                    break;
                }
                ptpcoll_module->narray_knomial_proxy_extra_index[i] = peer;
            }
            ptpcoll_module->narray_knomial_proxy_num = i;
        } else {
            ptpcoll_module->narray_type = PTPCOLL_IN_GROUP;;
        }
        /* Setting node info */
        for(i = 0; i < ptpcoll_module->full_narray_tree_size; i++) {
            rc = netpatterns_setup_narray_knomial_tree(
                cm->narray_knomial_radix,
                i,
                ptpcoll_module->full_narray_tree_size,
                &ptpcoll_module->narray_knomial_node[i]);
            if(OMPI_SUCCESS != rc) {
                goto Error;
            }
        }
    } else {
        ptpcoll_module->narray_type = PTPCOLL_EXTRA;
        ptpcoll_module->narray_knomial_proxy_extra_index[0] =
            (ptpcoll_module->super.sbgp_partner_module->my_index -
             ptpcoll_module->full_narray_tree_size) /
            cm->narray_knomial_radix;
    }

    return OMPI_SUCCESS;

Error:
    if (NULL != ptpcoll_module->narray_knomial_node) {
        free(ptpcoll_module->narray_knomial_node);
    }
    if (NULL != ptpcoll_module->narray_knomial_proxy_extra_index) {
        free(ptpcoll_module->narray_knomial_proxy_extra_index);
    }
    return OMPI_ERROR;
}
static int bcol_ptpcoll_barrier_recurs_dbl_extra_new(
                                bcol_function_args_t *input_args,
                                struct coll_ml_function_t *const_args)
{
   /* local variable */
    uint64_t sequence_number;
    int rc, completed, num_reqs = 2,
        tag, my_extra_partner_comm_rank;

    ompi_request_t **requests;
    ompi_free_list_item_t *item;

    mca_bcol_ptpcoll_collreq_t *collreq;

    mca_bcol_ptpcoll_module_t *ptp_module =
                         (mca_bcol_ptpcoll_module_t *) const_args->bcol_module;
    ompi_communicator_t *comm = ptp_module->super.sbgp_partner_module->group_comm;

    OMPI_FREE_LIST_WAIT_MT(&ptp_module->collreqs_free, item);
    if (OPAL_UNLIKELY(NULL == item)) {
        PTPCOLL_ERROR(("Free list waiting failed."));
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    collreq = (mca_bcol_ptpcoll_collreq_t *) item;
    input_args->bcol_opaque_data = (void *) collreq;

    requests = collreq->requests;

    /* TAG Calculation */
    sequence_number = input_args->sequence_num;

    /* Keep tag within the limit supportd by the pml */
    tag = (PTPCOLL_TAG_OFFSET + sequence_number * PTPCOLL_TAG_FACTOR) & (ptp_module->tag_mask);

    /* mark this as a collective tag, to avoid conflict with user-level flags */
    tag = -tag;

    /* I will not participate in the exchange - so just "register" as here,
     * signal the extra rank that I am here */

    my_extra_partner_comm_rank =
                 ptp_module->super.sbgp_partner_module->group_list[ptp_module->proxy_extra_index];

    rc = MCA_PML_CALL(isend(NULL, 0, MPI_INT,
                my_extra_partner_comm_rank, tag,
                MCA_PML_BASE_SEND_STANDARD, comm,
                &(requests[0])));
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        PTPCOLL_ERROR(("Send failed."));
        return rc;
    }

    /* Recv signal that the rest are done - my_extra_partner_comm_rank */
    rc = MCA_PML_CALL(irecv(NULL, 0, MPI_INT,
                my_extra_partner_comm_rank, tag, comm,
                &(requests[1])));
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        PTPCOLL_ERROR(("IRecv failed."));
        return rc;
    }

    /* Test for completion */
    completed =
        mca_bcol_ptpcoll_test_all_for_match(&num_reqs, requests, &rc);
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        PTPCOLL_ERROR(("Test for all failed."));
        return rc;
    }

    if (!completed) {
        return BCOL_FN_STARTED;
    }

    OMPI_FREE_LIST_RETURN_MT(&ptp_module->collreqs_free, (ompi_free_list_item_t *) collreq);
    return BCOL_FN_COMPLETE;
}
static int bcol_ptpcoll_barrier_recurs_dbl_new_progress(
                                bcol_function_args_t *input_args,
                                struct coll_ml_function_t *const_args)
{
   /* local variable */
    mca_bcol_ptpcoll_module_t *ptp_module =
                         (mca_bcol_ptpcoll_module_t *) const_args->bcol_module;

    ompi_communicator_t *comm = ptp_module->super.sbgp_partner_module->group_comm;

    int rc, exchange, pair_comm_rank, tag,
        pair_rank, delta, num_reqs, completed,
        my_rank = ptp_module->super.sbgp_partner_module->my_index,
        n_exchange = ptp_module->super.sbgp_partner_module->n_levels_pow2;

    ompi_request_t **requests;
    mca_bcol_ptpcoll_collreq_t *collreq =
                    (mca_bcol_ptpcoll_collreq_t *) input_args->bcol_opaque_data;

    num_reqs = collreq->num_reqs;
    requests = collreq->requests;

    /* test for completion */
    completed =
        mca_bcol_ptpcoll_test_all_for_match(&num_reqs, requests, &rc);
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        PTPCOLL_ERROR(("Test for all failed."));
        return rc;
    }

    if (!completed) {
          return BCOL_FN_STARTED;
    }

    assert(PTPCOLL_EXTRA != ptp_module->pow_2type);

    /* Continue loop over exchange send/recv pairs */
    num_reqs = 0;
    tag = collreq->tag;

    exchange = collreq->exchange;
    assert(exchange >= 0);

    delta = 1 << exchange;
    for (; exchange < n_exchange; ++exchange) {

        /* rank of exchange partner within the group */
        pair_rank = my_rank ^ delta;

        /* rank within the communicator */
        pair_comm_rank =
            ptp_module->super.sbgp_partner_module->group_list[pair_rank];

        /* send to partner - we will wait for completion, as send
         *   completion is at the MPI level, and will not
         *   incur network level completion costs
         */
        rc = MCA_PML_CALL(isend(NULL, 0, MPI_INT,
                    pair_comm_rank, tag,
                    MCA_PML_BASE_SEND_STANDARD, comm,
                    &(requests[0])));
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            PTPCOLL_ERROR(("ISend failed."));
            return rc;
        }

        ++num_reqs;

        /* recive from partner */
        rc = MCA_PML_CALL(irecv(NULL, 0, MPI_INT,
                    pair_comm_rank, tag, comm,
                    &(requests[1])));
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            PTPCOLL_ERROR(("IRecv failed."));
            return rc;
        }

        ++num_reqs;

        PTPCOLL_VERBOSE(5, ("exchange - %d, pair_rank - %d, pair_comm_rank - %d",
                             exchange, pair_rank, pair_comm_rank));

        /* test for completion */
        completed =
            mca_bcol_ptpcoll_test_all_for_match(&num_reqs, requests, &rc);
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            PTPCOLL_ERROR(("Test for all failed."));
            return rc;
        }

        if (!completed) {
            collreq->num_reqs = num_reqs;
            collreq->exchange = exchange + 1;
            assert(collreq->exchange >= 0);

            return BCOL_FN_STARTED;
        }

        delta <<= 1; /* delta *= 2 */
    }

    /* if non power of 2, may need to send message to "extra" proc */
    if (collreq->need_toserv_extra) {
        /* send - let the extra rank know that we are done */
        rc = MCA_PML_CALL(isend(NULL, 0, MPI_INT,
                    collreq->extra_partner_rank, tag,
                    MCA_PML_BASE_SEND_STANDARD, comm,
                    &(requests[0])));
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            PTPCOLL_ERROR(("ISend failed."));
            return rc;
        }

        completed = mca_bcol_ptpcoll_test_for_match(&requests[0], &rc);
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            PTPCOLL_ERROR(("Test for isend failed."));
            return rc;
        }

        if (!completed) {
            collreq->num_reqs = 1;
            collreq->need_toserv_extra = 0;
            collreq->exchange = n_exchange;

            return BCOL_FN_STARTED;
        }
    }

    return BCOL_FN_COMPLETE;
}
static int bcol_ptpcoll_barrier_recurs_dbl_new(
                                bcol_function_args_t *input_args,
                                struct coll_ml_function_t *const_args)
{
   /* local variable */
    uint64_t sequence_number;
    mca_bcol_ptpcoll_module_t *ptp_module =
                         (mca_bcol_ptpcoll_module_t *) const_args->bcol_module;

    ompi_communicator_t *comm = ptp_module->super.sbgp_partner_module->group_comm;

    int rc, my_extra_partner_comm_rank = 0, exchange, completed,
        pair_comm_rank, pair_rank, delta, tag, num_reqs = 0,
        my_rank = ptp_module->super.sbgp_partner_module->my_index,
        n_exchange = ptp_module->super.sbgp_partner_module->n_levels_pow2;

    ompi_request_t **requests;
    ompi_free_list_item_t *item;

    mca_bcol_ptpcoll_collreq_t *collreq;

    OMPI_FREE_LIST_WAIT_MT(&ptp_module->collreqs_free, item);
    if (OPAL_UNLIKELY(NULL == item)) {
        PTPCOLL_ERROR(("Free list waiting failed."));
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    collreq = (mca_bcol_ptpcoll_collreq_t *) item;
    input_args->bcol_opaque_data = (void *) collreq;

    assert(PTPCOLL_EXTRA != ptp_module->pow_2type);

    requests = collreq->requests;

    /* TAG Calculation */
    sequence_number = input_args->sequence_num;

    /* keep tag within the limit supportd by the pml */
    tag = (PTPCOLL_TAG_OFFSET + sequence_number * PTPCOLL_TAG_FACTOR) & (ptp_module->tag_mask);

    /* mark this as a collective tag, to avoid conflict with user-level flags */
    tag = -tag;

    if (PTPCOLL_PROXY == ptp_module->pow_2type) {
        /* I will participate in the exchange - wait for signal from extra
         ** process */
        /*
         * recv from extra rank - my_extra_partner_comm_rank
         *  can use blocking recv, as no other communications
         *  need to take place.
         */
        my_extra_partner_comm_rank =
                       ptp_module->super.sbgp_partner_module->group_list[ptp_module->proxy_extra_index];

        collreq->need_toserv_extra = 1;
        collreq->extra_partner_rank = my_extra_partner_comm_rank;

        rc = MCA_PML_CALL(irecv(NULL, 0, MPI_INT,
                    my_extra_partner_comm_rank, tag, comm,
                    &(requests[0])));
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            PTPCOLL_ERROR(("IRecv failed."));
            return rc;
        }

        completed = mca_bcol_ptpcoll_test_for_match(&requests[0], &rc);
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            PTPCOLL_ERROR(("Test for irecv failed."));
            return rc;
        }

        if (!completed) {
            collreq->tag = tag;
            collreq->num_reqs = 1;
            collreq->exchange = 0;

            return BCOL_FN_STARTED;
        }
    } else {
        collreq->need_toserv_extra = 0;
    }

    /* Loop over exchange send/recv pairs */
    delta = 1;
    for (exchange = 0; exchange < n_exchange; ++exchange) {

        /* rank of exchange partner within the group */
        pair_rank = my_rank ^ delta;

        /* rank within the communicator */
        pair_comm_rank =
            ptp_module->super.sbgp_partner_module->group_list[pair_rank];

        /* send to partner - we will wait for completion, as send
         *   completion is at the MPI level, and will not
         *   incur network level completion costs
         */
        rc = MCA_PML_CALL(isend(NULL, 0, MPI_INT,
                    pair_comm_rank, tag,
                    MCA_PML_BASE_SEND_STANDARD, comm,
                    &(requests[0])));
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            PTPCOLL_ERROR(("ISend failed."));
            return rc;
        }

        ++num_reqs;

        /* recive from partner */
        rc = MCA_PML_CALL(irecv(NULL, 0, MPI_INT,
                    pair_comm_rank, tag, comm,
                    &(requests[1])));
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            PTPCOLL_ERROR(("IRecv failed."));
            return rc;
        }

        ++num_reqs;

        PTPCOLL_VERBOSE(5, ("exchange - %d, pair_rank - %d, pair_comm_rank - %d",
                             exchange, pair_rank, pair_comm_rank));

        /* test for completion */
        completed =
            mca_bcol_ptpcoll_test_all_for_match(&num_reqs, requests, &rc);
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            PTPCOLL_ERROR(("Test for all failed."));
            return rc;
        }

        if (!completed) {
            collreq->tag = tag;
            collreq->num_reqs = num_reqs;

            collreq->exchange = exchange + 1;
            assert(collreq->exchange >= 0);

            return BCOL_FN_STARTED;
        }

        delta <<= 1; /* delta *= 2 */
    }

    if (PTPCOLL_PROXY == ptp_module->pow_2type) {
        /* send - let the extra rank know that we are done */
        rc = MCA_PML_CALL(isend(NULL, 0, MPI_INT,
                    my_extra_partner_comm_rank, tag,
                    MCA_PML_BASE_SEND_STANDARD, comm,
                    &(requests[0])));
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            PTPCOLL_ERROR(("ISend failed."));
            return rc;
        }

        completed = mca_bcol_ptpcoll_test_for_match(&requests[0], &rc);
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            PTPCOLL_ERROR(("Test for isend failed."));
            return rc;
        }

        if (!completed) {
            collreq->tag = tag;
            collreq->num_reqs = 1;

            collreq->need_toserv_extra = 0;
            collreq->exchange = n_exchange;

            return BCOL_FN_STARTED;
        }
    }

    OMPI_FREE_LIST_RETURN_MT(&ptp_module->collreqs_free, (ompi_free_list_item_t *) collreq);
    return BCOL_FN_COMPLETE;
}
static int bcol_ptpcoll_barrier_recurs_knomial_extra_new(
                                bcol_function_args_t *input_args,
                                struct coll_ml_function_t *const_args)
{
    /* local variable */
    uint64_t sequence_number;
    int rc, tag, pair_comm_rank,
        completed, num_reqs = 2;

    mca_bcol_ptpcoll_module_t *ptpcoll_module =
                    (mca_bcol_ptpcoll_module_t *) const_args->bcol_module;

    netpatterns_k_exchange_node_t *my_exchange_node =
                                   &ptpcoll_module->knomial_exchange_tree;

    ompi_communicator_t *comm =
                    ptpcoll_module->super.sbgp_partner_module->group_comm;

    int *extra_sources_array = my_exchange_node->rank_extra_sources_array;

    ompi_request_t **requests;
    ompi_free_list_item_t *item;

    mca_bcol_ptpcoll_collreq_t *collreq;

    OMPI_FREE_LIST_WAIT_MT(&ptpcoll_module->collreqs_free, item);
    if (OPAL_UNLIKELY(NULL == item)) {
        PTPCOLL_ERROR(("Free list waiting failed."));
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    collreq = (mca_bcol_ptpcoll_collreq_t *) item;
    input_args->bcol_opaque_data = (void *) collreq;

    requests = collreq->requests;

    /* TAG Calculation */
    sequence_number = input_args->sequence_num;

    /* Keep tag within the limit supportd by the pml */
    tag = (PTPCOLL_TAG_OFFSET + sequence_number * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask);

    /* Mark this as a collective tag, to avoid conflict with user-level flags */
    tag = -tag;

    pair_comm_rank =
            ptpcoll_module->super.sbgp_partner_module->group_list[extra_sources_array[0]];

    rc = MCA_PML_CALL(isend(
                NULL, 0, MPI_INT,
                pair_comm_rank, tag,
                MCA_PML_BASE_SEND_STANDARD,
                comm, &(requests[0])));
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        PTPCOLL_ERROR(("ISend failed."));
        return rc;
    }

    rc = MCA_PML_CALL(irecv(
                NULL, 0, MPI_INT,
                pair_comm_rank, tag,
                comm, &(requests[1])));
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        PTPCOLL_ERROR(("IRecv failed."));
        return rc;
    }

    /* Test for completion */
    completed =
        mca_bcol_ptpcoll_test_all_for_match(&num_reqs, requests, &rc);
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        PTPCOLL_ERROR(("Test for all failed."));
        return rc;
    }

    if (!completed) {
        return BCOL_FN_STARTED;
    }

    OMPI_FREE_LIST_RETURN_MT(&ptpcoll_module->collreqs_free, (ompi_free_list_item_t *) collreq);
    return BCOL_FN_COMPLETE;
}
static int bcol_ptpcoll_barrier_recurs_knomial_new(
                bcol_function_args_t *input_args,
                struct coll_ml_function_t *const_args)
{
    /* local variable */
    uint64_t sequence_number;
    mca_bcol_ptpcoll_module_t *ptpcoll_module =
                        (mca_bcol_ptpcoll_module_t *) const_args->bcol_module;

    netpatterns_k_exchange_node_t *my_exchange_node =
                                       &ptpcoll_module->knomial_exchange_tree;

    int rc, k, pair_comm_rank, exchange, completed,
        tree_order = my_exchange_node->tree_order, tag,
        n_extra_sources = my_exchange_node->n_extra_sources,
        n_exchange = my_exchange_node->n_exchanges, num_reqs;

    ompi_communicator_t *comm =
            ptpcoll_module->super.sbgp_partner_module->group_comm;

    int *extra_sources_array = NULL,
        **rank_exchanges = my_exchange_node->rank_exchanges;

    ompi_request_t **requests;
    ompi_free_list_item_t *item;

    mca_bcol_ptpcoll_collreq_t *collreq;

    OMPI_FREE_LIST_WAIT_MT(&ptpcoll_module->collreqs_free, item);
    if (OPAL_UNLIKELY(NULL == item)) {
        PTPCOLL_ERROR(("Free list waiting failed."));
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    collreq = (mca_bcol_ptpcoll_collreq_t *) item;
    input_args->bcol_opaque_data = (void *) collreq;

    requests = collreq->requests;

    /* TAG Calculation */
    sequence_number = input_args->sequence_num;

    /* Keep tag within the limit supportd by the pml */
    tag = (PTPCOLL_TAG_OFFSET + sequence_number * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask);

    /* Mark this as a collective tag, to avoid conflict with user-level flags */
    tag = -tag;

    if (0 < n_extra_sources) { /* EXCHANGE_NODE case */
        collreq->need_toserv_extra = 1;
        extra_sources_array = my_exchange_node->rank_extra_sources_array;

        /* I will participate in the exchange (of the algorithm) -
         * wait for signal from extra process */
        for (k = 0; k < n_extra_sources; ++k) {
            pair_comm_rank =
                    ptpcoll_module->super.sbgp_partner_module->group_list[extra_sources_array[k]];

            rc = MCA_PML_CALL(irecv(
                        NULL, 0, MPI_INT,
                        pair_comm_rank, tag,
                        comm, &(requests[k])));
            if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
                PTPCOLL_ERROR(("IRecv failed."));
                return rc;
            }
        }

        num_reqs = n_extra_sources;

        /* Test for completion */
        completed =
            mca_bcol_ptpcoll_test_all_for_match(&num_reqs, requests, &rc);
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            PTPCOLL_ERROR(("Test for all failed."));
            return rc;
        }

        if (!completed) {
            collreq->tag = tag;
            collreq->num_reqs = num_reqs;
            collreq->exchange = 0;

            return BCOL_FN_STARTED;
        }
    } else {
        collreq->need_toserv_extra = 0;
    }

    /* loop over exchange send/recv pairs */
    for (exchange = 0; exchange < n_exchange; ++exchange) {
        for (k = 0; k < tree_order - 1; ++k) {
            /* rank of exchange partner within the group */
            pair_comm_rank =
                ptpcoll_module->super.sbgp_partner_module->group_list[rank_exchanges[exchange][k]];

            assert(2 * ptpcoll_module->k_nomial_radix > (k * 2 + 1));

            /* send to partner - we will wait for completion, as send
             *   completion is at the MPI level, and will not
             *   incur network level completion costs
             */
            rc = MCA_PML_CALL(isend(
                        NULL, 0, MPI_INT,
                        pair_comm_rank, tag,
                        MCA_PML_BASE_SEND_STANDARD,
                        comm, &(requests[k * 2 + 1])));
            if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
                PTPCOLL_ERROR(("ISend failed."));
                return rc;
            }

            PTPCOLL_VERBOSE(10, ("Ex %d, K %d send to %d[%d]", exchange, k,
                                  pair_comm_rank, rank_exchanges[exchange][k]));

            /* recive from partner */
            rc = MCA_PML_CALL(irecv(
                        NULL, 0, MPI_INT,
                        pair_comm_rank, tag,
                        comm, &(requests[k * 2])));
            if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
                PTPCOLL_ERROR(("IRecv failed."));
                return rc;
            }

            PTPCOLL_VERBOSE(10, ("Ex %d, K %d irecv from %d[%d]", exchange, k,
                                  pair_comm_rank, rank_exchanges[exchange][k]));
        }

        num_reqs = 2 * (tree_order - 1);

        /* Test for completion */
        completed =
            mca_bcol_ptpcoll_test_all_for_match(&num_reqs, requests, &rc);
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            PTPCOLL_ERROR(("Test for all failed."));
            return rc;
        }

        if (!completed) {
            collreq->tag = tag;
            collreq->num_reqs = num_reqs;
            collreq->exchange = exchange + 1;

            return BCOL_FN_STARTED;
        }
    }

    /* If non power of 2, may need to send message to "extra" proc */
    if (0 < n_extra_sources)  {  /* EXCHANGE_NODE case */
        for (k = 0; k < n_extra_sources; ++k) {
            pair_comm_rank =
                ptpcoll_module->super.sbgp_partner_module->group_list[extra_sources_array[k]];

            rc = MCA_PML_CALL(isend(
                        NULL, 0, MPI_INT,
                        pair_comm_rank, tag,
                        MCA_PML_BASE_SEND_STANDARD,
                        comm, &(requests[k])));
            if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
                PTPCOLL_ERROR(("ISend failed."));
                return rc;
            }
        }

        num_reqs = n_extra_sources;

        /* Test for completion */
        completed =
            mca_bcol_ptpcoll_test_all_for_match(&num_reqs, requests, &rc);
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            PTPCOLL_ERROR(("Test for all failed."));
            return rc;
        }

        if (!completed) {
            collreq->tag = tag;
            collreq->num_reqs = num_reqs;

            collreq->exchange = n_exchange;
            collreq->need_toserv_extra = 0;

            return BCOL_FN_STARTED;
        }
    }

    OMPI_FREE_LIST_RETURN_MT(&ptpcoll_module->collreqs_free, (ompi_free_list_item_t *) collreq);
    return BCOL_FN_COMPLETE;
}
static int bcol_ptpcoll_barrier_recurs_knomial_new_progress(
                                bcol_function_args_t *input_args,
                                struct coll_ml_function_t *const_args)
{
    /* local variable */
    mca_bcol_ptpcoll_module_t *ptpcoll_module =
                        (mca_bcol_ptpcoll_module_t *) const_args->bcol_module;

    netpatterns_k_exchange_node_t *my_exchange_node =
                                       &ptpcoll_module->knomial_exchange_tree;

    int rc, k, tag, pair_comm_rank, exchange,
        tree_order = my_exchange_node->tree_order, num_reqs,
        n_exchange = my_exchange_node->n_exchanges, completed,
        n_extra_sources = my_exchange_node->n_extra_sources;

    ompi_communicator_t *comm =
            ptpcoll_module->super.sbgp_partner_module->group_comm;

    int *extra_sources_array,
        **rank_exchanges = my_exchange_node->rank_exchanges;

    mca_bcol_ptpcoll_collreq_t *collreq =
                    (mca_bcol_ptpcoll_collreq_t *) input_args->bcol_opaque_data;

    ompi_request_t **requests = collreq->requests;

    num_reqs = collreq->num_reqs;

    /* Test for completion */
    completed =
        mca_bcol_ptpcoll_test_all_for_match(&num_reqs, requests, &rc);
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        PTPCOLL_ERROR(("Test for all failed."));
        return rc;
    }

    if (!completed) {
        return BCOL_FN_STARTED;
    }

    /* Continue loop over exchange send/recv pairs */
    tag = collreq->tag;

    for (exchange = collreq->exchange; exchange < n_exchange; ++exchange) {
        for (k = 0; k < tree_order - 1; ++k) {
            /* rank of exchange partner within the group */
            pair_comm_rank =
                ptpcoll_module->super.sbgp_partner_module->group_list[rank_exchanges[exchange][k]];

            assert(2 * ptpcoll_module->k_nomial_radix > (k * 2 + 1));

            /* send to partner - we will wait for completion, as send
             *   completion is at the MPI level, and will not
             *   incur network level completion costs
             */
            rc = MCA_PML_CALL(isend(
                        NULL, 0, MPI_INT,
                        pair_comm_rank, tag,
                        MCA_PML_BASE_SEND_STANDARD,
                        comm, &(requests[k * 2 + 1])));
            if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
                PTPCOLL_ERROR(("ISend failed."));
                return rc;
            }

            PTPCOLL_VERBOSE(10, ("Ex %d, K %d send to %d[%d]", exchange, k,
                                  pair_comm_rank, rank_exchanges[exchange][k]));

            /* recive from partner */
            rc = MCA_PML_CALL(irecv(
                        NULL, 0, MPI_INT,
                        pair_comm_rank, tag,
                        comm, &(requests[k * 2])));
            if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
                PTPCOLL_ERROR(("IRecv failed."));
                return rc;
            }

            PTPCOLL_VERBOSE(10, ("Ex %d, K %d irecv from %d[%d]", exchange, k,
                                  pair_comm_rank, rank_exchanges[exchange][k]));
        }

        num_reqs = 2 * (tree_order - 1);

        /* Test for completion */
        completed =
            mca_bcol_ptpcoll_test_all_for_match(&num_reqs, requests, &rc);
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            PTPCOLL_ERROR(("Test for all failed."));
            return rc;
        }

        if (!completed) {
            collreq->num_reqs = num_reqs;
            collreq->exchange = exchange + 1;

            return BCOL_FN_STARTED;
        }
    }

    /* If non power of 2, may need to send message to "extra" proc */
    if (collreq->need_toserv_extra)  {  /* EXCHANGE_NODE case */
        extra_sources_array = my_exchange_node->rank_extra_sources_array;

        for (k = 0; k < n_extra_sources; ++k) {
            pair_comm_rank =
                ptpcoll_module->super.sbgp_partner_module->group_list[extra_sources_array[k]];

            rc = MCA_PML_CALL(isend(
                        NULL, 0, MPI_INT,
                        pair_comm_rank, tag,
                        MCA_PML_BASE_SEND_STANDARD,
                        comm, &(requests[k])));
            if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
                PTPCOLL_ERROR(("ISend failed."));
                return rc;
            }
        }

        num_reqs = n_extra_sources;

        /* Test for completion */
        completed =
            mca_bcol_ptpcoll_test_all_for_match(&num_reqs, requests, &rc);
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            PTPCOLL_ERROR(("Test for all failed."));
            return rc;
        }

        if (!completed) {
            collreq->num_reqs = num_reqs;
            collreq->exchange = n_exchange;
            collreq->need_toserv_extra = 0;

            return BCOL_FN_STARTED;
        }
    }

    return BCOL_FN_COMPLETE;
}