int mca_coll_ml_schedule_init_scratch(mca_coll_ml_topology_t *topo_info,
        mca_coll_ml_schedule_hier_info_t *h_info,
        int **out_scratch_indx, int **out_scratch_num) 
{
    bool prev_is_zero;
    int i, cnt;
    int n_hiers = h_info->n_hiers;
    int value_to_set = 0;
    mca_bcol_base_module_t *prev_bcol = NULL;
    int *scratch_indx, *scratch_num;

    scratch_indx = *out_scratch_indx = 
        (int *) calloc(n_hiers * 2, sizeof(int));
    if (NULL == *out_scratch_indx) {
        ML_ERROR(("Can't allocate memory.\n"));
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    scratch_num = *out_scratch_num = 
        (int *) calloc(n_hiers * 2, sizeof(int));
    if (NULL == *out_scratch_num) {
        ML_ERROR(("Can't allocate memory.\n"));
        free(out_scratch_indx);
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    for (i = 0, cnt = 0; i < h_info->num_up_levels; ++i, ++cnt) {
        if (IS_BCOL_TYPE_IDENTICAL(prev_bcol, GET_BCOL(topo_info, i))) {
            scratch_indx[cnt] = scratch_indx[cnt - 1] + 1;
        } else {
            scratch_indx[cnt] = 0;
            prev_bcol = GET_BCOL(topo_info, i);
        }
    }

    /* top  - only if the proc arrive to highest_level_is_global_highest_level */
    if (h_info->call_for_top_function) {
        if (IS_BCOL_TYPE_IDENTICAL(prev_bcol, GET_BCOL(topo_info, n_hiers - 1))) {
            scratch_indx[cnt] = scratch_indx[cnt - 1] + 1;
        } else {
            scratch_indx[cnt] = 0;
            prev_bcol = GET_BCOL(topo_info, n_hiers - 1);
        }
        ++cnt;
    }

    /* going down */
    for (i = h_info->num_up_levels - 1; i >= 0; --i, ++cnt) {
        if (IS_BCOL_TYPE_IDENTICAL(prev_bcol, GET_BCOL(topo_info, i))) {
            scratch_indx[cnt] = scratch_indx[cnt - 1] + 1;
        } else {
            scratch_indx[cnt] = 0;
            prev_bcol = GET_BCOL(topo_info, i);
        }
    }

    i = cnt - 1;
    prev_is_zero = true;

    do {
        if (prev_is_zero) {
            value_to_set = scratch_indx[i] + 1;
            prev_is_zero = false;
        }

        if (0 == scratch_indx[i]) {
            prev_is_zero = true;
        }

        scratch_num[i] = value_to_set;
        --i;
    } while(i >= 0);

    return OMPI_SUCCESS;
}
int ml_coll_up_and_down_hier_setup(mca_coll_ml_module_t *ml_module,
                                   mca_coll_ml_topology_t *topo_info,
                                   int up_function_idx,
                                   int top_function_idx,
                                   int down_function_idx,
                                   int collective)
{
    /* local variables */
    int i, j, cnt, value_to_set = -1;
    int ret = OMPI_SUCCESS, num_up_levels;

    int num_hierarchies = topo_info->n_levels;
    int global_high_hierarchy_index = topo_info->global_highest_hier_group_index;

    bool call_for_top_function, prev_is_zero;

    int *scratch_indx = NULL, *scratch_num = NULL;

    coll_ml_collective_description_t *collective_alg = NULL;
    mca_bcol_base_module_t *bcol_module = NULL,
                           *prev_bcol = NULL;

    /* RLG:  one blocking barrier collective algorithm - this is really a hack,
     * we need to figure out how to do this in a bit more extensible
     * manner.
     */
     collective_alg = (coll_ml_collective_description_t *)
         malloc(sizeof(coll_ml_collective_description_t));
     if (NULL == collective_alg) {
        ML_ERROR(("Can't allocate memory."));
        ret = OMPI_ERR_OUT_OF_RESOURCE;
        goto Error;
     }

    /* am I a member of the highest level subgroup ? */
    if (global_high_hierarchy_index ==
          topo_info->component_pairs[num_hierarchies - 1].bcol_index) {
        /* The process that is member of highest level subgroup
           should call for top algorithms in addition to fan-in/out steps*/
        call_for_top_function = true;
        /* hier level run only top algorithm, so we deduct 1 */
        num_up_levels = num_hierarchies - 1;
        /* Top algorithm is called only once, so we deduct 1 */
        collective_alg->n_functions = 2 * num_hierarchies - 1;
    } else {
        /* The process is not member of highest level subgroup,
           as result it does not call for top algorithm,
           but it calls for all fan-in/out steps */
        call_for_top_function = false;
        num_up_levels = num_hierarchies;
        collective_alg->n_functions = 2 * num_hierarchies;
    }

    ML_VERBOSE(10, ("high_index %d == bcol_index %d: Call top %d, num_up_levels %d, collective_alg->n_functions %d",
                global_high_hierarchy_index,
                topo_info->component_pairs[num_hierarchies - 1].bcol_index,
                call_for_top_function,
                num_up_levels,
                collective_alg->n_functions ));

    /* allocate space for the functions */
    collective_alg->functions = (mca_bcol_base_function_t *)
        calloc(collective_alg->n_functions, sizeof(mca_bcol_base_function_t));
    if( NULL == collective_alg->functions) {
        ML_ERROR(("Can't allocate memory."));
        ret = OMPI_ERR_OUT_OF_RESOURCE;
        goto Error;
    }

    /* Algorithm Description:
     * =====================
     * The algorithm used here for an N level system
     *  - up to level N-2, inclusive : up algorithm (fan in in barrier, reduce in Allreduce)
     *  - level N-1: top algorithm (barrier or allreduce)
     *  - level N-2, to level 0: down algorithm (fanout)
     */


    /* Starting scratch_num and scratch_index calculations */
    /* =================================================== */

    /* Figure out how many of the same bcols are called in a row.
     * The index of the bcol in row we store in scratch_indx and
     * the total number of bcols in the row we store in scratch_num */
    scratch_indx = (int *) calloc (2 * num_hierarchies, sizeof (int));
    if(NULL == scratch_indx) {
        ML_ERROR(("Can't allocate memory."));
        ret = OMPI_ERR_OUT_OF_RESOURCE;
        goto Error;
    }

    scratch_num = (int *) malloc(sizeof(int) * (2 * num_hierarchies));
    if(NULL == scratch_num) {
        ML_ERROR(("Can't allocate memory."));
        ret = OMPI_ERR_OUT_OF_RESOURCE;
        goto Error;
    }

    /* We go through all stages of algorithm (up, top, down)
     * and calculate bcol index. If previous bcol is the same type as current
     * one the counter index is increased, other way the index is zero */
    prev_bcol = NULL;
    /* going up */
    for (i = 0, cnt = 0; i < num_up_levels; ++i, ++cnt) {
        if (IS_BCOL_TYPE_IDENTICAL(prev_bcol, GET_BCOL(topo_info, i))) {
            scratch_indx[cnt] = scratch_indx[cnt - 1] + 1;
        } else {
            scratch_indx[cnt] = 0;
            prev_bcol = GET_BCOL(topo_info, i);
        }
    }

    /* top  - only if the proc arrive to highest_level_is_global_highest_level */
    if (call_for_top_function) {
        if (IS_BCOL_TYPE_IDENTICAL(prev_bcol, GET_BCOL(topo_info, num_hierarchies - 1))) {
            scratch_indx[cnt] = scratch_indx[cnt - 1] + 1;
        } else {
            scratch_indx[cnt] = 0;
            prev_bcol = GET_BCOL(topo_info, num_hierarchies - 1);
        }

        ++cnt;
    }

    /* going down */
    for (i = num_up_levels - 1; i >= 0; --i, ++cnt) {
        if (IS_BCOL_TYPE_IDENTICAL(prev_bcol, GET_BCOL(topo_info, i))) {
            scratch_indx[cnt] = scratch_indx[cnt - 1] + 1;
        } else {
            scratch_indx[cnt] = 0;
            prev_bcol = GET_BCOL(topo_info, i);
        }
    }

    /*
     * Calculate the number of the same bcols in row.
     * We parse the index array, if index is zero
     * it means that the row is done and we start
     * to calculate next bcols row. The maximum number
     * for the row is equal to maximal bcol index in the row + 1
     */
    i = cnt - 1;
    prev_is_zero = true;
    do {
        if (prev_is_zero) {
            value_to_set = scratch_indx[i] + 1;
            prev_is_zero = false;
        }

        if (0 == scratch_indx[i]) {
            prev_is_zero = true;
        }

        scratch_num[i] = value_to_set;
        --i;
    } while(i >= 0);

    /* =========================================================== */
    /* We are done with scratch_num and scratch_index calculations */

    /* Setup function call for each algorithm step */
    cnt = 0;
    /* up phase */
    for (i = 0; i < num_up_levels; i++) {
        bcol_module = GET_BCOL(topo_info, i);
        collective_alg->functions[cnt].fn_idx = up_function_idx;
        collective_alg->functions[cnt].bcol_module = bcol_module;
        collective_alg->functions[cnt].index_in_consecutive_same_bcol_calls = scratch_indx[cnt];
        collective_alg->functions[cnt].n_of_this_type_in_a_row = scratch_num[cnt];
        ML_VERBOSE(10, ("Setting collective [collective code %d][count %d], fn_idx %d, index_in_consecutive_same_bcol_calls %d, n_of_this_type_in_a_row %d",
                    collective, cnt, collective_alg->functions[cnt].fn_idx,
                    collective_alg->functions[cnt].index_in_consecutive_same_bcol_calls,
                    collective_alg->functions[cnt].n_of_this_type_in_a_row));
        ++cnt;
    }

    /* top function */
    if (call_for_top_function) {
        bcol_module = GET_BCOL(topo_info, num_hierarchies - 1);
        collective_alg->functions[cnt].fn_idx = top_function_idx;
        collective_alg->functions[cnt].bcol_module = bcol_module;
        collective_alg->functions[cnt].index_in_consecutive_same_bcol_calls = scratch_indx[cnt];
        collective_alg->functions[cnt].n_of_this_type_in_a_row = scratch_num[cnt];
        ML_VERBOSE(10, ("Setting collective [collective code %d][count %d], fn_idx %d, index_in_consecutive_same_bcol_calls %d, n_of_this_type_in_a_row %d",
                    collective, cnt, collective_alg->functions[cnt].fn_idx,
                    collective_alg->functions[cnt].index_in_consecutive_same_bcol_calls,
                    collective_alg->functions[cnt].n_of_this_type_in_a_row));
        ++cnt;
    }

    /* down phase*/
    for (i = num_up_levels - 1; i >= 0; i--) {
        bcol_module = GET_BCOL(topo_info, i);
        collective_alg->functions[cnt].fn_idx = down_function_idx;
        collective_alg->functions[cnt].bcol_module = bcol_module;
        collective_alg->functions[cnt].index_in_consecutive_same_bcol_calls = scratch_indx[cnt];
        collective_alg->functions[cnt].n_of_this_type_in_a_row = scratch_num[cnt];
        ML_VERBOSE(10, ("Setting collective [collective code %d][count %d], fn_idx %d, index_in_consecutive_same_bcol_calls %d, n_of_this_type_in_a_row %d",
                    collective, cnt, collective_alg->functions[cnt].fn_idx,
                    collective_alg->functions[cnt].index_in_consecutive_same_bcol_calls,
                    collective_alg->functions[cnt].n_of_this_type_in_a_row));
        ++cnt;
    }

    /* figure out how many times this bcol is used in this collective call */
    for (i = 0; i < collective_alg->n_functions; i++) {
        mca_bcol_base_module_t *current_bcol=
            collective_alg->functions[i].bcol_module;

        cnt = 0;
        for (j = 0; j < collective_alg->n_functions; ++j) {
            if (current_bcol ==
                    collective_alg->functions[j].bcol_module) {
                collective_alg->functions[j].index_of_this_type_in_collective = cnt;
                ML_VERBOSE(10, ("Pasha: Setting collective [collective code %d][count %d], fn_idx %d, collective_alg->functions[i].index_of_this_type_in_collective %d",
                            collective, cnt, i,
                            collective_alg->functions[j].index_of_this_type_in_collective));
                cnt++;
            }
        }

        collective_alg->functions[i].n_of_this_type_in_collective=cnt;
        ML_VERBOSE(10, ("Pasha: Setting collective [collective code %d][count %d], fn_idx %d, collective_alg->functions[i].n_of_this_type_in_collective %d",
                    collective, cnt, i,
                    collective_alg->functions[i].n_of_this_type_in_collective));
    }

    /* set Barrier algorithm */
    topo_info->hierarchical_algorithms[collective] = collective_alg;
    /* Setup maximum number function calls, it is used for resource allocation */
    ml_module->max_fn_calls = (collective_alg->n_functions > ml_module->max_fn_calls) ?
                                    collective_alg->n_functions : ml_module->max_fn_calls;
    /* Ishai: What is this n_buffers? I did not find where it is being used*/
    topo_info->hierarchical_algorithms[collective]->n_buffers = 1;

    /* Release temporary memories */
    if (NULL != scratch_indx) {
        free(scratch_indx);
    }

    if (NULL != scratch_num) {
       free(scratch_num);
    }

    return OMPI_SUCCESS;

Error:
    if (NULL != collective_alg->functions) {
       free(collective_alg->functions);
    }

    if (NULL != collective_alg) {
       free(collective_alg);
    }

    if (NULL != scratch_indx) {
        free(scratch_indx);
    }

    if (NULL != scratch_num) {
        free(scratch_num);
    }

    return ret;
}
int ml_coll_barrier_constant_group_data_setup(
                mca_coll_ml_topology_t *topo_info,
                mca_coll_ml_collective_operation_description_t  *schedule)
{
    /* local variables */
    int i, j, cnt, value_to_set = -1, ret = OMPI_SUCCESS, num_up_levels,
        num_hierarchies = topo_info->n_levels, n_functions = schedule->n_fns,
        global_high_hierarchy_index = topo_info->global_highest_hier_group_index;

    bool call_for_top_function, prev_is_zero;
    mca_coll_ml_utility_data_t *constant_group_data = NULL;

    int *scratch_indx = NULL, *scratch_num = NULL;

    mca_bcol_base_module_t *prev_bcol = NULL,
                           *bcol_module = NULL;

    /* Am I a member of the highest level subgroup ? */
    if (global_high_hierarchy_index ==
          topo_info->component_pairs[num_hierarchies - 1].bcol_index) {
        /* The process that is member of highest level subgroup
           should call for top algorithms in addition to fan-in/out steps*/
        call_for_top_function = true;
        /* hier level run only top algorithm, so we deduct 1 */
        num_up_levels = num_hierarchies - 1;
    } else {
        /* The process is not member of highest level subgroup,
           as result it does not call for top algorithm,
           but it calls for all fan-in/out steps */
        call_for_top_function = false;
        num_up_levels = num_hierarchies;
    }

    /* Algorithm Description:
     * =====================
     * The algorithm used here for an N level system
     *  - up to level N-2, inclusive : up algorithm (Fan-In in Barrier)
     *  - level N-1: top algorithm (Barrier algth)
     *  - level N-2, to level 0: down algorithm (Fan-out)
     */


    /* Starting scratch_num and scratch_index calculations */
    /* =================================================== */

    /* Figure out how many of the same bcols are called in a row.
     * The index of the bcol in row we store in scratch_indx and
     * the total number of bcols in the row we store in scratch_num */
    scratch_indx = (int *) calloc (2 * num_hierarchies, sizeof (int));
    if(NULL == scratch_indx) {
        ML_ERROR(("Can't allocate memory."));
        ret = OMPI_ERR_OUT_OF_RESOURCE;
        goto Const_Data_Setup_Error;
    }

    scratch_num = (int *) malloc(sizeof(int) * (2 * num_hierarchies));
    if(NULL == scratch_num) {
        ML_ERROR(("Can't allocate memory."));
        ret = OMPI_ERR_OUT_OF_RESOURCE;
        goto Const_Data_Setup_Error;
    }

    /* We go through all stages of algorithm (up, top, down)
     * and calculate bcol index. If previous bcol is the same type as current
     * one the counter index is increased, other way the index is zero */
    prev_bcol = NULL;

    /* Going up */
    for (i = 0, cnt = 0; i < num_up_levels; ++i, ++cnt) {
        if (IS_BCOL_TYPE_IDENTICAL(prev_bcol, GET_BCOL(topo_info, i))) {
            scratch_indx[cnt] = scratch_indx[cnt - 1] + 1;
        } else {
            scratch_indx[cnt] = 0;
            prev_bcol = GET_BCOL(topo_info, i);
        }
    }

    /* Top  - only if the proc arrive to highest_level_is_global_highest_level */
    if (call_for_top_function) {
        if (IS_BCOL_TYPE_IDENTICAL(prev_bcol, GET_BCOL(topo_info, num_hierarchies - 1))) {
            scratch_indx[cnt] = scratch_indx[cnt - 1] + 1;
        } else {
            scratch_indx[cnt] = 0;
            prev_bcol = GET_BCOL(topo_info, num_hierarchies - 1);
        }

        ++cnt;
    }

    /* Going down */
    for (i = num_up_levels - 1; i >= 0; --i, ++cnt) {
        if (IS_BCOL_TYPE_IDENTICAL(prev_bcol, GET_BCOL(topo_info, i))) {
            scratch_indx[cnt] = scratch_indx[cnt - 1] + 1;
        } else {
            scratch_indx[cnt] = 0;
            prev_bcol = GET_BCOL(topo_info, i);
        }
    }

    /*
     * Calculate the number of the same bcols in row.
     * We parse the index array, if index is zero
     * it means that the row is done and we start
     * to calculate next bcols row. The maximum number
     * for the row is equal to maximal bcol index in the row + 1
     */
    i = cnt - 1;
    prev_is_zero = true;
    do {
        if (prev_is_zero) {
            value_to_set = scratch_indx[i] + 1;
            prev_is_zero = false;
        }

        if (0 == scratch_indx[i]) {
            prev_is_zero = true;
        }

        scratch_num[i] = value_to_set;
        --i;
    } while(i >= 0);

    /* =========================================================== */
    /* We are done with scratch_num and scratch_index calculations */

    /* Setup function call for each algorithm step */
    cnt = 0;

    /* Up phase */
    for (i = 0; i < num_up_levels; ++i) {
        bcol_module = GET_BCOL(topo_info, i);
        constant_group_data = &schedule->component_functions[cnt].constant_group_data;

        constant_group_data->bcol_module = bcol_module;
        constant_group_data->index_in_consecutive_same_bcol_calls = scratch_indx[cnt];
        constant_group_data->n_of_this_type_in_a_row = scratch_num[cnt];

        ++cnt;
    }

    /* Top function */
    if (call_for_top_function) {
        bcol_module = GET_BCOL(topo_info, num_hierarchies - 1);
        constant_group_data = &schedule->component_functions[cnt].constant_group_data;

        constant_group_data->bcol_module = bcol_module;
        constant_group_data->index_in_consecutive_same_bcol_calls = scratch_indx[cnt];
        constant_group_data->n_of_this_type_in_a_row = scratch_num[cnt];

        ++cnt;
    }

    /* Down phase */
    for (i = num_up_levels - 1; i >= 0; --i) {
        bcol_module = GET_BCOL(topo_info, i);
        constant_group_data = &schedule->component_functions[cnt].constant_group_data;

        constant_group_data->bcol_module = bcol_module;

        /* All Fan-Outs will be done in parallel */
        constant_group_data->index_in_consecutive_same_bcol_calls = 0;
        constant_group_data->n_of_this_type_in_a_row = 1;

        ++cnt;
    }

    /* Figure out how many times this bcol is used in this collective call */
    for (i = 0; i < n_functions; ++i) {
        struct mca_coll_ml_compound_functions_t *component_functions =
                                 schedule->component_functions;
        mca_bcol_base_module_t *current_bcol =
                                 component_functions[i].constant_group_data.bcol_module;

        /* silence clang warning about possible NULL dereference of component_functions.
         * this case is a developer error if it occurs */
        assert (NULL != component_functions && NULL != constant_group_data);

        cnt = 0;
        for (j = 0; j < n_functions; ++j) {
            if (current_bcol ==
                    component_functions[j].constant_group_data.bcol_module) {
                constant_group_data->index_of_this_type_in_collective = cnt;

                ++cnt;
            }
        }

        component_functions[i].constant_group_data.n_of_this_type_in_collective = cnt;
    }

    MCA_COLL_ML_SET_SCHEDULE_ORDER_INFO(schedule);

    /* Release temporary memories */
    free(scratch_num);
    free(scratch_indx);

    return OMPI_SUCCESS;

Const_Data_Setup_Error:
    if (NULL != scratch_indx) {
        free(scratch_indx);
    }

    if (NULL != scratch_num) {
        free(scratch_num);
    }

    return ret;
}
/*
 * Fill up the collective descriptor
 *
 */
static int mca_coll_ml_build_static_reduce_schedule(
                                    mca_coll_ml_topology_t *topo_info,
                                    mca_coll_ml_collective_operation_description_t **coll_desc)
{
    int i_hier, j_hier,  n_fcns,
        n_hiers = topo_info->n_levels;
    int *scratch_indx = NULL,
        *scratch_num = NULL;
    int cnt, value_to_set = 0;
    int ret = OMPI_SUCCESS;
    bool prev_is_zero;
    mca_coll_ml_compound_functions_t *comp_fns_temp;
    mca_bcol_base_module_t *prev_bcol,
                           *bcol_module;
    mca_coll_ml_compound_functions_t *comp_fn;
    mca_coll_ml_collective_operation_description_t  *schedule = NULL;

    *coll_desc = (mca_coll_ml_collective_operation_description_t *)
                  malloc(sizeof(mca_coll_ml_collective_operation_description_t));

    schedule = *coll_desc;
    if (OPAL_UNLIKELY(NULL == schedule)) {
        ML_ERROR(("Can't allocate memory.\n"));
        ret = OMPI_ERR_OUT_OF_RESOURCE;
        goto Error;
    }

    scratch_indx = (int *) malloc(sizeof(int) * (n_hiers));
    if (NULL == scratch_indx) {
        ML_ERROR(("Can't allocate memory.\n"));
        ret = OMPI_ERR_OUT_OF_RESOURCE;
        goto Error;
    }

    scratch_num = (int *) malloc(sizeof(int) * (n_hiers));
    if (NULL == scratch_num) {
        ML_ERROR(("Can't allocate memory.\n"));
        ret = OMPI_ERR_OUT_OF_RESOURCE;
        goto Error;
    }

    prev_bcol = NULL;

    /* Calculate scratch numbers */
    for (i_hier = 0; i_hier < n_hiers; i_hier++) {
        if (IS_BCOL_TYPE_IDENTICAL(prev_bcol, GET_BCOL(topo_info, i_hier))) {
            scratch_indx[i_hier] = scratch_indx[i_hier - 1] + 1;
        } else {
            scratch_indx[i_hier] = 0;
            prev_bcol = GET_BCOL(topo_info, i_hier);
        }
    }

    --i_hier;
    prev_is_zero = true;

    do {
        if (prev_is_zero) {
            value_to_set = scratch_indx[i_hier] + 1;
            prev_is_zero = false;
        }

        if (0 == scratch_indx[i_hier]) {
            prev_is_zero = true;
        }

        scratch_num[i_hier] = value_to_set;
        --i_hier;
    } while(i_hier >= 0);

    /* All hierarchies call one function, unlike other collectives */
    n_fcns = n_hiers;

    /* Set dependencies equal to number of hierarchies */
    schedule->n_fns = n_fcns;
    schedule->topo_info = topo_info;
    schedule->progress_type = 0;
    /* Allocated the component function */
    schedule->component_functions = (struct mca_coll_ml_compound_functions_t *)
                                     calloc(n_fcns, sizeof(struct mca_coll_ml_compound_functions_t));

    if (OPAL_UNLIKELY(NULL == schedule->component_functions)) {
        ML_ERROR(("Can't allocate memory.\n"));
        ret = OMPI_ERR_OUT_OF_RESOURCE;
        goto Error;
    }


    for (i_hier = 0; i_hier < n_hiers; ++i_hier) {
        comp_fn = &schedule->component_functions[i_hier];

        /* The hierarchial level */
        comp_fn->h_level = i_hier;
        bcol_module = GET_BCOL(topo_info, i_hier);

        comp_fn->bcol_function =
                bcol_module->filtered_fns_table[DATA_SRC_KNOWN][NON_BLOCKING][BCOL_REDUCE][1][0][0];

        strcpy(comp_fn->fn_name, "REDUCE");
        ML_VERBOSE(10, ("func indx %d set to %p", i_hier, comp_fn->bcol_function));


        ML_VERBOSE(1,("In ML_REDUCE_SETUP  .. looks fine here"));
        /* No need completion func for Barrier */
        comp_fn->task_comp_fn = mca_coll_ml_task_comp_static_reduce;

        /* Constants */
        comp_fn->constant_group_data.bcol_module = bcol_module;
        comp_fn->constant_group_data.index_in_consecutive_same_bcol_calls = scratch_indx[i_hier];
        comp_fn->constant_group_data.n_of_this_type_in_a_row = scratch_num[i_hier];
        comp_fn->constant_group_data.n_of_this_type_in_collective = 0;
        comp_fn->constant_group_data.index_of_this_type_in_collective = 0;

        ML_VERBOSE(10, ("Setting collective [reduce] fn_idx %d, n_of_this_type_in_a_row %d, "
                        "index_in_consecutive_same_bcol_calls %d.",
                         i_hier, comp_fn->constant_group_data.n_of_this_type_in_a_row,
                         comp_fn->constant_group_data.index_in_consecutive_same_bcol_calls));
    }


    /* Fill the rest of constant data */
    for (i_hier = 0; i_hier < n_hiers; i_hier++) {
        mca_bcol_base_module_t *current_bcol =
            schedule->component_functions[i_hier].
            constant_group_data.bcol_module;
        cnt = 0;
        for (j_hier = 0; j_hier < n_hiers; j_hier++) {
            if (current_bcol ==
                    schedule->component_functions[j_hier].
                    constant_group_data.bcol_module) {
                schedule->component_functions[j_hier].
                    constant_group_data.index_of_this_type_in_collective = cnt;
                cnt++;
            }
        }
        schedule->component_functions[i_hier].
            constant_group_data.n_of_this_type_in_collective = cnt;
    }

    /* Manju: Reduction should always use the fixed schedule.
     * The subgroups that this process is leader should be executed first, then
     * it should execute the subgroups where this process is not a leader, and
     * then execute the subgroup that includes the root.
     */

    /* Allocate the schedule list */
    schedule->comp_fn_arr = (struct mca_coll_ml_compound_functions_t **)
        calloc(n_hiers,sizeof(struct mca_coll_ml_compound_functions_t *));
    if (NULL == schedule->comp_fn_arr) {
        ML_ERROR(("Can't allocate memory.\n"));
        ret = OMPI_ERR_OUT_OF_RESOURCE;
        goto Error;
    }

    /* Now that the functions have been set-up properly, we can simple permute the ordering a bit */

    for (i_hier = 0; i_hier < n_hiers; i_hier++) {
        /* first one is trivial */
        int leader_hierarchy = 0;
        int non_leader_hierarchy = 0;
        int func_index;

        comp_fns_temp = (struct mca_coll_ml_compound_functions_t *)
            calloc(n_hiers, sizeof(struct mca_coll_ml_compound_functions_t));

        leader_hierarchy = 0;
        non_leader_hierarchy = n_hiers - 2;

        for(j_hier = 0; j_hier < n_hiers - 1 ; j_hier++) {

            func_index = j_hier < i_hier ? j_hier : j_hier + 1;
            /* I'm a leader for this group */
            if (0 == topo_info->component_pairs->subgroup_module->my_index) {
                comp_fns_temp[leader_hierarchy++] =
                    schedule->component_functions[func_index];
            }
            else {
                comp_fns_temp[non_leader_hierarchy--] =
                    schedule->component_functions[func_index];
            }
        }

        comp_fns_temp[j_hier] = schedule->component_functions[i_hier];
        /* now let's attach this list to our array of lists */
        schedule->comp_fn_arr[i_hier] = comp_fns_temp;
    }

    /* Manju: Do we need this ? */

    /* I'm going to just loop over each schedule and
     * set up the scratch indices, scratch numbers
     * and other constant data
     */
    /*
    for( i_hier = 1; i_hier < n_hiers; i_hier++) {
        ret = mca_coll_ml_setup_scratch_vals(schedule->comp_fn_arr[i_hier], scratch_indx,
                scratch_num, n_hiers);
        if( OMPI_SUCCESS != ret ) {
            ret = OMPI_ERROR;
            goto Error;
        }

    }
    */

    /* Do I need this ? */
    schedule->task_setup_fn[COLL_ML_ROOT_TASK_FN] = mca_coll_ml_static_reduce_root;
    schedule->task_setup_fn[COLL_ML_GENERAL_TASK_FN] = mca_coll_ml_static_reduce_non_root;

    MCA_COLL_ML_SET_SCHEDULE_ORDER_INFO(schedule);

    free(scratch_num);
    free(scratch_indx);

    return OMPI_SUCCESS;

Error:
    if (NULL != schedule->component_functions) {
        free(schedule->component_functions);
        schedule->component_functions = NULL;
    }

    return ret;
}