예제 #1
0
파일: coll_ml_reduce.c 프로젝트: IanYXXL/A1
static int mca_coll_ml_reduce_unpack(mca_coll_ml_collective_operation_progress_t *coll_op)
{
    int ret;
    /* need to put in more */
    int count = coll_op->variable_fn_params.count;
    ompi_datatype_t *dtype = coll_op->variable_fn_params.dtype;

    void *dest = (void *)((uintptr_t)coll_op->full_message.dest_user_addr +
                          (uintptr_t)coll_op->fragment_data.offset_into_user_buffer);
    void *src = (void *)((uintptr_t)coll_op->fragment_data.buffer_desc->data_addr +
                         (size_t)coll_op->variable_fn_params.rbuf_offset);

    ret = ompi_datatype_copy_content_same_ddt(dtype, (int32_t) count, (char *) dest,
            (char *) src);
    if (ret < 0) {
        return OMPI_ERROR;
    }

    if (coll_op->variable_fn_params.root_flag) {
        ML_VERBOSE(1,("In reduce unpack %d",
                      *(int *)((unsigned char*) src)));
    }

    ML_VERBOSE(10, ("sbuf addr %p, sbuf offset %d, sbuf val %lf, rbuf addr %p, rbuf offset %d, rbuf val %lf.",
                    coll_op->variable_fn_params.sbuf, coll_op->variable_fn_params.sbuf_offset,
                    *(double *) ((unsigned char *) coll_op->variable_fn_params.sbuf +
                                 (size_t) coll_op->variable_fn_params.sbuf_offset),
                    coll_op->variable_fn_params.rbuf, coll_op->variable_fn_params.rbuf_offset,
                    *(double *) ((unsigned char *) coll_op->variable_fn_params.rbuf +
                                 (size_t) coll_op->variable_fn_params.rbuf_offset)));

    return OMPI_SUCCESS;
}
예제 #2
0
파일: coll_ml_reduce.c 프로젝트: IanYXXL/A1
int mca_coll_ml_reduce_nb(void *sbuf, void *rbuf, int count,
                          struct ompi_datatype_t *dtype, struct ompi_op_t *op,
                          int root, struct ompi_communicator_t *comm,
                          ompi_request_t **req,
                          mca_coll_base_module_t *module) {

    int ret = OMPI_SUCCESS;
    mca_coll_ml_module_t *ml_module = (mca_coll_ml_module_t*)module;

    if (OPAL_UNLIKELY(!ompi_op_is_commute(op) || !opal_datatype_is_contiguous_memory_layout(&dtype->super, count))) {
        /* coll/ml does not handle non-communative operations at this time. fallback
         * on another collective module */
        return ml_module->fallback.coll_ireduce (sbuf, rbuf, count, dtype, op, root, comm, req,
                ml_module->fallback.coll_ireduce_module);
    }

    ML_VERBOSE(10,("Calling Ml Reduce "));
    ret = parallel_reduce_start(sbuf, rbuf, count, dtype, op,
                                root, comm, ml_module,
                                req, ML_SMALL_DATA_REDUCE,
                                ML_LARGE_DATA_REDUCE);
    if (OPAL_UNLIKELY(ret != OMPI_SUCCESS)) {
        ML_VERBOSE(10, ("Failed to launch"));
        return ret;
    }


    ML_VERBOSE(10, ("Non-blocking Reduce is done"));

    return OMPI_SUCCESS;

}
예제 #3
0
int mca_coll_ml_allgather(const void *sbuf, int scount,
                          struct ompi_datatype_t *sdtype,
                          void* rbuf, int rcount,
                          struct ompi_datatype_t *rdtype,
                          struct ompi_communicator_t *comm,
                          mca_coll_base_module_t *module)
{
    ompi_request_t *req;
    int ret;

    ML_VERBOSE(10, ("Starting blocking allgather"));

    ret = mca_coll_ml_allgather_start (sbuf, scount, sdtype,
                                       rbuf, rcount, rdtype,
                                       comm, module, &req);
    if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
        return ret;
    }

    ret = ompi_request_wait (&req, MPI_STATUS_IGNORE);

    ML_VERBOSE(10, ("Blocking allgather is complete"));

    return ret;
}
예제 #4
0
/**
 * Hierarchical blocking barrier
 */
int mca_coll_ml_barrier_intra(struct ompi_communicator_t *comm,
                              mca_coll_base_module_t *module)
{
    int rc;
    ompi_request_t *req;

    mca_coll_ml_module_t *ml_module = (mca_coll_ml_module_t *) module;

#if OPAL_ENABLE_DEBUG
    static int barriers_count = 0;
#endif

    ML_VERBOSE(10, ("Barrier num %d start.", ++barriers_count));

    rc = mca_coll_ml_barrier_launch(ml_module, &req);
    if (OPAL_UNLIKELY(rc != OMPI_SUCCESS)) {
        ML_ERROR(("Failed to launch a barrier."));
        return rc;
    }

    /* Blocking barrier */
    ompi_request_wait_completion(req);
    ompi_request_free(&req);

    ML_VERBOSE(10, ("Barrier num %d was done.", barriers_count));

    return OMPI_SUCCESS;
}
예제 #5
0
static int mca_coll_ml_lmngr_init(mca_coll_ml_lmngr_t *lmngr)
{
    int i, num_blocks;
    int rc;
    unsigned char *addr;
    bcol_base_network_context_t *nc;

    ML_VERBOSE(7, ("List initialization"));

#ifdef HAVE_POSIX_MEMALIGN
    if((errno = posix_memalign(&lmngr->base_addr,
                    lmngr->list_alignment,
                    lmngr->list_size * lmngr->list_block_size)) != 0) {
        ML_ERROR(("Failed to allocate memory: %d [%s]", errno, strerror(errno)));
        return OMPI_ERROR;
    }
    lmngr->alloc_base = lmngr->base_addr;
#else
    lmngr->alloc_base =
        malloc(lmngr->list_size * lmngr->list_block_size + lmngr->list_alignment);
    if(NULL == lmngr->alloc_base) {
        ML_ERROR(("Failed to allocate memory: %d [%s]", errno, strerror(errno)));
        return OMPI_ERROR;
    }

    lmngr->base_addr = (void*)OPAL_ALIGN((uintptr_t)lmngr->alloc_base,
            lmngr->list_alignment, uintptr_t);
#endif

    assert(lmngr->n_resources < MCA_COLL_ML_MAX_REG_INFO);

    for(i= 0 ;i < lmngr->n_resources ;i++) {
        nc = lmngr->net_context[i];
        ML_VERBOSE(7, ("Call registration for resource index %d", i));
        rc = lmngr_register(lmngr, nc);
        if (OMPI_SUCCESS != rc) {
            ML_ERROR(("Failed to lmngr register: %d [%s]", errno, strerror(errno)));
            return rc;
        }
    }

    /* slice the memory to blocks */
    addr = (unsigned char *) lmngr->base_addr;
    for(num_blocks = 0; num_blocks < (int)lmngr->list_size; num_blocks++) {
        mca_bcol_base_lmngr_block_t *item = OBJ_NEW(mca_bcol_base_lmngr_block_t);
        item->base_addr = (void *)addr;
        item->lmngr = lmngr;
        /* ML_VERBOSE(10, ("Appending block # %d %p", num_blocks, (void *)addr)); */
        opal_list_append(&lmngr->blocks_list, (opal_list_item_t *)item);
        /* advance the address */
        addr += lmngr->list_block_size;
    }

    ML_VERBOSE(7, ("List initialization done %d",
                opal_list_get_size(&lmngr->blocks_list)));
    return OMPI_SUCCESS;
}
static int mca_coll_ml_memsync_recycle_memory(mca_coll_ml_collective_operation_progress_t *coll_op)
{
    mca_coll_ml_module_t *ml_module = (mca_coll_ml_module_t *)coll_op->coll_module;
    mca_bcol_base_memory_block_desc_t *ml_memblock = ml_module->payload_block;
    mca_coll_ml_collective_operation_progress_t *pending_op = NULL;
    int bank = coll_op->full_message.bank_index_to_recycle;
    int rc;
    bool have_resources = true;

    assert(bank >= 0 || 
           bank < (int)ml_memblock->num_banks ||
           ML_MEMSYNC == coll_op->fragment_data.current_coll_op);

    ML_VERBOSE(10,("MEMSYNC: bank %d was recycled coll_op %p", bank, coll_op));

    /* set the bank as free */

    ml_memblock->bank_is_busy[bank] = false;
    ml_memblock->bank_release_counters[bank] = 0;

    /* Check if we have any requests that are waiting for memory */
    while(opal_list_get_size(&ml_module->waiting_for_memory_list) && have_resources) {
        pending_op = (mca_coll_ml_collective_operation_progress_t *)
            opal_list_get_first(&ml_module->waiting_for_memory_list);

        ML_VERBOSE(10, ("Trying to start pending %p", pending_op));
        assert(pending_op->pending & REQ_OUT_OF_MEMORY);
        rc = pending_op->fragment_data.message_descriptor->fragment_launcher(pending_op);
        switch (rc) {
            case OMPI_SUCCESS: 
                ML_VERBOSE(10, ("Pending fragment was started %p", pending_op));
                pending_op->pending ^= REQ_OUT_OF_MEMORY;
                opal_list_remove_item(&ml_module->waiting_for_memory_list,
                        (opal_list_item_t *)pending_op);
                if (0 != pending_op->fragment_data.offset_into_user_buffer) {
                    /* non-zero offset ==> this is not fragment 0 */
                    CHECK_AND_RECYCLE(pending_op);
                }
                break;
            case OMPI_ERR_TEMP_OUT_OF_RESOURCE: 
                ML_VERBOSE(10, ("Already on the list %p", pending_op));
                have_resources = false;
                break;
            default:
                ML_ERROR(("Error happened %d", rc));
                return rc;
        }
    }

    ML_VERBOSE(10, ("Memsync done %p", coll_op));
    return OMPI_SUCCESS;
}
int ml_coll_hier_reduce_setup(mca_coll_ml_module_t *ml_module)
{
    int alg, ret, topo_index=0;
    mca_coll_ml_topology_t *topo_info =
           &ml_module->topo_list[ml_module->collectives_topology_map[ML_REDUCE][ML_SMALL_MSG]];

    if ( ml_module->max_fn_calls < topo_info->n_levels ) {
        ml_module->max_fn_calls = topo_info->n_levels;
    }


    alg = mca_coll_ml_component.coll_config[ML_REDUCE][ML_SMALL_MSG].algorithm_id;
    topo_index = ml_module->collectives_topology_map[ML_REDUCE][alg];
    if (ML_UNDEFINED == alg || ML_UNDEFINED == topo_index) {
        ML_ERROR(("No topology index or algorithm was defined"));
        topo_info->hierarchical_algorithms[ML_REDUCE] = NULL;
        return OMPI_ERROR;
    }

    ret = mca_coll_ml_build_static_reduce_schedule(&ml_module->topo_list[topo_index],
            &ml_module->coll_ml_reduce_functions[alg]);
    if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
        ML_VERBOSE(10, ("Failed to setup static reduce"));
        return ret;
    }


    return OMPI_SUCCESS;
}
예제 #8
0
static int lmngr_register(mca_coll_ml_lmngr_t *lmngr, bcol_base_network_context_t *nc)
{
    int rc, j;
    int max_nc = lmngr->n_resources;

    rc = nc->register_memory_fn(nc->context_data,
            lmngr->base_addr,
            lmngr->list_size * lmngr->list_block_size,
            &lmngr->reg_desc[nc->context_id]);

    if(rc != OMPI_SUCCESS) {
        int ret_val;
        ML_VERBOSE(7, ("Failed to register [%d], unrolling the registration", rc));
        /* deregistser the successful registrations */
        for( j = 0; j < max_nc; j++ ) {
            /* set the registration parameter to point to the current
             * resource description */
            nc = lmngr->net_context[j];
            ret_val = nc->deregister_memory_fn(nc->context_data,
                    lmngr->reg_desc[nc->context_id]);
            if(ret_val != OMPI_SUCCESS) {
                return ret_val;
            }
        }

        return rc;
    }

    return OMPI_SUCCESS;
}
예제 #9
0
int mca_coll_ml_lmngr_tune(mca_coll_ml_lmngr_t *lmngr,
        size_t block_size, size_t list_size, size_t alignment)
{
    ML_VERBOSE(7, ("Tunning list manager"));

    if (OPAL_UNLIKELY(NULL == lmngr->base_addr)) {
        ML_VERBOSE(7, ("The list manager is already initialized, you can not tune it"));
        return OMPI_ERROR;
    }

    lmngr->list_block_size = block_size;
    lmngr->list_alignment = alignment;
    lmngr->list_size = list_size;

    return OMPI_SUCCESS;
}
예제 #10
0
int mca_coll_ml_lmngr_append_nc(mca_coll_ml_lmngr_t *lmngr, bcol_base_network_context_t *nc)
{
    int i, rc;

    ML_VERBOSE(7, ("Append new network context %p to list manager %p",
                nc, lmngr));

    if (NULL == nc) {
        return OMPI_ERROR;
    }

    /* check if we already have the context on the list.
       if we do have - do not do anything, just return success
     */
    if (OPAL_UNLIKELY(MCA_COLL_ML_MAX_REG_INFO == lmngr->n_resources)) {
        ML_ERROR(("MPI overflows maximum supported network contexts is %d", MCA_COLL_ML_MAX_REG_INFO));
        return OMPI_ERROR;
    }

    for (i = 0; i < lmngr->n_resources; i++) {
        if (lmngr->net_context[i] == nc) {
            ML_VERBOSE(7, ("It is not new "));
            return OMPI_SUCCESS;
        }
    }

    ML_VERBOSE(7, ("Adding new context"));

    /* Setting context id */
    nc->context_id = lmngr->n_resources;
    lmngr->net_context[lmngr->n_resources] = nc;

    lmngr->n_resources++;

    /* Register the memory with new context */
    if (NULL != lmngr->base_addr) {
        rc = lmngr_register(lmngr, nc);
        if (OMPI_SUCCESS == rc) {
            return rc;
        }
    }

    return OMPI_SUCCESS;
}
예제 #11
0
static int add_to_invoke_table(mca_bcol_base_module_t *bcol_module,
                       mca_bcol_base_coll_fn_desc_t *fn_filtered,
                       mca_coll_ml_module_t *ml_module)
{
    struct mca_bcol_base_coll_fn_invoke_attributes_t *inv_attribs = NULL;
    int bcoll_type, data_src_type, waiting_semantic;
    int range_min,range_max;
    int i=0,j=0,k=0,mask=1;



    if((NULL == fn_filtered->inv_attr)||(NULL == fn_filtered->comm_attr)) {
        return OMPI_ERROR;
    }

    ML_VERBOSE(10, ("Calling add_to_invoke_table %p",fn_filtered->coll_fn));

    inv_attribs = fn_filtered->inv_attr;
    bcoll_type = fn_filtered->comm_attr->bcoll_type;
    data_src_type = fn_filtered->comm_attr->data_src;
    waiting_semantic = fn_filtered->comm_attr->waiting_semantics;

    range_min = msg_to_range(inv_attribs->bcol_msg_min);
    range_max = msg_to_range(inv_attribs->bcol_msg_max);

    for (j=0; j<OMPI_OP_NUM_OF_TYPES; j++) {
        for (k=0; k<OMPI_DATATYPE_MAX_PREDEFINED; k++) {

            if ((inv_attribs->datatype_bitmap & (mask << k)) && (inv_attribs->op_types_bitmap & (mask << j))){

               for (i=range_min; i<=range_max; i++) {
                    bcol_module->filtered_fns_table[data_src_type][waiting_semantic][bcoll_type][i][j][k]
                                                                    = fn_filtered;
                    ML_VERBOSE(21, ("Putting functions %d %d %d %d %p", bcoll_type, i, j, k, fn_filtered));
               }
            }
        }
    }

    return 0;

}
예제 #12
0
static int mca_coll_ml_barrier_launch(mca_coll_ml_module_t *ml_module,
                                     ompi_request_t **req)
{
    int rc;

    ompi_free_list_item_t *item;
    mca_coll_ml_collective_operation_progress_t *coll_op;
    ml_payload_buffer_desc_t *src_buffer_desc = NULL;
    
    /* allocate an ml buffer for signaling purposes */
    src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);

    while (NULL == src_buffer_desc) {
        opal_progress();
        src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
    }

    
    /* Blocking call on fragment allocation (Maybe we want to make it non blocking ?) */
    OMPI_FREE_LIST_WAIT(&(ml_module->coll_ml_collective_descriptors),
                          item,
                          rc);

    coll_op = (mca_coll_ml_collective_operation_progress_t *) item;
    assert(NULL != coll_op);

    ML_VERBOSE(10, ("Get coll request %p", coll_op));

    MCA_COLL_ML_OP_BASIC_SETUP(coll_op, 0, 0, NULL, NULL, ml_module->coll_ml_barrier_function);

    coll_op->fragment_data.buffer_desc = src_buffer_desc;
    coll_op->dag_description.num_tasks_completed = 0;

    coll_op->variable_fn_params.buffer_index = src_buffer_desc->buffer_index;

    coll_op->variable_fn_params.sequence_num =
        OPAL_THREAD_ADD64(&(ml_module->collective_sequence_num), 1);

    /* Pointer to a coll finalize function */
    coll_op->process_fn = NULL;

    (*req) = &coll_op->full_message.super;

    OMPI_REQUEST_INIT((*req), false);

    (*req)->req_status._cancelled = 0;
    (*req)->req_state = OMPI_REQUEST_ACTIVE;
    (*req)->req_status.MPI_ERROR = OMPI_SUCCESS;

    /* Set order info if there is a bcol needs ordering */
    MCA_COLL_ML_SET_ORDER_INFO(coll_op, 1);

    return mca_coll_ml_generic_collectives_launcher(coll_op, mca_coll_ml_barrier_task_setup);
}
예제 #13
0
static inline __opal_attribute_always_inline__ int mca_coll_ml_memsync_launch(mca_coll_ml_module_t *ml_module,
                                     ompi_request_t **req, int bank_index)
{
    mca_coll_ml_collective_operation_progress_t *coll_op;

    coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module,
            ml_module->coll_ml_memsync_function,
            NULL, NULL, 0, 0);

    assert(NULL != coll_op);

    ML_VERBOSE(10, ("Get coll request %p", coll_op));

    coll_op->fragment_data.buffer_desc = NULL;
    
    /* Caching bank index for future memory recycling callback */
    coll_op->full_message.bank_index_to_recycle = bank_index;

    coll_op->fragment_data.current_coll_op = ML_MEMSYNC;
    /* I don't want to define one more parameter, so under root
     * we pass buffer index */
    coll_op->variable_fn_params.root = bank_index;
    /* As well it's little bit ugly, since it is no wait for this request,
     * in order to recycle it we have to set offset to some value > 1 */
    coll_op->fragment_data.offset_into_user_buffer = 1;
    coll_op->variable_fn_params.buffer_index = MCA_COLL_ML_NO_BUFFER;
    coll_op->variable_fn_params.sequence_num = -1; /* It should be safe to use -1 */
    /* Pointer to a coll finalize function */
    if (OPAL_LIKELY(ml_module->initialized)) {
        coll_op->process_fn = mca_coll_ml_memsync_recycle_memory;
    } else {
        /* No post work on first call */
        coll_op->process_fn = NULL;
    }

    ML_VERBOSE(10,("Memsync start %p", &coll_op));

    return mca_coll_ml_generic_collectives_append_to_queue(coll_op, mca_coll_ml_barrier_task_setup);
}
예제 #14
0
static void destruct_lmngr(mca_coll_ml_lmngr_t *lmngr)
{
    int max_nc = lmngr->n_resources;
    int rc, i;
    bcol_base_network_context_t *nc;
    opal_list_item_t *item;

    ML_VERBOSE(6, ("Destructing list manager %p", (void *)lmngr));

    while (NULL != (item = opal_list_remove_first(&lmngr->blocks_list))) {
        OBJ_RELEASE(item);
    }

    OBJ_DESTRUCT(&lmngr->blocks_list);

    if (NULL != lmngr->alloc_base) {
        for( i = 0; i < max_nc; i++ ) {
            nc = lmngr->net_context[i];
            rc = nc->deregister_memory_fn(nc->context_data,
                    lmngr->reg_desc[nc->context_id]);
            if(rc != OMPI_SUCCESS) {
                ML_ERROR(("Failed to unregister , lmngr %p", (void *)lmngr));
            }
        }

        ML_VERBOSE(10, ("Release base addr %p", lmngr->alloc_base));

        free(lmngr->alloc_base);
        lmngr->alloc_base = NULL;
        lmngr->base_addr = NULL;
    }

    lmngr->list_block_size = 0;
    lmngr->list_alignment = 0;
    lmngr->list_size = 0;
    lmngr->n_resources = 0;

    OBJ_DESTRUCT(&lmngr->mem_lock);
}
예제 #15
0
파일: coll_ml_reduce.c 프로젝트: IanYXXL/A1
static int
mca_coll_ml_reduce_task_setup (mca_coll_ml_collective_operation_progress_t *coll_op)
{
    int fn_idx, h_level, next_h_level, my_index;
    mca_sbgp_base_module_t *sbgp;
    mca_coll_ml_topology_t *topo = coll_op->coll_schedule->topo_info;

    fn_idx      = coll_op->sequential_routine.current_active_bcol_fn;
    h_level     = coll_op->coll_schedule->component_functions[fn_idx].h_level;
    next_h_level = (fn_idx < coll_op->coll_schedule->n_fns - 1) ?
                   coll_op->coll_schedule->component_functions[fn_idx+1].h_level : -1;
    sbgp        = topo->component_pairs[h_level].subgroup_module;
    my_index    = sbgp->my_index;

    if (coll_op->variable_fn_params.root_flag) {
        ML_VERBOSE(1,("In task completion Data in receiver buffer %d ",
                      *(int *)((unsigned char*) coll_op->variable_fn_params.rbuf +
                               coll_op->variable_fn_params.rbuf_offset)));
    }

    /* determine the root for this level of the hierarchy */
    if (coll_op->coll_schedule->topo_info->route_vector[coll_op->global_root].level == next_h_level ||
            coll_op->global_root == sbgp->group_list[my_index]) {
        /* I am the global root or I will be talking to the global root in the next round. */
        coll_op->variable_fn_params.root = my_index;
    } else if (coll_op->coll_schedule->topo_info->route_vector[coll_op->global_root].level == h_level) {
        /* the root is in this level of my hierarchy */
        coll_op->variable_fn_params.root = coll_op->coll_schedule->topo_info->route_vector[coll_op->global_root].rank;
    } else {
        coll_op->variable_fn_params.root = 0;
    }

    /* Set the route vector for this root */
    coll_op->variable_fn_params.root_route =
        &coll_op->coll_schedule->topo_info->route_vector[sbgp->group_list[coll_op->variable_fn_params.root]];

    /* Am I the root of this hierarchy? */
    coll_op->variable_fn_params.root_flag = (my_index == coll_op->variable_fn_params.root);

    /* For hierarchy switch btw source and destination buffer
     * No need to make this switch for the first call ..
     * */
    if (0 < fn_idx) {
        int tmp_offset = coll_op->variable_fn_params.sbuf_offset;
        coll_op->variable_fn_params.sbuf_offset =
            coll_op->variable_fn_params.rbuf_offset;
        coll_op->variable_fn_params.rbuf_offset = tmp_offset;
    }

    return OMPI_SUCCESS;
}
예제 #16
0
int mca_coll_ml_allgather_nb(const void *sbuf, int scount,
                             struct ompi_datatype_t *sdtype,
                             void* rbuf, int rcount,
                             struct ompi_datatype_t *rdtype,
                             struct ompi_communicator_t *comm,
                             ompi_request_t **req,
                             mca_coll_base_module_t *module)
{
    int ret;

    ML_VERBOSE(10, ("Starting non-blocking allgather"));

    ret = mca_coll_ml_allgather_start (sbuf, scount, sdtype,
                                       rbuf, rcount, rdtype,
                                       comm, module, req);
    if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
        return ret;
    }

    ML_VERBOSE(10, ("Non-blocking allgather started"));

    return ret;
}
예제 #17
0
/**
 * Hierarchical non-blocking barrier
 */
int mca_coll_ml_ibarrier_intra(struct ompi_communicator_t *comm,
                               ompi_request_t **req,
                               mca_coll_base_module_t *module)
{
    int rc;
    mca_coll_ml_module_t *ml_module = (mca_coll_ml_module_t *) module;

#if OPAL_ENABLE_DEBUG
    static int barriers_count = 0;
#endif

    ML_VERBOSE(10, ("IBarrier num %d start.", ++barriers_count));

    rc = mca_coll_ml_barrier_launch(ml_module, req);
    if (OPAL_UNLIKELY(rc != OMPI_SUCCESS)) {
        ML_ERROR(("Failed to launch a barrier."));
        return rc;
    }

    ML_VERBOSE(10, ("IBarrier num %d was done.", barriers_count));

    return OMPI_SUCCESS;
}
예제 #18
0
int mca_coll_ml_lmngr_reg(void)
{
    int tmp, ret = OMPI_SUCCESS;

    mca_coll_ml_component_t *cm = &mca_coll_ml_component;

#define CHECK(expr) do {\
    tmp = (expr); \
    if (0 > tmp) ret = tmp; \
 } while (0)

    ML_VERBOSE(7, ("Setting parameters for list manager"));

    cm->lmngr_size = 8;
    CHECK(mca_base_component_var_register(&mca_coll_ml_component.super.collm_version,
                                          "memory_manager_list_size", "Memory manager list size",
                                          MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0,
                                          OPAL_INFO_LVL_9,
                                          MCA_BASE_VAR_SCOPE_READONLY,
                                          &cm->lmngr_size));

    /* The size list couldn't be less than possible max of ML modules,
       it = max supported communicators by ML */
    if (cm->lmngr_size < cm->max_comm) {
        cm->lmngr_size = cm->max_comm;
    }

    mca_coll_ml_component.lmngr_block_size = cm->payload_buffer_size *
      cm->n_payload_buffs_per_bank *
      cm->n_payload_mem_banks *
      cm->lmngr_size;

    CHECK(mca_base_component_var_register(&mca_coll_ml_component.super.collm_version,
                                          "memory_manager_block_size", "Memory manager block size",
                                          MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0,
                                          OPAL_INFO_LVL_9,
                                          MCA_BASE_VAR_SCOPE_READONLY,
                                          &mca_coll_ml_component.lmngr_block_size));

    cm->lmngr_alignment = opal_getpagesize();
    CHECK(mca_base_component_var_register(&mca_coll_ml_component.super.collm_version,
                                          "memory_manager_alignment", "Memory manager alignment",
                                          MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0,
                                          OPAL_INFO_LVL_9,
                                          MCA_BASE_VAR_SCOPE_READONLY,
                                          &mca_coll_ml_component.lmngr_block_size));

    return ret;
}
예제 #19
0
mca_bcol_base_lmngr_block_t* mca_coll_ml_lmngr_alloc (
        mca_coll_ml_lmngr_t *lmngr)
{
    int rc;
    opal_list_t *list = &lmngr->blocks_list;

    /* Check if the list manager was initialized */
    if(OPAL_UNLIKELY(NULL == lmngr->base_addr)) {
        ML_VERBOSE(7 ,("Starting memory initialization"));
        rc = mca_coll_ml_lmngr_init(lmngr);
        if (OMPI_SUCCESS != rc) {
            ML_ERROR(("Failed to init memory"));
            return NULL;
        }
    }

    if(OPAL_UNLIKELY(opal_list_is_empty(list))) {
        /* Upper layer need to handle the NULL */
        ML_VERBOSE(1, ("List manager is empty."));
        return NULL;
    }

    return (mca_bcol_base_lmngr_block_t *)opal_list_remove_first(list);
}
int ml_coll_memsync_setup(mca_coll_ml_module_t *ml_module)
{
    int ret;
    /* For barrier syncronization we use barrier topology */
    mca_coll_ml_topology_t *topo_info =
           &ml_module->topo_list[ml_module->collectives_topology_map[ML_BARRIER][ML_SMALL_MSG]];

    ret = mca_coll_ml_build_memsync_schedule(topo_info,
                            &ml_module->coll_ml_memsync_function);

    if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
        ML_VERBOSE(10, ("Failed to setup static bcast"));
        return ret;
    }

    return OMPI_SUCCESS;
}
예제 #21
0
/**
 * Non blocking memory syncronization
 */
int mca_coll_ml_memsync_intra(mca_coll_ml_module_t *ml_module, int bank_index)
{
    int rc;
    ompi_request_t *req;

    ML_VERBOSE(8, ("MEMSYNC start"));

    if (OPAL_UNLIKELY(0 == opal_list_get_size(&ml_module->active_bcols_list))) {
        /* Josh's change: In the case where only p2p is active, we have no way
         * to reset the bank release counters to zero, I am doing that here since it
         * would actually be "correct" to do it outside of this conditional, however
         * I suspect that reseting the value to zero elsewhere would result in corrupted 
         * flow for non-contiguous data types
         */
        
        /* nasty hack to ensure that resources are released in the single level 
         * ptp case. 
         */
        mca_coll_ml_collective_operation_progress_t dummy_coll;

        dummy_coll.coll_module = (mca_coll_base_module_t *) ml_module;
        dummy_coll.fragment_data.current_coll_op = ML_MEMSYNC;
        dummy_coll.full_message.bank_index_to_recycle = bank_index;

        /* Handling special case when memory syncronization is not required */
        rc = mca_coll_ml_memsync_recycle_memory(&dummy_coll);
        if(OPAL_UNLIKELY(rc != OMPI_SUCCESS)){
            ML_ERROR(("Failed to flush the list."));
            return rc;
        } 
    } else {
        /* retain the communicator until the operation is finished. the communicator
         * will be released by CHECK_AND_RECYCLE */
        OBJ_RETAIN(ml_module->comm);

        rc = mca_coll_ml_memsync_launch(ml_module, &req, bank_index);
        if (OPAL_UNLIKELY(rc != OMPI_SUCCESS)) {
            ML_ERROR(("Failed to launch a barrier."));
            return rc;
        }
    }

    return OMPI_SUCCESS;
}
예제 #22
0
/* The function is very different from the above function */
int mca_coll_ml_check_if_bcol_is_requested(const char *component_name)
{
    mca_base_component_list_item_t *bcol_comp;
    bcol_comp = (mca_base_component_list_item_t *) opal_list_get_first(&mca_bcol_base_components_in_use);

    ML_VERBOSE(10, ("Loop over bcol components\n"));
    for ( bcol_comp  = (mca_base_component_list_item_t *) opal_list_get_first(&mca_bcol_base_components_in_use);
          bcol_comp != (mca_base_component_list_item_t *) opal_list_get_end(&mca_bcol_base_components_in_use);
          bcol_comp  = (mca_base_component_list_item_t *) opal_list_get_next(bcol_comp)) {
        if(0 == strcmp(component_name,
                    ((mca_bcol_base_component_2_0_0_t *)
                     bcol_comp->cli_component)->bcol_version.mca_component_name)) {
            return true;
        }
    }

    /* the component was not resquested */
    return false;
}
예제 #23
0
/* Constructor for list memory manager */
static void construct_lmngr(mca_coll_ml_lmngr_t *lmngr)
{
    mca_coll_ml_component_t *cm = &mca_coll_ml_component;

    ML_VERBOSE(7, ("Constructing new list manager %p", (void *)lmngr));

    /* No real memory is allocated, only basic init.
    The real memory will be allocated on demand, on first block allocation */

    /* I caching this block size, alignment and list size
    since maybe in future we will want to define different parameters
    for lists */
    lmngr->list_block_size = cm->lmngr_block_size;
    lmngr->list_alignment = cm->lmngr_alignment;
    lmngr->list_size = cm->lmngr_size;
    lmngr->n_resources = 0;
    lmngr->base_addr = NULL; /* If the base addr is not null, the struct was initilized
                                and memory was allocated */
    /* Not sure that lock is required */
    OBJ_CONSTRUCT(&lmngr->mem_lock, opal_mutex_t);

    /* Only construct the list, no memry initialisation */
    OBJ_CONSTRUCT(&lmngr->blocks_list, opal_list_t);
}
예제 #24
0
static inline __opal_attribute_always_inline__
int mca_coll_ml_allgather_start (const void *sbuf, int scount,
                                 struct ompi_datatype_t *sdtype,
                                 void* rbuf, int rcount,
                                 struct ompi_datatype_t *rdtype,
                                 struct ompi_communicator_t *comm,
                                 mca_coll_base_module_t *module,
                                 ompi_request_t **req)
{
    size_t pack_len, sdt_size;
    int ret, n_fragments = 1, comm_size;

    mca_coll_ml_topology_t *topo_info;
    mca_bcol_base_payload_buffer_desc_t *src_buffer_desc;

    mca_coll_ml_component_t *cm = &mca_coll_ml_component;

    mca_coll_ml_collective_operation_progress_t *coll_op;
    mca_coll_ml_module_t *ml_module = (mca_coll_ml_module_t *) module;

    ptrdiff_t lb, extent;
    bool scontig, rcontig, in_place = false;

    /* check for in place setting */
    if (MPI_IN_PLACE == sbuf) {
        in_place = true;
        sdtype = rdtype;
        scount = rcount;
    }

    /* scontig could be != to rcontig */
    scontig = ompi_datatype_is_contiguous_memory_layout(sdtype, scount);
    rcontig = ompi_datatype_is_contiguous_memory_layout(rdtype, rcount);

    comm_size = ompi_comm_size(comm);

    ML_VERBOSE(10, ("Starting allgather"));

    assert(NULL != sdtype);
    /* Calculate size of the data,
     * at this stage, only contiguous data is supported */

    /* this is valid for allagther */
    ompi_datatype_type_size(sdtype, &sdt_size);
    pack_len = scount * sdt_size;

    if (in_place) {
        sbuf = (char *) rbuf + ompi_comm_rank(comm) * pack_len;
    }

    /* Allocate collective schedule and pack message */
    /* this is the total ending message size that will need to fit in the ml-buffer */
    if (pack_len <= (size_t) ml_module->small_message_thresholds[BCOL_ALLGATHER]) {
        /* The len of the message can not be larger than ML buffer size */
        ML_VERBOSE(10, ("Single frag %d %d %d", pack_len, comm_size, ml_module->payload_block->size_buffer));
        assert(pack_len * comm_size <= ml_module->payload_block->size_buffer);

        src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
        while (NULL == src_buffer_desc) {
            opal_progress();
            src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
        }

        /* change 1 */
        coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module,
                  ml_module->coll_ml_allgather_functions[ML_SMALL_DATA_ALLGATHER],
                  sbuf, rbuf, pack_len, 0 /* offset for first pack */);

        MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(coll_op,
                                              src_buffer_desc->buffer_index, src_buffer_desc);

        coll_op->fragment_data.current_coll_op = ML_SMALL_DATA_ALLGATHER;
        /* task setup callback function */
        coll_op->sequential_routine.seq_task_setup = mca_coll_ml_allgather_task_setup;

        /* change 2 */
        if (!scontig) {
            coll_op->full_message.n_bytes_scheduled =
                mca_coll_ml_convertor_prepare(sdtype, scount, sbuf,
                                              &coll_op->full_message.send_convertor, MCA_COLL_ML_NET_STREAM_SEND);

            mca_coll_ml_convertor_pack(
                (void *) ((uintptr_t) src_buffer_desc->data_addr + pack_len *
                          (coll_op->coll_schedule->topo_info->hier_layout_info[0].offset +
                           coll_op->coll_schedule->topo_info->hier_layout_info[0].level_one_index)),
                pack_len, &coll_op->full_message.send_convertor);
        } else {
            /* change 3 */
            memcpy((void *)((uintptr_t) src_buffer_desc->data_addr + pack_len *
                            (coll_op->coll_schedule->topo_info->hier_layout_info[0].offset +
                             coll_op->coll_schedule->topo_info->hier_layout_info[0].level_one_index)),
                   sbuf, pack_len);

            coll_op->full_message.n_bytes_scheduled = pack_len;
        }

        if (!rcontig) {
            mca_coll_ml_convertor_prepare(rdtype, rcount * comm_size, rbuf,
                                          &coll_op->full_message.recv_convertor, MCA_COLL_ML_NET_STREAM_RECV);
        }

        if (coll_op->coll_schedule->topo_info->ranks_contiguous) {
            coll_op->process_fn = mca_coll_ml_allgather_small_unpack_data;
        } else {
            coll_op->process_fn = mca_coll_ml_allgather_noncontiguous_unpack_data;
        }

        /* whole ml-buffer is used to send AND receive */
        coll_op->variable_fn_params.sbuf = (void *) src_buffer_desc->data_addr;
        coll_op->variable_fn_params.rbuf = (void *) src_buffer_desc->data_addr;

        /* we can set the initial offset here */
        coll_op->variable_fn_params.sbuf_offset = 0;
        coll_op->variable_fn_params.rbuf_offset = 0;

        coll_op->variable_fn_params.count = scount;
        coll_op->fragment_data.fragment_size =
            coll_op->full_message.n_bytes_scheduled;

        /* For small CINCO, we may use the native datatype */
        coll_op->variable_fn_params.dtype = sdtype;
        coll_op->variable_fn_params.buffer_size = pack_len;
        coll_op->variable_fn_params.root = 0;
    } else if (cm->enable_fragmentation || pack_len * comm_size < (1 << 20)) {
        /* calculate the number of fragments and the size of each frag */
        size_t n_dts_per_frag, frag_len;
        int pipeline_depth = mca_coll_ml_component.pipeline_depth;

        /* Calculate the number of fragments required for this message careful watch the integer division !*/
        frag_len = (pack_len <= (size_t) ml_module->small_message_thresholds[BCOL_ALLGATHER] ?
                    pack_len : (size_t) ml_module->small_message_thresholds[BCOL_ALLGATHER]);

        n_dts_per_frag = frag_len / sdt_size;
        n_fragments = (pack_len + sdt_size * n_dts_per_frag - 1) / (sdt_size * n_dts_per_frag);
        pipeline_depth = (n_fragments < pipeline_depth ? n_fragments : pipeline_depth);

        src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
        while (NULL == src_buffer_desc) {
            opal_progress();
            src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
        }

        /* change 4 */
        coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module,
                  ml_module->coll_ml_allgather_functions[ML_SMALL_DATA_ALLGATHER],
                  sbuf, rbuf, pack_len,
                  0 /* offset for first pack */);

        MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(coll_op,
                                              src_buffer_desc->buffer_index, src_buffer_desc);
        topo_info = coll_op->coll_schedule->topo_info;

        /* task setup callback function */
        coll_op->sequential_routine.seq_task_setup = mca_coll_ml_allgather_task_setup;

        if (!scontig) {
            coll_op->full_message.send_converter_bytes_packed =
                mca_coll_ml_convertor_prepare(
                    sdtype, scount, NULL,
                    &coll_op->full_message.dummy_convertor,
                    MCA_COLL_ML_NET_STREAM_SEND);

            coll_op->full_message.dummy_conv_position = 0;
            mca_coll_ml_convertor_get_send_frag_size(
                ml_module, &frag_len,
                &coll_op->full_message);

            /* change 5 */
            mca_coll_ml_convertor_prepare(sdtype, scount, sbuf,
                                          &coll_op->full_message.send_convertor, MCA_COLL_ML_NET_STREAM_SEND);

            mca_coll_ml_convertor_pack(
                (void *) ((uintptr_t) src_buffer_desc->data_addr + frag_len *
                          (topo_info->hier_layout_info[0].offset +
                           topo_info->hier_layout_info[0].level_one_index)),
                frag_len, &coll_op->full_message.send_convertor);
        } else {
            /* change 6 */
            memcpy((void *)((uintptr_t)src_buffer_desc->data_addr + frag_len *
                            (topo_info->hier_layout_info[0].offset +
                             topo_info->hier_layout_info[0].level_one_index)),
                   sbuf, frag_len);
        }

        if (!rcontig) {
            mca_coll_ml_convertor_prepare(rdtype, rcount * comm_size, rbuf,
                                          &coll_op->full_message.recv_convertor, MCA_COLL_ML_NET_STREAM_RECV);
        }

        coll_op->process_fn = mca_coll_ml_allgather_noncontiguous_unpack_data;

        /* hopefully this doesn't royaly screw things up idea behind this is the
         * whole ml-buffer is used to send and receive
         */
        coll_op->variable_fn_params.sbuf = (void *) src_buffer_desc->data_addr;
        coll_op->variable_fn_params.rbuf = (void *) src_buffer_desc->data_addr;

        /* we can set the initial offset here */
        coll_op->variable_fn_params.sbuf_offset = 0;
        coll_op->variable_fn_params.rbuf_offset = 0;

        coll_op->fragment_data.buffer_desc = src_buffer_desc;

        coll_op->fragment_data.fragment_size = frag_len;
        coll_op->fragment_data.message_descriptor->n_active = 1;

        coll_op->full_message.n_bytes_scheduled = frag_len;
        coll_op->full_message.fragment_launcher = mca_coll_ml_allgather_frag_progress;

        coll_op->full_message.pipeline_depth = pipeline_depth;
        coll_op->fragment_data.current_coll_op = ML_SMALL_DATA_ALLGATHER;

        /* remember this is different for frags !! Caused data corruption when
         * not properly set. Need to be sure you have consistent units.
         */
        coll_op->variable_fn_params.count = frag_len;
        coll_op->variable_fn_params.dtype = MPI_BYTE; /* for fragmented data, we work in
                                                       * units of bytes. This means that
                                                       * all of our arithmetic is done
                                                       * in terms of bytes
                                                       */

        coll_op->variable_fn_params.root = 0;
        coll_op->variable_fn_params.frag_size = frag_len;
        coll_op->variable_fn_params.buffer_size = frag_len;
    } else {
        /* change 7 */
        ML_VERBOSE(10, ("ML_ALLGATHER_LARGE_DATA_KNOWN case."));
        coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module,
                  ml_module->coll_ml_allgather_functions[ML_LARGE_DATA_ALLGATHER],
                  sbuf, rbuf, pack_len, 0 /* offset for first pack */);
        topo_info = coll_op->coll_schedule->topo_info;
        if (MCA_BCOL_BASE_NO_ML_BUFFER_FOR_LARGE_MSG & topo_info->all_bcols_mode) {
            MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(coll_op, MCA_COLL_ML_NO_BUFFER, NULL);
        } else {
            src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
            while (NULL == src_buffer_desc) {
                opal_progress();
                src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
            }

            MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(coll_op, src_buffer_desc->buffer_index, src_buffer_desc);
        }

        /* not sure if I really need this here */
        coll_op->sequential_routine.seq_task_setup = mca_coll_ml_allgather_task_setup;
        coll_op->process_fn = NULL;
        /* probably the most important piece */
        coll_op->variable_fn_params.sbuf = sbuf;
        coll_op->variable_fn_params.rbuf = rbuf;
        coll_op->variable_fn_params.sbuf_offset = 0;
        coll_op->variable_fn_params.rbuf_offset = 0;
        coll_op->variable_fn_params.count = scount;
        coll_op->variable_fn_params.dtype = sdtype;/* for zero copy, we want the
                                                    * native datatype and actual count
                                                    */
        coll_op->variable_fn_params.root = 0;

        /* you still need to copy in your own data into the rbuf */
        /* don't need to do this if you have in place data */
        if (!in_place) {
            memcpy((char *) rbuf + ompi_comm_rank(comm) * pack_len, sbuf, pack_len);
        }
    }

    coll_op->full_message.send_count = scount;
    coll_op->full_message.recv_count = rcount;

    coll_op->full_message.send_data_continguous = scontig;
    coll_op->full_message.recv_data_continguous = rcontig;

    ompi_datatype_get_extent(sdtype, &lb, &extent);
    coll_op->full_message.send_extent = (size_t) extent;

    ompi_datatype_get_extent(rdtype, &lb, &extent);
    coll_op->full_message.recv_extent = (size_t) extent;


    /* Fill in the function arguments */
    coll_op->variable_fn_params.sequence_num =
        OPAL_THREAD_ADD32(&(ml_module->collective_sequence_num), 1);
    coll_op->variable_fn_params.hier_factor = comm_size;

    MCA_COLL_ML_SET_ORDER_INFO(coll_op, n_fragments);


    ret = mca_coll_ml_launch_sequential_collective (coll_op);
    if (OMPI_SUCCESS != ret) {
        ML_VERBOSE(10, ("Failed to launch"));
        return ret;
    }

    *req = &coll_op->full_message.super;

    return OMPI_SUCCESS;
}
예제 #25
0
파일: coll_ml_reduce.c 프로젝트: IanYXXL/A1
static int mca_coll_ml_reduce_frag_progress(mca_coll_ml_collective_operation_progress_t *coll_op)
{
    /* local variables */
    void *buf;

    size_t dt_size;
    int ret, frag_len, count;

    ptrdiff_t lb, extent;

    mca_bcol_base_payload_buffer_desc_t *src_buffer_desc;
    mca_coll_ml_collective_operation_progress_t *new_op;

    mca_coll_ml_module_t *ml_module = OP_ML_MODULE(coll_op);

    ret = ompi_datatype_get_extent(coll_op->variable_fn_params.dtype, &lb, &extent);
    if (ret < 0) {
        return OMPI_ERROR;
    }

    dt_size = (size_t) extent;

    /* Keep the pipeline filled with fragments */
    while (coll_op->fragment_data.message_descriptor->n_active <
            coll_op->fragment_data.message_descriptor->pipeline_depth) {
        /* If an active fragment happens to have completed the collective during
         * a hop into the progress engine, then don't launch a new fragment,
         * instead break and return.
         */
        if (coll_op->fragment_data.message_descriptor->n_bytes_scheduled
                == coll_op->fragment_data.message_descriptor->n_bytes_total) {
            break;
        }

        /* Get an ml buffer */
        src_buffer_desc = mca_coll_ml_alloc_buffer(OP_ML_MODULE(coll_op));
        if (NULL == src_buffer_desc) {
            /* If there exist outstanding fragments, then break out
             * and let an active fragment deal with this later,
             * there are no buffers available.
             */
            if (0 < coll_op->fragment_data.message_descriptor->n_active) {
                return OMPI_SUCCESS;
            } else {
                /* It is useless to call progress from here, since
                 * ml progress can't be executed as result ml memsync
                 * call will not be completed and no memory will be
                 * recycled. So we put the element on the list, and we will
                 * progress it later when memsync will recycle some memory*/

                /* The fragment is already on list and
                 * the we still have no ml resources
                 * Return busy */
                if (coll_op->pending & REQ_OUT_OF_MEMORY) {
                    ML_VERBOSE(10,("Out of resources %p", coll_op));
                    return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
                }

                coll_op->pending |= REQ_OUT_OF_MEMORY;
                opal_list_append(&((OP_ML_MODULE(coll_op))->waiting_for_memory_list),
                                 (opal_list_item_t *)coll_op);
                ML_VERBOSE(10,("Out of resources %p adding to pending queue", coll_op));
                return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
            }
        }

        /* Get a new collective descriptor and initialize it */
        new_op =  mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module,
                  ml_module->coll_ml_reduce_functions[ML_SMALL_DATA_REDUCE],
                  coll_op->fragment_data.message_descriptor->src_user_addr,
                  coll_op->fragment_data.message_descriptor->dest_user_addr,
                  coll_op->fragment_data.message_descriptor->n_bytes_total,
                  coll_op->fragment_data.message_descriptor->n_bytes_scheduled);

        ML_VERBOSE(1,(" In Reduce fragment progress %d %d ",
                      coll_op->fragment_data.message_descriptor->n_bytes_total,
                      coll_op->fragment_data.message_descriptor->n_bytes_scheduled));
        MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(new_op,
                                              src_buffer_desc->buffer_index, src_buffer_desc);

        new_op->fragment_data.current_coll_op = coll_op->fragment_data.current_coll_op;
        new_op->fragment_data.message_descriptor = coll_op->fragment_data.message_descriptor;

        /* set the task setup callback  */
        new_op->sequential_routine.seq_task_setup = mca_coll_ml_reduce_task_setup;
        /* We need this address for pointer arithmetic in memcpy */
        buf = (void*)coll_op->fragment_data.message_descriptor->src_user_addr;
        /* calculate the number of data types in this packet */
        count = (coll_op->fragment_data.message_descriptor->n_bytes_total -
                 coll_op->fragment_data.message_descriptor->n_bytes_scheduled <
                 ((size_t) OP_ML_MODULE(coll_op)->small_message_thresholds[BCOL_REDUCE]/4 )?
                 (coll_op->fragment_data.message_descriptor->n_bytes_total -
                  coll_op->fragment_data.message_descriptor->n_bytes_scheduled) / dt_size :
                 (size_t) coll_op->variable_fn_params.count);

        /* calculate the fragment length */
        frag_len = count * dt_size;

        ret = ompi_datatype_copy_content_same_ddt(coll_op->variable_fn_params.dtype, count,
                (char *) src_buffer_desc->data_addr, (char *) ((uintptr_t) buf + (uintptr_t)
                        coll_op->fragment_data.message_descriptor->n_bytes_scheduled));
        if (ret < 0) {
            return OMPI_ERROR;
        }

        /* if root unpack the data */
        if (ompi_comm_rank(ml_module->comm) == coll_op->global_root ) {
            new_op->process_fn = mca_coll_ml_reduce_unpack;
            new_op->variable_fn_params.root_flag = true;
        } else {
            new_op->process_fn = NULL;
            new_op->variable_fn_params.root_flag = false;
        }

        new_op->variable_fn_params.root_route = coll_op->variable_fn_params.root_route;

        /* Setup fragment specific data */
        new_op->fragment_data.message_descriptor->n_bytes_scheduled += frag_len;
        new_op->fragment_data.buffer_desc = src_buffer_desc;
        new_op->fragment_data.fragment_size = frag_len;
        (new_op->fragment_data.message_descriptor->n_active)++;

        /* Set in Reduce Buffer arguments */
        ML_SET_VARIABLE_PARAMS_BCAST(new_op, OP_ML_MODULE(new_op), count,
                                     coll_op->variable_fn_params.dtype, src_buffer_desc,
                                     0, (ml_module->payload_block->size_buffer -
                                         ml_module->data_offset)/2, frag_len,
                                     src_buffer_desc->data_addr);

        new_op->variable_fn_params.buffer_size = frag_len;
        new_op->variable_fn_params.sbuf = src_buffer_desc->data_addr;
        new_op->variable_fn_params.rbuf = src_buffer_desc->data_addr;
        new_op->variable_fn_params.root = coll_op->variable_fn_params.root;
        new_op->global_root = coll_op->global_root;
        new_op->variable_fn_params.op = coll_op->variable_fn_params.op;
        new_op->variable_fn_params.hier_factor = coll_op->variable_fn_params.hier_factor;
        new_op->sequential_routine.current_bcol_status = SEQ_TASK_PENDING;
        MCA_COLL_ML_SET_NEW_FRAG_ORDER_INFO(new_op);

        ML_VERBOSE(10,("FFFF Contig + fragmentation [0-sk, 1-lk, 3-su, 4-lu] %d %d %d\n",
                       new_op->variable_fn_params.buffer_size,
                       new_op->fragment_data.fragment_size,
                       new_op->fragment_data.message_descriptor->n_bytes_scheduled));
        /* initialize first coll */
        new_op->sequential_routine.seq_task_setup(new_op);

        /* append this collective !! */
        OPAL_THREAD_LOCK(&(mca_coll_ml_component.sequential_collectives_mutex));
        opal_list_append(&mca_coll_ml_component.sequential_collectives,
                         (opal_list_item_t *)new_op);
        OPAL_THREAD_UNLOCK(&(mca_coll_ml_component.sequential_collectives_mutex));

    }

    return OMPI_SUCCESS;
}
static int mca_coll_ml_build_barrier_schedule(
                                    mca_coll_ml_topology_t *topo_info,
                                    mca_coll_ml_collective_operation_description_t
                                    **coll_desc,
                                    mca_coll_ml_module_t *ml_module)
{
    int i_hier, rc, i_fn, n_fcns, i,
        n_hiers = topo_info->n_levels;

    bool call_for_top_func;
    mca_bcol_base_module_t *bcol_module;

    mca_coll_ml_compound_functions_t *comp_fn;
    mca_coll_ml_collective_operation_description_t  *schedule;

    *coll_desc = (mca_coll_ml_collective_operation_description_t *)
                  malloc(sizeof(mca_coll_ml_collective_operation_description_t));

    schedule = *coll_desc;
    if (OPAL_UNLIKELY(NULL == schedule)) {
        ML_ERROR(("Can't allocate memory."));
        rc = OMPI_ERR_OUT_OF_RESOURCE;
        goto Barrier_Setup_Error;
    }

    if (topo_info->global_highest_hier_group_index ==
          topo_info->component_pairs[n_hiers - 1].bcol_index) {
        /* The process that is member of highest level subgroup
           should call for top algorithms in addition to fan-in/out steps */
        call_for_top_func = true;
        n_fcns = 2 * n_hiers - 1; /* Up + Top + Down */
    } else {
        /* The process is not member of highest level subgroup,
           as result it does not call for top algorithm,
           but it calls for all fan-in/out steps */
        call_for_top_func = false;
        n_fcns = 2 * n_hiers;
    }

    if( ml_module->max_fn_calls < n_fcns ) {
        ml_module->max_fn_calls = n_fcns;
    }

    /* Set dependencies equal to number of hierarchies */
    schedule->n_fns = n_fcns;
    schedule->topo_info = topo_info;

    /* Allocated the component function */
    schedule->component_functions = (struct mca_coll_ml_compound_functions_t *)
                                     calloc(n_fcns, sizeof(struct mca_coll_ml_compound_functions_t));

    if (OPAL_UNLIKELY(NULL == schedule->component_functions)) {
        ML_ERROR(("Can't allocate memory."));
        rc = OMPI_ERR_OUT_OF_RESOURCE;
        goto Barrier_Setup_Error;
    }
    for (i_fn = 0; i_fn < n_fcns; ++i_fn) {
        i_hier = (i_fn < n_hiers ? i_fn : n_fcns - i_fn - 1);
        comp_fn = &schedule->component_functions[i_fn];

        /* The hierarchial level */
        comp_fn->h_level = i_hier;
        bcol_module = GET_BCOL(topo_info, i_hier);

        /* The UP direction */
        if (1 + i_fn < n_hiers || (1 + i_fn == n_hiers && !call_for_top_func)) {
            comp_fn->bcol_function =
                bcol_module->filtered_fns_table[DATA_SRC_KNOWN][NON_BLOCKING][BCOL_FANIN][1][0][0];

            if (NULL == comp_fn->bcol_function) {
                ML_VERBOSE(10, ("no function available for BCOL_FANIN, NON_BLOCKING, DATA_SRC_KNOWN"));
                rc = OMPI_ERR_NOT_AVAILABLE;
                goto Barrier_Setup_Error;
            }

            /* Each function call with index K is depended of all K-1 previous indices -
               in simple words we will do sequential Fan-In calls */
            comp_fn->num_dependencies = (0 == i_fn) ? 0 : 1;
            comp_fn->num_dependent_tasks = 1;
            /* Init component function */
            strcpy(comp_fn->fn_name, "FANIN");
            /* On the highest level */
        } else if ((1 + i_fn == n_hiers && call_for_top_func)) {
            comp_fn->bcol_function =
                bcol_module->filtered_fns_table[DATA_SRC_KNOWN][NON_BLOCKING][BCOL_BARRIER][1][0][0];

            if (NULL == comp_fn->bcol_function) {
                ML_VERBOSE(10, ("no function available for BCOL_BARRIER, NON_BLOCKING, DATA_SRC_KNOWN"));
                rc = OMPI_ERR_NOT_AVAILABLE;
                goto Barrier_Setup_Error;
            }

            /* Each function call with index K is depended of all K-1 previous indices -
               in simple words we do sequential calls */
            comp_fn->num_dependencies = (1 == n_hiers)    ? 0 : 1; /* All Fan-Ins */
            comp_fn->num_dependent_tasks = n_fcns - n_hiers;  /* All Fan-Outs */

            /* Init component function */
            strcpy(comp_fn->fn_name, "BARRIER");

            ML_VERBOSE(10, ("func indx %d set to BARRIER %p", i_fn, comp_fn->bcol_function));

        /* The DOWN direction */
        } else {
            comp_fn->bcol_function =
                bcol_module->filtered_fns_table[DATA_SRC_KNOWN][NON_BLOCKING][BCOL_FANOUT][1][0][0];

            if (NULL == comp_fn->bcol_function) {
                ML_VERBOSE(10, ("no function available for BCOL_FANOUT, NON_BLOCKING, DATA_SRC_KNOWN"));
                rc = OMPI_ERR_NOT_AVAILABLE;
                goto Barrier_Setup_Error;
            }

            /* Each function call with index K is depended of all UP and TOP algths */
            comp_fn->num_dependencies = 1;
            comp_fn->num_dependent_tasks = call_for_top_func ? 0 :
                                           (i_fn + 1 == n_fcns ? 0 : 1);

            /* Init component function */
            strcpy(comp_fn->fn_name, "FANOUT");
        }

        ML_VERBOSE(10, ("func indx %d set to %p", i_fn, comp_fn->bcol_function));

        if (comp_fn->num_dependent_tasks > 0) {
            comp_fn->dependent_task_indices = (int *) calloc(comp_fn->num_dependent_tasks, sizeof(int));
            if (OPAL_UNLIKELY(NULL == comp_fn->dependent_task_indices)) {
                ML_ERROR(("Can't allocate memory."));
                rc = OMPI_ERR_OUT_OF_RESOURCE;
                goto Barrier_Setup_Error;
            }

            /* All indexes follow after this one */
            for (i = 0; i < comp_fn->num_dependent_tasks; ++i) {
                comp_fn->dependent_task_indices[i] = i_fn + i + 1;
            }
        } else {
                comp_fn->dependent_task_indices = NULL;
        }


        /* No need completion func for Barrier */
        comp_fn->task_comp_fn = NULL;

        ML_VERBOSE(10, ("Setting collective [Barrier] fn_idx %d, n_of_this_type_in_a_row %d, "
                        "index_in_consecutive_same_bcol_calls %d.",
                         i_fn, comp_fn->constant_group_data.n_of_this_type_in_a_row,
                         comp_fn->constant_group_data.index_in_consecutive_same_bcol_calls));
    }

    rc = ml_coll_barrier_constant_group_data_setup(topo_info, schedule);
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        ML_ERROR(("Failed to init const group data."));
        goto Barrier_Setup_Error;
    }

    schedule->progress_type = 0;

    return OMPI_SUCCESS;

Barrier_Setup_Error:
    if (NULL != schedule->component_functions) {
        free(schedule->component_functions);
        schedule->component_functions = NULL;
    }

    return rc;
}
예제 #27
0
파일: coll_ml_reduce.c 프로젝트: IanYXXL/A1
static inline __opal_attribute_always_inline__
int parallel_reduce_start (void *sbuf, void *rbuf, int count,
                           struct ompi_datatype_t *dtype, struct ompi_op_t *op,
                           int root,
                           struct ompi_communicator_t *comm,
                           mca_coll_ml_module_t *ml_module,
                           ompi_request_t **req,
                           int small_data_reduce,
                           int large_data_reduce) {
    ptrdiff_t lb, extent;
    size_t pack_len, dt_size;
    mca_bcol_base_payload_buffer_desc_t *src_buffer_desc = NULL;
    mca_coll_ml_collective_operation_progress_t * coll_op = NULL;
    bool contiguous = ompi_datatype_is_contiguous_memory_layout(dtype, count);
    mca_coll_ml_component_t *cm = &mca_coll_ml_component;
    int ret, n_fragments = 1, frag_len,
             pipeline_depth, n_dts_per_frag, rank;

    if (MPI_IN_PLACE == sbuf) {
        sbuf = rbuf;
    }

    ret = ompi_datatype_get_extent(dtype, &lb, &extent);
    if (ret < 0) {
        return OMPI_ERROR;
    }

    rank = ompi_comm_rank (comm);

    dt_size = (size_t) extent;
    pack_len = count * dt_size;

    /* We use a separate recieve and send buffer so only half the buffer is usable. */
    if (pack_len < (size_t) ml_module->small_message_thresholds[BCOL_REDUCE] / 4) {
        /* The len of the message can not be larger than ML buffer size */
        assert(pack_len <= ml_module->payload_block->size_buffer);

        src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);

        ML_VERBOSE(10,("Using small data reduce (threshold = %d)",
                       REDUCE_SMALL_MESSAGE_THRESHOLD));
        while (NULL == src_buffer_desc) {
            opal_progress();
            src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
        }

        coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module,
                  ml_module->coll_ml_reduce_functions[small_data_reduce],
                  sbuf, rbuf, pack_len, 0);

        MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(coll_op,
                                              src_buffer_desc->buffer_index, src_buffer_desc);

        coll_op->variable_fn_params.rbuf = src_buffer_desc->data_addr;
        coll_op->variable_fn_params.sbuf = src_buffer_desc->data_addr;
        coll_op->variable_fn_params.buffer_index = src_buffer_desc->buffer_index;
        coll_op->variable_fn_params.src_desc = src_buffer_desc;
        coll_op->variable_fn_params.count = count;

        ret = ompi_datatype_copy_content_same_ddt(dtype, count,
                (void *) (uintptr_t) src_buffer_desc->data_addr, (char *) sbuf);
        if (ret < 0) {
            return OMPI_ERROR;
        }

    } else if (cm->enable_fragmentation || !contiguous) {
        ML_VERBOSE(1,("Using Fragmented Reduce "));

        /* fragment the data */
        /* check for retarded application programming decisions */
        if (dt_size > (size_t) ml_module->small_message_thresholds[BCOL_REDUCE] / 4) {
            ML_ERROR(("Sorry, but we don't support datatypes that large"));
            return OMPI_ERROR;
        }

        /* calculate the number of data types that can fit per ml-buffer */
        n_dts_per_frag = ml_module->small_message_thresholds[BCOL_REDUCE] / (4 * dt_size);

        /* calculate the number of fragments */
        n_fragments = (count + n_dts_per_frag - 1) / n_dts_per_frag; /* round up */

        /* calculate the actual pipeline depth */
        pipeline_depth = n_fragments < cm->pipeline_depth ? n_fragments : cm->pipeline_depth;

        /* calculate the fragment size */
        frag_len = n_dts_per_frag * dt_size;

        /* allocate an ml buffer */
        src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
        while (NULL == src_buffer_desc) {
            opal_progress();
            src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
        }

        coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module,
                  ml_module->coll_ml_reduce_functions[small_data_reduce],
                  sbuf,rbuf,
                  pack_len,
                  0 /* offset for first pack */);

        MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(coll_op,
                                              src_buffer_desc->buffer_index, src_buffer_desc);


        coll_op->variable_fn_params.sbuf = (void *) src_buffer_desc->data_addr;
        coll_op->variable_fn_params.rbuf = (void *) src_buffer_desc->data_addr;

        coll_op->fragment_data.message_descriptor->n_active = 1;
        coll_op->full_message.n_bytes_scheduled = frag_len;
        coll_op->full_message.fragment_launcher = mca_coll_ml_reduce_frag_progress;
        coll_op->full_message.pipeline_depth = pipeline_depth;
        coll_op->fragment_data.current_coll_op = small_data_reduce;
        coll_op->fragment_data.fragment_size = frag_len;

        coll_op->variable_fn_params.count = n_dts_per_frag;  /* seems fishy */
        coll_op->variable_fn_params.buffer_size = frag_len;
        coll_op->variable_fn_params.src_desc = src_buffer_desc;
        /* copy into the ml-buffer */
        ret = ompi_datatype_copy_content_same_ddt(dtype, n_dts_per_frag,
                (char *) src_buffer_desc->data_addr, (char *) sbuf);
        if (ret < 0) {
            return OMPI_ERROR;
        }
    } else {
        ML_VERBOSE(1,("Using zero-copy ptp reduce"));
        coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module,
                  ml_module->coll_ml_reduce_functions[large_data_reduce],
                  sbuf, rbuf, pack_len, 0);

        coll_op->variable_fn_params.userbuf =
            coll_op->variable_fn_params.sbuf = sbuf;

        coll_op->variable_fn_params.rbuf = rbuf;

        /* The ML buffer is used for testing. Later, when we
         * switch to use knem/mmap/portals this should be replaced
         * appropriately
         */
        src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
        while (NULL == src_buffer_desc) {
            opal_progress();
            src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
        }

        coll_op->variable_fn_params.buffer_index = src_buffer_desc->buffer_index;
        coll_op->variable_fn_params.src_desc = src_buffer_desc;
        coll_op->variable_fn_params.count = count;
    }

    coll_op->process_fn = (rank != root) ? NULL : mca_coll_ml_reduce_unpack;

    /* Set common parts */
    coll_op->fragment_data.buffer_desc = src_buffer_desc;
    coll_op->variable_fn_params.dtype = dtype;
    coll_op->variable_fn_params.op = op;

    /* NTH: the root, root route, and root flag are set in the task setup */

    /* Fill in the function arguments */
    coll_op->variable_fn_params.sbuf_offset = 0;
    coll_op->variable_fn_params.rbuf_offset = (ml_module->payload_block->size_buffer -
            ml_module->data_offset)/2;

    /* Keep track of the global root of this operation */
    coll_op->global_root = root;

    coll_op->variable_fn_params.sequence_num =
        OPAL_THREAD_ADD32(&(ml_module->collective_sequence_num), 1);
    coll_op->sequential_routine.current_active_bcol_fn = 0;
    /* set the task setup callback  */
    coll_op->sequential_routine.seq_task_setup = mca_coll_ml_reduce_task_setup;

    /* Reduce requires the schedule to be fixed. If we use other (changing) schedule,
       the operation might result in different result. */
    coll_op->coll_schedule->component_functions = coll_op->coll_schedule->
            comp_fn_arr[coll_op->coll_schedule->topo_info->route_vector[root].level];

    /* Launch the collective */
    ret = mca_coll_ml_launch_sequential_collective (coll_op);
    if (OMPI_SUCCESS != ret) {
        ML_VERBOSE(10, ("Failed to launch reduce collective"));
        return ret;
    }

    *req = &coll_op->full_message.super;

    return OMPI_SUCCESS;
}
예제 #28
0
static int mca_coll_ml_allgather_frag_progress(mca_coll_ml_collective_operation_progress_t *coll_op)
{
    /* local variables */
    int ret;
    size_t frag_len, dt_size;

    const void *buf;
    mca_bcol_base_payload_buffer_desc_t *src_buffer_desc;
    mca_coll_ml_collective_operation_progress_t *new_op;

    mca_coll_ml_module_t *ml_module = OP_ML_MODULE(coll_op);
    bool scontig = coll_op->fragment_data.message_descriptor->send_data_continguous;

    ompi_datatype_type_size(coll_op->variable_fn_params.dtype, &dt_size);
    /* Keep the pipeline filled with fragments */
    while (coll_op->fragment_data.message_descriptor->n_active <
            coll_op->fragment_data.message_descriptor->pipeline_depth) {
        /* If an active fragment happens to have completed the collective during
         * a hop into the progress engine, then don't launch a new fragment,
         * instead break and return.
         */
        if (coll_op->fragment_data.message_descriptor->n_bytes_scheduled
                == coll_op->fragment_data.message_descriptor->n_bytes_total) {
            break;
        }
        /* Get an ml buffer */
        src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
        if (NULL == src_buffer_desc) {
            /* If there exist outstanding fragments, then break out
             * and let an active fragment deal with this later,
             * there are no buffers available.
             */
            if (0 < coll_op->fragment_data.message_descriptor->n_active) {
                return OMPI_SUCCESS;
            } else {
                /* The fragment is already on list and
                 * the we still have no ml resources
                 * Return busy */
                if (coll_op->pending & REQ_OUT_OF_MEMORY) {
                    ML_VERBOSE(10,("Out of resources %p", coll_op));
                    return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
                }

                coll_op->pending |= REQ_OUT_OF_MEMORY;
                opal_list_append(&((OP_ML_MODULE(coll_op))->waiting_for_memory_list),
                                 (opal_list_item_t *)coll_op);
                ML_VERBOSE(10,("Out of resources %p adding to pending queue", coll_op));
                return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
            }
        }

        /* Get a new collective descriptor and initialize it */
        new_op =  mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module,
                  ml_module->coll_ml_allgather_functions[ML_SMALL_DATA_ALLGATHER],
                  coll_op->fragment_data.message_descriptor->src_user_addr,
                  coll_op->fragment_data.message_descriptor->dest_user_addr,
                  coll_op->fragment_data.message_descriptor->n_bytes_total,
                  coll_op->fragment_data.message_descriptor->n_bytes_scheduled);

        new_op->fragment_data.current_coll_op = coll_op->fragment_data.current_coll_op;
        new_op->fragment_data.message_descriptor = coll_op->fragment_data.message_descriptor;

        /* set the task setup callback  */
        new_op->sequential_routine.seq_task_setup = mca_coll_ml_allgather_task_setup;

        /*
        MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(new_op,
                src_buffer_desc->buffer_index, src_buffer_desc);
        */

        /* We need this address for pointer arithmetic in memcpy */
        buf = coll_op->fragment_data.message_descriptor->src_user_addr;

        if (!scontig) {
            frag_len = ml_module->small_message_thresholds[BCOL_ALLGATHER];
            mca_coll_ml_convertor_get_send_frag_size(
                ml_module, &frag_len,
                coll_op->fragment_data.message_descriptor);

            mca_coll_ml_convertor_pack(
                (void *) ((uintptr_t) src_buffer_desc->data_addr +
                          frag_len * coll_op->coll_schedule->topo_info->hier_layout_info[0].offset +
                          frag_len * coll_op->coll_schedule->topo_info->hier_layout_info[0].level_one_index),
                frag_len, &coll_op->fragment_data.message_descriptor->send_convertor);
        } else {
            /* calculate new frag length, there are some issues here */
            frag_len = (coll_op->fragment_data.message_descriptor->n_bytes_total -
                        coll_op->fragment_data.message_descriptor->n_bytes_scheduled <
                        coll_op->fragment_data.fragment_size ?
                        coll_op->fragment_data.message_descriptor->n_bytes_total -
                        coll_op->fragment_data.message_descriptor->n_bytes_scheduled :
                        coll_op->fragment_data.fragment_size);

            /* everybody copies in, based on the new values */
            memcpy((void *) ((uintptr_t)src_buffer_desc->data_addr +
                             frag_len * new_op->coll_schedule->topo_info->hier_layout_info[0].offset +
                             frag_len * new_op->coll_schedule->topo_info->hier_layout_info[0].level_one_index),
                   (void *) ((uintptr_t) buf + (uintptr_t)
                             coll_op->fragment_data.message_descriptor->n_bytes_scheduled), frag_len);
        }

        new_op->variable_fn_params.sbuf = (void *) src_buffer_desc->data_addr;
        new_op->variable_fn_params.rbuf = (void *) src_buffer_desc->data_addr;

        /* update the number of bytes scheduled */
        new_op->fragment_data.message_descriptor->n_bytes_scheduled += frag_len;
        /* everyone needs an unpack function */
        new_op->process_fn = mca_coll_ml_allgather_noncontiguous_unpack_data;

        new_op->fragment_data.fragment_size = frag_len;
        new_op->fragment_data.buffer_desc = src_buffer_desc;

        /* Setup fragment specific data */
        ++(new_op->fragment_data.message_descriptor->n_active);

        ML_VERBOSE(10, ("Start more, My index %d ",
                        new_op->fragment_data.buffer_desc->buffer_index));

        /* this is a bit buggy */
        ML_SET_VARIABLE_PARAMS_BCAST(
            new_op,
            OP_ML_MODULE(new_op),
            frag_len /* yes, we have consistent units, so this makes sense */,
            MPI_BYTE /* we fragment according to buffer size
                          * we don't reduce the data thus we needn't
                          * keep "whole" datatypes, we may freely
                          * fragment without regard for multiples
                          * of any specific datatype
                          */,
            src_buffer_desc,
            0,
            0,
            frag_len,
            src_buffer_desc->data_addr);
        /* initialize first coll */
        ret = new_op->sequential_routine.seq_task_setup(new_op);
        if (OMPI_SUCCESS != ret) {
            ML_VERBOSE(3, ("Fragment failed to initialize itself"));
            return ret;
        }

        new_op->variable_fn_params.buffer_size = frag_len;
        new_op->variable_fn_params.hier_factor = coll_op->variable_fn_params.hier_factor;
        new_op->variable_fn_params.root = 0;

        MCA_COLL_ML_SET_NEW_FRAG_ORDER_INFO(new_op);

        /* append this collective !! */
        OPAL_THREAD_LOCK(&(mca_coll_ml_component.sequential_collectives_mutex));
        opal_list_append(&mca_coll_ml_component.sequential_collectives,
                         (opal_list_item_t *)new_op);
        OPAL_THREAD_UNLOCK(&(mca_coll_ml_component.sequential_collectives_mutex));
    }

    return OMPI_SUCCESS;
}
/*
 * Fill up the collective descriptor
 *
 */
static int mca_coll_ml_build_static_reduce_schedule(
                                    mca_coll_ml_topology_t *topo_info,
                                    mca_coll_ml_collective_operation_description_t **coll_desc)
{
    int i_hier, j_hier,  n_fcns,
        n_hiers = topo_info->n_levels;
    int *scratch_indx = NULL,
        *scratch_num = NULL;
    int cnt, value_to_set = 0;
    int ret = OMPI_SUCCESS;
    bool prev_is_zero;
    mca_coll_ml_compound_functions_t *comp_fns_temp;
    mca_bcol_base_module_t *prev_bcol,
                           *bcol_module;
    mca_coll_ml_compound_functions_t *comp_fn;
    mca_coll_ml_collective_operation_description_t  *schedule = NULL;

    *coll_desc = (mca_coll_ml_collective_operation_description_t *)
                  malloc(sizeof(mca_coll_ml_collective_operation_description_t));

    schedule = *coll_desc;
    if (OPAL_UNLIKELY(NULL == schedule)) {
        ML_ERROR(("Can't allocate memory.\n"));
        ret = OMPI_ERR_OUT_OF_RESOURCE;
        goto Error;
    }

    scratch_indx = (int *) malloc(sizeof(int) * (n_hiers));
    if (NULL == scratch_indx) {
        ML_ERROR(("Can't allocate memory.\n"));
        ret = OMPI_ERR_OUT_OF_RESOURCE;
        goto Error;
    }

    scratch_num = (int *) malloc(sizeof(int) * (n_hiers));
    if (NULL == scratch_num) {
        ML_ERROR(("Can't allocate memory.\n"));
        ret = OMPI_ERR_OUT_OF_RESOURCE;
        goto Error;
    }

    prev_bcol = NULL;

    /* Calculate scratch numbers */
    for (i_hier = 0; i_hier < n_hiers; i_hier++) {
        if (IS_BCOL_TYPE_IDENTICAL(prev_bcol, GET_BCOL(topo_info, i_hier))) {
            scratch_indx[i_hier] = scratch_indx[i_hier - 1] + 1;
        } else {
            scratch_indx[i_hier] = 0;
            prev_bcol = GET_BCOL(topo_info, i_hier);
        }
    }

    --i_hier;
    prev_is_zero = true;

    do {
        if (prev_is_zero) {
            value_to_set = scratch_indx[i_hier] + 1;
            prev_is_zero = false;
        }

        if (0 == scratch_indx[i_hier]) {
            prev_is_zero = true;
        }

        scratch_num[i_hier] = value_to_set;
        --i_hier;
    } while(i_hier >= 0);

    /* All hierarchies call one function, unlike other collectives */
    n_fcns = n_hiers;

    /* Set dependencies equal to number of hierarchies */
    schedule->n_fns = n_fcns;
    schedule->topo_info = topo_info;
    schedule->progress_type = 0;
    /* Allocated the component function */
    schedule->component_functions = (struct mca_coll_ml_compound_functions_t *)
                                     calloc(n_fcns, sizeof(struct mca_coll_ml_compound_functions_t));

    if (OPAL_UNLIKELY(NULL == schedule->component_functions)) {
        ML_ERROR(("Can't allocate memory.\n"));
        ret = OMPI_ERR_OUT_OF_RESOURCE;
        goto Error;
    }


    for (i_hier = 0; i_hier < n_hiers; ++i_hier) {
        comp_fn = &schedule->component_functions[i_hier];

        /* The hierarchial level */
        comp_fn->h_level = i_hier;
        bcol_module = GET_BCOL(topo_info, i_hier);

        comp_fn->bcol_function =
                bcol_module->filtered_fns_table[DATA_SRC_KNOWN][NON_BLOCKING][BCOL_REDUCE][1][0][0];

        strcpy(comp_fn->fn_name, "REDUCE");
        ML_VERBOSE(10, ("func indx %d set to %p", i_hier, comp_fn->bcol_function));


        ML_VERBOSE(1,("In ML_REDUCE_SETUP  .. looks fine here"));
        /* No need completion func for Barrier */
        comp_fn->task_comp_fn = mca_coll_ml_task_comp_static_reduce;

        /* Constants */
        comp_fn->constant_group_data.bcol_module = bcol_module;
        comp_fn->constant_group_data.index_in_consecutive_same_bcol_calls = scratch_indx[i_hier];
        comp_fn->constant_group_data.n_of_this_type_in_a_row = scratch_num[i_hier];
        comp_fn->constant_group_data.n_of_this_type_in_collective = 0;
        comp_fn->constant_group_data.index_of_this_type_in_collective = 0;

        ML_VERBOSE(10, ("Setting collective [reduce] fn_idx %d, n_of_this_type_in_a_row %d, "
                        "index_in_consecutive_same_bcol_calls %d.",
                         i_hier, comp_fn->constant_group_data.n_of_this_type_in_a_row,
                         comp_fn->constant_group_data.index_in_consecutive_same_bcol_calls));
    }


    /* Fill the rest of constant data */
    for (i_hier = 0; i_hier < n_hiers; i_hier++) {
        mca_bcol_base_module_t *current_bcol =
            schedule->component_functions[i_hier].
            constant_group_data.bcol_module;
        cnt = 0;
        for (j_hier = 0; j_hier < n_hiers; j_hier++) {
            if (current_bcol ==
                    schedule->component_functions[j_hier].
                    constant_group_data.bcol_module) {
                schedule->component_functions[j_hier].
                    constant_group_data.index_of_this_type_in_collective = cnt;
                cnt++;
            }
        }
        schedule->component_functions[i_hier].
            constant_group_data.n_of_this_type_in_collective = cnt;
    }

    /* Manju: Reduction should always use the fixed schedule.
     * The subgroups that this process is leader should be executed first, then
     * it should execute the subgroups where this process is not a leader, and
     * then execute the subgroup that includes the root.
     */

    /* Allocate the schedule list */
    schedule->comp_fn_arr = (struct mca_coll_ml_compound_functions_t **)
        calloc(n_hiers,sizeof(struct mca_coll_ml_compound_functions_t *));
    if (NULL == schedule->comp_fn_arr) {
        ML_ERROR(("Can't allocate memory.\n"));
        ret = OMPI_ERR_OUT_OF_RESOURCE;
        goto Error;
    }

    /* Now that the functions have been set-up properly, we can simple permute the ordering a bit */

    for (i_hier = 0; i_hier < n_hiers; i_hier++) {
        /* first one is trivial */
        int leader_hierarchy = 0;
        int non_leader_hierarchy = 0;
        int func_index;

        comp_fns_temp = (struct mca_coll_ml_compound_functions_t *)
            calloc(n_hiers, sizeof(struct mca_coll_ml_compound_functions_t));

        leader_hierarchy = 0;
        non_leader_hierarchy = n_hiers - 2;

        for(j_hier = 0; j_hier < n_hiers - 1 ; j_hier++) {

            func_index = j_hier < i_hier ? j_hier : j_hier + 1;
            /* I'm a leader for this group */
            if (0 == topo_info->component_pairs->subgroup_module->my_index) {
                comp_fns_temp[leader_hierarchy++] =
                    schedule->component_functions[func_index];
            }
            else {
                comp_fns_temp[non_leader_hierarchy--] =
                    schedule->component_functions[func_index];
            }
        }

        comp_fns_temp[j_hier] = schedule->component_functions[i_hier];
        /* now let's attach this list to our array of lists */
        schedule->comp_fn_arr[i_hier] = comp_fns_temp;
    }

    /* Manju: Do we need this ? */

    /* I'm going to just loop over each schedule and
     * set up the scratch indices, scratch numbers
     * and other constant data
     */
    /*
    for( i_hier = 1; i_hier < n_hiers; i_hier++) {
        ret = mca_coll_ml_setup_scratch_vals(schedule->comp_fn_arr[i_hier], scratch_indx,
                scratch_num, n_hiers);
        if( OMPI_SUCCESS != ret ) {
            ret = OMPI_ERROR;
            goto Error;
        }

    }
    */

    /* Do I need this ? */
    schedule->task_setup_fn[COLL_ML_ROOT_TASK_FN] = mca_coll_ml_static_reduce_root;
    schedule->task_setup_fn[COLL_ML_GENERAL_TASK_FN] = mca_coll_ml_static_reduce_non_root;

    MCA_COLL_ML_SET_SCHEDULE_ORDER_INFO(schedule);

    free(scratch_num);
    free(scratch_indx);

    return OMPI_SUCCESS;

Error:
    if (NULL != schedule->component_functions) {
        free(schedule->component_functions);
        schedule->component_functions = NULL;
    }

    return ret;
}
int ml_coll_up_and_down_hier_setup(mca_coll_ml_module_t *ml_module,
                                   mca_coll_ml_topology_t *topo_info,
                                   int up_function_idx,
                                   int top_function_idx,
                                   int down_function_idx,
                                   int collective)
{
    /* local variables */
    int i, j, cnt, value_to_set = -1;
    int ret = OMPI_SUCCESS, num_up_levels;

    int num_hierarchies = topo_info->n_levels;
    int global_high_hierarchy_index = topo_info->global_highest_hier_group_index;

    bool call_for_top_function, prev_is_zero;

    int *scratch_indx = NULL, *scratch_num = NULL;

    coll_ml_collective_description_t *collective_alg = NULL;
    mca_bcol_base_module_t *bcol_module = NULL,
                           *prev_bcol = NULL;

    /* RLG:  one blocking barrier collective algorithm - this is really a hack,
     * we need to figure out how to do this in a bit more extensible
     * manner.
     */
     collective_alg = (coll_ml_collective_description_t *)
         malloc(sizeof(coll_ml_collective_description_t));
     if (NULL == collective_alg) {
        ML_ERROR(("Can't allocate memory."));
        ret = OMPI_ERR_OUT_OF_RESOURCE;
        goto Error;
     }

    /* am I a member of the highest level subgroup ? */
    if (global_high_hierarchy_index ==
          topo_info->component_pairs[num_hierarchies - 1].bcol_index) {
        /* The process that is member of highest level subgroup
           should call for top algorithms in addition to fan-in/out steps*/
        call_for_top_function = true;
        /* hier level run only top algorithm, so we deduct 1 */
        num_up_levels = num_hierarchies - 1;
        /* Top algorithm is called only once, so we deduct 1 */
        collective_alg->n_functions = 2 * num_hierarchies - 1;
    } else {
        /* The process is not member of highest level subgroup,
           as result it does not call for top algorithm,
           but it calls for all fan-in/out steps */
        call_for_top_function = false;
        num_up_levels = num_hierarchies;
        collective_alg->n_functions = 2 * num_hierarchies;
    }

    ML_VERBOSE(10, ("high_index %d == bcol_index %d: Call top %d, num_up_levels %d, collective_alg->n_functions %d",
                global_high_hierarchy_index,
                topo_info->component_pairs[num_hierarchies - 1].bcol_index,
                call_for_top_function,
                num_up_levels,
                collective_alg->n_functions ));

    /* allocate space for the functions */
    collective_alg->functions = (mca_bcol_base_function_t *)
        calloc(collective_alg->n_functions, sizeof(mca_bcol_base_function_t));
    if( NULL == collective_alg->functions) {
        ML_ERROR(("Can't allocate memory."));
        ret = OMPI_ERR_OUT_OF_RESOURCE;
        goto Error;
    }

    /* Algorithm Description:
     * =====================
     * The algorithm used here for an N level system
     *  - up to level N-2, inclusive : up algorithm (fan in in barrier, reduce in Allreduce)
     *  - level N-1: top algorithm (barrier or allreduce)
     *  - level N-2, to level 0: down algorithm (fanout)
     */


    /* Starting scratch_num and scratch_index calculations */
    /* =================================================== */

    /* Figure out how many of the same bcols are called in a row.
     * The index of the bcol in row we store in scratch_indx and
     * the total number of bcols in the row we store in scratch_num */
    scratch_indx = (int *) calloc (2 * num_hierarchies, sizeof (int));
    if(NULL == scratch_indx) {
        ML_ERROR(("Can't allocate memory."));
        ret = OMPI_ERR_OUT_OF_RESOURCE;
        goto Error;
    }

    scratch_num = (int *) malloc(sizeof(int) * (2 * num_hierarchies));
    if(NULL == scratch_num) {
        ML_ERROR(("Can't allocate memory."));
        ret = OMPI_ERR_OUT_OF_RESOURCE;
        goto Error;
    }

    /* We go through all stages of algorithm (up, top, down)
     * and calculate bcol index. If previous bcol is the same type as current
     * one the counter index is increased, other way the index is zero */
    prev_bcol = NULL;
    /* going up */
    for (i = 0, cnt = 0; i < num_up_levels; ++i, ++cnt) {
        if (IS_BCOL_TYPE_IDENTICAL(prev_bcol, GET_BCOL(topo_info, i))) {
            scratch_indx[cnt] = scratch_indx[cnt - 1] + 1;
        } else {
            scratch_indx[cnt] = 0;
            prev_bcol = GET_BCOL(topo_info, i);
        }
    }

    /* top  - only if the proc arrive to highest_level_is_global_highest_level */
    if (call_for_top_function) {
        if (IS_BCOL_TYPE_IDENTICAL(prev_bcol, GET_BCOL(topo_info, num_hierarchies - 1))) {
            scratch_indx[cnt] = scratch_indx[cnt - 1] + 1;
        } else {
            scratch_indx[cnt] = 0;
            prev_bcol = GET_BCOL(topo_info, num_hierarchies - 1);
        }

        ++cnt;
    }

    /* going down */
    for (i = num_up_levels - 1; i >= 0; --i, ++cnt) {
        if (IS_BCOL_TYPE_IDENTICAL(prev_bcol, GET_BCOL(topo_info, i))) {
            scratch_indx[cnt] = scratch_indx[cnt - 1] + 1;
        } else {
            scratch_indx[cnt] = 0;
            prev_bcol = GET_BCOL(topo_info, i);
        }
    }

    /*
     * Calculate the number of the same bcols in row.
     * We parse the index array, if index is zero
     * it means that the row is done and we start
     * to calculate next bcols row. The maximum number
     * for the row is equal to maximal bcol index in the row + 1
     */
    i = cnt - 1;
    prev_is_zero = true;
    do {
        if (prev_is_zero) {
            value_to_set = scratch_indx[i] + 1;
            prev_is_zero = false;
        }

        if (0 == scratch_indx[i]) {
            prev_is_zero = true;
        }

        scratch_num[i] = value_to_set;
        --i;
    } while(i >= 0);

    /* =========================================================== */
    /* We are done with scratch_num and scratch_index calculations */

    /* Setup function call for each algorithm step */
    cnt = 0;
    /* up phase */
    for (i = 0; i < num_up_levels; i++) {
        bcol_module = GET_BCOL(topo_info, i);
        collective_alg->functions[cnt].fn_idx = up_function_idx;
        collective_alg->functions[cnt].bcol_module = bcol_module;
        collective_alg->functions[cnt].index_in_consecutive_same_bcol_calls = scratch_indx[cnt];
        collective_alg->functions[cnt].n_of_this_type_in_a_row = scratch_num[cnt];
        ML_VERBOSE(10, ("Setting collective [collective code %d][count %d], fn_idx %d, index_in_consecutive_same_bcol_calls %d, n_of_this_type_in_a_row %d",
                    collective, cnt, collective_alg->functions[cnt].fn_idx,
                    collective_alg->functions[cnt].index_in_consecutive_same_bcol_calls,
                    collective_alg->functions[cnt].n_of_this_type_in_a_row));
        ++cnt;
    }

    /* top function */
    if (call_for_top_function) {
        bcol_module = GET_BCOL(topo_info, num_hierarchies - 1);
        collective_alg->functions[cnt].fn_idx = top_function_idx;
        collective_alg->functions[cnt].bcol_module = bcol_module;
        collective_alg->functions[cnt].index_in_consecutive_same_bcol_calls = scratch_indx[cnt];
        collective_alg->functions[cnt].n_of_this_type_in_a_row = scratch_num[cnt];
        ML_VERBOSE(10, ("Setting collective [collective code %d][count %d], fn_idx %d, index_in_consecutive_same_bcol_calls %d, n_of_this_type_in_a_row %d",
                    collective, cnt, collective_alg->functions[cnt].fn_idx,
                    collective_alg->functions[cnt].index_in_consecutive_same_bcol_calls,
                    collective_alg->functions[cnt].n_of_this_type_in_a_row));
        ++cnt;
    }

    /* down phase*/
    for (i = num_up_levels - 1; i >= 0; i--) {
        bcol_module = GET_BCOL(topo_info, i);
        collective_alg->functions[cnt].fn_idx = down_function_idx;
        collective_alg->functions[cnt].bcol_module = bcol_module;
        collective_alg->functions[cnt].index_in_consecutive_same_bcol_calls = scratch_indx[cnt];
        collective_alg->functions[cnt].n_of_this_type_in_a_row = scratch_num[cnt];
        ML_VERBOSE(10, ("Setting collective [collective code %d][count %d], fn_idx %d, index_in_consecutive_same_bcol_calls %d, n_of_this_type_in_a_row %d",
                    collective, cnt, collective_alg->functions[cnt].fn_idx,
                    collective_alg->functions[cnt].index_in_consecutive_same_bcol_calls,
                    collective_alg->functions[cnt].n_of_this_type_in_a_row));
        ++cnt;
    }

    /* figure out how many times this bcol is used in this collective call */
    for (i = 0; i < collective_alg->n_functions; i++) {
        mca_bcol_base_module_t *current_bcol=
            collective_alg->functions[i].bcol_module;

        cnt = 0;
        for (j = 0; j < collective_alg->n_functions; ++j) {
            if (current_bcol ==
                    collective_alg->functions[j].bcol_module) {
                collective_alg->functions[j].index_of_this_type_in_collective = cnt;
                ML_VERBOSE(10, ("Pasha: Setting collective [collective code %d][count %d], fn_idx %d, collective_alg->functions[i].index_of_this_type_in_collective %d",
                            collective, cnt, i,
                            collective_alg->functions[j].index_of_this_type_in_collective));
                cnt++;
            }
        }

        collective_alg->functions[i].n_of_this_type_in_collective=cnt;
        ML_VERBOSE(10, ("Pasha: Setting collective [collective code %d][count %d], fn_idx %d, collective_alg->functions[i].n_of_this_type_in_collective %d",
                    collective, cnt, i,
                    collective_alg->functions[i].n_of_this_type_in_collective));
    }

    /* set Barrier algorithm */
    topo_info->hierarchical_algorithms[collective] = collective_alg;
    /* Setup maximum number function calls, it is used for resource allocation */
    ml_module->max_fn_calls = (collective_alg->n_functions > ml_module->max_fn_calls) ?
                                    collective_alg->n_functions : ml_module->max_fn_calls;
    /* Ishai: What is this n_buffers? I did not find where it is being used*/
    topo_info->hierarchical_algorithms[collective]->n_buffers = 1;

    /* Release temporary memories */
    if (NULL != scratch_indx) {
        free(scratch_indx);
    }

    if (NULL != scratch_num) {
       free(scratch_num);
    }

    return OMPI_SUCCESS;

Error:
    if (NULL != collective_alg->functions) {
       free(collective_alg->functions);
    }

    if (NULL != collective_alg) {
       free(collective_alg);
    }

    if (NULL != scratch_indx) {
        free(scratch_indx);
    }

    if (NULL != scratch_num) {
        free(scratch_num);
    }

    return ret;
}