static int mca_coll_ml_reduce_unpack(mca_coll_ml_collective_operation_progress_t *coll_op) { int ret; /* need to put in more */ int count = coll_op->variable_fn_params.count; ompi_datatype_t *dtype = coll_op->variable_fn_params.dtype; void *dest = (void *)((uintptr_t)coll_op->full_message.dest_user_addr + (uintptr_t)coll_op->fragment_data.offset_into_user_buffer); void *src = (void *)((uintptr_t)coll_op->fragment_data.buffer_desc->data_addr + (size_t)coll_op->variable_fn_params.rbuf_offset); ret = ompi_datatype_copy_content_same_ddt(dtype, (int32_t) count, (char *) dest, (char *) src); if (ret < 0) { return OMPI_ERROR; } if (coll_op->variable_fn_params.root_flag) { ML_VERBOSE(1,("In reduce unpack %d", *(int *)((unsigned char*) src))); } ML_VERBOSE(10, ("sbuf addr %p, sbuf offset %d, sbuf val %lf, rbuf addr %p, rbuf offset %d, rbuf val %lf.", coll_op->variable_fn_params.sbuf, coll_op->variable_fn_params.sbuf_offset, *(double *) ((unsigned char *) coll_op->variable_fn_params.sbuf + (size_t) coll_op->variable_fn_params.sbuf_offset), coll_op->variable_fn_params.rbuf, coll_op->variable_fn_params.rbuf_offset, *(double *) ((unsigned char *) coll_op->variable_fn_params.rbuf + (size_t) coll_op->variable_fn_params.rbuf_offset))); return OMPI_SUCCESS; }
int mca_coll_ml_reduce_nb(void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, ompi_request_t **req, mca_coll_base_module_t *module) { int ret = OMPI_SUCCESS; mca_coll_ml_module_t *ml_module = (mca_coll_ml_module_t*)module; if (OPAL_UNLIKELY(!ompi_op_is_commute(op) || !opal_datatype_is_contiguous_memory_layout(&dtype->super, count))) { /* coll/ml does not handle non-communative operations at this time. fallback * on another collective module */ return ml_module->fallback.coll_ireduce (sbuf, rbuf, count, dtype, op, root, comm, req, ml_module->fallback.coll_ireduce_module); } ML_VERBOSE(10,("Calling Ml Reduce ")); ret = parallel_reduce_start(sbuf, rbuf, count, dtype, op, root, comm, ml_module, req, ML_SMALL_DATA_REDUCE, ML_LARGE_DATA_REDUCE); if (OPAL_UNLIKELY(ret != OMPI_SUCCESS)) { ML_VERBOSE(10, ("Failed to launch")); return ret; } ML_VERBOSE(10, ("Non-blocking Reduce is done")); return OMPI_SUCCESS; }
int mca_coll_ml_allgather(const void *sbuf, int scount, struct ompi_datatype_t *sdtype, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { ompi_request_t *req; int ret; ML_VERBOSE(10, ("Starting blocking allgather")); ret = mca_coll_ml_allgather_start (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module, &req); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { return ret; } ret = ompi_request_wait (&req, MPI_STATUS_IGNORE); ML_VERBOSE(10, ("Blocking allgather is complete")); return ret; }
/** * Hierarchical blocking barrier */ int mca_coll_ml_barrier_intra(struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int rc; ompi_request_t *req; mca_coll_ml_module_t *ml_module = (mca_coll_ml_module_t *) module; #if OPAL_ENABLE_DEBUG static int barriers_count = 0; #endif ML_VERBOSE(10, ("Barrier num %d start.", ++barriers_count)); rc = mca_coll_ml_barrier_launch(ml_module, &req); if (OPAL_UNLIKELY(rc != OMPI_SUCCESS)) { ML_ERROR(("Failed to launch a barrier.")); return rc; } /* Blocking barrier */ ompi_request_wait_completion(req); ompi_request_free(&req); ML_VERBOSE(10, ("Barrier num %d was done.", barriers_count)); return OMPI_SUCCESS; }
static int mca_coll_ml_lmngr_init(mca_coll_ml_lmngr_t *lmngr) { int i, num_blocks; int rc; unsigned char *addr; bcol_base_network_context_t *nc; ML_VERBOSE(7, ("List initialization")); #ifdef HAVE_POSIX_MEMALIGN if((errno = posix_memalign(&lmngr->base_addr, lmngr->list_alignment, lmngr->list_size * lmngr->list_block_size)) != 0) { ML_ERROR(("Failed to allocate memory: %d [%s]", errno, strerror(errno))); return OMPI_ERROR; } lmngr->alloc_base = lmngr->base_addr; #else lmngr->alloc_base = malloc(lmngr->list_size * lmngr->list_block_size + lmngr->list_alignment); if(NULL == lmngr->alloc_base) { ML_ERROR(("Failed to allocate memory: %d [%s]", errno, strerror(errno))); return OMPI_ERROR; } lmngr->base_addr = (void*)OPAL_ALIGN((uintptr_t)lmngr->alloc_base, lmngr->list_alignment, uintptr_t); #endif assert(lmngr->n_resources < MCA_COLL_ML_MAX_REG_INFO); for(i= 0 ;i < lmngr->n_resources ;i++) { nc = lmngr->net_context[i]; ML_VERBOSE(7, ("Call registration for resource index %d", i)); rc = lmngr_register(lmngr, nc); if (OMPI_SUCCESS != rc) { ML_ERROR(("Failed to lmngr register: %d [%s]", errno, strerror(errno))); return rc; } } /* slice the memory to blocks */ addr = (unsigned char *) lmngr->base_addr; for(num_blocks = 0; num_blocks < (int)lmngr->list_size; num_blocks++) { mca_bcol_base_lmngr_block_t *item = OBJ_NEW(mca_bcol_base_lmngr_block_t); item->base_addr = (void *)addr; item->lmngr = lmngr; /* ML_VERBOSE(10, ("Appending block # %d %p", num_blocks, (void *)addr)); */ opal_list_append(&lmngr->blocks_list, (opal_list_item_t *)item); /* advance the address */ addr += lmngr->list_block_size; } ML_VERBOSE(7, ("List initialization done %d", opal_list_get_size(&lmngr->blocks_list))); return OMPI_SUCCESS; }
static int mca_coll_ml_memsync_recycle_memory(mca_coll_ml_collective_operation_progress_t *coll_op) { mca_coll_ml_module_t *ml_module = (mca_coll_ml_module_t *)coll_op->coll_module; mca_bcol_base_memory_block_desc_t *ml_memblock = ml_module->payload_block; mca_coll_ml_collective_operation_progress_t *pending_op = NULL; int bank = coll_op->full_message.bank_index_to_recycle; int rc; bool have_resources = true; assert(bank >= 0 || bank < (int)ml_memblock->num_banks || ML_MEMSYNC == coll_op->fragment_data.current_coll_op); ML_VERBOSE(10,("MEMSYNC: bank %d was recycled coll_op %p", bank, coll_op)); /* set the bank as free */ ml_memblock->bank_is_busy[bank] = false; ml_memblock->bank_release_counters[bank] = 0; /* Check if we have any requests that are waiting for memory */ while(opal_list_get_size(&ml_module->waiting_for_memory_list) && have_resources) { pending_op = (mca_coll_ml_collective_operation_progress_t *) opal_list_get_first(&ml_module->waiting_for_memory_list); ML_VERBOSE(10, ("Trying to start pending %p", pending_op)); assert(pending_op->pending & REQ_OUT_OF_MEMORY); rc = pending_op->fragment_data.message_descriptor->fragment_launcher(pending_op); switch (rc) { case OMPI_SUCCESS: ML_VERBOSE(10, ("Pending fragment was started %p", pending_op)); pending_op->pending ^= REQ_OUT_OF_MEMORY; opal_list_remove_item(&ml_module->waiting_for_memory_list, (opal_list_item_t *)pending_op); if (0 != pending_op->fragment_data.offset_into_user_buffer) { /* non-zero offset ==> this is not fragment 0 */ CHECK_AND_RECYCLE(pending_op); } break; case OMPI_ERR_TEMP_OUT_OF_RESOURCE: ML_VERBOSE(10, ("Already on the list %p", pending_op)); have_resources = false; break; default: ML_ERROR(("Error happened %d", rc)); return rc; } } ML_VERBOSE(10, ("Memsync done %p", coll_op)); return OMPI_SUCCESS; }
int ml_coll_hier_reduce_setup(mca_coll_ml_module_t *ml_module) { int alg, ret, topo_index=0; mca_coll_ml_topology_t *topo_info = &ml_module->topo_list[ml_module->collectives_topology_map[ML_REDUCE][ML_SMALL_MSG]]; if ( ml_module->max_fn_calls < topo_info->n_levels ) { ml_module->max_fn_calls = topo_info->n_levels; } alg = mca_coll_ml_component.coll_config[ML_REDUCE][ML_SMALL_MSG].algorithm_id; topo_index = ml_module->collectives_topology_map[ML_REDUCE][alg]; if (ML_UNDEFINED == alg || ML_UNDEFINED == topo_index) { ML_ERROR(("No topology index or algorithm was defined")); topo_info->hierarchical_algorithms[ML_REDUCE] = NULL; return OMPI_ERROR; } ret = mca_coll_ml_build_static_reduce_schedule(&ml_module->topo_list[topo_index], &ml_module->coll_ml_reduce_functions[alg]); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { ML_VERBOSE(10, ("Failed to setup static reduce")); return ret; } return OMPI_SUCCESS; }
static int lmngr_register(mca_coll_ml_lmngr_t *lmngr, bcol_base_network_context_t *nc) { int rc, j; int max_nc = lmngr->n_resources; rc = nc->register_memory_fn(nc->context_data, lmngr->base_addr, lmngr->list_size * lmngr->list_block_size, &lmngr->reg_desc[nc->context_id]); if(rc != OMPI_SUCCESS) { int ret_val; ML_VERBOSE(7, ("Failed to register [%d], unrolling the registration", rc)); /* deregistser the successful registrations */ for( j = 0; j < max_nc; j++ ) { /* set the registration parameter to point to the current * resource description */ nc = lmngr->net_context[j]; ret_val = nc->deregister_memory_fn(nc->context_data, lmngr->reg_desc[nc->context_id]); if(ret_val != OMPI_SUCCESS) { return ret_val; } } return rc; } return OMPI_SUCCESS; }
int mca_coll_ml_lmngr_tune(mca_coll_ml_lmngr_t *lmngr, size_t block_size, size_t list_size, size_t alignment) { ML_VERBOSE(7, ("Tunning list manager")); if (OPAL_UNLIKELY(NULL == lmngr->base_addr)) { ML_VERBOSE(7, ("The list manager is already initialized, you can not tune it")); return OMPI_ERROR; } lmngr->list_block_size = block_size; lmngr->list_alignment = alignment; lmngr->list_size = list_size; return OMPI_SUCCESS; }
int mca_coll_ml_lmngr_append_nc(mca_coll_ml_lmngr_t *lmngr, bcol_base_network_context_t *nc) { int i, rc; ML_VERBOSE(7, ("Append new network context %p to list manager %p", nc, lmngr)); if (NULL == nc) { return OMPI_ERROR; } /* check if we already have the context on the list. if we do have - do not do anything, just return success */ if (OPAL_UNLIKELY(MCA_COLL_ML_MAX_REG_INFO == lmngr->n_resources)) { ML_ERROR(("MPI overflows maximum supported network contexts is %d", MCA_COLL_ML_MAX_REG_INFO)); return OMPI_ERROR; } for (i = 0; i < lmngr->n_resources; i++) { if (lmngr->net_context[i] == nc) { ML_VERBOSE(7, ("It is not new ")); return OMPI_SUCCESS; } } ML_VERBOSE(7, ("Adding new context")); /* Setting context id */ nc->context_id = lmngr->n_resources; lmngr->net_context[lmngr->n_resources] = nc; lmngr->n_resources++; /* Register the memory with new context */ if (NULL != lmngr->base_addr) { rc = lmngr_register(lmngr, nc); if (OMPI_SUCCESS == rc) { return rc; } } return OMPI_SUCCESS; }
static int add_to_invoke_table(mca_bcol_base_module_t *bcol_module, mca_bcol_base_coll_fn_desc_t *fn_filtered, mca_coll_ml_module_t *ml_module) { struct mca_bcol_base_coll_fn_invoke_attributes_t *inv_attribs = NULL; int bcoll_type, data_src_type, waiting_semantic; int range_min,range_max; int i=0,j=0,k=0,mask=1; if((NULL == fn_filtered->inv_attr)||(NULL == fn_filtered->comm_attr)) { return OMPI_ERROR; } ML_VERBOSE(10, ("Calling add_to_invoke_table %p",fn_filtered->coll_fn)); inv_attribs = fn_filtered->inv_attr; bcoll_type = fn_filtered->comm_attr->bcoll_type; data_src_type = fn_filtered->comm_attr->data_src; waiting_semantic = fn_filtered->comm_attr->waiting_semantics; range_min = msg_to_range(inv_attribs->bcol_msg_min); range_max = msg_to_range(inv_attribs->bcol_msg_max); for (j=0; j<OMPI_OP_NUM_OF_TYPES; j++) { for (k=0; k<OMPI_DATATYPE_MAX_PREDEFINED; k++) { if ((inv_attribs->datatype_bitmap & (mask << k)) && (inv_attribs->op_types_bitmap & (mask << j))){ for (i=range_min; i<=range_max; i++) { bcol_module->filtered_fns_table[data_src_type][waiting_semantic][bcoll_type][i][j][k] = fn_filtered; ML_VERBOSE(21, ("Putting functions %d %d %d %d %p", bcoll_type, i, j, k, fn_filtered)); } } } } return 0; }
static int mca_coll_ml_barrier_launch(mca_coll_ml_module_t *ml_module, ompi_request_t **req) { int rc; ompi_free_list_item_t *item; mca_coll_ml_collective_operation_progress_t *coll_op; ml_payload_buffer_desc_t *src_buffer_desc = NULL; /* allocate an ml buffer for signaling purposes */ src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); while (NULL == src_buffer_desc) { opal_progress(); src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); } /* Blocking call on fragment allocation (Maybe we want to make it non blocking ?) */ OMPI_FREE_LIST_WAIT(&(ml_module->coll_ml_collective_descriptors), item, rc); coll_op = (mca_coll_ml_collective_operation_progress_t *) item; assert(NULL != coll_op); ML_VERBOSE(10, ("Get coll request %p", coll_op)); MCA_COLL_ML_OP_BASIC_SETUP(coll_op, 0, 0, NULL, NULL, ml_module->coll_ml_barrier_function); coll_op->fragment_data.buffer_desc = src_buffer_desc; coll_op->dag_description.num_tasks_completed = 0; coll_op->variable_fn_params.buffer_index = src_buffer_desc->buffer_index; coll_op->variable_fn_params.sequence_num = OPAL_THREAD_ADD64(&(ml_module->collective_sequence_num), 1); /* Pointer to a coll finalize function */ coll_op->process_fn = NULL; (*req) = &coll_op->full_message.super; OMPI_REQUEST_INIT((*req), false); (*req)->req_status._cancelled = 0; (*req)->req_state = OMPI_REQUEST_ACTIVE; (*req)->req_status.MPI_ERROR = OMPI_SUCCESS; /* Set order info if there is a bcol needs ordering */ MCA_COLL_ML_SET_ORDER_INFO(coll_op, 1); return mca_coll_ml_generic_collectives_launcher(coll_op, mca_coll_ml_barrier_task_setup); }
static inline __opal_attribute_always_inline__ int mca_coll_ml_memsync_launch(mca_coll_ml_module_t *ml_module, ompi_request_t **req, int bank_index) { mca_coll_ml_collective_operation_progress_t *coll_op; coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module, ml_module->coll_ml_memsync_function, NULL, NULL, 0, 0); assert(NULL != coll_op); ML_VERBOSE(10, ("Get coll request %p", coll_op)); coll_op->fragment_data.buffer_desc = NULL; /* Caching bank index for future memory recycling callback */ coll_op->full_message.bank_index_to_recycle = bank_index; coll_op->fragment_data.current_coll_op = ML_MEMSYNC; /* I don't want to define one more parameter, so under root * we pass buffer index */ coll_op->variable_fn_params.root = bank_index; /* As well it's little bit ugly, since it is no wait for this request, * in order to recycle it we have to set offset to some value > 1 */ coll_op->fragment_data.offset_into_user_buffer = 1; coll_op->variable_fn_params.buffer_index = MCA_COLL_ML_NO_BUFFER; coll_op->variable_fn_params.sequence_num = -1; /* It should be safe to use -1 */ /* Pointer to a coll finalize function */ if (OPAL_LIKELY(ml_module->initialized)) { coll_op->process_fn = mca_coll_ml_memsync_recycle_memory; } else { /* No post work on first call */ coll_op->process_fn = NULL; } ML_VERBOSE(10,("Memsync start %p", &coll_op)); return mca_coll_ml_generic_collectives_append_to_queue(coll_op, mca_coll_ml_barrier_task_setup); }
static void destruct_lmngr(mca_coll_ml_lmngr_t *lmngr) { int max_nc = lmngr->n_resources; int rc, i; bcol_base_network_context_t *nc; opal_list_item_t *item; ML_VERBOSE(6, ("Destructing list manager %p", (void *)lmngr)); while (NULL != (item = opal_list_remove_first(&lmngr->blocks_list))) { OBJ_RELEASE(item); } OBJ_DESTRUCT(&lmngr->blocks_list); if (NULL != lmngr->alloc_base) { for( i = 0; i < max_nc; i++ ) { nc = lmngr->net_context[i]; rc = nc->deregister_memory_fn(nc->context_data, lmngr->reg_desc[nc->context_id]); if(rc != OMPI_SUCCESS) { ML_ERROR(("Failed to unregister , lmngr %p", (void *)lmngr)); } } ML_VERBOSE(10, ("Release base addr %p", lmngr->alloc_base)); free(lmngr->alloc_base); lmngr->alloc_base = NULL; lmngr->base_addr = NULL; } lmngr->list_block_size = 0; lmngr->list_alignment = 0; lmngr->list_size = 0; lmngr->n_resources = 0; OBJ_DESTRUCT(&lmngr->mem_lock); }
static int mca_coll_ml_reduce_task_setup (mca_coll_ml_collective_operation_progress_t *coll_op) { int fn_idx, h_level, next_h_level, my_index; mca_sbgp_base_module_t *sbgp; mca_coll_ml_topology_t *topo = coll_op->coll_schedule->topo_info; fn_idx = coll_op->sequential_routine.current_active_bcol_fn; h_level = coll_op->coll_schedule->component_functions[fn_idx].h_level; next_h_level = (fn_idx < coll_op->coll_schedule->n_fns - 1) ? coll_op->coll_schedule->component_functions[fn_idx+1].h_level : -1; sbgp = topo->component_pairs[h_level].subgroup_module; my_index = sbgp->my_index; if (coll_op->variable_fn_params.root_flag) { ML_VERBOSE(1,("In task completion Data in receiver buffer %d ", *(int *)((unsigned char*) coll_op->variable_fn_params.rbuf + coll_op->variable_fn_params.rbuf_offset))); } /* determine the root for this level of the hierarchy */ if (coll_op->coll_schedule->topo_info->route_vector[coll_op->global_root].level == next_h_level || coll_op->global_root == sbgp->group_list[my_index]) { /* I am the global root or I will be talking to the global root in the next round. */ coll_op->variable_fn_params.root = my_index; } else if (coll_op->coll_schedule->topo_info->route_vector[coll_op->global_root].level == h_level) { /* the root is in this level of my hierarchy */ coll_op->variable_fn_params.root = coll_op->coll_schedule->topo_info->route_vector[coll_op->global_root].rank; } else { coll_op->variable_fn_params.root = 0; } /* Set the route vector for this root */ coll_op->variable_fn_params.root_route = &coll_op->coll_schedule->topo_info->route_vector[sbgp->group_list[coll_op->variable_fn_params.root]]; /* Am I the root of this hierarchy? */ coll_op->variable_fn_params.root_flag = (my_index == coll_op->variable_fn_params.root); /* For hierarchy switch btw source and destination buffer * No need to make this switch for the first call .. * */ if (0 < fn_idx) { int tmp_offset = coll_op->variable_fn_params.sbuf_offset; coll_op->variable_fn_params.sbuf_offset = coll_op->variable_fn_params.rbuf_offset; coll_op->variable_fn_params.rbuf_offset = tmp_offset; } return OMPI_SUCCESS; }
int mca_coll_ml_allgather_nb(const void *sbuf, int scount, struct ompi_datatype_t *sdtype, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, ompi_request_t **req, mca_coll_base_module_t *module) { int ret; ML_VERBOSE(10, ("Starting non-blocking allgather")); ret = mca_coll_ml_allgather_start (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module, req); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { return ret; } ML_VERBOSE(10, ("Non-blocking allgather started")); return ret; }
/** * Hierarchical non-blocking barrier */ int mca_coll_ml_ibarrier_intra(struct ompi_communicator_t *comm, ompi_request_t **req, mca_coll_base_module_t *module) { int rc; mca_coll_ml_module_t *ml_module = (mca_coll_ml_module_t *) module; #if OPAL_ENABLE_DEBUG static int barriers_count = 0; #endif ML_VERBOSE(10, ("IBarrier num %d start.", ++barriers_count)); rc = mca_coll_ml_barrier_launch(ml_module, req); if (OPAL_UNLIKELY(rc != OMPI_SUCCESS)) { ML_ERROR(("Failed to launch a barrier.")); return rc; } ML_VERBOSE(10, ("IBarrier num %d was done.", barriers_count)); return OMPI_SUCCESS; }
int mca_coll_ml_lmngr_reg(void) { int tmp, ret = OMPI_SUCCESS; mca_coll_ml_component_t *cm = &mca_coll_ml_component; #define CHECK(expr) do {\ tmp = (expr); \ if (0 > tmp) ret = tmp; \ } while (0) ML_VERBOSE(7, ("Setting parameters for list manager")); cm->lmngr_size = 8; CHECK(mca_base_component_var_register(&mca_coll_ml_component.super.collm_version, "memory_manager_list_size", "Memory manager list size", MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, &cm->lmngr_size)); /* The size list couldn't be less than possible max of ML modules, it = max supported communicators by ML */ if (cm->lmngr_size < cm->max_comm) { cm->lmngr_size = cm->max_comm; } mca_coll_ml_component.lmngr_block_size = cm->payload_buffer_size * cm->n_payload_buffs_per_bank * cm->n_payload_mem_banks * cm->lmngr_size; CHECK(mca_base_component_var_register(&mca_coll_ml_component.super.collm_version, "memory_manager_block_size", "Memory manager block size", MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, &mca_coll_ml_component.lmngr_block_size)); cm->lmngr_alignment = opal_getpagesize(); CHECK(mca_base_component_var_register(&mca_coll_ml_component.super.collm_version, "memory_manager_alignment", "Memory manager alignment", MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, &mca_coll_ml_component.lmngr_block_size)); return ret; }
mca_bcol_base_lmngr_block_t* mca_coll_ml_lmngr_alloc ( mca_coll_ml_lmngr_t *lmngr) { int rc; opal_list_t *list = &lmngr->blocks_list; /* Check if the list manager was initialized */ if(OPAL_UNLIKELY(NULL == lmngr->base_addr)) { ML_VERBOSE(7 ,("Starting memory initialization")); rc = mca_coll_ml_lmngr_init(lmngr); if (OMPI_SUCCESS != rc) { ML_ERROR(("Failed to init memory")); return NULL; } } if(OPAL_UNLIKELY(opal_list_is_empty(list))) { /* Upper layer need to handle the NULL */ ML_VERBOSE(1, ("List manager is empty.")); return NULL; } return (mca_bcol_base_lmngr_block_t *)opal_list_remove_first(list); }
int ml_coll_memsync_setup(mca_coll_ml_module_t *ml_module) { int ret; /* For barrier syncronization we use barrier topology */ mca_coll_ml_topology_t *topo_info = &ml_module->topo_list[ml_module->collectives_topology_map[ML_BARRIER][ML_SMALL_MSG]]; ret = mca_coll_ml_build_memsync_schedule(topo_info, &ml_module->coll_ml_memsync_function); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { ML_VERBOSE(10, ("Failed to setup static bcast")); return ret; } return OMPI_SUCCESS; }
/** * Non blocking memory syncronization */ int mca_coll_ml_memsync_intra(mca_coll_ml_module_t *ml_module, int bank_index) { int rc; ompi_request_t *req; ML_VERBOSE(8, ("MEMSYNC start")); if (OPAL_UNLIKELY(0 == opal_list_get_size(&ml_module->active_bcols_list))) { /* Josh's change: In the case where only p2p is active, we have no way * to reset the bank release counters to zero, I am doing that here since it * would actually be "correct" to do it outside of this conditional, however * I suspect that reseting the value to zero elsewhere would result in corrupted * flow for non-contiguous data types */ /* nasty hack to ensure that resources are released in the single level * ptp case. */ mca_coll_ml_collective_operation_progress_t dummy_coll; dummy_coll.coll_module = (mca_coll_base_module_t *) ml_module; dummy_coll.fragment_data.current_coll_op = ML_MEMSYNC; dummy_coll.full_message.bank_index_to_recycle = bank_index; /* Handling special case when memory syncronization is not required */ rc = mca_coll_ml_memsync_recycle_memory(&dummy_coll); if(OPAL_UNLIKELY(rc != OMPI_SUCCESS)){ ML_ERROR(("Failed to flush the list.")); return rc; } } else { /* retain the communicator until the operation is finished. the communicator * will be released by CHECK_AND_RECYCLE */ OBJ_RETAIN(ml_module->comm); rc = mca_coll_ml_memsync_launch(ml_module, &req, bank_index); if (OPAL_UNLIKELY(rc != OMPI_SUCCESS)) { ML_ERROR(("Failed to launch a barrier.")); return rc; } } return OMPI_SUCCESS; }
/* The function is very different from the above function */ int mca_coll_ml_check_if_bcol_is_requested(const char *component_name) { mca_base_component_list_item_t *bcol_comp; bcol_comp = (mca_base_component_list_item_t *) opal_list_get_first(&mca_bcol_base_components_in_use); ML_VERBOSE(10, ("Loop over bcol components\n")); for ( bcol_comp = (mca_base_component_list_item_t *) opal_list_get_first(&mca_bcol_base_components_in_use); bcol_comp != (mca_base_component_list_item_t *) opal_list_get_end(&mca_bcol_base_components_in_use); bcol_comp = (mca_base_component_list_item_t *) opal_list_get_next(bcol_comp)) { if(0 == strcmp(component_name, ((mca_bcol_base_component_2_0_0_t *) bcol_comp->cli_component)->bcol_version.mca_component_name)) { return true; } } /* the component was not resquested */ return false; }
/* Constructor for list memory manager */ static void construct_lmngr(mca_coll_ml_lmngr_t *lmngr) { mca_coll_ml_component_t *cm = &mca_coll_ml_component; ML_VERBOSE(7, ("Constructing new list manager %p", (void *)lmngr)); /* No real memory is allocated, only basic init. The real memory will be allocated on demand, on first block allocation */ /* I caching this block size, alignment and list size since maybe in future we will want to define different parameters for lists */ lmngr->list_block_size = cm->lmngr_block_size; lmngr->list_alignment = cm->lmngr_alignment; lmngr->list_size = cm->lmngr_size; lmngr->n_resources = 0; lmngr->base_addr = NULL; /* If the base addr is not null, the struct was initilized and memory was allocated */ /* Not sure that lock is required */ OBJ_CONSTRUCT(&lmngr->mem_lock, opal_mutex_t); /* Only construct the list, no memry initialisation */ OBJ_CONSTRUCT(&lmngr->blocks_list, opal_list_t); }
static inline __opal_attribute_always_inline__ int mca_coll_ml_allgather_start (const void *sbuf, int scount, struct ompi_datatype_t *sdtype, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module, ompi_request_t **req) { size_t pack_len, sdt_size; int ret, n_fragments = 1, comm_size; mca_coll_ml_topology_t *topo_info; mca_bcol_base_payload_buffer_desc_t *src_buffer_desc; mca_coll_ml_component_t *cm = &mca_coll_ml_component; mca_coll_ml_collective_operation_progress_t *coll_op; mca_coll_ml_module_t *ml_module = (mca_coll_ml_module_t *) module; ptrdiff_t lb, extent; bool scontig, rcontig, in_place = false; /* check for in place setting */ if (MPI_IN_PLACE == sbuf) { in_place = true; sdtype = rdtype; scount = rcount; } /* scontig could be != to rcontig */ scontig = ompi_datatype_is_contiguous_memory_layout(sdtype, scount); rcontig = ompi_datatype_is_contiguous_memory_layout(rdtype, rcount); comm_size = ompi_comm_size(comm); ML_VERBOSE(10, ("Starting allgather")); assert(NULL != sdtype); /* Calculate size of the data, * at this stage, only contiguous data is supported */ /* this is valid for allagther */ ompi_datatype_type_size(sdtype, &sdt_size); pack_len = scount * sdt_size; if (in_place) { sbuf = (char *) rbuf + ompi_comm_rank(comm) * pack_len; } /* Allocate collective schedule and pack message */ /* this is the total ending message size that will need to fit in the ml-buffer */ if (pack_len <= (size_t) ml_module->small_message_thresholds[BCOL_ALLGATHER]) { /* The len of the message can not be larger than ML buffer size */ ML_VERBOSE(10, ("Single frag %d %d %d", pack_len, comm_size, ml_module->payload_block->size_buffer)); assert(pack_len * comm_size <= ml_module->payload_block->size_buffer); src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); while (NULL == src_buffer_desc) { opal_progress(); src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); } /* change 1 */ coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module, ml_module->coll_ml_allgather_functions[ML_SMALL_DATA_ALLGATHER], sbuf, rbuf, pack_len, 0 /* offset for first pack */); MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(coll_op, src_buffer_desc->buffer_index, src_buffer_desc); coll_op->fragment_data.current_coll_op = ML_SMALL_DATA_ALLGATHER; /* task setup callback function */ coll_op->sequential_routine.seq_task_setup = mca_coll_ml_allgather_task_setup; /* change 2 */ if (!scontig) { coll_op->full_message.n_bytes_scheduled = mca_coll_ml_convertor_prepare(sdtype, scount, sbuf, &coll_op->full_message.send_convertor, MCA_COLL_ML_NET_STREAM_SEND); mca_coll_ml_convertor_pack( (void *) ((uintptr_t) src_buffer_desc->data_addr + pack_len * (coll_op->coll_schedule->topo_info->hier_layout_info[0].offset + coll_op->coll_schedule->topo_info->hier_layout_info[0].level_one_index)), pack_len, &coll_op->full_message.send_convertor); } else { /* change 3 */ memcpy((void *)((uintptr_t) src_buffer_desc->data_addr + pack_len * (coll_op->coll_schedule->topo_info->hier_layout_info[0].offset + coll_op->coll_schedule->topo_info->hier_layout_info[0].level_one_index)), sbuf, pack_len); coll_op->full_message.n_bytes_scheduled = pack_len; } if (!rcontig) { mca_coll_ml_convertor_prepare(rdtype, rcount * comm_size, rbuf, &coll_op->full_message.recv_convertor, MCA_COLL_ML_NET_STREAM_RECV); } if (coll_op->coll_schedule->topo_info->ranks_contiguous) { coll_op->process_fn = mca_coll_ml_allgather_small_unpack_data; } else { coll_op->process_fn = mca_coll_ml_allgather_noncontiguous_unpack_data; } /* whole ml-buffer is used to send AND receive */ coll_op->variable_fn_params.sbuf = (void *) src_buffer_desc->data_addr; coll_op->variable_fn_params.rbuf = (void *) src_buffer_desc->data_addr; /* we can set the initial offset here */ coll_op->variable_fn_params.sbuf_offset = 0; coll_op->variable_fn_params.rbuf_offset = 0; coll_op->variable_fn_params.count = scount; coll_op->fragment_data.fragment_size = coll_op->full_message.n_bytes_scheduled; /* For small CINCO, we may use the native datatype */ coll_op->variable_fn_params.dtype = sdtype; coll_op->variable_fn_params.buffer_size = pack_len; coll_op->variable_fn_params.root = 0; } else if (cm->enable_fragmentation || pack_len * comm_size < (1 << 20)) { /* calculate the number of fragments and the size of each frag */ size_t n_dts_per_frag, frag_len; int pipeline_depth = mca_coll_ml_component.pipeline_depth; /* Calculate the number of fragments required for this message careful watch the integer division !*/ frag_len = (pack_len <= (size_t) ml_module->small_message_thresholds[BCOL_ALLGATHER] ? pack_len : (size_t) ml_module->small_message_thresholds[BCOL_ALLGATHER]); n_dts_per_frag = frag_len / sdt_size; n_fragments = (pack_len + sdt_size * n_dts_per_frag - 1) / (sdt_size * n_dts_per_frag); pipeline_depth = (n_fragments < pipeline_depth ? n_fragments : pipeline_depth); src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); while (NULL == src_buffer_desc) { opal_progress(); src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); } /* change 4 */ coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module, ml_module->coll_ml_allgather_functions[ML_SMALL_DATA_ALLGATHER], sbuf, rbuf, pack_len, 0 /* offset for first pack */); MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(coll_op, src_buffer_desc->buffer_index, src_buffer_desc); topo_info = coll_op->coll_schedule->topo_info; /* task setup callback function */ coll_op->sequential_routine.seq_task_setup = mca_coll_ml_allgather_task_setup; if (!scontig) { coll_op->full_message.send_converter_bytes_packed = mca_coll_ml_convertor_prepare( sdtype, scount, NULL, &coll_op->full_message.dummy_convertor, MCA_COLL_ML_NET_STREAM_SEND); coll_op->full_message.dummy_conv_position = 0; mca_coll_ml_convertor_get_send_frag_size( ml_module, &frag_len, &coll_op->full_message); /* change 5 */ mca_coll_ml_convertor_prepare(sdtype, scount, sbuf, &coll_op->full_message.send_convertor, MCA_COLL_ML_NET_STREAM_SEND); mca_coll_ml_convertor_pack( (void *) ((uintptr_t) src_buffer_desc->data_addr + frag_len * (topo_info->hier_layout_info[0].offset + topo_info->hier_layout_info[0].level_one_index)), frag_len, &coll_op->full_message.send_convertor); } else { /* change 6 */ memcpy((void *)((uintptr_t)src_buffer_desc->data_addr + frag_len * (topo_info->hier_layout_info[0].offset + topo_info->hier_layout_info[0].level_one_index)), sbuf, frag_len); } if (!rcontig) { mca_coll_ml_convertor_prepare(rdtype, rcount * comm_size, rbuf, &coll_op->full_message.recv_convertor, MCA_COLL_ML_NET_STREAM_RECV); } coll_op->process_fn = mca_coll_ml_allgather_noncontiguous_unpack_data; /* hopefully this doesn't royaly screw things up idea behind this is the * whole ml-buffer is used to send and receive */ coll_op->variable_fn_params.sbuf = (void *) src_buffer_desc->data_addr; coll_op->variable_fn_params.rbuf = (void *) src_buffer_desc->data_addr; /* we can set the initial offset here */ coll_op->variable_fn_params.sbuf_offset = 0; coll_op->variable_fn_params.rbuf_offset = 0; coll_op->fragment_data.buffer_desc = src_buffer_desc; coll_op->fragment_data.fragment_size = frag_len; coll_op->fragment_data.message_descriptor->n_active = 1; coll_op->full_message.n_bytes_scheduled = frag_len; coll_op->full_message.fragment_launcher = mca_coll_ml_allgather_frag_progress; coll_op->full_message.pipeline_depth = pipeline_depth; coll_op->fragment_data.current_coll_op = ML_SMALL_DATA_ALLGATHER; /* remember this is different for frags !! Caused data corruption when * not properly set. Need to be sure you have consistent units. */ coll_op->variable_fn_params.count = frag_len; coll_op->variable_fn_params.dtype = MPI_BYTE; /* for fragmented data, we work in * units of bytes. This means that * all of our arithmetic is done * in terms of bytes */ coll_op->variable_fn_params.root = 0; coll_op->variable_fn_params.frag_size = frag_len; coll_op->variable_fn_params.buffer_size = frag_len; } else { /* change 7 */ ML_VERBOSE(10, ("ML_ALLGATHER_LARGE_DATA_KNOWN case.")); coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module, ml_module->coll_ml_allgather_functions[ML_LARGE_DATA_ALLGATHER], sbuf, rbuf, pack_len, 0 /* offset for first pack */); topo_info = coll_op->coll_schedule->topo_info; if (MCA_BCOL_BASE_NO_ML_BUFFER_FOR_LARGE_MSG & topo_info->all_bcols_mode) { MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(coll_op, MCA_COLL_ML_NO_BUFFER, NULL); } else { src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); while (NULL == src_buffer_desc) { opal_progress(); src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); } MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(coll_op, src_buffer_desc->buffer_index, src_buffer_desc); } /* not sure if I really need this here */ coll_op->sequential_routine.seq_task_setup = mca_coll_ml_allgather_task_setup; coll_op->process_fn = NULL; /* probably the most important piece */ coll_op->variable_fn_params.sbuf = sbuf; coll_op->variable_fn_params.rbuf = rbuf; coll_op->variable_fn_params.sbuf_offset = 0; coll_op->variable_fn_params.rbuf_offset = 0; coll_op->variable_fn_params.count = scount; coll_op->variable_fn_params.dtype = sdtype;/* for zero copy, we want the * native datatype and actual count */ coll_op->variable_fn_params.root = 0; /* you still need to copy in your own data into the rbuf */ /* don't need to do this if you have in place data */ if (!in_place) { memcpy((char *) rbuf + ompi_comm_rank(comm) * pack_len, sbuf, pack_len); } } coll_op->full_message.send_count = scount; coll_op->full_message.recv_count = rcount; coll_op->full_message.send_data_continguous = scontig; coll_op->full_message.recv_data_continguous = rcontig; ompi_datatype_get_extent(sdtype, &lb, &extent); coll_op->full_message.send_extent = (size_t) extent; ompi_datatype_get_extent(rdtype, &lb, &extent); coll_op->full_message.recv_extent = (size_t) extent; /* Fill in the function arguments */ coll_op->variable_fn_params.sequence_num = OPAL_THREAD_ADD32(&(ml_module->collective_sequence_num), 1); coll_op->variable_fn_params.hier_factor = comm_size; MCA_COLL_ML_SET_ORDER_INFO(coll_op, n_fragments); ret = mca_coll_ml_launch_sequential_collective (coll_op); if (OMPI_SUCCESS != ret) { ML_VERBOSE(10, ("Failed to launch")); return ret; } *req = &coll_op->full_message.super; return OMPI_SUCCESS; }
static int mca_coll_ml_reduce_frag_progress(mca_coll_ml_collective_operation_progress_t *coll_op) { /* local variables */ void *buf; size_t dt_size; int ret, frag_len, count; ptrdiff_t lb, extent; mca_bcol_base_payload_buffer_desc_t *src_buffer_desc; mca_coll_ml_collective_operation_progress_t *new_op; mca_coll_ml_module_t *ml_module = OP_ML_MODULE(coll_op); ret = ompi_datatype_get_extent(coll_op->variable_fn_params.dtype, &lb, &extent); if (ret < 0) { return OMPI_ERROR; } dt_size = (size_t) extent; /* Keep the pipeline filled with fragments */ while (coll_op->fragment_data.message_descriptor->n_active < coll_op->fragment_data.message_descriptor->pipeline_depth) { /* If an active fragment happens to have completed the collective during * a hop into the progress engine, then don't launch a new fragment, * instead break and return. */ if (coll_op->fragment_data.message_descriptor->n_bytes_scheduled == coll_op->fragment_data.message_descriptor->n_bytes_total) { break; } /* Get an ml buffer */ src_buffer_desc = mca_coll_ml_alloc_buffer(OP_ML_MODULE(coll_op)); if (NULL == src_buffer_desc) { /* If there exist outstanding fragments, then break out * and let an active fragment deal with this later, * there are no buffers available. */ if (0 < coll_op->fragment_data.message_descriptor->n_active) { return OMPI_SUCCESS; } else { /* It is useless to call progress from here, since * ml progress can't be executed as result ml memsync * call will not be completed and no memory will be * recycled. So we put the element on the list, and we will * progress it later when memsync will recycle some memory*/ /* The fragment is already on list and * the we still have no ml resources * Return busy */ if (coll_op->pending & REQ_OUT_OF_MEMORY) { ML_VERBOSE(10,("Out of resources %p", coll_op)); return OMPI_ERR_TEMP_OUT_OF_RESOURCE; } coll_op->pending |= REQ_OUT_OF_MEMORY; opal_list_append(&((OP_ML_MODULE(coll_op))->waiting_for_memory_list), (opal_list_item_t *)coll_op); ML_VERBOSE(10,("Out of resources %p adding to pending queue", coll_op)); return OMPI_ERR_TEMP_OUT_OF_RESOURCE; } } /* Get a new collective descriptor and initialize it */ new_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module, ml_module->coll_ml_reduce_functions[ML_SMALL_DATA_REDUCE], coll_op->fragment_data.message_descriptor->src_user_addr, coll_op->fragment_data.message_descriptor->dest_user_addr, coll_op->fragment_data.message_descriptor->n_bytes_total, coll_op->fragment_data.message_descriptor->n_bytes_scheduled); ML_VERBOSE(1,(" In Reduce fragment progress %d %d ", coll_op->fragment_data.message_descriptor->n_bytes_total, coll_op->fragment_data.message_descriptor->n_bytes_scheduled)); MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(new_op, src_buffer_desc->buffer_index, src_buffer_desc); new_op->fragment_data.current_coll_op = coll_op->fragment_data.current_coll_op; new_op->fragment_data.message_descriptor = coll_op->fragment_data.message_descriptor; /* set the task setup callback */ new_op->sequential_routine.seq_task_setup = mca_coll_ml_reduce_task_setup; /* We need this address for pointer arithmetic in memcpy */ buf = (void*)coll_op->fragment_data.message_descriptor->src_user_addr; /* calculate the number of data types in this packet */ count = (coll_op->fragment_data.message_descriptor->n_bytes_total - coll_op->fragment_data.message_descriptor->n_bytes_scheduled < ((size_t) OP_ML_MODULE(coll_op)->small_message_thresholds[BCOL_REDUCE]/4 )? (coll_op->fragment_data.message_descriptor->n_bytes_total - coll_op->fragment_data.message_descriptor->n_bytes_scheduled) / dt_size : (size_t) coll_op->variable_fn_params.count); /* calculate the fragment length */ frag_len = count * dt_size; ret = ompi_datatype_copy_content_same_ddt(coll_op->variable_fn_params.dtype, count, (char *) src_buffer_desc->data_addr, (char *) ((uintptr_t) buf + (uintptr_t) coll_op->fragment_data.message_descriptor->n_bytes_scheduled)); if (ret < 0) { return OMPI_ERROR; } /* if root unpack the data */ if (ompi_comm_rank(ml_module->comm) == coll_op->global_root ) { new_op->process_fn = mca_coll_ml_reduce_unpack; new_op->variable_fn_params.root_flag = true; } else { new_op->process_fn = NULL; new_op->variable_fn_params.root_flag = false; } new_op->variable_fn_params.root_route = coll_op->variable_fn_params.root_route; /* Setup fragment specific data */ new_op->fragment_data.message_descriptor->n_bytes_scheduled += frag_len; new_op->fragment_data.buffer_desc = src_buffer_desc; new_op->fragment_data.fragment_size = frag_len; (new_op->fragment_data.message_descriptor->n_active)++; /* Set in Reduce Buffer arguments */ ML_SET_VARIABLE_PARAMS_BCAST(new_op, OP_ML_MODULE(new_op), count, coll_op->variable_fn_params.dtype, src_buffer_desc, 0, (ml_module->payload_block->size_buffer - ml_module->data_offset)/2, frag_len, src_buffer_desc->data_addr); new_op->variable_fn_params.buffer_size = frag_len; new_op->variable_fn_params.sbuf = src_buffer_desc->data_addr; new_op->variable_fn_params.rbuf = src_buffer_desc->data_addr; new_op->variable_fn_params.root = coll_op->variable_fn_params.root; new_op->global_root = coll_op->global_root; new_op->variable_fn_params.op = coll_op->variable_fn_params.op; new_op->variable_fn_params.hier_factor = coll_op->variable_fn_params.hier_factor; new_op->sequential_routine.current_bcol_status = SEQ_TASK_PENDING; MCA_COLL_ML_SET_NEW_FRAG_ORDER_INFO(new_op); ML_VERBOSE(10,("FFFF Contig + fragmentation [0-sk, 1-lk, 3-su, 4-lu] %d %d %d\n", new_op->variable_fn_params.buffer_size, new_op->fragment_data.fragment_size, new_op->fragment_data.message_descriptor->n_bytes_scheduled)); /* initialize first coll */ new_op->sequential_routine.seq_task_setup(new_op); /* append this collective !! */ OPAL_THREAD_LOCK(&(mca_coll_ml_component.sequential_collectives_mutex)); opal_list_append(&mca_coll_ml_component.sequential_collectives, (opal_list_item_t *)new_op); OPAL_THREAD_UNLOCK(&(mca_coll_ml_component.sequential_collectives_mutex)); } return OMPI_SUCCESS; }
static int mca_coll_ml_build_barrier_schedule( mca_coll_ml_topology_t *topo_info, mca_coll_ml_collective_operation_description_t **coll_desc, mca_coll_ml_module_t *ml_module) { int i_hier, rc, i_fn, n_fcns, i, n_hiers = topo_info->n_levels; bool call_for_top_func; mca_bcol_base_module_t *bcol_module; mca_coll_ml_compound_functions_t *comp_fn; mca_coll_ml_collective_operation_description_t *schedule; *coll_desc = (mca_coll_ml_collective_operation_description_t *) malloc(sizeof(mca_coll_ml_collective_operation_description_t)); schedule = *coll_desc; if (OPAL_UNLIKELY(NULL == schedule)) { ML_ERROR(("Can't allocate memory.")); rc = OMPI_ERR_OUT_OF_RESOURCE; goto Barrier_Setup_Error; } if (topo_info->global_highest_hier_group_index == topo_info->component_pairs[n_hiers - 1].bcol_index) { /* The process that is member of highest level subgroup should call for top algorithms in addition to fan-in/out steps */ call_for_top_func = true; n_fcns = 2 * n_hiers - 1; /* Up + Top + Down */ } else { /* The process is not member of highest level subgroup, as result it does not call for top algorithm, but it calls for all fan-in/out steps */ call_for_top_func = false; n_fcns = 2 * n_hiers; } if( ml_module->max_fn_calls < n_fcns ) { ml_module->max_fn_calls = n_fcns; } /* Set dependencies equal to number of hierarchies */ schedule->n_fns = n_fcns; schedule->topo_info = topo_info; /* Allocated the component function */ schedule->component_functions = (struct mca_coll_ml_compound_functions_t *) calloc(n_fcns, sizeof(struct mca_coll_ml_compound_functions_t)); if (OPAL_UNLIKELY(NULL == schedule->component_functions)) { ML_ERROR(("Can't allocate memory.")); rc = OMPI_ERR_OUT_OF_RESOURCE; goto Barrier_Setup_Error; } for (i_fn = 0; i_fn < n_fcns; ++i_fn) { i_hier = (i_fn < n_hiers ? i_fn : n_fcns - i_fn - 1); comp_fn = &schedule->component_functions[i_fn]; /* The hierarchial level */ comp_fn->h_level = i_hier; bcol_module = GET_BCOL(topo_info, i_hier); /* The UP direction */ if (1 + i_fn < n_hiers || (1 + i_fn == n_hiers && !call_for_top_func)) { comp_fn->bcol_function = bcol_module->filtered_fns_table[DATA_SRC_KNOWN][NON_BLOCKING][BCOL_FANIN][1][0][0]; if (NULL == comp_fn->bcol_function) { ML_VERBOSE(10, ("no function available for BCOL_FANIN, NON_BLOCKING, DATA_SRC_KNOWN")); rc = OMPI_ERR_NOT_AVAILABLE; goto Barrier_Setup_Error; } /* Each function call with index K is depended of all K-1 previous indices - in simple words we will do sequential Fan-In calls */ comp_fn->num_dependencies = (0 == i_fn) ? 0 : 1; comp_fn->num_dependent_tasks = 1; /* Init component function */ strcpy(comp_fn->fn_name, "FANIN"); /* On the highest level */ } else if ((1 + i_fn == n_hiers && call_for_top_func)) { comp_fn->bcol_function = bcol_module->filtered_fns_table[DATA_SRC_KNOWN][NON_BLOCKING][BCOL_BARRIER][1][0][0]; if (NULL == comp_fn->bcol_function) { ML_VERBOSE(10, ("no function available for BCOL_BARRIER, NON_BLOCKING, DATA_SRC_KNOWN")); rc = OMPI_ERR_NOT_AVAILABLE; goto Barrier_Setup_Error; } /* Each function call with index K is depended of all K-1 previous indices - in simple words we do sequential calls */ comp_fn->num_dependencies = (1 == n_hiers) ? 0 : 1; /* All Fan-Ins */ comp_fn->num_dependent_tasks = n_fcns - n_hiers; /* All Fan-Outs */ /* Init component function */ strcpy(comp_fn->fn_name, "BARRIER"); ML_VERBOSE(10, ("func indx %d set to BARRIER %p", i_fn, comp_fn->bcol_function)); /* The DOWN direction */ } else { comp_fn->bcol_function = bcol_module->filtered_fns_table[DATA_SRC_KNOWN][NON_BLOCKING][BCOL_FANOUT][1][0][0]; if (NULL == comp_fn->bcol_function) { ML_VERBOSE(10, ("no function available for BCOL_FANOUT, NON_BLOCKING, DATA_SRC_KNOWN")); rc = OMPI_ERR_NOT_AVAILABLE; goto Barrier_Setup_Error; } /* Each function call with index K is depended of all UP and TOP algths */ comp_fn->num_dependencies = 1; comp_fn->num_dependent_tasks = call_for_top_func ? 0 : (i_fn + 1 == n_fcns ? 0 : 1); /* Init component function */ strcpy(comp_fn->fn_name, "FANOUT"); } ML_VERBOSE(10, ("func indx %d set to %p", i_fn, comp_fn->bcol_function)); if (comp_fn->num_dependent_tasks > 0) { comp_fn->dependent_task_indices = (int *) calloc(comp_fn->num_dependent_tasks, sizeof(int)); if (OPAL_UNLIKELY(NULL == comp_fn->dependent_task_indices)) { ML_ERROR(("Can't allocate memory.")); rc = OMPI_ERR_OUT_OF_RESOURCE; goto Barrier_Setup_Error; } /* All indexes follow after this one */ for (i = 0; i < comp_fn->num_dependent_tasks; ++i) { comp_fn->dependent_task_indices[i] = i_fn + i + 1; } } else { comp_fn->dependent_task_indices = NULL; } /* No need completion func for Barrier */ comp_fn->task_comp_fn = NULL; ML_VERBOSE(10, ("Setting collective [Barrier] fn_idx %d, n_of_this_type_in_a_row %d, " "index_in_consecutive_same_bcol_calls %d.", i_fn, comp_fn->constant_group_data.n_of_this_type_in_a_row, comp_fn->constant_group_data.index_in_consecutive_same_bcol_calls)); } rc = ml_coll_barrier_constant_group_data_setup(topo_info, schedule); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { ML_ERROR(("Failed to init const group data.")); goto Barrier_Setup_Error; } schedule->progress_type = 0; return OMPI_SUCCESS; Barrier_Setup_Error: if (NULL != schedule->component_functions) { free(schedule->component_functions); schedule->component_functions = NULL; } return rc; }
static inline __opal_attribute_always_inline__ int parallel_reduce_start (void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, mca_coll_ml_module_t *ml_module, ompi_request_t **req, int small_data_reduce, int large_data_reduce) { ptrdiff_t lb, extent; size_t pack_len, dt_size; mca_bcol_base_payload_buffer_desc_t *src_buffer_desc = NULL; mca_coll_ml_collective_operation_progress_t * coll_op = NULL; bool contiguous = ompi_datatype_is_contiguous_memory_layout(dtype, count); mca_coll_ml_component_t *cm = &mca_coll_ml_component; int ret, n_fragments = 1, frag_len, pipeline_depth, n_dts_per_frag, rank; if (MPI_IN_PLACE == sbuf) { sbuf = rbuf; } ret = ompi_datatype_get_extent(dtype, &lb, &extent); if (ret < 0) { return OMPI_ERROR; } rank = ompi_comm_rank (comm); dt_size = (size_t) extent; pack_len = count * dt_size; /* We use a separate recieve and send buffer so only half the buffer is usable. */ if (pack_len < (size_t) ml_module->small_message_thresholds[BCOL_REDUCE] / 4) { /* The len of the message can not be larger than ML buffer size */ assert(pack_len <= ml_module->payload_block->size_buffer); src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); ML_VERBOSE(10,("Using small data reduce (threshold = %d)", REDUCE_SMALL_MESSAGE_THRESHOLD)); while (NULL == src_buffer_desc) { opal_progress(); src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); } coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module, ml_module->coll_ml_reduce_functions[small_data_reduce], sbuf, rbuf, pack_len, 0); MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(coll_op, src_buffer_desc->buffer_index, src_buffer_desc); coll_op->variable_fn_params.rbuf = src_buffer_desc->data_addr; coll_op->variable_fn_params.sbuf = src_buffer_desc->data_addr; coll_op->variable_fn_params.buffer_index = src_buffer_desc->buffer_index; coll_op->variable_fn_params.src_desc = src_buffer_desc; coll_op->variable_fn_params.count = count; ret = ompi_datatype_copy_content_same_ddt(dtype, count, (void *) (uintptr_t) src_buffer_desc->data_addr, (char *) sbuf); if (ret < 0) { return OMPI_ERROR; } } else if (cm->enable_fragmentation || !contiguous) { ML_VERBOSE(1,("Using Fragmented Reduce ")); /* fragment the data */ /* check for retarded application programming decisions */ if (dt_size > (size_t) ml_module->small_message_thresholds[BCOL_REDUCE] / 4) { ML_ERROR(("Sorry, but we don't support datatypes that large")); return OMPI_ERROR; } /* calculate the number of data types that can fit per ml-buffer */ n_dts_per_frag = ml_module->small_message_thresholds[BCOL_REDUCE] / (4 * dt_size); /* calculate the number of fragments */ n_fragments = (count + n_dts_per_frag - 1) / n_dts_per_frag; /* round up */ /* calculate the actual pipeline depth */ pipeline_depth = n_fragments < cm->pipeline_depth ? n_fragments : cm->pipeline_depth; /* calculate the fragment size */ frag_len = n_dts_per_frag * dt_size; /* allocate an ml buffer */ src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); while (NULL == src_buffer_desc) { opal_progress(); src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); } coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module, ml_module->coll_ml_reduce_functions[small_data_reduce], sbuf,rbuf, pack_len, 0 /* offset for first pack */); MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(coll_op, src_buffer_desc->buffer_index, src_buffer_desc); coll_op->variable_fn_params.sbuf = (void *) src_buffer_desc->data_addr; coll_op->variable_fn_params.rbuf = (void *) src_buffer_desc->data_addr; coll_op->fragment_data.message_descriptor->n_active = 1; coll_op->full_message.n_bytes_scheduled = frag_len; coll_op->full_message.fragment_launcher = mca_coll_ml_reduce_frag_progress; coll_op->full_message.pipeline_depth = pipeline_depth; coll_op->fragment_data.current_coll_op = small_data_reduce; coll_op->fragment_data.fragment_size = frag_len; coll_op->variable_fn_params.count = n_dts_per_frag; /* seems fishy */ coll_op->variable_fn_params.buffer_size = frag_len; coll_op->variable_fn_params.src_desc = src_buffer_desc; /* copy into the ml-buffer */ ret = ompi_datatype_copy_content_same_ddt(dtype, n_dts_per_frag, (char *) src_buffer_desc->data_addr, (char *) sbuf); if (ret < 0) { return OMPI_ERROR; } } else { ML_VERBOSE(1,("Using zero-copy ptp reduce")); coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module, ml_module->coll_ml_reduce_functions[large_data_reduce], sbuf, rbuf, pack_len, 0); coll_op->variable_fn_params.userbuf = coll_op->variable_fn_params.sbuf = sbuf; coll_op->variable_fn_params.rbuf = rbuf; /* The ML buffer is used for testing. Later, when we * switch to use knem/mmap/portals this should be replaced * appropriately */ src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); while (NULL == src_buffer_desc) { opal_progress(); src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); } coll_op->variable_fn_params.buffer_index = src_buffer_desc->buffer_index; coll_op->variable_fn_params.src_desc = src_buffer_desc; coll_op->variable_fn_params.count = count; } coll_op->process_fn = (rank != root) ? NULL : mca_coll_ml_reduce_unpack; /* Set common parts */ coll_op->fragment_data.buffer_desc = src_buffer_desc; coll_op->variable_fn_params.dtype = dtype; coll_op->variable_fn_params.op = op; /* NTH: the root, root route, and root flag are set in the task setup */ /* Fill in the function arguments */ coll_op->variable_fn_params.sbuf_offset = 0; coll_op->variable_fn_params.rbuf_offset = (ml_module->payload_block->size_buffer - ml_module->data_offset)/2; /* Keep track of the global root of this operation */ coll_op->global_root = root; coll_op->variable_fn_params.sequence_num = OPAL_THREAD_ADD32(&(ml_module->collective_sequence_num), 1); coll_op->sequential_routine.current_active_bcol_fn = 0; /* set the task setup callback */ coll_op->sequential_routine.seq_task_setup = mca_coll_ml_reduce_task_setup; /* Reduce requires the schedule to be fixed. If we use other (changing) schedule, the operation might result in different result. */ coll_op->coll_schedule->component_functions = coll_op->coll_schedule-> comp_fn_arr[coll_op->coll_schedule->topo_info->route_vector[root].level]; /* Launch the collective */ ret = mca_coll_ml_launch_sequential_collective (coll_op); if (OMPI_SUCCESS != ret) { ML_VERBOSE(10, ("Failed to launch reduce collective")); return ret; } *req = &coll_op->full_message.super; return OMPI_SUCCESS; }
static int mca_coll_ml_allgather_frag_progress(mca_coll_ml_collective_operation_progress_t *coll_op) { /* local variables */ int ret; size_t frag_len, dt_size; const void *buf; mca_bcol_base_payload_buffer_desc_t *src_buffer_desc; mca_coll_ml_collective_operation_progress_t *new_op; mca_coll_ml_module_t *ml_module = OP_ML_MODULE(coll_op); bool scontig = coll_op->fragment_data.message_descriptor->send_data_continguous; ompi_datatype_type_size(coll_op->variable_fn_params.dtype, &dt_size); /* Keep the pipeline filled with fragments */ while (coll_op->fragment_data.message_descriptor->n_active < coll_op->fragment_data.message_descriptor->pipeline_depth) { /* If an active fragment happens to have completed the collective during * a hop into the progress engine, then don't launch a new fragment, * instead break and return. */ if (coll_op->fragment_data.message_descriptor->n_bytes_scheduled == coll_op->fragment_data.message_descriptor->n_bytes_total) { break; } /* Get an ml buffer */ src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); if (NULL == src_buffer_desc) { /* If there exist outstanding fragments, then break out * and let an active fragment deal with this later, * there are no buffers available. */ if (0 < coll_op->fragment_data.message_descriptor->n_active) { return OMPI_SUCCESS; } else { /* The fragment is already on list and * the we still have no ml resources * Return busy */ if (coll_op->pending & REQ_OUT_OF_MEMORY) { ML_VERBOSE(10,("Out of resources %p", coll_op)); return OMPI_ERR_TEMP_OUT_OF_RESOURCE; } coll_op->pending |= REQ_OUT_OF_MEMORY; opal_list_append(&((OP_ML_MODULE(coll_op))->waiting_for_memory_list), (opal_list_item_t *)coll_op); ML_VERBOSE(10,("Out of resources %p adding to pending queue", coll_op)); return OMPI_ERR_TEMP_OUT_OF_RESOURCE; } } /* Get a new collective descriptor and initialize it */ new_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module, ml_module->coll_ml_allgather_functions[ML_SMALL_DATA_ALLGATHER], coll_op->fragment_data.message_descriptor->src_user_addr, coll_op->fragment_data.message_descriptor->dest_user_addr, coll_op->fragment_data.message_descriptor->n_bytes_total, coll_op->fragment_data.message_descriptor->n_bytes_scheduled); new_op->fragment_data.current_coll_op = coll_op->fragment_data.current_coll_op; new_op->fragment_data.message_descriptor = coll_op->fragment_data.message_descriptor; /* set the task setup callback */ new_op->sequential_routine.seq_task_setup = mca_coll_ml_allgather_task_setup; /* MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(new_op, src_buffer_desc->buffer_index, src_buffer_desc); */ /* We need this address for pointer arithmetic in memcpy */ buf = coll_op->fragment_data.message_descriptor->src_user_addr; if (!scontig) { frag_len = ml_module->small_message_thresholds[BCOL_ALLGATHER]; mca_coll_ml_convertor_get_send_frag_size( ml_module, &frag_len, coll_op->fragment_data.message_descriptor); mca_coll_ml_convertor_pack( (void *) ((uintptr_t) src_buffer_desc->data_addr + frag_len * coll_op->coll_schedule->topo_info->hier_layout_info[0].offset + frag_len * coll_op->coll_schedule->topo_info->hier_layout_info[0].level_one_index), frag_len, &coll_op->fragment_data.message_descriptor->send_convertor); } else { /* calculate new frag length, there are some issues here */ frag_len = (coll_op->fragment_data.message_descriptor->n_bytes_total - coll_op->fragment_data.message_descriptor->n_bytes_scheduled < coll_op->fragment_data.fragment_size ? coll_op->fragment_data.message_descriptor->n_bytes_total - coll_op->fragment_data.message_descriptor->n_bytes_scheduled : coll_op->fragment_data.fragment_size); /* everybody copies in, based on the new values */ memcpy((void *) ((uintptr_t)src_buffer_desc->data_addr + frag_len * new_op->coll_schedule->topo_info->hier_layout_info[0].offset + frag_len * new_op->coll_schedule->topo_info->hier_layout_info[0].level_one_index), (void *) ((uintptr_t) buf + (uintptr_t) coll_op->fragment_data.message_descriptor->n_bytes_scheduled), frag_len); } new_op->variable_fn_params.sbuf = (void *) src_buffer_desc->data_addr; new_op->variable_fn_params.rbuf = (void *) src_buffer_desc->data_addr; /* update the number of bytes scheduled */ new_op->fragment_data.message_descriptor->n_bytes_scheduled += frag_len; /* everyone needs an unpack function */ new_op->process_fn = mca_coll_ml_allgather_noncontiguous_unpack_data; new_op->fragment_data.fragment_size = frag_len; new_op->fragment_data.buffer_desc = src_buffer_desc; /* Setup fragment specific data */ ++(new_op->fragment_data.message_descriptor->n_active); ML_VERBOSE(10, ("Start more, My index %d ", new_op->fragment_data.buffer_desc->buffer_index)); /* this is a bit buggy */ ML_SET_VARIABLE_PARAMS_BCAST( new_op, OP_ML_MODULE(new_op), frag_len /* yes, we have consistent units, so this makes sense */, MPI_BYTE /* we fragment according to buffer size * we don't reduce the data thus we needn't * keep "whole" datatypes, we may freely * fragment without regard for multiples * of any specific datatype */, src_buffer_desc, 0, 0, frag_len, src_buffer_desc->data_addr); /* initialize first coll */ ret = new_op->sequential_routine.seq_task_setup(new_op); if (OMPI_SUCCESS != ret) { ML_VERBOSE(3, ("Fragment failed to initialize itself")); return ret; } new_op->variable_fn_params.buffer_size = frag_len; new_op->variable_fn_params.hier_factor = coll_op->variable_fn_params.hier_factor; new_op->variable_fn_params.root = 0; MCA_COLL_ML_SET_NEW_FRAG_ORDER_INFO(new_op); /* append this collective !! */ OPAL_THREAD_LOCK(&(mca_coll_ml_component.sequential_collectives_mutex)); opal_list_append(&mca_coll_ml_component.sequential_collectives, (opal_list_item_t *)new_op); OPAL_THREAD_UNLOCK(&(mca_coll_ml_component.sequential_collectives_mutex)); } return OMPI_SUCCESS; }
/* * Fill up the collective descriptor * */ static int mca_coll_ml_build_static_reduce_schedule( mca_coll_ml_topology_t *topo_info, mca_coll_ml_collective_operation_description_t **coll_desc) { int i_hier, j_hier, n_fcns, n_hiers = topo_info->n_levels; int *scratch_indx = NULL, *scratch_num = NULL; int cnt, value_to_set = 0; int ret = OMPI_SUCCESS; bool prev_is_zero; mca_coll_ml_compound_functions_t *comp_fns_temp; mca_bcol_base_module_t *prev_bcol, *bcol_module; mca_coll_ml_compound_functions_t *comp_fn; mca_coll_ml_collective_operation_description_t *schedule = NULL; *coll_desc = (mca_coll_ml_collective_operation_description_t *) malloc(sizeof(mca_coll_ml_collective_operation_description_t)); schedule = *coll_desc; if (OPAL_UNLIKELY(NULL == schedule)) { ML_ERROR(("Can't allocate memory.\n")); ret = OMPI_ERR_OUT_OF_RESOURCE; goto Error; } scratch_indx = (int *) malloc(sizeof(int) * (n_hiers)); if (NULL == scratch_indx) { ML_ERROR(("Can't allocate memory.\n")); ret = OMPI_ERR_OUT_OF_RESOURCE; goto Error; } scratch_num = (int *) malloc(sizeof(int) * (n_hiers)); if (NULL == scratch_num) { ML_ERROR(("Can't allocate memory.\n")); ret = OMPI_ERR_OUT_OF_RESOURCE; goto Error; } prev_bcol = NULL; /* Calculate scratch numbers */ for (i_hier = 0; i_hier < n_hiers; i_hier++) { if (IS_BCOL_TYPE_IDENTICAL(prev_bcol, GET_BCOL(topo_info, i_hier))) { scratch_indx[i_hier] = scratch_indx[i_hier - 1] + 1; } else { scratch_indx[i_hier] = 0; prev_bcol = GET_BCOL(topo_info, i_hier); } } --i_hier; prev_is_zero = true; do { if (prev_is_zero) { value_to_set = scratch_indx[i_hier] + 1; prev_is_zero = false; } if (0 == scratch_indx[i_hier]) { prev_is_zero = true; } scratch_num[i_hier] = value_to_set; --i_hier; } while(i_hier >= 0); /* All hierarchies call one function, unlike other collectives */ n_fcns = n_hiers; /* Set dependencies equal to number of hierarchies */ schedule->n_fns = n_fcns; schedule->topo_info = topo_info; schedule->progress_type = 0; /* Allocated the component function */ schedule->component_functions = (struct mca_coll_ml_compound_functions_t *) calloc(n_fcns, sizeof(struct mca_coll_ml_compound_functions_t)); if (OPAL_UNLIKELY(NULL == schedule->component_functions)) { ML_ERROR(("Can't allocate memory.\n")); ret = OMPI_ERR_OUT_OF_RESOURCE; goto Error; } for (i_hier = 0; i_hier < n_hiers; ++i_hier) { comp_fn = &schedule->component_functions[i_hier]; /* The hierarchial level */ comp_fn->h_level = i_hier; bcol_module = GET_BCOL(topo_info, i_hier); comp_fn->bcol_function = bcol_module->filtered_fns_table[DATA_SRC_KNOWN][NON_BLOCKING][BCOL_REDUCE][1][0][0]; strcpy(comp_fn->fn_name, "REDUCE"); ML_VERBOSE(10, ("func indx %d set to %p", i_hier, comp_fn->bcol_function)); ML_VERBOSE(1,("In ML_REDUCE_SETUP .. looks fine here")); /* No need completion func for Barrier */ comp_fn->task_comp_fn = mca_coll_ml_task_comp_static_reduce; /* Constants */ comp_fn->constant_group_data.bcol_module = bcol_module; comp_fn->constant_group_data.index_in_consecutive_same_bcol_calls = scratch_indx[i_hier]; comp_fn->constant_group_data.n_of_this_type_in_a_row = scratch_num[i_hier]; comp_fn->constant_group_data.n_of_this_type_in_collective = 0; comp_fn->constant_group_data.index_of_this_type_in_collective = 0; ML_VERBOSE(10, ("Setting collective [reduce] fn_idx %d, n_of_this_type_in_a_row %d, " "index_in_consecutive_same_bcol_calls %d.", i_hier, comp_fn->constant_group_data.n_of_this_type_in_a_row, comp_fn->constant_group_data.index_in_consecutive_same_bcol_calls)); } /* Fill the rest of constant data */ for (i_hier = 0; i_hier < n_hiers; i_hier++) { mca_bcol_base_module_t *current_bcol = schedule->component_functions[i_hier]. constant_group_data.bcol_module; cnt = 0; for (j_hier = 0; j_hier < n_hiers; j_hier++) { if (current_bcol == schedule->component_functions[j_hier]. constant_group_data.bcol_module) { schedule->component_functions[j_hier]. constant_group_data.index_of_this_type_in_collective = cnt; cnt++; } } schedule->component_functions[i_hier]. constant_group_data.n_of_this_type_in_collective = cnt; } /* Manju: Reduction should always use the fixed schedule. * The subgroups that this process is leader should be executed first, then * it should execute the subgroups where this process is not a leader, and * then execute the subgroup that includes the root. */ /* Allocate the schedule list */ schedule->comp_fn_arr = (struct mca_coll_ml_compound_functions_t **) calloc(n_hiers,sizeof(struct mca_coll_ml_compound_functions_t *)); if (NULL == schedule->comp_fn_arr) { ML_ERROR(("Can't allocate memory.\n")); ret = OMPI_ERR_OUT_OF_RESOURCE; goto Error; } /* Now that the functions have been set-up properly, we can simple permute the ordering a bit */ for (i_hier = 0; i_hier < n_hiers; i_hier++) { /* first one is trivial */ int leader_hierarchy = 0; int non_leader_hierarchy = 0; int func_index; comp_fns_temp = (struct mca_coll_ml_compound_functions_t *) calloc(n_hiers, sizeof(struct mca_coll_ml_compound_functions_t)); leader_hierarchy = 0; non_leader_hierarchy = n_hiers - 2; for(j_hier = 0; j_hier < n_hiers - 1 ; j_hier++) { func_index = j_hier < i_hier ? j_hier : j_hier + 1; /* I'm a leader for this group */ if (0 == topo_info->component_pairs->subgroup_module->my_index) { comp_fns_temp[leader_hierarchy++] = schedule->component_functions[func_index]; } else { comp_fns_temp[non_leader_hierarchy--] = schedule->component_functions[func_index]; } } comp_fns_temp[j_hier] = schedule->component_functions[i_hier]; /* now let's attach this list to our array of lists */ schedule->comp_fn_arr[i_hier] = comp_fns_temp; } /* Manju: Do we need this ? */ /* I'm going to just loop over each schedule and * set up the scratch indices, scratch numbers * and other constant data */ /* for( i_hier = 1; i_hier < n_hiers; i_hier++) { ret = mca_coll_ml_setup_scratch_vals(schedule->comp_fn_arr[i_hier], scratch_indx, scratch_num, n_hiers); if( OMPI_SUCCESS != ret ) { ret = OMPI_ERROR; goto Error; } } */ /* Do I need this ? */ schedule->task_setup_fn[COLL_ML_ROOT_TASK_FN] = mca_coll_ml_static_reduce_root; schedule->task_setup_fn[COLL_ML_GENERAL_TASK_FN] = mca_coll_ml_static_reduce_non_root; MCA_COLL_ML_SET_SCHEDULE_ORDER_INFO(schedule); free(scratch_num); free(scratch_indx); return OMPI_SUCCESS; Error: if (NULL != schedule->component_functions) { free(schedule->component_functions); schedule->component_functions = NULL; } return ret; }
int ml_coll_up_and_down_hier_setup(mca_coll_ml_module_t *ml_module, mca_coll_ml_topology_t *topo_info, int up_function_idx, int top_function_idx, int down_function_idx, int collective) { /* local variables */ int i, j, cnt, value_to_set = -1; int ret = OMPI_SUCCESS, num_up_levels; int num_hierarchies = topo_info->n_levels; int global_high_hierarchy_index = topo_info->global_highest_hier_group_index; bool call_for_top_function, prev_is_zero; int *scratch_indx = NULL, *scratch_num = NULL; coll_ml_collective_description_t *collective_alg = NULL; mca_bcol_base_module_t *bcol_module = NULL, *prev_bcol = NULL; /* RLG: one blocking barrier collective algorithm - this is really a hack, * we need to figure out how to do this in a bit more extensible * manner. */ collective_alg = (coll_ml_collective_description_t *) malloc(sizeof(coll_ml_collective_description_t)); if (NULL == collective_alg) { ML_ERROR(("Can't allocate memory.")); ret = OMPI_ERR_OUT_OF_RESOURCE; goto Error; } /* am I a member of the highest level subgroup ? */ if (global_high_hierarchy_index == topo_info->component_pairs[num_hierarchies - 1].bcol_index) { /* The process that is member of highest level subgroup should call for top algorithms in addition to fan-in/out steps*/ call_for_top_function = true; /* hier level run only top algorithm, so we deduct 1 */ num_up_levels = num_hierarchies - 1; /* Top algorithm is called only once, so we deduct 1 */ collective_alg->n_functions = 2 * num_hierarchies - 1; } else { /* The process is not member of highest level subgroup, as result it does not call for top algorithm, but it calls for all fan-in/out steps */ call_for_top_function = false; num_up_levels = num_hierarchies; collective_alg->n_functions = 2 * num_hierarchies; } ML_VERBOSE(10, ("high_index %d == bcol_index %d: Call top %d, num_up_levels %d, collective_alg->n_functions %d", global_high_hierarchy_index, topo_info->component_pairs[num_hierarchies - 1].bcol_index, call_for_top_function, num_up_levels, collective_alg->n_functions )); /* allocate space for the functions */ collective_alg->functions = (mca_bcol_base_function_t *) calloc(collective_alg->n_functions, sizeof(mca_bcol_base_function_t)); if( NULL == collective_alg->functions) { ML_ERROR(("Can't allocate memory.")); ret = OMPI_ERR_OUT_OF_RESOURCE; goto Error; } /* Algorithm Description: * ===================== * The algorithm used here for an N level system * - up to level N-2, inclusive : up algorithm (fan in in barrier, reduce in Allreduce) * - level N-1: top algorithm (barrier or allreduce) * - level N-2, to level 0: down algorithm (fanout) */ /* Starting scratch_num and scratch_index calculations */ /* =================================================== */ /* Figure out how many of the same bcols are called in a row. * The index of the bcol in row we store in scratch_indx and * the total number of bcols in the row we store in scratch_num */ scratch_indx = (int *) calloc (2 * num_hierarchies, sizeof (int)); if(NULL == scratch_indx) { ML_ERROR(("Can't allocate memory.")); ret = OMPI_ERR_OUT_OF_RESOURCE; goto Error; } scratch_num = (int *) malloc(sizeof(int) * (2 * num_hierarchies)); if(NULL == scratch_num) { ML_ERROR(("Can't allocate memory.")); ret = OMPI_ERR_OUT_OF_RESOURCE; goto Error; } /* We go through all stages of algorithm (up, top, down) * and calculate bcol index. If previous bcol is the same type as current * one the counter index is increased, other way the index is zero */ prev_bcol = NULL; /* going up */ for (i = 0, cnt = 0; i < num_up_levels; ++i, ++cnt) { if (IS_BCOL_TYPE_IDENTICAL(prev_bcol, GET_BCOL(topo_info, i))) { scratch_indx[cnt] = scratch_indx[cnt - 1] + 1; } else { scratch_indx[cnt] = 0; prev_bcol = GET_BCOL(topo_info, i); } } /* top - only if the proc arrive to highest_level_is_global_highest_level */ if (call_for_top_function) { if (IS_BCOL_TYPE_IDENTICAL(prev_bcol, GET_BCOL(topo_info, num_hierarchies - 1))) { scratch_indx[cnt] = scratch_indx[cnt - 1] + 1; } else { scratch_indx[cnt] = 0; prev_bcol = GET_BCOL(topo_info, num_hierarchies - 1); } ++cnt; } /* going down */ for (i = num_up_levels - 1; i >= 0; --i, ++cnt) { if (IS_BCOL_TYPE_IDENTICAL(prev_bcol, GET_BCOL(topo_info, i))) { scratch_indx[cnt] = scratch_indx[cnt - 1] + 1; } else { scratch_indx[cnt] = 0; prev_bcol = GET_BCOL(topo_info, i); } } /* * Calculate the number of the same bcols in row. * We parse the index array, if index is zero * it means that the row is done and we start * to calculate next bcols row. The maximum number * for the row is equal to maximal bcol index in the row + 1 */ i = cnt - 1; prev_is_zero = true; do { if (prev_is_zero) { value_to_set = scratch_indx[i] + 1; prev_is_zero = false; } if (0 == scratch_indx[i]) { prev_is_zero = true; } scratch_num[i] = value_to_set; --i; } while(i >= 0); /* =========================================================== */ /* We are done with scratch_num and scratch_index calculations */ /* Setup function call for each algorithm step */ cnt = 0; /* up phase */ for (i = 0; i < num_up_levels; i++) { bcol_module = GET_BCOL(topo_info, i); collective_alg->functions[cnt].fn_idx = up_function_idx; collective_alg->functions[cnt].bcol_module = bcol_module; collective_alg->functions[cnt].index_in_consecutive_same_bcol_calls = scratch_indx[cnt]; collective_alg->functions[cnt].n_of_this_type_in_a_row = scratch_num[cnt]; ML_VERBOSE(10, ("Setting collective [collective code %d][count %d], fn_idx %d, index_in_consecutive_same_bcol_calls %d, n_of_this_type_in_a_row %d", collective, cnt, collective_alg->functions[cnt].fn_idx, collective_alg->functions[cnt].index_in_consecutive_same_bcol_calls, collective_alg->functions[cnt].n_of_this_type_in_a_row)); ++cnt; } /* top function */ if (call_for_top_function) { bcol_module = GET_BCOL(topo_info, num_hierarchies - 1); collective_alg->functions[cnt].fn_idx = top_function_idx; collective_alg->functions[cnt].bcol_module = bcol_module; collective_alg->functions[cnt].index_in_consecutive_same_bcol_calls = scratch_indx[cnt]; collective_alg->functions[cnt].n_of_this_type_in_a_row = scratch_num[cnt]; ML_VERBOSE(10, ("Setting collective [collective code %d][count %d], fn_idx %d, index_in_consecutive_same_bcol_calls %d, n_of_this_type_in_a_row %d", collective, cnt, collective_alg->functions[cnt].fn_idx, collective_alg->functions[cnt].index_in_consecutive_same_bcol_calls, collective_alg->functions[cnt].n_of_this_type_in_a_row)); ++cnt; } /* down phase*/ for (i = num_up_levels - 1; i >= 0; i--) { bcol_module = GET_BCOL(topo_info, i); collective_alg->functions[cnt].fn_idx = down_function_idx; collective_alg->functions[cnt].bcol_module = bcol_module; collective_alg->functions[cnt].index_in_consecutive_same_bcol_calls = scratch_indx[cnt]; collective_alg->functions[cnt].n_of_this_type_in_a_row = scratch_num[cnt]; ML_VERBOSE(10, ("Setting collective [collective code %d][count %d], fn_idx %d, index_in_consecutive_same_bcol_calls %d, n_of_this_type_in_a_row %d", collective, cnt, collective_alg->functions[cnt].fn_idx, collective_alg->functions[cnt].index_in_consecutive_same_bcol_calls, collective_alg->functions[cnt].n_of_this_type_in_a_row)); ++cnt; } /* figure out how many times this bcol is used in this collective call */ for (i = 0; i < collective_alg->n_functions; i++) { mca_bcol_base_module_t *current_bcol= collective_alg->functions[i].bcol_module; cnt = 0; for (j = 0; j < collective_alg->n_functions; ++j) { if (current_bcol == collective_alg->functions[j].bcol_module) { collective_alg->functions[j].index_of_this_type_in_collective = cnt; ML_VERBOSE(10, ("Pasha: Setting collective [collective code %d][count %d], fn_idx %d, collective_alg->functions[i].index_of_this_type_in_collective %d", collective, cnt, i, collective_alg->functions[j].index_of_this_type_in_collective)); cnt++; } } collective_alg->functions[i].n_of_this_type_in_collective=cnt; ML_VERBOSE(10, ("Pasha: Setting collective [collective code %d][count %d], fn_idx %d, collective_alg->functions[i].n_of_this_type_in_collective %d", collective, cnt, i, collective_alg->functions[i].n_of_this_type_in_collective)); } /* set Barrier algorithm */ topo_info->hierarchical_algorithms[collective] = collective_alg; /* Setup maximum number function calls, it is used for resource allocation */ ml_module->max_fn_calls = (collective_alg->n_functions > ml_module->max_fn_calls) ? collective_alg->n_functions : ml_module->max_fn_calls; /* Ishai: What is this n_buffers? I did not find where it is being used*/ topo_info->hierarchical_algorithms[collective]->n_buffers = 1; /* Release temporary memories */ if (NULL != scratch_indx) { free(scratch_indx); } if (NULL != scratch_num) { free(scratch_num); } return OMPI_SUCCESS; Error: if (NULL != collective_alg->functions) { free(collective_alg->functions); } if (NULL != collective_alg) { free(collective_alg); } if (NULL != scratch_indx) { free(scratch_indx); } if (NULL != scratch_num) { free(scratch_num); } return ret; }