int bcol_basesmuma_fanout_init(mca_bcol_base_module_t *super) { mca_bcol_base_coll_fn_comm_attributes_t comm_attribs; mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs; BASESMUMA_VERBOSE(10, ("Basesmuma Fan-Out register.\n")); comm_attribs.bcoll_type = BCOL_FANOUT; comm_attribs.comm_size_min = 0; comm_attribs.comm_size_max = 1024 * 1024; comm_attribs.waiting_semantics = NON_BLOCKING; inv_attribs.bcol_msg_min = 0; inv_attribs.bcol_msg_max = 20000; /* range 1 */ inv_attribs.datatype_bitmap = 0xffffffff; inv_attribs.op_types_bitmap = 0xffffffff; comm_attribs.data_src = DATA_SRC_KNOWN; mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, bcol_basesmuma_fanout_new, bcol_basesmuma_fanout_new); return OMPI_SUCCESS; }
/* * Allocate buffers for storing non-blocking collective descriptions, required * for making code re-entrant * */ static int init_nb_coll_buff_desc(mca_bcol_basesmuma_nb_coll_buff_desc_t **desc, void *base_addr, uint32_t num_banks, uint32_t num_buffers_per_bank, uint32_t size_buffer, uint32_t header_size, int group_size, int pow_k) { uint32_t i, j, ci; mca_bcol_basesmuma_nb_coll_buff_desc_t *tmp_desc = NULL; int k_nomial_radix = mca_bcol_basesmuma_component.k_nomial_radix; int pow_k_val = (0 == pow_k) ? 1 : pow_k; int num_to_alloc = (k_nomial_radix - 1) * pow_k_val * 2 + 1 ; *desc = (mca_bcol_basesmuma_nb_coll_buff_desc_t *)calloc(num_banks * num_buffers_per_bank, sizeof(mca_bcol_basesmuma_nb_coll_buff_desc_t)); if (NULL == *desc) { return OMPI_ERROR; } tmp_desc = *desc; for (i = 0; i < num_banks; i++) { for (j = 0; j < num_buffers_per_bank; j++) { ci = i * num_buffers_per_bank + j; tmp_desc[ci].bank_index = i; tmp_desc[ci].buffer_index = j; /* *2 is for gather session +1 for extra peer */ tmp_desc[ci].requests = (ompi_request_t **) calloc(num_to_alloc, sizeof(ompi_request_t *)); tmp_desc[ci].data_addr = (void *) ((unsigned char*)base_addr + ci * size_buffer + header_size); BASESMUMA_VERBOSE(10, ("ml memory cache setup %d %d - %p", i, j, tmp_desc[ci].data_addr)); } } return OMPI_SUCCESS; }
/* New init function used for new control scheme where we put the control * struct at the top of the payload buffer */ int bcol_basesmuma_bank_init_opti(struct mca_bcol_base_memory_block_desc_t *payload_block, uint32_t data_offset, mca_bcol_base_module_t *bcol_module, void *reg_data) { /* assumption here is that the block has been registered with * sm bcol hence has been mapped by each process, need to be * sure that memory is mapped amongst sm peers */ /* local variables */ int ret = OMPI_SUCCESS, i, j; sm_buffer_mgmt *pload_mgmt; mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component; bcol_basesmuma_registration_data_t *sm_reg_data = (bcol_basesmuma_registration_data_t *) reg_data; mca_bcol_basesmuma_module_t *sm_bcol = (mca_bcol_basesmuma_module_t *) bcol_module; mca_bcol_base_memory_block_desc_t *ml_block = payload_block; size_t malloc_size; bcol_basesmuma_smcm_file_t input_file; int leading_dim,loop_limit,buf_id; unsigned char *base_ptr; mca_bcol_basesmuma_module_t *sm_bcol_module= (mca_bcol_basesmuma_module_t *)bcol_module; int my_idx, array_id; mca_bcol_basesmuma_header_t *ctl_ptr; void **results_array, *mem_offset; mca_bcol_basesmuma_local_mlmem_desc_t *ml_mem = &sm_bcol_module->ml_mem; /* first, we get a pointer to the payload buffer management struct */ pload_mgmt = &(sm_bcol->colls_with_user_data); /* go ahead and get the header size that is cached on the payload block */ sm_bcol->total_header_size = data_offset; /* allocate memory for pointers to mine and my peers' payload buffers * difference here is that now we use our new data struct */ malloc_size = ml_block->num_banks*ml_block->num_buffers_per_bank* pload_mgmt->size_of_group *sizeof(mca_bcol_basesmuma_payload_t); pload_mgmt->data_buffs = (mca_bcol_basesmuma_payload_t *) malloc(malloc_size); if( !pload_mgmt->data_buffs) { ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit_ERROR; } /* allocate some memory to hold the offsets */ results_array = (void **) malloc(pload_mgmt->size_of_group * sizeof (void *)); /* setup the input file for the shared memory connection manager */ input_file.file_name = sm_reg_data->file_name; input_file.size = sm_reg_data->size; input_file.size_ctl_structure = 0; input_file.data_seg_alignment = BASESMUMA_CACHE_LINE_SIZE; input_file.mpool_size = sm_reg_data->size; /* call the connection manager and map my shared memory peers' file */ ret = bcol_basesmuma_smcm_allgather_connection( sm_bcol, sm_bcol->super.sbgp_partner_module, &(cs->sm_connections_list), &(sm_bcol->payload_backing_files_info), sm_bcol->super.sbgp_partner_module->group_comm, input_file,cs->payload_base_fname, false); if( OMPI_SUCCESS != ret ) { goto exit_ERROR; } /* now we exchange offset info - don't assume symmetric virtual memory */ mem_offset = (void *) ((uintptr_t) ml_block->block->base_addr - (uintptr_t) cs->sm_payload_structs->data_addr); /* call into the exchange offsets function */ ret=comm_allgather_pml(&mem_offset, results_array, sizeof (void *), MPI_BYTE, sm_bcol_module->super.sbgp_partner_module->my_index, sm_bcol_module->super.sbgp_partner_module->group_size, sm_bcol_module->super.sbgp_partner_module->group_list, sm_bcol_module->super.sbgp_partner_module->group_comm); if( OMPI_SUCCESS != ret ) { goto exit_ERROR; } /* convert memory offset to virtual address in current rank */ leading_dim = pload_mgmt->size_of_group; loop_limit = ml_block->num_banks*ml_block->num_buffers_per_bank; for (i=0;i< sm_bcol_module->super.sbgp_partner_module->group_size;i++) { /* get the base pointer */ int array_id=SM_ARRAY_INDEX(leading_dim,0,i); if( i == sm_bcol_module->super.sbgp_partner_module->my_index) { /* me */ base_ptr=cs->sm_payload_structs->map_addr; } else { base_ptr=sm_bcol_module->payload_backing_files_info[i]-> sm_mmap->map_addr; } /* first, set the pointer to the control struct */ pload_mgmt->data_buffs[array_id].ctl_struct=(mca_bcol_basesmuma_header_t *) (uintptr_t)(((uint64_t)(uintptr_t)results_array[array_id])+(uint64_t)(uintptr_t)base_ptr); /* second, calculate where to set the data pointer */ pload_mgmt->data_buffs[array_id].payload=(void *) (uintptr_t)((uint64_t)(uintptr_t) pload_mgmt->data_buffs[array_id].ctl_struct + (uint64_t)(uintptr_t) data_offset); for( buf_id = 1 ; buf_id < loop_limit ; buf_id++ ) { int array_id_m1=SM_ARRAY_INDEX(leading_dim,(buf_id-1),i); array_id=SM_ARRAY_INDEX(leading_dim,buf_id,i); /* now, play the same game as above * * first, set the control struct's position */ pload_mgmt->data_buffs[array_id].ctl_struct=(mca_bcol_basesmuma_header_t *) (uintptr_t)(((uint64_t)(uintptr_t)(pload_mgmt->data_buffs[array_id_m1].ctl_struct) + (uint64_t)(uintptr_t)ml_block->size_buffer)); /* second, set the payload pointer */ pload_mgmt->data_buffs[array_id].payload =(void *) (uintptr_t)((uint64_t)(uintptr_t) pload_mgmt->data_buffs[array_id].ctl_struct + (uint64_t)(uintptr_t) data_offset); } } /* done with the index array */ free (results_array); /* initialize my control structures!! */ my_idx = sm_bcol_module->super.sbgp_partner_module->my_index; leading_dim = sm_bcol_module->super.sbgp_partner_module->group_size; for( buf_id = 0; buf_id < loop_limit; buf_id++){ array_id = SM_ARRAY_INDEX(leading_dim,buf_id,my_idx); ctl_ptr = pload_mgmt->data_buffs[array_id].ctl_struct; /* initialize the data structures */ for( j = 0; j < SM_BCOLS_MAX; j++){ for( i = 0; i < NUM_SIGNAL_FLAGS; i++){ ctl_ptr->flags[i][j] = -1; } } ctl_ptr->sequence_number = -1; ctl_ptr->src = -1; } /* setup the data structures needed for releasing the payload * buffers back to the ml level */ for( i=0 ; i < (int) ml_block->num_banks ; i++ ) { sm_bcol->colls_with_user_data. ctl_buffs_mgmt[i].nb_barrier_desc.ml_memory_block_descriptor= ml_block; } ml_mem->num_banks = ml_block->num_banks; ml_mem->bank_release_counter = calloc(ml_block->num_banks, sizeof(uint32_t)); ml_mem->num_buffers_per_bank = ml_block->num_buffers_per_bank; ml_mem->size_buffer = ml_block->size_buffer; /* pointer to ml level descriptor */ ml_mem->ml_mem_desc = ml_block; if (OMPI_SUCCESS != init_nb_coll_buff_desc(&ml_mem->nb_coll_desc, ml_block->block->base_addr, ml_mem->num_banks, ml_mem->num_buffers_per_bank, ml_mem->size_buffer, data_offset, sm_bcol_module->super.sbgp_partner_module->group_size, sm_bcol_module->pow_k)) { BASESMUMA_VERBOSE(10, ("Failed to allocate memory descriptors for storing state of non-blocking collectives\n")); return OMPI_ERROR; } return OMPI_SUCCESS; exit_ERROR: return ret; }
static void mca_bcol_basesmuma_module_destruct(mca_bcol_basesmuma_module_t *sm_module) { /* local variables */ int i; mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component; /* * release allocated resrouces */ /* ...but not until you're sure you have no outstanding collectives */ while(0 != opal_list_get_size(&(cs->nb_admin_barriers))) { opal_progress(); } #ifdef __PORTALS_AVAIL__ /* Remove portals bcast specific resources */ if ( PTL_OK != PtlEQFree(sm_module->sg_state.read_eq)) { BASESMUMA_VERBOSE(10,("PtlEQFree() failed: )")); } #endif /* Remove Lmsg Reduce Offsets Array */ free_lmsg_reduce_offsets_array(sm_module); /* collective topology data */ if( sm_module->fanout_read_tree) { for(i=0 ; i < sm_module->super.size_of_subgroup ; i++ ) { if(0 < sm_module->fanout_read_tree[i].n_children ) { free(sm_module->fanout_read_tree[i].children_ranks); sm_module->fanout_read_tree[i].children_ranks=NULL; } } free(sm_module->fanout_read_tree); sm_module->fanout_read_tree=NULL; } /* gvm Leak FIX Reduction_tree[].children_ranks has * to be removed. I don't how to get the size (which is * size of subgroup) of array reduction_tree */ if( sm_module->reduction_tree) { for(i=0 ; i < sm_module->super.size_of_subgroup ; i++ ) { if(0 < sm_module->reduction_tree[i].n_children ) { free(sm_module->reduction_tree[i].children_ranks); sm_module->reduction_tree[i].children_ranks=NULL; } } free(sm_module->reduction_tree); sm_module->reduction_tree=NULL; } /* gvm Leak FIX */ if (sm_module->fanout_node.children_ranks){ free(sm_module->fanout_node.children_ranks); sm_module->fanout_node.children_ranks = NULL; } if (sm_module->fanin_node.children_ranks){ free(sm_module->fanin_node.children_ranks); sm_module->fanin_node.children_ranks = NULL; } /* colls_no_user_data resrouces */ if(sm_module->colls_no_user_data.ctl_buffs_mgmt){ free(sm_module->colls_no_user_data.ctl_buffs_mgmt); sm_module->colls_no_user_data.ctl_buffs_mgmt=NULL; } if(sm_module->colls_no_user_data.ctl_buffs){ free(sm_module->colls_no_user_data.ctl_buffs); sm_module->colls_no_user_data.ctl_buffs=NULL; } /* return control */ opal_list_append (&cs->ctl_structures, (opal_list_item_t *) sm_module->no_userdata_ctl); /* colls_with_user_data resrouces */ /* *debug print */ /* fprintf(stderr,"AAA colls_with_user_data.ctl_buffs %p \n", sm_module->colls_with_user_data.ctl_buffs_mgmt); end debug */ if(sm_module->colls_with_user_data.ctl_buffs_mgmt){ free(sm_module->colls_with_user_data.ctl_buffs_mgmt); sm_module->colls_with_user_data.ctl_buffs_mgmt=NULL; } if(sm_module->colls_with_user_data.ctl_buffs){ free(sm_module->colls_with_user_data.ctl_buffs); sm_module->colls_with_user_data.ctl_buffs=NULL; } if(sm_module->shared_memory_scratch_space) { free(sm_module->shared_memory_scratch_space); sm_module->shared_memory_scratch_space=NULL; } /* return control */ opal_list_append (&cs->ctl_structures, (opal_list_item_t *) sm_module->userdata_ctl); #if 1 if(sm_module->scatter_kary_tree) { for(i=0 ; i < sm_module->super.size_of_subgroup ; i++ ) { if(0 < sm_module->scatter_kary_tree[i].n_children) { free(sm_module->scatter_kary_tree[i].children_ranks); sm_module->scatter_kary_tree[i].children_ranks=NULL; } } free(sm_module->scatter_kary_tree); } #endif if(NULL != sm_module->super.list_n_connected ){ free(sm_module->super.list_n_connected); sm_module->super.list_n_connected = NULL; } /* free the k-nomial allgather tree here */ /* done */ }
int bcol_basesmuma_hdl_zerocopy_bcast(bcol_function_args_t *input_args, coll_ml_function_t *c_input_args) { /* local variables */ int group_size, process_shift, my_node_index; int my_rank, first_instance=0, flag_offset; int rc = OMPI_SUCCESS; int my_fanout_parent; int leading_dim, buff_idx, idx; volatile int64_t ready_flag; int count=input_args->count; struct ompi_datatype_t* dtype=input_args->dtype; int root=input_args->root; int64_t sequence_number=input_args->sequence_num; mca_bcol_basesmuma_module_t* bcol_module= (mca_bcol_basesmuma_module_t *)c_input_args->bcol_module; netpatterns_tree_node_t* my_fanout_read_tree; size_t pack_len = 0, dt_size; void *data_addr = (void *)((unsigned char *)input_args->src_desc->data_addr); struct mca_hdl_base_descriptor_t *hdl_desc; struct mca_hdl_base_segment_t *hdl_seg; int ret, completed, ridx/*remote rank index*/; bool status; volatile mca_bcol_basesmuma_ctl_struct_t **ctl_structs; mca_bcol_basesmuma_ctl_struct_t *my_ctl_pointer= NULL; volatile mca_bcol_basesmuma_ctl_struct_t *parent_ctl_pointer= NULL; volatile mca_bcol_basesmuma_ctl_struct_t *child_ctl_pointer= NULL; struct mca_hdl_base_module_t* hdl = bcol_module->hdl_module[0]; /* we will work only on packed data - so compute the length*/ ompi_datatype_type_size(dtype, &dt_size); pack_len = count * dt_size; buff_idx = input_args->src_desc->buffer_index; /* Get addressing information */ my_rank = bcol_module->super.sbgp_partner_module->my_index; group_size = bcol_module->colls_no_user_data.size_of_group; leading_dim=bcol_module->colls_no_user_data.size_of_group; idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0); ctl_structs = (volatile mca_bcol_basesmuma_ctl_struct_t **) bcol_module->colls_with_user_data.ctl_buffs+idx; my_ctl_pointer = ctl_structs[my_rank]; /* Align node index to around sbgp root */ process_shift = root; my_node_index = my_rank - root; if(0 > my_node_index ) { my_node_index += group_size; } /* get my node for the bcast tree */ my_fanout_read_tree = &(bcol_module->fanout_read_tree[my_node_index]); my_fanout_parent = my_fanout_read_tree->parent_rank + process_shift; if(group_size <= my_fanout_parent){ my_fanout_parent -= group_size; } /* setup resource recycling */ if( my_ctl_pointer->sequence_number < sequence_number ) { first_instance = 1; } if( first_instance ) { /* Signal arrival */ my_ctl_pointer->flag = -1; my_ctl_pointer->index = 1; /* this does not need to use any flag values , so only need to * set the value for subsequent values that may need this */ my_ctl_pointer->starting_flag_value = 0; flag_offset = 0; } else { /* only one thread at a time will be making progress on this * collective, so no need to make this atomic */ my_ctl_pointer->index++; } /* increment the starting flag by one and return */ flag_offset = my_ctl_pointer->starting_flag_value; ready_flag = flag_offset + sequence_number + 1; my_ctl_pointer->sequence_number = sequence_number; hdl_desc = (mca_hdl_base_descriptor_t *) malloc (sizeof (mca_hdl_base_descriptor_t) * 1); /*prepare a hdl data segment*/ hdl_seg = (mca_hdl_base_segment_t*) malloc ( sizeof (mca_hdl_base_segment_t) * 1); hdl_seg->seg_addr.pval = input_args->sbuf; hdl_seg->seg_len = pack_len; hdl->endpoint->ready_flag = ready_flag; hdl->endpoint->local_ctrl = my_ctl_pointer; hdl->endpoint->sbgp_contextid = bcol_module->super.sbgp_partner_module->group_comm->c_contextid; /* * Fan out from root */ if(ROOT_NODE == my_fanout_read_tree->my_node_type) { input_args->result_in_rbuf = false; hdl_desc->des_src = hdl_seg; hdl_desc->des_src_cnt = 1; hdl_desc->isroot = true; /*As the general semantics, there might multiple pairs of send/recv *on the topology tree*/ for (ridx = 0; ridx < my_fanout_read_tree->n_children; ridx++) { child_ctl_pointer = ctl_structs[my_fanout_read_tree->children_ranks[ridx]]; hdl->endpoint->remote_ctrl = child_ctl_pointer; ret = hdl->hdl_send(hdl, hdl->endpoint, hdl_desc); if (ret != OMPI_SUCCESS) { BASESMUMA_VERBOSE(1, ("send eror on rank %d ........", my_rank)); goto exit_ERROR; } } }else if(LEAF_NODE == my_fanout_read_tree->my_node_type) { input_args->result_in_rbuf = false; /* * Get parent payload data and control data. * Get the pointer to the base address of the parent's payload buffer. * Get the parent's control buffer. */ parent_ctl_pointer = ctl_structs[my_fanout_parent]; hdl_desc->des_dst = hdl_seg; hdl_desc->des_dst_cnt = 1; hdl_desc->isroot = false; hdl->endpoint->remote_ctrl = parent_ctl_pointer; #if __TEST_BLOCKING__ ret = hdl->hdl_recv(hdl, hdl->endpoint, hdl_desc); #else ret = hdl->hdl_recvi(hdl, hdl->endpoint, NULL, 0, 0, &hdl_desc); #endif #if __TEST_WAIT__ ret = hdl->hdl_wait(hdl, hdl->endpoint, hdl_desc); BASESMUMA_VERBOSE(1,("wait on rank %d is done!", my_rank)); #endif if (OMPI_SUCCESS != ret) { BASESMUMA_VERBOSE(1, ("recvi eror on rank %d ........", my_rank)); goto exit_ERROR; } status = false; #if __TEST_TEST__ while (!status) { hdl->hdl_test(&hdl_desc, &completed, &status); opal_progress(); BASESMUMA_VERBOSE(1, ("test on rank %d ........", my_rank)); } #endif goto Release; }else{ input_args->result_in_rbuf = false; /* Interior node */ /* Get parent payload data and control data */ parent_ctl_pointer = ctl_structs[my_fanout_parent]; hdl_desc->des_dst = hdl_seg; hdl_desc->des_dst_cnt = 1; hdl_desc->isroot = false; hdl->endpoint->remote_ctrl = parent_ctl_pointer; ret = hdl->hdl_recv(hdl, hdl->endpoint, hdl_desc); if (OMPI_SUCCESS != ret) { goto exit_ERROR; } if (OMPI_SUCCESS != ret) { BASESMUMA_VERBOSE(1, ("recvi eror on rank %d ........", my_rank)); goto exit_ERROR; } /* Signal to children that they may read the data from my shared buffer */ MB(); hdl_desc->des_src = hdl_seg; hdl_desc->des_src_cnt = 1; for (ridx = 0; ridx < my_fanout_read_tree->n_children; ridx++) { child_ctl_pointer = ctl_structs[my_fanout_read_tree->children_ranks[ridx]]; hdl->endpoint->remote_ctrl = child_ctl_pointer; ret = hdl->hdl_send(hdl, hdl->endpoint, hdl_desc); if (ret != OMPI_SUCCESS) { BASESMUMA_VERBOSE(1, ("send eror on rank %d ........", my_rank)); goto exit_ERROR; } } goto Release; } Release: /* if I am the last instance of a basesmuma function in this collectie, * release the resrouces */ if (IS_LAST_BCOL_FUNC(c_input_args)) { rc = bcol_basesmuma_free_buff( &(bcol_module->colls_with_user_data), sequence_number); } my_ctl_pointer->starting_flag_value += 1; return BCOL_FN_COMPLETE; exit_ERROR: return OMPI_ERROR; }