int bcol_basesmuma_fanout_init(mca_bcol_base_module_t *super)
{
    mca_bcol_base_coll_fn_comm_attributes_t comm_attribs;
    mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs;

    BASESMUMA_VERBOSE(10, ("Basesmuma Fan-Out register.\n"));

    comm_attribs.bcoll_type = BCOL_FANOUT;

    comm_attribs.comm_size_min = 0;
    comm_attribs.comm_size_max = 1024 * 1024;
    comm_attribs.waiting_semantics = NON_BLOCKING;

    inv_attribs.bcol_msg_min = 0;
    inv_attribs.bcol_msg_max = 20000; /* range 1 */

    inv_attribs.datatype_bitmap = 0xffffffff;
    inv_attribs.op_types_bitmap = 0xffffffff;

    comm_attribs.data_src = DATA_SRC_KNOWN;

    mca_bcol_base_set_attributes(super,
        &comm_attribs, &inv_attribs,
        bcol_basesmuma_fanout_new,
        bcol_basesmuma_fanout_new);

    return OMPI_SUCCESS;
}
Exemple #2
0
/*
 * Allocate buffers for storing non-blocking collective descriptions, required
 * for making code re-entrant
 *
 */
static int init_nb_coll_buff_desc(mca_bcol_basesmuma_nb_coll_buff_desc_t **desc,
                                  void *base_addr, uint32_t num_banks,
                                  uint32_t num_buffers_per_bank,
                                  uint32_t size_buffer,
                                  uint32_t header_size,
                                  int group_size,
                                  int pow_k)
{
    uint32_t i, j, ci;
    mca_bcol_basesmuma_nb_coll_buff_desc_t *tmp_desc = NULL;
    int k_nomial_radix = mca_bcol_basesmuma_component.k_nomial_radix;
    int pow_k_val = (0 == pow_k) ? 1 : pow_k;
    int num_to_alloc = (k_nomial_radix - 1) * pow_k_val * 2 + 1 ;


    *desc = (mca_bcol_basesmuma_nb_coll_buff_desc_t *)calloc(num_banks * num_buffers_per_bank, sizeof(mca_bcol_basesmuma_nb_coll_buff_desc_t));
    if (NULL == *desc) {
        return OMPI_ERROR;
    }

    tmp_desc = *desc;

    for (i = 0; i < num_banks; i++) {
        for (j = 0; j < num_buffers_per_bank; j++) {
            ci = i * num_buffers_per_bank + j;
            tmp_desc[ci].bank_index = i;
            tmp_desc[ci].buffer_index = j;
            /* *2  is for gather session  +1 for extra peer */
            tmp_desc[ci].requests = (ompi_request_t **)
                calloc(num_to_alloc, sizeof(ompi_request_t *));
            tmp_desc[ci].data_addr = (void *)
                ((unsigned char*)base_addr + ci * size_buffer + header_size);
            BASESMUMA_VERBOSE(10, ("ml memory cache setup %d %d - %p", i, j, tmp_desc[ci].data_addr));
        }
    }

    return OMPI_SUCCESS;
}
Exemple #3
0
/* New init function used for new control scheme where we put the control
 * struct at the top of the payload buffer
 */
int bcol_basesmuma_bank_init_opti(struct mca_bcol_base_memory_block_desc_t *payload_block,
        uint32_t data_offset,
        mca_bcol_base_module_t *bcol_module,
        void *reg_data)
{
    /* assumption here is that the block has been registered with
     * sm bcol hence has been mapped by each process, need to be
     * sure that memory is mapped amongst sm peers
     */

    /* local variables */
    int ret = OMPI_SUCCESS, i, j;
    sm_buffer_mgmt *pload_mgmt;
    mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component;
    bcol_basesmuma_registration_data_t *sm_reg_data =
        (bcol_basesmuma_registration_data_t *) reg_data;
    mca_bcol_basesmuma_module_t *sm_bcol =
        (mca_bcol_basesmuma_module_t *) bcol_module;
    mca_bcol_base_memory_block_desc_t *ml_block = payload_block;
    size_t malloc_size;
    bcol_basesmuma_smcm_file_t input_file;
    int leading_dim,loop_limit,buf_id;
    unsigned char *base_ptr;
    mca_bcol_basesmuma_module_t *sm_bcol_module=
        (mca_bcol_basesmuma_module_t *)bcol_module;
    int my_idx, array_id;
    mca_bcol_basesmuma_header_t *ctl_ptr;
    void **results_array, *mem_offset;

    mca_bcol_basesmuma_local_mlmem_desc_t *ml_mem = &sm_bcol_module->ml_mem;

    /* first, we get a pointer to the payload buffer management struct */
    pload_mgmt = &(sm_bcol->colls_with_user_data);

    /* go ahead and get the header size that is cached on the payload block
     */
    sm_bcol->total_header_size = data_offset;

    /* allocate memory for pointers to mine and my peers' payload buffers
     * difference here is that now we use our new data struct
     */
    malloc_size = ml_block->num_banks*ml_block->num_buffers_per_bank*
        pload_mgmt->size_of_group *sizeof(mca_bcol_basesmuma_payload_t);
    pload_mgmt->data_buffs = (mca_bcol_basesmuma_payload_t *) malloc(malloc_size);
    if( !pload_mgmt->data_buffs) {
        ret = OMPI_ERR_OUT_OF_RESOURCE;
        goto exit_ERROR;
    }

    /* allocate some memory to hold the offsets */
    results_array = (void **) malloc(pload_mgmt->size_of_group * sizeof (void *));

    /* setup the input file for the shared memory connection manager */
    input_file.file_name = sm_reg_data->file_name;
    input_file.size = sm_reg_data->size;
    input_file.size_ctl_structure = 0;
    input_file.data_seg_alignment = BASESMUMA_CACHE_LINE_SIZE;
    input_file.mpool_size = sm_reg_data->size;

    /* call the connection manager and map my shared memory peers' file
     */
    ret = bcol_basesmuma_smcm_allgather_connection(
        sm_bcol,
        sm_bcol->super.sbgp_partner_module,
        &(cs->sm_connections_list),
        &(sm_bcol->payload_backing_files_info),
        sm_bcol->super.sbgp_partner_module->group_comm,
        input_file,cs->payload_base_fname,
        false);
    if( OMPI_SUCCESS != ret ) {
        goto exit_ERROR;
    }


    /* now we exchange offset info - don't assume symmetric virtual memory
     */

    mem_offset = (void *) ((uintptr_t) ml_block->block->base_addr -
                           (uintptr_t) cs->sm_payload_structs->data_addr);

    /* call into the exchange offsets function */
    ret=comm_allgather_pml(&mem_offset, results_array, sizeof (void *), MPI_BYTE,
                           sm_bcol_module->super.sbgp_partner_module->my_index,
                           sm_bcol_module->super.sbgp_partner_module->group_size,
                           sm_bcol_module->super.sbgp_partner_module->group_list,
                           sm_bcol_module->super.sbgp_partner_module->group_comm);
    if( OMPI_SUCCESS != ret ) {
        goto exit_ERROR;
    }

    /* convert memory offset to virtual address in current rank */
    leading_dim = pload_mgmt->size_of_group;
    loop_limit =  ml_block->num_banks*ml_block->num_buffers_per_bank;
    for (i=0;i< sm_bcol_module->super.sbgp_partner_module->group_size;i++) {

        /* get the base pointer */
        int array_id=SM_ARRAY_INDEX(leading_dim,0,i);
        if( i == sm_bcol_module->super.sbgp_partner_module->my_index) {
            /* me */
            base_ptr=cs->sm_payload_structs->map_addr;
        } else {
            base_ptr=sm_bcol_module->payload_backing_files_info[i]->
                sm_mmap->map_addr;
        }

        /* first, set the pointer to the control struct */
        pload_mgmt->data_buffs[array_id].ctl_struct=(mca_bcol_basesmuma_header_t *)
            (uintptr_t)(((uint64_t)(uintptr_t)results_array[array_id])+(uint64_t)(uintptr_t)base_ptr);
        /* second, calculate where to set the data pointer */
        pload_mgmt->data_buffs[array_id].payload=(void *)
            (uintptr_t)((uint64_t)(uintptr_t) pload_mgmt->data_buffs[array_id].ctl_struct +
                        (uint64_t)(uintptr_t) data_offset);

        for( buf_id = 1 ; buf_id < loop_limit ; buf_id++ ) {
            int array_id_m1=SM_ARRAY_INDEX(leading_dim,(buf_id-1),i);
            array_id=SM_ARRAY_INDEX(leading_dim,buf_id,i);
            /* now, play the same game as above
             *
             * first, set the control struct's position */
            pload_mgmt->data_buffs[array_id].ctl_struct=(mca_bcol_basesmuma_header_t *)
                (uintptr_t)(((uint64_t)(uintptr_t)(pload_mgmt->data_buffs[array_id_m1].ctl_struct) +
                             (uint64_t)(uintptr_t)ml_block->size_buffer));

            /* second, set the payload pointer */
            pload_mgmt->data_buffs[array_id].payload =(void *)
                (uintptr_t)((uint64_t)(uintptr_t) pload_mgmt->data_buffs[array_id].ctl_struct +
                            (uint64_t)(uintptr_t) data_offset);
        }

    }

    /* done with the index array */
    free (results_array);

    /* initialize my control structures!! */
    my_idx = sm_bcol_module->super.sbgp_partner_module->my_index;
    leading_dim = sm_bcol_module->super.sbgp_partner_module->group_size;
    for( buf_id = 0; buf_id < loop_limit; buf_id++){
        array_id = SM_ARRAY_INDEX(leading_dim,buf_id,my_idx);
        ctl_ptr = pload_mgmt->data_buffs[array_id].ctl_struct;

        /* initialize the data structures */
        for( j = 0; j < SM_BCOLS_MAX; j++){
            for( i = 0; i < NUM_SIGNAL_FLAGS; i++){
                ctl_ptr->flags[i][j] = -1;
            }
        }
        ctl_ptr->sequence_number = -1;
        ctl_ptr->src = -1;
    }




    /* setup the data structures needed for releasing the payload
     * buffers back to the ml level
     */
    for( i=0 ; i < (int) ml_block->num_banks ; i++ ) {
        sm_bcol->colls_with_user_data.
            ctl_buffs_mgmt[i].nb_barrier_desc.ml_memory_block_descriptor=
            ml_block;
    }

    ml_mem->num_banks = ml_block->num_banks;
    ml_mem->bank_release_counter = calloc(ml_block->num_banks, sizeof(uint32_t));
    ml_mem->num_buffers_per_bank = ml_block->num_buffers_per_bank;
    ml_mem->size_buffer = ml_block->size_buffer;
    /* pointer to ml level descriptor */
    ml_mem->ml_mem_desc = ml_block;

    if (OMPI_SUCCESS != init_nb_coll_buff_desc(&ml_mem->nb_coll_desc,
                                               ml_block->block->base_addr,
                                               ml_mem->num_banks,
                                               ml_mem->num_buffers_per_bank,
                                               ml_mem->size_buffer,
                                               data_offset,
                                               sm_bcol_module->super.sbgp_partner_module->group_size,
                                               sm_bcol_module->pow_k)) {

        BASESMUMA_VERBOSE(10, ("Failed to allocate memory descriptors for storing state of non-blocking collectives\n"));
        return OMPI_ERROR;
    }

    return OMPI_SUCCESS;

exit_ERROR:
    return ret;
}
Exemple #4
0
static void
mca_bcol_basesmuma_module_destruct(mca_bcol_basesmuma_module_t *sm_module)
{
    /* local variables */
    int i;
    mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component;

    /*
     * release allocated resrouces
     */

    /* ...but not until you're sure you have no outstanding collectives */
    while(0 != opal_list_get_size(&(cs->nb_admin_barriers))) {
        opal_progress();
    }

#ifdef __PORTALS_AVAIL__
    /* Remove portals bcast specific resources */
    if ( PTL_OK != PtlEQFree(sm_module->sg_state.read_eq)) {
        BASESMUMA_VERBOSE(10,("PtlEQFree() failed:  )"));
    }
#endif

    /* Remove Lmsg Reduce Offsets Array */
    free_lmsg_reduce_offsets_array(sm_module);


    /* collective topology data */
    if( sm_module->fanout_read_tree) {
        for(i=0 ; i < sm_module->super.size_of_subgroup ; i++ ) {
            if(0 < sm_module->fanout_read_tree[i].n_children ) {
                free(sm_module->fanout_read_tree[i].children_ranks);
                sm_module->fanout_read_tree[i].children_ranks=NULL;
            }
        }
        free(sm_module->fanout_read_tree);
        sm_module->fanout_read_tree=NULL;
    }

    /* gvm Leak FIX Reduction_tree[].children_ranks has
     * to be removed. I don't how to get the size (which is
     * size of subgroup) of array reduction_tree
     */
    if( sm_module->reduction_tree) {
        for(i=0 ; i < sm_module->super.size_of_subgroup ; i++ ) {
            if(0 < sm_module->reduction_tree[i].n_children ) {
                free(sm_module->reduction_tree[i].children_ranks);
                sm_module->reduction_tree[i].children_ranks=NULL;
            }
        }
        free(sm_module->reduction_tree);
        sm_module->reduction_tree=NULL;
    }

    /* gvm Leak FIX */
    if (sm_module->fanout_node.children_ranks){
        free(sm_module->fanout_node.children_ranks);
        sm_module->fanout_node.children_ranks = NULL;
    }

    if (sm_module->fanin_node.children_ranks){
        free(sm_module->fanin_node.children_ranks);
        sm_module->fanin_node.children_ranks = NULL;
    }

    /* colls_no_user_data resrouces */
    if(sm_module->colls_no_user_data.ctl_buffs_mgmt){
        free(sm_module->colls_no_user_data.ctl_buffs_mgmt);
        sm_module->colls_no_user_data.ctl_buffs_mgmt=NULL;
    }
    if(sm_module->colls_no_user_data.ctl_buffs){
        free(sm_module->colls_no_user_data.ctl_buffs);
        sm_module->colls_no_user_data.ctl_buffs=NULL;
    }

    /* return control */
    opal_list_append (&cs->ctl_structures,  (opal_list_item_t *) sm_module->no_userdata_ctl);

    /* colls_with_user_data resrouces */
    /*
     *debug print */
    /*
      fprintf(stderr,"AAA colls_with_user_data.ctl_buffs %p \n",
      sm_module->colls_with_user_data.ctl_buffs_mgmt);
      end debug */

    if(sm_module->colls_with_user_data.ctl_buffs_mgmt){
        free(sm_module->colls_with_user_data.ctl_buffs_mgmt);
        sm_module->colls_with_user_data.ctl_buffs_mgmt=NULL;
    }
    if(sm_module->colls_with_user_data.ctl_buffs){
        free(sm_module->colls_with_user_data.ctl_buffs);
        sm_module->colls_with_user_data.ctl_buffs=NULL;
    }

    if(sm_module->shared_memory_scratch_space) {
        free(sm_module->shared_memory_scratch_space);
        sm_module->shared_memory_scratch_space=NULL;
    }

    /* return control */
    opal_list_append (&cs->ctl_structures,  (opal_list_item_t *) sm_module->userdata_ctl);

#if 1
    if(sm_module->scatter_kary_tree) {
        for(i=0 ; i < sm_module->super.size_of_subgroup ; i++ ) {
            if(0 < sm_module->scatter_kary_tree[i].n_children) {
                free(sm_module->scatter_kary_tree[i].children_ranks);
                sm_module->scatter_kary_tree[i].children_ranks=NULL;
            }
        }
        free(sm_module->scatter_kary_tree);
    }
#endif

    if(NULL != sm_module->super.list_n_connected ){
        free(sm_module->super.list_n_connected);
        sm_module->super.list_n_connected = NULL;
    }

    /* free the k-nomial allgather tree here */



    /* done */
}
int bcol_basesmuma_hdl_zerocopy_bcast(bcol_function_args_t *input_args,
                                  coll_ml_function_t   *c_input_args)
{
    /* local variables */
    int group_size, process_shift, my_node_index;
    int my_rank, first_instance=0, flag_offset; 
    int rc = OMPI_SUCCESS;
    int my_fanout_parent;
    int leading_dim, buff_idx, idx;
	volatile int64_t ready_flag;
    int count=input_args->count;
    struct ompi_datatype_t* dtype=input_args->dtype;
    int root=input_args->root;
    int64_t sequence_number=input_args->sequence_num;
    mca_bcol_basesmuma_module_t* bcol_module=
        (mca_bcol_basesmuma_module_t *)c_input_args->bcol_module;

    netpatterns_tree_node_t* my_fanout_read_tree;
    size_t pack_len = 0, dt_size;

    void *data_addr = (void *)((unsigned char *)input_args->src_desc->data_addr);

    struct mca_hdl_base_descriptor_t *hdl_desc;
    struct mca_hdl_base_segment_t *hdl_seg;
    int ret, completed, ridx/*remote rank index*/; 
    bool status;
    volatile mca_bcol_basesmuma_ctl_struct_t **ctl_structs;
    mca_bcol_basesmuma_ctl_struct_t  *my_ctl_pointer= NULL;
    volatile mca_bcol_basesmuma_ctl_struct_t  *parent_ctl_pointer= NULL;
    volatile mca_bcol_basesmuma_ctl_struct_t  *child_ctl_pointer= NULL;
    struct mca_hdl_base_module_t* hdl = bcol_module->hdl_module[0];


    /* we will work only on packed data - so compute the length*/
    ompi_datatype_type_size(dtype, &dt_size);
    pack_len = count * dt_size;

    buff_idx = input_args->src_desc->buffer_index;

    /* Get addressing information */ 
    my_rank = bcol_module->super.sbgp_partner_module->my_index;
    group_size = bcol_module->colls_no_user_data.size_of_group;
    leading_dim=bcol_module->colls_no_user_data.size_of_group;
    idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0);
    ctl_structs = (volatile mca_bcol_basesmuma_ctl_struct_t **)
        bcol_module->colls_with_user_data.ctl_buffs+idx;
    my_ctl_pointer = ctl_structs[my_rank];

    /* Align node index to around sbgp root */
    process_shift = root;
    my_node_index = my_rank - root;
    if(0 > my_node_index ) {
        my_node_index += group_size;
    }

    /* get my node for the bcast tree */
    my_fanout_read_tree = &(bcol_module->fanout_read_tree[my_node_index]); 
    my_fanout_parent = my_fanout_read_tree->parent_rank + process_shift;
    if(group_size <= my_fanout_parent){
        my_fanout_parent -= group_size;
    } 

    /* setup resource recycling */
    if( my_ctl_pointer->sequence_number < sequence_number ) {
        first_instance = 1;
    }

	if( first_instance ) {
        /* Signal arrival */
        my_ctl_pointer->flag  = -1;
        my_ctl_pointer->index = 1;
        /* this does not need to use any flag values , so only need to
         * set the value for subsequent values that may need this */
        my_ctl_pointer->starting_flag_value = 0;
        flag_offset = 0;
    } else {
        /* only one thread at a time will be making progress on this
         *   collective, so no need to make this atomic */
        my_ctl_pointer->index++;
    }


    /* increment the starting flag by one and return */
    flag_offset = my_ctl_pointer->starting_flag_value;
    ready_flag = flag_offset + sequence_number + 1;
    my_ctl_pointer->sequence_number = sequence_number;

    hdl_desc = (mca_hdl_base_descriptor_t *) 
        malloc (sizeof (mca_hdl_base_descriptor_t) * 1);
    
    /*prepare a hdl data segment*/
    hdl_seg = (mca_hdl_base_segment_t*) 
        malloc ( sizeof (mca_hdl_base_segment_t) * 1);
    hdl_seg->seg_addr.pval = input_args->sbuf;
    hdl_seg->seg_len = pack_len;
    

    hdl->endpoint->ready_flag = ready_flag;
    hdl->endpoint->local_ctrl  = my_ctl_pointer;
    hdl->endpoint->sbgp_contextid = 
        bcol_module->super.sbgp_partner_module->group_comm->c_contextid;

    /*
     * Fan out from root
     */
    if(ROOT_NODE == my_fanout_read_tree->my_node_type) {
        input_args->result_in_rbuf = false;

        hdl_desc->des_src = hdl_seg;
        hdl_desc->des_src_cnt = 1;
        hdl_desc->isroot = true;

        /*As the general semantics, there might multiple pairs of send/recv 
         *on the topology tree*/ 
        for (ridx = 0; ridx < my_fanout_read_tree->n_children; ridx++) {
            child_ctl_pointer = 
                ctl_structs[my_fanout_read_tree->children_ranks[ridx]]; 
            hdl->endpoint->remote_ctrl = child_ctl_pointer;
            ret = hdl->hdl_send(hdl, hdl->endpoint, hdl_desc);
            if (ret !=  OMPI_SUCCESS) {
                BASESMUMA_VERBOSE(1, ("send eror on rank %d ........", my_rank));
                goto exit_ERROR;
            }
        }
    }else if(LEAF_NODE == my_fanout_read_tree->my_node_type) {
        input_args->result_in_rbuf = false;
        /*
         * Get parent payload data and control data.
         * Get the pointer to the base address of the parent's payload buffer.
         * Get the parent's control buffer.
         */
        parent_ctl_pointer = ctl_structs[my_fanout_parent]; 

        hdl_desc->des_dst = hdl_seg;
        hdl_desc->des_dst_cnt = 1; 
        hdl_desc->isroot = false;
        hdl->endpoint->remote_ctrl = parent_ctl_pointer;

#if __TEST_BLOCKING__
        ret = hdl->hdl_recv(hdl, hdl->endpoint, hdl_desc);
#else
        ret = hdl->hdl_recvi(hdl, hdl->endpoint, NULL, 0, 0, &hdl_desc);
#endif

#if __TEST_WAIT__
        ret = hdl->hdl_wait(hdl, hdl->endpoint, hdl_desc); 
        BASESMUMA_VERBOSE(1,("wait on rank %d is done!", my_rank));
#endif
		if (OMPI_SUCCESS != ret) {
            BASESMUMA_VERBOSE(1, ("recvi eror on rank %d ........", my_rank));
            goto exit_ERROR;
        }

        status = false;
#if __TEST_TEST__
        while (!status) {
            hdl->hdl_test(&hdl_desc, &completed, &status);
			opal_progress();
            BASESMUMA_VERBOSE(1, ("test on rank %d ........", my_rank));
        }
#endif

        goto Release;
        
    }else{
        input_args->result_in_rbuf = false;
        /* Interior node */

        /* Get parent payload data and control data */
        parent_ctl_pointer = ctl_structs[my_fanout_parent]; 
		
        hdl_desc->des_dst = hdl_seg;
        hdl_desc->des_dst_cnt = 1; 
        hdl_desc->isroot = false;

        hdl->endpoint->remote_ctrl = parent_ctl_pointer;

        ret = hdl->hdl_recv(hdl, hdl->endpoint, hdl_desc);
		if (OMPI_SUCCESS != ret) {
            goto exit_ERROR;
        }
		if (OMPI_SUCCESS != ret) {
            BASESMUMA_VERBOSE(1, ("recvi eror on rank %d ........", my_rank));
            goto exit_ERROR;
        }
		
        /* Signal to children that they may read the data from my shared buffer */
        MB();
        hdl_desc->des_src = hdl_seg;
        hdl_desc->des_src_cnt = 1; 
        for (ridx = 0; ridx < my_fanout_read_tree->n_children; ridx++) {
            child_ctl_pointer = 
                ctl_structs[my_fanout_read_tree->children_ranks[ridx]]; 
            hdl->endpoint->remote_ctrl = child_ctl_pointer;

            ret = hdl->hdl_send(hdl, hdl->endpoint, hdl_desc);
            if (ret !=  OMPI_SUCCESS) {
                BASESMUMA_VERBOSE(1, ("send eror on rank %d ........", my_rank));
                goto exit_ERROR;
            }
        }
        goto Release;
    }

Release:
    /* if I am the last instance of a basesmuma function in this collectie,
     *   release the resrouces */
    if (IS_LAST_BCOL_FUNC(c_input_args)) {
        rc = bcol_basesmuma_free_buff(
                &(bcol_module->colls_with_user_data),
                sequence_number);
    }

    my_ctl_pointer->starting_flag_value += 1;

    return BCOL_FN_COMPLETE;
exit_ERROR:
    return OMPI_ERROR;
}