int mca_pml_ucx_init(void) { ucs_status_t status; int rc; PML_UCX_VERBOSE(1, "mca_pml_ucx_init"); /* TODO check MPI thread mode */ status = ucp_worker_create(ompi_pml_ucx.ucp_context, UCS_THREAD_MODE_SINGLE, &ompi_pml_ucx.ucp_worker); if (UCS_OK != status) { return OMPI_ERROR; } rc = mca_pml_ucx_send_worker_address(); if (rc < 0) { return rc; } /* Initialize the free lists */ OBJ_CONSTRUCT(&ompi_pml_ucx.persistent_reqs, mca_pml_ucx_freelist_t); OBJ_CONSTRUCT(&ompi_pml_ucx.convs, mca_pml_ucx_freelist_t); /* Create a completed request to be returned from isend */ OBJ_CONSTRUCT(&ompi_pml_ucx.completed_send_req, ompi_request_t); mca_pml_ucx_completed_request_init(&ompi_pml_ucx.completed_send_req); opal_progress_register(mca_pml_ucx_progress); PML_UCX_VERBOSE(2, "created ucp context %p, worker %p", (void *)ompi_pml_ucx.ucp_context, (void *)ompi_pml_ucx.ucp_worker); return OMPI_SUCCESS; }
/* * Invoked when there's a new communicator that has been created. * Look at the communicator and decide which set of functions and * priority we want to return. */ mca_coll_base_module_t * mca_coll_hcoll_comm_query(struct ompi_communicator_t *comm, int *priority) { mca_coll_base_module_t *module; mca_coll_hcoll_module_t *hcoll_module; static bool libhcoll_initialized = false; *priority = 0; module = NULL; if (!mca_coll_hcoll_component.hcoll_enable){ goto exit; } if (!libhcoll_initialized) { /* libhcoll should be initialized here since current implmentation of mxm bcol in libhcoll needs world_group fully functional during init world_group, i.e. ompi_comm_world, is not ready at hcoll component open call */ opal_progress_register(hcoll_progress_fn); int rc = hcoll_init(); if (HCOLL_SUCCESS != rc){ mca_coll_hcoll_component.hcoll_enable = 0; opal_progress_unregister(hcoll_progress_fn); HCOL_VERBOSE(0,"Hcol library init failed"); return NULL; } libhcoll_initialized = true; } hcoll_module = OBJ_NEW(mca_coll_hcoll_module_t); if (!hcoll_module){ goto exit; } if (ompi_comm_size(comm) < 2 || OMPI_COMM_IS_INTER(comm)){ goto exit; } hcoll_module->super.coll_module_enable = mca_coll_hcoll_module_enable; hcoll_module->super.coll_barrier = hcoll_collectives.coll_barrier ? mca_coll_hcoll_barrier : NULL; hcoll_module->super.coll_bcast = hcoll_collectives.coll_bcast ? mca_coll_hcoll_bcast : NULL; hcoll_module->super.coll_allgather = hcoll_collectives.coll_allgather ? mca_coll_hcoll_allgather : NULL; hcoll_module->super.coll_allreduce = hcoll_collectives.coll_allreduce ? mca_coll_hcoll_allreduce : NULL; hcoll_module->super.coll_alltoall = /*hcoll_collectives.coll_alltoall ? mca_coll_hcoll_alltoall : */ NULL; hcoll_module->super.coll_ibarrier = hcoll_collectives.coll_ibarrier ? mca_coll_hcoll_ibarrier : NULL; hcoll_module->super.coll_ibcast = hcoll_collectives.coll_ibcast ? mca_coll_hcoll_ibcast : NULL; hcoll_module->super.coll_iallgather = hcoll_collectives.coll_iallgather ? mca_coll_hcoll_iallgather : NULL; hcoll_module->super.coll_iallreduce = hcoll_collectives.coll_iallreduce ? mca_coll_hcoll_iallreduce : NULL; *priority = mca_coll_hcoll_component.hcoll_priority; module = &hcoll_module->super; exit: return module; }
int ompi_mtl_mx_module_init(){ mx_param_t mx_param; mx_return_t mx_return; int32_t nic, ep; /* setup params */ mx_param.key = MX_PARAM_UNEXP_QUEUE_MAX; mx_param.val.unexp_queue_max = ompi_mtl_mx.mx_unexp_queue_max; /* get a local endpoint */ nic = ompi_mtl_mx.mx_board_num; if (nic < 0) { nic = MX_ANY_NIC; } ep = ompi_mtl_mx.mx_endpoint_num; if (ep < 0) { ep = MX_ANY_ENDPOINT; } mx_return = mx_open_endpoint(nic, ep, ompi_mtl_mx.mx_filter, NULL, 0, &ompi_mtl_mx.mx_endpoint); if(mx_return != MX_SUCCESS) { opal_output(ompi_mtl_base_framework.framework_output, "Error in mx_open_endpoint (error %s)\n", mx_strerror(mx_return)); return OMPI_ERROR; } /* get the endpoint address */ mx_return = mx_get_endpoint_addr( ompi_mtl_mx.mx_endpoint, &ompi_mtl_mx.mx_endpoint_addr); if(mx_return != MX_SUCCESS) { opal_output(ompi_mtl_base_framework.framework_output, "Error in mx_get_endpoint_addr (error %s)\n", mx_strerror(mx_return)); return OMPI_ERROR; } mx_return = mx_decompose_endpoint_addr( ompi_mtl_mx.mx_endpoint_addr, &(ompi_mtl_mx.mx_addr.nic_id), &(ompi_mtl_mx.mx_addr.endpoint_id) ); if(mx_return != MX_SUCCESS) { opal_output(ompi_mtl_base_framework.framework_output, "Error in mx_decompose_endpoint_addr (error %s)\n", mx_strerror(mx_return)); return OMPI_ERROR; } opal_output_verbose(10, ompi_mtl_base_framework.framework_output, "mtl:mx: local nic %d, endpoint %d, got nic %d, ep %d\n", nic, ep, (int)ompi_mtl_mx.mx_addr.nic_id, ompi_mtl_mx.mx_addr.endpoint_id); ompi_modex_send( &mca_mtl_mx_component.super.mtl_version, &ompi_mtl_mx.mx_addr, sizeof(mca_mtl_mx_addr_t)); /* register the mtl mx progress function */ opal_progress_register(ompi_mtl_mx_progress); return OMPI_SUCCESS; }
int mca_pml_yalla_init(void) { mxm_error_t error; int rc; PML_YALLA_VERBOSE(1, "mca_pml_yalla_init"); if (ompi_pml_yalla.using_mem_hooks) { opal_mem_hooks_register_release(mca_pml_yalla_mem_release_cb, NULL); } error = mxm_ep_create(ompi_pml_yalla.mxm_context, ompi_pml_yalla.ep_opts, &ompi_pml_yalla.mxm_ep); if (MXM_OK != error) { return OMPI_ERROR; } rc = send_ep_address(); if (rc < 0) { return rc; } OBJ_CONSTRUCT(&ompi_pml_yalla.send_reqs, mca_pml_yalla_freelist_t); OBJ_CONSTRUCT(&ompi_pml_yalla.bsend_reqs, mca_pml_yalla_freelist_t); OBJ_CONSTRUCT(&ompi_pml_yalla.recv_reqs, mca_pml_yalla_freelist_t); OBJ_CONSTRUCT(&ompi_pml_yalla.convs, mca_pml_yalla_freelist_t); opal_progress_register(mca_pml_yalla_progress); PML_YALLA_VERBOSE(2, "created mxm context %p ep %p", (void *)ompi_pml_yalla.mxm_context, (void *)ompi_pml_yalla.mxm_ep); return OMPI_SUCCESS; }
int mca_pml_ob1_enable_progress(int32_t count) { int32_t progress_count = OPAL_ATOMIC_ADD_FETCH32(&mca_pml_ob1_progress_needed, count); if( 1 < progress_count ) return 0; /* progress was already on */ opal_progress_register(mca_pml_ob1_progress); return 1; }
int ompi_mtl_mx_module_init(){ mx_param_t mx_param; mx_return_t mx_return; /* setup params */ mx_param.key = MX_PARAM_UNEXP_QUEUE_MAX; mx_param.val.unexp_queue_max = ompi_mtl_mx.mx_unexp_queue_max; /* get a local endpoint */ mx_return = mx_open_endpoint(MX_ANY_NIC, MX_ANY_ENDPOINT, ompi_mtl_mx.mx_filter, NULL, 0, &ompi_mtl_mx.mx_endpoint); if(mx_return != MX_SUCCESS) { opal_output(ompi_mtl_base_output, "Error in mx_open_endpoint (error %s)\n", mx_strerror(mx_return)); return OMPI_ERROR; } /* get the endpoint address */ mx_return = mx_get_endpoint_addr( ompi_mtl_mx.mx_endpoint, &ompi_mtl_mx.mx_endpoint_addr); if(mx_return != MX_SUCCESS) { opal_output(ompi_mtl_base_output, "Error in mx_get_endpoint_addr (error %s)\n", mx_strerror(mx_return)); return OMPI_ERROR; } mx_return = mx_decompose_endpoint_addr( ompi_mtl_mx.mx_endpoint_addr, &(ompi_mtl_mx.mx_addr.nic_id), &(ompi_mtl_mx.mx_addr.endpoint_id) ); if(mx_return != MX_SUCCESS) { opal_output(ompi_mtl_base_output, "Error in mx_decompose_endpoint_addr (error %s)\n", mx_strerror(mx_return)); return OMPI_ERROR; } ompi_modex_send( &mca_mtl_mx_component.super.mtl_version, &ompi_mtl_mx.mx_addr, sizeof(mca_mtl_mx_addr_t)); /* register the mtl mx progress function */ opal_progress_register(ompi_mtl_mx_progress); return OMPI_SUCCESS; }
int NBC_Init_handle(struct ompi_communicator_t *comm, ompi_coll_libnbc_request_t **request, ompi_coll_libnbc_module_t *comminfo) { int tmp_tag; bool need_register = false; ompi_coll_libnbc_request_t *handle; OMPI_COLL_LIBNBC_REQUEST_ALLOC(comm, handle); if (NULL == handle) return OMPI_ERR_OUT_OF_RESOURCE; *request = handle; handle->tmpbuf = NULL; handle->req_count = 0; handle->req_array = NULL; handle->comm = comm; handle->schedule = NULL; /* first int is the schedule size */ handle->row_offset = sizeof(int); /******************** Do the tag and shadow comm administration ... ***************/ OPAL_THREAD_LOCK(&comminfo->mutex); tmp_tag = comminfo->tag--; if (tmp_tag == MCA_COLL_BASE_TAG_NONBLOCKING_END) { tmp_tag = comminfo->tag = MCA_COLL_BASE_TAG_NONBLOCKING_BASE; NBC_DEBUG(2,"resetting tags ...\n"); } if (true != comminfo->comm_registered) { comminfo->comm_registered = true; need_register = true; } OPAL_THREAD_UNLOCK(&comminfo->mutex); handle->tag=comminfo->tag; /* register progress */ if (need_register) { int32_t tmp = OPAL_THREAD_ADD32(&mca_coll_libnbc_component.active_comms, 1); if (tmp == 1) { opal_progress_register(ompi_coll_libnbc_progress); } } handle->comm=comm; /*printf("got comminfo: %lu tag: %i\n", comminfo, comminfo->tag);*/ /******************** end of tag and shadow comm administration ... ***************/ handle->comminfo = comminfo; NBC_DEBUG(3, "got tag %i\n", handle->tag); return NBC_OK; }
/* * Open the component */ static int iboffload_open(void) { int rc; /* local variables */ mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; IBOFFLOAD_VERBOSE(10, ("Open Iboffload component.\n")); cm->super.priority = 100; cm->super.n_net_contexts = 0; cm->super.network_contexts = NULL; OBJ_CONSTRUCT(&cm->recv_wrs.lock, opal_mutex_t); /* construct lists */ OBJ_CONSTRUCT(&cm->devices, opal_pointer_array_t); rc = opal_pointer_array_init(&cm->devices, 10, INT_MAX, 10); if (OMPI_SUCCESS != rc) { goto close_device; } /* load mca parametres */ rc = mca_bcol_iboffload_register_params(); if (OMPI_SUCCESS != rc) { goto close_device; } /* Register the progress function */ rc = opal_progress_register(mca_bcol_iboffload_component_progress); if (OMPI_SUCCESS != rc) { IBOFFLOAD_ERROR(("Failed to register the progress function" " for iboffload component.\n")); goto close_device; } map_ompi_to_ib_dtype(); map_ompi_to_ib_op_type(); /* The init_done set to true on first component usage */ cm->init_done = false; return OMPI_SUCCESS; close_device: OBJ_DESTRUCT(&cm->devices); OBJ_DESTRUCT(&cm->recv_wrs.lock); return rc; }
static int hcoll_open(void) { int rc; mca_coll_hcoll_output = opal_output_open(NULL); opal_output_set_verbosity(mca_coll_hcoll_output, mca_coll_hcoll_component.hcoll_verbose); hcoll_rte_fns_setup(); opal_progress_register(hcoll_progress_fn); rc = hcoll_init(); if (HCOLL_SUCCESS != rc){ opal_progress_unregister(hcoll_progress_fn); HCOL_VERBOSE(1,"Hcol library init failed"); return OMPI_ERROR; } return OMPI_SUCCESS; }
int memheap_oob_init(mca_memheap_map_t *map) { int rc = OSHMEM_SUCCESS; int i; oob_comm_request_t *r; memheap_map = map; OBJ_CONSTRUCT(&memheap_oob.lck, opal_mutex_t); OBJ_CONSTRUCT(&memheap_oob.cond, opal_condition_t); OBJ_CONSTRUCT(&memheap_oob.req_list, opal_list_t); for (i = 0; i < MEMHEAP_RECV_REQS_MAX; i++) { r = &memheap_oob.req_pool[i]; rc = PMPI_Recv_init(r->buf, sizeof(r->buf), MPI_BYTE, MPI_ANY_SOURCE, 0, oshmem_comm_world, &r->recv_req); if (MPI_SUCCESS != rc) { MEMHEAP_ERROR("Failed to created recv request %d", rc); return rc; } rc = PMPI_Start(&r->recv_req); if (MPI_SUCCESS != rc) { MEMHEAP_ERROR("Failed to post recv request %d", rc); return rc; } opal_list_append(&memheap_oob.req_list, &r->super); } opal_progress_register(oshmem_mkey_recv_cb); memheap_oob.is_inited = 1; return rc; }
int mca_spml_ikrit_add_procs(ompi_proc_t** procs, size_t nprocs) { spml_ikrit_mxm_ep_conn_info_t *ep_info = NULL; spml_ikrit_mxm_ep_conn_info_t *ep_hw_rdma_info = NULL; spml_ikrit_mxm_ep_conn_info_t my_ep_info = {{0}}; #if MXM_API < MXM_VERSION(2,0) mxm_conn_req_t *conn_reqs; int timeout; #else size_t mxm_addr_len = MXM_MAX_ADDR_LEN; #endif mxm_error_t err; size_t i, n; int rc = OSHMEM_ERROR; ompi_proc_t *proc_self; int my_rank = oshmem_my_proc_id(); OBJ_CONSTRUCT(&mca_spml_ikrit.active_peers, opal_list_t); /* Allocate connection requests */ #if MXM_API < MXM_VERSION(2,0) conn_reqs = malloc(nprocs * sizeof(mxm_conn_req_t)); if (NULL == conn_reqs) { rc = OSHMEM_ERR_OUT_OF_RESOURCE; goto bail; } memset(conn_reqs, 0x0, sizeof(mxm_conn_req_t)); #endif ep_info = calloc(sizeof(spml_ikrit_mxm_ep_conn_info_t), nprocs); if (NULL == ep_info) { rc = OSHMEM_ERR_OUT_OF_RESOURCE; goto bail; } if (mca_spml_ikrit.hw_rdma_channel) { ep_hw_rdma_info = calloc(sizeof(spml_ikrit_mxm_ep_conn_info_t), nprocs); if (NULL == ep_hw_rdma_info) { rc = OSHMEM_ERR_OUT_OF_RESOURCE; goto bail; } } mca_spml_ikrit.mxm_peers = (mxm_peer_t **) malloc(nprocs * sizeof(*(mca_spml_ikrit.mxm_peers))); if (NULL == mca_spml_ikrit.mxm_peers) { rc = OSHMEM_ERR_OUT_OF_RESOURCE; goto bail; } #if MXM_API < MXM_VERSION(2,0) if (OSHMEM_SUCCESS != spml_ikrit_get_ep_address(&my_ep_info, MXM_PTL_SELF)) { rc = OSHMEM_ERROR; goto bail; } if (OSHMEM_SUCCESS != spml_ikrit_get_ep_address(&my_ep_info, MXM_PTL_RDMA)) { rc = OSHMEM_ERROR; goto bail; } #else if (mca_spml_ikrit.hw_rdma_channel) { err = mxm_ep_get_address(mca_spml_ikrit.mxm_hw_rdma_ep, &my_ep_info.addr.ep_addr, &mxm_addr_len); if (MXM_OK != err) { orte_show_help("help-oshmem-spml-ikrit.txt", "unable to get endpoint address", true, mxm_error_string(err)); rc = OSHMEM_ERROR; goto bail; } oshmem_shmem_allgather(&my_ep_info, ep_hw_rdma_info, sizeof(spml_ikrit_mxm_ep_conn_info_t)); } err = mxm_ep_get_address(mca_spml_ikrit.mxm_ep, &my_ep_info.addr.ep_addr, &mxm_addr_len); if (MXM_OK != err) { orte_show_help("help-oshmem-spml-ikrit.txt", "unable to get endpoint address", true, mxm_error_string(err)); rc = OSHMEM_ERROR; goto bail; } #endif oshmem_shmem_allgather(&my_ep_info, ep_info, sizeof(spml_ikrit_mxm_ep_conn_info_t)); opal_progress_register(spml_ikrit_progress); /* Get the EP connection requests for all the processes from modex */ for (n = 0; n < nprocs; ++n) { /* mxm 2.0 keeps its connections on a list. Make sure * that list have different order on every rank */ i = (my_rank + n) % nprocs; mca_spml_ikrit.mxm_peers[i] = OBJ_NEW(mxm_peer_t); if (NULL == mca_spml_ikrit.mxm_peers[i]) { rc = OSHMEM_ERR_OUT_OF_RESOURCE; goto bail; } mca_spml_ikrit.mxm_peers[i]->pe = i; #if MXM_API < MXM_VERSION(2,0) conn_reqs[i].ptl_addr[MXM_PTL_SELF] = (struct sockaddr *) &ep_info[i].addr.ptl_addr[MXM_PTL_SELF]; conn_reqs[i].ptl_addr[MXM_PTL_SHM] = NULL; conn_reqs[i].ptl_addr[MXM_PTL_RDMA] = (struct sockaddr *) &ep_info[i].addr.ptl_addr[MXM_PTL_RDMA]; #else err = mxm_ep_connect(mca_spml_ikrit.mxm_ep, ep_info[i].addr.ep_addr, &mca_spml_ikrit.mxm_peers[i]->mxm_conn); if (MXM_OK != err) { SPML_ERROR("MXM returned connect error: %s\n", mxm_error_string(err)); goto bail; } if (OSHMEM_SUCCESS != create_ptl_idx(i)) goto bail; mxm_conn_ctx_set(mca_spml_ikrit.mxm_peers[i]->mxm_conn, mca_spml_ikrit.mxm_peers[i]); if (mca_spml_ikrit.hw_rdma_channel) { err = mxm_ep_connect(mca_spml_ikrit.mxm_hw_rdma_ep, ep_hw_rdma_info[i].addr.ep_addr, &mca_spml_ikrit.mxm_peers[i]->mxm_hw_rdma_conn); if (MXM_OK != err) { SPML_ERROR("MXM returned connect error: %s\n", mxm_error_string(err)); goto bail; } } else { mca_spml_ikrit.mxm_peers[i]->mxm_hw_rdma_conn = mca_spml_ikrit.mxm_peers[i]->mxm_conn; } #endif } #if MXM_API < MXM_VERSION(2,0) /* Connect to remote peers */ if (mxm_get_version() < MXM_VERSION(1,5)) { timeout = 1000; } else { timeout = -1; } err = mxm_ep_connect(mca_spml_ikrit.mxm_ep, conn_reqs, nprocs, timeout); if (MXM_OK != err) { SPML_ERROR("MXM returned connect error: %s\n", mxm_error_string(err)); for (i = 0; i < nprocs; ++i) { if (MXM_OK != conn_reqs[i].error) { SPML_ERROR("MXM EP connect to %s error: %s\n", procs[i]->proc_hostname, mxm_error_string(conn_reqs[i].error)); } } rc = OSHMEM_ERR_CONNECTION_FAILED; goto bail; } /* Save returned connections */ for (i = 0; i < nprocs; ++i) { mca_spml_ikrit.mxm_peers[i]->mxm_conn = conn_reqs[i].conn; if (OSHMEM_SUCCESS != create_ptl_idx(i)) { rc = OSHMEM_ERR_CONNECTION_FAILED; goto bail; } mxm_conn_ctx_set(conn_reqs[i].conn, mca_spml_ikrit.mxm_peers[i]); } if (conn_reqs) free(conn_reqs); #endif if (ep_info) free(ep_info); if (ep_hw_rdma_info) free(ep_hw_rdma_info); #if MXM_API >= MXM_VERSION(2,0) if (mca_spml_ikrit.bulk_connect) { /* Need a barrier to ensure remote peers already created connection */ oshmem_shmem_barrier(); mxm_ep_wireup(mca_spml_ikrit.mxm_ep); } #endif proc_self = oshmem_proc_group_find(oshmem_group_all, my_rank); /* identify local processes and change transport to SHM */ for (i = 0; i < nprocs; i++) { if (procs[i]->super.proc_name.jobid != proc_self->super.proc_name.jobid || !OPAL_PROC_ON_LOCAL_NODE(procs[i]->super.proc_flags)) { continue; } if (procs[i] == proc_self) continue; /* use zcopy for put/get via sysv shared memory */ OSHMEM_PROC_DATA(procs[i])->transport_ids[0] = MXM_PTL_SHM; OSHMEM_PROC_DATA(procs[i])->transport_ids[1] = MXM_PTL_RDMA; OSHMEM_PROC_DATA(procs[i])->num_transports = 2; } SPML_VERBOSE(50, "*** ADDED PROCS ***"); return OSHMEM_SUCCESS; bail: #if MXM_API < MXM_VERSION(2,0) if (conn_reqs) free(conn_reqs); #endif if (ep_info) free(ep_info); if (ep_hw_rdma_info) free(ep_hw_rdma_info); SPML_ERROR("add procs FAILED rc=%d", rc); return rc; }
static int mca_bml_r2_add_procs( size_t nprocs, struct ompi_proc_t** procs, struct opal_bitmap_t* reachable ) { size_t p, p_index, n_new_procs = 0; struct mca_btl_base_endpoint_t ** btl_endpoints = NULL; struct ompi_proc_t** new_procs = NULL; struct ompi_proc_t *unreach_proc = NULL; int rc, ret = OMPI_SUCCESS; if(0 == nprocs) { return OMPI_SUCCESS; } if(OMPI_SUCCESS != (rc = mca_bml_r2_add_btls()) ) { return rc; } /* Select only the procs that don't yet have the BML proc struct. This prevent * us from calling btl->add_procs several this on the same destination proc. */ for(p_index = 0; p_index < nprocs; p_index++) { struct ompi_proc_t* proc = procs[p_index]; OBJ_RETAIN(proc); if(NULL != proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]) { continue; /* go to the next proc */ } /* Allocate the new_procs on demand */ if( NULL == new_procs ) { new_procs = (struct ompi_proc_t **)malloc(nprocs * sizeof(struct ompi_proc_t *)); if( NULL == new_procs ) { return OMPI_ERR_OUT_OF_RESOURCE; } } new_procs[n_new_procs++] = proc; } if ( 0 == n_new_procs ) { return OMPI_SUCCESS; } /* Starting from here we only work on the unregistered procs */ procs = new_procs; nprocs = n_new_procs; /* attempt to add all procs to each r2 */ btl_endpoints = (struct mca_btl_base_endpoint_t **) malloc(nprocs * sizeof(struct mca_btl_base_endpoint_t*)); if (NULL == btl_endpoints) { free(new_procs); return OMPI_ERR_OUT_OF_RESOURCE; } for(p_index = 0; p_index < mca_bml_r2.num_btl_modules; p_index++) { mca_btl_base_module_t* btl = mca_bml_r2.btl_modules[p_index]; int btl_inuse = 0; /* if the r2 can reach the destination proc it sets the * corresponding bit (proc index) in the reachable bitmap * and can return addressing information for each proc * that is passed back to the r2 on data transfer calls */ opal_bitmap_clear_all_bits(reachable); memset(btl_endpoints, 0, nprocs *sizeof(struct mca_btl_base_endpoint_t*)); rc = btl->btl_add_procs(btl, n_new_procs, new_procs, btl_endpoints, reachable); if(OMPI_SUCCESS != rc) { /* This BTL has troubles adding the nodes. Let's continue maybe some other BTL * can take care of this task. */ continue; } /* for each proc that is reachable */ for( p = 0; p < n_new_procs; p++ ) { if(opal_bitmap_is_set_bit(reachable, p)) { ompi_proc_t *proc = new_procs[p]; mca_bml_base_endpoint_t * bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]; mca_bml_base_btl_t* bml_btl; size_t size; if(NULL == bml_endpoint) { /* allocate bml specific proc data */ bml_endpoint = OBJ_NEW(mca_bml_base_endpoint_t); if (NULL == bml_endpoint) { opal_output(0, "mca_bml_r2_add_procs: unable to allocate resources"); free(btl_endpoints); free(new_procs); return OMPI_ERR_OUT_OF_RESOURCE; } /* preallocate space in array for max number of r2s */ mca_bml_base_btl_array_reserve(&bml_endpoint->btl_eager, mca_bml_r2.num_btl_modules); mca_bml_base_btl_array_reserve(&bml_endpoint->btl_send, mca_bml_r2.num_btl_modules); mca_bml_base_btl_array_reserve(&bml_endpoint->btl_rdma, mca_bml_r2.num_btl_modules); bml_endpoint->btl_max_send_size = -1; bml_endpoint->btl_proc = proc; proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML] = bml_endpoint; bml_endpoint->btl_flags_or = 0; } /* dont allow an additional BTL with a lower exclusivity ranking */ size = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send); if(size > 0) { bml_btl = mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, size-1); /* skip this btl if the exclusivity is less than the previous */ if(bml_btl->btl->btl_exclusivity > btl->btl_exclusivity) { btl->btl_del_procs(btl, 1, &proc, &btl_endpoints[p]); continue; } } /* cache the endpoint on the proc */ bml_btl = mca_bml_base_btl_array_insert(&bml_endpoint->btl_send); bml_btl->btl = btl; bml_btl->btl_endpoint = btl_endpoints[p]; bml_btl->btl_weight = 0; bml_btl->btl_flags = btl->btl_flags; if( (bml_btl->btl_flags & MCA_BTL_FLAGS_PUT) && (NULL == btl->btl_put) ) { opal_output(0, "mca_bml_r2_add_procs: The PUT flag is specified for" " the %s BTL without any PUT function attached. Disard the flag !", bml_btl->btl->btl_component->btl_version.mca_component_name); bml_btl->btl_flags ^= MCA_BTL_FLAGS_PUT; } if( (bml_btl->btl_flags & MCA_BTL_FLAGS_GET) && (NULL == btl->btl_get) ) { opal_output(0, "mca_bml_r2_add_procs: The GET flag is specified for" " the %s BTL without any GET function attached. Discard the flag !", bml_btl->btl->btl_component->btl_version.mca_component_name); bml_btl->btl_flags ^= MCA_BTL_FLAGS_GET; } if( (bml_btl->btl_flags & (MCA_BTL_FLAGS_PUT | MCA_BTL_FLAGS_GET | MCA_BTL_FLAGS_SEND)) == 0 ) { /** * If no protocol specified, we have 2 choices: we ignore the BTL * as we don't know which protocl to use, or we suppose that all * BTLs support the send protocol. */ bml_btl->btl_flags |= MCA_BTL_FLAGS_SEND; } /** * calculate the bitwise OR of the btl flags */ bml_endpoint->btl_flags_or |= bml_btl->btl_flags; /* This BTL is in use, allow the progress registration */ btl_inuse++; } } if(btl_inuse > 0 && NULL != btl->btl_component->btl_progress) { size_t p; bool found = false; for( p = 0; p < mca_bml_r2.num_btl_progress; p++ ) { if(mca_bml_r2.btl_progress[p] == btl->btl_component->btl_progress) { found = true; break; } } if(found == false) { mca_bml_r2.btl_progress[mca_bml_r2.num_btl_progress] = btl->btl_component->btl_progress; mca_bml_r2.num_btl_progress++; opal_progress_register( btl->btl_component->btl_progress ); } } } free(btl_endpoints); /* iterate back through procs and compute metrics for registered r2s */ for(p=0; p<n_new_procs; p++) { ompi_proc_t *proc = new_procs[p]; mca_bml_base_endpoint_t* bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]; double total_bandwidth = 0; uint32_t latency = 0xffffffff; size_t n_index; size_t n_size; /* skip over procs w/ no btl's registered */ if(NULL == bml_endpoint) { continue; } /* (1) determine the total bandwidth available across all btls * note that we need to do this here, as we may already have btls configured * (2) determine the highest priority ranking for latency * (3) compute the maximum amount of bytes that can be send without any * weighting. Once the left over is smaller than this number we will * start using the weight to compute the correct amount. */ n_size = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send); /* sort BTLs in descending order according to bandwidth value */ qsort(bml_endpoint->btl_send.bml_btls, n_size, sizeof(mca_bml_base_btl_t), btl_bandwidth_compare); bml_endpoint->btl_rdma_index = 0; for(n_index = 0; n_index < n_size; n_index++) { mca_bml_base_btl_t* bml_btl = mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, n_index); mca_btl_base_module_t* btl = bml_btl->btl; total_bandwidth += bml_btl->btl->btl_bandwidth; if(btl->btl_latency < latency) { latency = btl->btl_latency; } } /* (1) set the weight of each btl as a percentage of overall bandwidth * (2) copy all btl instances at the highest priority ranking into the * list of btls used for first fragments */ for(n_index = 0; n_index < n_size; n_index++) { mca_bml_base_btl_t* bml_btl = mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, n_index); mca_btl_base_module_t *btl = bml_btl->btl; /* compute weighting factor for this r2 */ if(btl->btl_bandwidth > 0) { bml_btl->btl_weight = (float)(btl->btl_bandwidth / total_bandwidth); } else { bml_btl->btl_weight = (float)(1.0 / n_size); } /* check to see if this r2 is already in the array of r2s * used for first fragments - if not add it. */ if(btl->btl_latency == latency) { mca_bml_base_btl_t* bml_btl_new = mca_bml_base_btl_array_insert(&bml_endpoint->btl_eager); *bml_btl_new = *bml_btl; } /* set endpoint max send size as min of available btls */ if(bml_endpoint->btl_max_send_size > btl->btl_max_send_size) bml_endpoint->btl_max_send_size = btl->btl_max_send_size; /* check flags - is rdma prefered */ if ((btl->btl_flags & (MCA_BTL_FLAGS_PUT|MCA_BTL_FLAGS_GET)) && !((proc->proc_arch != ompi_proc_local_proc->proc_arch) && (0 == (btl->btl_flags & MCA_BTL_FLAGS_HETEROGENEOUS_RDMA)))) { mca_bml_base_btl_t* bml_btl_rdma = mca_bml_base_btl_array_insert(&bml_endpoint->btl_rdma); mca_btl_base_module_t* btl_rdma = bml_btl->btl; *bml_btl_rdma = *bml_btl; if(bml_endpoint->btl_pipeline_send_length < btl_rdma->btl_rdma_pipeline_send_length) { bml_endpoint->btl_pipeline_send_length = btl_rdma->btl_rdma_pipeline_send_length; } if(bml_endpoint->btl_send_limit < btl_rdma->btl_min_rdma_pipeline_size) { bml_endpoint->btl_send_limit = btl_rdma->btl_min_rdma_pipeline_size; } } } } /* see if we have a connection to everyone else */ for(p=0; p<n_new_procs; p++) { ompi_proc_t *proc = new_procs[p]; if (NULL == proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]) { if (NULL == unreach_proc) { unreach_proc = proc; } ret = OMPI_ERR_UNREACH; } } if (mca_bml_r2.show_unreach_errors && OMPI_ERR_UNREACH == ret) { opal_show_help("help-mca-bml-r2.txt", "unreachable proc", true, OMPI_NAME_PRINT(&(ompi_proc_local_proc->proc_name)), (NULL != ompi_proc_local_proc->proc_hostname ? ompi_proc_local_proc->proc_hostname : "unknown!"), OMPI_NAME_PRINT(&(unreach_proc->proc_name)), (NULL != ompi_proc_local_proc->proc_hostname ? ompi_proc_local_proc->proc_hostname : "unknown!"), btl_names); } free(new_procs); return ret; }
int ompi_mtl_portals4_add_procs(struct mca_mtl_base_module_t *mtl, size_t nprocs, struct ompi_proc_t** procs) { int ret, me; size_t i; bool new_found = false; ptl_process_t *maptable; if (ompi_mtl_portals4.use_logical) { maptable = malloc(sizeof(ptl_process_t) * nprocs); if (NULL == maptable) { opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "%s:%d: malloc failed\n", __FILE__, __LINE__); return OMPI_ERR_OUT_OF_RESOURCE; } } /* Get the list of ptl_process_id_t from the runtime and copy into structure */ for (i = 0 ; i < nprocs ; ++i) { ptl_process_t *modex_id; size_t size; if( procs[i] == ompi_proc_local_proc ) { me = i; } if (procs[i]->super.proc_arch != ompi_proc_local()->super.proc_arch) { opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "Portals 4 MTL does not support heterogeneous operations."); opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "Proc %s architecture %x, mine %x.", OMPI_NAME_PRINT(&procs[i]->super.proc_name), procs[i]->super.proc_arch, ompi_proc_local()->super.proc_arch); return OMPI_ERR_NOT_SUPPORTED; } OPAL_MODEX_RECV(ret, &mca_mtl_portals4_component.mtl_version, &procs[i]->super.proc_name, (uint8_t**)&modex_id, &size); if (OMPI_SUCCESS != ret) { opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "%s:%d: ompi_modex_recv failed: %d\n", __FILE__, __LINE__, ret); return ret; } else if (sizeof(ptl_process_t) != size) { opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "%s:%d: ompi_modex_recv failed: %d\n", __FILE__, __LINE__, ret); return OMPI_ERR_BAD_PARAM; } if (NULL == procs[i]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PORTALS4]) { ptl_process_t *peer_id; peer_id = malloc(sizeof(ptl_process_t)); if (NULL == peer_id) { opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "%s:%d: malloc failed: %d\n", __FILE__, __LINE__, ret); return OMPI_ERR_OUT_OF_RESOURCE; } if (ompi_mtl_portals4.use_logical) { peer_id->rank = i; maptable[i].phys.pid = modex_id->phys.pid; maptable[i].phys.nid = modex_id->phys.nid; opal_output_verbose(50, ompi_mtl_base_framework.framework_output, "logical: global rank=%d pid=%d nid=%d\n", (int)i, maptable[i].phys.pid, maptable[i].phys.nid); } else { *peer_id = *modex_id; } procs[i]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PORTALS4] = peer_id; new_found = true; } else { ptl_process_t *proc = (ptl_process_t*) procs[i]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PORTALS4]; if (ompi_mtl_portals4.use_logical) { if ((size_t)proc->rank != i) { opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "%s:%d: existing peer and rank don't match\n", __FILE__, __LINE__); return OMPI_ERROR; } maptable[i].phys.pid = modex_id->phys.pid; maptable[i].phys.nid = modex_id->phys.nid; } else if (proc->phys.nid != modex_id->phys.nid || proc->phys.pid != modex_id->phys.pid) { opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "%s:%d: existing peer and modex peer don't match\n", __FILE__, __LINE__); return OMPI_ERROR; } } } if (ompi_mtl_portals4.use_logical) { ret = PtlSetMap(ompi_mtl_portals4.ni_h, nprocs, maptable); if (OMPI_SUCCESS != ret) { opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "%s:%d: logical mapping failed: %d\n", __FILE__, __LINE__, ret); return ret; } opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "logical mapping OK\n"); free(maptable); } portals4_init_interface(); /* activate progress callback */ ret = opal_progress_register(ompi_mtl_portals4_progress); if (OMPI_SUCCESS != ret) { opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "%s:%d: opal_progress_register failed: %d\n", __FILE__, __LINE__, ret); return ret; } #if OMPI_MTL_PORTALS4_FLOW_CONTROL if (new_found) { ret = ompi_mtl_portals4_flowctl_add_procs(me, nprocs, procs); if (OMPI_SUCCESS != ret) { opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "%s:%d: flowctl_add_procs failed: %d\n", __FILE__, __LINE__, ret); return ret; } } #endif return OMPI_SUCCESS; }
/* /!\ Called for each processes /!\ */ static int portals4_init_query(bool enable_progress_threads, bool enable_mpi_threads) { int ret; ptl_md_t md; ptl_me_t me; /* Initialize Portals and create a physical, matching interface */ ret = PtlInit(); if (PTL_OK != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, "%s:%d: PtlInit failed: %d\n", __FILE__, __LINE__, ret); return OMPI_ERROR; } ret = PtlNIInit(PTL_IFACE_DEFAULT, PTL_NI_PHYSICAL | PTL_NI_MATCHING, PTL_PID_ANY, NULL, &mca_coll_portals4_component.ni_limits, &mca_coll_portals4_component.ni_h); if (PTL_OK != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, "%s:%d: PtlNIInit failed: %d\n", __FILE__, __LINE__, ret); return OMPI_ERROR; } ret = PtlGetId(mca_coll_portals4_component.ni_h, &mca_coll_portals4_component.id); if (PTL_OK != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, "%s:%d: PtlGetid failed: %d\n", __FILE__, __LINE__, ret); return OMPI_ERROR; } /* FIX ME: Need to make sure our ID matches with the MTL... */ ret = PtlGetUid(mca_coll_portals4_component.ni_h, &mca_coll_portals4_component.uid); if (PTL_OK != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, "%s:%d: PtlGetUid failed: %d\n", __FILE__, __LINE__, ret); return OMPI_ERROR; } ret = PtlEQAlloc(mca_coll_portals4_component.ni_h, MCA_COLL_PORTALS4_EQ_SIZE, &mca_coll_portals4_component.eq_h); if (PTL_OK != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, "%s:%d: PtlEQAlloc failed: %d\n", __FILE__, __LINE__, ret); return OMPI_ERROR; } ret = PtlPTAlloc(mca_coll_portals4_component.ni_h, 0, mca_coll_portals4_component.eq_h, REQ_COLL_TABLE_ID, &mca_coll_portals4_component.pt_idx); if (PTL_OK != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, "%s:%d: PtlPTAlloc failed: %d\n", __FILE__, __LINE__, ret); return OMPI_ERROR; } if (mca_coll_portals4_component.pt_idx != REQ_COLL_TABLE_ID) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, "%s:%d: PtlPTAlloc return wrong pt_idx: %d\n", __FILE__, __LINE__, mca_coll_portals4_component.finish_pt_idx); return OMPI_ERROR; } ret = PtlPTAlloc(mca_coll_portals4_component.ni_h, 0, mca_coll_portals4_component.eq_h, REQ_COLL_FINISH_TABLE_ID, &mca_coll_portals4_component.finish_pt_idx); if (PTL_OK != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, "%s:%d: PtlPTAlloc failed: %d\n", __FILE__, __LINE__, ret); return OMPI_ERROR; } if (mca_coll_portals4_component.finish_pt_idx != REQ_COLL_FINISH_TABLE_ID) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, "%s:%d: PtlPTAlloc return wrong pt_idx: %d\n", __FILE__, __LINE__, mca_coll_portals4_component.finish_pt_idx); return OMPI_ERROR; } /* Bind MD/MDs across all memory. We prefer (for obvious reasons) to have a single MD across all of memory */ memset(&md, 0, sizeof(ptl_md_t)); md.start = 0; md.length = 0; md.options = 0; md.eq_handle = PTL_EQ_NONE; md.ct_handle = PTL_CT_NONE; ret = PtlMDBind(mca_coll_portals4_component.ni_h, &md, &mca_coll_portals4_component.zero_md_h); if (PTL_OK != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, "%s:%d: PtlMDBind failed: %d\n", __FILE__, __LINE__, ret); return OMPI_ERROR; } md.start = 0; md.length = PTL_SIZE_MAX; md.options = 0; md.eq_handle = PTL_EQ_NONE; md.ct_handle = PTL_CT_NONE; ret = PtlMDBind(mca_coll_portals4_component.ni_h, &md, &mca_coll_portals4_component.data_md_h); if (PTL_OK != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, "%s:%d: PtlMDBind failed: %d\n", __FILE__, __LINE__, ret); return OMPI_ERROR; } OPAL_OUTPUT_VERBOSE((90, ompi_coll_base_framework.framework_output, "PtlMDBind start=%p length=%x\n", md.start, md.length)); /* setup finish ack ME */ me.start = NULL; me.length = 0; me.ct_handle = PTL_CT_NONE; me.min_free = 0; me.uid = mca_coll_portals4_component.uid; me.options = PTL_ME_OP_PUT | PTL_ME_EVENT_LINK_DISABLE | PTL_ME_EVENT_UNLINK_DISABLE; me.match_id.phys.nid = PTL_NID_ANY; me.match_id.phys.pid = PTL_PID_ANY; me.match_bits = 0; me.ignore_bits = 0; ret = PtlMEAppend(mca_coll_portals4_component.ni_h, mca_coll_portals4_component.finish_pt_idx, &me, PTL_PRIORITY_LIST, NULL, &mca_coll_portals4_component.finish_me_h); if (PTL_OK != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, "%s:%d: PtlMEAppend of barrier unexpected failed: %d\n", __FILE__, __LINE__, ret); return OMPI_ERROR; } /* This ME is used for RTR exchange only */ me.start = NULL; me.length = 0; me.ct_handle = PTL_CT_NONE; me.min_free = 0; me.uid = mca_coll_portals4_component.uid; me.options = PTL_ME_OP_PUT | PTL_ME_EVENT_SUCCESS_DISABLE | PTL_ME_EVENT_OVER_DISABLE | PTL_ME_EVENT_LINK_DISABLE | PTL_ME_EVENT_UNLINK_DISABLE; me.match_id.phys.nid = PTL_NID_ANY; me.match_id.phys.pid = PTL_PID_ANY; /* Note : the RTR bit must be set to match this ME, * this allows to discriminate the RTR from data flow * (especially for the Barrier operations) */ COLL_PORTALS4_SET_BITS(me.match_bits, 0, 0, 1, 0, 0, 0); me.ignore_bits = ~COLL_PORTALS4_RTR_MASK; ret = PtlMEAppend(mca_coll_portals4_component.ni_h, mca_coll_portals4_component.pt_idx, &me, PTL_OVERFLOW_LIST, NULL, &mca_coll_portals4_component.unex_me_h); if (PTL_OK != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, "%s:%d: PtlMEAppend of barrier unexpected failed: %d\n", __FILE__, __LINE__, ret); return OMPI_ERROR; } /* activate progress callback */ ret = opal_progress_register(portals4_progress); if (OMPI_SUCCESS != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, "%s:%d: opal_progress_register failed: %d\n", __FILE__, __LINE__, ret); return OMPI_ERROR; } return OMPI_SUCCESS; }
int mca_pml_yalla_init(void) { mxm_context_opts_t *ctx_opts; mxm_ep_opts_t *ep_opts; mxm_error_t error; int rc; PML_YALLA_VERBOSE(1, "mca_pml_yalla_init"); /* Set memory hooks */ if ((OPAL_MEMORY_FREE_SUPPORT | OPAL_MEMORY_MUNMAP_SUPPORT) == ((OPAL_MEMORY_FREE_SUPPORT | OPAL_MEMORY_MUNMAP_SUPPORT) & opal_mem_hooks_support_level())) { PML_YALLA_VERBOSE(1, "enabling on-demand memory mapping"); opal_setenv("MXM_PML_MEM_ON_DEMAND_MAP", "y", false, &environ); ompi_pml_yalla.using_mem_hooks = 1; } else { PML_YALLA_VERBOSE(1, "disabling on-demand memory mapping"); ompi_pml_yalla.using_mem_hooks = 0; } opal_setenv("MXM_PML_SINGLE_THREAD", ompi_mpi_thread_multiple ? "n" : "y", false, &environ); /* Read options */ error = mxm_config_read_opts(&ctx_opts, &ep_opts, "PML", NULL, 0); if (MXM_OK != error) { return OMPI_ERROR; } error = mxm_init(ctx_opts, &ompi_pml_yalla.mxm_context); if (MXM_OK != error) { return OMPI_ERROR; } if (ompi_pml_yalla.using_mem_hooks) { opal_mem_hooks_register_release(mca_pml_yalla_mem_release_cb, NULL); } error = mxm_ep_create(ompi_pml_yalla.mxm_context, ep_opts, &ompi_pml_yalla.mxm_ep); if (MXM_OK != error) { return OMPI_ERROR; } mxm_config_free_context_opts(ctx_opts); mxm_config_free_ep_opts(ep_opts); rc = send_ep_address(); if (rc < 0) { return rc; } OBJ_CONSTRUCT(&ompi_pml_yalla.send_reqs, mca_pml_yalla_freelist_t); OBJ_CONSTRUCT(&ompi_pml_yalla.bsend_reqs, mca_pml_yalla_freelist_t); OBJ_CONSTRUCT(&ompi_pml_yalla.recv_reqs, mca_pml_yalla_freelist_t); OBJ_CONSTRUCT(&ompi_pml_yalla.convs, mca_pml_yalla_freelist_t); opal_progress_register(mca_pml_yalla_progress); PML_YALLA_VERBOSE(2, "created mxm context %p ep %p", ompi_pml_yalla.mxm_context, ompi_pml_yalla.mxm_ep); return OMPI_SUCCESS; }
/* * Open the component */ static int basesmuma_open(void) { /* local variables */ mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component; int ret = OMPI_SUCCESS; opal_mutex_t *mutex_ptr; int dummy; /* * Make sure that the number of banks is a power of 2 */ cs->basesmuma_num_mem_banks= roundup_to_power_radix(2,cs->basesmuma_num_mem_banks, &dummy); if ( 0 == cs->basesmuma_num_mem_banks ) { ret=OMPI_ERROR; goto ERROR; } /* * Make sure that the the number of buffers is a power of 2 */ cs->basesmuma_num_regions_per_bank= roundup_to_power_radix(2,cs->basesmuma_num_regions_per_bank, &dummy); if ( 0 == cs->basesmuma_num_regions_per_bank ) { ret=OMPI_ERROR; goto ERROR; } /* Portals initialization */ cs->portals_init = false; cs->portals_info = NULL; /* * initialization */ cs->sm_ctl_structs=NULL; OBJ_CONSTRUCT(&(cs->sm_connections_list),opal_list_t); OBJ_CONSTRUCT(&(cs->nb_admin_barriers),opal_list_t); mutex_ptr= &(cs->nb_admin_barriers_mutex); OBJ_CONSTRUCT(mutex_ptr, opal_mutex_t); /* Control structures object construct */ OBJ_CONSTRUCT(&(cs->ctl_structures), opal_list_t); /* shared memory has not been registered yet */ cs->mpool_inited = false; /* initialize base file names */ cs->clt_base_fname="sm_ctl_mem_"; cs->payload_base_fname="sm_payload_mem_"; /* initialize the size of the shared memory scartch region */ cs->my_scratch_shared_memory_size=getpagesize(); cs->my_scratch_shared_memory=NULL; cs->scratch_offset_from_base_ctl_file=0; /* * register the progess function */ ret=opal_progress_register(bcol_basesmuma_progress); if (MPI_SUCCESS != ret) { opal_output(0, "failed to register the progress function\n"); } return ret; ERROR: return ret; }
int mca_spml_ucx_ctx_create(long options, shmem_ctx_t *ctx) { mca_spml_ucx_ctx_t *ucx_ctx; ucp_worker_params_t params; ucp_ep_params_t ep_params; size_t i, j, nprocs = oshmem_num_procs(); ucs_status_t err; int my_pe = oshmem_my_proc_id(); size_t len; spml_ucx_mkey_t *ucx_mkey; sshmem_mkey_t *mkey; int rc = OSHMEM_ERROR; ucx_ctx = malloc(sizeof(mca_spml_ucx_ctx_t)); ucx_ctx->options = options; params.field_mask = UCP_WORKER_PARAM_FIELD_THREAD_MODE; if (oshmem_mpi_thread_provided == SHMEM_THREAD_SINGLE || options & SHMEM_CTX_PRIVATE || options & SHMEM_CTX_SERIALIZED) { params.thread_mode = UCS_THREAD_MODE_SINGLE; } else { params.thread_mode = UCS_THREAD_MODE_MULTI; } err = ucp_worker_create(mca_spml_ucx.ucp_context, ¶ms, &ucx_ctx->ucp_worker); if (UCS_OK != err) { free(ucx_ctx); return OSHMEM_ERROR; } ucx_ctx->ucp_peers = (ucp_peer_t *) calloc(nprocs, sizeof(*(ucx_ctx->ucp_peers))); if (NULL == ucx_ctx->ucp_peers) { goto error; } if (mca_spml_ucx.active_array.ctxs_count == 0) { opal_progress_register(spml_ucx_ctx_progress); } for (i = 0; i < nprocs; i++) { ep_params.field_mask = UCP_EP_PARAM_FIELD_REMOTE_ADDRESS; ep_params.address = (ucp_address_t *)(mca_spml_ucx.remote_addrs_tbl[i]); err = ucp_ep_create(ucx_ctx->ucp_worker, &ep_params, &ucx_ctx->ucp_peers[i].ucp_conn); if (UCS_OK != err) { SPML_ERROR("ucp_ep_create(proc=%d/%d) failed: %s", i, nprocs, ucs_status_string(err)); goto error2; } for (j = 0; j < MCA_MEMHEAP_SEG_COUNT; j++) { mkey = &memheap_map->mem_segs[j].mkeys_cache[i][0]; ucx_mkey = &ucx_ctx->ucp_peers[i].mkeys[j].key; err = ucp_ep_rkey_unpack(ucx_ctx->ucp_peers[i].ucp_conn, mkey->u.data, &ucx_mkey->rkey); if (UCS_OK != err) { SPML_UCX_ERROR("failed to unpack rkey"); goto error2; } mca_spml_ucx_cache_mkey(ucx_ctx, mkey, j, i); } } SHMEM_MUTEX_LOCK(mca_spml_ucx.internal_mutex); _ctx_add(&mca_spml_ucx.active_array, ucx_ctx); SHMEM_MUTEX_UNLOCK(mca_spml_ucx.internal_mutex); (*ctx) = (shmem_ctx_t)ucx_ctx; return OSHMEM_SUCCESS; error2: for (i = 0; i < nprocs; i++) { if (ucx_ctx->ucp_peers[i].ucp_conn) { ucp_ep_destroy(ucx_ctx->ucp_peers[i].ucp_conn); } } if (ucx_ctx->ucp_peers) free(ucx_ctx->ucp_peers); error: ucp_worker_destroy(ucx_ctx->ucp_worker); free(ucx_ctx); rc = OSHMEM_ERR_OUT_OF_RESOURCE; SPML_ERROR("ctx create FAILED rc=%d", rc); return rc; }
int mca_spml_ucx_add_procs(ompi_proc_t** procs, size_t nprocs) { size_t i, j, n; int rc = OSHMEM_ERROR; int my_rank = oshmem_my_proc_id(); ucs_status_t err; ucp_address_t *wk_local_addr; size_t wk_addr_len; int *wk_roffs = NULL; int *wk_rsizes = NULL; char *wk_raddrs = NULL; ucp_ep_params_t ep_params; mca_spml_ucx_ctx_default.ucp_peers = (ucp_peer_t *) calloc(nprocs, sizeof(*(mca_spml_ucx_ctx_default.ucp_peers))); if (NULL == mca_spml_ucx_ctx_default.ucp_peers) { goto error; } err = ucp_worker_get_address(mca_spml_ucx_ctx_default.ucp_worker, &wk_local_addr, &wk_addr_len); if (err != UCS_OK) { goto error; } dump_address(my_rank, (char *)wk_local_addr, wk_addr_len); rc = oshmem_shmem_xchng(wk_local_addr, wk_addr_len, nprocs, (void **)&wk_raddrs, &wk_roffs, &wk_rsizes); if (rc != OSHMEM_SUCCESS) { goto error; } opal_progress_register(spml_ucx_default_progress); mca_spml_ucx.remote_addrs_tbl = (char **)calloc(nprocs, sizeof(char *)); memset(mca_spml_ucx.remote_addrs_tbl, 0, nprocs * sizeof(char *)); /* Get the EP connection requests for all the processes from modex */ for (n = 0; n < nprocs; ++n) { i = (my_rank + n) % nprocs; dump_address(i, (char *)(wk_raddrs + wk_roffs[i]), wk_rsizes[i]); ep_params.field_mask = UCP_EP_PARAM_FIELD_REMOTE_ADDRESS; ep_params.address = (ucp_address_t *)(wk_raddrs + wk_roffs[i]); err = ucp_ep_create(mca_spml_ucx_ctx_default.ucp_worker, &ep_params, &mca_spml_ucx_ctx_default.ucp_peers[i].ucp_conn); if (UCS_OK != err) { SPML_UCX_ERROR("ucp_ep_create(proc=%zu/%zu) failed: %s", n, nprocs, ucs_status_string(err)); goto error2; } OSHMEM_PROC_DATA(procs[i])->num_transports = 1; OSHMEM_PROC_DATA(procs[i])->transport_ids = spml_ucx_transport_ids; for (j = 0; j < MCA_MEMHEAP_SEG_COUNT; j++) { mca_spml_ucx_ctx_default.ucp_peers[i].mkeys[j].key.rkey = NULL; } mca_spml_ucx.remote_addrs_tbl[i] = (char *)malloc(wk_rsizes[i]); memcpy(mca_spml_ucx.remote_addrs_tbl[i], (char *)(wk_raddrs + wk_roffs[i]), wk_rsizes[i]); } ucp_worker_release_address(mca_spml_ucx_ctx_default.ucp_worker, wk_local_addr); free(wk_raddrs); free(wk_rsizes); free(wk_roffs); SPML_UCX_VERBOSE(50, "*** ADDED PROCS ***"); return OSHMEM_SUCCESS; error2: for (i = 0; i < nprocs; ++i) { if (mca_spml_ucx_ctx_default.ucp_peers[i].ucp_conn) { ucp_ep_destroy(mca_spml_ucx_ctx_default.ucp_peers[i].ucp_conn); } if (mca_spml_ucx.remote_addrs_tbl[i]) { free(mca_spml_ucx.remote_addrs_tbl[i]); } } if (mca_spml_ucx_ctx_default.ucp_peers) free(mca_spml_ucx_ctx_default.ucp_peers); if (mca_spml_ucx.remote_addrs_tbl) free(mca_spml_ucx.remote_addrs_tbl); free(wk_raddrs); free(wk_rsizes); free(wk_roffs); error: rc = OSHMEM_ERR_OUT_OF_RESOURCE; SPML_UCX_ERROR("add procs FAILED rc=%d", rc); return rc; }
int mca_spml_ikrit_add_procs(ompi_proc_t** procs, size_t nprocs) { spml_ikrit_mxm_ep_conn_info_t *ep_info = NULL; spml_ikrit_mxm_ep_conn_info_t *ep_hw_rdma_info = NULL; spml_ikrit_mxm_ep_conn_info_t my_ep_info; size_t mxm_addr_len = MXM_MAX_ADDR_LEN; mxm_error_t err; size_t i, n; int rc = OSHMEM_ERROR; ompi_proc_t *proc_self; int my_rank = oshmem_my_proc_id(); OBJ_CONSTRUCT(&mca_spml_ikrit.active_peers, opal_list_t); /* Allocate connection requests */ ep_info = calloc(sizeof(spml_ikrit_mxm_ep_conn_info_t), nprocs); if (NULL == ep_info) { rc = OSHMEM_ERR_OUT_OF_RESOURCE; goto bail; } if (mca_spml_ikrit.hw_rdma_channel) { ep_hw_rdma_info = calloc(sizeof(spml_ikrit_mxm_ep_conn_info_t), nprocs); if (NULL == ep_hw_rdma_info) { rc = OSHMEM_ERR_OUT_OF_RESOURCE; goto bail; } } mca_spml_ikrit.mxm_peers = (mxm_peer_t *) calloc(nprocs , sizeof(mxm_peer_t)); if (NULL == mca_spml_ikrit.mxm_peers) { rc = OSHMEM_ERR_OUT_OF_RESOURCE; goto bail; } memset(&my_ep_info, 0, sizeof(my_ep_info)); if (mca_spml_ikrit.hw_rdma_channel) { err = mxm_ep_get_address(mca_spml_ikrit.mxm_hw_rdma_ep, &my_ep_info.addr.ep_addr, &mxm_addr_len); if (MXM_OK != err) { orte_show_help("help-oshmem-spml-ikrit.txt", "unable to get endpoint address", true, mxm_error_string(err)); rc = OSHMEM_ERROR; goto bail; } oshmem_shmem_allgather(&my_ep_info, ep_hw_rdma_info, sizeof(spml_ikrit_mxm_ep_conn_info_t)); } err = mxm_ep_get_address(mca_spml_ikrit.mxm_ep, &my_ep_info.addr.ep_addr, &mxm_addr_len); if (MXM_OK != err) { orte_show_help("help-oshmem-spml-ikrit.txt", "unable to get endpoint address", true, mxm_error_string(err)); rc = OSHMEM_ERROR; goto bail; } oshmem_shmem_allgather(&my_ep_info, ep_info, sizeof(spml_ikrit_mxm_ep_conn_info_t)); opal_progress_register(spml_ikrit_progress); /* Get the EP connection requests for all the processes from modex */ for (n = 0; n < nprocs; ++n) { /* mxm 2.0 keeps its connections on a list. Make sure * that list have different order on every rank */ i = (my_rank + n) % nprocs; mxm_peer_construct(&mca_spml_ikrit.mxm_peers[i]); err = mxm_ep_connect(mca_spml_ikrit.mxm_ep, ep_info[i].addr.ep_addr, &mca_spml_ikrit.mxm_peers[i].mxm_conn); if (MXM_OK != err) { SPML_ERROR("MXM returned connect error: %s\n", mxm_error_string(err)); goto bail; } mxm_conn_ctx_set(mca_spml_ikrit.mxm_peers[i].mxm_conn, &mca_spml_ikrit.mxm_peers[i]); if (mca_spml_ikrit.hw_rdma_channel) { err = mxm_ep_connect(mca_spml_ikrit.mxm_hw_rdma_ep, ep_hw_rdma_info[i].addr.ep_addr, &mca_spml_ikrit.mxm_peers[i].mxm_hw_rdma_conn); if (MXM_OK != err) { SPML_ERROR("MXM returned connect error: %s\n", mxm_error_string(err)); goto bail; } } else { mca_spml_ikrit.mxm_peers[i].mxm_hw_rdma_conn = mca_spml_ikrit.mxm_peers[i].mxm_conn; } } if (ep_info) free(ep_info); if (ep_hw_rdma_info) free(ep_hw_rdma_info); if (mca_spml_ikrit.bulk_connect) { /* Need a barrier to ensure remote peers already created connection */ oshmem_shmem_barrier(); mxm_ep_wireup(mca_spml_ikrit.mxm_ep); } proc_self = oshmem_proc_group_find(oshmem_group_all, my_rank); /* identify local processes and change transport to SHM */ for (i = 0; i < nprocs; i++) { if (procs[i]->super.proc_name.jobid != proc_self->super.proc_name.jobid || !OPAL_PROC_ON_LOCAL_NODE(procs[i]->super.proc_flags)) { continue; } if (procs[i] == proc_self) continue; /* use zcopy for put/get via sysv shared memory with fallback to RDMA */ mca_spml_ikrit.mxm_peers[i].ptl_id = MXM_PTL_SHM; } SPML_VERBOSE(50, "*** ADDED PROCS ***"); return OSHMEM_SUCCESS; bail: if (ep_info) free(ep_info); if (ep_hw_rdma_info) free(ep_hw_rdma_info); SPML_ERROR("add procs FAILED rc=%d", rc); return rc; }
static int component_init(bool enable_progress_threads, bool enable_mpi_threads) { ucp_config_t *config = NULL; ucp_params_t context_params; bool progress_registered = false, requests_created = false; int ret = OMPI_SUCCESS; ucs_status_t status; mca_osc_ucx_component.ucp_context = NULL; mca_osc_ucx_component.ucp_worker = NULL; mca_osc_ucx_component.enable_mpi_threads = enable_mpi_threads; status = ucp_config_read("MPI", NULL, &config); if (UCS_OK != status) { opal_output_verbose(1, ompi_osc_base_framework.framework_output, "%s:%d: ucp_config_read failed: %d\n", __FILE__, __LINE__, status); return OMPI_ERROR; } OBJ_CONSTRUCT(&mca_osc_ucx_component.requests, opal_free_list_t); requests_created = true; ret = opal_free_list_init (&mca_osc_ucx_component.requests, sizeof(ompi_osc_ucx_request_t), opal_cache_line_size, OBJ_CLASS(ompi_osc_ucx_request_t), 0, 0, 8, 0, 8, NULL, 0, NULL, NULL, NULL); if (OMPI_SUCCESS != ret) { opal_output_verbose(1, ompi_osc_base_framework.framework_output, "%s:%d: opal_free_list_init failed: %d\n", __FILE__, __LINE__, ret); goto error; } mca_osc_ucx_component.num_incomplete_req_ops = 0; ret = opal_progress_register(progress_callback); progress_registered = true; if (OMPI_SUCCESS != ret) { opal_output_verbose(1, ompi_osc_base_framework.framework_output, "%s:%d: opal_progress_register failed: %d\n", __FILE__, __LINE__, ret); goto error; } /* initialize UCP context */ memset(&context_params, 0, sizeof(ucp_context_h)); context_params.field_mask = UCP_PARAM_FIELD_FEATURES | UCP_PARAM_FIELD_MT_WORKERS_SHARED | UCP_PARAM_FIELD_ESTIMATED_NUM_EPS | UCP_PARAM_FIELD_REQUEST_INIT | UCP_PARAM_FIELD_REQUEST_SIZE; context_params.features = UCP_FEATURE_RMA | UCP_FEATURE_AMO32 | UCP_FEATURE_AMO64; context_params.mt_workers_shared = 0; context_params.estimated_num_eps = ompi_proc_world_size(); context_params.request_init = internal_req_init; context_params.request_size = sizeof(ompi_osc_ucx_internal_request_t); status = ucp_init(&context_params, config, &mca_osc_ucx_component.ucp_context); ucp_config_release(config); if (UCS_OK != status) { opal_output_verbose(1, ompi_osc_base_framework.framework_output, "%s:%d: ucp_init failed: %d\n", __FILE__, __LINE__, status); ret = OMPI_ERROR; goto error; } return ret; error: if (progress_registered) opal_progress_unregister(progress_callback); if (requests_created) OBJ_DESTRUCT(&mca_osc_ucx_component.requests); if (mca_osc_ucx_component.ucp_context) ucp_cleanup(mca_osc_ucx_component.ucp_context); return ret; }
int ompi_mtl_psm2_module_init(int local_rank, int num_local_procs) { psm2_error_t err; psm2_ep_t ep; /* endpoint handle */ psm2_mq_t mq; psm2_epid_t epid; /* unique lid+port identifier */ psm2_uuid_t unique_job_key; struct psm2_ep_open_opts ep_opt; unsigned long long *uu = (unsigned long long *) unique_job_key; char *generated_key; char env_string[256]; int rc; generated_key = getenv("OMPI_MCA_orte_precondition_transports"); memset(uu, 0, sizeof(psm2_uuid_t)); if (!generated_key || (strlen(generated_key) != 33) || sscanf(generated_key, "%016llx-%016llx", &uu[0], &uu[1]) != 2) { opal_show_help("help-mtl-psm2.txt", "no uuid present", true, generated_key ? "could not be parsed from" : "not present in", ompi_process_info.nodename); return OMPI_ERROR; } /* Handle our own errors for opening endpoints */ psm2_error_register_handler(ompi_mtl_psm2.ep, ompi_mtl_psm2_errhandler); /* Setup MPI_LOCALRANKID and MPI_LOCALNRANKS so PSM2 can allocate hardware * contexts correctly. */ snprintf(env_string, sizeof(env_string), "%d", local_rank); setenv("MPI_LOCALRANKID", env_string, 0); snprintf(env_string, sizeof(env_string), "%d", num_local_procs); setenv("MPI_LOCALNRANKS", env_string, 0); /* Setup the endpoint options. */ psm2_ep_open_opts_get_defaults(&ep_opt); ep_opt.timeout = ompi_mtl_psm2.connect_timeout * 1e9; ep_opt.affinity = PSM2_EP_OPEN_AFFINITY_SKIP; /* do not let PSM2 set affinity */ /* Open PSM2 endpoint */ err = psm2_ep_open(unique_job_key, &ep_opt, &ep, &epid); if (err) { opal_show_help("help-mtl-psm2.txt", "unable to open endpoint", true, psm2_error_get_string(err)); return OMPI_ERROR; } /* Future errors are handled by the default error handler */ psm2_error_register_handler(ompi_mtl_psm2.ep, PSM2_ERRHANDLER_DEFAULT); err = psm2_mq_init(ep, 0xffff000000000000ULL, NULL, 0, &mq); if (err) { opal_show_help("help-mtl-psm2.txt", "psm2 init", true, psm2_error_get_string(err)); return OMPI_ERROR; } ompi_mtl_psm2.ep = ep; ompi_mtl_psm2.epid = epid; ompi_mtl_psm2.mq = mq; OPAL_MODEX_SEND(rc, OPAL_PMIX_GLOBAL, &mca_mtl_psm2_component.super.mtl_version, &ompi_mtl_psm2.epid, sizeof(psm2_epid_t)); if (OMPI_SUCCESS != rc) { opal_output(0, "Open MPI couldn't send PSM2 epid to head node process"); return OMPI_ERROR; } /* register the psm2 progress function */ opal_progress_register(ompi_mtl_psm2_progress); return OMPI_SUCCESS; }
int ompi_mtl_psm_module_init(int local_rank, int num_local_procs) { psm_error_t err; psm_ep_t ep; /* endpoint handle */ psm_mq_t mq; psm_epid_t epid; /* unique lid+port identifier */ psm_uuid_t unique_job_key; struct psm_ep_open_opts ep_opt; unsigned long long *uu = (unsigned long long *) unique_job_key; char *generated_key; char env_string[256]; generated_key = getenv("OMPI_MCA_orte_precondition_transports"); memset(uu, 0, sizeof(psm_uuid_t)); if (!generated_key || (strlen(generated_key) != 33) || sscanf(generated_key, "%016llx-%016llx", &uu[0], &uu[1]) != 2) { opal_show_help("help-mtl-psm.txt", "no uuid present", true, generated_key ? "could not be parsed from" : "not present in", ompi_process_info.nodename); return OMPI_ERROR; } /* Handle our own errors for opening endpoints */ psm_error_register_handler(ompi_mtl_psm.ep, ompi_mtl_psm_errhandler); /* Setup MPI_LOCALRANKID and MPI_LOCALNRANKS so PSM can allocate hardware * contexts correctly. */ snprintf(env_string, sizeof(env_string), "%d", local_rank); setenv("MPI_LOCALRANKID", env_string, 0); snprintf(env_string, sizeof(env_string), "%d", num_local_procs); setenv("MPI_LOCALNRANKS", env_string, 0); /* Setup the endpoint options. */ bzero((void*) &ep_opt, sizeof(ep_opt)); ep_opt.timeout = ompi_mtl_psm.connect_timeout * 1e9; ep_opt.unit = ompi_mtl_psm.ib_unit; ep_opt.affinity = PSM_EP_OPEN_AFFINITY_SKIP; /* do not let PSM set affinity */ ep_opt.shm_mbytes = -1; /* Choose PSM defaults */ ep_opt.sendbufs_num = -1; /* Choose PSM defaults */ #if PSM_VERNO >= 0x0101 ep_opt.network_pkey = ompi_mtl_psm.ib_pkey; #endif #if PSM_VERNO >= 0x0107 ep_opt.port = ompi_mtl_psm.ib_port; ep_opt.outsl = ompi_mtl_psm.ib_service_level; #endif #if PSM_VERNO >= 0x010d ep_opt.service_id = ompi_mtl_psm.ib_service_id; ep_opt.path_res_type = ompi_mtl_psm.path_res_type; #endif /* Open PSM endpoint */ err = psm_ep_open(unique_job_key, &ep_opt, &ep, &epid); if (err) { opal_show_help("help-mtl-psm.txt", "unable to open endpoint", true, psm_error_get_string(err)); return OMPI_ERROR; } /* Future errors are handled by the default error handler */ psm_error_register_handler(ompi_mtl_psm.ep, PSM_ERRHANDLER_DEFAULT); err = psm_mq_init(ep, 0xffff000000000000ULL, NULL, 0, &mq); if (err) { opal_show_help("help-mtl-psm.txt", "psm init", true, psm_error_get_string(err)); return OMPI_ERROR; } ompi_mtl_psm.ep = ep; ompi_mtl_psm.epid = epid; ompi_mtl_psm.mq = mq; if (OMPI_SUCCESS != ompi_modex_send( &mca_mtl_psm_component.super.mtl_version, &ompi_mtl_psm.epid, sizeof(psm_epid_t))) { opal_output(0, "Open MPI couldn't send PSM epid to head node process"); return OMPI_ERROR; } /* register the psm progress function */ opal_progress_register(ompi_mtl_psm_progress); return OMPI_SUCCESS; }
static mca_mtl_base_module_t* ompi_mtl_ofi_component_init(bool enable_progress_threads, bool enable_mpi_threads) { int ret, fi_version; struct fi_info *hints; struct fi_info *providers = NULL, *prov = NULL; struct fi_cq_attr cq_attr = {0}; struct fi_av_attr av_attr = {0}; char ep_name[FI_NAME_MAX] = {0}; size_t namelen; /** * Hints to filter providers * See man fi_getinfo for a list of all filters * mode: Select capabilities MTL is prepared to support. * In this case, MTL will pass in context into communication calls * ep_type: reliable datagram operation * caps: Capabilities required from the provider. * Tag matching is specified to implement MPI semantics. * msg_order: Guarantee that messages with same tag are ordered. */ hints = fi_allocinfo(); if (!hints) { opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "%s:%d: Could not allocate fi_info\n", __FILE__, __LINE__); goto error; } hints->mode = FI_CONTEXT; hints->ep_attr->type = FI_EP_RDM; /* Reliable datagram */ hints->caps = FI_TAGGED; /* Tag matching interface */ hints->tx_attr->msg_order = FI_ORDER_SAS; hints->rx_attr->msg_order = FI_ORDER_SAS; hints->domain_attr->threading = FI_THREAD_UNSPEC; if (MTL_OFI_PROG_AUTO == control_progress) { hints->domain_attr->control_progress = FI_PROGRESS_AUTO; } else { hints->domain_attr->control_progress = FI_PROGRESS_MANUAL; } if (MTL_OFI_PROG_MANUAL == data_progress) { hints->domain_attr->data_progress = FI_PROGRESS_MANUAL; } else { hints->domain_attr->data_progress = FI_PROGRESS_AUTO; } if (MTL_OFI_AV_TABLE == av_type) { hints->domain_attr->av_type = FI_AV_TABLE; } else { hints->domain_attr->av_type = FI_AV_MAP; } hints->domain_attr->resource_mgmt = FI_RM_ENABLED; /** * FI_VERSION provides binary backward and forward compatibility support * Specify the version of OFI is coded to, the provider will select struct * layouts that are compatible with this version. */ fi_version = FI_VERSION(1, 0); /** * fi_getinfo: returns information about fabric services for reaching a * remote node or service. this does not necessarily allocate resources. * Pass NULL for name/service because we want a list of providers supported. */ ret = fi_getinfo(fi_version, /* OFI version requested */ NULL, /* Optional name or fabric to resolve */ NULL, /* Optional service name or port to request */ 0ULL, /* Optional flag */ hints, /* In: Hints to filter providers */ &providers); /* Out: List of matching providers */ if (0 != ret) { opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "%s:%d: fi_getinfo failed: %s\n", __FILE__, __LINE__, fi_strerror(-ret)); goto error; } /** * Select a provider from the list returned by fi_getinfo(). */ prov = select_ofi_provider(providers); if (!prov) { opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "%s:%d: select_ofi_provider: no provider found\n", __FILE__, __LINE__); goto error; } /** * Open fabric * The getinfo struct returns a fabric attribute struct that can be used to * instantiate the virtual or physical network. This opens a "fabric * provider". See man fi_fabric for details. */ ret = fi_fabric(prov->fabric_attr, /* In: Fabric attributes */ &ompi_mtl_ofi.fabric, /* Out: Fabric handle */ NULL); /* Optional context for fabric events */ if (0 != ret) { opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "%s:%d: fi_fabric failed: %s\n", __FILE__, __LINE__, fi_strerror(-ret)); goto error; } /** * Create the access domain, which is the physical or virtual network or * hardware port/collection of ports. Returns a domain object that can be * used to create endpoints. See man fi_domain for details. */ ret = fi_domain(ompi_mtl_ofi.fabric, /* In: Fabric object */ prov, /* In: Provider */ &ompi_mtl_ofi.domain, /* Out: Domain oject */ NULL); /* Optional context for domain events */ if (0 != ret) { opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "%s:%d: fi_domain failed: %s\n", __FILE__, __LINE__, fi_strerror(-ret)); goto error; } /** * Create a transport level communication endpoint. To use the endpoint, * it must be bound to completion counters or event queues and enabled, * and the resources consumed by it, such as address vectors, counters, * completion queues, etc. * see man fi_endpoint for more details. */ ret = fi_endpoint(ompi_mtl_ofi.domain, /* In: Domain object */ prov, /* In: Provider */ &ompi_mtl_ofi.ep, /* Out: Endpoint object */ NULL); /* Optional context */ if (0 != ret) { opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "%s:%d: fi_endpoint failed: %s\n", __FILE__, __LINE__, fi_strerror(-ret)); goto error; } /** * Save the maximum inject size. */ ompi_mtl_ofi.max_inject_size = prov->tx_attr->inject_size; /** * Create the objects that will be bound to the endpoint. * The objects include: * - completion queue for events * - address vector of other endpoint addresses * - dynamic memory-spanning memory region */ cq_attr.format = FI_CQ_FORMAT_TAGGED; ret = fi_cq_open(ompi_mtl_ofi.domain, &cq_attr, &ompi_mtl_ofi.cq, NULL); if (ret) { opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "%s:%d: fi_cq_open failed: %s\n", __FILE__, __LINE__, fi_strerror(-ret)); goto error; } /** * The remote fi_addr will be stored in the ofi_endpoint struct. */ av_attr.type = (MTL_OFI_AV_TABLE == av_type) ? FI_AV_TABLE: FI_AV_MAP; ret = fi_av_open(ompi_mtl_ofi.domain, &av_attr, &ompi_mtl_ofi.av, NULL); if (ret) { opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "%s:%d: fi_av_open failed: %s\n", __FILE__, __LINE__, fi_strerror(-ret)); goto error; } /** * Bind the CQ and AV to the endpoint object. */ ret = fi_ep_bind(ompi_mtl_ofi.ep, (fid_t)ompi_mtl_ofi.cq, FI_SEND | FI_RECV); if (0 != ret) { opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "%s:%d: fi_bind CQ-EP failed: %s\n", __FILE__, __LINE__, fi_strerror(-ret)); goto error; } ret = fi_ep_bind(ompi_mtl_ofi.ep, (fid_t)ompi_mtl_ofi.av, 0); if (0 != ret) { opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "%s:%d: fi_bind AV-EP failed: %s\n", __FILE__, __LINE__, fi_strerror(-ret)); goto error; } /** * Enable the endpoint for communication * This commits the bind operations. */ ret = fi_enable(ompi_mtl_ofi.ep); if (0 != ret) { opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "%s:%d: fi_enable failed: %s\n", __FILE__, __LINE__, fi_strerror(-ret)); goto error; } /** * Free providers info since it's not needed anymore. */ fi_freeinfo(hints); hints = NULL; fi_freeinfo(providers); providers = NULL; /** * Get our address and publish it with modex. */ namelen = sizeof(ep_name); ret = fi_getname((fid_t)ompi_mtl_ofi.ep, &ep_name[0], &namelen); if (ret) { opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "%s:%d: fi_getname failed: %s\n", __FILE__, __LINE__, fi_strerror(-ret)); goto error; } OFI_COMPAT_MODEX_SEND(ret, &mca_mtl_ofi_component.super.mtl_version, &ep_name, namelen); if (OMPI_SUCCESS != ret) { opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "%s:%d: modex_send failed: %d\n", __FILE__, __LINE__, ret); goto error; } ompi_mtl_ofi.epnamelen = namelen; /** * Set the ANY_SRC address. */ ompi_mtl_ofi.any_addr = FI_ADDR_UNSPEC; /** * Activate progress callback. */ ret = opal_progress_register(ompi_mtl_ofi_progress_no_inline); if (OMPI_SUCCESS != ret) { opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "%s:%d: opal_progress_register failed: %d\n", __FILE__, __LINE__, ret); goto error; } return &ompi_mtl_ofi.base; error: if (providers) { (void) fi_freeinfo(providers); } if (hints) { (void) fi_freeinfo(hints); } if (ompi_mtl_ofi.av) { (void) fi_close((fid_t)ompi_mtl_ofi.av); } if (ompi_mtl_ofi.cq) { (void) fi_close((fid_t)ompi_mtl_ofi.cq); } if (ompi_mtl_ofi.ep) { (void) fi_close((fid_t)ompi_mtl_ofi.ep); } if (ompi_mtl_ofi.domain) { (void) fi_close((fid_t)ompi_mtl_ofi.domain); } if (ompi_mtl_ofi.fabric) { (void) fi_close((fid_t)ompi_mtl_ofi.fabric); } return NULL; }
/* * Open the component */ static int iboffload_open(void) { int rc; /* local variables */ mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component; IBOFFLOAD_VERBOSE(10, ("Open Iboffload component.\n")); (void) mca_bcol_iboffload_verify_params(); cm->super.priority = 100; cm->super.n_net_contexts = 0; cm->super.network_contexts = NULL; OBJ_CONSTRUCT(&cm->recv_wrs.lock, opal_mutex_t); /* construct lists */ OBJ_CONSTRUCT(&cm->devices, opal_pointer_array_t); rc = opal_pointer_array_init(&cm->devices, 10, INT_MAX, 10); if (OMPI_SUCCESS != rc) { goto close_device; } /* Check MCA parameters */ if (0 != (mca_bcol_iboffload_component.exchange_tree_order & (mca_bcol_iboffload_component.exchange_tree_order - 1))) { IBOFFLOAD_ERROR(("Warning: ibcol_iboffload_exchange_tree_order is %d which is not a power of 2, setting it to 2", mca_bcol_iboffload_component.exchange_tree_order)); mca_bcol_iboffload_component.exchange_tree_order = 2; } /* Pasha: Since we do not have max inline check like in openib, I will put some dummy check here. All mlnx devices support at least 512b */ if (mca_bcol_iboffload_component.max_inline_data > 512) { IBOFFLOAD_ERROR(("Warning the inline %d, is to big and unsupported", mca_bcol_iboffload_component.max_inline_data)); rc = OMPI_ERROR; goto close_device; } /* Register the progress function */ rc = opal_progress_register(mca_bcol_iboffload_component_progress); if (OMPI_SUCCESS != rc) { IBOFFLOAD_ERROR(("Failed to register the progress function" " for iboffload component.\n")); goto close_device; } map_ompi_to_ib_dtype(); map_ompi_to_ib_op_type(); /* The init_done set to true on first component usage */ cm->init_done = false; return OMPI_SUCCESS; close_device: OBJ_DESTRUCT(&cm->devices); OBJ_DESTRUCT(&cm->recv_wrs.lock); return rc; }
int ompi_mtl_mxm_module_init(void) { #if MXM_API < MXM_VERSION(2,0) ompi_mtl_mxm_ep_conn_info_t ep_info; #endif void *ep_address; size_t ep_address_len; mxm_error_t err; uint32_t jobid; uint64_t mxlr; ompi_proc_t **procs; unsigned ptl_bitmap; size_t totps, proc; int lr, nlps; int rc; mxlr = 0; lr = -1; jobid = ompi_mtl_mxm_get_job_id(); if (0 == jobid) { MXM_ERROR("Failed to generate jobid"); return OMPI_ERROR; } if (NULL == (procs = ompi_proc_world(&totps))) { MXM_ERROR("Unable to obtain process list"); return OMPI_ERROR; } if (totps < (size_t)ompi_mtl_mxm.mxm_np) { MXM_VERBOSE(1, "MXM support will be disabled because of total number " "of processes (%lu) is less than the minimum set by the " "mtl_mxm_np MCA parameter (%u)", totps, ompi_mtl_mxm.mxm_np); return OMPI_ERR_NOT_SUPPORTED; } MXM_VERBOSE(1, "MXM support enabled"); if (ORTE_NODE_RANK_INVALID == (lr = ompi_process_info.my_node_rank)) { MXM_ERROR("Unable to obtain local node rank"); return OMPI_ERROR; } nlps = ompi_process_info.num_local_peers + 1; for (proc = 0; proc < totps; proc++) { if (OPAL_PROC_ON_LOCAL_NODE(procs[proc]->proc_flags)) { mxlr = max(mxlr, procs[proc]->proc_name.vpid); } } /* Setup the endpoint options and local addresses to bind to. */ #if MXM_API < MXM_VERSION(2,0) ptl_bitmap = ompi_mtl_mxm.mxm_ctx_opts->ptl_bitmap; #else ptl_bitmap = 0; #endif /* Open MXM endpoint */ err = ompi_mtl_mxm_create_ep(ompi_mtl_mxm.mxm_context, &ompi_mtl_mxm.ep, ptl_bitmap, lr, jobid, mxlr, nlps); if (MXM_OK != err) { opal_show_help("help-mtl-mxm.txt", "unable to create endpoint", true, mxm_error_string(err)); return OMPI_ERROR; } /* * Get address for each PTL on this endpoint, and share it with other ranks. */ #if MXM_API < MXM_VERSION(2,0) if ((ptl_bitmap & MXM_BIT(MXM_PTL_SELF)) && OMPI_SUCCESS != ompi_mtl_mxm_get_ep_address(&ep_info, MXM_PTL_SELF)) { return OMPI_ERROR; } if ((ptl_bitmap & MXM_BIT(MXM_PTL_RDMA)) && OMPI_SUCCESS != ompi_mtl_mxm_get_ep_address(&ep_info, MXM_PTL_RDMA)) { return OMPI_ERROR; } if ((ptl_bitmap & MXM_BIT(MXM_PTL_SHM)) && OMPI_SUCCESS != ompi_mtl_mxm_get_ep_address(&ep_info, MXM_PTL_SHM)) { return OMPI_ERROR; } ep_address = &ep_info; ep_address_len = sizeof(ep_info); #else rc = ompi_mtl_mxm_get_ep_address(&ep_address, &ep_address_len); if (OMPI_SUCCESS != rc) { return rc; } #endif rc = ompi_mtl_mxm_send_ep_address(ep_address, ep_address_len); if (OMPI_SUCCESS != rc) { MXM_ERROR("Modex session failed."); return rc; } #if MXM_API >= MXM_VERSION(2,0) free(ep_address); #endif /* Register the MXM progress function */ opal_progress_register(ompi_mtl_mxm_progress); #if MXM_API >= MXM_VERSION(2,0) if (ompi_mtl_mxm.using_mem_hooks) { opal_mem_hooks_register_release(ompi_mtl_mxm_mem_release_cb, NULL); } #endif return OMPI_SUCCESS; }
int NBC_Schedule_request(NBC_Schedule *schedule, ompi_communicator_t *comm, ompi_coll_libnbc_module_t *module, bool persistent, ompi_request_t **request, void *tmpbuf) { int ret, tmp_tag; bool need_register = false; ompi_coll_libnbc_request_t *handle; /* no operation (e.g. one process barrier)? */ if (((int *)schedule->data)[0] == 0 && schedule->data[sizeof(int)] == 0) { ret = nbc_get_noop_request(persistent, request); if (OMPI_SUCCESS != ret) { return OMPI_ERR_OUT_OF_RESOURCE; } /* update the module->tag here because other processes may have operations * and they may update the module->tag */ OPAL_THREAD_LOCK(&module->mutex); tmp_tag = module->tag--; if (tmp_tag == MCA_COLL_BASE_TAG_NONBLOCKING_END) { tmp_tag = module->tag = MCA_COLL_BASE_TAG_NONBLOCKING_BASE; NBC_DEBUG(2,"resetting tags ...\n"); } OPAL_THREAD_UNLOCK(&module->mutex); OBJ_RELEASE(schedule); free(tmpbuf); return OMPI_SUCCESS; } OMPI_COLL_LIBNBC_REQUEST_ALLOC(comm, persistent, handle); if (NULL == handle) return OMPI_ERR_OUT_OF_RESOURCE; handle->tmpbuf = NULL; handle->req_count = 0; handle->req_array = NULL; handle->comm = comm; handle->schedule = NULL; handle->row_offset = 0; handle->nbc_complete = persistent ? true : false; /******************** Do the tag and shadow comm administration ... ***************/ OPAL_THREAD_LOCK(&module->mutex); tmp_tag = module->tag--; if (tmp_tag == MCA_COLL_BASE_TAG_NONBLOCKING_END) { tmp_tag = module->tag = MCA_COLL_BASE_TAG_NONBLOCKING_BASE; NBC_DEBUG(2,"resetting tags ...\n"); } if (true != module->comm_registered) { module->comm_registered = true; need_register = true; } OPAL_THREAD_UNLOCK(&module->mutex); handle->tag = tmp_tag; /* register progress */ if (need_register) { int32_t tmp = OPAL_THREAD_ADD_FETCH32(&mca_coll_libnbc_component.active_comms, 1); if (tmp == 1) { opal_progress_register(ompi_coll_libnbc_progress); } } handle->comm=comm; /*printf("got module: %lu tag: %i\n", module, module->tag);*/ /******************** end of tag and shadow comm administration ... ***************/ handle->comminfo = module; NBC_DEBUG(3, "got tag %i\n", handle->tag); handle->tmpbuf = tmpbuf; handle->schedule = schedule; *request = (ompi_request_t *) handle; return OMPI_SUCCESS; }
int ompio_io_ompio_file_iread (mca_io_ompio_file_t *fh, void *buf, int count, struct ompi_datatype_t *datatype, ompi_request_t **request) { int ret = OMPI_SUCCESS; mca_ompio_request_t *ompio_req=NULL; ompio_req = OBJ_NEW(mca_ompio_request_t); ompio_req->req_type = MCA_OMPIO_REQUEST_READ; ompio_req->req_ompi.req_state = OMPI_REQUEST_ACTIVE; if ( 0 == count ) { ompi_request_complete (&ompio_req->req_ompi, 0); ompio_req->req_ompi.req_status.MPI_ERROR = OMPI_SUCCESS; ompio_req->req_ompi.req_status._ucount = 0; return OMPI_SUCCESS; } if ( NULL != fh->f_fbtl->fbtl_ipreadv ) { // This fbtl has support for non-blocking operations size_t total_bytes_read = 0; /* total bytes that have been read*/ uint32_t iov_count = 0; struct iovec *decoded_iov = NULL; size_t max_data = 0; int i = 0; /* index into the decoded iovec of the buffer */ int j = 0; /* index into the file vie iovec */ ompi_io_ompio_decode_datatype (fh, datatype, count, buf, &max_data, &decoded_iov, &iov_count); // Non-blocking operations have to occur in a single cycle j = fh->f_index_in_file_view; mca_io_ompio_build_io_array ( fh, 0, // index 1, // no. of cyces max_data, // setting bytes per cycle to match data max_data, iov_count, decoded_iov, &i, &j, &total_bytes_read); if (fh->f_num_of_io_entries) { fh->f_fbtl->fbtl_ipreadv (fh, (ompi_request_t *) ompio_req); } if ( false == mca_io_ompio_progress_is_registered ) { // Lazy initialization of progress function to minimize impact // on other ompi functionality in case its not used. opal_progress_register (mca_io_ompio_component_progress); mca_io_ompio_progress_is_registered=true; } fh->f_num_of_io_entries = 0; if (NULL != fh->f_io_array) { free (fh->f_io_array); fh->f_io_array = NULL; } if (NULL != decoded_iov) { free (decoded_iov); decoded_iov = NULL; } } else { // This fbtl does not support non-blocking operations ompi_status_public_t status; ret = ompio_io_ompio_file_read (fh, buf, count, datatype, &status); ompi_request_complete (&ompio_req->req_ompi, 0); ompio_req->req_ompi.req_status.MPI_ERROR = ret; ompio_req->req_ompi.req_status._ucount = status._ucount; } *request = (ompi_request_t *) ompio_req; return ret; }
int mca_spml_ucx_add_procs(oshmem_proc_t** procs, size_t nprocs) { size_t i, n; int rc = OSHMEM_ERROR; int my_rank = oshmem_my_proc_id(); ucs_status_t err; ucp_address_t *wk_local_addr; size_t wk_addr_len; int *wk_roffs, *wk_rsizes; char *wk_raddrs; mca_spml_ucx.ucp_peers = (ucp_peer_t *) calloc(nprocs, sizeof(*(mca_spml_ucx.ucp_peers))); if (NULL == mca_spml_ucx.ucp_peers) { goto error; } err = ucp_worker_get_address(mca_spml_ucx.ucp_worker, &wk_local_addr, &wk_addr_len); if (err != UCS_OK) { goto error; } dump_address(my_rank, (char *)wk_local_addr, wk_addr_len); rc = oshmem_shmem_xchng(wk_local_addr, wk_addr_len, nprocs, (void **)&wk_raddrs, &wk_roffs, &wk_rsizes); if (rc != OSHMEM_SUCCESS) { goto error; } opal_progress_register(spml_ucx_progress); /* Get the EP connection requests for all the processes from modex */ for (n = 0; n < nprocs; ++n) { i = (my_rank + n) % nprocs; dump_address(i, (char *)(wk_raddrs + wk_roffs[i]), wk_rsizes[i]); err = ucp_ep_create(mca_spml_ucx.ucp_worker, (ucp_address_t *)(wk_raddrs + wk_roffs[i]), &mca_spml_ucx.ucp_peers[i].ucp_conn); if (UCS_OK != err) { SPML_ERROR("ucp_ep_create failed!!!\n"); goto error2; } procs[i]->num_transports = 1; procs[i]->transport_ids = spml_ucx_transport_ids; } ucp_worker_release_address(mca_spml_ucx.ucp_worker, wk_local_addr); free(wk_raddrs); free(wk_rsizes); free(wk_roffs); SPML_VERBOSE(50, "*** ADDED PROCS ***"); return OSHMEM_SUCCESS; error2: for (i = 0; i < nprocs; ++i) { if (mca_spml_ucx.ucp_peers[i].ucp_conn) { ucp_ep_destroy(mca_spml_ucx.ucp_peers[i].ucp_conn); } } if (mca_spml_ucx.ucp_peers) free(mca_spml_ucx.ucp_peers); if (wk_raddrs) free(wk_raddrs); if (wk_rsizes) free(wk_rsizes); if (wk_roffs) free(wk_roffs); if (mca_spml_ucx.ucp_peers) free(mca_spml_ucx.ucp_peers); error: rc = OSHMEM_ERR_OUT_OF_RESOURCE; SPML_ERROR("add procs FAILED rc=%d", rc); return rc; }
static int component_init(bool enable_progress_threads, bool enable_mpi_threads) { int ret; ptl_ni_limits_t actual; ret = PtlInit(); if (PTL_OK != ret) { opal_output_verbose(1, ompi_osc_base_framework.framework_output, "%s:%d: PtlInit failed: %d\n", __FILE__, __LINE__, ret); return OMPI_ERROR; } ret = PtlNIInit(PTL_IFACE_DEFAULT, PTL_NI_PHYSICAL | PTL_NI_MATCHING, PTL_PID_ANY, NULL, &actual, &mca_osc_portals4_component.matching_ni_h); if (PTL_OK != ret) { opal_output_verbose(1, ompi_osc_base_framework.framework_output, "%s:%d: PtlNIInit failed: %d\n", __FILE__, __LINE__, ret); return ret; } /* BWB: FIX ME: Need to make sure our ID matches with the MTL... */ mca_osc_portals4_component.matching_atomic_max = actual.max_atomic_size; mca_osc_portals4_component.matching_fetch_atomic_max = actual.max_fetch_atomic_size; mca_osc_portals4_component.matching_atomic_ordered_size = MAX(actual.max_waw_ordered_size, actual.max_war_ordered_size); ret = PtlEQAlloc(mca_osc_portals4_component.matching_ni_h, 4096, &mca_osc_portals4_component.matching_eq_h); if (PTL_OK != ret) { opal_output_verbose(1, ompi_osc_base_framework.framework_output, "%s:%d: PtlEQAlloc failed: %d\n", __FILE__, __LINE__, ret); return ret; } ret = PtlPTAlloc(mca_osc_portals4_component.matching_ni_h, 0, mca_osc_portals4_component.matching_eq_h, 4, &mca_osc_portals4_component.matching_pt_idx); if (PTL_OK != ret) { opal_output_verbose(1, ompi_osc_base_framework.framework_output, "%s:%d: PtlPTAlloc failed: %d\n", __FILE__, __LINE__, ret); return ret; } OBJ_CONSTRUCT(&mca_osc_portals4_component.requests, opal_free_list_t); ret = opal_free_list_init (&mca_osc_portals4_component.requests, sizeof(ompi_osc_portals4_request_t), opal_cache_line_size, OBJ_CLASS(ompi_osc_portals4_request_t), 0, 0, 8, 0, 8, NULL, 0, NULL, NULL, NULL); if (OMPI_SUCCESS != ret) { opal_output_verbose(1, ompi_osc_base_framework.framework_output, "%s:%d: opal_free_list_init failed: %d\n", __FILE__, __LINE__, ret); return ret; } ret = opal_progress_register(progress_callback); if (OMPI_SUCCESS != ret) { opal_output_verbose(1, ompi_osc_base_framework.framework_output, "%s:%d: opal_progress_register failed: %d\n", __FILE__, __LINE__, ret); return ret; } return OMPI_SUCCESS; }
/* * Invoked when there's a new communicator that has been created. * Look at the communicator and decide which set of functions and * priority we want to return. */ mca_coll_base_module_t * mca_coll_hcoll_comm_query(struct ompi_communicator_t *comm, int *priority) { mca_coll_base_module_t *module; mca_coll_hcoll_module_t *hcoll_module; ompi_attribute_fn_ptr_union_t del_fn; ompi_attribute_fn_ptr_union_t copy_fn; mca_coll_hcoll_component_t *cm; int err; int rc; cm = &mca_coll_hcoll_component; *priority = 0; module = NULL; if (!cm->hcoll_enable) { return NULL; } if (OMPI_COMM_IS_INTER(comm) || ompi_comm_size(comm) < cm->hcoll_np || ompi_comm_size(comm) < 2) { return NULL; } if (!cm->libhcoll_initialized) { /* libhcoll should be initialized here since current implmentation of mxm bcol in libhcoll needs world_group fully functional during init world_group, i.e. ompi_comm_world, is not ready at hcoll component open call */ opal_progress_register(mca_coll_hcoll_progress); HCOL_VERBOSE(10,"Calling hcoll_init();"); #if HCOLL_API >= HCOLL_VERSION(3,2) hcoll_read_init_opts(&cm->init_opts); cm->init_opts->base_tag = MCA_COLL_BASE_TAG_HCOLL_BASE; cm->init_opts->max_tag = mca_pml.pml_max_tag; cm->init_opts->enable_thread_support = ompi_mpi_thread_multiple; rc = hcoll_init_with_opts(&cm->init_opts); #else hcoll_set_runtime_tag_offset(MCA_COLL_BASE_TAG_HCOLL_BASE, mca_pml.pml_max_tag); rc = hcoll_init(); #endif if (HCOLL_SUCCESS != rc) { cm->hcoll_enable = 0; opal_progress_unregister(mca_coll_hcoll_progress); HCOL_ERROR("Hcol library init failed"); return NULL; } #if HCOLL_API >= HCOLL_VERSION(3,2) if (cm->using_mem_hooks && cm->init_opts->mem_hook_needed) { #else if (cm->using_mem_hooks && hcoll_check_mem_release_cb_needed()) { #endif opal_mem_hooks_register_release(mca_coll_hcoll_mem_release_cb, NULL); } else { cm->using_mem_hooks = 0; } copy_fn.attr_communicator_copy_fn = (MPI_Comm_internal_copy_attr_function*) MPI_COMM_NULL_COPY_FN; del_fn.attr_communicator_delete_fn = hcoll_comm_attr_del_fn; err = ompi_attr_create_keyval(COMM_ATTR, copy_fn, del_fn, &hcoll_comm_attr_keyval, NULL ,0, NULL); if (OMPI_SUCCESS != err) { cm->hcoll_enable = 0; hcoll_finalize(); opal_progress_unregister(mca_coll_hcoll_progress); HCOL_ERROR("Hcol comm keyval create failed"); return NULL; } if (mca_coll_hcoll_component.derived_types_support_enabled) { copy_fn.attr_datatype_copy_fn = (MPI_Type_internal_copy_attr_function *) MPI_TYPE_NULL_COPY_FN; del_fn.attr_datatype_delete_fn = hcoll_type_attr_del_fn; err = ompi_attr_create_keyval(TYPE_ATTR, copy_fn, del_fn, &hcoll_type_attr_keyval, NULL ,0, NULL); if (OMPI_SUCCESS != err) { cm->hcoll_enable = 0; hcoll_finalize(); opal_progress_unregister(mca_coll_hcoll_progress); HCOL_ERROR("Hcol type keyval create failed"); return NULL; } } OBJ_CONSTRUCT(&cm->dtypes, opal_free_list_t); opal_free_list_init(&cm->dtypes, sizeof(mca_coll_hcoll_dtype_t), 8, OBJ_CLASS(mca_coll_hcoll_dtype_t), 0, 0, 32, -1, 32, NULL, 0, NULL, NULL, NULL); } hcoll_module = OBJ_NEW(mca_coll_hcoll_module_t); if (!hcoll_module) { if (!cm->libhcoll_initialized) { cm->hcoll_enable = 0; hcoll_finalize(); opal_progress_unregister(mca_coll_hcoll_progress); } return NULL; } hcoll_module->comm = comm; HCOL_VERBOSE(10,"Creating hcoll_context for comm %p, comm_id %d, comm_size %d", (void*)comm,comm->c_contextid,ompi_comm_size(comm)); hcoll_module->hcoll_context = hcoll_create_context((rte_grp_handle_t)comm); if (NULL == hcoll_module->hcoll_context) { HCOL_VERBOSE(1,"hcoll_create_context returned NULL"); OBJ_RELEASE(hcoll_module); if (!cm->libhcoll_initialized) { cm->hcoll_enable = 0; hcoll_finalize(); opal_progress_unregister(mca_coll_hcoll_progress); } return NULL; } hcoll_module->super.coll_module_enable = mca_coll_hcoll_module_enable; hcoll_module->super.coll_barrier = hcoll_collectives.coll_barrier ? mca_coll_hcoll_barrier : NULL; hcoll_module->super.coll_bcast = hcoll_collectives.coll_bcast ? mca_coll_hcoll_bcast : NULL; hcoll_module->super.coll_allgather = hcoll_collectives.coll_allgather ? mca_coll_hcoll_allgather : NULL; hcoll_module->super.coll_allgatherv = hcoll_collectives.coll_allgatherv ? mca_coll_hcoll_allgatherv : NULL; hcoll_module->super.coll_allreduce = hcoll_collectives.coll_allreduce ? mca_coll_hcoll_allreduce : NULL; hcoll_module->super.coll_alltoall = hcoll_collectives.coll_alltoall ? mca_coll_hcoll_alltoall : NULL; hcoll_module->super.coll_alltoallv = hcoll_collectives.coll_alltoallv ? mca_coll_hcoll_alltoallv : NULL; hcoll_module->super.coll_gatherv = hcoll_collectives.coll_gatherv ? mca_coll_hcoll_gatherv : NULL; hcoll_module->super.coll_reduce = hcoll_collectives.coll_reduce ? mca_coll_hcoll_reduce : NULL; hcoll_module->super.coll_ibarrier = hcoll_collectives.coll_ibarrier ? mca_coll_hcoll_ibarrier : NULL; hcoll_module->super.coll_ibcast = hcoll_collectives.coll_ibcast ? mca_coll_hcoll_ibcast : NULL; hcoll_module->super.coll_iallgather = hcoll_collectives.coll_iallgather ? mca_coll_hcoll_iallgather : NULL; #if HCOLL_API >= HCOLL_VERSION(3,5) hcoll_module->super.coll_iallgatherv = hcoll_collectives.coll_iallgatherv ? mca_coll_hcoll_iallgatherv : NULL; #else hcoll_module->super.coll_iallgatherv = NULL; #endif hcoll_module->super.coll_iallreduce = hcoll_collectives.coll_iallreduce ? mca_coll_hcoll_iallreduce : NULL; #if HCOLL_API >= HCOLL_VERSION(3,5) hcoll_module->super.coll_ireduce = hcoll_collectives.coll_ireduce ? mca_coll_hcoll_ireduce : NULL; #else hcoll_module->super.coll_ireduce = NULL; #endif hcoll_module->super.coll_gather = /*hcoll_collectives.coll_gather ? mca_coll_hcoll_gather :*/ NULL; hcoll_module->super.coll_igatherv = hcoll_collectives.coll_igatherv ? mca_coll_hcoll_igatherv : NULL; hcoll_module->super.coll_ialltoall = /*hcoll_collectives.coll_ialltoall ? mca_coll_hcoll_ialltoall : */ NULL; #if HCOLL_API >= HCOLL_VERSION(3,7) hcoll_module->super.coll_ialltoallv = hcoll_collectives.coll_ialltoallv ? mca_coll_hcoll_ialltoallv : NULL; #else hcoll_module->super.coll_ialltoallv = NULL; #endif *priority = cm->hcoll_priority; module = &hcoll_module->super; if (!cm->libhcoll_initialized) { cm->libhcoll_initialized = true; } return module; } OBJ_CLASS_INSTANCE(mca_coll_hcoll_module_t, mca_coll_base_module_t, mca_coll_hcoll_module_construct, mca_coll_hcoll_module_destruct);