static int ompi_comm_allreduce_getnextcid (ompi_comm_request_t *request) { ompi_comm_cid_context_t *context = (ompi_comm_cid_context_t *) request->context; int64_t my_id = ((int64_t) ompi_comm_get_cid (context->comm) << 32 | context->pml_tag); ompi_request_t *subreq; bool flag; int ret; if (OPAL_THREAD_TRYLOCK(&ompi_cid_lock)) { return ompi_comm_request_schedule_append (request, ompi_comm_allreduce_getnextcid, NULL, 0); } if (ompi_comm_cid_lowest_id < my_id) { OPAL_THREAD_UNLOCK(&ompi_cid_lock); return ompi_comm_request_schedule_append (request, ompi_comm_allreduce_getnextcid, NULL, 0); } ompi_comm_cid_lowest_id = my_id; /** * This is the real algorithm described in the doc */ flag = false; context->nextlocal_cid = mca_pml.pml_max_contextid; for (unsigned int i = context->start ; i < mca_pml.pml_max_contextid ; ++i) { flag = opal_pointer_array_test_and_set_item (&ompi_mpi_communicators, i, context->comm); if (true == flag) { context->nextlocal_cid = i; break; } } ret = context->allreduce_fn (&context->nextlocal_cid, &context->nextcid, 1, MPI_MAX, context, &subreq); if (OMPI_SUCCESS != ret) { ompi_comm_cid_lowest_id = INT64_MAX; OPAL_THREAD_UNLOCK(&ompi_cid_lock); return ret; } if ((unsigned int) context->nextlocal_cid == mca_pml.pml_max_contextid) { /* at least one peer ran out of CIDs */ if (flag) { opal_pointer_array_test_and_set_item(&ompi_mpi_communicators, context->nextlocal_cid, NULL); } ompi_comm_cid_lowest_id = INT64_MAX; OPAL_THREAD_UNLOCK(&ompi_cid_lock); return OMPI_ERR_OUT_OF_RESOURCE; } OPAL_THREAD_UNLOCK(&ompi_cid_lock); /* next we want to verify that the resulting commid is ok */ return ompi_comm_request_schedule_append (request, ompi_comm_checkcid, &subreq, 1); }
static int ompi_osc_pt2pt_dt_send_complete (ompi_request_t *request) { ompi_datatype_t *datatype = (ompi_datatype_t *) request->req_complete_cb_data; ompi_osc_pt2pt_module_t *module = NULL; OMPI_DATATYPE_RELEASE(datatype); OPAL_THREAD_LOCK(&mca_osc_pt2pt_component.lock); (void) opal_hash_table_get_value_uint32(&mca_osc_pt2pt_component.modules, ompi_comm_get_cid(request->req_mpi_object.comm), (void **) &module); OPAL_THREAD_UNLOCK(&mca_osc_pt2pt_component.lock); assert (NULL != module); ompi_request_free (&request); return 1; }
static int ompi_osc_pt2pt_dt_send_complete (ompi_request_t *request) { ompi_datatype_t *datatype = (ompi_datatype_t *) request->req_complete_cb_data; ompi_osc_pt2pt_module_t *module = NULL; OBJ_RELEASE(datatype); OPAL_THREAD_LOCK(&mca_osc_pt2pt_component.lock); (void) opal_hash_table_get_value_uint32(&mca_osc_pt2pt_component.modules, ompi_comm_get_cid(request->req_mpi_object.comm), (void **) &module); OPAL_THREAD_UNLOCK(&mca_osc_pt2pt_component.lock); assert (NULL != module); /* put this request on the garbage colletion list */ osc_pt2pt_gc_add_request (module, request); return OMPI_SUCCESS; }
static int component_select(struct ompi_win_t *win, void **base, size_t size, int disp_unit, struct ompi_communicator_t *comm, struct ompi_info_t *info, int flavor, int *model) { ompi_osc_portals4_module_t *module = NULL; int ret = OMPI_ERROR; int tmp; ptl_md_t md; ptl_me_t me; char *name; if (MPI_WIN_FLAVOR_SHARED == flavor) return OMPI_ERR_NOT_SUPPORTED; /* create module structure */ module = (ompi_osc_portals4_module_t*) calloc(1, sizeof(ompi_osc_portals4_module_t)); if (NULL == module) return OMPI_ERR_TEMP_OUT_OF_RESOURCE; /* fill in the function pointer part */ memcpy(module, &ompi_osc_portals4_module_template, sizeof(ompi_osc_base_module_t)); /* fill in our part */ if (MPI_WIN_FLAVOR_ALLOCATE == flavor) { module->free_after = *base = malloc(size); if (NULL == *base) goto error; } else { module->free_after = NULL; } ret = ompi_comm_dup(comm, &module->comm); if (OMPI_SUCCESS != ret) goto error; opal_output_verbose(1, ompi_osc_base_framework.framework_output, "portals4 component creating window with id %d", ompi_comm_get_cid(module->comm)); asprintf(&name, "portals4 window %d", ompi_comm_get_cid(module->comm)); ompi_win_set_name(win, name); free(name); /* share everyone's displacement units. Only do an allgather if strictly necessary, since it requires O(p) state. */ tmp = disp_unit; ret = module->comm->c_coll.coll_bcast(&tmp, 1, MPI_INT, 0, module->comm, module->comm->c_coll.coll_bcast_module); if (OMPI_SUCCESS != ret) { opal_output_verbose(1, ompi_osc_base_framework.framework_output, "%s:%d: MPI_Bcast failed: %d\n", __FILE__, __LINE__, ret); goto error; } tmp = (tmp == disp_unit) ? 1 : 0; ret = module->comm->c_coll.coll_allreduce(MPI_IN_PLACE, &tmp, 1, MPI_INT, MPI_LAND, module->comm, module->comm->c_coll.coll_allreduce_module); if (OMPI_SUCCESS != ret) goto error; if (tmp == 1) { module->disp_unit = disp_unit; module->disp_units = NULL; } else { module->disp_unit = -1; module->disp_units = malloc(sizeof(int) * ompi_comm_size(module->comm)); ret = module->comm->c_coll.coll_allgather(&disp_unit, 1, MPI_INT, module->disp_units, 1, MPI_INT, module->comm, module->comm->c_coll.coll_allgather_module); if (OMPI_SUCCESS != ret) goto error; } module->ni_h = mca_osc_portals4_component.matching_ni_h; module->pt_idx = mca_osc_portals4_component.matching_pt_idx; ret = PtlCTAlloc(module->ni_h, &(module->ct_h)); if (PTL_OK != ret) { opal_output_verbose(1, ompi_osc_base_framework.framework_output, "%s:%d: PtlCTAlloc failed: %d\n", __FILE__, __LINE__, ret); goto error; } md.start = 0; md.length = PTL_SIZE_MAX; md.options = PTL_MD_EVENT_SUCCESS_DISABLE | PTL_MD_EVENT_CT_REPLY | PTL_MD_EVENT_CT_ACK; md.eq_handle = mca_osc_portals4_component.matching_eq_h; md.ct_handle = module->ct_h; ret = PtlMDBind(module->ni_h, &md, &module->md_h); if (PTL_OK != ret) { opal_output_verbose(1, ompi_osc_base_framework.framework_output, "%s:%d: PtlMDBind failed: %d\n", __FILE__, __LINE__, ret); goto error; } md.start = 0; md.length = PTL_SIZE_MAX; md.options = PTL_MD_EVENT_CT_REPLY | PTL_MD_EVENT_CT_ACK; md.eq_handle = mca_osc_portals4_component.matching_eq_h; md.ct_handle = module->ct_h; ret = PtlMDBind(module->ni_h, &md, &module->req_md_h); if (PTL_OK != ret) { opal_output_verbose(1, ompi_osc_base_framework.framework_output, "%s:%d: PtlMDBind failed: %d\n", __FILE__, __LINE__, ret); goto error; } if (MPI_WIN_FLAVOR_DYNAMIC == flavor) { me.start = 0; me.length = PTL_SIZE_MAX; } else { me.start = *base; me.length = size; } me.ct_handle = PTL_CT_NONE; me.uid = PTL_UID_ANY; me.options = PTL_ME_OP_PUT | PTL_ME_OP_GET | PTL_ME_NO_TRUNCATE | PTL_ME_EVENT_SUCCESS_DISABLE; me.match_id.phys.nid = PTL_NID_ANY; me.match_id.phys.pid = PTL_PID_ANY; me.match_bits = module->comm->c_contextid; me.ignore_bits = 0; ret = PtlMEAppend(module->ni_h, module->pt_idx, &me, PTL_PRIORITY_LIST, NULL, &module->data_me_h); if (PTL_OK != ret) { opal_output_verbose(1, ompi_osc_base_framework.framework_output, "%s:%d: PtlMEAppend failed: %d\n", __FILE__, __LINE__, ret); goto error; } me.start = &module->state; me.length = sizeof(module->state); me.ct_handle = PTL_CT_NONE; me.uid = PTL_UID_ANY; me.options = PTL_ME_OP_PUT | PTL_ME_OP_GET | PTL_ME_NO_TRUNCATE | PTL_ME_EVENT_SUCCESS_DISABLE; me.match_id.phys.nid = PTL_NID_ANY; me.match_id.phys.pid = PTL_PID_ANY; me.match_bits = module->comm->c_contextid | OSC_PORTALS4_MB_CONTROL; me.ignore_bits = 0; ret = PtlMEAppend(module->ni_h, module->pt_idx, &me, PTL_PRIORITY_LIST, NULL, &module->control_me_h); if (PTL_OK != ret) { opal_output_verbose(1, ompi_osc_base_framework.framework_output, "%s:%d: PtlMEAppend failed: %d\n", __FILE__, __LINE__, ret); goto error; } module->opcount = 0; module->match_bits = module->comm->c_contextid; module->atomic_max = (check_config_value_equal("accumulate_ordering", info, "none")) ? mca_osc_portals4_component.matching_atomic_max : MIN(mca_osc_portals4_component.matching_atomic_max, mca_osc_portals4_component.matching_atomic_ordered_size); module->fetch_atomic_max = (check_config_value_equal("accumulate_ordering", info, "none")) ? mca_osc_portals4_component.matching_fetch_atomic_max : MIN(mca_osc_portals4_component.matching_fetch_atomic_max, mca_osc_portals4_component.matching_atomic_ordered_size); module->zero = 0; module->one = 1; module->start_group = NULL; module->post_group = NULL; module->state.post_count = 0; module->state.complete_count = 0; if (check_config_value_bool("no_locks", info)) { module->state.lock = LOCK_ILLEGAL; } else { module->state.lock = LOCK_UNLOCKED; } OBJ_CONSTRUCT(&module->outstanding_locks, opal_list_t); module->passive_target_access_epoch = false; #if OPAL_ASSEMBLY_ARCH == OPAL_AMD64 || OPAL_ASSEMBLY_ARCH == OPAL_IA32 *model = MPI_WIN_UNIFIED; #else *model = MPI_WIN_SEPARATE; #endif win->w_osc_module = &module->super; PtlAtomicSync(); /* Make sure that everyone's ready to receive. */ module->comm->c_coll.coll_barrier(module->comm, module->comm->c_coll.coll_barrier_module); return OMPI_SUCCESS; error: /* BWB: FIX ME: This is all wrong... */ if (0 != module->ct_h) PtlCTFree(module->ct_h); if (0 != module->data_me_h) PtlMEUnlink(module->data_me_h); if (0 != module->req_md_h) PtlMDRelease(module->req_md_h); if (0 != module->md_h) PtlMDRelease(module->md_h); if (NULL != module->comm) ompi_comm_free(&module->comm); if (NULL != module) free(module); return ret; }
int ompi_osc_pt2pt_free(ompi_win_t *win) { int ret = OMPI_SUCCESS; ompi_osc_pt2pt_module_t *module = GET_MODULE(win); if (NULL == module) { return OMPI_SUCCESS; } if (NULL != module->comm) { opal_output_verbose(1, ompi_osc_base_framework.framework_output, "pt2pt component destroying window with id %d", ompi_comm_get_cid(module->comm)); /* finish with a barrier */ if (ompi_group_size(win->w_group) > 1) { ret = module->comm->c_coll.coll_barrier(module->comm, module->comm->c_coll.coll_barrier_module); } /* remove from component information */ OPAL_THREAD_SCOPED_LOCK(&mca_osc_pt2pt_component.lock, opal_hash_table_remove_value_uint32(&mca_osc_pt2pt_component.modules, ompi_comm_get_cid(module->comm))); } win->w_osc_module = NULL; OBJ_DESTRUCT(&module->outstanding_locks); OBJ_DESTRUCT(&module->locks_pending); OBJ_DESTRUCT(&module->locks_pending_lock); OBJ_DESTRUCT(&module->acc_lock); OBJ_DESTRUCT(&module->cond); OBJ_DESTRUCT(&module->lock); /* it is erroneous to close a window with active operations on it so we should * probably produce an error here instead of cleaning up */ OPAL_LIST_DESTRUCT(&module->pending_acc); OPAL_LIST_DESTRUCT(&module->pending_posts); osc_pt2pt_gc_clean (module); OPAL_LIST_DESTRUCT(&module->request_gc); OPAL_LIST_DESTRUCT(&module->buffer_gc); OBJ_DESTRUCT(&module->gc_lock); if (NULL != module->peers) { for (int i = 0 ; i < ompi_comm_size (module->comm) ; ++i) { OBJ_DESTRUCT(module->peers + i); } free(module->peers); } if (NULL != module->epoch_outgoing_frag_count) free(module->epoch_outgoing_frag_count); if (NULL != module->frag_request) { module->frag_request->req_complete_cb = NULL; ompi_request_cancel (module->frag_request); ompi_request_free (&module->frag_request); } if (NULL != module->comm) { ompi_comm_free(&module->comm); } if (NULL != module->incoming_buffer) free (module->incoming_buffer); if (NULL != module->free_after) free(module->free_after); free (module); return ret; }
int ompi_coll_portals4_barrier_intra(struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { mca_coll_portals4_module_t *portals4_module = (mca_coll_portals4_module_t*) module; int ret, i, dim, hibit, mask, num_msgs; int size = ompi_comm_size(comm); int rank = ompi_comm_rank(comm); ptl_ct_event_t ct; ptl_handle_ct_t ct_h; ptl_handle_me_t me_h; ptl_me_t me; size_t count; ptl_match_bits_t match_bits; ptl_handle_md_t md_h; void *base; ompi_coll_portals4_get_md(0, &md_h, &base); count = opal_atomic_add_size_t(&portals4_module->barrier_count, 1); ret = PtlCTAlloc(mca_coll_portals4_component.ni_h, &ct_h); if (PTL_OK != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, "%s:%d: PtlCTAlloc failed: %d\n", __FILE__, __LINE__, ret); return OMPI_ERR_TEMP_OUT_OF_RESOURCE; } COLL_PORTALS4_SET_BITS(match_bits, ompi_comm_get_cid(comm), 0, COLL_PORTALS4_BARRIER, count); /* Build "tree" out of hypercube */ dim = comm->c_cube_dim; hibit = opal_hibit(rank, dim); --dim; /* receive space */ me.start = NULL; me.length = 0; me.ct_handle = ct_h; me.min_free = 0; me.uid = mca_coll_portals4_component.uid; me.options = PTL_ME_OP_PUT | PTL_ME_EVENT_SUCCESS_DISABLE | PTL_ME_EVENT_LINK_DISABLE | PTL_ME_EVENT_UNLINK_DISABLE | PTL_ME_EVENT_CT_COMM | PTL_ME_EVENT_CT_OVERFLOW; me.match_id.phys.nid = PTL_NID_ANY; me.match_id.phys.pid = PTL_PID_ANY; me.match_bits = match_bits; me.ignore_bits = 0; ret = PtlMEAppend(mca_coll_portals4_component.ni_h, mca_coll_portals4_component.pt_idx, &me, PTL_PRIORITY_LIST, NULL, &me_h); if (PTL_OK != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, "%s:%d: PtlMEAppend failed: %d\n", __FILE__, __LINE__, ret); return OMPI_ERROR; } /* calculate number of children to receive from */ num_msgs = ompi_coll_portals4_get_nchildren(dim + 1, hibit, rank, size); /* send to parent when children have sent to us */ if (rank > 0) { int parent = rank & ~(1 << hibit); ret = PtlTriggeredPut(md_h, 0, 0, PTL_NO_ACK_REQ, ompi_coll_portals4_get_peer(comm, parent), mca_coll_portals4_component.pt_idx, match_bits, 0, NULL, 0, ct_h, num_msgs); if (PTL_OK != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, "%s:%d: PtlTriggeredPut failed: %d\n", __FILE__, __LINE__, ret); return OMPI_ERROR; } /* we'll need to wait for the parent response before the next set of comms */ num_msgs++; } /* send to children when parent (or all children if root) has sent to us */ for (i = hibit + 1, mask = 1 << i; i <= dim; ++i, mask <<= 1) { int peer = rank | mask; if (peer < size) { ret = PtlTriggeredPut(md_h, 0, 0, PTL_NO_ACK_REQ, ompi_coll_portals4_get_peer(comm, peer), mca_coll_portals4_component.pt_idx, match_bits, 0, NULL, 0, ct_h, num_msgs); if (PTL_OK != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, "%s:%d: PtlTriggeredPut failed: %d\n", __FILE__, __LINE__, ret); return OMPI_ERROR; } } } /* Wait for all incoming messages */ ret = PtlCTWait(ct_h, num_msgs, &ct); if (PTL_OK != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, "%s:%d: PtlCTWait failed: %d\n", __FILE__, __LINE__, ret); return OMPI_ERROR; } /* cleanup */ ret = PtlMEUnlink(me_h); if (PTL_OK != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, "%s:%d: PtlMEUnlink failed: %d\n", __FILE__, __LINE__, ret); return OMPI_ERROR; } ret = PtlCTFree(ct_h); if (PTL_OK != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, "%s:%d: PtlCTFree failed: %d\n", __FILE__, __LINE__, ret); return OMPI_ERROR; } return OMPI_SUCCESS; }
static int setup_sync_handles(struct ompi_communicator_t *comm, ompi_coll_portals4_request_t *request, mca_coll_portals4_module_t *portals4_module) { int ret, line; ptl_me_t me; OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:portals4:setup_sync_handles enter rank %d", request->u.scatter.my_rank)); /**********************************/ /* Setup Sync Handles */ /**********************************/ COLL_PORTALS4_SET_BITS(request->u.scatter.sync_match_bits, ompi_comm_get_cid(comm), 0, 1, COLL_PORTALS4_SCATTER, 0, request->u.scatter.coll_count); OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:portals4:setup_sync_handles rank(%d) sync_match_bits(0x%016lX)", request->u.scatter.my_rank, request->u.scatter.sync_match_bits)); ret = PtlCTAlloc(mca_coll_portals4_component.ni_h, &request->u.scatter.sync_cth); if (PTL_OK != ret) { ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; line = __LINE__; goto err_hdlr; } request->u.scatter.sync_mdh = mca_coll_portals4_component.zero_md_h; me.start = NULL; me.length = 0; me.ct_handle = request->u.scatter.sync_cth; me.min_free = 0; me.uid = mca_coll_portals4_component.uid; me.options = PTL_ME_OP_PUT | PTL_ME_EVENT_SUCCESS_DISABLE | PTL_ME_EVENT_LINK_DISABLE | PTL_ME_EVENT_UNLINK_DISABLE | PTL_ME_EVENT_CT_COMM | PTL_ME_EVENT_CT_OVERFLOW; me.match_id.phys.nid = PTL_NID_ANY; me.match_id.phys.pid = PTL_PID_ANY; me.match_bits = request->u.scatter.sync_match_bits; me.ignore_bits = 0; ret = PtlMEAppend(mca_coll_portals4_component.ni_h, mca_coll_portals4_component.pt_idx, &me, PTL_PRIORITY_LIST, NULL, &request->u.scatter.sync_meh); if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; } OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:portals4:setup_sync_handles exit rank %d", request->u.scatter.my_rank)); return OMPI_SUCCESS; err_hdlr: opal_output(ompi_coll_base_framework.framework_output, "%s:%4d:%4d\tError occurred ret=%d, rank %2d", __FILE__, __LINE__, line, ret, request->u.scatter.my_rank); return ret; }
int ompi_osc_rdma_module_free(ompi_win_t *win) { int ret = OMPI_SUCCESS; int tmp, i; ompi_osc_rdma_module_t *module = GET_MODULE(win); opal_output_verbose(1, ompi_osc_base_output, "rdma component destroying window with id %d", ompi_comm_get_cid(module->m_comm)); /* finish with a barrier */ if (ompi_group_size(win->w_group) > 1) { ret = module->m_comm->c_coll.coll_barrier(module->m_comm, module->m_comm->c_coll.coll_barrier_module); } /* remove from component information */ OPAL_THREAD_LOCK(&mca_osc_rdma_component.c_lock); tmp = opal_hash_table_remove_value_uint32(&mca_osc_rdma_component.c_modules, ompi_comm_get_cid(module->m_comm)); /* only take the output of hast_table_remove if there wasn't already an error */ ret = (ret != OMPI_SUCCESS) ? ret : tmp; if (0 == opal_hash_table_get_size(&mca_osc_rdma_component.c_modules)) { #if OPAL_ENABLE_PROGRESS_THREADS void *foo; mca_osc_rdma_component.c_thread_run = false; opal_condition_broadcast(&ompi_request_cond); opal_thread_join(&mca_osc_rdma_component.c_thread, &foo); #else opal_progress_unregister(ompi_osc_rdma_component_progress); #endif } OPAL_THREAD_UNLOCK(&mca_osc_rdma_component.c_lock); win->w_osc_module = NULL; OBJ_DESTRUCT(&module->m_unlocks_pending); OBJ_DESTRUCT(&module->m_locks_pending); OBJ_DESTRUCT(&module->m_queued_sendreqs); OBJ_DESTRUCT(&module->m_copy_pending_sendreqs); OBJ_DESTRUCT(&module->m_pending_sendreqs); OBJ_DESTRUCT(&module->m_acc_lock); OBJ_DESTRUCT(&module->m_cond); OBJ_DESTRUCT(&module->m_lock); if (NULL != module->m_sc_remote_ranks) { free(module->m_sc_remote_ranks); } if (NULL != module->m_sc_remote_active_ranks) { free(module->m_sc_remote_active_ranks); } if (NULL != module->m_pending_buffers) { free(module->m_pending_buffers); } if (NULL != module->m_fence_coll_counts) { free(module->m_fence_coll_counts); } if (NULL != module->m_copy_num_pending_sendreqs) { free(module->m_copy_num_pending_sendreqs); } if (NULL != module->m_num_pending_sendreqs) { free(module->m_num_pending_sendreqs); } if (NULL != module->m_peer_info) { for (i = 0 ; i < ompi_comm_size(module->m_comm) ; ++i) { ompi_osc_rdma_peer_info_free(&module->m_peer_info[i]); } free(module->m_peer_info); } if (NULL != module->m_comm) ompi_comm_free(&module->m_comm); if (NULL != module) free(module); return ret; }
static int component_select(struct ompi_win_t *win, void **base, size_t size, int disp_unit, struct ompi_communicator_t *comm, struct opal_info_t *info, int flavor, int *model) { ompi_osc_ucx_module_t *module = NULL; char *name = NULL; long values[2]; int ret = OMPI_SUCCESS; ucs_status_t status; int i, comm_size = ompi_comm_size(comm); int is_eps_ready; bool eps_created = false, worker_created = false; ucp_address_t *my_addr = NULL; size_t my_addr_len; char *recv_buf = NULL; void *rkey_buffer = NULL, *state_rkey_buffer = NULL; size_t rkey_buffer_size, state_rkey_buffer_size; void *state_base = NULL; void * my_info = NULL; size_t my_info_len; int disps[comm_size]; int rkey_sizes[comm_size]; /* the osc/sm component is the exclusive provider for support for * shared memory windows */ if (flavor == MPI_WIN_FLAVOR_SHARED) { return OMPI_ERR_NOT_SUPPORTED; } /* if UCP worker has never been initialized before, init it first */ if (mca_osc_ucx_component.ucp_worker == NULL) { ucp_worker_params_t worker_params; ucp_worker_attr_t worker_attr; memset(&worker_params, 0, sizeof(ucp_worker_h)); worker_params.field_mask = UCP_WORKER_PARAM_FIELD_THREAD_MODE; worker_params.thread_mode = (mca_osc_ucx_component.enable_mpi_threads == true) ? UCS_THREAD_MODE_MULTI : UCS_THREAD_MODE_SINGLE; status = ucp_worker_create(mca_osc_ucx_component.ucp_context, &worker_params, &(mca_osc_ucx_component.ucp_worker)); if (UCS_OK != status) { opal_output_verbose(1, ompi_osc_base_framework.framework_output, "%s:%d: ucp_worker_create failed: %d\n", __FILE__, __LINE__, status); ret = OMPI_ERROR; goto error; } /* query UCP worker attributes */ worker_attr.field_mask = UCP_WORKER_ATTR_FIELD_THREAD_MODE; status = ucp_worker_query(mca_osc_ucx_component.ucp_worker, &worker_attr); if (UCS_OK != status) { opal_output_verbose(1, ompi_osc_base_framework.framework_output, "%s:%d: ucp_worker_query failed: %d\n", __FILE__, __LINE__, status); ret = OMPI_ERROR; goto error; } if (mca_osc_ucx_component.enable_mpi_threads == true && worker_attr.thread_mode != UCS_THREAD_MODE_MULTI) { opal_output_verbose(1, ompi_osc_base_framework.framework_output, "%s:%d: ucx does not support multithreading\n", __FILE__, __LINE__); ret = OMPI_ERROR; goto error; } worker_created = true; } /* create module structure */ module = (ompi_osc_ucx_module_t *)calloc(1, sizeof(ompi_osc_ucx_module_t)); if (module == NULL) { ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; goto error; } /* fill in the function pointer part */ memcpy(module, &ompi_osc_ucx_module_template, sizeof(ompi_osc_base_module_t)); ret = ompi_comm_dup(comm, &module->comm); if (ret != OMPI_SUCCESS) { goto error; } asprintf(&name, "ucx window %d", ompi_comm_get_cid(module->comm)); ompi_win_set_name(win, name); free(name); /* share everyone's displacement units. Only do an allgather if strictly necessary, since it requires O(p) state. */ values[0] = disp_unit; values[1] = -disp_unit; ret = module->comm->c_coll->coll_allreduce(MPI_IN_PLACE, values, 2, MPI_LONG, MPI_MIN, module->comm, module->comm->c_coll->coll_allreduce_module); if (OMPI_SUCCESS != ret) { goto error; } if (values[0] == -values[1]) { /* everyone has the same disp_unit, we do not need O(p) space */ module->disp_unit = disp_unit; } else { /* different disp_unit sizes, allocate O(p) space to store them */ module->disp_unit = -1; module->disp_units = calloc(comm_size, sizeof(int)); if (module->disp_units == NULL) { ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; goto error; } ret = module->comm->c_coll->coll_allgather(&disp_unit, 1, MPI_INT, module->disp_units, 1, MPI_INT, module->comm, module->comm->c_coll->coll_allgather_module); if (OMPI_SUCCESS != ret) { goto error; } } /* exchange endpoints if necessary */ is_eps_ready = 1; for (i = 0; i < comm_size; i++) { if (OSC_UCX_GET_EP(module->comm, i) == NULL) { is_eps_ready = 0; break; } } ret = module->comm->c_coll->coll_allreduce(MPI_IN_PLACE, &is_eps_ready, 1, MPI_INT, MPI_LAND, module->comm, module->comm->c_coll->coll_allreduce_module); if (OMPI_SUCCESS != ret) { goto error; } if (!is_eps_ready) { status = ucp_worker_get_address(mca_osc_ucx_component.ucp_worker, &my_addr, &my_addr_len); if (status != UCS_OK) { opal_output_verbose(1, ompi_osc_base_framework.framework_output, "%s:%d: ucp_worker_get_address failed: %d\n", __FILE__, __LINE__, status); ret = OMPI_ERROR; goto error; } ret = allgather_len_and_info(my_addr, (int)my_addr_len, &recv_buf, disps, module->comm); if (ret != OMPI_SUCCESS) { goto error; } for (i = 0; i < comm_size; i++) { if (OSC_UCX_GET_EP(module->comm, i) == NULL) { ucp_ep_params_t ep_params; ucp_ep_h ep; memset(&ep_params, 0, sizeof(ucp_ep_params_t)); ep_params.field_mask = UCP_EP_PARAM_FIELD_REMOTE_ADDRESS; ep_params.address = (ucp_address_t *)&(recv_buf[disps[i]]); status = ucp_ep_create(mca_osc_ucx_component.ucp_worker, &ep_params, &ep); if (status != UCS_OK) { opal_output_verbose(1, ompi_osc_base_framework.framework_output, "%s:%d: ucp_ep_create failed: %d\n", __FILE__, __LINE__, status); ret = OMPI_ERROR; goto error; } ompi_comm_peer_lookup(module->comm, i)->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_UCX] = ep; } } ucp_worker_release_address(mca_osc_ucx_component.ucp_worker, my_addr); my_addr = NULL; free(recv_buf); recv_buf = NULL; eps_created = true; } ret = mem_map(base, size, &(module->memh), module, flavor); if (ret != OMPI_SUCCESS) { goto error; } state_base = (void *)&(module->state); ret = mem_map(&state_base, sizeof(ompi_osc_ucx_state_t), &(module->state_memh), module, MPI_WIN_FLAVOR_CREATE); if (ret != OMPI_SUCCESS) { goto error; } module->win_info_array = calloc(comm_size, sizeof(ompi_osc_ucx_win_info_t)); if (module->win_info_array == NULL) { ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; goto error; } module->state_info_array = calloc(comm_size, sizeof(ompi_osc_ucx_win_info_t)); if (module->state_info_array == NULL) { ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; goto error; } status = ucp_rkey_pack(mca_osc_ucx_component.ucp_context, module->memh, &rkey_buffer, &rkey_buffer_size); if (status != UCS_OK) { opal_output_verbose(1, ompi_osc_base_framework.framework_output, "%s:%d: ucp_rkey_pack failed: %d\n", __FILE__, __LINE__, status); ret = OMPI_ERROR; goto error; } status = ucp_rkey_pack(mca_osc_ucx_component.ucp_context, module->state_memh, &state_rkey_buffer, &state_rkey_buffer_size); if (status != UCS_OK) { opal_output_verbose(1, ompi_osc_base_framework.framework_output, "%s:%d: ucp_rkey_pack failed: %d\n", __FILE__, __LINE__, status); ret = OMPI_ERROR; goto error; } my_info_len = 2 * sizeof(uint64_t) + rkey_buffer_size + state_rkey_buffer_size; my_info = malloc(my_info_len); if (my_info == NULL) { ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; goto error; } memcpy(my_info, base, sizeof(uint64_t)); memcpy((void *)((char *)my_info + sizeof(uint64_t)), &state_base, sizeof(uint64_t)); memcpy((void *)((char *)my_info + 2 * sizeof(uint64_t)), rkey_buffer, rkey_buffer_size); memcpy((void *)((char *)my_info + 2 * sizeof(uint64_t) + rkey_buffer_size), state_rkey_buffer, state_rkey_buffer_size); ret = allgather_len_and_info(my_info, (int)my_info_len, &recv_buf, disps, module->comm); if (ret != OMPI_SUCCESS) { goto error; } ret = comm->c_coll->coll_allgather((void *)&rkey_buffer_size, 1, MPI_INT, rkey_sizes, 1, MPI_INT, comm, comm->c_coll->coll_allgather_module); if (OMPI_SUCCESS != ret) { goto error; } for (i = 0; i < comm_size; i++) { ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, i); assert(ep != NULL); memcpy(&(module->win_info_array[i]).addr, &recv_buf[disps[i]], sizeof(uint64_t)); memcpy(&(module->state_info_array[i]).addr, &recv_buf[disps[i] + sizeof(uint64_t)], sizeof(uint64_t)); status = ucp_ep_rkey_unpack(ep, &(recv_buf[disps[i] + 2 * sizeof(uint64_t)]), &((module->win_info_array[i]).rkey)); if (status != UCS_OK) { opal_output_verbose(1, ompi_osc_base_framework.framework_output, "%s:%d: ucp_ep_rkey_unpack failed: %d\n", __FILE__, __LINE__, status); ret = OMPI_ERROR; goto error; } status = ucp_ep_rkey_unpack(ep, &(recv_buf[disps[i] + 2 * sizeof(uint64_t) + rkey_sizes[i]]), &((module->state_info_array[i]).rkey)); if (status != UCS_OK) { opal_output_verbose(1, ompi_osc_base_framework.framework_output, "%s:%d: ucp_ep_rkey_unpack failed: %d\n", __FILE__, __LINE__, status); ret = OMPI_ERROR; goto error; } } free(my_info); free(recv_buf); ucp_rkey_buffer_release(rkey_buffer); ucp_rkey_buffer_release(state_rkey_buffer); module->state.lock = TARGET_LOCK_UNLOCKED; module->state.post_index = 0; memset((void *)module->state.post_state, 0, sizeof(uint64_t) * OMPI_OSC_UCX_POST_PEER_MAX); module->state.complete_count = 0; module->state.req_flag = 0; module->state.acc_lock = TARGET_LOCK_UNLOCKED; module->epoch_type.access = NONE_EPOCH; module->epoch_type.exposure = NONE_EPOCH; module->lock_count = 0; module->post_count = 0; module->start_group = NULL; module->post_group = NULL; OBJ_CONSTRUCT(&module->outstanding_locks, opal_hash_table_t); OBJ_CONSTRUCT(&module->pending_posts, opal_list_t); module->global_ops_num = 0; module->per_target_ops_nums = calloc(comm_size, sizeof(int)); module->start_grp_ranks = NULL; module->lock_all_is_nocheck = false; ret = opal_hash_table_init(&module->outstanding_locks, comm_size); if (ret != OPAL_SUCCESS) { goto error; } win->w_osc_module = &module->super; /* sync with everyone */ ret = module->comm->c_coll->coll_barrier(module->comm, module->comm->c_coll->coll_barrier_module); if (ret != OMPI_SUCCESS) { goto error; } return ret; error: if (my_addr) ucp_worker_release_address(mca_osc_ucx_component.ucp_worker, my_addr); if (recv_buf) free(recv_buf); if (my_info) free(my_info); for (i = 0; i < comm_size; i++) { if ((module->win_info_array[i]).rkey != NULL) { ucp_rkey_destroy((module->win_info_array[i]).rkey); } if ((module->state_info_array[i]).rkey != NULL) { ucp_rkey_destroy((module->state_info_array[i]).rkey); } } if (rkey_buffer) ucp_rkey_buffer_release(rkey_buffer); if (state_rkey_buffer) ucp_rkey_buffer_release(state_rkey_buffer); if (module->win_info_array) free(module->win_info_array); if (module->state_info_array) free(module->state_info_array); if (module->disp_units) free(module->disp_units); if (module->comm) ompi_comm_free(&module->comm); if (module->per_target_ops_nums) free(module->per_target_ops_nums); if (eps_created) { for (i = 0; i < comm_size; i++) { ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, i); ucp_ep_destroy(ep); } } if (worker_created) ucp_worker_destroy(mca_osc_ucx_component.ucp_worker); if (module) free(module); return ret; }
int ompi_osc_pt2pt_component_select(ompi_win_t *win, ompi_info_t *info, ompi_communicator_t *comm) { ompi_osc_pt2pt_module_t *module = NULL; int ret, i; ompi_osc_pt2pt_buffer_t *buffer = NULL; opal_free_list_item_t *item = NULL; char *tmp = NULL; /* create module structure */ module = (ompi_osc_pt2pt_module_t*) calloc(1, sizeof(ompi_osc_pt2pt_module_t)); if (NULL == module) return OMPI_ERR_TEMP_OUT_OF_RESOURCE; /* fill in the function pointer part */ memcpy(module, &ompi_osc_pt2pt_module_template, sizeof(ompi_osc_base_module_t)); /* initialize the p2p part */ OBJ_CONSTRUCT(&(module->p2p_lock), opal_mutex_t); OBJ_CONSTRUCT(&(module->p2p_cond), opal_condition_t); OBJ_CONSTRUCT(&(module->p2p_acc_lock), opal_mutex_t); OBJ_CONSTRUCT(&module->p2p_pending_sendreqs, opal_list_t); OBJ_CONSTRUCT(&(module->p2p_copy_pending_sendreqs), opal_list_t); OBJ_CONSTRUCT(&(module->p2p_locks_pending), opal_list_t); OBJ_CONSTRUCT(&(module->p2p_unlocks_pending), opal_list_t); module->p2p_win = win; ret = ompi_comm_dup(comm, &(module->p2p_comm)); if (ret != OMPI_SUCCESS) goto cleanup; opal_output_verbose(1, ompi_osc_base_framework.framework_output, "pt2pt component creating window with id %d", ompi_comm_get_cid(module->p2p_comm)); asprintf(&tmp, "%d", ompi_comm_get_cid(module->p2p_comm)); ompi_win_set_name(win, tmp); free(tmp); module->p2p_num_pending_sendreqs = (unsigned int*) malloc(sizeof(unsigned int) * ompi_comm_size(module->p2p_comm)); if (NULL == module->p2p_num_pending_sendreqs) { ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; goto cleanup; } memset(module->p2p_num_pending_sendreqs, 0, sizeof(unsigned int) * ompi_comm_size(module->p2p_comm)); module->p2p_num_pending_out = 0; module->p2p_num_pending_in = 0; module->p2p_num_post_msgs = 0; module->p2p_num_complete_msgs = 0; module->p2p_tag_counter = 0; module->p2p_copy_num_pending_sendreqs = (unsigned int*) malloc(sizeof(unsigned int) * ompi_comm_size(module->p2p_comm)); if (NULL == module->p2p_copy_num_pending_sendreqs) { ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; goto cleanup; } memset(module->p2p_num_pending_sendreqs, 0, sizeof(unsigned int) * ompi_comm_size(module->p2p_comm)); /* fence data */ module->p2p_fence_coll_counts = (int*) malloc(sizeof(int) * ompi_comm_size(module->p2p_comm)); if (NULL == module->p2p_fence_coll_counts) { ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; goto cleanup; } for (i = 0 ; i < ompi_comm_size(module->p2p_comm) ; ++i) { module->p2p_fence_coll_counts[i] = 1; } /* pwsc data */ module->p2p_pw_group = NULL; module->p2p_sc_group = NULL; module->p2p_sc_remote_active_ranks = (bool*) malloc(sizeof(bool) * ompi_comm_size(module->p2p_comm)); if (NULL == module->p2p_sc_remote_active_ranks) { ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; goto cleanup; } module->p2p_sc_remote_ranks = (int*) malloc(sizeof(int) * ompi_comm_size(module->p2p_comm)); if (NULL == module->p2p_sc_remote_ranks) { ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; goto cleanup; } /* lock data */ module->p2p_lock_status = 0; module->p2p_shared_count = 0; module->p2p_lock_received_ack = 0; /* fill in window information */ win->w_osc_module = (ompi_osc_base_module_t*) module; /* sync memory - make sure all initialization completed */ opal_atomic_mb(); /* start up receive for protocol headers */ OPAL_FREE_LIST_GET(&mca_osc_pt2pt_component.p2p_c_buffers, item, ret); if (OMPI_SUCCESS != ret) goto cleanup; buffer = (ompi_osc_pt2pt_buffer_t*) item; buffer->data = (void*) module; ret = ompi_osc_pt2pt_component_irecv(buffer->payload, mca_osc_pt2pt_component.p2p_c_eager_size, MPI_BYTE, MPI_ANY_SOURCE, CONTROL_MSG_TAG, module->p2p_comm, &(buffer->request), component_fragment_cb, buffer); if (OMPI_SUCCESS != ret) goto cleanup; return OMPI_SUCCESS; cleanup: OBJ_DESTRUCT(&module->p2p_unlocks_pending); OBJ_DESTRUCT(&module->p2p_locks_pending); OBJ_DESTRUCT(&module->p2p_copy_pending_sendreqs); OBJ_DESTRUCT(&module->p2p_pending_sendreqs); OBJ_DESTRUCT(&module->p2p_acc_lock); OBJ_DESTRUCT(&module->p2p_cond); OBJ_DESTRUCT(&module->p2p_lock); if (NULL != buffer) { OPAL_FREE_LIST_RETURN(&mca_osc_pt2pt_component.p2p_c_buffers, item); } if (NULL != module->p2p_sc_remote_ranks) { free(module->p2p_sc_remote_ranks); } if (NULL != module->p2p_sc_remote_active_ranks) { free(module->p2p_sc_remote_active_ranks); } if (NULL != module->p2p_fence_coll_counts) { free(module->p2p_fence_coll_counts); } if (NULL != module->p2p_copy_num_pending_sendreqs) { free(module->p2p_copy_num_pending_sendreqs); } if (NULL != module->p2p_num_pending_sendreqs) { free(module->p2p_num_pending_sendreqs); } if (NULL != module->p2p_comm) ompi_comm_free(&module->p2p_comm); #if OPAL_ENABLE_DEBUG memset(module, 0, sizeof(ompi_osc_base_module_t)); #endif if (NULL != module) free(module); return ret; }