static void ucp_perf_test_destroy_eps(ucx_perf_context_t* perf, unsigned group_size) { ucs_status_ptr_t *reqs; ucp_tag_recv_info_t info; ucs_status_t status; unsigned i; reqs = calloc(sizeof(*reqs), group_size); for (i = 0; i < group_size; ++i) { if (perf->ucp.peers[i].rkey != NULL) { ucp_rkey_destroy(perf->ucp.peers[i].rkey); } if (perf->ucp.peers[i].ep != NULL) { reqs[i] = ucp_disconnect_nb(perf->ucp.peers[i].ep); } } for (i = 0; i < group_size; ++i) { if (!UCS_PTR_IS_PTR(reqs[i])) { continue; } do { ucp_worker_progress(perf->ucp.worker); status = ucp_request_test(reqs[i], &info); } while (status == UCS_INPROGRESS); ucp_request_release(reqs[i]); } free(reqs); free(perf->ucp.peers); }
int mca_spml_ucx_deregister(sshmem_mkey_t *mkeys) { spml_ucx_mkey_t *ucx_mkey; map_segment_t *mem_seg; int segno; int my_pe = oshmem_my_proc_id(); MCA_SPML_CALL(quiet(oshmem_ctx_default)); if (!mkeys) return OSHMEM_SUCCESS; if (!mkeys[0].spml_context) return OSHMEM_SUCCESS; mem_seg = memheap_find_va(mkeys[0].va_base); ucx_mkey = (spml_ucx_mkey_t*)mkeys[0].spml_context; if (OPAL_UNLIKELY(NULL == mem_seg)) { return OSHMEM_ERROR; } if (MAP_SEGMENT_ALLOC_UCX != mem_seg->type) { ucp_mem_unmap(mca_spml_ucx.ucp_context, ucx_mkey->mem_h); } ucp_rkey_destroy(ucx_mkey->rkey); ucx_mkey->rkey = NULL; if (0 < mkeys[0].len) { ucp_rkey_buffer_release(mkeys[0].u.data); } free(mkeys); return OSHMEM_SUCCESS; }
int ompi_osc_ucx_free(struct ompi_win_t *win) { ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module; int i, ret = OMPI_SUCCESS; if ((module->epoch_type.access != NONE_EPOCH && module->epoch_type.access != FENCE_EPOCH) || module->epoch_type.exposure != NONE_EPOCH) { ret = OMPI_ERR_RMA_SYNC; } if (module->start_group != NULL || module->post_group != NULL) { ret = OMPI_ERR_RMA_SYNC; } assert(module->global_ops_num == 0); assert(module->lock_count == 0); assert(opal_list_is_empty(&module->pending_posts) == true); OBJ_DESTRUCT(&module->outstanding_locks); OBJ_DESTRUCT(&module->pending_posts); while (module->state.lock != TARGET_LOCK_UNLOCKED) { /* not sure if this is required */ ucp_worker_progress(mca_osc_ucx_component.ucp_worker); } ret = module->comm->c_coll->coll_barrier(module->comm, module->comm->c_coll->coll_barrier_module); for (i = 0; i < ompi_comm_size(module->comm); i++) { ucp_rkey_destroy((module->win_info_array[i]).rkey); ucp_rkey_destroy((module->state_info_array[i]).rkey); } free(module->win_info_array); free(module->state_info_array); free(module->per_target_ops_nums); ucp_mem_unmap(mca_osc_ucx_component.ucp_context, module->memh); ucp_mem_unmap(mca_osc_ucx_component.ucp_context, module->state_memh); if (module->disp_units) free(module->disp_units); ompi_comm_free(&module->comm); free(module); return ret; }
void mca_spml_ucx_rmkey_free(sshmem_mkey_t *mkey) { spml_ucx_mkey_t *ucx_mkey; if (!mkey->spml_context) { return; } ucx_mkey = (spml_ucx_mkey_t *)(mkey->spml_context); ucp_rkey_destroy(ucx_mkey->rkey); }
static inline int get_dynamic_win_info(uint64_t remote_addr, ompi_osc_ucx_module_t *module, ucp_ep_h ep, int target) { ucp_rkey_h state_rkey = (module->state_info_array)[target].rkey; uint64_t remote_state_addr = (module->state_info_array)[target].addr + OSC_UCX_STATE_DYNAMIC_WIN_CNT_OFFSET; size_t len = sizeof(uint64_t) + sizeof(ompi_osc_dynamic_win_info_t) * OMPI_OSC_UCX_ATTACH_MAX; char *temp_buf = malloc(len); ompi_osc_dynamic_win_info_t *temp_dynamic_wins; int win_count, contain, insert = -1; ucs_status_t status; if ((module->win_info_array[target]).rkey_init == true) { ucp_rkey_destroy((module->win_info_array[target]).rkey); (module->win_info_array[target]).rkey_init == false; } status = ucp_get_nbi(ep, (void *)temp_buf, len, remote_state_addr, state_rkey); if (status != UCS_OK && status != UCS_INPROGRESS) { opal_output_verbose(1, ompi_osc_base_framework.framework_output, "%s:%d: ucp_get_nbi failed: %d\n", __FILE__, __LINE__, status); return OMPI_ERROR; } status = ucp_ep_flush(ep); if (status != UCS_OK) { opal_output_verbose(1, ompi_osc_base_framework.framework_output, "%s:%d: ucp_ep_flush failed: %d\n", __FILE__, __LINE__, status); return OMPI_ERROR; } memcpy(&win_count, temp_buf, sizeof(uint64_t)); assert(win_count > 0 && win_count <= OMPI_OSC_UCX_ATTACH_MAX); temp_dynamic_wins = (ompi_osc_dynamic_win_info_t *)(temp_buf + sizeof(uint64_t)); contain = ompi_osc_find_attached_region_position(temp_dynamic_wins, 0, win_count, remote_addr, 1, &insert); assert(contain >= 0 && contain < win_count); status = ucp_ep_rkey_unpack(ep, temp_dynamic_wins[contain].rkey_buffer, &((module->win_info_array[target]).rkey)); if (status != UCS_OK) { opal_output_verbose(1, ompi_osc_base_framework.framework_output, "%s:%d: ucp_ep_rkey_unpack failed: %d\n", __FILE__, __LINE__, status); return OMPI_ERROR; } (module->win_info_array[target]).rkey_init = true; free(temp_buf); return status; }
static void ucp_perf_test_destroy_eps(ucx_perf_context_t* perf, unsigned group_size) { unsigned i; for (i = 0; i < group_size; ++i) { if (perf->ucp.peers[i].rkey != NULL) { ucp_rkey_destroy(perf->ucp.peers[i].rkey); } if (perf->ucp.peers[i].ep != NULL) { ucp_ep_destroy(perf->ucp.peers[i].ep); } } free(perf->ucp.peers); }
ucs_status_t ucp_ep_rkey_unpack(ucp_ep_h ep, void *rkey_buffer, ucp_rkey_h *rkey_p) { unsigned remote_pd_index, remote_pd_gap; unsigned rkey_index; unsigned pd_count; ucs_status_t status; ucp_rkey_h rkey; uint8_t pd_size; ucp_pd_map_t pd_map; void *p; /* Count the number of remote PDs in the rkey buffer */ p = rkey_buffer; /* Read remote PD map */ pd_map = *(ucp_pd_map_t*)p; ucs_trace("unpacking rkey with pd_map 0x%x", pd_map); if (pd_map == 0) { /* Dummy key return ok */ *rkey_p = &ucp_mem_dummy_rkey; return UCS_OK; } pd_count = ucs_count_one_bits(pd_map); p += sizeof(ucp_pd_map_t); /* Allocate rkey handle which holds UCT rkeys for all remote PDs. * We keep all of them to handle a future transport switch. */ rkey = ucs_malloc(sizeof(*rkey) + (sizeof(rkey->uct[0]) * pd_count), "ucp_rkey"); if (rkey == NULL) { status = UCS_ERR_NO_MEMORY; goto err; } rkey->pd_map = 0; remote_pd_index = 0; /* Index of remote PD */ rkey_index = 0; /* Index of the rkey in the array */ /* Unpack rkey of each UCT PD */ while (pd_map > 0) { pd_size = *((uint8_t*)p++); /* Use bit operations to iterate through the indices of the remote PDs * as provided in the pd_map. pd_map always holds a bitmap of PD indices * that remain to be used. Every time we find the "gap" until the next * valid PD index using ffs operation. If some rkeys cannot be unpacked, * we remove them from the local map. */ remote_pd_gap = ucs_ffs64(pd_map); /* Find the offset for next PD index */ remote_pd_index += remote_pd_gap; /* Calculate next index of remote PD*/ pd_map >>= remote_pd_gap; /* Remove the gap from the map */ ucs_assert(pd_map & 1); /* Unpack only reachable rkeys */ if (UCS_BIT(remote_pd_index) & ucp_ep_config(ep)->key.reachable_pd_map) { ucs_assert(rkey_index < pd_count); status = uct_rkey_unpack(p, &rkey->uct[rkey_index]); if (status != UCS_OK) { ucs_error("Failed to unpack remote key from remote pd[%d]: %s", remote_pd_index, ucs_status_string(status)); goto err_destroy; } ucs_trace("rkey[%d] for remote pd %d is 0x%lx", rkey_index, remote_pd_index, rkey->uct[rkey_index].rkey); rkey->pd_map |= UCS_BIT(remote_pd_index); ++rkey_index; } ++remote_pd_index; pd_map >>= 1; p += pd_size; } if (rkey->pd_map == 0) { ucs_debug("The unpacked rkey from the destination is unreachable"); status = UCS_ERR_UNREACHABLE; goto err_destroy; } *rkey_p = rkey; return UCS_OK; err_destroy: ucp_rkey_destroy(rkey); err: return status; }
static int component_select(struct ompi_win_t *win, void **base, size_t size, int disp_unit, struct ompi_communicator_t *comm, struct opal_info_t *info, int flavor, int *model) { ompi_osc_ucx_module_t *module = NULL; char *name = NULL; long values[2]; int ret = OMPI_SUCCESS; ucs_status_t status; int i, comm_size = ompi_comm_size(comm); int is_eps_ready; bool eps_created = false, worker_created = false; ucp_address_t *my_addr = NULL; size_t my_addr_len; char *recv_buf = NULL; void *rkey_buffer = NULL, *state_rkey_buffer = NULL; size_t rkey_buffer_size, state_rkey_buffer_size; void *state_base = NULL; void * my_info = NULL; size_t my_info_len; int disps[comm_size]; int rkey_sizes[comm_size]; /* the osc/sm component is the exclusive provider for support for * shared memory windows */ if (flavor == MPI_WIN_FLAVOR_SHARED) { return OMPI_ERR_NOT_SUPPORTED; } /* if UCP worker has never been initialized before, init it first */ if (mca_osc_ucx_component.ucp_worker == NULL) { ucp_worker_params_t worker_params; ucp_worker_attr_t worker_attr; memset(&worker_params, 0, sizeof(ucp_worker_h)); worker_params.field_mask = UCP_WORKER_PARAM_FIELD_THREAD_MODE; worker_params.thread_mode = (mca_osc_ucx_component.enable_mpi_threads == true) ? UCS_THREAD_MODE_MULTI : UCS_THREAD_MODE_SINGLE; status = ucp_worker_create(mca_osc_ucx_component.ucp_context, &worker_params, &(mca_osc_ucx_component.ucp_worker)); if (UCS_OK != status) { opal_output_verbose(1, ompi_osc_base_framework.framework_output, "%s:%d: ucp_worker_create failed: %d\n", __FILE__, __LINE__, status); ret = OMPI_ERROR; goto error; } /* query UCP worker attributes */ worker_attr.field_mask = UCP_WORKER_ATTR_FIELD_THREAD_MODE; status = ucp_worker_query(mca_osc_ucx_component.ucp_worker, &worker_attr); if (UCS_OK != status) { opal_output_verbose(1, ompi_osc_base_framework.framework_output, "%s:%d: ucp_worker_query failed: %d\n", __FILE__, __LINE__, status); ret = OMPI_ERROR; goto error; } if (mca_osc_ucx_component.enable_mpi_threads == true && worker_attr.thread_mode != UCS_THREAD_MODE_MULTI) { opal_output_verbose(1, ompi_osc_base_framework.framework_output, "%s:%d: ucx does not support multithreading\n", __FILE__, __LINE__); ret = OMPI_ERROR; goto error; } worker_created = true; } /* create module structure */ module = (ompi_osc_ucx_module_t *)calloc(1, sizeof(ompi_osc_ucx_module_t)); if (module == NULL) { ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; goto error; } /* fill in the function pointer part */ memcpy(module, &ompi_osc_ucx_module_template, sizeof(ompi_osc_base_module_t)); ret = ompi_comm_dup(comm, &module->comm); if (ret != OMPI_SUCCESS) { goto error; } asprintf(&name, "ucx window %d", ompi_comm_get_cid(module->comm)); ompi_win_set_name(win, name); free(name); /* share everyone's displacement units. Only do an allgather if strictly necessary, since it requires O(p) state. */ values[0] = disp_unit; values[1] = -disp_unit; ret = module->comm->c_coll->coll_allreduce(MPI_IN_PLACE, values, 2, MPI_LONG, MPI_MIN, module->comm, module->comm->c_coll->coll_allreduce_module); if (OMPI_SUCCESS != ret) { goto error; } if (values[0] == -values[1]) { /* everyone has the same disp_unit, we do not need O(p) space */ module->disp_unit = disp_unit; } else { /* different disp_unit sizes, allocate O(p) space to store them */ module->disp_unit = -1; module->disp_units = calloc(comm_size, sizeof(int)); if (module->disp_units == NULL) { ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; goto error; } ret = module->comm->c_coll->coll_allgather(&disp_unit, 1, MPI_INT, module->disp_units, 1, MPI_INT, module->comm, module->comm->c_coll->coll_allgather_module); if (OMPI_SUCCESS != ret) { goto error; } } /* exchange endpoints if necessary */ is_eps_ready = 1; for (i = 0; i < comm_size; i++) { if (OSC_UCX_GET_EP(module->comm, i) == NULL) { is_eps_ready = 0; break; } } ret = module->comm->c_coll->coll_allreduce(MPI_IN_PLACE, &is_eps_ready, 1, MPI_INT, MPI_LAND, module->comm, module->comm->c_coll->coll_allreduce_module); if (OMPI_SUCCESS != ret) { goto error; } if (!is_eps_ready) { status = ucp_worker_get_address(mca_osc_ucx_component.ucp_worker, &my_addr, &my_addr_len); if (status != UCS_OK) { opal_output_verbose(1, ompi_osc_base_framework.framework_output, "%s:%d: ucp_worker_get_address failed: %d\n", __FILE__, __LINE__, status); ret = OMPI_ERROR; goto error; } ret = allgather_len_and_info(my_addr, (int)my_addr_len, &recv_buf, disps, module->comm); if (ret != OMPI_SUCCESS) { goto error; } for (i = 0; i < comm_size; i++) { if (OSC_UCX_GET_EP(module->comm, i) == NULL) { ucp_ep_params_t ep_params; ucp_ep_h ep; memset(&ep_params, 0, sizeof(ucp_ep_params_t)); ep_params.field_mask = UCP_EP_PARAM_FIELD_REMOTE_ADDRESS; ep_params.address = (ucp_address_t *)&(recv_buf[disps[i]]); status = ucp_ep_create(mca_osc_ucx_component.ucp_worker, &ep_params, &ep); if (status != UCS_OK) { opal_output_verbose(1, ompi_osc_base_framework.framework_output, "%s:%d: ucp_ep_create failed: %d\n", __FILE__, __LINE__, status); ret = OMPI_ERROR; goto error; } ompi_comm_peer_lookup(module->comm, i)->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_UCX] = ep; } } ucp_worker_release_address(mca_osc_ucx_component.ucp_worker, my_addr); my_addr = NULL; free(recv_buf); recv_buf = NULL; eps_created = true; } ret = mem_map(base, size, &(module->memh), module, flavor); if (ret != OMPI_SUCCESS) { goto error; } state_base = (void *)&(module->state); ret = mem_map(&state_base, sizeof(ompi_osc_ucx_state_t), &(module->state_memh), module, MPI_WIN_FLAVOR_CREATE); if (ret != OMPI_SUCCESS) { goto error; } module->win_info_array = calloc(comm_size, sizeof(ompi_osc_ucx_win_info_t)); if (module->win_info_array == NULL) { ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; goto error; } module->state_info_array = calloc(comm_size, sizeof(ompi_osc_ucx_win_info_t)); if (module->state_info_array == NULL) { ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; goto error; } status = ucp_rkey_pack(mca_osc_ucx_component.ucp_context, module->memh, &rkey_buffer, &rkey_buffer_size); if (status != UCS_OK) { opal_output_verbose(1, ompi_osc_base_framework.framework_output, "%s:%d: ucp_rkey_pack failed: %d\n", __FILE__, __LINE__, status); ret = OMPI_ERROR; goto error; } status = ucp_rkey_pack(mca_osc_ucx_component.ucp_context, module->state_memh, &state_rkey_buffer, &state_rkey_buffer_size); if (status != UCS_OK) { opal_output_verbose(1, ompi_osc_base_framework.framework_output, "%s:%d: ucp_rkey_pack failed: %d\n", __FILE__, __LINE__, status); ret = OMPI_ERROR; goto error; } my_info_len = 2 * sizeof(uint64_t) + rkey_buffer_size + state_rkey_buffer_size; my_info = malloc(my_info_len); if (my_info == NULL) { ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; goto error; } memcpy(my_info, base, sizeof(uint64_t)); memcpy((void *)((char *)my_info + sizeof(uint64_t)), &state_base, sizeof(uint64_t)); memcpy((void *)((char *)my_info + 2 * sizeof(uint64_t)), rkey_buffer, rkey_buffer_size); memcpy((void *)((char *)my_info + 2 * sizeof(uint64_t) + rkey_buffer_size), state_rkey_buffer, state_rkey_buffer_size); ret = allgather_len_and_info(my_info, (int)my_info_len, &recv_buf, disps, module->comm); if (ret != OMPI_SUCCESS) { goto error; } ret = comm->c_coll->coll_allgather((void *)&rkey_buffer_size, 1, MPI_INT, rkey_sizes, 1, MPI_INT, comm, comm->c_coll->coll_allgather_module); if (OMPI_SUCCESS != ret) { goto error; } for (i = 0; i < comm_size; i++) { ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, i); assert(ep != NULL); memcpy(&(module->win_info_array[i]).addr, &recv_buf[disps[i]], sizeof(uint64_t)); memcpy(&(module->state_info_array[i]).addr, &recv_buf[disps[i] + sizeof(uint64_t)], sizeof(uint64_t)); status = ucp_ep_rkey_unpack(ep, &(recv_buf[disps[i] + 2 * sizeof(uint64_t)]), &((module->win_info_array[i]).rkey)); if (status != UCS_OK) { opal_output_verbose(1, ompi_osc_base_framework.framework_output, "%s:%d: ucp_ep_rkey_unpack failed: %d\n", __FILE__, __LINE__, status); ret = OMPI_ERROR; goto error; } status = ucp_ep_rkey_unpack(ep, &(recv_buf[disps[i] + 2 * sizeof(uint64_t) + rkey_sizes[i]]), &((module->state_info_array[i]).rkey)); if (status != UCS_OK) { opal_output_verbose(1, ompi_osc_base_framework.framework_output, "%s:%d: ucp_ep_rkey_unpack failed: %d\n", __FILE__, __LINE__, status); ret = OMPI_ERROR; goto error; } } free(my_info); free(recv_buf); ucp_rkey_buffer_release(rkey_buffer); ucp_rkey_buffer_release(state_rkey_buffer); module->state.lock = TARGET_LOCK_UNLOCKED; module->state.post_index = 0; memset((void *)module->state.post_state, 0, sizeof(uint64_t) * OMPI_OSC_UCX_POST_PEER_MAX); module->state.complete_count = 0; module->state.req_flag = 0; module->state.acc_lock = TARGET_LOCK_UNLOCKED; module->epoch_type.access = NONE_EPOCH; module->epoch_type.exposure = NONE_EPOCH; module->lock_count = 0; module->post_count = 0; module->start_group = NULL; module->post_group = NULL; OBJ_CONSTRUCT(&module->outstanding_locks, opal_hash_table_t); OBJ_CONSTRUCT(&module->pending_posts, opal_list_t); module->global_ops_num = 0; module->per_target_ops_nums = calloc(comm_size, sizeof(int)); module->start_grp_ranks = NULL; module->lock_all_is_nocheck = false; ret = opal_hash_table_init(&module->outstanding_locks, comm_size); if (ret != OPAL_SUCCESS) { goto error; } win->w_osc_module = &module->super; /* sync with everyone */ ret = module->comm->c_coll->coll_barrier(module->comm, module->comm->c_coll->coll_barrier_module); if (ret != OMPI_SUCCESS) { goto error; } return ret; error: if (my_addr) ucp_worker_release_address(mca_osc_ucx_component.ucp_worker, my_addr); if (recv_buf) free(recv_buf); if (my_info) free(my_info); for (i = 0; i < comm_size; i++) { if ((module->win_info_array[i]).rkey != NULL) { ucp_rkey_destroy((module->win_info_array[i]).rkey); } if ((module->state_info_array[i]).rkey != NULL) { ucp_rkey_destroy((module->state_info_array[i]).rkey); } } if (rkey_buffer) ucp_rkey_buffer_release(rkey_buffer); if (state_rkey_buffer) ucp_rkey_buffer_release(state_rkey_buffer); if (module->win_info_array) free(module->win_info_array); if (module->state_info_array) free(module->state_info_array); if (module->disp_units) free(module->disp_units); if (module->comm) ompi_comm_free(&module->comm); if (module->per_target_ops_nums) free(module->per_target_ops_nums); if (eps_created) { for (i = 0; i < comm_size; i++) { ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, i); ucp_ep_destroy(ep); } } if (worker_created) ucp_worker_destroy(mca_osc_ucx_component.ucp_worker); if (module) free(module); return ret; }