Exemplo n.º 1
0
static inline int start_shared(ompi_osc_ucx_module_t *module, int target) {
    uint64_t result_value = -1;
    ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, target);
    ucp_rkey_h rkey = (module->state_info_array)[target].rkey;
    uint64_t remote_addr = (module->state_info_array)[target].addr + OSC_UCX_STATE_LOCK_OFFSET;
    ucs_status_t status;
    int ret;

    while (true) {
        ret = opal_common_ucx_atomic_fetch(ep, UCP_ATOMIC_FETCH_OP_FADD, 1,
                                           &result_value, sizeof(result_value),
                                           remote_addr, rkey, mca_osc_ucx_component.ucp_worker);
        if (OMPI_SUCCESS != ret) {
            return ret;
        }
        assert((int64_t)result_value >= 0);
        if (result_value >= TARGET_LOCK_EXCLUSIVE) {
            status = ucp_atomic_post(ep, UCP_ATOMIC_POST_OP_ADD, (-1), sizeof(uint64_t),
                                     remote_addr, rkey);
            if (status != UCS_OK) {
                OSC_UCX_VERBOSE(1, "ucp_atomic_add64 failed: %d", status);
                return OMPI_ERROR;
            }
        } else {
            break;
        }
    }

    return OMPI_SUCCESS;
}
Exemplo n.º 2
0
int ompi_osc_ucx_get(void *origin_addr, int origin_count,
                     struct ompi_datatype_t *origin_dt,
                     int target, ptrdiff_t target_disp, int target_count,
                     struct ompi_datatype_t *target_dt, struct ompi_win_t *win) {
    ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module;
    ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, target);
    uint64_t remote_addr = (module->win_info_array[target]).addr + target_disp * OSC_UCX_GET_DISP(module, target);
    ucp_rkey_h rkey;
    ptrdiff_t origin_lb, origin_extent, target_lb, target_extent;
    bool is_origin_contig = false, is_target_contig = false;
    ucs_status_t status;
    int ret = OMPI_SUCCESS;

    ret = check_sync_state(module, target, false);
    if (ret != OMPI_SUCCESS) {
        return ret;
    }

    if (module->flavor == MPI_WIN_FLAVOR_DYNAMIC) {
        status = get_dynamic_win_info(remote_addr, module, ep, target);
        if (status != UCS_OK) {
            return OMPI_ERROR;
        }
    }

    rkey = (module->win_info_array[target]).rkey;

    ompi_datatype_get_true_extent(origin_dt, &origin_lb, &origin_extent);
    ompi_datatype_get_true_extent(target_dt, &target_lb, &target_extent);

    is_origin_contig = ompi_datatype_is_contiguous_memory_layout(origin_dt, origin_count);
    is_target_contig = ompi_datatype_is_contiguous_memory_layout(target_dt, target_count);

    if (is_origin_contig && is_target_contig) {
        /* fast path */
        size_t origin_len;

        ompi_datatype_type_size(origin_dt, &origin_len);
        origin_len *= origin_count;

        status = ucp_get_nbi(ep, (void *)((intptr_t)origin_addr + origin_lb), origin_len,
                             remote_addr + target_lb, rkey);
        if (status != UCS_OK && status != UCS_INPROGRESS) {
            opal_output_verbose(1, ompi_osc_base_framework.framework_output,
                                "%s:%d: ucp_get_nbi failed: %d\n",
                                __FILE__, __LINE__, status);
            return OMPI_ERROR;
        }

        return incr_and_check_ops_num(module, target, ep);
    } else {
        return ddt_put_get(module, origin_addr, origin_count, origin_dt, is_origin_contig,
                           origin_lb, target, ep, remote_addr, rkey, target_count, target_dt,
                           is_target_contig, target_lb, true);
    }
}
Exemplo n.º 3
0
static inline int end_shared(ompi_osc_ucx_module_t *module, int target) {
    ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, target);
    ucp_rkey_h rkey = (module->state_info_array)[target].rkey;
    uint64_t remote_addr = (module->state_info_array)[target].addr + OSC_UCX_STATE_LOCK_OFFSET;
    ucs_status_t status;

    status = ucp_atomic_post(ep, UCP_ATOMIC_POST_OP_ADD, (-1), sizeof(uint64_t),
                             remote_addr, rkey);
    if (status != UCS_OK) {
        OSC_UCX_VERBOSE(1, "ucp_atomic_post(OP_ADD) failed: %d", status);
        return OMPI_ERROR;
    }

    return OMPI_SUCCESS;
}
Exemplo n.º 4
0
int ompi_osc_ucx_compare_and_swap(const void *origin_addr, const void *compare_addr,
                                  void *result_addr, struct ompi_datatype_t *dt,
                                  int target, ptrdiff_t target_disp,
                                  struct ompi_win_t *win) {
    ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t *)win->w_osc_module;
    ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, target);
    uint64_t remote_addr = (module->win_info_array[target]).addr + target_disp * OSC_UCX_GET_DISP(module, target);
    ucp_rkey_h rkey;
    size_t dt_bytes;
    ompi_osc_ucx_internal_request_t *req = NULL;
    int ret = OMPI_SUCCESS;
    ucs_status_t status;

    ret = check_sync_state(module, target, false);
    if (ret != OMPI_SUCCESS) {
        return ret;
    }

    ret = start_atomicity(module, ep, target);
    if (ret != OMPI_SUCCESS) {
        return ret;
    }

    if (module->flavor == MPI_WIN_FLAVOR_DYNAMIC) {
        status = get_dynamic_win_info(remote_addr, module, ep, target);
        if (status != UCS_OK) {
            return OMPI_ERROR;
        }
    }

    rkey = (module->win_info_array[target]).rkey;

    ompi_datatype_type_size(dt, &dt_bytes);
    memcpy(result_addr, origin_addr, dt_bytes);
    req = ucp_atomic_fetch_nb(ep, UCP_ATOMIC_FETCH_OP_CSWAP, *(uint64_t *)compare_addr,
                              result_addr, dt_bytes, remote_addr, rkey, req_completion);
    if (UCS_PTR_IS_PTR(req)) {
        ucp_request_release(req);
    }

    ret = incr_and_check_ops_num(module, target, ep);
    if (ret != OMPI_SUCCESS) {
        return ret;
    }

    return end_atomicity(module, ep, target);
}
Exemplo n.º 5
0
int ompi_osc_ucx_unlock(int target, struct ompi_win_t *win) {
    ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t *)win->w_osc_module;
    ompi_osc_ucx_lock_t *lock = NULL;
    int ret = OMPI_SUCCESS;
    ucp_ep_h ep;

    if (module->epoch_type.access != PASSIVE_EPOCH) {
        return OMPI_ERR_RMA_SYNC;
    }

    opal_hash_table_get_value_uint32(&module->outstanding_locks, (uint32_t) target, (void **) &lock);
    if (lock == NULL) {
        return OMPI_ERR_RMA_SYNC;
    }

    opal_hash_table_remove_value_uint32(&module->outstanding_locks,
                                        (uint32_t)target);

    ep = OSC_UCX_GET_EP(module->comm, target);
    ret = opal_common_ucx_ep_flush(ep, mca_osc_ucx_component.ucp_worker);
    if (ret != OMPI_SUCCESS) {
        return ret;
    }

    module->global_ops_num -= module->per_target_ops_nums[target];
    module->per_target_ops_nums[target] = 0;

    if (lock->is_nocheck == false) {
        if (lock->type == LOCK_EXCLUSIVE) {
            ret = end_exclusive(module, target);
        } else {
            ret = end_shared(module, target);
        }
    }

    OBJ_RELEASE(lock);

    module->lock_count--;
    assert(module->lock_count >= 0);
    if (module->lock_count == 0) {
        module->epoch_type.access = NONE_EPOCH;
        assert(module->global_ops_num == 0);
    }

    return ret;
}
Exemplo n.º 6
0
static inline int end_exclusive(ompi_osc_ucx_module_t *module, int target) {
    uint64_t result_value = 0;
    ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, target);
    ucp_rkey_h rkey = (module->state_info_array)[target].rkey;
    uint64_t remote_addr = (module->state_info_array)[target].addr + OSC_UCX_STATE_LOCK_OFFSET;
    int ret;

    ret = opal_common_ucx_atomic_fetch(ep, UCP_ATOMIC_FETCH_OP_SWAP, TARGET_LOCK_UNLOCKED,
                                       &result_value, sizeof(result_value),
                                       remote_addr, rkey, mca_osc_ucx_component.ucp_worker);
    if (OMPI_SUCCESS != ret) {
        return ret;
    }

    assert(result_value >= TARGET_LOCK_EXCLUSIVE);

    return OMPI_SUCCESS;
}
Exemplo n.º 7
0
static inline int start_exclusive(ompi_osc_ucx_module_t *module, int target) {
    uint64_t result_value = -1;
    ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, target);
    ucp_rkey_h rkey = (module->state_info_array)[target].rkey;
    uint64_t remote_addr = (module->state_info_array)[target].addr + OSC_UCX_STATE_LOCK_OFFSET;
    ucs_status_t status;

    while (result_value != TARGET_LOCK_UNLOCKED) {
        status = opal_common_ucx_atomic_cswap(ep, TARGET_LOCK_UNLOCKED, TARGET_LOCK_EXCLUSIVE,
                                              &result_value, sizeof(result_value),
                                              remote_addr, rkey,
                                              mca_osc_ucx_component.ucp_worker);
        if (status != UCS_OK) {
            return OMPI_ERROR;
        }
    }

    return OMPI_SUCCESS;
}
Exemplo n.º 8
0
static int component_finalize(void) {
    int i;
    for (i = 0; i < ompi_proc_world_size(); i++) {
        ucp_ep_h ep = OSC_UCX_GET_EP(&(ompi_mpi_comm_world.comm), i);
        if (ep != NULL) {
            ucp_ep_destroy(ep);
        }
    }

    if (mca_osc_ucx_component.ucp_worker != NULL) {
        ucp_worker_destroy(mca_osc_ucx_component.ucp_worker);
    }

    assert(mca_osc_ucx_component.num_incomplete_req_ops == 0);
    OBJ_DESTRUCT(&mca_osc_ucx_component.requests);
    opal_progress_unregister(progress_callback);
    ucp_cleanup(mca_osc_ucx_component.ucp_context);
    return OMPI_SUCCESS;
}
Exemplo n.º 9
0
int ompi_osc_ucx_flush(int target, struct ompi_win_t *win) {
    ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module;
    ucp_ep_h ep;
    int ret;

    if (module->epoch_type.access != PASSIVE_EPOCH &&
        module->epoch_type.access != PASSIVE_ALL_EPOCH) {
        return OMPI_ERR_RMA_SYNC;
    }

    ep = OSC_UCX_GET_EP(module->comm, target);
    ret = opal_common_ucx_ep_flush(ep, mca_osc_ucx_component.ucp_worker);
    if (ret != OMPI_SUCCESS) {
        return ret;
    }

    module->global_ops_num -= module->per_target_ops_nums[target];
    module->per_target_ops_nums[target] = 0;

    return OMPI_SUCCESS;
}
Exemplo n.º 10
0
int ompi_osc_ucx_rget(void *origin_addr, int origin_count,
                      struct ompi_datatype_t *origin_dt,
                      int target, ptrdiff_t target_disp, int target_count,
                      struct ompi_datatype_t *target_dt, struct ompi_win_t *win,
                      struct ompi_request_t **request) {
    ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module;
    ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, target);
    uint64_t remote_addr = (module->state_info_array[target]).addr + OSC_UCX_STATE_REQ_FLAG_OFFSET;
    ucp_rkey_h rkey;
    ompi_osc_ucx_request_t *ucx_req = NULL;
    ompi_osc_ucx_internal_request_t *internal_req = NULL;
    ucs_status_t status;
    int ret = OMPI_SUCCESS;

    ret = check_sync_state(module, target, true);
    if (ret != OMPI_SUCCESS) {
        return ret;
    }

    if (module->flavor == MPI_WIN_FLAVOR_DYNAMIC) {
        status = get_dynamic_win_info(remote_addr, module, ep, target);
        if (status != UCS_OK) {
            return OMPI_ERROR;
        }
    }

    rkey = (module->win_info_array[target]).rkey;

    OMPI_OSC_UCX_REQUEST_ALLOC(win, ucx_req);
    if (NULL == ucx_req) {
        return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
    }

    ret = ompi_osc_ucx_get(origin_addr, origin_count, origin_dt, target, target_disp,
                           target_count, target_dt, win);
    if (ret != OMPI_SUCCESS) {
        return ret;
    }

    status = ucp_worker_fence(mca_osc_ucx_component.ucp_worker);
    if (status != UCS_OK) {
        opal_output_verbose(1, ompi_osc_base_framework.framework_output,
                            "%s:%d: ucp_worker_fence failed: %d\n",
                            __FILE__, __LINE__, status);
        return OMPI_ERROR;
    }

    internal_req = ucp_atomic_fetch_nb(ep, UCP_ATOMIC_FETCH_OP_FADD, 0,
                                       &(module->req_result), sizeof(uint64_t),
                                       remote_addr, rkey, req_completion);

    if (UCS_PTR_IS_PTR(internal_req)) {
        internal_req->external_req = ucx_req;
        mca_osc_ucx_component.num_incomplete_req_ops++;
    } else {
        ompi_request_complete(&ucx_req->super, true);
    }

    *request = &ucx_req->super;

    return incr_and_check_ops_num(module, target, ep);
}
Exemplo n.º 11
0
int ompi_osc_ucx_get_accumulate(const void *origin_addr, int origin_count,
                                struct ompi_datatype_t *origin_dt,
                                void *result_addr, int result_count,
                                struct ompi_datatype_t *result_dt,
                                int target, ptrdiff_t target_disp,
                                int target_count, struct ompi_datatype_t *target_dt,
                                struct ompi_op_t *op, struct ompi_win_t *win) {
    ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module;
    ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, target);
    int ret = OMPI_SUCCESS;

    ret = check_sync_state(module, target, false);
    if (ret != OMPI_SUCCESS) {
        return ret;
    }

    ret = start_atomicity(module, ep, target);
    if (ret != OMPI_SUCCESS) {
        return ret;
    }

    ret = ompi_osc_ucx_get(result_addr, result_count, result_dt, target,
                           target_disp, target_count, target_dt, win);
    if (ret != OMPI_SUCCESS) {
        return ret;
    }

    if (op != &ompi_mpi_op_no_op.op) {
        if (op == &ompi_mpi_op_replace.op) {
            ret = ompi_osc_ucx_put(origin_addr, origin_count, origin_dt,
                                   target, target_disp, target_count,
                                   target_dt, win);
            if (ret != OMPI_SUCCESS) {
                return ret;
            }
        } else {
            void *temp_addr = NULL;
            uint32_t temp_count;
            ompi_datatype_t *temp_dt;
            ptrdiff_t temp_lb, temp_extent;
            ucs_status_t status;
            bool is_origin_contig = ompi_datatype_is_contiguous_memory_layout(origin_dt, origin_count);

            if (ompi_datatype_is_predefined(target_dt)) {
                temp_dt = target_dt;
                temp_count = target_count;
            } else {
                ret = ompi_osc_base_get_primitive_type_info(target_dt, &temp_dt, &temp_count);
                if (ret != OMPI_SUCCESS) {
                    return ret;
                }
            }
            ompi_datatype_get_true_extent(temp_dt, &temp_lb, &temp_extent);
            temp_addr = malloc(temp_extent * temp_count);
            if (temp_addr == NULL) {
                return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
            }

            ret = ompi_osc_ucx_get(temp_addr, (int)temp_count, temp_dt,
                                   target, target_disp, target_count, target_dt, win);
            if (ret != OMPI_SUCCESS) {
                return ret;
            }

            status = ucp_ep_flush(ep);
            if (status != UCS_OK) {
                opal_output_verbose(1, ompi_osc_base_framework.framework_output,
                                    "%s:%d: ucp_ep_flush failed: %d\n",
                                    __FILE__, __LINE__, status);
                return OMPI_ERROR;
            }

            if (ompi_datatype_is_predefined(origin_dt) || is_origin_contig) {
                ompi_op_reduce(op, (void *)origin_addr, temp_addr, (int)temp_count, temp_dt);
            } else {
                ucx_iovec_t *origin_ucx_iov = NULL;
                uint32_t origin_ucx_iov_count = 0;
                uint32_t origin_ucx_iov_idx = 0;

                ret = create_iov_list(origin_addr, origin_count, origin_dt,
                                      &origin_ucx_iov, &origin_ucx_iov_count);
                if (ret != OMPI_SUCCESS) {
                    return ret;
                }

                if ((op != &ompi_mpi_op_maxloc.op && op != &ompi_mpi_op_minloc.op) ||
                    ompi_datatype_is_contiguous_memory_layout(temp_dt, temp_count)) {
                    size_t temp_size;
                    ompi_datatype_type_size(temp_dt, &temp_size);
                    while (origin_ucx_iov_idx < origin_ucx_iov_count) {
                        int curr_count = origin_ucx_iov[origin_ucx_iov_idx].len / temp_size;
                        ompi_op_reduce(op, origin_ucx_iov[origin_ucx_iov_idx].addr,
                                       temp_addr, curr_count, temp_dt);
                        temp_addr = (void *)((char *)temp_addr + curr_count * temp_size);
                        origin_ucx_iov_idx++;
                    }
                } else {
                    int i;
                    void *curr_origin_addr = origin_ucx_iov[origin_ucx_iov_idx].addr;
                    for (i = 0; i < (int)temp_count; i++) {
                        ompi_op_reduce(op, curr_origin_addr,
                                       (void *)((char *)temp_addr + i * temp_extent),
                                       1, temp_dt);
                        curr_origin_addr = (void *)((char *)curr_origin_addr + temp_extent);
                        origin_ucx_iov_idx++;
                        if (curr_origin_addr >= (void *)((char *)origin_ucx_iov[origin_ucx_iov_idx].addr + origin_ucx_iov[origin_ucx_iov_idx].len)) {
                            origin_ucx_iov_idx++;
                            curr_origin_addr = origin_ucx_iov[origin_ucx_iov_idx].addr;
                        }
                    }
                }
                free(origin_ucx_iov);
            }

            ret = ompi_osc_ucx_put(temp_addr, (int)temp_count, temp_dt, target, target_disp,
                                   target_count, target_dt, win);
            if (ret != OMPI_SUCCESS) {
                return ret;
            }

            status = ucp_ep_flush(ep);
            if (status != UCS_OK) {
                opal_output_verbose(1, ompi_osc_base_framework.framework_output,
                                    "%s:%d: ucp_ep_flush failed: %d\n",
                                    __FILE__, __LINE__, status);
                return OMPI_ERROR;
            }

            free(temp_addr);
        }
    }

    ret = end_atomicity(module, ep, target);

    return ret;
}
Exemplo n.º 12
0
int ompi_osc_ucx_fetch_and_op(const void *origin_addr, void *result_addr,
                              struct ompi_datatype_t *dt, int target,
                              ptrdiff_t target_disp, struct ompi_op_t *op,
                              struct ompi_win_t *win) {
    ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module;
    int ret = OMPI_SUCCESS;

    ret = check_sync_state(module, target, false);
    if (ret != OMPI_SUCCESS) {
        return ret;
    }

    if (op == &ompi_mpi_op_no_op.op || op == &ompi_mpi_op_replace.op ||
        op == &ompi_mpi_op_sum.op) {
        ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, target);
        uint64_t remote_addr = (module->win_info_array[target]).addr + target_disp * OSC_UCX_GET_DISP(module, target);
        ucp_rkey_h rkey;
        uint64_t value = *(uint64_t *)origin_addr;
        ucp_atomic_fetch_op_t opcode;
        size_t dt_bytes;
        ompi_osc_ucx_internal_request_t *req = NULL;
        ucs_status_t status;

        ret = start_atomicity(module, ep, target);
        if (ret != OMPI_SUCCESS) {
            return ret;
        }

        if (module->flavor == MPI_WIN_FLAVOR_DYNAMIC) {
            status = get_dynamic_win_info(remote_addr, module, ep, target);
            if (status != UCS_OK) {
                return OMPI_ERROR;
            }
        }

        rkey = (module->win_info_array[target]).rkey;

        ompi_datatype_type_size(dt, &dt_bytes);

        if (op == &ompi_mpi_op_replace.op) {
            opcode = UCP_ATOMIC_FETCH_OP_SWAP;
        } else {
            opcode = UCP_ATOMIC_FETCH_OP_FADD;
            if (op == &ompi_mpi_op_no_op.op) {
                value = 0;
            }
        }

        req = ucp_atomic_fetch_nb(ep, opcode, value, result_addr,
                                  dt_bytes, remote_addr, rkey, req_completion);
        if (UCS_PTR_IS_PTR(req)) {
            ucp_request_release(req);
        }

        ret = incr_and_check_ops_num(module, target, ep);
        if (ret != OMPI_SUCCESS) {
            return ret;
        }

        return end_atomicity(module, ep, target);
    } else {
        return ompi_osc_ucx_get_accumulate(origin_addr, 1, dt, result_addr, 1, dt,
                                           target, target_disp, 1, dt, op, win);
    }
}
Exemplo n.º 13
0
static int component_select(struct ompi_win_t *win, void **base, size_t size, int disp_unit,
                            struct ompi_communicator_t *comm, struct opal_info_t *info,
                            int flavor, int *model) {
    ompi_osc_ucx_module_t *module = NULL;
    char *name = NULL;
    long values[2];
    int ret = OMPI_SUCCESS;
    ucs_status_t status;
    int i, comm_size = ompi_comm_size(comm);
    int is_eps_ready;
    bool eps_created = false, worker_created = false;
    ucp_address_t *my_addr = NULL;
    size_t my_addr_len;
    char *recv_buf = NULL;
    void *rkey_buffer = NULL, *state_rkey_buffer = NULL;
    size_t rkey_buffer_size, state_rkey_buffer_size;
    void *state_base = NULL;
    void * my_info = NULL;
    size_t my_info_len;
    int disps[comm_size];
    int rkey_sizes[comm_size];

    /* the osc/sm component is the exclusive provider for support for
     * shared memory windows */
    if (flavor == MPI_WIN_FLAVOR_SHARED) {
        return OMPI_ERR_NOT_SUPPORTED;
    }

    /* if UCP worker has never been initialized before, init it first */
    if (mca_osc_ucx_component.ucp_worker == NULL) {
        ucp_worker_params_t worker_params;
        ucp_worker_attr_t worker_attr;

        memset(&worker_params, 0, sizeof(ucp_worker_h));
        worker_params.field_mask = UCP_WORKER_PARAM_FIELD_THREAD_MODE;
        worker_params.thread_mode = (mca_osc_ucx_component.enable_mpi_threads == true)
                                    ? UCS_THREAD_MODE_MULTI : UCS_THREAD_MODE_SINGLE;
        status = ucp_worker_create(mca_osc_ucx_component.ucp_context, &worker_params,
                                   &(mca_osc_ucx_component.ucp_worker));
        if (UCS_OK != status) {
            opal_output_verbose(1, ompi_osc_base_framework.framework_output,
                                "%s:%d: ucp_worker_create failed: %d\n",
                                __FILE__, __LINE__, status);
            ret = OMPI_ERROR;
            goto error;
        }

        /* query UCP worker attributes */
        worker_attr.field_mask = UCP_WORKER_ATTR_FIELD_THREAD_MODE;
        status = ucp_worker_query(mca_osc_ucx_component.ucp_worker, &worker_attr);
        if (UCS_OK != status) {
            opal_output_verbose(1, ompi_osc_base_framework.framework_output,
                                "%s:%d: ucp_worker_query failed: %d\n",
                                __FILE__, __LINE__, status);
            ret = OMPI_ERROR;
            goto error;
        }

        if (mca_osc_ucx_component.enable_mpi_threads == true &&
            worker_attr.thread_mode != UCS_THREAD_MODE_MULTI) {
            opal_output_verbose(1, ompi_osc_base_framework.framework_output,
                                "%s:%d: ucx does not support multithreading\n",
                                __FILE__, __LINE__);
            ret = OMPI_ERROR;
            goto error;
        }

        worker_created = true;
    }

    /* create module structure */
    module = (ompi_osc_ucx_module_t *)calloc(1, sizeof(ompi_osc_ucx_module_t));
    if (module == NULL) {
        ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE;
        goto error;
    }

    /* fill in the function pointer part */
    memcpy(module, &ompi_osc_ucx_module_template, sizeof(ompi_osc_base_module_t));

    ret = ompi_comm_dup(comm, &module->comm);
    if (ret != OMPI_SUCCESS) {
        goto error;
    }

    asprintf(&name, "ucx window %d", ompi_comm_get_cid(module->comm));
    ompi_win_set_name(win, name);
    free(name);

    /* share everyone's displacement units. Only do an allgather if
       strictly necessary, since it requires O(p) state. */
    values[0] = disp_unit;
    values[1] = -disp_unit;

    ret = module->comm->c_coll->coll_allreduce(MPI_IN_PLACE, values, 2, MPI_LONG,
                                               MPI_MIN, module->comm,
                                               module->comm->c_coll->coll_allreduce_module);
    if (OMPI_SUCCESS != ret) {
        goto error;
    }

    if (values[0] == -values[1]) { /* everyone has the same disp_unit, we do not need O(p) space */
        module->disp_unit = disp_unit;
    } else { /* different disp_unit sizes, allocate O(p) space to store them */
        module->disp_unit = -1;
        module->disp_units = calloc(comm_size, sizeof(int));
        if (module->disp_units == NULL) {
            ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE;
            goto error;
        }

        ret = module->comm->c_coll->coll_allgather(&disp_unit, 1, MPI_INT,
                                                   module->disp_units, 1, MPI_INT,
                                                   module->comm,
                                                   module->comm->c_coll->coll_allgather_module);
        if (OMPI_SUCCESS != ret) {
            goto error;
        }
    }

    /* exchange endpoints if necessary */
    is_eps_ready = 1;
    for (i = 0; i < comm_size; i++) {
        if (OSC_UCX_GET_EP(module->comm, i) == NULL) {
            is_eps_ready = 0;
            break;
        }
    }

    ret = module->comm->c_coll->coll_allreduce(MPI_IN_PLACE, &is_eps_ready, 1, MPI_INT,
                                               MPI_LAND,
                                               module->comm,
                                               module->comm->c_coll->coll_allreduce_module);
    if (OMPI_SUCCESS != ret) {
        goto error;
    }

    if (!is_eps_ready) {
        status = ucp_worker_get_address(mca_osc_ucx_component.ucp_worker,
                                        &my_addr, &my_addr_len);
        if (status != UCS_OK) {
            opal_output_verbose(1, ompi_osc_base_framework.framework_output,
                                "%s:%d: ucp_worker_get_address failed: %d\n",
                                __FILE__, __LINE__, status);
            ret = OMPI_ERROR;
            goto error;
        }

        ret = allgather_len_and_info(my_addr, (int)my_addr_len,
                                     &recv_buf, disps, module->comm);
        if (ret != OMPI_SUCCESS) {
            goto error;
        }

        for (i = 0; i < comm_size; i++) {
            if (OSC_UCX_GET_EP(module->comm, i) == NULL) {
                ucp_ep_params_t ep_params;
                ucp_ep_h ep;
                memset(&ep_params, 0, sizeof(ucp_ep_params_t));
                ep_params.field_mask = UCP_EP_PARAM_FIELD_REMOTE_ADDRESS;
                ep_params.address = (ucp_address_t *)&(recv_buf[disps[i]]);
                status = ucp_ep_create(mca_osc_ucx_component.ucp_worker, &ep_params, &ep);
                if (status != UCS_OK) {
                    opal_output_verbose(1, ompi_osc_base_framework.framework_output,
                                        "%s:%d: ucp_ep_create failed: %d\n",
                                        __FILE__, __LINE__, status);
                    ret = OMPI_ERROR;
                    goto error;
                }

                ompi_comm_peer_lookup(module->comm, i)->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_UCX] = ep;
            }
        }

        ucp_worker_release_address(mca_osc_ucx_component.ucp_worker, my_addr);
        my_addr = NULL;
        free(recv_buf);
        recv_buf = NULL;

        eps_created = true;
    }

    ret = mem_map(base, size, &(module->memh), module, flavor);
    if (ret != OMPI_SUCCESS) {
        goto error;
    }

    state_base = (void *)&(module->state);
    ret = mem_map(&state_base, sizeof(ompi_osc_ucx_state_t), &(module->state_memh),
                  module, MPI_WIN_FLAVOR_CREATE);
    if (ret != OMPI_SUCCESS) {
        goto error;
    }

    module->win_info_array = calloc(comm_size, sizeof(ompi_osc_ucx_win_info_t));
    if (module->win_info_array == NULL) {
        ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE;
        goto error;
    }

    module->state_info_array = calloc(comm_size, sizeof(ompi_osc_ucx_win_info_t));
    if (module->state_info_array == NULL) {
        ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE;
        goto error;
    }

    status = ucp_rkey_pack(mca_osc_ucx_component.ucp_context, module->memh,
                           &rkey_buffer, &rkey_buffer_size);
    if (status != UCS_OK) {
        opal_output_verbose(1, ompi_osc_base_framework.framework_output,
                            "%s:%d: ucp_rkey_pack failed: %d\n",
                            __FILE__, __LINE__, status);
        ret = OMPI_ERROR;
        goto error;
    }

    status = ucp_rkey_pack(mca_osc_ucx_component.ucp_context, module->state_memh,
                           &state_rkey_buffer, &state_rkey_buffer_size);
    if (status != UCS_OK) {
        opal_output_verbose(1, ompi_osc_base_framework.framework_output,
                            "%s:%d: ucp_rkey_pack failed: %d\n",
                            __FILE__, __LINE__, status);
        ret = OMPI_ERROR;
        goto error;
    }

    my_info_len = 2 * sizeof(uint64_t) + rkey_buffer_size + state_rkey_buffer_size;
    my_info = malloc(my_info_len);
    if (my_info == NULL) {
        ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE;
        goto error;
    }

    memcpy(my_info, base, sizeof(uint64_t));
    memcpy((void *)((char *)my_info + sizeof(uint64_t)), &state_base, sizeof(uint64_t));
    memcpy((void *)((char *)my_info + 2 * sizeof(uint64_t)), rkey_buffer, rkey_buffer_size);
    memcpy((void *)((char *)my_info + 2 * sizeof(uint64_t) + rkey_buffer_size),
           state_rkey_buffer, state_rkey_buffer_size);

    ret = allgather_len_and_info(my_info, (int)my_info_len, &recv_buf, disps, module->comm);
    if (ret != OMPI_SUCCESS) {
        goto error;
    }

    ret = comm->c_coll->coll_allgather((void *)&rkey_buffer_size, 1, MPI_INT,
                                       rkey_sizes, 1, MPI_INT, comm,
                                       comm->c_coll->coll_allgather_module);
    if (OMPI_SUCCESS != ret) {
        goto error;
    }

    for (i = 0; i < comm_size; i++) {
        ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, i);
        assert(ep != NULL);

        memcpy(&(module->win_info_array[i]).addr, &recv_buf[disps[i]], sizeof(uint64_t));
        memcpy(&(module->state_info_array[i]).addr, &recv_buf[disps[i] + sizeof(uint64_t)], 
               sizeof(uint64_t));

        status = ucp_ep_rkey_unpack(ep, &(recv_buf[disps[i] + 2 * sizeof(uint64_t)]),
                                    &((module->win_info_array[i]).rkey));
        if (status != UCS_OK) {
            opal_output_verbose(1, ompi_osc_base_framework.framework_output,
                                "%s:%d: ucp_ep_rkey_unpack failed: %d\n",
                                __FILE__, __LINE__, status);
            ret = OMPI_ERROR;
            goto error;
        }

        status = ucp_ep_rkey_unpack(ep, &(recv_buf[disps[i] + 2 * sizeof(uint64_t) + rkey_sizes[i]]),
                                    &((module->state_info_array[i]).rkey));
        if (status != UCS_OK) {
            opal_output_verbose(1, ompi_osc_base_framework.framework_output,
                                "%s:%d: ucp_ep_rkey_unpack failed: %d\n",
                                __FILE__, __LINE__, status);
            ret = OMPI_ERROR;
            goto error;
        }
    }

    free(my_info);
    free(recv_buf);

    ucp_rkey_buffer_release(rkey_buffer);
    ucp_rkey_buffer_release(state_rkey_buffer);

    module->state.lock = TARGET_LOCK_UNLOCKED;
    module->state.post_index = 0;
    memset((void *)module->state.post_state, 0, sizeof(uint64_t) * OMPI_OSC_UCX_POST_PEER_MAX);
    module->state.complete_count = 0;
    module->state.req_flag = 0;
    module->state.acc_lock = TARGET_LOCK_UNLOCKED;
    module->epoch_type.access = NONE_EPOCH;
    module->epoch_type.exposure = NONE_EPOCH;
    module->lock_count = 0;
    module->post_count = 0;
    module->start_group = NULL;
    module->post_group = NULL;
    OBJ_CONSTRUCT(&module->outstanding_locks, opal_hash_table_t);
    OBJ_CONSTRUCT(&module->pending_posts, opal_list_t);
    module->global_ops_num = 0;
    module->per_target_ops_nums = calloc(comm_size, sizeof(int));
    module->start_grp_ranks = NULL;
    module->lock_all_is_nocheck = false;

    ret = opal_hash_table_init(&module->outstanding_locks, comm_size);
    if (ret != OPAL_SUCCESS) {
        goto error;
    }

    win->w_osc_module = &module->super;

    /* sync with everyone */

    ret = module->comm->c_coll->coll_barrier(module->comm,
                                             module->comm->c_coll->coll_barrier_module);
    if (ret != OMPI_SUCCESS) {
        goto error;
    }

    return ret;

 error:
    if (my_addr) ucp_worker_release_address(mca_osc_ucx_component.ucp_worker, my_addr);
    if (recv_buf) free(recv_buf);
    if (my_info) free(my_info);
    for (i = 0; i < comm_size; i++) {
        if ((module->win_info_array[i]).rkey != NULL) {
            ucp_rkey_destroy((module->win_info_array[i]).rkey);
        }
        if ((module->state_info_array[i]).rkey != NULL) {
            ucp_rkey_destroy((module->state_info_array[i]).rkey);
        }
    }
    if (rkey_buffer) ucp_rkey_buffer_release(rkey_buffer);
    if (state_rkey_buffer) ucp_rkey_buffer_release(state_rkey_buffer);
    if (module->win_info_array) free(module->win_info_array);
    if (module->state_info_array) free(module->state_info_array);
    if (module->disp_units) free(module->disp_units);
    if (module->comm) ompi_comm_free(&module->comm);
    if (module->per_target_ops_nums) free(module->per_target_ops_nums);
    if (eps_created) {
        for (i = 0; i < comm_size; i++) {
            ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, i);
            ucp_ep_destroy(ep);
        }
    }
    if (worker_created) ucp_worker_destroy(mca_osc_ucx_component.ucp_worker);
    if (module) free(module);
    return ret;
}