Esempio n. 1
0
static void ucp_perf_test_cleanup_endpoints(ucx_perf_context_t *perf)
{
    unsigned group_size;

    rte_call(perf, barrier);

    group_size  = rte_call(perf, group_size);

    ucp_perf_test_destroy_eps(perf, group_size);
}
Esempio n. 2
0
ucs_status_t ucx_perf_run(ucx_perf_params_t *params, ucx_perf_result_t *result)
{
    ucx_perf_context_t perf;
    ucs_status_t status;

    if (params->command == UCX_PERF_CMD_LAST) {
        ucs_error("Test is not selected");
        status = UCS_ERR_INVALID_PARAM;
        goto out;
    }

    if ((params->api != UCX_PERF_API_UCT) && (params->api != UCX_PERF_API_UCP)) {
        ucs_error("Invalid test API parameter (should be UCT or UCP)");
        status = UCS_ERR_INVALID_PARAM;
        goto out;
    }

    if (UCS_THREAD_MODE_SINGLE != params->thread_mode) {
        return ucx_perf_thread_spawn(params, result);
    }
        
    ucx_perf_test_reset(&perf, params);

    status = ucx_perf_funcs[params->api].setup(&perf, params);
    if (status != UCS_OK) {
        goto out;
    }

    if (params->warmup_iter > 0) {
        ucx_perf_set_warmup(&perf, params);
        status = ucx_perf_funcs[params->api].run(&perf);
        if (status != UCS_OK) {
            goto out_cleanup;
        }

        rte_call(&perf, barrier);
        ucx_perf_test_reset(&perf, params);
    }

    /* Run test */
    status = ucx_perf_funcs[params->api].run(&perf);
    rte_call(&perf, barrier);
    if (status == UCS_OK) {
        ucx_perf_calc_result(&perf, result);
        rte_call(&perf, report, result, perf.params.report_arg, 1);
    }

out_cleanup:
    ucx_perf_funcs[params->api].cleanup(&perf);
out:
    return status;
}
Esempio n. 3
0
static void ucp_perf_cleanup(ucx_perf_context_t *perf)
{
    ucp_perf_test_cleanup_endpoints(perf);
    rte_call(perf, barrier);
    ucp_perf_test_free_mem(perf);
    ucp_worker_destroy(perf->ucp.worker);
    ucp_cleanup(perf->ucp.context);
}
Esempio n. 4
0
static void* ucx_perf_thread_run_test(void* arg) {
    ucx_perf_thread_context_t* tctx = (ucx_perf_thread_context_t*) arg;
    ucx_perf_params_t* params = &tctx->params;
    ucx_perf_result_t* result = &tctx->result;
    ucx_perf_context_t* perf = &tctx->perf;
    ucs_status_t* statuses = tctx->statuses;
    pthread_barrier_t* tbarrier = tctx->tbarrier;
    int tid = tctx->tid;
    int i;

    if (params->warmup_iter > 0) {
        ucx_perf_set_warmup(perf, params);
        statuses[tid] = ucx_perf_funcs[params->api].run(perf);
        pthread_barrier_wait(tbarrier);
        for (i = 0; i < tctx->ntid; i++) {
            if (UCS_OK != statuses[i]) {
                goto out;
            }
        }
        if (0 == tid) {
            rte_call(perf, barrier);
            ucx_perf_test_reset(perf, params);
        }
    }

    /* Run test */
    pthread_barrier_wait(tbarrier);
    statuses[tid] = ucx_perf_funcs[params->api].run(perf);
    pthread_barrier_wait(tbarrier);
    for (i = 0; i < tctx->ntid; i++) {
        if (UCS_OK != statuses[i]) {
            goto out;
        }
    }
    if (0 == tid) {
        rte_call(perf, barrier);
        /* Assuming all threads are fairly treated, reporting only tid==0
            TODO: aggregate reports */
        ucx_perf_calc_result(perf, result);
        rte_call(perf, report, result, 1);
    }

out:
    return &statuses[tid];
}
Esempio n. 5
0
static void uct_perf_test_cleanup_endpoints(ucx_perf_context_t *perf)
{
    unsigned group_size, group_index, i;

    rte_call(perf, barrier);

    uct_iface_set_am_handler(perf->uct.iface, UCT_PERF_TEST_AM_ID, NULL, NULL, UCT_AM_CB_FLAG_SYNC);

    group_size  = rte_call(perf, group_size);
    group_index = rte_call(perf, group_index);

    for (i = 0; i < group_size; ++i) {
        if (i != group_index) {
            uct_rkey_release(&perf->uct.peers[i].rkey);
            if (perf->uct.peers[i].ep) {
                uct_ep_destroy(perf->uct.peers[i].ep);
            }
        }
    }
    free(perf->uct.peers);
}
Esempio n. 6
0
static ucs_status_t ucp_perf_test_exchange_status(ucx_perf_context_t *perf,
                                                  ucs_status_t status)
{
    unsigned group_size  = rte_call(perf, group_size);
    ucs_status_t collective_status = UCS_OK;
    struct iovec vec;
    void *req = NULL;
    unsigned i;

    vec.iov_base = &status;
    vec.iov_len  = sizeof(status);

    rte_call(perf, post_vec, &vec, 1, &req);
    rte_call(perf, exchange_vec, req);
    for (i = 0; i < group_size; ++i) {
        rte_call(perf, recv, i, &status, sizeof(status), req);
        if (status != UCS_OK) {
            collective_status = status;
        }
    }
    return collective_status;
}
Esempio n. 7
0
static ucs_status_t ucx_perf_cuda_init(ucx_perf_context_t *perf)
{
    cudaError_t cerr;
    unsigned group_index;
    int num_gpus;
    int gpu_index;

    group_index = rte_call(perf, group_index);

    cerr = cudaGetDeviceCount(&num_gpus);
    if (cerr != cudaSuccess) {
        return UCS_ERR_NO_DEVICE;
    }

    gpu_index = group_index % num_gpus;

    cerr = cudaSetDevice(gpu_index);
    if (cerr != cudaSuccess) {
        return UCS_ERR_NO_DEVICE;
    }

    return UCS_OK;
}
Esempio n. 8
0
static ucs_status_t ucp_perf_test_setup_endpoints(ucx_perf_context_t *perf,
                                                  uint64_t features)
{
    const size_t buffer_size = 2048;
    ucx_perf_ep_info_t info, *remote_info;
    unsigned group_size, i, group_index;
    ucp_address_t *address;
    size_t address_length = 0;
    ucp_ep_params_t ep_params;
    ucs_status_t status;
    struct iovec vec[3];
    void *rkey_buffer;
    void *req = NULL;
    void *buffer;

    group_size  = rte_call(perf, group_size);
    group_index = rte_call(perf, group_index);

    status = ucp_worker_get_address(perf->ucp.worker, &address, &address_length);
    if (status != UCS_OK) {
        if (perf->params.flags & UCX_PERF_TEST_FLAG_VERBOSE) {
            ucs_error("ucp_worker_get_address() failed: %s", ucs_status_string(status));
        }
        goto err;
    }

    info.ucp.addr_len  = address_length;
    info.recv_buffer   = (uintptr_t)perf->recv_buffer;

    vec[0].iov_base    = &info;
    vec[0].iov_len     = sizeof(info);
    vec[1].iov_base    = address;
    vec[1].iov_len     = address_length;

    if (features & (UCP_FEATURE_RMA|UCP_FEATURE_AMO32|UCP_FEATURE_AMO64)) {
        status = ucp_rkey_pack(perf->ucp.context, perf->ucp.recv_memh,
                               &rkey_buffer, &info.rkey_size);
        if (status != UCS_OK) {
            if (perf->params.flags & UCX_PERF_TEST_FLAG_VERBOSE) {
                ucs_error("ucp_rkey_pack() failed: %s", ucs_status_string(status));
            }
            ucp_worker_release_address(perf->ucp.worker, address);
            goto err;
        }

        vec[2].iov_base = rkey_buffer;
        vec[2].iov_len  = info.rkey_size;
        rte_call(perf, post_vec, vec, 3, &req);
        ucp_rkey_buffer_release(rkey_buffer);
    } else {
        info.rkey_size  = 0;
        rte_call(perf, post_vec, vec, 2, &req);
    }

    ucp_worker_release_address(perf->ucp.worker, address);
    rte_call(perf, exchange_vec, req);

    perf->ucp.peers = calloc(group_size, sizeof(*perf->uct.peers));
    if (perf->ucp.peers == NULL) {
        goto err;
    }

    buffer = malloc(buffer_size);
    if (buffer == NULL) {
        ucs_error("Failed to allocate RTE receive buffer");
        status = UCS_ERR_NO_MEMORY;
        goto err_destroy_eps;
    }

    for (i = 0; i < group_size; ++i) {
        if (i == group_index) {
            continue;
        }

        rte_call(perf, recv, i, buffer, buffer_size, req);

        remote_info = buffer;
        address     = (void*)(remote_info + 1);
        rkey_buffer = (void*)address + remote_info->ucp.addr_len;
        perf->ucp.peers[i].remote_addr = remote_info->recv_buffer;

        ep_params.field_mask = UCP_EP_PARAM_FIELD_REMOTE_ADDRESS;
        ep_params.address    = address;

        status = ucp_ep_create(perf->ucp.worker, &ep_params, &perf->ucp.peers[i].ep);
        if (status != UCS_OK) {
            if (perf->params.flags & UCX_PERF_TEST_FLAG_VERBOSE) {
                ucs_error("ucp_ep_create() failed: %s", ucs_status_string(status));
            }
            goto err_free_buffer;
        }

        if (remote_info->rkey_size > 0) {
            status = ucp_ep_rkey_unpack(perf->ucp.peers[i].ep, rkey_buffer,
                                        &perf->ucp.peers[i].rkey);
            if (status != UCS_OK) {
                if (perf->params.flags & UCX_PERF_TEST_FLAG_VERBOSE) {
                    ucs_fatal("ucp_rkey_unpack() failed: %s", ucs_status_string(status));
                }
                goto err_free_buffer;
            }
        } else {
            perf->ucp.peers[i].rkey = NULL;
        }
    }

    free(buffer);

    status = ucp_perf_test_exchange_status(perf, UCS_OK);
    if (status != UCS_OK) {
        ucp_perf_test_destroy_eps(perf, group_size);
    }
    return status;

err_free_buffer:
    free(buffer);
err_destroy_eps:
    ucp_perf_test_destroy_eps(perf, group_size);
err:
    (void)ucp_perf_test_exchange_status(perf, status);
    return status;
}
Esempio n. 9
0
static ucs_status_t uct_perf_test_setup_endpoints(ucx_perf_context_t *perf)
{
    const size_t buffer_size = 2048;
    ucx_perf_ep_info_t info, *remote_info;
    unsigned group_size, i, group_index;
    uct_device_addr_t *dev_addr;
    uct_iface_addr_t *iface_addr;
    uct_ep_addr_t *ep_addr;
    uct_iface_attr_t iface_attr;
    uct_md_attr_t md_attr;
    void *rkey_buffer;
    ucs_status_t status;
    struct iovec vec[5];
    void *buffer;
    void *req;

    buffer = malloc(buffer_size);
    if (buffer == NULL) {
        ucs_error("Failed to allocate RTE buffer");
        status = UCS_ERR_NO_MEMORY;
        goto err;
    }

    status = uct_iface_query(perf->uct.iface, &iface_attr);
    if (status != UCS_OK) {
        ucs_error("Failed to uct_iface_query: %s", ucs_status_string(status));
        goto err_free;
    }

    status = uct_md_query(perf->uct.md, &md_attr);
    if (status != UCS_OK) {
        ucs_error("Failed to uct_md_query: %s", ucs_status_string(status));
        goto err_free;
    }

    if (md_attr.cap.flags & (UCT_MD_FLAG_ALLOC|UCT_MD_FLAG_REG)) {
        info.rkey_size      = md_attr.rkey_packed_size;
    } else {
        info.rkey_size      = 0;
    }
    info.uct.dev_addr_len   = iface_attr.device_addr_len;
    info.uct.iface_addr_len = iface_attr.iface_addr_len;
    info.uct.ep_addr_len    = iface_attr.ep_addr_len;
    info.recv_buffer        = (uintptr_t)perf->recv_buffer;

    rkey_buffer             = buffer;
    dev_addr                = (void*)rkey_buffer + info.rkey_size;
    iface_addr              = (void*)dev_addr    + info.uct.dev_addr_len;
    ep_addr                 = (void*)iface_addr  + info.uct.iface_addr_len;
    ucs_assert_always((void*)ep_addr + info.uct.ep_addr_len <= buffer + buffer_size);

    status = uct_iface_get_device_address(perf->uct.iface, dev_addr);
    if (status != UCS_OK) {
        ucs_error("Failed to uct_iface_get_device_address: %s",
                  ucs_status_string(status));
        goto err_free;
    }

    status = uct_iface_get_address(perf->uct.iface, iface_addr);
    if (status != UCS_OK) {
        ucs_error("Failed to uct_iface_get_address: %s", ucs_status_string(status));
        goto err_free;
    }

    if (info.rkey_size > 0) {
        status = uct_md_mkey_pack(perf->uct.md, perf->uct.recv_mem.memh, rkey_buffer);
        if (status != UCS_OK) {
            ucs_error("Failed to uct_rkey_pack: %s", ucs_status_string(status));
            goto err_free;
        }
    }

    group_size  = rte_call(perf, group_size);
    group_index = rte_call(perf, group_index);

    perf->uct.peers = calloc(group_size, sizeof(*perf->uct.peers));
    if (perf->uct.peers == NULL) {
        goto err_free;
    }

    if (iface_attr.cap.flags & UCT_IFACE_FLAG_CONNECT_TO_EP) {
        for (i = 0; i < group_size; ++i) {
            if (i == group_index) {
                continue;
            }

            status = uct_ep_create(perf->uct.iface, &perf->uct.peers[i].ep);
            if (status != UCS_OK) {
                ucs_error("Failed to uct_ep_create: %s", ucs_status_string(status));
                goto err_destroy_eps;
            }
            status = uct_ep_get_address(perf->uct.peers[i].ep, ep_addr);
            if (status != UCS_OK) {
                ucs_error("Failed to uct_ep_get_address: %s", ucs_status_string(status));
                goto err_destroy_eps;
            }
        }
    }

    vec[0].iov_base         = &info;
    vec[0].iov_len          = sizeof(info);
    vec[1].iov_base         = buffer;
    vec[1].iov_len          = info.rkey_size + info.uct.dev_addr_len +
                              info.uct.iface_addr_len + info.uct.ep_addr_len;

    rte_call(perf, post_vec, vec, 2, &req);
    rte_call(perf, exchange_vec, req);

    for (i = 0; i < group_size; ++i) {
        if (i == group_index) {
            continue;
        }

        rte_call(perf, recv, i, buffer, buffer_size, req);

        remote_info = buffer;
        rkey_buffer = remote_info + 1;
        dev_addr    = (void*)rkey_buffer + remote_info->rkey_size;
        iface_addr  = (void*)dev_addr    + remote_info->uct.dev_addr_len;
        ep_addr     = (void*)iface_addr  + remote_info->uct.iface_addr_len;
        perf->uct.peers[i].remote_addr = remote_info->recv_buffer;

        if (!uct_iface_is_reachable(perf->uct.iface, dev_addr,
                                    remote_info->uct.iface_addr_len ?
                                    iface_addr : NULL)) {
            ucs_error("Destination is unreachable");
            status = UCS_ERR_UNREACHABLE;
            goto err_destroy_eps;
        }

        if (remote_info->rkey_size > 0) {
            status = uct_rkey_unpack(rkey_buffer, &perf->uct.peers[i].rkey);
            if (status != UCS_OK) {
                ucs_error("Failed to uct_rkey_unpack: %s", ucs_status_string(status));
                goto err_destroy_eps;
            }
        } else {
            perf->uct.peers[i].rkey.handle = NULL;
            perf->uct.peers[i].rkey.type   = NULL;
            perf->uct.peers[i].rkey.rkey   = UCT_INVALID_RKEY;
        }

        if (iface_attr.cap.flags & UCT_IFACE_FLAG_CONNECT_TO_EP) {
            status = uct_ep_connect_to_ep(perf->uct.peers[i].ep, dev_addr, ep_addr);
        } else if (iface_attr.cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE) {
            status = uct_ep_create_connected(perf->uct.iface, dev_addr, iface_addr,
                                             &perf->uct.peers[i].ep);
        } else {
            status = UCS_ERR_UNSUPPORTED;
        }
        if (status != UCS_OK) {
            ucs_error("Failed to connect endpoint: %s", ucs_status_string(status));
            goto err_destroy_eps;
        }
    }
    uct_perf_iface_flush_b(perf);

    free(buffer);
    rte_call(perf, barrier);
    return UCS_OK;

err_destroy_eps:
    for (i = 0; i < group_size; ++i) {
        if (perf->uct.peers[i].rkey.type != NULL) {
            uct_rkey_release(&perf->uct.peers[i].rkey);
        }
        if (perf->uct.peers[i].ep != NULL) {
            uct_ep_destroy(perf->uct.peers[i].ep);
        }
    }
    free(perf->uct.peers);
err_free:
    free(buffer);
err:
    return status;
}
Esempio n. 10
0
static ucs_status_t ucp_perf_test_setup_endpoints(ucx_perf_context_t *perf,
                                                  uint64_t features)
{
    unsigned group_size, i, group_index;
    ucp_address_t *address;
    size_t address_length = 0;
    ucs_status_t status;
    struct iovec vec[3];
    void *rkey_buffer;
    size_t rkey_size;
    void *req = NULL;
    int iov_len;

    group_size  = rte_call(perf, group_size);
    group_index = rte_call(perf, group_index);

    status = ucp_worker_get_address(perf->ucp.worker, &address, &address_length);
    if (status != UCS_OK) {
        if (perf->params.flags & UCX_PERF_TEST_FLAG_VERBOSE) {
            ucs_error("ucp_worker_get_address() failed: %s", ucs_status_string(status));
        }
        goto err;
    }

    vec[0].iov_base = address;
    vec[0].iov_len  = address_length;
    vec[1].iov_base = &perf->recv_buffer;
    vec[1].iov_len  = sizeof(uintptr_t);

    if (features & (UCP_FEATURE_RMA|UCP_FEATURE_AMO32|UCP_FEATURE_AMO64)) {
        status = ucp_rkey_pack(perf->ucp.context, perf->ucp.recv_memh,
                               &rkey_buffer, &rkey_size);
        if (status != UCS_OK) {
            if (perf->params.flags & UCX_PERF_TEST_FLAG_VERBOSE) {
                ucs_error("ucp_rkey_pack() failed: %s", ucs_status_string(status));
            }
            ucp_worker_release_address(perf->ucp.worker, address);
            goto err;
        }

        vec[2].iov_base = rkey_buffer;
        vec[2].iov_len  = rkey_size;
        iov_len = 3;
    } else {
        rkey_buffer     = NULL;
        iov_len = 2;
    }

    rte_call(perf, post_vec, vec, iov_len, &req);

    if (rkey_buffer != NULL) {
        ucp_rkey_buffer_release(rkey_buffer);
    }
    ucp_worker_release_address(perf->ucp.worker, address);

    rte_call(perf, exchange_vec, req);

    perf->ucp.peers = calloc(group_size, sizeof(*perf->uct.peers));
    if (perf->ucp.peers == NULL) {
        goto err;
    }

    for (i = 0; i < group_size; ++i) {
        if (i == group_index) {
            continue;
        }

        address         = malloc(address_length);
        rkey_buffer     = NULL;

        vec[0].iov_base = address;
        vec[0].iov_len  = address_length;
        vec[1].iov_base = &perf->ucp.peers[i].remote_addr;
        vec[1].iov_len  = sizeof(uintptr_t);

        if (iov_len > 2) {
            rkey_buffer     = malloc(rkey_size);
            vec[2].iov_base = rkey_buffer;
            vec[2].iov_len  = rkey_size;
        }

        rte_call(perf, recv_vec, i, vec, iov_len, req);

        status = ucp_ep_create(perf->ucp.worker, address, &perf->ucp.peers[i].ep);
        if (status != UCS_OK) {
            if (perf->params.flags & UCX_PERF_TEST_FLAG_VERBOSE) {
                ucs_error("ucp_ep_create() failed: %s", ucs_status_string(status));
            }
            free(rkey_buffer);
            free(address);
            goto err_destroy_eps;
        }

        free(address);

        if (iov_len > 2) {
            status = ucp_ep_rkey_unpack(perf->ucp.peers[i].ep, rkey_buffer,
                                        &perf->ucp.peers[i].rkey);
            if (status != UCS_OK) {
                if (perf->params.flags & UCX_PERF_TEST_FLAG_VERBOSE) {
                    ucs_error("ucp_rkey_unpack() failed: %s", ucs_status_string(status));
                }
                free(rkey_buffer);
                goto err_destroy_eps;
            }
        } else {
            perf->ucp.peers[i].rkey = NULL;
        }

        free(rkey_buffer);
    }

    status = ucp_perf_test_exchange_status(perf, UCS_OK);
    if (status != UCS_OK) {
        ucp_perf_test_destroy_eps(perf, group_size);
    }
    return status;

err_destroy_eps:
    ucp_perf_test_destroy_eps(perf, group_size);
err:
    (void)ucp_perf_test_exchange_status(perf, status);
    return status;
}
Esempio n. 11
0
static ucs_status_t uct_perf_test_setup_endpoints(ucx_perf_context_t *perf)
{
    unsigned group_size, i, group_index;
    uct_device_addr_t *dev_addr;
    uct_iface_addr_t *iface_addr;
    uct_ep_addr_t *ep_addr;
    uct_iface_attr_t iface_attr;
    uct_pd_attr_t pd_attr;
    unsigned long va;
    void *rkey_buffer;
    ucs_status_t status;
    struct iovec vec[5];
    void *req;

    status = uct_iface_query(perf->uct.iface, &iface_attr);
    if (status != UCS_OK) {
        ucs_error("Failed to uct_iface_query: %s", ucs_status_string(status));
        goto err;
    }

    status = uct_pd_query(perf->uct.pd, &pd_attr);
    if (status != UCS_OK) {
        ucs_error("Failed to uct_pd_query: %s", ucs_status_string(status));
        goto err;
    }

    dev_addr    = calloc(1, iface_attr.device_addr_len);
    iface_addr  = calloc(1, iface_attr.iface_addr_len);
    ep_addr     = calloc(1, iface_attr.ep_addr_len);
    rkey_buffer = calloc(1, pd_attr.rkey_packed_size);
    if ((iface_addr == NULL) || (ep_addr == NULL) || (rkey_buffer == NULL)) {
        goto err_free;
    }

    status = uct_iface_get_device_address(perf->uct.iface, dev_addr);
    if (status != UCS_OK) {
        ucs_error("Failed to uct_iface_get_device_address: %s",
                  ucs_status_string(status));
        goto err_free;
    }

    if (iface_attr.cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE) {
        status = uct_iface_get_address(perf->uct.iface, iface_addr);
        if (status != UCS_OK) {
            ucs_error("Failed to uct_iface_get_address: %s", ucs_status_string(status));
            goto err_free;
        }
    }

    status = uct_pd_mkey_pack(perf->uct.pd, perf->uct.recv_mem.memh, rkey_buffer);
    if (status != UCS_OK) {
        ucs_error("Failed to uct_rkey_pack: %s", ucs_status_string(status));
        goto err_free;
    }

    group_size  = rte_call(perf, group_size);
    group_index = rte_call(perf, group_index);

    perf->uct.peers = calloc(group_size, sizeof(*perf->uct.peers));
    if (perf->uct.peers == NULL) {
        goto err_free;
    }

    if (iface_attr.cap.flags & UCT_IFACE_FLAG_CONNECT_TO_EP) {
        for (i = 0; i < group_size; ++i) {
            if (i == group_index) {
                continue;
            }

            status = uct_ep_create(perf->uct.iface, &perf->uct.peers[i].ep);
            if (status != UCS_OK) {
                ucs_error("Failed to uct_ep_create: %s", ucs_status_string(status));
                goto err_destroy_eps;
            }
            status = uct_ep_get_address(perf->uct.peers[i].ep, ep_addr);
            if (status != UCS_OK) {
                ucs_error("Failed to uct_ep_get_address: %s", ucs_status_string(status));
                goto err_destroy_eps;
            }
        }
    }

    va                  = (uintptr_t)perf->recv_buffer;
    vec[0].iov_base     = &va;
    vec[0].iov_len      = sizeof(va);
    vec[1].iov_base     = rkey_buffer;
    vec[1].iov_len      = pd_attr.rkey_packed_size;
    vec[2].iov_base     = dev_addr;
    vec[2].iov_len      = iface_attr.device_addr_len;
    vec[3].iov_base     = iface_addr;
    vec[3].iov_len      = iface_attr.iface_addr_len;
    vec[4].iov_base     = ep_addr;
    vec[4].iov_len      = iface_attr.ep_addr_len;

    rte_call(perf, post_vec, vec, 5, &req);
    rte_call(perf, exchange_vec, req);

    for (i = 0; i < group_size; ++i) {
        if (i == group_index) {
            continue;
        }
        vec[0].iov_base     = &va;
        vec[0].iov_len      = sizeof(va);
        vec[1].iov_base     = rkey_buffer;
        vec[1].iov_len      = pd_attr.rkey_packed_size;
        vec[2].iov_base     = dev_addr;
        vec[2].iov_len      = iface_attr.device_addr_len;
        vec[3].iov_base     = iface_addr;
        vec[3].iov_len      = iface_attr.iface_addr_len;
        vec[4].iov_base     = ep_addr;
        vec[4].iov_len      = iface_attr.ep_addr_len;

        rte_call(perf, recv_vec, i, vec, 5, req);

        perf->uct.peers[i].remote_addr = va;
        status = uct_rkey_unpack(rkey_buffer, &perf->uct.peers[i].rkey);
        if (status != UCS_OK) {
            ucs_error("Failed to uct_rkey_unpack: %s", ucs_status_string(status));
            return status;
        }

        if (iface_attr.cap.flags & UCT_IFACE_FLAG_CONNECT_TO_EP) {
            status = uct_ep_connect_to_ep(perf->uct.peers[i].ep, dev_addr, ep_addr);
        } else if (iface_attr.cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE) {
            status = uct_ep_create_connected(perf->uct.iface, dev_addr, iface_addr,
                                             &perf->uct.peers[i].ep);
        } else {
            status = UCS_ERR_UNSUPPORTED;
        }
        if (status != UCS_OK) {
            ucs_error("Failed to connect endpoint: %s", ucs_status_string(status));
            goto err_destroy_eps;
        }
    }
    uct_perf_iface_flush_b(perf);

    rte_call(perf, barrier);

    free(ep_addr);
    free(iface_addr);
    free(dev_addr);
    free(rkey_buffer);

    return UCS_OK;

err_destroy_eps:
    for (i = 0; i < group_size; ++i) {
        if (perf->uct.peers[i].rkey.type != NULL) {
            uct_rkey_release(&perf->uct.peers[i].rkey);
        }
        if (perf->uct.peers[i].ep != NULL) {
            uct_ep_destroy(perf->uct.peers[i].ep);
        }
    }
    free(perf->uct.peers);
err_free:
    free(ep_addr);
    free(iface_addr);
    free(dev_addr);
    free(rkey_buffer);
err:
    return status;
}