static void ucp_perf_test_cleanup_endpoints(ucx_perf_context_t *perf) { unsigned group_size; rte_call(perf, barrier); group_size = rte_call(perf, group_size); ucp_perf_test_destroy_eps(perf, group_size); }
ucs_status_t ucx_perf_run(ucx_perf_params_t *params, ucx_perf_result_t *result) { ucx_perf_context_t perf; ucs_status_t status; if (params->command == UCX_PERF_CMD_LAST) { ucs_error("Test is not selected"); status = UCS_ERR_INVALID_PARAM; goto out; } if ((params->api != UCX_PERF_API_UCT) && (params->api != UCX_PERF_API_UCP)) { ucs_error("Invalid test API parameter (should be UCT or UCP)"); status = UCS_ERR_INVALID_PARAM; goto out; } if (UCS_THREAD_MODE_SINGLE != params->thread_mode) { return ucx_perf_thread_spawn(params, result); } ucx_perf_test_reset(&perf, params); status = ucx_perf_funcs[params->api].setup(&perf, params); if (status != UCS_OK) { goto out; } if (params->warmup_iter > 0) { ucx_perf_set_warmup(&perf, params); status = ucx_perf_funcs[params->api].run(&perf); if (status != UCS_OK) { goto out_cleanup; } rte_call(&perf, barrier); ucx_perf_test_reset(&perf, params); } /* Run test */ status = ucx_perf_funcs[params->api].run(&perf); rte_call(&perf, barrier); if (status == UCS_OK) { ucx_perf_calc_result(&perf, result); rte_call(&perf, report, result, perf.params.report_arg, 1); } out_cleanup: ucx_perf_funcs[params->api].cleanup(&perf); out: return status; }
static void ucp_perf_cleanup(ucx_perf_context_t *perf) { ucp_perf_test_cleanup_endpoints(perf); rte_call(perf, barrier); ucp_perf_test_free_mem(perf); ucp_worker_destroy(perf->ucp.worker); ucp_cleanup(perf->ucp.context); }
static void* ucx_perf_thread_run_test(void* arg) { ucx_perf_thread_context_t* tctx = (ucx_perf_thread_context_t*) arg; ucx_perf_params_t* params = &tctx->params; ucx_perf_result_t* result = &tctx->result; ucx_perf_context_t* perf = &tctx->perf; ucs_status_t* statuses = tctx->statuses; pthread_barrier_t* tbarrier = tctx->tbarrier; int tid = tctx->tid; int i; if (params->warmup_iter > 0) { ucx_perf_set_warmup(perf, params); statuses[tid] = ucx_perf_funcs[params->api].run(perf); pthread_barrier_wait(tbarrier); for (i = 0; i < tctx->ntid; i++) { if (UCS_OK != statuses[i]) { goto out; } } if (0 == tid) { rte_call(perf, barrier); ucx_perf_test_reset(perf, params); } } /* Run test */ pthread_barrier_wait(tbarrier); statuses[tid] = ucx_perf_funcs[params->api].run(perf); pthread_barrier_wait(tbarrier); for (i = 0; i < tctx->ntid; i++) { if (UCS_OK != statuses[i]) { goto out; } } if (0 == tid) { rte_call(perf, barrier); /* Assuming all threads are fairly treated, reporting only tid==0 TODO: aggregate reports */ ucx_perf_calc_result(perf, result); rte_call(perf, report, result, 1); } out: return &statuses[tid]; }
static void uct_perf_test_cleanup_endpoints(ucx_perf_context_t *perf) { unsigned group_size, group_index, i; rte_call(perf, barrier); uct_iface_set_am_handler(perf->uct.iface, UCT_PERF_TEST_AM_ID, NULL, NULL, UCT_AM_CB_FLAG_SYNC); group_size = rte_call(perf, group_size); group_index = rte_call(perf, group_index); for (i = 0; i < group_size; ++i) { if (i != group_index) { uct_rkey_release(&perf->uct.peers[i].rkey); if (perf->uct.peers[i].ep) { uct_ep_destroy(perf->uct.peers[i].ep); } } } free(perf->uct.peers); }
static ucs_status_t ucp_perf_test_exchange_status(ucx_perf_context_t *perf, ucs_status_t status) { unsigned group_size = rte_call(perf, group_size); ucs_status_t collective_status = UCS_OK; struct iovec vec; void *req = NULL; unsigned i; vec.iov_base = &status; vec.iov_len = sizeof(status); rte_call(perf, post_vec, &vec, 1, &req); rte_call(perf, exchange_vec, req); for (i = 0; i < group_size; ++i) { rte_call(perf, recv, i, &status, sizeof(status), req); if (status != UCS_OK) { collective_status = status; } } return collective_status; }
static ucs_status_t ucx_perf_cuda_init(ucx_perf_context_t *perf) { cudaError_t cerr; unsigned group_index; int num_gpus; int gpu_index; group_index = rte_call(perf, group_index); cerr = cudaGetDeviceCount(&num_gpus); if (cerr != cudaSuccess) { return UCS_ERR_NO_DEVICE; } gpu_index = group_index % num_gpus; cerr = cudaSetDevice(gpu_index); if (cerr != cudaSuccess) { return UCS_ERR_NO_DEVICE; } return UCS_OK; }
static ucs_status_t ucp_perf_test_setup_endpoints(ucx_perf_context_t *perf, uint64_t features) { const size_t buffer_size = 2048; ucx_perf_ep_info_t info, *remote_info; unsigned group_size, i, group_index; ucp_address_t *address; size_t address_length = 0; ucp_ep_params_t ep_params; ucs_status_t status; struct iovec vec[3]; void *rkey_buffer; void *req = NULL; void *buffer; group_size = rte_call(perf, group_size); group_index = rte_call(perf, group_index); status = ucp_worker_get_address(perf->ucp.worker, &address, &address_length); if (status != UCS_OK) { if (perf->params.flags & UCX_PERF_TEST_FLAG_VERBOSE) { ucs_error("ucp_worker_get_address() failed: %s", ucs_status_string(status)); } goto err; } info.ucp.addr_len = address_length; info.recv_buffer = (uintptr_t)perf->recv_buffer; vec[0].iov_base = &info; vec[0].iov_len = sizeof(info); vec[1].iov_base = address; vec[1].iov_len = address_length; if (features & (UCP_FEATURE_RMA|UCP_FEATURE_AMO32|UCP_FEATURE_AMO64)) { status = ucp_rkey_pack(perf->ucp.context, perf->ucp.recv_memh, &rkey_buffer, &info.rkey_size); if (status != UCS_OK) { if (perf->params.flags & UCX_PERF_TEST_FLAG_VERBOSE) { ucs_error("ucp_rkey_pack() failed: %s", ucs_status_string(status)); } ucp_worker_release_address(perf->ucp.worker, address); goto err; } vec[2].iov_base = rkey_buffer; vec[2].iov_len = info.rkey_size; rte_call(perf, post_vec, vec, 3, &req); ucp_rkey_buffer_release(rkey_buffer); } else { info.rkey_size = 0; rte_call(perf, post_vec, vec, 2, &req); } ucp_worker_release_address(perf->ucp.worker, address); rte_call(perf, exchange_vec, req); perf->ucp.peers = calloc(group_size, sizeof(*perf->uct.peers)); if (perf->ucp.peers == NULL) { goto err; } buffer = malloc(buffer_size); if (buffer == NULL) { ucs_error("Failed to allocate RTE receive buffer"); status = UCS_ERR_NO_MEMORY; goto err_destroy_eps; } for (i = 0; i < group_size; ++i) { if (i == group_index) { continue; } rte_call(perf, recv, i, buffer, buffer_size, req); remote_info = buffer; address = (void*)(remote_info + 1); rkey_buffer = (void*)address + remote_info->ucp.addr_len; perf->ucp.peers[i].remote_addr = remote_info->recv_buffer; ep_params.field_mask = UCP_EP_PARAM_FIELD_REMOTE_ADDRESS; ep_params.address = address; status = ucp_ep_create(perf->ucp.worker, &ep_params, &perf->ucp.peers[i].ep); if (status != UCS_OK) { if (perf->params.flags & UCX_PERF_TEST_FLAG_VERBOSE) { ucs_error("ucp_ep_create() failed: %s", ucs_status_string(status)); } goto err_free_buffer; } if (remote_info->rkey_size > 0) { status = ucp_ep_rkey_unpack(perf->ucp.peers[i].ep, rkey_buffer, &perf->ucp.peers[i].rkey); if (status != UCS_OK) { if (perf->params.flags & UCX_PERF_TEST_FLAG_VERBOSE) { ucs_fatal("ucp_rkey_unpack() failed: %s", ucs_status_string(status)); } goto err_free_buffer; } } else { perf->ucp.peers[i].rkey = NULL; } } free(buffer); status = ucp_perf_test_exchange_status(perf, UCS_OK); if (status != UCS_OK) { ucp_perf_test_destroy_eps(perf, group_size); } return status; err_free_buffer: free(buffer); err_destroy_eps: ucp_perf_test_destroy_eps(perf, group_size); err: (void)ucp_perf_test_exchange_status(perf, status); return status; }
static ucs_status_t uct_perf_test_setup_endpoints(ucx_perf_context_t *perf) { const size_t buffer_size = 2048; ucx_perf_ep_info_t info, *remote_info; unsigned group_size, i, group_index; uct_device_addr_t *dev_addr; uct_iface_addr_t *iface_addr; uct_ep_addr_t *ep_addr; uct_iface_attr_t iface_attr; uct_md_attr_t md_attr; void *rkey_buffer; ucs_status_t status; struct iovec vec[5]; void *buffer; void *req; buffer = malloc(buffer_size); if (buffer == NULL) { ucs_error("Failed to allocate RTE buffer"); status = UCS_ERR_NO_MEMORY; goto err; } status = uct_iface_query(perf->uct.iface, &iface_attr); if (status != UCS_OK) { ucs_error("Failed to uct_iface_query: %s", ucs_status_string(status)); goto err_free; } status = uct_md_query(perf->uct.md, &md_attr); if (status != UCS_OK) { ucs_error("Failed to uct_md_query: %s", ucs_status_string(status)); goto err_free; } if (md_attr.cap.flags & (UCT_MD_FLAG_ALLOC|UCT_MD_FLAG_REG)) { info.rkey_size = md_attr.rkey_packed_size; } else { info.rkey_size = 0; } info.uct.dev_addr_len = iface_attr.device_addr_len; info.uct.iface_addr_len = iface_attr.iface_addr_len; info.uct.ep_addr_len = iface_attr.ep_addr_len; info.recv_buffer = (uintptr_t)perf->recv_buffer; rkey_buffer = buffer; dev_addr = (void*)rkey_buffer + info.rkey_size; iface_addr = (void*)dev_addr + info.uct.dev_addr_len; ep_addr = (void*)iface_addr + info.uct.iface_addr_len; ucs_assert_always((void*)ep_addr + info.uct.ep_addr_len <= buffer + buffer_size); status = uct_iface_get_device_address(perf->uct.iface, dev_addr); if (status != UCS_OK) { ucs_error("Failed to uct_iface_get_device_address: %s", ucs_status_string(status)); goto err_free; } status = uct_iface_get_address(perf->uct.iface, iface_addr); if (status != UCS_OK) { ucs_error("Failed to uct_iface_get_address: %s", ucs_status_string(status)); goto err_free; } if (info.rkey_size > 0) { status = uct_md_mkey_pack(perf->uct.md, perf->uct.recv_mem.memh, rkey_buffer); if (status != UCS_OK) { ucs_error("Failed to uct_rkey_pack: %s", ucs_status_string(status)); goto err_free; } } group_size = rte_call(perf, group_size); group_index = rte_call(perf, group_index); perf->uct.peers = calloc(group_size, sizeof(*perf->uct.peers)); if (perf->uct.peers == NULL) { goto err_free; } if (iface_attr.cap.flags & UCT_IFACE_FLAG_CONNECT_TO_EP) { for (i = 0; i < group_size; ++i) { if (i == group_index) { continue; } status = uct_ep_create(perf->uct.iface, &perf->uct.peers[i].ep); if (status != UCS_OK) { ucs_error("Failed to uct_ep_create: %s", ucs_status_string(status)); goto err_destroy_eps; } status = uct_ep_get_address(perf->uct.peers[i].ep, ep_addr); if (status != UCS_OK) { ucs_error("Failed to uct_ep_get_address: %s", ucs_status_string(status)); goto err_destroy_eps; } } } vec[0].iov_base = &info; vec[0].iov_len = sizeof(info); vec[1].iov_base = buffer; vec[1].iov_len = info.rkey_size + info.uct.dev_addr_len + info.uct.iface_addr_len + info.uct.ep_addr_len; rte_call(perf, post_vec, vec, 2, &req); rte_call(perf, exchange_vec, req); for (i = 0; i < group_size; ++i) { if (i == group_index) { continue; } rte_call(perf, recv, i, buffer, buffer_size, req); remote_info = buffer; rkey_buffer = remote_info + 1; dev_addr = (void*)rkey_buffer + remote_info->rkey_size; iface_addr = (void*)dev_addr + remote_info->uct.dev_addr_len; ep_addr = (void*)iface_addr + remote_info->uct.iface_addr_len; perf->uct.peers[i].remote_addr = remote_info->recv_buffer; if (!uct_iface_is_reachable(perf->uct.iface, dev_addr, remote_info->uct.iface_addr_len ? iface_addr : NULL)) { ucs_error("Destination is unreachable"); status = UCS_ERR_UNREACHABLE; goto err_destroy_eps; } if (remote_info->rkey_size > 0) { status = uct_rkey_unpack(rkey_buffer, &perf->uct.peers[i].rkey); if (status != UCS_OK) { ucs_error("Failed to uct_rkey_unpack: %s", ucs_status_string(status)); goto err_destroy_eps; } } else { perf->uct.peers[i].rkey.handle = NULL; perf->uct.peers[i].rkey.type = NULL; perf->uct.peers[i].rkey.rkey = UCT_INVALID_RKEY; } if (iface_attr.cap.flags & UCT_IFACE_FLAG_CONNECT_TO_EP) { status = uct_ep_connect_to_ep(perf->uct.peers[i].ep, dev_addr, ep_addr); } else if (iface_attr.cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE) { status = uct_ep_create_connected(perf->uct.iface, dev_addr, iface_addr, &perf->uct.peers[i].ep); } else { status = UCS_ERR_UNSUPPORTED; } if (status != UCS_OK) { ucs_error("Failed to connect endpoint: %s", ucs_status_string(status)); goto err_destroy_eps; } } uct_perf_iface_flush_b(perf); free(buffer); rte_call(perf, barrier); return UCS_OK; err_destroy_eps: for (i = 0; i < group_size; ++i) { if (perf->uct.peers[i].rkey.type != NULL) { uct_rkey_release(&perf->uct.peers[i].rkey); } if (perf->uct.peers[i].ep != NULL) { uct_ep_destroy(perf->uct.peers[i].ep); } } free(perf->uct.peers); err_free: free(buffer); err: return status; }
static ucs_status_t ucp_perf_test_setup_endpoints(ucx_perf_context_t *perf, uint64_t features) { unsigned group_size, i, group_index; ucp_address_t *address; size_t address_length = 0; ucs_status_t status; struct iovec vec[3]; void *rkey_buffer; size_t rkey_size; void *req = NULL; int iov_len; group_size = rte_call(perf, group_size); group_index = rte_call(perf, group_index); status = ucp_worker_get_address(perf->ucp.worker, &address, &address_length); if (status != UCS_OK) { if (perf->params.flags & UCX_PERF_TEST_FLAG_VERBOSE) { ucs_error("ucp_worker_get_address() failed: %s", ucs_status_string(status)); } goto err; } vec[0].iov_base = address; vec[0].iov_len = address_length; vec[1].iov_base = &perf->recv_buffer; vec[1].iov_len = sizeof(uintptr_t); if (features & (UCP_FEATURE_RMA|UCP_FEATURE_AMO32|UCP_FEATURE_AMO64)) { status = ucp_rkey_pack(perf->ucp.context, perf->ucp.recv_memh, &rkey_buffer, &rkey_size); if (status != UCS_OK) { if (perf->params.flags & UCX_PERF_TEST_FLAG_VERBOSE) { ucs_error("ucp_rkey_pack() failed: %s", ucs_status_string(status)); } ucp_worker_release_address(perf->ucp.worker, address); goto err; } vec[2].iov_base = rkey_buffer; vec[2].iov_len = rkey_size; iov_len = 3; } else { rkey_buffer = NULL; iov_len = 2; } rte_call(perf, post_vec, vec, iov_len, &req); if (rkey_buffer != NULL) { ucp_rkey_buffer_release(rkey_buffer); } ucp_worker_release_address(perf->ucp.worker, address); rte_call(perf, exchange_vec, req); perf->ucp.peers = calloc(group_size, sizeof(*perf->uct.peers)); if (perf->ucp.peers == NULL) { goto err; } for (i = 0; i < group_size; ++i) { if (i == group_index) { continue; } address = malloc(address_length); rkey_buffer = NULL; vec[0].iov_base = address; vec[0].iov_len = address_length; vec[1].iov_base = &perf->ucp.peers[i].remote_addr; vec[1].iov_len = sizeof(uintptr_t); if (iov_len > 2) { rkey_buffer = malloc(rkey_size); vec[2].iov_base = rkey_buffer; vec[2].iov_len = rkey_size; } rte_call(perf, recv_vec, i, vec, iov_len, req); status = ucp_ep_create(perf->ucp.worker, address, &perf->ucp.peers[i].ep); if (status != UCS_OK) { if (perf->params.flags & UCX_PERF_TEST_FLAG_VERBOSE) { ucs_error("ucp_ep_create() failed: %s", ucs_status_string(status)); } free(rkey_buffer); free(address); goto err_destroy_eps; } free(address); if (iov_len > 2) { status = ucp_ep_rkey_unpack(perf->ucp.peers[i].ep, rkey_buffer, &perf->ucp.peers[i].rkey); if (status != UCS_OK) { if (perf->params.flags & UCX_PERF_TEST_FLAG_VERBOSE) { ucs_error("ucp_rkey_unpack() failed: %s", ucs_status_string(status)); } free(rkey_buffer); goto err_destroy_eps; } } else { perf->ucp.peers[i].rkey = NULL; } free(rkey_buffer); } status = ucp_perf_test_exchange_status(perf, UCS_OK); if (status != UCS_OK) { ucp_perf_test_destroy_eps(perf, group_size); } return status; err_destroy_eps: ucp_perf_test_destroy_eps(perf, group_size); err: (void)ucp_perf_test_exchange_status(perf, status); return status; }
static ucs_status_t uct_perf_test_setup_endpoints(ucx_perf_context_t *perf) { unsigned group_size, i, group_index; uct_device_addr_t *dev_addr; uct_iface_addr_t *iface_addr; uct_ep_addr_t *ep_addr; uct_iface_attr_t iface_attr; uct_pd_attr_t pd_attr; unsigned long va; void *rkey_buffer; ucs_status_t status; struct iovec vec[5]; void *req; status = uct_iface_query(perf->uct.iface, &iface_attr); if (status != UCS_OK) { ucs_error("Failed to uct_iface_query: %s", ucs_status_string(status)); goto err; } status = uct_pd_query(perf->uct.pd, &pd_attr); if (status != UCS_OK) { ucs_error("Failed to uct_pd_query: %s", ucs_status_string(status)); goto err; } dev_addr = calloc(1, iface_attr.device_addr_len); iface_addr = calloc(1, iface_attr.iface_addr_len); ep_addr = calloc(1, iface_attr.ep_addr_len); rkey_buffer = calloc(1, pd_attr.rkey_packed_size); if ((iface_addr == NULL) || (ep_addr == NULL) || (rkey_buffer == NULL)) { goto err_free; } status = uct_iface_get_device_address(perf->uct.iface, dev_addr); if (status != UCS_OK) { ucs_error("Failed to uct_iface_get_device_address: %s", ucs_status_string(status)); goto err_free; } if (iface_attr.cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE) { status = uct_iface_get_address(perf->uct.iface, iface_addr); if (status != UCS_OK) { ucs_error("Failed to uct_iface_get_address: %s", ucs_status_string(status)); goto err_free; } } status = uct_pd_mkey_pack(perf->uct.pd, perf->uct.recv_mem.memh, rkey_buffer); if (status != UCS_OK) { ucs_error("Failed to uct_rkey_pack: %s", ucs_status_string(status)); goto err_free; } group_size = rte_call(perf, group_size); group_index = rte_call(perf, group_index); perf->uct.peers = calloc(group_size, sizeof(*perf->uct.peers)); if (perf->uct.peers == NULL) { goto err_free; } if (iface_attr.cap.flags & UCT_IFACE_FLAG_CONNECT_TO_EP) { for (i = 0; i < group_size; ++i) { if (i == group_index) { continue; } status = uct_ep_create(perf->uct.iface, &perf->uct.peers[i].ep); if (status != UCS_OK) { ucs_error("Failed to uct_ep_create: %s", ucs_status_string(status)); goto err_destroy_eps; } status = uct_ep_get_address(perf->uct.peers[i].ep, ep_addr); if (status != UCS_OK) { ucs_error("Failed to uct_ep_get_address: %s", ucs_status_string(status)); goto err_destroy_eps; } } } va = (uintptr_t)perf->recv_buffer; vec[0].iov_base = &va; vec[0].iov_len = sizeof(va); vec[1].iov_base = rkey_buffer; vec[1].iov_len = pd_attr.rkey_packed_size; vec[2].iov_base = dev_addr; vec[2].iov_len = iface_attr.device_addr_len; vec[3].iov_base = iface_addr; vec[3].iov_len = iface_attr.iface_addr_len; vec[4].iov_base = ep_addr; vec[4].iov_len = iface_attr.ep_addr_len; rte_call(perf, post_vec, vec, 5, &req); rte_call(perf, exchange_vec, req); for (i = 0; i < group_size; ++i) { if (i == group_index) { continue; } vec[0].iov_base = &va; vec[0].iov_len = sizeof(va); vec[1].iov_base = rkey_buffer; vec[1].iov_len = pd_attr.rkey_packed_size; vec[2].iov_base = dev_addr; vec[2].iov_len = iface_attr.device_addr_len; vec[3].iov_base = iface_addr; vec[3].iov_len = iface_attr.iface_addr_len; vec[4].iov_base = ep_addr; vec[4].iov_len = iface_attr.ep_addr_len; rte_call(perf, recv_vec, i, vec, 5, req); perf->uct.peers[i].remote_addr = va; status = uct_rkey_unpack(rkey_buffer, &perf->uct.peers[i].rkey); if (status != UCS_OK) { ucs_error("Failed to uct_rkey_unpack: %s", ucs_status_string(status)); return status; } if (iface_attr.cap.flags & UCT_IFACE_FLAG_CONNECT_TO_EP) { status = uct_ep_connect_to_ep(perf->uct.peers[i].ep, dev_addr, ep_addr); } else if (iface_attr.cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE) { status = uct_ep_create_connected(perf->uct.iface, dev_addr, iface_addr, &perf->uct.peers[i].ep); } else { status = UCS_ERR_UNSUPPORTED; } if (status != UCS_OK) { ucs_error("Failed to connect endpoint: %s", ucs_status_string(status)); goto err_destroy_eps; } } uct_perf_iface_flush_b(perf); rte_call(perf, barrier); free(ep_addr); free(iface_addr); free(dev_addr); free(rkey_buffer); return UCS_OK; err_destroy_eps: for (i = 0; i < group_size; ++i) { if (perf->uct.peers[i].rkey.type != NULL) { uct_rkey_release(&perf->uct.peers[i].rkey); } if (perf->uct.peers[i].ep != NULL) { uct_ep_destroy(perf->uct.peers[i].ep); } } free(perf->uct.peers); err_free: free(ep_addr); free(iface_addr); free(dev_addr); free(rkey_buffer); err: return status; }