ucs_status_t ucx_perf_run(ucx_perf_params_t *params, ucx_perf_result_t *result) { ucx_perf_context_t *perf; ucs_status_t status; if (params->command == UCX_PERF_CMD_LAST) { ucs_error("Test is not selected"); status = UCS_ERR_INVALID_PARAM; goto out; } if ((params->api != UCX_PERF_API_UCT) && (params->api != UCX_PERF_API_UCP)) { ucs_error("Invalid test API parameter (should be UCT or UCP)"); status = UCS_ERR_INVALID_PARAM; goto out; } perf = malloc(sizeof(*perf)); if (perf == NULL) { status = UCS_ERR_NO_MEMORY; goto out; } ucx_perf_test_reset(perf, params); status = ucx_perf_funcs[params->api].setup(perf, params); if (status != UCS_OK) { goto out_free; } if (UCS_THREAD_MODE_SINGLE == params->thread_mode) { if (params->warmup_iter > 0) { ucx_perf_set_warmup(perf, params); status = ucx_perf_funcs[params->api].run(perf); if (status != UCS_OK) { goto out_cleanup; } rte_call(perf, barrier); ucx_perf_test_reset(perf, params); } /* Run test */ status = ucx_perf_funcs[params->api].run(perf); rte_call(perf, barrier); if (status == UCS_OK) { ucx_perf_calc_result(perf, result); rte_call(perf, report, result, perf->params.report_arg, 1); } } else { status = ucx_perf_thread_spawn(perf, result); } out_cleanup: ucx_perf_funcs[params->api].cleanup(perf); out_free: free(perf); out: return status; }
static int ucx_perf_thread_spawn(ucx_perf_params_t* params, ucx_perf_result_t* result) { ucx_perf_context_t perf; ucs_status_t status; int ti; int nti = params->thread_count; ucx_perf_thread_context_t* tctx = calloc(nti, sizeof(ucx_perf_thread_context_t)); ucs_status_t* statuses = calloc(nti, sizeof(ucs_status_t)); pthread_barrier_t tbarrier; pthread_barrier_init(&tbarrier, NULL, nti); ucx_perf_test_reset(&perf, params); status = ucx_perf_funcs[params->api].setup(&perf, params); if (UCS_OK != status) { goto out_cleanup; } for (ti = 0; ti < nti; ti++) { tctx[ti].tid = ti; tctx[ti].ntid = nti; tctx[ti].tbarrier = &tbarrier; tctx[ti].statuses = statuses; tctx[ti].params = *params; tctx[ti].perf = perf; /* Doctor the src and dst buffers to make them thread specific */ tctx[ti].perf.send_buffer += ti * params->message_size; tctx[ti].perf.recv_buffer += ti * params->message_size; pthread_create(&tctx[ti].pt, NULL, ucx_perf_thread_run_test, (void*)&tctx[ti]); } for (ti = 0; ti < nti; ti++) { pthread_join(tctx[ti].pt, NULL); if (UCS_OK != statuses[ti]) { ucs_error("Thread %d failed to run test: %s", tctx[ti].tid, ucs_status_string(statuses[ti])); status = statuses[ti]; } } ucx_perf_funcs[params->api].cleanup(&perf); out_cleanup: pthread_barrier_destroy(&tbarrier); free(statuses); free(tctx); return status; }
static int ucx_perf_thread_spawn(ucx_perf_params_t* params, ucx_perf_result_t* result) { ucx_perf_context_t perf; ucs_status_t status = UCS_OK; int ti, nti; omp_set_num_threads(params->thread_count); nti = params->thread_count; ucx_perf_thread_context_t* tctx = calloc(nti, sizeof(ucx_perf_thread_context_t)); ucs_status_t* statuses = calloc(nti, sizeof(ucs_status_t)); ucx_perf_test_reset(&perf, params); status = ucx_perf_funcs[params->api].setup(&perf, params); if (UCS_OK != status) { goto out_cleanup; } #pragma omp parallel private(ti) { ti = omp_get_thread_num(); tctx[ti].tid = ti; tctx[ti].ntid = nti; tctx[ti].statuses = statuses; tctx[ti].params = *params; tctx[ti].perf = perf; /* Doctor the src and dst buffers to make them thread specific */ tctx[ti].perf.send_buffer += ti * params->message_size; tctx[ti].perf.recv_buffer += ti * params->message_size; tctx[ti].perf.offset = ti * params->message_size; ucx_perf_thread_run_test((void*)&tctx[ti]); } for (ti = 0; ti < nti; ti++) { if (UCS_OK != statuses[ti]) { ucs_error("Thread %d failed to run test: %s", tctx[ti].tid, ucs_status_string(statuses[ti])); status = statuses[ti]; } } ucx_perf_funcs[params->api].cleanup(&perf); out_cleanup: free(statuses); free(tctx); return status; }
static void* ucx_perf_thread_run_test(void* arg) { ucx_perf_thread_context_t* tctx = (ucx_perf_thread_context_t*) arg; ucx_perf_params_t* params = &tctx->params; ucx_perf_result_t* result = &tctx->result; ucx_perf_context_t* perf = &tctx->perf; ucs_status_t* statuses = tctx->statuses; pthread_barrier_t* tbarrier = tctx->tbarrier; int tid = tctx->tid; int i; if (params->warmup_iter > 0) { ucx_perf_set_warmup(perf, params); statuses[tid] = ucx_perf_funcs[params->api].run(perf); pthread_barrier_wait(tbarrier); for (i = 0; i < tctx->ntid; i++) { if (UCS_OK != statuses[i]) { goto out; } } if (0 == tid) { rte_call(perf, barrier); ucx_perf_test_reset(perf, params); } } /* Run test */ pthread_barrier_wait(tbarrier); statuses[tid] = ucx_perf_funcs[params->api].run(perf); pthread_barrier_wait(tbarrier); for (i = 0; i < tctx->ntid; i++) { if (UCS_OK != statuses[i]) { goto out; } } if (0 == tid) { rte_call(perf, barrier); /* Assuming all threads are fairly treated, reporting only tid==0 TODO: aggregate reports */ ucx_perf_calc_result(perf, result); rte_call(perf, report, result, 1); } out: return &statuses[tid]; }