static UCS_F_ALWAYS_INLINE ucs_status_t uct_rc_verbs_iface_poll_rx(uct_rc_verbs_iface_t *iface) { uct_ib_iface_recv_desc_t *desc; uct_rc_hdr_t *hdr; struct ibv_wc wc[UCT_IB_MAX_WC]; int i, ret; ret = ibv_poll_cq(iface->super.super.recv_cq, UCT_IB_MAX_WC, wc); if (ret > 0) { for (i = 0; i < ret; ++i) { if (ucs_unlikely(wc[i].status != IBV_WC_SUCCESS)) { ucs_fatal("Receive completion with error: %s", ibv_wc_status_str(wc[i].status)); } UCS_STATS_UPDATE_COUNTER(iface->super.stats, UCT_RC_IFACE_STAT_RX_COMPLETION, 1); desc = (void*)wc[i].wr_id; uct_ib_iface_desc_received(&iface->super.super, desc, wc[i].byte_len, 1); hdr = uct_ib_iface_recv_desc_hdr(&iface->super.super, desc); uct_ib_log_recv_completion(IBV_QPT_RC, &wc[i], hdr, uct_rc_ep_am_packet_dump); uct_rc_iface_invoke_am(&iface->super, hdr, wc[i].byte_len, desc); } iface->super.rx.available += ret; return UCS_OK; } else if (ret == 0) { uct_rc_verbs_iface_post_recv(iface, 0); return UCS_ERR_NO_PROGRESS; } else { ucs_fatal("Failed to poll receive CQ"); } }
static inline ucs_status_t uct_ud_verbs_iface_poll_rx(uct_ud_verbs_iface_t *iface) { uct_ib_iface_recv_desc_t *desc; struct ibv_wc wc[UCT_IB_MAX_WC]; int i, ret; char *packet; ret = ibv_poll_cq(iface->super.super.recv_cq, UCT_IB_MAX_WC, wc); if (ret == 0) { return UCS_ERR_NO_PROGRESS; } if (ucs_unlikely(ret < 0)) { ucs_fatal("Failed to poll receive CQ"); } for (i = 0; i < ret; ++i) { if (ucs_unlikely(wc[i].status != IBV_WC_SUCCESS)) { ucs_fatal("Receive completion with error: %s", ibv_wc_status_str(wc[i].status)); } desc = (void*)wc[i].wr_id; ucs_trace_data("pkt rcvd: buf=%p len=%d", desc, wc[i].byte_len); packet = uct_ib_iface_recv_desc_hdr(&iface->super.super, desc); VALGRIND_MAKE_MEM_DEFINED(packet, wc[i].byte_len); uct_ud_ep_process_rx(&iface->super, (uct_ud_neth_t *)(packet + UCT_IB_GRH_LEN), wc[i].byte_len - UCT_IB_GRH_LEN, (uct_ud_recv_skb_t *)desc); } iface->super.rx.available += ret; uct_ud_verbs_iface_post_recv(iface); return UCS_OK; }
static UCS_F_ALWAYS_INLINE void uct_rc_verbs_ep_post_send(uct_rc_verbs_iface_t* iface, uct_rc_verbs_ep_t* ep, struct ibv_send_wr *wr, int send_flags, int max_log_sge) { struct ibv_send_wr *bad_wr; int ret; uct_rc_txqp_check(&ep->super.txqp); if (!(send_flags & IBV_SEND_SIGNALED)) { send_flags |= uct_rc_iface_tx_moderation(&iface->super, &ep->super.txqp, IBV_SEND_SIGNALED); } if (wr->opcode == IBV_WR_RDMA_READ) { send_flags |= uct_rc_ep_atomic_fence(&iface->super, &ep->fi, IBV_SEND_FENCE); } wr->send_flags = send_flags; wr->wr_id = uct_rc_txqp_unsignaled(&ep->super.txqp); uct_ib_log_post_send(&iface->super.super, ep->super.txqp.qp, wr, max_log_sge, (wr->opcode == IBV_WR_SEND) ? uct_rc_ep_packet_dump : NULL); ret = ibv_post_send(ep->super.txqp.qp, wr, &bad_wr); if (ret != 0) { ucs_fatal("ibv_post_send() returned %d (%m)", ret); } uct_rc_verbs_txqp_posted(&ep->super.txqp, &ep->txcnt, &iface->super, send_flags & IBV_SEND_SIGNALED); }
static ucs_status_t ucs_async_handler_dispatch(ucs_async_handler_t *handler) { ucs_async_context_t *async; ucs_async_mode_t mode; ucs_status_t status; mode = handler->mode; async = handler->async; if (async != NULL) { async->last_wakeup = ucs_get_time(); } if (async == NULL) { ucs_trace_async("calling async handler " UCS_ASYNC_HANDLER_FMT, UCS_ASYNC_HANDLER_ARG(handler)); handler->cb(handler->id, handler->arg); } else if (ucs_async_method_call(mode, context_try_block, async)) { ucs_trace_async("calling async handler " UCS_ASYNC_HANDLER_FMT, UCS_ASYNC_HANDLER_ARG(handler)); handler->cb(handler->id, handler->arg); ucs_async_method_call(mode, context_unblock, async); } else /* async != NULL */ { ucs_trace_async("missed " UCS_ASYNC_HANDLER_FMT ", last_wakeup %lu", UCS_ASYNC_HANDLER_ARG(handler), async->last_wakeup); if (ucs_atomic_cswap32(&handler->missed, 0, 1) == 0) { status = ucs_mpmc_queue_push(&async->missed, handler->id); if (status != UCS_OK) { ucs_fatal("Failed to push event %d to miss queue: %s", handler->id, ucs_status_string(status)); } } return UCS_ERR_NO_PROGRESS; } return UCS_OK; }
static UCS_F_ALWAYS_INLINE void uct_rc_verbs_ep_post_send(uct_rc_verbs_iface_t* iface, uct_rc_verbs_ep_t* ep, struct ibv_send_wr *wr, int send_flags) { struct ibv_send_wr *bad_wr; int ret; uct_rc_txqp_check(&ep->super.txqp); if (!(send_flags & IBV_SEND_SIGNALED)) { send_flags |= uct_rc_iface_tx_moderation(&iface->super, &ep->super.txqp, IBV_SEND_SIGNALED); } wr->send_flags = send_flags; wr->wr_id = uct_rc_txqp_unsignaled(&ep->super.txqp); uct_ib_log_post_send(&iface->super.super, ep->super.txqp.qp, wr, (wr->opcode == IBV_WR_SEND) ? uct_rc_ep_am_packet_dump : NULL); UCT_IB_INSTRUMENT_RECORD_SEND_WR_LEN("uct_rc_verbs_ep_post_send", wr); ret = ibv_post_send(ep->super.txqp.qp, wr, &bad_wr); if (ret != 0) { ucs_fatal("ibv_post_send() returned %d (%m)", ret); } uct_rc_verbs_txqp_posted(&ep->super.txqp, &ep->txcnt, &iface->super, send_flags & IBV_SEND_SIGNALED); }
static UCS_F_ALWAYS_INLINE void uct_rc_verbs_exp_post_send(uct_rc_verbs_ep_t *ep, struct ibv_exp_send_wr *wr, uint64_t signal) { uct_rc_verbs_iface_t *iface = ucs_derived_of(ep->super.super.super.iface, uct_rc_verbs_iface_t); uct_rc_txqp_check(&ep->super.txqp); struct ibv_exp_send_wr *bad_wr; int ret; signal |= uct_rc_iface_tx_moderation(&iface->super, &ep->super.txqp, IBV_EXP_SEND_SIGNALED); wr->exp_send_flags = signal; wr->wr_id = uct_rc_txqp_unsignaled(&ep->super.txqp); uct_ib_log_exp_post_send(&iface->super.super, ep->super.txqp.qp, wr, (wr->exp_opcode == IBV_EXP_WR_SEND) ? uct_rc_ep_am_packet_dump : NULL); UCT_IB_INSTRUMENT_RECORD_SEND_EXP_WR_LEN("uct_rc_verbs_exp_post_send", wr); ret = ibv_exp_post_send(ep->super.txqp.qp, wr, &bad_wr); if (ret != 0) { ucs_fatal("ibv_exp_post_send() returned %d (%m)", ret); } uct_rc_verbs_txqp_posted(&ep->super.txqp, &ep->txcnt, &iface->super, signal); }
size_t ucp_tag_rndv_rts_pack(void *dest, void *arg) { ucp_request_t *sreq = arg; /* send request */ ucp_rndv_rts_hdr_t *rndv_rts_hdr = dest; ucp_worker_h worker = sreq->send.ep->worker; ssize_t packed_rkey_size; rndv_rts_hdr->super.tag = sreq->send.tag.tag; rndv_rts_hdr->sreq.reqptr = (uintptr_t)sreq; rndv_rts_hdr->sreq.sender_uuid = worker->uuid; rndv_rts_hdr->size = sreq->send.length; /* Pack remote keys (which can be empty list) */ if (UCP_DT_IS_CONTIG(sreq->send.datatype) && ucp_rndv_is_get_zcopy(sreq, worker->context->config.ext.rndv_mode)) { /* pack rkey, ask target to do get_zcopy */ rndv_rts_hdr->address = (uintptr_t)sreq->send.buffer; packed_rkey_size = ucp_rkey_pack_uct(worker->context, sreq->send.state.dt.dt.contig.md_map, sreq->send.state.dt.dt.contig.memh, rndv_rts_hdr + 1); if (packed_rkey_size < 0) { ucs_fatal("failed to pack rendezvous remote key: %s", ucs_status_string(packed_rkey_size)); } } else { rndv_rts_hdr->address = 0; packed_rkey_size = 0; } return sizeof(*rndv_rts_hdr) + packed_rkey_size; }
static UCS_F_ALWAYS_INLINE void uct_rc_verbs_exp_post_send(uct_rc_verbs_ep_t *ep, struct ibv_exp_send_wr *wr, int signal) { uct_rc_verbs_iface_t *iface = ucs_derived_of(ep->super.super.super.iface, uct_rc_verbs_iface_t); struct ibv_exp_send_wr *bad_wr; int ret; if (!signal) { signal = uct_rc_iface_tx_moderation(&iface->super, &ep->super, IBV_EXP_SEND_SIGNALED); } wr->exp_send_flags |= signal; wr->wr_id = ep->super.unsignaled; uct_ib_log_exp_post_send(ep->super.qp, wr, (wr->exp_opcode == IBV_EXP_WR_SEND) ? uct_rc_ep_am_packet_dump : NULL); ret = ibv_exp_post_send(ep->super.qp, wr, &bad_wr); if (ret != 0) { ucs_fatal("ibv_exp_post_send() returned %d (%m)", ret); } uct_rc_verbs_ep_posted(ep, signal); }
unsigned uct_rc_verbs_iface_post_recv_always(uct_rc_iface_t *iface, unsigned max) { struct ibv_recv_wr *bad_wr; uct_ib_recv_wr_t *wrs; unsigned count; int ret; wrs = ucs_alloca(sizeof *wrs * max); count = uct_ib_iface_prepare_rx_wrs(&iface->super, &iface->rx.mp, wrs, max); if (ucs_unlikely(count == 0)) { return 0; } UCT_IB_INSTRUMENT_RECORD_RECV_WR_LEN("uct_rc_iface_post_recv_always", &wrs[0].ibwr); ret = ibv_post_srq_recv(iface->rx.srq, &wrs[0].ibwr, &bad_wr); if (ret != 0) { ucs_fatal("ibv_post_srq_recv() returned %d: %m", ret); } iface->rx.available -= count; return count; }
void ucs_global_opts_init() { ucs_status_t status; status = ucs_config_parser_fill_opts(&ucs_global_opts, ucs_global_opts_table, NULL, NULL, 1); if (status != UCS_OK) { ucs_fatal("failed to parse global configuration - aborting"); } }
void ucp_tag_eager_sync_zcopy_req_complete(ucp_request_t *req, ucs_status_t status) { if (req->send.state.dt.offset == req->send.length) { ucp_request_send_buffer_dereg(req); /* TODO register+lane change */ ucp_tag_eager_sync_completion(req, UCP_REQUEST_FLAG_LOCAL_COMPLETED, status); } else if (status != UCS_OK) { ucs_fatal("error handling is not supported with tag-sync protocol"); } }
static UCS_F_ALWAYS_INLINE void uct_rc_verbs_iface_poll_tx(uct_rc_verbs_iface_t *iface) { struct ibv_wc wc[UCT_IB_MAX_WC]; uct_rc_verbs_ep_t *ep; uct_rc_iface_send_op_t *op; unsigned count; uint16_t sn; int i, ret; ret = ibv_poll_cq(iface->super.super.send_cq, UCT_IB_MAX_WC, wc); if (ucs_unlikely(ret <= 0)) { if (ucs_unlikely(ret < 0)) { ucs_fatal("Failed to poll send CQ"); } return; } for (i = 0; i < ret; ++i) { if (ucs_unlikely(wc[i].status != IBV_WC_SUCCESS)) { ucs_fatal("Send completion with error: %s", ibv_wc_status_str(wc[i].status)); } UCS_STATS_UPDATE_COUNTER(iface->super.stats, UCT_RC_IFACE_STAT_TX_COMPLETION, 1); ep = ucs_derived_of(uct_rc_iface_lookup_ep(&iface->super, wc[i].qp_num), uct_rc_verbs_ep_t); ucs_assert(ep != NULL); count = wc[i].wr_id + 1; /* Number of sends with WC completes in batch */ ep->super.available += count; ep->tx.completion_count += count; ++iface->super.tx.cq_available; sn = ep->tx.completion_count; ucs_queue_for_each_extract(op, &ep->super.outstanding, queue, UCS_CIRCULAR_COMPARE16(op->sn, <=, sn)) { op->handler(op); } } }
void *uct_mm_ep_attach_remote_seg(uct_mm_ep_t *ep, uct_mm_iface_t *iface, uct_mm_fifo_element_t *elem) { uct_mm_remote_seg_t *remote_seg, search; ucs_status_t status; /* take the mmid of the chunk that the desc belongs to, (the desc that the fifo_elem * is 'assigned' to), and check if the ep has already attached to it. */ search.mmid = elem->desc_mmid; remote_seg = sglib_hashed_uct_mm_remote_seg_t_find_member(ep->remote_segments_hash, &search); if (remote_seg == NULL) { /* not in the hash. attach to the memory the mmid refers to. the attach call * will return the base address of the mmid's chunk - * save this base address in a hash table (which maps mmid to base address). */ remote_seg = ucs_malloc(sizeof(*remote_seg), "mm_desc"); if (remote_seg == NULL) { ucs_fatal("Failed to allocated memory for a remote segment identifier. %m"); } status = uct_mm_md_mapper_ops(iface->super.md)->attach(elem->desc_mmid, elem->desc_mpool_size, elem->desc_chunk_base_addr, &remote_seg->address, &remote_seg->cookie, iface->path); if (status != UCS_OK) { ucs_fatal("Failed to attach to remote mmid:%zu. %s ", elem->desc_mmid, ucs_status_string(status)); } remote_seg->mmid = elem->desc_mmid; remote_seg->length = elem->desc_mpool_size; /* put the base address into the ep's hash table */ sglib_hashed_uct_mm_remote_seg_t_add(ep->remote_segments_hash, remote_seg); } return remote_seg->address; }
static inline void uct_ud_verbs_iface_poll_tx(uct_ud_verbs_iface_t *iface) { struct ibv_wc wc; int ret; ret = ibv_poll_cq(iface->super.super.send_cq, 1, &wc); if (ucs_unlikely(ret < 0)) { ucs_fatal("Failed to poll send CQ"); return; } if (ret == 0) { return; } if (ucs_unlikely(wc.status != IBV_WC_SUCCESS)) { ucs_fatal("Send completion (wr_id=0x%0X with error: %s ", (unsigned)wc.wr_id, ibv_wc_status_str(wc.status)); return; } iface->super.tx.available += UCT_UD_TX_MODERATION + 1; }
static void uct_cm_iface_outstanding_remove(uct_cm_iface_t* iface, struct ib_cm_id* id) { unsigned index; for (index = 0; index < iface->num_outstanding; ++index) { if (iface->outstanding[index] == id) { iface->outstanding[index] = uct_cm_iface_outstanding_pop(iface); return; } } ucs_fatal("outstanding cm id %p not found", id); }
static void uct_tcp_ep_epoll_ctl(uct_tcp_ep_t *ep, int op) { uct_tcp_iface_t *iface = ucs_derived_of(ep->super.super.iface, uct_tcp_iface_t); struct epoll_event epoll_event; int ret; memset(&epoll_event, 0, sizeof(epoll_event)); epoll_event.data.ptr = ep; epoll_event.events = ep->events; ret = epoll_ctl(iface->epfd, op, ep->fd, &epoll_event); if (ret < 0) { ucs_fatal("epoll_ctl(epfd=%d, op=%d, fd=%d) failed: %m", iface->epfd, op, ep->fd); } }
static void ucs_config_parser_print_opts_recurs(FILE *stream, const void *opts, const ucs_config_field_t *fields, unsigned flags, const char *env_prefix, const char *table_prefix) { const ucs_config_field_t *field, *aliased_field; size_t alias_table_offset; const char *prefix; prefix = table_prefix == NULL ? "" : table_prefix; for (field = fields; field->name; ++field) { if (ucs_config_is_table_field(field)) { /* Parse with sub-table prefix */ if (table_prefix == NULL) { ucs_config_parser_print_opts_recurs(stream, opts + field->offset, field->parser.arg, flags, env_prefix, field->name); } else { ucs_config_parser_print_opts_recurs(stream, opts + field->offset, field->parser.arg, flags, env_prefix, table_prefix); } } else if (ucs_config_is_alias_field(field)) { if (flags & UCS_CONFIG_PRINT_HIDDEN) { aliased_field = ucs_config_find_aliased_field(fields, field, &alias_table_offset); if (aliased_field == NULL) { ucs_fatal("could not find aliased field of %s", field->name); } ucs_config_parser_print_field(stream, opts + alias_table_offset, env_prefix, table_prefix, field->name, aliased_field, flags, "(alias of %s%s%s)", env_prefix, table_prefix, aliased_field->name); } } else { ucs_config_parser_print_field(stream, opts, env_prefix, prefix, field->name, field, flags, NULL); } } }
ucs_status_t ucp_tag_send(ucp_ep_h ep, const void *buffer, size_t length, ucp_tag_t tag) { ucp_worker_h worker = ep->worker; ucs_status_t status; retry: if (ucs_likely(length < ep->config.max_short_tag)) { UCS_STATIC_ASSERT(sizeof(ucp_tag_t) == sizeof(uint64_t)); status = uct_ep_am_short(ep->uct_ep, UCP_AM_ID_EAGER_ONLY, tag, buffer, length); if (status == UCS_ERR_NO_RESOURCE) { ucp_worker_progress(worker); goto retry; } return status; } ucs_fatal("unsupported"); }
void uct_dc_ep_set_failed(ucs_class_t *ep_cls, uct_dc_iface_t *iface, uint32_t qp_num) { uint8_t dci = uct_dc_iface_dci_find(iface, qp_num); uct_dc_ep_t *ep = iface->tx.dcis[dci].ep; if (!ep) { return; } uct_rc_txqp_purge_outstanding(&iface->tx.dcis[dci].txqp, UCS_ERR_ENDPOINT_TIMEOUT, 0); uct_set_ep_failed(ep_cls, &ep->super.super, &iface->super.super.super.super); if (UCS_OK != uct_dc_iface_dci_reconnect(iface, &iface->tx.dcis[dci].txqp)) { ucs_fatal("Unsuccessful reconnect of DC QP #%u", qp_num); } uct_rc_txqp_available_set(&iface->tx.dcis[dci].txqp, iface->super.config.tx_qp_len); }
static UCS_F_NOINLINE void uct_ud_verbs_iface_post_recv_always(uct_ud_verbs_iface_t *iface, int max) { struct ibv_recv_wr *bad_wr; uct_ib_recv_wr_t *wrs; unsigned count; int ret; wrs = ucs_alloca(sizeof *wrs * max); count = uct_ib_iface_prepare_rx_wrs(&iface->super.super, &iface->super.rx.mp, wrs, max); if (count == 0) { return; } ret = ibv_post_recv(iface->super.qp, &wrs[0].ibwr, &bad_wr); if (ret != 0) { ucs_fatal("ibv_post_recv() returned %d: %m", ret); } iface->super.rx.available -= count; }
static UCS_F_ALWAYS_INLINE void uct_rc_verbs_ep_post_send(uct_rc_verbs_iface_t* iface, uct_rc_verbs_ep_t* ep, struct ibv_send_wr *wr, int send_flags) { struct ibv_send_wr *bad_wr; int ret; if (!(send_flags & IBV_SEND_SIGNALED)) { send_flags |= uct_rc_iface_tx_moderation(&iface->super, &ep->super, IBV_SEND_SIGNALED); } wr->send_flags = send_flags; wr->wr_id = ep->super.unsignaled; uct_ib_log_post_send(ep->super.qp, wr, (wr->opcode == IBV_WR_SEND) ? uct_rc_ep_am_packet_dump : NULL); ret = ibv_post_send(ep->super.qp, wr, &bad_wr); if (ret != 0) { ucs_fatal("ibv_post_send() returned %d (%m)", ret); } uct_rc_verbs_ep_posted(ep, send_flags & IBV_SEND_SIGNALED); }
void __ucs_wtimer_add(ucs_twheel_t *t, ucs_wtimer_t *timer, ucs_time_t delta) { uint64_t slot; timer->is_active = 1; slot = delta>>t->res_order; if (ucs_unlikely(slot == 0)) { /* nothing really wrong with adding timer to the current slot. However * we want to guard against the case we spend to much time in hi res * timer processing */ ucs_fatal("Timer resolution is too low. Min resolution %lf usec, wanted %lf usec", ucs_time_to_usec(t->res), ucs_time_to_usec(delta)); } ucs_assert(slot > 0); if (ucs_unlikely(slot >= t->num_slots)) { slot = t->num_slots - 1; } slot = (t->current + slot) % t->num_slots; ucs_assert(slot != t->current); ucs_list_add_tail(&t->wheel[slot], &timer->list); }
static ucs_status_t ucp_perf_test_setup_endpoints(ucx_perf_context_t *perf, uint64_t features) { const size_t buffer_size = 2048; ucx_perf_ep_info_t info, *remote_info; unsigned group_size, i, group_index; ucp_address_t *address; size_t address_length = 0; ucp_ep_params_t ep_params; ucs_status_t status; struct iovec vec[3]; void *rkey_buffer; void *req = NULL; void *buffer; group_size = rte_call(perf, group_size); group_index = rte_call(perf, group_index); status = ucp_worker_get_address(perf->ucp.worker, &address, &address_length); if (status != UCS_OK) { if (perf->params.flags & UCX_PERF_TEST_FLAG_VERBOSE) { ucs_error("ucp_worker_get_address() failed: %s", ucs_status_string(status)); } goto err; } info.ucp.addr_len = address_length; info.recv_buffer = (uintptr_t)perf->recv_buffer; vec[0].iov_base = &info; vec[0].iov_len = sizeof(info); vec[1].iov_base = address; vec[1].iov_len = address_length; if (features & (UCP_FEATURE_RMA|UCP_FEATURE_AMO32|UCP_FEATURE_AMO64)) { status = ucp_rkey_pack(perf->ucp.context, perf->ucp.recv_memh, &rkey_buffer, &info.rkey_size); if (status != UCS_OK) { if (perf->params.flags & UCX_PERF_TEST_FLAG_VERBOSE) { ucs_error("ucp_rkey_pack() failed: %s", ucs_status_string(status)); } ucp_worker_release_address(perf->ucp.worker, address); goto err; } vec[2].iov_base = rkey_buffer; vec[2].iov_len = info.rkey_size; rte_call(perf, post_vec, vec, 3, &req); ucp_rkey_buffer_release(rkey_buffer); } else { info.rkey_size = 0; rte_call(perf, post_vec, vec, 2, &req); } ucp_worker_release_address(perf->ucp.worker, address); rte_call(perf, exchange_vec, req); perf->ucp.peers = calloc(group_size, sizeof(*perf->uct.peers)); if (perf->ucp.peers == NULL) { goto err; } buffer = malloc(buffer_size); if (buffer == NULL) { ucs_error("Failed to allocate RTE receive buffer"); status = UCS_ERR_NO_MEMORY; goto err_destroy_eps; } for (i = 0; i < group_size; ++i) { if (i == group_index) { continue; } rte_call(perf, recv, i, buffer, buffer_size, req); remote_info = buffer; address = (void*)(remote_info + 1); rkey_buffer = (void*)address + remote_info->ucp.addr_len; perf->ucp.peers[i].remote_addr = remote_info->recv_buffer; ep_params.field_mask = UCP_EP_PARAM_FIELD_REMOTE_ADDRESS; ep_params.address = address; status = ucp_ep_create(perf->ucp.worker, &ep_params, &perf->ucp.peers[i].ep); if (status != UCS_OK) { if (perf->params.flags & UCX_PERF_TEST_FLAG_VERBOSE) { ucs_error("ucp_ep_create() failed: %s", ucs_status_string(status)); } goto err_free_buffer; } if (remote_info->rkey_size > 0) { status = ucp_ep_rkey_unpack(perf->ucp.peers[i].ep, rkey_buffer, &perf->ucp.peers[i].rkey); if (status != UCS_OK) { if (perf->params.flags & UCX_PERF_TEST_FLAG_VERBOSE) { ucs_fatal("ucp_rkey_unpack() failed: %s", ucs_status_string(status)); } goto err_free_buffer; } } else { perf->ucp.peers[i].rkey = NULL; } } free(buffer); status = ucp_perf_test_exchange_status(perf, UCS_OK); if (status != UCS_OK) { ucp_perf_test_destroy_eps(perf, group_size); } return status; err_free_buffer: free(buffer); err_destroy_eps: ucp_perf_test_destroy_eps(perf, group_size); err: (void)ucp_perf_test_exchange_status(perf, status); return status; }
ucs_status_t ucp_wireup_init_lanes(ucp_ep_h ep, unsigned address_count, const ucp_address_entry_t *address_list, uint8_t *addr_indices) { ucp_worker_h worker = ep->worker; ucp_ep_config_key_t key; uint16_t new_cfg_index; ucp_lane_index_t lane; ucs_status_t status; uint8_t conn_flag; char str[32]; ucs_trace("ep %p: initialize lanes", ep); status = ucp_wireup_select_lanes(ep, address_count, address_list, addr_indices, &key); if (status != UCS_OK) { goto err; } key.reachable_md_map |= ucp_ep_config(ep)->key.reachable_md_map; new_cfg_index = ucp_worker_get_ep_config(worker, &key); if ((ep->cfg_index == new_cfg_index)) { return UCS_OK; /* No change */ } if ((ep->cfg_index != 0) && !ucp_ep_is_stub(ep)) { /* * TODO handle a case where we have to change lanes and reconfigure the ep: * * - if we already have uct ep connected to an address - move it to the new lane index * - if we don't yet have connection to an address - create it * - if an existing lane is not connected anymore - delete it (possibly) * - if the configuration has changed - replay all pending operations on all lanes - * need that every pending callback would return, in case of failure, the number * of lane it wants to be queued on. */ ucs_debug("cannot reconfigure ep %p from [%d] to [%d]", ep, ep->cfg_index, new_cfg_index); ucp_wireup_print_config(worker->context, &ucp_ep_config(ep)->key, "old", NULL); ucp_wireup_print_config(worker->context, &key, "new", NULL); ucs_fatal("endpoint reconfiguration not supported yet"); } ep->cfg_index = new_cfg_index; ep->am_lane = key.am_lane; snprintf(str, sizeof(str), "ep %p", ep); ucp_wireup_print_config(worker->context, &ucp_ep_config(ep)->key, str, addr_indices); ucs_trace("ep %p: connect lanes", ep); /* establish connections on all underlying endpoints */ conn_flag = UCP_EP_FLAG_LOCAL_CONNECTED; for (lane = 0; lane < ucp_ep_num_lanes(ep); ++lane) { status = ucp_wireup_connect_lane(ep, lane, address_count, address_list, addr_indices[lane]); if (status != UCS_OK) { goto err; } if (ucp_worker_is_tl_p2p(worker, ucp_ep_get_rsc_index(ep, lane))) { conn_flag = 0; /* If we have a p2p transport, we're not connected */ } } ep->flags |= conn_flag; return UCS_OK; err: for (lane = 0; lane < ucp_ep_num_lanes(ep); ++lane) { if (ep->uct_eps[lane] != NULL) { uct_ep_destroy(ep->uct_eps[lane]); ep->uct_eps[lane] = NULL; } } return status; }
/* * Generic data-pointer posting function. * Parameters which are not relevant to the opcode are ignored. * * +--------+-----+-------+--------+-------+ * SEND | CTRL | INL | am_id | am_hdr | DPSEG | * +--------+-----+---+---+----+----+------+ * RDMA_WRITE | CTRL | RADDR | DPSEG | * +--------+---------+--------+-------+ * ATOMIC | CTRL | RADDR | ATOMIC | DPSEG | * +--------+---------+--------+-------+ */ static UCS_F_ALWAYS_INLINE void uct_rc_mlx5_ep_dptr_post(uct_rc_mlx5_ep_t *ep, unsigned opcode_flags, const void *buffer, unsigned length, uint32_t *lkey_p, /* SEND */ uint8_t am_id, const void *am_hdr, unsigned am_hdr_len, /* RDMA/ATOMIC */ uint64_t remote_addr, uct_rkey_t rkey, /* ATOMIC */ uint64_t compare_mask, uint64_t compare, uint64_t swap_add, int signal) { struct mlx5_wqe_ctrl_seg *ctrl; struct mlx5_wqe_raddr_seg *raddr; struct mlx5_wqe_atomic_seg *atomic; struct mlx5_wqe_data_seg *dptr; struct mlx5_wqe_inl_data_seg *inl; struct uct_ib_mlx5_atomic_masked_cswap32_seg *masked_cswap32; struct uct_ib_mlx5_atomic_masked_fadd32_seg *masked_fadd32; struct uct_ib_mlx5_atomic_masked_cswap64_seg *masked_cswap64; uct_rc_mlx5_iface_t *iface; uct_rc_hdr_t *rch; unsigned wqe_size, inl_seg_size; uint8_t opmod; iface = ucs_derived_of(ep->super.super.super.iface, uct_rc_mlx5_iface_t); if (!signal) { signal = uct_rc_iface_tx_moderation(&iface->super, &ep->super, MLX5_WQE_CTRL_CQ_UPDATE); } else { ucs_assert(signal == MLX5_WQE_CTRL_CQ_UPDATE); } opmod = 0; ctrl = ep->tx.wq.curr; switch (opcode_flags) { case MLX5_OPCODE_SEND: inl_seg_size = ucs_align_up_pow2(sizeof(*inl) + sizeof(*rch) + am_hdr_len, UCT_IB_MLX5_WQE_SEG_SIZE); ucs_assert(sizeof(*ctrl) + inl_seg_size + sizeof(*dptr) <= UCT_RC_MLX5_MAX_BB * MLX5_SEND_WQE_BB); ucs_assert(length + sizeof(*rch) + am_hdr_len <= iface->super.super.config.seg_size); /* Inline segment with AM ID and header */ inl = (void*)(ctrl + 1); inl->byte_count = htonl((sizeof(*rch) + am_hdr_len) | MLX5_INLINE_SEG); rch = (void*)(inl + 1); rch->am_id = am_id; uct_ib_mlx5_inline_copy(rch + 1, am_hdr, am_hdr_len, &ep->tx.wq); /* Data segment with payload */ if (length == 0) { wqe_size = sizeof(*ctrl) + inl_seg_size; } else { wqe_size = sizeof(*ctrl) + inl_seg_size + sizeof(*dptr); dptr = (void*)(ctrl + 1) + inl_seg_size; if (ucs_unlikely((void*)dptr >= ep->tx.wq.qend)) { dptr = (void*)dptr - (ep->tx.wq.qend - ep->tx.wq.qstart); } ucs_assert((void*)dptr >= ep->tx.wq.qstart); ucs_assert((void*)(dptr + 1) <= ep->tx.wq.qend); uct_ib_mlx5_set_data_seg(dptr, buffer, length, *lkey_p); } break; case MLX5_OPCODE_SEND|UCT_RC_MLX5_OPCODE_FLAG_RAW: /* Data segment only */ ucs_assert(length < (2ul << 30)); ucs_assert(length <= iface->super.super.config.seg_size); wqe_size = sizeof(*ctrl) + sizeof(*dptr); uct_ib_mlx5_set_data_seg((void*)(ctrl + 1), buffer, length, *lkey_p); break; case MLX5_OPCODE_RDMA_READ: case MLX5_OPCODE_RDMA_WRITE: /* Set RDMA segment */ ucs_assert(length <= UCT_IB_MAX_MESSAGE_SIZE); raddr = (void*)(ctrl + 1); uct_rc_mlx5_ep_set_rdma_seg(raddr, remote_addr, rkey); /* Data segment */ if (length == 0) { wqe_size = sizeof(*ctrl) + sizeof(*raddr); } else { wqe_size = sizeof(*ctrl) + sizeof(*raddr) + sizeof(*dptr); uct_ib_mlx5_set_data_seg((void*)(raddr + 1), buffer, length, *lkey_p); } break; case MLX5_OPCODE_ATOMIC_FA: case MLX5_OPCODE_ATOMIC_CS: ucs_assert(length == sizeof(uint64_t)); raddr = (void*)(ctrl + 1); uct_rc_mlx5_ep_set_rdma_seg(raddr, remote_addr, rkey); atomic = (void*)(raddr + 1); if (opcode_flags == MLX5_OPCODE_ATOMIC_CS) { atomic->compare = compare; } atomic->swap_add = swap_add; uct_ib_mlx5_set_data_seg((void*)(atomic + 1), buffer, length, *lkey_p); wqe_size = sizeof(*ctrl) + sizeof(*raddr) + sizeof(*atomic) + sizeof(*dptr); break; case MLX5_OPCODE_ATOMIC_MASKED_CS: raddr = (void*)(ctrl + 1); uct_rc_mlx5_ep_set_rdma_seg(raddr, remote_addr, rkey); switch (length) { case sizeof(uint32_t): opmod = UCT_IB_MLX5_OPMOD_EXT_ATOMIC(2); masked_cswap32 = (void*)(raddr + 1); masked_cswap32->swap = swap_add; masked_cswap32->compare = compare; masked_cswap32->swap_mask = (uint32_t)-1; masked_cswap32->compare_mask = compare_mask; dptr = (void*)(masked_cswap32 + 1); wqe_size = sizeof(*ctrl) + sizeof(*raddr) + sizeof(*masked_cswap32) + sizeof(*dptr); break; case sizeof(uint64_t): opmod = UCT_IB_MLX5_OPMOD_EXT_ATOMIC(3); /* Ext. atomic, size 2**3 */ masked_cswap64 = (void*)(raddr + 1); masked_cswap64->swap = swap_add; masked_cswap64->compare = compare; masked_cswap64->swap_mask = (uint64_t)-1; masked_cswap64->compare_mask = compare_mask; dptr = (void*)(masked_cswap64 + 1); wqe_size = sizeof(*ctrl) + sizeof(*raddr) + sizeof(*masked_cswap64) + sizeof(*dptr); /* Handle QP wrap-around. It cannot happen in the middle of * masked-cswap segment, because it's still in the first BB. */ ucs_assert((void*)dptr <= ep->tx.wq.qend); if (dptr == ep->tx.wq.qend) { dptr = ep->tx.wq.qstart; } else { ucs_assert((void*)masked_cswap64 < ep->tx.wq.qend); } break; default: ucs_assert(0); } uct_ib_mlx5_set_data_seg(dptr, buffer, length, *lkey_p); break; case MLX5_OPCODE_ATOMIC_MASKED_FA: ucs_assert(length == sizeof(uint32_t)); raddr = (void*)(ctrl + 1); uct_rc_mlx5_ep_set_rdma_seg(raddr, remote_addr, rkey); opmod = UCT_IB_MLX5_OPMOD_EXT_ATOMIC(2); masked_fadd32 = (void*)(raddr + 1); masked_fadd32->add = swap_add; masked_fadd32->filed_boundary = 0; uct_ib_mlx5_set_data_seg((void*)(masked_fadd32 + 1), buffer, length, *lkey_p); wqe_size = sizeof(*ctrl) + sizeof(*raddr) + sizeof(*masked_fadd32) + sizeof(*dptr); break; default: ucs_fatal("invalid send opcode"); } uct_rc_mlx5_post_send(ep, ctrl, (opcode_flags & UCT_RC_MLX5_OPCODE_MASK), opmod, signal, wqe_size); }
void uct_ud_ep_process_rx(uct_ud_iface_t *iface, uct_ud_neth_t *neth, unsigned byte_len, uct_ud_recv_skb_t *skb) { uint32_t dest_id; uint32_t is_am, am_id; uct_ud_ep_t *ep = 0; /* todo: check why gcc complaints about uninitialized var */ ucs_frag_list_ooo_type_t ooo_type; dest_id = uct_ud_neth_get_dest_id(neth); am_id = uct_ud_neth_get_am_id(neth); is_am = neth->packet_type & UCT_UD_PACKET_FLAG_AM; if (ucs_unlikely(dest_id == UCT_UD_EP_NULL_ID)) { /* must be connection request packet */ uct_ud_iface_log_rx(iface, NULL, neth, byte_len); uct_ud_ep_rx_creq(iface, neth); goto out; } else if (ucs_unlikely(!ucs_ptr_array_lookup(&iface->eps, dest_id, ep) || ep->ep_id != dest_id)) { uct_ud_iface_log_rx(iface, ep, neth, byte_len); /* TODO: in the future just drop the packet */ ucs_fatal("Faied to find ep(%d)", dest_id); goto out; } uct_ud_iface_log_rx(iface, ep, neth, byte_len); ucs_assert(ep->ep_id != UCT_UD_EP_NULL_ID); UCT_UD_EP_HOOK_CALL_RX(ep, neth); uct_ud_ep_process_ack(iface, ep, neth->ack_psn); if (ucs_unlikely(neth->packet_type & UCT_UD_PACKET_FLAG_ACK_REQ)) { uct_ud_iface_queue_pending(iface, ep, UCT_UD_EP_OP_ACK); } if (ucs_unlikely(!is_am && (byte_len == sizeof(*neth)))) { goto out; } ooo_type = ucs_frag_list_insert(&ep->rx.ooo_pkts, &skb->ooo_elem, neth->psn); if (ucs_unlikely(ooo_type != UCS_FRAG_LIST_INSERT_FAST)) { ucs_fatal("Out of order is not implemented: got %d", ooo_type); goto out; } if (ucs_unlikely(!is_am && (neth->packet_type & UCT_UD_PACKET_FLAG_CTL))) { uct_ud_ep_rx_ctl(iface, ep, (uct_ud_ctl_hdr_t *)(neth + 1)); goto out; } if (ucs_unlikely(!is_am && (neth->packet_type & UCT_UD_PACKET_FLAG_PUT))) { uct_ud_ep_rx_put(neth, byte_len); goto out; } uct_ib_iface_invoke_am(&iface->super, am_id, neth + 1, byte_len - sizeof(*neth), &skb->super); return; out: ucs_mpool_put(skb); }