static ssize_t uct_ud_verbs_ep_am_bcopy(uct_ep_h tl_ep, uint8_t id, uct_pack_callback_t pack_cb, void *arg) { uct_ud_verbs_ep_t *ep = ucs_derived_of(tl_ep, uct_ud_verbs_ep_t); uct_ud_verbs_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_ud_verbs_iface_t); uct_ud_send_skb_t *skb; ucs_status_t status; size_t length; uct_ud_enter(&iface->super); uct_ud_iface_progress_pending_tx(&iface->super); status = uct_ud_am_common(&iface->super, &ep->super, id, &skb); if (status != UCS_OK) { uct_ud_leave(&iface->super); return status; } length = uct_ud_skb_bcopy(skb, pack_cb, arg); uct_ud_verbs_ep_tx_skb(iface, ep, skb, 0); ucs_trace_data("TX(iface=%p): AM_BCOPY [%d] skb=%p buf=%p len=%u", iface, id, skb, arg, skb->len); uct_ud_iface_complete_tx_skb(&iface->super, &ep->super, skb); uct_ud_leave(&iface->super); return length; }
static ssize_t uct_ud_mlx5_ep_am_bcopy(uct_ep_h tl_ep, uint8_t id, uct_pack_callback_t pack_cb, void *arg) { uct_ud_mlx5_ep_t *ep = ucs_derived_of(tl_ep, uct_ud_mlx5_ep_t); uct_ud_mlx5_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_ud_mlx5_iface_t); uct_ud_send_skb_t *skb; ucs_status_t status; size_t length; uct_ud_enter(&iface->super); uct_ud_iface_progress_pending_tx(&iface->super); status = uct_ud_am_common(&iface->super, &ep->super, id, &skb); if (status != UCS_OK) { uct_ud_leave(&iface->super); return status; } length = uct_ud_skb_bcopy(skb, pack_cb, arg); uct_ud_mlx5_ep_tx_skb(iface, ep, skb); uct_ud_iface_complete_tx_skb(&iface->super, &ep->super, skb); UCT_TL_EP_STAT_OP(&ep->super.super, AM, BCOPY, length); uct_ud_leave(&iface->super); return length; }
static ucs_status_t uct_ud_mlx5_ep_am_short(uct_ep_h tl_ep, uint8_t id, uint64_t hdr, const void *buffer, unsigned length) { uct_ud_mlx5_ep_t *ep = ucs_derived_of(tl_ep, uct_ud_mlx5_ep_t); uct_ud_mlx5_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_ud_mlx5_iface_t); struct mlx5_wqe_ctrl_seg *ctrl; struct mlx5_wqe_inl_data_seg *inl; uct_ud_am_short_hdr_t *am; uct_ud_neth_t *neth; unsigned wqe_size; uct_ud_send_skb_t *skb; /* data a written directly into tx wqe, so it is impossible to use * common ud am code */ UCT_CHECK_AM_ID(id); UCT_CHECK_LENGTH(sizeof(uct_ud_neth_t) + sizeof(hdr) + length, iface->super.config.max_inline, "am_short"); uct_ud_enter(&iface->super); uct_ud_iface_progress_pending_tx(&iface->super); skb = uct_ud_ep_get_tx_skb(&iface->super, &ep->super); if (!skb) { uct_ud_leave(&iface->super); return UCS_ERR_NO_RESOURCE; } ctrl = iface->tx.wq.curr; /* Set inline segment which has AM id, AM header, and AM payload */ inl = uct_ib_mlx5_get_next_seg(&iface->tx.wq, ctrl, UCT_UD_MLX5_WQE_SIZE); wqe_size = length + sizeof(*am) + sizeof(*neth); inl->byte_count = htonl(wqe_size | MLX5_INLINE_SEG); /* assume that neth and am header fit into one bb */ ucs_assert(sizeof(*am) + sizeof(*neth) < MLX5_SEND_WQE_BB); neth = (void*)(inl + 1); uct_ud_am_set_neth(neth, &ep->super, id); am = (void*)(neth + 1); am->hdr = hdr; uct_ib_mlx5_inline_copy(am + 1, buffer, length, &iface->tx.wq); wqe_size += UCT_UD_MLX5_WQE_SIZE + sizeof(*inl); UCT_CHECK_LENGTH(wqe_size, UCT_IB_MLX5_MAX_BB * MLX5_SEND_WQE_BB, "am_short"); UCT_UD_EP_HOOK_CALL_TX(&ep->super, neth); uct_ud_mlx5_post_send(iface, ep, ctrl, wqe_size); skb->len = sizeof(*neth) + sizeof(*am); memcpy(skb->neth, neth, skb->len); uct_ud_iface_complete_tx_inl(&iface->super, &ep->super, skb, (char *)skb->neth + skb->len, buffer, length); UCT_TL_EP_STAT_OP(&ep->super.super, AM, SHORT, sizeof(hdr) + length); uct_ud_leave(&iface->super); return UCS_OK; }
static ucs_status_t uct_ud_mlx5_ep_put_short(uct_ep_h tl_ep, const void *buffer, unsigned length, uint64_t remote_addr, uct_rkey_t rkey) { uct_ud_mlx5_ep_t *ep = ucs_derived_of(tl_ep, uct_ud_mlx5_ep_t); uct_ud_mlx5_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_ud_mlx5_iface_t); struct mlx5_wqe_ctrl_seg *ctrl; struct mlx5_wqe_inl_data_seg *inl; unsigned wqe_size; uct_ud_put_hdr_t *put_hdr; uct_ud_neth_t *neth; uct_ud_send_skb_t *skb; uct_ud_enter(&iface->super); uct_ud_iface_progress_pending_tx(&iface->super); skb = uct_ud_ep_get_tx_skb(&iface->super, &ep->super); if (!skb) { uct_ud_leave(&iface->super); return UCS_ERR_NO_RESOURCE; } ctrl = iface->tx.wq.curr; /* Set inline segment which has AM id, AM header, and AM payload */ inl = uct_ib_mlx5_get_next_seg(&iface->tx.wq, ctrl, UCT_UD_MLX5_WQE_SIZE); wqe_size = length + sizeof(*put_hdr) + sizeof(*neth); inl->byte_count = htonl(wqe_size | MLX5_INLINE_SEG); /* assume that neth and am header fit into one bb */ ucs_assert(sizeof(*put_hdr) + sizeof(*neth) < MLX5_SEND_WQE_BB); neth = (void*)(inl + 1); uct_ud_neth_init_data(&ep->super, neth); uct_ud_neth_set_type_put(&ep->super, neth); uct_ud_neth_ack_req(&ep->super, neth); put_hdr = (uct_ud_put_hdr_t *)(neth+1); put_hdr->rva = remote_addr; uct_ib_mlx5_inline_copy(put_hdr + 1, buffer, length, &iface->tx.wq); wqe_size += UCT_UD_MLX5_WQE_SIZE + sizeof(*inl); UCT_CHECK_LENGTH(wqe_size, UCT_IB_MLX5_MAX_BB * MLX5_SEND_WQE_BB, "put_short"); UCT_UD_EP_HOOK_CALL_TX(&ep->super, neth); uct_ud_mlx5_post_send(iface, ep, ctrl, wqe_size); skb->len = sizeof(*neth) + sizeof(*put_hdr); memcpy(skb->neth, neth, skb->len); uct_ud_iface_complete_tx_inl(&iface->super, &ep->super, skb, (char *)skb->neth + skb->len, buffer, length); UCT_TL_EP_STAT_OP(&ep->super.super, PUT, SHORT, length); uct_ud_leave(&iface->super); return UCS_OK; }
ucs_status_t uct_ud_verbs_ep_create_connected(uct_iface_h iface_h, const struct sockaddr *addr, uct_ep_h *new_ep_p) { uct_ud_verbs_iface_t *iface = ucs_derived_of(iface_h, uct_ud_verbs_iface_t); uct_ud_verbs_ep_t *ep; uct_ud_ep_t *new_ud_ep; const uct_sockaddr_ib_t *if_addr = (const uct_sockaddr_ib_t *)addr; uct_ud_send_skb_t *skb; struct ibv_ah *ah; ucs_status_t status; uct_ud_enter(&iface->super); status = uct_ud_ep_create_connected_common(&iface->super, if_addr, &new_ud_ep, &skb); if (status != UCS_OK) { return status; } ep = ucs_derived_of(new_ud_ep, uct_ud_verbs_ep_t); *new_ep_p = &ep->super.super.super; if (skb == NULL) { uct_ud_leave(&iface->super); return UCS_OK; } ucs_assert_always(ep->ah == NULL); ah = uct_ib_create_ah(&iface->super.super, if_addr->lid); if (ah == NULL) { ucs_error("failed to create address handle: %m"); status = UCS_ERR_INVALID_ADDR; goto err; } ep->ah = ah; ucs_trace_data("TX: CREQ (qp=%x lid=%d)", if_addr->qp_num, if_addr->lid); uct_ud_verbs_ep_tx_skb(iface, ep, skb, IBV_SEND_INLINE); uct_ud_iface_complete_tx_skb(&iface->super, &ep->super, skb); uct_ud_leave(&iface->super); return UCS_OK; err: uct_ud_ep_destroy_connected(&ep->super, if_addr); uct_ud_leave(&iface->super); *new_ep_p = NULL; return status; }
ucs_status_t uct_ud_mlx5_ep_create_connected(uct_iface_h iface_h, const struct sockaddr *addr, uct_ep_h *new_ep_p) { uct_ud_mlx5_iface_t *iface = ucs_derived_of(iface_h, uct_ud_mlx5_iface_t); uct_ud_mlx5_ep_t *ep; uct_ud_ep_t *new_ud_ep; const uct_sockaddr_ib_t *if_addr = (const uct_sockaddr_ib_t *)addr; uct_ud_send_skb_t *skb; ucs_status_t status; uct_ud_enter(&iface->super); status = uct_ud_ep_create_connected_common(&iface->super, if_addr, &new_ud_ep, &skb); if (status != UCS_OK) { uct_ud_leave(&iface->super); return status; } ep = ucs_derived_of(new_ud_ep, uct_ud_mlx5_ep_t); *new_ep_p = &ep->super.super.super; if (skb == NULL) { uct_ud_leave(&iface->super); return UCS_OK; } status = uct_ud_mlx5_ep_create_ah(iface, ep, if_addr); if (status != UCS_OK) { goto err; } ucs_trace_data("TX: CREQ (qp=%x lid=%d)", if_addr->qp_num, if_addr->lid); uct_ud_mlx5_ep_tx_skb(iface, ep, skb); uct_ud_iface_complete_tx_skb(&iface->super, &ep->super, skb); uct_ud_leave(&iface->super); return UCS_OK; err: uct_ud_ep_destroy_connected(&ep->super, if_addr); uct_ud_leave(&iface->super); *new_ep_p = NULL; return status; }
static ucs_status_t uct_ud_verbs_ep_put_short(uct_ep_h tl_ep, const void *buffer, unsigned length, uint64_t remote_addr, uct_rkey_t rkey) { uct_ud_verbs_ep_t *ep = ucs_derived_of(tl_ep, uct_ud_verbs_ep_t); uct_ud_verbs_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_ud_verbs_iface_t); uct_ud_send_skb_t *skb; uct_ud_put_hdr_t *put_hdr; uct_ud_neth_t *neth; /* TODO: UCT_CHECK_LENGTH(length <= iface->config.max_inline, "put_short"); */ uct_ud_enter(&iface->super); uct_ud_iface_progress_pending_tx(&iface->super); skb = uct_ud_ep_get_tx_skb(&iface->super, &ep->super); if (!skb) { uct_ud_leave(&iface->super); return UCS_ERR_NO_RESOURCE; } neth = skb->neth; uct_ud_neth_init_data(&ep->super, neth); uct_ud_neth_set_type_put(&ep->super, neth); uct_ud_neth_ack_req(&ep->super, neth); put_hdr = (uct_ud_put_hdr_t *)(neth+1); put_hdr->rva = remote_addr; iface->tx.sge[0].addr = (uintptr_t)neth; iface->tx.sge[0].length = sizeof(*neth) + sizeof(*put_hdr); uct_ud_verbs_ep_tx_inlv(iface, ep, buffer, length); ucs_trace_data("TX: PUT [%0llx] buf=%p len=%u", (unsigned long long)remote_addr, buffer, length); skb->len = iface->tx.sge[0].length; uct_ud_iface_complete_tx_inl(&iface->super, &ep->super, skb, put_hdr+1, buffer, length); uct_ud_leave(&iface->super); return UCS_OK; }
static void uct_ud_mlx5_iface_progress(void *arg) { uct_ud_mlx5_iface_t *iface = arg; ucs_status_t status; uct_ud_enter(&iface->super); status = uct_ud_mlx5_iface_poll_rx(iface); if (status == UCS_ERR_NO_PROGRESS) { uct_ud_mlx5_iface_poll_tx(iface); } uct_ud_iface_progress_pending(&iface->super, 0); uct_ud_leave(&iface->super); }
static ucs_status_t uct_ud_verbs_ep_am_short(uct_ep_h tl_ep, uint8_t id, uint64_t hdr, const void *buffer, unsigned length) { uct_ud_verbs_ep_t *ep = ucs_derived_of(tl_ep, uct_ud_verbs_ep_t); uct_ud_verbs_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_ud_verbs_iface_t); uct_ud_send_skb_t *skb; uct_ud_am_short_hdr_t *am_hdr; ucs_status_t status; UCT_CHECK_LENGTH(sizeof(uct_ud_neth_t) + sizeof(hdr) + length, iface->super.config.max_inline, "am_short"); uct_ud_enter(&iface->super); uct_ud_iface_progress_pending_tx(&iface->super); status = uct_ud_am_common(&iface->super, &ep->super, id, &skb); if (status != UCS_OK) { uct_ud_leave(&iface->super); return status; } am_hdr = (uct_ud_am_short_hdr_t *)(skb->neth+1); am_hdr->hdr = hdr; iface->tx.sge[0].length = sizeof(uct_ud_neth_t) + sizeof(*am_hdr); iface->tx.sge[0].addr = (uintptr_t)skb->neth; uct_ud_verbs_ep_tx_inlv(iface, ep, buffer, length); ucs_trace_data("TX: AM [%d] buf=%p len=%u", id, buffer, length); skb->len = iface->tx.sge[0].length; uct_ud_iface_complete_tx_inl(&iface->super, &ep->super, skb, am_hdr+1, buffer, length); uct_ud_leave(&iface->super); return UCS_OK; }
static UCS_CLASS_INIT_FUNC(uct_ud_verbs_iface_t, uct_pd_h pd, uct_worker_h worker, const char *dev_name, size_t rx_headroom, const uct_iface_config_t *tl_config) { uct_ud_iface_config_t *config = ucs_derived_of(tl_config, uct_ud_iface_config_t); ucs_trace_func(""); UCS_CLASS_CALL_SUPER_INIT(uct_ud_iface_t, &uct_ud_verbs_iface_ops, pd, worker, dev_name, rx_headroom, 0, config); self->super.ops.async_progress = uct_ud_verbs_iface_async_progress; self->super.ops.tx_skb = uct_ud_verbs_ep_tx_ctl_skb; if (self->super.config.rx_max_batch < UCT_IB_MAX_WC) { ucs_warn("max batch is too low (%d < %d), performance may be impacted", self->super.config.rx_max_batch, UCT_IB_MAX_WC); } while (self->super.rx.available >= self->super.config.rx_max_batch) { uct_ud_verbs_iface_post_recv(self); } memset(&self->tx.wr_inl, 0, sizeof(self->tx.wr_inl)); self->tx.wr_inl.opcode = IBV_WR_SEND; self->tx.wr_inl.wr_id = 0xBEEBBEEB; self->tx.wr_inl.wr.ud.remote_qkey = UCT_IB_QKEY; self->tx.wr_inl.imm_data = 0; self->tx.wr_inl.next = 0; self->tx.wr_inl.sg_list = self->tx.sge; self->tx.wr_inl.num_sge = UCT_UD_MAX_SGE; memset(&self->tx.wr_skb, 0, sizeof(self->tx.wr_skb)); self->tx.wr_skb.opcode = IBV_WR_SEND; self->tx.wr_skb.wr_id = 0xFAAFFAAF; self->tx.wr_skb.wr.ud.remote_qkey = UCT_IB_QKEY; self->tx.wr_skb.imm_data = 0; self->tx.wr_skb.next = 0; self->tx.wr_skb.sg_list = self->tx.sge; self->tx.wr_skb.num_sge = 1; /* TODO: add progress on first ep creation */ uct_worker_progress_register(worker, uct_ud_verbs_iface_progress, self); uct_ud_leave(&self->super); return UCS_OK; }
static UCS_CLASS_INIT_FUNC(uct_ud_mlx5_iface_t, uct_pd_h pd, uct_worker_h worker, const char *dev_name, size_t rx_headroom, const uct_iface_config_t *tl_config) { uct_ud_iface_config_t *config = ucs_derived_of(tl_config, uct_ud_iface_config_t); ucs_status_t status; int i; ucs_trace_func(""); UCS_CLASS_CALL_SUPER_INIT(uct_ud_iface_t, &uct_ud_mlx5_iface_ops, pd, worker, dev_name, rx_headroom, 0, config); self->super.ops.async_progress = uct_ud_mlx5_iface_async_progress; self->super.ops.tx_skb = uct_ud_mlx5_ep_tx_ctl_skb; status = uct_ib_mlx5_get_cq(self->super.super.send_cq, &self->tx.cq); if (status != UCS_OK) { return status; } if (uct_ib_mlx5_cqe_size(&self->tx.cq) != sizeof(struct mlx5_cqe64)) { ucs_error("TX CQE size (%d) is not %d", uct_ib_mlx5_cqe_size(&self->tx.cq), (int)sizeof(struct mlx5_cqe64)); return UCS_ERR_IO_ERROR; } status = uct_ib_mlx5_get_cq(self->super.super.recv_cq, &self->rx.cq); if (status != UCS_OK) { return UCS_ERR_IO_ERROR; } if (uct_ib_mlx5_cqe_size(&self->rx.cq) != sizeof(struct mlx5_cqe64)) { ucs_error("RX CQE size (%d) is not %d", uct_ib_mlx5_cqe_size(&self->rx.cq), (int)sizeof(struct mlx5_cqe64)); return UCS_ERR_IO_ERROR; } status = uct_ib_mlx5_get_txwq(self->super.super.super.worker, self->super.qp, &self->tx.wq); if (status != UCS_OK) { return UCS_ERR_IO_ERROR; } self->super.tx.available = self->tx.wq.bb_max; status = uct_ib_mlx5_get_rxwq(self->super.qp, &self->rx.wq); if (status != UCS_OK) { return UCS_ERR_IO_ERROR; } /* write buffer sizes */ for (i = 0; i <= self->rx.wq.mask; i++) { self->rx.wq.wqes[i].byte_count = htonl(self->super.super.config.rx_payload_offset + self->super.super.config.seg_size); } while (self->super.rx.available >= self->super.config.rx_max_batch) { uct_ud_mlx5_iface_post_recv(self); } /* TODO: add progress on first ep creation */ uct_worker_progress_register(worker, uct_ud_mlx5_iface_progress, self); uct_ud_leave(&self->super); return UCS_OK; }