static UCS_CLASS_INIT_FUNC(uct_rc_verbs_iface_t, uct_pd_h pd, uct_worker_h worker, const char *dev_name, size_t rx_headroom, const uct_iface_config_t *tl_config) { uct_rc_verbs_iface_config_t *config = ucs_derived_of(tl_config, uct_rc_verbs_iface_config_t); struct ibv_exp_device_attr *dev_attr; size_t am_hdr_size; ucs_status_t status; struct ibv_qp_cap cap; struct ibv_qp *qp; extern uct_iface_ops_t uct_rc_verbs_iface_ops; UCS_CLASS_CALL_SUPER_INIT(uct_rc_iface_t, &uct_rc_verbs_iface_ops, pd, worker, dev_name, rx_headroom, 0, &config->super); /* Initialize inline work request */ memset(&self->inl_am_wr, 0, sizeof(self->inl_am_wr)); self->inl_am_wr.sg_list = self->inl_sge; self->inl_am_wr.num_sge = 2; self->inl_am_wr.opcode = IBV_WR_SEND; self->inl_am_wr.send_flags = IBV_SEND_INLINE; memset(&self->inl_rwrite_wr, 0, sizeof(self->inl_rwrite_wr)); self->inl_rwrite_wr.sg_list = self->inl_sge; self->inl_rwrite_wr.num_sge = 1; self->inl_rwrite_wr.opcode = IBV_WR_RDMA_WRITE; self->inl_rwrite_wr.send_flags = IBV_SEND_SIGNALED | IBV_SEND_INLINE; memset(self->inl_sge, 0, sizeof(self->inl_sge)); /* Configuration */ am_hdr_size = ucs_max(config->max_am_hdr, sizeof(uct_rc_hdr_t)); self->config.short_desc_size = ucs_max(UCT_RC_MAX_ATOMIC_SIZE, am_hdr_size); dev_attr = &uct_ib_iface_device(&self->super.super)->dev_attr; if (IBV_EXP_HAVE_ATOMIC_HCA(dev_attr) || IBV_EXP_HAVE_ATOMIC_GLOB(dev_attr)) { self->config.atomic32_handler = uct_rc_ep_atomic_handler_32_be0; self->config.atomic64_handler = uct_rc_ep_atomic_handler_64_be0; } else if (IBV_EXP_HAVE_ATOMIC_HCA_REPLY_BE(dev_attr)) { self->config.atomic32_handler = uct_rc_ep_atomic_handler_32_be1; self->config.atomic64_handler = uct_rc_ep_atomic_handler_64_be1; } /* Create a dummy QP in order to find out max_inline */ status = uct_rc_iface_qp_create(&self->super, &qp, &cap); if (status != UCS_OK) { goto err; } ibv_destroy_qp(qp); self->config.max_inline = cap.max_inline_data; /* Create AH headers and Atomic mempool */ status = uct_iface_mpool_create(&self->super.super.super.super, sizeof(uct_rc_iface_send_desc_t) + self->config.short_desc_size, sizeof(uct_rc_iface_send_desc_t), UCS_SYS_CACHE_LINE_SIZE, &config->super.super.tx.mp, self->super.config.tx_qp_len, uct_rc_iface_send_desc_init, "rc_verbs_short_desc", &self->short_desc_mp); if (status != UCS_OK) { goto err; } while (self->super.rx.available > 0) { if (uct_rc_verbs_iface_post_recv(self, 1) == 0) { ucs_error("failed to post receives"); status = UCS_ERR_NO_MEMORY; goto err_destroy_short_desc_mp; } } ucs_notifier_chain_add(&worker->progress_chain, uct_rc_verbs_iface_progress, self); return UCS_OK; err_destroy_short_desc_mp: ucs_mpool_destroy(self->short_desc_mp); err: return status; }
static ucs_status_t uct_ugni_ep_get_composed_fma_rdma(uct_ep_h tl_ep, void *buffer, size_t length, uct_mem_h memh, uint64_t remote_addr, uct_rkey_t rkey, uct_completion_t *comp) { uct_ugni_rdma_fetch_desc_t *fma = NULL; uct_ugni_ep_t *ep = ucs_derived_of(tl_ep, uct_ugni_ep_t); uct_ugni_rdma_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_ugni_rdma_iface_t); uct_ugni_rdma_fetch_desc_t *rdma = NULL; size_t fma_length, rdma_length, aligned_fma_remote_start; uint64_t fma_remote_start, rdma_remote_start; ucs_status_t post_result; rdma_length = length - iface->config.fma_seg_size; fma_length = iface->config.fma_seg_size; UCT_TL_IFACE_GET_TX_DESC(&iface->super.super, &iface->free_desc_get_buffer, fma, return UCS_ERR_NO_RESOURCE); UCT_TL_IFACE_GET_TX_DESC(&iface->super.super, &iface->free_desc_get, rdma, return UCS_ERR_NO_RESOURCE); rdma_remote_start = remote_addr; fma_remote_start = rdma_remote_start + rdma_length; aligned_fma_remote_start = ucs_align_up_pow2(fma_remote_start, UGNI_GET_ALIGN); uct_ugni_format_get_fma(fma, GNI_POST_FMA_GET, aligned_fma_remote_start, rkey, fma_length, ep, comp, uct_ugni_unalign_fma_composed_cb, NULL, NULL); fma->tail = aligned_fma_remote_start - fma_remote_start; uct_ugni_format_unaligned_rdma(rdma, GNI_POST_RDMA_GET, buffer, rdma_remote_start, memh, rkey, rdma_length+fma->tail, ep, iface->super.local_cq, comp, uct_ugni_unalign_rdma_composed_cb); fma->head = rdma; rdma->head = fma; fma->network_completed_bytes = rdma->network_completed_bytes = 0; fma->user_buffer = rdma->user_buffer = buffer; fma->expected_bytes = rdma->expected_bytes = fma->super.desc.length + rdma->super.desc.length; ucs_trace_data("Posting split GET ZCOPY, GNI_PostFma of size %"PRIx64" (%lu) from %p to " "%p, with [%"PRIx64" %"PRIx64"] and GNI_PostRdma of size %"PRIx64" (%lu)" " from %p to %p, with [%"PRIx64" %"PRIx64"]", fma->super.desc.length, length, (void *)fma->super.desc.local_addr, (void *)fma->super.desc.remote_addr, fma->super.desc.remote_mem_hndl.qword1, fma->super.desc.remote_mem_hndl.qword2, rdma->super.desc.length, length, (void *)rdma->super.desc.local_addr, (void *)rdma->super.desc.remote_addr, rdma->super.desc.remote_mem_hndl.qword1, rdma->super.desc.remote_mem_hndl.qword2); UCT_TL_EP_STAT_OP(ucs_derived_of(tl_ep, uct_base_ep_t), GET, ZCOPY, length); post_result = uct_ugni_post_fma(iface, ep, &(fma->super), UCS_INPROGRESS); if(post_result != UCS_OK && post_result != UCS_INPROGRESS){ ucs_mpool_put(rdma); return post_result; } return uct_ugni_post_rdma(iface, ep, &(rdma->super)); }
/* * Generic data-pointer posting function. * Parameters which are not relevant to the opcode are ignored. * * +--------+-----+-------+--------+-------+ * SEND | CTRL | INL | am_id | am_hdr | DPSEG | * +--------+-----+---+---+----+----+------+ * RDMA_WRITE | CTRL | RADDR | DPSEG | * +--------+---------+--------+-------+ * ATOMIC | CTRL | RADDR | ATOMIC | DPSEG | * +--------+---------+--------+-------+ */ static UCS_F_ALWAYS_INLINE ucs_status_t uct_rc_mlx5_ep_dptr_post(uct_rc_mlx5_ep_t *ep, unsigned opcode_flags, const void *buffer, unsigned length, uint32_t *lkey_p, /* SEND */ uint8_t am_id, const void *am_hdr, unsigned am_hdr_len, /* RDMA/ATOMIC */ uint64_t remote_addr, uct_rkey_t rkey, /* ATOMIC */ uint64_t compare_mask, uint64_t compare, uint64_t swap_add, int signal) { struct mlx5_wqe_ctrl_seg *ctrl; struct mlx5_wqe_raddr_seg *raddr; struct mlx5_wqe_atomic_seg *atomic; struct mlx5_wqe_data_seg *dptr; struct mlx5_wqe_inl_data_seg *inl; struct uct_ib_mlx5_atomic_masked_cswap32_seg *masked_cswap32; struct uct_ib_mlx5_atomic_masked_fadd32_seg *masked_fadd32; struct uct_ib_mlx5_atomic_masked_cswap64_seg *masked_cswap64; uct_rc_mlx5_iface_t *iface; uct_rc_hdr_t *rch; unsigned wqe_size, inl_seg_size; uint8_t opmod; iface = ucs_derived_of(ep->super.super.super.iface, uct_rc_mlx5_iface_t); if (!signal) { signal = uct_rc_iface_tx_moderation(&iface->super, &ep->super, MLX5_WQE_CTRL_CQ_UPDATE); } else { ucs_assert(signal == MLX5_WQE_CTRL_CQ_UPDATE); } opmod = 0; ctrl = ep->tx.seg; switch (opcode_flags) { case MLX5_OPCODE_SEND: UCT_CHECK_LENGTH(length + sizeof(*rch) + am_hdr_len, iface->super.super.config.seg_size, "am_zcopy payload"); inl_seg_size = ucs_align_up_pow2(sizeof(*inl) + sizeof(*rch) + am_hdr_len, UCT_IB_MLX5_WQE_SEG_SIZE); UCT_CHECK_LENGTH(sizeof(*ctrl) + inl_seg_size + sizeof(*dptr), UCT_RC_MLX5_MAX_BB * MLX5_SEND_WQE_BB, "am_zcopy header"); /* Inline segment with AM ID and header */ inl = (void*)(ctrl + 1); inl->byte_count = htonl((sizeof(*rch) + am_hdr_len) | MLX5_INLINE_SEG); rch = (void*)(inl + 1); rch->am_id = am_id; uct_rc_mlx5_inline_copy(rch + 1, am_hdr, am_hdr_len, ep); /* Data segment with payload */ if (length == 0) { wqe_size = sizeof(*ctrl) + inl_seg_size; } else { wqe_size = sizeof(*ctrl) + inl_seg_size + sizeof(*dptr); dptr = (void*)(ctrl + 1) + inl_seg_size; if (ucs_unlikely((void*)dptr >= ep->tx.qend)) { dptr = (void*)dptr - (ep->tx.qend - ep->tx.qstart); } ucs_assert((void*)dptr >= ep->tx.qstart); ucs_assert((void*)(dptr + 1) <= ep->tx.qend); uct_rc_mlx5_ep_set_dptr_seg(dptr, buffer, length, *lkey_p); } break; case MLX5_OPCODE_SEND|UCT_RC_MLX5_OPCODE_FLAG_RAW: /* Data segment only */ UCT_CHECK_LENGTH(length, iface->super.super.config.seg_size, "send"); ucs_assert(length < (2ul << 30)); wqe_size = sizeof(*ctrl) + sizeof(*dptr); uct_rc_mlx5_ep_set_dptr_seg((void*)(ctrl + 1), buffer, length, *lkey_p); break; case MLX5_OPCODE_RDMA_READ: case MLX5_OPCODE_RDMA_WRITE: /* Set RDMA segment */ UCT_CHECK_LENGTH(length, UCT_IB_MAX_MESSAGE_SIZE, "put/get"); raddr = (void*)(ctrl + 1); uct_rc_mlx5_ep_set_rdma_seg(raddr, remote_addr, rkey); /* Data segment */ if (length == 0) { wqe_size = sizeof(*ctrl) + sizeof(*raddr); } else { wqe_size = sizeof(*ctrl) + sizeof(*raddr) + sizeof(*dptr); uct_rc_mlx5_ep_set_dptr_seg((void*)(raddr + 1), buffer, length, *lkey_p); } break; case MLX5_OPCODE_ATOMIC_FA: case MLX5_OPCODE_ATOMIC_CS: ucs_assert(length == sizeof(uint64_t)); raddr = (void*)(ctrl + 1); uct_rc_mlx5_ep_set_rdma_seg(raddr, remote_addr, rkey); atomic = (void*)(raddr + 1); if (opcode_flags == MLX5_OPCODE_ATOMIC_CS) { atomic->compare = compare; } atomic->swap_add = swap_add; uct_rc_mlx5_ep_set_dptr_seg((void*)(atomic + 1), buffer, length, *lkey_p); wqe_size = sizeof(*ctrl) + sizeof(*raddr) + sizeof(*atomic) + sizeof(*dptr); break; case MLX5_OPCODE_ATOMIC_MASKED_CS: raddr = (void*)(ctrl + 1); uct_rc_mlx5_ep_set_rdma_seg(raddr, remote_addr, rkey); switch (length) { case sizeof(uint32_t): opmod = UCT_IB_MLX5_OPMOD_EXT_ATOMIC(2); masked_cswap32 = (void*)(raddr + 1); masked_cswap32->swap = swap_add; masked_cswap32->compare = compare; masked_cswap32->swap_mask = (uint32_t)-1; masked_cswap32->compare_mask = compare_mask; dptr = (void*)(masked_cswap32 + 1); wqe_size = sizeof(*ctrl) + sizeof(*raddr) + sizeof(*masked_cswap32) + sizeof(*dptr); break; case sizeof(uint64_t): opmod = UCT_IB_MLX5_OPMOD_EXT_ATOMIC(3); /* Ext. atomic, size 2**3 */ masked_cswap64 = (void*)(raddr + 1); masked_cswap64->swap = swap_add; masked_cswap64->compare = compare; masked_cswap64->swap_mask = (uint64_t)-1; masked_cswap64->compare_mask = compare_mask; dptr = (void*)(masked_cswap64 + 1); wqe_size = sizeof(*ctrl) + sizeof(*raddr) + sizeof(*masked_cswap64) + sizeof(*dptr); /* Handle QP wrap-around. It cannot happen in the middle of * masked-cswap segment, because it's still in the first BB. */ ucs_assert((void*)dptr <= ep->tx.qend); if (dptr == ep->tx.qend) { dptr = ep->tx.qstart; } else { ucs_assert((void*)masked_cswap64 < ep->tx.qend); } break; default: ucs_assert(0); } uct_rc_mlx5_ep_set_dptr_seg(dptr, buffer, length, *lkey_p); break; case MLX5_OPCODE_ATOMIC_MASKED_FA: ucs_assert(length == sizeof(uint32_t)); raddr = (void*)(ctrl + 1); uct_rc_mlx5_ep_set_rdma_seg(raddr, remote_addr, rkey); opmod = UCT_IB_MLX5_OPMOD_EXT_ATOMIC(2); masked_fadd32 = (void*)(raddr + 1); masked_fadd32->add = swap_add; masked_fadd32->filed_boundary = 0; uct_rc_mlx5_ep_set_dptr_seg((void*)(masked_fadd32 + 1), buffer, length, *lkey_p); wqe_size = sizeof(*ctrl) + sizeof(*raddr) + sizeof(*masked_fadd32) + sizeof(*dptr); break; default: return UCS_ERR_INVALID_PARAM; } uct_rc_mlx5_post_send(ep, ctrl, (opcode_flags & UCT_RC_MLX5_OPCODE_MASK), opmod, signal, wqe_size); return UCS_OK; }
static inline int uct_mm_ep_has_tx_resources(uct_mm_ep_t *ep) { uct_mm_iface_t *iface = ucs_derived_of(ep->super.super.iface, uct_mm_iface_t); return UCT_MM_EP_IS_ABLE_TO_SEND(ep->fifo_ctl->head, ep->cached_tail, iface->config.fifo_size); }
static UCS_CLASS_INIT_FUNC(uct_ugni_smsg_iface_t, uct_md_h md, uct_worker_h worker, const uct_iface_params_t *params, const uct_iface_config_t *tl_config) { uct_ugni_iface_config_t *config = ucs_derived_of(tl_config, uct_ugni_iface_config_t); ucs_status_t status; gni_return_t ugni_rc; unsigned int bytes_per_mbox; gni_smsg_attr_t smsg_attr; pthread_mutex_lock(&uct_ugni_global_lock); UCS_CLASS_CALL_SUPER_INIT(uct_ugni_iface_t, md, worker, params, &uct_ugni_smsg_iface_ops, &config->super UCS_STATS_ARG(NULL)); /* Setting initial configuration */ self->config.smsg_seg_size = 2048; self->config.rx_headroom = params->rx_headroom; self->config.smsg_max_retransmit = 16; self->config.smsg_max_credit = 8; self->smsg_id = 0; smsg_attr.msg_type = GNI_SMSG_TYPE_MBOX_AUTO_RETRANSMIT; smsg_attr.mbox_maxcredit = self->config.smsg_max_credit; smsg_attr.msg_maxsize = self->config.smsg_seg_size; ugni_rc = GNI_SmsgBufferSizeNeeded(&(smsg_attr), &bytes_per_mbox); self->bytes_per_mbox = ucs_align_up_pow2(bytes_per_mbox, ucs_get_page_size()); if (ugni_rc != GNI_RC_SUCCESS) { ucs_error("Smsg buffer size calculation failed"); status = UCS_ERR_INVALID_PARAM; goto exit; } status = ucs_mpool_init(&self->free_desc, 0, self->config.smsg_seg_size + sizeof(uct_ugni_smsg_desc_t), 0, UCS_SYS_CACHE_LINE_SIZE, /* alignment */ 128 , /* grow */ config->mpool.max_bufs, /* max buffers */ &uct_ugni_smsg_desc_mpool_ops, "UGNI-SMSG-DESC"); if (UCS_OK != status) { ucs_error("Desc Mpool creation failed"); goto exit; } status = ucs_mpool_init(&self->free_mbox, 0, self->bytes_per_mbox + sizeof(uct_ugni_smsg_mbox_t), sizeof(uct_ugni_smsg_mbox_t), UCS_SYS_CACHE_LINE_SIZE, /* alignment */ 128, /* grow */ config->mpool.max_bufs, /* max buffers */ &uct_ugni_smsg_mbox_mpool_ops, "UGNI-SMSG-MBOX"); if (UCS_OK != status) { ucs_error("Mbox Mpool creation failed"); goto clean_desc; } UCT_TL_IFACE_GET_TX_DESC(&self->super.super, &self->free_desc, self->user_desc, self->user_desc = NULL); status = ugni_smsg_activate_iface(self); if (UCS_OK != status) { ucs_error("Failed to activate the interface"); goto clean_mbox; } ugni_rc = GNI_SmsgSetMaxRetrans(self->super.nic_handle, self->config.smsg_max_retransmit); if (ugni_rc != GNI_RC_SUCCESS) { ucs_error("Smsg setting max retransmit count failed."); status = UCS_ERR_INVALID_PARAM; goto clean_iface; } /* TBD: eventually the uct_ugni_progress has to be moved to * udt layer so each ugni layer will have own progress */ uct_worker_progress_register(worker, uct_ugni_smsg_progress, self); pthread_mutex_unlock(&uct_ugni_global_lock); return UCS_OK; clean_iface: ugni_smsg_deactivate_iface(self); clean_desc: ucs_mpool_put(self->user_desc); ucs_mpool_cleanup(&self->free_desc, 1); clean_mbox: ucs_mpool_cleanup(&self->free_mbox, 1); exit: ucs_error("Failed to activate interface"); pthread_mutex_unlock(&uct_ugni_global_lock); return status; }
static void uct_ugni_udt_progress(void *arg) { uint32_t rem_addr, rem_id; uint64_t id; void *payload; void *user_desc; ucs_status_t status; uct_ugni_udt_desc_t *desc; uct_ugni_udt_header_t *header; uct_ugni_udt_iface_t * iface = (uct_ugni_udt_iface_t *)arg; uct_ugni_udt_ep_t *ep; gni_ep_handle_t ugni_ep; gni_post_state_t post_state; gni_return_t ugni_rc; pthread_mutex_lock(&uct_ugni_global_lock); ugni_rc = GNI_PostDataProbeById(iface->super.nic_handle, &id); if (ucs_unlikely(GNI_RC_SUCCESS != ugni_rc)) { if (GNI_RC_NO_MATCH != ugni_rc) { ucs_error("GNI_PostDataProbeById , Error status: %s %d", gni_err_str[ugni_rc], ugni_rc); } goto exit; } if (UCT_UGNI_UDT_ANY == id) { /* New incomming message */ ep = NULL; ugni_ep = iface->ep_any; desc = iface->desc_any; } else { /* Ack message */ ep = ucs_derived_of(uct_ugni_iface_lookup_ep(&iface->super, id), uct_ugni_udt_ep_t); if (ucs_unlikely(NULL == ep)) { ucs_error("Can not lookup ep with id %"PRIx64,id); goto exit; } ugni_ep = ep->super.ep; desc = ep->posted_desc; } ugni_rc = GNI_EpPostDataWaitById(ugni_ep, id, -1, &post_state, &rem_addr, &rem_id); if (ucs_unlikely(GNI_RC_SUCCESS != ugni_rc)) { ucs_error("GNI_EpPostDataWaitById, Error status: %s %d", gni_err_str[ugni_rc], ugni_rc); goto exit; } header = uct_ugni_udt_get_rheader(desc, iface); payload = uct_ugni_udt_get_rpayload(desc, iface); user_desc = uct_ugni_udt_get_user_desc(desc, iface); if (UCT_UGNI_UDT_ANY == id) { /* New incomming message */ ucs_assert_always(header->type == UCT_UGNI_UDT_PAYLOAD); uct_iface_trace_am(&iface->super.super, UCT_AM_TRACE_TYPE_RECV, header->am_id, payload, header->length, "RX: AM"); status = uct_iface_invoke_am(&iface->super.super, header->am_id, payload, header->length, user_desc); if (UCS_OK != status) { uct_ugni_udt_desc_t *new_desc; /* set iface for a later release call */ uct_recv_desc_iface(user_desc) = &iface->super.super.super; /* Allocate a new element */ UCT_TL_IFACE_GET_TX_DESC(&iface->super.super, &iface->free_desc, new_desc, goto exit); /* set the new desc */ iface->desc_any = new_desc; }
/** * @param rx_headroom Headroom requested by the user. * @param rx_priv_len Length of transport private data to reserve (0 if unused) * @param rx_hdr_len Length of transport network header. * @param mss Maximal segment size (transport limit). */ UCS_CLASS_INIT_FUNC(uct_ib_iface_t, uct_ib_iface_ops_t *ops, uct_md_h md, uct_worker_h worker, const uct_iface_params_t *params, unsigned rx_priv_len, unsigned rx_hdr_len, unsigned tx_cq_len, size_t mss, const uct_ib_iface_config_t *config) { uct_ib_device_t *dev = &ucs_derived_of(md, uct_ib_md_t)->dev; ucs_status_t status; uint8_t port_num; UCS_CLASS_CALL_SUPER_INIT(uct_base_iface_t, &ops->super, md, worker, &config->super UCS_STATS_ARG(dev->stats)); status = uct_ib_device_find_port(dev, params->dev_name, &port_num); if (status != UCS_OK) { goto err; } self->ops = ops; self->config.rx_payload_offset = sizeof(uct_ib_iface_recv_desc_t) + ucs_max(sizeof(uct_am_recv_desc_t) + params->rx_headroom, rx_priv_len + rx_hdr_len); self->config.rx_hdr_offset = self->config.rx_payload_offset - rx_hdr_len; self->config.rx_headroom_offset= self->config.rx_payload_offset - params->rx_headroom; self->config.seg_size = ucs_min(mss, config->super.max_bcopy); self->config.tx_max_poll = config->tx.max_poll; self->config.rx_max_poll = config->rx.max_poll; self->config.rx_max_batch = ucs_min(config->rx.max_batch, config->rx.queue_len / 4); self->config.port_num = port_num; self->config.sl = config->sl; self->config.gid_index = config->gid_index; status = uct_ib_iface_init_pkey(self, config); if (status != UCS_OK) { goto err; } status = uct_ib_device_query_gid(dev, self->config.port_num, self->config.gid_index, &self->gid); if (status != UCS_OK) { goto err; } status = uct_ib_iface_init_lmc(self, config); if (status != UCS_OK) { goto err; } self->comp_channel = ibv_create_comp_channel(dev->ibv_context); if (self->comp_channel == NULL) { ucs_error("ibv_create_comp_channel() failed: %m"); status = UCS_ERR_IO_ERROR; goto err_free_path_bits; } status = ucs_sys_fcntl_modfl(self->comp_channel->fd, O_NONBLOCK, 0); if (status != UCS_OK) { goto err_destroy_comp_channel; } status = uct_ib_iface_create_cq(self, tx_cq_len, 0, &self->send_cq); if (status != UCS_OK) { goto err_destroy_comp_channel; } status = uct_ib_iface_create_cq(self, config->rx.queue_len, config->rx.inl, &self->recv_cq); if (status != UCS_OK) { goto err_destroy_send_cq; } /* Address scope and size */ if (config->addr_type == UCT_IB_IFACE_ADDRESS_TYPE_AUTO) { if (IBV_PORT_IS_LINK_LAYER_ETHERNET(uct_ib_iface_port_attr(self))) { self->addr_type = UCT_IB_ADDRESS_TYPE_ETH; } else { self->addr_type = uct_ib_address_scope(self->gid.global.subnet_prefix); } } else { ucs_assert(config->addr_type < UCT_IB_ADDRESS_TYPE_LAST); self->addr_type = config->addr_type; } self->addr_size = uct_ib_address_size(self->addr_type); ucs_debug("created uct_ib_iface_t headroom_ofs %d payload_ofs %d hdr_ofs %d data_sz %d", self->config.rx_headroom_offset, self->config.rx_payload_offset, self->config.rx_hdr_offset, self->config.seg_size); return UCS_OK; err_destroy_send_cq: ibv_destroy_cq(self->send_cq); err_destroy_comp_channel: ibv_destroy_comp_channel(self->comp_channel); err_free_path_bits: ucs_free(self->path_bits); err: return status; }
static UCS_CLASS_CLEANUP_FUNC(uct_ugni_smsg_ep_t) { uct_ugni_smsg_iface_t *iface = ucs_derived_of(self->super.super.super.iface, uct_ugni_smsg_iface_t); uct_ugni_smsg_mbox_dereg(iface, self->smsg_attr); ucs_mpool_put(self->smsg_attr); }
static UCS_CLASS_INIT_FUNC(uct_ugni_rdma_iface_t, uct_pd_h pd, uct_worker_h worker, const char *dev_name, size_t rx_headroom, const uct_iface_config_t *tl_config) { uct_ugni_rdma_iface_config_t *config = ucs_derived_of(tl_config, uct_ugni_rdma_iface_config_t); ucs_status_t status; pthread_mutex_lock(&uct_ugni_global_lock); UCS_CLASS_CALL_SUPER_INIT(uct_ugni_iface_t, pd, worker, dev_name, &uct_ugni_rdma_iface_ops, &config->super UCS_STATS_ARG(NULL)); /* Setting initial configuration */ self->config.fma_seg_size = UCT_UGNI_MAX_FMA; self->config.rdma_max_size = UCT_UGNI_MAX_RDMA; status = ucs_mpool_init(&self->free_desc, 0, sizeof(uct_ugni_base_desc_t), 0, /* alignment offset */ UCS_SYS_CACHE_LINE_SIZE, /* alignment */ 128, /* grow */ config->mpool.max_bufs, /* max buffers */ &uct_ugni_rdma_desc_mpool_ops, "UGNI-DESC-ONLY"); if (UCS_OK != status) { ucs_error("Mpool creation failed"); goto exit; } status = ucs_mpool_init(&self->free_desc_get, 0, sizeof(uct_ugni_rdma_fetch_desc_t), 0, /* alignment offset */ UCS_SYS_CACHE_LINE_SIZE, /* alignment */ 128 , /* grow */ config->mpool.max_bufs, /* max buffers */ &uct_ugni_rdma_desc_mpool_ops, "UGNI-GET-DESC-ONLY"); if (UCS_OK != status) { ucs_error("Mpool creation failed"); goto clean_desc; } status = ucs_mpool_init(&self->free_desc_buffer, 0, sizeof(uct_ugni_base_desc_t) + self->config.fma_seg_size, sizeof(uct_ugni_base_desc_t), /* alignment offset */ UCS_SYS_CACHE_LINE_SIZE, /* alignment */ 128 , /* grow */ config->mpool.max_bufs, /* max buffers */ &uct_ugni_rdma_desc_mpool_ops, "UGNI-DESC-BUFFER"); if (UCS_OK != status) { ucs_error("Mpool creation failed"); goto clean_desc_get; } status = uct_iface_mpool_init(&self->super.super, &self->free_desc_famo, sizeof(uct_ugni_rdma_fetch_desc_t) + 8, sizeof(uct_ugni_rdma_fetch_desc_t),/* alignment offset */ UCS_SYS_CACHE_LINE_SIZE, /* alignment */ &config->mpool, /* mpool config */ 128 , /* grow */ uct_ugni_base_desc_key_init, /* memory/key init */ "UGNI-DESC-FAMO"); if (UCS_OK != status) { ucs_error("Mpool creation failed"); goto clean_buffer; } status = uct_iface_mpool_init(&self->super.super, &self->free_desc_get_buffer, sizeof(uct_ugni_rdma_fetch_desc_t) + self->config.fma_seg_size, sizeof(uct_ugni_rdma_fetch_desc_t), /* alignment offset */ UCS_SYS_CACHE_LINE_SIZE, /* alignment */ &config->mpool, /* mpool config */ 128 , /* grow */ uct_ugni_base_desc_key_init, /* memory/key init */ "UGNI-DESC-GET"); if (UCS_OK != status) { ucs_error("Mpool creation failed"); goto clean_famo; } status = ugni_activate_iface(&self->super); if (UCS_OK != status) { ucs_error("Failed to activate the interface"); goto clean_get_buffer; } if(GNI_DEVICE_ARIES == self->super.dev->type) { uct_ugni_rdma_iface_ops.ep_atomic_swap64 = uct_ugni_ep_atomic_swap64; uct_ugni_rdma_iface_ops.ep_atomic_add32 = uct_ugni_ep_atomic_add32; uct_ugni_rdma_iface_ops.ep_atomic_fadd32 = uct_ugni_ep_atomic_fadd32; uct_ugni_rdma_iface_ops.ep_atomic_cswap32 = uct_ugni_ep_atomic_cswap32; uct_ugni_rdma_iface_ops.ep_atomic_swap32 = uct_ugni_ep_atomic_swap32; } /* TBD: eventually the uct_ugni_progress has to be moved to * rdma layer so each ugni layer will have own progress */ uct_worker_progress_register(worker, uct_ugni_progress, self); pthread_mutex_unlock(&uct_ugni_global_lock); return UCS_OK; clean_get_buffer: ucs_mpool_cleanup(&self->free_desc_get_buffer, 1); clean_famo: ucs_mpool_cleanup(&self->free_desc_famo, 1); clean_buffer: ucs_mpool_cleanup(&self->free_desc_buffer, 1); clean_desc_get: ucs_mpool_cleanup(&self->free_desc_get, 1); clean_desc: ucs_mpool_cleanup(&self->free_desc, 1); exit: ucs_error("Failed to activate interface"); pthread_mutex_unlock(&uct_ugni_global_lock); return status; }
static void uct_ib_memh_free(uct_ib_mem_t *memh) { ucs_free(memh); } static uct_ib_mem_t *uct_ib_memh_alloc() { return ucs_calloc(1, sizeof(uct_ib_mem_t), "ib_memh"); } static ucs_status_t uct_ib_mem_alloc_internal(uct_md_h uct_md, size_t *length_p, void **address_p, uct_ib_mem_t *memh UCS_MEMTRACK_ARG) { uct_ib_md_t *md = ucs_derived_of(uct_md, uct_ib_md_t); struct ibv_exp_reg_mr_in in = { md->pd, NULL, ucs_memtrack_adjust_alloc_size(*length_p), UCT_IB_MEM_ACCESS_FLAGS | IBV_EXP_ACCESS_ALLOCATE_MR, 0 }; memh->mr = ibv_exp_reg_mr(&in); if (memh->mr == NULL) { ucs_error("ibv_exp_reg_mr(in={NULL, length=%Zu, flags=0x%lx}) failed: %m", ucs_memtrack_adjust_alloc_size(*length_p), (unsigned long)(UCT_IB_MEM_ACCESS_FLAGS | IBV_EXP_ACCESS_ALLOCATE_MR)); return UCS_ERR_IO_ERROR; }
/** * @param rx_headroom Headroom requested by the user. * @param rx_priv_len Length of transport private data to reserve (0 if unused) * @param rx_hdr_len Length of transport network header. * @param mss Maximal segment size (transport limit). */ UCS_CLASS_INIT_FUNC(uct_ib_iface_t, uct_ib_iface_ops_t *ops, uct_md_h md, uct_worker_h worker, const char *dev_name, unsigned rx_headroom, unsigned rx_priv_len, unsigned rx_hdr_len, unsigned tx_cq_len, size_t mss, uct_ib_iface_config_t *config) { uct_ib_device_t *dev = &ucs_derived_of(md, uct_ib_md_t)->dev; ucs_status_t status; uint8_t port_num; UCS_CLASS_CALL_SUPER_INIT(uct_base_iface_t, &ops->super, md, worker, &config->super UCS_STATS_ARG(dev->stats)); status = uct_ib_device_find_port(dev, dev_name, &port_num); if (status != UCS_OK) { goto err; } self->port_num = port_num; self->sl = config->sl; self->config.rx_payload_offset = sizeof(uct_ib_iface_recv_desc_t) + ucs_max(sizeof(uct_am_recv_desc_t) + rx_headroom, rx_priv_len + rx_hdr_len); self->config.rx_hdr_offset = self->config.rx_payload_offset - rx_hdr_len; self->config.rx_headroom_offset= self->config.rx_payload_offset - rx_headroom; self->config.seg_size = ucs_min(mss, config->super.max_bcopy); self->config.tx_max_poll = config->tx.max_poll; self->config.rx_max_poll = config->rx.max_poll; self->config.rx_max_batch = ucs_min(config->rx.max_batch, config->rx.queue_len / 4); self->ops = ops; status = uct_ib_iface_init_pkey(self, config); if (status != UCS_OK) { goto err; } status = uct_ib_iface_init_gid(self, config); if (status != UCS_OK) { goto err; } status = uct_ib_iface_init_lmc(self, config); if (status != UCS_OK) { goto err; } self->comp_channel = ibv_create_comp_channel(dev->ibv_context); if (self->comp_channel == NULL) { ucs_error("Failed to create completion channel: %m"); status = UCS_ERR_IO_ERROR; goto err_free_path_bits; } status = ucs_sys_fcntl_modfl(self->comp_channel->fd, O_NONBLOCK, 0); if (status != UCS_OK) { goto err_destroy_comp_channel; } /* TODO inline scatter for send SQ */ self->send_cq = ibv_create_cq(dev->ibv_context, tx_cq_len, NULL, self->comp_channel, 0); if (self->send_cq == NULL) { ucs_error("Failed to create send cq: %m"); status = UCS_ERR_IO_ERROR; goto err_destroy_comp_channel; } if (config->rx.inl > 32 /*UCT_IB_MLX5_CQE64_MAX_INL*/) { ibv_exp_setenv(dev->ibv_context, "MLX5_CQE_SIZE", "128", 1); } self->recv_cq = ibv_create_cq(dev->ibv_context, config->rx.queue_len, NULL, self->comp_channel, 0); ibv_exp_setenv(dev->ibv_context, "MLX5_CQE_SIZE", "64", 1); if (self->recv_cq == NULL) { ucs_error("Failed to create recv cq: %m"); status = UCS_ERR_IO_ERROR; goto err_destroy_send_cq; } if (!uct_ib_device_is_port_ib(dev, self->port_num)) { ucs_error("Unsupported link layer"); status = UCS_ERR_UNSUPPORTED; goto err_destroy_recv_cq; } /* Address scope and size */ self->addr_scope = uct_ib_address_scope(self->gid.global.subnet_prefix); self->addr_size = uct_ib_address_size(self->addr_scope); ucs_debug("created uct_ib_iface_t headroom_ofs %d payload_ofs %d hdr_ofs %d data_sz %d", self->config.rx_headroom_offset, self->config.rx_payload_offset, self->config.rx_hdr_offset, self->config.seg_size); return UCS_OK; err_destroy_recv_cq: ibv_destroy_cq(self->recv_cq); err_destroy_send_cq: ibv_destroy_cq(self->send_cq); err_destroy_comp_channel: ibv_destroy_comp_channel(self->comp_channel); err_free_path_bits: ucs_free(self->path_bits); err: return status; }
ucs_status_t uct_dc_mlx5_ep_fc_ctrl(uct_ep_t *tl_ep, unsigned op, uct_rc_fc_request_t *req) { uintptr_t sender_ep; uct_ib_iface_t *ib_iface; uct_ib_mlx5_base_av_t av; uct_dc_fc_request_t *dc_req; uct_dc_mlx5_ep_t *dc_mlx5_ep; uct_dc_ep_t *dc_ep = ucs_derived_of(tl_ep, uct_dc_ep_t); uct_dc_mlx5_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_dc_mlx5_iface_t); UCT_DC_MLX5_TXQP_DECL(txqp, txwq); ucs_assert((sizeof(uint8_t) + sizeof(sender_ep)) <= UCT_IB_MLX5_AV_FULL_SIZE); UCT_DC_CHECK_RES(&iface->super, dc_ep); UCT_DC_MLX5_IFACE_TXQP_GET(iface, dc_ep, txqp, txwq); if (op == UCT_RC_EP_FC_PURE_GRANT) { ucs_assert(req != NULL); dc_req = ucs_derived_of(req, uct_dc_fc_request_t); sender_ep = (uintptr_t)dc_req->sender_ep; ib_iface = &iface->super.super.super; /* Note av initialization is copied from exp verbs */ av.stat_rate_sl = ib_iface->config.sl; /* (attr->static_rate << 4) | attr->sl */ av.fl_mlid = ib_iface->path_bits[0] & 0x7f; /* lid in dc_req is in BE already */ av.rlid = dc_req->lid | htons(ib_iface->path_bits[0]); av.dqp_dct = htonl(dc_req->dct_num); if (!iface->ud_common.config.compact_av) { av.dqp_dct |= UCT_IB_MLX5_EXTENDED_UD_AV; } uct_rc_mlx5_txqp_inline_post(&iface->super.super, IBV_EXP_QPT_DC_INI, txqp, txwq, MLX5_OPCODE_SEND, NULL, 0, op, sender_ep, 0, 0, 0, &av, uct_ib_mlx5_wqe_av_size(&av)); } else { ucs_assert(op == UCT_RC_EP_FC_FLAG_HARD_REQ); sender_ep = (uintptr_t)dc_ep; dc_mlx5_ep = ucs_derived_of(tl_ep, uct_dc_mlx5_ep_t); UCS_STATS_UPDATE_COUNTER(dc_ep->fc.stats, UCT_RC_FC_STAT_TX_HARD_REQ, 1); uct_rc_mlx5_txqp_inline_post(&iface->super.super, IBV_EXP_QPT_DC_INI, txqp, txwq, MLX5_OPCODE_SEND_IMM, NULL, 0, op, sender_ep, iface->super.rx.dct->dct_num, 0, 0, &dc_mlx5_ep->av, uct_ib_mlx5_wqe_av_size(&dc_mlx5_ep->av)); } return UCS_OK; }
/** * @param rx_headroom Headroom requested by the user. * @param rx_priv_len Length of transport private data to reserve (0 if unused) * @param rx_hdr_len Length of transport network header. * @param mss Maximal segment size (transport limit). */ UCS_CLASS_INIT_FUNC(uct_ib_iface_t, uct_iface_ops_t *ops, uct_pd_h pd, uct_worker_h worker, const char *dev_name, unsigned rx_headroom, unsigned rx_priv_len, unsigned rx_hdr_len, unsigned tx_cq_len, size_t mss, uct_ib_iface_config_t *config) { uct_ib_device_t *dev = &ucs_derived_of(pd, uct_ib_pd_t)->dev; ucs_status_t status; uint8_t port_num; UCS_CLASS_CALL_SUPER_INIT(uct_base_iface_t, ops, pd, worker, &config->super UCS_STATS_ARG(dev->stats)); status = uct_ib_iface_find_port(dev, dev_name, &port_num); if (status != UCS_OK) { ucs_error("Failed to find port %s: %s", dev_name, ucs_status_string(status)); goto err; } self->port_num = port_num; self->sl = config->sl; self->config.rx_payload_offset = sizeof(uct_ib_iface_recv_desc_t) + ucs_max(sizeof(uct_am_recv_desc_t) + rx_headroom, rx_priv_len + rx_hdr_len); self->config.rx_hdr_offset = self->config.rx_payload_offset - rx_hdr_len; self->config.rx_headroom_offset= self->config.rx_payload_offset - rx_headroom; self->config.seg_size = config->super.max_bcopy; status = uct_ib_iface_init_pkey(self, config); if (status != UCS_OK) { goto err; } status = uct_ib_iface_init_gid(self, config); if (status != UCS_OK) { goto err; } status = uct_ib_iface_init_lmc(self, config); if (status != UCS_OK) { goto err; } /* TODO comp_channel */ /* TODO inline scatter for send SQ */ self->send_cq = ibv_create_cq(dev->ibv_context, tx_cq_len, NULL, NULL, 0); if (self->send_cq == NULL) { ucs_error("Failed to create send cq: %m"); status = UCS_ERR_IO_ERROR; goto err_free_path_bits; } if (config->rx.inl > 32 /*UCT_IB_MLX5_CQE64_MAX_INL*/) { ibv_exp_setenv(dev->ibv_context, "MLX5_CQE_SIZE", "128", 1); } self->recv_cq = ibv_create_cq(dev->ibv_context, config->rx.queue_len, NULL, NULL, 0); ibv_exp_setenv(dev->ibv_context, "MLX5_CQE_SIZE", "64", 1); if (self->recv_cq == NULL) { ucs_error("Failed to create recv cq: %m"); status = UCS_ERR_IO_ERROR; goto err_destroy_send_cq; } if (!uct_ib_device_is_port_ib(dev, self->port_num)) { ucs_error("Unsupported link layer"); status = UCS_ERR_UNSUPPORTED; goto err_destroy_recv_cq; } ucs_debug("created uct_ib_iface_t headroom_ofs %d payload_ofs %d hdr_ofs %d data_sz %d", self->config.rx_headroom_offset, self->config.rx_payload_offset, self->config.rx_hdr_offset, self->config.seg_size); return UCS_OK; err_destroy_recv_cq: ibv_destroy_cq(self->recv_cq); err_destroy_send_cq: ibv_destroy_cq(self->send_cq); err_free_path_bits: ucs_free(self->path_bits); err: return status; }
static ucs_status_t uct_mm_iface_query(uct_iface_h tl_iface, uct_iface_attr_t *iface_attr) { uct_mm_iface_t *iface = ucs_derived_of(tl_iface, uct_mm_iface_t); memset(iface_attr, 0, sizeof(uct_iface_attr_t)); /* default values for all shared memory transports */ iface_attr->cap.put.max_short = UINT_MAX; iface_attr->cap.put.max_bcopy = SIZE_MAX; iface_attr->cap.put.min_zcopy = 0; iface_attr->cap.put.max_zcopy = SIZE_MAX; iface_attr->cap.put.opt_zcopy_align = UCS_SYS_CACHE_LINE_SIZE; iface_attr->cap.put.align_mtu = iface_attr->cap.put.opt_zcopy_align; iface_attr->cap.put.max_iov = 1; iface_attr->cap.get.max_bcopy = SIZE_MAX; iface_attr->cap.get.min_zcopy = 0; iface_attr->cap.get.max_zcopy = SIZE_MAX; iface_attr->cap.get.opt_zcopy_align = UCS_SYS_CACHE_LINE_SIZE; iface_attr->cap.get.align_mtu = iface_attr->cap.get.opt_zcopy_align; iface_attr->cap.get.max_iov = 1; iface_attr->cap.am.max_short = iface->config.fifo_elem_size - sizeof(uct_mm_fifo_element_t); iface_attr->cap.am.max_bcopy = iface->config.seg_size; iface_attr->cap.am.min_zcopy = 0; iface_attr->cap.am.max_zcopy = 0; iface_attr->cap.am.opt_zcopy_align = UCS_SYS_CACHE_LINE_SIZE; iface_attr->cap.am.align_mtu = iface_attr->cap.am.opt_zcopy_align; iface_attr->cap.am.max_iov = 1; iface_attr->iface_addr_len = sizeof(uct_mm_iface_addr_t); iface_attr->device_addr_len = UCT_SM_IFACE_DEVICE_ADDR_LEN; iface_attr->ep_addr_len = 0; iface_attr->max_conn_priv = 0; iface_attr->cap.flags = UCT_IFACE_FLAG_PUT_SHORT | UCT_IFACE_FLAG_PUT_BCOPY | UCT_IFACE_FLAG_ATOMIC_CPU | UCT_IFACE_FLAG_GET_BCOPY | UCT_IFACE_FLAG_AM_SHORT | UCT_IFACE_FLAG_AM_BCOPY | UCT_IFACE_FLAG_PENDING | UCT_IFACE_FLAG_CB_SYNC | UCT_IFACE_FLAG_EVENT_SEND_COMP | UCT_IFACE_FLAG_EVENT_RECV_SIG | UCT_IFACE_FLAG_CONNECT_TO_IFACE; iface_attr->cap.atomic32.op_flags = iface_attr->cap.atomic64.op_flags = UCS_BIT(UCT_ATOMIC_OP_ADD) | UCS_BIT(UCT_ATOMIC_OP_AND) | UCS_BIT(UCT_ATOMIC_OP_OR) | UCS_BIT(UCT_ATOMIC_OP_XOR); iface_attr->cap.atomic32.fop_flags = iface_attr->cap.atomic64.fop_flags = UCS_BIT(UCT_ATOMIC_OP_ADD) | UCS_BIT(UCT_ATOMIC_OP_AND) | UCS_BIT(UCT_ATOMIC_OP_OR) | UCS_BIT(UCT_ATOMIC_OP_XOR) | UCS_BIT(UCT_ATOMIC_OP_SWAP) | UCS_BIT(UCT_ATOMIC_OP_CSWAP); iface_attr->latency.overhead = 80e-9; /* 80 ns */ iface_attr->latency.growth = 0; iface_attr->bandwidth = 12179 * 1024.0 * 1024.0; iface_attr->overhead = 10e-9; /* 10 ns */ iface_attr->priority = uct_mm_md_mapper_ops(iface->super.md)->get_priority(); return UCS_OK; }
static void uct_cm_iface_release_desc(uct_iface_t *tl_iface, void *desc) { uct_cm_iface_t *iface = ucs_derived_of(tl_iface, uct_cm_iface_t); ucs_free(desc - iface->super.config.rx_headroom_offset); }
static void uct_rc_verbs_ep_am_zcopy_handler(uct_rc_iface_send_op_t *op) { uct_rc_iface_send_desc_t *desc = ucs_derived_of(op, uct_rc_iface_send_desc_t); uct_invoke_completion(desc->super.user_comp); ucs_mpool_put(desc); }
static UCS_CLASS_INIT_FUNC(uct_cm_iface_t, uct_pd_h pd, uct_worker_h worker, const char *dev_name, size_t rx_headroom, const uct_iface_config_t *tl_config) { uct_cm_iface_config_t *config = ucs_derived_of(tl_config, uct_cm_iface_config_t); ucs_status_t status; int ret; ucs_trace_func(""); UCS_CLASS_CALL_SUPER_INIT(uct_ib_iface_t, &uct_cm_iface_ops, pd, worker, dev_name, rx_headroom, 0 /* rx_priv_len */, 0 /* rx_hdr_len */, 1 /* tx_cq_len */, IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE, /* mss */ &config->super); if (worker->async == NULL) { ucs_error("cm must have async!=NULL"); return UCS_ERR_INVALID_PARAM; } self->service_id = (uint32_t)(ucs_generate_uuid((uintptr_t)self) & (~IB_CM_ASSIGN_SERVICE_ID_MASK)); self->num_outstanding = 0; self->config.timeout_ms = (int)(config->timeout * 1e3 + 0.5); self->config.max_outstanding = config->max_outstanding; self->config.retry_count = ucs_min(config->retry_count, UINT8_MAX); self->notify_q.head = NULL; ucs_queue_head_init(&self->notify_q); self->outstanding = ucs_calloc(self->config.max_outstanding, sizeof(*self->outstanding), "cm_outstanding"); if (self->outstanding == NULL) { status = UCS_ERR_NO_MEMORY; goto err; } self->cmdev = ib_cm_open_device(uct_ib_iface_device(&self->super)->ibv_context); if (self->cmdev == NULL) { ucs_error("ib_cm_open_device() failed: %m. Check if ib_ucm.ko module is loaded."); status = UCS_ERR_NO_DEVICE; goto err_free_outstanding; } status = ucs_sys_fcntl_modfl(self->cmdev->fd, O_NONBLOCK, 0); if (status != UCS_OK) { goto err_close_device; } ret = ib_cm_create_id(self->cmdev, &self->listen_id, self); if (ret) { ucs_error("ib_cm_create_id() failed: %m"); status = UCS_ERR_NO_DEVICE; goto err_close_device; } ret = ib_cm_listen(self->listen_id, self->service_id, 0); if (ret) { ucs_error("ib_cm_listen() failed: %m"); status = UCS_ERR_INVALID_ADDR; goto err_destroy_id; } if (config->async_mode == UCS_ASYNC_MODE_SIGNAL) { ucs_warn("ib_cm fd does not support SIGIO"); } status = ucs_async_set_event_handler(config->async_mode, self->cmdev->fd, POLLIN, uct_cm_iface_event_handler, self, worker->async); if (status != UCS_OK) { ucs_error("failed to set event handler"); goto err_destroy_id; } ucs_debug("listening for SIDR service_id 0x%x on fd %d", self->service_id, self->cmdev->fd); return UCS_OK; err_destroy_id: ib_cm_destroy_id(self->listen_id); err_close_device: ib_cm_close_device(self->cmdev); err_free_outstanding: ucs_free(self->outstanding); err: return status; }
static ucs_status_t uct_ib_mlx5dv_create_ksm(uct_ib_md_t *ibmd, uct_ib_mem_t *ib_memh, off_t offset) { uct_ib_mlx5_mem_t *memh = ucs_derived_of(ib_memh, uct_ib_mlx5_mem_t); uct_ib_mlx5_md_t *md = ucs_derived_of(ibmd, uct_ib_mlx5_md_t); uint32_t out[UCT_IB_MLX5DV_ST_SZ_DW(create_mkey_out)] = {}; struct ibv_mr *mr = memh->super.mr; ucs_status_t status = UCS_OK; struct mlx5dv_pd dvpd = {}; struct mlx5dv_obj dv = {}; size_t reg_length, length, inlen; int list_size, i; void *mkc, *klm; uint32_t *in; intptr_t addr; if (!(md->flags & UCT_IB_MLX5_MD_FLAG_KSM)) { return UCS_ERR_UNSUPPORTED; } reg_length = UCT_IB_MD_MAX_MR_SIZE; addr = (intptr_t)mr->addr & ~(reg_length - 1); length = mr->length + (intptr_t)mr->addr - addr; list_size = ucs_div_round_up(length, reg_length); inlen = UCT_IB_MLX5DV_ST_SZ_BYTES(create_mkey_in) + UCT_IB_MLX5DV_ST_SZ_BYTES(klm) * list_size; in = ucs_calloc(1, inlen, "mkey mailbox"); if (in == NULL) { return UCS_ERR_NO_MEMORY; } dv.pd.in = md->super.pd; dv.pd.out = &dvpd; mlx5dv_init_obj(&dv, MLX5DV_OBJ_PD); UCT_IB_MLX5DV_SET(create_mkey_in, in, opcode, UCT_IB_MLX5_CMD_OP_CREATE_MKEY); mkc = UCT_IB_MLX5DV_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); UCT_IB_MLX5DV_SET(mkc, mkc, access_mode_1_0, UCT_IB_MLX5_MKC_ACCESS_MODE_KSM); UCT_IB_MLX5DV_SET(mkc, mkc, a, 1); UCT_IB_MLX5DV_SET(mkc, mkc, rw, 1); UCT_IB_MLX5DV_SET(mkc, mkc, rr, 1); UCT_IB_MLX5DV_SET(mkc, mkc, lw, 1); UCT_IB_MLX5DV_SET(mkc, mkc, lr, 1); UCT_IB_MLX5DV_SET(mkc, mkc, pd, dvpd.pdn); UCT_IB_MLX5DV_SET(mkc, mkc, translations_octword_size, list_size); UCT_IB_MLX5DV_SET(mkc, mkc, log_entity_size, ucs_ilog2(reg_length)); UCT_IB_MLX5DV_SET(mkc, mkc, qpn, 0xffffff); UCT_IB_MLX5DV_SET(mkc, mkc, mkey_7_0, offset & 0xff); UCT_IB_MLX5DV_SET64(mkc, mkc, start_addr, addr + offset); UCT_IB_MLX5DV_SET64(mkc, mkc, len, length); UCT_IB_MLX5DV_SET(create_mkey_in, in, translations_octword_actual_size, list_size); klm = UCT_IB_MLX5DV_ADDR_OF(create_mkey_in, in, klm_pas_mtt); for (i = 0; i < list_size; i++) { if (i == list_size - 1) { UCT_IB_MLX5DV_SET(klm, klm, byte_count, length % reg_length); } else { UCT_IB_MLX5DV_SET(klm, klm, byte_count, reg_length); } UCT_IB_MLX5DV_SET(klm, klm, mkey, mr->lkey); UCT_IB_MLX5DV_SET64(klm, klm, address, addr + (i * reg_length)); klm += UCT_IB_MLX5DV_ST_SZ_BYTES(klm); } memh->atomic_dvmr = mlx5dv_devx_obj_create(md->super.dev.ibv_context, in, inlen, out, sizeof(out)); if (memh->atomic_dvmr == NULL) { ucs_debug("CREATE_MKEY KSM failed: %m"); status = UCS_ERR_UNSUPPORTED; md->flags &= ~UCT_IB_MLX5_MD_FLAG_KSM; goto out; } memh->super.atomic_rkey = (UCT_IB_MLX5DV_GET(create_mkey_out, out, mkey_index) << 8) | (offset & 0xff); ucs_debug("KSM registered memory %p..%p offset 0x%lx on %s rkey 0x%x", mr->addr, mr->addr + mr->length, offset, uct_ib_device_name(&md->super.dev), memh->super.atomic_rkey); out: ucs_free(in); return status; }
ucs_status_t uct_rc_verbs_ep_fence(uct_ep_h tl_ep, unsigned flags) { uct_rc_verbs_ep_t *ep = ucs_derived_of(tl_ep, uct_rc_verbs_ep_t); return uct_rc_ep_fence(tl_ep, &ep->fi, 1); }
static UCS_CLASS_INIT_FUNC(uct_ugni_rdma_iface_t, uct_pd_h pd, uct_worker_h worker, const char *dev_name, size_t rx_headroom, const uct_iface_config_t *tl_config) { uct_ugni_iface_config_t *config = ucs_derived_of(tl_config, uct_ugni_iface_config_t); ucs_status_t rc; pthread_mutex_lock(&uct_ugni_global_lock); UCS_CLASS_CALL_SUPER_INIT(uct_ugni_iface_t, pd, worker, dev_name, &uct_ugni_iface_ops, &config->super UCS_STATS_ARG(NULL)); /* Setting initial configuration */ self->config.fma_seg_size = UCT_UGNI_MAX_FMA; self->config.rdma_max_size = UCT_UGNI_MAX_RDMA; rc = ucs_mpool_create("UGNI-DESC-ONLY", sizeof(uct_ugni_base_desc_t), 0, /* alignment offset */ UCS_SYS_CACHE_LINE_SIZE, /* alignment */ 128 , /* grow */ config->mpool.max_bufs, /* max buffers */ &self->super.super, /* iface */ ucs_mpool_hugetlb_malloc, /* allocation hooks */ ucs_mpool_hugetlb_free, /* free hook */ uct_ugni_base_desc_init, /* init func */ NULL , &self->free_desc); if (UCS_OK != rc) { ucs_error("Mpool creation failed"); goto exit; } rc = ucs_mpool_create("UGNI-GET-DESC-ONLY", sizeof(uct_ugni_fetch_desc_t), 0, /* alignment offset */ UCS_SYS_CACHE_LINE_SIZE, /* alignment */ 128 , /* grow */ config->mpool.max_bufs, /* max buffers */ &self->super.super, /* iface */ ucs_mpool_hugetlb_malloc, /* allocation hooks */ ucs_mpool_hugetlb_free, /* free hook */ uct_ugni_base_desc_init, /* init func */ NULL , &self->free_desc_get); if (UCS_OK != rc) { ucs_error("Mpool creation failed"); goto clean_desc; } rc = ucs_mpool_create("UGNI-DESC-BUFFER", sizeof(uct_ugni_base_desc_t) + self->config.fma_seg_size, sizeof(uct_ugni_base_desc_t), /* alignment offset */ UCS_SYS_CACHE_LINE_SIZE, /* alignment */ 128 , /* grow */ config->mpool.max_bufs, /* max buffers */ &self->super.super, /* iface */ ucs_mpool_hugetlb_malloc, /* allocation hooks */ ucs_mpool_hugetlb_free, /* free hook */ uct_ugni_base_desc_init, /* init func */ NULL , &self->free_desc_buffer); if (UCS_OK != rc) { ucs_error("Mpool creation failed"); goto clean_desc_get; } rc = uct_iface_mpool_create(&self->super.super.super, sizeof(uct_ugni_fetch_desc_t) + 8, sizeof(uct_ugni_fetch_desc_t), /* alignment offset */ UCS_SYS_CACHE_LINE_SIZE, /* alignment */ &config->mpool, /* mpool config */ 128 , /* grow */ uct_ugni_base_desc_key_init, /* memory/key init */ "UGNI-DESC-FAMO", /* name */ &self->free_desc_famo); /* mpool */ if (UCS_OK != rc) { ucs_error("Mpool creation failed"); goto clean_buffer; } rc = uct_iface_mpool_create(&self->super.super.super, sizeof(uct_ugni_fetch_desc_t) + self->config.fma_seg_size, sizeof(uct_ugni_fetch_desc_t), /* alignment offset */ UCS_SYS_CACHE_LINE_SIZE, /* alignment */ &config->mpool, /* mpool config */ 128 , /* grow */ uct_ugni_base_desc_key_init, /* memory/key init */ "UGNI-DESC-GET", /* name */ &self->free_desc_get_buffer); /* mpool */ if (UCS_OK != rc) { ucs_error("Mpool creation failed"); goto clean_famo; } rc = ugni_activate_iface(&self->super); if (UCS_OK != rc) { ucs_error("Failed to activate the interface"); goto clean_get_buffer; } /* TBD: eventually the uct_ugni_progress has to be moved to * rdma layer so each ugni layer will have own progress */ ucs_notifier_chain_add(&worker->progress_chain, uct_ugni_progress, self); pthread_mutex_unlock(&uct_ugni_global_lock); return UCS_OK; clean_get_buffer: ucs_mpool_destroy(self->free_desc_get_buffer); clean_famo: ucs_mpool_destroy(self->free_desc_famo); clean_buffer: ucs_mpool_destroy(self->free_desc_buffer); clean_desc_get: ucs_mpool_destroy(self->free_desc_get); clean_desc: ucs_mpool_destroy(self->free_desc); exit: ucs_error("Failed to activate interface"); pthread_mutex_unlock(&uct_ugni_global_lock); return rc; }
static UCS_CLASS_INIT_FUNC(uct_mm_ep_t, uct_iface_t *tl_iface, const uct_device_addr_t *dev_addr, const uct_iface_addr_t *iface_addr) { uct_mm_iface_t *iface = ucs_derived_of(tl_iface, uct_mm_iface_t); const uct_mm_iface_addr_t *addr = (const void*)iface_addr; ucs_status_t status; size_t size_to_attach; uct_mm_fifo_ctl_t *remote_fifo_ctl; UCS_CLASS_CALL_SUPER_INIT(uct_base_ep_t, &iface->super); /* Connect to the remote address (remote FIFO) */ /* Attach the address's memory */ size_to_attach = UCT_MM_GET_FIFO_SIZE(iface); status = uct_mm_md_mapper_ops(iface->super.md)->attach(addr->id, size_to_attach, (void *)addr->vaddr, &self->mapped_desc.address, &self->mapped_desc.cookie, iface->path); if (status != UCS_OK) { ucs_error("failed to connect to remote peer with mm. remote mm_id: %zu", addr->id); return status; } self->mapped_desc.length = size_to_attach; self->mapped_desc.mmid = addr->id; /* Set the fifo ctl to a dummy struct. It will switch to the real fifo_ctl, * which points to the remote peer's fifo, once the connection establishment * message is sent successfully on the unix socket */ self->fifo_ctl = &iface->dummy_fifo_ctl; self->cached_tail = self->fifo_ctl->tail; /* set the ep->fifo ptr to point to the beginning of the fifo elements at * the remote peer */ uct_mm_set_fifo_elems_ptr(self->mapped_desc.address, &self->fifo); remote_fifo_ctl = uct_mm_set_fifo_ctl(self->mapped_desc.address); self->cached_signal_addrlen = remote_fifo_ctl->signal_addrlen; self->cached_signal_sockaddr = remote_fifo_ctl->signal_sockaddr; self->cbq_elem_on = 0; /* Send connect message to remote side so it will start polling */ status = uct_mm_ep_signal_remote(self, UCT_MM_IFACE_SIGNAL_CONNECT); if (status != UCS_OK) { uct_mm_md_mapper_ops(iface->super.md)->detach(&self->mapped_desc); return status; } /* Initiate the hash which will keep the base_adresses of remote memory * chunks that hold the descriptors for bcopy. */ sglib_hashed_uct_mm_remote_seg_t_init(self->remote_segments_hash); ucs_arbiter_group_init(&self->arb_group); /* Register for send side progress */ uct_worker_progress_register(iface->super.worker, uct_mm_iface_progress, iface); ucs_debug("mm: ep connected: %p, to remote_shmid: %zu", self, addr->id); return UCS_OK; }
{ int ret; ret = ibv_dereg_mr(mr); if (ret != 0) { ucs_error("ibv_dereg_mr() failed: %m"); return UCS_ERR_IO_ERROR; } return UCS_OK; } static ucs_status_t uct_ib_mem_alloc(uct_pd_h uct_pd, size_t *length_p, void **address_p, uct_mem_h *memh_p UCS_MEMTRACK_ARG) { uct_ib_pd_t *pd = ucs_derived_of(uct_pd, uct_ib_pd_t); struct ibv_exp_reg_mr_in in = { pd->pd, NULL, ucs_memtrack_adjust_alloc_size(*length_p), UCT_IB_MEM_ACCESS_FLAGS | IBV_EXP_ACCESS_ALLOCATE_MR, 0 }; struct ibv_mr *mr; mr = ibv_exp_reg_mr(&in); if (mr == NULL) { ucs_error("ibv_exp_reg_mr(in={NULL, length=%Zu, flags=0x%lx}) failed: %m", ucs_memtrack_adjust_alloc_size(*length_p), (unsigned long)(UCT_IB_MEM_ACCESS_FLAGS | IBV_EXP_ACCESS_ALLOCATE_MR)); return UCS_ERR_IO_ERROR;
static UCS_CLASS_INIT_FUNC(uct_ud_mlx5_iface_t, uct_pd_h pd, uct_worker_h worker, const char *dev_name, size_t rx_headroom, const uct_iface_config_t *tl_config) { uct_ud_iface_config_t *config = ucs_derived_of(tl_config, uct_ud_iface_config_t); ucs_status_t status; int i; ucs_trace_func(""); UCS_CLASS_CALL_SUPER_INIT(uct_ud_iface_t, &uct_ud_mlx5_iface_ops, pd, worker, dev_name, rx_headroom, 0, config); status = uct_ib_mlx5_get_cq(self->super.super.send_cq, &self->tx.cq); if (status != UCS_OK) { return status; } if (uct_ib_mlx5_cqe_size(&self->tx.cq) != sizeof(struct mlx5_cqe64)) { ucs_error("TX CQE size (%d) is not %d", uct_ib_mlx5_cqe_size(&self->tx.cq), (int)sizeof(struct mlx5_cqe64)); return UCS_ERR_IO_ERROR; } status = uct_ib_mlx5_get_cq(self->super.super.recv_cq, &self->rx.cq); if (status != UCS_OK) { return UCS_ERR_IO_ERROR; } if (uct_ib_mlx5_cqe_size(&self->rx.cq) != sizeof(struct mlx5_cqe64)) { ucs_error("RX CQE size (%d) is not %d", uct_ib_mlx5_cqe_size(&self->rx.cq), (int)sizeof(struct mlx5_cqe64)); return UCS_ERR_IO_ERROR; } status = uct_ib_mlx5_get_txwq(self->super.qp, &self->tx.wq); if (status != UCS_OK) { return UCS_ERR_IO_ERROR; } self->super.tx.available = self->tx.wq.bb_max; status = uct_ib_mlx5_get_rxwq(self->super.qp, &self->rx.wq); if (status != UCS_OK) { return UCS_ERR_IO_ERROR; } /* write buffer sizes */ for (i = 0; i <= self->rx.wq.mask; i++) { self->rx.wq.wqes[i].byte_count = htonl(self->super.super.config.rx_payload_offset + self->super.super.config.seg_size); } while (self->super.rx.available >= self->super.config.rx_max_batch) { uct_ud_mlx5_iface_post_recv(self); } /* TODO: add progress on first ep creation */ uct_worker_progress_register(worker, uct_ud_mlx5_iface_progress, self); return UCS_OK; }
void uct_ud_ep_destroy_connected(uct_ud_ep_t *ep, const uct_sockaddr_ib_t *addr) { uct_ud_iface_t *iface = ucs_derived_of(ep->super.super.iface, uct_ud_iface_t); uct_ud_iface_cep_rollback(iface, addr, ep); uct_ud_ep_disconnect_from_iface(&ep->super.super); }
static ucs_status_t uct_gdr_copy_md_open(const char *md_name, const uct_md_config_t *uct_md_config, uct_md_h *md_p) { const uct_gdr_copy_md_config_t *md_config = ucs_derived_of(uct_md_config, uct_gdr_copy_md_config_t); ucs_status_t status; uct_gdr_copy_md_t *md; ucs_rcache_params_t rcache_params; md = ucs_malloc(sizeof(uct_gdr_copy_md_t), "uct_gdr_copy_md_t"); if (NULL == md) { ucs_error("failed to allocate memory for uct_gdr_copy_md_t"); return UCS_ERR_NO_MEMORY; } md->super.ops = &md_ops; md->super.component = &uct_gdr_copy_md_component; md->rcache = NULL; md->reg_cost = md_config->uc_reg_cost; md->gdrcpy_ctx = gdr_open(); if (md->gdrcpy_ctx == NULL) { ucs_error("failed to open gdr copy"); status = UCS_ERR_IO_ERROR; goto err_free_md; } if (md_config->enable_rcache != UCS_NO) { rcache_params.region_struct_size = sizeof(uct_gdr_copy_rcache_region_t); rcache_params.alignment = md_config->rcache.alignment; rcache_params.max_alignment = UCT_GDR_COPY_MD_RCACHE_DEFAULT_ALIGN; rcache_params.ucm_event_priority = md_config->rcache.event_prio; rcache_params.context = md; rcache_params.ops = &uct_gdr_copy_rcache_ops; status = ucs_rcache_create(&rcache_params, "gdr_copy", NULL, &md->rcache); if (status == UCS_OK) { md->super.ops = &md_rcache_ops; md->reg_cost.overhead = 0; md->reg_cost.growth = 0; } else { ucs_assert(md->rcache == NULL); if (md_config->enable_rcache == UCS_YES) { status = UCS_ERR_IO_ERROR; goto err_close_gdr; } else { ucs_debug("could not create registration cache for: %s", ucs_status_string(status)); } } } *md_p = (uct_md_h) md; status = UCS_OK; out: return status; err_close_gdr: gdr_close(md->gdrcpy_ctx); err_free_md: ucs_free(md); goto out; }
static UCS_CLASS_INIT_FUNC(uct_ugni_udt_iface_t, uct_md_h md, uct_worker_h worker, const uct_iface_params_t *params, const uct_iface_config_t *tl_config) { uct_ugni_iface_config_t *config = ucs_derived_of(tl_config, uct_ugni_iface_config_t); ucs_status_t status; uct_ugni_udt_desc_t *desc; gni_return_t ugni_rc; int rc; UCS_CLASS_CALL_SUPER_INIT(uct_ugni_iface_t, md, worker, params, &uct_ugni_udt_iface_ops, &config->super UCS_STATS_ARG(NULL)); /* Setting initial configuration */ self->config.udt_seg_size = GNI_DATAGRAM_MAXSIZE; self->config.rx_headroom = params->rx_headroom; self->release_desc.cb = uct_ugni_udt_iface_release_desc; status = ucs_async_pipe_create(&self->event_pipe); if (UCS_OK != status) { ucs_error("Pipe creation failed"); goto exit; } status = ucs_mpool_init(&self->free_desc, 0, uct_ugni_udt_get_diff(self) + self->config.udt_seg_size * 2, uct_ugni_udt_get_diff(self), UCS_SYS_CACHE_LINE_SIZE, /* alignment */ 128, /* grow */ config->mpool.max_bufs, /* max buffers */ &uct_ugni_udt_desc_mpool_ops, "UGNI-UDT-DESC"); if (UCS_OK != status) { ucs_error("Mpool creation failed"); goto clean_pipe; } ugni_rc = GNI_EpCreate(uct_ugni_udt_iface_nic_handle(self), NULL, &self->ep_any); if (GNI_RC_SUCCESS != ugni_rc) { ucs_error("GNI_EpCreate failed, Error status: %s %d", gni_err_str[ugni_rc], ugni_rc); status = UCS_ERR_NO_DEVICE; goto clean_free_desc; } UCT_TL_IFACE_GET_TX_DESC(&self->super.super, &self->free_desc, desc, goto clean_ep); ucs_debug("First wildcard desc is %p", desc); /* Init any desc */ self->desc_any = desc; status = uct_ugni_udt_ep_any_post(self); if (UCS_OK != status) { /* We can't continue if we can't post the first receive */ ucs_error("Failed to post wildcard request"); goto clean_any_desc; } status = ucs_async_set_event_handler(self->super.super.worker->async->mode, ucs_async_pipe_rfd(&self->event_pipe), POLLIN, uct_ugni_proccess_datagram_pipe, self, self->super.super.worker->async); if (UCS_OK != status) { goto clean_cancel_desc; } pthread_mutex_init(&self->device_lock, NULL); pthread_cond_init(&self->device_condition, NULL); self->events_ready = 0; rc = pthread_create(&self->event_thread, NULL, uct_ugni_udt_device_thread, self); if(0 != rc) { goto clean_remove_event; } return UCS_OK; clean_remove_event: ucs_async_pipe_destroy(&self->event_pipe); clean_cancel_desc: uct_ugni_udt_clean_wildcard(self); clean_any_desc: ucs_mpool_put(self->desc_any); clean_ep: ugni_rc = GNI_EpDestroy(self->ep_any); if (GNI_RC_SUCCESS != ugni_rc) { ucs_warn("GNI_EpDestroy failed, Error status: %s %d", gni_err_str[ugni_rc], ugni_rc); } clean_free_desc: ucs_mpool_cleanup(&self->free_desc, 1); clean_pipe: ucs_async_pipe_destroy(&self->event_pipe); exit: uct_ugni_cleanup_base_iface(&self->super); ucs_error("Failed to activate interface"); return status; }
/* * Generic inline posting function. * Parameters which are not relevant to the opcode are ignored. * * +--------+-----+-------+--------+------------ * SEND | CTRL | INL | am_id | am_hdr | payload ... * +--------+-----+---+---+-+-------+----------- * RDMA_WRITE | CTRL | RADDR | INL | payload ... * +--------+---------+-----+------------------- * */ static UCS_F_ALWAYS_INLINE ucs_status_t uct_rc_mlx5_ep_inline_post(uct_rc_mlx5_ep_t *ep, unsigned opcode, const void *buffer, unsigned length, /* SEND */ uint8_t am_id, uint64_t am_hdr, /* RDMA */ uint64_t rdma_raddr, uct_rkey_t rdma_rkey ) { uct_rc_mlx5_iface_t *iface = ucs_derived_of(ep->super.super.super.iface, uct_rc_mlx5_iface_t); struct mlx5_wqe_ctrl_seg *ctrl; struct mlx5_wqe_raddr_seg *raddr; struct mlx5_wqe_inl_data_seg *inl; uct_rc_am_short_hdr_t *am; unsigned wqe_size; unsigned sig_flag; ctrl = ep->tx.seg; UCT_RC_MLX5_CHECK_RES(iface, ep); switch (opcode) { case MLX5_OPCODE_SEND: /* Set inline segment which has AM id, AM header, and AM payload */ wqe_size = sizeof(*ctrl) + sizeof(*inl) + sizeof(*am) + length; UCT_CHECK_LENGTH(wqe_size, UCT_RC_MLX5_MAX_BB * MLX5_SEND_WQE_BB, "am_short"); inl = (void*)(ctrl + 1); inl->byte_count = htonl((length + sizeof(*am)) | MLX5_INLINE_SEG); am = (void*)(inl + 1); am->rc_hdr.am_id = am_id; am->am_hdr = am_hdr; uct_rc_mlx5_inline_copy(am + 1, buffer, length, ep); sig_flag = uct_rc_iface_tx_moderation(&iface->super, &ep->super, MLX5_WQE_CTRL_CQ_UPDATE); break; case MLX5_OPCODE_RDMA_WRITE: /* Set RDMA segment */ if (length == 0) { wqe_size = sizeof(*ctrl) + sizeof(*raddr); } else { wqe_size = sizeof(*ctrl) + sizeof(*raddr) + sizeof(*inl) + length; } UCT_CHECK_LENGTH(wqe_size, UCT_RC_MLX5_MAX_BB * MLX5_SEND_WQE_BB, "put_short"); raddr = (void*)(ctrl + 1); uct_rc_mlx5_ep_set_rdma_seg(raddr, rdma_raddr, rdma_rkey); inl = (void*)(raddr + 1); inl->byte_count = htonl(length | MLX5_INLINE_SEG); uct_rc_mlx5_inline_copy(inl + 1, buffer, length, ep); sig_flag = MLX5_WQE_CTRL_CQ_UPDATE; break; case MLX5_OPCODE_NOP: /* Empty inline segment */ wqe_size = sizeof(*ctrl); inl = (void*)(ctrl + 1); inl->byte_count = htonl(MLX5_INLINE_SEG); sig_flag = MLX5_WQE_CTRL_CQ_UPDATE | MLX5_WQE_CTRL_FENCE; break; default: return UCS_ERR_INVALID_PARAM; } uct_rc_mlx5_post_send(ep, ctrl, opcode, 0, sig_flag, wqe_size); return UCS_OK; }
ucs_status_t uct_dc_mlx5_ep_fc_ctrl(uct_ep_t *tl_ep, unsigned op, uct_rc_fc_request_t *req) { uct_dc_mlx5_ep_t *dc_ep = ucs_derived_of(tl_ep, uct_dc_mlx5_ep_t); uct_dc_mlx5_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_dc_mlx5_iface_t); uct_ib_iface_t *ib_iface = &iface->super.super.super; struct ibv_ah_attr ah_attr = {.is_global = 0}; uct_dc_fc_sender_data_t sender; uct_dc_fc_request_t *dc_req; struct mlx5_wqe_av mlx5_av; uct_ib_mlx5_base_av_t av; ucs_status_t status; uintptr_t sender_ep; struct ibv_ah *ah; UCT_DC_MLX5_TXQP_DECL(txqp, txwq); ucs_assert((sizeof(uint8_t) + sizeof(sender_ep)) <= UCT_IB_MLX5_AV_FULL_SIZE); UCT_DC_MLX5_CHECK_RES(iface, dc_ep); UCT_DC_MLX5_IFACE_TXQP_GET(iface, dc_ep, txqp, txwq); dc_req = ucs_derived_of(req, uct_dc_fc_request_t); if (op == UCT_RC_EP_FC_PURE_GRANT) { ucs_assert(req != NULL); sender_ep = (uintptr_t)dc_req->sender.ep; /* TODO: look at common code with uct_ud_mlx5_iface_get_av */ if (dc_req->sender.global.is_global) { uct_ib_iface_fill_ah_attr_from_gid_lid(ib_iface, dc_req->lid, ucs_unaligned_ptr(&dc_req->sender.global.gid), ib_iface->path_bits[0], &ah_attr); status = uct_ib_iface_create_ah(ib_iface, &ah_attr, &ah); if (status != UCS_OK) { return status; } uct_ib_mlx5_get_av(ah, &mlx5_av); } /* Note av initialization is copied from exp verbs */ av.stat_rate_sl = ib_iface->config.sl; /* (attr->static_rate << 4) | attr->sl */ av.fl_mlid = ib_iface->path_bits[0] & 0x7f; /* lid in dc_req is in BE already */ av.rlid = uct_ib_iface_is_roce(ib_iface) ? 0 : (dc_req->lid | htons(ib_iface->path_bits[0])); av.dqp_dct = htonl(dc_req->dct_num); uct_dc_mlx5_iface_set_av_sport(iface, &av, dc_req->dct_num); if (!iface->ud_common.config.compact_av || ah_attr.is_global) { av.dqp_dct |= UCT_IB_MLX5_EXTENDED_UD_AV; } uct_rc_mlx5_txqp_inline_post(&iface->super, UCT_IB_QPT_DCI, txqp, txwq, MLX5_OPCODE_SEND, &av /*dummy*/, 0, op, sender_ep, 0, 0, 0, &av, ah_attr.is_global ? mlx5_av_grh(&mlx5_av) : NULL, uct_ib_mlx5_wqe_av_size(&av), 0, INT_MAX); } else { ucs_assert(op == UCT_RC_EP_FC_FLAG_HARD_REQ); sender.ep = (uint64_t)dc_ep; sender.global.gid = ib_iface->gid; sender.global.is_global = dc_ep->flags & UCT_DC_MLX5_EP_FLAG_GRH; UCS_STATS_UPDATE_COUNTER(dc_ep->fc.stats, UCT_RC_FC_STAT_TX_HARD_REQ, 1); uct_rc_mlx5_txqp_inline_post(&iface->super, UCT_IB_QPT_DCI, txqp, txwq, MLX5_OPCODE_SEND_IMM, &sender.global, sizeof(sender.global), op, sender.ep, uct_dc_mlx5_get_dct_num(iface), 0, 0, &dc_ep->av, uct_dc_mlx5_ep_get_grh(dc_ep), uct_ib_mlx5_wqe_av_size(&dc_ep->av), MLX5_WQE_CTRL_SOLICITED, INT_MAX); } return UCS_OK; } UCS_CLASS_INIT_FUNC(uct_dc_mlx5_ep_t, uct_dc_mlx5_iface_t *iface, const uct_dc_mlx5_iface_addr_t *if_addr, uct_ib_mlx5_base_av_t *av) { uint32_t remote_dctn; ucs_trace_func(""); UCS_CLASS_CALL_SUPER_INIT(uct_base_ep_t, &iface->super.super.super.super); self->atomic_mr_offset = uct_ib_md_atomic_offset(if_addr->atomic_mr_id); remote_dctn = uct_ib_unpack_uint24(if_addr->qp_num); memcpy(&self->av, av, sizeof(*av)); self->av.dqp_dct |= htonl(remote_dctn); uct_dc_mlx5_iface_set_av_sport(iface, &self->av, remote_dctn); return uct_dc_mlx5_ep_basic_init(iface, self); }
ssize_t uct_cm_ep_am_bcopy(uct_ep_h tl_ep, uint8_t am_id, uct_pack_callback_t pack_cb, void *arg) { uct_cm_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_cm_iface_t); uct_cm_ep_t *ep = ucs_derived_of(tl_ep, uct_cm_ep_t); struct ib_cm_sidr_req_param req; struct ibv_sa_path_rec path; struct ib_cm_id *id; ucs_status_t status; uct_cm_hdr_t *hdr; size_t payload_len; size_t total_len; int ret; UCT_CHECK_AM_ID(am_id); uct_cm_enter(iface); if (iface->num_outstanding >= iface->config.max_outstanding) { status = UCS_ERR_NO_RESOURCE; goto err; } /* Allocate temporary contiguous buffer */ hdr = ucs_malloc(IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE, "cm_send_buf"); if (hdr == NULL) { status = UCS_ERR_NO_MEMORY; goto err; } payload_len = pack_cb(hdr + 1, arg); hdr->am_id = am_id; hdr->length = payload_len; total_len = sizeof(*hdr) + payload_len; status = uct_cm_ep_fill_path_rec(ep, &path); if (status != UCS_OK) { goto err_free; } /* Fill SIDR request */ memset(&req, 0, sizeof req); req.path = &path; req.service_id = ep->dest_addr.id; req.timeout_ms = iface->config.timeout_ms; req.private_data = hdr; req.private_data_len = total_len; req.max_cm_retries = iface->config.retry_count; /* Create temporary ID for this message. Will be released when getting REP. */ ret = ib_cm_create_id(iface->cmdev, &id, NULL); if (ret) { ucs_error("ib_cm_create_id() failed: %m"); status = UCS_ERR_IO_ERROR; goto err_free; } uct_cm_dump_path(&path); ret = ib_cm_send_sidr_req(id, &req); if (ret) { ucs_error("ib_cm_send_sidr_req() failed: %m"); status = UCS_ERR_IO_ERROR; goto err_destroy_id; } iface->outstanding[iface->num_outstanding++] = id; UCT_TL_EP_STAT_OP(&ep->super, AM, BCOPY, payload_len); uct_cm_leave(iface); uct_cm_iface_trace_data(iface, UCT_AM_TRACE_TYPE_SEND, hdr, "TX: SIDR_REQ [dlid %d svc 0x%"PRIx64"]", ntohs(path.dlid), req.service_id); ucs_free(hdr); return payload_len; err_destroy_id: ib_cm_destroy_id(id); err_free: ucs_free(hdr); err: uct_cm_leave(iface); return status; }
static UCS_F_ALWAYS_INLINE ssize_t uct_ugni_udt_ep_am_common_send(const unsigned is_short, uct_ugni_udt_ep_t *ep, uct_ugni_udt_iface_t *iface, uint8_t am_id, unsigned length, uint64_t header, const void *payload, uct_pack_callback_t pack_cb, void *arg) { gni_return_t ugni_rc; uint16_t msg_length; uct_ugni_udt_desc_t *desc; uct_ugni_udt_header_t *sheader, *rheader; ssize_t packed_length; UCT_CHECK_AM_ID(am_id); if (ucs_unlikely(NULL != ep->posted_desc)) { UCT_TL_IFACE_STAT_TX_NO_DESC(&iface->super.super); return UCS_ERR_NO_RESOURCE; } UCT_TL_IFACE_GET_TX_DESC(&iface->super.super, &iface->free_desc, desc, return UCS_ERR_NO_RESOURCE); rheader = uct_ugni_udt_get_rheader(desc, iface); rheader->type = UCT_UGNI_UDT_EMPTY; sheader = uct_ugni_udt_get_sheader(desc, iface); if (is_short) { uint64_t *hdr = (uint64_t *)uct_ugni_udt_get_spayload(desc, iface); *hdr = header; memcpy((void*)(hdr + 1), payload, length); sheader->length = length + sizeof(header); msg_length = sheader->length + sizeof(*sheader); UCT_TL_EP_STAT_OP(ucs_derived_of(ep, uct_base_ep_t), AM, SHORT, sizeof(header) + length); } else { packed_length = pack_cb((void *)uct_ugni_udt_get_spayload(desc, iface), arg); sheader->length = packed_length; msg_length = sheader->length + sizeof(*sheader); UCT_TL_EP_STAT_OP(ucs_derived_of(ep, uct_base_ep_t), AM, BCOPY, packed_length); } uct_iface_trace_am(&iface->super.super, UCT_AM_TRACE_TYPE_SEND, am_id, uct_ugni_udt_get_spayload(desc, iface), length, is_short ? "TX: AM_SHORT" : "TX: AM_BCOPY"); sheader->am_id = am_id; sheader->type = UCT_UGNI_UDT_PAYLOAD; ucs_assert_always(sheader->length <= GNI_DATAGRAM_MAXSIZE); pthread_mutex_lock(&uct_ugni_global_lock); ugni_rc = GNI_EpPostDataWId(ep->super.ep, sheader, msg_length, rheader, (uint16_t)iface->config.udt_seg_size, ep->super.hash_key); pthread_mutex_unlock(&uct_ugni_global_lock); UCT_UGNI_UDT_CHECK_RC(ugni_rc); ep->posted_desc = desc; ++ep->super.outstanding; ++iface->super.outstanding; return is_short ? UCS_OK : packed_length; }