static UCS_CLASS_INIT_FUNC(uct_ugni_smsg_iface_t, uct_md_h md, uct_worker_h worker, const char *dev_name, size_t rx_headroom, const uct_iface_config_t *tl_config) { uct_ugni_iface_config_t *config = ucs_derived_of(tl_config, uct_ugni_iface_config_t); ucs_status_t status; gni_return_t ugni_rc; unsigned int bytes_per_mbox; gni_smsg_attr_t smsg_attr; pthread_mutex_lock(&uct_ugni_global_lock); UCS_CLASS_CALL_SUPER_INIT(uct_ugni_iface_t, md, worker, dev_name, &uct_ugni_smsg_iface_ops, &config->super UCS_STATS_ARG(NULL)); /* Setting initial configuration */ self->config.smsg_seg_size = 2048; self->config.rx_headroom = rx_headroom; self->config.smsg_max_retransmit = 16; self->config.smsg_max_credit = 8; self->smsg_id = 0; smsg_attr.msg_type = GNI_SMSG_TYPE_MBOX_AUTO_RETRANSMIT; smsg_attr.mbox_maxcredit = self->config.smsg_max_credit; smsg_attr.msg_maxsize = self->config.smsg_seg_size; ugni_rc = GNI_SmsgBufferSizeNeeded(&(smsg_attr), &bytes_per_mbox); self->bytes_per_mbox = ucs_align_up_pow2(bytes_per_mbox, ucs_get_page_size()); if (ugni_rc != GNI_RC_SUCCESS) { ucs_error("Smsg buffer size calculation failed"); status = UCS_ERR_INVALID_PARAM; goto exit; } status = ucs_mpool_init(&self->free_desc, 0, self->config.smsg_seg_size + sizeof(uct_ugni_smsg_desc_t), 0, UCS_SYS_CACHE_LINE_SIZE, /* alignment */ 128 , /* grow */ config->mpool.max_bufs, /* max buffers */ &uct_ugni_smsg_desc_mpool_ops, "UGNI-SMSG-DESC"); if (UCS_OK != status) { ucs_error("Desc Mpool creation failed"); goto exit; } status = ucs_mpool_init(&self->free_mbox, 0, self->bytes_per_mbox + sizeof(uct_ugni_smsg_mbox_t), sizeof(uct_ugni_smsg_mbox_t), UCS_SYS_CACHE_LINE_SIZE, /* alignment */ 128, /* grow */ config->mpool.max_bufs, /* max buffers */ &uct_ugni_smsg_mbox_mpool_ops, "UGNI-SMSG-MBOX"); if (UCS_OK != status) { ucs_error("Mbox Mpool creation failed"); goto clean_desc; } UCT_TL_IFACE_GET_TX_DESC(&self->super.super, &self->free_desc, self->user_desc, self->user_desc = NULL); status = ugni_smsg_activate_iface(self); if (UCS_OK != status) { ucs_error("Failed to activate the interface"); goto clean_mbox; } ugni_rc = GNI_SmsgSetMaxRetrans(self->super.nic_handle, self->config.smsg_max_retransmit); if (ugni_rc != GNI_RC_SUCCESS) { ucs_error("Smsg setting max retransmit count failed."); status = UCS_ERR_INVALID_PARAM; goto clean_iface; } /* TBD: eventually the uct_ugni_progress has to be moved to * udt layer so each ugni layer will have own progress */ uct_worker_progress_register(worker, uct_ugni_smsg_progress, self); pthread_mutex_unlock(&uct_ugni_global_lock); return UCS_OK; clean_iface: ugni_smsg_deactivate_iface(self); clean_desc: ucs_mpool_put(self->user_desc); ucs_mpool_cleanup(&self->free_desc, 1); clean_mbox: ucs_mpool_cleanup(&self->free_mbox, 1); exit: ucs_error("Failed to activate interface"); pthread_mutex_unlock(&uct_ugni_global_lock); return status; }
ucs_status_t uct_mem_alloc(size_t min_length, uct_alloc_method_t *methods, unsigned num_methods, uct_pd_h *pds, unsigned num_pds, const char *alloc_name, uct_allocated_memory_t *mem) { uct_alloc_method_t *method; uct_pd_attr_t pd_attr; ucs_status_t status; size_t alloc_length; unsigned pd_index; uct_mem_h memh; uct_pd_h pd; void *address; int shmid; if (min_length == 0) { ucs_error("Allocation length cannot be 0"); return UCS_ERR_INVALID_PARAM; } if (num_methods == 0) { ucs_error("No allocation methods provided"); return UCS_ERR_INVALID_PARAM; } for (method = methods; method < methods + num_methods; ++method) { ucs_debug("trying allocation method %s", uct_alloc_method_names[*method]); switch (*method) { case UCT_ALLOC_METHOD_PD: /* Allocate with one of the specified protection domains */ for (pd_index = 0; pd_index < num_pds; ++pd_index) { pd = pds[pd_index]; status = uct_pd_query(pd, &pd_attr); if (status != UCS_OK) { ucs_error("Failed to query PD"); return status; } /* Check if PD supports allocation */ if (!(pd_attr.cap.flags & UCT_PD_FLAG_ALLOC)) { continue; } /* Allocate memory using the PD. * If the allocation fails, it's considered an error and we don't * fall-back, because this PD already exposed support for memory * allocation. */ alloc_length = min_length; status = uct_pd_mem_alloc(pd, &alloc_length, &address, alloc_name, &memh); if (status != UCS_OK) { ucs_error("failed to allocate %zu bytes using pd %s: %s", alloc_length, pd->component->name, ucs_status_string(status)); return status; } mem->pd = pd; mem->memh = memh; goto allocated; } break; case UCT_ALLOC_METHOD_HEAP: /* Allocate aligned memory using libc allocator */ alloc_length = min_length; address = ucs_memalign(UCS_SYS_CACHE_LINE_SIZE, alloc_length UCS_MEMTRACK_VAL); if (address != NULL) { goto allocated_without_pd; } ucs_debug("failed to allocate %zu bytes from the heap", alloc_length); break; case UCT_ALLOC_METHOD_MMAP: /* Request memory from operating system using mmap() */ alloc_length = ucs_align_up_pow2(min_length, ucs_get_page_size()); address = ucs_mmap(NULL, alloc_length, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, 0 UCS_MEMTRACK_VAL); if (address != MAP_FAILED) { goto allocated_without_pd; } ucs_debug("failed to mmap %zu bytes: %m", alloc_length); break; case UCT_ALLOC_METHOD_HUGE: /* Allocate huge pages */ alloc_length = min_length; status = ucs_sysv_alloc(&alloc_length, &address, SHM_HUGETLB, &shmid UCS_MEMTRACK_VAL); if (status == UCS_OK) { goto allocated_without_pd; } ucs_debug("failed to allocate %zu bytes from hugetlb: %s", min_length, ucs_status_string(status)); break; default: ucs_error("Invalid allocation method %d", *method); return UCS_ERR_INVALID_PARAM; } } ucs_debug("Could not allocate memory with any of the provided methods"); return UCS_ERR_NO_MEMORY; allocated_without_pd: mem->pd = NULL; mem->memh = UCT_INVALID_MEM_HANDLE; allocated: ucs_debug("allocated %zu bytes at %p using %s", alloc_length, address, (mem->pd == NULL) ? uct_alloc_method_names[*method] : mem->pd->component->name); mem->address = address; mem->length = alloc_length; mem->method = *method; return UCS_OK; }
/* * Generic data-pointer posting function. * Parameters which are not relevant to the opcode are ignored. * * +--------+-----+-------+--------+-------+ * SEND | CTRL | INL | am_id | am_hdr | DPSEG | * +--------+-----+---+---+----+----+------+ * RDMA_WRITE | CTRL | RADDR | DPSEG | * +--------+---------+--------+-------+ * ATOMIC | CTRL | RADDR | ATOMIC | DPSEG | * +--------+---------+--------+-------+ */ static UCS_F_ALWAYS_INLINE void uct_rc_mlx5_ep_dptr_post(uct_rc_mlx5_ep_t *ep, unsigned opcode_flags, const void *buffer, unsigned length, uint32_t *lkey_p, /* SEND */ uint8_t am_id, const void *am_hdr, unsigned am_hdr_len, /* RDMA/ATOMIC */ uint64_t remote_addr, uct_rkey_t rkey, /* ATOMIC */ uint64_t compare_mask, uint64_t compare, uint64_t swap_add, int signal) { struct mlx5_wqe_ctrl_seg *ctrl; struct mlx5_wqe_raddr_seg *raddr; struct mlx5_wqe_atomic_seg *atomic; struct mlx5_wqe_data_seg *dptr; struct mlx5_wqe_inl_data_seg *inl; struct uct_ib_mlx5_atomic_masked_cswap32_seg *masked_cswap32; struct uct_ib_mlx5_atomic_masked_fadd32_seg *masked_fadd32; struct uct_ib_mlx5_atomic_masked_cswap64_seg *masked_cswap64; uct_rc_mlx5_iface_t *iface; uct_rc_hdr_t *rch; unsigned wqe_size, inl_seg_size; uint8_t opmod; iface = ucs_derived_of(ep->super.super.super.iface, uct_rc_mlx5_iface_t); if (!signal) { signal = uct_rc_iface_tx_moderation(&iface->super, &ep->super, MLX5_WQE_CTRL_CQ_UPDATE); } else { ucs_assert(signal == MLX5_WQE_CTRL_CQ_UPDATE); } opmod = 0; ctrl = ep->tx.wq.curr; switch (opcode_flags) { case MLX5_OPCODE_SEND: inl_seg_size = ucs_align_up_pow2(sizeof(*inl) + sizeof(*rch) + am_hdr_len, UCT_IB_MLX5_WQE_SEG_SIZE); ucs_assert(sizeof(*ctrl) + inl_seg_size + sizeof(*dptr) <= UCT_RC_MLX5_MAX_BB * MLX5_SEND_WQE_BB); ucs_assert(length + sizeof(*rch) + am_hdr_len <= iface->super.super.config.seg_size); /* Inline segment with AM ID and header */ inl = (void*)(ctrl + 1); inl->byte_count = htonl((sizeof(*rch) + am_hdr_len) | MLX5_INLINE_SEG); rch = (void*)(inl + 1); rch->am_id = am_id; uct_ib_mlx5_inline_copy(rch + 1, am_hdr, am_hdr_len, &ep->tx.wq); /* Data segment with payload */ if (length == 0) { wqe_size = sizeof(*ctrl) + inl_seg_size; } else { wqe_size = sizeof(*ctrl) + inl_seg_size + sizeof(*dptr); dptr = (void*)(ctrl + 1) + inl_seg_size; if (ucs_unlikely((void*)dptr >= ep->tx.wq.qend)) { dptr = (void*)dptr - (ep->tx.wq.qend - ep->tx.wq.qstart); } ucs_assert((void*)dptr >= ep->tx.wq.qstart); ucs_assert((void*)(dptr + 1) <= ep->tx.wq.qend); uct_ib_mlx5_set_data_seg(dptr, buffer, length, *lkey_p); } break; case MLX5_OPCODE_SEND|UCT_RC_MLX5_OPCODE_FLAG_RAW: /* Data segment only */ ucs_assert(length < (2ul << 30)); ucs_assert(length <= iface->super.super.config.seg_size); wqe_size = sizeof(*ctrl) + sizeof(*dptr); uct_ib_mlx5_set_data_seg((void*)(ctrl + 1), buffer, length, *lkey_p); break; case MLX5_OPCODE_RDMA_READ: case MLX5_OPCODE_RDMA_WRITE: /* Set RDMA segment */ ucs_assert(length <= UCT_IB_MAX_MESSAGE_SIZE); raddr = (void*)(ctrl + 1); uct_rc_mlx5_ep_set_rdma_seg(raddr, remote_addr, rkey); /* Data segment */ if (length == 0) { wqe_size = sizeof(*ctrl) + sizeof(*raddr); } else { wqe_size = sizeof(*ctrl) + sizeof(*raddr) + sizeof(*dptr); uct_ib_mlx5_set_data_seg((void*)(raddr + 1), buffer, length, *lkey_p); } break; case MLX5_OPCODE_ATOMIC_FA: case MLX5_OPCODE_ATOMIC_CS: ucs_assert(length == sizeof(uint64_t)); raddr = (void*)(ctrl + 1); uct_rc_mlx5_ep_set_rdma_seg(raddr, remote_addr, rkey); atomic = (void*)(raddr + 1); if (opcode_flags == MLX5_OPCODE_ATOMIC_CS) { atomic->compare = compare; } atomic->swap_add = swap_add; uct_ib_mlx5_set_data_seg((void*)(atomic + 1), buffer, length, *lkey_p); wqe_size = sizeof(*ctrl) + sizeof(*raddr) + sizeof(*atomic) + sizeof(*dptr); break; case MLX5_OPCODE_ATOMIC_MASKED_CS: raddr = (void*)(ctrl + 1); uct_rc_mlx5_ep_set_rdma_seg(raddr, remote_addr, rkey); switch (length) { case sizeof(uint32_t): opmod = UCT_IB_MLX5_OPMOD_EXT_ATOMIC(2); masked_cswap32 = (void*)(raddr + 1); masked_cswap32->swap = swap_add; masked_cswap32->compare = compare; masked_cswap32->swap_mask = (uint32_t)-1; masked_cswap32->compare_mask = compare_mask; dptr = (void*)(masked_cswap32 + 1); wqe_size = sizeof(*ctrl) + sizeof(*raddr) + sizeof(*masked_cswap32) + sizeof(*dptr); break; case sizeof(uint64_t): opmod = UCT_IB_MLX5_OPMOD_EXT_ATOMIC(3); /* Ext. atomic, size 2**3 */ masked_cswap64 = (void*)(raddr + 1); masked_cswap64->swap = swap_add; masked_cswap64->compare = compare; masked_cswap64->swap_mask = (uint64_t)-1; masked_cswap64->compare_mask = compare_mask; dptr = (void*)(masked_cswap64 + 1); wqe_size = sizeof(*ctrl) + sizeof(*raddr) + sizeof(*masked_cswap64) + sizeof(*dptr); /* Handle QP wrap-around. It cannot happen in the middle of * masked-cswap segment, because it's still in the first BB. */ ucs_assert((void*)dptr <= ep->tx.wq.qend); if (dptr == ep->tx.wq.qend) { dptr = ep->tx.wq.qstart; } else { ucs_assert((void*)masked_cswap64 < ep->tx.wq.qend); } break; default: ucs_assert(0); } uct_ib_mlx5_set_data_seg(dptr, buffer, length, *lkey_p); break; case MLX5_OPCODE_ATOMIC_MASKED_FA: ucs_assert(length == sizeof(uint32_t)); raddr = (void*)(ctrl + 1); uct_rc_mlx5_ep_set_rdma_seg(raddr, remote_addr, rkey); opmod = UCT_IB_MLX5_OPMOD_EXT_ATOMIC(2); masked_fadd32 = (void*)(raddr + 1); masked_fadd32->add = swap_add; masked_fadd32->filed_boundary = 0; uct_ib_mlx5_set_data_seg((void*)(masked_fadd32 + 1), buffer, length, *lkey_p); wqe_size = sizeof(*ctrl) + sizeof(*raddr) + sizeof(*masked_fadd32) + sizeof(*dptr); break; default: ucs_fatal("invalid send opcode"); } uct_rc_mlx5_post_send(ep, ctrl, (opcode_flags & UCT_RC_MLX5_OPCODE_MASK), opmod, signal, wqe_size); }