static void *ucm_memalign_impl(size_t alignment, size_t size, const char *debug_name) { void *ptr; ucm_malloc_hook_state.hook_called = 1; ptr = ucm_dlmemalign(ucs_max(alignment, ucm_global_config.alloc_alignment), size); ucm_malloc_allocated(ptr, size, debug_name); return ptr; }
static ucs_status_t uct_rc_verbs_query_resources(uct_pd_h pd, uct_tl_resource_desc_t **resources_p, unsigned *num_resources_p) { return uct_ib_device_query_tl_resources(ucs_derived_of(pd, uct_ib_device_t), "rc", 0, ucs_max(sizeof(uct_rc_hdr_t), UCT_IB_RETH_LEN), 75, resources_p, num_resources_p); }
static double ucp_wireup_score_func(uct_tl_resource_desc_t *resource, uct_iface_h iface, uct_iface_attr_t *iface_attr) { if (!(iface_attr->cap.flags & UCT_IFACE_FLAG_AM_BCOPY) || !(iface_attr->cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE)) { return 0.0; } return (1e6 / resource->latency) + (1e3 * ucs_max(iface_attr->cap.am.max_bcopy, iface_attr->cap.am.max_short)); }
static void ucp_ep_config_print_rma_proto(FILE *stream, const ucp_ep_rma_config_t* rma_config, size_t bcopy_thresh) { size_t max_short; max_short = ucs_max(rma_config->max_put_short + 1, bcopy_thresh); fprintf(stream, "# put: 0"); if (max_short > 0) { fprintf(stream, "..<short>..%zu" , max_short); } fprintf(stream, "..<bcopy>..(inf)\n"); fprintf(stream, "# get: 0..<bcopy>..(inf)\n"); }
ucs_status_t ucs_async_dispatch_timerq(ucs_timer_queue_t *timerq, ucs_time_t current_time) { size_t max_timers, num_timers = 0; int *expired_timers; ucs_timer_t *timer; max_timers = ucs_max(1, ucs_timerq_size(timerq)); expired_timers = ucs_alloca(max_timers * sizeof(*expired_timers)); ucs_timerq_for_each_expired(timer, timerq, current_time, { expired_timers[num_timers++] = timer->id; if (num_timers >= max_timers) { break; /* Keep timers which we don't have room for in the queue */ } }) return ucs_async_dispatch_handlers(expired_timers, num_timers);
void ucs_class_call_cleanup_chain(ucs_class_t *cls, void *obj, int limit) { ucs_class_t *c; int depth, skip; ucs_assert((limit == -1) || (limit >= 1)); /* Count how many classes are there */ for (depth = 0, c = cls; c != NULL; ++depth, c = c->superclass); /* Skip some destructors, because we may have a limit here */ skip = (limit < 0) ? 0 : ucs_max(depth - limit, 0); c = cls; while (skip-- > 0) { c = c->superclass; } /* Call remaining destructors */ while (c != NULL) { c->cleanup(obj); c = c->superclass; } }
ucs_status_t uct_rc_verbs_iface_common_init(uct_rc_verbs_iface_common_t *iface, uct_rc_iface_t *rc_iface, uct_rc_verbs_iface_common_config_t *config, uct_rc_iface_config_t *rc_config, size_t max_hdr_size) { memset(iface->inl_sge, 0, sizeof(iface->inl_sge)); ucs_status_t status; /* Configuration */ iface->config.short_desc_size = ucs_max(UCT_RC_MAX_ATOMIC_SIZE, max_hdr_size); /* Create AM headers and Atomic mempool */ status = uct_iface_mpool_init(&rc_iface->super.super, &iface->short_desc_mp, sizeof(uct_rc_iface_send_desc_t) + iface->config.short_desc_size, sizeof(uct_rc_iface_send_desc_t), UCS_SYS_CACHE_LINE_SIZE, &rc_config->super.tx.mp, rc_iface->config.tx_qp_len, uct_rc_iface_send_desc_init, "rc_verbs_short_desc"); if (status != UCS_OK) { return status; } iface->config.notag_hdr_size = 0; iface->am_inl_hdr = ucs_mpool_get(&iface->short_desc_mp); if (iface->am_inl_hdr == NULL) { ucs_error("Failed to allocate AM short header"); return UCS_ERR_NO_MEMORY; } return UCS_OK; }
/** * @param rx_headroom Headroom requested by the user. * @param rx_priv_len Length of transport private data to reserve (0 if unused) * @param rx_hdr_len Length of transport network header. * @param mss Maximal segment size (transport limit). */ UCS_CLASS_INIT_FUNC(uct_ib_iface_t, uct_ib_iface_ops_t *ops, uct_md_h md, uct_worker_h worker, const uct_iface_params_t *params, unsigned rx_priv_len, unsigned rx_hdr_len, unsigned tx_cq_len, size_t mss, const uct_ib_iface_config_t *config) { uct_ib_device_t *dev = &ucs_derived_of(md, uct_ib_md_t)->dev; ucs_status_t status; uint8_t port_num; UCS_CLASS_CALL_SUPER_INIT(uct_base_iface_t, &ops->super, md, worker, &config->super UCS_STATS_ARG(dev->stats)); status = uct_ib_device_find_port(dev, params->dev_name, &port_num); if (status != UCS_OK) { goto err; } self->ops = ops; self->config.rx_payload_offset = sizeof(uct_ib_iface_recv_desc_t) + ucs_max(sizeof(uct_am_recv_desc_t) + params->rx_headroom, rx_priv_len + rx_hdr_len); self->config.rx_hdr_offset = self->config.rx_payload_offset - rx_hdr_len; self->config.rx_headroom_offset= self->config.rx_payload_offset - params->rx_headroom; self->config.seg_size = ucs_min(mss, config->super.max_bcopy); self->config.tx_max_poll = config->tx.max_poll; self->config.rx_max_poll = config->rx.max_poll; self->config.rx_max_batch = ucs_min(config->rx.max_batch, config->rx.queue_len / 4); self->config.port_num = port_num; self->config.sl = config->sl; self->config.gid_index = config->gid_index; status = uct_ib_iface_init_pkey(self, config); if (status != UCS_OK) { goto err; } status = uct_ib_device_query_gid(dev, self->config.port_num, self->config.gid_index, &self->gid); if (status != UCS_OK) { goto err; } status = uct_ib_iface_init_lmc(self, config); if (status != UCS_OK) { goto err; } self->comp_channel = ibv_create_comp_channel(dev->ibv_context); if (self->comp_channel == NULL) { ucs_error("ibv_create_comp_channel() failed: %m"); status = UCS_ERR_IO_ERROR; goto err_free_path_bits; } status = ucs_sys_fcntl_modfl(self->comp_channel->fd, O_NONBLOCK, 0); if (status != UCS_OK) { goto err_destroy_comp_channel; } status = uct_ib_iface_create_cq(self, tx_cq_len, 0, &self->send_cq); if (status != UCS_OK) { goto err_destroy_comp_channel; } status = uct_ib_iface_create_cq(self, config->rx.queue_len, config->rx.inl, &self->recv_cq); if (status != UCS_OK) { goto err_destroy_send_cq; } /* Address scope and size */ if (config->addr_type == UCT_IB_IFACE_ADDRESS_TYPE_AUTO) { if (IBV_PORT_IS_LINK_LAYER_ETHERNET(uct_ib_iface_port_attr(self))) { self->addr_type = UCT_IB_ADDRESS_TYPE_ETH; } else { self->addr_type = uct_ib_address_scope(self->gid.global.subnet_prefix); } } else { ucs_assert(config->addr_type < UCT_IB_ADDRESS_TYPE_LAST); self->addr_type = config->addr_type; } self->addr_size = uct_ib_address_size(self->addr_type); ucs_debug("created uct_ib_iface_t headroom_ofs %d payload_ofs %d hdr_ofs %d data_sz %d", self->config.rx_headroom_offset, self->config.rx_payload_offset, self->config.rx_hdr_offset, self->config.seg_size); return UCS_OK; err_destroy_send_cq: ibv_destroy_cq(self->send_cq); err_destroy_comp_channel: ibv_destroy_comp_channel(self->comp_channel); err_free_path_bits: ucs_free(self->path_bits); err: return status; }
static ucs_status_t uct_ib_iface_create_cq(uct_ib_iface_t *iface, int cq_length, size_t inl, struct ibv_cq **cq_p) { static const char *cqe_size_env_var = "MLX5_CQE_SIZE"; uct_ib_device_t *dev = uct_ib_iface_device(iface); const char *cqe_size_env_value; size_t cqe_size_min, cqe_size; char cqe_size_buf[32]; ucs_status_t status; struct ibv_cq *cq; int env_var_added = 0; int ret; cqe_size_min = (inl > 32) ? 128 : 64; cqe_size_env_value = getenv(cqe_size_env_var); if (cqe_size_env_value != NULL) { cqe_size = atol(cqe_size_env_value); if (cqe_size < cqe_size_min) { ucs_error("%s is set to %zu, but at least %zu is required (inl: %zu)", cqe_size_env_var, cqe_size, cqe_size_min, inl); status = UCS_ERR_INVALID_PARAM; goto out; } } else { /* CQE size is not defined by the environment, set it according to inline * size and cache line size. */ cqe_size = ucs_max(cqe_size_min, UCS_SYS_CACHE_LINE_SIZE); cqe_size = ucs_max(cqe_size, 64); /* at least 64 */ cqe_size = ucs_min(cqe_size, 128); /* at most 128 */ snprintf(cqe_size_buf, sizeof(cqe_size_buf),"%zu", cqe_size); ucs_debug("%s: setting %s=%s", uct_ib_device_name(dev), cqe_size_env_var, cqe_size_buf); ret = ibv_exp_setenv(dev->ibv_context, cqe_size_env_var, cqe_size_buf, 1); if (ret) { ucs_error("ibv_exp_setenv(%s=%s) failed: %m", cqe_size_env_var, cqe_size_buf); status = UCS_ERR_INVALID_PARAM; goto out; } env_var_added = 1; } cq = ibv_create_cq(dev->ibv_context, cq_length, NULL, iface->comp_channel, 0); if (cq == NULL) { ucs_error("ibv_create_cq(cqe=%d) failed: %m", cq_length); status = UCS_ERR_IO_ERROR; goto out_unsetenv; } *cq_p = cq; status = UCS_OK; out_unsetenv: if (env_var_added) { /* if we created a new environment variable, remove it */ ret = ibv_exp_unsetenv(dev->ibv_context, cqe_size_env_var); if (ret) { ucs_warn("unsetenv(%s) failed: %m", cqe_size_env_var); } } out: return status; }
void ucp_ep_config_init(ucp_worker_h worker, ucp_ep_config_t *config) { ucp_context_h context = worker->context; ucp_ep_rma_config_t *rma_config; uct_iface_attr_t *iface_attr; ucp_rsc_index_t rsc_index; uct_md_attr_t *md_attr; ucp_lane_index_t lane; double zcopy_thresh, numerator, denumerator; size_t rndv_thresh; /* Default settings */ config->zcopy_thresh = SIZE_MAX; config->sync_zcopy_thresh = -1; config->bcopy_thresh = context->config.ext.bcopy_thresh; config->rndv_thresh = SIZE_MAX; config->sync_rndv_thresh = SIZE_MAX; config->max_rndv_get_zcopy = SIZE_MAX; config->p2p_lanes = 0; /* Collect p2p lanes */ for (lane = 0; lane < config->key.num_lanes; ++lane) { rsc_index = config->key.lanes[lane]; if ((rsc_index != UCP_NULL_RESOURCE) && ucp_worker_is_tl_p2p(worker, rsc_index)) { config->p2p_lanes |= UCS_BIT(lane); } } /* Configuration for active messages */ if (config->key.am_lane != UCP_NULL_LANE) { lane = config->key.am_lane; rsc_index = config->key.lanes[lane]; if (rsc_index != UCP_NULL_RESOURCE) { iface_attr = &worker->iface_attrs[rsc_index]; md_attr = &context->md_attrs[context->tl_rscs[rsc_index].md_index]; if (iface_attr->cap.flags & UCT_IFACE_FLAG_AM_SHORT) { config->max_eager_short = iface_attr->cap.am.max_short - sizeof(ucp_eager_hdr_t); config->max_am_short = iface_attr->cap.am.max_short - sizeof(uint64_t); } if (iface_attr->cap.flags & UCT_IFACE_FLAG_AM_BCOPY) { config->max_am_bcopy = iface_attr->cap.am.max_bcopy; } if ((iface_attr->cap.flags & UCT_IFACE_FLAG_AM_ZCOPY) && (md_attr->cap.flags & UCT_MD_FLAG_REG)) { config->max_am_zcopy = iface_attr->cap.am.max_zcopy; if (context->config.ext.zcopy_thresh == UCS_CONFIG_MEMUNITS_AUTO) { /* auto */ zcopy_thresh = md_attr->reg_cost.overhead / ( (1.0 / context->config.ext.bcopy_bw) - (1.0 / iface_attr->bandwidth) - md_attr->reg_cost.growth); if (zcopy_thresh < 0) { config->zcopy_thresh = SIZE_MAX; config->sync_zcopy_thresh = -1; } else { config->zcopy_thresh = zcopy_thresh; config->sync_zcopy_thresh = zcopy_thresh; } } else { config->zcopy_thresh = context->config.ext.zcopy_thresh; config->sync_zcopy_thresh = context->config.ext.zcopy_thresh; } } } else { config->max_am_bcopy = UCP_MIN_BCOPY; /* Stub endpoint */ } } /* Configuration for remote memory access */ for (lane = 0; lane < config->key.num_lanes; ++lane) { if (ucp_lane_map_get_lane(config->key.rma_lane_map, lane) == 0) { continue; } rma_config = &config->rma[lane]; rsc_index = config->key.lanes[lane]; iface_attr = &worker->iface_attrs[rsc_index]; if (rsc_index != UCP_NULL_RESOURCE) { if (iface_attr->cap.flags & UCT_IFACE_FLAG_PUT_SHORT) { rma_config->max_put_short = iface_attr->cap.put.max_short; } if (iface_attr->cap.flags & UCT_IFACE_FLAG_PUT_BCOPY) { rma_config->max_put_bcopy = iface_attr->cap.put.max_bcopy; } if (iface_attr->cap.flags & UCT_IFACE_FLAG_GET_BCOPY) { rma_config->max_get_bcopy = iface_attr->cap.get.max_bcopy; } } else { rma_config->max_put_bcopy = UCP_MIN_BCOPY; /* Stub endpoint */ } } /* Configuration for Rendezvous data */ if (config->key.rndv_lane != UCP_NULL_LANE) { lane = config->key.rndv_lane; rsc_index = config->key.lanes[lane]; if (rsc_index != UCP_NULL_RESOURCE) { iface_attr = &worker->iface_attrs[rsc_index]; md_attr = &context->md_attrs[context->tl_rscs[rsc_index].md_index]; ucs_assert_always(iface_attr->cap.flags & UCT_IFACE_FLAG_GET_ZCOPY); if (context->config.ext.rndv_thresh == UCS_CONFIG_MEMUNITS_AUTO) { /* auto */ /* Make UCX calculate the rndv threshold on its own. * We do it by finding the message size at which rndv and eager_zcopy get * the same latency. Starting this message size (rndv_thresh), rndv's latency * would be lower and the reached bandwidth would be higher. * The latency function for eager_zcopy is: * [ reg_cost.overhead + size * md_attr->reg_cost.growth + * max(size/bw , size/bcopy_bw) + overhead ] * The latency function for Rendezvous is: * [ reg_cost.overhead + size * md_attr->reg_cost.growth + latency + overhead + * reg_cost.overhead + size * md_attr->reg_cost.growth + overhead + latency + * size/bw + latency + overhead + latency ] * Isolating the 'size' yields the rndv_thresh. * The used latency functions for eager_zcopy and rndv are also specified in * the UCX wiki */ numerator = ((2 * iface_attr->overhead) + (4 * iface_attr->latency) + md_attr->reg_cost.overhead); denumerator = (ucs_max((1.0 / iface_attr->bandwidth),(1.0 / context->config.ext.bcopy_bw)) - md_attr->reg_cost.growth - (1.0 / iface_attr->bandwidth)); if (denumerator > 0) { rndv_thresh = numerator / denumerator; ucs_trace("rendezvous threshold is %zu ( = %f / %f)", rndv_thresh, numerator, denumerator); } else { rndv_thresh = context->config.ext.rndv_thresh_fallback; ucs_trace("rendezvous threshold is %zu", rndv_thresh); } /* for the 'auto' mode in the rndv_threshold, we enforce the usage of rndv * to a value that can be set by the user. * to disable rndv, need to set a high value for the rndv_threshold * (without the 'auto' mode)*/ config->rndv_thresh = rndv_thresh; config->sync_rndv_thresh = rndv_thresh; config->max_rndv_get_zcopy = iface_attr->cap.get.max_zcopy; } else { config->rndv_thresh = context->config.ext.rndv_thresh; config->sync_rndv_thresh = context->config.ext.rndv_thresh; config->max_rndv_get_zcopy = iface_attr->cap.get.max_zcopy; ucs_trace("rendezvous threshold is %zu", config->rndv_thresh); } } else { ucs_debug("rendezvous protocol is not supported "); } } }
/** * @param rx_headroom Headroom requested by the user. * @param rx_priv_len Length of transport private data to reserve (0 if unused) * @param rx_hdr_len Length of transport network header. * @param mss Maximal segment size (transport limit). */ UCS_CLASS_INIT_FUNC(uct_ib_iface_t, uct_ib_iface_ops_t *ops, uct_md_h md, uct_worker_h worker, const char *dev_name, unsigned rx_headroom, unsigned rx_priv_len, unsigned rx_hdr_len, unsigned tx_cq_len, size_t mss, uct_ib_iface_config_t *config) { uct_ib_device_t *dev = &ucs_derived_of(md, uct_ib_md_t)->dev; ucs_status_t status; uint8_t port_num; UCS_CLASS_CALL_SUPER_INIT(uct_base_iface_t, &ops->super, md, worker, &config->super UCS_STATS_ARG(dev->stats)); status = uct_ib_device_find_port(dev, dev_name, &port_num); if (status != UCS_OK) { goto err; } self->port_num = port_num; self->sl = config->sl; self->config.rx_payload_offset = sizeof(uct_ib_iface_recv_desc_t) + ucs_max(sizeof(uct_am_recv_desc_t) + rx_headroom, rx_priv_len + rx_hdr_len); self->config.rx_hdr_offset = self->config.rx_payload_offset - rx_hdr_len; self->config.rx_headroom_offset= self->config.rx_payload_offset - rx_headroom; self->config.seg_size = ucs_min(mss, config->super.max_bcopy); self->config.tx_max_poll = config->tx.max_poll; self->config.rx_max_poll = config->rx.max_poll; self->config.rx_max_batch = ucs_min(config->rx.max_batch, config->rx.queue_len / 4); self->ops = ops; status = uct_ib_iface_init_pkey(self, config); if (status != UCS_OK) { goto err; } status = uct_ib_iface_init_gid(self, config); if (status != UCS_OK) { goto err; } status = uct_ib_iface_init_lmc(self, config); if (status != UCS_OK) { goto err; } self->comp_channel = ibv_create_comp_channel(dev->ibv_context); if (self->comp_channel == NULL) { ucs_error("Failed to create completion channel: %m"); status = UCS_ERR_IO_ERROR; goto err_free_path_bits; } status = ucs_sys_fcntl_modfl(self->comp_channel->fd, O_NONBLOCK, 0); if (status != UCS_OK) { goto err_destroy_comp_channel; } /* TODO inline scatter for send SQ */ self->send_cq = ibv_create_cq(dev->ibv_context, tx_cq_len, NULL, self->comp_channel, 0); if (self->send_cq == NULL) { ucs_error("Failed to create send cq: %m"); status = UCS_ERR_IO_ERROR; goto err_destroy_comp_channel; } if (config->rx.inl > 32 /*UCT_IB_MLX5_CQE64_MAX_INL*/) { ibv_exp_setenv(dev->ibv_context, "MLX5_CQE_SIZE", "128", 1); } self->recv_cq = ibv_create_cq(dev->ibv_context, config->rx.queue_len, NULL, self->comp_channel, 0); ibv_exp_setenv(dev->ibv_context, "MLX5_CQE_SIZE", "64", 1); if (self->recv_cq == NULL) { ucs_error("Failed to create recv cq: %m"); status = UCS_ERR_IO_ERROR; goto err_destroy_send_cq; } if (!uct_ib_device_is_port_ib(dev, self->port_num)) { ucs_error("Unsupported link layer"); status = UCS_ERR_UNSUPPORTED; goto err_destroy_recv_cq; } /* Address scope and size */ self->addr_scope = uct_ib_address_scope(self->gid.global.subnet_prefix); self->addr_size = uct_ib_address_size(self->addr_scope); ucs_debug("created uct_ib_iface_t headroom_ofs %d payload_ofs %d hdr_ofs %d data_sz %d", self->config.rx_headroom_offset, self->config.rx_payload_offset, self->config.rx_hdr_offset, self->config.seg_size); return UCS_OK; err_destroy_recv_cq: ibv_destroy_cq(self->recv_cq); err_destroy_send_cq: ibv_destroy_cq(self->send_cq); err_destroy_comp_channel: ibv_destroy_comp_channel(self->comp_channel); err_free_path_bits: ucs_free(self->path_bits); err: return status; }
/** * @param rx_headroom Headroom requested by the user. * @param rx_priv_len Length of transport private data to reserve (0 if unused) * @param rx_hdr_len Length of transport network header. * @param mss Maximal segment size (transport limit). */ UCS_CLASS_INIT_FUNC(uct_ib_iface_t, uct_iface_ops_t *ops, uct_pd_h pd, uct_worker_h worker, const char *dev_name, unsigned rx_headroom, unsigned rx_priv_len, unsigned rx_hdr_len, unsigned tx_cq_len, size_t mss, uct_ib_iface_config_t *config) { uct_ib_device_t *dev = &ucs_derived_of(pd, uct_ib_pd_t)->dev; ucs_status_t status; uint8_t port_num; UCS_CLASS_CALL_SUPER_INIT(uct_base_iface_t, ops, pd, worker, &config->super UCS_STATS_ARG(dev->stats)); status = uct_ib_iface_find_port(dev, dev_name, &port_num); if (status != UCS_OK) { ucs_error("Failed to find port %s: %s", dev_name, ucs_status_string(status)); goto err; } self->port_num = port_num; self->sl = config->sl; self->config.rx_payload_offset = sizeof(uct_ib_iface_recv_desc_t) + ucs_max(sizeof(uct_am_recv_desc_t) + rx_headroom, rx_priv_len + rx_hdr_len); self->config.rx_hdr_offset = self->config.rx_payload_offset - rx_hdr_len; self->config.rx_headroom_offset= self->config.rx_payload_offset - rx_headroom; self->config.seg_size = config->super.max_bcopy; status = uct_ib_iface_init_pkey(self, config); if (status != UCS_OK) { goto err; } status = uct_ib_iface_init_gid(self, config); if (status != UCS_OK) { goto err; } status = uct_ib_iface_init_lmc(self, config); if (status != UCS_OK) { goto err; } /* TODO comp_channel */ /* TODO inline scatter for send SQ */ self->send_cq = ibv_create_cq(dev->ibv_context, tx_cq_len, NULL, NULL, 0); if (self->send_cq == NULL) { ucs_error("Failed to create send cq: %m"); status = UCS_ERR_IO_ERROR; goto err_free_path_bits; } if (config->rx.inl > 32 /*UCT_IB_MLX5_CQE64_MAX_INL*/) { ibv_exp_setenv(dev->ibv_context, "MLX5_CQE_SIZE", "128", 1); } self->recv_cq = ibv_create_cq(dev->ibv_context, config->rx.queue_len, NULL, NULL, 0); ibv_exp_setenv(dev->ibv_context, "MLX5_CQE_SIZE", "64", 1); if (self->recv_cq == NULL) { ucs_error("Failed to create recv cq: %m"); status = UCS_ERR_IO_ERROR; goto err_destroy_send_cq; } if (!uct_ib_device_is_port_ib(dev, self->port_num)) { ucs_error("Unsupported link layer"); status = UCS_ERR_UNSUPPORTED; goto err_destroy_recv_cq; } ucs_debug("created uct_ib_iface_t headroom_ofs %d payload_ofs %d hdr_ofs %d data_sz %d", self->config.rx_headroom_offset, self->config.rx_payload_offset, self->config.rx_hdr_offset, self->config.seg_size); return UCS_OK; err_destroy_recv_cq: ibv_destroy_cq(self->recv_cq); err_destroy_send_cq: ibv_destroy_cq(self->send_cq); err_free_path_bits: ucs_free(self->path_bits); err: return status; }
static UCS_CLASS_INIT_FUNC(uct_rc_verbs_iface_t, uct_pd_h pd, uct_worker_h worker, const char *dev_name, size_t rx_headroom, const uct_iface_config_t *tl_config) { uct_rc_verbs_iface_config_t *config = ucs_derived_of(tl_config, uct_rc_verbs_iface_config_t); struct ibv_exp_device_attr *dev_attr; size_t am_hdr_size; ucs_status_t status; struct ibv_qp_cap cap; struct ibv_qp *qp; extern uct_iface_ops_t uct_rc_verbs_iface_ops; UCS_CLASS_CALL_SUPER_INIT(uct_rc_iface_t, &uct_rc_verbs_iface_ops, pd, worker, dev_name, rx_headroom, 0, &config->super); /* Initialize inline work request */ memset(&self->inl_am_wr, 0, sizeof(self->inl_am_wr)); self->inl_am_wr.sg_list = self->inl_sge; self->inl_am_wr.num_sge = 2; self->inl_am_wr.opcode = IBV_WR_SEND; self->inl_am_wr.send_flags = IBV_SEND_INLINE; memset(&self->inl_rwrite_wr, 0, sizeof(self->inl_rwrite_wr)); self->inl_rwrite_wr.sg_list = self->inl_sge; self->inl_rwrite_wr.num_sge = 1; self->inl_rwrite_wr.opcode = IBV_WR_RDMA_WRITE; self->inl_rwrite_wr.send_flags = IBV_SEND_SIGNALED | IBV_SEND_INLINE; memset(self->inl_sge, 0, sizeof(self->inl_sge)); /* Configuration */ am_hdr_size = ucs_max(config->max_am_hdr, sizeof(uct_rc_hdr_t)); self->config.short_desc_size = ucs_max(UCT_RC_MAX_ATOMIC_SIZE, am_hdr_size); dev_attr = &uct_ib_iface_device(&self->super.super)->dev_attr; if (IBV_EXP_HAVE_ATOMIC_HCA(dev_attr) || IBV_EXP_HAVE_ATOMIC_GLOB(dev_attr)) { self->config.atomic32_handler = uct_rc_ep_atomic_handler_32_be0; self->config.atomic64_handler = uct_rc_ep_atomic_handler_64_be0; } else if (IBV_EXP_HAVE_ATOMIC_HCA_REPLY_BE(dev_attr)) { self->config.atomic32_handler = uct_rc_ep_atomic_handler_32_be1; self->config.atomic64_handler = uct_rc_ep_atomic_handler_64_be1; } /* Create a dummy QP in order to find out max_inline */ status = uct_rc_iface_qp_create(&self->super, &qp, &cap); if (status != UCS_OK) { goto err; } ibv_destroy_qp(qp); self->config.max_inline = cap.max_inline_data; /* Create AH headers and Atomic mempool */ status = uct_iface_mpool_create(&self->super.super.super.super, sizeof(uct_rc_iface_send_desc_t) + self->config.short_desc_size, sizeof(uct_rc_iface_send_desc_t), UCS_SYS_CACHE_LINE_SIZE, &config->super.super.tx.mp, self->super.config.tx_qp_len, uct_rc_iface_send_desc_init, "rc_verbs_short_desc", &self->short_desc_mp); if (status != UCS_OK) { goto err; } while (self->super.rx.available > 0) { if (uct_rc_verbs_iface_post_recv(self, 1) == 0) { ucs_error("failed to post receives"); status = UCS_ERR_NO_MEMORY; goto err_destroy_short_desc_mp; } } ucs_notifier_chain_add(&worker->progress_chain, uct_rc_verbs_iface_progress, self); return UCS_OK; err_destroy_short_desc_mp: ucs_mpool_destroy(self->short_desc_mp); err: return status; }