ucs_status_t ucp_rma_put(ucp_ep_h ep, const void *buffer, size_t length, uint64_t remote_addr, ucp_rkey_h rkey) { ucs_status_t status; uct_rkey_t uct_rkey; size_t frag_length; if (length == 0) { return UCS_OK; } if (ENABLE_PARAMS_CHECK && (buffer == NULL)) { return UCS_ERR_INVALID_PARAM; } uct_rkey = UCP_RMA_RKEY_LOOKUP(ep, rkey); /* Loop until all message has been sent. * We re-check the configuration on every iteration, because it can be * changed by transport switch. */ for (;;) { if (length <= ep->config.max_short_put) { status = uct_ep_put_short(ep->uct.ep, buffer, length, remote_addr, uct_rkey); if (ucs_likely(status != UCS_ERR_NO_RESOURCE)) { break; } } else { if (length <= ep->worker->context->config.bcopy_thresh) { frag_length = ucs_min(length, ep->config.max_short_put); status = uct_ep_put_short(ep->uct.ep, buffer, frag_length, remote_addr, uct_rkey); } else { frag_length = ucs_min(length, ep->config.max_bcopy_put); status = uct_ep_put_bcopy(ep->uct.ep, (uct_pack_callback_t)memcpy, (void*)buffer, frag_length, remote_addr, uct_rkey); } if (ucs_likely(status == UCS_OK)) { length -= frag_length; if (length == 0) { break; } buffer += frag_length; remote_addr += frag_length; } else if (status != UCS_ERR_NO_RESOURCE) { break; } } ucp_worker_progress(ep->worker); } return status; }
ucs_status_t ucp_put(ucp_ep_h ep, const void *buffer, size_t length, uint64_t remote_addr, ucp_rkey_h rkey) { ucp_ep_rma_config_t *rma_config; ucs_status_t status; uct_rkey_t uct_rkey; size_t frag_length; ssize_t packed_len; ucp_lane_index_t lane; UCP_RMA_CHECK_PARAMS(buffer, length); /* Loop until all message has been sent. * We re-check the configuration on every iteration, because it can be * changed by transport switch. */ for (;;) { UCP_EP_RESOLVE_RKEY_RMA(ep, rkey, lane, uct_rkey, rma_config); if (length <= rma_config->max_put_short) { status = uct_ep_put_short(ep->uct_eps[lane], buffer, length, remote_addr, uct_rkey); if (ucs_likely(status != UCS_ERR_NO_RESOURCE)) { break; } } else { if (length <= ucp_ep_config(ep)->bcopy_thresh) { frag_length = ucs_min(length, rma_config->max_put_short); status = uct_ep_put_short(ep->uct_eps[lane], buffer, frag_length, remote_addr, uct_rkey); } else { ucp_memcpy_pack_context_t pack_ctx; pack_ctx.src = buffer; pack_ctx.length = frag_length = ucs_min(length, rma_config->max_put_bcopy); packed_len = uct_ep_put_bcopy(ep->uct_eps[lane], ucp_memcpy_pack, &pack_ctx, remote_addr, uct_rkey); status = (packed_len > 0) ? UCS_OK : (ucs_status_t)packed_len; } if (ucs_likely(status == UCS_OK)) { length -= frag_length; if (length == 0) { break; } buffer += frag_length; remote_addr += frag_length; } else if (status != UCS_ERR_NO_RESOURCE) { break; } } ucp_worker_progress(ep->worker); } return status; }
static ucs_status_t ucp_progress_put_nbi(uct_pending_req_t *self) { ucp_request_t *req = ucs_container_of(self, ucp_request_t, send.uct); ucp_rkey_h rkey = req->send.rma.rkey; ucp_ep_t *ep = req->send.ep; ucp_ep_rma_config_t *rma_config; ucs_status_t status; uct_rkey_t uct_rkey; ssize_t packed_len; uct_ep_h uct_ep; UCP_EP_RESOLVE_RKEY_RMA(ep, rkey, uct_ep, uct_rkey, rma_config); for (;;) { if (req->send.length <= ep->worker->context->config.ext.bcopy_thresh) { /* Should be replaced with bcopy */ packed_len = ucs_min(req->send.length, rma_config->max_put_short); status = uct_ep_put_short(uct_ep, req->send.buffer, packed_len, req->send.rma.remote_addr, uct_rkey); } else { /* We don't do it right now, but in future we have to add * an option to use zcopy */ ucp_memcpy_pack_context_t pack_ctx; pack_ctx.src = req->send.buffer; pack_ctx.length = ucs_min(req->send.length, rma_config->max_put_bcopy); packed_len = uct_ep_put_bcopy(uct_ep, ucp_memcpy_pack, &pack_ctx, req->send.rma.remote_addr, uct_rkey); status = (packed_len > 0) ? UCS_OK : (ucs_status_t)packed_len; } if (ucs_likely(status == UCS_OK || status == UCS_INPROGRESS)) { req->send.length -= packed_len; if (req->send.length == 0) { ucp_request_complete(req, void); break; } req->send.buffer += packed_len; req->send.rma.remote_addr += packed_len; } else { break; } } return status; }
void uct_ib_log_dump_sg_list(struct ibv_sge *sg_list, int num_sge, uint64_t inline_bitmap, uct_log_data_dump_func_t data_dump, char *buf, size_t max) { char data[256]; size_t total_len = 0; size_t total_valid_len = 0;; char *s = buf; char *ends = buf + max; void *pd = data; size_t len; int i; for (i = 0; i < num_sge; ++i) { if (inline_bitmap & UCS_BIT(i)) { snprintf(s, ends - s, " [inl len %d]", sg_list[i].length); } else { snprintf(s, ends - s, " [va 0x%"PRIx64" len %d lkey 0x%x]", sg_list[i].addr, sg_list[i].length, sg_list[i].lkey); } len = ucs_min(sg_list[i].length, (void*)data + sizeof(data) - pd); memcpy(pd, (void*)sg_list[i].addr, len); s += strlen(s); pd += len; total_len += len; total_valid_len += sg_list[i].length; } if (data_dump != NULL) { data_dump(data, total_len, total_valid_len, s, ends - s); } }
size_t ucp_dt_iov_scatter(ucp_dt_iov_t *iov, size_t iovcnt, const void *src, size_t length, size_t *iov_offset, size_t *iovcnt_offset) { size_t item_len, item_len_to_copy; size_t length_it = 0; while ((length_it < length) && (*iovcnt_offset < iovcnt)) { item_len = iov[*iovcnt_offset].length; item_len_to_copy = ucs_min(ucs_max((ssize_t)(item_len - *iov_offset), 0), length - length_it); ucs_assert(*iov_offset <= item_len); memcpy(iov[*iovcnt_offset].buffer + *iov_offset, src + length_it, item_len_to_copy); length_it += item_len_to_copy; ucs_assert(length_it <= length); if (length_it < length) { *iov_offset = 0; ++(*iovcnt_offset); } else { *iov_offset += item_len_to_copy; } } return length_it; }
static void *ucm_realloc(void *oldptr, size_t size, const void *caller) { void *newptr; size_t oldsz; int foreign; ucm_malloc_hook_state.hook_called = 1; if (oldptr != NULL) { foreign = !ucm_malloc_address_remove_if_managed(oldptr, "realloc"); if (RUNNING_ON_VALGRIND || foreign) { /* If pointer was created by original malloc(), allocate the new pointer * with the new heap, and copy out the data. Then, release the old pointer. * We do the same if we are running with valgrind, so we could use client * requests properly. */ newptr = ucm_dlmalloc(size); ucm_malloc_allocated(newptr, size, "realloc"); oldsz = ucm_malloc_hook_state.usable_size(oldptr); memcpy(newptr, oldptr, ucs_min(size, oldsz)); if (foreign) { ucm_release_foreign_block(oldptr, ucm_malloc_hook_state.free, "realloc"); } else{ ucm_mem_free(oldptr, oldsz); } return newptr; } } newptr = ucm_dlrealloc(oldptr, size); ucm_malloc_allocated(newptr, size, "realloc"); return newptr; }
ucs_status_t ucs_timerq_remove(ucs_timer_queue_t *timerq, int timer_id) { ucs_status_t status; ucs_timer_t *ptr; ucs_trace_func("timerq=%p timer_id=%d", timerq, timer_id); status = UCS_ERR_NO_ELEM; pthread_spin_lock(&timerq->lock); timerq->min_interval = UCS_TIME_INFINITY; ptr = timerq->timers; while (ptr < timerq->timers + timerq->num_timers) { if (ptr->id == timer_id) { *ptr = timerq->timers[--timerq->num_timers]; status = UCS_OK; } else { timerq->min_interval = ucs_min(timerq->min_interval, ptr->interval); ++ptr; } } /* TODO realloc - shrink */ if (timerq->num_timers == 0) { ucs_assert(timerq->min_interval == UCS_TIME_INFINITY); free(timerq->timers); timerq->timers = NULL; } else { ucs_assert(timerq->min_interval != UCS_TIME_INFINITY); } pthread_spin_unlock(&timerq->lock); return status; }
static void ucp_ep_config_print_tag_proto(FILE *stream, const char *name, size_t max_eager_short, size_t zcopy_thresh, size_t rndv_thresh) { size_t max_bcopy; fprintf(stream, "# %18s: 0", name); if (max_eager_short > 0) { fprintf(stream, "..<egr/short>..%zu" , max_eager_short + 1); } max_bcopy = ucs_min(zcopy_thresh, rndv_thresh); if (max_eager_short < max_bcopy) { fprintf(stream, "..<egr/bcopy>..%zu", max_bcopy); } if (zcopy_thresh < rndv_thresh) { fprintf(stream, "..<egr/zcopy>.."); if (rndv_thresh < SIZE_MAX) { fprintf(stream, "%zu", rndv_thresh); } } if (rndv_thresh < SIZE_MAX) { fprintf(stream, "..<rndv>.."); } fprintf(stream, "(inf)\n"); }
static double ucp_wireup_rma_score_func(const uct_md_attr_t *md_attr, const uct_iface_attr_t *iface_attr, const ucp_wireup_iface_attr_t *remote_iface_attr) { /* best for 4k messages */ return 1e-3 / (iface_attr->latency + iface_attr->overhead + (4096.0 / ucs_min(iface_attr->bandwidth, remote_iface_attr->bandwidth))); }
size_t ucs_config_memunits_get(size_t config_size, size_t auto_size, size_t max_size) { if (config_size == UCS_CONFIG_MEMUNITS_AUTO) { return auto_size; } else { return ucs_min(config_size, max_size); } }
/* Unpack a string and return pointer to next storage byte */ static const void* ucp_address_unpack_string(const void *src, char *s, size_t max) { size_t length, avail; ucs_assert(max >= 1); length = *(const uint8_t*)src; avail = ucs_min(length, max - 1); memcpy(s, src + 1, avail); s[avail] = '\0'; return src + length + 1; }
ucs_status_t ucp_rma_get(ucp_ep_h ep, void *buffer, size_t length, uint64_t remote_addr, ucp_rkey_h rkey) { uct_completion_t comp; ucs_status_t status; uct_rkey_t uct_rkey; size_t frag_length; if (length == 0) { return UCS_OK; } uct_rkey = UCP_RMA_RKEY_LOOKUP(ep, rkey); comp.count = 1; for (;;) { /* Push out all fragments, and request completion only for the last * fragment. */ frag_length = ucs_min(ep->config.max_bcopy_get, length); status = uct_ep_get_bcopy(ep->uct.ep, (uct_unpack_callback_t)memcpy, (void*)buffer, frag_length, remote_addr, uct_rkey, &comp); if (ucs_likely(status == UCS_OK)) { goto posted; } else if (status == UCS_INPROGRESS) { ++comp.count; goto posted; } else if (status == UCS_ERR_NO_RESOURCE) { goto retry; } else { return status; } posted: length -= frag_length; if (length == 0) { break; } buffer += frag_length; remote_addr += frag_length; retry: ucp_worker_progress(ep->worker); } /* coverity[loop_condition] */ while (comp.count > 1) { ucp_worker_progress(ep->worker); }; return UCS_OK; }
ucs_status_t ucp_get_nbi(ucp_ep_h ep, void *buffer, size_t length, uint64_t remote_addr, ucp_rkey_h rkey) { ucp_ep_rma_config_t *rma_config; ucs_status_t status; uct_rkey_t uct_rkey; size_t frag_length; uct_ep_h uct_ep; UCP_RMA_CHECK_PARAMS(buffer, length); for (;;) { UCP_EP_RESOLVE_RKEY_RMA(ep, rkey, uct_ep, uct_rkey, rma_config); frag_length = ucs_min(rma_config->max_get_bcopy, length); status = uct_ep_get_bcopy(uct_ep, (uct_unpack_callback_t)memcpy, (void*)buffer, frag_length, remote_addr, uct_rkey, NULL); if (ucs_likely(status == UCS_OK || status == UCS_INPROGRESS)) { /* Get was initiated */ length -= frag_length; buffer += frag_length; remote_addr += frag_length; if (length == 0) { break; } } else if (ucs_unlikely(status == UCS_ERR_NO_RESOURCE)) { /* Out of resources - adding request for later schedule */ ucp_request_t *req; req = ucs_mpool_get_inline(&ep->worker->req_mp); if (req == NULL) { /* can't allocate memory for request - abort */ status = UCS_ERR_NO_MEMORY; break; } ucp_add_pending_rma(req, ep, uct_ep, buffer, length, remote_addr, rkey, ucp_progress_get_nbi); /* Mark it as in progress */ status = UCS_INPROGRESS; break; } else { /* Error */ break; } } return status; }
static ucs_status_t uct_posix_test_mem(size_t length, int shm_fd) { int *buf; int chunk_size = 256 * 1024; ucs_status_t status = UCS_OK; size_t size_to_write, remaining; ssize_t single_write; buf = ucs_malloc(chunk_size, "write buffer"); if (buf == NULL) { ucs_error("Failed to allocate memory for testing space for backing file."); status = UCS_ERR_NO_MEMORY; goto out; } memset(buf, 0, chunk_size); if (lseek(shm_fd, 0, SEEK_SET) < 0) { ucs_error("lseek failed. %m"); status = UCS_ERR_IO_ERROR; goto out_free_buf; } remaining = length; while (remaining > 0) { size_to_write = ucs_min(remaining, chunk_size); single_write = write(shm_fd, buf, size_to_write); if (single_write < 0) { ucs_error("Failed to write %zu bytes. %m", size_to_write); status = UCS_ERR_IO_ERROR; goto out_free_buf; } if (single_write != size_to_write) { ucs_error("Not enough memory to write total of %zu bytes. " "Please check that /dev/shm or the directory you specified has " "more available memory.", length); status = UCS_ERR_NO_MEMORY; goto out_free_buf; } remaining -= size_to_write; } out_free_buf: ucs_free(buf); out: return status; }
static ucs_status_t uct_cm_iface_query(uct_iface_h tl_iface, uct_iface_attr_t *iface_attr) { size_t mtu; mtu = ucs_min(IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE - sizeof(uct_cm_hdr_t), UINT8_MAX); memset(iface_attr, 0, sizeof(*iface_attr)); iface_attr->cap.am.max_bcopy = mtu; iface_attr->iface_addr_len = sizeof(uct_sockaddr_ib_t); iface_attr->ep_addr_len = 0; iface_attr->cap.flags = /* UCT_IFACE_FLAG_AM_BCOPY | enable only in case of emergency */ UCT_IFACE_FLAG_PENDING | UCT_IFACE_FLAG_CONNECT_TO_IFACE; return UCS_OK; }
static uint64_t __sumup_host_name(unsigned prime_index) { uint64_t sum, n; const char *p; unsigned i; sum = 0; i = prime_index; p = ucs_get_host_name(); while (*p != '\0') { n = 0; strncpy((char*)&n, p, sizeof(n)); sum += ucs_get_prime(i) * n; ++i; p += ucs_min(sizeof(n), strlen(p)); } return sum; }
ucs_status_t ucs_timerq_add(ucs_timer_queue_t *timerq, int timer_id, ucs_time_t interval) { ucs_status_t status; ucs_timer_t *ptr; ucs_trace_func("timerq=%p interval=%.2fus timer_id=%d", timerq, ucs_time_to_usec(interval), timer_id); pthread_spin_lock(&timerq->lock); /* Make sure ID is unique */ for (ptr = timerq->timers; ptr < timerq->timers + timerq->num_timers; ++ptr) { if (ptr->id == timer_id) { status = UCS_ERR_ALREADY_EXISTS; goto out_unlock; } } /* Resize timer array */ ptr = ucs_realloc(timerq->timers, (timerq->num_timers + 1) * sizeof(ucs_timer_t), "timerq"); if (ptr == NULL) { status = UCS_ERR_NO_MEMORY; goto out_unlock; } timerq->timers = ptr; ++timerq->num_timers; timerq->min_interval = ucs_min(interval, timerq->min_interval); ucs_assert(timerq->min_interval != UCS_TIME_INFINITY); /* Initialize the new timer */ ptr = &timerq->timers[timerq->num_timers - 1]; ptr->expiration = 0; /* will fire the next time sweep is called */ ptr->interval = interval; ptr->id = timer_id; status = UCS_OK; out_unlock: pthread_spin_unlock(&timerq->lock); return status; }
static ucs_status_t ucp_progress_get_nbi(uct_pending_req_t *self) { ucp_request_t *req = ucs_container_of(self, ucp_request_t, send.uct); ucp_rkey_h rkey = req->send.rma.rkey; ucp_ep_t *ep = req->send.ep; ucp_ep_rma_config_t *rma_config; ucs_status_t status; uct_rkey_t uct_rkey; size_t frag_length; uct_ep_h uct_ep; UCP_EP_RESOLVE_RKEY_RMA(ep, rkey, uct_ep, uct_rkey, rma_config); for (;;) { frag_length = ucs_min(rma_config->max_get_bcopy, req->send.length); status = uct_ep_get_bcopy(uct_ep, (uct_unpack_callback_t)memcpy, (void*)req->send.buffer, frag_length, req->send.rma.remote_addr, uct_rkey, NULL); if (ucs_likely(status == UCS_OK || status == UCS_INPROGRESS)) { /* Get was initiated */ req->send.length -= frag_length; req->send.buffer += frag_length; req->send.rma.remote_addr += frag_length; if (req->send.length == 0) { /* Get was posted */ ucp_request_complete(req, void); status = UCS_OK; break; } } else { /* Error - abort */ break; } } return status; }
static UCS_CLASS_INIT_FUNC(uct_rc_mlx5_ep_t, uct_iface_h tl_iface) { uct_rc_mlx5_iface_t *iface = ucs_derived_of(tl_iface, uct_rc_mlx5_iface_t); ucs_status_t status; UCS_CLASS_CALL_SUPER_INIT(uct_rc_ep_t, &iface->super); status = uct_ib_mlx5_txwq_init(iface->super.super.super.worker, &self->tx.wq, self->super.txqp.qp); if (status != UCS_OK) { ucs_error("Failed to get mlx5 QP information"); return status; } self->qp_num = self->super.txqp.qp->qp_num; self->tx.wq.bb_max = ucs_min(self->tx.wq.bb_max, iface->tx.bb_max); uct_rc_txqp_available_set(&self->super.txqp, self->tx.wq.bb_max); uct_worker_progress_register(iface->super.super.super.worker, uct_rc_mlx5_iface_progress, iface); return UCS_OK; }
ucs_status_t uct_ib_iface_recv_mpool_init(uct_ib_iface_t *iface, const uct_ib_iface_config_t *config, const char *name, ucs_mpool_t *mp) { unsigned grow; if (config->rx.queue_len < 1024) { grow = 1024; } else { /* We want to have some free (+10%) elements to avoid mem pool expansion */ grow = ucs_min( (int)(1.1 * config->rx.queue_len + 0.5), config->rx.mp.max_bufs); } return uct_iface_mpool_init(&iface->super, mp, iface->config.rx_payload_offset + iface->config.seg_size, iface->config.rx_hdr_offset, UCS_SYS_CACHE_LINE_SIZE, &config->rx.mp, grow, uct_ib_iface_recv_desc_init, name); }
static ucs_status_t uct_cm_iface_query(uct_iface_h tl_iface, uct_iface_attr_t *iface_attr) { uct_cm_iface_t *iface = ucs_derived_of(tl_iface, uct_cm_iface_t); size_t mtu; uct_ib_iface_query(&iface->super, 32 /* TODO */, iface_attr); iface_attr->overhead = 1200e-9; mtu = ucs_min(IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE - sizeof(uct_cm_hdr_t), UINT8_MAX); iface_attr->cap.am.max_bcopy = mtu; iface_attr->iface_addr_len = sizeof(uint32_t); iface_attr->ep_addr_len = 0; iface_attr->cap.flags = UCT_IFACE_FLAG_AM_BCOPY | UCT_IFACE_FLAG_AM_DUP | UCT_IFACE_FLAG_PENDING | UCT_IFACE_FLAG_AM_CB_ASYNC | UCT_IFACE_FLAG_CONNECT_TO_IFACE; return UCS_OK; }
static UCS_F_ALWAYS_INLINE size_t ucp_tag_get_rndv_threshold(const ucp_request_t *req, size_t count, size_t max_iov, size_t rndv_rma_thresh, size_t rndv_am_thresh, size_t seg_size) { switch (req->send.datatype & UCP_DATATYPE_CLASS_MASK) { case UCP_DATATYPE_IOV: if ((count > max_iov) && ucp_ep_is_tag_offload_enabled(ucp_ep_config(req->send.ep))) { /* Make sure SW RNDV will be used, because tag offload does * not support multi-packet eager protocols. */ return seg_size; } /* Fall through */ case UCP_DATATYPE_CONTIG: return ucs_min(rndv_rma_thresh, rndv_am_thresh); case UCP_DATATYPE_GENERIC: return rndv_am_thresh; default: ucs_error("Invalid data type %lx", req->send.datatype); } return SIZE_MAX; }
static ucs_status_t ucp_progress_get_nbi(uct_pending_req_t *self) { ucp_request_t *req = ucs_container_of(self, ucp_request_t, send.uct); ucp_rkey_h rkey = req->send.rma.rkey; ucp_ep_t *ep = req->send.ep; ucp_ep_rma_config_t *rma_config; ucs_status_t status; uct_rkey_t uct_rkey; size_t frag_length; UCP_EP_RESOLVE_RKEY_RMA(ep, rkey, req->send.lane, uct_rkey, rma_config); frag_length = ucs_min(rma_config->max_get_bcopy, req->send.length); status = uct_ep_get_bcopy(ep->uct_eps[req->send.lane], (uct_unpack_callback_t)memcpy, (void*)req->send.buffer, frag_length, req->send.rma.remote_addr, uct_rkey, NULL); if ((status == UCS_OK) || (status == UCS_INPROGRESS)) { /* Get was initiated */ req->send.length -= frag_length; req->send.buffer += frag_length; req->send.rma.remote_addr += frag_length; if (req->send.length == 0) { /* Get was posted */ ucp_request_put(req, UCS_OK); return UCS_OK; } else { return UCS_INPROGRESS; } } else { return status; } }
static void ucx_perf_set_warmup(ucx_perf_context_t* perf, ucx_perf_params_t* params) { perf->max_iter = ucs_min(params->warmup_iter, params->max_iter / 10); perf->report_interval = -1; }
ucs_status_t ucp_put_nbi(ucp_ep_h ep, const void *buffer, size_t length, uint64_t remote_addr, ucp_rkey_h rkey) { ucp_ep_rma_config_t *rma_config; ucs_status_t status; uct_rkey_t uct_rkey; ssize_t packed_len; ucp_request_t *req; uct_ep_h uct_ep; UCP_RMA_CHECK_PARAMS(buffer, length); for (;;) { UCP_EP_RESOLVE_RKEY_RMA(ep, rkey, uct_ep, uct_rkey, rma_config); if (length <= rma_config->max_put_short) { /* Fast path for a single short message */ status = uct_ep_put_short(uct_ep, buffer, length, remote_addr, uct_rkey); if (ucs_likely(status != UCS_ERR_NO_RESOURCE)) { /* Return on error or success */ break; } else { /* Out of resources - adding request for later schedule */ req = ucs_mpool_get_inline(&ep->worker->req_mp); if (req == NULL) { status = UCS_ERR_NO_MEMORY; break; } ucp_add_pending_rma(req, ep, uct_ep, buffer, length, remote_addr, rkey, ucp_progress_put_nbi); status = UCS_INPROGRESS; break; } } else { /* Fragmented put */ if (length <= ucp_ep_config(ep)->bcopy_thresh) { /* TBD: Should be replaced with bcopy */ packed_len = ucs_min(length, rma_config->max_put_short); status = uct_ep_put_short(uct_ep, buffer, packed_len, remote_addr, uct_rkey); } else { /* TBD: Use z-copy */ ucp_memcpy_pack_context_t pack_ctx; pack_ctx.src = buffer; pack_ctx.length = ucs_min(length, rma_config->max_put_bcopy); packed_len = uct_ep_put_bcopy(uct_ep, ucp_memcpy_pack, &pack_ctx, remote_addr, uct_rkey); status = (packed_len > 0) ? UCS_OK : (ucs_status_t)packed_len; } if (ucs_likely(status == UCS_OK || status == UCS_INPROGRESS)) { length -= packed_len; if (length == 0) { /* Put is completed - return success */ break; } buffer += packed_len; remote_addr += packed_len; } else if (status == UCS_ERR_NO_RESOURCE) { /* Out of resources - adding request for later schedule */ req = ucs_mpool_get_inline(&ep->worker->req_mp); if (req == NULL) { status = UCS_ERR_NO_MEMORY; break; } ucp_add_pending_rma(req, ep, uct_ep, buffer, length, remote_addr, rkey, ucp_progress_put_nbi); status = UCS_INPROGRESS; break; } else { /* Return - Error occured */ break; } } } return status; }
ucs_status_t uct_ib_iface_query(uct_ib_iface_t *iface, size_t xport_hdr_len, uct_iface_attr_t *iface_attr) { uct_ib_device_t *dev = uct_ib_iface_device(iface); static const unsigned ib_port_widths[] = { [0] = 1, [1] = 4, [2] = 8, [3] = 12 }; uint8_t active_width, active_speed, active_mtu; double encoding, signal_rate, wire_speed; size_t mtu, width, extra_pkt_len; int i = 0; active_width = uct_ib_iface_port_attr(iface)->active_width; active_speed = uct_ib_iface_port_attr(iface)->active_speed; active_mtu = uct_ib_iface_port_attr(iface)->active_mtu; /* Get active width */ if (!ucs_is_pow2(active_width) || (active_width < 1) || (ucs_ilog2(active_width) > 3)) { ucs_error("Invalid active_width on %s:%d: %d", UCT_IB_IFACE_ARG(iface), active_width); return UCS_ERR_IO_ERROR; } memset(iface_attr, 0, sizeof(*iface_attr)); iface_attr->device_addr_len = iface->addr_size; switch (active_speed) { case 1: /* SDR */ iface_attr->latency = 5000e-9; signal_rate = 2.5e9; encoding = 8.0/10.0; break; case 2: /* DDR */ iface_attr->latency = 2500e-9; signal_rate = 5.0e9; encoding = 8.0/10.0; break; case 4: /* QDR */ iface_attr->latency = 1300e-9; signal_rate = 10.0e9; encoding = 8.0/10.0; break; case 8: /* FDR10 */ iface_attr->latency = 700e-9; signal_rate = 10.3125e9; encoding = 64.0/66.0; break; case 16: /* FDR */ iface_attr->latency = 700e-9; signal_rate = 14.0625e9; encoding = 64.0/66.0; break; case 32: /* EDR */ iface_attr->latency = 600e-9; signal_rate = 25.78125e9; encoding = 64.0/66.0; break; default: ucs_error("Invalid active_speed on %s:%d: %d", UCT_IB_IFACE_ARG(iface), active_speed); return UCS_ERR_IO_ERROR; } /* Wire speed calculation: Width * SignalRate * Encoding */ width = ib_port_widths[ucs_ilog2(active_width)]; wire_speed = (width * signal_rate * encoding) / 8.0; /* Calculate packet overhead */ mtu = ucs_min(uct_ib_mtu_value(active_mtu), iface->config.seg_size); extra_pkt_len = UCT_IB_BTH_LEN + xport_hdr_len + UCT_IB_ICRC_LEN + UCT_IB_VCRC_LEN + UCT_IB_DELIM_LEN; if (IBV_PORT_IS_LINK_LAYER_ETHERNET(uct_ib_iface_port_attr(iface))) { extra_pkt_len += UCT_IB_GRH_LEN + UCT_IB_ROCE_LEN; } else { /* TODO check if UCT_IB_DELIM_LEN is present in RoCE as well */ extra_pkt_len += UCT_IB_LRH_LEN; } iface_attr->bandwidth = (wire_speed * mtu) / (mtu + extra_pkt_len); /* Set priority of current device */ iface_attr->priority = 0; while (uct_ib_device_info_table[i].vendor_part_id != 0) { if (uct_ib_device_info_table[i].vendor_part_id == dev->dev_attr.vendor_part_id) { iface_attr->priority = uct_ib_device_info_table[i].priority; break; } i++; } return UCS_OK; }
/** * @param rx_headroom Headroom requested by the user. * @param rx_priv_len Length of transport private data to reserve (0 if unused) * @param rx_hdr_len Length of transport network header. * @param mss Maximal segment size (transport limit). */ UCS_CLASS_INIT_FUNC(uct_ib_iface_t, uct_ib_iface_ops_t *ops, uct_md_h md, uct_worker_h worker, const uct_iface_params_t *params, unsigned rx_priv_len, unsigned rx_hdr_len, unsigned tx_cq_len, size_t mss, const uct_ib_iface_config_t *config) { uct_ib_device_t *dev = &ucs_derived_of(md, uct_ib_md_t)->dev; ucs_status_t status; uint8_t port_num; UCS_CLASS_CALL_SUPER_INIT(uct_base_iface_t, &ops->super, md, worker, &config->super UCS_STATS_ARG(dev->stats)); status = uct_ib_device_find_port(dev, params->dev_name, &port_num); if (status != UCS_OK) { goto err; } self->ops = ops; self->config.rx_payload_offset = sizeof(uct_ib_iface_recv_desc_t) + ucs_max(sizeof(uct_am_recv_desc_t) + params->rx_headroom, rx_priv_len + rx_hdr_len); self->config.rx_hdr_offset = self->config.rx_payload_offset - rx_hdr_len; self->config.rx_headroom_offset= self->config.rx_payload_offset - params->rx_headroom; self->config.seg_size = ucs_min(mss, config->super.max_bcopy); self->config.tx_max_poll = config->tx.max_poll; self->config.rx_max_poll = config->rx.max_poll; self->config.rx_max_batch = ucs_min(config->rx.max_batch, config->rx.queue_len / 4); self->config.port_num = port_num; self->config.sl = config->sl; self->config.gid_index = config->gid_index; status = uct_ib_iface_init_pkey(self, config); if (status != UCS_OK) { goto err; } status = uct_ib_device_query_gid(dev, self->config.port_num, self->config.gid_index, &self->gid); if (status != UCS_OK) { goto err; } status = uct_ib_iface_init_lmc(self, config); if (status != UCS_OK) { goto err; } self->comp_channel = ibv_create_comp_channel(dev->ibv_context); if (self->comp_channel == NULL) { ucs_error("ibv_create_comp_channel() failed: %m"); status = UCS_ERR_IO_ERROR; goto err_free_path_bits; } status = ucs_sys_fcntl_modfl(self->comp_channel->fd, O_NONBLOCK, 0); if (status != UCS_OK) { goto err_destroy_comp_channel; } status = uct_ib_iface_create_cq(self, tx_cq_len, 0, &self->send_cq); if (status != UCS_OK) { goto err_destroy_comp_channel; } status = uct_ib_iface_create_cq(self, config->rx.queue_len, config->rx.inl, &self->recv_cq); if (status != UCS_OK) { goto err_destroy_send_cq; } /* Address scope and size */ if (config->addr_type == UCT_IB_IFACE_ADDRESS_TYPE_AUTO) { if (IBV_PORT_IS_LINK_LAYER_ETHERNET(uct_ib_iface_port_attr(self))) { self->addr_type = UCT_IB_ADDRESS_TYPE_ETH; } else { self->addr_type = uct_ib_address_scope(self->gid.global.subnet_prefix); } } else { ucs_assert(config->addr_type < UCT_IB_ADDRESS_TYPE_LAST); self->addr_type = config->addr_type; } self->addr_size = uct_ib_address_size(self->addr_type); ucs_debug("created uct_ib_iface_t headroom_ofs %d payload_ofs %d hdr_ofs %d data_sz %d", self->config.rx_headroom_offset, self->config.rx_payload_offset, self->config.rx_hdr_offset, self->config.seg_size); return UCS_OK; err_destroy_send_cq: ibv_destroy_cq(self->send_cq); err_destroy_comp_channel: ibv_destroy_comp_channel(self->comp_channel); err_free_path_bits: ucs_free(self->path_bits); err: return status; }
static ucs_status_t uct_ib_iface_create_cq(uct_ib_iface_t *iface, int cq_length, size_t inl, struct ibv_cq **cq_p) { static const char *cqe_size_env_var = "MLX5_CQE_SIZE"; uct_ib_device_t *dev = uct_ib_iface_device(iface); const char *cqe_size_env_value; size_t cqe_size_min, cqe_size; char cqe_size_buf[32]; ucs_status_t status; struct ibv_cq *cq; int env_var_added = 0; int ret; cqe_size_min = (inl > 32) ? 128 : 64; cqe_size_env_value = getenv(cqe_size_env_var); if (cqe_size_env_value != NULL) { cqe_size = atol(cqe_size_env_value); if (cqe_size < cqe_size_min) { ucs_error("%s is set to %zu, but at least %zu is required (inl: %zu)", cqe_size_env_var, cqe_size, cqe_size_min, inl); status = UCS_ERR_INVALID_PARAM; goto out; } } else { /* CQE size is not defined by the environment, set it according to inline * size and cache line size. */ cqe_size = ucs_max(cqe_size_min, UCS_SYS_CACHE_LINE_SIZE); cqe_size = ucs_max(cqe_size, 64); /* at least 64 */ cqe_size = ucs_min(cqe_size, 128); /* at most 128 */ snprintf(cqe_size_buf, sizeof(cqe_size_buf),"%zu", cqe_size); ucs_debug("%s: setting %s=%s", uct_ib_device_name(dev), cqe_size_env_var, cqe_size_buf); ret = ibv_exp_setenv(dev->ibv_context, cqe_size_env_var, cqe_size_buf, 1); if (ret) { ucs_error("ibv_exp_setenv(%s=%s) failed: %m", cqe_size_env_var, cqe_size_buf); status = UCS_ERR_INVALID_PARAM; goto out; } env_var_added = 1; } cq = ibv_create_cq(dev->ibv_context, cq_length, NULL, iface->comp_channel, 0); if (cq == NULL) { ucs_error("ibv_create_cq(cqe=%d) failed: %m", cq_length); status = UCS_ERR_IO_ERROR; goto out_unsetenv; } *cq_p = cq; status = UCS_OK; out_unsetenv: if (env_var_added) { /* if we created a new environment variable, remove it */ ret = ibv_exp_unsetenv(dev->ibv_context, cqe_size_env_var); if (ret) { ucs_warn("unsetenv(%s) failed: %m", cqe_size_env_var); } } out: return status; }
void ucs_fill_filename_template(const char *tmpl, char *buf, size_t max) { char *p, *end; const char *pf, *pp; size_t length; time_t t; p = buf; end = buf + max - 1; *end = 0; pf = tmpl; while (*pf != 0 && p < end) { pp = strchr(pf, '%'); if (pp == NULL) { strncpy(p, pf, end - p); p = end; break; } length = ucs_min(pp - pf, end - p); strncpy(p, pf, length); p += length; switch (*(pp + 1)) { case 'p': snprintf(p, end - p, "%d", getpid()); pf = pp + 2; p += strlen(p); break; case 'h': snprintf(p, end - p, "%s", ucs_get_host_name()); pf = pp + 2; p += strlen(p); break; case 'c': snprintf(p, end - p, "%02d", ucs_get_first_cpu()); pf = pp + 2; p += strlen(p); break; case 't': t = time(NULL); strftime(p, end - p, "%Y-%m-%d-%H:%M:%S", localtime(&t)); pf = pp + 2; p += strlen(p); break; case 'u': snprintf(p, end - p, "%s", basename(ucs_get_user_name())); pf = pp + 2; p += strlen(p); break; case 'e': snprintf(p, end - p, "%s", basename(ucs_get_exe())); pf = pp + 2; p += strlen(p); break; default: *(p++) = *pp; pf = pp + 1; break; } p += strlen(p); } *p = 0; }
static UCS_CLASS_INIT_FUNC(uct_cm_iface_t, uct_pd_h pd, uct_worker_h worker, const char *dev_name, size_t rx_headroom, const uct_iface_config_t *tl_config) { uct_cm_iface_config_t *config = ucs_derived_of(tl_config, uct_cm_iface_config_t); ucs_status_t status; int ret; ucs_trace_func(""); UCS_CLASS_CALL_SUPER_INIT(uct_ib_iface_t, &uct_cm_iface_ops, pd, worker, dev_name, rx_headroom, 0 /* rx_priv_len */, 0 /* rx_hdr_len */, 1 /* tx_cq_len */, IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE, /* mss */ &config->super); if (worker->async == NULL) { ucs_error("cm must have async!=NULL"); return UCS_ERR_INVALID_PARAM; } self->service_id = (uint32_t)(ucs_generate_uuid((uintptr_t)self) & (~IB_CM_ASSIGN_SERVICE_ID_MASK)); self->num_outstanding = 0; self->config.timeout_ms = (int)(config->timeout * 1e3 + 0.5); self->config.max_outstanding = config->max_outstanding; self->config.retry_count = ucs_min(config->retry_count, UINT8_MAX); self->notify_q.head = NULL; ucs_queue_head_init(&self->notify_q); self->outstanding = ucs_calloc(self->config.max_outstanding, sizeof(*self->outstanding), "cm_outstanding"); if (self->outstanding == NULL) { status = UCS_ERR_NO_MEMORY; goto err; } self->cmdev = ib_cm_open_device(uct_ib_iface_device(&self->super)->ibv_context); if (self->cmdev == NULL) { ucs_error("ib_cm_open_device() failed: %m. Check if ib_ucm.ko module is loaded."); status = UCS_ERR_NO_DEVICE; goto err_free_outstanding; } status = ucs_sys_fcntl_modfl(self->cmdev->fd, O_NONBLOCK, 0); if (status != UCS_OK) { goto err_close_device; } ret = ib_cm_create_id(self->cmdev, &self->listen_id, self); if (ret) { ucs_error("ib_cm_create_id() failed: %m"); status = UCS_ERR_NO_DEVICE; goto err_close_device; } ret = ib_cm_listen(self->listen_id, self->service_id, 0); if (ret) { ucs_error("ib_cm_listen() failed: %m"); status = UCS_ERR_INVALID_ADDR; goto err_destroy_id; } if (config->async_mode == UCS_ASYNC_MODE_SIGNAL) { ucs_warn("ib_cm fd does not support SIGIO"); } status = ucs_async_set_event_handler(config->async_mode, self->cmdev->fd, POLLIN, uct_cm_iface_event_handler, self, worker->async); if (status != UCS_OK) { ucs_error("failed to set event handler"); goto err_destroy_id; } ucs_debug("listening for SIDR service_id 0x%x on fd %d", self->service_id, self->cmdev->fd); return UCS_OK; err_destroy_id: ib_cm_destroy_id(self->listen_id); err_close_device: ib_cm_close_device(self->cmdev); err_free_outstanding: ucs_free(self->outstanding); err: return status; }