static ucs_status_t uct_xpmem_detach(uct_mm_remote_seg_t *mm_desc) { xpmem_apid_t apid = mm_desc->cookie; void *address; int ret; address = ucs_align_down_pow2_ptr(mm_desc->address, ucs_get_page_size()); ucs_trace("xpmem detaching address %p", address); ret = xpmem_detach(address); if (ret < 0) { ucs_error("Failed to xpmem_detach: %m"); return UCS_ERR_IO_ERROR; } VALGRIND_MAKE_MEM_UNDEFINED(mm_desc->address, mm_desc->length); ucs_trace("xpmem releasing segment apid 0x%llx", apid); ret = xpmem_release(apid); if (ret < 0) { ucs_error("Failed to release xpmem segment apid 0x%llx", apid); return UCS_ERR_IO_ERROR; } return UCS_OK; }
static void ucp_ep_flush_progress(ucp_request_t *req) { ucp_ep_h ep = req->send.ep; ucp_lane_index_t lane; ucs_status_t status; uct_ep_h uct_ep; ucs_trace("ep %p: progress flush req %p, lanes 0x%x count %d", ep, req, req->send.flush.lanes, req->send.uct_comp.count); while (req->send.flush.lanes) { /* Search for next lane to start flush */ lane = ucs_ffs64(req->send.flush.lanes); uct_ep = ep->uct_eps[lane]; if (uct_ep == NULL) { req->send.flush.lanes &= ~UCS_BIT(lane); --req->send.uct_comp.count; continue; } /* Start flush operation on UCT endpoint */ status = uct_ep_flush(uct_ep, 0, &req->send.uct_comp); ucs_trace("flushing ep %p lane[%d]: %s", ep, lane, ucs_status_string(status)); if (status == UCS_OK) { req->send.flush.lanes &= ~UCS_BIT(lane); --req->send.uct_comp.count; } else if (status == UCS_INPROGRESS) { req->send.flush.lanes &= ~UCS_BIT(lane); } else if (status == UCS_ERR_NO_RESOURCE) { if (req->send.lane != UCP_NULL_LANE) { ucs_trace("ep %p: not adding pending flush %p on lane %d, " "because it's already pending on lane %d", ep, req, lane, req->send.lane); break; } req->send.lane = lane; status = uct_ep_pending_add(uct_ep, &req->send.uct); ucs_trace("adding pending flush on ep %p lane[%d]: %s", ep, lane, ucs_status_string(status)); if (status == UCS_OK) { req->send.flush.lanes &= ~UCS_BIT(lane); } else if (status != UCS_ERR_BUSY) { ucp_ep_flush_error(req, status); } } else { ucp_ep_flush_error(req, status); } } }
static void *uct_ugni_udt_device_thread(void *arg) { uct_ugni_udt_iface_t *iface = (uct_ugni_udt_iface_t *)arg; gni_return_t ugni_rc; uint64_t id; while (1) { pthread_mutex_lock(&iface->device_lock); while (iface->events_ready) { pthread_cond_wait(&iface->device_condition, &iface->device_lock); } pthread_mutex_unlock(&iface->device_lock); ugni_rc = GNI_PostdataProbeWaitById(uct_ugni_udt_iface_nic_handle(iface),-1,&id); if (ucs_unlikely(GNI_RC_SUCCESS != ugni_rc)) { ucs_error("GNI_PostDataProbeWaitById, Error status: %s %d\n", gni_err_str[ugni_rc], ugni_rc); continue; } if (ucs_unlikely(UCT_UGNI_UDT_CANCEL == id)) { /* When the iface is torn down, it will post and cancel a datagram with a * magic cookie as it's id that tells us to shut down. */ break; } iface->events_ready = 1; ucs_trace("Recieved a new datagram"); ucs_async_pipe_push(&iface->event_pipe); } return NULL; }
void uct_ugni_proccess_datagram_pipe(int event_id, void *arg) { uct_ugni_udt_iface_t *iface = (uct_ugni_udt_iface_t *)arg; uct_ugni_udt_ep_t *ep; uct_ugni_udt_desc_t *datagram; ucs_status_t status; void *user_desc; gni_return_t ugni_rc; uint64_t id; ucs_trace_func(""); uct_ugni_device_lock(&iface->super.cdm); ugni_rc = GNI_PostDataProbeById(uct_ugni_udt_iface_nic_handle(iface), &id); uct_ugni_device_unlock(&iface->super.cdm); while (GNI_RC_SUCCESS == ugni_rc) { status = recieve_datagram(iface, id, &ep); if (UCS_INPROGRESS == status) { if (ep != NULL){ ucs_trace_data("Processing reply"); datagram = ep->posted_desc; status = processs_datagram(iface, datagram); if (UCS_OK != status) { user_desc = uct_ugni_udt_get_user_desc(datagram, iface); uct_recv_desc(user_desc) = &iface->release_desc; } else { ucs_mpool_put(datagram); } ep->posted_desc = NULL; uct_ugni_check_flush(ep->desc_flush_group); } else { ucs_trace_data("Processing wildcard"); datagram = iface->desc_any; status = processs_datagram(iface, datagram); if (UCS_OK != status) { UCT_TL_IFACE_GET_TX_DESC(&iface->super.super, &iface->free_desc, iface->desc_any, iface->desc_any=NULL); user_desc = uct_ugni_udt_get_user_desc(datagram, iface); uct_recv_desc(user_desc) = &iface->release_desc; } status = uct_ugni_udt_ep_any_post(iface); if (UCS_OK != status) { /* We can't continue if we can't post the first receive */ ucs_error("Failed to post wildcard request"); return; } } } uct_ugni_device_lock(&iface->super.cdm); ugni_rc = GNI_PostDataProbeById(uct_ugni_udt_iface_nic_handle(iface), &id); uct_ugni_device_unlock(&iface->super.cdm); } ucs_async_pipe_drain(&iface->event_pipe); pthread_mutex_lock(&iface->device_lock); iface->events_ready = 0; pthread_mutex_unlock(&iface->device_lock); ucs_trace("Signaling device thread to resume monitoring"); pthread_cond_signal(&iface->device_condition); }
static ucs_status_t recieve_datagram(uct_ugni_udt_iface_t *iface, uint64_t id, uct_ugni_udt_ep_t **ep_out) { uint32_t rem_addr, rem_id; gni_post_state_t post_state; gni_return_t ugni_rc; uct_ugni_udt_ep_t *ep; gni_ep_handle_t gni_ep; uct_ugni_udt_desc_t *desc; uct_ugni_udt_header_t *header; ucs_trace_func("iface=%p, id=%lx", iface, id); if (UCT_UGNI_UDT_ANY == id) { ep = NULL; gni_ep = iface->ep_any; desc = iface->desc_any; } else { ep = ucs_derived_of(uct_ugni_iface_lookup_ep(&iface->super, id), uct_ugni_udt_ep_t); gni_ep = ep->super.ep; desc = ep->posted_desc; } *ep_out = ep; uct_ugni_device_lock(&iface->super.cdm); ugni_rc = GNI_EpPostDataWaitById(gni_ep, id, -1, &post_state, &rem_addr, &rem_id); uct_ugni_device_unlock(&iface->super.cdm); if (ucs_unlikely(GNI_RC_SUCCESS != ugni_rc)) { ucs_error("GNI_EpPostDataWaitById, id=%lu Error status: %s %d", id, gni_err_str[ugni_rc], ugni_rc); return UCS_ERR_IO_ERROR; } if (GNI_POST_TERMINATED == post_state) { return UCS_ERR_CANCELED; } if (GNI_POST_COMPLETED != post_state) { ucs_error("GNI_EpPostDataWaitById gave unexpected response: %u", post_state); return UCS_ERR_IO_ERROR; } if (UCT_UGNI_UDT_ANY != id) { --iface->super.outstanding; } header = uct_ugni_udt_get_rheader(desc, iface); ucs_trace("Got datagram id: %lu type: %i len: %i am_id: %i", id, header->type, header->length, header->am_id); if (UCT_UGNI_UDT_PAYLOAD != header->type) { /* ack message, no data */ ucs_assert_always(NULL != ep); ucs_mpool_put(ep->posted_desc); uct_ugni_check_flush(ep->desc_flush_group); ep->posted_desc = NULL; return UCS_OK; } return UCS_INPROGRESS; }
static UCS_F_ALWAYS_INLINE ucs_status_t uct_ugni_smsg_ep_am_common_send(uct_ugni_smsg_ep_t *ep, uct_ugni_smsg_iface_t *iface, uint8_t am_id, unsigned header_length, void *header, unsigned payload_length, void *payload, uct_ugni_smsg_desc_t *desc) { gni_return_t gni_rc; if (ucs_unlikely(!uct_ugni_ep_can_send(&ep->super))) { goto exit_no_res; } desc->msg_id = iface->smsg_id++; desc->flush_group = ep->super.flush_group; uct_ugni_cdm_lock(&iface->super.cdm); gni_rc = GNI_SmsgSendWTag(ep->super.ep, header, header_length, payload, payload_length, desc->msg_id, am_id); uct_ugni_cdm_unlock(&iface->super.cdm); if(GNI_RC_SUCCESS != gni_rc){ goto exit_no_res; } ++desc->flush_group->flush_comp.count; ++iface->super.outstanding; sglib_hashed_uct_ugni_smsg_desc_t_add(iface->smsg_list, desc); return UCS_OK; exit_no_res: ucs_trace("Smsg send failed."); ucs_mpool_put(desc); UCS_STATS_UPDATE_COUNTER(ep->super.super.stats, UCT_EP_STAT_NO_RES, 1); return UCS_ERR_NO_RESOURCE; }
ucs_status_t ucp_wireup_msg_progress(uct_pending_req_t *self) { ucp_request_t *req = ucs_container_of(self, ucp_request_t, send.uct); ucp_ep_h ep = req->send.ep; ssize_t packed_len; if (req->send.wireup.type == UCP_WIREUP_MSG_REQUEST) { if (ep->flags & UCP_EP_FLAG_REMOTE_CONNECTED) { ucs_trace("ep %p: not sending wireup message - remote already connected", ep); goto out; } } /* send the active message */ if (req->send.wireup.type == UCP_WIREUP_MSG_ACK) { req->send.lane = ucp_ep_get_am_lane(ep); } else { req->send.lane = ucp_ep_get_wireup_msg_lane(ep); } packed_len = uct_ep_am_bcopy(ep->uct_eps[req->send.lane], UCP_AM_ID_WIREUP, ucp_wireup_msg_pack, req); if (packed_len < 0) { if (packed_len != UCS_ERR_NO_RESOURCE) { ucs_error("failed to send wireup: %s", ucs_status_string(packed_len)); } return (ucs_status_t)packed_len; } out: ucp_request_complete_send(req, UCS_OK); return UCS_OK; }
static ucs_status_t uct_xpmem_attach(uct_mm_id_t mmid, size_t length, void *remote_address, void **local_address, uint64_t *cookie, const char *path) { struct xpmem_addr addr; ucs_status_t status; ptrdiff_t offset; void *address; addr.offset = 0; addr.apid = xpmem_get(mmid, XPMEM_RDWR, XPMEM_PERMIT_MODE, NULL); VALGRIND_MAKE_MEM_DEFINED(&addr.apid, sizeof(addr.apid)); if (addr.apid < 0) { ucs_error("Failed to acquire xpmem segment 0x%"PRIx64": %m", mmid); status = UCS_ERR_IO_ERROR; goto err_xget; } ucs_trace("xpmem acquired segment 0x%"PRIx64" apid 0x%llx remote_address %p", mmid, addr.apid, remote_address); offset = ((uintptr_t)remote_address) % ucs_get_page_size(); address = xpmem_attach(addr, length + offset, NULL); VALGRIND_MAKE_MEM_DEFINED(&address, sizeof(address)); if (address == MAP_FAILED) { ucs_error("Failed to attach xpmem segment 0x%"PRIx64" apid 0x%llx " "with length %zu: %m", mmid, addr.apid, length); status = UCS_ERR_IO_ERROR; goto err_xattach; } VALGRIND_MAKE_MEM_DEFINED(address + offset, length); *local_address = address + offset; *cookie = addr.apid; ucs_trace("xpmem attached segment 0x%"PRIx64" apid 0x%llx %p..%p at %p (+%zd)", mmid, addr.apid, remote_address, remote_address + length, address, offset); return UCS_OK; err_xattach: xpmem_release(addr.apid); err_xget: return status; }
static void uct_ugni_flush_cb(uct_completion_t *self, ucs_status_t status) { uct_ugni_flush_group_t *group = ucs_container_of(self, uct_ugni_flush_group_t, flush_comp); ucs_trace("group=%p, parent=%p, user_comp=%p", group, group->parent, group->user_comp); uct_invoke_completion(group->user_comp, UCS_OK); uct_ugni_check_flush(group->parent); uct_ugni_put_flush_group(group); }
static ucs_status_t uct_gdr_copy_mem_reg_internal(uct_md_h uct_md, void *address, size_t length, unsigned flags, uct_gdr_copy_mem_t *mem_hndl) { uct_gdr_copy_md_t *md = ucs_derived_of(uct_md, uct_gdr_copy_md_t); CUdeviceptr d_ptr = ((CUdeviceptr )(char *) address); gdr_mh_t mh; void *bar_ptr; gdr_info_t info; int ret; if (!length) { mem_hndl->mh = 0; return UCS_OK; } ret = gdr_pin_buffer(md->gdrcpy_ctx, d_ptr, length, 0, 0, &mh); if (ret) { ucs_error("gdr_pin_buffer failed. length :%lu ret:%d", length, ret); goto err; } ret = gdr_map(md->gdrcpy_ctx, mh, &bar_ptr, length); if (ret) { ucs_error("gdr_map failed. length :%lu ret:%d", length, ret); goto unpin_buffer; } ret = gdr_get_info(md->gdrcpy_ctx, mh, &info); if (ret) { ucs_error("gdr_get_info failed. ret:%d", ret); goto unmap_buffer; } mem_hndl->mh = mh; mem_hndl->info = info; mem_hndl->bar_ptr = bar_ptr; mem_hndl->reg_size = length; ucs_trace("registered memory:%p..%p length:%lu info.va:0x%"PRIx64" bar_ptr:%p", address, address + length, length, info.va, bar_ptr); return UCS_OK; unmap_buffer: ret = gdr_unmap(md->gdrcpy_ctx, mem_hndl->mh, mem_hndl->bar_ptr, mem_hndl->reg_size); if (ret) { ucs_warn("gdr_unmap failed. unpin_size:%lu ret:%d", mem_hndl->reg_size, ret); } unpin_buffer: ret = gdr_unpin_buffer(md->gdrcpy_ctx, mh); if (ret) { ucs_warn("gdr_unpin_buffer failed. ret;%d", ret); } err: return UCS_ERR_IO_ERROR; }
static ucs_status_t ucp_ep_flush_progress_pending(uct_pending_req_t *self) { ucp_request_t *req = ucs_container_of(self, ucp_request_t, send.uct); ucp_lane_index_t lane = req->send.lane; ucp_ep_h ep = req->send.ep; ucs_status_t status; int completed; ucs_assert(!(req->flags & UCP_REQUEST_FLAG_COMPLETED)); status = uct_ep_flush(ep->uct_eps[lane], 0, &req->send.uct_comp); ucs_trace("flushing ep %p lane[%d]: %s", ep, lane, ucs_status_string(status)); if (status == UCS_OK) { --req->send.uct_comp.count; /* UCT endpoint is flushed */ } /* since req->flush.pend.lane is still non-NULL, this function will not * put anything on pending. */ ucp_ep_flush_progress(req); completed = ucp_flush_check_completion(req); /* If the operation has not completed, add slow-path progress to resume */ if (!completed && req->send.flush.lanes && !req->send.flush.cbq_elem_on) { ucs_trace("ep %p: adding slow-path callback to resume flush", ep); req->send.flush.cbq_elem.cb = ucp_ep_flush_resume_slow_path_callback; req->send.flush.cbq_elem_on = 1; uct_worker_slowpath_progress_register(ep->worker->uct, &req->send.flush.cbq_elem); } if ((status == UCS_OK) || (status == UCS_INPROGRESS)) { req->send.lane = UCP_NULL_LANE; return UCS_OK; } else if (status == UCS_ERR_NO_RESOURCE) { return UCS_ERR_NO_RESOURCE; } else { ucp_ep_flush_error(req, status); return UCS_OK; } }
static ucs_status_t uct_knem_rkey_pack(uct_md_h md, uct_mem_h memh, void *rkey_buffer) { uct_knem_key_t *packed = (uct_knem_key_t*)rkey_buffer; uct_knem_key_t *key = (uct_knem_key_t *)memh; packed->cookie = (uint64_t)key->cookie; packed->address = (uintptr_t)key->address; ucs_trace("packed rkey: cookie 0x%"PRIx64" address %"PRIxPTR, key->cookie, key->address); return UCS_OK; }
static int ucp_is_resource_enabled(uct_tl_resource_desc_t *resource, const ucp_config_t *config, uint64_t *masks) { int device_enabled, tl_enabled; ucp_tl_alias_t *alias; /* Find the enabled devices */ device_enabled = ucp_is_resource_in_device_list(resource, config->devices, masks, resource->dev_type); /* Find the enabled UCTs */ ucs_assert(config->tls.count > 0); if (ucp_config_is_tl_enabled(config, resource->tl_name, 0)) { tl_enabled = 1; } else { tl_enabled = 0; /* check aliases */ for (alias = ucp_tl_aliases; alias->alias != NULL; ++alias) { /* If an alias is enabled, and the transport is part of this alias, * enable the transport. */ if (ucp_config_is_tl_enabled(config, alias->alias, 1) && (ucp_str_array_search(alias->tls, ucp_tl_alias_count(alias), resource->tl_name) >= 0)) { tl_enabled = 1; ucs_trace("enabling tl '%s' for alias '%s'", resource->tl_name, alias->alias); break; } } } ucs_trace(UCT_TL_RESOURCE_DESC_FMT " is %sabled", UCT_TL_RESOURCE_DESC_ARG(resource), (device_enabled && tl_enabled) ? "en" : "dis"); return device_enabled && tl_enabled; }
void uct_tcp_ep_mod_events(uct_tcp_ep_t *ep, uint32_t add, uint32_t remove) { int new_events = (ep->events | add) & ~remove; if (new_events != ep->events) { ep->events = new_events; ucs_trace("tcp_ep %p: set events to %c%c", ep, (new_events & EPOLLIN) ? 'i' : '-', (new_events & EPOLLOUT) ? 'o' : '-'); uct_tcp_ep_epoll_ctl(ep, EPOLL_CTL_MOD); } }
static int ucp_is_resource_enabled(uct_tl_resource_desc_t *resource, const ucp_config_t *config, uint64_t *devices_mask_p) { int device_enabled, tl_enabled; unsigned config_idx; ucs_assert(config->devices.count > 0); if (!strcmp(config->devices.names[0], "all")) { /* if the user's list is 'all', use all the available resources */ device_enabled = 1; *devices_mask_p = 1; } else { /* go over the device list from the user and check (against the available resources) * which can be satisfied */ device_enabled = 0; *devices_mask_p = 0; ucs_assert_always(config->devices.count <= 64); /* Using uint64_t bitmap */ for (config_idx = 0; config_idx < config->devices.count; ++config_idx) { if (!strcmp(config->devices.names[config_idx], resource->dev_name)) { device_enabled = 1; *devices_mask_p |= UCS_MASK(config_idx); } } } /* Disable the posix mmap and xpmem 'devices'. ONLY for now - use sysv for mm . * This will be removed after multi-rail is supported */ if (!strcmp(resource->dev_name,"posix") || !strcmp(resource->dev_name, "xpmem")) { device_enabled = 0; } ucs_assert(config->tls.count > 0); if (!strcmp(config->tls.names[0], "all")) { /* if the user's list is 'all', use all the available tls */ tl_enabled = 1; } else { /* go over the tls list from the user and compare it against the available resources */ tl_enabled = 0; for (config_idx = 0; config_idx < config->tls.count; ++config_idx) { if (!strcmp(config->tls.names[config_idx], resource->tl_name)) { tl_enabled = 1; break; } } } ucs_trace(UCT_TL_RESOURCE_DESC_FMT " is %sabled", UCT_TL_RESOURCE_DESC_ARG(resource), (device_enabled && tl_enabled) ? "en" : "dis"); return device_enabled && tl_enabled; }
static uint64_t ucs_get_mac_address() { static uint64_t mac_address = 0; struct ifreq ifr, *it, *end; struct ifconf ifc; char buf[1024]; int sock; if (mac_address == 0) { sock = socket(AF_INET, SOCK_DGRAM, IPPROTO_IP); if (sock == -1) { ucs_error("failed to create socket: %m"); return 0; } ifc.ifc_len = sizeof(buf); ifc.ifc_buf = buf; if (ioctl(sock, SIOCGIFCONF, &ifc) == -1) { ucs_error("ioctl(SIOCGIFCONF) failed: %m"); close(sock); return 0; } it = ifc.ifc_req; end = it + (ifc.ifc_len / sizeof *it); for (it = ifc.ifc_req; it != end; ++it) { strcpy(ifr.ifr_name, it->ifr_name); if (ioctl(sock, SIOCGIFFLAGS, &ifr) != 0) { ucs_error("ioctl(SIOCGIFFLAGS) failed: %m"); close(sock); return 0; } if (!(ifr.ifr_flags & IFF_LOOPBACK)) { if (ioctl(sock, SIOCGIFHWADDR, &ifr) != 0) { ucs_error("ioctl(SIOCGIFHWADDR) failed: %m"); close(sock); return 0; } memcpy(&mac_address, ifr.ifr_hwaddr.sa_data, 6); break; } } close(sock); ucs_trace("MAC address is 0x%012"PRIX64, mac_address); } return mac_address; }
static void ucp_wireup_process_ack(ucp_worker_h worker, uint64_t uuid) { ucp_ep_h ep = ucp_worker_ep_find(worker, uuid); if (ep == NULL) { ucs_debug("ignoring connection ack - ep not exists"); return; } ucs_trace("ep %p: got wireup ack", ep); ep->flags |= UCP_EP_FLAG_REMOTE_CONNECTED; ucp_wireup_ep_remote_connected(ep); }
static ucs_status_t uct_xpmem_dereg(uct_mm_id_t mmid) { int ret; ret = xpmem_remove(mmid); if (ret < 0) { /* No error since there a chance that it already was released * or deregistered */ ucs_debug("Failed to remove xpmem segment 0x%"PRIx64": %m", mmid); } ucs_trace("xpmem removed segment 0x%"PRIx64, mmid); return UCS_OK; }
void uct_dc_ep_cleanup(uct_ep_h tl_ep, ucs_class_t *cls) { uct_dc_ep_t *ep = ucs_derived_of(tl_ep, uct_dc_ep_t); uct_dc_iface_t *iface = ucs_derived_of(ep->super.super.iface, uct_dc_iface_t); UCS_CLASS_CLEANUP_CALL(cls, ep); if (uct_dc_ep_fc_wait_for_grant(ep)) { ucs_trace("not releasing dc_ep %p - waiting for grant", ep); ep->state = UCT_DC_EP_INVALID; ucs_list_add_tail(&iface->tx.gc_list, &ep->list); } else { ucs_free(ep); } }
static void ucp_wireup_ep_remote_connected(ucp_ep_h ep) { ucp_worker_h worker = ep->worker; ucp_rsc_index_t rsc_index; ucp_lane_index_t lane; ucs_trace("ep %p: remote connected", ep); for (lane = 0; lane < ucp_ep_num_lanes(ep); ++lane) { rsc_index = ucp_ep_get_rsc_index(ep, lane); if (ucp_worker_is_tl_p2p(worker, rsc_index)) { ucp_stub_ep_remote_connected(ep->uct_eps[lane]); } } }
static void ucp_ep_disconnected(ucp_request_t *req) { ucp_ep_h ep = req->send.ep; if (ep->flags & UCP_EP_FLAG_REMOTE_CONNECTED) { /* Endpoints which have remote connection are destroyed only when the * worker is destroyed, to enable remote endpoints keep sending * TODO negotiate disconnect. */ ucs_trace("not destroying ep %p because of connection from remote", ep); return; } ucp_ep_delete_from_hash(ep); ucp_ep_destroy_internal(ep, " from disconnect"); }
ucs_status_t ucp_rkey_pack(ucp_context_h context, ucp_mem_h memh, void **rkey_buffer_p, size_t *size_p) { unsigned pd_index, uct_memh_index; void *rkey_buffer, *p; size_t size, pd_size; ucs_trace("packing rkeys for buffer %p memh %p pd_map 0x%"PRIx64, memh->address, memh, memh->pd_map); size = sizeof(uint64_t); for (pd_index = 0; pd_index < context->num_pds; ++pd_index) { size += sizeof(uint8_t); pd_size = context->pd_attrs[pd_index].rkey_packed_size; ucs_assert_always(pd_size < UINT8_MAX); size += pd_size; } rkey_buffer = ucs_malloc(size, "ucp_rkey_buffer"); if (rkey_buffer == NULL) { return UCS_ERR_NO_MEMORY; } p = rkey_buffer; /* Write the PD map */ *(uint64_t*)p = memh->pd_map; p += sizeof(uint64_t); /* Write both size and rkey_buffer for each UCT rkey */ uct_memh_index = 0; for (pd_index = 0; pd_index < context->num_pds; ++pd_index) { if (!(memh->pd_map & UCS_BIT(pd_index))) { continue; } pd_size = context->pd_attrs[pd_index].rkey_packed_size; *((uint8_t*)p++) = pd_size; uct_pd_mkey_pack(context->pds[pd_index], memh->uct[uct_memh_index], p); ++uct_memh_index; p += pd_size; } *rkey_buffer_p = rkey_buffer; *size_p = size; return UCS_OK; }
static int ucp_flush_check_completion(ucp_request_t *req) { ucp_ep_h ep = req->send.ep; /* Check if flushed all lanes */ if (req->send.uct_comp.count != 0) { return 0; } ucs_trace("adding slow-path callback to destroy ep %p", ep); ucp_ep_flush_slow_path_remove(req); req->send.flush.cbq_elem.cb = ucp_ep_flushed_slow_path_callback; req->send.flush.cbq_elem_on = 1; uct_worker_slowpath_progress_register(ep->worker->uct, &req->send.flush.cbq_elem); return 1; }
static ucs_status_t ucp_wireup_connect_local(ucp_ep_h ep, const uint8_t *tli, unsigned address_count, const ucp_address_entry_t *address_list) { ucp_worker_h worker = ep->worker; const ucp_address_entry_t *address; ucp_rsc_index_t rsc_index; ucp_lane_index_t lane, amo_index; ucs_status_t status; ucp_md_map_t UCS_V_UNUSED md_map; ucs_trace("ep %p: connect local transports", ep); for (lane = 0; lane < ucp_ep_num_lanes(ep); ++lane) { rsc_index = ucp_ep_get_rsc_index(ep, lane); if (!ucp_worker_is_tl_p2p(worker, rsc_index)) { continue; } address = &address_list[tli[lane]]; ucs_assert(address->tl_addr_len > 0); /* Check that if the lane is used for RMA/AMO, destination md index matches */ md_map = ucp_lane_map_get_lane(ucp_ep_config(ep)->key.rma_lane_map, lane); ucs_assertv((md_map == 0) || (md_map == UCS_BIT(address->md_index)), "lane=%d ai=%d md_map=0x%x md_index=%d", lane, tli[lane], md_map, address->md_index); amo_index = ucp_ep_get_amo_lane_index(&ucp_ep_config(ep)->key, lane); if (amo_index != UCP_NULL_LANE) { md_map = ucp_lane_map_get_lane(ucp_ep_config(ep)->key.amo_lane_map, amo_index); ucs_assertv((md_map == 0) || (md_map == UCS_BIT(address->md_index)), "lane=%d ai=%d md_map=0x%x md_index=%d", lane, tli[lane], md_map, address->md_index); } status = uct_ep_connect_to_ep(ep->uct_eps[lane], address->dev_addr, address->ep_addr); if (status != UCS_OK) { return status; } } return UCS_OK; }
static void ucp_ep_flushed_slow_path_callback(ucs_callbackq_slow_elem_t *self) { ucp_request_t *req = ucs_container_of(self, ucp_request_t, send.flush.cbq_elem); ucp_ep_h ep = req->send.ep; ucs_assert(!(req->flags & UCP_REQUEST_FLAG_COMPLETED)); ucs_trace("flush req %p ep %p remove from uct_worker %p", req, ep, ep->worker->uct); ucp_ep_flush_slow_path_remove(req); req->send.flush.flushed_cb(req); /* Complete send request from here, to avoid releasing the request while * slow-path element is still pending */ ucs_trace_req("completing flush request %p (%p) with status %s", req, req + 1, ucs_status_string(req->status)); ucp_request_put(req, req->status); }
static void ucp_wireup_process_reply(ucp_worker_h worker, ucp_wireup_msg_t *msg, uint64_t uuid, unsigned address_count, const ucp_address_entry_t *address_list) { ucp_ep_h ep = ucp_worker_ep_find(worker, uuid); ucp_rsc_index_t rsc_tli[UCP_MAX_LANES]; ucs_status_t status; int ack; if (ep == NULL) { ucs_debug("ignoring connection reply - not exists"); return; } ucs_trace("ep %p: got wireup reply", ep); /* Connect p2p addresses to remote endpoint */ if (!(ep->flags & UCP_EP_FLAG_LOCAL_CONNECTED)) { status = ucp_wireup_connect_local(ep, msg->tli, address_count, address_list); if (status != UCS_OK) { return; } ep->flags |= UCP_EP_FLAG_LOCAL_CONNECTED; ack = 1; } else { ack = 0; } if (!(ep->flags & UCP_EP_FLAG_REMOTE_CONNECTED)) { ucp_wireup_ep_remote_connected(ep); ep->flags |= UCP_EP_FLAG_REMOTE_CONNECTED; } if (ack) { /* Send ACK without any address, we've already sent it as part of the request */ memset(rsc_tli, -1, sizeof(rsc_tli)); status = ucp_wireup_msg_send(ep, UCP_WIREUP_MSG_ACK, 0, rsc_tli); if (status != UCS_OK) { return; } } }
static ucs_status_t uct_knem_rkey_unpack(uct_md_component_t *mdc, const void *rkey_buffer, uct_rkey_t *rkey_p, void **handle_p) { uct_knem_key_t *packed = (uct_knem_key_t *)rkey_buffer; uct_knem_key_t *key; key = ucs_malloc(sizeof(uct_knem_key_t), "uct_knem_key_t"); if (NULL == key) { ucs_error("Failed to allocate memory for uct_knem_key_t"); return UCS_ERR_NO_MEMORY; } key->cookie = packed->cookie; key->address = packed->address; *handle_p = NULL; *rkey_p = (uintptr_t)key; ucs_trace("unpacked rkey: key %p cookie 0x%"PRIx64" address %"PRIxPTR, key, key->cookie, key->address); return UCS_OK; }
static ucs_status_t uct_xmpem_reg(void *address, size_t size, uct_mm_id_t *mmid_p) { xpmem_segid_t segid; void *start, *end; start = ucs_align_down_pow2_ptr(address, ucs_get_page_size()); end = ucs_align_up_pow2_ptr(address + size, ucs_get_page_size()); ucs_assert_always(start <= end); segid = xpmem_make(start, end - start, XPMEM_PERMIT_MODE, (void*)0666); VALGRIND_MAKE_MEM_DEFINED(&segid, sizeof(segid)); if (segid < 0) { ucs_error("Failed to register %p..%p with xpmem: %m", start, end); return UCS_ERR_IO_ERROR; } ucs_trace("xpmem registered %p..%p segment 0x%llx", start, end, segid); *mmid_p = segid; return UCS_OK; }
static ucs_status_t uct_gdr_copy_mem_dereg_internal(uct_md_h uct_md, uct_gdr_copy_mem_t *mem_hndl) { uct_gdr_copy_md_t *md = ucs_derived_of(uct_md, uct_gdr_copy_md_t); int ret; ret = gdr_unmap(md->gdrcpy_ctx, mem_hndl->mh, mem_hndl->bar_ptr, mem_hndl->reg_size); if (ret) { ucs_error("gdr_unmap failed. unpin_size:%lu ret:%d", mem_hndl->reg_size, ret); return UCS_ERR_IO_ERROR; } ret = gdr_unpin_buffer(md->gdrcpy_ctx, mem_hndl->mh); if (ret) { ucs_error("gdr_unpin_buffer failed. ret:%d", ret); return UCS_ERR_IO_ERROR; } ucs_trace("deregistered memorory. info.va:0x%"PRIx64" bar_ptr:%p", mem_hndl->info.va, mem_hndl->bar_ptr); return UCS_OK; }
static int ucp_wireup_check_flags(const uct_tl_resource_desc_t *resource, uint64_t flags, uint64_t required_flags, const char *title, const char ** flag_descs, char *reason, size_t max) { const char *missing_flag_desc; if (ucs_test_all_flags(flags, required_flags)) { return 1; } if (required_flags) { missing_flag_desc = ucp_wireup_get_missing_flag_desc(flags, required_flags, flag_descs); ucs_trace(UCT_TL_RESOURCE_DESC_FMT " : not suitable for %s, no %s", UCT_TL_RESOURCE_DESC_ARG(resource), title, missing_flag_desc); snprintf(reason, max, UCT_TL_RESOURCE_DESC_FMT" - no %s", UCT_TL_RESOURCE_DESC_ARG(resource), missing_flag_desc); } return 0; }