/** * Register the memory on all PDs, except maybe for alloc_pd. * In case alloc_pd != NULL, alloc_pd_memh will hold the memory key obtained from * allocation. It will be put in the array of keys in the proper index. */ static ucs_status_t ucp_memh_reg_pds(ucp_context_h context, ucp_mem_h memh, uct_mem_h alloc_pd_memh) { uct_mem_h dummy_pd_memh; unsigned uct_memh_count; ucs_status_t status; unsigned pd_index; memh->pd_map = 0; uct_memh_count = 0; /* Register on all transports (except the one we used to allocate) */ for (pd_index = 0; pd_index < context->num_pds; ++pd_index) { if (context->pds[pd_index] == memh->alloc_pd) { /* Add the memory handle we got from allocation */ ucs_assert(memh->alloc_method == UCT_ALLOC_METHOD_PD); memh->pd_map |= UCS_BIT(pd_index); memh->uct[uct_memh_count++] = alloc_pd_memh; } else if (context->pd_attrs[pd_index].cap.flags & UCT_PD_FLAG_REG) { /* If the PD supports registration, register on it as well */ status = uct_pd_mem_reg(context->pds[pd_index], memh->address, memh->length, &memh->uct[uct_memh_count]); if (status != UCS_OK) { ucp_memh_dereg_pds(context, memh, &dummy_pd_memh); return status; } memh->pd_map |= UCS_BIT(pd_index); ++uct_memh_count; } } return UCS_OK; }
static void ucp_address_pack_iface_attr(ucp_address_packed_iface_attr_t *packed, const uct_iface_attr_t *iface_attr, int enable_atomics) { uint32_t packed_flag; uint64_t cap_flags; uint64_t bit; cap_flags = iface_attr->cap.flags; if (!enable_atomics) { cap_flags &= ~(UCP_UCT_IFACE_ATOMIC32_FLAGS | UCP_UCT_IFACE_ATOMIC64_FLAGS); } packed->prio_cap_flags = ((uint8_t)iface_attr->priority); packed->overhead = iface_attr->overhead; packed->bandwidth = iface_attr->bandwidth; packed->lat_ovh = iface_attr->latency.overhead; /* Keep only the bits defined by UCP_ADDRESS_IFACE_FLAGS, to shrink address. */ packed_flag = UCS_BIT(8); bit = 1; while (UCP_ADDRESS_IFACE_FLAGS & ~(bit - 1)) { if (UCP_ADDRESS_IFACE_FLAGS & bit) { if (cap_flags & bit) { packed->prio_cap_flags |= packed_flag; } packed_flag <<= 1; } bit <<= 1; } }
void uct_ib_log_dump_sg_list(struct ibv_sge *sg_list, int num_sge, uint64_t inline_bitmap, uct_log_data_dump_func_t data_dump, char *buf, size_t max) { char data[256]; size_t total_len = 0; size_t total_valid_len = 0;; char *s = buf; char *ends = buf + max; void *pd = data; size_t len; int i; for (i = 0; i < num_sge; ++i) { if (inline_bitmap & UCS_BIT(i)) { snprintf(s, ends - s, " [inl len %d]", sg_list[i].length); } else { snprintf(s, ends - s, " [va 0x%"PRIx64" len %d lkey 0x%x]", sg_list[i].addr, sg_list[i].length, sg_list[i].lkey); } len = ucs_min(sg_list[i].length, (void*)data + sizeof(data) - pd); memcpy(pd, (void*)sg_list[i].addr, len); s += strlen(s); pd += len; total_len += len; total_valid_len += sg_list[i].length; } if (data_dump != NULL) { data_dump(data, total_len, total_valid_len, s, ends - s); } }
static int ucp_is_resource_in_device_list(uct_tl_resource_desc_t *resource, const str_names_array_t *devices, uint64_t *masks, int index) { int device_enabled, config_idx; if (devices[index].count == 0) { return 0; } if (!strcmp(devices[index].names[0], "all")) { /* if the user's list is 'all', use all the available resources */ device_enabled = 1; masks[index] = -1; /* using all available devices. can satisfy 'all' */ } else { /* go over the device list from the user and check (against the available resources) * which can be satisfied */ device_enabled = 0; ucs_assert_always(devices[index].count <= 64); /* Using uint64_t bitmap */ config_idx = ucp_str_array_search((const char**)devices[index].names, devices[index].count, resource->dev_name); if (config_idx >= 0) { device_enabled = 1; masks[index] |= UCS_BIT(config_idx); } } return device_enabled; }
static void ucp_address_unpack_iface_attr(ucp_address_iface_attr_t *iface_attr, const ucp_address_packed_iface_attr_t *packed) { uint32_t packed_flag; uint64_t bit; iface_attr->cap_flags = 0; iface_attr->priority = packed->prio_cap_flags & UCS_MASK(8); iface_attr->overhead = packed->overhead; iface_attr->bandwidth = packed->bandwidth; iface_attr->lat_ovh = packed->lat_ovh; packed_flag = UCS_BIT(8); bit = 1; while (UCP_ADDRESS_IFACE_FLAGS & ~(bit - 1)) { if (UCP_ADDRESS_IFACE_FLAGS & bit) { if (packed->prio_cap_flags & packed_flag) { iface_attr->cap_flags |= bit; } packed_flag <<= 1; } bit <<= 1; } }
static ucs_status_t ucp_wireup_connect_local(ucp_ep_h ep, const uint8_t *tli, unsigned address_count, const ucp_address_entry_t *address_list) { ucp_worker_h worker = ep->worker; const ucp_address_entry_t *address; ucp_rsc_index_t rsc_index; ucp_lane_index_t lane, amo_index; ucs_status_t status; ucp_md_map_t UCS_V_UNUSED md_map; ucs_trace("ep %p: connect local transports", ep); for (lane = 0; lane < ucp_ep_num_lanes(ep); ++lane) { rsc_index = ucp_ep_get_rsc_index(ep, lane); if (!ucp_worker_is_tl_p2p(worker, rsc_index)) { continue; } address = &address_list[tli[lane]]; ucs_assert(address->tl_addr_len > 0); /* Check that if the lane is used for RMA/AMO, destination md index matches */ md_map = ucp_lane_map_get_lane(ucp_ep_config(ep)->key.rma_lane_map, lane); ucs_assertv((md_map == 0) || (md_map == UCS_BIT(address->md_index)), "lane=%d ai=%d md_map=0x%x md_index=%d", lane, tli[lane], md_map, address->md_index); amo_index = ucp_ep_get_amo_lane_index(&ucp_ep_config(ep)->key, lane); if (amo_index != UCP_NULL_LANE) { md_map = ucp_lane_map_get_lane(ucp_ep_config(ep)->key.amo_lane_map, amo_index); ucs_assertv((md_map == 0) || (md_map == UCS_BIT(address->md_index)), "lane=%d ai=%d md_map=0x%x md_index=%d", lane, tli[lane], md_map, address->md_index); } status = uct_ep_connect_to_ep(ep->uct_eps[lane], address->dev_addr, address->ep_addr); if (status != UCS_OK) { return status; } } return UCS_OK; }
int ucs_config_sscanf_bitmask(const char *buf, void *dest, const void *arg) { int ret = sscanf(buf, "%u", (unsigned*)dest); if (*(unsigned*)dest != 0) { *(unsigned*)dest = UCS_BIT(*(unsigned*)dest) - 1; } return ret; }
static void print_atomic_info(uct_atomic_op_t opcode, const char *name, const char *suffix, uint64_t op32, uint64_t op64) { char amo[256] = "atomic_"; char *s; if ((op32 & UCS_BIT(opcode)) || (op64 & UCS_BIT(opcode))) { s = strduplower(name); strncat(amo, suffix, sizeof(amo) - strlen(amo) - 1); strncat(amo, s, sizeof(amo) - strlen(amo) - 1); free(s); if ((op32 & UCS_BIT(opcode)) && (op64 & UCS_BIT(opcode))) { printf("# %12s: 32, 64 bit\n", amo); } else { printf("# %12s: %d bit\n", amo, (op32 & UCS_BIT(opcode)) ? 32 : 64); } } }
static ucs_status_t ucp_address_gather_devices(ucp_worker_h worker, uint64_t tl_bitmap, int has_ep, ucp_address_packed_device_t **devices_p, ucp_rsc_index_t *num_devices_p) { ucp_context_h context = worker->context; ucp_address_packed_device_t *dev, *devices; uct_iface_attr_t *iface_attr; ucp_rsc_index_t num_devices; ucp_rsc_index_t i; uint64_t mask; devices = ucs_calloc(context->num_tls, sizeof(*devices), "packed_devices"); if (devices == NULL) { return UCS_ERR_NO_MEMORY; } num_devices = 0; for (i = 0; i < context->num_tls; ++i) { mask = UCS_BIT(i); if (!(mask & tl_bitmap)) { continue; } iface_attr = &worker->ifaces[i].attr; if (!(iface_attr->cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE) && !(iface_attr->cap.flags & UCT_IFACE_FLAG_CONNECT_TO_EP)) { continue; } dev = ucp_address_get_device(context->tl_rscs[i].tl_rsc.dev_name, devices, &num_devices); dev->tl_addrs_size += iface_attr->iface_addr_len; if (!(iface_attr->cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE) && has_ep) { /* ep address and its length */ dev->tl_addrs_size += 1 + iface_attr->ep_addr_len; } dev->tl_addrs_size += sizeof(uint16_t); /* tl name checksum */ dev->tl_addrs_size += sizeof(ucp_address_packed_iface_attr_t); /* iface attr */ dev->tl_addrs_size += 1; /* iface address length */ dev->rsc_index = i; dev->dev_addr_len = iface_attr->device_addr_len; dev->tl_bitmap |= mask; } *devices_p = devices; *num_devices_p = num_devices; return UCS_OK; }
/** * Print a bitmap as a list of ranges. * * @param n Number equivalent to the first bit in the bitmap. * @param bitmap Compressed array of bits. * @param length Number of bits in the bitmap. */ const char *ucs_log_bitmap_to_str(unsigned n, uint8_t *bitmap, size_t length) { static char buf[512] = {0}; int first, in_range; unsigned prev = 0, end = 0; char *p, *endp; size_t i; p = buf; endp = buf + sizeof(buf) - 4; first = 1; in_range = 0; for (i = 0; i < length; ++i) { if (bitmap[i / 8] & UCS_BIT(i % 8)) { if (first) { p += snprintf(p, endp - p, "%d", n); if (p > endp) { goto overflow; } } else if (n == prev + 1) { in_range = 1; end = n; } else { if (in_range) { p += snprintf(p, endp - p, "-%d", end); if (p > endp) { goto overflow; } } in_range = 0; p += snprintf(p, endp - p, ",%d", n); if (p > endp) { goto overflow; } } first = 0; prev = n; } ++n; } if (in_range) { p += snprintf(p, endp - p, "-%d", end); if (p > endp) { goto overflow; } } return buf; overflow: strcpy(p, "..."); return buf; }
static ucs_status_t ucp_address_gather_devices(ucp_worker_h worker, uint64_t tl_bitmap, int has_ep, ucp_address_packed_device_t **devices_p, ucp_rsc_index_t *num_devices_p) { ucp_context_h context = worker->context; ucp_address_packed_device_t *dev, *devices; uct_iface_attr_t *iface_attr; ucp_rsc_index_t num_devices; ucp_rsc_index_t i; uint64_t mask; devices = ucs_calloc(context->num_tls, sizeof(*devices), "packed_devices"); if (devices == NULL) { return UCS_ERR_NO_MEMORY; } num_devices = 0; for (i = 0; i < context->num_tls; ++i) { mask = UCS_BIT(i); if (!(mask & tl_bitmap)) { continue; } dev = ucp_address_get_device(context->tl_rscs[i].tl_rsc.dev_name, devices, &num_devices); iface_attr = &worker->iface_attrs[i]; if (iface_attr->cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE) { dev->tl_addrs_size += iface_attr->iface_addr_len; } else if (iface_attr->cap.flags & UCT_IFACE_FLAG_CONNECT_TO_EP) { if (has_ep) { dev->tl_addrs_size += iface_attr->ep_addr_len; } else { /* Empty address */ } } else { continue; } dev->rsc_index = i; dev->dev_addr_len = iface_attr->device_addr_len; dev->tl_bitmap |= mask; dev->tl_addrs_size += 1; /* address length */ dev->tl_addrs_size += ucp_address_string_packed_size(context->tl_rscs[i].tl_rsc.tl_name); } *devices_p = devices; *num_devices_p = num_devices; return UCS_OK; }
static void ucp_check_unavailable_devices(const str_names_array_t *devices, uint64_t *masks) { int dev_type_idx, i; /* Go over the devices lists and check which devices were marked as unavailable */ for (dev_type_idx = 0; dev_type_idx < UCT_DEVICE_TYPE_LAST; dev_type_idx++) { for (i = 0; i < devices[dev_type_idx].count; i++) { if (!(masks[dev_type_idx] & UCS_BIT(i))) { ucs_info("Device %s is not available", devices[dev_type_idx].names[i]); } } } }
ucs_status_t ucp_wireup_send_request(ucp_ep_h ep) { ucp_worker_h worker = ep->worker; ucp_rsc_index_t rsc_tli[UCP_MAX_LANES]; ucp_rsc_index_t rsc_index; uint64_t tl_bitmap = 0; ucp_lane_index_t lane; ucs_status_t status; if (ep->flags & UCP_EP_FLAG_CONNECT_REQ_SENT) { return UCS_OK; } ucs_assert_always(!ucp_ep_is_stub(ep)); for (lane = 0; lane < UCP_MAX_LANES; ++lane) { if (lane < ucp_ep_num_lanes(ep)) { rsc_index = ucp_ep_get_rsc_index(ep, lane); rsc_tli[lane] = ucp_worker_is_tl_p2p(worker, rsc_index) ? rsc_index : UCP_NULL_RESOURCE; tl_bitmap |= UCS_BIT(rsc_index); } else { rsc_tli[lane] = UCP_NULL_RESOURCE; } } /* TODO make sure such lane would exist */ rsc_index = ucp_stub_ep_get_aux_rsc_index( ep->uct_eps[ucp_ep_get_wireup_msg_lane(ep)]); if (rsc_index != UCP_NULL_RESOURCE) { tl_bitmap |= UCS_BIT(rsc_index); } ucs_debug("ep %p: send wireup request (flags=0x%x)", ep, ep->flags); status = ucp_wireup_msg_send(ep, UCP_WIREUP_MSG_REQUEST, tl_bitmap, rsc_tli); ep->flags |= UCP_EP_FLAG_CONNECT_REQ_SENT; return status; }
static ucs_status_t uct_gdr_copy_md_query(uct_md_h md, uct_md_attr_t *md_attr) { md_attr->cap.flags = UCT_MD_FLAG_REG | UCT_MD_FLAG_NEED_RKEY; md_attr->cap.reg_mem_types = UCS_BIT(UCT_MD_MEM_TYPE_CUDA); md_attr->cap.mem_type = UCT_MD_MEM_TYPE_CUDA; md_attr->cap.max_alloc = 0; md_attr->cap.max_reg = ULONG_MAX; md_attr->rkey_packed_size = sizeof(uct_gdr_copy_key_t); md_attr->reg_cost.overhead = 0; md_attr->reg_cost.growth = 0; memset(&md_attr->local_cpus, 0xff, sizeof(md_attr->local_cpus)); return UCS_OK; }
static ucs_status_t uct_self_md_query(uct_md_h md, uct_md_attr_t *attr) { /* Dummy memory registration provided. No real memory handling exists */ attr->cap.flags = UCT_MD_FLAG_REG | UCT_MD_FLAG_NEED_RKEY; /* TODO ignore rkey in rma/amo ops */ attr->cap.reg_mem_types = UCS_BIT(UCT_MD_MEM_TYPE_HOST); attr->cap.mem_type = UCT_MD_MEM_TYPE_HOST; attr->cap.max_alloc = 0; attr->cap.max_reg = ULONG_MAX; attr->rkey_packed_size = 0; /* uct_md_query adds UCT_MD_COMPONENT_NAME_MAX to this */ attr->reg_cost.overhead = 0; attr->reg_cost.growth = 0; memset(&attr->local_cpus, 0xff, sizeof(attr->local_cpus)); return UCS_OK; }
static void ucp_ep_config_print_md_map(FILE *stream, const char *name, ucp_md_map_t md_map) { ucp_rsc_index_t md_index; int first; first = 1; fprintf(stream, "%s", name); for (md_index = 0; md_index < UCP_MD_INDEX_BITS; ++md_index) { if (md_map & UCS_BIT(md_index)) { fprintf(stream, "%c%d", first ? '{' : ',', md_index); first = 0; } } fprintf(stream, "}"); }
ucs_status_t uct_iface_mem_alloc(uct_iface_h tl_iface, size_t length, unsigned flags, const char *name, uct_allocated_memory_t *mem) { uct_base_iface_t *iface = ucs_derived_of(tl_iface, uct_base_iface_t); uct_md_attr_t md_attr; ucs_status_t status; status = uct_mem_alloc(NULL, length, UCT_MD_MEM_ACCESS_ALL, iface->config.alloc_methods, iface->config.num_alloc_methods, &iface->md, 1, name, mem); if (status != UCS_OK) { goto err; } /* If the memory was not allocated using MD, register it */ if (mem->method != UCT_ALLOC_METHOD_MD) { status = uct_md_query(iface->md, &md_attr); if (status != UCS_OK) { goto err_free; } /* If MD does not support registration, allow only the MD method */ if ((md_attr.cap.flags & UCT_MD_FLAG_REG) && (md_attr.cap.reg_mem_types & UCS_BIT(mem->mem_type))) { status = uct_md_mem_reg(iface->md, mem->address, mem->length, flags, &mem->memh); if (status != UCS_OK) { goto err_free; } ucs_assert(mem->memh != UCT_MEM_HANDLE_NULL); } else { mem->memh = UCT_MEM_HANDLE_NULL; } mem->md = iface->md; } return UCS_OK; err_free: uct_mem_free(mem); err: return status; }
ucs_status_t ucp_rkey_pack(ucp_context_h context, ucp_mem_h memh, void **rkey_buffer_p, size_t *size_p) { unsigned pd_index, uct_memh_index; void *rkey_buffer, *p; size_t size, pd_size; ucs_trace("packing rkeys for buffer %p memh %p pd_map 0x%"PRIx64, memh->address, memh, memh->pd_map); size = sizeof(uint64_t); for (pd_index = 0; pd_index < context->num_pds; ++pd_index) { size += sizeof(uint8_t); pd_size = context->pd_attrs[pd_index].rkey_packed_size; ucs_assert_always(pd_size < UINT8_MAX); size += pd_size; } rkey_buffer = ucs_malloc(size, "ucp_rkey_buffer"); if (rkey_buffer == NULL) { return UCS_ERR_NO_MEMORY; } p = rkey_buffer; /* Write the PD map */ *(uint64_t*)p = memh->pd_map; p += sizeof(uint64_t); /* Write both size and rkey_buffer for each UCT rkey */ uct_memh_index = 0; for (pd_index = 0; pd_index < context->num_pds; ++pd_index) { if (!(memh->pd_map & UCS_BIT(pd_index))) { continue; } pd_size = context->pd_attrs[pd_index].rkey_packed_size; *((uint8_t*)p++) = pd_size; uct_pd_mkey_pack(context->pds[pd_index], memh->uct[uct_memh_index], p); ++uct_memh_index; p += pd_size; } *rkey_buffer_p = rkey_buffer; *size_p = size; return UCS_OK; }
static uint64_t ucp_wireup_get_reachable_mds(ucp_worker_h worker, unsigned address_count, const ucp_address_entry_t *address_list) { ucp_context_h context = worker->context; uint64_t reachable_mds = 0; const ucp_address_entry_t *ae; ucp_rsc_index_t rsc_index; for (rsc_index = 0; rsc_index < context->num_tls; ++rsc_index) { for (ae = address_list; ae < address_list + address_count; ++ae) { if (ucp_wireup_is_reachable(worker, rsc_index, ae)) { reachable_mds |= UCS_BIT(ae->md_index); } } } return reachable_mds; }
int ucs_config_sprintf_bitmap(char *buf, size_t max, void *src, const void *arg) { char * const *table; int i, len; len = 0; for (table = arg, i = 0; *table; ++table, ++i) { if (*((unsigned*)src) & UCS_BIT(i)) { snprintf(buf + len, max - len, "%s,", *table); len = strlen(buf); } } if (len > 0) { buf[len - 1] = '\0'; /* remove last ',' */ } else { buf[0] = '\0'; } return 1; }
int ucs_config_sscanf_bitmap(const char *buf, void *dest, const void *arg) { char *str = strdup(buf); char *p; int ret, i; ret = 1; *((unsigned*)dest) = 0; p = strtok(str, ","); while (p != NULL) { i = __find_string_in_list(p, (const char**)arg); if (i < 0) { ret = 0; break; } *((unsigned*)dest) |= UCS_BIT(i); p = strtok(NULL, ","); } free(str); return ret; }
static int ucp_is_resource_in_device_list(uct_tl_resource_desc_t *resource, const str_names_array_t *devices, uint64_t *masks, int index) { int device_enabled, config_idx; if (devices[index].count == 0) { return 0; } if (!strcmp(devices[index].names[0], "all")) { /* if the user's list is 'all', use all the available resources */ device_enabled = 1; masks[index] = -1; /* using all available devices. can satisfy 'all' */ } else { /* go over the device list from the user and check (against the available resources) * which can be satisfied */ device_enabled = 0; ucs_assert_always(devices[index].count <= 64); /* Using uint64_t bitmap */ config_idx = ucp_str_array_search((const char**)devices[index].names, devices[index].count, resource->dev_name); if (config_idx >= 0) { device_enabled = 1; masks[index] |= UCS_BIT(config_idx); } } /* Disable the posix mmap and xpmem 'devices'. ONLY for now - use sysv for mm . * This will be removed after multi-rail is supported */ if ((!strcmp(resource->dev_name,"posix") || !strcmp(resource->dev_name, "xpmem")) && (device_enabled)) { device_enabled = 0; ucs_info("posix and xpmem are currently unavailable"); } return device_enabled; }
/** * Unregister memory from all protection domains. * Save in *alloc_pd_memh_p the memory handle of the allocating PD, if such exists. */ static ucs_status_t ucp_memh_dereg_pds(ucp_context_h context, ucp_mem_h memh, uct_mem_h* alloc_pd_memh_p) { unsigned pd_index, uct_index; ucs_status_t status; uct_index = 0; *alloc_pd_memh_p = UCT_INVALID_MEM_HANDLE; for (pd_index = 0; pd_index < context->num_pds; ++pd_index) { if (!(memh->pd_map & UCS_BIT(pd_index))) { /* PD not present in the array */ continue; } if (memh->alloc_pd == context->pds[pd_index]) { /* If we used a pd to register the memory, remember the memh - for * releasing the memory later. We cannot release the memory at this * point because we have to unregister it from other PDs first. */ ucs_assert(memh->alloc_method == UCT_ALLOC_METHOD_PD); *alloc_pd_memh_p = memh->uct[uct_index]; } else { status = uct_pd_mem_dereg(context->pds[pd_index], memh->uct[uct_index]); if (status != UCS_OK) { ucs_error("Failed to dereg address %p with pd %s", memh->address, context->pd_rscs[pd_index].pd_name); return status; } } ++uct_index; } return UCS_OK; }
static ucs_status_t ucp_pick_best_wireup(ucp_worker_h worker, ucp_address_t *address, ucp_wireup_score_function_t score_func, ucp_rsc_index_t *src_rsc_index_p, ucp_rsc_index_t *dst_rsc_index_p, ucp_rsc_index_t *dst_pd_index_p, struct sockaddr **addr_p, uint64_t *reachable_pds, const char *title) { ucp_context_h context = worker->context; ucp_rsc_index_t src_rsc_index, dst_rsc_index; ucp_rsc_index_t pd_index; struct sockaddr *addr, *best_addr; double score, best_score; uct_iface_attr_t *iface_attr; uct_tl_resource_desc_t *resource; char tl_name[UCT_TL_NAME_MAX]; uct_iface_h iface; void *iter; best_addr = NULL; best_score = 1e-9; *src_rsc_index_p = -1; *dst_rsc_index_p = -1; *dst_pd_index_p = -1; *reachable_pds = 0; /* * Find the best combination of local resource and reachable remote address. */ dst_rsc_index = 0; ucp_address_iter_init(address, &iter); while (ucp_address_iter_next(&iter, &addr, tl_name, &pd_index)) { for (src_rsc_index = 0; src_rsc_index < context->num_tls; ++src_rsc_index) { resource = &context->tl_rscs[src_rsc_index].tl_rsc; iface = worker->ifaces[src_rsc_index]; iface_attr = &worker->iface_attrs[src_rsc_index]; /* Must be reachable address, on same transport */ if (strcmp(tl_name, resource->tl_name) || !uct_iface_is_reachable(iface, addr)) { continue; } *reachable_pds |= UCS_BIT(pd_index); score = score_func(resource, iface, iface_attr); ucs_trace("%s " UCT_TL_RESOURCE_DESC_FMT " score %.2f", title, UCT_TL_RESOURCE_DESC_ARG(resource), score); if (score > best_score) { ucs_assert(addr != NULL); best_score = score; best_addr = addr; *src_rsc_index_p = src_rsc_index; *dst_rsc_index_p = dst_rsc_index; *dst_pd_index_p = pd_index; } } ++dst_rsc_index; } if (best_addr == NULL) { return UCS_ERR_UNREACHABLE; } ucs_debug("%s: " UCT_TL_RESOURCE_DESC_FMT " to %d pd %d", title, UCT_TL_RESOURCE_DESC_ARG(&context->tl_rscs[*src_rsc_index_p].tl_rsc), *dst_rsc_index_p, *dst_pd_index_p); *addr_p = best_addr; return UCS_OK; }
#include <netdb.h> #include <getopt.h> #include <string.h> #include <sys/types.h> #include <locale.h> #if HAVE_MPI # include <mpi.h> #elif HAVE_RTE # include<rte.h> #endif #define MAX_BATCH_FILES 32 enum { TEST_FLAG_PRINT_RESULTS = UCS_BIT(0), TEST_FLAG_PRINT_TEST = UCS_BIT(1), TEST_FLAG_SET_AFFINITY = UCS_BIT(8), TEST_FLAG_NUMERIC_FMT = UCS_BIT(9), TEST_FLAG_PRINT_FINAL = UCS_BIT(10), TEST_FLAG_PRINT_CSV = UCS_BIT(11) }; typedef struct sock_rte_group { int is_server; int connfd; } sock_rte_group_t; typedef struct test_type { const char *name;
ucs_status_t ucp_rkey_pack(ucp_context_h context, ucp_mem_h memh, void **rkey_buffer_p, size_t *size_p) { unsigned pd_index, uct_memh_index; void *rkey_buffer, *p; size_t size, pd_size; ucs_status_t status; char UCS_V_UNUSED buf[128]; ucs_trace("packing rkeys for buffer %p memh %p pd_map 0x%x", memh->address, memh, memh->pd_map); if (memh->length == 0) { /* dummy memh, return dummy key */ *rkey_buffer_p = &ucp_mem_dummy_buffer; *size_p = sizeof(ucp_mem_dummy_buffer); return UCS_OK; } size = sizeof(ucp_pd_map_t); for (pd_index = 0; pd_index < context->num_pds; ++pd_index) { size += sizeof(uint8_t); pd_size = context->pd_attrs[pd_index].rkey_packed_size; ucs_assert_always(pd_size < UINT8_MAX); size += pd_size; } rkey_buffer = ucs_malloc(size, "ucp_rkey_buffer"); if (rkey_buffer == NULL) { status = UCS_ERR_NO_MEMORY; goto err; } p = rkey_buffer; /* Write the PD map */ *(ucp_pd_map_t*)p = memh->pd_map; p += sizeof(ucp_pd_map_t); /* Write both size and rkey_buffer for each UCT rkey */ uct_memh_index = 0; for (pd_index = 0; pd_index < context->num_pds; ++pd_index) { if (!(memh->pd_map & UCS_BIT(pd_index))) { continue; } pd_size = context->pd_attrs[pd_index].rkey_packed_size; *((uint8_t*)p++) = pd_size; uct_pd_mkey_pack(context->pds[pd_index], memh->uct[uct_memh_index], p); ucs_trace("rkey[%d]=%s for pd[%d]=%s", uct_memh_index, ucs_log_dump_hex(p, pd_size, buf, sizeof(buf)), pd_index, context->pd_rscs[pd_index].pd_name); ++uct_memh_index; p += pd_size; } if (uct_memh_index == 0) { status = UCS_ERR_UNSUPPORTED; goto err_destroy; } *rkey_buffer_p = rkey_buffer; *size_p = size; return UCS_OK; err_destroy: ucs_free(rkey_buffer); err: return status; }
ucs_status_t ucp_ep_rkey_unpack(ucp_ep_h ep, void *rkey_buffer, ucp_rkey_h *rkey_p) { unsigned remote_pd_index, remote_pd_gap; unsigned rkey_index; unsigned pd_count; ucs_status_t status; ucp_rkey_h rkey; uint8_t pd_size; ucp_pd_map_t pd_map; void *p; /* Count the number of remote PDs in the rkey buffer */ p = rkey_buffer; /* Read remote PD map */ pd_map = *(ucp_pd_map_t*)p; ucs_trace("unpacking rkey with pd_map 0x%x", pd_map); if (pd_map == 0) { /* Dummy key return ok */ *rkey_p = &ucp_mem_dummy_rkey; return UCS_OK; } pd_count = ucs_count_one_bits(pd_map); p += sizeof(ucp_pd_map_t); /* Allocate rkey handle which holds UCT rkeys for all remote PDs. * We keep all of them to handle a future transport switch. */ rkey = ucs_malloc(sizeof(*rkey) + (sizeof(rkey->uct[0]) * pd_count), "ucp_rkey"); if (rkey == NULL) { status = UCS_ERR_NO_MEMORY; goto err; } rkey->pd_map = 0; remote_pd_index = 0; /* Index of remote PD */ rkey_index = 0; /* Index of the rkey in the array */ /* Unpack rkey of each UCT PD */ while (pd_map > 0) { pd_size = *((uint8_t*)p++); /* Use bit operations to iterate through the indices of the remote PDs * as provided in the pd_map. pd_map always holds a bitmap of PD indices * that remain to be used. Every time we find the "gap" until the next * valid PD index using ffs operation. If some rkeys cannot be unpacked, * we remove them from the local map. */ remote_pd_gap = ucs_ffs64(pd_map); /* Find the offset for next PD index */ remote_pd_index += remote_pd_gap; /* Calculate next index of remote PD*/ pd_map >>= remote_pd_gap; /* Remove the gap from the map */ ucs_assert(pd_map & 1); /* Unpack only reachable rkeys */ if (UCS_BIT(remote_pd_index) & ucp_ep_config(ep)->key.reachable_pd_map) { ucs_assert(rkey_index < pd_count); status = uct_rkey_unpack(p, &rkey->uct[rkey_index]); if (status != UCS_OK) { ucs_error("Failed to unpack remote key from remote pd[%d]: %s", remote_pd_index, ucs_status_string(status)); goto err_destroy; } ucs_trace("rkey[%d] for remote pd %d is 0x%lx", rkey_index, remote_pd_index, rkey->uct[rkey_index].rkey); rkey->pd_map |= UCS_BIT(remote_pd_index); ++rkey_index; } ++remote_pd_index; pd_map >>= 1; p += pd_size; } if (rkey->pd_map == 0) { ucs_debug("The unpacked rkey from the destination is unreachable"); status = UCS_ERR_UNREACHABLE; goto err_destroy; } *rkey_p = rkey; return UCS_OK; err_destroy: ucp_rkey_destroy(rkey); err: return status; }
static void ucp_wireup_process_request(ucp_worker_h worker, const ucp_wireup_msg_t *msg, uint64_t uuid, const char *peer_name, unsigned address_count, const ucp_address_entry_t *address_list) { ucp_ep_h ep = ucp_worker_ep_find(worker, uuid); ucp_rsc_index_t rsc_tli[UCP_MAX_LANES]; uint8_t addr_indices[UCP_MAX_LANES]; ucp_lane_index_t lane, remote_lane; ucp_rsc_index_t rsc_index; ucs_status_t status; uint64_t tl_bitmap = 0; ucs_trace("ep %p: got wireup request from %s", ep, peer_name); /* Create a new endpoint if does not exist */ if (ep == NULL) { status = ucp_ep_new(worker, uuid, peer_name, "remote-request", &ep); if (status != UCS_OK) { return; } } /* Initialize lanes (possible destroy existing lanes) */ status = ucp_wireup_init_lanes(ep, address_count, address_list, addr_indices); if (status != UCS_OK) { return; } /* Connect p2p addresses to remote endpoint */ if (!(ep->flags & UCP_EP_FLAG_LOCAL_CONNECTED)) { status = ucp_wireup_connect_local(ep, addr_indices, address_count, address_list); if (status != UCS_OK) { return; } ep->flags |= UCP_EP_FLAG_LOCAL_CONNECTED; /* Construct the list that tells the remote side with which address we * have connected to each of its lanes. */ memset(rsc_tli, -1, sizeof(rsc_tli)); for (lane = 0; lane < ucp_ep_num_lanes(ep); ++lane) { rsc_index = ucp_ep_get_rsc_index(ep, lane); for (remote_lane = 0; remote_lane < UCP_MAX_LANES; ++remote_lane) { /* If 'lane' has connected to 'remote_lane' ... */ if (addr_indices[lane] == msg->tli[remote_lane]) { ucs_assert(ucp_worker_is_tl_p2p(worker, rsc_index)); rsc_tli[remote_lane] = rsc_index; tl_bitmap |= UCS_BIT(rsc_index); } } } ucs_trace("ep %p: sending wireup reply", ep); status = ucp_wireup_msg_send(ep, UCP_WIREUP_MSG_REPLY, tl_bitmap, rsc_tli); if (status != UCS_OK) { return; } } }
static ucs_status_t uct_ib_iface_init_lmc(uct_ib_iface_t *iface, const uct_ib_iface_config_t *config) { unsigned i, j, num_path_bits; unsigned first, last; uint8_t lmc; int step; if (config->lid_path_bits.count == 0) { ucs_error("List of path bits must not be empty"); return UCS_ERR_INVALID_PARAM; } /* count the number of lid_path_bits */ num_path_bits = 0; for (i = 0; i < config->lid_path_bits.count; i++) { num_path_bits += 1 + abs(config->lid_path_bits.ranges[i].first - config->lid_path_bits.ranges[i].last); } iface->path_bits = ucs_calloc(1, num_path_bits * sizeof(*iface->path_bits), "ib_path_bits"); if (iface->path_bits == NULL) { return UCS_ERR_NO_MEMORY; } lmc = uct_ib_iface_port_attr(iface)->lmc; /* go over the list of values (ranges) for the lid_path_bits and set them */ iface->path_bits_count = 0; for (i = 0; i < config->lid_path_bits.count; ++i) { first = config->lid_path_bits.ranges[i].first; last = config->lid_path_bits.ranges[i].last; /* range of values or one value */ if (first < last) { step = 1; } else { step = -1; } /* fill the value/s */ for (j = first; j != (last + step); j += step) { if (j >= UCS_BIT(lmc)) { ucs_debug("Not using value %d for path_bits - must be < 2^lmc (lmc=%d)", j, lmc); if (step == 1) { break; } else { continue; } } ucs_assert(iface->path_bits_count <= num_path_bits); iface->path_bits[iface->path_bits_count] = j; iface->path_bits_count++; } } return UCS_OK; }
ucs_status_t ucp_wireup_select_lanes(ucp_ep_h ep, unsigned address_count, const ucp_address_entry_t *address_list, uint8_t *addr_indices, ucp_ep_config_key_t *key) { ucp_worker_h worker = ep->worker; ucp_lane_index_t num_amo_lanes = 0; ucp_wireup_lane_desc_t lane_descs[UCP_MAX_LANES]; ucp_rsc_index_t rsc_index, dst_md_index; ucp_lane_index_t lane; ucs_status_t status; memset(lane_descs, 0, sizeof(lane_descs)); memset(key, 0, sizeof(*key)); status = ucp_wireup_add_rma_lanes(ep, address_count, address_list, lane_descs, &key->num_lanes); if (status != UCS_OK) { return status; } status = ucp_wireup_add_amo_lanes(ep, address_count, address_list, lane_descs, &key->num_lanes); if (status != UCS_OK) { return status; } status = ucp_wireup_add_am_lane(ep, address_count, address_list, lane_descs, &key->num_lanes); if (status != UCS_OK) { return status; } status = ucp_wireup_add_rndv_lane(ep, address_count, address_list, lane_descs, &key->num_lanes); if (status != UCS_OK) { return status; } /* User should not create endpoints unless requested communication features */ if (key->num_lanes == 0) { ucs_error("No transports selected to %s", ucp_ep_peer_name(ep)); return UCS_ERR_UNREACHABLE; } /* Sort lanes according to RMA score */ qsort(lane_descs, key->num_lanes, sizeof(*lane_descs), ucp_wireup_compare_lane_rma_score); /* Construct the endpoint configuration key: * - arrange lane description in the EP configuration * - create remote MD bitmap * - create bitmap of lanes used for RMA and AMO * - if AM lane exists and fits for wireup messages, select it for this purpose. */ key->am_lane = UCP_NULL_LANE; key->rndv_lane = UCP_NULL_LANE; for (lane = 0; lane < key->num_lanes; ++lane) { rsc_index = lane_descs[lane].rsc_index; dst_md_index = lane_descs[lane].dst_md_index; key->lanes[lane] = rsc_index; addr_indices[lane] = lane_descs[lane].addr_index; ucs_assert(lane_descs[lane].usage != 0); if (lane_descs[lane].usage & UCP_WIREUP_LANE_USAGE_AM) { ucs_assert(key->am_lane == UCP_NULL_LANE); key->am_lane = lane; } if (lane_descs[lane].usage & UCP_WIREUP_LANE_USAGE_RMA) { key->rma_lane_map |= UCS_BIT(dst_md_index + lane * UCP_MD_INDEX_BITS); } if (lane_descs[lane].usage & UCP_WIREUP_LANE_USAGE_AMO) { key->amo_lanes[num_amo_lanes] = lane; ++num_amo_lanes; } if (lane_descs[lane].usage & UCP_WIREUP_LANE_USAGE_RNDV) { ucs_assert(key->rndv_lane == UCP_NULL_LANE); key->rndv_lane = lane; } } /* Sort and add AMO lanes */ ucs_qsort_r(key->amo_lanes, num_amo_lanes, sizeof(*key->amo_lanes), ucp_wireup_compare_lane_amo_score, lane_descs); for (lane = 0; lane < UCP_MAX_LANES; ++lane) { if (lane < num_amo_lanes) { dst_md_index = lane_descs[key->amo_lanes[lane]].dst_md_index; key->amo_lane_map |= UCS_BIT(dst_md_index + lane * UCP_MD_INDEX_BITS); } else { key->amo_lanes[lane] = UCP_NULL_LANE; } } key->reachable_md_map = ucp_wireup_get_reachable_mds(worker, address_count, address_list); key->wireup_msg_lane = ucp_wireup_select_wireup_msg_lane(worker, address_list, lane_descs, key->num_lanes); return UCS_OK; }