static bool usdf_fabric_checkname(uint32_t version, struct usd_device_attrs *dap, const char *hint) { int ret; bool valid = false; char *reference; USDF_DBG("checking devname: version=%d, devname='%s'\n", version, hint); if (version) { ret = usdf_fabric_getname(version, dap, &reference); if (ret < 0) return false; if (strcmp(reference, hint) == 0) { valid = true; } else { USDF_DBG("hint %s failed to match %s\n", hint, reference); } free(reference); return valid; } /* The hint string itself is kind of a version check, in pre-1.4 the * name was just the device name. In 1.4 and beyond, then name is * actually CIDR * notation. */ if (strstr(hint, "/")) return usdf_fabric_checkname(FI_VERSION(1, 4), dap, hint); return usdf_fabric_checkname(FI_VERSION(1, 3), dap, hint); }
static int usdf_fabric_getname(uint32_t version, struct usd_device_attrs *dap, char **name) { int ret = FI_SUCCESS; char *bufp = NULL; struct in_addr in; char *addrnetw; if (FI_VERSION_GE(version, FI_VERSION(1, 4))) { in.s_addr = dap->uda_ipaddr_be & dap->uda_netmask_be; addrnetw = inet_ntoa(in); ret = asprintf(&bufp, "%s/%d", addrnetw, dap->uda_prefixlen); if (ret < 0) { USDF_DBG( "asprintf failed while creating fabric name\n"); ret = -ENOMEM; } } else { bufp = strdup(dap->uda_devname); if (!bufp) { USDF_DBG("strdup failed while creating fabric name\n"); ret = -errno; } } *name = bufp; return ret; }
/* In pre-1.4 the domain name was NULL. This is unfortunate as it makes it * difficult to tell whether providing a name was intended. In this case, it can * be broken into 4 cases: * * 1. Version is greater than or equal to 1.4 and a non-NULL hint is provided. * Just do a string compare. * 2. Version is greater than or equal to 1.4 and provided hint is NULL. Treat * this as _valid_ as it could be an application requesting a 1.4 domain name * but not providing an explicit hint. * 3. Version is less than 1.4 and a name hint is provided. This should always * be _invalid_. * 4. Version is less than 1.4 and name hint is NULL. This will always be * _valid_. */ bool usdf_domain_checkname(uint32_t version, struct usd_device_attrs *dap, const char *hint) { char *reference; bool valid; int ret; USDF_DBG("checking domain name: version=%d, domain name='%s'\n", version, hint); if (version) { valid = false; ret = usdf_domain_getname(version, dap, &reference); if (ret < 0) return false; /* If the reference name exists, then this is version 1.4 or * greater. */ if (reference) { if (hint) { /* Case 1 */ valid = (strcmp(reference, hint) == 0); } else { /* Case 2 */ valid = true; } } else { /* Case 3 & 4 */ valid = (hint == NULL); } if (!valid) USDF_DBG("given hint %s does not match %s -- invalid\n", hint, reference); free(reference); return valid; } /* If hint is non-NULL then assume the version is 1.4 if not provided. */ if (hint) return usdf_domain_checkname(FI_VERSION(1, 4), dap, hint); return usdf_domain_checkname(FI_VERSION(1, 3), dap, hint); }
ssize_t usdf_msg_recvmsg(struct fid_ep *fep, const struct fi_msg *msg, uint64_t flags) { size_t i; struct usdf_ep *ep; struct usdf_rx *rx; struct usdf_msg_qe *rqe; struct usdf_domain *udp; size_t tot_len; const struct iovec *iov; ep = ep_ftou(fep); rx = ep->ep_rx; udp = ep->ep_domain; iov = msg->msg_iov; if (TAILQ_EMPTY(&rx->r.msg.rx_free_rqe)) { return -FI_EAGAIN; } if (flags & ~USDF_MSG_SUPP_RECVMSG_FLAGS) { USDF_DBG("one or more flags in 0x%llx not supported\n", flags); return -FI_EOPNOTSUPP; } pthread_spin_lock(&udp->dom_progress_lock); rqe = TAILQ_FIRST(&rx->r.msg.rx_free_rqe); TAILQ_REMOVE(&rx->r.msg.rx_free_rqe, rqe, ms_link); --rx->r.msg.rx_num_free_rqe; rqe->ms_context = msg->context; tot_len = 0; for (i = 0; i < msg->iov_count; ++i) { rqe->ms_iov[i].iov_base = (void *)iov[i].iov_base; rqe->ms_iov[i].iov_len = iov[i].iov_len; tot_len += iov[i].iov_len; } rqe->ms_last_iov = msg->iov_count - 1; rqe->ms_cur_iov = 0; rqe->ms_resid = tot_len; rqe->ms_length = tot_len; rqe->ms_cur_ptr = iov[0].iov_base; rqe->ms_iov_resid = iov[0].iov_len; rqe->ms_signal_comp = ep->ep_rx_dflt_signal_comp || (flags & FI_COMPLETION) ? 1 : 0; TAILQ_INSERT_TAIL(&rx->r.msg.rx_posted_rqe, rqe, ms_link); pthread_spin_unlock(&udp->dom_progress_lock); return 0; }
int usdf_domain_getname(uint32_t version, struct usd_device_attrs *dap, char **name) { int ret = FI_SUCCESS; char *buf = NULL; if (FI_VERSION_GE(version, FI_VERSION(1, 4))) { buf = strdup(dap->uda_devname); if (!buf) { ret = -errno; USDF_DBG("strdup failed while creating domain name\n"); } } *name = buf; return ret; }
static int usdf_handle_node_and_service(const char *node, const char *service, uint64_t flags, void **src, void **dest, const struct fi_info *hints, struct addrinfo **ai) { int ret; struct sockaddr_in *sin; if (node != NULL || service != NULL) { if (hints && hints->addr_format == FI_ADDR_STR) { /* FI_ADDR_STR can't have service param. */ if (service) return -FI_EINVAL; sin = usdf_format_to_sin(hints, node); if (!sin) /* This could be invalid or no memory. */ return -FI_EINVAL; } else { ret = getaddrinfo(node, service, NULL, ai); if (ret != 0) { USDF_DBG("getaddrinfo failed: %d: <%s>\n", ret, gai_strerror(ret)); return ret; } sin = (struct sockaddr_in *)(*ai)->ai_addr; } if (flags & FI_SOURCE) *src = usdf_sin_to_format(hints, sin, NULL); else *dest = usdf_sin_to_format(hints, sin, NULL); } return FI_SUCCESS; }
static int usdf_fabric_open(struct fi_fabric_attr *fattrp, struct fid_fabric **fabric, void *context) { struct fid_fabric *ff; struct usdf_fabric *fp; struct usdf_usnic_info *dp; struct usdf_dev_entry *dep; struct epoll_event ev; struct sockaddr_in sin; int ret; int d; USDF_TRACE("\n"); /* Make sure this fabric exists */ dp = __usdf_devinfo; for (d = 0; d < dp->uu_num_devs; ++d) { dep = &dp->uu_info[d]; if (dep->ue_dev_ok && strcmp(fattrp->name, dep->ue_dattr.uda_devname) == 0) { break; } } if (d >= dp->uu_num_devs) { USDF_INFO("device \"%s\" does not exit, returning -FI_ENODEV\n", fattrp->name); return -FI_ENODEV; } fp = calloc(1, sizeof(*fp)); if (fp == NULL) { USDF_INFO("unable to allocate memory for fabric\n"); return -FI_ENOMEM; } fp->fab_epollfd = -1; fp->fab_arp_sockfd = -1; LIST_INIT(&fp->fab_domain_list); fp->fab_attr.fabric = fab_utof(fp); fp->fab_attr.name = strdup(fattrp->name); fp->fab_attr.prov_name = strdup(USDF_PROV_NAME); fp->fab_attr.prov_version = USDF_PROV_VERSION; if (fp->fab_attr.name == NULL || fp->fab_attr.prov_name == NULL) { ret = -FI_ENOMEM; goto fail; } fp->fab_fid.fid.fclass = FI_CLASS_FABRIC; fp->fab_fid.fid.context = context; fp->fab_fid.fid.ops = &usdf_fi_ops; fp->fab_fid.ops = &usdf_ops_fabric; fp->fab_dev_attrs = &dep->ue_dattr; fp->fab_epollfd = epoll_create(1024); if (fp->fab_epollfd == -1) { ret = -errno; USDF_INFO("unable to allocate epoll fd\n"); goto fail; } fp->fab_eventfd = eventfd(0, EFD_NONBLOCK | EFD_SEMAPHORE); if (fp->fab_eventfd == -1) { ret = -errno; USDF_INFO("unable to allocate event fd\n"); goto fail; } fp->fab_poll_item.pi_rtn = usdf_fabric_progression_cb; fp->fab_poll_item.pi_context = fp; ev.events = EPOLLIN; ev.data.ptr = &fp->fab_poll_item; ret = epoll_ctl(fp->fab_epollfd, EPOLL_CTL_ADD, fp->fab_eventfd, &ev); if (ret == -1) { ret = -errno; USDF_INFO("unable to EPOLL_CTL_ADD\n"); goto fail; } /* initialize timer subsystem */ ret = usdf_timer_init(fp); if (ret != 0) { USDF_INFO("unable to initialize timer\n"); goto fail; } ret = pthread_create(&fp->fab_thread, NULL, usdf_fabric_progression_thread, fp); if (ret != 0) { ret = -ret; USDF_INFO("unable to create progress thread\n"); goto fail; } /* create and bind socket for ARP resolution */ memset(&sin, 0, sizeof(sin)); sin.sin_family = AF_INET; sin.sin_addr.s_addr = fp->fab_dev_attrs->uda_ipaddr_be; fp->fab_arp_sockfd = socket(AF_INET, SOCK_DGRAM, 0); if (fp->fab_arp_sockfd == -1) { USDF_INFO("unable to create socket\n"); goto fail; } ret = bind(fp->fab_arp_sockfd, (struct sockaddr *) &sin, sizeof(sin)); if (ret == -1) { ret = -errno; goto fail; } atomic_initialize(&fp->fab_refcnt, 0); fattrp->fabric = fab_utof(fp); fattrp->prov_version = USDF_PROV_VERSION; *fabric = fab_utof(fp); USDF_INFO("successfully opened %s/%s\n", fattrp->name, fp->fab_dev_attrs->uda_ifname); return 0; fail: ff = fab_utof(fp); usdf_fabric_close(&ff->fid); USDF_DBG("returning %d (%s)\n", ret, fi_strerror(-ret)); return ret; }
static int usdf_getinfo(uint32_t version, const char *node, const char *service, uint64_t flags, struct fi_info *hints, struct fi_info **info) { struct usdf_usnic_info *dp; struct usdf_dev_entry *dep; struct usd_device_attrs *dap; struct fi_info *fi_first; struct fi_info *fi_last; struct addrinfo *ai; struct sockaddr_in *src; struct sockaddr_in *dest; enum fi_ep_type ep_type; int metric; int d; int ret; USDF_TRACE("\n"); fi_first = NULL; fi_last = NULL; ai = NULL; src = NULL; dest = NULL; /* * Get and cache usNIC device info */ if (__usdf_devinfo == NULL) { ret = usdf_get_devinfo(); if (ret != 0) { USDF_WARN("failed to usdf_get_devinfo, ret=%d (%s)\n", ret, fi_strerror(-ret)); if (ret == -FI_ENODEV) ret = -FI_ENODATA; goto fail; } } dp = __usdf_devinfo; if (node != NULL || service != NULL) { ret = getaddrinfo(node, service, NULL, &ai); if (ret != 0) { USDF_DBG("getaddrinfo failed, likely bad node/service specified (%s:%s)\n", node, service); ret = -errno; goto fail; } if (flags & FI_SOURCE) { src = (struct sockaddr_in *)ai->ai_addr; } else { dest = (struct sockaddr_in *)ai->ai_addr; } } if (hints != NULL) { if (dest == NULL && hints->dest_addr != NULL) { dest = hints->dest_addr; } if (src == NULL && hints->src_addr != NULL) { src = hints->src_addr; } } for (d = 0; d < dp->uu_num_devs; ++d) { dep = &dp->uu_info[d]; dap = &dep->ue_dattr; /* skip this device if it has some problem */ if (!dep->ue_dev_ok) { USDF_DBG("skipping %s/%s\n", dap->uda_devname, dap->uda_ifname); continue; } /* See if dest is reachable from this device */ if (dest != NULL && dest->sin_addr.s_addr != INADDR_ANY) { ret = usdf_get_distance(dap, dest->sin_addr.s_addr, &metric); if (ret != 0) { goto fail; } if (metric == -1) { USDF_DBG("dest %s unreachable from %s/%s, skipping\n", inet_ntoa(dest->sin_addr), dap->uda_devname, dap->uda_ifname); continue; } } /* Does this device match requested attributes? */ if (hints != NULL) { ret = usdf_validate_hints(hints, dap); if (ret != 0) { USDF_DBG("hints do not match for %s/%s, skipping\n", dap->uda_devname, dap->uda_ifname); continue; } ep_type = hints->ep_attr ? hints->ep_attr->type : FI_EP_UNSPEC; } else { ep_type = FI_EP_UNSPEC; } if (ep_type == FI_EP_DGRAM || ep_type == FI_EP_UNSPEC) { ret = usdf_fill_info_dgram(version, hints, src, dest, dap, &fi_first, &fi_last); if (ret != 0 && ret != -FI_ENODATA) { goto fail; } } if (ep_type == FI_EP_MSG || ep_type == FI_EP_UNSPEC) { ret = usdf_fill_info_msg(hints, src, dest, dap, &fi_first, &fi_last); if (ret != 0 && ret != -FI_ENODATA) { goto fail; } } if (ep_type == FI_EP_RDM || ep_type == FI_EP_UNSPEC) { ret = usdf_fill_info_rdm(hints, src, dest, dap, &fi_first, &fi_last); if (ret != 0 && ret != -FI_ENODATA) { goto fail; } } } if (fi_first != NULL) { *info = fi_first; ret = 0; } else { ret = -FI_ENODATA; } fail: if (ret != 0) { fi_freeinfo(fi_first); } if (ai != NULL) { freeaddrinfo(ai); } if (ret != 0) { USDF_INFO("returning %d (%s)\n", ret, fi_strerror(-ret)); } return ret; }
static int usdf_fill_addr_info(struct fi_info *fi, uint32_t addr_format, struct sockaddr_in *src, struct sockaddr_in *dest, struct usd_device_attrs *dap) { struct sockaddr_in *sin; int ret; #if ENABLE_DEBUG char requested[INET_ADDRSTRLEN], actual[INET_ADDRSTRLEN]; #endif if (addr_format != FI_FORMAT_UNSPEC) { fi->addr_format = addr_format; } else { fi->addr_format = FI_SOCKADDR_IN; } switch (fi->addr_format) { case FI_SOCKADDR: case FI_SOCKADDR_IN: if (src != NULL && src->sin_addr.s_addr != INADDR_ANY && src->sin_addr.s_addr != dap->uda_ipaddr_be) { USDF_DBG("src addr (%s) does not match device addr (%s)\n", inet_ntop(AF_INET, &src->sin_addr.s_addr, requested, sizeof(requested)), inet_ntop(AF_INET, &dap->uda_ipaddr_be, actual, sizeof(actual))); ret = -FI_ENODATA; goto fail; } sin = calloc(1, sizeof(*sin)); fi->src_addr = sin; if (sin == NULL) { ret = -FI_ENOMEM; goto fail; } fi->src_addrlen = sizeof(*sin); sin->sin_family = AF_INET; sin->sin_addr.s_addr = dap->uda_ipaddr_be; if (src != NULL) { sin->sin_port = src->sin_port; } /* copy in dest if specified */ if (dest != NULL) { sin = calloc(1, sizeof(*sin)); *sin = *dest; fi->dest_addr = sin; fi->dest_addrlen = sizeof(*sin); } break; default: ret = -FI_ENODATA; goto fail; } return 0; fail: return ret; // fi_freeinfo() in caller frees all }
int usdf_domain_open(struct fid_fabric *fabric, struct fi_info *info, struct fid_domain **domain, void *context) { struct usdf_fabric *fp; struct usdf_domain *udp; struct sockaddr_in *sin; size_t addrlen; int ret; #if ENABLE_DEBUG char requested[INET_ADDRSTRLEN], actual[INET_ADDRSTRLEN]; #endif USDF_TRACE_SYS(DOMAIN, "\n"); sin = NULL; fp = fab_fidtou(fabric); if (info->domain_attr != NULL) { /* No versioning information available here. */ if (!usdf_domain_checkname(0, fp->fab_dev_attrs, info->domain_attr->name)) { USDF_WARN_SYS(DOMAIN, "domain name mismatch\n"); return -FI_ENODATA; } if (ofi_check_mr_mode(fabric->api_version, OFI_MR_BASIC_MAP | FI_MR_LOCAL, info->domain_attr->mr_mode)) { /* the caller ignored our fi_getinfo results */ USDF_WARN_SYS(DOMAIN, "MR mode (%d) not supported\n", info->domain_attr->mr_mode); return -FI_ENODATA; } } udp = calloc(1, sizeof *udp); if (udp == NULL) { USDF_DBG("unable to alloc mem for domain\n"); ret = -FI_ENOMEM; goto fail; } USDF_DBG("uda_devname=%s\n", fp->fab_dev_attrs->uda_devname); /* * Make sure address format is good and matches this fabric */ switch (info->addr_format) { case FI_SOCKADDR: addrlen = sizeof(struct sockaddr); sin = info->src_addr; break; case FI_SOCKADDR_IN: addrlen = sizeof(struct sockaddr_in); sin = info->src_addr; break; case FI_ADDR_STR: sin = usdf_format_to_sin(info, info->src_addr); goto skip_size_check; default: ret = -FI_EINVAL; goto fail; } if (info->src_addrlen != addrlen) { ret = -FI_EINVAL; goto fail; } skip_size_check: if (sin->sin_family != AF_INET || sin->sin_addr.s_addr != fp->fab_dev_attrs->uda_ipaddr_be) { USDF_DBG_SYS(DOMAIN, "requested src_addr (%s) != fabric addr (%s)\n", inet_ntop(AF_INET, &sin->sin_addr.s_addr, requested, sizeof(requested)), inet_ntop(AF_INET, &fp->fab_dev_attrs->uda_ipaddr_be, actual, sizeof(actual))); ret = -FI_EINVAL; usdf_free_sin_if_needed(info, sin); goto fail; } usdf_free_sin_if_needed(info, sin); ret = usd_open(fp->fab_dev_attrs->uda_devname, &udp->dom_dev); if (ret != 0) { goto fail; } udp->dom_fid.fid.fclass = FI_CLASS_DOMAIN; udp->dom_fid.fid.context = context; udp->dom_fid.fid.ops = &usdf_fid_ops; udp->dom_fid.ops = &usdf_domain_ops; udp->dom_fid.mr = &usdf_domain_mr_ops; ret = pthread_spin_init(&udp->dom_progress_lock, PTHREAD_PROCESS_PRIVATE); if (ret != 0) { ret = -ret; goto fail; } TAILQ_INIT(&udp->dom_tx_ready); TAILQ_INIT(&udp->dom_hcq_list); udp->dom_info = fi_dupinfo(info); if (udp->dom_info == NULL) { ret = -FI_ENOMEM; goto fail; } if (udp->dom_info->dest_addr != NULL) { free(udp->dom_info->dest_addr); udp->dom_info->dest_addr = NULL; } ret = usdf_dom_rdc_alloc_data(udp); if (ret != 0) { goto fail; } udp->dom_fabric = fp; LIST_INSERT_HEAD(&fp->fab_domain_list, udp, dom_link); ofi_atomic_initialize32(&udp->dom_refcnt, 0); ofi_atomic_inc32(&fp->fab_refcnt); *domain = &udp->dom_fid; return 0; fail: if (udp != NULL) { if (udp->dom_info != NULL) { fi_freeinfo(udp->dom_info); } if (udp->dom_dev != NULL) { usd_close(udp->dom_dev); } usdf_dom_rdc_free_data(udp); free(udp); } return ret; }
ssize_t usdf_msg_sendmsg(struct fid_ep *fep, const struct fi_msg *msg, uint64_t flags) { size_t i; struct usdf_ep *ep; struct usdf_tx *tx; struct usdf_msg_qe *wqe; struct usdf_domain *udp; size_t tot_len; const struct iovec *iov; ep = ep_ftou(fep); tx = ep->ep_tx; udp = ep->ep_domain; iov = msg->msg_iov; if (flags & ~USDF_MSG_SUPP_SENDMSG_FLAGS) { USDF_DBG("one or more flags in 0x%llx not supported\n", flags); return -FI_EOPNOTSUPP; } /* check for inject overrun before acquiring lock and allocating wqe, * easier to unwind this way */ if (flags & FI_INJECT) { iov = msg->msg_iov; tot_len = 0; for (i = 0; i < msg->iov_count; ++i) { tot_len += iov[i].iov_len; if (tot_len > USDF_MSG_MAX_INJECT_SIZE) { USDF_DBG_SYS(EP_DATA, "max inject len exceeded (%zu)\n", tot_len); return -FI_EINVAL; } } } if (TAILQ_EMPTY(&tx->t.msg.tx_free_wqe)) { return -FI_EAGAIN; } pthread_spin_lock(&udp->dom_progress_lock); wqe = TAILQ_FIRST(&tx->t.msg.tx_free_wqe); TAILQ_REMOVE(&tx->t.msg.tx_free_wqe, wqe, ms_link); wqe->ms_context = msg->context; if (flags & FI_INJECT) { tot_len = 0; for (i = 0; i < msg->iov_count; ++i) { assert(tot_len + iov[i].iov_len <= USDF_MSG_MAX_INJECT_SIZE); memcpy(&wqe->ms_inject_buf[tot_len], iov[i].iov_base, iov[i].iov_len); tot_len += iov[i].iov_len; } wqe->ms_iov[0].iov_base = wqe->ms_inject_buf; wqe->ms_iov[0].iov_len = tot_len; wqe->ms_last_iov = 0; } else { tot_len = 0; for (i = 0; i < msg->iov_count; ++i) { wqe->ms_iov[i].iov_base = (void *)iov[i].iov_base; wqe->ms_iov[i].iov_len = iov[i].iov_len; tot_len += iov[i].iov_len; } wqe->ms_last_iov = msg->iov_count - 1; } wqe->ms_cur_iov = 0; wqe->ms_resid = tot_len; wqe->ms_length = tot_len; wqe->ms_cur_ptr = iov[0].iov_base; wqe->ms_iov_resid = iov[0].iov_len; wqe->ms_signal_comp = ep->ep_tx_dflt_signal_comp || (flags & FI_COMPLETION) ? 1 : 0; /* add send to EP, and add EP to TX list if not present */ TAILQ_INSERT_TAIL(&ep->e.msg.ep_posted_wqe, wqe, ms_link); usdf_msg_ep_ready(ep); pthread_spin_unlock(&udp->dom_progress_lock); usdf_domain_progress(udp); return 0; }