static inline ssize_t usdf_cq_copy_soft_entry(void *dst, const struct usdf_cq_soft_entry *src, enum fi_cq_format dst_format) { struct fi_cq_entry *ctx_entry; struct fi_cq_msg_entry *msg_entry; struct fi_cq_data_entry *data_entry; switch (dst_format) { case FI_CQ_FORMAT_CONTEXT: ctx_entry = (struct fi_cq_entry *)dst; ctx_entry->op_context = src->cse_context; break; case FI_CQ_FORMAT_MSG: msg_entry = (struct fi_cq_msg_entry *)dst; msg_entry->op_context = src->cse_context; msg_entry->flags = src->cse_flags; msg_entry->len = src->cse_len; break; case FI_CQ_FORMAT_DATA: data_entry = (struct fi_cq_data_entry *)dst; data_entry->op_context = src->cse_context; data_entry->flags = src->cse_flags; data_entry->len = src->cse_len; data_entry->buf = src->cse_buf; data_entry->data = src->cse_data; break; default: USDF_WARN("unexpected CQ format, internal error\n"); return -FI_EOPNOTSUPP; } return FI_SUCCESS; }
/* * poll a soft CQ * This will loop over all the hard CQs within, collecting results. * Since this routine is an inline and is always called with format as * a constant, I am counting on the compiler optimizing away all the switches * on format. */ static inline ssize_t usdf_cq_read_common_soft(struct fid_cq *fcq, void *buf, size_t count, enum fi_cq_format format) { struct usdf_cq *cq; uint8_t *entry; uint8_t *last; void *tail; size_t entry_len; cq = cq_ftou(fcq); if (cq->cq_comp.uc_status != 0) { return -FI_EAVAIL; } /* progress... */ usdf_domain_progress(cq->cq_domain); switch (format) { case FI_CQ_FORMAT_CONTEXT: entry_len = sizeof(struct fi_cq_entry); break; case FI_CQ_FORMAT_MSG: entry_len = sizeof(struct fi_cq_msg_entry); break; case FI_CQ_FORMAT_DATA: entry_len = sizeof(struct fi_cq_data_entry); break; default: USDF_WARN("unexpected CQ format, internal error\n"); return -FI_EOPNOTSUPP; } entry = buf; last = entry + (entry_len * count); tail = cq->c.soft.cq_tail; // XXX ... handle error comps while (entry < last && tail != cq->c.soft.cq_head) { memcpy(entry, tail, entry_len); entry += entry_len; tail = (uint8_t *)tail + entry_len; if (tail == cq->c.soft.cq_end) { tail = cq->c.soft.cq_comps; } } cq->c.soft.cq_tail = tail; if (entry > (uint8_t *)buf) { return (entry - (uint8_t *)buf) / entry_len; } else { return -FI_EAGAIN; } }
/* Checks that the given address is actually a sockaddr_in of appropriate * length. "addr_format" is an FI_ constant like FI_SOCKADDR_IN indicating the * claimed type of the given address. * * Returns true if address is actually a sockaddr_in, false otherwise. * * Upon successful return, "addr" can be safely cast to either * "struct sockaddr_in *" or "struct sockaddr *". * * "addr" should not be NULL. */ bool usdf_cm_addr_is_valid_sin(void *addr, size_t addrlen, uint32_t addr_format) { assert(addr != NULL); switch (addr_format) { case FI_SOCKADDR_IN: case FI_SOCKADDR: if (addrlen != sizeof(struct sockaddr_in)) { USDF_WARN("addrlen is incorrect\n"); return false; } if (((struct sockaddr *)addr)->sa_family != AF_INET) { USDF_WARN("unknown/unsupported addr_format\n"); return false; } return true; default: USDF_WARN("unknown/unsupported addr_format\n"); return false; } }
ssize_t usdf_msg_inject(struct fid_ep *fep, const void *buf, size_t len, fi_addr_t dest_addr) { struct usdf_ep *ep; struct usdf_tx *tx; struct usdf_msg_qe *wqe; struct usdf_domain *udp; if (len > USDF_MSG_MAX_INJECT_SIZE) { USDF_WARN("cannot inject more than inject_size bytes\n"); return -EINVAL; } ep = ep_ftou(fep); tx = ep->ep_tx; udp = ep->ep_domain; if (TAILQ_EMPTY(&tx->t.msg.tx_free_wqe)) { return -FI_EAGAIN; } pthread_spin_lock(&udp->dom_progress_lock); wqe = TAILQ_FIRST(&tx->t.msg.tx_free_wqe); TAILQ_REMOVE(&tx->t.msg.tx_free_wqe, wqe, ms_link); wqe->ms_context = NULL; memcpy(wqe->ms_inject_buf, buf, len); wqe->ms_iov[0].iov_base = wqe->ms_inject_buf; wqe->ms_iov[0].iov_len = len; wqe->ms_last_iov = 0; wqe->ms_cur_iov = 0; wqe->ms_cur_ptr = buf; wqe->ms_iov_resid = len; wqe->ms_resid = len; wqe->ms_length = len; /* fi_inject() never signals a completion */ wqe->ms_signal_comp = 0; /* add send to EP, and add EP to TX list if not present */ TAILQ_INSERT_TAIL(&ep->e.msg.ep_posted_wqe, wqe, ms_link); usdf_msg_ep_ready(ep); pthread_spin_unlock(&udp->dom_progress_lock); usdf_domain_progress(udp); return 0; }
static inline ssize_t usdf_cq_copy_cq_entry(void *dst, struct usd_completion *src, enum fi_cq_format format) { struct fi_cq_entry *ctx_entry; struct fi_cq_msg_entry *msg_entry; struct fi_cq_data_entry *data_entry; switch (format) { case FI_CQ_FORMAT_CONTEXT: ctx_entry = (struct fi_cq_entry *)dst; ctx_entry->op_context = src->uc_context; break; case FI_CQ_FORMAT_MSG: msg_entry = (struct fi_cq_msg_entry *)dst; msg_entry->op_context = src->uc_context; msg_entry->flags = usdf_cqe_to_flags(src); msg_entry->len = src->uc_bytes; usdf_cq_adjust_len(src, &msg_entry->len); break; case FI_CQ_FORMAT_DATA: data_entry = (struct fi_cq_data_entry *)dst; data_entry->op_context = src->uc_context; data_entry->flags = usdf_cqe_to_flags(src); data_entry->len = src->uc_bytes; data_entry->buf = 0; /* XXX */ data_entry->data = 0; usdf_cq_adjust_len(src, &data_entry->len); break; default: USDF_WARN("unexpected CQ format, internal error\n"); return -FI_EOPNOTSUPP; } return FI_SUCCESS; }
/* * poll a soft CQ * This will loop over all the hard CQs within, collecting results. * Since this routine is an inline and is always called with format as * a constant, I am counting on the compiler optimizing away all the switches * on format. */ static inline ssize_t usdf_cq_read_common_soft(struct fid_cq *fcq, void *buf, size_t count, enum fi_cq_format format) { struct usdf_cq *cq; uint8_t *entry; uint8_t *last; struct usdf_cq_soft_entry *tail; size_t entry_len; ssize_t ret; cq = cq_ftou(fcq); if (cq->cq_comp.uc_status != 0) { return -FI_EAVAIL; } /* progress... */ usdf_domain_progress(cq->cq_domain); switch (format) { case FI_CQ_FORMAT_CONTEXT: entry_len = sizeof(struct fi_cq_entry); break; case FI_CQ_FORMAT_MSG: entry_len = sizeof(struct fi_cq_msg_entry); break; case FI_CQ_FORMAT_DATA: entry_len = sizeof(struct fi_cq_data_entry); break; default: USDF_WARN("unexpected CQ format, internal error\n"); return -FI_EOPNOTSUPP; } entry = buf; last = entry + (entry_len * count); tail = cq->c.soft.cq_tail; while (entry < last) { /* If the head and tail are equal and the last * operation was a read then that means we have an * empty queue. */ if ((tail == cq->c.soft.cq_head) && (cq->c.soft.cq_last_op == USDF_SOFT_CQ_READ)) break; if (tail->cse_prov_errno > 0) { if (entry > (uint8_t *) buf) break; else return -FI_EAVAIL; } ret = usdf_cq_copy_soft_entry(entry, tail, format); if (ret < 0) { return ret; } entry += entry_len; tail++; if (tail == cq->c.soft.cq_end) { tail = cq->c.soft.cq_comps; } cq->c.soft.cq_last_op = USDF_SOFT_CQ_READ; } cq->c.soft.cq_tail = tail; if (entry > (uint8_t *)buf) { return (entry - (uint8_t *)buf) / entry_len; } else { return -FI_EAGAIN; } }
static ssize_t usdf_cq_sread_common_soft(struct fid_cq *fcq, void *buf, size_t count, const void *cond, int timeout_ms, enum fi_cq_format format) { struct usdf_cq *cq; uint8_t *entry; uint8_t *last; struct usdf_cq_soft_entry *tail; size_t entry_len; size_t sleep_time_us; size_t time_spent_us = 0; ssize_t ret; cq = cq_ftou(fcq); if (cq->cq_attr.wait_obj == FI_WAIT_NONE) return -FI_EOPNOTSUPP; sleep_time_us = SREAD_INIT_SLEEP_TIME_US; switch (format) { case FI_CQ_FORMAT_CONTEXT: entry_len = sizeof(struct fi_cq_entry); break; case FI_CQ_FORMAT_MSG: entry_len = sizeof(struct fi_cq_msg_entry); break; case FI_CQ_FORMAT_DATA: entry_len = sizeof(struct fi_cq_data_entry); break; default: USDF_WARN("unexpected CQ format, internal error\n"); return -FI_EOPNOTSUPP; } entry = buf; last = entry + (entry_len * count); while (1) { /* progress... */ usdf_domain_progress(cq->cq_domain); tail = cq->c.soft.cq_tail; while (entry < last) { /* If the head and tail are equal and the last * operation was a read then that means we have an * empty queue. */ if ((tail == cq->c.soft.cq_head) && (cq->c.soft.cq_last_op == USDF_SOFT_CQ_READ)) break; if (tail->cse_prov_errno > 0) { if (entry > (uint8_t *)buf) break; else return -FI_EAVAIL; } ret = usdf_cq_copy_soft_entry(entry, tail, format); if (ret < 0) return ret; entry += entry_len; tail++; if (tail == cq->c.soft.cq_end) tail = cq->c.soft.cq_comps; cq->c.soft.cq_last_op = USDF_SOFT_CQ_READ; } if (entry > (uint8_t *)buf) { cq->c.soft.cq_tail = tail; return (entry - (uint8_t *)buf) / entry_len; } else { if (timeout_ms >= 0 && (time_spent_us >= 1000 * timeout_ms)) break; usleep(sleep_time_us); time_spent_us += sleep_time_us; /* exponentially back off up to a limit */ if (sleep_time_us < SREAD_MAX_SLEEP_TIME_US) sleep_time_us *= SREAD_EXP_BASE; sleep_time_us = MIN(sleep_time_us, SREAD_MAX_SLEEP_TIME_US); } } return -FI_EAGAIN; }
static int usdf_getinfo(uint32_t version, const char *node, const char *service, uint64_t flags, struct fi_info *hints, struct fi_info **info) { struct usdf_usnic_info *dp; struct usdf_dev_entry *dep; struct usd_device_attrs *dap; struct fi_info *fi_first; struct fi_info *fi_last; struct addrinfo *ai; struct sockaddr_in *src; struct sockaddr_in *dest; enum fi_ep_type ep_type; int metric; int d; int ret; USDF_TRACE("\n"); fi_first = NULL; fi_last = NULL; ai = NULL; src = NULL; dest = NULL; /* * Get and cache usNIC device info */ if (__usdf_devinfo == NULL) { ret = usdf_get_devinfo(); if (ret != 0) { USDF_WARN("failed to usdf_get_devinfo, ret=%d (%s)\n", ret, fi_strerror(-ret)); if (ret == -FI_ENODEV) ret = -FI_ENODATA; goto fail; } } dp = __usdf_devinfo; if (node != NULL || service != NULL) { ret = getaddrinfo(node, service, NULL, &ai); if (ret != 0) { USDF_DBG("getaddrinfo failed, likely bad node/service specified (%s:%s)\n", node, service); ret = -errno; goto fail; } if (flags & FI_SOURCE) { src = (struct sockaddr_in *)ai->ai_addr; } else { dest = (struct sockaddr_in *)ai->ai_addr; } } if (hints != NULL) { if (dest == NULL && hints->dest_addr != NULL) { dest = hints->dest_addr; } if (src == NULL && hints->src_addr != NULL) { src = hints->src_addr; } } for (d = 0; d < dp->uu_num_devs; ++d) { dep = &dp->uu_info[d]; dap = &dep->ue_dattr; /* skip this device if it has some problem */ if (!dep->ue_dev_ok) { USDF_DBG("skipping %s/%s\n", dap->uda_devname, dap->uda_ifname); continue; } /* See if dest is reachable from this device */ if (dest != NULL && dest->sin_addr.s_addr != INADDR_ANY) { ret = usdf_get_distance(dap, dest->sin_addr.s_addr, &metric); if (ret != 0) { goto fail; } if (metric == -1) { USDF_DBG("dest %s unreachable from %s/%s, skipping\n", inet_ntoa(dest->sin_addr), dap->uda_devname, dap->uda_ifname); continue; } } /* Does this device match requested attributes? */ if (hints != NULL) { ret = usdf_validate_hints(hints, dap); if (ret != 0) { USDF_DBG("hints do not match for %s/%s, skipping\n", dap->uda_devname, dap->uda_ifname); continue; } ep_type = hints->ep_attr ? hints->ep_attr->type : FI_EP_UNSPEC; } else { ep_type = FI_EP_UNSPEC; } if (ep_type == FI_EP_DGRAM || ep_type == FI_EP_UNSPEC) { ret = usdf_fill_info_dgram(version, hints, src, dest, dap, &fi_first, &fi_last); if (ret != 0 && ret != -FI_ENODATA) { goto fail; } } if (ep_type == FI_EP_MSG || ep_type == FI_EP_UNSPEC) { ret = usdf_fill_info_msg(hints, src, dest, dap, &fi_first, &fi_last); if (ret != 0 && ret != -FI_ENODATA) { goto fail; } } if (ep_type == FI_EP_RDM || ep_type == FI_EP_UNSPEC) { ret = usdf_fill_info_rdm(hints, src, dest, dap, &fi_first, &fi_last); if (ret != 0 && ret != -FI_ENODATA) { goto fail; } } } if (fi_first != NULL) { *info = fi_first; ret = 0; } else { ret = -FI_ENODATA; } fail: if (ret != 0) { fi_freeinfo(fi_first); } if (ai != NULL) { freeaddrinfo(ai); } if (ret != 0) { USDF_INFO("returning %d (%s)\n", ret, fi_strerror(-ret)); } return ret; }
static int usdf_getinfo(uint32_t version, const char *node, const char *service, uint64_t flags, const struct fi_info *hints, struct fi_info **info) { struct usdf_usnic_info *dp; struct usdf_dev_entry *dep; struct usd_device_attrs *dap; struct fi_info *fi_first; struct fi_info *fi_last; struct addrinfo *ai; void *src; void *dest; enum fi_ep_type ep_type; int d; int ret; USDF_TRACE("\n"); fi_first = NULL; fi_last = NULL; ai = NULL; src = NULL; dest = NULL; /* * Get and cache usNIC device info */ if (__usdf_devinfo == NULL) { ret = usdf_get_devinfo(); if (ret != 0) { USDF_WARN("failed to usdf_get_devinfo, ret=%d (%s)\n", ret, fi_strerror(-ret)); if (ret == -FI_ENODEV) ret = -FI_ENODATA; goto fail; } } dp = __usdf_devinfo; /* Check the hints up front and fail if they're invalid. */ if (hints) { ret = usdf_validate_hints(version, hints); if (ret) { USDF_WARN_SYS(FABRIC, "hints failed to validate\n"); goto fail; } } /* Get the src and dest if user specified. */ ret = usdf_handle_node_and_service(node, service, flags, &src, &dest, hints, &ai); if (ret) { USDF_WARN_SYS(FABRIC, "failed to handle node and service.\n"); goto fail; } if (hints != NULL) { if (dest == NULL && hints->dest_addr != NULL) dest = hints->dest_addr; if (src == NULL && hints->src_addr != NULL) src = hints->src_addr; } for (d = 0; d < dp->uu_num_devs; ++d) { dep = &dp->uu_info[d]; dap = &dep->ue_dattr; /* If the device has an issue or the hints don't match the * device information, then skip. */ if (!usdf_check_device(version, hints, src, dest, dep)) continue; if (hints && hints->ep_attr) ep_type = hints->ep_attr->type; else ep_type = FI_EP_UNSPEC; if (ep_type == FI_EP_DGRAM || ep_type == FI_EP_UNSPEC) { ret = usdf_fill_info_dgram(version, hints, src, dest, dap, &fi_first, &fi_last); if (ret != 0 && ret != -FI_ENODATA) { goto fail; } } if (ep_type == FI_EP_MSG || ep_type == FI_EP_UNSPEC) { ret = usdf_fill_info_msg(version, hints, src, dest, dap, &fi_first, &fi_last); if (ret != 0 && ret != -FI_ENODATA) { goto fail; } } if (ep_type == FI_EP_RDM || ep_type == FI_EP_UNSPEC) { ret = usdf_fill_info_rdm(version, hints, src, dest, dap, &fi_first, &fi_last); if (ret != 0 && ret != -FI_ENODATA) { goto fail; } } } if (fi_first != NULL) { *info = fi_first; ret = 0; } else { ret = -FI_ENODATA; } fail: if (ai) freeaddrinfo(ai); if (ret != 0) { fi_freeinfo(fi_first); USDF_INFO("returning %d (%s)\n", ret, fi_strerror(-ret)); } return ret; }