/* * poll a soft CQ * This will loop over all the hard CQs within, collecting results. * Since this routine is an inline and is always called with format as * a constant, I am counting on the compiler optimizing away all the switches * on format. */ static inline ssize_t usdf_cq_read_common_soft(struct fid_cq *fcq, void *buf, size_t count, enum fi_cq_format format) { struct usdf_cq *cq; uint8_t *entry; uint8_t *last; void *tail; size_t entry_len; cq = cq_ftou(fcq); if (cq->cq_comp.uc_status != 0) { return -FI_EAVAIL; } /* progress... */ usdf_domain_progress(cq->cq_domain); switch (format) { case FI_CQ_FORMAT_CONTEXT: entry_len = sizeof(struct fi_cq_entry); break; case FI_CQ_FORMAT_MSG: entry_len = sizeof(struct fi_cq_msg_entry); break; case FI_CQ_FORMAT_DATA: entry_len = sizeof(struct fi_cq_data_entry); break; default: USDF_WARN("unexpected CQ format, internal error\n"); return -FI_EOPNOTSUPP; } entry = buf; last = entry + (entry_len * count); tail = cq->c.soft.cq_tail; // XXX ... handle error comps while (entry < last && tail != cq->c.soft.cq_head) { memcpy(entry, tail, entry_len); entry += entry_len; tail = (uint8_t *)tail + entry_len; if (tail == cq->c.soft.cq_end) { tail = cq->c.soft.cq_comps; } } cq->c.soft.cq_tail = tail; if (entry > (uint8_t *)buf) { return (entry - (uint8_t *)buf) / entry_len; } else { return -FI_EAGAIN; } }
ssize_t usdf_msg_sendv(struct fid_ep *fep, const struct iovec *iov, void **desc, size_t count, fi_addr_t dest_addr, void *context) { size_t i; struct usdf_ep *ep; struct usdf_tx *tx; struct usdf_msg_qe *wqe; struct usdf_domain *udp; size_t tot_len; uint64_t op_flags; ep = ep_ftou(fep); tx = ep->ep_tx; udp = ep->ep_domain; if (TAILQ_EMPTY(&tx->t.msg.tx_free_wqe)) { return -FI_EAGAIN; } pthread_spin_lock(&udp->dom_progress_lock); wqe = TAILQ_FIRST(&tx->t.msg.tx_free_wqe); TAILQ_REMOVE(&tx->t.msg.tx_free_wqe, wqe, ms_link); wqe->ms_context = context; tot_len = 0; for (i = 0; i < count; ++i) { wqe->ms_iov[i].iov_base = (void *)iov[i].iov_base; wqe->ms_iov[i].iov_len = iov[i].iov_len; tot_len += iov[i].iov_len; } wqe->ms_last_iov = count - 1; wqe->ms_cur_iov = 0; wqe->ms_cur_ptr = iov[0].iov_base; wqe->ms_iov_resid = iov[0].iov_len; wqe->ms_resid = tot_len; wqe->ms_length = tot_len; op_flags = ep->ep_tx->tx_attr.op_flags; wqe->ms_signal_comp = ep->ep_tx_dflt_signal_comp || (op_flags & FI_COMPLETION) ? 1 : 0; /* add send to EP, and add EP to TX list if not present */ TAILQ_INSERT_TAIL(&ep->e.msg.ep_posted_wqe, wqe, ms_link); usdf_msg_ep_ready(ep); pthread_spin_unlock(&udp->dom_progress_lock); usdf_domain_progress(udp); return 0; }
ssize_t usdf_msg_inject(struct fid_ep *fep, const void *buf, size_t len, fi_addr_t dest_addr) { struct usdf_ep *ep; struct usdf_tx *tx; struct usdf_msg_qe *wqe; struct usdf_domain *udp; if (len > USDF_MSG_MAX_INJECT_SIZE) { USDF_WARN_SYS(EP_DATA, "cannot inject more than inject_size bytes\n"); return -EINVAL; } ep = ep_ftou(fep); tx = ep->ep_tx; udp = ep->ep_domain; if (TAILQ_EMPTY(&tx->t.msg.tx_free_wqe)) { return -FI_EAGAIN; } pthread_spin_lock(&udp->dom_progress_lock); wqe = usdf_msg_get_tx_wqe(tx); wqe->ms_context = NULL; memcpy(wqe->ms_inject_buf, buf, len); wqe->ms_iov[0].iov_base = wqe->ms_inject_buf; wqe->ms_iov[0].iov_len = len; wqe->ms_last_iov = 0; wqe->ms_cur_iov = 0; wqe->ms_cur_ptr = buf; wqe->ms_iov_resid = len; wqe->ms_resid = len; wqe->ms_length = len; /* fi_inject() never signals a completion */ wqe->ms_signal_comp = 0; /* add send to EP, and add EP to TX list if not present */ TAILQ_INSERT_TAIL(&ep->e.msg.ep_posted_wqe, wqe, ms_link); usdf_msg_ep_ready(ep); pthread_spin_unlock(&udp->dom_progress_lock); usdf_domain_progress(udp); return 0; }
ssize_t usdf_msg_send(struct fid_ep *fep, const void *buf, size_t len, void *desc, fi_addr_t dest_addr, void *context) { struct usdf_ep *ep; struct usdf_tx *tx; struct usdf_msg_qe *wqe; struct usdf_domain *udp; uint64_t op_flags; ep = ep_ftou(fep); tx = ep->ep_tx; udp = ep->ep_domain; if (TAILQ_EMPTY(&tx->t.msg.tx_free_wqe)) { return -FI_EAGAIN; } pthread_spin_lock(&udp->dom_progress_lock); wqe = usdf_msg_get_tx_wqe(tx); wqe->ms_context = context; wqe->ms_iov[0].iov_base = (void *)buf; wqe->ms_iov[0].iov_len = len; wqe->ms_last_iov = 0; wqe->ms_cur_iov = 0; wqe->ms_cur_ptr = buf; wqe->ms_iov_resid = len; wqe->ms_resid = len; wqe->ms_length = len; op_flags = ep->ep_tx->tx_attr.op_flags; wqe->ms_signal_comp = ep->ep_tx_dflt_signal_comp || (op_flags & FI_COMPLETION) ? 1 : 0; /* add send to EP, and add EP to TX list if not present */ TAILQ_INSERT_TAIL(&ep->e.msg.ep_posted_wqe, wqe, ms_link); usdf_msg_ep_ready(ep); pthread_spin_unlock(&udp->dom_progress_lock); usdf_domain_progress(udp); return 0; }
ssize_t usdf_msg_send(struct fid_ep *fep, const void *buf, size_t len, void *desc, fi_addr_t dest_addr, void *context) { struct usdf_ep *ep; struct usdf_tx *tx; struct usdf_msg_qe *wqe; struct usdf_domain *udp; ep = ep_ftou(fep); tx = ep->ep_tx; udp = ep->ep_domain; if (TAILQ_EMPTY(&tx->t.msg.tx_free_wqe)) { return -FI_EAGAIN; } pthread_spin_lock(&udp->dom_progress_lock); wqe = TAILQ_FIRST(&tx->t.msg.tx_free_wqe); TAILQ_REMOVE(&tx->t.msg.tx_free_wqe, wqe, ms_link); wqe->ms_context = context; wqe->ms_iov[0].iov_base = (void *)buf; wqe->ms_iov[0].iov_len = len; wqe->ms_last_iov = 0; wqe->ms_cur_iov = 0; wqe->ms_cur_ptr = buf; wqe->ms_iov_resid = len; wqe->ms_resid = len; wqe->ms_length = len; /* add send to EP, and add EP to TX list if not present */ TAILQ_INSERT_TAIL(&ep->e.msg.ep_posted_wqe, wqe, ms_link); usdf_msg_ep_ready(ep); pthread_spin_unlock(&udp->dom_progress_lock); usdf_domain_progress(udp); return 0; }
ssize_t usdf_msg_sendmsg(struct fid_ep *fep, const struct fi_msg *msg, uint64_t flags) { size_t i; struct usdf_ep *ep; struct usdf_tx *tx; struct usdf_msg_qe *wqe; struct usdf_domain *udp; size_t tot_len; const struct iovec *iov; ep = ep_ftou(fep); tx = ep->ep_tx; udp = ep->ep_domain; iov = msg->msg_iov; if (flags & ~USDF_MSG_SUPP_SENDMSG_FLAGS) { USDF_DBG_SYS(EP_DATA, "one or more flags in %#" PRIx64 " not supported\n", flags); return -FI_EOPNOTSUPP; } /* check for inject overrun before acquiring lock and allocating wqe, * easier to unwind this way */ if (flags & FI_INJECT) { iov = msg->msg_iov; tot_len = 0; for (i = 0; i < msg->iov_count; ++i) { tot_len += iov[i].iov_len; if (tot_len > USDF_MSG_MAX_INJECT_SIZE) { USDF_DBG_SYS(EP_DATA, "max inject len exceeded (%zu)\n", tot_len); return -FI_EINVAL; } } } if (TAILQ_EMPTY(&tx->t.msg.tx_free_wqe)) { return -FI_EAGAIN; } pthread_spin_lock(&udp->dom_progress_lock); wqe = usdf_msg_get_tx_wqe(tx); wqe->ms_context = msg->context; if (flags & FI_INJECT) { tot_len = 0; for (i = 0; i < msg->iov_count; ++i) { assert(tot_len + iov[i].iov_len <= USDF_MSG_MAX_INJECT_SIZE); memcpy(&wqe->ms_inject_buf[tot_len], iov[i].iov_base, iov[i].iov_len); tot_len += iov[i].iov_len; } wqe->ms_iov[0].iov_base = wqe->ms_inject_buf; wqe->ms_iov[0].iov_len = tot_len; wqe->ms_last_iov = 0; } else { tot_len = 0; for (i = 0; i < msg->iov_count; ++i) { wqe->ms_iov[i].iov_base = (void *)iov[i].iov_base; wqe->ms_iov[i].iov_len = iov[i].iov_len; tot_len += iov[i].iov_len; } wqe->ms_last_iov = msg->iov_count - 1; } wqe->ms_cur_iov = 0; wqe->ms_resid = tot_len; wqe->ms_length = tot_len; wqe->ms_cur_ptr = iov[0].iov_base; wqe->ms_iov_resid = iov[0].iov_len; wqe->ms_signal_comp = ep->ep_tx_dflt_signal_comp || (flags & FI_COMPLETION) ? 1 : 0; /* add send to EP, and add EP to TX list if not present */ TAILQ_INSERT_TAIL(&ep->e.msg.ep_posted_wqe, wqe, ms_link); usdf_msg_ep_ready(ep); pthread_spin_unlock(&udp->dom_progress_lock); usdf_domain_progress(udp); return 0; }
/* * poll a soft CQ * This will loop over all the hard CQs within, collecting results. * Since this routine is an inline and is always called with format as * a constant, I am counting on the compiler optimizing away all the switches * on format. */ static inline ssize_t usdf_cq_read_common_soft(struct fid_cq *fcq, void *buf, size_t count, enum fi_cq_format format) { struct usdf_cq *cq; uint8_t *entry; uint8_t *last; struct usdf_cq_soft_entry *tail; size_t entry_len; ssize_t ret; cq = cq_ftou(fcq); if (cq->cq_comp.uc_status != 0) { return -FI_EAVAIL; } /* progress... */ usdf_domain_progress(cq->cq_domain); switch (format) { case FI_CQ_FORMAT_CONTEXT: entry_len = sizeof(struct fi_cq_entry); break; case FI_CQ_FORMAT_MSG: entry_len = sizeof(struct fi_cq_msg_entry); break; case FI_CQ_FORMAT_DATA: entry_len = sizeof(struct fi_cq_data_entry); break; default: USDF_WARN("unexpected CQ format, internal error\n"); return -FI_EOPNOTSUPP; } entry = buf; last = entry + (entry_len * count); tail = cq->c.soft.cq_tail; while (entry < last) { /* If the head and tail are equal and the last * operation was a read then that means we have an * empty queue. */ if ((tail == cq->c.soft.cq_head) && (cq->c.soft.cq_last_op == USDF_SOFT_CQ_READ)) break; if (tail->cse_prov_errno > 0) { if (entry > (uint8_t *) buf) break; else return -FI_EAVAIL; } ret = usdf_cq_copy_soft_entry(entry, tail, format); if (ret < 0) { return ret; } entry += entry_len; tail++; if (tail == cq->c.soft.cq_end) { tail = cq->c.soft.cq_comps; } cq->c.soft.cq_last_op = USDF_SOFT_CQ_READ; } cq->c.soft.cq_tail = tail; if (entry > (uint8_t *)buf) { return (entry - (uint8_t *)buf) / entry_len; } else { return -FI_EAGAIN; } }
static ssize_t usdf_cq_sread_common_soft(struct fid_cq *fcq, void *buf, size_t count, const void *cond, int timeout_ms, enum fi_cq_format format) { struct usdf_cq *cq; uint8_t *entry; uint8_t *last; struct usdf_cq_soft_entry *tail; size_t entry_len; size_t sleep_time_us; size_t time_spent_us = 0; ssize_t ret; cq = cq_ftou(fcq); if (cq->cq_attr.wait_obj == FI_WAIT_NONE) return -FI_EOPNOTSUPP; sleep_time_us = SREAD_INIT_SLEEP_TIME_US; switch (format) { case FI_CQ_FORMAT_CONTEXT: entry_len = sizeof(struct fi_cq_entry); break; case FI_CQ_FORMAT_MSG: entry_len = sizeof(struct fi_cq_msg_entry); break; case FI_CQ_FORMAT_DATA: entry_len = sizeof(struct fi_cq_data_entry); break; default: USDF_WARN("unexpected CQ format, internal error\n"); return -FI_EOPNOTSUPP; } entry = buf; last = entry + (entry_len * count); while (1) { /* progress... */ usdf_domain_progress(cq->cq_domain); tail = cq->c.soft.cq_tail; while (entry < last) { /* If the head and tail are equal and the last * operation was a read then that means we have an * empty queue. */ if ((tail == cq->c.soft.cq_head) && (cq->c.soft.cq_last_op == USDF_SOFT_CQ_READ)) break; if (tail->cse_prov_errno > 0) { if (entry > (uint8_t *)buf) break; else return -FI_EAVAIL; } ret = usdf_cq_copy_soft_entry(entry, tail, format); if (ret < 0) return ret; entry += entry_len; tail++; if (tail == cq->c.soft.cq_end) tail = cq->c.soft.cq_comps; cq->c.soft.cq_last_op = USDF_SOFT_CQ_READ; } if (entry > (uint8_t *)buf) { cq->c.soft.cq_tail = tail; return (entry - (uint8_t *)buf) / entry_len; } else { if (timeout_ms >= 0 && (time_spent_us >= 1000 * timeout_ms)) break; usleep(sleep_time_us); time_spent_us += sleep_time_us; /* exponentially back off up to a limit */ if (sleep_time_us < SREAD_MAX_SLEEP_TIME_US) sleep_time_us *= SREAD_EXP_BASE; sleep_time_us = MIN(sleep_time_us, SREAD_MAX_SLEEP_TIME_US); } } return -FI_EAGAIN; }