ssize_t usdf_dgram_sendv(struct fid_ep *fep, const struct iovec *iov, void **desc, size_t count, fi_addr_t dest_addr, void *context) { struct usd_dest *dest; struct usdf_ep *ep; size_t len; ep = ep_ftou(fep); len = sizeof(struct usd_udp_hdr); dest = (struct usd_dest *)(uintptr_t) dest_addr; len += _usdf_iov_len(iov, count); if (len <= USD_SEND_MAX_COPY) { return _usdf_dgram_send_iov_copy(ep, dest, iov, count, context, ep->ep_tx_completion); } else if (ep->e.dg.tx_op_flags & FI_INJECT) { USDF_DBG_SYS(EP_DATA, "given inject length (%zu) exceeds max inject length (%d)\n", len, USD_SEND_MAX_COPY); return -FI_ENOSPC; } if (count > ep->e.dg.tx_iov_limit) { USDF_DBG_SYS(EP_DATA, "max iov count exceeded: %zu\n", count); return -FI_ENOSPC; } return _usdf_dgram_send_iov(ep, dest, iov, count, context, ep->ep_tx_completion); }
/* Register as a callback triggered by the socket becoming writeable. Write as * much data as can be written in a single write, and keep track of how much * data is left. If the data is not fully written, it will finish getting * written in another iteration of the progression. */ static int usdf_pep_reject_async(void *vreq) { struct usdf_connreq *crp; int ret; crp = vreq; do { ret = write(crp->cr_sockfd, crp->cr_ptr, crp->cr_resid); } while ((ret < 0) && (errno == EINTR)); if ((ret <= 0) && (errno != EAGAIN)) { USDF_DBG_SYS(EP_CTRL, "write failed: %s\n", strerror(errno)); usdf_cm_msg_connreq_failed(crp, -errno); return -errno; } crp->cr_resid -= ret; crp->cr_ptr += ret; if (crp->cr_resid == 0) usdf_cm_msg_connreq_cleanup(crp); return FI_SUCCESS; }
ssize_t usdf_dgram_sendmsg(struct fid_ep *fep, const struct fi_msg *msg, uint64_t flags) { USDF_DBG_SYS(EP_DATA, "flags ignored!"); return usdf_dgram_sendv(fep, msg->msg_iov, msg->desc, msg->iov_count, (fi_addr_t)msg->addr, msg->context); }
ssize_t usdf_dgram_send(struct fid_ep *fep, const void *buf, size_t len, void *desc, fi_addr_t dest_addr, void *context) { struct usdf_dest *dest; struct usdf_ep *ep; uint32_t flags; ep = ep_ftou(fep); dest = (struct usdf_dest *)(uintptr_t) dest_addr; flags = (ep->ep_tx_completion) ? USD_SF_SIGNAL : 0; if (len + sizeof(struct usd_udp_hdr) <= USD_SEND_MAX_COPY) { return usd_post_send_one_copy(ep->e.dg.ep_qp, &dest->ds_dest, buf, len, flags, context); } else if (ep->e.dg.tx_op_flags & FI_INJECT) { USDF_DBG_SYS(EP_DATA, "given inject length (%zu) exceeds max inject length (%d)\n", len + sizeof(struct usd_udp_hdr), USD_SEND_MAX_COPY); return -FI_ENOSPC; } return usd_post_send_one(ep->e.dg.ep_qp, &dest->ds_dest, buf, len, flags, context); }
ssize_t usdf_dgram_prefix_sendmsg(struct fid_ep *fep, const struct fi_msg *msg, uint64_t flags) { struct iovec send_iov[USDF_DGRAM_MAX_SGE]; struct usd_dest *dest; struct usdf_ep *ep; uint8_t completion; size_t len; size_t padding; ep = ep_ftou(fep); dest = (struct usd_dest *)(uintptr_t) msg->addr; len = _usdf_iov_len(msg->msg_iov, msg->iov_count); completion = ep->ep_tx_dflt_signal_comp || (flags & FI_COMPLETION); padding = USDF_HDR_BUF_ENTRY - sizeof(struct usd_udp_hdr); if (msg->iov_count > ep->e.dg.tx_iov_limit) { USDF_DBG_SYS(EP_DATA, "max iov count exceeded: %zu\n", msg->iov_count); return -FI_ENOSPC; } if ((len - padding) <= USD_SEND_MAX_COPY) { /* _usdf_dgram_send_iov_copy isn't prefix aware and allocates * its own prefix. reorganize iov[0] base to point to data and * len to reflect data length. */ memcpy(send_iov, msg->msg_iov, sizeof(struct iovec) * msg->iov_count); send_iov[0].iov_base = ((char *) send_iov[0].iov_base + USDF_HDR_BUF_ENTRY); send_iov[0].iov_len -= USDF_HDR_BUF_ENTRY; return _usdf_dgram_send_iov_copy(ep, dest, send_iov, msg->iov_count, msg->context, completion); } else if (flags & FI_INJECT) { USDF_DBG_SYS(EP_DATA, "given inject length (%zu) exceeds max inject length (%d)\n", len, USD_SEND_MAX_COPY); return -FI_ENOSPC; } return _usdf_dgram_send_iov_prefix(ep, dest, msg->msg_iov, msg->iov_count, msg->context, completion); }
ssize_t usdf_msg_recvmsg(struct fid_ep *fep, const struct fi_msg *msg, uint64_t flags) { size_t i; struct usdf_ep *ep; struct usdf_rx *rx; struct usdf_msg_qe *rqe; struct usdf_domain *udp; size_t tot_len; const struct iovec *iov; ep = ep_ftou(fep); rx = ep->ep_rx; udp = ep->ep_domain; iov = msg->msg_iov; if (TAILQ_EMPTY(&rx->r.msg.rx_free_rqe)) { return -FI_EAGAIN; } if (flags & ~USDF_MSG_SUPP_RECVMSG_FLAGS) { USDF_DBG_SYS(EP_DATA, "one or more flags in %#" PRIx64 " not supported\n", flags); return -FI_EOPNOTSUPP; } pthread_spin_lock(&udp->dom_progress_lock); rqe = usdf_msg_get_rx_rqe(rx); rqe->ms_context = msg->context; tot_len = 0; for (i = 0; i < msg->iov_count; ++i) { rqe->ms_iov[i].iov_base = (void *)iov[i].iov_base; rqe->ms_iov[i].iov_len = iov[i].iov_len; tot_len += iov[i].iov_len; } rqe->ms_last_iov = msg->iov_count - 1; rqe->ms_cur_iov = 0; rqe->ms_resid = tot_len; rqe->ms_length = 0; rqe->ms_cur_ptr = iov[0].iov_base; rqe->ms_iov_resid = iov[0].iov_len; rqe->ms_signal_comp = ep->ep_rx_dflt_signal_comp || (flags & FI_COMPLETION) ? 1 : 0; TAILQ_INSERT_TAIL(&rx->r.msg.rx_posted_rqe, rqe, ms_link); pthread_spin_unlock(&udp->dom_progress_lock); return 0; }
ssize_t usdf_dgram_prefix_sendv(struct fid_ep *fep, const struct iovec *iov, void **desc, size_t count, fi_addr_t dest_addr, void *context) { struct iovec send_iov[USDF_DGRAM_MAX_SGE]; struct usd_dest *dest; struct usdf_ep *ep; size_t len; size_t padding; ep = ep_ftou(fep); dest = (struct usd_dest *)(uintptr_t) dest_addr; len = _usdf_iov_len(iov, count); padding = USDF_HDR_BUF_ENTRY - sizeof(struct usd_udp_hdr); if (count > ep->e.dg.tx_iov_limit) { USDF_DBG_SYS(EP_DATA, "max iov count exceeded: %zu\n", count); return -FI_ENOSPC; } if ((len - padding) <= USD_SEND_MAX_COPY) { /* _usdf_dgram_send_iov_copy isn't prefix aware and allocates * its own prefix. reorganize iov[0] base to point to data and * len to reflect data length. */ memcpy(send_iov, iov, sizeof(struct iovec) * count); send_iov[0].iov_base = ((char *) send_iov[0].iov_base + USDF_HDR_BUF_ENTRY); send_iov[0].iov_len -= USDF_HDR_BUF_ENTRY; return _usdf_dgram_send_iov_copy(ep, dest, send_iov, count, context, ep->ep_tx_completion); } else if (ep->e.dg.tx_op_flags & FI_INJECT) { USDF_DBG_SYS(EP_DATA, "given inject length (%zu) exceeds max inject length (%d)\n", len, USD_SEND_MAX_COPY); return -FI_ENOSPC; } return _usdf_dgram_send_iov_prefix(ep, dest, iov, count, context, ep->ep_tx_completion); }
static inline void usdf_msg_process_ack(struct usdf_ep *ep, uint16_t seq) { struct usdf_cq_hard *hcq; struct usdf_msg_qe *wqe; struct usdf_tx *tx; uint16_t max_ack; unsigned credits; tx = ep->ep_tx; /* don't try to ACK what we don't think we've sent */ max_ack = ep->e.msg.ep_next_tx_seq - 1; if (RUDP_SEQ_GT(seq, max_ack)) { seq = max_ack; } hcq = tx->t.msg.tx_hcq; while (!TAILQ_EMPTY(&ep->e.msg.ep_sent_wqe)) { wqe = TAILQ_FIRST(&ep->e.msg.ep_sent_wqe); if (RUDP_SEQ_LE(wqe->ms_last_seq, seq)) { TAILQ_REMOVE(&ep->e.msg.ep_sent_wqe, wqe, ms_link); USDF_DBG_SYS(EP_DATA, "send complete, signal_comp=%u\n", wqe->ms_signal_comp); if (wqe->ms_signal_comp) hcq->cqh_post(hcq, wqe->ms_context, wqe->ms_length, FI_SUCCESS, FI_MSG | FI_SEND); usdf_msg_put_tx_wqe(tx, wqe); } else { break; } } credits = RUDP_SEQ_DIFF(seq, ep->e.msg.ep_last_rx_ack); if (ep->e.msg.ep_seq_credits == 0 && credits > 0 && !TAILQ_EMPTY(&ep->e.msg.ep_posted_wqe)) { usdf_msg_ep_ready(ep); } ep->e.msg.ep_seq_credits += credits; ep->e.msg.ep_last_rx_ack = seq; /* If all ACKed, cancel timer, else reset it */ if (seq == max_ack) { usdf_timer_cancel(ep->ep_domain->dom_fabric, ep->e.msg.ep_ack_timer); } else { usdf_timer_reset(ep->ep_domain->dom_fabric, ep->e.msg.ep_ack_timer, USDF_RUDP_ACK_TIMEOUT); } }
ssize_t usdf_msg_rx_size_left(struct fid_ep *fep) { struct usdf_ep *ep; struct usdf_rx *rx; USDF_DBG_SYS(EP_DATA, "\n"); ep = ep_ftou(fep); rx = ep->ep_rx; if (rx == NULL) return -FI_EOPBADSTATE; /* EP not enabled */ return rx->r.msg.rx_num_free_rqe; }
ssize_t usdf_msg_tx_size_left(struct fid_ep *fep) { struct usdf_ep *ep; struct usdf_tx *tx; USDF_DBG_SYS(EP_DATA, "\n"); ep = ep_ftou(fep); tx = ep->ep_tx; if (tx == NULL) return -FI_EOPBADSTATE; /* EP not enabled */ return tx->t.msg.tx_num_free_wqe; }
static inline int usdf_cqe_to_flags(struct usd_completion *comp) { switch (comp->uc_type) { case USD_COMPTYPE_SEND: return (FI_MSG | FI_SEND); case USD_COMPTYPE_RECV: return (FI_MSG | FI_RECV); default: USDF_DBG_SYS(CQ, "WARNING: unknown completion type! (%d)\n", comp->uc_type); return 0; } }
ssize_t usdf_msg_rx_size_left(struct fid_ep *fep) { struct usdf_ep *ep; struct usdf_rx *rx; USDF_DBG_SYS(EP_DATA, "\n"); ep = ep_ftou(fep); rx = ep->ep_rx; if (!(ep->flags & USDF_EP_ENABLED)) return -FI_EOPBADSTATE; return rx->r.msg.rx_num_free_rqe; }
ssize_t usdf_msg_tx_size_left(struct fid_ep *fep) { struct usdf_ep *ep; struct usdf_tx *tx; USDF_DBG_SYS(EP_DATA, "\n"); ep = ep_ftou(fep); tx = ep->ep_tx; if (!(ep->flags & USDF_EP_ENABLED)) return -FI_EOPBADSTATE; return tx->t.msg.tx_num_free_wqe; }
ssize_t usdf_dgram_sendmsg(struct fid_ep *fep, const struct fi_msg *msg, uint64_t flags) { struct usd_dest *dest; struct usdf_ep *ep; uint8_t completion; size_t len; ep = ep_ftou(fep); len = sizeof(struct usd_udp_hdr); dest = (struct usd_dest *)(uintptr_t) msg->addr; completion = ep->ep_tx_dflt_signal_comp || (flags & FI_COMPLETION); len += _usdf_iov_len(msg->msg_iov, msg->iov_count); if (len <= USD_SEND_MAX_COPY) { return _usdf_dgram_send_iov_copy(ep, dest, msg->msg_iov, msg->iov_count, msg->context, completion); } else if (flags & FI_INJECT) { USDF_DBG_SYS(EP_DATA, "given inject length (%zu) exceeds max inject length (%d)\n", len, USD_SEND_MAX_COPY); return -FI_ENOSPC; } if (msg->iov_count > ep->e.dg.tx_iov_limit) { USDF_DBG_SYS(EP_DATA, "max iov count exceeded: %zu\n", msg->iov_count); return -FI_ENOSPC; } return _usdf_dgram_send_iov(ep, dest, msg->msg_iov, msg->iov_count, msg->context, completion); }
ssize_t usdf_dgram_prefix_send(struct fid_ep *fep, const void *buf, size_t len, void *desc, fi_addr_t dest_addr, void *context) { struct usd_udp_hdr *hdr; struct usd_qp_impl *qp; struct usdf_dest *dest; struct usdf_ep *ep; struct usd_wq *wq; uint32_t last_post; uint32_t flags; size_t padding; ep = ep_ftou(fep); dest = (struct usdf_dest *)(uintptr_t) dest_addr; padding = USDF_HDR_BUF_ENTRY - sizeof(struct usd_udp_hdr); flags = (ep->ep_tx_completion) ? USD_SF_SIGNAL : 0; if (ep->e.dg.tx_op_flags & FI_INJECT) { if ((len - padding) > USD_SEND_MAX_COPY) { USDF_DBG_SYS(EP_DATA, "given inject length (%zu) exceeds max inject length (%d)\n", len, USD_SEND_MAX_COPY); return -FI_ENOSPC; } return usd_post_send_one_copy(ep->e.dg.ep_qp, &dest->ds_dest, buf + USDF_HDR_BUF_ENTRY, len - USDF_HDR_BUF_ENTRY, flags, context); } qp = to_qpi(ep->e.dg.ep_qp); wq = &qp->uq_wq; hdr = (struct usd_udp_hdr *) ((char *) buf + padding); memcpy(hdr, &dest->ds_dest.ds_dest.ds_udp.u_hdr, sizeof(*hdr)); _usdf_adjust_prefix_hdr(hdr, qp, len, padding); last_post = _usd_post_send_one(wq, hdr, len - padding, ep->ep_tx_completion); _usdf_adjust_post_info(wq, last_post, context, len - USDF_HDR_BUF_ENTRY); return FI_SUCCESS; }
ssize_t usdf_dgram_tx_size_left(struct fid_ep *fep) { struct usdf_ep *ep; USDF_DBG_SYS(EP_DATA, "\n"); if (fep == NULL) return -FI_EINVAL; ep = ep_ftou(fep); if (ep->e.dg.ep_qp == NULL) return -FI_EOPBADSTATE; /* EP not enabled */ return usd_get_send_credits(ep->e.dg.ep_qp) / (ep->e.dg.tx_iov_limit + 1); }
ssize_t usdf_dgram_tx_size_left(struct fid_ep *fep) { struct usdf_ep *ep; USDF_DBG_SYS(EP_DATA, "\n"); if (fep == NULL) return -FI_EINVAL; ep = ep_ftou(fep); if (ep->e.dg.ep_qp == NULL) return -FI_EOPBADSTATE; /* EP not enabled */ /* see NOTE-SIZE-LEFT */ return usd_get_send_credits(ep->e.dg.ep_qp) / (USDF_DGRAM_DFLT_SGE + 1); }
/* Given a connection request structure containing data, make a copy of the data * that can be accessed in error entries on the EQ. The return value is the size * of the data stored in the error entry. If the return value is a non-negative * value, then the function has suceeded and the size and output data can be * assumed to be valid. If the function fails, then the data will be NULL and * the size will be a negative error value. */ static int usdf_cm_generate_err_data(struct usdf_eq *eq, struct usdf_connreq *crp, void **data) { struct usdf_err_data_entry *err_data_entry; struct usdf_connreq_msg *reqp; size_t entry_size; size_t data_size; if (!eq || !crp || !data) { USDF_DBG_SYS(EP_CTRL, "eq, crp, or data is NULL.\n"); return -FI_EINVAL; } /* Initialize to NULL so data can't be used in the error case. */ *data = NULL; reqp = (struct usdf_connreq_msg *) crp->cr_data; /* This is a normal case, maybe there was no data. */ if (!reqp || !reqp->creq_datalen) return 0; data_size = reqp->creq_datalen; entry_size = sizeof(*err_data_entry) + data_size; err_data_entry = calloc(1, entry_size); if (!err_data_entry) { USDF_WARN_SYS(EP_CTRL, "failed to allocate err data entry\n"); return -FI_ENOMEM; } /* This data should be copied and owned by the provider. Keep * track of it in the EQ, this will be freed in the next EQ read * call after it has been read. */ memcpy(err_data_entry->err_data, reqp->creq_data, data_size); slist_insert_tail(&err_data_entry->entry, &eq->eq_err_data); *data = err_data_entry->err_data; return data_size; }
ssize_t usdf_dgram_prefix_tx_size_left(struct fid_ep *fep) { struct usdf_ep *ep; USDF_DBG_SYS(EP_DATA, "\n"); if (fep == NULL) return -FI_EINVAL; ep = ep_ftou(fep); if (ep->e.dg.ep_qp == NULL) return -FI_EOPBADSTATE; /* EP not enabled */ /* prefix_sendvcan post up to iov_limit descriptors */ return (usd_get_send_credits(ep->e.dg.ep_qp) / ep->e.dg.tx_iov_limit); }
ssize_t usdf_dgram_prefix_rx_size_left(struct fid_ep *fep) { struct usdf_ep *ep; USDF_DBG_SYS(EP_DATA, "\n"); if (fep == NULL) return -FI_EINVAL; ep = ep_ftou(fep); if (ep->e.dg.ep_qp == NULL) return -FI_EOPBADSTATE; /* EP not enabled */ /* prefix_recvv can post up to iov_limit descriptors * * also see NOTE-SIZE-LEFT */ return (usd_get_recv_credits(ep->e.dg.ep_qp) / USDF_DGRAM_DFLT_SGE); }
/* Report a connection management related failure. Sometimes there is connection * event data that should be copied into the generated event. If the copy_data * parameter evaluates to true, then the data will be copied. * * If data is to be generated for the error entry, then the connection request * is assumed to have the data size in host order. If something fails during * processing of the error data, then the EQ entry will still be generated * without the error data. */ void usdf_cm_report_failure(struct usdf_connreq *crp, int error, bool copy_data) { struct fi_eq_err_entry err = {0}; struct usdf_pep *pep; struct usdf_ep *ep; struct usdf_eq *eq; fid_t fid; int ret; USDF_DBG_SYS(EP_CTRL, "error=%d (%s)\n", error, fi_strerror(error)); pep = crp->cr_pep; ep = crp->cr_ep; if (ep != NULL) { fid = ep_utofid(ep); eq = ep->ep_eq; ep->ep_domain->dom_peer_tab[ep->e.msg.ep_rem_peer_id] = NULL; } else { fid = pep_utofid(pep); eq = pep->pep_eq; } /* Try to generate the space necessary for the error data. If the * function returns a number greater than or equal to 0, then it was a * success. The return value is the size of the data. */ if (copy_data) { ret = usdf_cm_generate_err_data(eq, crp, &err.err_data); if (ret >= 0) err.err_data_size = ret; } err.fid = fid; err.err = -error; usdf_eq_write_internal(eq, 0, &err, sizeof(err), USDF_EVENT_FLAG_ERROR); usdf_cm_msg_connreq_cleanup(crp); }
ssize_t usdf_dgram_rx_size_left(struct fid_ep *fep) { struct usdf_ep *ep; USDF_DBG_SYS(EP_DATA, "\n"); if (fep == NULL) return -FI_EINVAL; ep = ep_ftou(fep); if (ep->e.dg.ep_qp == NULL) return -FI_EOPBADSTATE; /* EP not enabled */ /* NOTE-SIZE-LEFT: divide by constant right now, rather than keeping * track of the rx_attr->iov_limit value we gave to the user. This * sometimes under-reports the number of RX ops that could be posted, * but it avoids touching a cache line that we don't otherwise need. * * sendv/recvv could potentially post iov_limit+1 descriptors */ return usd_get_recv_credits(ep->e.dg.ep_qp) / (USDF_DGRAM_DFLT_SGE + 1); }
ssize_t usdf_dgram_inject(struct fid_ep *fep, const void *buf, size_t len, fi_addr_t dest_addr) { struct usdf_dest *dest; struct usdf_ep *ep; ep = ep_ftou(fep); dest = (struct usdf_dest *)(uintptr_t) dest_addr; if (len + sizeof(struct usd_udp_hdr) > USD_SEND_MAX_COPY) { USDF_DBG_SYS(EP_DATA, "given inject length (%zu) exceeds max inject length (%d)\n", len + sizeof(struct usd_udp_hdr), USD_SEND_MAX_COPY); return -FI_ENOSPC; } /* * fi_inject never generates a completion */ return usd_post_send_one_copy(ep->e.dg.ep_qp, &dest->ds_dest, buf, len, 0, NULL); }
int usdf_pep_open(struct fid_fabric *fabric, struct fi_info *info, struct fid_pep **pep_o, void *context) { struct usdf_pep *pep; struct usdf_fabric *fp; struct sockaddr_in *sin; int ret; int optval; USDF_TRACE_SYS(EP_CTRL, "\n"); if (!info) { USDF_DBG_SYS(EP_CTRL, "null fi_info struct is invalid\n"); return -FI_EINVAL; } if (info->ep_attr->type != FI_EP_MSG) { return -FI_ENODEV; } if ((info->caps & ~USDF_MSG_CAPS) != 0) { return -FI_EBADF; } switch (info->addr_format) { case FI_SOCKADDR: if (((struct sockaddr *)info->src_addr)->sa_family != AF_INET) { USDF_WARN_SYS(EP_CTRL, "non-AF_INET src_addr specified\n"); return -FI_EINVAL; } break; case FI_SOCKADDR_IN: break; default: USDF_WARN_SYS(EP_CTRL, "unknown/unsupported addr_format\n"); return -FI_EINVAL; } if (info->src_addrlen && info->src_addrlen != sizeof(struct sockaddr_in)) { USDF_WARN_SYS(EP_CTRL, "unexpected src_addrlen\n"); return -FI_EINVAL; } fp = fab_ftou(fabric); pep = calloc(1, sizeof(*pep)); if (pep == NULL) { return -FI_ENOMEM; } pep->pep_fid.fid.fclass = FI_CLASS_PEP; pep->pep_fid.fid.context = context; pep->pep_fid.fid.ops = &usdf_pep_ops; pep->pep_fid.ops = &usdf_pep_base_ops; pep->pep_fid.cm = &usdf_pep_cm_ops; pep->pep_fabric = fp; pep->pep_state = USDF_PEP_UNBOUND; pep->pep_sock = socket(AF_INET, SOCK_STREAM, 0); if (pep->pep_sock == -1) { ret = -errno; goto fail; } ret = fcntl(pep->pep_sock, F_GETFL, 0); if (ret == -1) { ret = -errno; goto fail; } ret = fcntl(pep->pep_sock, F_SETFL, ret | O_NONBLOCK); if (ret == -1) { ret = -errno; goto fail; } /* set SO_REUSEADDR to prevent annoying "Address already in use" errors * on successive runs of programs listening on a well known port */ optval = 1; ret = setsockopt(pep->pep_sock, SOL_SOCKET, SO_REUSEADDR, &optval, sizeof(optval)); if (ret == -1) { ret = -errno; goto fail; } pep->pep_info = fi_dupinfo(info); if (!pep->pep_info) { ret = -FI_ENOMEM; goto fail; } if (info->src_addrlen == 0) { /* Copy the source address information from the device * attributes. */ pep->pep_info->src_addrlen = sizeof(struct sockaddr_in); sin = calloc(1, pep->pep_info->src_addrlen); if (!sin) { USDF_WARN_SYS(EP_CTRL, "calloc for src address failed\n"); goto fail; } sin->sin_family = AF_INET; sin->sin_addr.s_addr = fp->fab_dev_attrs->uda_ipaddr_be; pep->pep_info->src_addr = sin; } memcpy(&pep->pep_src_addr, pep->pep_info->src_addr, pep->pep_info->src_addrlen); /* initialize connreq freelist */ ret = pthread_spin_init(&pep->pep_cr_lock, PTHREAD_PROCESS_PRIVATE); if (ret != 0) { ret = -ret; goto fail; } TAILQ_INIT(&pep->pep_cr_free); TAILQ_INIT(&pep->pep_cr_pending); pep->pep_backlog = 10; ret = usdf_pep_grow_backlog(pep); if (ret != 0) { goto fail; } atomic_initialize(&pep->pep_refcnt, 0); atomic_inc(&fp->fab_refcnt); *pep_o = pep_utof(pep); return 0; fail: if (pep != NULL) { usdf_pep_free_cr_lists(pep); if (pep->pep_sock != -1) { close(pep->pep_sock); } fi_freeinfo(pep->pep_info); free(pep); } return ret; }
ssize_t usdf_msg_sendmsg(struct fid_ep *fep, const struct fi_msg *msg, uint64_t flags) { size_t i; struct usdf_ep *ep; struct usdf_tx *tx; struct usdf_msg_qe *wqe; struct usdf_domain *udp; size_t tot_len; const struct iovec *iov; ep = ep_ftou(fep); tx = ep->ep_tx; udp = ep->ep_domain; iov = msg->msg_iov; if (flags & ~USDF_MSG_SUPP_SENDMSG_FLAGS) { USDF_DBG_SYS(EP_DATA, "one or more flags in %#" PRIx64 " not supported\n", flags); return -FI_EOPNOTSUPP; } /* check for inject overrun before acquiring lock and allocating wqe, * easier to unwind this way */ if (flags & FI_INJECT) { iov = msg->msg_iov; tot_len = 0; for (i = 0; i < msg->iov_count; ++i) { tot_len += iov[i].iov_len; if (tot_len > USDF_MSG_MAX_INJECT_SIZE) { USDF_DBG_SYS(EP_DATA, "max inject len exceeded (%zu)\n", tot_len); return -FI_EINVAL; } } } if (TAILQ_EMPTY(&tx->t.msg.tx_free_wqe)) { return -FI_EAGAIN; } pthread_spin_lock(&udp->dom_progress_lock); wqe = usdf_msg_get_tx_wqe(tx); wqe->ms_context = msg->context; if (flags & FI_INJECT) { tot_len = 0; for (i = 0; i < msg->iov_count; ++i) { assert(tot_len + iov[i].iov_len <= USDF_MSG_MAX_INJECT_SIZE); memcpy(&wqe->ms_inject_buf[tot_len], iov[i].iov_base, iov[i].iov_len); tot_len += iov[i].iov_len; } wqe->ms_iov[0].iov_base = wqe->ms_inject_buf; wqe->ms_iov[0].iov_len = tot_len; wqe->ms_last_iov = 0; } else { tot_len = 0; for (i = 0; i < msg->iov_count; ++i) { wqe->ms_iov[i].iov_base = (void *)iov[i].iov_base; wqe->ms_iov[i].iov_len = iov[i].iov_len; tot_len += iov[i].iov_len; } wqe->ms_last_iov = msg->iov_count - 1; } wqe->ms_cur_iov = 0; wqe->ms_resid = tot_len; wqe->ms_length = tot_len; wqe->ms_cur_ptr = iov[0].iov_base; wqe->ms_iov_resid = iov[0].iov_len; wqe->ms_signal_comp = ep->ep_tx_dflt_signal_comp || (flags & FI_COMPLETION) ? 1 : 0; /* add send to EP, and add EP to TX list if not present */ TAILQ_INSERT_TAIL(&ep->e.msg.ep_posted_wqe, wqe, ms_link); usdf_msg_ep_ready(ep); pthread_spin_unlock(&udp->dom_progress_lock); usdf_domain_progress(udp); return 0; }
int usdf_msg_fill_dom_attr(uint32_t version, const struct fi_info *hints, struct fi_info *fi, struct usd_device_attrs *dap) { int ret; struct fi_domain_attr defaults; defaults = msg_dflt_domain_attr; ret = usdf_domain_getname(version, dap, &defaults.name); if (ret < 0) return -FI_ENODATA; if (!hints || !hints->domain_attr) goto catch; /* how to handle fi_thread_fid, fi_thread_completion, etc? */ switch (hints->domain_attr->threading) { case FI_THREAD_UNSPEC: case FI_THREAD_ENDPOINT: break; default: return -FI_ENODATA; } /* how to handle fi_progress_manual? */ switch (hints->domain_attr->control_progress) { case FI_PROGRESS_UNSPEC: case FI_PROGRESS_AUTO: break; default: return -FI_ENODATA; } switch (hints->domain_attr->data_progress) { case FI_PROGRESS_UNSPEC: case FI_PROGRESS_MANUAL: break; default: return -FI_ENODATA; } switch (hints->domain_attr->resource_mgmt) { case FI_RM_UNSPEC: case FI_RM_DISABLED: break; default: return -FI_ENODATA; } switch (hints->domain_attr->caps) { case 0: case FI_REMOTE_COMM: break; default: USDF_WARN_SYS(DOMAIN, "invalid domain capabilities\n"); return -FI_ENODATA; } if (usdf_check_mr_mode(version, hints, defaults.mr_mode)) return -FI_ENODATA; if (hints->domain_attr->mr_cnt <= USDF_MSG_MR_CNT) { defaults.mr_cnt = hints->domain_attr->mr_cnt; } else { USDF_DBG_SYS(DOMAIN, "mr_count exceeded provider limit\n"); return -FI_ENODATA; } catch:
/* * Handle a receive on a queue servicing a message endpoint */ static inline void usdf_msg_handle_recv(struct usdf_domain *udp, struct usd_completion *comp) { struct rudp_pkt *pkt; struct usdf_msg_qe *rqe; struct usdf_ep *ep; struct usd_qp *qp; struct usdf_rx *rx; uint32_t peer_id; uint32_t opcode; uint8_t *rx_ptr; uint8_t *rqe_ptr; size_t cur_iov; size_t iov_resid; size_t ms_resid; size_t rxlen; size_t copylen; int ret; pkt = comp->uc_context; opcode = ntohs(pkt->msg.opcode); peer_id = ntohs(pkt->msg.src_peer_id); if (peer_id > USDF_MAX_PEERS) { qp = comp->uc_qp; rx = qp->uq_context; goto dropit; } ep = udp->dom_peer_tab[peer_id]; if (ep == NULL) { qp = comp->uc_qp; rx = qp->uq_context; goto dropit; } rx = ep->ep_rx; if (comp->uc_status != USD_COMPSTAT_SUCCESS) goto dropit; switch (opcode) { case RUDP_OP_ACK: usdf_msg_rx_ack(ep, pkt); goto dropit; case RUDP_OP_NAK: usdf_msg_rx_nak(ep, pkt); goto dropit; case RUDP_OP_FIRST: case RUDP_OP_LAST: break; default: USDF_DBG_SYS(EP_DATA, "encountered unexpected opcode %" PRIu32 "\n", opcode); goto dropit; } ret = usdf_msg_check_seq(ep, pkt); if (ret == -1) { goto dropit; } rqe = ep->e.msg.ep_cur_recv; if (rqe == NULL) { if (TAILQ_EMPTY(&rx->r.msg.rx_posted_rqe)) { goto dropit; } rqe = TAILQ_FIRST(&rx->r.msg.rx_posted_rqe); TAILQ_REMOVE(&rx->r.msg.rx_posted_rqe, rqe, ms_link); ep->e.msg.ep_cur_recv = rqe; } rx_ptr = (uint8_t *)(pkt + 1); rxlen = ntohs(pkt->msg.m.rc_data.length); rqe->ms_length += rxlen; rqe_ptr = (uint8_t *)rqe->ms_cur_ptr; iov_resid = rqe->ms_iov_resid; cur_iov = rqe->ms_cur_iov; ms_resid = rqe->ms_resid; while (rxlen > 0) { copylen = MIN(rxlen, iov_resid); memcpy(rqe_ptr, rx_ptr, copylen); rx_ptr += copylen; rxlen -= copylen; iov_resid -= copylen; ms_resid -= copylen; if (iov_resid == 0) { if (cur_iov == rqe->ms_last_iov) { break; } ++cur_iov; rqe_ptr = rqe->ms_iov[cur_iov].iov_base; iov_resid = rqe->ms_iov[cur_iov].iov_len; } else { rqe_ptr += copylen; } } if (opcode & RUDP_OP_LAST) { /* * Normally we need to store back the updated values of * ms_resid, ms_cur_iov, ms_cur_ptr and ms_iov_resid. But * being the last step of the process, updating these * values are not necessary */ if (rxlen > 0) { USDF_DBG_SYS(EP_DATA, "message truncated by %zu bytes", rxlen); rqe->ms_length -= rxlen; usdf_msg_recv_complete(ep, rqe, FI_ETRUNC); } else { usdf_msg_recv_complete(ep, rqe, FI_SUCCESS); } ep->e.msg.ep_cur_recv = NULL; } else { rqe->ms_cur_ptr = rqe_ptr; rqe->ms_iov_resid = iov_resid; rqe->ms_cur_iov = cur_iov; rqe->ms_resid = ms_resid; } dropit: /* repost buffer */ _usdf_msg_post_recv(rx, pkt, rx->rx_domain->dom_fabric->fab_dev_attrs->uda_mtu); }
int usdf_domain_open(struct fid_fabric *fabric, struct fi_info *info, struct fid_domain **domain, void *context) { struct usdf_fabric *fp; struct usdf_domain *udp; struct sockaddr_in *sin; size_t addrlen; int ret; #if ENABLE_DEBUG char requested[INET_ADDRSTRLEN], actual[INET_ADDRSTRLEN]; #endif USDF_TRACE_SYS(DOMAIN, "\n"); sin = NULL; fp = fab_fidtou(fabric); if (info->domain_attr != NULL) { /* No versioning information available here. */ if (!usdf_domain_checkname(0, fp->fab_dev_attrs, info->domain_attr->name)) { USDF_WARN_SYS(DOMAIN, "domain name mismatch\n"); return -FI_ENODATA; } if (ofi_check_mr_mode(fabric->api_version, OFI_MR_BASIC_MAP | FI_MR_LOCAL, info->domain_attr->mr_mode)) { /* the caller ignored our fi_getinfo results */ USDF_WARN_SYS(DOMAIN, "MR mode (%d) not supported\n", info->domain_attr->mr_mode); return -FI_ENODATA; } } udp = calloc(1, sizeof *udp); if (udp == NULL) { USDF_DBG("unable to alloc mem for domain\n"); ret = -FI_ENOMEM; goto fail; } USDF_DBG("uda_devname=%s\n", fp->fab_dev_attrs->uda_devname); /* * Make sure address format is good and matches this fabric */ switch (info->addr_format) { case FI_SOCKADDR: addrlen = sizeof(struct sockaddr); sin = info->src_addr; break; case FI_SOCKADDR_IN: addrlen = sizeof(struct sockaddr_in); sin = info->src_addr; break; case FI_ADDR_STR: sin = usdf_format_to_sin(info, info->src_addr); goto skip_size_check; default: ret = -FI_EINVAL; goto fail; } if (info->src_addrlen != addrlen) { ret = -FI_EINVAL; goto fail; } skip_size_check: if (sin->sin_family != AF_INET || sin->sin_addr.s_addr != fp->fab_dev_attrs->uda_ipaddr_be) { USDF_DBG_SYS(DOMAIN, "requested src_addr (%s) != fabric addr (%s)\n", inet_ntop(AF_INET, &sin->sin_addr.s_addr, requested, sizeof(requested)), inet_ntop(AF_INET, &fp->fab_dev_attrs->uda_ipaddr_be, actual, sizeof(actual))); ret = -FI_EINVAL; usdf_free_sin_if_needed(info, sin); goto fail; } usdf_free_sin_if_needed(info, sin); ret = usd_open(fp->fab_dev_attrs->uda_devname, &udp->dom_dev); if (ret != 0) { goto fail; } udp->dom_fid.fid.fclass = FI_CLASS_DOMAIN; udp->dom_fid.fid.context = context; udp->dom_fid.fid.ops = &usdf_fid_ops; udp->dom_fid.ops = &usdf_domain_ops; udp->dom_fid.mr = &usdf_domain_mr_ops; ret = pthread_spin_init(&udp->dom_progress_lock, PTHREAD_PROCESS_PRIVATE); if (ret != 0) { ret = -ret; goto fail; } TAILQ_INIT(&udp->dom_tx_ready); TAILQ_INIT(&udp->dom_hcq_list); udp->dom_info = fi_dupinfo(info); if (udp->dom_info == NULL) { ret = -FI_ENOMEM; goto fail; } if (udp->dom_info->dest_addr != NULL) { free(udp->dom_info->dest_addr); udp->dom_info->dest_addr = NULL; } ret = usdf_dom_rdc_alloc_data(udp); if (ret != 0) { goto fail; } udp->dom_fabric = fp; LIST_INSERT_HEAD(&fp->fab_domain_list, udp, dom_link); ofi_atomic_initialize32(&udp->dom_refcnt, 0); ofi_atomic_inc32(&fp->fab_refcnt); *domain = &udp->dom_fid; return 0; fail: if (udp != NULL) { if (udp->dom_info != NULL) { fi_freeinfo(udp->dom_info); } if (udp->dom_dev != NULL) { usd_close(udp->dom_dev); } usdf_dom_rdc_free_data(udp); free(udp); } return ret; }