static void rxd_set_rx_credits(struct rxd_ep *ep, struct rxd_rx_entry *rx_entry) { size_t num_pkts, avail, size_left; size_left = rx_entry->op_hdr.size - rx_entry->done; num_pkts = (size_left + rxd_ep_domain(ep)->max_mtu_sz - 1) / rxd_ep_domain(ep)->max_mtu_sz; avail = MIN(ep->credits, num_pkts); rx_entry->credits = MIN(avail, RXD_MAX_RX_CREDITS); rx_entry->last_win_seg += rx_entry->credits; ep->credits -= rx_entry->credits; }
void rxd_init_data_pkt(struct rxd_ep *ep, struct rxd_x_entry *tx_entry, struct rxd_pkt_entry *pkt_entry) { struct rxd_data_pkt *data_pkt = (struct rxd_data_pkt *) (pkt_entry->pkt); uint32_t seg_size; seg_size = tx_entry->cq_entry.len - tx_entry->bytes_done; seg_size = MIN(rxd_ep_domain(ep)->max_seg_sz, seg_size); data_pkt->base_hdr.version = RXD_PROTOCOL_VERSION; data_pkt->base_hdr.type = (tx_entry->cq_entry.flags & (FI_READ | FI_REMOTE_READ)) ? RXD_DATA_READ : RXD_DATA; data_pkt->ext_hdr.rx_id = tx_entry->rx_id; data_pkt->ext_hdr.tx_id = tx_entry->tx_id; data_pkt->ext_hdr.seg_no = tx_entry->next_seg_no++; data_pkt->base_hdr.peer = ep->peers[tx_entry->peer].peer_addr; pkt_entry->pkt_size = ofi_copy_from_iov(data_pkt->msg, seg_size, tx_entry->iov, tx_entry->iov_count, tx_entry->bytes_done); pkt_entry->peer = tx_entry->peer; tx_entry->bytes_done += pkt_entry->pkt_size; pkt_entry->pkt_size += sizeof(*data_pkt) + ep->tx_prefix_size; }
static ssize_t rxd_generic_write_inject(struct rxd_ep *rxd_ep, const struct iovec *iov, size_t iov_count, const struct fi_rma_iov *rma_iov, size_t rma_count, fi_addr_t addr, void *context, uint32_t op, uint64_t data, uint32_t rxd_flags) { struct rxd_x_entry *tx_entry; fi_addr_t rxd_addr; ssize_t ret = -FI_EAGAIN; assert(iov_count <= RXD_IOV_LIMIT && rma_count <= RXD_IOV_LIMIT); assert(ofi_total_iov_len(iov, iov_count) <= rxd_ep_domain(rxd_ep)->max_inline_rma); fastlock_acquire(&rxd_ep->util_ep.lock); fastlock_acquire(&rxd_ep->util_ep.tx_cq->cq_lock); if (ofi_cirque_isfull(rxd_ep->util_ep.tx_cq->cirq)) goto out; rxd_addr = rxd_ep_av(rxd_ep)->fi_addr_table[addr]; ret = rxd_send_rts_if_needed(rxd_ep, rxd_addr); if (ret) goto out; tx_entry = rxd_tx_entry_init(rxd_ep, iov, iov_count, NULL, 0, rma_count, data, 0, context, rxd_addr, op, rxd_flags); if (!tx_entry) goto out; ret = rxd_ep_send_op(rxd_ep, tx_entry, rma_iov, rma_count, NULL, 0, 0, 0); if (ret) { rxd_tx_entry_free(rxd_ep, tx_entry); goto out; } if (tx_entry->op == RXD_READ_REQ) goto out; ret = 0; out: fastlock_release(&rxd_ep->util_ep.tx_cq->cq_lock); fastlock_release(&rxd_ep->util_ep.lock); return ret; }
int rxd_ep_post_buf(struct rxd_ep *ep) { struct rxd_pkt_entry *pkt_entry; ssize_t ret; pkt_entry = rxd_get_rx_pkt(ep); if (!pkt_entry) return -FI_ENOMEM; ret = fi_recv(ep->dg_ep, rxd_pkt_start(pkt_entry), rxd_ep_domain(ep)->max_mtu_sz, rxd_mr_desc(pkt_entry->mr, ep), FI_ADDR_UNSPEC, &pkt_entry->context); if (ret) { ofi_buf_free(pkt_entry); FI_WARN(&rxd_prov, FI_LOG_EP_CTRL, "failed to repost\n"); return ret; } ep->posted_bufs++; slist_insert_tail(&pkt_entry->s_entry, &ep->rx_pkt_list); return 0; }
static size_t rxd_avail_buf(struct rxd_ep *rxd_ep, struct rxd_base_hdr *hdr, void *ptr) { return rxd_ep_domain(rxd_ep)->max_inline_msg - ((uintptr_t) ptr - (uintptr_t) hdr); }
struct rxd_x_entry *rxd_tx_entry_init(struct rxd_ep *ep, const struct iovec *iov, size_t iov_count, const struct iovec *res_iov, size_t res_count, size_t rma_count, uint64_t data, uint64_t tag, void *context, fi_addr_t addr, uint32_t op, uint32_t flags) { struct rxd_x_entry *tx_entry; struct rxd_domain *rxd_domain = rxd_ep_domain(ep); size_t max_inline; tx_entry = rxd_get_tx_entry(ep, op); if (!tx_entry) { FI_WARN(&rxd_prov, FI_LOG_EP_CTRL, "could not get tx entry\n"); return NULL; } tx_entry->op = op; tx_entry->peer = addr; tx_entry->flags = flags; tx_entry->bytes_done = 0; tx_entry->offset = 0; tx_entry->next_seg_no = 0; tx_entry->iov_count = iov_count; memcpy(&tx_entry->iov[0], iov, sizeof(*iov) * iov_count); if (res_count) { tx_entry->res_count = res_count; memcpy(&tx_entry->res_iov[0], res_iov, sizeof(*res_iov) * res_count); } tx_entry->cq_entry.op_context = context; tx_entry->cq_entry.len = ofi_total_iov_len(iov, iov_count); tx_entry->cq_entry.buf = iov[0].iov_base; tx_entry->cq_entry.flags = ofi_tx_cq_flags(op); tx_entry->cq_entry.tag = tag; tx_entry->pkt = NULL; max_inline = rxd_domain->max_inline_msg; if (tx_entry->cq_entry.flags & FI_RMA) max_inline -= sizeof(struct ofi_rma_iov) * rma_count; if (tx_entry->flags & RXD_TAG_HDR) max_inline -= sizeof(tx_entry->cq_entry.tag); if (tx_entry->flags & RXD_REMOTE_CQ_DATA) { max_inline -= sizeof(tx_entry->cq_entry.data); tx_entry->cq_entry.data = data; } if (rma_count > 1 || tx_entry->cq_entry.flags & FI_READ || tx_entry->cq_entry.len > max_inline) max_inline -= sizeof(struct rxd_sar_hdr); else tx_entry->flags |= RXD_INLINE; if (tx_entry->cq_entry.flags & FI_ATOMIC || tx_entry->cq_entry.len <= max_inline) tx_entry->num_segs = 1; else if (tx_entry->cq_entry.flags & FI_READ) tx_entry->num_segs = ofi_div_ceil(tx_entry->cq_entry.len, rxd_domain->max_seg_sz); else tx_entry->num_segs = ofi_div_ceil(tx_entry->cq_entry.len - max_inline, rxd_domain->max_seg_sz) + 1; if ((tx_entry->op == RXD_READ_REQ || tx_entry->op == RXD_ATOMIC_FETCH || tx_entry->op == RXD_ATOMIC_COMPARE) && ep->peers[tx_entry->peer].unacked_cnt < ep->peers[tx_entry->peer].tx_window && ep->peers[tx_entry->peer].peer_addr != FI_ADDR_UNSPEC) dlist_insert_tail(&tx_entry->entry, &ep->peers[tx_entry->peer].rma_rx_list); else dlist_insert_tail(&tx_entry->entry, &ep->peers[tx_entry->peer].tx_list); return tx_entry; }
int rxd_ep_init_res(struct rxd_ep *ep, struct fi_info *fi_info) { struct rxd_domain *rxd_domain = rxd_ep_domain(ep); struct ofi_bufpool_attr entry_pool_attr = { .size = sizeof(struct rxd_x_entry), .alignment = RXD_BUF_POOL_ALIGNMENT, .max_cnt = 0, .flags = OFI_BUFPOOL_INDEXED, }; int ret; ret = ofi_bufpool_create_ex(&ep->tx_pkt_pool, rxd_domain->max_mtu_sz + sizeof(struct rxd_pkt_entry), RXD_BUF_POOL_ALIGNMENT, 0, RXD_TX_POOL_CHUNK_CNT, ep->do_local_mr ? rxd_buf_region_alloc_fn : NULL, ep->do_local_mr ? rxd_buf_region_free_fn : NULL, rxd_domain); if (ret) return ret; ret = ofi_bufpool_create_ex(&ep->rx_pkt_pool, rxd_domain->max_mtu_sz + sizeof (struct rxd_pkt_entry), RXD_BUF_POOL_ALIGNMENT, 0, RXD_RX_POOL_CHUNK_CNT, ep->do_local_mr ? rxd_buf_region_alloc_fn : NULL, ep->do_local_mr ? rxd_buf_region_free_fn : NULL, rxd_domain); if (ret) goto err; entry_pool_attr.flags |= OFI_BUFPOOL_NO_TRACK; entry_pool_attr.chunk_cnt = ep->tx_size; ret = ofi_bufpool_create_attr(&entry_pool_attr, &ep->tx_entry_pool); if (ret) goto err; entry_pool_attr.chunk_cnt = ep->rx_size; ret = ofi_bufpool_create_attr(&entry_pool_attr, &ep->rx_entry_pool); if (ret) goto err; dlist_init(&ep->rx_list); dlist_init(&ep->rx_tag_list); dlist_init(&ep->active_peers); dlist_init(&ep->rts_sent_list); dlist_init(&ep->unexp_list); dlist_init(&ep->unexp_tag_list); dlist_init(&ep->ctrl_pkts); slist_init(&ep->rx_pkt_list); return 0; err: if (ep->tx_pkt_pool) ofi_bufpool_destroy(ep->tx_pkt_pool); if (ep->rx_pkt_pool) ofi_bufpool_destroy(ep->rx_pkt_pool); if (ep->tx_entry_pool) ofi_bufpool_destroy(ep->tx_entry_pool); if (ep->rx_entry_pool) ofi_bufpool_destroy(ep->rx_entry_pool); return ret; } static void rxd_init_peer(struct rxd_ep *ep, uint64_t rxd_addr) { ep->peers[rxd_addr].peer_addr = FI_ADDR_UNSPEC; ep->peers[rxd_addr].tx_seq_no = 0; ep->peers[rxd_addr].rx_seq_no = 0; ep->peers[rxd_addr].last_rx_ack = 0; ep->peers[rxd_addr].last_tx_ack = 0; ep->peers[rxd_addr].rx_window = rxd_env.max_unacked; ep->peers[rxd_addr].tx_window = rxd_env.max_unacked; ep->peers[rxd_addr].unacked_cnt = 0; ep->peers[rxd_addr].retry_cnt = 0; ep->peers[rxd_addr].active = 0; dlist_init(&ep->peers[rxd_addr].unacked); dlist_init(&ep->peers[rxd_addr].tx_list); dlist_init(&ep->peers[rxd_addr].rx_list); dlist_init(&ep->peers[rxd_addr].rma_rx_list); dlist_init(&ep->peers[rxd_addr].buf_pkts); }
int rxd_process_start_data(struct rxd_ep *ep, struct rxd_rx_entry *rx_entry, struct rxd_peer *peer, struct ofi_ctrl_hdr *ctrl, struct fi_cq_msg_entry *comp, struct rxd_rx_buf *rx_buf) { uint64_t idx; int i, offset, ret; struct ofi_rma_iov *rma_iov; struct rxd_pkt_data_start *pkt_start; struct rxd_tx_entry *tx_entry; pkt_start = (struct rxd_pkt_data_start *) ctrl; switch (rx_entry->op_hdr.op) { case ofi_op_msg: rx_entry->recv = rxd_get_recv_entry(ep, rx_entry); if (!rx_entry->recv) { if (ep->num_unexp_msg < RXD_EP_MAX_UNEXP_MSG) { dlist_insert_tail(&rx_entry->unexp_entry, &ep->unexp_msg_list); rx_entry->unexp_buf = rx_buf; ep->num_unexp_msg++; return -FI_ENOENT; } else { FI_WARN(&rxd_prov, FI_LOG_EP_CTRL, "dropping msg\n"); return -FI_ENOMEM; } } rxd_ep_handle_data_msg(ep, peer, rx_entry, rx_entry->recv->iov, rx_entry->recv->msg.iov_count, ctrl, pkt_start->data, rx_buf); break; case ofi_op_tagged: rx_entry->trecv = rxd_get_trecv_entry(ep, rx_entry); if (!rx_entry->trecv) { if (ep->num_unexp_msg < RXD_EP_MAX_UNEXP_MSG) { dlist_insert_tail(&rx_entry->unexp_entry, &ep->unexp_tag_list); rx_entry->unexp_buf = rx_buf; ep->num_unexp_msg++; return -FI_ENOENT; } else { FI_WARN(&rxd_prov, FI_LOG_EP_CTRL, "dropping msg\n"); return -FI_ENOMEM; } } rxd_ep_handle_data_msg(ep, peer, rx_entry, rx_entry->trecv->iov, rx_entry->trecv->msg.iov_count, ctrl, pkt_start->data, rx_buf); break; case ofi_op_write: rma_iov = (struct ofi_rma_iov *) pkt_start->data; for (i = 0; i < rx_entry->op_hdr.iov_count; i++) { ret = rxd_mr_verify(rxd_ep_domain(ep), rma_iov[i].len, (uintptr_t *) &rma_iov[i].addr, rma_iov[i].key, FI_REMOTE_WRITE); if (ret) { /* todo: handle invalid key case */ FI_WARN(&rxd_prov, FI_LOG_EP_CTRL, "invalid key/access permissions\n"); return -FI_EACCES; } rx_entry->write.iov[i].iov_base = (void *) (uintptr_t) rma_iov[i].addr; rx_entry->write.iov[i].iov_len = rma_iov[i].len; } offset = sizeof(struct ofi_rma_iov) * rx_entry->op_hdr.iov_count; ctrl->seg_size -= offset; rxd_ep_handle_data_msg(ep, peer, rx_entry, rx_entry->write.iov, rx_entry->op_hdr.iov_count, ctrl, pkt_start->data + offset, rx_buf); break; case ofi_op_read_req: rma_iov = (struct ofi_rma_iov *) pkt_start->data; tx_entry = rxd_tx_entry_alloc(ep, peer, rx_entry->peer, 0, RXD_TX_READ_RSP); if (!tx_entry) { FI_WARN(&rxd_prov, FI_LOG_EP_CTRL, "no free tx-entry\n"); return -FI_ENOMEM; } tx_entry->peer = rx_entry->peer; tx_entry->read_rsp.iov_count = rx_entry->op_hdr.iov_count; for (i = 0; i < rx_entry->op_hdr.iov_count; i++) { ret = rxd_mr_verify(rxd_ep_domain(ep), rma_iov[i].len, (uintptr_t *) &rma_iov[i].addr, rma_iov[i].key, FI_REMOTE_READ); if (ret) { /* todo: handle invalid key case */ FI_WARN(&rxd_prov, FI_LOG_EP_CTRL, "invalid key/access permissions\n"); return -FI_EACCES; } tx_entry->read_rsp.src_iov[i].iov_base = (void *) (uintptr_t) rma_iov[i].addr; tx_entry->read_rsp.src_iov[i].iov_len = rma_iov[i].len; } tx_entry->read_rsp.peer_msg_id = ctrl->msg_id; ret = rxd_ep_start_xfer(ep, peer, ofi_op_read_rsp, tx_entry); if (ret) rxd_tx_entry_free(ep, tx_entry); rxd_rx_entry_free(ep, rx_entry); break; case ofi_op_read_rsp: idx = rx_entry->op_hdr.remote_idx & RXD_TX_IDX_BITS; tx_entry = &ep->tx_entry_fs->buf[idx]; if (tx_entry->msg_id != rx_entry->op_hdr.remote_idx) return -FI_ENOMEM; rx_entry->read_rsp.tx_entry = tx_entry; rxd_ep_handle_data_msg(ep, peer, rx_entry, tx_entry->read_req.dst_iov, tx_entry->read_req.msg.iov_count, ctrl, pkt_start->data, rx_buf); break; case ofi_op_atomic: default: FI_WARN(&rxd_prov, FI_LOG_EP_CTRL, "invalid op type\n"); return -FI_EINVAL; } return 0; }