int wait_receive_data() { /* Wait for receive completion */ err = ibv_get_cq_event(comp_chan, &evt_cq, &cq_context); if (err) return 1; ibv_ack_cq_events(evt_cq, 1); err = ibv_req_notify_cq(cq, 0); if (err) return 1; n = ibv_poll_cq(cq, 1, &wc); if (n <= 0) return 1; if (wc.status != IBV_WC_SUCCESS) return 1; return 0; }
void * poll_cq(void *ctx) { struct ibv_cq *cq; struct ibv_wc wc; while (1) { TEST_NZ(ibv_get_cq_event(s_ctx->comp_channel, &cq, &ctx)); ibv_ack_cq_events(cq, 1); TEST_NZ(ibv_req_notify_cq(cq, 0)); while (ibv_poll_cq(cq, 1, &wc)) on_completion(&wc); } return NULL; }
static int get_thread_wc(struct thread_context_t *t_ctx, struct ibv_wc *wc, int is_send) { struct ibv_cq *cq; struct ibv_comp_channel *comp_channel; struct rdma_resource_t *rdma_resource; struct user_param_t *user_param; void *ectx; int rc = 0; rdma_resource = t_ctx->rdma_resource; user_param = &(rdma_resource->user_param); if (is_send) { cq = t_ctx->send_cq; comp_channel = t_ctx->send_comp_channel; } else { cq = t_ctx->recv_cq; comp_channel = t_ctx->recv_comp_channel; } if (user_param->use_event) { rc = ibv_get_cq_event(comp_channel, &cq, &ectx); if (rc != 0) { ERROR("Failed to do ibv_get_cq_event.\n"); return 1; } ibv_ack_cq_events(cq, 1); rc = ibv_req_notify_cq(cq, 0); if (rc != 0) { ERROR("Failed to do ibv_get_cq_event"); return 1; } } do { rc = ibv_poll_cq(cq, 1, wc); if (rc < 0) { ERROR("Failed to poll CQ.\n"); return 1; } } while (!user_param->use_event && (rc == 0)); /// need timeout return 0; }
static int rdma_poll_cq(RdmaDeviceResources *rdma_dev_res, struct ibv_cq *ibcq) { int i, ne, total_ne = 0; BackendCtx *bctx; struct ibv_wc wc[2]; RdmaProtectedGSList *cqe_ctx_list; qemu_mutex_lock(&rdma_dev_res->lock); do { ne = ibv_poll_cq(ibcq, ARRAY_SIZE(wc), wc); trace_rdma_poll_cq(ne, ibcq); for (i = 0; i < ne; i++) { bctx = rdma_rm_get_cqe_ctx(rdma_dev_res, wc[i].wr_id); if (unlikely(!bctx)) { rdma_error_report("No matching ctx for req %"PRId64, wc[i].wr_id); continue; } comp_handler(bctx->up_ctx, &wc[i]); if (bctx->backend_qp) { cqe_ctx_list = &bctx->backend_qp->cqe_ctx_list; } else { cqe_ctx_list = &bctx->backend_srq->cqe_ctx_list; } rdma_protected_gslist_remove_int32(cqe_ctx_list, wc[i].wr_id); rdma_rm_dealloc_cqe_ctx(rdma_dev_res, wc[i].wr_id); g_free(bctx); } total_ne += ne; } while (ne > 0); atomic_sub(&rdma_dev_res->stats.missing_cqe, total_ne); qemu_mutex_unlock(&rdma_dev_res->lock); if (ne < 0) { rdma_error_report("ibv_poll_cq fail, rc=%d, errno=%d", ne, errno); } rdma_dev_res->stats.completions += total_ne; return total_ne; }
/** * Polling for events on a inner thread allows processing of management messages * like buffer connection immediately, even if the user is not polling. * Otherwise buffer constructors would block indefinitely. * * Deep learning workloads are about sending small numbers of large messages, * in which case this model works great. If the library was to be used to * exchange large numbers of short messages, it would be useful to split * management and data messages over two different queue pairs. User threads * could then wait or poll on the data queue pair directly. */ void RDMAAdapter::InternalThreadEntry() { while (!must_stop()) { ibv_cq* cq; void* cq_context; CHECK(!ibv_get_cq_event(channel_, &cq, &cq_context)); CHECK(cq == cq_); ibv_ack_cq_events(cq, 1); CHECK(!ibv_req_notify_cq(cq_, 0)); int ne = ibv_poll_cq(cq_, MAX_CONCURRENT_WRITES * 2, static_cast<ibv_wc*>(wc_)); CHECK_GE(ne, 0); for (int i = 0; i < ne; ++i) { CHECK(wc_[i].status == IBV_WC_SUCCESS) << "Failed status \n" << ibv_wc_status_str(wc_[i].status) << " " << wc_[i].status << " " << static_cast<int>(wc_[i].wr_id) << " "<< wc_[i].vendor_err; if (wc_[i].opcode == IBV_WC_RECV_RDMA_WITH_IMM) { // Data message, add it to user received queue RDMAChannel* channel = reinterpret_cast<RDMAChannel*>(wc_[i].wr_id); channel->recv(); int id = wc_[i].imm_data; if (id >= CTRL_ID_OFFSET) { // ctrl signal ctrl_received_.push(channel->buffers_[id - CTRL_ID_OFFSET]); } else { // data received_.push(channel->buffers_[id]); } } else { if (wc_[i].opcode & IBV_WC_RECV) { // Buffer connection message RDMAChannel* channel = reinterpret_cast<RDMAChannel*>(wc_[i].wr_id); int id = wc_[i].imm_data; channel->memory_regions_queue_.push(channel->memory_regions_[id]); CHECK(id == channel->memory_regions_received_++); CHECK(!ibv_dereg_mr(channel->region_regions_[id])); } } } } }
static int send_qp_num_for_ah(struct pingpong_context *ctx, struct perftest_parameters *user_param) { struct ibv_send_wr wr; struct ibv_send_wr *bad_wr; struct ibv_sge list; struct ibv_wc wc; int ne; memcpy(ctx->buf,&ctx->qp[0]->qp_num,sizeof(uint32_t)); list.addr = (uintptr_t)ctx->buf; list.length = sizeof(uint32_t); list.lkey = ctx->mr->lkey; wr.wr_id = 0; wr.sg_list = &list; wr.num_sge = 1; wr.opcode = IBV_WR_SEND_WITH_IMM; wr.send_flags = IBV_SEND_SIGNALED; wr.next = NULL; wr.imm_data = htonl(ctx->qp[0]->qp_num); wr.wr.ud.ah = ctx->ah[0]; wr.wr.ud.remote_qpn = user_param->rem_ud_qpn; wr.wr.ud.remote_qkey = user_param->rem_ud_qkey; if (ibv_post_send(ctx->qp[0],&wr,&bad_wr)) { fprintf(stderr, "Function ibv_post_send failed\n"); return 1; } do { ne = ibv_poll_cq(ctx->send_cq, 1,&wc); } while (ne == 0); if (wc.status || wc.opcode != IBV_WC_SEND || wc.wr_id != 0) { fprintf(stderr, " Couldn't post send my QP number %d\n",(int)wc.status); return 1; } return 0; }
inline void cfio_rdma_client_wait(void *ctx) { struct ibv_cq *cq; struct ibv_wc wc; while (request_stack_size) { // rdma_debug("get cq event ..."); TEST_NZ(ibv_get_cq_event(s_ctx->comp_channel, &cq, &ctx)); // rdma_debug("ibv_ack_cq_events..."); ibv_ack_cq_events(cq, 1); TEST_NZ(ibv_req_notify_cq(cq, 0)); while (ibv_poll_cq(cq, 1, &wc)) { // rdma_debug("handle cq ..."); on_completion(&wc); } } }
void event_handler(struct ibv_cq *cq) { int ret; while(1) { /* int ibv_poll_cq(a,b,c): * a: command queue to poll * b: max number of completions to return * c: array of at least (b) entries of ibv_wc where these * completion events will be returned. */ ret = ibv_poll_cq(cq, 1, &wc); if(ret == 0) { LOGPRINTF(("Empty completion queue, requesting next notification")); ibv_req_notify_cq(r_cq_hndl, 0); /* ... explained in prev line.. */ return; } else if(ret < 0) { fprintf(stderr, "Error in event_handler (polling cq)\n"); exit(-1); } else if(wc.status != IBV_WC_SUCCESS) { fprintf(stderr, "Error in event_handler, on returned work completion " "status: %d\n", wc.status); exit(-1); } LOGPRINTF(("Retrieved work completion")); /* For ping-pong mode at least, this check shouldn't be needed for * normal operation, but it will help catch any bugs with multiple * sends coming through when we're only expecting one. */ if(receive_complete == 1) { while(receive_complete != 0) sched_yield(); } receive_complete = 1; } }
static int poll_cqs(enum CQ_INDEX index) { struct ibv_wc wc[8]; int done, i, ret; for (i = 0; i < connections; i++) { if (!test.nodes[i].connected) continue; for (done = 0; done < message_count; done += ret) { ret = ibv_poll_cq(test.nodes[i].cq[index], 8, wc); if (ret < 0) { printf("cmatose: failed polling CQ: %d\n", ret); return ret; } } } return 0; }
void * poll_cq(void *ctx) { struct ibv_cq *cq; struct ibv_wc wc; IbvConnection *conn = (IbvConnection *)ctx; while (1) { TEST_NZ(ibv_get_cq_event(conn->comp_channel, &cq, &ctx)); ibv_ack_cq_events(cq, 1); TEST_NZ(ibv_req_notify_cq(cq, 0)); while (ibv_poll_cq(cq, 1, &wc)) { (OnCompletionHandler)(&wc); } } return NULL; }
/*-----------------------------------------------------------------------------------*/ static err_t ibvif_thread(struct netif *netif) { struct ibvif *ibvif; int ne, i; struct ibv_wc wc[PBUF_READ_DEPTH]; ibvif = (struct ibvif *)netif->state; ne = ibv_poll_cq(ibvif->send_cq, PBUF_READ_DEPTH, wc); for (i=0; i<ne; i++) { if (wc[i].status != IBV_WC_SUCCESS) { perror("tapif: write 2"); } } /* Wait for a packet to arrive. */ low_level_input(netif); }
static UCS_F_ALWAYS_INLINE ucs_status_t uct_ud_verbs_iface_poll_rx(uct_ud_verbs_iface_t *iface) { uct_ib_iface_recv_desc_t *desc; struct ibv_wc wc[UCT_IB_MAX_WC]; int i, ret; char *packet; ucs_status_t status; ret = ibv_poll_cq(iface->super.super.recv_cq, UCT_IB_MAX_WC, wc); if (ret == 0) { status = UCS_ERR_NO_PROGRESS; goto out; } if (ucs_unlikely(ret < 0)) { ucs_fatal("Failed to poll receive CQ"); } for (i = 0; i < ret; ++i) { if (ucs_unlikely(wc[i].status != IBV_WC_SUCCESS)) { ucs_fatal("Receive completion with error: %s", ibv_wc_status_str(wc[i].status)); } desc = (void*)wc[i].wr_id; ucs_trace_data("pkt rcvd: buf=%p len=%d", desc, wc[i].byte_len); packet = uct_ib_iface_recv_desc_hdr(&iface->super.super, desc); VALGRIND_MAKE_MEM_DEFINED(packet, wc[i].byte_len); uct_ud_ep_process_rx(&iface->super, (uct_ud_neth_t *)(packet + UCT_IB_GRH_LEN), wc[i].byte_len - UCT_IB_GRH_LEN, (uct_ud_recv_skb_t *)desc); } iface->super.rx.available += ret; status = UCS_OK; out: uct_ud_verbs_iface_post_recv(iface); return status; }
static int __xfer_rdma_poll_cq(struct xfer_context *ctx, struct ibv_wc *ret_wc, int sleep) { //void *ctx_ptr; int ne; /* if (ibv_get_cq_event(ctx->ch, &ctx->cq, &ctx_ptr)) { fprintf(stderr, "Failed to get cq_event\n"); return -1; } ibv_ack_cq_events(ctx->cq, 1); if (ibv_req_notify_cq(ctx->cq, 0)) { fprintf(stderr, "Couldn't request CQ notification\n"); return -1; } */ do { ne = ibv_poll_cq(ctx->cq, 1, ret_wc); if (ne < 0) { fprintf(stderr, "Failed to poll completions from the CQ\n"); return -1; } if (!ne && sleep) usleep(100); } while (ne == 0); //printf("got events: %d, opcode: %d\n", ne, ret_wc->opcode); if (ret_wc->status != IBV_WC_SUCCESS) { fprintf(stderr, "Completion with status 0x%x was found\n", ret_wc->status); return -1; } return 0; }
void mvdev_flush_qp(mv_qp_pool_entry *rqp, int num_to_flush) { struct ibv_qp_attr qp_attr; struct ibv_wc wc; int ne; memset(&qp_attr, 0, sizeof(qp_attr)); qp_attr.qp_state = IBV_QPS_ERR; /* need to transition to the error state so we can flush * all the posted buffers */ if(ibv_modify_qp(rqp->ud_qp, &qp_attr, IBV_QP_STATE)) { error_abort_all(IBV_RETURN_ERR, "Error changing to the err state\n"); } /* pull failed completions */ { int total_pulled = 0; do { ne = ibv_poll_cq(rqp->ud_cq, 1, &wc); total_pulled += ne; } while(total_pulled < num_to_flush); } { struct ibv_qp_attr attr; memset(&attr, 0, sizeof(struct ibv_qp_attr)); attr.qp_state = IBV_QPS_RESET; if (ibv_modify_qp(rqp->ud_qp, &attr, IBV_QP_STATE)) { error_abort_all(IBV_RETURN_ERR, "Failed to modify QP to RESET"); } } /* now we need to re-transition it back to the RTS phase */ MV_Transition_UD_QP(&mvdev.rndv_si, rqp->ud_qp); }
static UCS_F_ALWAYS_INLINE void uct_rc_verbs_iface_poll_tx(uct_rc_verbs_iface_t *iface) { struct ibv_wc wc[UCT_IB_MAX_WC]; uct_rc_verbs_ep_t *ep; uct_rc_iface_send_op_t *op; unsigned count; uint16_t sn; int i, ret; ret = ibv_poll_cq(iface->super.super.send_cq, UCT_IB_MAX_WC, wc); if (ucs_unlikely(ret <= 0)) { if (ucs_unlikely(ret < 0)) { ucs_fatal("Failed to poll send CQ"); } return; } for (i = 0; i < ret; ++i) { if (ucs_unlikely(wc[i].status != IBV_WC_SUCCESS)) { ucs_fatal("Send completion with error: %s", ibv_wc_status_str(wc[i].status)); } UCS_STATS_UPDATE_COUNTER(iface->super.stats, UCT_RC_IFACE_STAT_TX_COMPLETION, 1); ep = ucs_derived_of(uct_rc_iface_lookup_ep(&iface->super, wc[i].qp_num), uct_rc_verbs_ep_t); ucs_assert(ep != NULL); count = wc[i].wr_id + 1; /* Number of sends with WC completes in batch */ ep->super.available += count; ep->tx.completion_count += count; ++iface->super.tx.cq_available; sn = ep->tx.completion_count; ucs_queue_for_each_extract(op, &ep->super.outstanding, queue, UCS_CIRCULAR_COMPARE16(op->sn, <=, sn)) { op->handler(op); } } }
static int poll_cqs(void) { struct ibv_wc wc[8]; int done, i, ret; for (i = 0; i < connections; i++) { if (!test.nodes[i].connected) continue; for (done = 0; done < message_count; done += ret) { ret = ibv_poll_cq(test.nodes[i].cq, 8, wc); if (ret < 0) { printf("udaddy: failed polling CQ: %d\n", ret); return ret; } if (ret && !test.nodes[i].ah) create_reply_ah(&test.nodes[i], wc); } } return 0; }
void RDMABuffer::Write(bool data) { struct ibv_sge list; list.addr = (uint64_t) addr_; list.length = size_; list.lkey = self_->lkey; struct ibv_send_wr wr; caffe_memset(sizeof(wr), 0, &wr); wr.wr_id = (uint64_t) this; wr.sg_list = &list; wr.num_sge = 1; wr.opcode = IBV_WR_RDMA_WRITE_WITH_IMM; wr.send_flags = IBV_SEND_SIGNALED; wr.imm_data = id_; if (!data) { // ctrl signal wr.imm_data += CTRL_ID_OFFSET; } wr.wr.rdma.remote_addr = (uint64_t) peer_->addr; wr.wr.rdma.rkey = peer_->rkey; struct ibv_send_wr *bad_wr; // lock the channel since there may be multiple threads calling write() boost::mutex::scoped_lock lock(channel_->mutex_); CHECK(!ibv_post_send(channel_->qp_, &wr, &bad_wr)) << "Failed to post send"; // TODO poll only every N writes to improve performance for (;;) { ibv_wc wc; int ne = ibv_poll_cq(channel_->write_cq_, 1, &wc); CHECK_GE(ne, 0); if (ne) { CHECK(wc.wr_id == (uint64_t)this) << "Oops. Polled a Work Completion belongs to a different buffer"; break; } } }
static inline void uct_ud_verbs_iface_poll_tx(uct_ud_verbs_iface_t *iface) { struct ibv_wc wc; int ret; ret = ibv_poll_cq(iface->super.super.send_cq, 1, &wc); if (ucs_unlikely(ret < 0)) { ucs_fatal("Failed to poll send CQ"); return; } if (ret == 0) { return; } if (ucs_unlikely(wc.status != IBV_WC_SUCCESS)) { ucs_fatal("Send completion (wr_id=0x%0X with error: %s ", (unsigned)wc.wr_id, ibv_wc_status_str(wc.status)); return; } iface->super.tx.available += UCT_UD_TX_MODERATION + 1; }
static int rdma_read_keys(struct pingpong_dest *rem_dest, struct perftest_comm *comm) { #ifdef HAVE_ENDIAN struct pingpong_dest a_rem_dest; #endif struct ibv_wc wc; int ne; do { ne = ibv_poll_cq(comm->rdma_ctx->recv_cq,1,&wc); } while (ne == 0); if (wc.status || !(wc.opcode & IBV_WC_RECV) || wc.wr_id != SYNC_SPEC_ID) { fprintf(stderr, "Bad wc status -- %d -- %d \n",(int)wc.status,(int)wc.wr_id); return 1; } #ifdef HAVE_ENDIAN memcpy(&a_rem_dest,comm->rdma_ctx->buf,sizeof(struct pingpong_dest)); rem_dest->lid = ntohl(a_rem_dest.lid); rem_dest->out_reads = ntohl(a_rem_dest.out_reads); rem_dest->qpn = ntohl(a_rem_dest.qpn); rem_dest->psn = ntohl(a_rem_dest.psn); rem_dest->rkey = ntohl(a_rem_dest.rkey); rem_dest->vaddr = be64toh(a_rem_dest.vaddr); memcpy(rem_dest->gid.raw, &(a_rem_dest.gid), 16*sizeof(uint8_t)); #else memcpy(&rem_dest,comm->rdma_ctx->buf,sizeof(struct pingpong_dest)); #endif if (post_one_recv_wqe(comm->rdma_ctx)) { fprintf(stderr, "Couldn't post send \n"); return 1; } return 0; }
void RDMAWriteSocket::send_close() { Buffer send_buf = this->rsock->get_send_buf(); // clear send cq struct ibv_wc wc[PACKET_WINDOW_SIZE]; //this->rsock->poll_send_cq(PACKET_WINDOW_SIZE, wc); int ret = ibv_poll_cq(this->rsock->client_id->send_cq, PACKET_WINDOW_SIZE, wc); if (ret < 0) { perror("ibv_poll_cq"); exit(1); } // send close msg MessageHeader header(MessageType::CLOSE, 0); int is_arrived = 0xffffffff; send_buf.write(header).write(is_arrived); this->rsock->post_write(send_buf, this->rka); // check send struct ibv_wc close_wc; this->rsock->poll_send_cq(1, &close_wc); }
void poll_cq(void *ctx) { struct ibv_cq *cq; struct ibv_wc wc; int ne; TEST_NZ(ibv_get_cq_event(s_ctx->comp_channel, &cq, &ctx));//block by default ibv_ack_cq_events(cq, 1); TEST_NZ(ibv_req_notify_cq(cq, 0)); do { ne = ibv_poll_cq(cq, 1, &wc); if(ne < 0){ printf("fail to poll completion from the CQ. ret = %d\n", ne); return; } else if(ne == 0) continue; else on_completion(&wc); } while (ne == 0); return; }
static int create_ah_from_wc_recv(struct pingpong_context *ctx, struct perftest_parameters *user_param) { struct ibv_qp_attr attr; struct ibv_qp_init_attr init_attr; struct ibv_wc wc; int ne; do { ne = ibv_poll_cq(ctx->recv_cq,1,&wc); } while (ne == 0); if (wc.status || !(wc.opcode & IBV_WC_RECV) || wc.wr_id != 0) { fprintf(stderr, "Bad wc status when trying to create AH -- %d -- %d \n",(int)wc.status,(int)wc.wr_id); return 1; } ctx->ah[0] = ibv_create_ah_from_wc(ctx->pd,&wc,(struct ibv_grh*)ctx->buf,ctx->cm_id->port_num); user_param->rem_ud_qpn = ntohl(wc.imm_data); ibv_query_qp(ctx->qp[0],&attr, IBV_QP_QKEY,&init_attr); user_param->rem_ud_qkey = attr.qkey; return 0; }
void * poll_cq(void *ctx) { void* tmp_ctx; struct ibv_wc wc; int num_entries, nument = 1; tmp_cq = NULL; while(1) { if (tmp_cq != NULL) { while ((num_entries = ibv_poll_cq(tmp_cq, nument, &wc))) { on_completion(&wc); } } if (ibv_get_cq_event(s_ctx->comp_channel, &tmp_cq, &tmp_ctx)) { } ibv_ack_cq_events(tmp_cq, 1); if (ibv_req_notify_cq(tmp_cq, 0) > 0) { } } return 0; }
/* Proxy-in service - RX thread * * <- Work request in (RW_imm - WR idata), remote initiated RW * <- Work completion in (RW_imm - WC idata), local initiated RW */ void m_rcv_event(struct mcm_cq *m_cq, int *events) { struct ibv_wc wc[mcm_wrc_max]; struct ibv_cq *ib_cq; struct mcm_qp *m_qp; void *cq_ctx; int i, wc_cnt, ret, err=0, notify=0; ret = ibv_get_cq_event(m_cq->ib_ch, &ib_cq, (void *)&cq_ctx); if (ret == 0) ibv_ack_cq_events(m_cq->ib_cq, 1); wc_cnt = 0; retry: if (wc_cnt >= mcm_wrc_max) { if (wc[0].status == 0) mlog(0x10," m_cq %p processed max %d, exit\n", m_cq, wc_cnt); *events += 1; /* pending */ return; } ret = ibv_poll_cq(m_cq->ib_cq, mcm_wrc_max, wc); if (ret <= 0) { if (!ret && !notify) { ibv_req_notify_cq(m_cq->ib_cq, 0); notify = 1; goto retry; } return; } else notify = 0; wc_cnt += ret; for (i=0; i<ret; i++) { m_qp = (struct mcm_qp *)wc[i].wr_id; mlog(0x40," wr_id[%d of %d] m_qp %p\n", i+1, ret, m_qp); mlog(0x40," ib_wc: st %d, vn %x idata %x op %x wr_id %Lx\n", wc[i].status, wc[i].vendor_err, ntohl(wc[i].imm_data), wc[i].opcode, wc[i].wr_id); if (wc[i].status != IBV_WC_SUCCESS) { if (wc[i].status != IBV_WC_WR_FLUSH_ERR) mlog(0," DTO ERR: st %d, vn %x idata %x qstate 0x%x\n", wc[i].status, wc[i].vendor_err, ntohl(wc[i].imm_data), m_qp->ib_qp2->state); continue; } if (m_qp->cm && (m_qp->cm->state == MCM_DISCONNECTED)) { mlog(1," WARN: RX data on DISC m_qp %p qp1 %p qp2 %p %s\n", m_qp, m_qp->ib_qp1, m_qp->ib_qp2, mcm_state_str(m_qp->cm->state)); continue; } if (wc[i].opcode == IBV_WC_RECV_RDMA_WITH_IMM) { struct ibv_recv_wr r_wr, *r_err; wrc_idata_t wrc; struct ibv_qp *ib_qp; wrc.id = WRC_ID_DATA(ntohl(wc[i].imm_data)); wrc.type = WRC_TYPE_DATA(ntohl(wc[i].imm_data)); wrc.flags = WRC_FLAGS_DATA(ntohl(wc[i].imm_data)); /* process WR or WC */ m_pi_rcv_event(m_qp, &wrc); /* re-post message */ r_wr.next = NULL; r_wr.sg_list = NULL; r_wr.num_sge = 0; r_wr.wr_id = (uint64_t)(uintptr_t) m_qp; /* MXS -> MSS or HST, PI service will be on QP1 */ if (MXS_EP(&m_qp->smd->md->addr) && (MSS_EP(&m_qp->cm->msg.daddr1) || HST_EP(&m_qp->cm->msg.daddr1))) ib_qp = m_qp->ib_qp1; else ib_qp = m_qp->ib_qp2; errno = 0; if (ib_qp) { err = ibv_post_recv(ib_qp, &r_wr, &r_err); if (err) { mlog(0,"ERR: qp %p (%s) qpn %x ibv_post_recv ret = %d %s\n", m_qp, (MXS_EP(&m_qp->smd->md->addr) && MSS_EP(&m_qp->cm->msg.daddr1)) ? "QP1":"QP2", m_qp->ib_qp2 ? m_qp->ib_qp2->qp_num:m_qp->ib_qp1->qp_num, ret, strerror(errno)); } } MCNTR(m_qp->smd->md, MCM_QP_RECV); } else { mlog(0,"ERR: unexpected WC opcode = %d on m_qp %p\n", wc[i].opcode, m_qp); } } goto retry; }
static int rping_cq_event_handler(struct rping_cb *cb) { struct ibv_wc wc; struct ibv_recv_wr *bad_wr; int ret; int flushed = 0; while ((ret = ibv_poll_cq(cb->cq, 1, &wc)) == 1) { ret = 0; if (wc.status) { if (wc.status == IBV_WC_WR_FLUSH_ERR) { flushed = 1; continue; } fprintf(stderr, "cq completion failed status %d\n", wc.status); ret = -1; goto error; } switch (wc.opcode) { case IBV_WC_SEND: DEBUG_LOG("send completion\n"); break; case IBV_WC_RDMA_WRITE: DEBUG_LOG("rdma write completion\n"); cb->state = RDMA_WRITE_COMPLETE; sem_post(&cb->sem); break; case IBV_WC_RDMA_READ: DEBUG_LOG("rdma read completion\n"); cb->state = RDMA_READ_COMPLETE; sem_post(&cb->sem); break; case IBV_WC_RECV: DEBUG_LOG("recv completion\n"); ret = cb->server ? server_recv(cb, &wc) : client_recv(cb, &wc); if (ret) { fprintf(stderr, "recv wc error: %d\n", ret); goto error; } ret = ibv_post_recv(cb->qp, &cb->rq_wr, &bad_wr); if (ret) { fprintf(stderr, "post recv error: %d\n", ret); goto error; } sem_post(&cb->sem); break; default: DEBUG_LOG("unknown!!!!! completion\n"); ret = -1; goto error; } } if (ret) { fprintf(stderr, "poll error %d\n", ret); goto error; } return flushed; error: cb->state = ERROR; sem_post(&cb->sem); return ret; }
static void ibw_event_handler_verbs(struct tevent_context *ev, struct tevent_fd *fde, uint16_t flags, void *private_data) { struct ibw_conn *conn = talloc_get_type(private_data, struct ibw_conn); struct ibw_conn_priv *pconn = talloc_get_type(conn->internal, struct ibw_conn_priv); struct ibw_ctx_priv *pctx = talloc_get_type(conn->ctx->internal, struct ibw_ctx_priv); struct ibv_wc wc; int rc; struct ibv_cq *ev_cq; void *ev_ctx; DEBUG(DEBUG_DEBUG, ("ibw_event_handler_verbs(%u)\n", (uint32_t)flags)); /* TODO: check whether if it's good to have more channels here... */ rc = ibv_get_cq_event(pconn->verbs_channel, &ev_cq, &ev_ctx); if (rc) { sprintf(ibw_lasterr, "Failed to get cq_event with %d\n", rc); goto error; } if (ev_cq != pconn->cq) { sprintf(ibw_lasterr, "ev_cq(%p) != pconn->cq(%p)\n", ev_cq, pconn->cq); goto error; } rc = ibv_req_notify_cq(pconn->cq, 0); if (rc) { sprintf(ibw_lasterr, "Couldn't request CQ notification (%d)\n", rc); goto error; } while((rc=ibv_poll_cq(pconn->cq, 1, &wc))==1) { if (wc.status) { sprintf(ibw_lasterr, "cq completion failed status=%d, opcode=%d, rc=%d\n", wc.status, wc.opcode, rc); goto error; } switch(wc.opcode) { case IBV_WC_SEND: DEBUG(DEBUG_DEBUG, ("send completion\n")); if (ibw_wc_send(conn, &wc)) goto error; break; case IBV_WC_RDMA_WRITE: DEBUG(DEBUG_DEBUG, ("rdma write completion\n")); break; case IBV_WC_RDMA_READ: DEBUG(DEBUG_DEBUG, ("rdma read completion\n")); break; case IBV_WC_RECV: DEBUG(DEBUG_DEBUG, ("recv completion\n")); if (ibw_wc_recv(conn, &wc)) goto error; break; default: sprintf(ibw_lasterr, "unknown completion %d\n", wc.opcode); goto error; } } if (rc!=0) { sprintf(ibw_lasterr, "ibv_poll_cq error %d\n", rc); goto error; } ibv_ack_cq_events(pconn->cq, 1); return; error: ibv_ack_cq_events(pconn->cq, 1); DEBUG(DEBUG_ERR, (ibw_lasterr)); if (conn->state!=IBWC_ERROR) { conn->state = IBWC_ERROR; pctx->connstate_func(NULL, conn); } }
//static void* poll_cq(struct RDMA_communicator* comm) static void* poll_cq(struct poll_cq_args* args) { struct ibv_cq *cq; struct ibv_wc wc; struct connection *conn; struct RDMA_communicator *comm; // struct RDMA_message *msg; struct control_msg cmsg; void* ctx; char* buff; uint64_t buff_size; int tag; uint64_t mr_size=0; uint64_t sent_size=0; char* send_base_addr; int* flag = args->flag; comm= args->comm; buff= args->msg->buff; send_base_addr = args->msg->buff;; buff_size= args->msg->size; tag= args->msg->tag; cmsg.type=MR_INIT; cmsg.data1.buff_size=buff_size; send_control_msg(comm->cm_id->context, &cmsg); post_receives(comm->cm_id->context); while (1) { TEST_NZ(ibv_get_cq_event(s_ctx->comp_channel, &cq, &ctx)); ibv_ack_cq_events(cq, 1); TEST_NZ(ibv_req_notify_cq(cq, 0)); while (ibv_poll_cq(cq, 1, &wc)){ conn = (struct connection *)(uintptr_t)wc.wr_id; if (wc.status != IBV_WC_SUCCESS) { die("on_completion: status is not IBV_WC_SUCCESS."); } if (wc.opcode == IBV_WC_RECV) { switch (conn->recv_msg->type) { case MR_INIT_ACK: case MR_CHUNK_ACK: debug(printf("Recived: Type=%d\n", conn->recv_msg->type), 1); if (sent_size == buff_size) { /*sent all data*/ cmsg.type=MR_FIN; cmsg.data1.tag=tag; } else { /*not sent all data yet*/ if (sent_size + RDMA_BUF_SIZE_C > buff_size) { mr_size = buff_size - sent_size; } else { mr_size = RDMA_BUF_SIZE_C; } debug(printf("mr_size=%lu\n", mr_size),1); // printf("%s\n", send_base_addr); register_rdma_region(conn, send_base_addr, mr_size); send_base_addr += mr_size; sent_size += mr_size; cmsg.type=MR_CHUNK; cmsg.data1.mr_size=mr_size; memcpy(&cmsg.data.mr, conn->rdma_msg_mr, sizeof(struct ibv_mr)); // cmsg.data.mr = conn->rdma_msg_mr; } break; case MR_FIN_ACK: debug(printf("Recived: Type=%d\n", conn->recv_msg->type),1); *flag = 1; // rdma_disconnect(comm->cm_id); // rdma_disconnect(conn->id); //exit(0); return NULL; default: debug(printf("Unknown TYPE"), 1); return NULL; } send_control_msg(conn, &cmsg); post_receives(conn); } else if (wc.opcode == IBV_WC_SEND) { debug(printf("Sent: TYPE=%d\n", conn->send_msg->type),1); } else { die("unknow opecode."); } } } return NULL; }
void mvdev_rendezvous_push_zcopy(MPIR_SHANDLE * s, mvdev_connection_t *c) { int pkt_count, malloc_count, bytes_to_send, i, ne; struct ibv_send_wr *sr, *bad_wr; struct ibv_sge *sg_entry; struct ibv_wc wc_list[30]; mv_qp *qp = &(mvdev.rndv_qp[s->hca_index]); MV_ASSERT(s->dreg_entry != NULL); pkt_count = ceil((double) s->bytes_total / mvparams.mtu); malloc_count = MIN(pkt_count, 64); D_PRINT("Sending %u of data for shandle %p\n", s->bytes_total, REQ_TO_ID(s)); D_PRINT("Local addr: %p\n", s->local_address); sr = (struct ibv_send_wr *) malloc(sizeof(struct ibv_send_wr) * malloc_count); sg_entry = (struct ibv_sge *) malloc(sizeof(struct ibv_sge) * malloc_count); D_PRINT("Entering push zcopy (%d)\n", s->bytes_total); c->last_ah = (c->last_ah + 1) % mvparams.max_lmc_total; while(s->bytes_sent < s->bytes_total) { int empty = 0; do { ne = ibv_poll_cq(mvdev.rndv_cq[s->hca_index], 30, wc_list); if(ne < 0) { error_abort_all(IBV_RETURN_ERR, "Error polling RNDV CQ\n"); } else if (ne > 0) { for(i = 0; i < ne; i++) { if(wc_list[i].status != IBV_WC_SUCCESS) { error_abort_all(IBV_STATUS_ERR, "got completion with " "error code %d, wr_id: %lu\n", wc_list[i].status, wc_list[i].wr_id); } qp->send_wqes_avail++; if(wc_list[i].wr_id) { mv_sbuf * v = (mv_sbuf *) ((mv_sdescriptor *) wc_list[i].wr_id)->parent; v->left_to_send--; if(0 == v->left_to_send) { v->in_progress = 0; if(0 == v->seqnum) { release_mv_sbuf(v); } } } } empty = 0; } else { empty = 1; } } while(qp->send_wqes_avail < 500 || !empty); for(i = 0; i < malloc_count; i++) { bytes_to_send = MIN(s->bytes_total - s->bytes_sent, mvparams.mtu); if(i > 0) { sr[i-1].next = &(sr[i]); } sr[i].next = NULL; sr[i].opcode = IBV_WR_SEND_WITH_IMM; sr[i].wr_id = 0; sr[i].num_sge = 1; sr[i].sg_list = &(sg_entry[i]); sr[i].imm_data = s->seqnum++; sr[i].send_flags = IBV_SEND_SIGNALED; sr[i].wr.ud.ah = c->data_ud_ah[c->last_ah]; sr[i].wr.ud.remote_qpn = s->remote_qpn; sr[i].wr.ud.remote_qkey = 0; sg_entry[i].addr = (uintptr_t) ((char *) (s->local_address) + s->bytes_sent); sg_entry[i].length = bytes_to_send; sg_entry[i].lkey = ((dreg_entry *) s->dreg_entry)->memhandle[0]->lkey; s->bytes_sent += bytes_to_send; qp->send_wqes_avail--; if(s->bytes_total == s->bytes_sent) { break; } } if(ibv_post_send(qp->qp, sr, &bad_wr)) { error_abort_all(IBV_RETURN_ERR,"Error posting to UD RNDV QP (%d) - %lu\n", qp->send_wqes_avail, bad_wr->wr_id ); } } MV_ASSERT(s->bytes_total == s->bytes_sent); mvdev_ud_zcopy_finish(s, c->last_ah); s->nearly_complete = 1; }
void mvdev_incoming_ud_zcopy_finish(mv_rbuf * v, mvdev_connection_t * c, mvdev_packet_ud_zcopy_finish * h) { MPIR_RHANDLE *rhandle; mv_qp_pool_entry *rqp; struct ibv_wc *wc; int ne, count = 0, empty = 0, i = 0, next_to_recv = 0, in_order = 1; int posted_buffers; /* find the rhandle for this data */ rhandle = (MPIR_RHANDLE *) ID_TO_REQ(h->rreq); rqp = (mv_qp_pool_entry *) rhandle->qp_entry; /* make sure all the data is here by checking the associated * cq for the qp used for this data transfer. All messages * are hopefully here.... otherwise we need to do * cleanup */ D_PRINT("Got a zcopy finish message\n"); posted_buffers = ceil((double) rhandle->len / mvparams.mtu); wc = (struct ibv_wc *) malloc(sizeof(struct ibv_wc) * posted_buffers); do { ne = ibv_poll_cq(rqp->ud_cq, posted_buffers - 1, wc); if(ne < 0) { error_abort_all(IBV_RETURN_ERR, "Error polling CQ\n"); } else if (ne > 0) { for(i = 0; i < ne; i++) { if (wc[i].status != IBV_WC_SUCCESS) { error_abort_all(IBV_STATUS_ERR, "got completion with " "error. code: %d, wr_id: %lu", wc[i].status, wc[i].wr_id); } else { if(IBV_WC_RECV == wc[i].opcode) { if(wc[i].imm_data != next_to_recv) { D_PRINT("Out of order! %u %u\n", wc[i].imm_data, next_to_recv); in_order = 0; } next_to_recv++; } } count++; } } else { empty = 1; } } while(!empty && posted_buffers != count); D_PRINT("Finished polling... -- got %d or %d\n", count, posted_buffers); if(count == posted_buffers && in_order) { mv_sbuf *v = get_sbuf(c, sizeof(mvdev_packet_ud_zcopy_ack)); mvdev_packet_ud_zcopy_ack *h = (mvdev_packet_ud_zcopy_ack *) v->header_ptr; PACKET_SET_HEADER(h, c, MVDEV_PACKET_UD_ZCOPY_ACK); h->sreq = rhandle->send_id; v->shandle = NULL; mvdev_post_channel_send(c, v, sizeof(mvdev_packet_ud_zcopy_ack)); D_PRINT("finished sending MVDEV_PACKET_UD_ZCOPY_ACK\n"); RELEASE_RNDV_QP(rqp); RECV_COMPLETE(rhandle); } else { if(count != posted_buffers) { mvdev_flush_qp((mv_qp_pool_entry *) rhandle->qp_entry, posted_buffers - count); } mvdev_post_zcopy_recv(rhandle); MV_Rndv_Send_Reply(rhandle); } free(wc); }
int main(int argc, char *argv[]) { struct pdata rep_pdata; struct rdma_event_channel *cm_channel; struct rdma_cm_id *listen_id; struct rdma_cm_id *cm_id; struct rdma_cm_event *event; struct rdma_conn_param conn_param = { }; struct ibv_pd *pd; struct ibv_comp_channel *comp_chan; struct ibv_cq *cq; struct ibv_cq *evt_cq; struct ibv_mr *mr; struct ibv_qp_init_attr qp_attr = { }; struct ibv_sge sge; struct ibv_send_wr send_wr = { }; struct ibv_send_wr *bad_send_wr; struct ibv_recv_wr recv_wr = { }; struct ibv_recv_wr *bad_recv_wr; struct ibv_wc wc; void *cq_context; struct sockaddr_in sin; uint32_t *buf; int err; /* Set up RDMA CM structures */ cm_channel = rdma_create_event_channel(); if (!cm_channel) return 1; err = rdma_create_id(cm_channel, &listen_id, NULL, RDMA_PS_TCP); if (err) return err; sin.sin_family = AF_INET; sin.sin_port = htons(20079); sin.sin_addr.s_addr = INADDR_ANY; /* Bind to local port and listen for connection request */ err = rdma_bind_addr(listen_id, (struct sockaddr *) &sin); if (err) return 1; err = rdma_listen(listen_id, 1); if (err) return 1; err = rdma_get_cm_event(cm_channel, &event); if (err) return err; printf("after get_cm_event\n"); if (event->event != RDMA_CM_EVENT_CONNECT_REQUEST) return 1; cm_id = event->id; rdma_ack_cm_event(event); /* Create verbs objects now that we know which device to use */ pd = ibv_alloc_pd(cm_id->verbs); if (!pd) return 1; comp_chan = ibv_create_comp_channel(cm_id->verbs); if (!comp_chan) return 1; cq = ibv_create_cq(cm_id->verbs, 2, NULL, comp_chan, 0); if (!cq) return 1; if (ibv_req_notify_cq(cq, 0)) return 1; buf = calloc(2, sizeof(uint32_t)); if (!buf) return 1; mr = ibv_reg_mr(pd, buf, 2 * sizeof(uint32_t), IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_WRITE); if (!mr) return 1; qp_attr.cap.max_send_wr = 1; qp_attr.cap.max_send_sge = 1; qp_attr.cap.max_recv_wr = 1; qp_attr.cap.max_recv_sge = 1; qp_attr.send_cq = cq; qp_attr.recv_cq = cq; qp_attr.qp_type = IBV_QPT_RC; err = rdma_create_qp(cm_id, pd, &qp_attr); if (err) return err; /* Post receive before accepting connection */ sge.addr = (uintptr_t) buf + sizeof(uint32_t); sge.length = sizeof(uint32_t); sge.lkey = mr->lkey; recv_wr.sg_list = &sge; recv_wr.num_sge = 1; if (ibv_post_recv(cm_id->qp, &recv_wr, &bad_recv_wr)) return 1; rep_pdata.buf_va = htonll((uintptr_t) buf); rep_pdata.buf_rkey = htonl(mr->rkey); conn_param.responder_resources = 1; conn_param.private_data = &rep_pdata; conn_param.private_data_len = sizeof rep_pdata; /* Accept connection */ printf("before accept\n"); err = rdma_accept(cm_id, &conn_param); if (err) return 1; printf("after accept\n"); err = rdma_get_cm_event(cm_channel, &event); if (err) return err; if (event->event != RDMA_CM_EVENT_ESTABLISHED) return 1; rdma_ack_cm_event(event); /* Wait for receive completion */ if (ibv_get_cq_event(comp_chan, &evt_cq, &cq_context)) return 1; if (ibv_req_notify_cq(cq, 0)) return 1; if (ibv_poll_cq(cq, 1, &wc) < 1) return 1; if (wc.status != IBV_WC_SUCCESS) return 1; /* Add two integers and send reply back */ buf[0] = htonl(ntohl(buf[0]) + ntohl(buf[1])); sge.addr = (uintptr_t) buf; sge.length = sizeof(uint32_t); sge.lkey = mr->lkey; send_wr.opcode = IBV_WR_SEND; send_wr.send_flags = IBV_SEND_SIGNALED; send_wr.sg_list = &sge; send_wr.num_sge = 1; if (ibv_post_send(cm_id->qp, &send_wr, &bad_send_wr)) return 1; /* Wait for send completion */ if (ibv_get_cq_event(comp_chan, &evt_cq, &cq_context)) return 1; if (ibv_poll_cq(cq, 1, &wc) < 1) return 1; if (wc.status != IBV_WC_SUCCESS) return 1; printf("before ack cq 2\n"); ibv_ack_cq_events(cq, 2); return 0; }