/***************************************** * Function: poll_completion *****************************************/ static int poll_completion( struct resources *res) { struct ibv_wc wc; void *ev_ctx; struct ibv_cq *ev_cq; int rc; fprintf(stdout, "waiting for completion event\n"); /* Wait for the completion event */ if (ibv_get_cq_event(res->comp_channel, &ev_cq, &ev_ctx)) { fprintf(stderr, "failed to get cq_event\n"); return 1; } fprintf(stdout, "got completion event\n"); /* Ack the event */ ibv_ack_cq_events(ev_cq, 1); /* Request notification upon the next completion event */ rc = ibv_req_notify_cq(ev_cq, 0); if (rc) { fprintf(stderr, "Couldn't request CQ notification\n"); return 1; } /* in a real program, the user should empty the CQ before waiting for the next completion event */ /* poll the completion that causes thew event (if exists) */ rc = ibv_poll_cq(res->cq, 1, &wc); if (rc < 0) { fprintf(stderr, "poll CQ failed\n"); return 1; } /* check if the CQ is empty (there can be an event event when the CQ is empty, this can happen when more than one completion(s) are being created. Here we create only one completion so empty CQ means there is an error) */ if (rc == 0) { fprintf(stderr, "completion wasn't found in the CQ after timeout\n"); return 1; } fprintf(stdout, "completion was found in CQ with status 0x%x\n", wc.status); /* check the completion status (here we don't care about the completion opcode */ if (wc.status != IBV_WC_SUCCESS) { fprintf(stderr, "got bad completion with status: 0x%x, vendor syndrome: 0x%x\n", wc.status, wc.vendor_err); return 1; } return 0; }
void * poll_cq(void *ctx) { struct ibv_cq *cq; struct ibv_wc wc; while (1) { if (!paused) { // rdma_debug("get cq event ..."); TEST_NZ(ibv_get_cq_event(s_ctx->comp_channel, &cq, &ctx)); ibv_ack_cq_events(cq, 1); TEST_NZ(ibv_req_notify_cq(cq, 0)); while (ibv_poll_cq(cq, 1, &wc)) { // rdma_debug("handle cq ..."); on_completion(&wc); } } else { // rdma_debug("wait signal ..."); pthread_mutex_lock(&mutex); pthread_cond_wait(&resume_cond, &mutex); pthread_mutex_unlock(&mutex); } } return NULL; }
void send_ack() { /* Send ack */ ack_buffer = client_pdata.index; sge_send.addr = (uintptr_t)&ack_buffer; sge_send.length = sizeof(ack_buffer); sge_send.lkey = mr_ack_buffer->lkey; send_wr.wr_id = 1; send_wr.opcode = IBV_WR_SEND; send_wr.send_flags = IBV_SEND_SIGNALED; send_wr.sg_list = &sge_send; send_wr.num_sge = 1; err = ibv_post_send(cm_id->qp, &send_wr, &bad_send_wr); assert(err == 0); /* Wait send completion */ err = ibv_get_cq_event(comp_chan, &evt_cq, &cq_context); assert(err == 0); ibv_ack_cq_events(evt_cq, 1); err = ibv_req_notify_cq(cq, 0); assert(err == 0); n = ibv_poll_cq(cq, 1, &wc); assert(n >= 1); if (wc.status != IBV_WC_SUCCESS) printf("Warning: Client %d send ack failed\n", client_pdata.index); }
static void *cq_thread(void *arg) { struct rping_cb *cb = arg; struct ibv_cq *ev_cq; void *ev_ctx; int ret; DEBUG_LOG("cq_thread started.\n"); while (1) { pthread_testcancel(); ret = ibv_get_cq_event(cb->channel, &ev_cq, &ev_ctx); if (ret) { fprintf(stderr, "Failed to get cq event!\n"); pthread_exit(NULL); } if (ev_cq != cb->cq) { fprintf(stderr, "Unknown CQ!\n"); pthread_exit(NULL); } ret = ibv_req_notify_cq(cb->cq, 0); if (ret) { fprintf(stderr, "Failed to set notify!\n"); pthread_exit(NULL); } ret = rping_cq_event_handler(cb); ibv_ack_cq_events(cb->cq, 1); if (ret) pthread_exit(NULL); } }
static void *comp_handler_thread(void *arg) { RdmaBackendDev *backend_dev = (RdmaBackendDev *)arg; int rc; struct ibv_cq *ev_cq; void *ev_ctx; int flags; GPollFD pfds[1]; /* Change to non-blocking mode */ flags = fcntl(backend_dev->channel->fd, F_GETFL); rc = fcntl(backend_dev->channel->fd, F_SETFL, flags | O_NONBLOCK); if (rc < 0) { rdma_error_report("Failed to change backend channel FD to non-blocking"); return NULL; } pfds[0].fd = backend_dev->channel->fd; pfds[0].events = G_IO_IN | G_IO_HUP | G_IO_ERR; backend_dev->comp_thread.is_running = true; while (backend_dev->comp_thread.run) { do { rc = qemu_poll_ns(pfds, 1, THR_POLL_TO * (int64_t)SCALE_MS); if (!rc) { backend_dev->rdma_dev_res->stats.poll_cq_ppoll_to++; } } while (!rc && backend_dev->comp_thread.run); if (backend_dev->comp_thread.run) { rc = ibv_get_cq_event(backend_dev->channel, &ev_cq, &ev_ctx); if (unlikely(rc)) { rdma_error_report("ibv_get_cq_event fail, rc=%d, errno=%d", rc, errno); continue; } rc = ibv_req_notify_cq(ev_cq, 0); if (unlikely(rc)) { rdma_error_report("ibv_req_notify_cq fail, rc=%d, errno=%d", rc, errno); } backend_dev->rdma_dev_res->stats.poll_cq_from_bk++; rdma_poll_cq(backend_dev->rdma_dev_res, ev_cq); ibv_ack_cq_events(ev_cq, 1); } } backend_dev->comp_thread.is_running = false; qemu_thread_exit(0); return NULL; }
ucs_status_t uct_ib_iface_wakeup_arm(uct_wakeup_h wakeup) { int res, send_cq_count = 0, recv_cq_count = 0; ucs_status_t status; struct ibv_cq *cq; void *cq_context; uct_ib_iface_t *iface = ucs_derived_of(wakeup->iface, uct_ib_iface_t); do { res = ibv_get_cq_event(iface->comp_channel, &cq, &cq_context); if (0 == res) { if (iface->send_cq == cq) { ++send_cq_count; } if (iface->recv_cq == cq) { ++recv_cq_count; } } } while (res == 0); if (errno != EAGAIN) { return UCS_ERR_IO_ERROR; } if (send_cq_count > 0) { ibv_ack_cq_events(iface->send_cq, send_cq_count); } if (recv_cq_count > 0) { ibv_ack_cq_events(iface->recv_cq, recv_cq_count); } /* avoid re-arming the interface if any events exists */ if ((send_cq_count > 0) || (recv_cq_count > 0)) { return UCS_ERR_BUSY; } if (wakeup->events & UCT_WAKEUP_TX_COMPLETION) { status = iface->ops->arm_tx_cq(iface); if (status != UCS_OK) { return status; } } if (wakeup->events & (UCT_WAKEUP_RX_AM | UCT_WAKEUP_RX_SIGNALED_AM)) { status = iface->ops->arm_rx_cq(iface, 0); if (status != UCS_OK) { return status; } } return UCS_OK; }
/* EventThread: * Continuously polls the Command Queue for events, and registers them with * the event_handler(..) function. */ void *EventThread(void *unused) { struct ibv_cq *cq; void *ev_ctx; while (1) { if (ibv_get_cq_event(0, &cq, &ev_ctx)) { fprintf(stderr, "Failed to get CQ event\n"); return NULL; } event_handler(cq); } }
void * poll_cq2(void *ctx) { struct ibv_cq *cq; struct ibv_wc wc; while (1) { TEST_NZ(ibv_get_cq_event(s_ctx->comp_channel, &cq, &ctx)); ibv_ack_cq_events(cq, 1); TEST_NZ(ibv_req_notify_cq(cq, 0)); while (ibv_poll_cq(cq, 1, &wc)) on_completion(&wc); } return NULL; }
void ParallelRenderingClientIBVerbs::run() { struct ibv_wc wc; struct ibv_cq *ev_cq; void *ev_ctx; while (keepRunning) { lock.lock(); int ne; do { ne = ibv_poll_cq(ctx->cq, 1, &wc); if (ne > 0) { if (ibv_get_cq_event(ctx->ch, &ev_cq, &ev_ctx)) { fprintf(stderr, "Failed to get cq event!\n"); return; } if (ev_cq != ctx->cq) { fprintf(stderr, "Unkown CQ!\n"); return; } ibv_ack_cq_events(ctx->cq, 1); ibv_req_notify_cq(ctx->cq, 0); } microSleep(100); } while (ne == 0); if (ne < 0) { fprintf(stderr, "poll CQ failed %d\n", ne); return; } if (wc.status != IBV_WC_SUCCESS) { fprintf(stderr, "Completion with error at client\n"); fprintf(stderr, "Failed status %d: wr_id %d\n", wc.status, (int)wc.wr_id); return; } } }
int wait_receive_data() { /* Wait for receive completion */ err = ibv_get_cq_event(comp_chan, &evt_cq, &cq_context); if (err) return 1; ibv_ack_cq_events(evt_cq, 1); err = ibv_req_notify_cq(cq, 0); if (err) return 1; n = ibv_poll_cq(cq, 1, &wc); if (n <= 0) return 1; if (wc.status != IBV_WC_SUCCESS) return 1; return 0; }
int __ibv_get_cq_event_1_0(struct ibv_comp_channel *channel, struct ibv_cq_1_0 **cq, void **cq_context) { fprintf(stderr, "%s:%s:%d \n", __func__, __FILE__, __LINE__); struct ibv_cq *real_cq; void *cq_ptr; int ret; ret = ibv_get_cq_event(channel, &real_cq, &cq_ptr); if (ret) return ret; *cq = cq_ptr; *cq_context = (*cq)->cq_context; return 0; }
int __ibv_get_cq_event_1_0(struct ibv_comp_channel *channel, struct ibv_cq_1_0 **cq, void **cq_context) { struct ibv_cq *real_cq; void *cq_ptr; int ret; ret = ibv_get_cq_event(channel, &real_cq, &cq_ptr); if (ret) return ret; *cq = cq_ptr; *cq_context = (*cq)->cq_context; return 0; }
static void async_completion_thread() { int ret; struct ibv_comp_channel *ev_ch; struct ibv_cq *ev_cq; void *ev_ctx; /* This thread should be in a cancel enabled state */ pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL); ev_ch = viadev.comp_channel; while(1) { pthread_testcancel(); pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL); do { ret = ibv_get_cq_event(ev_ch, &ev_cq, &ev_ctx); if (ret && errno != EINTR) { error_abort_all(IBV_RETURN_ERR, "Failed to get cq event: %d\n", ret); } } while (ret && errno == EINTR); pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, NULL); if (ev_cq != viadev.cq_hndl) { error_abort_all(GEN_ASSERT_ERR, "Event in unknown CQ\n"); } pthread_kill(parent_threadId, SIGUSR1); ibv_ack_cq_events(viadev.cq_hndl, 1); pthread_testcancel(); pthread_testcancel(); if (ibv_req_notify_cq(viadev.cq_hndl, 1)) { error_abort_all(IBV_RETURN_ERR, "Couldn't request for CQ notification\n"); } } }
static int get_thread_wc(struct thread_context_t *t_ctx, struct ibv_wc *wc, int is_send) { struct ibv_cq *cq; struct ibv_comp_channel *comp_channel; struct rdma_resource_t *rdma_resource; struct user_param_t *user_param; void *ectx; int rc = 0; rdma_resource = t_ctx->rdma_resource; user_param = &(rdma_resource->user_param); if (is_send) { cq = t_ctx->send_cq; comp_channel = t_ctx->send_comp_channel; } else { cq = t_ctx->recv_cq; comp_channel = t_ctx->recv_comp_channel; } if (user_param->use_event) { rc = ibv_get_cq_event(comp_channel, &cq, &ectx); if (rc != 0) { ERROR("Failed to do ibv_get_cq_event.\n"); return 1; } ibv_ack_cq_events(cq, 1); rc = ibv_req_notify_cq(cq, 0); if (rc != 0) { ERROR("Failed to do ibv_get_cq_event"); return 1; } } do { rc = ibv_poll_cq(cq, 1, wc); if (rc < 0) { ERROR("Failed to poll CQ.\n"); return 1; } } while (!user_param->use_event && (rc == 0)); /// need timeout return 0; }
/** * Polling for events on a inner thread allows processing of management messages * like buffer connection immediately, even if the user is not polling. * Otherwise buffer constructors would block indefinitely. * * Deep learning workloads are about sending small numbers of large messages, * in which case this model works great. If the library was to be used to * exchange large numbers of short messages, it would be useful to split * management and data messages over two different queue pairs. User threads * could then wait or poll on the data queue pair directly. */ void RDMAAdapter::InternalThreadEntry() { while (!must_stop()) { ibv_cq* cq; void* cq_context; CHECK(!ibv_get_cq_event(channel_, &cq, &cq_context)); CHECK(cq == cq_); ibv_ack_cq_events(cq, 1); CHECK(!ibv_req_notify_cq(cq_, 0)); int ne = ibv_poll_cq(cq_, MAX_CONCURRENT_WRITES * 2, static_cast<ibv_wc*>(wc_)); CHECK_GE(ne, 0); for (int i = 0; i < ne; ++i) { CHECK(wc_[i].status == IBV_WC_SUCCESS) << "Failed status \n" << ibv_wc_status_str(wc_[i].status) << " " << wc_[i].status << " " << static_cast<int>(wc_[i].wr_id) << " "<< wc_[i].vendor_err; if (wc_[i].opcode == IBV_WC_RECV_RDMA_WITH_IMM) { // Data message, add it to user received queue RDMAChannel* channel = reinterpret_cast<RDMAChannel*>(wc_[i].wr_id); channel->recv(); int id = wc_[i].imm_data; if (id >= CTRL_ID_OFFSET) { // ctrl signal ctrl_received_.push(channel->buffers_[id - CTRL_ID_OFFSET]); } else { // data received_.push(channel->buffers_[id]); } } else { if (wc_[i].opcode & IBV_WC_RECV) { // Buffer connection message RDMAChannel* channel = reinterpret_cast<RDMAChannel*>(wc_[i].wr_id); int id = wc_[i].imm_data; channel->memory_regions_queue_.push(channel->memory_regions_[id]); CHECK(id == channel->memory_regions_received_++); CHECK(!ibv_dereg_mr(channel->region_regions_[id])); } } } } }
inline void cfio_rdma_client_wait(void *ctx) { struct ibv_cq *cq; struct ibv_wc wc; while (request_stack_size) { // rdma_debug("get cq event ..."); TEST_NZ(ibv_get_cq_event(s_ctx->comp_channel, &cq, &ctx)); // rdma_debug("ibv_ack_cq_events..."); ibv_ack_cq_events(cq, 1); TEST_NZ(ibv_req_notify_cq(cq, 0)); while (ibv_poll_cq(cq, 1, &wc)) { // rdma_debug("handle cq ..."); on_completion(&wc); } } }
void * poll_cq(void *ctx) { struct ibv_cq *cq; struct ibv_wc wc; IbvConnection *conn = (IbvConnection *)ctx; while (1) { TEST_NZ(ibv_get_cq_event(conn->comp_channel, &cq, &ctx)); ibv_ack_cq_events(cq, 1); TEST_NZ(ibv_req_notify_cq(cq, 0)); while (ibv_poll_cq(cq, 1, &wc)) { (OnCompletionHandler)(&wc); } } return NULL; }
static inline int fi_ibv_poll_events(struct fi_ibv_cq *_cq, int timeout) { int ret, rc; void *context; struct pollfd fds[2]; char data; fds[0].fd = _cq->channel->fd; fds[1].fd = _cq->signal_fd[0]; fds[0].events = fds[1].events = POLLIN; rc = poll(fds, 2, timeout); if (rc == 0) return -FI_EAGAIN; else if (rc < 0) return -errno; if (fds[0].revents & POLLIN) { ret = ibv_get_cq_event(_cq->channel, &_cq->cq, &context); if (ret) return ret; atomic_inc(&_cq->nevents); rc--; } if (fds[1].revents & POLLIN) { do { ret = read(fds[1].fd, &data, 1); } while (ret > 0); ret = -FI_EAGAIN; rc--; } if (rc) { FI_WARN(&fi_ibv_prov, FI_LOG_CQ, "Unknown poll error: check revents\n"); return -FI_EOTHER; } return ret; }
static int get_cq_event(void) { struct ibv_cq *ev_cq; void *ev_ctx; if (ibv_get_cq_event(ctx.channel, &ev_cq, &ev_ctx)) { fprintf(stderr, "Failed to get cq_event\n"); return 1; } if (ev_cq != ctx.recv_cq) { fprintf(stderr, "CQ event for unknown CQ %p\n", ev_cq); return 1; } if (ibv_req_notify_cq(ctx.recv_cq, 0)) { fprintf(stderr, "Couldn't request CQ notification\n"); return 1; } return 0; }
static void *mca_oob_ud_event_dispatch(int fd, int flags, void *context) { int rc; mca_oob_ud_device_t *device = (mca_oob_ud_device_t *) context; mca_oob_ud_port_t *port = NULL; struct ibv_cq *event_cq = NULL; void *event_context = NULL; do { rc = ibv_get_cq_event (device->ib_channel, &event_cq, &event_context); } while (rc && errno == EINTR); if (NULL == event_cq) { /* re-arm the event */ opal_event_add (&port->device->event, NULL); return NULL; } port = (mca_oob_ud_port_t *) event_context; rc = mca_oob_ud_process_messages (event_cq, port); if (rc < 0) { opal_output (0, "%s oob:ud:event_dispatch error processing messages", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); return NULL; } if (ibv_req_notify_cq(event_cq, 0)) { opal_output (0, "%s oob:ud:event_dispatch error asking for cq notifications", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); } /* re-arm the event */ opal_event_add (&port->device->event, NULL); return NULL; }
ucs_status_t uct_ib_iface_wakeup_arm(uct_wakeup_h wakeup) { int res, ack_count = 0; ucs_status_t status; struct ibv_cq *cq; void *cq_context; uct_ib_iface_t *iface = ucs_derived_of(wakeup->iface, uct_ib_iface_t); do { res = ibv_get_cq_event(iface->comp_channel, &cq, &cq_context); ack_count++; } while (res == 0); if (errno != EAGAIN) { return UCS_ERR_IO_ERROR; } if (ack_count > 1) { ibv_ack_cq_events(cq, ack_count - 1); } if (wakeup->events & UCT_WAKEUP_TX_COMPLETION) { status = iface->ops->arm_tx_cq(iface); if (status != UCS_OK) { return status; } } if (wakeup->events & (UCT_WAKEUP_RX_AM | UCT_WAKEUP_RX_SIGNALED_AM)) { status = iface->ops->arm_rx_cq(iface, 0); if (status != UCS_OK) { return status; } } return UCS_OK; }
void poll_cq(void *ctx) { struct ibv_cq *cq; struct ibv_wc wc; int ne; TEST_NZ(ibv_get_cq_event(s_ctx->comp_channel, &cq, &ctx));//block by default ibv_ack_cq_events(cq, 1); TEST_NZ(ibv_req_notify_cq(cq, 0)); do { ne = ibv_poll_cq(cq, 1, &wc); if(ne < 0){ printf("fail to poll completion from the CQ. ret = %d\n", ne); return; } else if(ne == 0) continue; else on_completion(&wc); } while (ne == 0); return; }
/* * Return -1 for error and 'nr events' for a positive number * of events */ static int rdma_poll_wait(struct thread_data *td, enum ibv_wc_opcode opcode) { struct rdmaio_data *rd = td->io_ops->data; struct ibv_cq *ev_cq; void *ev_ctx; int ret; if (rd->cq_event_num > 0) { /* previous left */ rd->cq_event_num--; return 0; } again: if (ibv_get_cq_event(rd->channel, &ev_cq, &ev_ctx) != 0) { log_err("fio: Failed to get cq event!\n"); return -1; } if (ev_cq != rd->cq) { log_err("fio: Unknown CQ!\n"); return -1; } if (ibv_req_notify_cq(rd->cq, 0) != 0) { log_err("fio: Failed to set notify!\n"); return -1; } ret = cq_event_handler(td, opcode); if (ret == 0) goto again; ibv_ack_cq_events(rd->cq, ret); rd->cq_event_num--; return ret; }
void * poll_cq(void *ctx) { void* tmp_ctx; struct ibv_wc wc; int num_entries, nument = 1; tmp_cq = NULL; while(1) { if (tmp_cq != NULL) { while ((num_entries = ibv_poll_cq(tmp_cq, nument, &wc))) { on_completion(&wc); } } if (ibv_get_cq_event(s_ctx->comp_channel, &tmp_cq, &tmp_ctx)) { } ibv_ack_cq_events(tmp_cq, 1); if (ibv_req_notify_cq(tmp_cq, 0) > 0) { } } return 0; }
int CompletionChannel::retrieveEvents(ibv_cq** cq) { void* context; return ibv_get_cq_event(mChannel, cq, &context); }
/* Proxy-in service - RX thread * * <- Work request in (RW_imm - WR idata), remote initiated RW * <- Work completion in (RW_imm - WC idata), local initiated RW */ void m_rcv_event(struct mcm_cq *m_cq, int *events) { struct ibv_wc wc[mcm_wrc_max]; struct ibv_cq *ib_cq; struct mcm_qp *m_qp; void *cq_ctx; int i, wc_cnt, ret, err=0, notify=0; ret = ibv_get_cq_event(m_cq->ib_ch, &ib_cq, (void *)&cq_ctx); if (ret == 0) ibv_ack_cq_events(m_cq->ib_cq, 1); wc_cnt = 0; retry: if (wc_cnt >= mcm_wrc_max) { if (wc[0].status == 0) mlog(0x10," m_cq %p processed max %d, exit\n", m_cq, wc_cnt); *events += 1; /* pending */ return; } ret = ibv_poll_cq(m_cq->ib_cq, mcm_wrc_max, wc); if (ret <= 0) { if (!ret && !notify) { ibv_req_notify_cq(m_cq->ib_cq, 0); notify = 1; goto retry; } return; } else notify = 0; wc_cnt += ret; for (i=0; i<ret; i++) { m_qp = (struct mcm_qp *)wc[i].wr_id; mlog(0x40," wr_id[%d of %d] m_qp %p\n", i+1, ret, m_qp); mlog(0x40," ib_wc: st %d, vn %x idata %x op %x wr_id %Lx\n", wc[i].status, wc[i].vendor_err, ntohl(wc[i].imm_data), wc[i].opcode, wc[i].wr_id); if (wc[i].status != IBV_WC_SUCCESS) { if (wc[i].status != IBV_WC_WR_FLUSH_ERR) mlog(0," DTO ERR: st %d, vn %x idata %x qstate 0x%x\n", wc[i].status, wc[i].vendor_err, ntohl(wc[i].imm_data), m_qp->ib_qp2->state); continue; } if (m_qp->cm && (m_qp->cm->state == MCM_DISCONNECTED)) { mlog(1," WARN: RX data on DISC m_qp %p qp1 %p qp2 %p %s\n", m_qp, m_qp->ib_qp1, m_qp->ib_qp2, mcm_state_str(m_qp->cm->state)); continue; } if (wc[i].opcode == IBV_WC_RECV_RDMA_WITH_IMM) { struct ibv_recv_wr r_wr, *r_err; wrc_idata_t wrc; struct ibv_qp *ib_qp; wrc.id = WRC_ID_DATA(ntohl(wc[i].imm_data)); wrc.type = WRC_TYPE_DATA(ntohl(wc[i].imm_data)); wrc.flags = WRC_FLAGS_DATA(ntohl(wc[i].imm_data)); /* process WR or WC */ m_pi_rcv_event(m_qp, &wrc); /* re-post message */ r_wr.next = NULL; r_wr.sg_list = NULL; r_wr.num_sge = 0; r_wr.wr_id = (uint64_t)(uintptr_t) m_qp; /* MXS -> MSS or HST, PI service will be on QP1 */ if (MXS_EP(&m_qp->smd->md->addr) && (MSS_EP(&m_qp->cm->msg.daddr1) || HST_EP(&m_qp->cm->msg.daddr1))) ib_qp = m_qp->ib_qp1; else ib_qp = m_qp->ib_qp2; errno = 0; if (ib_qp) { err = ibv_post_recv(ib_qp, &r_wr, &r_err); if (err) { mlog(0,"ERR: qp %p (%s) qpn %x ibv_post_recv ret = %d %s\n", m_qp, (MXS_EP(&m_qp->smd->md->addr) && MSS_EP(&m_qp->cm->msg.daddr1)) ? "QP1":"QP2", m_qp->ib_qp2 ? m_qp->ib_qp2->qp_num:m_qp->ib_qp1->qp_num, ret, strerror(errno)); } } MCNTR(m_qp->smd->md, MCM_QP_RECV); } else { mlog(0,"ERR: unexpected WC opcode = %d on m_qp %p\n", wc[i].opcode, m_qp); } } goto retry; }
static void ibw_event_handler_verbs(struct tevent_context *ev, struct tevent_fd *fde, uint16_t flags, void *private_data) { struct ibw_conn *conn = talloc_get_type(private_data, struct ibw_conn); struct ibw_conn_priv *pconn = talloc_get_type(conn->internal, struct ibw_conn_priv); struct ibw_ctx_priv *pctx = talloc_get_type(conn->ctx->internal, struct ibw_ctx_priv); struct ibv_wc wc; int rc; struct ibv_cq *ev_cq; void *ev_ctx; DEBUG(DEBUG_DEBUG, ("ibw_event_handler_verbs(%u)\n", (uint32_t)flags)); /* TODO: check whether if it's good to have more channels here... */ rc = ibv_get_cq_event(pconn->verbs_channel, &ev_cq, &ev_ctx); if (rc) { sprintf(ibw_lasterr, "Failed to get cq_event with %d\n", rc); goto error; } if (ev_cq != pconn->cq) { sprintf(ibw_lasterr, "ev_cq(%p) != pconn->cq(%p)\n", ev_cq, pconn->cq); goto error; } rc = ibv_req_notify_cq(pconn->cq, 0); if (rc) { sprintf(ibw_lasterr, "Couldn't request CQ notification (%d)\n", rc); goto error; } while((rc=ibv_poll_cq(pconn->cq, 1, &wc))==1) { if (wc.status) { sprintf(ibw_lasterr, "cq completion failed status=%d, opcode=%d, rc=%d\n", wc.status, wc.opcode, rc); goto error; } switch(wc.opcode) { case IBV_WC_SEND: DEBUG(DEBUG_DEBUG, ("send completion\n")); if (ibw_wc_send(conn, &wc)) goto error; break; case IBV_WC_RDMA_WRITE: DEBUG(DEBUG_DEBUG, ("rdma write completion\n")); break; case IBV_WC_RDMA_READ: DEBUG(DEBUG_DEBUG, ("rdma read completion\n")); break; case IBV_WC_RECV: DEBUG(DEBUG_DEBUG, ("recv completion\n")); if (ibw_wc_recv(conn, &wc)) goto error; break; default: sprintf(ibw_lasterr, "unknown completion %d\n", wc.opcode); goto error; } } if (rc!=0) { sprintf(ibw_lasterr, "ibv_poll_cq error %d\n", rc); goto error; } ibv_ack_cq_events(pconn->cq, 1); return; error: ibv_ack_cq_events(pconn->cq, 1); DEBUG(DEBUG_ERR, (ibw_lasterr)); if (conn->state!=IBWC_ERROR) { conn->state = IBWC_ERROR; pctx->connstate_func(NULL, conn); } }
//static void* poll_cq(struct RDMA_communicator* comm) static void* poll_cq(struct poll_cq_args* args) { struct ibv_cq *cq; struct ibv_wc wc; struct connection *conn; struct RDMA_communicator *comm; // struct RDMA_message *msg; struct control_msg cmsg; void* ctx; char* buff; uint64_t buff_size; int tag; uint64_t mr_size=0; uint64_t sent_size=0; char* send_base_addr; int* flag = args->flag; comm= args->comm; buff= args->msg->buff; send_base_addr = args->msg->buff;; buff_size= args->msg->size; tag= args->msg->tag; cmsg.type=MR_INIT; cmsg.data1.buff_size=buff_size; send_control_msg(comm->cm_id->context, &cmsg); post_receives(comm->cm_id->context); while (1) { TEST_NZ(ibv_get_cq_event(s_ctx->comp_channel, &cq, &ctx)); ibv_ack_cq_events(cq, 1); TEST_NZ(ibv_req_notify_cq(cq, 0)); while (ibv_poll_cq(cq, 1, &wc)){ conn = (struct connection *)(uintptr_t)wc.wr_id; if (wc.status != IBV_WC_SUCCESS) { die("on_completion: status is not IBV_WC_SUCCESS."); } if (wc.opcode == IBV_WC_RECV) { switch (conn->recv_msg->type) { case MR_INIT_ACK: case MR_CHUNK_ACK: debug(printf("Recived: Type=%d\n", conn->recv_msg->type), 1); if (sent_size == buff_size) { /*sent all data*/ cmsg.type=MR_FIN; cmsg.data1.tag=tag; } else { /*not sent all data yet*/ if (sent_size + RDMA_BUF_SIZE_C > buff_size) { mr_size = buff_size - sent_size; } else { mr_size = RDMA_BUF_SIZE_C; } debug(printf("mr_size=%lu\n", mr_size),1); // printf("%s\n", send_base_addr); register_rdma_region(conn, send_base_addr, mr_size); send_base_addr += mr_size; sent_size += mr_size; cmsg.type=MR_CHUNK; cmsg.data1.mr_size=mr_size; memcpy(&cmsg.data.mr, conn->rdma_msg_mr, sizeof(struct ibv_mr)); // cmsg.data.mr = conn->rdma_msg_mr; } break; case MR_FIN_ACK: debug(printf("Recived: Type=%d\n", conn->recv_msg->type),1); *flag = 1; // rdma_disconnect(comm->cm_id); // rdma_disconnect(conn->id); //exit(0); return NULL; default: debug(printf("Unknown TYPE"), 1); return NULL; } send_control_msg(conn, &cmsg); post_receives(conn); } else if (wc.opcode == IBV_WC_SEND) { debug(printf("Sent: TYPE=%d\n", conn->send_msg->type),1); } else { die("unknow opecode."); } } } return NULL; }
int main(int argc, char *argv[]) { struct pdata rep_pdata; struct rdma_event_channel *cm_channel; struct rdma_cm_id *listen_id; struct rdma_cm_id *cm_id; struct rdma_cm_event *event; struct rdma_conn_param conn_param = { }; struct ibv_pd *pd; struct ibv_comp_channel *comp_chan; struct ibv_cq *cq; struct ibv_cq *evt_cq; struct ibv_mr *mr; struct ibv_qp_init_attr qp_attr = { }; struct ibv_sge sge; struct ibv_send_wr send_wr = { }; struct ibv_send_wr *bad_send_wr; struct ibv_recv_wr recv_wr = { }; struct ibv_recv_wr *bad_recv_wr; struct ibv_wc wc; void *cq_context; struct sockaddr_in sin; uint32_t *buf; int err; /* Set up RDMA CM structures */ cm_channel = rdma_create_event_channel(); if (!cm_channel) return 1; err = rdma_create_id(cm_channel, &listen_id, NULL, RDMA_PS_TCP); if (err) return err; sin.sin_family = AF_INET; sin.sin_port = htons(20079); sin.sin_addr.s_addr = INADDR_ANY; /* Bind to local port and listen for connection request */ err = rdma_bind_addr(listen_id, (struct sockaddr *) &sin); if (err) return 1; err = rdma_listen(listen_id, 1); if (err) return 1; err = rdma_get_cm_event(cm_channel, &event); if (err) return err; printf("after get_cm_event\n"); if (event->event != RDMA_CM_EVENT_CONNECT_REQUEST) return 1; cm_id = event->id; rdma_ack_cm_event(event); /* Create verbs objects now that we know which device to use */ pd = ibv_alloc_pd(cm_id->verbs); if (!pd) return 1; comp_chan = ibv_create_comp_channel(cm_id->verbs); if (!comp_chan) return 1; cq = ibv_create_cq(cm_id->verbs, 2, NULL, comp_chan, 0); if (!cq) return 1; if (ibv_req_notify_cq(cq, 0)) return 1; buf = calloc(2, sizeof(uint32_t)); if (!buf) return 1; mr = ibv_reg_mr(pd, buf, 2 * sizeof(uint32_t), IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_WRITE); if (!mr) return 1; qp_attr.cap.max_send_wr = 1; qp_attr.cap.max_send_sge = 1; qp_attr.cap.max_recv_wr = 1; qp_attr.cap.max_recv_sge = 1; qp_attr.send_cq = cq; qp_attr.recv_cq = cq; qp_attr.qp_type = IBV_QPT_RC; err = rdma_create_qp(cm_id, pd, &qp_attr); if (err) return err; /* Post receive before accepting connection */ sge.addr = (uintptr_t) buf + sizeof(uint32_t); sge.length = sizeof(uint32_t); sge.lkey = mr->lkey; recv_wr.sg_list = &sge; recv_wr.num_sge = 1; if (ibv_post_recv(cm_id->qp, &recv_wr, &bad_recv_wr)) return 1; rep_pdata.buf_va = htonll((uintptr_t) buf); rep_pdata.buf_rkey = htonl(mr->rkey); conn_param.responder_resources = 1; conn_param.private_data = &rep_pdata; conn_param.private_data_len = sizeof rep_pdata; /* Accept connection */ printf("before accept\n"); err = rdma_accept(cm_id, &conn_param); if (err) return 1; printf("after accept\n"); err = rdma_get_cm_event(cm_channel, &event); if (err) return err; if (event->event != RDMA_CM_EVENT_ESTABLISHED) return 1; rdma_ack_cm_event(event); /* Wait for receive completion */ if (ibv_get_cq_event(comp_chan, &evt_cq, &cq_context)) return 1; if (ibv_req_notify_cq(cq, 0)) return 1; if (ibv_poll_cq(cq, 1, &wc) < 1) return 1; if (wc.status != IBV_WC_SUCCESS) return 1; /* Add two integers and send reply back */ buf[0] = htonl(ntohl(buf[0]) + ntohl(buf[1])); sge.addr = (uintptr_t) buf; sge.length = sizeof(uint32_t); sge.lkey = mr->lkey; send_wr.opcode = IBV_WR_SEND; send_wr.send_flags = IBV_SEND_SIGNALED; send_wr.sg_list = &sge; send_wr.num_sge = 1; if (ibv_post_send(cm_id->qp, &send_wr, &bad_send_wr)) return 1; /* Wait for send completion */ if (ibv_get_cq_event(comp_chan, &evt_cq, &cq_context)) return 1; if (ibv_poll_cq(cq, 1, &wc) < 1) return 1; if (wc.status != IBV_WC_SUCCESS) return 1; printf("before ack cq 2\n"); ibv_ack_cq_events(cq, 2); return 0; }
gaspi_return_t pgaspi_passive_receive (const gaspi_segment_id_t segment_id_local, const gaspi_offset_t offset_local, gaspi_rank_t * const rem_rank, const gaspi_size_t size, const gaspi_timeout_t timeout_ms) { #ifdef DEBUG if (glb_gaspi_ctx_ib.rrmd[segment_id_local] == NULL) { gaspi_printf("Debug: Invalid local segment (gaspi_passive_receive)\n"); return GASPI_ERROR; } if( rem_rank == NULL) { gaspi_printf("Debug: Invalid pointer parameter: rem_rank (gaspi_passive_receive)\n"); return GASPI_ERROR; } if( offset_local > glb_gaspi_ctx_ib.rrmd[segment_id_local][glb_gaspi_ctx.rank].size) { gaspi_printf("Debug: Invalid offsets (gaspi_passive_receive)\n"); return GASPI_ERROR; } if( size < 1 || size > GASPI_MAX_TSIZE_P ) { gaspi_printf("Debug: Invalid size (gaspi_passive_receive)\n"); return GASPI_ERROR; } #endif struct ibv_recv_wr *bad_wr; struct ibv_wc wc_recv; struct ibv_sge rlist; struct ibv_recv_wr rwr; struct ibv_cq *ev_cq; void *ev_ctx; int i; fd_set rfds; struct timeval tout; lock_gaspi_tout (&glb_gaspi_ctx.lockPR, timeout_ms); rlist.addr = (uintptr_t) (glb_gaspi_ctx_ib. rrmd[segment_id_local][glb_gaspi_ctx.rank].addr + NOTIFY_OFFSET + offset_local); rlist.length = size; rlist.lkey = glb_gaspi_ctx_ib.rrmd[segment_id_local][glb_gaspi_ctx.rank].mr->lkey; rwr.wr_id = glb_gaspi_ctx.rank; rwr.sg_list = &rlist; rwr.num_sge = 1; rwr.next = NULL; if (ibv_post_srq_recv (glb_gaspi_ctx_ib.srqP, &rwr, &bad_wr)) { unlock_gaspi (&glb_gaspi_ctx.lockPR); return GASPI_ERROR; } FD_ZERO (&rfds); FD_SET (glb_gaspi_ctx_ib.channelP->fd, &rfds); const long ts = (timeout_ms / 1000); const long tus = (timeout_ms - ts * 1000) * 1000; tout.tv_sec = ts; tout.tv_usec = tus; const int selret = select (FD_SETSIZE, &rfds, NULL, NULL, &tout); if (selret < 0) { unlock_gaspi (&glb_gaspi_ctx.lockPR); return GASPI_ERROR; } else if (selret == 0) { unlock_gaspi (&glb_gaspi_ctx.lockPR); return GASPI_TIMEOUT; } if (ibv_get_cq_event (glb_gaspi_ctx_ib.channelP, &ev_cq, &ev_ctx)) { unlock_gaspi (&glb_gaspi_ctx.lockPR); return GASPI_ERROR; } ibv_ack_cq_events (ev_cq, 1); if (ev_cq != glb_gaspi_ctx_ib.rcqP) { unlock_gaspi (&glb_gaspi_ctx.lockPR); return GASPI_ERROR; } if (ibv_req_notify_cq (glb_gaspi_ctx_ib.rcqP, 0)) { unlock_gaspi (&glb_gaspi_ctx.lockPR); return GASPI_ERROR; } int ne = 0; do { ne = ibv_poll_cq (glb_gaspi_ctx_ib.rcqP, 1, &wc_recv); } while (ne == 0); if ((ne < 0) || (wc_recv.status != IBV_WC_SUCCESS)) { glb_gaspi_ctx.qp_state_vec[GASPI_PASSIVE_QP][wc_recv.wr_id] = 1; unlock_gaspi (&glb_gaspi_ctx.lockPR); return GASPI_ERROR; } *rem_rank = 0xffff; for (i = 0; i < glb_gaspi_ctx.tnc; i++) { if (glb_gaspi_ctx_ib.qpP[i]->qp_num == wc_recv.qp_num) { *rem_rank = i; break; } } unlock_gaspi (&glb_gaspi_ctx.lockPR); return GASPI_SUCCESS; }