void RDMACMSocket::post_recv(const Buffer& buf) { if (rdma_post_recv(this->client_id, buf.addr, buf.addr, buf.size, this->verbs_mr) < 0) { rdma_dereg_mr(this->verbs_mr); this->verbs_buf.free(); rdma_destroy_ep(this->client_id); perror("rdma_post_recv"); exit(1); } }
static int connect_client (struct rdma_cm_id *client) { if (!client) return -1; if ( -1 == kiro_attach_qp (client)) { g_critical ("Could not create a QP for the new connection"); rdma_destroy_id (client); return -1; } struct kiro_connection_context *ctx = (struct kiro_connection_context *)g_try_malloc0 (sizeof (struct kiro_connection_context)); if (!ctx) { g_critical ("Failed to create connection context"); rdma_destroy_id (client); return -1; } ctx->cf_mr_recv = kiro_create_rdma_memory (client->pd, sizeof (struct kiro_ctrl_msg), IBV_ACCESS_LOCAL_WRITE); ctx->cf_mr_send = kiro_create_rdma_memory (client->pd, sizeof (struct kiro_ctrl_msg), IBV_ACCESS_LOCAL_WRITE); if (!ctx->cf_mr_recv || !ctx->cf_mr_send) { g_critical ("Failed to register control message memory"); goto error; } ctx->cf_mr_recv->size = ctx->cf_mr_send->size = sizeof (struct kiro_ctrl_msg); client->context = ctx; if (rdma_post_recv (client, client, ctx->cf_mr_recv->mem, ctx->cf_mr_recv->size, ctx->cf_mr_recv->mr)) { g_critical ("Posting preemtive receive for connection failed: %s", strerror (errno)); goto error; } if (rdma_accept (client, NULL)) { g_warning ("Failed to establish connection to the client: %s", strerror (errno)); goto error; } g_debug ("Client connection setup successfull"); return 0; error: rdma_reject (client, NULL, 0); kiro_destroy_connection_context (&ctx); rdma_destroy_id (client); return -1; }
/** * \brief Create an RDMA transport server * * \param cmid The CM id passed up in the connect event * \param q_depth A hint from the client on the depth of it's SQ/RQ * \param msize The max message size * \returns A pointer to the newly allocated transport */ Nptrans * np_rdmatrans_create(struct rdma_cm_id *cmid, int q_depth, int msize) { int i, ret; u8 *p; struct Nptrans *trans; struct Rdmatrans *rdma; struct ibv_qp_init_attr qp_attr; struct rdma_conn_param cparam; rdma = calloc(1, sizeof *rdma); if (!rdma) goto error; ret = pthread_mutex_init(&rdma->lock, NULL); if (ret) goto error; ret = pthread_cond_init(&rdma->cond, NULL); if (ret) goto error; rdma->connected = 0; rdma->cm_id = cmid; rdma->context = cmid->verbs; rdma->q_depth = q_depth; rdma->msize = msize + sizeof(Rdmactx); rdma->pd = ibv_alloc_pd(rdma->context); if (!rdma->pd) goto error; /* Create receive buffer space and register it */ rdma->rcv_buf = malloc(rdma->msize * q_depth); if (!rdma->rcv_buf) goto error; rdma->rcv_mr = ibv_reg_mr(rdma->pd, rdma->rcv_buf, rdma->msize * q_depth, IBV_ACCESS_LOCAL_WRITE); if (!rdma->rcv_mr) goto error; /* Create send buffer space and register it */ rdma->snd_buf = malloc(rdma->msize * q_depth); if (!rdma->snd_buf) goto error; rdma->next_buf = 0; rdma->snd_mr = ibv_reg_mr(rdma->pd, rdma->snd_buf, rdma->msize * q_depth, 0); if (!rdma->snd_mr) goto error; rdma->ch = ibv_create_comp_channel(rdma->context); if (!rdma->ch) goto error; rdma->fd = rdma->ch->fd; rdma->cq = ibv_create_cq(rdma->context, 2*q_depth, rdma, rdma->ch, 0); if (!rdma->cq) goto error; ibv_req_notify_cq(rdma->cq, 0); /* Create the CQ */ memset(&qp_attr, 0, sizeof qp_attr); qp_attr.send_cq = rdma->cq; qp_attr.recv_cq = rdma->cq; qp_attr.cap.max_send_wr = q_depth; qp_attr.cap.max_recv_wr = q_depth; qp_attr.cap.max_send_sge = 1; qp_attr.cap.max_send_sge = 1; qp_attr.cap.max_recv_sge = 1; qp_attr.cap.max_inline_data = 64; qp_attr.qp_type = IBV_QPT_RC; ret = rdma_create_qp(rdma->cm_id, rdma->pd, &qp_attr); if (ret) goto error; rdma->qp = rdma->cm_id->qp; p = rdma->rcv_buf; for (i = 0; i < q_depth; i++) rdma_post_recv(rdma, (Rdmactx *)(p + i*rdma->msize)); trans = np_trans_create(rdma, rdma_trans_recv, rdma_trans_send, rdma_trans_destroy); if (!trans) goto error; rdma->trans = trans; memset(&cparam, 0, sizeof(cparam)); cparam.responder_resources = 1; cparam.initiator_depth = 1; cparam.private_data = NULL; cparam.private_data_len = 0; ret = rdma_accept(cmid, &cparam); if (ret) { np_uerror(ret); goto error; } rdma->connected = 1; return trans; error: if (rdma) rdma_trans_destroy(rdma); rdma_reject(cmid, NULL, 0); return NULL; }
static int rdma_trans_recv(Npfcall **fcp, u32 msize, void *a) { int n, ret, closing = 0; struct ibv_cq *cq; struct ibv_wc wc; void *context; Rdmatrans *rdma = (Rdmatrans *)a; Rdmactx *ctx; Npfcall *fc = NULL; if (!(fc = np_alloc_fcall (msize))) { np_uerror(ENOMEM); return -1; } pthread_mutex_lock(&rdma->lock); again: if (rdma->rfirst) { ctx = rdma->rfirst; n = ctx->len - ctx->pos; if (n > msize) n = msize; memmove(fc->pkt, ctx->buf + ctx->pos, n); ctx->pos += n; if (ctx->pos == ctx->len) { rdma->rfirst = ctx->next; if (ctx == rdma->rlast) rdma->rlast = NULL; rdma_post_recv(rdma, ctx); } pthread_mutex_unlock(&rdma->lock); fc->size = n; *fcp = fc; return 0; } pthread_mutex_unlock(&rdma->lock); poll: ret = ibv_get_cq_event(rdma->ch, &cq, &context); if (ret) { np_uerror(ret); //fprintf(stderr, "Error %d polling cq\n", ret); return -1; } ibv_ack_cq_events(rdma->cq, 1); ibv_req_notify_cq(cq, 0); while ((ret = ibv_poll_cq(rdma->cq, 1, &wc)) > 0) { /* Check if it's a flush */ if (wc.status != IBV_WC_SUCCESS) { //fprintf(stderr, "cq fail: status %d opcode %d\n", // wc.status, wc.opcode); closing = 1; continue; } if (wc.opcode == IBV_WC_RECV) { ctx = (Rdmactx *) wc.wr_id; pthread_mutex_lock(&rdma->lock); ctx->used = 0; ctx->len = wc.byte_len; ctx->pos = 0; if (rdma->rlast) rdma->rlast->next = ctx; else rdma->rfirst = ctx; rdma->rlast = ctx; ctx->next = NULL; goto again; } else if (wc.opcode == IBV_WC_SEND) { ctx = (Rdmactx *) wc.wr_id; pthread_mutex_lock(&rdma->lock); ctx->used = 0; pthread_cond_signal(&rdma->cond); pthread_mutex_unlock(&rdma->lock); } } if (!ret && !closing) goto poll; np_uerror(ret); return -1; }
static gboolean process_rdma_event (GIOChannel *source, GIOCondition condition, gpointer data) { // Right now, we don't need 'source' // Tell the compiler to ignore it by (void)-ing it (void) source; if (!G_TRYLOCK (rdma_handling)) { g_debug ("RDMA handling will wait for the next dispatch."); return TRUE; } g_debug ("Got message on condition: %i", condition); void *payload = ((GList *)data)->data; struct kiro_client_connection *cc = (struct kiro_client_connection *)payload; struct ibv_wc wc; gint num_comp = ibv_poll_cq (cc->conn->recv_cq, 1, &wc); if (!num_comp) { g_critical ("RDMA event handling was triggered, but there is no completion on the queue"); goto end_rmda_eh; } if (num_comp < 0) { g_critical ("Failure getting receive completion event from the queue: %s", strerror (errno)); goto end_rmda_eh; } g_debug ("Got %i receive events from the queue", num_comp); void *cq_ctx; struct ibv_cq *cq; int err = ibv_get_cq_event (cc->conn->recv_cq_channel, &cq, &cq_ctx); if (!err) ibv_ack_cq_events (cq, 1); struct kiro_connection_context *ctx = (struct kiro_connection_context *)cc->conn->context; guint type = ((struct kiro_ctrl_msg *)ctx->cf_mr_recv->mem)->msg_type; g_debug ("Received a message from Client %u of type %u", cc->id, type); switch (type) { case KIRO_PING: { struct kiro_ctrl_msg *msg = (struct kiro_ctrl_msg *) (ctx->cf_mr_send->mem); msg->msg_type = KIRO_PONG; if (!send_msg (cc->conn, ctx->cf_mr_send)) { g_warning ("Failure while trying to post PONG send: %s", strerror (errno)); goto done; } break; } case KIRO_ACK_RDMA: { g_debug ("ACK received"); if (G_TRYLOCK (realloc_timeout)) { g_debug ("Client %i has ACKed the reallocation request", cc->id); GList *client = g_list_find (realloc_list, (gpointer)cc); if (client) { realloc_list = g_list_remove_link (realloc_list, client); if (cc->backup_mri->mr) ibv_dereg_mr (cc->backup_mri->mr); g_free (cc->backup_mri); cc->backup_mri = NULL; g_debug ("Client %i removed from realloc_list", cc->id); } G_UNLOCK (realloc_timeout); } break; } default: g_debug ("Message Type is unknow. Ignoring..."); } done: //Post a generic receive in order to stay responsive to any messages from //the client if (rdma_post_recv (cc->conn, cc->conn, ctx->cf_mr_recv->mem, ctx->cf_mr_recv->size, ctx->cf_mr_recv->mr)) { //TODO: Connection teardown in an event handler routine? Not a good //idea... g_critical ("Posting generic receive for event handling failed: %s", strerror (errno)); kiro_destroy_connection_context (&ctx); rdma_destroy_ep (cc->conn); goto end_rmda_eh; } ibv_req_notify_cq (cc->conn->recv_cq, 0); // Make the respective Queue push events onto the channel g_debug ("Finished RDMA event handling"); end_rmda_eh: G_UNLOCK (rdma_handling); return TRUE; }
static int rc_test(void) { struct rdma_addrinfo hints, *res; struct ibv_qp_init_attr attr; struct ibv_wc wc; int ret; memset(&hints, 0, sizeof hints); hints.ai_flags = RAI_PASSIVE; hints.ai_port_space = RDMA_PS_TCP; ret = rdma_getaddrinfo(NULL, port, &hints, &res); if (ret) { printf("rdma_getaddrinfo %d\n", errno); return ret; } memset(&attr, 0, sizeof attr); attr.cap.max_send_wr = attr.cap.max_recv_wr = 1; attr.cap.max_send_sge = attr.cap.max_recv_sge = 1; attr.cap.max_inline_data = sizeof send_msg; attr.sq_sig_all = 1; ret = rdma_create_ep(&listen_id, res, NULL, &attr); rdma_freeaddrinfo(res); if (ret) { printf("rdma_create_ep %d\n", errno); return ret; } ret = rdma_listen(listen_id, 0); if (ret) { printf("rdma_listen %d\n", errno); return ret; } ret = rdma_get_request(listen_id, &id); if (ret) { printf("rdma_get_request %d\n", errno); return ret; } mr = rdma_reg_msgs(id, recv_msg, sizeof recv_msg); if (!mr) { printf("rdma_reg_msgs %d\n", errno); return ret; } ret = rdma_post_recv(id, NULL, recv_msg, sizeof recv_msg, mr); if (ret) { printf("rdma_post_recv %d\n", errno); return ret; } ret = rdma_accept(id, NULL); if (ret) { printf("rdma_accept %d\n", errno); return ret; } ret = rdma_get_recv_comp(id, &wc); if (ret <= 0) { printf("rdma_get_recv_comp %d\n", ret); return ret; } ret = rdma_post_send(id, NULL, send_msg, sizeof send_msg, NULL, IBV_SEND_INLINE); if (ret) { printf("rdma_post_send %d\n", errno); return ret; } ret = rdma_get_send_comp(id, &wc); if (ret <= 0) { printf("rdma_get_send_comp %d\n", ret); return ret; } rdma_disconnect(id); rdma_dereg_mr(mr); rdma_destroy_ep(id); rdma_destroy_ep(listen_id); return 0; }
static int xrc_test(void) { struct rdma_cm_id *conn_id, *lookup_id; struct ibv_qp_init_attr attr; struct rdma_conn_param param; struct rdma_cm_event *event; struct ibv_wc wc; int ret; conn_id = xrc_listen_recv(); if (!conn_id) return -1; ret = xrc_create_srq_listen(rdma_get_local_addr(conn_id), sizeof(struct sockaddr_storage)); if (ret) return -1; memset(&attr, 0, sizeof attr); attr.qp_type = IBV_QPT_XRC_RECV; attr.ext.xrc_recv.xrcd = srq_id->srq->ext.xrc.xrcd; ret = rdma_create_qp(conn_id, NULL, &attr); if (ret) { printf("Unable to create xrc recv qp %d\n", errno); return ret; } ret = rdma_accept(conn_id, NULL); if (ret) { printf("rdma_accept failed for xrc recv qp %d\n", errno); return ret; } ret = rdma_get_request(srq_id, &lookup_id); if (ret) { printf("rdma_get_request %d\n", errno); return ret; } mr = rdma_reg_msgs(srq_id, recv_msg, sizeof recv_msg); if (!mr) { printf("ibv_reg_msgs %d\n", errno); return ret; } ret = rdma_post_recv(srq_id, NULL, recv_msg, sizeof recv_msg, mr); if (ret) { printf("rdma_post_recv %d\n", errno); return ret; } memset(¶m, 0, sizeof param); param.qp_num = srq_id->srq->ext.xrc.srq_num; ret = rdma_accept(lookup_id, ¶m); if (ret) { printf("rdma_accept failed for srqn lookup %d\n", errno); return ret; } rdma_destroy_id(lookup_id); ret = rdma_get_recv_comp(srq_id, &wc); if (ret <= 0) { printf("rdma_get_recv_comp %d\n", ret); return ret; } ret = rdma_get_cm_event(conn_id->channel, &event); if (ret || event->event != RDMA_CM_EVENT_DISCONNECTED) { printf("Failed to get disconnect event\n"); return -1; } rdma_ack_cm_event(event); rdma_disconnect(conn_id); rdma_destroy_ep(conn_id); rdma_dereg_mr(mr); rdma_destroy_ep(srq_id); rdma_destroy_ep(listen_id); return 0; }
static int run(void) { struct rdma_addrinfo hints, *res; struct ibv_qp_init_attr attr; struct ibv_wc wc; int ret; memset(&hints, 0, sizeof hints); hints.ai_port_space = RDMA_PS_TCP; ret = rdma_getaddrinfo(server, port, &hints, &res); if (ret) { printf("rdma_getaddrinfo %d\n", errno); return ret; } memset(&attr, 0, sizeof attr); attr.cap.max_send_wr = attr.cap.max_recv_wr = 1; attr.cap.max_send_sge = attr.cap.max_recv_sge = 1; attr.cap.max_inline_data = 16; attr.qp_context = id; attr.sq_sig_all = 1; ret = rdma_create_ep(&id, res, NULL, &attr); rdma_freeaddrinfo(res); if (ret) { printf("rdma_create_ep %d\n", errno); return ret; } mr = rdma_reg_msgs(id, recv_msg, 16); if (!mr) { printf("rdma_reg_msgs %d\n", errno); return ret; } ret = rdma_post_recv(id, NULL, recv_msg, 16, mr); if (ret) { printf("rdma_post_recv %d\n", errno); return ret; } ret = rdma_connect(id, NULL); if (ret) { printf("rdma_connect %d\n", errno); return ret; } s = get_dtime(); ret = rdma_post_send(id, NULL, send_msg, 16, NULL, IBV_SEND_INLINE); if (ret) { printf("rdma_post_send %d\n", errno); return ret; } e = get_dtime(); ret = rdma_get_recv_comp(id, &wc); if (ret <= 0) { printf("rdma_get_recv_comp %d\n", ret); return ret; } printf("time %f\n", e - s); rdma_disconnect(id); rdma_dereg_mr(mr); rdma_destroy_ep(id); return 0; }