void build_connection(struct rdma_cm_id *id) { struct connection *conn; struct ibv_qp_init_attr qp_attr; //init semaphores sem_init(&read_ops, 0, 0); sem_init(&done_ops, 0, 0); sem_init(&write_ops, 0, 1); build_context(id->verbs); build_qp_attr(&qp_attr); TEST_NZ(rdma_create_qp(id, s_ctx->pd, &qp_attr)); id->context = conn = (struct connection *)malloc(sizeof(struct connection)); conn->id = id; conn->qp = id->qp; conn->send_state = SS_INIT; conn->recv_state = RS_INIT; conn->connected = 0; register_memory(conn); post_receives(conn); }
void build_connection(struct rdma_cm_id *id) { rdma_conn_t *conn; struct ibv_qp_init_attr qp_attr; build_context(id->verbs); build_qp_attr(&qp_attr); TEST_NZ(rdma_create_qp(id, s_ctx->pd, &qp_attr)); conn = malloc(sizeof(rdma_conn_t)); id->context = conn; rdma_conn = conn; conn->id = id; conn->qp = id->qp; conn->send_state = SS_INIT; conn->recv_state = RS_INIT; conn->connected = 0; register_memory(conn); post_receives(conn); }
int on_addr_resolved(struct rdma_cm_id *id) { struct ibv_qp_init_attr qp_attr; struct connection *conn; printf("address resolved.\n"); build_context(id->verbs); build_qp_attr(&qp_attr); TEST_NZ(rdma_create_qp(id, s_ctx->pd, &qp_attr)); id->context = conn = (struct connection *)malloc(sizeof(struct connection)); conn->id = id; conn->qp = id->qp; conn->num_completions = 0; register_memory(conn); post_receives(conn); TEST_NZ(rdma_resolve_route(id, TIMEOUT_IN_MS)); return 0; }
void on_completion(struct ibv_wc *wc) { struct connection *conn = (struct connection *)(uintptr_t)wc->wr_id; if (wc->status != IBV_WC_SUCCESS) die("on_completion: status is not IBV_WC_SUCCESS."); if (wc->opcode & IBV_WC_RECV) { conn->recv_state++; printf("RECV: Recieved: TYPE=%d\n", conn->recv_msg->type); if (conn->recv_msg->type == MSG_MR) { memcpy(&conn->peer_mr, &conn->recv_msg->data.mr, sizeof(conn->peer_mr)); post_receives(conn); /* only rearm for MSG_MR */ if (conn->send_state == SS_INIT) /* received peer's MR before sending ours, so send ours back */ send_mr(conn); } } else { conn->send_state++; printf("SEND: Sent out: TYPE=%d\n", conn->send_msg->type); } if (conn->send_state == SS_MR_SENT && conn->recv_state == RS_MR_RECV) { /* struct ibv_send_wr wr, *bad_wr = NULL; struct ibv_sge sge; if (s_mode == M_WRITE) printf(" -> received MSG_MR. writing message to remote memory...\n"); else printf(" -> received MSG_MR. reading message from remote memory...\n"); memset(&wr, 0, sizeof(wr)); wr.wr_id = (uintptr_t)conn; wr.opcode = (s_mode == M_WRITE) ? IBV_WR_RDMA_WRITE : IBV_WR_RDMA_READ; wr.sg_list = &sge; wr.num_sge = 1; wr.send_flags = IBV_SEND_SIGNALED; wr.wr.rdma.remote_addr = (uintptr_t)conn->peer_mr.addr; wr.wr.rdma.rkey = conn->peer_mr.rkey; sge.addr = (uintptr_t)conn->rdma_local_region; sge.length = RDMA_BUFFER_SIZE; sge.lkey = conn->rdma_local_mr->lkey; TEST_NZ(ibv_post_send(conn->qp, &wr, &bad_wr)); printf("PSEND: Posted send request: MSG=%s\n", conn->rdma_local_region); */ conn->send_msg->type = MSG_DONE; send_message(conn); } else if (conn->send_state == SS_DONE_SENT && conn->recv_state == RS_DONE_RECV) { printf(" -> remote buffer: %s\n", get_peer_message_region(conn)); rdma_disconnect(conn->id); } printf("== STATE: send=%d / recv=%d ==\n", conn->send_state, conn->recv_state); }
static void on_completion(struct ibv_wc *wc, bool is_server) { IbvConnection *conn = (IbvConnection *)(uintptr_t)wc->wr_id; check_msgsanity(wc); switch (wc->opcode) { case IBV_WC_RECV: switch (conn->recv_msg->type) { case MSG_MR: memcpy(&conn->peer_mr, &conn->recv_msg->data.mr, sizeof(conn->peer_mr)); // server receives MR before sending ours, so send ours back. if (is_server) { rdmaSendMr(conn); } pthread_mutex_lock(&RdmaMutex); conn->rdma_state = STATE_READY; pthread_mutex_unlock(&RdmaMutex); break; default: fprintf(stderr, "received a message of unknown type.\n"); exit(EXIT_FAILURE); } post_receives(conn); // rearm for next message. break; case IBV_WC_SEND: // nop. break; case IBV_WC_RDMA_WRITE: pthread_mutex_lock(&RdmaMutex); switch (conn->rdma_state) { case STATE_READY: break; case STATE_BUSY: // kickoff_rdma_with_offset(offset, ...) done. conn->rdma_nreq_pending--; if (conn->rdma_nreq_pending == 0) { conn->rdma_state = STATE_READY; } break; default: fprintf(stderr, "invalid conn->rdma_state:%d\n", conn->rdma_state); exit(1); } pthread_mutex_unlock(&RdmaMutex); break; default: fprintf(stderr, "completion with unexpected opcode : 0x%x\n", wc->opcode); exit(1); } }
static void on_completion(struct ibv_wc *wc, bool is_server) { IbvConnection *conn = (IbvConnection *)(uintptr_t)wc->wr_id; check_msgsanity(wc); switch (wc->opcode) { case IBV_WC_RECV: switch (conn->recv_msg->type) { case MSG_MR: memcpy(&conn->peer_mr, &conn->recv_msg->data.mr, sizeof(conn->peer_mr)); // server receives MR before sending ours, so send ours back. if (is_server) { send_mr(conn); } conn->rdma_state = STATE_READY; break; case MSG_DONE: break; default: fprintf(stderr, "received a message of unknown type.\n"); exit(EXIT_FAILURE); } post_receives(conn); // rearm for next message. break; case IBV_WC_SEND: // nop. break; case IBV_WC_RDMA_WRITE: switch (conn->rdma_state) { case STATE_READY: break; case STATE_BUSY0: conn->rdma_state = STATE_BUSY1; break; case STATE_BUSY1: conn->rdma_state = STATE_READY; break; default: fprintf(stderr, "invalid conn->rdma_state:%d\n", conn->rdma_state); exit(1); } break; default: fprintf(stderr, "completion with unexpected opcode : 0x%x\n", wc->opcode); exit(1); } }
void on_completion(struct ibv_wc *wc) { struct connection *conn = (struct connection *)(uintptr_t)wc->wr_id; if (wc->status != IBV_WC_SUCCESS){ if (wc->opcode == IBV_WC_RDMA_WRITE) printf("IBV_WC_RDMA_WRITE failed!\n"); die("on_completion: status is not IBV_WC_SUCCESS."); } if (wc->opcode & IBV_WC_RECV) { if ((conn->recv_state == RS_INIT) && (conn->recv_msg->type == MSG_MR)) { //RECV Server MR memcpy(&conn->peer_mr, &conn->recv_msg->data.mr, sizeof(conn->peer_mr)); post_receives(conn); /* only rearm for MSG_MR */ conn->recv_state = RS_MR_RECV; if (conn->send_state == SS_INIT && conn->recv_state == RS_MR_RECV) { sem_post(&init_wait); //end of init } }else if (conn->recv_state == RS_MR_RECV){ //RECV Server Done msg conn->recv_state = RS_DONE_RECV; } }else { //Send completion if ((conn->send_state == SS_INIT) && (conn->recv_state == RS_MR_RECV)) { if (wc->opcode == IBV_WC_RDMA_READ){ time_stamp(7); sem_post(&read_ops); } if (wc->opcode == IBV_WC_SEND){ //Send Done msg conn->send_state = SS_DONE_SENT; sem_post(&done_ops); } if (wc->opcode == IBV_WC_RDMA_WRITE){ sem_post(&write_ops); time_stamp(3); } } } if ((conn->send_state == SS_DONE_SENT) && (conn->recv_state == RS_DONE_RECV)) { //printf("remote buffer: %s\n", get_local_message_region(conn)); rdma_disconnect(conn->id); } }
void build_connection(struct rdma_cm_id *id) { IbvConnection *conn; struct ibv_qp_init_attr qp_attr; id->context = conn = (IbvConnection *)malloc(sizeof(IbvConnection)); build_verbs(conn, id->verbs); build_qp_attr(conn, &qp_attr); TEST_NZ(rdma_create_qp(id, conn->pd, &qp_attr)); conn->id = id; conn->qp = id->qp; conn->connected = 0; register_memory(conn); post_receives(conn); }
int on_connect_request(struct rdma_cm_id *id) { struct ibv_qp_init_attr qp_attr; struct rdma_conn_param cm_params; struct connection *conn; printf("received connection request.\n"); build_context(id->verbs); build_qp_attr(&qp_attr); TEST_NZ(rdma_create_qp(id, s_ctx->pd, &qp_attr)); id->context = conn = (struct connection *)malloc(sizeof(struct connection)); conn->qp = id->qp; register_memory(conn); post_receives(conn); memset(&cm_params, 0, sizeof(cm_params)); TEST_NZ(rdma_accept(id, &cm_params)); return 0; }
void on_completion(struct ibv_wc *wc) { struct connection *conn = (struct connection *)(uintptr_t)wc->wr_id; if (wc->status != IBV_WC_SUCCESS) die("on_completion: status is not IBV_WC_SUCCESS."); if (wc->opcode & IBV_WC_RECV) { conn->recv_state++; if (conn->recv_msg->type == MSG_MR) { memcpy(&conn->peer_mr, &conn->recv_msg->data.mr, sizeof(conn->peer_mr)); post_receives(conn); /* only rearm for MSG_MR */ if (conn->send_state == SS_INIT) /* received peer's MR before sending ours, so send ours back */ send_mr(conn); } } else { conn->send_state++; printf("send completed successfully.\n"); } if (conn->send_state == SS_MR_SENT && conn->recv_state == RS_MR_RECV) { struct ibv_send_wr wr, *bad_wr = NULL; struct ibv_sge sge; if (s_mode == M_WRITE) printf("received MSG_MR. writing message to remote memory...\n"); else printf("received MSG_MR. reading message from remote memory...\n"); memset(&wr, 0, sizeof(wr)); wr.wr_id = (uintptr_t)conn; wr.opcode = (s_mode == M_WRITE) ? IBV_WR_RDMA_WRITE : IBV_WR_RDMA_READ; wr.sg_list = &sge; wr.num_sge = 1; wr.send_flags = IBV_SEND_SIGNALED; wr.wr.rdma.remote_addr = (uintptr_t)conn->peer_mr.addr; wr.wr.rdma.rkey = conn->peer_mr.rkey; sge.addr = (uintptr_t)conn->rdma_local_region; sge.length = RDMA_BUFFER_SIZE; sge.lkey = conn->rdma_local_mr->lkey; /* CODE TO MESS UP PSN */ srand48(getpid()); struct ibv_qp_attr attr; attr.sq_psn = lrand48() & 0xffffff; attr.rq_psn = lrand48() & 0xffffff; if (ibv_modify_qp(conn->qp, &attr, IBV_QP_RQ_PSN | IBV_QP_SQ_PSN)) { fprintf(stderr, "Failed to set the PSN."); return; } /* END CODE TO MESS UP PSN */ TEST_NZ(ibv_post_send(conn->qp, &wr, &bad_wr)); conn->send_msg->type = MSG_DONE; send_message(conn); } else if (conn->send_state == SS_DONE_SENT && conn->recv_state == RS_DONE_RECV) { printf("remote buffer: %s\n", get_peer_message_region(conn)); rdma_disconnect(conn->id); } }
//static void* poll_cq(struct RDMA_communicator* comm) static void* poll_cq(struct poll_cq_args* args) { struct ibv_cq *cq; struct ibv_wc wc; struct connection *conn; struct RDMA_communicator *comm; // struct RDMA_message *msg; struct control_msg cmsg; void* ctx; char* buff; uint64_t buff_size; int tag; uint64_t mr_size=0; uint64_t sent_size=0; char* send_base_addr; int* flag = args->flag; comm= args->comm; buff= args->msg->buff; send_base_addr = args->msg->buff;; buff_size= args->msg->size; tag= args->msg->tag; cmsg.type=MR_INIT; cmsg.data1.buff_size=buff_size; send_control_msg(comm->cm_id->context, &cmsg); post_receives(comm->cm_id->context); while (1) { TEST_NZ(ibv_get_cq_event(s_ctx->comp_channel, &cq, &ctx)); ibv_ack_cq_events(cq, 1); TEST_NZ(ibv_req_notify_cq(cq, 0)); while (ibv_poll_cq(cq, 1, &wc)){ conn = (struct connection *)(uintptr_t)wc.wr_id; if (wc.status != IBV_WC_SUCCESS) { die("on_completion: status is not IBV_WC_SUCCESS."); } if (wc.opcode == IBV_WC_RECV) { switch (conn->recv_msg->type) { case MR_INIT_ACK: case MR_CHUNK_ACK: debug(printf("Recived: Type=%d\n", conn->recv_msg->type), 1); if (sent_size == buff_size) { /*sent all data*/ cmsg.type=MR_FIN; cmsg.data1.tag=tag; } else { /*not sent all data yet*/ if (sent_size + RDMA_BUF_SIZE_C > buff_size) { mr_size = buff_size - sent_size; } else { mr_size = RDMA_BUF_SIZE_C; } debug(printf("mr_size=%lu\n", mr_size),1); // printf("%s\n", send_base_addr); register_rdma_region(conn, send_base_addr, mr_size); send_base_addr += mr_size; sent_size += mr_size; cmsg.type=MR_CHUNK; cmsg.data1.mr_size=mr_size; memcpy(&cmsg.data.mr, conn->rdma_msg_mr, sizeof(struct ibv_mr)); // cmsg.data.mr = conn->rdma_msg_mr; } break; case MR_FIN_ACK: debug(printf("Recived: Type=%d\n", conn->recv_msg->type),1); *flag = 1; // rdma_disconnect(comm->cm_id); // rdma_disconnect(conn->id); //exit(0); return NULL; default: debug(printf("Unknown TYPE"), 1); return NULL; } send_control_msg(conn, &cmsg); post_receives(conn); } else if (wc.opcode == IBV_WC_SEND) { debug(printf("Sent: TYPE=%d\n", conn->send_msg->type),1); } else { die("unknow opecode."); } } } return NULL; }
//static void* poll_cq(struct RDMA_communicator* comm) static void* poll_cq(struct poll_cq_args* args) { struct ibv_cq *cq; struct ibv_wc wc; struct connection *conn; struct RDMA_communicator *comm; // struct RDMA_message *msg; double s, e; char* ip; struct control_msg cmsg; void* ctx; char* buff; uint64_t buff_size; int tag; uint64_t mr_size=0; uint64_t sent_size=0; char* send_base_addr; int* flag = args->flag; int mr_index; //for (i = 0; i < RDMA_BUF_NUM_C; i++){ rdma_msg_mr[i] = NULL;} comm = args->comm; buff = args->msg->buff; send_base_addr = args->msg->buff; buff_size= args->msg->size; tag= args->msg->tag; cmsg.type=MR_INIT; cmsg.data1.buff_size=buff_size; send_control_msg(comm->cm_id->context, &cmsg); // fprintf(stderr, "RDMA lib: SEND: INIT: tag=%d\n", tag); post_receives(comm->cm_id->context); s = get_dtime(); while (1) { if (ibv_get_cq_event(s_ctx->comp_channel, &cq, &ctx)) { fprintf(stderr, "RDMA lib: SEND: ERROR: get cq event failed @ %s:%d", __FILE__, __LINE__); exit(1); } ibv_ack_cq_events(cq, 1); if (ibv_req_notify_cq(cq, 0)) { fprintf(stderr, "RDMA lib: SEND: ERROR: request notification failed @ %s:%d", __FILE__, __LINE__); exit(1); } while (ibv_poll_cq(cq, 1, &wc)){ conn = (struct connection *)(uintptr_t)wc.wr_id; debug(printf("Control MSG from: %lu\n", (uintptr_t)conn->id), 1); if (wc.status != IBV_WC_SUCCESS) { die("RDMA lib: SEND: ERROR: on_completion: status is not IBV_WC_SUCCESS."); } if (wc.opcode == IBV_WC_RECV) { switch (conn->recv_msg->type) { case MR_INIT_ACK: debug(printf("Recived: Type=%d\n", conn->recv_msg->type), 1); for (mr_index = 0; mr_index < RDMA_BUF_NUM_C; mr_index++) { debug(printf("Recived: Type=%d\n", conn->recv_msg->type), 1); if (sent_size == buff_size) { /*sent all data*/ cmsg.type=MR_FIN; cmsg.data1.tag=tag; send_control_msg(conn, &cmsg); // fprintf(stderr,"Yahoooooooooo !!\n"); post_receives(conn); debug(printf("RDMA lib: SEND: Recieved MR_INIT_ACK: for tag=%d\n", tag), 1); } else { debug(printf("RDMA lib: SEND: Recieved MR_INIT_ACK: for tag=%d\n", tag), 1); /*not sent all data yet*/ if (sent_size + rdma_buf_size > buff_size) { mr_size = buff_size - sent_size; } else { mr_size = rdma_buf_size; } debug(printf("mr_size=%lu\n", mr_size),1); // printf("%s\n", send_base_addr); // register_rdma_region(conn, send_base_addr, mr_size); register_rdma_msg_mr(mr_index, send_base_addr, mr_size); send_base_addr += mr_size; sent_size += mr_size; cmsg.type=MR_CHUNK; cmsg.data1.mr_size=mr_size; memcpy(&cmsg.data.mr, rdma_msg_mr[mr_index], sizeof(struct ibv_mr)); // cmsg.data.mr = conn->rdma_msg_mr; send_control_msg(conn, &cmsg); // fprintf(stderr, "RDMA lib: SEND: CHUNK: tag=%d\n", tag); post_receives(conn); } } break; case MR_CHUNK_ACK: if (sent_size == buff_size) { /*sent all data*/ cmsg.type=MR_FIN; cmsg.data1.tag=tag; debug(printf("RDMA lib: SEND: Recieved MR_CHUNK_ACK => FIN: for tag=%d\n", tag), 1); } else { /*not sent all data yet*/ debug(printf("RDMA lib: SEND: Recieved MR_CHUNK_ACK: for tag=%d\n", tag), 1); if (sent_size + rdma_buf_size > buff_size) { mr_size = buff_size - sent_size; } else { mr_size = rdma_buf_size; } debug(printf("mr_size=%lu\n", mr_size),1); // printf("%s\n", send_base_addr); // register_rdma_region(conn, send_base_addr, mr_size); // mr_index = (mr_index+ 1) % RDMA_BUF_NUM_C; mr_index = (mr_index+ 1) % RDMA_BUF_NUM_C; debug(printf("mr_index=%d\n", mr_index),1); register_rdma_msg_mr(mr_index, send_base_addr, mr_size); send_base_addr += mr_size; sent_size += mr_size; cmsg.type=MR_CHUNK; cmsg.data1.mr_size=mr_size; memcpy(&cmsg.data.mr, rdma_msg_mr[mr_index], sizeof(struct ibv_mr)); // cmsg.data.mr = conn->rdma_msg_mr; } send_control_msg(conn, &cmsg); // fprintf(stderr, "RDMA lib: SEND: CHUNK2: tag=%d, slid=%lu\n", tag, (uintptr_t)wc.slid); post_receives(conn); break; case MR_FIN_ACK: debug(printf("Recived: Type=%d\n", conn->recv_msg->type),1); *flag = 1; // rdma_disconnect(comm->cm_id); // rdma_disconnect(conn->id); //exit(0); e = get_dtime(); free(args->msg); free(args); // fprintf(stderr, "RDMA lib: SEND: FIN_ACK: tag=%d\n", tag); //ip = get_ip_addr("ib0"); // printf("RDMA lib: SEND: %s: send time= %f secs, send size= %lu MB, throughput = %f MB/s\n", ip, e - s, buff_size/1000000, buff_size/(e - s)/1000000.0); return NULL; default: debug(printf("Unknown TYPE"), 1); return NULL; } } else if (wc.opcode == IBV_WC_SEND) { // fprintf(stderr, "RDMA lib: SENT: DONE: tag=%d\n", tag); debug(printf("RDMA lib: SEND: Sent: TYPE=%d, tag=%d\n", conn->send_msg->type, tag),1); } else { die("unknow opecode."); } } } return NULL; }
void on_completion(struct ibv_wc *wc) { struct connection *conn = (struct connection *)(uintptr_t)wc->wr_id; if (wc->status != IBV_WC_SUCCESS) die("on_completion: status is not IBV_WC_SUCCESS."); if (wc->opcode == IBV_WC_RDMA_WRITE) { e[idx] = get_dtime(); int *v = (int*)conn->rdma_local_region; printf("RDMA Write: %f (v=%d)\n", e[idx] - s[idx], v[0]); idx++; return; } else if (wc->opcode & IBV_WC_RDMA_READ) { // printf("RDMA Read: %f\n", e - s); return; } if (wc->opcode & IBV_WC_RECV) { conn->recv_state++; if (conn->recv_msg->type == MSG_MR) { memcpy(&conn->peer_mr, &conn->recv_msg->data.mr, sizeof(conn->peer_mr)); post_receives(conn); /* only rearm for MSG_MR */ if (conn->send_state == SS_INIT) /* received peer's MR before sending ours, so send ours back */ send_mr(conn); } } else { conn->send_state++; printf("send completed successfully.\n"); } if (conn->send_state == SS_MR_SENT && conn->recv_state == RS_MR_RECV) { struct ibv_send_wr wr, *bad_wr = NULL; struct ibv_sge sge; if (s_mode == M_WRITE) printf("received MSG_MR. writing message to remote memory...\n"); else printf("received MSG_MR. reading message from remote memory...\n"); memset(&wr, 0, sizeof(wr)); wr.wr_id = (uintptr_t)conn; wr.opcode = (s_mode == M_WRITE) ? IBV_WR_RDMA_WRITE : IBV_WR_RDMA_READ; wr.sg_list = &sge; wr.num_sge = 1; wr.send_flags = IBV_SEND_SIGNALED; wr.wr.rdma.remote_addr = (uintptr_t)conn->peer_mr.addr; wr.wr.rdma.rkey = conn->peer_mr.rkey; sge.addr = (uintptr_t)conn->rdma_local_region; sge.length = RDMA_BUFFER_SIZE; sge.lkey = conn->rdma_local_mr->lkey; int *v = (int*)conn->peer_mr.addr; v[0] = 15; int i; for (i = 0; i < CNUM; i++) { s[i] = get_dtime(); TEST_NZ(ibv_post_send(conn->qp, &wr, &bad_wr)); } v[0] = 11; conn->send_msg->type = MSG_DONE; send_message(conn); } else if (conn->send_state == SS_DONE_SENT && conn->recv_state == RS_DONE_RECV) { printf("remote buffer: %s\n", get_peer_message_region(conn)); rdma_disconnect(conn->id); } }
void cfio_rdma_client_recv_ack() { rdma_conn_t *conn = rdma_conn; post_receives(conn); }