void FreeBuff(char *buff1, char *buff2) { VAPI_ret_t ret; if(s_mr_hndl != VAPI_INVAL_HNDL) { LOGPRINTF("Deregistering send buffer\n"); ret = VAPI_deregister_mr(hca_hndl, s_mr_hndl); if(ret != VAPI_OK) { fprintf(stderr, "Error deregistering send mr: %s\n", VAPI_strerror(ret)); } else { s_mr_hndl = VAPI_INVAL_HNDL; } } if(r_mr_hndl != VAPI_INVAL_HNDL) { LOGPRINTF("Deregistering recv buffer\n"); ret = VAPI_deregister_mr(hca_hndl, r_mr_hndl); if(ret != VAPI_OK) { fprintf(stderr, "Error deregistering recv mr: %s\n", VAPI_strerror(ret)); } else { r_mr_hndl = VAPI_INVAL_HNDL; } } if(buff1 != NULL) free(buff1); if(buff2 != NULL) free(buff2); }
void SendData(ArgStruct *p) { VAPI_ret_t ret; /* Return code */ VAPI_sr_desc_t sr; /* Send request */ VAPI_sg_lst_entry_t sg_entry; /* Scatter/Gather list - holds buff addr */ /* Fill in send request struct */ if(p->prot.commtype == NP_COMM_SENDRECV) { sr.opcode = VAPI_SEND; LOGPRINTF("Doing regular send\n"); } else if(p->prot.commtype == NP_COMM_SENDRECV_WITH_IMM) { sr.opcode = VAPI_SEND_WITH_IMM; LOGPRINTF("Doing regular send with imm\n"); } else if(p->prot.commtype == NP_COMM_RDMAWRITE) { sr.opcode = VAPI_RDMA_WRITE; sr.remote_addr = (VAPI_virt_addr_t)(MT_virt_addr_t)(remote_address + (p->s_ptr - p->s_buff)); sr.r_key = remote_key; LOGPRINTF("Doing RDMA write (raddr=%p)\n", sr.remote_addr); } else if(p->prot.commtype == NP_COMM_RDMAWRITE_WITH_IMM) { sr.opcode = VAPI_RDMA_WRITE_WITH_IMM; sr.remote_addr = (VAPI_virt_addr_t)(MT_virt_addr_t)(remote_address + (p->s_ptr - p->s_buff)); sr.r_key = remote_key; LOGPRINTF("Doing RDMA write with imm (raddr=%p)\n", sr.remote_addr); } else { fprintf(stderr, "Error, invalid communication type in SendData\n"); exit(-1); } sr.comp_type = VAPI_UNSIGNALED; sr.set_se = FALSE; /* This needed due to a bug in Mellanox HW rel a-0 */ sr.sg_lst_len = 1; sr.sg_lst_p = &sg_entry; sg_entry.lkey = s_mr_out.l_key; /* Local memory region key */ sg_entry.len = p->bufflen; sg_entry.addr = (VAPI_virt_addr_t)(MT_virt_addr_t)p->s_ptr; ret = VAPI_post_sr(hca_hndl, qp_hndl, &sr); if(ret != VAPI_OK) { fprintf(stderr, "Error posting send request: %s\n", VAPI_strerror(ret)); } else { LOGPRINTF("Posted send request\n"); } }
void PrepareToReceive(ArgStruct *p) { VAPI_ret_t ret; /* Return code */ VAPI_rr_desc_t rr; /* Receive request */ VAPI_sg_lst_entry_t sg_entry; /* Scatter/Gather list - holds buff addr */ /* We don't need to post a receive if doing RDMA write with local polling */ if( p->prot.commtype == NP_COMM_RDMAWRITE && p->prot.comptype == NP_COMP_LOCALPOLL ) return; rr.opcode = VAPI_RECEIVE; /* We only need signaled completions if using VAPI * completion methods. */ if( p->prot.comptype == NP_COMP_LOCALPOLL ) rr.comp_type = VAPI_UNSIGNALED; else rr.comp_type = VAPI_SIGNALED; rr.sg_lst_len = 1; rr.sg_lst_p = &sg_entry; sg_entry.lkey = r_mr_out.l_key; sg_entry.len = p->bufflen; sg_entry.addr = (VAPI_virt_addr_t)(MT_virt_addr_t)p->r_ptr; ret = VAPI_post_rr(hca_hndl, qp_hndl, &rr); if(ret != VAPI_OK) { fprintf(stderr, "Error posting recv request: %s\n", VAPI_strerror(ret)); CleanUp(p); exit(-1); } else { LOGPRINTF("Posted recv request\n"); } /* Set receive flag to zero and request event completion * notification for this receive so the event handler will * be triggered when the receive completes. */ if( p->prot.comptype == NP_COMP_EVENT ) { receive_complete = 0; } }
static int psib_poll(hca_info_t *hca_info, int blocking) { VAPI_ret_t rc; do { rc = psib_check_cq(hca_info); } while (blocking && (rc != VAPI_CQ_EMPTY)); if (psib_debug && (rc != VAPI_CQ_EMPTY) && (rc != VAPI_OK)) { printf("psib_poll: %s: %s\n", VAPI_strerror_sym(rc), VAPI_strerror(rc)); } return (rc == VAPI_CQ_EMPTY); }
static void psib_err_rc(char *str, VAPI_ret_t rc) { const char *vapi_sym = VAPI_strerror_sym(rc); const char *vapi_err = VAPI_strerror(rc); int len = strlen(str) + strlen(vapi_sym) + strlen(vapi_err) + 20; char *msg = malloc(len); assert(msg); strcpy(msg, str); strcat(msg, " : "); strcat(msg, vapi_sym); strcat(msg, " - "); strcat(msg, vapi_err); psib_err(msg); free(msg); }
void event_handler(VAPI_hca_hndl_t hca, VAPI_cq_hndl_t cq, void* data) { VAPI_ret_t ret; while(1) { ret = VAPI_poll_cq(hca, cq, &wc); if(ret == VAPI_CQ_EMPTY) { LOGPRINTF("Empty completion queue, requesting next notification\n"); VAPI_req_comp_notif(hca_hndl, r_cq_hndl, VAPI_NEXT_COMP); return; } else if(ret != VAPI_OK) { fprintf(stderr, "Error in event_handler, polling cq: %s\n", VAPI_strerror(ret)); exit(-1); } else if(wc.status != VAPI_SUCCESS) { fprintf(stderr, "Error in event_handler, on returned work completion " "status: %s\n", VAPI_wc_status_sym(wc.status)); exit(-1); } LOGPRINTF("Retrieved work completion\n"); /* For ping-pong mode at least, this check shouldn't be needed for * normal operation, but it will help catch any bugs with multiple * sends coming through when we're only expecting one. */ if(receive_complete == 1) { while(receive_complete != 0) sched_yield(); } receive_complete = 1; } }
/* Reset is used after a trial to empty the work request queues so we have enough room for the next trial to run */ void Reset(ArgStruct *p) { VAPI_ret_t ret; /* Return code */ VAPI_sr_desc_t sr; /* Send request */ VAPI_rr_desc_t rr; /* Recv request */ /* If comptype is event, then we'll use event handler to detect receive, * so initialize receive_complete flag */ if(p->prot.comptype == NP_COMP_EVENT) receive_complete = 0; /* Prepost receive */ rr.opcode = VAPI_RECEIVE; rr.comp_type = VAPI_SIGNALED; rr.sg_lst_len = 0; LOGPRINTF("Posting recv request in Reset\n"); ret = VAPI_post_rr(hca_hndl, qp_hndl, &rr); if(ret != VAPI_OK) { fprintf(stderr, " Error posting recv request: %s\n", VAPI_strerror(ret)); CleanUp(p); exit(-1); } /* Make sure both nodes have preposted receives */ Sync(p); /* Post Send */ sr.opcode = VAPI_SEND; sr.comp_type = VAPI_SIGNALED; sr.set_se = FALSE; /* This needed due to a bug in Mellanox HW rel a-0 */ sr.sg_lst_len = 0; LOGPRINTF("Posting send request \n"); ret = VAPI_post_sr(hca_hndl, qp_hndl, &sr); if(ret != VAPI_OK) { fprintf(stderr, " Error posting send request in Reset: %s\n", VAPI_strerror(ret)); exit(-1); } if(wc.status != VAPI_SUCCESS) { fprintf(stderr, " Error in completion status: %s\n", VAPI_wc_status_sym(wc.status)); exit(-1); } LOGPRINTF("Polling for completion of send request\n"); ret = VAPI_CQ_EMPTY; while(ret == VAPI_CQ_EMPTY) ret = VAPI_poll_cq(hca_hndl, s_cq_hndl, &wc); if(ret != VAPI_OK) { fprintf(stderr, "Error polling CQ for send in Reset: %s\n", VAPI_strerror(ret)); exit(-1); } if(wc.status != VAPI_SUCCESS) { fprintf(stderr, " Error in completion status: %s\n", VAPI_wc_status_sym(wc.status)); exit(-1); } LOGPRINTF("Status of send completion: %s\n", VAPI_wc_status_sym(wc.status)); if(p->prot.comptype == NP_COMP_EVENT) { /* If using event completion, the event handler will set receive_complete * when it gets the completion event. */ LOGPRINTF("Waiting for receive_complete flag\n"); while(receive_complete == 0) { /* BUSY WAIT */ } } else { LOGPRINTF("Polling for completion of recv request\n"); ret = VAPI_CQ_EMPTY; while(ret == VAPI_CQ_EMPTY) ret = VAPI_poll_cq(hca_hndl, r_cq_hndl, &wc); if(ret != VAPI_OK) { fprintf(stderr, "Error polling CQ for recv in Reset: %s\n", VAPI_strerror(ret)); exit(-1); } if(wc.status != VAPI_SUCCESS) { fprintf(stderr, " Error in completion status: %s\n", VAPI_wc_status_sym(wc.status)); exit(-1); } LOGPRINTF("Status of recv completion: %s\n", VAPI_wc_status_sym(wc.status)); } LOGPRINTF("Done with reset\n"); }
void RecvData(ArgStruct *p) { VAPI_ret_t ret; /* Busy wait for incoming data */ LOGPRINTF("Receiving at buffer address %p\n", p->r_ptr); if( p->prot.comptype == NP_COMP_LOCALPOLL ) { /* Poll for receive completion locally on the receive data */ LOGPRINTF("Waiting for last byte of data to arrive\n"); while(p->r_ptr[p->bufflen-1] != 'a' + (p->cache ? 1 - p->tr : 1) ) { /* BUSY WAIT -- this should be fine since we * declared r_ptr with volatile qualifier */ } /* Reset last byte */ p->r_ptr[p->bufflen-1] = 'a' + (p->cache ? p->tr : 0); LOGPRINTF("Received all of data\n"); } else if( p->prot.comptype == NP_COMP_VAPIPOLL ) { /* Poll for receive completion using VAPI poll function */ LOGPRINTF("Polling completion queue for VAPI work completion\n"); ret = VAPI_CQ_EMPTY; while(ret == VAPI_CQ_EMPTY) ret = VAPI_poll_cq(hca_hndl, r_cq_hndl, &wc); if(ret != VAPI_OK) { fprintf(stderr, "Error in RecvData, polling for completion: %s\n", VAPI_strerror(ret)); exit(-1); } if(wc.status != VAPI_SUCCESS) { fprintf(stderr, "Error in status of returned completion: %s\n", VAPI_wc_status_sym(wc.status)); exit(-1); } LOGPRINTF("Retrieved successful completion\n"); } else if( p->prot.comptype == NP_COMP_EVENT ) { /* Instead of polling directly on data or VAPI completion queue, * let the VAPI event completion handler set a flag when the receive * completes, and poll on that instead. Could try using semaphore here * as well to eliminate busy polling */ LOGPRINTF("Polling receive flag\n"); while( receive_complete == 0 ) { /* BUSY WAIT */ } /* If in prepost-burst mode, we won't be calling PrepareToReceive * between ping-pongs, so we need to reset the receive_complete * flag here. */ if( p->preburst ) receive_complete = 0; LOGPRINTF("Receive completed\n"); } }
int finalizeIB(ArgStruct *p) { VAPI_ret_t ret; LOGPRINTF("Finalizing IB stuff\n"); /* Clear completion event handler */ if(p->prot.comptype == NP_COMP_EVENT ) { LOGPRINTF("Clearing comp handler\n"); ret = EVAPI_clear_comp_eventh(hca_hndl, ceh_hndl); if(ret != VAPI_OK) { fprintf(stderr, "Error clearing event handler: %s\n", VAPI_strerror(ret)); } } if(qp_hndl != VAPI_INVAL_HNDL) { LOGPRINTF("Destroying QP\n"); ret = VAPI_destroy_qp(hca_hndl, qp_hndl); if(ret != VAPI_OK) { fprintf(stderr, "Error destroying Queue Pair: %s\n", VAPI_strerror(ret)); } } if(r_cq_hndl != VAPI_INVAL_HNDL) { LOGPRINTF("Destroying Recv CQ\n"); ret = VAPI_destroy_cq(hca_hndl, r_cq_hndl); if(ret != VAPI_OK) { fprintf(stderr, "Error destroying recv CQ: %s\n", VAPI_strerror(ret)); } } if(s_cq_hndl != VAPI_INVAL_HNDL) { LOGPRINTF("Destroying Send CQ\n"); ret = VAPI_destroy_cq(hca_hndl, s_cq_hndl); if(ret != VAPI_OK) { fprintf(stderr, "Error destroying send CQ: %s\n", VAPI_strerror(ret)); } } /* Check memory registrations just in case user bailed out */ if(s_mr_hndl != VAPI_INVAL_HNDL) { LOGPRINTF("Deregistering send buffer\n"); ret = VAPI_deregister_mr(hca_hndl, s_mr_hndl); if(ret != VAPI_OK) { fprintf(stderr, "Error deregistering send mr: %s\n", VAPI_strerror(ret)); } } if(r_mr_hndl != VAPI_INVAL_HNDL) { LOGPRINTF("Deregistering recv buffer\n"); ret = VAPI_deregister_mr(hca_hndl, r_mr_hndl); if(ret != VAPI_OK) { fprintf(stderr, "Error deregistering recv mr: %s\n", VAPI_strerror(ret)); } } if(pd_hndl != VAPI_INVAL_HNDL) { LOGPRINTF("Deallocating PD\n"); ret = VAPI_dealloc_pd(hca_hndl, pd_hndl); if(ret != VAPI_OK) { fprintf(stderr, "Error deallocating PD: %s\n", VAPI_strerror(ret)); } } /* Application code should not close HCA, just release handle */ if(hca_hndl != VAPI_INVAL_HNDL) { LOGPRINTF("Releasing HCA\n"); ret = EVAPI_release_hca_hndl(hca_hndl); if(ret != VAPI_OK) { fprintf(stderr, "Error releasing HCA: %s\n", VAPI_strerror(ret)); } } return 0; }
int initIB(ArgStruct *p) { VAPI_ret_t ret; /* Open HCA */ /* open hca just in case it was not opened by system earlier */ ret = VAPI_open_hca("InfiniHost0", &hca_hndl); ret = EVAPI_get_hca_hndl("InfiniHost0", &hca_hndl); if(ret != VAPI_OK) { fprintf(stderr, "Error opening Infiniband HCA: %s\n", VAPI_strerror(ret)); return -1; } else { LOGPRINTF("Opened Infiniband HCA\n"); } /* Get HCA properties */ port_num=1; ret = VAPI_query_hca_port_prop(hca_hndl, (IB_port_t)port_num, (VAPI_hca_port_t *)&hca_port); if(ret != VAPI_OK) { fprintf(stderr, "Error querying Infiniband HCA: %s\n", VAPI_strerror(ret)); return -1; } else { LOGPRINTF("Queried Infiniband HCA\n"); } lid = hca_port.lid; LOGPRINTF(" lid = %d\n", lid); /* Allocate Protection Domain */ ret = VAPI_alloc_pd(hca_hndl, &pd_hndl); if(ret != VAPI_OK) { fprintf(stderr, "Error allocating PD: %s\n", VAPI_strerror(ret)); return -1; } else { LOGPRINTF("Allocated Protection Domain\n"); } /* Create send completion queue */ num_cqe = 30000; /* Requested number of completion q elements */ ret = VAPI_create_cq(hca_hndl, num_cqe, &s_cq_hndl, &act_num_cqe); if(ret != VAPI_OK) { fprintf(stderr, "Error creating send CQ: %s\n", VAPI_strerror(ret)); return -1; } else { LOGPRINTF("Created Send Completion Queue with %d elements\n", act_num_cqe); } /* Create recv completion queue */ num_cqe = 20000; /* Requested number of completion q elements */ ret = VAPI_create_cq(hca_hndl, num_cqe, &r_cq_hndl, &act_num_cqe); if(ret != VAPI_OK) { fprintf(stderr, "Error creating recv CQ: %s\n", VAPI_strerror(ret)); return -1; } else { LOGPRINTF("Created Recv Completion Queue with %d elements\n", act_num_cqe); } /* Placeholder for MR */ /* Create Queue Pair */ qp_init_attr.cap.max_oust_wr_rq = max_wq; /* Max outstanding WR on RQ */ qp_init_attr.cap.max_oust_wr_sq = max_wq; /* Max outstanding WR on SQ */ qp_init_attr.cap.max_sg_size_rq = 1; /* Max scatter/gather entries on RQ */ qp_init_attr.cap.max_sg_size_sq = 1; /* Max scatter/gather entries on SQ */ qp_init_attr.pd_hndl = pd_hndl; /* Protection domain handle */ qp_init_attr.rdd_hndl = 0; /* Reliable datagram domain handle */ qp_init_attr.rq_cq_hndl = r_cq_hndl; /* CQ handle for RQ */ qp_init_attr.rq_sig_type = VAPI_SIGNAL_REQ_WR; /* Signalling type */ qp_init_attr.sq_cq_hndl = s_cq_hndl; /* CQ handle for RQ */ qp_init_attr.sq_sig_type = VAPI_SIGNAL_REQ_WR; /* Signalling type */ qp_init_attr.ts_type = IB_TS_RC; /* Transmission type */ ret = VAPI_create_qp(hca_hndl, &qp_init_attr, &qp_hndl, &qp_prop); if(ret != VAPI_OK) { fprintf(stderr, "Error creating Queue Pair: %s\n", VAPI_strerror(ret)); return -1; } else { LOGPRINTF("Created Queue Pair, max outstanding WR on RQ: %d, on SQ: %d\n", qp_prop.cap.max_oust_wr_rq, qp_prop.cap.max_oust_wr_sq); } /* Exchange lid and qp_num with other node */ if( write(p->commfd, &lid, sizeof(lid) ) != sizeof(lid) ) { fprintf(stderr, "Failed to send lid over socket\n"); return -1; } if( write(p->commfd, &qp_prop.qp_num, sizeof(qp_prop.qp_num) ) != sizeof(qp_prop.qp_num) ) { fprintf(stderr, "Failed to send qpnum over socket\n"); return -1; } if( read(p->commfd, &d_lid, sizeof(d_lid) ) != sizeof(d_lid) ) { fprintf(stderr, "Failed to read lid from socket\n"); return -1; } if( read(p->commfd, &d_qp_num, sizeof(d_qp_num) ) != sizeof(d_qp_num) ) { fprintf(stderr, "Failed to read qpnum from socket\n"); return -1; } LOGPRINTF("Local: lid=%d qp_num=%d Remote: lid=%d qp_num=%d\n", lid, qp_prop.qp_num, d_lid, d_qp_num); /* Bring up Queue Pair */ /******* INIT state ******/ QP_ATTR_MASK_CLR_ALL(qp_attr_mask); qp_attr.qp_state = VAPI_INIT; QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_QP_STATE); qp_attr.pkey_ix = 0; QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_PKEY_IX); qp_attr.port = port_num; QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_PORT); qp_attr.remote_atomic_flags = VAPI_EN_REM_WRITE | VAPI_EN_REM_READ; QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_REMOTE_ATOMIC_FLAGS); ret = VAPI_modify_qp(hca_hndl, qp_hndl, &qp_attr, &qp_attr_mask, &qp_cap); if(ret != VAPI_OK) { fprintf(stderr, "Error modifying QP to INIT: %s\n", VAPI_strerror(ret)); return -1; } LOGPRINTF("Modified QP to INIT\n"); /******* RTR (Ready-To-Receive) state *******/ QP_ATTR_MASK_CLR_ALL(qp_attr_mask); qp_attr.qp_state = VAPI_RTR; QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_QP_STATE); qp_attr.qp_ous_rd_atom = 1; QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_QP_OUS_RD_ATOM); qp_attr.dest_qp_num = d_qp_num; QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_DEST_QP_NUM); qp_attr.av.sl = 0; qp_attr.av.grh_flag = FALSE; qp_attr.av.dlid = d_lid; qp_attr.av.static_rate = 0; qp_attr.av.src_path_bits = 0; QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_AV); qp_attr.path_mtu = p->prot.ib_mtu; QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_PATH_MTU); qp_attr.rq_psn = 0; QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_RQ_PSN); qp_attr.pkey_ix = 0; QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_PKEY_IX); qp_attr.min_rnr_timer = 5; QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_MIN_RNR_TIMER); ret = VAPI_modify_qp(hca_hndl, qp_hndl, &qp_attr, &qp_attr_mask, &qp_cap); if(ret != VAPI_OK) { fprintf(stderr, "Error modifying QP to RTR: %s\n", VAPI_strerror(ret)); return -1; } LOGPRINTF("Modified QP to RTR\n"); /* Sync before going to RTS state */ Sync(p); /******* RTS (Ready-to-Send) state *******/ QP_ATTR_MASK_CLR_ALL(qp_attr_mask); qp_attr.qp_state = VAPI_RTS; QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_QP_STATE); qp_attr.sq_psn = 0; QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_SQ_PSN); qp_attr.timeout = 31; QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_TIMEOUT); qp_attr.retry_count = 1; QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_RETRY_COUNT); qp_attr.rnr_retry = 1; QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_RNR_RETRY); qp_attr.ous_dst_rd_atom = 1; QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_OUS_DST_RD_ATOM); ret = VAPI_modify_qp(hca_hndl, qp_hndl, &qp_attr, &qp_attr_mask, &qp_cap); if(ret != VAPI_OK) { fprintf(stderr, "Error modifying QP to RTS: %s\n", VAPI_strerror(ret)); return -1; } LOGPRINTF("Modified QP to RTS\n"); /* If using event completion, register event completion handler and request * the initial notification */ if( p->prot.comptype == NP_COMP_EVENT ) { EVAPI_set_comp_eventh(hca_hndl, r_cq_hndl, event_handler, p, &ceh_hndl); VAPI_req_comp_notif(hca_hndl, r_cq_hndl, VAPI_NEXT_COMP); } return 0; }
void MyMalloc(ArgStruct *p, int bufflen, int soffset, int roffset) { VAPI_ret_t ret; /* Allocate buffers */ p->r_buff = malloc(bufflen+MAX(soffset,roffset)); if(p->r_buff == NULL) { fprintf(stderr, "Error malloc'ing buffer\n"); exit(-1); } if(p->cache) { /* Infiniband spec says we can register same memory region * more than once, so just copy buffer address. We will register * the same buffer twice with Infiniband. */ p->s_buff = p->r_buff; } else { p->s_buff = malloc(bufflen+soffset); if(p->s_buff == NULL) { fprintf(stderr, "Error malloc'ing buffer\n"); exit(-1); } } /* Register buffers with Infiniband */ mr_in.acl = VAPI_EN_LOCAL_WRITE | VAPI_EN_REMOTE_WRITE; mr_in.l_key = 0; mr_in.pd_hndl = pd_hndl; mr_in.r_key = 0; mr_in.size = bufflen+MAX(soffset,roffset); mr_in.start = (VAPI_virt_addr_t)(MT_virt_addr_t)p->r_buff; mr_in.type = VAPI_MR; ret = VAPI_register_mr(hca_hndl, &mr_in, &r_mr_hndl, &r_mr_out); if(ret != VAPI_OK) { fprintf(stderr, "Error registering recv buffer: %s\n", VAPI_strerror(ret)); exit(-1); } else { LOGPRINTF("Registered Recv Buffer\n"); } mr_in.acl = VAPI_EN_LOCAL_WRITE; mr_in.l_key = 0; mr_in.pd_hndl = pd_hndl; mr_in.r_key = 0; mr_in.size = bufflen+soffset; mr_in.start = (VAPI_virt_addr_t)(MT_virt_addr_t)p->s_buff; mr_in.type = VAPI_MR; ret = VAPI_register_mr(hca_hndl, &mr_in, &s_mr_hndl, &s_mr_out); if(ret != VAPI_OK) { fprintf(stderr, "Error registering send buffer: %s\n", VAPI_strerror(ret)); exit(-1); } else { LOGPRINTF("Registered Send Buffer\n"); } }
/* * The queue pair has been created and we have received the remote * queue pair information from the peer so we init this queue pair * and are ready to roll. */ int mca_btl_mvapi_endpoint_qp_init_query( mca_btl_mvapi_module_t* mvapi_btl, VAPI_hca_hndl_t nic, VAPI_qp_hndl_t qp_hndl, VAPI_qp_num_t remote_qp_num, IB_lid_t remote_lid, IB_port_t port_id ) { VAPI_ret_t ret; VAPI_qp_attr_t qp_attr; VAPI_qp_attr_mask_t qp_attr_mask; VAPI_qp_init_attr_t qp_init_attr; VAPI_qp_cap_t qp_cap; /* Modifying QP to INIT */ QP_ATTR_MASK_CLR_ALL(qp_attr_mask); qp_attr.qp_state = VAPI_INIT; QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_QP_STATE); qp_attr.pkey_ix = mca_btl_mvapi_component.ib_pkey_ix; QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_PKEY_IX); qp_attr.port = port_id; QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_PORT); qp_attr.remote_atomic_flags = VAPI_EN_REM_WRITE | VAPI_EN_REM_READ; QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_REMOTE_ATOMIC_FLAGS); ret = VAPI_modify_qp(nic, qp_hndl, &qp_attr, &qp_attr_mask, &qp_cap); if(VAPI_OK != ret) { BTL_ERROR(("Error modifying the queue pair: %s", VAPI_strerror(ret))); return OMPI_ERROR; } BTL_VERBOSE(("Modified to init..Qp %d", qp_hndl)); /********************** INIT --> RTR ************************/ QP_ATTR_MASK_CLR_ALL(qp_attr_mask); qp_attr.qp_state = VAPI_RTR; QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_QP_STATE); qp_attr.qp_ous_rd_atom = mca_btl_mvapi_component.ib_qp_ous_rd_atom; QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_QP_OUS_RD_ATOM); qp_attr.path_mtu = mca_btl_mvapi_component.ib_mtu; QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_PATH_MTU); qp_attr.rq_psn = mca_btl_mvapi_component.ib_psn; QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_RQ_PSN); qp_attr.pkey_ix = mca_btl_mvapi_component.ib_pkey_ix; QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_PKEY_IX); qp_attr.min_rnr_timer = mca_btl_mvapi_component.ib_min_rnr_timer; QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_MIN_RNR_TIMER); qp_attr.av.sl = mca_btl_mvapi_component.ib_service_level; qp_attr.av.grh_flag = FALSE; qp_attr.av.static_rate = mca_btl_mvapi_component.ib_static_rate; qp_attr.av.src_path_bits = mca_btl_mvapi_component.ib_src_path_bits; qp_attr.dest_qp_num = remote_qp_num; QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_DEST_QP_NUM); qp_attr.av.dlid = remote_lid; QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_AV); ret = VAPI_modify_qp(nic, qp_hndl, &qp_attr, &qp_attr_mask, &qp_cap); if(VAPI_OK != ret) { BTL_ERROR(("Error modifying the queue pair: %s", VAPI_strerror(ret))); return OMPI_ERROR; } BTL_VERBOSE(("Modified to RTR..Qp %d", qp_hndl)); /************** RTS *******************/ QP_ATTR_MASK_CLR_ALL(qp_attr_mask); qp_attr.qp_state = VAPI_RTS; QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_QP_STATE); qp_attr.sq_psn = mca_btl_mvapi_component.ib_psn; QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_SQ_PSN); qp_attr.timeout = mca_btl_mvapi_component.ib_timeout; QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_TIMEOUT); qp_attr.retry_count = mca_btl_mvapi_component.ib_retry_count; QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_RETRY_COUNT); qp_attr.rnr_retry = mca_btl_mvapi_component.ib_rnr_retry; QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_RNR_RETRY); qp_attr.ous_dst_rd_atom = mca_btl_mvapi_component.ib_max_rdma_dst_ops; QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_OUS_DST_RD_ATOM); ret = VAPI_modify_qp(nic, qp_hndl, &qp_attr, &qp_attr_mask, &qp_cap); if(VAPI_OK != ret) { return OMPI_ERROR; } BTL_VERBOSE(("Modified to RTS..Qp %d", qp_hndl)); ret = VAPI_query_qp(nic, qp_hndl, &qp_attr, &qp_attr_mask, &qp_init_attr ); if (ret != VAPI_OK) { BTL_ERROR(("Error modifying the queue pair: %s", VAPI_strerror(ret))); return OMPI_ERROR; } mvapi_btl->ib_inline_max = qp_init_attr.cap.max_inline_data_sq; return OMPI_SUCCESS; }
int mca_btl_mvapi_endpoint_create_qp( mca_btl_mvapi_module_t* mvapi_btl, VAPI_hca_hndl_t nic, VAPI_pd_hndl_t ptag, VAPI_cq_hndl_t cq_hndl, #ifdef VAPI_FEATURE_SRQ VAPI_srq_hndl_t srq_hndl, #endif VAPI_qp_hndl_t* qp_hndl, VAPI_qp_prop_t* qp_prop, int transport_type) { VAPI_ret_t ret; VAPI_qp_init_attr_t qp_init_attr; #ifdef VAPI_FEATURE_SRQ VAPI_qp_init_attr_ext_t qp_init_attr_ext; #endif /* worst case number of credit messages could be queued */ switch(transport_type) { case VAPI_TS_RC: /* Set up RC qp parameters */ qp_init_attr.cap.max_oust_wr_sq = mca_btl_mvapi_component.rd_num + 1; qp_init_attr.cap.max_oust_wr_rq = mca_btl_mvapi_component.rd_num + mca_btl_mvapi_component.rd_rsv; qp_init_attr.cap.max_sg_size_sq = mca_btl_mvapi_component.ib_sg_list_size; qp_init_attr.cap.max_sg_size_rq = mca_btl_mvapi_component.ib_sg_list_size; qp_init_attr.pd_hndl = ptag; /* We don't have Reliable Datagram Handle right now */ qp_init_attr.rdd_hndl = 0; /* Signal all work requests on this queue pair */ qp_init_attr.rq_sig_type = VAPI_SIGNAL_REQ_WR; qp_init_attr.sq_sig_type = VAPI_SIGNAL_REQ_WR; /* Use Reliable Connected transport service */ qp_init_attr.ts_type = VAPI_TS_RC; /* Set Send and Recv completion queues */ qp_init_attr.rq_cq_hndl = cq_hndl; qp_init_attr.sq_cq_hndl = cq_hndl; break; case VAPI_TS_UD: /* Set up UD qp parameters */ default: return OMPI_ERR_NOT_IMPLEMENTED; } #ifdef VAPI_FEATURE_SRQ if(mca_btl_mvapi_component.use_srq) { qp_init_attr_ext.srq_hndl = srq_hndl; ret = VAPI_create_qp_ext(nic, &qp_init_attr, &qp_init_attr_ext, qp_hndl, qp_prop); } else #endif { ret = VAPI_create_qp(nic, &qp_init_attr, qp_hndl, qp_prop); } if(VAPI_OK != ret) { BTL_ERROR(("error creating the queue pair: %s", VAPI_strerror(ret))); return OMPI_ERROR; } return OMPI_SUCCESS; }
static inline int mca_btl_mvapi_endpoint_post_send( mca_btl_mvapi_module_t* mvapi_btl, mca_btl_mvapi_endpoint_t * endpoint, mca_btl_mvapi_frag_t * frag) { int do_rdma = 0; VAPI_qp_hndl_t qp_hndl; int ret; if(frag->base.des_flags & MCA_BTL_DES_FLAGS_PRIORITY && frag->size <= mvapi_btl->super.btl_eager_limit){ /* check for a send wqe */ if (OPAL_THREAD_ADD32(&endpoint->sd_wqe_hp,-1) < 0) { OPAL_THREAD_ADD32(&endpoint->sd_wqe_hp,1); opal_list_append(&endpoint->pending_frags_hp, (opal_list_item_t *)frag); return OMPI_SUCCESS; } /* check for rdma tocken */ if (OPAL_THREAD_ADD32(&endpoint->eager_rdma_remote.tokens,-1) < 0) { OPAL_THREAD_ADD32(&endpoint->eager_rdma_remote.tokens,1); /* check for a token */ if(!mca_btl_mvapi_component.use_srq && OPAL_THREAD_ADD32(&endpoint->sd_tokens_hp,-1) < 0) { OPAL_THREAD_ADD32(&endpoint->sd_wqe_hp,1); OPAL_THREAD_ADD32(&endpoint->sd_tokens_hp,1); opal_list_append(&endpoint->pending_frags_hp, (opal_list_item_t *)frag); return OMPI_SUCCESS; } else if( mca_btl_mvapi_component.use_srq && OPAL_THREAD_ADD32(&mvapi_btl->sd_tokens_hp,-1) < 0) { OPAL_THREAD_ADD32(&endpoint->sd_wqe_hp,1); OPAL_THREAD_ADD32(&mvapi_btl->sd_tokens_hp,1); OPAL_THREAD_LOCK(&mvapi_btl->ib_lock); opal_list_append(&mvapi_btl->pending_frags_hp, (opal_list_item_t *)frag); OPAL_THREAD_UNLOCK(&mvapi_btl->ib_lock); return OMPI_SUCCESS; } } else { do_rdma = 1; } frag->hdr->credits = (endpoint->rd_credits_hp > 0) ? endpoint->rd_credits_hp : 0; OPAL_THREAD_ADD32(&endpoint->rd_credits_hp, -frag->hdr->credits); frag->hdr->rdma_credits = endpoint->eager_rdma_local.credits; OPAL_THREAD_ADD32(&endpoint->eager_rdma_local.credits, -frag->hdr->rdma_credits); qp_hndl = endpoint->lcl_qp_hndl_hp; } else { /* check for a send wqe */ if (OPAL_THREAD_ADD32(&endpoint->sd_wqe_lp,-1) < 0) { OPAL_THREAD_ADD32(&endpoint->sd_wqe_lp,1); opal_list_append(&endpoint->pending_frags_lp, (opal_list_item_t *)frag); return OMPI_SUCCESS; /* check for a token */ } else if(!mca_btl_mvapi_component.use_srq && OPAL_THREAD_ADD32(&endpoint->sd_tokens_lp,-1) < 0 ) { OPAL_THREAD_ADD32(&endpoint->sd_wqe_lp,1); OPAL_THREAD_ADD32(&endpoint->sd_tokens_lp,1); opal_list_append(&endpoint->pending_frags_lp, (opal_list_item_t *)frag); return OMPI_SUCCESS; } else if(mca_btl_mvapi_component.use_srq && OPAL_THREAD_ADD32(&mvapi_btl->sd_tokens_lp,-1) < 0) { OPAL_THREAD_ADD32(&endpoint->sd_wqe_lp,1); OPAL_THREAD_ADD32(&mvapi_btl->sd_tokens_lp,1); OPAL_THREAD_LOCK(&mvapi_btl->ib_lock); opal_list_append(&mvapi_btl->pending_frags_lp, (opal_list_item_t *)frag); OPAL_THREAD_UNLOCK(&mvapi_btl->ib_lock); return OMPI_SUCCESS; /* queue the request */ } else { frag->hdr->credits = (endpoint->rd_credits_lp > 0) ? endpoint->rd_credits_lp : 0; OPAL_THREAD_ADD32(&endpoint->rd_credits_lp, -frag->hdr->credits); qp_hndl = endpoint->lcl_qp_hndl_lp; } } frag->desc.sr_desc.remote_qkey = 0; frag->sg_entry.addr = (VAPI_virt_addr_t) (MT_virt_addr_t) frag->hdr; frag->sg_entry.len = frag->segment.seg_len + sizeof(mca_btl_mvapi_header_t) + (do_rdma ? sizeof(mca_btl_mvapi_footer_t) : 0); if(do_rdma) { mca_btl_mvapi_footer_t* ftr = (mca_btl_mvapi_footer_t*)(((char*)frag->segment.seg_addr.pval) + frag->segment.seg_len); frag->desc.sr_desc.opcode = VAPI_RDMA_WRITE; MCA_BTL_MVAPI_RDMA_FRAG_SET_SIZE(ftr, frag->sg_entry.len); MCA_BTL_MVAPI_RDMA_MAKE_LOCAL(ftr); #ifdef OMPI_ENABLE_DEBUG ftr->seq = endpoint->eager_rdma_remote.seq++; #endif frag->desc.sr_desc.r_key = (VAPI_rkey_t)endpoint->eager_rdma_remote.rkey; frag->desc.sr_desc.remote_addr = (VAPI_virt_addr_t) endpoint->eager_rdma_remote.base.lval + endpoint->eager_rdma_remote.head * mvapi_btl->eager_rdma_frag_size + sizeof(mca_btl_mvapi_frag_t) + sizeof(mca_btl_mvapi_header_t) + frag->size + sizeof(mca_btl_mvapi_footer_t); frag->desc.sr_desc.remote_addr -= frag->sg_entry.len; MCA_BTL_MVAPI_RDMA_NEXT_INDEX (endpoint->eager_rdma_remote.head); } else { frag->desc.sr_desc.opcode = VAPI_SEND; } if(frag->sg_entry.len <= mvapi_btl->ib_inline_max) { ret = EVAPI_post_inline_sr(mvapi_btl->nic, qp_hndl, &frag->desc.sr_desc); } else { ret = VAPI_post_sr(mvapi_btl->nic, qp_hndl, &frag->desc.sr_desc); } if(VAPI_OK != ret) { BTL_ERROR(("VAPI_post_sr: %s\n", VAPI_strerror(ret))); return OMPI_ERROR; } #ifdef VAPI_FEATURE_SRQ if(mca_btl_mvapi_component.use_srq) { MCA_BTL_MVAPI_POST_SRR_HIGH(mvapi_btl, 1); MCA_BTL_MVAPI_POST_SRR_LOW(mvapi_btl, 1); } else #endif { MCA_BTL_MVAPI_ENDPOINT_POST_RR_HIGH(endpoint, 1); MCA_BTL_MVAPI_ENDPOINT_POST_RR_LOW(endpoint, 1); } return OMPI_SUCCESS; }