static int mca_oob_ud_event_send_ack (mca_oob_ud_port_t *port, mca_oob_ud_peer_t *peer, mca_oob_ud_msg_hdr_t *msg_hdr) { mca_oob_ud_msg_hdr_t tmp_hdr; int rc = ORTE_SUCCESS; struct ibv_send_wr wr; struct ibv_sge sge; OPAL_OUTPUT_VERBOSE((10, mca_oob_base_output, "%s oob:ud:event_send_ack sending ack for message id %" PRIu64 " peer = %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), msg_hdr->msg_id, ORTE_NAME_PRINT(&peer->peer_name))); /* reuse registered buffer to send ack (just need to change the type/return address) */ memcpy (&tmp_hdr, msg_hdr, sizeof (tmp_hdr)); msg_hdr->msg_type = MCA_OOB_UD_MSG_ACK; /* set return address */ msg_hdr->ra.qkey = 0; msg_hdr->ra.name = *ORTE_PROC_MY_NAME; msg_hdr->ra.port_num = port->port_num; mca_oob_ud_fill_sge (&sge, msg_hdr, sizeof (*msg_hdr), port->msg_buf.mr->lkey); mca_oob_ud_fill_send_wr (&wr, &sge, 1, peer); rc = mca_oob_ud_qp_post_send (&port->listen_qp, &wr, 1); if (ORTE_SUCCESS != rc) { opal_output (0, "oob:ud:event_send_ack error posting ack!"); return rc; } memcpy (msg_hdr, &tmp_hdr, sizeof (tmp_hdr)); return ORTE_SUCCESS; }
int mca_oob_ud_port_post_one_recv (mca_oob_ud_port_t *port, int msg_num) { char *grh_buf = port->grh_buf.ptr + msg_num * sizeof (struct ibv_grh); char *msg_buf = port->msg_buf.ptr + msg_num * port->mtu; struct ibv_recv_wr wr; struct ibv_sge sge[2]; /* GRH */ mca_oob_ud_fill_sge(sge, grh_buf, sizeof (struct ibv_grh), port->grh_buf.mr->lkey); /* message */ mca_oob_ud_fill_sge(sge + 1, msg_buf, port->mtu, port->msg_buf.mr->lkey); mca_oob_ud_fill_recv_wr (&wr, sge, 2); wr.wr_id = MCA_OOB_UD_RECV_WR | (uint64_t)msg_num; return mca_oob_ud_qp_post_recv (&port->listen_qp, &wr); }
int mca_oob_ud_msg_get (struct mca_oob_ud_port_t *port, mca_oob_ud_req_t *req, mca_oob_ud_qp_t *qp, mca_oob_ud_peer_t *peer, bool persist, mca_oob_ud_msg_t **msgp) { opal_free_list_item_t *item; opal_free_list_t *list = &port->free_msgs; item = opal_free_list_wait_st (list); if (NULL == item) { opal_output_verbose(5, orte_oob_base_framework.framework_output, "%s oob:ud:msg_get error getting message buffer", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); return ORTE_ERROR; } *msgp = (mca_oob_ud_msg_t *) item; (*msgp)->persist = persist; (*msgp)->req = req; (*msgp)->peer = peer; (*msgp)->qp = qp; if (NULL != peer) { OBJ_RETAIN(peer); } memset ((*msgp)->hdr, 0, sizeof (*((*msgp)->hdr))); mca_oob_ud_fill_sge (&(*msgp)->sge, (*msgp)->hdr, port->mtu, (*msgp)->mr->lkey); mca_oob_ud_fill_send_wr (&(*msgp)->wr, &(*msgp)->sge, 1, peer); /* set return address */ (*msgp)->hdr->ra.name = *ORTE_PROC_MY_NAME; (*msgp)->hdr->ra.qkey = 0; (*msgp)->hdr->ra.port_num = port->port_num; return ORTE_SUCCESS; }
int mca_oob_ud_recv_try (mca_oob_ud_req_t *recv_req) { int rc, data_len; int wr_count, sge_count, wr_index, sge_index, iov_index; unsigned int iov_left, iov_offset, packet_size; const unsigned int mtu = recv_req->req_mtu; struct timeval aquire_timeout = {0, 500000}; mca_oob_ud_msg_t *rep_msg = NULL; OPAL_OUTPUT_VERBOSE((10, mca_oob_base_output, "%s oob:ud:recv_try receiving from %s. rem ctx = %p", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&recv_req->req_peer->peer_name), recv_req->req_rem_ctx)); do { if (NULL == recv_req->req_qp) { rc = mca_oob_ud_qp_data_aquire (recv_req->req_port, &recv_req->req_qp); if (ORTE_SUCCESS != rc) { break; } } (void) mca_oob_ud_qp_purge (recv_req->req_qp); rc = mca_oob_ud_msg_get (recv_req->req_port, recv_req, &recv_req->req_port->listen_qp, recv_req->req_peer, NULL, &rep_msg); if (ORTE_SUCCESS != rc) { break; } if (NULL == recv_req->req_mr) { /* allocate space for memory registers */ recv_req->req_mr = (struct ibv_mr **) calloc (recv_req->req_count, sizeof (struct ibv_mr *)); if (NULL == recv_req->req_mr) { opal_output (0, "%s oob:ud:recv_try error allocating space for memory registers. errno = %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno); rc = ORTE_ERR_OUT_OF_RESOURCE; break; } } rc = mca_oob_ud_register_iov (recv_req->req_uiov, recv_req->req_count, recv_req->req_mr, recv_req->req_port->device->ib_pd, mtu, &sge_count, &wr_count, &data_len); if (ORTE_SUCCESS != rc) { break; } data_len = min(data_len, recv_req->req_rem_data_len); if (data_len < recv_req->req_rem_data_len && !(recv_req->req_flags & ORTE_RML_TRUNC)) { /* receive buffers are not big enough and ORTE_RML_TRUNC was not specified. this is probably an error condition */ rc = ORTE_ERR_BAD_PARAM; break; } wr_count = (data_len + mtu - 1) / mtu; sge_count += wr_count; OPAL_OUTPUT_VERBOSE((5, mca_oob_base_output, "%s oob:ud:recv_try receiving %d bytes in %d " "work requests, %d sges", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), data_len, wr_count, sge_count)); recv_req->req_packet_count = wr_count; if (NULL == recv_req->req_wr.recv) { /* allocate work requests */ recv_req->req_wr.recv = (struct ibv_recv_wr *) calloc (wr_count, sizeof (struct ibv_recv_wr)); if (NULL == recv_req->req_wr.recv) { opal_output (0, "%s oob:ud:recv_try error allocating work requests. errno = %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno); rc = ORTE_ERR_OUT_OF_RESOURCE; break; } } if (NULL == recv_req->req_sge) { /* allocate scatter-gather lists. we need more to hold the grh */ recv_req->req_sge = (struct ibv_sge *) calloc (sge_count, sizeof (struct ibv_sge)); if (NULL == recv_req->req_sge) { opal_output (0, "%s oob:ud:recv_try error allocating sges. errno = %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno); rc = ORTE_ERR_OUT_OF_RESOURCE; break; } } if (NULL == recv_req->req_grh) { /* allocate grh buffers */ recv_req->req_grh = (struct ibv_grh *) calloc (wr_count, sizeof (struct ibv_grh)); if (NULL == recv_req->req_grh) { opal_output (0, "%s oob:ud:recv_try error allocating space for GRHs. errno = %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno); rc = ORTE_ERR_OUT_OF_RESOURCE; break; } } if (NULL == recv_req->req_grh_mr) { /* register grh buffers */ recv_req->req_grh_mr = ibv_reg_mr (recv_req->req_port->device->ib_pd, recv_req->req_grh, wr_count * sizeof (struct ibv_grh), IBV_ACCESS_LOCAL_WRITE); if (NULL == recv_req->req_grh_mr) { opal_output (0, "%s oob:ud:recv_try error allocating registering GRH memory. errno = %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno); /* could not register memory */ rc = ORTE_ERR_OUT_OF_RESOURCE; break; } } rc = ORTE_SUCCESS; iov_left = recv_req->req_uiov[0].iov_len; iov_offset = 0; iov_index = 0; for (wr_index = 0, sge_index = 0 ; wr_index < wr_count ; ++wr_index) { int sge_first = sge_index; packet_size = 0; /* grh */ mca_oob_ud_fill_sge(recv_req->req_sge + sge_index++, recv_req->req_grh + wr_index, sizeof (struct ibv_grh), recv_req->req_grh_mr->lkey); do { int to_recv = min (iov_left, mtu - packet_size); mca_oob_ud_fill_sge(recv_req->req_sge + sge_index++, (char *)recv_req->req_uiov[iov_index].iov_base + iov_offset, to_recv, recv_req->req_mr[iov_index]->lkey); iov_offset += to_recv; iov_left -= to_recv; packet_size += to_recv; if (0 == iov_left) { iov_index++; iov_offset = 0; if (iov_index < recv_req->req_count) { iov_left = recv_req->req_uiov[iov_index].iov_len; } } } while ((packet_size < mtu) && (iov_left > 0)); mca_oob_ud_fill_recv_wr(recv_req->req_wr.recv + wr_index, recv_req->req_sge + sge_first, sge_index - sge_first); if (wr_index + 1 < wr_count) { recv_req->req_wr.recv[wr_index].next = recv_req->req_wr.recv + wr_index + 1; } } rc = mca_oob_ud_qp_post_recv (recv_req->req_qp, recv_req->req_wr.recv); if (ORTE_SUCCESS != rc) { break; } OPAL_OUTPUT_VERBOSE((5, mca_oob_base_output, "%s oob:ud:recv_try posting reply message", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* ok, we have a data queue pair */ rep_msg->hdr->msg_type = MCA_OOB_UD_MSG_REPLY; rep_msg->hdr->msg_lcl_ctx = recv_req->req_rem_ctx; rep_msg->hdr->msg_rem_ctx = recv_req; rep_msg->hdr->msg_data.rep.qpn = recv_req->req_qp->ib_qp->qp_num; rep_msg->hdr->msg_data.rep.data_len = data_len; rep_msg->hdr->msg_data.rep.mtu = mtu; rc = mca_oob_ud_msg_post_send (rep_msg); /* post send already returned the message */ rep_msg = NULL; } while (0); if (ORTE_ERR_TEMP_OUT_OF_RESOURCE == rc) { mca_oob_ud_req_timer_set (recv_req, &aquire_timeout, 1, mca_oob_ud_recv_try_to); rc = ORTE_SUCCESS; } if (ORTE_SUCCESS != rc) { /* bad stuff happened */ if (MCA_OOB_UD_REQ_UNEX != recv_req->type) { mca_oob_ud_req_complete (recv_req, rc); } OBJ_RELEASE(recv_req); return rc; } recv_req->state = MCA_OOB_UD_REQ_ACTIVE; return rc; }
int mca_oob_ud_send_try (mca_oob_ud_req_t *send_req) { int wr_index, wr_count, sge_count, sge_index, iov_index; unsigned int iov_left, iov_offset, packet_size; const unsigned int mtu = send_req->req_mtu; const struct timeval aquire_timeout = {0, 500000}; mca_oob_ud_msg_t *com_msg; int data_len; int rc = ORTE_SUCCESS; opal_output_verbose(10, orte_oob_base_framework.framework_output, "%s oob:ud:send_try sending to %s, tag = %d, " "req = %p", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&send_req->req_peer->peer_name), send_req->req_tag, (void *) send_req); do { if (NULL == send_req->req_qp) { rc = mca_oob_ud_qp_data_aquire (send_req->req_port, &send_req->req_qp); if (ORTE_SUCCESS != rc) { break; } } (void) mca_oob_ud_qp_purge (send_req->req_qp); rc = mca_oob_ud_msg_get (send_req->req_port, send_req, send_req->req_qp, send_req->req_peer, false, &com_msg); if (ORTE_SUCCESS != rc) { break; } if (MCA_OOB_UD_REQ_IOV == send_req->req_data_type) { if (NULL == send_req->req_data.iov.mr) { /* allocate space for memory registers */ send_req->req_data.iov.mr = (struct ibv_mr **) calloc (send_req->req_data.iov.count, sizeof (struct ibv_mr *)); if (NULL == send_req->req_data.iov.mr) { rc = ORTE_ERR_OUT_OF_RESOURCE; ORTE_ERROR_LOG(rc); break; } } rc = mca_oob_ud_register_iov (send_req->req_data.iov.uiov, send_req->req_data.iov.count, send_req->req_data.iov.mr, send_req->req_port->device->ib_pd, mtu, &sge_count, &wr_count, &data_len); if (ORTE_SUCCESS != rc) { break; } } else { data_len = send_req->req_data.buf.size; rc = mca_oob_ud_register_buf(send_req->req_data.buf.p, send_req->req_data.buf.size, &send_req->req_data.buf.mr, send_req->req_port->device->ib_pd, mtu, &sge_count, &wr_count); if (ORTE_SUCCESS != rc) { break; } } wr_count = (data_len + mtu - 1) / mtu; if (data_len > 0) { data_len = data_len + 0; } if (MCA_OOB_UD_REQ_IOV == send_req->req_data_type) { opal_output_verbose(5, orte_oob_base_framework.framework_output, "%s oob:ud:send_try sending %d bytes in %d " "work requests, %d sges. uiov = %p.", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), data_len, wr_count, sge_count, (void *) send_req->req_data.iov.uiov); } else { opal_output_verbose(5, orte_oob_base_framework.framework_output, "%s oob:ud:send_try sending %d bytes in %d " "work requests, %d sges. buf = %p", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), data_len, wr_count, sge_count, (void *) send_req->req_data.buf.p); } if (wr_count && NULL == send_req->req_wr.send) { send_req->req_wr.send = (struct ibv_send_wr *) calloc (wr_count, sizeof (struct ibv_send_wr)); if (NULL == send_req->req_wr.send) { rc = ORTE_ERR_OUT_OF_RESOURCE; ORTE_ERROR_LOG(rc); break; } } if (wr_count && NULL == send_req->req_sge) { send_req->req_sge = (struct ibv_sge *) calloc (sge_count, sizeof (struct ibv_sge)); if (NULL == send_req->req_sge) { rc = ORTE_ERR_OUT_OF_RESOURCE; ORTE_ERROR_LOG(rc); break; } } if (MCA_OOB_UD_REQ_IOV == send_req->req_data_type) { opal_output_verbose(10, orte_oob_base_framework.framework_output, "%s oob:ud:send_try posting message using iovec", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); iov_left = send_req->req_data.iov.uiov[0].iov_len; iov_offset = 0; iov_index = 0; for (wr_index = 0, sge_index = 0 ; wr_index < wr_count ; ++wr_index) { int sge_first = sge_index; packet_size = 0; do { int to_send = min (iov_left, mtu - packet_size); mca_oob_ud_fill_sge(send_req->req_sge + sge_index++, (char *)send_req->req_data.iov.uiov[iov_index].iov_base + iov_offset, to_send, send_req->req_data.iov.mr[iov_index]->lkey); iov_offset += to_send; iov_left -= to_send; packet_size += to_send; if (0 == iov_left) { iov_index++; iov_offset = 0; if (iov_index < send_req->req_data.iov.count) { iov_left = send_req->req_data.iov.uiov[iov_index].iov_len; } } } while ((packet_size < mtu) && (iov_left > 0)); mca_oob_ud_fill_send_wr(send_req->req_wr.send + wr_index, send_req->req_sge + sge_first, sge_index - sge_first, send_req->req_peer); /* we don't care about completions for data */ send_req->req_wr.send[wr_index].send_flags = IBV_SEND_SOLICITED; /* sequence number */ send_req->req_wr.send[wr_index].imm_data = wr_index; send_req->req_wr.send[wr_index].wr.ud.remote_qpn = send_req->req_rem_qpn; send_req->req_wr.send[wr_index].opcode = IBV_WR_SEND_WITH_IMM; if (wr_index + 1 < wr_count) { send_req->req_wr.send[wr_index].next = send_req->req_wr.send + wr_index + 1; } } } else {//data is in buffer unsigned int buffer_offset = 0; unsigned int buffer_size = send_req->req_data.buf.size; opal_output_verbose(10, orte_oob_base_framework.framework_output, "%s oob:ud:send_try posting message using buffer", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); for (wr_index = 0, sge_index = 0 ; wr_index < wr_count ; ++wr_index) { int sge_first = sge_index; packet_size = 0; do { int to_send = min (buffer_size, mtu - packet_size); mca_oob_ud_fill_sge(send_req->req_sge + sge_index++, (char *)send_req->req_data.buf.p + buffer_offset, to_send, send_req->req_data.buf.mr->lkey); buffer_offset += to_send; buffer_size -= to_send; packet_size += to_send; } while ((packet_size < mtu) && (buffer_size > 0)); mca_oob_ud_fill_send_wr(send_req->req_wr.send + wr_index, send_req->req_sge + sge_first, sge_index - sge_first, send_req->req_peer); /* we don't care about completions for data */ send_req->req_wr.send[wr_index].send_flags = IBV_SEND_SOLICITED; /* sequence number */ send_req->req_wr.send[wr_index].imm_data = wr_index; send_req->req_wr.send[wr_index].wr.ud.remote_qpn = send_req->req_rem_qpn; send_req->req_wr.send[wr_index].opcode = IBV_WR_SEND_WITH_IMM; opal_output_verbose(10, orte_oob_base_framework.framework_output, "%s oob:ud:send_try imm_data = %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), wr_index); if (wr_index + 1 < wr_count) { send_req->req_wr.send[wr_index].next = send_req->req_wr.send + wr_index + 1; } } } /* send data */ rc = mca_oob_ud_qp_post_send (send_req->req_qp, send_req->req_wr.send, 0); if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); break; } opal_output_verbose(10, orte_oob_base_framework.framework_output, "%s oob:ud:send_try posting completion message", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); /* Fill in completion message. This message will go to the peers listen QP but must originate from our data qp to ensure that it is sent last. */ com_msg->hdr->msg_type = MCA_OOB_UD_MSG_COMPLETE; com_msg->hdr->msg_lcl_ctx = send_req->req_rem_ctx; com_msg->hdr->msg_rem_ctx = send_req; /* send message header */ rc = mca_oob_ud_msg_post_send (com_msg); /* post_send already returned the message */ com_msg = NULL; } while (0); if (ORTE_ERR_TEMP_OUT_OF_RESOURCE == rc) { /* set timer to retry post */ mca_oob_ud_req_timer_set (send_req, &aquire_timeout, 1, mca_oob_ud_send_try_to); rc = ORTE_SUCCESS; } if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); /* damn */ return mca_oob_ud_send_complete (send_req, rc); } send_req->state = MCA_OOB_UD_REQ_ACTIVE; return rc; }