static int mca_oob_ud_event_send_ack (mca_oob_ud_port_t *port, mca_oob_ud_peer_t *peer, mca_oob_ud_msg_hdr_t *msg_hdr)
{
    mca_oob_ud_msg_hdr_t tmp_hdr;
    int rc = ORTE_SUCCESS;
    struct ibv_send_wr wr;
    struct ibv_sge sge;

    OPAL_OUTPUT_VERBOSE((10, mca_oob_base_output, "%s oob:ud:event_send_ack sending ack for message id %"
                         PRIu64 " peer = %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), msg_hdr->msg_id,
                         ORTE_NAME_PRINT(&peer->peer_name)));

    /* reuse registered buffer to send ack (just need to change the type/return address) */
    memcpy (&tmp_hdr, msg_hdr, sizeof (tmp_hdr));

    msg_hdr->msg_type = MCA_OOB_UD_MSG_ACK;

    /* set return address */
    msg_hdr->ra.qkey     = 0;
    msg_hdr->ra.name     = *ORTE_PROC_MY_NAME;
    msg_hdr->ra.port_num = port->port_num;

    mca_oob_ud_fill_sge (&sge, msg_hdr, sizeof (*msg_hdr), port->msg_buf.mr->lkey);
    mca_oob_ud_fill_send_wr (&wr, &sge, 1, peer);

    rc = mca_oob_ud_qp_post_send (&port->listen_qp, &wr, 1);
    if (ORTE_SUCCESS != rc) {
        opal_output (0, "oob:ud:event_send_ack error posting ack!");
        return rc;
    }

    memcpy (msg_hdr, &tmp_hdr, sizeof (tmp_hdr));

    return ORTE_SUCCESS;
}
Exemple #2
0
int mca_oob_ud_msg_get (struct mca_oob_ud_port_t *port, mca_oob_ud_req_t *req,
                        mca_oob_ud_qp_t *qp, mca_oob_ud_peer_t *peer, bool persist,
                        mca_oob_ud_msg_t **msgp)
{
    opal_free_list_item_t *item;
    opal_free_list_t *list = &port->free_msgs;

    item = opal_free_list_wait_st (list);
    if (NULL == item) {
        opal_output_verbose(5, orte_oob_base_framework.framework_output,
                            "%s oob:ud:msg_get error getting message buffer",
                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
        return ORTE_ERROR;
    }

    *msgp = (mca_oob_ud_msg_t *) item;

    (*msgp)->persist = persist;
    (*msgp)->req     = req;
    (*msgp)->peer    = peer;
    (*msgp)->qp      = qp;

    if (NULL != peer) {
        OBJ_RETAIN(peer);
    }

    memset ((*msgp)->hdr, 0, sizeof (*((*msgp)->hdr)));

    mca_oob_ud_fill_sge (&(*msgp)->sge, (*msgp)->hdr, port->mtu, (*msgp)->mr->lkey);
    mca_oob_ud_fill_send_wr (&(*msgp)->wr, &(*msgp)->sge, 1, peer);

    /* set return address */
    (*msgp)->hdr->ra.name     = *ORTE_PROC_MY_NAME;
    (*msgp)->hdr->ra.qkey     = 0;
    (*msgp)->hdr->ra.port_num = port->port_num;

    return ORTE_SUCCESS;
}
Exemple #3
0
int mca_oob_ud_send_try (mca_oob_ud_req_t *send_req) {
    int wr_index, wr_count, sge_count, sge_index, iov_index;
    unsigned int iov_left, iov_offset, packet_size;
    const unsigned int mtu = send_req->req_mtu;
    const struct timeval aquire_timeout = {0, 500000};
    mca_oob_ud_msg_t *com_msg;
    int data_len;
    int rc = ORTE_SUCCESS;

    opal_output_verbose(10, orte_oob_base_framework.framework_output,
                         "%s oob:ud:send_try sending to %s, tag = %d, "
                         "req = %p",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_NAME_PRINT(&send_req->req_peer->peer_name),
                         send_req->req_tag, (void *) send_req);

    do {
        if (NULL == send_req->req_qp) {
            rc = mca_oob_ud_qp_data_aquire (send_req->req_port, &send_req->req_qp);
            if (ORTE_SUCCESS != rc) {
                break;
            }
        }

        (void) mca_oob_ud_qp_purge (send_req->req_qp);

        rc = mca_oob_ud_msg_get (send_req->req_port, send_req, send_req->req_qp, send_req->req_peer, false,
                                 &com_msg);
        if (ORTE_SUCCESS != rc) {
            break;
        }

        if (MCA_OOB_UD_REQ_IOV == send_req->req_data_type) {
            if (NULL == send_req->req_data.iov.mr) {
                /* allocate space for memory registers */
                send_req->req_data.iov.mr = (struct ibv_mr **) calloc (send_req->req_data.iov.count, sizeof (struct ibv_mr *));
                if (NULL == send_req->req_data.iov.mr) {
                    rc = ORTE_ERR_OUT_OF_RESOURCE;
                    ORTE_ERROR_LOG(rc);
                    break;
                }
            }

            rc = mca_oob_ud_register_iov (send_req->req_data.iov.uiov, send_req->req_data.iov.count,
                                          send_req->req_data.iov.mr, send_req->req_port->device->ib_pd,
                                          mtu, &sge_count, &wr_count, &data_len);

            if (ORTE_SUCCESS != rc) {
                break;
            }
        } else {
            data_len = send_req->req_data.buf.size;
            rc = mca_oob_ud_register_buf(send_req->req_data.buf.p, send_req->req_data.buf.size,
                                         &send_req->req_data.buf.mr, send_req->req_port->device->ib_pd,
                                         mtu, &sge_count, &wr_count);

            if (ORTE_SUCCESS != rc) {
                break;
            }
        }

        wr_count = (data_len + mtu - 1) / mtu;

        if (data_len > 0) {
            data_len = data_len + 0;
        }

        if (MCA_OOB_UD_REQ_IOV == send_req->req_data_type) {
            opal_output_verbose(5, orte_oob_base_framework.framework_output,
                                 "%s oob:ud:send_try sending %d bytes in %d "
                                 "work requests, %d sges. uiov = %p.", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), data_len,
                                 wr_count, sge_count, (void *) send_req->req_data.iov.uiov);
        } else {
            opal_output_verbose(5, orte_oob_base_framework.framework_output,
                                 "%s oob:ud:send_try sending %d bytes in %d "
                                 "work requests, %d sges. buf = %p", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), data_len,
                                 wr_count, sge_count, (void *) send_req->req_data.buf.p);
        }

        if (wr_count && NULL == send_req->req_wr.send) {
            send_req->req_wr.send = (struct ibv_send_wr *) calloc (wr_count, sizeof (struct ibv_send_wr));
            if (NULL == send_req->req_wr.send) {
                rc = ORTE_ERR_OUT_OF_RESOURCE;
                ORTE_ERROR_LOG(rc);
                break;
            }
        }

        if (wr_count && NULL == send_req->req_sge) {
            send_req->req_sge = (struct ibv_sge *) calloc (sge_count, sizeof (struct ibv_sge));

            if (NULL == send_req->req_sge) {
                rc = ORTE_ERR_OUT_OF_RESOURCE;
                ORTE_ERROR_LOG(rc);
                break;
            }
        }

        if (MCA_OOB_UD_REQ_IOV == send_req->req_data_type) {
            opal_output_verbose(10, orte_oob_base_framework.framework_output,
                 "%s oob:ud:send_try posting message using iovec",
                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));

            iov_left   = send_req->req_data.iov.uiov[0].iov_len;
            iov_offset = 0;
            iov_index  = 0;

            for (wr_index = 0, sge_index = 0 ; wr_index < wr_count ; ++wr_index) {
                int sge_first = sge_index;

                packet_size = 0;

                do {
                    int to_send = min (iov_left, mtu - packet_size);

                    mca_oob_ud_fill_sge(send_req->req_sge + sge_index++,
                                        (char *)send_req->req_data.iov.uiov[iov_index].iov_base + iov_offset,
                                        to_send, send_req->req_data.iov.mr[iov_index]->lkey);

                    iov_offset  += to_send;
                    iov_left    -= to_send;
                    packet_size += to_send;

                    if (0 == iov_left) {
                        iov_index++;
                        iov_offset = 0;

                        if (iov_index < send_req->req_data.iov.count) {
                            iov_left = send_req->req_data.iov.uiov[iov_index].iov_len;
                        }
                    }
                } while ((packet_size < mtu) && (iov_left > 0));

                mca_oob_ud_fill_send_wr(send_req->req_wr.send + wr_index,
                                        send_req->req_sge + sge_first,
                                        sge_index - sge_first, send_req->req_peer);

                /* we don't care about completions for data  */
                send_req->req_wr.send[wr_index].send_flags       = IBV_SEND_SOLICITED;

                /* sequence number */
                send_req->req_wr.send[wr_index].imm_data         = wr_index;
                send_req->req_wr.send[wr_index].wr.ud.remote_qpn = send_req->req_rem_qpn;
                send_req->req_wr.send[wr_index].opcode           = IBV_WR_SEND_WITH_IMM;

                if (wr_index + 1 < wr_count) {
                    send_req->req_wr.send[wr_index].next = send_req->req_wr.send + wr_index + 1;
                }
            }
        } else {//data is in buffer
            unsigned int buffer_offset = 0;
            unsigned int buffer_size = send_req->req_data.buf.size;

            opal_output_verbose(10, orte_oob_base_framework.framework_output,
                                 "%s oob:ud:send_try posting message using buffer",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));

            for (wr_index = 0, sge_index = 0 ; wr_index < wr_count ; ++wr_index) {
                int sge_first = sge_index;

                packet_size = 0;

                do {
                    int to_send = min (buffer_size, mtu - packet_size);

                    mca_oob_ud_fill_sge(send_req->req_sge + sge_index++,
                                        (char *)send_req->req_data.buf.p + buffer_offset,
                                        to_send, send_req->req_data.buf.mr->lkey);

                    buffer_offset  += to_send;
                    buffer_size    -= to_send;
                    packet_size += to_send;
                } while ((packet_size < mtu) && (buffer_size > 0));

                mca_oob_ud_fill_send_wr(send_req->req_wr.send + wr_index,
                                        send_req->req_sge + sge_first,
                                        sge_index - sge_first, send_req->req_peer);

                /* we don't care about completions for data  */
                send_req->req_wr.send[wr_index].send_flags       = IBV_SEND_SOLICITED;

                /* sequence number */
                send_req->req_wr.send[wr_index].imm_data         = wr_index;
                send_req->req_wr.send[wr_index].wr.ud.remote_qpn = send_req->req_rem_qpn;
                send_req->req_wr.send[wr_index].opcode           = IBV_WR_SEND_WITH_IMM;

                opal_output_verbose(10, orte_oob_base_framework.framework_output,
                                     "%s oob:ud:send_try imm_data = %d",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), wr_index);

                if (wr_index + 1 < wr_count) {
                    send_req->req_wr.send[wr_index].next = send_req->req_wr.send + wr_index + 1;
                }
            }
        }

        /* send data */
        rc = mca_oob_ud_qp_post_send (send_req->req_qp, send_req->req_wr.send, 0);
        if (ORTE_SUCCESS != rc) {
            ORTE_ERROR_LOG(rc);
            break;
        }

        opal_output_verbose(10, orte_oob_base_framework.framework_output,
                             "%s oob:ud:send_try posting completion message",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));

        /* Fill in completion message. This message will go to the peers listen QP but
           must originate from our data qp to ensure that it is sent last. */
        com_msg->hdr->msg_type    = MCA_OOB_UD_MSG_COMPLETE;
        com_msg->hdr->msg_lcl_ctx = send_req->req_rem_ctx;
        com_msg->hdr->msg_rem_ctx = send_req;

        /* send message header */
        rc = mca_oob_ud_msg_post_send (com_msg);

        /* post_send already returned the message */
        com_msg = NULL;
    } while (0);

    if (ORTE_ERR_TEMP_OUT_OF_RESOURCE == rc) {
        /* set timer to retry post */
        mca_oob_ud_req_timer_set (send_req, &aquire_timeout, 1, mca_oob_ud_send_try_to);
        rc = ORTE_SUCCESS;
    }

    if (ORTE_SUCCESS != rc) {
        ORTE_ERROR_LOG(rc);
        /* damn */
        return mca_oob_ud_send_complete (send_req, rc);
    }

    send_req->state = MCA_OOB_UD_REQ_ACTIVE;

    return rc;
}