Exemple #1
0
void orte_qos_ack_msg_ack_timeout_callback (struct opal_hotel_t *hotel,
                                            int room_num, void *occupant)
{
    orte_rml_send_t *msg;
    orte_qos_ack_channel_t *ack_chan;
    msg = (orte_rml_send_t *) occupant;
    ack_chan = (orte_qos_ack_channel_t*) msg->channel->qos_channel_ptr;
    OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output,
                        "%s orte_qos_ack_msg_ack_timeout_callback for msg = %p seq num =%d\n",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         (void*)msg, msg->seq_num));
    /* for now complete only the msg that timed out
      TO DO : handle the completion of all messages in the window */
    msg->status = ORTE_ERR_ACK_TIMEOUT_SENDER;
    // set room num to -1 for the msg's seq number
    orte_qos_ack_channel_set_msg_room (ack_chan, msg->seq_num , -1);
    // complete the msg
    ORTE_RML_SEND_COMPLETE(msg);
}
Exemple #2
0
void orte_qos_ack_channel_process_ack (int status, orte_process_name_t* sender,
                                       opal_buffer_t *buffer,
                                       orte_rml_tag_t tag, void *cbdata)
{
    /*  process ack received for the msg */
    uint32_t num_msgs_acked, channel_num, i;
    int32_t num_values, room_num;
    orte_rml_send_t *msg, *missed_msg;
    void *occupant = NULL;
    orte_rml_channel_t *channel;
    orte_qos_ack_channel_t *ack_chan;
    uint32_t *seq_num_array;
    uint32_t ack_type;
    uint32_t missed_msg_seq_num = 0;
    num_values = 1;
    /* unpack channel number first */
    opal_dss.unpack(buffer, (void*) &channel_num, &num_values, OPAL_UINT32);
    OPAL_OUTPUT_VERBOSE((5, orte_qos_base_framework.framework_output,
                         "orte_qos_ack_channel_process_ack recieved ack on channel = %d",
                         channel_num));
    channel = orte_rml_base_get_channel (channel_num);
    if ((NULL != channel) || (NULL != channel->qos_channel_ptr)) {
        ack_chan = (orte_qos_ack_channel_t *) (channel->qos_channel_ptr);
        seq_num_array = malloc (sizeof(uint32_t) * ack_chan->window);
        num_values = 1;
        /* unpack ack type */
        opal_dss.unpack(buffer, (void*) &ack_type, &num_values, OPAL_UINT32);
        num_values = 1;
        /* unpack num messages acked */
        opal_dss.unpack(buffer, (void*) &num_msgs_acked, &num_values, OPAL_UINT32);
        OPAL_OUTPUT_VERBOSE((5, orte_qos_base_framework.framework_output,
                             "orte_qos_ack_channel_process_ack recieved ack type %d for %d msgs on channel = %d",
                             ack_type, num_msgs_acked, channel_num));
        if (ACK_OUT_OF_ORDER != ack_type)   {
            //handle normal ACK
            for (i = 0; i < num_msgs_acked; i++)
                {
                    opal_dss.unpack(buffer, (void*) &seq_num_array[i], &num_values, OPAL_UINT32);
                    room_num = orte_qos_ack_channel_get_msg_room (ack_chan, seq_num_array[i]);
                    opal_hotel_checkout_and_return_occupant(&ack_chan->outstanding_msgs, room_num, &occupant);
                    orte_qos_ack_channel_set_msg_room(ack_chan, seq_num_array[i], -1);
                    if((occupant != NULL) && (room_num != -1)) {
                        msg = (orte_rml_send_t*) occupant;
                        OPAL_OUTPUT_VERBOSE((10, orte_rml_base_framework.framework_output,
                                             "Releasing sent message with tag %d and seq_num %d after receiving Ack from dest ",
                                             msg->tag, msg->seq_num ));
                        msg->status = ORTE_SUCCESS;
                        ORTE_RML_SEND_COMPLETE(msg);
                    } else {
                        OPAL_OUTPUT_VERBOSE((10, orte_rml_base_framework.framework_output,
                                             "OOPS received an ACK for already completed seq_num =%d ",
                                             seq_num_array[i] ));
                    }
                }
        } else {
            // handle out of order ACK - complete msgs received in order, retry the lost msg.
            for (i = 0; i < num_msgs_acked; i++)
                {
                    opal_dss.unpack(buffer, (void*) &seq_num_array[i], &num_values, OPAL_UINT32);
                    room_num = orte_qos_ack_channel_get_msg_room (ack_chan, seq_num_array[i]);
                    opal_hotel_checkout_and_return_occupant(&ack_chan->outstanding_msgs, room_num, &occupant);
                    orte_qos_ack_channel_set_msg_room(ack_chan, seq_num_array[i], -1);
                    if ((NULL != occupant) && ((i == 0 )|| (seq_num_array[i] == seq_num_array[i-1] +1 ))) {
                        msg = (orte_rml_send_t*) occupant;
                        msg->status = ORTE_SUCCESS;
                        ORTE_RML_SEND_COMPLETE(msg);
                    } else {
                        if (NULL != occupant) {
                            // num_missed_msgs = (seq_num_array[i] - seq_num_array [i-1] - 1);
                            assert( i == num_msgs_acked -1);
                            /* recheck the ith msg */
                            opal_hotel_checkin(&ack_chan->outstanding_msgs, (void*)occupant, &room_num);
                            orte_qos_ack_channel_set_msg_room (ack_chan, seq_num_array[i], room_num);
                            /* resend and recheck all the missed msgs*/
                            missed_msg_seq_num = seq_num_array[i-1] + 1;
                            for (; missed_msg_seq_num < seq_num_array[i]; missed_msg_seq_num++) {
                                room_num = orte_qos_ack_channel_get_msg_room (ack_chan, missed_msg_seq_num);
                                opal_hotel_checkout_and_return_occupant (&ack_chan->outstanding_msgs, room_num, &occupant);
                                assert ( NULL != occupant);
                                missed_msg = (orte_rml_send_t*) occupant;
                                missed_msg->status = ORTE_ERR_LOST_MSG_IN_WINDOW;
                                opal_hotel_checkin(&ack_chan->outstanding_msgs, (void*)missed_msg, &room_num);
                                orte_qos_ack_channel_set_msg_room (ack_chan, missed_msg_seq_num, room_num);
                                /* send this out on wire directly */
                                ORTE_OOB_SEND (missed_msg);
                            } //end for
                        } else {
                            OPAL_OUTPUT_VERBOSE((10, orte_rml_base_framework.framework_output,
                                                 "OOPS received an ACK for already completed seq_num =%d ",
                                                 seq_num_array[i] ));
                        }//end  if (NULL != occupant)
                    } //end else
                } // end for
        }//end out of order ack processing
        free(seq_num_array);
    }else {
        OPAL_OUTPUT_VERBOSE((5, orte_qos_base_framework.framework_output,
                             "orte_qos_ack_channel_msg_ack_recv_callback recieved ack on non existent channel = %d",
                             channel_num));
    }
}
/*
 * A file descriptor is available/ready for send. Check the state
 * of the socket and take the appropriate action.
 */
void mca_oob_tcp_send_handler(int sd, short flags, void *cbdata)
{
    mca_oob_tcp_peer_t* peer = (mca_oob_tcp_peer_t*)cbdata;
    mca_oob_tcp_send_t* msg = peer->send_msg;
    int rc;

    opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
                        "%s tcp:send_handler called to send to peer %s",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                        ORTE_NAME_PRINT(&peer->name));

    switch (peer->state) {
    case MCA_OOB_TCP_CONNECTING:
    case MCA_OOB_TCP_CLOSED:
        opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
                            "%s tcp:send_handler %s",
                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                            mca_oob_tcp_state_print(peer->state));
        mca_oob_tcp_peer_complete_connect(peer);
        /* de-activate the send event until the connection
         * handshake completes
         */
        if (peer->send_ev_active) {
            opal_event_del(&peer->send_event);
            peer->send_ev_active = false;
        }
        break;
    case MCA_OOB_TCP_CONNECTED:
        opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
                            "%s tcp:send_handler SENDING TO %s",
                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                            (NULL == peer->send_msg) ? "NULL" : ORTE_NAME_PRINT(&peer->name));
        if (NULL != msg) {
            /* if the header hasn't been completely sent, send it */
            if (!msg->hdr_sent) {
                OPAL_TIMING_EVENT((&tm,"Send header to %s", ORTE_NAME_PRINT(&peer->name)));
                if (ORTE_SUCCESS == (rc = send_bytes(peer))) {
                    /* header is completely sent */
                    msg->hdr_sent = true;
                    /* setup to send the data */
                    if (NULL != msg->data) {
                        /* relay msg - send that data */
                        msg->sdptr = msg->data;
                        msg->sdbytes = (int)ntohl(msg->hdr.nbytes);
                    } else if (NULL == msg->msg) {
                        /* this was a zero-byte relay - nothing more to do */
                        OBJ_RELEASE(msg);
                        peer->send_msg = NULL;
                        goto next;
                    } else if (NULL != msg->msg->buffer) {
                        /* send the buffer data as a single block */
                        msg->sdptr = msg->msg->buffer->base_ptr;
                        msg->sdbytes = msg->msg->buffer->bytes_used;
                    } else if (NULL != msg->msg->iov) {
                        /* start with the first iovec */
                        msg->sdptr = msg->msg->iov[0].iov_base;
                        msg->sdbytes = msg->msg->iov[0].iov_len;
                        msg->iovnum = 0;
                    } else {
                        /* just send the data */
                        msg->sdptr = msg->msg->data;
                        msg->sdbytes = msg->msg->count;
                    }
                    /* fall thru and let the send progress */
                } else if (ORTE_ERR_RESOURCE_BUSY == rc ||
                           ORTE_ERR_WOULD_BLOCK == rc) {
                    /* exit this event and let the event lib progress */
                    return;
                } else {
                    // report the error
                    opal_output(0, "%s-%s mca_oob_tcp_peer_send_handler: unable to send header",
                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                ORTE_NAME_PRINT(&(peer->name)));
                    opal_event_del(&peer->send_event);
                    msg->msg->status = rc;
                    ORTE_RML_SEND_COMPLETE(msg->msg);
                    OBJ_RELEASE(msg);
                    peer->send_msg = NULL;
                    goto next;
                }
            }
            /* progress the data transmission */
            if (msg->hdr_sent) {
                OPAL_TIMING_EVENT((&tm,"Send msg to %s", ORTE_NAME_PRINT(&peer->name)));
                if (ORTE_SUCCESS == (rc = send_bytes(peer))) {
                    /* this block is complete */
                    if (NULL != msg->data || NULL == msg->msg) {
                        /* the relay is complete - release the data */
                        opal_output_verbose(2, orte_oob_base_framework.framework_output,
                                            "%s MESSAGE RELAY COMPLETE TO %s OF %d BYTES ON SOCKET %d",
                                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                            ORTE_NAME_PRINT(&(peer->name)),
                                            (int)ntohl(msg->hdr.nbytes), peer->sd);
                        OBJ_RELEASE(msg);
                        peer->send_msg = NULL;
                    } else if (NULL != msg->msg->buffer) {
                        /* we are done - notify the RML */
                        opal_output_verbose(2, orte_oob_base_framework.framework_output,
                                            "%s MESSAGE SEND COMPLETE TO %s OF %d BYTES ON SOCKET %d",
                                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                            ORTE_NAME_PRINT(&(peer->name)),
                                            (int)ntohl(msg->hdr.nbytes), peer->sd);
                        msg->msg->status = ORTE_SUCCESS;
                        ORTE_RML_SEND_COMPLETE(msg->msg);
                        OBJ_RELEASE(msg);
                        peer->send_msg = NULL;
                    } else if (NULL != msg->msg->data) {
                        /* this was a relay we have now completed - no need to
                         * notify the RML as the local proc didn't initiate
                         * the send
                         */
                        opal_output_verbose(2, orte_oob_base_framework.framework_output,
                                            "%s MESSAGE RELAY COMPLETE TO %s OF %d BYTES ON SOCKET %d",
                                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                            ORTE_NAME_PRINT(&(peer->name)),
                                            (int)ntohl(msg->hdr.nbytes), peer->sd);
                        msg->msg->status = ORTE_SUCCESS;
                        OBJ_RELEASE(msg);
                        peer->send_msg = NULL;
                    } else {
                        /* rotate to the next iovec */
                        msg->iovnum++;
                        if (msg->iovnum < msg->msg->count) {
                            msg->sdptr = msg->msg->iov[msg->iovnum].iov_base;
                            msg->sdbytes = msg->msg->iov[msg->iovnum].iov_len;
                            /* exit this event to give the event lib
                             * a chance to progress any other pending
                             * actions
                             */
                            return;
                        } else {
                            /* this message is complete - notify the RML */
                            opal_output_verbose(2, orte_oob_base_framework.framework_output,
                                                "%s MESSAGE SEND COMPLETE TO %s OF %d BYTES ON SOCKET %d",
                                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                                ORTE_NAME_PRINT(&(peer->name)),
                                                (int)ntohl(msg->hdr.nbytes), peer->sd);
                            msg->msg->status = ORTE_SUCCESS;
                            ORTE_RML_SEND_COMPLETE(msg->msg);
                            OBJ_RELEASE(msg);
                            peer->send_msg = NULL;
                        }
                    }
                    /* fall thru to queue the next message */
                } else if (ORTE_ERR_RESOURCE_BUSY == rc ||
                           ORTE_ERR_WOULD_BLOCK == rc) {
                    /* exit this event and let the event lib progress */
                    return;
                } else {
                    // report the error
                    opal_output(0, "%s-%s mca_oob_tcp_peer_send_handler: unable to send message ON SOCKET %d",
                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                ORTE_NAME_PRINT(&(peer->name)), peer->sd);
                    opal_event_del(&peer->send_event);
                    msg->msg->status = rc;
                    ORTE_RML_SEND_COMPLETE(msg->msg);
                    OBJ_RELEASE(msg);
                    peer->send_msg = NULL;
                    ORTE_FORCED_TERMINATE(1);
                    return;
                }
            }

        next:
            /* if current message completed - progress any pending sends by
             * moving the next in the queue into the "on-deck" position. Note
             * that this doesn't mean we send the message right now - we will
             * wait for another send_event to fire before doing so. This gives
             * us a chance to service any pending recvs.
             */
            peer->send_msg = (mca_oob_tcp_send_t*)
                opal_list_remove_first(&peer->send_queue);
        }
        
        /* if nothing else to do unregister for send event notifications */
        if (NULL == peer->send_msg && peer->send_ev_active) {
            opal_event_del(&peer->send_event);
            peer->send_ev_active = false;
        }
        break;
    default:
        opal_output(0, "%s-%s mca_oob_tcp_peer_send_handler: invalid connection state (%d) on socket %d",
                    ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                    ORTE_NAME_PRINT(&(peer->name)),
                    peer->state, peer->sd);
        if (peer->send_ev_active) {
            opal_event_del(&peer->send_event);
            peer->send_ev_active = false;
        }
        break;
    }
}
Exemple #4
0
static int mca_oob_ud_send_self (orte_rml_send_t *msg)
{
    unsigned int srco, dsto;
    mca_oob_ud_req_t *req;
    int srci, dsti;
    int rc, size;

    MCA_OOB_UD_IOV_SIZE(msg, size);

    opal_output_verbose(10, orte_oob_base_framework.framework_output,
                         "%s mca_oob_ud_send_self: sending %d bytes to myself",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), size);

    rc = mca_oob_ud_get_recv_req (*ORTE_PROC_MY_NAME, msg->tag, &req, (msg->iov != NULL) ? true : false);
    if (ORTE_SUCCESS != rc) {
        return rc;
    }

    req->req_rem_data_len = size;
    req->req_is_eager     = true;

    rc = mca_oob_ud_recv_alloc (req);
    if (ORTE_SUCCESS != rc) {
        ORTE_ERROR_LOG(rc);
        if (MCA_OOB_UD_REQ_IOV == req->req_data_type) {
            free (req->req_data.iov.uiov);
        }
        OBJ_RELEASE(req);
        return rc;
    }

    srci = dsti = 0;
    srco = dsto = 0;

    if (msg->iov != NULL) {
        do {
            req->req_data_type = MCA_OOB_UD_REQ_IOV;
            size_t copy = min(msg->iov[srci].iov_len - srco,
                              req->req_data.iov.uiov[dsti].iov_len - dsto);

            memmove ((unsigned char *) req->req_data.iov.uiov[dsti].iov_base + dsto,
                     (unsigned char *) msg->iov[srci].iov_base + srco, copy);

            srco += copy;
            if (srco == msg->iov[srci].iov_len) {
                srci++;
                srco = 0;
            }

            dsto += copy;
            if (dsto == req->req_data.iov.uiov[dsti].iov_len) {
                dsti++;
                dsto = 0;
            }
        } while (srci < req->req_data.iov.count && dsti < msg->count);
    } else {
        req->req_data_type = MCA_OOB_UD_REQ_BUF;

        opal_buffer_t *buffer;
        buffer = OBJ_NEW(opal_buffer_t);

        if (OPAL_SUCCESS != (rc = opal_dss.copy_payload(buffer, msg->buffer))) {
            ORTE_ERROR_LOG(rc);
            OBJ_RELEASE(buffer);
            return rc;
        }
        if (OPAL_SUCCESS != (rc = opal_dss.unload(buffer, (void **)&req->req_data.buf.p, &req->req_data.buf.size)))
        {
            ORTE_ERROR_LOG(rc);
            OBJ_RELEASE(buffer);
            free(req->req_data.buf.p);
            return rc;
        }
        OBJ_RELEASE(buffer);
    }

    req->state = MCA_OOB_UD_REQ_COMPLETE;

    opal_output_verbose(10, orte_oob_base_framework.framework_output,
                         "%s mca_oob_ud_send_self: complete. calling callbacks",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));

    /* queue up recv callback */
    mca_oob_ud_event_queue_completed (req);

    req->rml_msg->status = ORTE_SUCCESS;

    ORTE_RML_SEND_COMPLETE(req->rml_msg);

    return size;
}
Exemple #5
0
void mca_oob_ud_req_complete (mca_oob_ud_req_t *req, int rc)
{
    int i;
    opal_output_verbose(10, orte_oob_base_framework.framework_output,
                         "%s oob:ud:req_complete %s request %p completed with status %d",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (req->type == MCA_OOB_UD_REQ_SEND) ? "SEND":"RECV", (void *) req, rc);

    if (NULL != req->req_qp) {
        (void) mca_oob_ud_qp_data_release (req->req_qp);
        req->req_qp = NULL;
    }

    /* deregister memory *before* handing it to the callback */
    MCA_OOB_UD_REQ_DEREG_MR(req);

    switch (req->type) {
    case MCA_OOB_UD_REQ_SEND:
        if (req->req_data_type != MCA_OOB_UD_REQ_TR) {
            req->rml_msg->status = rc;
            if( NULL == req->rml_msg->channel) {
                ORTE_RML_SEND_COMPLETE(req->rml_msg);
            } else {
                ORTE_QOS_SEND_COMPLETE(req->rml_msg);
            }
        }
        break;
    case MCA_OOB_UD_REQ_RECV:
        if ((req->req_target.jobid == ORTE_PROC_MY_NAME->jobid) &&
            (req->req_target.vpid == ORTE_PROC_MY_NAME->vpid)) {
            opal_output_verbose(1, orte_oob_base_framework.framework_output,
                "%s DELIVERING TO RML",
                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
            if (MCA_OOB_UD_REQ_IOV == req->req_data_type) {
                char *data = (char *)calloc(req->req_data.iov.count, sizeof(struct iovec));
                int datalen = 0;
                for (i = 0 ; i < req->req_data.iov.count; ++i) {
                    memcpy (&data[datalen], req->req_data.iov.uiov[i].iov_base, req->req_data.iov.uiov[i].iov_len);
                    datalen += req->req_data.iov.uiov[i].iov_len;
                }
                ORTE_RML_POST_MESSAGE(&req->req_origin, req->req_tag, req->req_channel, req->req_seq_num, data, datalen, 0);
                free(data);
            } else {
                ORTE_RML_POST_MESSAGE(&req->req_origin, req->req_tag, req->req_channel, req->req_seq_num,
                                      req->req_data.buf.p, req->req_data.buf.size, 0);
            }
        } else {
            opal_output_verbose(1, orte_oob_base_framework.framework_output,
                                "%s UD PROMOTING ROUTED MESSAGE FOR %s TO OOB",
                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                ORTE_NAME_PRINT(&req->req_target));

            orte_rml_send_t *snd = OBJ_NEW(orte_rml_send_t);
            snd->dst = req->req_target;
            snd->origin =  req->req_origin;
            snd->tag = req->req_tag;
            snd->dst_channel = req->req_channel;
            snd->seq_num = req->req_seq_num;
            if (MCA_OOB_UD_REQ_IOV == req->req_data_type) {
                char *data = (char *)calloc(req->req_data.iov.count, sizeof(struct iovec));
                int datalen = 0;
                for (i = 0 ; i < req->req_data.iov.count; ++i) {
                    memcpy (&data[datalen], req->req_data.iov.uiov[i].iov_base, req->req_data.iov.uiov[i].iov_len);
                    datalen += req->req_data.iov.uiov[i].iov_len;
                }
                snd->data = data;
                snd->count = datalen;
            } else {
                char *data = (char *)calloc(req->req_data.buf.size, sizeof(char));
                memcpy (data, req->req_data.buf.p, req->req_data.buf.size);
                snd->data = data;
                snd->count = req->req_data.buf.size;
            }
            snd->cbfunc.iov = NULL;
            snd->cbdata = NULL;
            /* activate the OOB send state */
            ORTE_OOB_SEND(snd);
        }
        break;
    default:
        break;
    }

    mca_oob_ud_req_return (req);
}