示例#1
0
void orte_rml_close_channel_recv_callback (int status,
        orte_process_name_t* peer,
        struct opal_buffer_t* buffer,
        orte_rml_tag_t tag,
        void* cbdata)
{
    // find the channel and close it or log error
    orte_rml_channel_t *channel;
    int32_t count =1, rc;
    orte_rml_channel_num_t channel_num =5;
    OPAL_OUTPUT_VERBOSE((1, orte_rml_base_framework.framework_output,
                         "%s rml_close_channel_recv_callback from peer %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_NAME_PRINT(peer)));
    /* unpack channel number */
    if(ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &channel_num,
                             &count, OPAL_UINT32))) {
        ORTE_ERROR_LOG(rc);
        return;
    }
    channel = orte_rml_base_get_channel(channel_num);
    OPAL_OUTPUT_VERBOSE((1, orte_rml_base_framework.framework_output,
                         "%s rml_close_channel_recv_callback for channel num =%d channel=%p",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         channel_num, (void*)channel));
    if (NULL != channel) {
        orte_qos_close_channel ( channel->qos, channel->qos_channel_ptr);
        opal_pointer_array_set_item ( &orte_rml_base.open_channels, channel->channel_num, NULL);
        OBJ_RELEASE(channel);
    } else {
        ORTE_ERROR_LOG(OPAL_ERR_BAD_PARAM);
    }
}
示例#2
0
void orte_qos_ack_recv_msg_timeout_callback (struct opal_hotel_t *hotel,
                                             int room_num, void *occupant)
{
#if OPAL_ENABLE_DEBUG
    orte_rml_recv_t *msg = (orte_rml_recv_t *) occupant;
#endif
#if 0
    orte_qos_ack_channel_t *ack_chan;
    orte_rml_channel_t *channel;

    channel = orte_rml_base_get_channel(msg->channel_num);
    ack_chan = (orte_qos_ack_channel_t*) channel->qos_channel_ptr;
#endif

    OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output,
                         "%s OOPS received msg = %p seq num =%d timed out on ACK Queue\n",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         (void*)msg, msg->seq_num));
    /* Need to determine correct action here as the sender hasn't responded yet to
       a lost msg event */
    /* This is highly unlikely - lets assert to enable debug*/
    assert(0);
    /*
    // set room num to -1 for the msg's seq number
    ack_chan->seq_num_to_room_num[msg->seq_num % QOS_ACK_MAX_OUTSTANDING_MSGS] = -1;
    // complete the msg
    ORTE_RML_REACTIVATE_MESSAGE(msg);*/
}
示例#3
0
void orte_qos_ack_channel_process_ack (int status, orte_process_name_t* sender,
                                       opal_buffer_t *buffer,
                                       orte_rml_tag_t tag, void *cbdata)
{
    /*  process ack received for the msg */
    uint32_t num_msgs_acked, channel_num, i;
    int32_t num_values, room_num;
    orte_rml_send_t *msg, *missed_msg;
    void *occupant = NULL;
    orte_rml_channel_t *channel;
    orte_qos_ack_channel_t *ack_chan;
    uint32_t *seq_num_array;
    uint32_t ack_type;
    uint32_t missed_msg_seq_num = 0;
    num_values = 1;
    /* unpack channel number first */
    opal_dss.unpack(buffer, (void*) &channel_num, &num_values, OPAL_UINT32);
    OPAL_OUTPUT_VERBOSE((5, orte_qos_base_framework.framework_output,
                         "orte_qos_ack_channel_process_ack recieved ack on channel = %d",
                         channel_num));
    channel = orte_rml_base_get_channel (channel_num);
    if ((NULL != channel) || (NULL != channel->qos_channel_ptr)) {
        ack_chan = (orte_qos_ack_channel_t *) (channel->qos_channel_ptr);
        seq_num_array = malloc (sizeof(uint32_t) * ack_chan->window);
        num_values = 1;
        /* unpack ack type */
        opal_dss.unpack(buffer, (void*) &ack_type, &num_values, OPAL_UINT32);
        num_values = 1;
        /* unpack num messages acked */
        opal_dss.unpack(buffer, (void*) &num_msgs_acked, &num_values, OPAL_UINT32);
        OPAL_OUTPUT_VERBOSE((5, orte_qos_base_framework.framework_output,
                             "orte_qos_ack_channel_process_ack recieved ack type %d for %d msgs on channel = %d",
                             ack_type, num_msgs_acked, channel_num));
        if (ACK_OUT_OF_ORDER != ack_type)   {
            //handle normal ACK
            for (i = 0; i < num_msgs_acked; i++)
                {
                    opal_dss.unpack(buffer, (void*) &seq_num_array[i], &num_values, OPAL_UINT32);
                    room_num = orte_qos_ack_channel_get_msg_room (ack_chan, seq_num_array[i]);
                    opal_hotel_checkout_and_return_occupant(&ack_chan->outstanding_msgs, room_num, &occupant);
                    orte_qos_ack_channel_set_msg_room(ack_chan, seq_num_array[i], -1);
                    if((occupant != NULL) && (room_num != -1)) {
                        msg = (orte_rml_send_t*) occupant;
                        OPAL_OUTPUT_VERBOSE((10, orte_rml_base_framework.framework_output,
                                             "Releasing sent message with tag %d and seq_num %d after receiving Ack from dest ",
                                             msg->tag, msg->seq_num ));
                        msg->status = ORTE_SUCCESS;
                        ORTE_RML_SEND_COMPLETE(msg);
                    } else {
                        OPAL_OUTPUT_VERBOSE((10, orte_rml_base_framework.framework_output,
                                             "OOPS received an ACK for already completed seq_num =%d ",
                                             seq_num_array[i] ));
                    }
                }
        } else {
            // handle out of order ACK - complete msgs received in order, retry the lost msg.
            for (i = 0; i < num_msgs_acked; i++)
                {
                    opal_dss.unpack(buffer, (void*) &seq_num_array[i], &num_values, OPAL_UINT32);
                    room_num = orte_qos_ack_channel_get_msg_room (ack_chan, seq_num_array[i]);
                    opal_hotel_checkout_and_return_occupant(&ack_chan->outstanding_msgs, room_num, &occupant);
                    orte_qos_ack_channel_set_msg_room(ack_chan, seq_num_array[i], -1);
                    if ((NULL != occupant) && ((i == 0 )|| (seq_num_array[i] == seq_num_array[i-1] +1 ))) {
                        msg = (orte_rml_send_t*) occupant;
                        msg->status = ORTE_SUCCESS;
                        ORTE_RML_SEND_COMPLETE(msg);
                    } else {
                        if (NULL != occupant) {
                            // num_missed_msgs = (seq_num_array[i] - seq_num_array [i-1] - 1);
                            assert( i == num_msgs_acked -1);
                            /* recheck the ith msg */
                            opal_hotel_checkin(&ack_chan->outstanding_msgs, (void*)occupant, &room_num);
                            orte_qos_ack_channel_set_msg_room (ack_chan, seq_num_array[i], room_num);
                            /* resend and recheck all the missed msgs*/
                            missed_msg_seq_num = seq_num_array[i-1] + 1;
                            for (; missed_msg_seq_num < seq_num_array[i]; missed_msg_seq_num++) {
                                room_num = orte_qos_ack_channel_get_msg_room (ack_chan, missed_msg_seq_num);
                                opal_hotel_checkout_and_return_occupant (&ack_chan->outstanding_msgs, room_num, &occupant);
                                assert ( NULL != occupant);
                                missed_msg = (orte_rml_send_t*) occupant;
                                missed_msg->status = ORTE_ERR_LOST_MSG_IN_WINDOW;
                                opal_hotel_checkin(&ack_chan->outstanding_msgs, (void*)missed_msg, &room_num);
                                orte_qos_ack_channel_set_msg_room (ack_chan, missed_msg_seq_num, room_num);
                                /* send this out on wire directly */
                                ORTE_OOB_SEND (missed_msg);
                            } //end for
                        } else {
                            OPAL_OUTPUT_VERBOSE((10, orte_rml_base_framework.framework_output,
                                                 "OOPS received an ACK for already completed seq_num =%d ",
                                                 seq_num_array[i] ));
                        }//end  if (NULL != occupant)
                    } //end else
                } // end for
        }//end out of order ack processing
        free(seq_num_array);
    }else {
        OPAL_OUTPUT_VERBOSE((5, orte_qos_base_framework.framework_output,
                             "orte_qos_ack_channel_msg_ack_recv_callback recieved ack on non existent channel = %d",
                             channel_num));
    }
}
示例#4
0
static inline int send_ack (orte_qos_ack_channel_t * ack_chan,
                             orte_rml_channel_num_t channel_num,
                             uint32_t ack_type, uint32_t last_msg_seq_num)
{
    int rc;
    orte_rml_channel_t *rml_channel;
    opal_buffer_t *buffer;
    uint32_t num_msgs_to_ack = 0;
    uint32_t *ack_seq_num_array;
    uint32_t i;
    rml_channel = orte_rml_base_get_channel (channel_num);
    num_msgs_to_ack = ack_chan->in_msg_seq_num - ack_chan->ack_msg_seq_num + 1;
    OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output,
                         "%s sending ack type = %d \n",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ack_type));
    if ( NULL != (ack_seq_num_array = malloc (sizeof(uint32_t) * num_msgs_to_ack))) {
        for (i = 1; i <= num_msgs_to_ack ; i++) {
            ack_seq_num_array[i-1] = ack_chan->ack_msg_seq_num + i;
            OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output,
                                 "%s ack_recv acking msg %d to peer = %s\n",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ack_seq_num_array[i-1],
                                 ORTE_NAME_PRINT(&rml_channel->peer)));
        }
        ack_seq_num_array[num_msgs_to_ack - 1] = last_msg_seq_num;
        OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output,
                             "%s ack_recv acking last msg %d to peer = %s\n",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ack_seq_num_array[num_msgs_to_ack - 1],
                             ORTE_NAME_PRINT(&rml_channel->peer)));
    }
    else {
        OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output,
                             "%s ack_recv cannot allocate ack array to send ack to peer = %s\n",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(&rml_channel->peer)));
        rc = ORTE_ERR_TEMP_OUT_OF_RESOURCE;
        return rc;
    }
    buffer = OBJ_NEW (opal_buffer_t);
    /* pack channel number */
    opal_dss.pack (buffer, &rml_channel->peer_channel, 1, OPAL_UINT32);
    /* pack ack type */
    opal_dss.pack (buffer, &ack_type, 1, OPAL_UINT32);
    /* pack num messages */
    opal_dss.pack (buffer, &num_msgs_to_ack, 1, OPAL_UINT32);
    /* pack seq number array */
    for (i =0; i<num_msgs_to_ack; i++) {
        opal_dss.pack (buffer, &ack_seq_num_array[i], 1 , OPAL_UINT32);
    }
    rc = orte_rml.send_buffer_nb  (&rml_channel->peer, buffer, ORTE_RML_TAG_MSG_ACK,
                                   orte_qos_ack_msg_send_callback, rml_channel);
    if(ORTE_SUCCESS == rc) {
        /* update last acked msg */
        ack_chan->ack_msg_seq_num = last_msg_seq_num;
    } else {
        //TO DO
    }
    return rc;
}