Пример #1
0
void mca_oob_ud_peer_stop_timer (mca_oob_ud_peer_t *peer)
{
    if (peer->peer_timer.active) {
        peer->peer_timer.active = false;
        opal_event_evtimer_del (&peer->peer_timer.event);
    }
}
Пример #2
0
static void send_cbfunc(int status, orte_process_name_t* sender,
                        opal_buffer_t* buffer, orte_rml_tag_t tag,
                        void* cbdata)
{
    /* cancel the timer */
    if (NULL != quicktime) {
        opal_event_evtimer_del(quicktime);
	free(quicktime);
	quicktime = NULL;
    }
    /* declare the work done */
    timer_fired = true;
}
Пример #3
0
static void recv_info(int status, orte_process_name_t* sender,
                      opal_buffer_t* buffer, orte_rml_tag_t tag,
                      void* cbdata)
{
    int rc;
    
    /* cancel the timer */
    if (NULL != quicktime) {
        opal_event_evtimer_del(quicktime);
	free (quicktime);
	quicktime = NULL;
    }
    /* xfer the answer */
    if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(&answer, buffer))) {
        ORTE_ERROR_LOG(rc);
    }
    /* declare the work done */
    timer_fired = true;
}
Пример #4
0
static int ack_recv (void *qos_channel, orte_rml_recv_t *msg) {
    orte_qos_ack_channel_t *ack_chan;
    ack_chan = (orte_qos_ack_channel_t*) (qos_channel);
    int32_t rc;
    struct timeval ack_timeout;
    OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output,
                         "%s ack_recv msg = %p seq_num = %d from peer = %s\n",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         (void*)msg, msg->seq_num,
                         ORTE_NAME_PRINT(&msg->sender)));
    /** HACK - drop every third msg to stimulate lost msg */
 /*   if ((msg->seq_num == 3) && (hack == 0)) {
        OBJ_RELEASE(msg);
        hack = 1;
        return ORTE_ERROR;
    }*/
    /* check if this is the next expected msg*/
    if((ack_chan->in_msg_seq_num + 1 == msg->seq_num) && (ack_chan->ack_msg_seq_num < msg->seq_num))
    {
        /* check if we are at the end of the window */
        if(ack_chan->window == (msg->seq_num - ack_chan->ack_msg_seq_num)) {
            /* stop window ack timer */
            opal_event_evtimer_del (&ack_chan->msg_ack_timer_event);
            rc = send_ack (ack_chan, msg->channel_num, ACK_WINDOW_COMPLETE, msg->seq_num);
        } else {
            if(ack_chan->in_msg_seq_num ==  ack_chan->ack_msg_seq_num) {
                /* begining window -start window ack timer */
                ack_timeout.tv_sec = ack_chan->timeout_secs;
                ack_timeout.tv_usec = 0;
                opal_event_evtimer_add (&ack_chan->msg_ack_timer_event, &ack_timeout);
            }
            rc = ORTE_SUCCESS;
        }
        ack_chan->in_msg_seq_num = msg->seq_num;
    }
    else {
        rc = process_out_of_order_msg(ack_chan, msg);
    }
    return rc;
}
Пример #5
0
void orte_show_help_finalize(void)
{
    if (!ready) {
        return;
    }
    ready = false;

    opal_show_help = save_help;
    save_help = NULL;

    /* Shutdown show_help, showing final messages */
    if (ORTE_PROC_IS_HNP) {
        show_accumulated_duplicates(0, 0, NULL);
        OBJ_DESTRUCT(&abd_tuples);
        if (show_help_timer_set) {
            opal_event_evtimer_del(&show_help_timer_event);
        }
        
        /* cancel the recv */
        orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_SHOW_HELP);
        return;
    }
}
Пример #6
0
static inline int process_out_of_order_msg ( orte_qos_ack_channel_t *ack_chan,
        orte_rml_recv_t *msg)
{
    int32_t rc, room_num, first_lost_msg_seq_num, num_lost_msgs, i;
    orte_rml_recv_t *out_msg;
    void *occupant = NULL;
    OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output,
                         "%s process_out_of_order_msg msg %d",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         msg->seq_num));
    /* if this msg is a duplicate - then do nothing */
    if ((orte_qos_ack_channel_get_msg_room(ack_chan, msg->seq_num)) != -1) {
        OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output,
                             "%s process_out_of_order_msg msg %d",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             msg->seq_num));
        rc = ORTE_ERR_DUPLICATE_MSG;
    }
    else {
        if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&ack_chan->outstanding_msgs, (void*)msg, &room_num))) {
            return rc;
        }
        OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output,
                             "process_out_of_order_msg checked in msg %d in room %d\n",
                              msg->seq_num, room_num));
        orte_qos_ack_channel_set_msg_room (ack_chan, msg->seq_num, room_num);
        rc = ORTE_ERR_OUT_OF_ORDER_MSG;
        /*  check if we need to send an ACK */
        if (ack_chan->ack_msg_seq_num <= ack_chan->in_msg_seq_num) {
            OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output,
                                 "%s process_out_of_order_msg sending ack last seq_num = %d\n",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 msg->seq_num));
            /* send ACK. */
            send_ack (ack_chan, msg->channel_num, ACK_OUT_OF_ORDER, msg->seq_num);
            /* stop window ack timer */
            opal_event_evtimer_del (&ack_chan->msg_ack_timer_event);
        }
        else {
            /* if we got a lost msg - any seq num between in_msg_seq_num and ack_seq_num*/
            if (ack_chan->ack_msg_seq_num > msg->seq_num) {
                /* check if we have got all lost msgs */
                first_lost_msg_seq_num = ack_chan->in_msg_seq_num + 1;
                num_lost_msgs = ack_chan->ack_msg_seq_num - ack_chan->in_msg_seq_num;
                OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output,
                                     "%s process_out_of_order_msg msg %d first_lost_msg =%d num_lost_msgs =%d",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     msg->seq_num, first_lost_msg_seq_num, num_lost_msgs));
                for (i =0 ; i < num_lost_msgs; i++) {
                    if ((orte_qos_ack_channel_get_msg_room(ack_chan, first_lost_msg_seq_num +i)) == -1)
                        break;
                }
                if (i == num_lost_msgs) {

                    /* we got all the lost msgs so we can complete all the msgs in the hotel now */
                    /* reset ack_seq_num */
                    ack_chan->ack_msg_seq_num = first_lost_msg_seq_num -1;
                    room_num = 0;
                    for ( i = 0; room_num != -1; i++) {
                        OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output,
                                             "%s process_out_of_order_msg got all lost msgs  completing outstanding msgs %d",
                                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                             (first_lost_msg_seq_num + i)));
                        /* evict msg and complete it */
                        room_num = orte_qos_ack_channel_get_msg_room (ack_chan, first_lost_msg_seq_num +i);
                        opal_hotel_checkout_and_return_occupant(&ack_chan->outstanding_msgs, room_num, &occupant);
                        orte_qos_ack_channel_set_msg_room(ack_chan, first_lost_msg_seq_num +i, -1);
                        out_msg = (orte_rml_recv_t *) occupant;
                        if ((NULL != out_msg) && (room_num != -1)) {
                            // set in seq num */
                            ack_chan->in_msg_seq_num = out_msg->seq_num;
                            orte_rml_base_complete_recv_msg(&out_msg);
                            /* completing recv msg to rml */
                            OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output,
                                                 "process_out_of_order_msg completed recv msg %d",
                                                 (first_lost_msg_seq_num + i)));
                            } else {
                            OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output,
                                                 "%s process_out_of_order_msg lost msg %d not in hotel",
                                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                                 (first_lost_msg_seq_num + i)));
                        }
                    } //end for
                    /* send ACK */
                    send_ack (ack_chan, ack_chan->channel_num, ACK_RECV_MISSED_MSG,
                                ack_chan->in_msg_seq_num);
                } //end if (i== num_lost_msgs)
            } // if (ack_chan->ack_msg_seq_num > msg->seq_num)
        } //end else
    } // end duplicate else
    return rc;
}
Пример #7
0
int orte_util_comm_query_proc_info(const orte_process_name_t *hnp, orte_jobid_t job, orte_vpid_t vpid,
                                   int *num_procs, orte_proc_t ***proc_info_array)
#endif
{
    int ret;
    int32_t cnt, cnt_procs, n;
    opal_buffer_t *cmd;
    orte_daemon_cmd_flag_t command = ORTE_DAEMON_REPORT_PROC_INFO_CMD;
    orte_proc_t **proc_info;

    /* set default response */
    *num_procs = 0;
    *proc_info_array = NULL;
    
    /* query the HNP for info on the procs in this job */
    cmd = OBJ_NEW(opal_buffer_t);
    if (ORTE_SUCCESS != (ret = opal_dss.pack(cmd, &command, 1, ORTE_DAEMON_CMD))) {
        ORTE_ERROR_LOG(ret);
        OBJ_RELEASE(cmd);
        return ret;
    }
    if (ORTE_SUCCESS != (ret = opal_dss.pack(cmd, &job, 1, ORTE_JOBID))) {
        ORTE_ERROR_LOG(ret);
        OBJ_RELEASE(cmd);
        return ret;
    }
    if (ORTE_SUCCESS != (ret = opal_dss.pack(cmd, &vpid, 1, ORTE_VPID))) {
        ORTE_ERROR_LOG(ret);
        OBJ_RELEASE(cmd);
        return ret;
    }
#if ORTE_ENABLE_EPOCH
    if (ORTE_SUCCESS != (ret = opal_dss.pack(cmd, &epoch, 1, ORTE_EPOCH))) {
        ORTE_ERROR_LOG(ret);
        OBJ_RELEASE(cmd);
        return ret;
    }
#endif
    /* define a max time to wait for send to complete */
    timer_fired = false;
    error_exit = ORTE_SUCCESS;
    ORTE_DETECT_TIMEOUT(&quicktime, 100, 1000, 100000, quicktime_cb);
    
    /* do the send */
    if (0 > (ret = orte_rml.send_buffer_nb((orte_process_name_t*)hnp, cmd, ORTE_RML_TAG_DAEMON, 0,
                                           send_cbfunc, NULL))) {
        ORTE_ERROR_LOG(ret);
        OBJ_RELEASE(cmd);
        return ret;
    }
    
    /* wait for send to complete */
    ORTE_PROGRESSED_WAIT(timer_fired, 0, 1);
    
    /* release the buffer */
    OBJ_RELEASE(cmd);
    
    /* did it succeed? */
    if (ORTE_SUCCESS != error_exit) {
        return error_exit;
    }

    /* define a max time to wait for an answer */
    timer_fired = false;
    error_exit = ORTE_SUCCESS;
    ORTE_DETECT_TIMEOUT(&quicktime, 10, 1000, 10000, quicktime_cb);
    
    /* get the answer */
    OBJ_CONSTRUCT(&answer, opal_buffer_t);
    if (ORTE_SUCCESS != (ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
                                                      ORTE_RML_TAG_TOOL,
                                                      ORTE_RML_NON_PERSISTENT,
                                                      recv_info,
                                                      NULL))) {
        /* cancel the timer */
        if (NULL != quicktime) {
            opal_event_evtimer_del(quicktime);
	    free(quicktime);
	    quicktime = NULL;
        }
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&answer);
        return ret;
    }
    
    ORTE_PROGRESSED_WAIT(timer_fired, 0, 1);
    
    if (ORTE_SUCCESS != error_exit) {
        OBJ_DESTRUCT(&answer);
        return error_exit;
    }
    
    cnt = 1;
    if (ORTE_SUCCESS != (ret = opal_dss.unpack(&answer, &cnt_procs, &cnt, OPAL_INT32))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&answer);
        return ret;
    }

    /* allocate the required memory */
    if (0 < cnt_procs) {
        proc_info = (orte_proc_t**)malloc(cnt_procs * sizeof(orte_proc_t*));
        /* unpack the procs */
        for (n=0; n < cnt_procs; n++) {
            cnt = 1;
            if (ORTE_SUCCESS != (ret = opal_dss.unpack(&answer, &proc_info[n], &cnt, ORTE_PROC))) {
                ORTE_ERROR_LOG(ret);
                OBJ_DESTRUCT(&answer);
                free(proc_info);
                return ret;
            }
        }
        *proc_info_array = proc_info;
        *num_procs = (int)cnt_procs;
    }
    OBJ_DESTRUCT(&answer);

    return ORTE_SUCCESS;
}
Пример #8
0
/* report an event to a connected tool */
int orte_util_comm_report_event(orte_comm_event_t ev)
{
    int rc, i;
    opal_buffer_t buf;
    orte_node_t *node;
    
    /* if nothing is connected, ignore this */
    if (!tool_connected) {
        return ORTE_SUCCESS;
    }
    
    /* init a buffer for the data */
    OBJ_CONSTRUCT(&buf, opal_buffer_t);
    /* flag the type of event */
    opal_dss.pack(&buf, &ev, 1, ORTE_COMM_EVENT);

    switch (ev) {
        case ORTE_COMM_EVENT_ALLOCATE:
            /* loop through nodes, storing just node names */
            for (i=0; i < orte_node_pool->size; i++) {
                if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
                    continue;
                }
                opal_dss.pack(&buf, &node->name, 1, OPAL_STRING);
            }
            break;
        
        case ORTE_COMM_EVENT_MAP:
            break;
        
        case ORTE_COMM_EVENT_LAUNCH:
            break;
        
        default:
            ORTE_ERROR_LOG(ORTE_ERROR);
            OBJ_DESTRUCT(&buf);
            return ORTE_ERROR;
            break;
    }
    
    /* do the send */
    if (0 > (rc = orte_rml.send_buffer(&tool, &buf, ORTE_RML_TAG_TOOL, 0))) {
        ORTE_ERROR_LOG(rc);
        OBJ_DESTRUCT(&buf);
        return rc;
    }
    
    if (step) {
        /* the caller wants to wait until an ack is received -
         * define a max time to wait for an answer
         */
        OBJ_CONSTRUCT(&answer, opal_buffer_t);
        timer_fired = false;
        error_exit = ORTE_SUCCESS;
        ORTE_DETECT_TIMEOUT(&quicktime, 100, 1000, 100000, quicktime_cb);
        
        /* get the answer */
        if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
                                                           ORTE_RML_TAG_TOOL,
                                                           ORTE_RML_NON_PERSISTENT,
                                                           recv_info,
                                                           NULL))) {
            /* cancel the timer */
            if (NULL != quicktime) {
                opal_event_evtimer_del(quicktime);
		free(quicktime);
		quicktime = NULL;
            }
            ORTE_ERROR_LOG(rc);
            OBJ_DESTRUCT(&answer);
            return rc;
        }
        
        ORTE_PROGRESSED_WAIT(timer_fired, 0, 1);
        
        /* cleanup */
        OBJ_DESTRUCT(&answer);

        if (ORTE_SUCCESS != error_exit) {
            return error_exit;
        }
    }
    
    return ORTE_SUCCESS;
}