void mca_oob_ud_peer_stop_timer (mca_oob_ud_peer_t *peer) { if (peer->peer_timer.active) { peer->peer_timer.active = false; opal_event_evtimer_del (&peer->peer_timer.event); } }
static void send_cbfunc(int status, orte_process_name_t* sender, opal_buffer_t* buffer, orte_rml_tag_t tag, void* cbdata) { /* cancel the timer */ if (NULL != quicktime) { opal_event_evtimer_del(quicktime); free(quicktime); quicktime = NULL; } /* declare the work done */ timer_fired = true; }
static void recv_info(int status, orte_process_name_t* sender, opal_buffer_t* buffer, orte_rml_tag_t tag, void* cbdata) { int rc; /* cancel the timer */ if (NULL != quicktime) { opal_event_evtimer_del(quicktime); free (quicktime); quicktime = NULL; } /* xfer the answer */ if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(&answer, buffer))) { ORTE_ERROR_LOG(rc); } /* declare the work done */ timer_fired = true; }
static int ack_recv (void *qos_channel, orte_rml_recv_t *msg) { orte_qos_ack_channel_t *ack_chan; ack_chan = (orte_qos_ack_channel_t*) (qos_channel); int32_t rc; struct timeval ack_timeout; OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output, "%s ack_recv msg = %p seq_num = %d from peer = %s\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (void*)msg, msg->seq_num, ORTE_NAME_PRINT(&msg->sender))); /** HACK - drop every third msg to stimulate lost msg */ /* if ((msg->seq_num == 3) && (hack == 0)) { OBJ_RELEASE(msg); hack = 1; return ORTE_ERROR; }*/ /* check if this is the next expected msg*/ if((ack_chan->in_msg_seq_num + 1 == msg->seq_num) && (ack_chan->ack_msg_seq_num < msg->seq_num)) { /* check if we are at the end of the window */ if(ack_chan->window == (msg->seq_num - ack_chan->ack_msg_seq_num)) { /* stop window ack timer */ opal_event_evtimer_del (&ack_chan->msg_ack_timer_event); rc = send_ack (ack_chan, msg->channel_num, ACK_WINDOW_COMPLETE, msg->seq_num); } else { if(ack_chan->in_msg_seq_num == ack_chan->ack_msg_seq_num) { /* begining window -start window ack timer */ ack_timeout.tv_sec = ack_chan->timeout_secs; ack_timeout.tv_usec = 0; opal_event_evtimer_add (&ack_chan->msg_ack_timer_event, &ack_timeout); } rc = ORTE_SUCCESS; } ack_chan->in_msg_seq_num = msg->seq_num; } else { rc = process_out_of_order_msg(ack_chan, msg); } return rc; }
void orte_show_help_finalize(void) { if (!ready) { return; } ready = false; opal_show_help = save_help; save_help = NULL; /* Shutdown show_help, showing final messages */ if (ORTE_PROC_IS_HNP) { show_accumulated_duplicates(0, 0, NULL); OBJ_DESTRUCT(&abd_tuples); if (show_help_timer_set) { opal_event_evtimer_del(&show_help_timer_event); } /* cancel the recv */ orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_SHOW_HELP); return; } }
static inline int process_out_of_order_msg ( orte_qos_ack_channel_t *ack_chan, orte_rml_recv_t *msg) { int32_t rc, room_num, first_lost_msg_seq_num, num_lost_msgs, i; orte_rml_recv_t *out_msg; void *occupant = NULL; OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output, "%s process_out_of_order_msg msg %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), msg->seq_num)); /* if this msg is a duplicate - then do nothing */ if ((orte_qos_ack_channel_get_msg_room(ack_chan, msg->seq_num)) != -1) { OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output, "%s process_out_of_order_msg msg %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), msg->seq_num)); rc = ORTE_ERR_DUPLICATE_MSG; } else { if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&ack_chan->outstanding_msgs, (void*)msg, &room_num))) { return rc; } OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output, "process_out_of_order_msg checked in msg %d in room %d\n", msg->seq_num, room_num)); orte_qos_ack_channel_set_msg_room (ack_chan, msg->seq_num, room_num); rc = ORTE_ERR_OUT_OF_ORDER_MSG; /* check if we need to send an ACK */ if (ack_chan->ack_msg_seq_num <= ack_chan->in_msg_seq_num) { OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output, "%s process_out_of_order_msg sending ack last seq_num = %d\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), msg->seq_num)); /* send ACK. */ send_ack (ack_chan, msg->channel_num, ACK_OUT_OF_ORDER, msg->seq_num); /* stop window ack timer */ opal_event_evtimer_del (&ack_chan->msg_ack_timer_event); } else { /* if we got a lost msg - any seq num between in_msg_seq_num and ack_seq_num*/ if (ack_chan->ack_msg_seq_num > msg->seq_num) { /* check if we have got all lost msgs */ first_lost_msg_seq_num = ack_chan->in_msg_seq_num + 1; num_lost_msgs = ack_chan->ack_msg_seq_num - ack_chan->in_msg_seq_num; OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output, "%s process_out_of_order_msg msg %d first_lost_msg =%d num_lost_msgs =%d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), msg->seq_num, first_lost_msg_seq_num, num_lost_msgs)); for (i =0 ; i < num_lost_msgs; i++) { if ((orte_qos_ack_channel_get_msg_room(ack_chan, first_lost_msg_seq_num +i)) == -1) break; } if (i == num_lost_msgs) { /* we got all the lost msgs so we can complete all the msgs in the hotel now */ /* reset ack_seq_num */ ack_chan->ack_msg_seq_num = first_lost_msg_seq_num -1; room_num = 0; for ( i = 0; room_num != -1; i++) { OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output, "%s process_out_of_order_msg got all lost msgs completing outstanding msgs %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (first_lost_msg_seq_num + i))); /* evict msg and complete it */ room_num = orte_qos_ack_channel_get_msg_room (ack_chan, first_lost_msg_seq_num +i); opal_hotel_checkout_and_return_occupant(&ack_chan->outstanding_msgs, room_num, &occupant); orte_qos_ack_channel_set_msg_room(ack_chan, first_lost_msg_seq_num +i, -1); out_msg = (orte_rml_recv_t *) occupant; if ((NULL != out_msg) && (room_num != -1)) { // set in seq num */ ack_chan->in_msg_seq_num = out_msg->seq_num; orte_rml_base_complete_recv_msg(&out_msg); /* completing recv msg to rml */ OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output, "process_out_of_order_msg completed recv msg %d", (first_lost_msg_seq_num + i))); } else { OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output, "%s process_out_of_order_msg lost msg %d not in hotel", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (first_lost_msg_seq_num + i))); } } //end for /* send ACK */ send_ack (ack_chan, ack_chan->channel_num, ACK_RECV_MISSED_MSG, ack_chan->in_msg_seq_num); } //end if (i== num_lost_msgs) } // if (ack_chan->ack_msg_seq_num > msg->seq_num) } //end else } // end duplicate else return rc; }
int orte_util_comm_query_proc_info(const orte_process_name_t *hnp, orte_jobid_t job, orte_vpid_t vpid, int *num_procs, orte_proc_t ***proc_info_array) #endif { int ret; int32_t cnt, cnt_procs, n; opal_buffer_t *cmd; orte_daemon_cmd_flag_t command = ORTE_DAEMON_REPORT_PROC_INFO_CMD; orte_proc_t **proc_info; /* set default response */ *num_procs = 0; *proc_info_array = NULL; /* query the HNP for info on the procs in this job */ cmd = OBJ_NEW(opal_buffer_t); if (ORTE_SUCCESS != (ret = opal_dss.pack(cmd, &command, 1, ORTE_DAEMON_CMD))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(cmd); return ret; } if (ORTE_SUCCESS != (ret = opal_dss.pack(cmd, &job, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(cmd); return ret; } if (ORTE_SUCCESS != (ret = opal_dss.pack(cmd, &vpid, 1, ORTE_VPID))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(cmd); return ret; } #if ORTE_ENABLE_EPOCH if (ORTE_SUCCESS != (ret = opal_dss.pack(cmd, &epoch, 1, ORTE_EPOCH))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(cmd); return ret; } #endif /* define a max time to wait for send to complete */ timer_fired = false; error_exit = ORTE_SUCCESS; ORTE_DETECT_TIMEOUT(&quicktime, 100, 1000, 100000, quicktime_cb); /* do the send */ if (0 > (ret = orte_rml.send_buffer_nb((orte_process_name_t*)hnp, cmd, ORTE_RML_TAG_DAEMON, 0, send_cbfunc, NULL))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(cmd); return ret; } /* wait for send to complete */ ORTE_PROGRESSED_WAIT(timer_fired, 0, 1); /* release the buffer */ OBJ_RELEASE(cmd); /* did it succeed? */ if (ORTE_SUCCESS != error_exit) { return error_exit; } /* define a max time to wait for an answer */ timer_fired = false; error_exit = ORTE_SUCCESS; ORTE_DETECT_TIMEOUT(&quicktime, 10, 1000, 10000, quicktime_cb); /* get the answer */ OBJ_CONSTRUCT(&answer, opal_buffer_t); if (ORTE_SUCCESS != (ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_TOOL, ORTE_RML_NON_PERSISTENT, recv_info, NULL))) { /* cancel the timer */ if (NULL != quicktime) { opal_event_evtimer_del(quicktime); free(quicktime); quicktime = NULL; } ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&answer); return ret; } ORTE_PROGRESSED_WAIT(timer_fired, 0, 1); if (ORTE_SUCCESS != error_exit) { OBJ_DESTRUCT(&answer); return error_exit; } cnt = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(&answer, &cnt_procs, &cnt, OPAL_INT32))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&answer); return ret; } /* allocate the required memory */ if (0 < cnt_procs) { proc_info = (orte_proc_t**)malloc(cnt_procs * sizeof(orte_proc_t*)); /* unpack the procs */ for (n=0; n < cnt_procs; n++) { cnt = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(&answer, &proc_info[n], &cnt, ORTE_PROC))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&answer); free(proc_info); return ret; } } *proc_info_array = proc_info; *num_procs = (int)cnt_procs; } OBJ_DESTRUCT(&answer); return ORTE_SUCCESS; }
/* report an event to a connected tool */ int orte_util_comm_report_event(orte_comm_event_t ev) { int rc, i; opal_buffer_t buf; orte_node_t *node; /* if nothing is connected, ignore this */ if (!tool_connected) { return ORTE_SUCCESS; } /* init a buffer for the data */ OBJ_CONSTRUCT(&buf, opal_buffer_t); /* flag the type of event */ opal_dss.pack(&buf, &ev, 1, ORTE_COMM_EVENT); switch (ev) { case ORTE_COMM_EVENT_ALLOCATE: /* loop through nodes, storing just node names */ for (i=0; i < orte_node_pool->size; i++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) { continue; } opal_dss.pack(&buf, &node->name, 1, OPAL_STRING); } break; case ORTE_COMM_EVENT_MAP: break; case ORTE_COMM_EVENT_LAUNCH: break; default: ORTE_ERROR_LOG(ORTE_ERROR); OBJ_DESTRUCT(&buf); return ORTE_ERROR; break; } /* do the send */ if (0 > (rc = orte_rml.send_buffer(&tool, &buf, ORTE_RML_TAG_TOOL, 0))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&buf); return rc; } if (step) { /* the caller wants to wait until an ack is received - * define a max time to wait for an answer */ OBJ_CONSTRUCT(&answer, opal_buffer_t); timer_fired = false; error_exit = ORTE_SUCCESS; ORTE_DETECT_TIMEOUT(&quicktime, 100, 1000, 100000, quicktime_cb); /* get the answer */ if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_TOOL, ORTE_RML_NON_PERSISTENT, recv_info, NULL))) { /* cancel the timer */ if (NULL != quicktime) { opal_event_evtimer_del(quicktime); free(quicktime); quicktime = NULL; } ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&answer); return rc; } ORTE_PROGRESSED_WAIT(timer_fired, 0, 1); /* cleanup */ OBJ_DESTRUCT(&answer); if (ORTE_SUCCESS != error_exit) { return error_exit; } } return ORTE_SUCCESS; }