int orte_plm_base_orted_exit(orte_daemon_cmd_flag_t command) { int rc; opal_buffer_t *cmd; OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output, "%s plm:base:orted_cmd sending orted_exit commands", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* flag that orteds are being terminated */ orte_orteds_term_ordered = true; /* send it express delivery! */ cmd = OBJ_NEW(opal_buffer_t); /* pack the command */ if (ORTE_SUCCESS != (rc = opal_dss.pack(cmd, &command, 1, ORTE_DAEMON_CMD))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(cmd); return rc; } if (ORTE_SUCCESS != (rc = orte_grpcomm.xcast(ORTE_PROC_MY_NAME->jobid, cmd, ORTE_RML_TAG_DAEMON))) { ORTE_ERROR_LOG(rc); } OBJ_RELEASE(cmd); #if 0 /* if we are abnormally ordering the termination, then * set a timeout in case it never finishes */ if (orte_abnormal_term_ordered) { ORTE_DETECT_TIMEOUT(orte_process_info.num_procs, 100, 3, failed_cmd, NULL); } #endif return rc; }
int orte_plm_base_orted_exit(orte_daemon_cmd_flag_t command) { int rc; opal_buffer_t *cmd; orte_daemon_cmd_flag_t cmmnd; orte_grpcomm_signature_t *sig; OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output, "%s plm:base:orted_cmd sending orted_exit commands", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* flag that orteds are being terminated */ orte_orteds_term_ordered = true; cmmnd = command; /* if we are terminating before launch, or abnormally * terminating, then the daemons may not be wired up * and therefore cannot depend on detecting their * routed children to determine termination */ if (orte_abnormal_term_ordered || orte_never_launched || !orte_routing_is_enabled) { cmmnd = ORTE_DAEMON_HALT_VM_CMD; } /* send it express delivery! */ cmd = OBJ_NEW(opal_buffer_t); /* pack the command */ if (ORTE_SUCCESS != (rc = opal_dss.pack(cmd, &cmmnd, 1, ORTE_DAEMON_CMD))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(cmd); return rc; } /* goes to all daemons */ sig = OBJ_NEW(orte_grpcomm_signature_t); sig->signature = (orte_process_name_t*)malloc(sizeof(orte_process_name_t)); sig->signature[0].jobid = ORTE_PROC_MY_NAME->jobid; sig->signature[0].vpid = ORTE_VPID_WILDCARD; if (ORTE_SUCCESS != (rc = orte_grpcomm.xcast(sig, ORTE_RML_TAG_DAEMON, cmd))) { ORTE_ERROR_LOG(rc); } OBJ_RELEASE(cmd); OBJ_RELEASE(sig); #if 0 /* if we are abnormally ordering the termination, then * set a timeout in case it never finishes */ if (orte_abnormal_term_ordered) { ORTE_DETECT_TIMEOUT(orte_process_info.num_procs, 100, 3, failed_cmd, NULL); } #endif return rc; }
static void poll_spawns(int fd, short args, void *cbdata) { orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata; int i, rc; bool failed_launch = true; int local_err; tm_event_t event; /* TM poll for all the spawns */ for (i = 0; i < launched; ++i) { rc = tm_poll(TM_NULL_EVENT, &event, 1, &local_err); if (TM_SUCCESS != rc) { opal_output(0, "plm:tm: failed to poll for a spawned daemon, return status = %d", rc); goto cleanup; } if (TM_SUCCESS != local_err) { opal_output(0, "plm:tm: failed to spawn daemon, error code = %d", local_err ); goto cleanup; } } failed_launch = false; #if 0 /* set a timer to tell us if one or more daemon's fails to start - use the * millisec/daemon timeout provided by the user to compute time */ if (0 < orte_startup_timeout) { OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "%s plm:tm: setting startup timer for %d milliseconds", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orte_startup_timeout)); ORTE_DETECT_TIMEOUT(map->num_new_daemons, orte_startup_timeout*1000, -1, failed_start, state->jdata); } #endif cleanup: /* cleanup */ OBJ_RELEASE(state); /* check for failed launch - if so, force terminate */ if (failed_launch) { ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); } }
int orte_util_comm_query_proc_info(const orte_process_name_t *hnp, orte_jobid_t job, orte_vpid_t vpid, int *num_procs, orte_proc_t ***proc_info_array) #endif { int ret; int32_t cnt, cnt_procs, n; opal_buffer_t *cmd; orte_daemon_cmd_flag_t command = ORTE_DAEMON_REPORT_PROC_INFO_CMD; orte_proc_t **proc_info; /* set default response */ *num_procs = 0; *proc_info_array = NULL; /* query the HNP for info on the procs in this job */ cmd = OBJ_NEW(opal_buffer_t); if (ORTE_SUCCESS != (ret = opal_dss.pack(cmd, &command, 1, ORTE_DAEMON_CMD))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(cmd); return ret; } if (ORTE_SUCCESS != (ret = opal_dss.pack(cmd, &job, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(cmd); return ret; } if (ORTE_SUCCESS != (ret = opal_dss.pack(cmd, &vpid, 1, ORTE_VPID))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(cmd); return ret; } #if ORTE_ENABLE_EPOCH if (ORTE_SUCCESS != (ret = opal_dss.pack(cmd, &epoch, 1, ORTE_EPOCH))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(cmd); return ret; } #endif /* define a max time to wait for send to complete */ timer_fired = false; error_exit = ORTE_SUCCESS; ORTE_DETECT_TIMEOUT(&quicktime, 100, 1000, 100000, quicktime_cb); /* do the send */ if (0 > (ret = orte_rml.send_buffer_nb((orte_process_name_t*)hnp, cmd, ORTE_RML_TAG_DAEMON, 0, send_cbfunc, NULL))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(cmd); return ret; } /* wait for send to complete */ ORTE_PROGRESSED_WAIT(timer_fired, 0, 1); /* release the buffer */ OBJ_RELEASE(cmd); /* did it succeed? */ if (ORTE_SUCCESS != error_exit) { return error_exit; } /* define a max time to wait for an answer */ timer_fired = false; error_exit = ORTE_SUCCESS; ORTE_DETECT_TIMEOUT(&quicktime, 10, 1000, 10000, quicktime_cb); /* get the answer */ OBJ_CONSTRUCT(&answer, opal_buffer_t); if (ORTE_SUCCESS != (ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_TOOL, ORTE_RML_NON_PERSISTENT, recv_info, NULL))) { /* cancel the timer */ if (NULL != quicktime) { opal_event_evtimer_del(quicktime); free(quicktime); quicktime = NULL; } ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&answer); return ret; } ORTE_PROGRESSED_WAIT(timer_fired, 0, 1); if (ORTE_SUCCESS != error_exit) { OBJ_DESTRUCT(&answer); return error_exit; } cnt = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(&answer, &cnt_procs, &cnt, OPAL_INT32))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&answer); return ret; } /* allocate the required memory */ if (0 < cnt_procs) { proc_info = (orte_proc_t**)malloc(cnt_procs * sizeof(orte_proc_t*)); /* unpack the procs */ for (n=0; n < cnt_procs; n++) { cnt = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(&answer, &proc_info[n], &cnt, ORTE_PROC))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&answer); free(proc_info); return ret; } } *proc_info_array = proc_info; *num_procs = (int)cnt_procs; } OBJ_DESTRUCT(&answer); return ORTE_SUCCESS; }
/* report an event to a connected tool */ int orte_util_comm_report_event(orte_comm_event_t ev) { int rc, i; opal_buffer_t buf; orte_node_t *node; /* if nothing is connected, ignore this */ if (!tool_connected) { return ORTE_SUCCESS; } /* init a buffer for the data */ OBJ_CONSTRUCT(&buf, opal_buffer_t); /* flag the type of event */ opal_dss.pack(&buf, &ev, 1, ORTE_COMM_EVENT); switch (ev) { case ORTE_COMM_EVENT_ALLOCATE: /* loop through nodes, storing just node names */ for (i=0; i < orte_node_pool->size; i++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) { continue; } opal_dss.pack(&buf, &node->name, 1, OPAL_STRING); } break; case ORTE_COMM_EVENT_MAP: break; case ORTE_COMM_EVENT_LAUNCH: break; default: ORTE_ERROR_LOG(ORTE_ERROR); OBJ_DESTRUCT(&buf); return ORTE_ERROR; break; } /* do the send */ if (0 > (rc = orte_rml.send_buffer(&tool, &buf, ORTE_RML_TAG_TOOL, 0))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&buf); return rc; } if (step) { /* the caller wants to wait until an ack is received - * define a max time to wait for an answer */ OBJ_CONSTRUCT(&answer, opal_buffer_t); timer_fired = false; error_exit = ORTE_SUCCESS; ORTE_DETECT_TIMEOUT(&quicktime, 100, 1000, 100000, quicktime_cb); /* get the answer */ if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_TOOL, ORTE_RML_NON_PERSISTENT, recv_info, NULL))) { /* cancel the timer */ if (NULL != quicktime) { opal_event_evtimer_del(quicktime); free(quicktime); quicktime = NULL; } ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&answer); return rc; } ORTE_PROGRESSED_WAIT(timer_fired, 0, 1); /* cleanup */ OBJ_DESTRUCT(&answer); if (ORTE_SUCCESS != error_exit) { return error_exit; } } return ORTE_SUCCESS; }
static int onesided_barrier(void) { int num_participating; opal_list_t daemon_tree; opal_buffer_t buf; orte_process_name_t my_parent; opal_event_t *quicktime=NULL; struct timeval quicktimeval; int rc; OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_output, "%s grpcomm:bad: onesided barrier called", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* if we are not to use the barrier, then just return */ if (!orte_orted_exit_with_barrier) { if (ORTE_PROC_IS_HNP) { /* if we are the HNP, we need to do a little delay to give * the orteds a chance to exit before we leave */ OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_output, "%s grpcomm:bad: onesided barrier adding delay timer", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); quicktimeval.tv_sec = 0; quicktimeval.tv_usec = 100; timer_fired = false; ORTE_DETECT_TIMEOUT(&quicktime, orte_process_info.num_procs, 1000, 10000, quicktime_cb); ORTE_PROGRESSED_WAIT(timer_fired, 0, 1); } return ORTE_SUCCESS; } /* initialize things */ num_onesided_barrier_recvd = 0; num_participating = 0; /* figure out how many participants we should be expecting */ OBJ_CONSTRUCT(&daemon_tree, opal_list_t); my_parent.jobid = ORTE_PROC_MY_NAME->jobid; my_parent.vpid = orte_routed.get_routing_tree(&daemon_tree); num_participating = opal_list_get_size(&daemon_tree); OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_output, "%s grpcomm:bad: onesided barrier num_participating %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), num_participating)); /* set the recv */ if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ONESIDED_BARRIER, ORTE_RML_NON_PERSISTENT, onesided_barrier_recv, NULL))) { ORTE_ERROR_LOG(rc); } /* wait to recv them */ ORTE_PROGRESSED_WAIT(false, num_onesided_barrier_recvd, num_participating); /* cancel the recv */ orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ONESIDED_BARRIER); /* if I am the HNP, then we are done */ if (ORTE_PROC_IS_HNP) { return ORTE_SUCCESS; } /* send a zero-byte msg to my parent */ OBJ_CONSTRUCT(&buf, opal_buffer_t); /* send it */ OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_output, "%s grpcomm:bad:onsided:barrier not the HNP - sending to parent %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&my_parent))); if (0 > (rc = orte_rml.send_buffer(&my_parent, &buf, ORTE_RML_TAG_ONESIDED_BARRIER, 0))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&buf); return rc; } OBJ_DESTRUCT(&buf); return ORTE_SUCCESS; }