static void filem_base_process_get_proc_node_name_cmd(orte_process_name_t* sender, opal_buffer_t* buffer) { opal_buffer_t *answer; orte_std_cntr_t count; orte_job_t *jdata = NULL; orte_proc_t *proc = NULL; orte_process_name_t name; int rc; /* * Unpack the data */ count = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &name, &count, ORTE_NAME))) { ORTE_ERROR_LOG(rc); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); return; } /* * Process the data */ /* get the job data object for this proc */ if (NULL == (jdata = orte_get_job_data_object(name.jobid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); return; } /* get the proc object for it */ proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, name.vpid); if (NULL == proc || NULL == proc->node) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); return; } /* * Send back the answer */ answer = OBJ_NEW(opal_buffer_t); if (ORTE_SUCCESS != (rc = opal_dss.pack(answer, &(proc->node->name), 1, OPAL_STRING))) { ORTE_ERROR_LOG(rc); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(answer); return; } if (0 > (rc = orte_rml.send_buffer_nb(sender, answer, ORTE_RML_TAG_FILEM_BASE_RESP, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(answer); return; } }
/* after we allocate, we need to map the processes * so we know what nodes will be used */ static void allocation_complete(int fd, short args, void *cbdata) { orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata; orte_job_t *jdata; orte_job_t *daemons; orte_topology_t *t; orte_node_t *node; int i; ORTE_ACQUIRE_OBJECT(caddy); jdata = state->jdata; jdata->state = ORTE_JOB_STATE_ALLOCATION_COMPLETE; /* get the daemon job object */ if (NULL == (daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); goto done; } /* mark that we are not using a VM */ orte_set_attribute(&daemons->attributes, ORTE_JOB_NO_VM, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL); /* ensure that all nodes point to our topology - we * cannot support hetero nodes with this state machine */ t = (orte_topology_t*)opal_pointer_array_get_item(orte_node_topologies, 0); for (i=1; i < orte_node_pool->size; i++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) { continue; } node->topology = t; } if (!orte_managed_allocation) { if (NULL != orte_set_slots && 0 != strncmp(orte_set_slots, "none", strlen(orte_set_slots))) { for (i=0; i < orte_node_pool->size; i++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) { continue; } if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_SLOTS_GIVEN)) { OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output, "%s plm:base:setting slots for node %s by %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name, orte_set_slots)); orte_plm_base_set_slots(node); } } } } /* move to the map stage */ ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP); done: /* cleanup */ OBJ_RELEASE(state); }
static void files_ready(int status, void *cbdata) { orte_job_t *jdata = (orte_job_t*)cbdata; if (ORTE_SUCCESS != status) { ORTE_FORCED_TERMINATE(status); return; } else { ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_SYSTEM_PREP); } }
static void init_complete(int sd, short args, void *cbdata) { orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; /* nothing to do here but move along - if it is the * daemon job, then next step is allocate */ if (caddy->jdata->jobid == ORTE_PROC_MY_NAME->jobid) { ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_ALLOCATE); } else { /* next step - position any required files */ if (ORTE_SUCCESS != orte_filem.preposition_files(caddy->jdata, files_ready, caddy->jdata)) { ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); } } OBJ_RELEASE(caddy); }
/* after we allocate, we need to map the processes * so we know what nodes will be used */ static void allocation_complete(int fd, short args, void *cbdata) { orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata; orte_job_t *jdata = state->jdata; orte_job_t *daemons; jdata->state = ORTE_JOB_STATE_ALLOCATION_COMPLETE; /* get the daemon job object */ if (NULL == (daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); goto done; } /* mark that we are not using a VM */ orte_set_attribute(&daemons->attributes, ORTE_JOB_NO_VM, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL); #if OPAL_HAVE_HWLOC { orte_topology_t *t; orte_node_t *node; int i; /* ensure that all nodes point to our topology - we * cannot support hetero nodes with this state machine */ t = (orte_topology_t*)opal_pointer_array_get_item(orte_node_topologies, 0); for (i=1; i < orte_node_pool->size; i++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) { continue; } node->topology = t->topo; } } #endif /* move to the map stage */ ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP); done: /* cleanup */ OBJ_RELEASE(state); }
static void vm_ready(int fd, short args, void *cbdata) { orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; int rc; opal_buffer_t *buf; orte_daemon_cmd_flag_t command = ORTE_DAEMON_DVM_NIDMAP_CMD; orte_grpcomm_signature_t *sig; opal_buffer_t *wireup; opal_byte_object_t bo, *boptr; int8_t flag; int32_t numbytes; /* if this is my job, then we are done */ if (ORTE_PROC_MY_NAME->jobid == caddy->jdata->jobid) { /* send the daemon map to every daemon in this DVM - we * do this here so we don't have to do it for every * job we are going to launch */ buf = OBJ_NEW(opal_buffer_t); /* pack the "load nidmap" cmd */ if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &command, 1, ORTE_DAEMON_CMD))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(buf); return; } /* flag that daemons were launched so we will update the nidmap */ flag = 1; opal_dss.pack(buf, &flag, 1, OPAL_INT8); /* construct a nodemap with everything in it */ if (ORTE_SUCCESS != (rc = orte_util_encode_nodemap(buf))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(buf); return; } if (!orte_static_ports && !orte_fwd_mpirun_port) { /* pack a flag indicating wiring info is provided */ flag = 1; opal_dss.pack(buf, &flag, 1, OPAL_INT8); /* get wireup info for daemons per the selected routing module */ wireup = OBJ_NEW(opal_buffer_t); if (ORTE_SUCCESS != (rc = orte_rml_base_get_contact_info(ORTE_PROC_MY_NAME->jobid, wireup))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(wireup); OBJ_RELEASE(buf); return; } /* put it in a byte object for xmission */ opal_dss.unload(wireup, (void**)&bo.bytes, &numbytes); /* pack the byte object - zero-byte objects are fine */ bo.size = numbytes; boptr = &bo; if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &boptr, 1, OPAL_BYTE_OBJECT))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(wireup); OBJ_RELEASE(buf); return; } /* release the data since it has now been copied into our buffer */ if (NULL != bo.bytes) { free(bo.bytes); } OBJ_RELEASE(wireup); } else { flag = 0; opal_dss.pack(buf, &flag, 1, OPAL_INT8); } /* goes to all daemons */ sig = OBJ_NEW(orte_grpcomm_signature_t); sig->signature = (orte_process_name_t*)malloc(sizeof(orte_process_name_t)); sig->signature[0].jobid = ORTE_PROC_MY_NAME->jobid; sig->signature[0].vpid = ORTE_VPID_WILDCARD; if (ORTE_SUCCESS != (rc = orte_grpcomm.xcast(sig, ORTE_RML_TAG_DAEMON, buf))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(buf); OBJ_RELEASE(sig); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); return; } OBJ_RELEASE(buf); /* notify that the vm is ready */ fprintf(stdout, "DVM ready\n"); OBJ_RELEASE(caddy); return; } /* progress the job */ caddy->jdata->state = ORTE_JOB_STATE_VM_READY; /* position any required files */ if (ORTE_SUCCESS != orte_filem.preposition_files(caddy->jdata, files_ready, caddy->jdata)) { ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); } /* cleanup */ OBJ_RELEASE(caddy); }
static void track_procs(int fd, short args, void *cbdata) { orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; orte_process_name_t *proc = &caddy->name; orte_proc_state_t state = caddy->proc_state; orte_job_t *jdata; orte_proc_t *pdata; opal_output_verbose(2, orte_state_base_framework.framework_output, "%s state:staged_hnp:track_procs called for proc %s state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), orte_proc_state_to_str(state)); /* get the job object for this proc */ if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } pdata = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid); if (ORTE_PROC_STATE_RUNNING == state) { /* update the proc state */ pdata->state = state; jdata->num_launched++; if (jdata->num_launched == jdata->num_procs) { ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_RUNNING); } } /* if this is a registration, check to see if it came from * inside MPI_Init - if it did, that is not acceptable */ if (ORTE_PROC_STATE_REGISTERED == state) { if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_AS_MPI) && !ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_GANG_LAUNCHED)) { /* we can't support this - issue an error and abort */ orte_show_help("help-state-staged-hnp.txt", "mpi-procs-not-supported", true); ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_SILENT_ABORT); } /* update the proc state */ pdata->state = state; jdata->num_reported++; if (jdata->num_reported == jdata->num_procs) { ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_REGISTERED); } OBJ_RELEASE(caddy); return; } if (ORTE_PROC_STATE_IOF_COMPLETE == state) { /* update the proc state */ pdata->state = state; /* Release only the stdin IOF file descriptor for this child, if one * was defined. File descriptors for the other IOF channels - stdout, * stderr, and stddiag - were released when their associated pipes * were cleared and closed due to termination of the process */ if (NULL != orte_iof.close) { orte_iof.close(proc, ORTE_IOF_STDIN); } ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_IOF_COMPLETE); if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_WAITPID)) { goto terminated; } OBJ_RELEASE(caddy); return; } if (ORTE_PROC_STATE_WAITPID_FIRED == state) { /* update the proc state */ pdata->state = state; ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_WAITPID); if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_IOF_COMPLETE)) { goto terminated; } OBJ_RELEASE(caddy); return; } /* if the proc terminated, see if any other procs are * waiting to run. We assume that the app_contexts are * in priority order, with the highest priority being * at position 0 in the app_context array for this job */ if (ORTE_PROC_STATE_TERMINATED == state) { terminated: /* update the proc state */ ORTE_FLAG_UNSET(pdata, ORTE_PROC_FLAG_ALIVE); pdata->state = ORTE_PROC_STATE_TERMINATED; if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_LOCAL)) { /* Clean up the session directory as if we were the process * itself. This covers the case where the process died abnormally * and didn't cleanup its own session directory. */ orte_session_dir_finalize(proc); } /* return the allocated slot for reuse */ cleanup_node(pdata); /* track job status */ jdata->num_terminated++; if (jdata->num_terminated == jdata->num_procs) { /* no other procs are waiting, so end this job */ ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED); } else if (jdata->num_mapped < jdata->num_procs) { /* schedule the job for re-mapping so that procs * waiting for resources can execute */ ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP); } /* otherwise, do nothing until more procs terminate */ OBJ_RELEASE(caddy); return; } }
/* * Function for selecting one component from all those that are * available. */ void orte_ras_base_allocate(int fd, short args, void *cbdata) { int rc; orte_job_t *jdata; opal_list_t nodes; orte_node_t *node; orte_std_cntr_t i; orte_app_context_t *app; orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output, "%s ras:base:allocate", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* convenience */ jdata = caddy->jdata; /* if we already did this, don't do it again - the pool of * global resources is set. */ if (orte_ras_base.allocation_read) { OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output, "%s ras:base:allocate allocation already read", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto next_state; } orte_ras_base.allocation_read = true; /* Otherwise, we have to create * the initial set of resources that will delineate all * further operations serviced by this HNP. This list will * contain ALL nodes that can be used by any subsequent job. * * In other words, if a node isn't found in this step, then * no job launched by this HNP will be able to utilize it. */ /* construct a list to hold the results */ OBJ_CONSTRUCT(&nodes, opal_list_t); /* if a component was selected, then we know we are in a managed * environment. - the active module will return a list of what it found */ if (NULL != orte_ras_base.active_module) { /* read the allocation */ if (ORTE_SUCCESS != (rc = orte_ras_base.active_module->allocate(jdata, &nodes))) { if (ORTE_ERR_ALLOCATION_PENDING == rc) { /* an allocation request is underway, so just do nothing */ OBJ_DESTRUCT(&nodes); OBJ_RELEASE(caddy); return; } if (ORTE_ERR_SYSTEM_WILL_BOOTSTRAP == rc) { /* this module indicates that nodes will be discovered * on a bootstrap basis, so all we do here is add our * own node to the list */ goto addlocal; } if (ORTE_ERR_TAKE_NEXT_OPTION == rc) { /* we have an active module, but it is unable to * allocate anything for this job - this indicates * that it isn't a fatal error, but could be if * an allocation is required */ if (orte_allocation_required) { /* an allocation is required, so this is fatal */ OBJ_DESTRUCT(&nodes); orte_show_help("help-ras-base.txt", "ras-base:no-allocation", true); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } else { /* an allocation is not required, so we can just * run on the local node - go add it */ goto addlocal; } } ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&nodes); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } } /* If something came back, save it and we are done */ if (!opal_list_is_empty(&nodes)) { /* store the results in the global resource pool - this removes the * list items */ if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nodes, jdata))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&nodes); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } OBJ_DESTRUCT(&nodes); /* default to no-oversubscribe-allowed for managed systems */ if (!(ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) { ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE); } /* flag that the allocation is managed */ orte_managed_allocation = true; goto DISPLAY; } else if (orte_allocation_required) { /* if nothing was found, and an allocation is * required, then error out */ OBJ_DESTRUCT(&nodes); orte_show_help("help-ras-base.txt", "ras-base:no-allocation", true); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output, "%s ras:base:allocate nothing found in module - proceeding to hostfile", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* nothing was found, or no active module was alive. Our next * option is to look for a hostfile and assign our global * pool from there. * * Individual hostfile names, if given, are included * in the app_contexts for this job. We therefore need to * retrieve the app_contexts for the job, and then cycle * through them to see if anything is there. The parser will * add the nodes found in each hostfile to our list - i.e., * the resulting list contains the UNION of all nodes specified * in hostfiles from across all app_contexts * * We then continue to add any hosts provided by dash-host and * the default hostfile, if we have it. We will then filter out * all the non-desired hosts (i.e., those not specified by * -host and/or -hostfile) when we start the mapping process * * Note that any relative node syntax found in the hostfiles will * generate an error in this scenario, so only non-relative syntax * can be present */ if (NULL != orte_default_hostfile) { OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output, "%s ras:base:allocate parsing default hostfile %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orte_default_hostfile)); /* a default hostfile was provided - parse it */ if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&nodes, orte_default_hostfile))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&nodes); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } } for (i=0; i < jdata->apps->size; i++) { if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) { continue; } if (NULL != app->hostfile) { OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output, "%s ras:base:allocate adding hostfile %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), app->hostfile)); /* hostfile was specified - parse it and add it to the list */ if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&nodes, app->hostfile))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&nodes); /* set an error event */ ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } } else if (!orte_soft_locations && NULL != app->dash_host) { /* if we are using soft locations, then any dash-host would * just include desired nodes and not required. We don't want * to pick them up here as this would mean the request was * always satisfied - instead, we want to allow the request * to fail later on and use whatever nodes are actually * available */ OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output, "%s ras:base:allocate adding dash_hosts", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); if (ORTE_SUCCESS != (rc = orte_util_add_dash_host_nodes(&nodes, app->dash_host))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&nodes); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } } } /* if something was found in the hostfile(s), we use that as our global * pool - set it and we are done */ if (!opal_list_is_empty(&nodes)) { /* store the results in the global resource pool - this removes the * list items */ if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nodes, jdata))) { ORTE_ERROR_LOG(rc); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } /* cleanup */ OBJ_DESTRUCT(&nodes); goto DISPLAY; } OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output, "%s ras:base:allocate nothing found in hostfiles - checking for rankfile", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* Our next option is to look for a rankfile - if one was provided, we * will use its nodes to create a default allocation pool */ if (NULL != orte_rankfile) { /* check the rankfile for node information */ if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&nodes, orte_rankfile))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&nodes); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return ; } } /* if something was found in rankfile, we use that as our global * pool - set it and we are done */ if (!opal_list_is_empty(&nodes)) { /* store the results in the global resource pool - this removes the * list items */ if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nodes, jdata))) { ORTE_ERROR_LOG(rc); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } /* rankfile is considered equivalent to an RM allocation */ if (!(ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) { ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE); } /* cleanup */ OBJ_DESTRUCT(&nodes); goto DISPLAY; } OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output, "%s ras:base:allocate nothing found in rankfile - inserting current node", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); addlocal: /* if nothing was found by any of the above methods, then we have no * earthly idea what to do - so just add the local host */ node = OBJ_NEW(orte_node_t); if (NULL == node) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); OBJ_DESTRUCT(&nodes); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } /* use the same name we got in orte_process_info so we avoid confusion in * the session directories */ node->name = strdup(orte_process_info.nodename); node->state = ORTE_NODE_STATE_UP; node->slots_inuse = 0; node->slots_max = 0; node->slots = 1; opal_list_append(&nodes, &node->super); /* store the results in the global resource pool - this removes the * list items */ if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nodes, jdata))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&nodes); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } OBJ_DESTRUCT(&nodes); DISPLAY: /* shall we display the results? */ if (4 < opal_output_get_verbosity(orte_ras_base_framework.framework_output)) { orte_ras_base_display_alloc(); } next_state: /* are we to report this event? */ if (orte_report_events) { if (ORTE_SUCCESS != (rc = orte_util_comm_report_event(ORTE_COMM_EVENT_ALLOCATE))) { ORTE_ERROR_LOG(rc); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); } } /* set total slots alloc */ jdata->total_slots_alloc = orte_ras_base.total_slots_alloc; /* set the job state to the next position */ ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOCATION_COMPLETE); /* cleanup */ OBJ_RELEASE(caddy); }
static void job_errors(int fd, short args, void *cbdata) { orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; orte_job_t *jdata; orte_job_state_t jobstate; orte_exit_code_t sts; orte_proc_t *aborted_proc; opal_buffer_t *answer; int32_t rc, ret; int room, *rmptr; /* * if orte is trying to shutdown, just let it */ if (orte_finalizing) { return; } /* if the jdata is NULL, then we abort as this * is reporting an unrecoverable error */ if (NULL == caddy->jdata) { ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_FORCED_EXIT); OBJ_RELEASE(caddy); return; } /* update the state */ jdata = caddy->jdata; jobstate = caddy->job_state; jdata->state = jobstate; OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output, "%s errmgr:default_hnp: job %s reported state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid), orte_job_state_to_str(jobstate))); if (ORTE_JOB_STATE_NEVER_LAUNCHED == jobstate || ORTE_JOB_STATE_ALLOC_FAILED == jobstate || ORTE_JOB_STATE_MAP_FAILED == jobstate || ORTE_JOB_STATE_CANNOT_LAUNCH == jobstate) { if (1 == ORTE_LOCAL_JOBID(jdata->jobid)) { /* this is the primary job */ orte_never_launched = true; } /* disable routing as we may not have performed the daemon * wireup - e.g., in a managed environment, all the daemons * "phone home", but don't actually wireup into the routed * network until they receive the launch message */ orte_routing_is_enabled = false; jdata->num_terminated = jdata->num_procs; ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_TERMINATED); /* if it was a dynamic spawn, then we better tell them this didn't work */ if (ORTE_JOBID_INVALID != jdata->originator.jobid) { rc = jobstate; answer = OBJ_NEW(opal_buffer_t); if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &rc, 1, OPAL_INT32))) { ORTE_ERROR_LOG(ret); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &jdata->jobid, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(ret); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } /* pack the room number */ rmptr = &room; if (orte_get_attribute(&jdata->attributes, ORTE_JOB_ROOM_NUM, (void**)&rmptr, OPAL_INT)) { if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &room, 1, OPAL_INT))) { ORTE_ERROR_LOG(ret); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } } OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s errmgr:hnp sending dyn error release of job %s to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid), ORTE_NAME_PRINT(&jdata->originator))); if (0 > (ret = orte_rml.send_buffer_nb(&jdata->originator, answer, ORTE_RML_TAG_LAUNCH_RESP, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(answer); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); } } OBJ_RELEASE(caddy); return; } if (ORTE_JOB_STATE_FAILED_TO_START == jobstate || ORTE_JOB_STATE_FAILED_TO_LAUNCH == jobstate) { /* the job object for this job will have been NULL'd * in the array if the job was solely local. If it isn't * NULL, then we need to tell everyone else to die */ aborted_proc = NULL; if (orte_get_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, (void**)&aborted_proc, OPAL_PTR)) { sts = aborted_proc->exit_code; if (ORTE_PROC_MY_NAME->jobid == jdata->jobid) { if (WIFSIGNALED(sts)) { /* died on signal */ #ifdef WCOREDUMP if (WCOREDUMP(sts)) { orte_show_help("help-plm-base.txt", "daemon-died-signal-core", true, WTERMSIG(sts)); sts = WTERMSIG(sts); } else { orte_show_help("help-plm-base.txt", "daemon-died-signal", true, WTERMSIG(sts)); sts = WTERMSIG(sts); } #else orte_show_help("help-plm-base.txt", "daemon-died-signal", true, WTERMSIG(sts)); sts = WTERMSIG(sts); #endif /* WCOREDUMP */ } else { orte_show_help("help-plm-base.txt", "daemon-died-no-signal", true, WEXITSTATUS(sts)); sts = WEXITSTATUS(sts); } } } /* if this is the daemon job, then we need to ensure we * output an error message indicating we couldn't launch the * daemons */ if (jdata->jobid == ORTE_PROC_MY_NAME->jobid) { orte_show_help("help-errmgr-base.txt", "failed-daemon-launch", true); } } /* if the daemon job aborted and we haven't heard from everyone yet, * then this could well have been caused by a daemon not finding * a way back to us. In this case, output a message indicating a daemon * died without reporting. Otherwise, say nothing as we * likely already output an error message */ if (ORTE_JOB_STATE_ABORTED == jobstate && jdata->jobid == ORTE_PROC_MY_NAME->jobid && jdata->num_procs != jdata->num_reported) { orte_show_help("help-errmgr-base.txt", "failed-daemon", true); } /* abort the job */ ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_FORCED_EXIT); /* set the global abnormal exit flag */ orte_abnormal_term_ordered = true; OBJ_RELEASE(caddy); }
static void launch_daemons(int fd, short args, void *cbdata) { orte_job_map_t *map; size_t num_nodes; char *param; char **argv = NULL; int argc; int rc; char** env = NULL; char **nodelist_argv; char *nodelist; int nodelist_argc; char *vpid_string; int i; char *cur_prefix; int proc_vpid_index = 0; bool failed_launch = true; orte_app_context_t *app; orte_node_t *node; orte_std_cntr_t nnode; orte_job_t *daemons; orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata; orte_job_t *jdata = state->jdata; /* start by setting up the virtual machine */ daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); if (ORTE_SUCCESS != (rc = orte_plm_base_setup_virtual_machine(jdata))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* if we don't want to launch, then don't attempt to * launch the daemons - the user really wants to just * look at the proposed process map */ if (orte_do_not_launch) { /* set the state to indicate the daemons reported - this * will trigger the daemons_reported event and cause the * job to move to the following step */ state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED); OBJ_RELEASE(state); return; } OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output, "%s plm:lsf: launching vm", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* Get the map for this job */ if (NULL == (map = daemons->map)) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); rc = ORTE_ERR_NOT_FOUND; goto cleanup; } num_nodes = map->num_new_daemons; if (0 == num_nodes) { /* set the state to indicate the daemons reported - this * will trigger the daemons_reported event and cause the * job to move to the following step */ OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output, "%s plm:lsf: no new daemons to launch", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED); OBJ_RELEASE(state); return; } /* create nodelist */ nodelist_argv = NULL; nodelist_argc = 0; for (nnode=0; nnode < map->nodes->size; nnode++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, nnode))) { continue; } /* if the daemon already exists on this node, then * don't include it */ if (ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_DAEMON_LAUNCHED)) { continue; } /* otherwise, add it to the list of nodes upon which * we need to launch a daemon */ opal_argv_append(&nodelist_argc, &nodelist_argv, node->name); } nodelist = opal_argv_join(nodelist_argv, ','); /* * start building argv array */ argv = NULL; argc = 0; /* * ORTED OPTIONS */ /* add the daemon command (as specified by user) */ orte_plm_base_setup_orted_cmd(&argc, &argv); /* Add basic orted command line options */ orte_plm_base_orted_append_basic_args(&argc, &argv, "lsf", &proc_vpid_index, nodelist); free(nodelist); /* tell the new daemons the base of the name list so they can compute * their own name on the other end */ rc = orte_util_convert_vpid_to_string(&vpid_string, map->daemon_vpid_start); if (ORTE_SUCCESS != rc) { opal_output(0, "plm_lsf: unable to get daemon vpid as string"); goto cleanup; } free(argv[proc_vpid_index]); argv[proc_vpid_index] = strdup(vpid_string); free(vpid_string); if (0 < opal_output_get_verbosity(orte_plm_base_framework.framework_output)) { param = opal_argv_join(argv, ' '); if (NULL != param) { opal_output(0, "plm:lsf: final top-level argv:"); opal_output(0, "plm:lsf: %s", param); free(param); } } /* Copy the prefix-directory specified in the corresponding app_context. If there are multiple, different prefix's in the app context, complain (i.e., only allow one --prefix option for the entire lsf run -- we don't support different --prefix'es for different nodes in the LSF plm) */ cur_prefix = NULL; for (i=0; i < jdata->apps->size; i++) { char *app_prefix_dir; if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) { continue; } orte_get_attribute(&app->attributes, ORTE_APP_PREFIX_DIR, (void**)&app_prefix_dir, OPAL_STRING); /* Check for already set cur_prefix_dir -- if different, complain */ if (NULL != app_prefix_dir) { if (NULL != cur_prefix && 0 != strcmp (cur_prefix, app_prefix_dir)) { orte_show_help("help-plm-lsf.txt", "multiple-prefixes", true, cur_prefix, app_prefix_dir); rc = ORTE_ERR_FAILED_TO_START; goto cleanup; } /* If not yet set, copy it; iff set, then it's the same anyway */ if (NULL == cur_prefix) { cur_prefix = strdup(app_prefix_dir); OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output, "%s plm:lsf: Set prefix:%s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), cur_prefix)); } free(app_prefix_dir); } } /* setup environment */ env = opal_argv_copy(orte_launch_environ); /* lsb_launch tampers with SIGCHLD. * After the call to lsb_launch, the signal handler for SIGCHLD is NULL. * So, we disable the SIGCHLD handler of libevent for the duration of * the call to lsb_launch */ orte_wait_disable(); /* exec the daemon(s). Do NOT wait for lsb_launch to complete as * it only completes when the processes it starts - in this case, * the orteds - complete. We need to go ahead and return so * orterun can do the rest of its stuff. Instead, we'll catch any * failures and deal with them elsewhere */ if (lsb_launch(nodelist_argv, argv, LSF_DJOB_REPLACE_ENV | LSF_DJOB_NOWAIT, env) < 0) { ORTE_ERROR_LOG(ORTE_ERR_FAILED_TO_START); opal_output(0, "lsb_launch failed: %d", rc); rc = ORTE_ERR_FAILED_TO_START; orte_wait_enable(); /* re-enable our SIGCHLD handler */ goto cleanup; } orte_wait_enable(); /* re-enable our SIGCHLD handler */ /* indicate that the daemons for this job were launched */ state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; daemons->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; /* flag that launch was successful, so far as we currently know */ failed_launch = false; cleanup: if (NULL != argv) { opal_argv_free(argv); } if (NULL != env) { opal_argv_free(env); } /* cleanup the caddy */ OBJ_RELEASE(state); /* check for failed launch - if so, force terminate */ if (failed_launch) { ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); } }
void mca_oob_tcp_recv_handler(int sd, short flags, void *cbdata) { mca_oob_tcp_peer_t* peer = (mca_oob_tcp_peer_t*)cbdata; int rc; orte_process_name_t hop; mca_oob_tcp_peer_t *relay; uint64_t ui64; if (orte_abnormal_term_ordered) { return; } opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s:tcp:recv:handler called for peer %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&peer->name)); switch (peer->state) { case MCA_OOB_TCP_CONNECT_ACK: if (ORTE_SUCCESS == (rc = mca_oob_tcp_peer_recv_connect_ack(peer, peer->sd, NULL))) { opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s:tcp:recv:handler starting send/recv events", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); /* we connected! Start the send/recv events */ if (!peer->recv_ev_active) { opal_event_add(&peer->recv_event, 0); peer->recv_ev_active = true; } if (peer->timer_ev_active) { opal_event_del(&peer->timer_event); peer->timer_ev_active = false; } /* if there is a message waiting to be sent, queue it */ if (NULL == peer->send_msg) { peer->send_msg = (mca_oob_tcp_send_t*)opal_list_remove_first(&peer->send_queue); } if (NULL != peer->send_msg && !peer->send_ev_active) { opal_event_add(&peer->send_event, 0); peer->send_ev_active = true; } /* update our state */ peer->state = MCA_OOB_TCP_CONNECTED; } else { opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s UNABLE TO COMPLETE CONNECT ACK WITH %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&peer->name)); opal_event_del(&peer->recv_event); ORTE_FORCED_TERMINATE(1); return; } break; case MCA_OOB_TCP_CONNECTED: opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s:tcp:recv:handler CONNECTED", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); /* allocate a new message and setup for recv */ if (NULL == peer->recv_msg) { opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s:tcp:recv:handler allocate new recv msg", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); peer->recv_msg = OBJ_NEW(mca_oob_tcp_recv_t); if (NULL == peer->recv_msg) { opal_output(0, "%s-%s mca_oob_tcp_peer_recv_handler: unable to allocate recv message\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->name))); return; } /* start by reading the header */ peer->recv_msg->rdptr = (char*)&peer->recv_msg->hdr; peer->recv_msg->rdbytes = sizeof(mca_oob_tcp_hdr_t); } /* if the header hasn't been completely read, read it */ if (!peer->recv_msg->hdr_recvd) { opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s:tcp:recv:handler read hdr", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); if (ORTE_SUCCESS == (rc = read_bytes(peer))) { OPAL_TIMING_EVENT((&tm,"Header received from %s", ORTE_NAME_PRINT(&peer->name))); /* completed reading the header */ peer->recv_msg->hdr_recvd = true; /* convert the header */ MCA_OOB_TCP_HDR_NTOH(&peer->recv_msg->hdr); /* if this is a zero-byte message, then we are done */ if (0 == peer->recv_msg->hdr.nbytes) { opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s RECVD ZERO-BYTE MESSAGE FROM %s for tag %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&peer->name), peer->recv_msg->hdr.tag); peer->recv_msg->data = NULL; // make sure } else { opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s:tcp:recv:handler allocate data region of size %lu", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (unsigned long)peer->recv_msg->hdr.nbytes); /* allocate the data region */ peer->recv_msg->data = (char*)malloc(peer->recv_msg->hdr.nbytes); /* point to it */ peer->recv_msg->rdptr = peer->recv_msg->data; peer->recv_msg->rdbytes = peer->recv_msg->hdr.nbytes; } /* fall thru and attempt to read the data */ } else if (ORTE_ERR_RESOURCE_BUSY == rc || ORTE_ERR_WOULD_BLOCK == rc) { /* exit this event and let the event lib progress */ return; } else { /* close the connection */ opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s:tcp:recv:handler error reading bytes - closing connection", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); mca_oob_tcp_peer_close(peer); return; } } if (peer->recv_msg->hdr_recvd) { /* continue to read the data block - we start from * wherever we left off, which could be at the * beginning or somewhere in the message */ if (ORTE_SUCCESS == (rc = read_bytes(peer))) { OPAL_TIMING_EVENT((&tm,"Msg received from %s", ORTE_NAME_PRINT(&peer->name))); /* we recvd all of the message */ opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s RECVD COMPLETE MESSAGE FROM %s (ORIGIN %s) OF %d BYTES FOR DEST %s TAG %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&peer->name), ORTE_NAME_PRINT(&peer->recv_msg->hdr.origin), (int)peer->recv_msg->hdr.nbytes, ORTE_NAME_PRINT(&peer->recv_msg->hdr.dst), peer->recv_msg->hdr.tag); /* am I the intended recipient (header was already converted back to host order)? */ if (peer->recv_msg->hdr.dst.jobid == ORTE_PROC_MY_NAME->jobid && peer->recv_msg->hdr.dst.vpid == ORTE_PROC_MY_NAME->vpid) { /* yes - post it to the RML for delivery */ opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s DELIVERING TO RML", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); ORTE_RML_POST_MESSAGE(&peer->recv_msg->hdr.origin, peer->recv_msg->hdr.tag, peer->recv_msg->data, peer->recv_msg->hdr.nbytes); OBJ_RELEASE(peer->recv_msg); } else { /* no - find the next hop in the route */ hop = orte_routed.get_route(&peer->recv_msg->hdr.dst); if (hop.jobid == ORTE_JOBID_INVALID || hop.vpid == ORTE_VPID_INVALID) { /* no hop known - post the error to the component * and let the OOB see if there is another way * to get there from here */ opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s NO ROUTE TO %s FROM HERE", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&peer->name)); /* let the component know about the problem */ ORTE_ACTIVATE_TCP_MSG_ERROR(NULL, peer->recv_msg, &hop, mca_oob_tcp_component_no_route); /* cleanup */ OBJ_RELEASE(peer->recv_msg); return; } else { /* does we know how to reach the next hop? */ memcpy(&ui64, (char*)&hop, sizeof(uint64_t)); if (OPAL_SUCCESS != opal_hash_table_get_value_uint64(&mca_oob_tcp_module.peers, ui64, (void**)&relay)) { opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s ADDRESS OF NEXT HOP %s TO %s IS UNKNOWN", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&hop), ORTE_NAME_PRINT(&peer->recv_msg->hdr.dst)); /* let the component know about the problem */ ORTE_ACTIVATE_TCP_MSG_ERROR(NULL, peer->recv_msg, &hop, mca_oob_tcp_component_hop_unknown); /* cleanup */ OBJ_RELEASE(peer->recv_msg); return; } opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s ROUTING TO %s FROM HERE", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&relay->name)); /* if this came from a different job family, then ensure * we know how to return */ if (ORTE_JOB_FAMILY(peer->recv_msg->hdr.origin.jobid) != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) { orte_routed.update_route(&(peer->recv_msg->hdr.origin), &peer->name); } /* post the message for retransmission */ MCA_OOB_TCP_QUEUE_RELAY(peer->recv_msg, relay); OBJ_RELEASE(peer->recv_msg); } } peer->recv_msg = NULL; return; } else if (ORTE_ERR_RESOURCE_BUSY == rc || ORTE_ERR_WOULD_BLOCK == rc) { /* exit this event and let the event lib progress */ return; } else { // report the error opal_output(0, "%s-%s mca_oob_tcp_peer_recv_handler: unable to recv message", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->name))); /* turn off the recv event */ opal_event_del(&peer->recv_event); ORTE_FORCED_TERMINATE(1); return; } } break; default: opal_output(0, "%s-%s mca_oob_tcp_peer_recv_handler: invalid socket state(%d)", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->name)), peer->state); // mca_oob_tcp_peer_close(peer); break; } }
/* * A file descriptor is available/ready for send. Check the state * of the socket and take the appropriate action. */ void mca_oob_tcp_send_handler(int sd, short flags, void *cbdata) { mca_oob_tcp_peer_t* peer = (mca_oob_tcp_peer_t*)cbdata; mca_oob_tcp_send_t* msg = peer->send_msg; int rc; opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s tcp:send_handler called to send to peer %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&peer->name)); switch (peer->state) { case MCA_OOB_TCP_CONNECTING: case MCA_OOB_TCP_CLOSED: opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s tcp:send_handler %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), mca_oob_tcp_state_print(peer->state)); mca_oob_tcp_peer_complete_connect(peer); /* de-activate the send event until the connection * handshake completes */ if (peer->send_ev_active) { opal_event_del(&peer->send_event); peer->send_ev_active = false; } break; case MCA_OOB_TCP_CONNECTED: opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s tcp:send_handler SENDING TO %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == peer->send_msg) ? "NULL" : ORTE_NAME_PRINT(&peer->name)); if (NULL != msg) { /* if the header hasn't been completely sent, send it */ if (!msg->hdr_sent) { OPAL_TIMING_EVENT((&tm,"Send header to %s", ORTE_NAME_PRINT(&peer->name))); if (ORTE_SUCCESS == (rc = send_bytes(peer))) { /* header is completely sent */ msg->hdr_sent = true; /* setup to send the data */ if (NULL != msg->data) { /* relay msg - send that data */ msg->sdptr = msg->data; msg->sdbytes = (int)ntohl(msg->hdr.nbytes); } else if (NULL == msg->msg) { /* this was a zero-byte relay - nothing more to do */ OBJ_RELEASE(msg); peer->send_msg = NULL; goto next; } else if (NULL != msg->msg->buffer) { /* send the buffer data as a single block */ msg->sdptr = msg->msg->buffer->base_ptr; msg->sdbytes = msg->msg->buffer->bytes_used; } else if (NULL != msg->msg->iov) { /* start with the first iovec */ msg->sdptr = msg->msg->iov[0].iov_base; msg->sdbytes = msg->msg->iov[0].iov_len; msg->iovnum = 0; } else { /* just send the data */ msg->sdptr = msg->msg->data; msg->sdbytes = msg->msg->count; } /* fall thru and let the send progress */ } else if (ORTE_ERR_RESOURCE_BUSY == rc || ORTE_ERR_WOULD_BLOCK == rc) { /* exit this event and let the event lib progress */ return; } else { // report the error opal_output(0, "%s-%s mca_oob_tcp_peer_send_handler: unable to send header", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->name))); opal_event_del(&peer->send_event); msg->msg->status = rc; ORTE_RML_SEND_COMPLETE(msg->msg); OBJ_RELEASE(msg); peer->send_msg = NULL; goto next; } } /* progress the data transmission */ if (msg->hdr_sent) { OPAL_TIMING_EVENT((&tm,"Send msg to %s", ORTE_NAME_PRINT(&peer->name))); if (ORTE_SUCCESS == (rc = send_bytes(peer))) { /* this block is complete */ if (NULL != msg->data || NULL == msg->msg) { /* the relay is complete - release the data */ opal_output_verbose(2, orte_oob_base_framework.framework_output, "%s MESSAGE RELAY COMPLETE TO %s OF %d BYTES ON SOCKET %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->name)), (int)ntohl(msg->hdr.nbytes), peer->sd); OBJ_RELEASE(msg); peer->send_msg = NULL; } else if (NULL != msg->msg->buffer) { /* we are done - notify the RML */ opal_output_verbose(2, orte_oob_base_framework.framework_output, "%s MESSAGE SEND COMPLETE TO %s OF %d BYTES ON SOCKET %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->name)), (int)ntohl(msg->hdr.nbytes), peer->sd); msg->msg->status = ORTE_SUCCESS; ORTE_RML_SEND_COMPLETE(msg->msg); OBJ_RELEASE(msg); peer->send_msg = NULL; } else if (NULL != msg->msg->data) { /* this was a relay we have now completed - no need to * notify the RML as the local proc didn't initiate * the send */ opal_output_verbose(2, orte_oob_base_framework.framework_output, "%s MESSAGE RELAY COMPLETE TO %s OF %d BYTES ON SOCKET %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->name)), (int)ntohl(msg->hdr.nbytes), peer->sd); msg->msg->status = ORTE_SUCCESS; OBJ_RELEASE(msg); peer->send_msg = NULL; } else { /* rotate to the next iovec */ msg->iovnum++; if (msg->iovnum < msg->msg->count) { msg->sdptr = msg->msg->iov[msg->iovnum].iov_base; msg->sdbytes = msg->msg->iov[msg->iovnum].iov_len; /* exit this event to give the event lib * a chance to progress any other pending * actions */ return; } else { /* this message is complete - notify the RML */ opal_output_verbose(2, orte_oob_base_framework.framework_output, "%s MESSAGE SEND COMPLETE TO %s OF %d BYTES ON SOCKET %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->name)), (int)ntohl(msg->hdr.nbytes), peer->sd); msg->msg->status = ORTE_SUCCESS; ORTE_RML_SEND_COMPLETE(msg->msg); OBJ_RELEASE(msg); peer->send_msg = NULL; } } /* fall thru to queue the next message */ } else if (ORTE_ERR_RESOURCE_BUSY == rc || ORTE_ERR_WOULD_BLOCK == rc) { /* exit this event and let the event lib progress */ return; } else { // report the error opal_output(0, "%s-%s mca_oob_tcp_peer_send_handler: unable to send message ON SOCKET %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->name)), peer->sd); opal_event_del(&peer->send_event); msg->msg->status = rc; ORTE_RML_SEND_COMPLETE(msg->msg); OBJ_RELEASE(msg); peer->send_msg = NULL; ORTE_FORCED_TERMINATE(1); return; } } next: /* if current message completed - progress any pending sends by * moving the next in the queue into the "on-deck" position. Note * that this doesn't mean we send the message right now - we will * wait for another send_event to fire before doing so. This gives * us a chance to service any pending recvs. */ peer->send_msg = (mca_oob_tcp_send_t*) opal_list_remove_first(&peer->send_queue); } /* if nothing else to do unregister for send event notifications */ if (NULL == peer->send_msg && peer->send_ev_active) { opal_event_del(&peer->send_event); peer->send_ev_active = false; } break; default: opal_output(0, "%s-%s mca_oob_tcp_peer_send_handler: invalid connection state (%d) on socket %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->name)), peer->state, peer->sd); if (peer->send_ev_active) { opal_event_del(&peer->send_event); peer->send_ev_active = false; } break; } }
/* * This function is responsible for: * - Constructing the remote absolute path for the specified file/dir * - Verify the existence of the file/dir * - Determine if the specified file/dir is in fact a file or dir or unknown if not found. */ static void filem_base_process_get_remote_path_cmd(orte_process_name_t* sender, opal_buffer_t* buffer) { opal_buffer_t *answer; orte_std_cntr_t count; char *filename = NULL; char *tmp_name = NULL; char cwd[OPAL_PATH_MAX]; int file_type = ORTE_FILEM_TYPE_UNKNOWN; struct stat file_status; int rc; count = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &filename, &count, OPAL_STRING))) { ORTE_ERROR_LOG(rc); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); goto CLEANUP; } /* * Determine the absolute path of the file */ if (filename[0] != '/') { /* if it is not an absolute path already */ getcwd(cwd, sizeof(cwd)); asprintf(&tmp_name, "%s/%s", cwd, filename); } else { tmp_name = strdup(filename); } opal_output_verbose(10, orte_filem_base_framework.framework_output, "filem:base: process_get_remote_path_cmd: %s -> %s: Filename Requested (%s) translated to (%s)", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(sender), filename, tmp_name); /* * Determine if the file/dir exists at that absolute path * Determine if the file/dir is a file or a directory */ if (0 != (rc = stat(tmp_name, &file_status) ) ){ file_type = ORTE_FILEM_TYPE_UNKNOWN; } else { /* Is it a directory? */ if(S_ISDIR(file_status.st_mode)) { file_type = ORTE_FILEM_TYPE_DIR; } else if(S_ISREG(file_status.st_mode)) { file_type = ORTE_FILEM_TYPE_FILE; } } /* * Pack up the response * Send back the reference type * - ORTE_FILEM_TYPE_FILE = File * - ORTE_FILEM_TYPE_DIR = Directory * - ORTE_FILEM_TYPE_UNKNOWN = Could not be determined, or does not exist */ answer = OBJ_NEW(opal_buffer_t); if (ORTE_SUCCESS != (rc = opal_dss.pack(answer, &tmp_name, 1, OPAL_STRING))) { ORTE_ERROR_LOG(rc); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(answer); goto CLEANUP; } if (ORTE_SUCCESS != (rc = opal_dss.pack(answer, &file_type, 1, OPAL_INT))) { ORTE_ERROR_LOG(rc); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(answer); goto CLEANUP; } if (0 > (rc = orte_rml.send_buffer_nb(sender, answer, ORTE_RML_TAG_FILEM_BASE_RESP, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(answer); } CLEANUP: if( NULL != filename) { free(filename); filename = NULL; } if( NULL != tmp_name) { free(tmp_name); tmp_name = NULL; } }
void mca_oob_usock_recv_handler(int sd, short flags, void *cbdata) { mca_oob_usock_peer_t* peer = (mca_oob_usock_peer_t*)cbdata; int rc; orte_rml_send_t *snd; if (orte_abnormal_term_ordered) { return; } opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s:usock:recv:handler called for peer %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&peer->name)); switch (peer->state) { case MCA_OOB_USOCK_CONNECT_ACK: if (ORTE_SUCCESS == (rc = mca_oob_usock_peer_recv_connect_ack(peer, peer->sd, NULL))) { opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s:usock:recv:handler starting send/recv events", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); /* we connected! Start the send/recv events */ if (!peer->recv_ev_active) { opal_event_add(&peer->recv_event, 0); peer->recv_ev_active = true; } if (peer->timer_ev_active) { opal_event_del(&peer->timer_event); peer->timer_ev_active = false; } /* if there is a message waiting to be sent, queue it */ if (NULL == peer->send_msg) { peer->send_msg = (mca_oob_usock_send_t*)opal_list_remove_first(&peer->send_queue); } if (NULL != peer->send_msg && !peer->send_ev_active) { opal_event_add(&peer->send_event, 0); peer->send_ev_active = true; } /* update our state */ peer->state = MCA_OOB_USOCK_CONNECTED; } else { opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s UNABLE TO COMPLETE CONNECT ACK WITH %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&peer->name)); opal_event_del(&peer->recv_event); peer->recv_ev_active = false; ORTE_FORCED_TERMINATE(1); return; } break; case MCA_OOB_USOCK_CONNECTED: opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s:usock:recv:handler CONNECTED", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); /* allocate a new message and setup for recv */ if (NULL == peer->recv_msg) { opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s:usock:recv:handler allocate new recv msg", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); peer->recv_msg = OBJ_NEW(mca_oob_usock_recv_t); if (NULL == peer->recv_msg) { opal_output(0, "%s-%s mca_oob_usock_peer_recv_handler: unable to allocate recv message\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->name))); return; } /* start by reading the header */ peer->recv_msg->rdptr = (char*)&peer->recv_msg->hdr; peer->recv_msg->rdbytes = sizeof(mca_oob_usock_hdr_t); } /* if the header hasn't been completely read, read it */ if (!peer->recv_msg->hdr_recvd) { opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s:usock:recv:handler read hdr", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); if (ORTE_SUCCESS == (rc = read_bytes(peer))) { /* completed reading the header */ peer->recv_msg->hdr_recvd = true; /* if this is a zero-byte message, then we are done */ if (0 == peer->recv_msg->hdr.nbytes) { opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s RECVD ZERO-BYTE MESSAGE FROM %s for tag %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&peer->name), peer->recv_msg->hdr.tag); peer->recv_msg->data = NULL; // make sure peer->recv_msg->rdptr = NULL; peer->recv_msg->rdbytes = 0; } else { opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s:usock:recv:handler allocate data region of size %lu", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (unsigned long)peer->recv_msg->hdr.nbytes); /* allocate the data region */ peer->recv_msg->data = (char*)malloc(peer->recv_msg->hdr.nbytes); /* point to it */ peer->recv_msg->rdptr = peer->recv_msg->data; peer->recv_msg->rdbytes = peer->recv_msg->hdr.nbytes; } /* fall thru and attempt to read the data */ } else if (ORTE_ERR_RESOURCE_BUSY == rc || ORTE_ERR_WOULD_BLOCK == rc) { /* exit this event and let the event lib progress */ return; } else { /* close the connection */ opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s:usock:recv:handler error reading bytes - closing connection", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); mca_oob_usock_peer_close(peer); return; } } if (peer->recv_msg->hdr_recvd) { /* continue to read the data block - we start from * wherever we left off, which could be at the * beginning or somewhere in the message */ if (ORTE_SUCCESS == (rc = read_bytes(peer))) { /* we recvd all of the message */ opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s RECVD COMPLETE MESSAGE FROM %s OF %d BYTES FOR DEST %s TAG %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&peer->recv_msg->hdr.origin), (int)peer->recv_msg->hdr.nbytes, ORTE_NAME_PRINT(&peer->recv_msg->hdr.dst), peer->recv_msg->hdr.tag); /* am I the intended recipient? */ if (peer->recv_msg->hdr.dst.jobid == ORTE_PROC_MY_NAME->jobid && peer->recv_msg->hdr.dst.vpid == ORTE_PROC_MY_NAME->vpid) { /* yes - post it to the RML for delivery */ opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s DELIVERING TO RML", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); ORTE_RML_POST_MESSAGE(&peer->recv_msg->hdr.origin, peer->recv_msg->hdr.tag, peer->recv_msg->hdr.seq_num, peer->recv_msg->data, peer->recv_msg->hdr.nbytes); OBJ_RELEASE(peer->recv_msg); } else { /* no - we don't route things, so we promote this * back to the OOB and let another transport move * it along. If we are a daemon and it is intended * for another of our local procs, it will just come * back to us and be handled then */ snd = OBJ_NEW(orte_rml_send_t); snd->dst = peer->recv_msg->hdr.dst; snd->origin = peer->recv_msg->hdr.origin; snd->tag = peer->recv_msg->hdr.tag; snd->data = peer->recv_msg->data; snd->seq_num = peer->recv_msg->hdr.seq_num; snd->count = peer->recv_msg->hdr.nbytes; snd->cbfunc.iov = NULL; snd->cbdata = NULL; /* activate the OOB send state */ ORTE_OOB_SEND(snd); /* protect the data */ peer->recv_msg->data = NULL; /* cleanup */ OBJ_RELEASE(peer->recv_msg); return; } } else if (ORTE_ERR_RESOURCE_BUSY == rc || ORTE_ERR_WOULD_BLOCK == rc) { /* exit this event and let the event lib progress */ return; } else { // report the error opal_output(0, "%s-%s mca_oob_usock_peer_recv_handler: unable to recv message", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->name))); /* turn off the recv event */ opal_event_del(&peer->recv_event); peer->recv_ev_active = false; ORTE_FORCED_TERMINATE(1); return; } } break; default: opal_output(0, "%s-%s mca_oob_usock_peer_recv_handler: invalid socket state(%d)", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->name)), peer->state); // mca_oob_usock_peer_close(peer); break; } }
/* * Function for selecting one component from all those that are * available. */ void orte_rmaps_base_map_job(int fd, short args, void *cbdata) { orte_job_t *jdata; orte_job_map_t *map; int rc; bool did_map; opal_list_item_t *item; orte_rmaps_base_selected_module_t *mod; orte_job_t *parent; orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; /* convenience */ jdata = caddy->jdata; jdata->state = ORTE_JOB_STATE_MAP; /* NOTE: NO PROXY COMPONENT REQUIRED - REMOTE PROCS ARE NOT * ALLOWED TO CALL RMAPS INDEPENDENTLY. ONLY THE PLM CAN * DO SO, AND ALL PLM COMMANDS ARE RELAYED TO HNP */ opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: mapping job %s", ORTE_JOBID_PRINT(jdata->jobid)); /* NOTE: CHECK FOR JDATA->MAP == NULL. IF IT IS, THEN USE * THE VALUES THAT WERE READ BY THE LOCAL MCA PARAMS. THE * PLM PROXY WILL SEND A JOB-OBJECT THAT WILL INCLUDE ANY * MAPPING DIRECTIVES - OTHERWISE, THAT OBJECT WILL HAVE A * NULL MAP FIELD * LONE EXCEPTION - WE COPY DISPLAY MAP ACROSS IF THEY * DIDN'T SET IT */ if (NULL == jdata->map) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: creating new map for job %s", ORTE_JOBID_PRINT(jdata->jobid)); /* create a map object where we will store the results */ map = OBJ_NEW(orte_job_map_t); if (NULL == map) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } /* load it with the system defaults */ map->mapping = orte_rmaps_base.mapping; map->ranking = orte_rmaps_base.ranking; #if OPAL_HAVE_HWLOC map->binding = opal_hwloc_binding_policy; #endif if (NULL != orte_rmaps_base.ppr) { map->ppr = strdup(orte_rmaps_base.ppr); } map->cpus_per_rank = orte_rmaps_base.cpus_per_rank; map->display_map = orte_rmaps_base.display_map; /* assign the map object to this job */ jdata->map = map; } else { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: setting mapping policies for job %s", ORTE_JOBID_PRINT(jdata->jobid)); if (!jdata->map->display_map) { jdata->map->display_map = orte_rmaps_base.display_map; } /* set the default mapping policy IFF it wasn't provided */ if (!ORTE_MAPPING_POLICY_IS_SET(jdata->map->mapping)) { ORTE_SET_MAPPING_POLICY(jdata->map->mapping, orte_rmaps_base.mapping); } if (!ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) { ORTE_SET_MAPPING_DIRECTIVE(jdata->map->mapping, ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)); } /* ditto for rank and bind policies */ if (!ORTE_RANKING_POLICY_IS_SET(jdata->map->ranking)) { ORTE_SET_RANKING_POLICY(jdata->map->ranking, orte_rmaps_base.ranking); } #if OPAL_HAVE_HWLOC if (!OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) { jdata->map->binding = opal_hwloc_binding_policy; } #endif } #if OPAL_HAVE_HWLOC /* if we are not going to launch, then we need to set any * undefined topologies to match our own so the mapper * can operate */ if (orte_do_not_launch) { orte_node_t *node; hwloc_topology_t t0; int i; node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0); t0 = node->topology; for (i=1; i < orte_node_pool->size; i++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) { continue; } if (NULL == node->topology) { node->topology = t0; } } } #endif /* cycle thru the available mappers until one agrees to map * the job */ did_map = false; if (1 == opal_list_get_size(&orte_rmaps_base.selected_modules)) { /* forced selection */ mod = (orte_rmaps_base_selected_module_t*)opal_list_get_first(&orte_rmaps_base.selected_modules); jdata->map->req_mapper = strdup(mod->component->mca_component_name); } for (item = opal_list_get_first(&orte_rmaps_base.selected_modules); item != opal_list_get_end(&orte_rmaps_base.selected_modules); item = opal_list_get_next(item)) { mod = (orte_rmaps_base_selected_module_t*)item; if (ORTE_SUCCESS == (rc = mod->module->map_job(jdata)) || ORTE_ERR_RESOURCE_BUSY == rc) { did_map = true; break; } /* mappers return "next option" if they didn't attempt to * map the job. anything else is a true error. */ if (ORTE_ERR_TAKE_NEXT_OPTION != rc) { ORTE_ERROR_LOG(rc); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } } if (did_map && ORTE_ERR_RESOURCE_BUSY == rc) { /* the map was done but nothing could be mapped * for launch as all the resources were busy */ OBJ_RELEASE(caddy); return; } /* if we get here without doing the map, or with zero procs in * the map, then that's an error */ if (!did_map || 0 == jdata->num_procs) { orte_show_help("help-orte-rmaps-base.txt", "failed-map", true); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } /* compute and save local ranks */ if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) { ORTE_ERROR_LOG(rc); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } #if OPAL_HAVE_HWLOC /* compute and save bindings */ if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_bindings(jdata))) { ORTE_ERROR_LOG(rc); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } #endif /* set the offset so shared memory components can potentially * connect to any spawned jobs */ jdata->offset = orte_total_procs; /* track the total number of procs launched by us */ orte_total_procs += jdata->num_procs; /* if it is a dynamic spawn, save the bookmark on the parent's job too */ if (ORTE_JOBID_INVALID != jdata->originator.jobid) { if (NULL != (parent = orte_get_job_data_object(jdata->originator.jobid))) { parent->bookmark = jdata->bookmark; } } /* if we wanted to display the map, now is the time to do it - ignore * daemon job */ if (jdata->map->display_map) { char *output=NULL; int i, j; orte_node_t *node; orte_proc_t *proc; if (orte_display_diffable_output) { /* intended solely to test mapping methods, this output * can become quite long when testing at scale. Rather * than enduring all the malloc/free's required to * create an arbitrary-length string, custom-generate * the output a line at a time here */ /* display just the procs in a diffable format */ opal_output(orte_clean_output, "<map>\n\t<jobid=%s>\n\t<offset=%s>", ORTE_JOBID_PRINT(jdata->jobid), ORTE_VPID_PRINT(jdata->offset)); fflush(stderr); /* loop through nodes */ for (i=0; i < jdata->map->nodes->size; i++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, i))) { continue; } opal_output(orte_clean_output, "\t<host name=%s>", (NULL == node->name) ? "UNKNOWN" : node->name); fflush(stderr); for (j=0; j < node->procs->size; j++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { continue; } #if OPAL_HAVE_HWLOC { char locale[64]; if (NULL != proc->locale) { hwloc_bitmap_list_snprintf(locale, 64, proc->locale->cpuset); } opal_output(orte_clean_output, "\t\t<process rank=%s app_idx=%ld local_rank=%lu node_rank=%lu locale=%s binding=%s>", ORTE_VPID_PRINT(proc->name.vpid), (long)proc->app_idx, (unsigned long)proc->local_rank, (unsigned long)proc->node_rank, locale, (NULL == proc->cpu_bitmap) ? "NULL" : proc->cpu_bitmap); } #else opal_output(orte_clean_output, "\t\t<process rank=%s app_idx=%ld local_rank=%lu node_rank=%lu>", ORTE_VPID_PRINT(proc->name.vpid), (long)proc->app_idx, (unsigned long)proc->local_rank, (unsigned long)proc->node_rank); #endif fflush(stderr); } opal_output(orte_clean_output, "\t</host>"); fflush(stderr); } #if OPAL_HAVE_HWLOC { opal_hwloc_locality_t locality; orte_proc_t *p0; /* test locality - for the first node, print the locality of each proc relative to the first one */ node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, 0); p0 = (orte_proc_t*)opal_pointer_array_get_item(node->procs, 0); opal_output(orte_clean_output, "\t<locality>"); for (j=1; j < node->procs->size; j++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { continue; } locality = opal_hwloc_base_get_relative_locality(node->topology, p0->cpu_bitmap, proc->cpu_bitmap); opal_output(orte_clean_output, "\t\t<rank=%s rank=%s locality=%s>", ORTE_VPID_PRINT(p0->name.vpid), ORTE_VPID_PRINT(proc->name.vpid), opal_hwloc_base_print_locality(locality)); } opal_output(orte_clean_output, "\t</locality>\n</map>"); fflush(stderr); } #else opal_output(orte_clean_output, "\n</map>"); fflush(stderr); #endif } else { opal_output(orte_clean_output, " Data for JOB %s offset %s", ORTE_JOBID_PRINT(jdata->jobid), ORTE_VPID_PRINT(jdata->offset)); opal_dss.print(&output, NULL, jdata->map, ORTE_JOB_MAP); if (orte_xml_output) { fprintf(orte_xml_fp, "%s\n", output); fflush(orte_xml_fp); } else { opal_output(orte_clean_output, "%s", output); } free(output); } } /* set the job state to the next position */ ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_COMPLETE); /* cleanup */ OBJ_RELEASE(caddy); }
static void launch_daemons(int fd, short args, void *cbdata) { orte_app_context_t *app; orte_node_t *node; orte_std_cntr_t n; orte_job_map_t *map; char *jobid_string = NULL; char *param; char **argv = NULL; int argc; int rc; char *tmp; char** env = NULL; char *nodelist_flat; char **nodelist_argv; char *name_string; char **custom_strings; int num_args, i; char *cur_prefix; int proc_vpid_index; bool failed_launch=true; orte_job_t *daemons; orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata; OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output, "%s plm:slurm: LAUNCH DAEMONS CALLED", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* if we are launching debugger daemons, then just go * do it - no new daemons will be launched */ if (ORTE_JOB_CONTROL_DEBUGGER_DAEMON & state->jdata->controls) { state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED); OBJ_RELEASE(state); return; } /* start by setting up the virtual machine */ daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); if (ORTE_SUCCESS != (rc = orte_plm_base_setup_virtual_machine(state->jdata))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* if we don't want to launch, then don't attempt to * launch the daemons - the user really wants to just * look at the proposed process map */ if (orte_do_not_launch) { /* set the state to indicate the daemons reported - this * will trigger the daemons_reported event and cause the * job to move to the following step */ state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED); OBJ_RELEASE(state); return; } /* Get the map for this job */ if (NULL == (map = daemons->map)) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); rc = ORTE_ERR_NOT_FOUND; goto cleanup; } if (0 == map->num_new_daemons) { /* set the state to indicate the daemons reported - this * will trigger the daemons_reported event and cause the * job to move to the following step */ OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output, "%s plm:slurm: no new daemons to launch", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED); OBJ_RELEASE(state); return; } /* need integer value for command line parameter */ asprintf(&jobid_string, "%lu", (unsigned long) daemons->jobid); /* * start building argv array */ argv = NULL; argc = 0; /* * SLURM srun OPTIONS */ /* add the srun command */ opal_argv_append(&argc, &argv, "srun"); /* start one orted on each node */ opal_argv_append(&argc, &argv, "--ntasks-per-node=1"); /* alert us if any orteds die during startup */ opal_argv_append(&argc, &argv, "--kill-on-bad-exit"); /* Append user defined arguments to srun */ if ( NULL != mca_plm_slurm_component.custom_args ) { custom_strings = opal_argv_split(mca_plm_slurm_component.custom_args, ' '); num_args = opal_argv_count(custom_strings); for (i = 0; i < num_args; ++i) { opal_argv_append(&argc, &argv, custom_strings[i]); } opal_argv_free(custom_strings); } /* create nodelist */ nodelist_argv = NULL; for (n=0; n < map->nodes->size; n++ ) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, n))) { continue; } /* if the daemon already exists on this node, then * don't include it */ if (node->daemon_launched) { continue; } /* otherwise, add it to the list of nodes upon which * we need to launch a daemon */ opal_argv_append_nosize(&nodelist_argv, node->name); } if (0 == opal_argv_count(nodelist_argv)) { orte_show_help("help-plm-slurm.txt", "no-hosts-in-list", true); rc = ORTE_ERR_FAILED_TO_START; goto cleanup; } nodelist_flat = opal_argv_join(nodelist_argv, ','); opal_argv_free(nodelist_argv); /* if we are using all allocated nodes, then srun doesn't * require any further arguments */ if (map->num_new_daemons < orte_num_allocated_nodes) { asprintf(&tmp, "--nodes=%lu", (unsigned long)map->num_new_daemons); opal_argv_append(&argc, &argv, tmp); free(tmp); asprintf(&tmp, "--nodelist=%s", nodelist_flat); opal_argv_append(&argc, &argv, tmp); free(tmp); } /* tell srun how many tasks to run */ asprintf(&tmp, "--ntasks=%lu", (unsigned long)map->num_new_daemons); opal_argv_append(&argc, &argv, tmp); free(tmp); OPAL_OUTPUT_VERBOSE((2, orte_plm_base_framework.framework_output, "%s plm:slurm: launching on nodes %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), nodelist_flat)); /* * ORTED OPTIONS */ /* add the daemon command (as specified by user) */ orte_plm_base_setup_orted_cmd(&argc, &argv); /* Add basic orted command line options, including debug flags */ orte_plm_base_orted_append_basic_args(&argc, &argv, NULL, &proc_vpid_index, nodelist_flat); free(nodelist_flat); /* tell the new daemons the base of the name list so they can compute * their own name on the other end */ rc = orte_util_convert_vpid_to_string(&name_string, map->daemon_vpid_start); if (ORTE_SUCCESS != rc) { opal_output(0, "plm_slurm: unable to get daemon vpid as string"); goto cleanup; } free(argv[proc_vpid_index]); argv[proc_vpid_index] = strdup(name_string); free(name_string); /* Copy the prefix-directory specified in the corresponding app_context. If there are multiple, different prefix's in the app context, complain (i.e., only allow one --prefix option for the entire slurm run -- we don't support different --prefix'es for different nodes in the SLURM plm) */ cur_prefix = NULL; for (n=0; n < state->jdata->apps->size; n++) { char * app_prefix_dir; if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(state->jdata->apps, n))) { continue; } app_prefix_dir = app->prefix_dir; /* Check for already set cur_prefix_dir -- if different, complain */ if (NULL != app_prefix_dir) { if (NULL != cur_prefix && 0 != strcmp (cur_prefix, app_prefix_dir)) { orte_show_help("help-plm-slurm.txt", "multiple-prefixes", true, cur_prefix, app_prefix_dir); goto cleanup; } /* If not yet set, copy it; iff set, then it's the * same anyway */ if (NULL == cur_prefix) { cur_prefix = strdup(app_prefix_dir); OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output, "%s plm:slurm: Set prefix:%s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), cur_prefix)); } } } /* setup environment */ env = opal_argv_copy(orte_launch_environ); if (0 < opal_output_get_verbosity(orte_plm_base_framework.framework_output)) { param = opal_argv_join(argv, ' '); OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output, "%s plm:slurm: final top-level argv:\n\t%s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == param) ? "NULL" : param)); if (NULL != param) free(param); } /* exec the daemon(s) */ if (ORTE_SUCCESS != (rc = plm_slurm_start_proc(argc, argv, env, cur_prefix))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* indicate that the daemons for this job were launched */ state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; daemons->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; /* flag that launch was successful, so far as we currently know */ failed_launch = false; cleanup: if (NULL != argv) { opal_argv_free(argv); } if (NULL != env) { opal_argv_free(env); } if(NULL != jobid_string) { free(jobid_string); } /* cleanup the caddy */ OBJ_RELEASE(state); /* check for failed launch - if so, force terminate */ if (failed_launch) { ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); } }
static void xcast_recv(int status, orte_process_name_t* sender, opal_buffer_t* buffer, orte_rml_tag_t tg, void* cbdata) { opal_list_item_t *item; orte_namelist_t *nm; int ret, cnt; opal_buffer_t *relay=NULL, *rly; orte_daemon_cmd_flag_t command = ORTE_DAEMON_NULL_CMD; opal_buffer_t wireup, datbuf, *data; opal_byte_object_t *bo; int8_t flag; orte_job_t *jdata; orte_proc_t *rec; opal_list_t coll; orte_grpcomm_signature_t *sig; orte_rml_tag_t tag; char *rtmod, *nidmap; size_t inlen, cmplen; uint8_t *packed_data, *cmpdata; int32_t nvals, i; opal_value_t kv, *kval; orte_process_name_t dmn; OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_framework.framework_output, "%s grpcomm:direct:xcast:recv: with %d bytes", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)buffer->bytes_used)); /* we need a passthru buffer to send to our children - we leave it * as compressed data */ rly = OBJ_NEW(opal_buffer_t); opal_dss.copy_payload(rly, buffer); OBJ_CONSTRUCT(&datbuf, opal_buffer_t); /* setup the relay list */ OBJ_CONSTRUCT(&coll, opal_list_t); /* unpack the flag to see if this payload is compressed */ cnt=1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &flag, &cnt, OPAL_INT8))) { ORTE_ERROR_LOG(ret); ORTE_FORCED_TERMINATE(ret); OBJ_DESTRUCT(&datbuf); OBJ_DESTRUCT(&coll); OBJ_RELEASE(rly); return; } if (flag) { /* unpack the data size */ cnt=1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &inlen, &cnt, OPAL_SIZE))) { ORTE_ERROR_LOG(ret); ORTE_FORCED_TERMINATE(ret); OBJ_DESTRUCT(&datbuf); OBJ_DESTRUCT(&coll); OBJ_RELEASE(rly); return; } /* unpack the unpacked data size */ cnt=1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &cmplen, &cnt, OPAL_SIZE))) { ORTE_ERROR_LOG(ret); ORTE_FORCED_TERMINATE(ret); OBJ_DESTRUCT(&datbuf); OBJ_DESTRUCT(&coll); OBJ_RELEASE(rly); return; } /* allocate the space */ packed_data = (uint8_t*)malloc(inlen); /* unpack the data blob */ cnt = inlen; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, packed_data, &cnt, OPAL_UINT8))) { ORTE_ERROR_LOG(ret); free(packed_data); ORTE_FORCED_TERMINATE(ret); OBJ_DESTRUCT(&datbuf); OBJ_DESTRUCT(&coll); OBJ_RELEASE(rly); return; } /* decompress the data */ if (orte_util_uncompress_block(&cmpdata, cmplen, packed_data, inlen)) { /* the data has been uncompressed */ opal_dss.load(&datbuf, cmpdata, cmplen); data = &datbuf; } else { data = buffer; } free(packed_data); } else { data = buffer; } /* get the signature that we do not need */ cnt=1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(data, &sig, &cnt, ORTE_SIGNATURE))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&datbuf); OBJ_DESTRUCT(&coll); OBJ_RELEASE(rly); ORTE_FORCED_TERMINATE(ret); return; } OBJ_RELEASE(sig); /* get the target tag */ cnt=1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(data, &tag, &cnt, ORTE_RML_TAG))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&datbuf); OBJ_DESTRUCT(&coll); OBJ_RELEASE(rly); ORTE_FORCED_TERMINATE(ret); return; } /* get our conduit's routed module name */ rtmod = orte_rml.get_routed(orte_coll_conduit); /* if this is headed for the daemon command processor, * then we first need to check for add_local_procs * as that command includes some needed wireup info */ if (ORTE_RML_TAG_DAEMON == tag) { /* peek at the command */ cnt=1; if (ORTE_SUCCESS == (ret = opal_dss.unpack(data, &command, &cnt, ORTE_DAEMON_CMD))) { /* if it is an exit cmd, then flag that we are quitting so we will properly * handle connection losses from our downstream peers */ if (ORTE_DAEMON_EXIT_CMD == command || ORTE_DAEMON_HALT_VM_CMD == command) { orte_orteds_term_ordered = true; if (ORTE_DAEMON_HALT_VM_CMD == command) { /* this is an abnormal termination */ orte_abnormal_term_ordered = true; } /* copy the msg for relay to ourselves */ relay = OBJ_NEW(opal_buffer_t); /* repack the command */ if (OPAL_SUCCESS != (ret = opal_dss.pack(relay, &command, 1, ORTE_DAEMON_CMD))) { ORTE_ERROR_LOG(ret); goto relay; } opal_dss.copy_payload(relay, data); } else if (ORTE_DAEMON_ADD_LOCAL_PROCS == command || ORTE_DAEMON_DVM_NIDMAP_CMD == command || ORTE_DAEMON_DVM_ADD_PROCS == command) { /* setup our internal relay buffer */ relay = OBJ_NEW(opal_buffer_t); /* repack the command */ if (OPAL_SUCCESS != (ret = opal_dss.pack(relay, &command, 1, ORTE_DAEMON_CMD))) { ORTE_ERROR_LOG(ret); goto relay; } /* unpack the nidmap string - may be NULL */ cnt = 1; if (OPAL_SUCCESS != (ret = opal_dss.unpack(data, &nidmap, &cnt, OPAL_STRING))) { ORTE_ERROR_LOG(ret); goto relay; } if (NULL != nidmap) { if (ORTE_SUCCESS != (ret = orte_regx.nidmap_parse(nidmap))) { ORTE_ERROR_LOG(ret); goto relay; } free(nidmap); } /* see if they included info on node capabilities */ cnt = 1; if (OPAL_SUCCESS != (ret = opal_dss.unpack(data, &flag, &cnt, OPAL_INT8))) { ORTE_ERROR_LOG(ret); goto relay; } if (0 != flag) { /* update our local nidmap, if required - the decode function * knows what to do */ OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output, "%s grpcomm:direct:xcast updating daemon nidmap", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); if (ORTE_SUCCESS != (ret = orte_regx.decode_daemon_nodemap(data))) { ORTE_ERROR_LOG(ret); goto relay; } if (!ORTE_PROC_IS_HNP) { /* update the routing plan - the HNP already did * it when it computed the VM, so don't waste time * re-doing it here */ orte_routed.update_routing_plan(rtmod); } /* routing is now possible */ orte_routed_base.routing_enabled = true; /* unpack the byte object */ cnt=1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(data, &bo, &cnt, OPAL_BYTE_OBJECT))) { ORTE_ERROR_LOG(ret); goto relay; } if (0 < bo->size) { /* load it into a buffer */ OBJ_CONSTRUCT(&wireup, opal_buffer_t); opal_dss.load(&wireup, bo->bytes, bo->size); /* decode it, pushing the info into our database */ if (opal_pmix.legacy_get()) { OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = OPAL_PMIX_PROC_URI; kv.type = OPAL_STRING; cnt=1; while (OPAL_SUCCESS == (ret = opal_dss.unpack(&wireup, &dmn, &cnt, ORTE_NAME))) { cnt = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(&wireup, &kv.data.string, &cnt, OPAL_STRING))) { ORTE_ERROR_LOG(ret); break; } if (OPAL_SUCCESS != (ret = opal_pmix.store_local(&dmn, &kv))) { ORTE_ERROR_LOG(ret); free(kv.data.string); break; } free(kv.data.string); kv.data.string = NULL; } if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != ret) { ORTE_ERROR_LOG(ret); } } else { cnt=1; while (OPAL_SUCCESS == (ret = opal_dss.unpack(&wireup, &dmn, &cnt, ORTE_NAME))) { cnt = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(&wireup, &nvals, &cnt, OPAL_INT32))) { ORTE_ERROR_LOG(ret); break; } for (i=0; i < nvals; i++) { cnt = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(&wireup, &kval, &cnt, OPAL_VALUE))) { ORTE_ERROR_LOG(ret); break; } OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output, "%s STORING MODEX DATA FOR PROC %s KEY %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&dmn), kval->key)); if (OPAL_SUCCESS != (ret = opal_pmix.store_local(&dmn, kval))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(kval); break; } OBJ_RELEASE(kval); } } if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != ret) { ORTE_ERROR_LOG(ret); } } /* done with the wireup buffer - dump it */ OBJ_DESTRUCT(&wireup); } free(bo); } /* copy the remainder of the payload - we don't pass wiring info * to the odls */ opal_dss.copy_payload(relay, data); } else { relay = OBJ_NEW(opal_buffer_t); /* repack the command */ if (OPAL_SUCCESS != (ret = opal_dss.pack(relay, &command, 1, ORTE_DAEMON_CMD))) { ORTE_ERROR_LOG(ret); goto relay; } /* copy the msg for relay to ourselves */ opal_dss.copy_payload(relay, data); } } else { ORTE_ERROR_LOG(ret); goto CLEANUP; } } else { /* copy the msg for relay to ourselves */ relay = OBJ_NEW(opal_buffer_t); opal_dss.copy_payload(relay, data); } relay: if (!orte_do_not_launch) { /* get the list of next recipients from the routed module */ orte_routed.get_routing_list(rtmod, &coll); /* if list is empty, no relay is required */ if (opal_list_is_empty(&coll)) { OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output, "%s grpcomm:direct:send_relay - recipient list is empty!", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto CLEANUP; } /* send the message to each recipient on list, deconstructing it as we go */ while (NULL != (item = opal_list_remove_first(&coll))) { nm = (orte_namelist_t*)item; OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output, "%s grpcomm:direct:send_relay sending relay msg of %d bytes to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)rly->bytes_used, ORTE_NAME_PRINT(&nm->name))); OBJ_RETAIN(rly); /* check the state of the recipient - no point * sending to someone not alive */ jdata = orte_get_job_data_object(nm->name.jobid); if (NULL == (rec = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, nm->name.vpid))) { if (!orte_abnormal_term_ordered && !orte_orteds_term_ordered) { opal_output(0, "%s grpcomm:direct:send_relay proc %s not found - cannot relay", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&nm->name)); } OBJ_RELEASE(rly); OBJ_RELEASE(item); ORTE_FORCED_TERMINATE(ORTE_ERR_UNREACH); continue; } if ((ORTE_PROC_STATE_RUNNING < rec->state && ORTE_PROC_STATE_CALLED_ABORT != rec->state) || !ORTE_FLAG_TEST(rec, ORTE_PROC_FLAG_ALIVE)) { if (!orte_abnormal_term_ordered && !orte_orteds_term_ordered) { opal_output(0, "%s grpcomm:direct:send_relay proc %s not running - cannot relay: %s ", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&nm->name), ORTE_FLAG_TEST(rec, ORTE_PROC_FLAG_ALIVE) ? orte_proc_state_to_str(rec->state) : "NOT ALIVE"); } OBJ_RELEASE(rly); OBJ_RELEASE(item); ORTE_FORCED_TERMINATE(ORTE_ERR_UNREACH); continue; } if (ORTE_SUCCESS != (ret = orte_rml.send_buffer_nb(orte_coll_conduit, &nm->name, rly, ORTE_RML_TAG_XCAST, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(rly); OBJ_RELEASE(item); ORTE_FORCED_TERMINATE(ORTE_ERR_UNREACH); continue; } OBJ_RELEASE(item); } } CLEANUP: /* cleanup */ OPAL_LIST_DESTRUCT(&coll); OBJ_RELEASE(rly); // retain accounting /* now pass the relay buffer to myself for processing - don't * inject it into the RML system via send as that will compete * with the relay messages down in the OOB. Instead, pass it * directly to the RML message processor */ if (ORTE_DAEMON_DVM_NIDMAP_CMD != command) { ORTE_RML_POST_MESSAGE(ORTE_PROC_MY_NAME, tag, 1, relay->base_ptr, relay->bytes_used); relay->base_ptr = NULL; relay->bytes_used = 0; } if (NULL != relay) { OBJ_RELEASE(relay); } OBJ_DESTRUCT(&datbuf); }
static void xcast_recv(int status, orte_process_name_t* sender, opal_buffer_t* buffer, orte_rml_tag_t tg, void* cbdata) { opal_list_item_t *item; orte_namelist_t *nm; int ret, cnt; opal_buffer_t *relay, *rly; orte_daemon_cmd_flag_t command = ORTE_DAEMON_NULL_CMD; opal_buffer_t wireup; opal_byte_object_t *bo; int8_t flag; orte_job_t *jdata; orte_proc_t *rec; opal_list_t coll; orte_grpcomm_signature_t *sig; orte_rml_tag_t tag; OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_framework.framework_output, "%s grpcomm:direct:xcast:recv: with %d bytes", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)buffer->bytes_used)); /* we need a passthru buffer to send to our children */ rly = OBJ_NEW(opal_buffer_t); opal_dss.copy_payload(rly, buffer); /* get the signature that we do not need */ cnt=1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &sig, &cnt, ORTE_SIGNATURE))) { ORTE_ERROR_LOG(ret); ORTE_FORCED_TERMINATE(ret); return; } OBJ_RELEASE(sig); /* get the target tag */ cnt=1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &tag, &cnt, ORTE_RML_TAG))) { ORTE_ERROR_LOG(ret); ORTE_FORCED_TERMINATE(ret); return; } /* setup a buffer we can pass to ourselves - this just contains * the initial message, minus the headers inserted by xcast itself */ relay = OBJ_NEW(opal_buffer_t); opal_dss.copy_payload(relay, buffer); /* setup the relay list */ OBJ_CONSTRUCT(&coll, opal_list_t); /* if this is headed for the daemon command processor, * then we first need to check for add_local_procs * as that command includes some needed wireup info */ if (ORTE_RML_TAG_DAEMON == tag) { /* peek at the command */ cnt=1; if (ORTE_SUCCESS == (ret = opal_dss.unpack(buffer, &command, &cnt, ORTE_DAEMON_CMD))) { /* if it is add_procs, then... */ if (ORTE_DAEMON_ADD_LOCAL_PROCS == command || ORTE_DAEMON_DVM_NIDMAP_CMD == command) { /* extract the byte object holding the daemonmap */ cnt=1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &bo, &cnt, OPAL_BYTE_OBJECT))) { ORTE_ERROR_LOG(ret); goto relay; } /* update our local nidmap, if required - the decode function * knows what to do - it will also free the bytes in the byte object */ if (ORTE_PROC_IS_HNP) { /* no need - already have the info */ if (NULL != bo) { if (NULL != bo->bytes) { free(bo->bytes); } free(bo); } } else { OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output, "%s grpcomm:direct:xcast updating daemon nidmap", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); if (ORTE_SUCCESS != (ret = orte_util_decode_daemon_nodemap(bo))) { ORTE_ERROR_LOG(ret); goto relay; } } /* update the routing plan */ orte_routed.update_routing_plan(); /* see if we have wiring info as well */ cnt=1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &flag, &cnt, OPAL_INT8))) { ORTE_ERROR_LOG(ret); goto relay; } if (ORTE_DAEMON_ADD_LOCAL_PROCS == command) { OBJ_RELEASE(relay); relay = OBJ_NEW(opal_buffer_t); /* repack the command */ if (OPAL_SUCCESS != (ret = opal_dss.pack(relay, &command, 1, ORTE_DAEMON_CMD))) { ORTE_ERROR_LOG(ret); goto relay; } if (0 == flag) { /* copy the remainder of the payload */ opal_dss.copy_payload(relay, buffer); /* no - just return */ goto relay; } } /* unpack the byte object */ cnt=1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &bo, &cnt, OPAL_BYTE_OBJECT))) { ORTE_ERROR_LOG(ret); goto relay; } if (0 < bo->size) { /* load it into a buffer */ OBJ_CONSTRUCT(&wireup, opal_buffer_t); opal_dss.load(&wireup, bo->bytes, bo->size); /* pass it for processing */ if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, &wireup))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&wireup); goto relay; } /* done with the wireup buffer - dump it */ OBJ_DESTRUCT(&wireup); } free(bo); if (ORTE_DAEMON_ADD_LOCAL_PROCS == command) { /* copy the remainder of the payload */ opal_dss.copy_payload(relay, buffer); } } } else { ORTE_ERROR_LOG(ret); goto CLEANUP; } } relay: /* get the list of next recipients from the routed module */ orte_routed.get_routing_list(&coll); /* if list is empty, no relay is required */ if (opal_list_is_empty(&coll)) { OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output, "%s grpcomm:direct:send_relay - recipient list is empty!", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); OBJ_RELEASE(rly); goto CLEANUP; } /* send the message to each recipient on list, deconstructing it as we go */ while (NULL != (item = opal_list_remove_first(&coll))) { nm = (orte_namelist_t*)item; OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output, "%s grpcomm:direct:send_relay sending relay msg of %d bytes to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)rly->bytes_used, ORTE_NAME_PRINT(&nm->name))); OBJ_RETAIN(rly); /* check the state of the recipient - no point * sending to someone not alive */ jdata = orte_get_job_data_object(nm->name.jobid); if (NULL == (rec = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, nm->name.vpid))) { opal_output(0, "%s grpcomm:direct:send_relay proc %s not found - cannot relay", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&nm->name)); OBJ_RELEASE(rly); OBJ_RELEASE(item); continue; } if (ORTE_PROC_STATE_RUNNING < rec->state) { opal_output(0, "%s grpcomm:direct:send_relay proc %s not running - cannot relay", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&nm->name)); OBJ_RELEASE(rly); OBJ_RELEASE(item); continue; } if (ORTE_SUCCESS != (ret = orte_rml.send_buffer_nb(&nm->name, rly, ORTE_RML_TAG_XCAST, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(rly); OBJ_RELEASE(item); continue; } OBJ_RELEASE(item); } OBJ_RELEASE(rly); // retain accounting CLEANUP: /* cleanup */ OBJ_DESTRUCT(&coll); /* now send the relay buffer to myself for processing */ if (ORTE_DAEMON_DVM_NIDMAP_CMD != command) { if (ORTE_SUCCESS != (ret = orte_rml.send_buffer_nb(ORTE_PROC_MY_NAME, relay, tag, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(relay); } } }
static void recv_data(int fd, short args, void *cbdata) { bool found; int i, rc; orte_node_t *nd, *nd2; opal_list_t nds, ndtmp; opal_list_item_t *item, *itm; char recv_msg[8192]; int nbytes, idx, sjob; char **alloc, *nodelist, *tpn; local_jobtracker_t *ptr, *jtrk; local_apptracker_t *aptrk; orte_app_context_t *app; orte_jobid_t jobid; orte_job_t *jdata; char **dash_host = NULL; opal_output_verbose(2, orte_ras_base_framework.framework_output, "%s ras:slurm: dynamic allocation - data recvd", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); /* read the data from the socket and put it in the * nodes field of op */ memset(recv_msg, 0, sizeof(recv_msg)); nbytes = read(fd, recv_msg, sizeof(recv_msg) - 1); opal_output_verbose(2, orte_ras_base_framework.framework_output, "%s ras:slurm: dynamic allocation msg: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), recv_msg); /* check if we got something */ if (0 == nbytes || 0 == strlen(recv_msg) || strstr(recv_msg, "failure") != NULL) { /* show an error here - basically, a "nothing was available" * message */ orte_show_help("help-ras-slurm.txt", "slurm-dyn-alloc-failed", true, (0 == strlen(recv_msg)) ? "NO MSG" : recv_msg); ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_ALLOC_FAILED); return; } /* break the message into its component parts, separated by colons */ alloc = opal_argv_split(recv_msg, ':'); /* the first section contains the ORTE jobid for this allocation */ tpn = strchr(alloc[0], '='); orte_util_convert_string_to_jobid(&jobid, tpn+1); /* get the corresponding job object */ jdata = orte_get_job_data_object(jobid); jtrk = NULL; /* find the associated tracking object */ for (item = opal_list_get_first(&jobs); item != opal_list_get_end(&jobs); item = opal_list_get_next(item)) { ptr = (local_jobtracker_t*)item; if (ptr->jobid == jobid) { jtrk = ptr; break; } } if (NULL == jtrk) { orte_show_help("help-ras-slurm.txt", "slurm-dyn-alloc-failed", true, "NO JOB TRACKER"); ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_ALLOC_FAILED); opal_argv_free(alloc); return; } /* stop the timeout event */ opal_event_del(&jtrk->timeout_ev); /* cycle across all the remaining parts - each is the allocation for * an app in this job */ OBJ_CONSTRUCT(&nds, opal_list_t); OBJ_CONSTRUCT(&ndtmp, opal_list_t); idx = -1; sjob = -1; nodelist = NULL; tpn = NULL; for (i=1; NULL != alloc[i]; i++) { if (ORTE_SUCCESS != parse_alloc_msg(alloc[i], &idx, &sjob, &nodelist, &tpn)) { orte_show_help("help-ras-slurm.txt", "slurm-dyn-alloc-failed", true, jtrk->cmd); ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOC_FAILED); opal_argv_free(alloc); if (NULL != nodelist) { free(nodelist); } if (NULL != tpn) { free(tpn); } return; } if (idx < 0) { orte_show_help("help-ras-slurm.txt", "slurm-dyn-alloc-failed", true, jtrk->cmd); ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOC_FAILED); opal_argv_free(alloc); free(nodelist); free(tpn); return; } if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, idx))) { orte_show_help("help-ras-slurm.txt", "slurm-dyn-alloc-failed", true, jtrk->cmd); ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOC_FAILED); opal_argv_free(alloc); free(nodelist); free(tpn); return; } /* release the current dash_host as that contained the *desired* allocation */ orte_remove_attribute(&app->attributes, ORTE_APP_DASH_HOST); /* track the Slurm jobid */ if (NULL == (aptrk = (local_apptracker_t*)opal_pointer_array_get_item(&jtrk->apps, idx))) { aptrk = OBJ_NEW(local_apptracker_t); opal_pointer_array_set_item(&jtrk->apps, idx, aptrk); } aptrk->sjob = sjob; /* since the nodelist/tpn may contain regular expressions, parse them */ if (ORTE_SUCCESS != (rc = orte_ras_slurm_discover(nodelist, tpn, &ndtmp))) { ORTE_ERROR_LOG(rc); ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOC_FAILED); opal_argv_free(alloc); free(nodelist); free(tpn); return; } /* transfer the discovered nodes to our node list, and construct * the new dash_host entry to match what was allocated */ while (NULL != (item = opal_list_remove_first(&ndtmp))) { nd = (orte_node_t*)item; opal_argv_append_nosize(&dash_host, nd->name); /* check for duplicates */ found = false; for (itm = opal_list_get_first(&nds); itm != opal_list_get_end(&nds); itm = opal_list_get_next(itm)) { nd2 = (orte_node_t*)itm; if (0 == strcmp(nd->name, nd2->name)) { found = true; nd2->slots += nd->slots; OBJ_RELEASE(item); break; } } if (!found) { /* append the new node to our list */ opal_list_append(&nds, item); } } /* cleanup */ free(nodelist); free(tpn); } /* cleanup */ opal_argv_free(alloc); OBJ_DESTRUCT(&ndtmp); if (NULL != dash_host) { tpn = opal_argv_join(dash_host, ','); orte_set_attribute(&app->attributes, ORTE_APP_DASH_HOST, ORTE_ATTR_LOCAL, (void*)tpn, OPAL_STRING); opal_argv_free(dash_host); free(tpn); } if (opal_list_is_empty(&nds)) { /* if we get here, then we were able to contact slurm, * which means we are in an actively managed cluster. * However, slurm indicated that nothing is currently * available that meets our requirements. This is a fatal * situation - we do NOT have the option of running on * user-specified hosts as the cluster is managed. */ OBJ_DESTRUCT(&nds); orte_show_help("help-ras-base.txt", "ras-base:no-allocation", true); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); } /* store the found nodes */ if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nds, jdata))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&nds); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); return; } OBJ_DESTRUCT(&nds); /* default to no-oversubscribe-allowed for managed systems */ if (!(ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) { ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE); } /* flag that the allocation is managed */ orte_managed_allocation = true; /* move the job along */ ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOCATION_COMPLETE); /* all done */ return; }
/* process incoming messages in order of receipt */ void orte_plm_base_recv(int status, orte_process_name_t* sender, opal_buffer_t* buffer, orte_rml_tag_t tag, void* cbdata) { orte_plm_cmd_flag_t command; orte_std_cntr_t count; orte_jobid_t job; orte_job_t *jdata, *parent; opal_buffer_t *answer; orte_vpid_t vpid; orte_proc_t *proc; orte_proc_state_t state; orte_exit_code_t exit_code; int32_t rc=ORTE_SUCCESS, ret; orte_app_context_t *app, *child_app; orte_process_name_t name; pid_t pid; bool running; int i; char **env; char *prefix_dir; OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output, "%s plm:base:receive processing msg", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); count = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &command, &count, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } switch (command) { case ORTE_PLM_LAUNCH_JOB_CMD: OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output, "%s plm:base:receive job launch command from %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(sender))); /* unpack the job object */ count = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &jdata, &count, ORTE_JOB))) { ORTE_ERROR_LOG(rc); goto ANSWER_LAUNCH; } /* record the sender so we know who to respond to */ jdata->originator.jobid = sender->jobid; jdata->originator.vpid = sender->vpid; /* get the parent's job object */ if (NULL != (parent = orte_get_job_data_object(sender->jobid))) { /* if the prefix was set in the parent's job, we need to transfer * that prefix to the child's app_context so any further launch of * orteds can find the correct binary. There always has to be at * least one app_context in both parent and child, so we don't * need to check that here. However, be sure not to overwrite * the prefix if the user already provided it! */ app = (orte_app_context_t*)opal_pointer_array_get_item(parent->apps, 0); child_app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, 0); prefix_dir = NULL; if (orte_get_attribute(&app->attributes, ORTE_APP_PREFIX_DIR, (void**)&prefix_dir, OPAL_STRING) && !orte_get_attribute(&child_app->attributes, ORTE_APP_PREFIX_DIR, NULL, OPAL_STRING)) { orte_set_attribute(&child_app->attributes, ORTE_APP_PREFIX_DIR, ORTE_ATTR_GLOBAL, prefix_dir, OPAL_STRING); } if (NULL != prefix_dir) { free(prefix_dir); } } /* if the user asked to forward any envars, cycle through the app contexts * in the comm_spawn request and add them */ if (NULL != orte_forwarded_envars) { for (i=0; i < jdata->apps->size; i++) { if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) { continue; } env = opal_environ_merge(orte_forwarded_envars, app->env); opal_argv_free(app->env); app->env = env; } } OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output, "%s plm:base:receive adding hosts", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* process any add-hostfile and add-host options that were provided */ if (ORTE_SUCCESS != (rc = orte_ras_base_add_hosts(jdata))) { ORTE_ERROR_LOG(rc); goto ANSWER_LAUNCH; } if (NULL != parent) { if (NULL == parent->bookmark) { /* find the sender's node in the job map */ if (NULL != (proc = (orte_proc_t*)opal_pointer_array_get_item(parent->procs, sender->vpid))) { /* set the bookmark so the child starts from that place - this means * that the first child process could be co-located with the proc * that called comm_spawn, assuming slots remain on that node. Otherwise, * the procs will start on the next available node */ jdata->bookmark = proc->node; } } else { jdata->bookmark = parent->bookmark; } } /* launch it */ OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output, "%s plm:base:receive calling spawn", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdata))) { ORTE_ERROR_LOG(rc); goto ANSWER_LAUNCH; } break; ANSWER_LAUNCH: OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output, "%s plm:base:receive - error on launch: %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), rc)); /* setup the response */ answer = OBJ_NEW(opal_buffer_t); /* pack the error code to be returned */ if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &rc, 1, OPAL_INT32))) { ORTE_ERROR_LOG(ret); } /* send the response back to the sender */ if (0 > (ret = orte_rml.send_buffer_nb(sender, answer, ORTE_RML_TAG_PLM_PROXY, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(answer); } break; case ORTE_PLM_UPDATE_PROC_STATE: opal_output_verbose(5, orte_plm_base_framework.framework_output, "%s plm:base:receive update proc state command from %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(sender)); count = 1; while (ORTE_SUCCESS == (rc = opal_dss.unpack(buffer, &job, &count, ORTE_JOBID))) { opal_output_verbose(5, orte_plm_base_framework.framework_output, "%s plm:base:receive got update_proc_state for job %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job)); name.jobid = job; running = false; /* get the job object */ jdata = orte_get_job_data_object(job); count = 1; while (ORTE_SUCCESS == (rc = opal_dss.unpack(buffer, &vpid, &count, ORTE_VPID))) { if (ORTE_VPID_INVALID == vpid) { /* flag indicates that this job is complete - move on */ break; } name.vpid = vpid; /* unpack the pid */ count = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &pid, &count, OPAL_PID))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } /* unpack the state */ count = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &state, &count, ORTE_PROC_STATE))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } if (ORTE_PROC_STATE_RUNNING == state) { running = true; } /* unpack the exit code */ count = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &exit_code, &count, ORTE_EXIT_CODE))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output, "%s plm:base:receive got update_proc_state for vpid %lu state %s exit_code %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (unsigned long)vpid, orte_proc_state_to_str(state), (int)exit_code)); if (NULL != jdata) { /* get the proc data object */ if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, vpid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); } /* NEVER update the proc state before activating the state machine - let * the state cbfunc update it as it may need to compare this * state against the prior proc state */ proc->pid = pid; proc->exit_code = exit_code; ORTE_ACTIVATE_PROC_STATE(&name, state); } } /* record that we heard back from a daemon during app launch */ if (running && NULL != jdata) { jdata->num_daemons_reported++; if (orte_report_launch_progress) { if (0 == jdata->num_daemons_reported % 100 || jdata->num_daemons_reported == orte_process_info.num_procs) { ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_REPORT_PROGRESS); } } } /* prepare for next job */ count = 1; } if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) { ORTE_ERROR_LOG(rc); } else { rc = ORTE_SUCCESS; } break; case ORTE_PLM_REGISTERED_CMD: count=1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &job, &count, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); goto DEPART; } name.jobid = job; /* get the job object */ if (NULL == (jdata = orte_get_job_data_object(job))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); rc = ORTE_ERR_NOT_FOUND; goto DEPART; } count=1; while (ORTE_SUCCESS == opal_dss.unpack(buffer, &vpid, &count, ORTE_VPID)) { name.vpid = vpid; ORTE_ACTIVATE_PROC_STATE(&name, ORTE_PROC_STATE_REGISTERED); count=1; } break; default: ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS); rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS; break; } CLEANUP: if (ORTE_SUCCESS != rc) { goto DEPART; } DEPART: /* see if an error occurred - if so, wakeup the HNP so we can exit */ if (ORTE_PROC_IS_HNP && ORTE_SUCCESS != rc) { jdata = NULL; ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); } OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output, "%s plm:base:receive done processing commands", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); }
static void job_errors(int fd, short args, void *cbdata) { orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; orte_job_t *jdata; orte_job_state_t jobstate; int rc; orte_plm_cmd_flag_t cmd; opal_buffer_t *alert; /* * if orte is trying to shutdown, just let it */ if (orte_finalizing) { return; } /* if the jdata is NULL, then we abort as this * is reporting an unrecoverable error */ if (NULL == caddy->jdata) { ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_FORCED_EXIT); OBJ_RELEASE(caddy); return; } /* update the state */ jdata = caddy->jdata; jobstate = caddy->job_state; jdata->state = jobstate; OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted: job %s reported error state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid), orte_job_state_to_str(jobstate))); switch (jobstate) { case ORTE_JOB_STATE_FAILED_TO_START: failed_start(jdata); break; case ORTE_JOB_STATE_COMM_FAILED: /* kill all local procs */ killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); /* order termination */ ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); goto cleanup; break; case ORTE_JOB_STATE_HEARTBEAT_FAILED: /* let the HNP handle this */ goto cleanup; break; default: break; } alert = OBJ_NEW(opal_buffer_t); /* pack update state command */ cmd = ORTE_PLM_UPDATE_PROC_STATE; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(alert); goto cleanup; } /* pack the job info */ if (ORTE_SUCCESS != (rc = pack_state_update(alert, jdata))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(alert); goto cleanup; } /* send it */ if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(alert); } cleanup: OBJ_RELEASE(caddy); }
static void launch_daemons(int fd, short args, void *cbdata) { orte_job_map_t *map; char *jobid_string = NULL; char *param; char **argv = NULL; int argc; int rc; char *tmp; char** env = NULL; char *nodelist_flat; char **nodelist_argv; int nodelist_argc; char *vpid_string; char **custom_strings; int num_args, i; char *cur_prefix; int proc_vpid_index; orte_app_context_t *app; orte_node_t *node; orte_std_cntr_t nnode; orte_job_t *daemons; orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata; /* if we are launching debugger daemons, then just go * do it - no new daemons will be launched */ if (ORTE_FLAG_TEST(state->jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) { state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED); OBJ_RELEASE(state); return; } /* start by setting up the virtual machine */ daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); if (ORTE_SUCCESS != (rc = orte_plm_base_setup_virtual_machine(state->jdata))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* if we don't want to launch, then don't attempt to * launch the daemons - the user really wants to just * look at the proposed process map */ if (orte_do_not_launch) { /* set the state to indicate the daemons reported - this * will trigger the daemons_reported event and cause the * job to move to the following step */ state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED); OBJ_RELEASE(state); return; } /* Get the map for this job */ if (NULL == (map = daemons->map)) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); rc = ORTE_ERR_NOT_FOUND; goto cleanup; } if (0 == map->num_new_daemons) { /* set the state to indicate the daemons reported - this * will trigger the daemons_reported event and cause the * job to move to the following step */ OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output, "%s plm:alps: no new daemons to launch", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED); OBJ_RELEASE(state); return; } /* need integer value for command line parameter */ orte_util_convert_jobid_to_string(&jobid_string, daemons->jobid); /* * start building argv array */ argv = NULL; argc = 0; /* * ALPS aprun OPTIONS */ /* add the aprun command */ opal_argv_append(&argc, &argv, mca_plm_alps_component.aprun_cmd); /* Append user defined arguments to aprun */ if ( NULL != mca_plm_alps_component.custom_args ) { custom_strings = opal_argv_split(mca_plm_alps_component.custom_args, ' '); num_args = opal_argv_count(custom_strings); for (i = 0; i < num_args; ++i) { opal_argv_append(&argc, &argv, custom_strings[i]); } opal_argv_free(custom_strings); } /* number of processors needed */ opal_argv_append(&argc, &argv, "-n"); asprintf(&tmp, "%lu", (unsigned long) map->num_new_daemons); opal_argv_append(&argc, &argv, tmp); free(tmp); opal_argv_append(&argc, &argv, "-N"); opal_argv_append(&argc, &argv, "1"); opal_argv_append(&argc, &argv, "-cc"); opal_argv_append(&argc, &argv, "none"); /* create nodelist */ nodelist_argv = NULL; nodelist_argc = 0; for (nnode=0; nnode < map->nodes->size; nnode++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, nnode))) { continue; } /* if the daemon already exists on this node, then * don't include it */ if (ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_DAEMON_LAUNCHED)) { continue; } /* otherwise, add it to the list of nodes upon which * we need to launch a daemon */ opal_argv_append(&nodelist_argc, &nodelist_argv, node->name); } if (0 == opal_argv_count(nodelist_argv)) { orte_show_help("help-plm-alps.txt", "no-hosts-in-list", true); rc = ORTE_ERR_FAILED_TO_START; goto cleanup; } nodelist_flat = opal_argv_join(nodelist_argv, ','); opal_argv_free(nodelist_argv); /* if we are using all allocated nodes, then alps * doesn't need a nodelist, or if running without a batch scheduler */ if ((map->num_new_daemons < orte_num_allocated_nodes) || (orte_num_allocated_nodes == 0)) { opal_argv_append(&argc, &argv, "-L"); opal_argv_append(&argc, &argv, nodelist_flat); } /* * ORTED OPTIONS */ /* add the daemon command (as specified by user) */ orte_plm_base_setup_orted_cmd(&argc, &argv); /* Add basic orted command line options, including debug flags */ orte_plm_base_orted_append_basic_args(&argc, &argv, NULL, &proc_vpid_index, nodelist_flat); free(nodelist_flat); /* tell the new daemons the base of the name list so they can compute * their own name on the other end */ rc = orte_util_convert_vpid_to_string(&vpid_string, map->daemon_vpid_start); if (ORTE_SUCCESS != rc) { opal_output(0, "plm_alps: unable to create process name"); goto cleanup; } free(argv[proc_vpid_index]); argv[proc_vpid_index] = strdup(vpid_string); free(vpid_string); if (mca_plm_alps_component.debug) { param = opal_argv_join(argv, ' '); if (NULL != param) { opal_output(0, "plm:alps: final top-level argv:"); opal_output(0, "plm:alps: %s", param); free(param); } } /* Copy the prefix-directory specified in the corresponding app_context. If there are multiple, different prefix's in the app context, complain (i.e., only allow one --prefix option for the entire alps run -- we don't support different --prefix'es for different nodes in the ALPS plm) */ cur_prefix = NULL; for (i=0; i < state->jdata->apps->size; i++) { char *app_prefix_dir = NULL; if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(state->jdata->apps, i))) { continue; } orte_get_attribute(&app->attributes, ORTE_APP_PREFIX_DIR, (void**)&app_prefix_dir, OPAL_STRING); /* Check for already set cur_prefix_dir -- if different, complain */ if (NULL != app_prefix_dir) { if (NULL != cur_prefix && 0 != strcmp (cur_prefix, app_prefix_dir)) { orte_show_help("help-plm-alps.txt", "multiple-prefixes", true, cur_prefix, app_prefix_dir); goto cleanup; } /* If not yet set, copy it; iff set, then it's the same anyway */ if (NULL == cur_prefix) { cur_prefix = strdup(app_prefix_dir); if (mca_plm_alps_component.debug) { opal_output (0, "plm:alps: Set prefix:%s", cur_prefix); } } free(app_prefix_dir); } } /* protect the args in case someone has a script wrapper around aprun */ mca_base_cmd_line_wrap_args(argv); /* setup environment */ env = opal_argv_copy(orte_launch_environ); if (0 < opal_output_get_verbosity(orte_plm_base_framework.framework_output)) { param = opal_argv_join(argv, ' '); OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output, "%s plm:alps: final top-level argv:\n\t%s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == param) ? "NULL" : param)); if (NULL != param) free(param); } /* exec the daemon(s) */ if (ORTE_SUCCESS != (rc = plm_alps_start_proc(argc, argv, env, cur_prefix))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* indicate that the daemons for this job were launched */ state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; daemons->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; /* flag that launch was successful, so far as we currently know */ failed_launch = false; cleanup: if (NULL != argv) { opal_argv_free(argv); } if (NULL != env) { opal_argv_free(env); } if(NULL != jobid_string) { free(jobid_string); } /* cleanup the caddy */ OBJ_RELEASE(state); /* check for failed launch - if so, force terminate */ if (failed_launch) { ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); } }
static void proc_errors(int fd, short args, void *cbdata) { orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; orte_job_t *jdata; orte_process_name_t *proc = &caddy->name; orte_proc_state_t state = caddy->proc_state; orte_proc_t *child, *ptr; opal_buffer_t *alert; orte_plm_cmd_flag_t cmd; int rc=ORTE_SUCCESS; int i; OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted:proc_errors process %s error state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), orte_proc_state_to_str(state))); /* * if orte is trying to shutdown, just let it */ if (orte_finalizing) { OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted:proc_errors finalizing - ignoring error", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto cleanup; } /* if this is a heartbeat failure, let the HNP handle it */ if (ORTE_PROC_STATE_HEARTBEAT_FAILED == state) { OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted:proc_errors heartbeat failed - ignoring error", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto cleanup; } /* if this was a failed comm, then see if it was to our * lifeline */ if (ORTE_PROC_STATE_LIFELINE_LOST == state || ORTE_PROC_STATE_UNABLE_TO_SEND_MSG == state) { OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:orted lifeline lost - exiting", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* set our exit status */ ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); /* kill our children */ killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); /* terminate - our routed children will see * us leave and automatically die */ ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); goto cleanup; } /* get the job object */ if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) { /* must already be complete */ OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted:proc_errors NULL jdata - ignoring error", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto cleanup; } if (ORTE_PROC_STATE_COMM_FAILED == state) { /* if it is our own connection, ignore it */ if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_NAME, proc)) { OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted:proc_errors comm_failed to self - ignoring error", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto cleanup; } /* was it a daemon? */ if (proc->jobid != ORTE_PROC_MY_NAME->jobid) { /* nope - we can't seem to trust that we will catch the waitpid * in this situation, so push this over to be handled as if * it were a waitpid trigger so we don't create a bunch of * duplicate code */ OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted:proc_errors comm_failed to non-daemon - handling as waitpid", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* get the proc_t */ if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); goto cleanup; } /* leave the exit code alone - process this as a waitpid */ odls_base_default_wait_local_proc(child, NULL); goto cleanup; } OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default:orted daemon %s exited", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); /* if we are using static ports, then it is possible that the HNP * will not see this termination. So if the HNP didn't order us * to terminate, then we should ensure it knows */ if (orte_static_ports && !orte_orteds_term_ordered) { /* send an alert to the HNP */ alert = OBJ_NEW(opal_buffer_t); /* pack update state command */ cmd = ORTE_PLM_UPDATE_PROC_STATE; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); return; } /* get the proc_t */ if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); goto cleanup; } /* set the exit code to reflect the problem */ child->exit_code = ORTE_ERR_COMM_FAILURE; /* pack only the data for this daemon - have to start with the jobid * so the receiver can unpack it correctly */ if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); return; } /* now pack the daemon's info */ if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) { ORTE_ERROR_LOG(rc); return; } /* send it */ OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted reporting lost connection to daemon %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(alert); } /* mark that we notified the HNP for this job so we don't do it again */ orte_set_attribute(&jdata->attributes, ORTE_JOB_FAIL_NOTIFIED, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL); /* continue on */ goto cleanup; } if (orte_orteds_term_ordered) { /* are any of my children still alive */ for (i=0; i < orte_local_children->size; i++) { if (NULL != (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE)) { OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output, "%s errmgr:default:orted[%s(%d)] proc %s is alive", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), __FILE__, __LINE__, ORTE_NAME_PRINT(&child->name))); goto cleanup; } } } /* if all my routes and children are gone, then terminate ourselves nicely (i.e., this is a normal termination) */ if (0 == orte_routed.num_routes()) { OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default:orted all routes gone - exiting", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED); } else { OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default:orted not exiting, num_routes() == %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)orte_routed.num_routes())); } } /* if not, then we can continue */ goto cleanup; } if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); goto cleanup; } /* if this is not a local proc for this job, we can * ignore this call */ if (!ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_LOCAL)) { OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted:proc_errors proc is not local - ignoring error", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto cleanup; } OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted got state %s for proc %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orte_proc_state_to_str(state), ORTE_NAME_PRINT(proc))); if (ORTE_PROC_STATE_TERM_NON_ZERO == state) { /* update the state */ child->state = state; /* report this as abnormal termination to the HNP, unless we already have * done so for this job */ if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FAIL_NOTIFIED, NULL, OPAL_BOOL)) { alert = OBJ_NEW(opal_buffer_t); /* pack update state command */ cmd = ORTE_PLM_UPDATE_PROC_STATE; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); return; } /* pack only the data for this proc - have to start with the jobid * so the receiver can unpack it correctly */ if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); return; } /* now pack the child's info */ if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) { ORTE_ERROR_LOG(rc); return; } /* send it */ OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted reporting proc %s abnormally terminated with non-zero status (local procs = %d)", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&child->name), jdata->num_local_procs)); if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(alert); } /* mark that we notified the HNP for this job so we don't do it again */ orte_set_attribute(&jdata->attributes, ORTE_JOB_FAIL_NOTIFIED, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL); } /* if the proc has terminated, notify the state machine */ if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_IOF_COMPLETE) && ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_WAITPID) && !ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_RECORDED)) { ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED); } goto cleanup; } if (ORTE_PROC_STATE_FAILED_TO_START == state || ORTE_PROC_STATE_FAILED_TO_LAUNCH == state) { /* update the proc state */ child->state = state; /* count the proc as having "terminated" */ jdata->num_terminated++; /* leave the error report in this case to the * state machine, which will receive notice * when all local procs have attempted to start * so that we send a consolidated error report * back to the HNP */ goto cleanup; } if (ORTE_PROC_STATE_TERMINATED < state) { /* if we were ordered to terminate, see if * any of our routes or local children remain alive - if not, then * terminate ourselves. */ if (orte_orteds_term_ordered) { /* mark the child as no longer alive and update the counters, if necessary. * we have to do this here as we aren't going to send this to the state * machine, and we want to keep the bookkeeping accurate just in case */ if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE)) { ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_ALIVE); } if (!ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_RECORDED)) { ORTE_FLAG_SET(child, ORTE_PROC_FLAG_RECORDED); jdata->num_terminated++; } for (i=0; i < orte_local_children->size; i++) { if (NULL != (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE)) { goto keep_going; } } } /* if all my routes and children are gone, then terminate ourselves nicely (i.e., this is a normal termination) */ if (0 == orte_routed.num_routes()) { OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default:orted all routes gone - exiting", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED); } /* no need to alert the HNP - we are already on our way out */ goto cleanup; } keep_going: /* if the job hasn't completed and the state is abnormally * terminated, then we need to alert the HNP right away - but * only do this once! */ if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FAIL_NOTIFIED, NULL, OPAL_BOOL)) { alert = OBJ_NEW(opal_buffer_t); /* pack update state command */ cmd = ORTE_PLM_UPDATE_PROC_STATE; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); return; } /* pack only the data for this proc - have to start with the jobid * so the receiver can unpack it correctly */ if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); return; } child->state = state; /* now pack the child's info */ if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) { ORTE_ERROR_LOG(rc); return; } OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted reporting proc %s aborted to HNP (local procs = %d)", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&child->name), jdata->num_local_procs)); /* send it */ if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); } /* mark that we notified the HNP for this job so we don't do it again */ orte_set_attribute(&jdata->attributes, ORTE_JOB_FAIL_NOTIFIED, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL); } /* if the proc has terminated, notify the state machine */ if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_IOF_COMPLETE) && ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_WAITPID) && !ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_RECORDED)) { ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED); } goto cleanup; } /* only other state is terminated - see if anyone is left alive */ if (!any_live_children(proc->jobid)) { alert = OBJ_NEW(opal_buffer_t); /* pack update state command */ cmd = ORTE_PLM_UPDATE_PROC_STATE; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); return; } /* pack the data for the job */ if (ORTE_SUCCESS != (rc = pack_state_update(alert, jdata))) { ORTE_ERROR_LOG(rc); return; } OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted reporting all procs in %s terminated", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid))); /* remove all of this job's children from the global list */ for (i=0; i < orte_local_children->size; i++) { if (NULL == (ptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { continue; } if (jdata->jobid == ptr->name.jobid) { opal_pointer_array_set_item(orte_local_children, i, NULL); OBJ_RELEASE(ptr); } } /* ensure the job's local session directory tree is removed */ orte_session_dir_cleanup(jdata->jobid); /* remove this job from our local job data since it is complete */ opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(jdata->jobid), NULL); OBJ_RELEASE(jdata); /* send it */ if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); } return; } cleanup: OBJ_RELEASE(caddy); }