static int setup_child(orte_job_t *jdata, orte_proc_t *child, orte_app_context_t *app) { char *param, *value; int rc, i; int32_t nrestarts=0, *nrptr; bool takeus = false; /* see if we are included */ for (i=0; NULL != jdata->personality[i]; i++) { if (0 == strcmp(jdata->personality[i], "ompi")) { takeus = true; break; } } if (!takeus) { return ORTE_ERR_TAKE_NEXT_OPTION; } /* setup the jobid */ if (ORTE_SUCCESS != (rc = orte_util_convert_jobid_to_string(&value, child->name.jobid))) { ORTE_ERROR_LOG(rc); return rc; } opal_setenv("OMPI_MCA_ess_base_jobid", value, true, &app->env); free(value); /* setup the vpid */ if (ORTE_SUCCESS != (rc = orte_util_convert_vpid_to_string(&value, child->name.vpid))) { ORTE_ERROR_LOG(rc); return rc; } opal_setenv("OMPI_MCA_ess_base_vpid", value, true, &app->env); /* although the vpid IS the process' rank within the job, users * would appreciate being given a public environmental variable * that also represents this value - something MPI specific - so * do that here. * * AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT. * We know - just live with it */ opal_setenv("OMPI_COMM_WORLD_RANK", value, true, &app->env); free(value); /* done with this now */ /* users would appreciate being given a public environmental variable * that also represents the local rank value - something MPI specific - so * do that here. * * AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT. * We know - just live with it */ if (ORTE_LOCAL_RANK_INVALID == child->local_rank) { ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS); rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS; return rc; } asprintf(&value, "%lu", (unsigned long) child->local_rank); opal_setenv("OMPI_COMM_WORLD_LOCAL_RANK", value, true, &app->env); free(value); /* users would appreciate being given a public environmental variable * that also represents the node rank value - something MPI specific - so * do that here. * * AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT. * We know - just live with it */ if (ORTE_NODE_RANK_INVALID == child->node_rank) { ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS); rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS; return rc; } asprintf(&value, "%lu", (unsigned long) child->node_rank); opal_setenv("OMPI_COMM_WORLD_NODE_RANK", value, true, &app->env); /* set an mca param for it too */ opal_setenv("OMPI_MCA_orte_ess_node_rank", value, true, &app->env); free(value); /* provide the identifier for the PMIx connection - the * PMIx connection is made prior to setting the process * name itself. Although in most cases the ID and the * process name are the same, it isn't necessarily * required */ orte_util_convert_process_name_to_string(&value, &child->name); opal_setenv("PMIX_ID", value, true, &app->env); free(value); nrptr = &nrestarts; if (orte_get_attribute(&child->attributes, ORTE_PROC_NRESTARTS, (void**)&nrptr, OPAL_INT32)) { /* pass the number of restarts for this proc - will be zero for * an initial start, but procs would like to know if they are being * restarted so they can take appropriate action */ asprintf(&value, "%d", nrestarts); opal_setenv("OMPI_MCA_orte_num_restarts", value, true, &app->env); free(value); } /* if the proc should not barrier in orte_init, tell it */ if (orte_get_attribute(&child->attributes, ORTE_PROC_NOBARRIER, NULL, OPAL_BOOL) || 0 < nrestarts) { opal_setenv("OMPI_MCA_orte_do_not_barrier", "1", true, &app->env); } /* if we are using staged execution, tell it */ if (orte_staged_execution) { opal_setenv("OMPI_MCA_orte_staged_execution", "1", true, &app->env); } /* if the proc isn't going to forward IO, then we need to flag that * it has "completed" iof termination as otherwise it will never fire */ if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_FORWARD_OUTPUT)) { ORTE_FLAG_SET(child, ORTE_PROC_FLAG_IOF_COMPLETE); } /* construct the proc's session dir name */ if (NULL != orte_process_info.tmpdir_base) { value = strdup(orte_process_info.tmpdir_base); } else { value = NULL; } param = NULL; if (ORTE_SUCCESS != (rc = orte_session_dir_get_name(¶m, &value, NULL, orte_process_info.nodename, NULL, &child->name))) { ORTE_ERROR_LOG(rc); if (NULL != value) { free(value); } return rc; } free(value); /* pass an envar so the proc can find any files it had prepositioned */ opal_setenv("OMPI_FILE_LOCATION", param, true, &app->env); /* if the user wanted the cwd to be the proc's session dir, then * switch to that location now */ if (orte_get_attribute(&app->attributes, ORTE_APP_SSNDIR_CWD, NULL, OPAL_BOOL)) { /* create the session dir - may not exist */ if (OPAL_SUCCESS != (rc = opal_os_dirpath_create(param, S_IRWXU))) { ORTE_ERROR_LOG(rc); /* doesn't exist with correct permissions, and/or we can't * create it - either way, we are done */ free(param); return rc; } /* change to it */ if (0 != chdir(param)) { free(param); return ORTE_ERROR; } /* It seems that chdir doesn't * adjust the $PWD enviro variable when it changes the directory. This * can cause a user to get a different response when doing getcwd vs * looking at the enviro variable. To keep this consistent, we explicitly * ensure that the PWD enviro variable matches the CWD we moved to. * * NOTE: if a user's program does a chdir(), then $PWD will once * again not match getcwd! This is beyond our control - we are only * ensuring they start out matching. */ opal_setenv("PWD", param, true, &app->env); /* update the initial wdir value too */ opal_setenv("OMPI_MCA_initial_wdir", param, true, &app->env); } free(param); return ORTE_SUCCESS; }
static void launch_daemons(int fd, short args, void *cbdata) { orte_job_map_t *map; char *jobid_string = NULL; char *param; char **argv = NULL; int argc; int rc; char *tmp; char** env = NULL; char *nodelist_flat; char **nodelist_argv; int nodelist_argc; char *vpid_string; char **custom_strings; int num_args, i; char *cur_prefix; int proc_vpid_index; orte_app_context_t *app; orte_node_t *node; orte_std_cntr_t nnode; orte_job_t *daemons; orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata; /* if we are launching debugger daemons, then just go * do it - no new daemons will be launched */ if (ORTE_FLAG_TEST(state->jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) { state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED); OBJ_RELEASE(state); return; } /* start by setting up the virtual machine */ daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); if (ORTE_SUCCESS != (rc = orte_plm_base_setup_virtual_machine(state->jdata))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* if we don't want to launch, then don't attempt to * launch the daemons - the user really wants to just * look at the proposed process map */ if (orte_do_not_launch) { /* set the state to indicate the daemons reported - this * will trigger the daemons_reported event and cause the * job to move to the following step */ state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED); OBJ_RELEASE(state); return; } /* Get the map for this job */ if (NULL == (map = daemons->map)) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); rc = ORTE_ERR_NOT_FOUND; goto cleanup; } if (0 == map->num_new_daemons) { /* set the state to indicate the daemons reported - this * will trigger the daemons_reported event and cause the * job to move to the following step */ OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output, "%s plm:alps: no new daemons to launch", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED); OBJ_RELEASE(state); return; } /* need integer value for command line parameter */ orte_util_convert_jobid_to_string(&jobid_string, daemons->jobid); /* * start building argv array */ argv = NULL; argc = 0; /* * ALPS aprun OPTIONS */ /* add the aprun command */ opal_argv_append(&argc, &argv, mca_plm_alps_component.aprun_cmd); /* Append user defined arguments to aprun */ if ( NULL != mca_plm_alps_component.custom_args ) { custom_strings = opal_argv_split(mca_plm_alps_component.custom_args, ' '); num_args = opal_argv_count(custom_strings); for (i = 0; i < num_args; ++i) { opal_argv_append(&argc, &argv, custom_strings[i]); } opal_argv_free(custom_strings); } /* number of processors needed */ opal_argv_append(&argc, &argv, "-n"); asprintf(&tmp, "%lu", (unsigned long) map->num_new_daemons); opal_argv_append(&argc, &argv, tmp); free(tmp); opal_argv_append(&argc, &argv, "-N"); opal_argv_append(&argc, &argv, "1"); opal_argv_append(&argc, &argv, "-cc"); opal_argv_append(&argc, &argv, "none"); /* create nodelist */ nodelist_argv = NULL; nodelist_argc = 0; for (nnode=0; nnode < map->nodes->size; nnode++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, nnode))) { continue; } /* if the daemon already exists on this node, then * don't include it */ if (ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_DAEMON_LAUNCHED)) { continue; } /* otherwise, add it to the list of nodes upon which * we need to launch a daemon */ opal_argv_append(&nodelist_argc, &nodelist_argv, node->name); } if (0 == opal_argv_count(nodelist_argv)) { orte_show_help("help-plm-alps.txt", "no-hosts-in-list", true); rc = ORTE_ERR_FAILED_TO_START; goto cleanup; } nodelist_flat = opal_argv_join(nodelist_argv, ','); opal_argv_free(nodelist_argv); /* if we are using all allocated nodes, then alps * doesn't need a nodelist, or if running without a batch scheduler */ if ((map->num_new_daemons < orte_num_allocated_nodes) || (orte_num_allocated_nodes == 0)) { opal_argv_append(&argc, &argv, "-L"); opal_argv_append(&argc, &argv, nodelist_flat); } /* * ORTED OPTIONS */ /* add the daemon command (as specified by user) */ orte_plm_base_setup_orted_cmd(&argc, &argv); /* Add basic orted command line options, including debug flags */ orte_plm_base_orted_append_basic_args(&argc, &argv, NULL, &proc_vpid_index, nodelist_flat); free(nodelist_flat); /* tell the new daemons the base of the name list so they can compute * their own name on the other end */ rc = orte_util_convert_vpid_to_string(&vpid_string, map->daemon_vpid_start); if (ORTE_SUCCESS != rc) { opal_output(0, "plm_alps: unable to create process name"); goto cleanup; } free(argv[proc_vpid_index]); argv[proc_vpid_index] = strdup(vpid_string); free(vpid_string); if (mca_plm_alps_component.debug) { param = opal_argv_join(argv, ' '); if (NULL != param) { opal_output(0, "plm:alps: final top-level argv:"); opal_output(0, "plm:alps: %s", param); free(param); } } /* Copy the prefix-directory specified in the corresponding app_context. If there are multiple, different prefix's in the app context, complain (i.e., only allow one --prefix option for the entire alps run -- we don't support different --prefix'es for different nodes in the ALPS plm) */ cur_prefix = NULL; for (i=0; i < state->jdata->apps->size; i++) { char *app_prefix_dir = NULL; if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(state->jdata->apps, i))) { continue; } orte_get_attribute(&app->attributes, ORTE_APP_PREFIX_DIR, (void**)&app_prefix_dir, OPAL_STRING); /* Check for already set cur_prefix_dir -- if different, complain */ if (NULL != app_prefix_dir) { if (NULL != cur_prefix && 0 != strcmp (cur_prefix, app_prefix_dir)) { orte_show_help("help-plm-alps.txt", "multiple-prefixes", true, cur_prefix, app_prefix_dir); goto cleanup; } /* If not yet set, copy it; iff set, then it's the same anyway */ if (NULL == cur_prefix) { cur_prefix = strdup(app_prefix_dir); if (mca_plm_alps_component.debug) { opal_output (0, "plm:alps: Set prefix:%s", cur_prefix); } } free(app_prefix_dir); } } /* protect the args in case someone has a script wrapper around aprun */ mca_base_cmd_line_wrap_args(argv); /* setup environment */ env = opal_argv_copy(orte_launch_environ); if (0 < opal_output_get_verbosity(orte_plm_base_framework.framework_output)) { param = opal_argv_join(argv, ' '); OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output, "%s plm:alps: final top-level argv:\n\t%s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == param) ? "NULL" : param)); if (NULL != param) free(param); } /* exec the daemon(s) */ if (ORTE_SUCCESS != (rc = plm_alps_start_proc(argc, argv, env, cur_prefix))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* indicate that the daemons for this job were launched */ state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; daemons->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; /* flag that launch was successful, so far as we currently know */ failed_launch = false; cleanup: if (NULL != argv) { opal_argv_free(argv); } if (NULL != env) { opal_argv_free(env); } if(NULL != jobid_string) { free(jobid_string); } /* cleanup the caddy */ OBJ_RELEASE(state); /* check for failed launch - if so, force terminate */ if (failed_launch) { ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); } }
static void launch_daemons(int fd, short args, void *cbdata) { orte_job_map_t *map; size_t num_nodes; char *param; char **argv = NULL; int argc; int rc; char** env = NULL; char **nodelist_argv; char *nodelist; int nodelist_argc; char *vpid_string; int i; char *cur_prefix; int proc_vpid_index = 0; bool failed_launch = true; orte_app_context_t *app; orte_node_t *node; orte_std_cntr_t nnode; orte_job_t *daemons; orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata; orte_job_t *jdata = state->jdata; /* start by setting up the virtual machine */ daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); if (ORTE_SUCCESS != (rc = orte_plm_base_setup_virtual_machine(jdata))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* if we don't want to launch, then don't attempt to * launch the daemons - the user really wants to just * look at the proposed process map */ if (orte_do_not_launch) { /* set the state to indicate the daemons reported - this * will trigger the daemons_reported event and cause the * job to move to the following step */ state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED); OBJ_RELEASE(state); return; } OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output, "%s plm:lsf: launching vm", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* Get the map for this job */ if (NULL == (map = daemons->map)) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); rc = ORTE_ERR_NOT_FOUND; goto cleanup; } num_nodes = map->num_new_daemons; if (0 == num_nodes) { /* set the state to indicate the daemons reported - this * will trigger the daemons_reported event and cause the * job to move to the following step */ OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output, "%s plm:lsf: no new daemons to launch", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED); OBJ_RELEASE(state); return; } /* create nodelist */ nodelist_argv = NULL; nodelist_argc = 0; for (nnode=0; nnode < map->nodes->size; nnode++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, nnode))) { continue; } /* if the daemon already exists on this node, then * don't include it */ if (ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_DAEMON_LAUNCHED)) { continue; } /* otherwise, add it to the list of nodes upon which * we need to launch a daemon */ opal_argv_append(&nodelist_argc, &nodelist_argv, node->name); } nodelist = opal_argv_join(nodelist_argv, ','); /* * start building argv array */ argv = NULL; argc = 0; /* * ORTED OPTIONS */ /* add the daemon command (as specified by user) */ orte_plm_base_setup_orted_cmd(&argc, &argv); /* Add basic orted command line options */ orte_plm_base_orted_append_basic_args(&argc, &argv, "lsf", &proc_vpid_index, nodelist); free(nodelist); /* tell the new daemons the base of the name list so they can compute * their own name on the other end */ rc = orte_util_convert_vpid_to_string(&vpid_string, map->daemon_vpid_start); if (ORTE_SUCCESS != rc) { opal_output(0, "plm_lsf: unable to get daemon vpid as string"); goto cleanup; } free(argv[proc_vpid_index]); argv[proc_vpid_index] = strdup(vpid_string); free(vpid_string); if (0 < opal_output_get_verbosity(orte_plm_base_framework.framework_output)) { param = opal_argv_join(argv, ' '); if (NULL != param) { opal_output(0, "plm:lsf: final top-level argv:"); opal_output(0, "plm:lsf: %s", param); free(param); } } /* Copy the prefix-directory specified in the corresponding app_context. If there are multiple, different prefix's in the app context, complain (i.e., only allow one --prefix option for the entire lsf run -- we don't support different --prefix'es for different nodes in the LSF plm) */ cur_prefix = NULL; for (i=0; i < jdata->apps->size; i++) { char *app_prefix_dir; if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) { continue; } orte_get_attribute(&app->attributes, ORTE_APP_PREFIX_DIR, (void**)&app_prefix_dir, OPAL_STRING); /* Check for already set cur_prefix_dir -- if different, complain */ if (NULL != app_prefix_dir) { if (NULL != cur_prefix && 0 != strcmp (cur_prefix, app_prefix_dir)) { orte_show_help("help-plm-lsf.txt", "multiple-prefixes", true, cur_prefix, app_prefix_dir); rc = ORTE_ERR_FAILED_TO_START; goto cleanup; } /* If not yet set, copy it; iff set, then it's the same anyway */ if (NULL == cur_prefix) { cur_prefix = strdup(app_prefix_dir); OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output, "%s plm:lsf: Set prefix:%s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), cur_prefix)); } free(app_prefix_dir); } } /* setup environment */ env = opal_argv_copy(orte_launch_environ); /* lsb_launch tampers with SIGCHLD. * After the call to lsb_launch, the signal handler for SIGCHLD is NULL. * So, we disable the SIGCHLD handler of libevent for the duration of * the call to lsb_launch */ orte_wait_disable(); /* exec the daemon(s). Do NOT wait for lsb_launch to complete as * it only completes when the processes it starts - in this case, * the orteds - complete. We need to go ahead and return so * orterun can do the rest of its stuff. Instead, we'll catch any * failures and deal with them elsewhere */ if (lsb_launch(nodelist_argv, argv, LSF_DJOB_REPLACE_ENV | LSF_DJOB_NOWAIT, env) < 0) { ORTE_ERROR_LOG(ORTE_ERR_FAILED_TO_START); opal_output(0, "lsb_launch failed: %d", rc); rc = ORTE_ERR_FAILED_TO_START; orte_wait_enable(); /* re-enable our SIGCHLD handler */ goto cleanup; } orte_wait_enable(); /* re-enable our SIGCHLD handler */ /* indicate that the daemons for this job were launched */ state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; daemons->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; /* flag that launch was successful, so far as we currently know */ failed_launch = false; cleanup: if (NULL != argv) { opal_argv_free(argv); } if (NULL != env) { opal_argv_free(env); } /* cleanup the caddy */ OBJ_RELEASE(state); /* check for failed launch - if so, force terminate */ if (failed_launch) { ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); } }
/* When working in this function, ALWAYS jump to "cleanup" if * you encounter an error so that orterun will be woken up and * the job can cleanly terminate */ static int plm_slurm_launch_job(orte_job_t *jdata) { orte_app_context_t **apps; orte_node_t **nodes; orte_std_cntr_t n; orte_job_map_t *map; char *jobid_string = NULL; char *param; char **argv = NULL; int argc; int rc; char *tmp; char** env = NULL; char* var; char *nodelist_flat; char **nodelist_argv; int nodelist_argc; char *name_string; char **custom_strings; int num_args, i; char *cur_prefix; struct timeval launchstart, launchstop; int proc_vpid_index; orte_jobid_t failed_job; bool failed_launch=true; /* flag the daemons as failing by default */ failed_job = ORTE_PROC_MY_NAME->jobid; if (orte_timing) { if (0 != gettimeofday(&launchstart, NULL)) { opal_output(0, "plm_slurm: could not obtain job start time"); launchstart.tv_sec = 0; launchstart.tv_usec = 0; } } /* indicate the state of the launch */ launching_daemons = true; /* create a jobid for this job */ if (ORTE_SUCCESS != (rc = orte_plm_base_create_jobid(&jdata->jobid))) { ORTE_ERROR_LOG(rc); goto cleanup; } OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "%s plm:slurm: launching job %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid))); /* setup the job */ if (ORTE_SUCCESS != (rc = orte_plm_base_setup_job(jdata))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* set the active jobid */ active_job = jdata->jobid; /* Get the map for this job */ if (NULL == (map = orte_rmaps.get_job_map(active_job))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); rc = ORTE_ERR_NOT_FOUND; goto cleanup; } apps = (orte_app_context_t**)jdata->apps->addr; nodes = (orte_node_t**)map->nodes->addr; if (0 == map->num_new_daemons) { /* no new daemons required - just launch apps */ OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "%s plm:slurm: no new daemons to launch", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto launch_apps; } /* need integer value for command line parameter */ asprintf(&jobid_string, "%lu", (unsigned long) jdata->jobid); /* * start building argv array */ argv = NULL; argc = 0; /* * SLURM srun OPTIONS */ /* add the srun command */ opal_argv_append(&argc, &argv, "srun"); /* Append user defined arguments to srun */ if ( NULL != mca_plm_slurm_component.custom_args ) { custom_strings = opal_argv_split(mca_plm_slurm_component.custom_args, ' '); num_args = opal_argv_count(custom_strings); for (i = 0; i < num_args; ++i) { opal_argv_append(&argc, &argv, custom_strings[i]); } opal_argv_free(custom_strings); } asprintf(&tmp, "--nodes=%lu", (unsigned long) map->num_new_daemons); opal_argv_append(&argc, &argv, tmp); free(tmp); asprintf(&tmp, "--ntasks=%lu", (unsigned long) map->num_new_daemons); opal_argv_append(&argc, &argv, tmp); free(tmp); /* alert us if any orteds die during startup */ opal_argv_append(&argc, &argv, "--kill-on-bad-exit"); /* create nodelist */ nodelist_argv = NULL; nodelist_argc = 0; for (n=0; n < map->num_nodes; n++ ) { /* if the daemon already exists on this node, then * don't include it */ if (nodes[n]->daemon_launched) { continue; } /* otherwise, add it to the list of nodes upon which * we need to launch a daemon */ opal_argv_append(&nodelist_argc, &nodelist_argv, nodes[n]->name); } if (0 == opal_argv_count(nodelist_argv)) { orte_show_help("help-plm-slurm.txt", "no-hosts-in-list", true); rc = ORTE_ERR_FAILED_TO_START; goto cleanup; } nodelist_flat = opal_argv_join(nodelist_argv, ','); opal_argv_free(nodelist_argv); asprintf(&tmp, "--nodelist=%s", nodelist_flat); opal_argv_append(&argc, &argv, tmp); free(tmp); OPAL_OUTPUT_VERBOSE((2, orte_plm_globals.output, "%s plm:slurm: launching on nodes %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), nodelist_flat)); /* * ORTED OPTIONS */ /* add the daemon command (as specified by user) */ orte_plm_base_setup_orted_cmd(&argc, &argv); /* Add basic orted command line options, including debug flags */ orte_plm_base_orted_append_basic_args(&argc, &argv, "slurm", &proc_vpid_index, false); /* tell the new daemons the base of the name list so they can compute * their own name on the other end */ rc = orte_util_convert_vpid_to_string(&name_string, map->daemon_vpid_start); if (ORTE_SUCCESS != rc) { opal_output(0, "plm_slurm: unable to get daemon vpid as string"); goto cleanup; } free(argv[proc_vpid_index]); argv[proc_vpid_index] = strdup(name_string); free(name_string); if (0 < opal_output_get_verbosity(orte_plm_globals.output)) { param = opal_argv_join(argv, ' '); OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "%s plm:slurm: final top-level argv:\n\t%s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == param) ? "NULL" : param)); if (NULL != param) free(param); } /* Copy the prefix-directory specified in the corresponding app_context. If there are multiple, different prefix's in the app context, complain (i.e., only allow one --prefix option for the entire slurm run -- we don't support different --prefix'es for different nodes in the SLURM plm) */ cur_prefix = NULL; for (n=0; n < jdata->num_apps; n++) { char * app_prefix_dir = apps[n]->prefix_dir; /* Check for already set cur_prefix_dir -- if different, complain */ if (NULL != app_prefix_dir) { if (NULL != cur_prefix && 0 != strcmp (cur_prefix, app_prefix_dir)) { orte_show_help("help-plm-slurm.txt", "multiple-prefixes", true, cur_prefix, app_prefix_dir); return ORTE_ERR_FATAL; } /* If not yet set, copy it; iff set, then it's the same anyway */ if (NULL == cur_prefix) { cur_prefix = strdup(app_prefix_dir); OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "%s plm:slurm: Set prefix:%s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), cur_prefix)); } } } /* setup environment */ env = opal_argv_copy(orte_launch_environ); /* add the nodelist */ var = mca_base_param_environ_variable("orte", "slurm", "nodelist"); opal_setenv(var, nodelist_flat, true, &env); free(nodelist_flat); free(var); /* exec the daemon(s) */ if (ORTE_SUCCESS != (rc = plm_slurm_start_proc(argc, argv, env, cur_prefix))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* do NOT wait for srun to complete. Srun only completes when the processes * it starts - in this case, the orteds - complete. Instead, we'll catch * any srun failures and deal with them elsewhere */ /* wait for daemons to callback */ if (ORTE_SUCCESS != (rc = orte_plm_base_daemon_callback(map->num_new_daemons))) { OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "%s plm:slurm: daemon launch failed for job %s on error %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(active_job), ORTE_ERROR_NAME(rc))); goto cleanup; } launch_apps: /* get here if daemons launch okay - any failures now by apps */ launching_daemons = false; failed_job = active_job; if (ORTE_SUCCESS != (rc = orte_plm_base_launch_apps(active_job))) { OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "%s plm:slurm: launch of apps failed for job %s on error %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(active_job), ORTE_ERROR_NAME(rc))); goto cleanup; } /* declare the launch a success */ failed_launch = false; if (orte_timing) { if (0 != gettimeofday(&launchstop, NULL)) { opal_output(0, "plm_slurm: could not obtain stop time"); } else { opal_output(0, "plm_slurm: total job launch time is %ld usec", (launchstop.tv_sec - launchstart.tv_sec)*1000000 + (launchstop.tv_usec - launchstart.tv_usec)); } } if (ORTE_SUCCESS != rc) { opal_output(0, "plm:slurm: start_procs returned error %d", rc); goto cleanup; } cleanup: if (NULL != argv) { opal_argv_free(argv); } if (NULL != env) { opal_argv_free(env); } if(NULL != jobid_string) { free(jobid_string); } /* check for failed launch - if so, force terminate */ if (failed_launch) { orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, ORTE_JOB_STATE_FAILED_TO_START); } return rc; }
static void launch_daemons(int fd, short args, void *cbdata) { orte_app_context_t *app; orte_node_t *node; orte_std_cntr_t n; orte_job_map_t *map; char *jobid_string = NULL; char *param; char **argv = NULL; int argc; int rc; char *tmp; char** env = NULL; char *nodelist_flat; char **nodelist_argv; char *name_string; char **custom_strings; int num_args, i; char *cur_prefix; int proc_vpid_index; bool failed_launch=true; orte_job_t *daemons; orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata; OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output, "%s plm:slurm: LAUNCH DAEMONS CALLED", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* if we are launching debugger daemons, then just go * do it - no new daemons will be launched */ if (ORTE_JOB_CONTROL_DEBUGGER_DAEMON & state->jdata->controls) { state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED); OBJ_RELEASE(state); return; } /* start by setting up the virtual machine */ daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); if (ORTE_SUCCESS != (rc = orte_plm_base_setup_virtual_machine(state->jdata))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* if we don't want to launch, then don't attempt to * launch the daemons - the user really wants to just * look at the proposed process map */ if (orte_do_not_launch) { /* set the state to indicate the daemons reported - this * will trigger the daemons_reported event and cause the * job to move to the following step */ state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED); OBJ_RELEASE(state); return; } /* Get the map for this job */ if (NULL == (map = daemons->map)) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); rc = ORTE_ERR_NOT_FOUND; goto cleanup; } if (0 == map->num_new_daemons) { /* set the state to indicate the daemons reported - this * will trigger the daemons_reported event and cause the * job to move to the following step */ OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output, "%s plm:slurm: no new daemons to launch", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED); OBJ_RELEASE(state); return; } /* need integer value for command line parameter */ asprintf(&jobid_string, "%lu", (unsigned long) daemons->jobid); /* * start building argv array */ argv = NULL; argc = 0; /* * SLURM srun OPTIONS */ /* add the srun command */ opal_argv_append(&argc, &argv, "srun"); /* start one orted on each node */ opal_argv_append(&argc, &argv, "--ntasks-per-node=1"); /* alert us if any orteds die during startup */ opal_argv_append(&argc, &argv, "--kill-on-bad-exit"); /* Append user defined arguments to srun */ if ( NULL != mca_plm_slurm_component.custom_args ) { custom_strings = opal_argv_split(mca_plm_slurm_component.custom_args, ' '); num_args = opal_argv_count(custom_strings); for (i = 0; i < num_args; ++i) { opal_argv_append(&argc, &argv, custom_strings[i]); } opal_argv_free(custom_strings); } /* create nodelist */ nodelist_argv = NULL; for (n=0; n < map->nodes->size; n++ ) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, n))) { continue; } /* if the daemon already exists on this node, then * don't include it */ if (node->daemon_launched) { continue; } /* otherwise, add it to the list of nodes upon which * we need to launch a daemon */ opal_argv_append_nosize(&nodelist_argv, node->name); } if (0 == opal_argv_count(nodelist_argv)) { orte_show_help("help-plm-slurm.txt", "no-hosts-in-list", true); rc = ORTE_ERR_FAILED_TO_START; goto cleanup; } nodelist_flat = opal_argv_join(nodelist_argv, ','); opal_argv_free(nodelist_argv); /* if we are using all allocated nodes, then srun doesn't * require any further arguments */ if (map->num_new_daemons < orte_num_allocated_nodes) { asprintf(&tmp, "--nodes=%lu", (unsigned long)map->num_new_daemons); opal_argv_append(&argc, &argv, tmp); free(tmp); asprintf(&tmp, "--nodelist=%s", nodelist_flat); opal_argv_append(&argc, &argv, tmp); free(tmp); } /* tell srun how many tasks to run */ asprintf(&tmp, "--ntasks=%lu", (unsigned long)map->num_new_daemons); opal_argv_append(&argc, &argv, tmp); free(tmp); OPAL_OUTPUT_VERBOSE((2, orte_plm_base_framework.framework_output, "%s plm:slurm: launching on nodes %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), nodelist_flat)); /* * ORTED OPTIONS */ /* add the daemon command (as specified by user) */ orte_plm_base_setup_orted_cmd(&argc, &argv); /* Add basic orted command line options, including debug flags */ orte_plm_base_orted_append_basic_args(&argc, &argv, NULL, &proc_vpid_index, nodelist_flat); free(nodelist_flat); /* tell the new daemons the base of the name list so they can compute * their own name on the other end */ rc = orte_util_convert_vpid_to_string(&name_string, map->daemon_vpid_start); if (ORTE_SUCCESS != rc) { opal_output(0, "plm_slurm: unable to get daemon vpid as string"); goto cleanup; } free(argv[proc_vpid_index]); argv[proc_vpid_index] = strdup(name_string); free(name_string); /* Copy the prefix-directory specified in the corresponding app_context. If there are multiple, different prefix's in the app context, complain (i.e., only allow one --prefix option for the entire slurm run -- we don't support different --prefix'es for different nodes in the SLURM plm) */ cur_prefix = NULL; for (n=0; n < state->jdata->apps->size; n++) { char * app_prefix_dir; if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(state->jdata->apps, n))) { continue; } app_prefix_dir = app->prefix_dir; /* Check for already set cur_prefix_dir -- if different, complain */ if (NULL != app_prefix_dir) { if (NULL != cur_prefix && 0 != strcmp (cur_prefix, app_prefix_dir)) { orte_show_help("help-plm-slurm.txt", "multiple-prefixes", true, cur_prefix, app_prefix_dir); goto cleanup; } /* If not yet set, copy it; iff set, then it's the * same anyway */ if (NULL == cur_prefix) { cur_prefix = strdup(app_prefix_dir); OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output, "%s plm:slurm: Set prefix:%s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), cur_prefix)); } } } /* setup environment */ env = opal_argv_copy(orte_launch_environ); if (0 < opal_output_get_verbosity(orte_plm_base_framework.framework_output)) { param = opal_argv_join(argv, ' '); OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output, "%s plm:slurm: final top-level argv:\n\t%s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == param) ? "NULL" : param)); if (NULL != param) free(param); } /* exec the daemon(s) */ if (ORTE_SUCCESS != (rc = plm_slurm_start_proc(argc, argv, env, cur_prefix))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* indicate that the daemons for this job were launched */ state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; daemons->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; /* flag that launch was successful, so far as we currently know */ failed_launch = false; cleanup: if (NULL != argv) { opal_argv_free(argv); } if (NULL != env) { opal_argv_free(env); } if(NULL != jobid_string) { free(jobid_string); } /* cleanup the caddy */ OBJ_RELEASE(state); /* check for failed launch - if so, force terminate */ if (failed_launch) { ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); } }
/* When working in this function, ALWAYS jump to "cleanup" if * you encounter an error so that orterun will be woken up and * the job can cleanly terminate */ static void launch_daemons(int fd, short args, void *cbdata) { orte_job_map_t *map = NULL; orte_app_context_t *app; orte_node_t *node; int proc_vpid_index; char *param; char **env = NULL; char *var; char **argv = NULL; char **nodeargv; int argc = 0; int rc; orte_std_cntr_t i; char *bin_base = NULL, *lib_base = NULL; tm_event_t *tm_events = NULL; tm_task_id *tm_task_ids = NULL; bool failed_launch = true; mode_t current_umask; char *nodelist; char* vpid_string; orte_job_t *daemons, *jdata; orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata; int32_t launchid, *ldptr; char *prefix_dir = NULL; jdata = state->jdata; /* if we are launching debugger daemons, then just go * do it - no new daemons will be launched */ if (ORTE_FLAG_TEST(state->jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) { jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_REPORTED); OBJ_RELEASE(state); return; } /* setup the virtual machine */ daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); if (ORTE_SUCCESS != (rc = orte_plm_base_setup_virtual_machine(jdata))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* if we don't want to launch, then don't attempt to * launch the daemons - the user really wants to just * look at the proposed process map */ if (orte_do_not_launch) { /* set the state to indicate the daemons reported - this * will trigger the daemons_reported event and cause the * job to move to the following step */ jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_REPORTED); OBJ_RELEASE(state); return; } /* Get the map for this job */ if (NULL == (map = daemons->map)) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); rc = ORTE_ERR_NOT_FOUND; goto cleanup; } if (0 == map->num_new_daemons) { /* set the state to indicate the daemons reported - this * will trigger the daemons_reported event and cause the * job to move to the following step */ jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_REPORTED); OBJ_RELEASE(state); return; } OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output, "%s plm:tm: launching vm", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* Allocate a bunch of TM events to use for tm_spawn()ing */ tm_events = malloc(sizeof(tm_event_t) * map->num_new_daemons); if (NULL == tm_events) { rc = ORTE_ERR_OUT_OF_RESOURCE; ORTE_ERROR_LOG(rc); goto cleanup; } tm_task_ids = malloc(sizeof(tm_task_id) * map->num_new_daemons); if (NULL == tm_task_ids) { rc = ORTE_ERR_OUT_OF_RESOURCE; ORTE_ERROR_LOG(rc); goto cleanup; } /* add the daemon command (as specified by user) */ orte_plm_base_setup_orted_cmd(&argc, &argv); /* create a list of nodes in this launch */ nodeargv = NULL; for (i = 0; i < map->nodes->size; i++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) { continue; } /* if this daemon already exists, don't launch it! */ if (ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_DAEMON_LAUNCHED)) { continue; } /* add to list */ opal_argv_append_nosize(&nodeargv, node->name); } nodelist = opal_argv_join(nodeargv, ','); opal_argv_free(nodeargv); /* Add basic orted command line options */ orte_plm_base_orted_append_basic_args(&argc, &argv, "tm", &proc_vpid_index, nodelist); free(nodelist); if (0 < opal_output_get_verbosity(orte_plm_base_framework.framework_output)) { param = opal_argv_join(argv, ' '); OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output, "%s plm:tm: final top-level argv:\n\t%s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == param) ? "NULL" : param)); if (NULL != param) free(param); } if (!connected) { if (ORTE_SUCCESS != plm_tm_connect()) { goto cleanup; } connected = true; } /* Figure out the basenames for the libdir and bindir. There is a lengthy comment about this in plm_rsh_module.c explaining all the rationale for how / why we're doing this. */ lib_base = opal_basename(opal_install_dirs.libdir); bin_base = opal_basename(opal_install_dirs.bindir); /* setup environment */ env = opal_argv_copy(orte_launch_environ); /* enable local launch by the orteds */ (void) mca_base_var_env_name ("plm", &var); opal_setenv(var, "rsh", true, &env); free(var); /* add our umask -- see big note in orted.c */ current_umask = umask(0); umask(current_umask); (void)asprintf(&var, "0%o", current_umask); opal_setenv("ORTE_DAEMON_UMASK_VALUE", var, true, &env); free(var); /* If we have a prefix, then modify the PATH and LD_LIBRARY_PATH environment variables. We only allow a single prefix to be specified. Since there will always be at least one app_context, we take it from there */ app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, 0); orte_get_attribute(&app->attributes, ORTE_APP_PREFIX_DIR, (void**)&prefix_dir, OPAL_STRING); if (NULL != prefix_dir) { char *newenv; for (i = 0; NULL != env && NULL != env[i]; ++i) { /* Reset PATH */ if (0 == strncmp("PATH=", env[i], 5)) { (void)asprintf(&newenv, "%s/%s:%s", prefix_dir, bin_base, env[i] + 5); OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output, "%s plm:tm: resetting PATH: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), newenv)); opal_setenv("PATH", newenv, true, &env); free(newenv); } /* Reset LD_LIBRARY_PATH */ else if (0 == strncmp("LD_LIBRARY_PATH=", env[i], 16)) { (void)asprintf(&newenv, "%s/%s:%s", prefix_dir, lib_base, env[i] + 16); OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output, "%s plm:tm: resetting LD_LIBRARY_PATH: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), newenv)); opal_setenv("LD_LIBRARY_PATH", newenv, true, &env); free(newenv); } } free(prefix_dir); } /* Iterate through each of the nodes and spin * up a daemon. */ ldptr = &launchid; for (i = 0; i < map->nodes->size; i++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) { continue; } /* if this daemon already exists, don't launch it! */ if (ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_DAEMON_LAUNCHED)) { continue; } OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output, "%s plm:tm: launching on node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name)); /* setup process name */ rc = orte_util_convert_vpid_to_string(&vpid_string, node->daemon->name.vpid); if (ORTE_SUCCESS != rc) { opal_output(0, "plm:tm: unable to get daemon vpid as string"); exit(-1); } free(argv[proc_vpid_index]); argv[proc_vpid_index] = strdup(vpid_string); free(vpid_string); /* exec the daemon */ if (0 < opal_output_get_verbosity(orte_plm_base_framework.framework_output)) { param = opal_argv_join(argv, ' '); OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output, "%s plm:tm: executing:\n\t%s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == param) ? "NULL" : param)); if (NULL != param) free(param); } launchid = 0; if (!orte_get_attribute(&node->attributes, ORTE_NODE_LAUNCH_ID, (void**)&ldptr, OPAL_INT32)) { orte_show_help("help-plm-tm.txt", "tm-spawn-failed", true, argv[0], node->name, 0); rc = ORTE_ERROR; goto cleanup; } rc = tm_spawn(argc, argv, env, launchid, tm_task_ids + launched, tm_events + launched); if (TM_SUCCESS != rc) { orte_show_help("help-plm-tm.txt", "tm-spawn-failed", true, argv[0], node->name, launchid); rc = ORTE_ERROR; goto cleanup; } launched++; } /* indicate that the daemons for this job were launched */ state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; daemons->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; /* flag that launch was successful, so far as we currently know */ failed_launch = false; OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output, "%s plm:tm:launch: finished spawning orteds", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); cleanup: /* cleanup */ OBJ_RELEASE(state); /* check for failed launch - if so, force terminate */ if (failed_launch) { ORTE_ACTIVATE_JOB_STATE(daemons, ORTE_JOB_STATE_FAILED_TO_START); } }
/* setup argv for daemon process */ static int setup_daemon_proc_env_and_argv(orte_proc_t* proc, char ***pargv, int *argc, char ***penv) { orte_job_t* daemons; int rc; char* param; /* get daemon job object */ daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); *penv = opal_argv_copy(orte_launch_environ); /* prepend orted to argv */ opal_argv_append(argc, pargv, "orted"); /* ess */ opal_argv_append(argc, pargv, "-mca"); opal_argv_append(argc, pargv, "ess"); opal_argv_append(argc, pargv, "env"); /* jobid */ opal_argv_append(argc, pargv, "-mca"); opal_argv_append(argc, pargv, "orte_ess_jobid"); if (ORTE_SUCCESS != (rc = orte_util_convert_jobid_to_string(¶m, ORTE_PROC_MY_NAME->jobid))) { ORTE_ERROR_LOG(rc); return rc; } opal_argv_append(argc, pargv, param); free(param); /* vpid */ opal_argv_append(argc, pargv, "-mca"); opal_argv_append(argc, pargv, "orte_ess_vpid"); if (ORTE_SUCCESS != (rc = orte_util_convert_vpid_to_string(¶m, proc->name.vpid))) { ORTE_ERROR_LOG(rc); return rc; } opal_argv_append(argc, pargv, param); free(param); /* num processes */ opal_argv_append(argc, pargv, "-mca"); opal_argv_append(argc, pargv, "orte_ess_num_procs"); asprintf(¶m, "%lu", daemons->num_procs); opal_argv_append(argc, pargv, param); free(param); /* pass the uri of the hnp */ asprintf(¶m, "\\\"%s\\\"", orte_rml.get_contact_info()); opal_argv_append(argc, pargv, "-mca"); opal_argv_append(argc, pargv, "orte_hnp_uri"); opal_argv_append(argc, pargv, param); free(param); /* oob */ opal_argv_append(argc, pargv, "-mca"); opal_argv_append(argc, pargv, "oob"); opal_argv_append(argc, pargv, "tcp"); /* odls */ opal_argv_append(argc, pargv, "-mca"); opal_argv_append(argc, pargv, "odls"); opal_argv_append(argc, pargv, "yarn"); /* add stdout, stderr to orted */ opal_argv_append_nosize(pargv, "1><LOG_DIR>/stdout"); opal_argv_append_nosize(pargv, "2><LOG_DIR>/stderr"); /* orted */ opal_argv_append(argc, pargv, "-mca"); opal_argv_append(argc, pargv, "state"); opal_argv_append(argc, pargv, "orted"); /* print launch commandline and env when this env is specified */ if (getenv("HAMSTER_VERBOSE")) { char* join_argv = opal_argv_join(*pargv, ' '); OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, "%s plm:yarn launch_daemon argv=%s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), join_argv)); if (join_argv) { free(join_argv); } } return 0; }
/* * setup env and argv for specified process */ static int setup_proc_env_and_argv(orte_job_t* jdata, orte_app_context_t* app, orte_proc_t* proc, char ***pargv, char ***penv) { char* param; char* param2; char* value; char* vp_id_str; char* job_id_str; int rc; int i, num_nodes; /* obtain app->argv */ if (!(app->argv)) { opal_output(0, "%s plm::yarn::setup_proc_env_and_argv: app->argv is null", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); return ORTE_ERROR; } *pargv = opal_argv_copy(app->argv); if (ORTE_SUCCESS != orte_util_convert_jobid_to_string(&job_id_str, jdata->jobid)) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); return ORTE_ERR_OUT_OF_RESOURCE; } if (ORTE_SUCCESS != orte_util_convert_vpid_to_string(&vp_id_str, proc->name.vpid)) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); return ORTE_ERR_OUT_OF_RESOURCE; } // add stdout, stderr to app opal_argv_append_nosize(pargv, "1><LOG_DIR>/stdout"); opal_argv_append_nosize(pargv, "2><LOG_DIR>/stderr"); // add java executor to app opal_argv_prepend_nosize(pargv, vp_id_str); opal_argv_prepend_nosize(pargv, job_id_str); opal_argv_prepend_nosize(pargv, "com.pivotal.hamster.yarnexecutor.YarnExecutor"); opal_argv_prepend_nosize(pargv, "hamster-core.jar"); opal_argv_prepend_nosize(pargv, "-cp"); opal_argv_prepend_nosize(pargv, getenv("HAMSTER_JAVA_OPT")==NULL ? "-Xmx32M -Xms8M" : getenv("HAMSTER_JAVA_OPT")); opal_argv_prepend_nosize(pargv, "$JAVA_HOME/bin/java"); /* obtain app->env */ *penv = opal_environ_merge(environ, app->env); if (!proc->node) { opal_output(0, "%s plm::yarn::setup_proc_env_and_argv: node of proc[%d] is NULL", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), proc->name.vpid); return ORTE_ERROR; } if (!proc->node->daemon) { opal_output(0, "%s plm::yarn::setup_proc_env_and_argv: daemon of node[%s] is NULL", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), proc->node->name); return ORTE_ERROR; } /* set the app_context number into the environment */ //?? param = mca_base_param_env_var("orte_app_num"); asprintf(¶m2, "%ld", (long)app->idx); opal_setenv(param, param2, true, penv); free(param); free(param2); // pass the daemon's name param = mca_base_param_env_var("orte_local_daemon_uri"); opal_setenv(param, proc->node->daemon->rml_uri, true, penv); free(param); /* pass my contact info */ param = mca_base_param_env_var("orte_hnp_uri"); opal_setenv(param, orte_process_info.my_hnp_uri, true, penv); free(param); /* pass the jobid */ param = mca_base_param_env_var("orte_ess_jobid"); opal_setenv(param, job_id_str, true, penv); free(param); free(job_id_str); /* pass the rank */ param = mca_base_param_env_var("orte_ess_vpid"); opal_setenv(param, vp_id_str, true, penv); free(param); opal_setenv("OMPI_COMM_WORLD_RANK", vp_id_str, true, penv); free(vp_id_str); /* done with this now */ /* pass local rank */ asprintf(&value, "%lu", (unsigned long) proc->local_rank); opal_setenv("OMPI_COMM_WORLD_LOCAL_RANK", value, true, penv); free(value); /* pass node rank */ asprintf(&value, "%lu", (unsigned long) proc->node_rank); opal_setenv("OMPI_COMM_WORLD_NODE_RANK", value, true, penv); /* set an mca param for it too */ param = mca_base_param_env_var("orte_ess_node_rank"); opal_setenv(param, value, true, penv); free(param); free(value); /* pass a param telling the child what model of cpu we are on, * if we know it */ if (NULL != orte_local_cpu_type) { param = mca_base_param_env_var("orte_cpu_type"); /* do not overwrite what the user may have provided */ opal_setenv(param, orte_local_cpu_type, false, penv); free(param); } if (NULL != orte_local_cpu_model) { param = mca_base_param_env_var("orte_cpu_model"); /* do not overwrite what the user may have provided */ opal_setenv(param, orte_local_cpu_model, false, penv); free(param); } /* pass the number of nodes involved in this job */ param = mca_base_param_env_var("orte_num_nodes"); /* we have to count the number of nodes as the size of orte_node_pool * is only guaranteed to be equal or larger than that number - i.e., * the pointer_array increases the size by a block each time, so some * of the locations are left empty */ num_nodes = 0; for (i = 0; i < orte_node_pool->size; i++) { if (NULL != opal_pointer_array_get_item(orte_node_pool, i)) { num_nodes++; } } asprintf(&value, "%d", num_nodes); opal_setenv(param, value, true, penv); free(param); free(value); /* setup yield schedule */ param = mca_base_param_env_var("mpi_yield_when_idle"); opal_setenv(param, "0", false, penv); free(param); /* set MPI universe envar */ orte_ess_env_put(jdata->num_procs, proc->node->num_procs, penv); asprintf(&value, "%ld", (long) jdata->num_procs); opal_setenv("OMPI_UNIVERSE_SIZE", value, true, penv); free(value); /* pass collective ids for the std MPI operations */ param = mca_base_param_env_var("orte_peer_modex_id"); asprintf(&value, "%d", jdata->peer_modex); opal_setenv(param, value, true, penv); free(param); free(value); param = mca_base_param_env_var("orte_peer_init_barrier_id"); asprintf(&value, "%d", jdata->peer_init_barrier); opal_setenv(param, value, true, penv); free(param); free(value); param = mca_base_param_env_var("orte_peer_fini_barrier_id"); asprintf(&value, "%d", jdata->peer_fini_barrier); opal_setenv(param, value, true, penv); free(param); free(value); /* finally, we will set/unset some mca param to select modules */ opal_unsetenv("OMPI_MCA_plm", penv); opal_unsetenv("OMPI_MCA_ras", penv); opal_unsetenv("OMPI_MCA_ess", penv); opal_unsetenv("OMPI_MCA_state", penv); opal_unsetenv("OMPI_MCA_errmgr", penv); return 0; }
int orte_session_dir_finalize(orte_process_name_t *proc) { int rc; char *tmp; char *job_session_dir, *vpid, *proc_session_dir; if (!orte_create_session_dirs) { /* didn't create them */ return ORTE_SUCCESS; } /* need to setup the top_session_dir with the prefix */ tmp = opal_os_path(false, orte_process_info.tmpdir_base, orte_process_info.top_session_dir, NULL); /* define the proc and job session directories for this process */ if (ORTE_SUCCESS != (rc = orte_util_convert_vpid_to_string(&vpid, proc->vpid))) { ORTE_ERROR_LOG(rc); free(tmp); return rc; } job_session_dir = orte_build_job_session_dir(tmp, proc, proc->jobid); if( NULL == job_session_dir) { free(tmp); free(vpid); return ORTE_ERR_OUT_OF_RESOURCE; } proc_session_dir = opal_os_path( false, job_session_dir, vpid, NULL ); if( NULL == proc_session_dir ) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); free(tmp); free(vpid); free(job_session_dir); return ORTE_ERR_OUT_OF_RESOURCE; } opal_os_dirpath_destroy(proc_session_dir, false, orte_dir_check_file); opal_os_dirpath_destroy(job_session_dir, false, orte_dir_check_file); opal_os_dirpath_destroy(tmp, false, orte_dir_check_file); if (opal_os_dirpath_is_empty(proc_session_dir)) { if (orte_debug_flag) { opal_output(0, "sess_dir_finalize: found proc session dir empty - deleting"); } rmdir(proc_session_dir); } else { if (orte_debug_flag) { opal_output(0, "sess_dir_finalize: proc session dir not empty - leaving"); } goto CLEANUP; } if (opal_os_dirpath_is_empty(job_session_dir)) { if (orte_debug_flag) { opal_output(0, "sess_dir_finalize: found job session dir empty - deleting"); } rmdir(job_session_dir); } else { if (orte_debug_flag) { opal_output(0, "sess_dir_finalize: job session dir not empty - leaving"); } goto CLEANUP; } if (opal_os_dirpath_is_empty(tmp)) { if (orte_debug_flag) { opal_output(0, "sess_dir_finalize: found top session dir empty - deleting"); } rmdir(tmp); } else { if (orte_debug_flag) { opal_output(0, "sess_dir_finalize: top session dir not empty - leaving"); } } CLEANUP: free(tmp); free(vpid); free(job_session_dir); free(proc_session_dir); return ORTE_SUCCESS; }
/* * Construct the fullpath to the session directory */ int orte_session_dir_get_name(char **fulldirpath, char **return_prefix, /* This will come back as the valid tmp dir */ char **return_frontend, char *hostid, char *batchid, orte_process_name_t *proc) { char *hostname = NULL, *batchname = NULL, *sessions = NULL, *user = NULL, *prefix = NULL, *frontend = NULL, *jobfam = NULL, *job = NULL, *vpidstr = NULL; bool prefix_provided = false; int exit_status = ORTE_SUCCESS; size_t len; int uid; struct passwd *pwdent; /* Ensure that system info is set */ orte_proc_info(); /* get the name of the user */ uid = getuid(); #ifdef HAVE_GETPWUID pwdent = getpwuid(uid); #else pwdent = NULL; #endif if (NULL != pwdent) { user = strdup(pwdent->pw_name); } else { orte_show_help("help-orte-runtime.txt", "orte:session:dir:nopwname", true); return ORTE_ERR_OUT_OF_RESOURCE; } /* * set the 'hostname' */ if( NULL != hostid) { /* User specified version */ hostname = strdup(hostid); } else { /* check if it is set elsewhere */ if( NULL != orte_process_info.nodename) hostname = strdup(orte_process_info.nodename); else { /* Couldn't find it, so fail */ ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); exit_status = ORTE_ERR_BAD_PARAM; goto cleanup; } } /* * set the 'batchid' */ if (NULL != batchid) batchname = strdup(batchid); else batchname = strdup("0"); /* * get the front part of the session directory * Will look something like: * openmpi-sessions-USERNAME@HOSTNAME_BATCHID */ if (NULL != orte_process_info.top_session_dir) { frontend = strdup(orte_process_info.top_session_dir); } else { /* If not set then construct it */ if (0 > asprintf(&frontend, "openmpi-sessions-%s@%s_%s", user, hostname, batchname)) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); exit_status = ORTE_ERR_OUT_OF_RESOURCE; goto cleanup; } } /* * Construct the session directory */ /* If we were given a valid vpid then we can construct it fully into: * openmpi-sessions-USERNAME@HOSTNAME_BATCHID/JOB-FAMILY/JOBID/VPID */ if( NULL != proc) { if (ORTE_VPID_INVALID != proc->vpid) { if (0 > asprintf(&jobfam, "%d", ORTE_JOB_FAMILY(proc->jobid))) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); exit_status = ORTE_ERR_OUT_OF_RESOURCE; goto cleanup; } if (0 > asprintf(&job, "%d", ORTE_LOCAL_JOBID(proc->jobid))) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); exit_status = ORTE_ERR_OUT_OF_RESOURCE; goto cleanup; } if (ORTE_SUCCESS != orte_util_convert_vpid_to_string(&vpidstr, proc->vpid)) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); exit_status = ORTE_ERR_OUT_OF_RESOURCE; goto cleanup; } sessions = opal_os_path( false, frontend, jobfam, job, vpidstr, NULL ); if( NULL == sessions ) { ORTE_ERROR_LOG(ORTE_ERROR); exit_status = ORTE_ERROR; goto cleanup; } } /* If we were given a valid jobid then we can construct it partially into: * openmpi-sessions-USERNAME@HOSTNAME_BATCHID/JOB-FAMILY/JOBID */ else if (ORTE_JOBID_INVALID != proc->jobid) { if (0 > asprintf(&jobfam, "%d", ORTE_JOB_FAMILY(proc->jobid))) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); exit_status = ORTE_ERR_OUT_OF_RESOURCE; goto cleanup; } if (0 > asprintf(&job, "%d", ORTE_LOCAL_JOBID(proc->jobid))) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); exit_status = ORTE_ERR_OUT_OF_RESOURCE; goto cleanup; } sessions = opal_os_path( false, frontend, jobfam, job, NULL ); if( NULL == sessions ) { ORTE_ERROR_LOG(ORTE_ERROR); exit_status = ORTE_ERROR; goto cleanup; } } /* if both are invalid */ else { sessions = strdup(frontend); /* must dup this to avoid double-free later */ } } /* If we were not given a proc at all, then we just set it to frontend */ else { sessions = strdup(frontend); /* must dup this to avoid double-free later */ } /* * If the user specified an invalid prefix, or no prefix at all * we need to keep looking */ if( NULL != fulldirpath && NULL != *fulldirpath) { free(*fulldirpath); *fulldirpath = NULL; } if( NULL != return_prefix && NULL != *return_prefix) { /* use the user specified one, if available */ prefix = strdup(*return_prefix); prefix_provided = true; } /* Try to find a proper alternative prefix */ else if (NULL != orte_process_info.tmpdir_base) { /* stored value */ prefix = strdup(orte_process_info.tmpdir_base); } else { /* General Environment var */ prefix = strdup(opal_tmp_directory()); } len = strlen(prefix); /* check for a trailing path separator */ if (OPAL_PATH_SEP[0] == prefix[len-1]) { prefix[len-1] = '\0'; } /* BEFORE doing anything else, check to see if this prefix is * allowed by the system */ if (NULL != orte_prohibited_session_dirs) { char **list; int i, len; /* break the string into tokens - it should be * separated by ',' */ list = opal_argv_split(orte_prohibited_session_dirs, ','); len = opal_argv_count(list); /* cycle through the list */ for (i=0; i < len; i++) { /* check if prefix matches */ if (0 == strncmp(prefix, list[i], strlen(list[i]))) { /* this is a prohibited location */ orte_show_help("help-orte-runtime.txt", "orte:session:dir:prohibited", true, prefix, orte_prohibited_session_dirs); return ORTE_ERR_FATAL; } } opal_argv_free(list); /* done with this */ } /* * Construct the absolute final path, if requested */ if (NULL != fulldirpath) { *fulldirpath = opal_os_path(false, prefix, sessions, NULL); } /* * Return the frontend and prefix, if user requested we do so */ if (NULL != return_frontend) { *return_frontend = strdup(frontend); } if (!prefix_provided && NULL != return_prefix) { *return_prefix = strdup(prefix); } cleanup: if(NULL != hostname) free(hostname); if(NULL != batchname) free(batchname); if(NULL != sessions) free(sessions); if(NULL != user) free(user); if (NULL != prefix) free(prefix); if (NULL != frontend) free(frontend); if (NULL != jobfam) free(jobfam); if (NULL != job) free(job); if (NULL != vpidstr) free(vpidstr); return exit_status; }
int orte_session_dir_finalize(orte_process_name_t *proc) { int rc; char *tmp; char *job_session_dir, *vpid, *proc_session_dir; if (!orte_create_session_dirs) { /* didn't create them */ return ORTE_SUCCESS; } if (NULL == orte_process_info.tmpdir_base && NULL == orte_process_info.top_session_dir) { /* this should never happen - it means we are calling * cleanup *before* properly setting up the session * dir system. This leaves open the possibility of * accidentally removing directories we shouldn't * touch */ ORTE_ERROR_LOG(ORTE_ERR_NOT_INITIALIZED); return ORTE_ERR_NOT_INITIALIZED; } /* need to setup the top_session_dir with the prefix */ tmp = opal_os_path(false, orte_process_info.tmpdir_base, orte_process_info.top_session_dir, NULL); /* define the proc and job session directories for this process */ if (ORTE_SUCCESS != (rc = orte_util_convert_vpid_to_string(&vpid, proc->vpid))) { ORTE_ERROR_LOG(rc); free(tmp); return rc; } job_session_dir = orte_build_job_session_dir(tmp, proc, proc->jobid); if( NULL == job_session_dir) { free(tmp); free(vpid); return ORTE_ERR_OUT_OF_RESOURCE; } proc_session_dir = opal_os_path( false, job_session_dir, vpid, NULL ); if( NULL == proc_session_dir ) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); free(tmp); free(vpid); free(job_session_dir); return ORTE_ERR_OUT_OF_RESOURCE; } opal_os_dirpath_destroy(proc_session_dir, false, orte_dir_check_file); opal_os_dirpath_destroy(job_session_dir, false, orte_dir_check_file); opal_os_dirpath_destroy(tmp, false, orte_dir_check_file); if (opal_os_dirpath_is_empty(proc_session_dir)) { if (orte_debug_flag) { opal_output(0, "sess_dir_finalize: found proc session dir empty - deleting"); } rmdir(proc_session_dir); } else { if (orte_debug_flag) { if (OPAL_ERR_NOT_FOUND == opal_os_dirpath_access(proc_session_dir, 0)) { opal_output(0, "sess_dir_finalize: proc session dir does not exist"); } else { opal_output(0, "sess_dir_finalize: proc session dir not empty - leaving"); } } goto CLEANUP; } if (opal_os_dirpath_is_empty(job_session_dir)) { if (orte_debug_flag) { opal_output(0, "sess_dir_finalize: found job session dir empty - deleting"); } rmdir(job_session_dir); } else { if (orte_debug_flag) { if (OPAL_ERR_NOT_FOUND == opal_os_dirpath_access(job_session_dir, 0)) { opal_output(0, "sess_dir_finalize: job session dir does not exist"); } else { opal_output(0, "sess_dir_finalize: job session dir not empty - leaving"); } } goto CLEANUP; } if (opal_os_dirpath_is_empty(tmp)) { if (orte_debug_flag) { opal_output(0, "sess_dir_finalize: found top session dir empty - deleting"); } rmdir(tmp); } else { if (orte_debug_flag) { if (OPAL_ERR_NOT_FOUND == opal_os_dirpath_access(tmp, 0)) { opal_output(0, "sess_dir_finalize: top session dir does not exist"); } else { opal_output(0, "sess_dir_finalize: top session dir not empty - leaving"); } } } CLEANUP: free(tmp); free(vpid); free(job_session_dir); free(proc_session_dir); return ORTE_SUCCESS; }