Esempio n. 1
0
/*
 * setup env and argv for specified process
 */
static int setup_proc_env_and_argv(orte_job_t* jdata, orte_app_context_t* app,
        orte_proc_t* proc, char ***pargv, char ***penv)
{
    char* param;
    char* param2;
    char* value;
    char* vp_id_str;
    char* job_id_str;
    int rc;
    int i, num_nodes;

    /* obtain app->argv */
    if (!(app->argv)) {
        opal_output(0, "%s plm::yarn::setup_proc_env_and_argv: app->argv is null",
            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
        return ORTE_ERROR;
    }
    *pargv = opal_argv_copy(app->argv);

    if (ORTE_SUCCESS != orte_util_convert_jobid_to_string(&job_id_str, jdata->jobid)) {
        ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
        return ORTE_ERR_OUT_OF_RESOURCE;
    }
    if (ORTE_SUCCESS != orte_util_convert_vpid_to_string(&vp_id_str, proc->name.vpid)) {
        ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
        return ORTE_ERR_OUT_OF_RESOURCE;
    }

    // add stdout, stderr to app
    opal_argv_append_nosize(pargv, "1><LOG_DIR>/stdout");
    opal_argv_append_nosize(pargv, "2><LOG_DIR>/stderr");

    // add java executor to app
    opal_argv_prepend_nosize(pargv, vp_id_str);
    opal_argv_prepend_nosize(pargv, job_id_str);
    opal_argv_prepend_nosize(pargv, "com.pivotal.hamster.yarnexecutor.YarnExecutor");
    opal_argv_prepend_nosize(pargv, "hamster-core.jar");
    opal_argv_prepend_nosize(pargv, "-cp");
    opal_argv_prepend_nosize(pargv, getenv("HAMSTER_JAVA_OPT")==NULL ? "-Xmx32M -Xms8M" : getenv("HAMSTER_JAVA_OPT"));
    opal_argv_prepend_nosize(pargv, "$JAVA_HOME/bin/java");

    /* obtain app->env */
    *penv = opal_environ_merge(environ, app->env);

    if (!proc->node) {
        opal_output(0, "%s plm::yarn::setup_proc_env_and_argv: node of proc[%d] is NULL",
            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), proc->name.vpid);
        return ORTE_ERROR;
    }

    if (!proc->node->daemon) {
        opal_output(0, "%s plm::yarn::setup_proc_env_and_argv: daemon of node[%s] is NULL",
            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), proc->node->name);
        return ORTE_ERROR;
    }

    /* set the app_context number into the environment */ //??
    param = mca_base_param_env_var("orte_app_num");
    asprintf(&param2, "%ld", (long)app->idx);
    opal_setenv(param, param2, true, penv);
    free(param);
    free(param2);

    // pass the daemon's name
    param = mca_base_param_env_var("orte_local_daemon_uri");
    opal_setenv(param, proc->node->daemon->rml_uri, true, penv);
    free(param);

    /* pass my contact info */
    param = mca_base_param_env_var("orte_hnp_uri");
    opal_setenv(param, orte_process_info.my_hnp_uri, true, penv);
    free(param);

    /* pass the jobid */
    param = mca_base_param_env_var("orte_ess_jobid");
    opal_setenv(param, job_id_str, true, penv);
    free(param);
    free(job_id_str);

    /* pass the rank */
    param = mca_base_param_env_var("orte_ess_vpid");
    opal_setenv(param, vp_id_str, true, penv);
    free(param);


    opal_setenv("OMPI_COMM_WORLD_RANK", vp_id_str, true, penv);
    free(vp_id_str);  /* done with this now */

    /* pass local rank */
    asprintf(&value, "%lu", (unsigned long) proc->local_rank);
    opal_setenv("OMPI_COMM_WORLD_LOCAL_RANK", value, true, penv);
    free(value);

    /* pass node rank */
    asprintf(&value, "%lu", (unsigned long) proc->node_rank);
    opal_setenv("OMPI_COMM_WORLD_NODE_RANK", value, true, penv);

    /* set an mca param for it too */
    param = mca_base_param_env_var("orte_ess_node_rank");
    opal_setenv(param, value, true, penv);
    free(param);
    free(value);

    /* pass a param telling the child what model of cpu we are on,
     * if we know it
     */
    if (NULL != orte_local_cpu_type) {
        param = mca_base_param_env_var("orte_cpu_type");
        /* do not overwrite what the user may have provided */
        opal_setenv(param, orte_local_cpu_type, false, penv);
        free(param);
    }
    if (NULL != orte_local_cpu_model) {
        param = mca_base_param_env_var("orte_cpu_model");
        /* do not overwrite what the user may have provided */
        opal_setenv(param, orte_local_cpu_model, false, penv);
        free(param);
    }

    /* pass the number of nodes involved in this job */
    param = mca_base_param_env_var("orte_num_nodes");

    /* we have to count the number of nodes as the size of orte_node_pool
     * is only guaranteed to be equal or larger than that number - i.e.,
     * the pointer_array increases the size by a block each time, so some
     * of the locations are left empty
     */
    num_nodes = 0;
    for (i = 0; i < orte_node_pool->size; i++) {
        if (NULL != opal_pointer_array_get_item(orte_node_pool, i)) {
            num_nodes++;
        }
    }
    asprintf(&value, "%d", num_nodes);
    opal_setenv(param, value, true, penv);
    free(param);
    free(value);

    /* setup yield schedule */
    param = mca_base_param_env_var("mpi_yield_when_idle");
    opal_setenv(param, "0", false, penv);
    free(param);

    /* set MPI universe envar */
    orte_ess_env_put(jdata->num_procs, proc->node->num_procs, penv);

    asprintf(&value, "%ld", (long) jdata->num_procs);
    opal_setenv("OMPI_UNIVERSE_SIZE", value, true, penv);
    free(value);

	/* pass collective ids for the std MPI operations */
	param = mca_base_param_env_var("orte_peer_modex_id");
	asprintf(&value, "%d", jdata->peer_modex);
	opal_setenv(param, value, true, penv);
	free(param);
	free(value);

	param = mca_base_param_env_var("orte_peer_init_barrier_id");
	asprintf(&value, "%d", jdata->peer_init_barrier);
	opal_setenv(param, value, true, penv);
	free(param);
	free(value);

	param = mca_base_param_env_var("orte_peer_fini_barrier_id");
	asprintf(&value, "%d", jdata->peer_fini_barrier);
	opal_setenv(param, value, true, penv);
	free(param);
	free(value);

    /* finally, we will set/unset some mca param to select modules */
    opal_unsetenv("OMPI_MCA_plm", penv);
    opal_unsetenv("OMPI_MCA_ras", penv);
    opal_unsetenv("OMPI_MCA_ess", penv);
    opal_unsetenv("OMPI_MCA_state", penv);
    opal_unsetenv("OMPI_MCA_errmgr", penv);
    return 0;
}
Esempio n. 2
0
static int setup_fork(orte_job_t *jdata,
                      orte_app_context_t *app)
{
    int i;
    bool takeus = false;
    char *p, *t2;
    char dir[MAXPATHLEN];

    if (NULL != orte_schizo_base.personalities) {
        /* see if we are included */
        for (i=0; NULL != jdata->personality[i]; i++) {
            if (0 == strcmp(jdata->personality[i], "singularity")) {
                takeus = true;
                break;
            }
        }
    }
    if (!takeus) {
        /* even if they didn't specify, check to see if
         * this involves a singularity container */
        if (0 != strcmp(app->argv[0],"singularity") &&
            0 != strcmp(app->argv[0],"sapprun") &&
            NULL == strstr(app->argv[0], ".sapp")) {
            /* guess not! */
            return ORTE_ERR_TAKE_NEXT_OPTION;
        }
    }

    /* set the singularity cache dir, unless asked not to do so */
    if (!orte_get_attribute(&app->attributes, ORTE_APP_NO_CACHEDIR, NULL, OPAL_BOOL)) {
        opal_setenv("SINGULARITY_CACHEDIR", orte_process_info.job_session_dir, true, &app->env);
        opal_setenv("SINGULARITY_CACHEDIR", orte_process_info.job_session_dir, true, &environ);
    }

    /* save our current directory */
    getcwd(dir, sizeof(dir));

    /* change to the working directory for this context */
    chdir(app->cwd);

    /* if the app contains .sapp, then we need to strip that
     * extension so singularity doesn't bark at us */
    if (NULL != strstr(app->argv[0], ".sapp")) {
        /* ensure the app is installed */
        opal_output_verbose(1, orte_schizo_base_framework.framework_output,
                            "%s schizo:singularity: installing app %s",
                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), app->argv[0]);
        t2 = opal_basename(app->argv[0]);
        p = strstr(t2, ".sapp");
        *p = '\0'; // strip the extension
        if (0 < opal_output_get_verbosity(orte_schizo_base_framework.framework_output)) {
            (void)asprintf(&p, "singularity -vv install --runkey %s %s", t2, app->argv[0]);
        } else {
            (void)asprintf(&p, "singularity --quiet install --runkey %s %s", t2, app->argv[0]);
        }
        system(p);
        free(p);
        free(app->argv[0]);
        app->argv[0] = t2;
    }

    /* ensure that we use "singularity run" to execute this app */
    if (0 != strcmp(app->app, "singularity")) {
        opal_output_verbose(1, orte_schizo_base_framework.framework_output,
                            "%s schizo:singularity: adding singularity cmd",
                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
        /* change the app to the "singularity" command */
        free(app->app);
        app->app = strdup("singularity");
        opal_argv_prepend_nosize(&app->argv, "run");
        if (0 < opal_output_get_verbosity(orte_schizo_base_framework.framework_output)) {
            opal_argv_prepend_nosize(&app->argv, "-vv");
        } else {
            opal_argv_prepend_nosize(&app->argv, "--quiet");
        }
        opal_argv_prepend_nosize(&app->argv, "singularity");
    }

    /* return to the original directory */
    chdir(dir);

    return ORTE_SUCCESS;
}
Esempio n. 3
0
static void link_launch(orcm_cfgi_app_t *app,
                        orcm_cfgi_run_t *run,
                        bool linkall)
{
    orcm_cfgi_caddy_t *caddy;
    int j, k;
    orcm_cfgi_exec_t *exec, *eptr;
    orcm_cfgi_version_t *vers, *vptr;
    orcm_cfgi_bin_t *bin;
    orte_job_t *jdat, *jptr;
    orte_app_context_t *ax;
    bool found;

    /* link all the required binaries */
    found = false;
    for (j=0; j < run->binaries.size; j++) {
        if (NULL == (bin = (orcm_cfgi_bin_t*)opal_pointer_array_get_item(&run->binaries, j))) {
            continue;
        }
        /* find the matching executable */
        exec = NULL;
        for (k=0; k < app->executables.size; k++) {
            if (NULL == (eptr = (orcm_cfgi_exec_t*)opal_pointer_array_get_item(&app->executables, k))) {
                continue;
            }
            if (0 == strcmp(eptr->appname, bin->appname)) {
                exec = eptr;
                break;
            }
        }
        /* if not found, then skip - hasn't been defined yet */
        if (NULL == exec) {
            continue;
        }
        /* find the matching version */
        vers = NULL;
        /* if not found, then skip - hasn't been defined yet */
        for (k=0; k < exec->versions.size; k++) {
            if (NULL == (vptr = (orcm_cfgi_version_t*)opal_pointer_array_get_item(&exec->versions, k))) {
                continue;
            }
            if (0 == strcmp(vptr->version, bin->version)) {
                vers = vptr;
                break;
            }
        }
        /* if not found, then skip - hasn't been defined yet */
        if (NULL == vers) {
            continue;
        }
        /* have we already matched this one */
        if (NULL != bin->vers) {
            if (linkall) {
                /* attempt to launch all */
                found = true;
            }
            continue;
        }
        /* is there enough room left for this number of procs? */
        if (0 <= exec->process_limit) {
            if (exec->process_limit < (bin->num_procs + exec->total_procs)) {
                opal_output(0, "%s EXECUTABLE %s: MAX NUMBER OF ALLOWED PROCS (%d) EXCEEDED - CANNOT ADD %d PROCS, ALREADY HAVE %d",
                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                            (NULL == exec->appname) ? "NULL" : exec->appname,
                            exec->process_limit, bin->num_procs, exec->total_procs);
                continue;
            }
        }
        /* make the link */
        bin->vers = vers;
        bin->vers_idx = opal_pointer_array_add(&vers->binaries, bin);
        bin->exec = exec;
        found = true;
        OPAL_OUTPUT_VERBOSE((2, orcm_cfgi_base.output,
                             "%s LINKED BINARY %s TO VERSION %s:%s with num_procs %d",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             bin->binary, exec->appname, vers->version, bin->num_procs));
    }
    /* if we found at least one binary, launch it */
    if (found) {
        /* need to create the ORTE job object for this job */
        jdat = OBJ_NEW(orte_job_t);
        jdat->name = strdup(run->application);
        jdat->instance = strdup(run->instance);
        for (j=0; j < run->binaries.size; j++) {
            if (NULL == (bin = (orcm_cfgi_bin_t*)opal_pointer_array_get_item(&run->binaries, j))) {
                continue;
            }
            if (NULL == bin->vers) {
                /* not installed yet */
                continue;
            }
            /* create an app_context for this binary */
            ax = OBJ_NEW(orte_app_context_t);
            ax->app = strdup(bin->binary);
            /* copy the argv across */
            if (NULL != bin->vers->argv) {
                ax->argv = opal_argv_copy(bin->vers->argv);
            }
            /* stick the command at the beginning of the argv */
            opal_argv_prepend_nosize(&ax->argv, bin->binary);
            /* set num procs */
            ax->num_procs = bin->num_procs;
            /* add it to the job */
            ax->idx = opal_pointer_array_add(jdat->apps, ax);
            jdat->num_apps++;
        }
        /* notify the launcher */
        caddy = OBJ_NEW(orcm_cfgi_caddy_t);
        caddy->cmd = ORCM_CFGI_SPAWN;
        caddy->jdata = jdat;
        opal_fd_write(orcm_cfgi_base.launch_pipe[1], sizeof(orcm_cfgi_caddy_t*), &caddy);
    }
}