static void _setup_exec_srun(spawn_req_t *req) { char **env, env_key[32]; int i, rc; spawn_resp_t *resp; debug3("mpi/pmi2: in _setup_exec_srun"); /* setup environments */ env = env_array_copy((const char **)job_info.job_env); /* TODO: unset some env-vars */ env_array_overwrite_fmt(&env, "SLURM_JOB_ID", "%u", job_info.jobid); env_array_overwrite_fmt(&env, PMI2_SPAWNER_JOBID_ENV, "%s", job_info.pmi_jobid); env_array_overwrite_fmt(&env, PMI2_PMI_JOBID_ENV, "%s-%u", job_info.pmi_jobid, req->seq); env_array_overwrite_fmt(&env, PMI2_SPAWN_SEQ_ENV, "%u", req->seq); env_array_overwrite_fmt(&env, PMI2_SPAWNER_PORT_ENV, "%hu", tree_info.pmi_port); /* preput kvs */ env_array_overwrite_fmt(&env, PMI2_PREPUT_CNT_ENV, "%d", req->preput_cnt); for (i = 0; i < req->preput_cnt; i ++) { snprintf(env_key, 32, PMI2_PPKEY_ENV"%d", i); env_array_overwrite_fmt(&env, env_key, "%s", req->pp_keys[i]); snprintf(env_key, 32, PMI2_PPVAL_ENV"%d", i); env_array_overwrite_fmt(&env, env_key, "%s", req->pp_vals[i]); } if (req->subcmd_cnt == 1) { /* no return if success */ rc = _exec_srun_single(req, env); } else { /* no return if success */ rc = _exec_srun_multiple(req, env); } resp = spawn_resp_new(); resp->seq = req->seq; xstrfmtcat(resp->jobid, "%s-%u", job_info.pmi_jobid, req->seq); resp->error_cnt = 0; resp->rc = rc; /* fake a srun address */ tree_info.srun_addr = xmalloc(sizeof(slurm_addr_t)); slurm_set_addr(tree_info.srun_addr, tree_info.pmi_port, "127.0.0.1"); spawn_resp_send_to_srun(resp); spawn_resp_free(resp); exit(errno); }
/* * Current process is running as the user when this is called. */ void exec_task(slurmd_job_t *job, int i) { uint32_t *gtids; /* pointer to arrary of ranks */ int fd, j; slurmd_task_info_t *task = job->task[i]; char **tmp_env; if (i == 0) _make_tmpdir(job); gtids = xmalloc(job->node_tasks * sizeof(uint32_t)); for (j = 0; j < job->node_tasks; j++) gtids[j] = job->task[j]->gtid; job->envtp->sgtids = _uint32_array_to_str(job->node_tasks, gtids); xfree(gtids); job->envtp->jobid = job->jobid; job->envtp->stepid = job->stepid; job->envtp->nodeid = job->nodeid; job->envtp->cpus_on_node = job->cpus; job->envtp->procid = task->gtid; job->envtp->localid = task->id; job->envtp->task_pid = getpid(); job->envtp->distribution = job->task_dist; job->envtp->cpu_bind = xstrdup(job->cpu_bind); job->envtp->cpu_bind_type = job->cpu_bind_type; job->envtp->cpu_freq = job->cpu_freq; job->envtp->mem_bind = xstrdup(job->mem_bind); job->envtp->mem_bind_type = job->mem_bind_type; job->envtp->distribution = -1; job->envtp->ckpt_dir = xstrdup(job->ckpt_dir); job->envtp->batch_flag = job->batch; /* Modify copy of job's environment. Do not alter in place or * concurrent searches of the environment can generate invalid memory * references. */ job->envtp->env = env_array_copy((const char **) job->env); setup_env(job->envtp, false); setenvf(&job->envtp->env, "SLURMD_NODENAME", "%s", conf->node_name); tmp_env = job->env; job->env = job->envtp->env; env_array_free(tmp_env); job->envtp->env = NULL; xfree(job->envtp->task_count); if (task->argv[0] && *task->argv[0] != '/') { /* * Normally the client (srun) expands the command name * to a fully qualified path, but in --multi-prog mode it * is left up to the server to search the PATH for the * executable. */ task->argv[0] = _build_path(task->argv[0], job->env); } if (!job->batch) { if (interconnect_attach(job->switch_job, &job->env, job->nodeid, (uint32_t) i, job->nnodes, job->ntasks, task->gtid) < 0) { error("Unable to attach to interconnect: %m"); log_fini(); exit(1); } if (_setup_mpi(job, i) != SLURM_SUCCESS) { error("Unable to configure MPI plugin: %m"); log_fini(); exit(1); } } /* task-specific pre-launch activities */ if (spank_user_task (job, i) < 0) { error ("Failed to invoke task plugin stack"); exit (1); } /* task plugin hook */ if (pre_launch(job)) { error ("Failed task affinity setup"); exit (1); } if (conf->task_prolog) { char *my_prolog; slurm_mutex_lock(&conf->config_mutex); my_prolog = xstrdup(conf->task_prolog); slurm_mutex_unlock(&conf->config_mutex); _run_script_and_set_env("slurm task_prolog", my_prolog, job); xfree(my_prolog); } if (job->task_prolog) { _run_script_and_set_env("user task_prolog", job->task_prolog, job); } if (!job->batch) pdebug_stop_current(job); if (job->env == NULL) { debug("job->env is NULL"); job->env = (char **)xmalloc(sizeof(char *)); job->env[0] = (char *)NULL; } if (job->restart_dir) { info("restart from %s", job->restart_dir); /* no return on success */ checkpoint_restart_task(job, job->restart_dir, task->gtid); error("Restart task failed: %m"); exit(errno); } if (task->argv[0] == NULL) { error("No executable program specified for this task"); exit(2); } /* Do this last so you don't worry too much about the users limits including the slurmstepd in with it. */ if (set_user_limits(job) < 0) { debug("Unable to set user limits"); log_fini(); exit(5); } execve(task->argv[0], task->argv, job->env); /* * print error message and clean up if execve() returns: */ if ((errno == ENOENT) && ((fd = open(task->argv[0], O_RDONLY)) >= 0)) { char buf[256], *eol; int sz; sz = read(fd, buf, sizeof(buf)); if ((sz >= 3) && (strncmp(buf, "#!", 2) == 0)) { eol = strchr(buf, '\n'); if (eol) eol[0] = '\0'; else buf[sizeof(buf)-1] = '\0'; error("execve(): bad interpreter(%s): %m", buf+2); exit(errno); } } error("execve(): %s: %m", task->argv[0]); exit(errno); }
/* * Current process is running as the user when this is called. */ extern void exec_task(stepd_step_rec_t *job, int local_proc_id) { uint32_t *gtids; /* pointer to array of ranks */ int fd, j; stepd_step_task_info_t *task = job->task[local_proc_id]; char **tmp_env; int saved_errno; uint32_t node_offset = 0, task_offset = 0; if (job->node_offset != NO_VAL) node_offset = job->node_offset; if (job->pack_task_offset != NO_VAL) task_offset = job->pack_task_offset; gtids = xmalloc(job->node_tasks * sizeof(uint32_t)); for (j = 0; j < job->node_tasks; j++) gtids[j] = job->task[j]->gtid + task_offset; job->envtp->sgtids = _uint32_array_to_str(job->node_tasks, gtids); xfree(gtids); if (job->pack_jobid != NO_VAL) job->envtp->jobid = job->pack_jobid; else job->envtp->jobid = job->jobid; job->envtp->stepid = job->stepid; job->envtp->nodeid = job->nodeid + node_offset; job->envtp->cpus_on_node = job->cpus; job->envtp->procid = task->gtid + task_offset; job->envtp->localid = task->id; job->envtp->task_pid = getpid(); job->envtp->distribution = job->task_dist; job->envtp->cpu_bind = xstrdup(job->cpu_bind); job->envtp->cpu_bind_type = job->cpu_bind_type; job->envtp->cpu_freq_min = job->cpu_freq_min; job->envtp->cpu_freq_max = job->cpu_freq_max; job->envtp->cpu_freq_gov = job->cpu_freq_gov; job->envtp->mem_bind = xstrdup(job->mem_bind); job->envtp->mem_bind_type = job->mem_bind_type; job->envtp->distribution = -1; job->envtp->ckpt_dir = xstrdup(job->ckpt_dir); job->envtp->batch_flag = job->batch; job->envtp->uid = job->uid; job->envtp->user_name = xstrdup(job->user_name); /* * Modify copy of job's environment. Do not alter in place or * concurrent searches of the environment can generate invalid memory * references. */ job->envtp->env = env_array_copy((const char **) job->env); setup_env(job->envtp, false); setenvf(&job->envtp->env, "SLURM_JOB_GID", "%d", job->gid); setenvf(&job->envtp->env, "SLURMD_NODENAME", "%s", conf->node_name); if (job->tres_bind) { setenvf(&job->envtp->env, "SLURMD_TRES_BIND", "%s", job->tres_bind); } if (job->tres_freq) { setenvf(&job->envtp->env, "SLURMD_TRES_FREQ", "%s", job->tres_freq); } tmp_env = job->env; job->env = job->envtp->env; env_array_free(tmp_env); job->envtp->env = NULL; xfree(job->envtp->task_count); if (task->argv[0] && *task->argv[0] != '/') { /* * Normally the client (srun) expands the command name * to a fully qualified path, but in --multi-prog mode it * is left up to the server to search the PATH for the * executable. */ task->argv[0] = _build_path(task->argv[0], job->env, NULL); } if (!job->batch && (job->stepid != SLURM_EXTERN_CONT)) { if (switch_g_job_attach(job->switch_job, &job->env, job->nodeid, (uint32_t) local_proc_id, job->nnodes, job->ntasks, task->gtid) < 0) { error("Unable to attach to interconnect: %m"); log_fini(); exit(1); } if (_setup_mpi(job, local_proc_id) != SLURM_SUCCESS) { error("Unable to configure MPI plugin: %m"); log_fini(); exit(1); } } /* task-specific pre-launch activities */ /* task plugin hook */ if (task_g_pre_launch(job)) { error("Failed to invoke task plugins: task_p_pre_launch error"); exit(1); } if (!job->batch && (job->accel_bind_type || job->tres_bind || job->tres_freq)) { /* * Modify copy of job's environment. Do not alter in place or * concurrent searches of the environment can generate invalid * memory references. * * Also sets GRES frequency as needed. */ job->envtp->env = env_array_copy((const char **) job->env); gres_plugin_step_set_env(&job->envtp->env, job->step_gres_list, job->accel_bind_type, job->tres_bind, job->tres_freq, local_proc_id); tmp_env = job->env; job->env = job->envtp->env; env_array_free(tmp_env); } if (spank_user_task(job, local_proc_id) < 0) { error("Failed to invoke spank plugin stack"); exit(1); } if (conf->task_prolog) { char *my_prolog; slurm_mutex_lock(&conf->config_mutex); my_prolog = xstrdup(conf->task_prolog); slurm_mutex_unlock(&conf->config_mutex); _run_script_and_set_env("slurm task_prolog", my_prolog, job); xfree(my_prolog); } if (job->task_prolog) { _run_script_and_set_env("user task_prolog", job->task_prolog, job); } /* * Set TMPDIR after running prolog scripts, since TMPDIR * might be set or changed in one of the prolog scripts. */ if (local_proc_id == 0) _make_tmpdir(job); if (!job->batch) pdebug_stop_current(job); if (job->env == NULL) { debug("job->env is NULL"); job->env = (char **)xmalloc(sizeof(char *)); job->env[0] = (char *)NULL; } if (job->restart_dir) { info("restart from %s", job->restart_dir); /* no return on success */ checkpoint_restart_task(job, job->restart_dir, task->gtid); error("Restart task failed: %m"); exit(errno); } if (task->argv[0] == NULL) { error("No executable program specified for this task"); exit(2); } /* Do this last so you don't worry too much about the users limits including the slurmstepd in with it. */ if (set_user_limits(job) < 0) { debug("Unable to set user limits"); log_fini(); exit(5); } execve(task->argv[0], task->argv, job->env); saved_errno = errno; /* * print error message and clean up if execve() returns: */ if ((errno == ENOENT) && ((fd = open(task->argv[0], O_RDONLY)) >= 0)) { char buf[256], *eol; int sz; sz = read(fd, buf, sizeof(buf)); if ((sz >= 3) && (xstrncmp(buf, "#!", 2) == 0)) { buf[sizeof(buf)-1] = '\0'; eol = strchr(buf, '\n'); if (eol) eol[0] = '\0'; slurm_seterrno(saved_errno); error("execve(): bad interpreter(%s): %m", buf+2); exit(errno); } } slurm_seterrno(saved_errno); error("execve(): %s: %m", task->argv[0]); exit(errno); }
static int _setup_stepd_job_info(const stepd_step_rec_t *job, char ***env) { char *p; int i; memset(&job_info, 0, sizeof(job_info)); job_info.jobid = job->jobid; job_info.stepid = job->stepid; job_info.nnodes = job->nnodes; job_info.nodeid = job->nodeid; job_info.ntasks = job->ntasks; job_info.ltasks = job->node_tasks; job_info.gtids = xmalloc(job->node_tasks * sizeof(uint32_t)); for (i = 0; i < job->node_tasks; i ++) { job_info.gtids[i] = job->task[i]->gtid; } p = getenvp(*env, PMI2_PMI_DEBUGGED_ENV); if (p) { job_info.pmi_debugged = atoi(p); } else { job_info.pmi_debugged = 0; } p = getenvp(*env, PMI2_SPAWN_SEQ_ENV); if (p) { /* spawned */ job_info.spawn_seq = atoi(p); unsetenvp(*env, PMI2_SPAWN_SEQ_ENV); p = getenvp(*env, PMI2_SPAWNER_JOBID_ENV); job_info.spawner_jobid = xstrdup(p); unsetenvp(*env, PMI2_SPAWNER_JOBID_ENV); } else { job_info.spawn_seq = 0; job_info.spawner_jobid = NULL; } p = getenvp(*env, PMI2_PMI_JOBID_ENV); if (p) { job_info.pmi_jobid = xstrdup(p); unsetenvp(*env, PMI2_PMI_JOBID_ENV); } else { xstrfmtcat(job_info.pmi_jobid, "%u.%u", job->jobid, job->stepid); } p = getenvp(*env, PMI2_STEP_NODES_ENV); if (!p) { error("mpi/pmi2: unable to find nodes in job environment"); return SLURM_ERROR; } else { job_info.step_nodelist = xstrdup(p); unsetenvp(*env, PMI2_STEP_NODES_ENV); } /* * how to get the mapping info from stepd directly? * there is the task distribution info in the launch_tasks_request_msg_t, * but it is not stored in the stepd_step_rec_t. */ p = getenvp(*env, PMI2_PROC_MAPPING_ENV); if (!p) { error("PMI2_PROC_MAPPING_ENV not found"); return SLURM_ERROR; } else { job_info.proc_mapping = xstrdup(p); unsetenvp(*env, PMI2_PROC_MAPPING_ENV); } job_info.job_env = env_array_copy((const char **)*env); job_info.MPIR_proctable = NULL; job_info.srun_opt = NULL; return SLURM_SUCCESS; }
static int _setup_srun_job_info(const mpi_plugin_client_info_t *job) { char *p; void *handle = NULL, *sym = NULL; memset(&job_info, 0, sizeof(job_info)); job_info.jobid = job->jobid; job_info.stepid = job->stepid; job_info.nnodes = job->step_layout->node_cnt; job_info.nodeid = -1; /* id in tree. not used. */ job_info.ntasks = job->step_layout->task_cnt; job_info.ltasks = 0; /* not used */ job_info.gtids = NULL; /* not used */ p = getenv(PMI2_PMI_DEBUGGED_ENV); if (p) { job_info.pmi_debugged = atoi(p); } else { job_info.pmi_debugged = 0; } p = getenv(PMI2_SPAWN_SEQ_ENV); if (p) { /* spawned */ job_info.spawn_seq = atoi(p); p = getenv(PMI2_SPAWNER_JOBID_ENV); job_info.spawner_jobid = xstrdup(p); /* env unset in stepd */ } else { job_info.spawn_seq = 0; job_info.spawner_jobid = NULL; } job_info.step_nodelist = xstrdup(job->step_layout->node_list); job_info.proc_mapping = _get_proc_mapping(job); if (job_info.proc_mapping == NULL) { return SLURM_ERROR; } p = getenv(PMI2_PMI_JOBID_ENV); if (p) { /* spawned */ job_info.pmi_jobid = xstrdup(p); } else { xstrfmtcat(job_info.pmi_jobid, "%u.%u", job->jobid, job->stepid); } job_info.job_env = env_array_copy((const char **)environ); /* hjcao: this is really dirty. But writing a new launcher is not desirable. */ handle = dlopen(NULL, RTLD_LAZY); if (handle == NULL) { error("mpi/pmi2: failed to dlopen()"); return SLURM_ERROR; } sym = dlsym(handle, "MPIR_proctable"); if (sym == NULL) { /* if called directly in API, there may be no symbol available */ verbose ("mpi/pmi2: failed to find symbol 'MPIR_proctable'"); job_info.MPIR_proctable = NULL; } else { job_info.MPIR_proctable = *(MPIR_PROCDESC **)sym; } sym = dlsym(handle, "opt"); if (sym == NULL) { verbose("mpi/pmi2: failed to find symbol 'opt'"); job_info.srun_opt = NULL; } else { job_info.srun_opt = (opt_t *)sym; } dlclose(handle); return SLURM_SUCCESS; }
static void _setup_one_job_env(opt_t *opt_local, srun_job_t *job, bool got_alloc) { env_t *env = xmalloc(sizeof(env_t)); uint16_t *tasks = NULL; xassert(job); env->localid = -1; env->nodeid = -1; env->procid = -1; env->stepid = -1; if (opt_local->bcast_flag) _file_bcast(opt_local, job); if (opt_local->cpus_set) env->cpus_per_task = opt_local->cpus_per_task; if (opt_local->ntasks_per_node != NO_VAL) env->ntasks_per_node = opt_local->ntasks_per_node; if (opt_local->ntasks_per_socket != NO_VAL) env->ntasks_per_socket = opt_local->ntasks_per_socket; if (opt_local->ntasks_per_core != NO_VAL) env->ntasks_per_core = opt_local->ntasks_per_core; env->distribution = opt_local->distribution; if (opt_local->plane_size != NO_VAL) env->plane_size = opt_local->plane_size; env->cpu_bind_type = opt_local->cpu_bind_type; env->cpu_bind = opt_local->cpu_bind; env->cpu_freq_min = opt_local->cpu_freq_min; env->cpu_freq_max = opt_local->cpu_freq_max; env->cpu_freq_gov = opt_local->cpu_freq_gov; env->mem_bind_type = opt_local->mem_bind_type; env->mem_bind = opt_local->mem_bind; env->overcommit = opt_local->overcommit; env->slurmd_debug = opt_local->slurmd_debug; env->labelio = opt_local->labelio; env->comm_port = slurmctld_comm_addr.port; if (opt_local->job_name) env->job_name = opt_local->job_name; slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_TASKS, &tasks); env->select_jobinfo = job->select_jobinfo; if (job->pack_node_list) env->nodelist = job->pack_node_list; else env->nodelist = job->nodelist; env->partition = job->partition; /* * If we didn't get the allocation don't overwrite the previous info. */ if (got_alloc) env->nhosts = job->nhosts; env->ntasks = job->ntasks; if (job->pack_ntasks != NO_VAL) env->ntasks = job->pack_ntasks; env->task_count = _uint16_array_to_str(job->nhosts, tasks); if (job->pack_jobid != NO_VAL) env->jobid = job->pack_jobid; else env->jobid = job->jobid; env->stepid = job->stepid; env->account = job->account; env->qos = job->qos; env->resv_name = job->resv_name; if (opt_local->pty && (set_winsize(job) < 0)) { error("Not using a pseudo-terminal, disregarding --pty option"); opt_local->pty = false; } if (opt_local->pty) { struct termios term; int fd = STDIN_FILENO; /* Save terminal settings for restore */ tcgetattr(fd, &termdefaults); tcgetattr(fd, &term); /* Set raw mode on local tty */ cfmakeraw(&term); /* Re-enable output processing such that debug() and * and error() work properly. */ term.c_oflag |= OPOST; tcsetattr(fd, TCSANOW, &term); atexit(&_pty_restore); block_sigwinch(); pty_thread_create(job); env->pty_port = job->pty_port; env->ws_col = job->ws_col; env->ws_row = job->ws_row; } env->env = env_array_copy((const char **) environ); setup_env(env, opt_local->preserve_env); job->env = env->env; xfree(env->task_count); xfree(env); }