static void _job_init_task_info(stepd_step_rec_t *job, uint32_t *gtid, char *ifname, char *ofname, char *efname) { int i; char *in, *out, *err; if (job->node_tasks == 0) { error("User requested launch of zero tasks!"); job->task = NULL; return; } job->task = (stepd_step_task_info_t **) xmalloc(job->node_tasks * sizeof(stepd_step_task_info_t *)); for (i = 0; i < job->node_tasks; i++){ in = _expand_stdio_filename(ifname, gtid[i], job); out = _expand_stdio_filename(ofname, gtid[i], job); err = _expand_stdio_filename(efname, gtid[i], job); job->task[i] = task_info_create(i, gtid[i], in, out, err); if (job->multi_prog) { multi_prog_get_argv(job->argv[1], job->env, gtid[i], &job->task[i]->argc, &job->task[i]->argv, job->argc, job->argv); } else { job->task[i]->argc = job->argc; job->task[i]->argv = job->argv; } } }
static void _job_init_task_info(stepd_step_rec_t *job, uint32_t **gtid, char *ifname, char *ofname, char *efname) { int i, node_id = job->nodeid; char *in, *out, *err; if (job->node_tasks == 0) { error("User requested launch of zero tasks!"); job->task = NULL; return; } #if defined(HAVE_NATIVE_CRAY) for (i = 0; i < job->nnodes; i++) { int j; for (j = 1; j < job->task_cnts[i]; j++) { if (gtid[i][j] != gtid[i][j-1] + 1) { job->non_smp = 1; break; } } } #endif job->task = (stepd_step_task_info_t **) xmalloc(job->node_tasks * sizeof(stepd_step_task_info_t *)); if (((job->flags & LAUNCH_MULTI_PROG) == 0) && job->argv) { char *new_path = build_path(job->argv[0], job->env, job->cwd); xfree(job->argv[0]); job->argv[0] = new_path; } for (i = 0; i < job->node_tasks; i++) { in = _expand_stdio_filename(ifname, gtid[node_id][i], job); out = _expand_stdio_filename(ofname, gtid[node_id][i], job); err = _expand_stdio_filename(efname, gtid[node_id][i], job); job->task[i] = task_info_create(i, gtid[node_id][i], in, out, err); if ((job->flags & LAUNCH_MULTI_PROG) == 0) { job->task[i]->argc = job->argc; job->task[i]->argv = job->argv; } } if (job->flags & LAUNCH_MULTI_PROG) { char *switch_type = slurm_get_switch_type(); if (!xstrcmp(switch_type, "switch/cray")) multi_prog_parse(job, gtid); xfree(switch_type); for (i = 0; i < job->node_tasks; i++){ multi_prog_get_argv(job->argv[1], job->env, gtid[node_id][i], &job->task[i]->argc, &job->task[i]->argv, job->argc, job->argv); } } }
static void _job_init_task_info(stepd_step_rec_t *job, uint32_t **gtid, char *ifname, char *ofname, char *efname) { int i, node_id = job->nodeid; char *in, *out, *err; if (job->node_tasks == 0) { error("User requested launch of zero tasks!"); job->task = NULL; return; } job->task = (stepd_step_task_info_t **) xmalloc(job->node_tasks * sizeof(stepd_step_task_info_t *)); for (i = 0; i < job->node_tasks; i++){ in = _expand_stdio_filename(ifname, gtid[node_id][i], job); out = _expand_stdio_filename(ofname, gtid[node_id][i], job); err = _expand_stdio_filename(efname, gtid[node_id][i], job); job->task[i] = task_info_create(i, gtid[node_id][i], in, out, err); if (!job->multi_prog) { job->task[i]->argc = job->argc; job->task[i]->argv = job->argv; } } if (job->multi_prog) { char *switch_type = slurm_get_switch_type(); if (!strcmp(switch_type, "switch/cray")) multi_prog_parse(job, gtid); xfree(switch_type); for (i = 0; i < job->node_tasks; i++){ multi_prog_get_argv(job->argv[1], job->env, gtid[node_id][i], &job->task[i]->argc, &job->task[i]->argv, job->argc, job->argv); } } }
extern stepd_step_rec_t * batch_stepd_step_rec_create(batch_job_launch_msg_t *msg) { stepd_step_rec_t *job; srun_info_t *srun = NULL; char *in_name; xassert(msg != NULL); debug3("entering batch_stepd_step_rec_create"); if (!_valid_uid_gid((uid_t)msg->uid, &(msg->gid), &(msg->user_name))) return NULL; if (_check_acct_freq_task(msg->job_mem, msg->acctg_freq)) return NULL; job = xmalloc(sizeof(stepd_step_rec_t)); job->state = SLURMSTEPD_STEP_STARTING; if (msg->cpus_per_node) job->cpus = msg->cpus_per_node[0]; job->node_tasks = 1; job->ntasks = msg->ntasks; job->jobid = msg->job_id; job->stepid = msg->step_id; job->array_job_id = msg->array_job_id; job->array_task_id = msg->array_task_id; job->batch = true; /* This needs to happen before acct_gather_profile_startpoll and only really looks at the profile in the job. */ acct_gather_profile_g_node_step_start(job); /* needed for the jobacct_gather plugin to start */ acct_gather_profile_startpoll(msg->acctg_freq, conf->job_acct_gather_freq); job->multi_prog = 0; job->open_mode = msg->open_mode; job->overcommit = (bool) msg->overcommit; job->node_name = xstrdup(conf->node_name); job->uid = (uid_t) msg->uid; job->user_name = xstrdup(msg->user_name); job->gid = (gid_t) msg->gid; job->cwd = xstrdup(msg->work_dir); job->ckpt_dir = xstrdup(msg->ckpt_dir); job->restart_dir = xstrdup(msg->restart_dir); job->env = _array_copy(msg->envc, msg->environment); job->eio = eio_handle_create(); job->sruns = list_create((ListDelF) _srun_info_destructor); job->envtp = xmalloc(sizeof(env_t)); job->envtp->jobid = -1; job->envtp->stepid = -1; job->envtp->procid = -1; job->envtp->localid = -1; job->envtp->nodeid = -1; job->envtp->distribution = 0; job->cpu_bind_type = msg->cpu_bind_type; job->cpu_bind = xstrdup(msg->cpu_bind); job->envtp->mem_bind_type = 0; job->envtp->mem_bind = NULL; job->envtp->ckpt_dir = NULL; job->envtp->restart_cnt = msg->restart_cnt; if (msg->cpus_per_node) job->cpus = msg->cpus_per_node[0]; format_core_allocs(msg->cred, conf->node_name, conf->cpus, &job->job_alloc_cores, &job->step_alloc_cores, &job->job_mem, &job->step_mem); if (job->step_mem) jobacct_gather_set_mem_limit(job->jobid, NO_VAL, job->step_mem); else if (job->job_mem) jobacct_gather_set_mem_limit(job->jobid, NO_VAL, job->job_mem); get_cred_gres(msg->cred, conf->node_name, &job->job_gres_list, &job->step_gres_list); srun = srun_info_create(NULL, NULL, NULL); list_append(job->sruns, (void *) srun); if (msg->argc) { job->argc = msg->argc; job->argv = _array_copy(job->argc, msg->argv); } else { job->argc = 1; /* job script has not yet been written out to disk -- * argv will be filled in later by _make_batch_script() */ job->argv = (char **) xmalloc(2 * sizeof(char *)); } job->task = xmalloc(sizeof(stepd_step_task_info_t *)); if (msg->std_err == NULL) msg->std_err = xstrdup(msg->std_out); if (msg->std_in == NULL) in_name = xstrdup("/dev/null"); else in_name = fname_create(job, msg->std_in, 0); job->task[0] = task_info_create(0, 0, in_name, _batchfilename(job, msg->std_out), _batchfilename(job, msg->std_err)); job->task[0]->argc = job->argc; job->task[0]->argv = job->argv; #ifdef HAVE_ALPS_CRAY select_g_select_jobinfo_get(msg->select_jobinfo, SELECT_JOBDATA_RESV_ID, &job->resv_id); #endif return job; }
slurmd_job_t * job_batch_job_create(batch_job_launch_msg_t *msg) { struct passwd *pwd; slurmd_job_t *job; srun_info_t *srun = NULL; char *in_name; xassert(msg != NULL); debug3("entering batch_job_create"); if ((pwd = _pwd_create((uid_t)msg->uid)) == NULL) { error("uid %ld not found on system", (long) msg->uid); slurm_seterrno (ESLURMD_UID_NOT_FOUND); return NULL; } if (!_valid_gid(pwd, &(msg->gid))) { slurm_seterrno (ESLURMD_GID_NOT_FOUND); _pwd_destroy(pwd); return NULL; } if(msg->job_mem && (msg->acctg_freq != (uint16_t) NO_VAL) && (msg->acctg_freq > conf->job_acct_gather_freq)) { error("Can't set frequency to %u, it is higher than %u. " "We need it to be at least at this level to " "monitor memory usage.", msg->acctg_freq, conf->job_acct_gather_freq); slurm_seterrno (ESLURMD_INVALID_ACCT_FREQ); _pwd_destroy(pwd); return NULL; } job = xmalloc(sizeof(slurmd_job_t)); job->state = SLURMSTEPD_STEP_STARTING; job->pwd = pwd; if (msg->cpus_per_node) job->cpus = msg->cpus_per_node[0]; job->node_tasks = 1; job->ntasks = msg->ntasks; job->jobid = msg->job_id; job->stepid = msg->step_id; job->batch = true; if (msg->acctg_freq != (uint16_t) NO_VAL) jobacct_gather_change_poll(msg->acctg_freq); job->multi_prog = 0; job->open_mode = msg->open_mode; job->overcommit = (bool) msg->overcommit; job->node_name = xstrdup(conf->node_name); job->uid = (uid_t) msg->uid; job->gid = (gid_t) msg->gid; job->cwd = xstrdup(msg->work_dir); job->ckpt_dir = xstrdup(msg->ckpt_dir); job->restart_dir = xstrdup(msg->restart_dir); job->env = _array_copy(msg->envc, msg->environment); job->eio = eio_handle_create(); job->sruns = list_create((ListDelF) _srun_info_destructor); job->envtp = xmalloc(sizeof(env_t)); job->envtp->jobid = -1; job->envtp->stepid = -1; job->envtp->procid = -1; job->envtp->localid = -1; job->envtp->nodeid = -1; job->envtp->distribution = 0; job->cpu_bind_type = msg->cpu_bind_type; job->cpu_bind = xstrdup(msg->cpu_bind); job->envtp->mem_bind_type = 0; job->envtp->mem_bind = NULL; job->envtp->ckpt_dir = NULL; job->envtp->restart_cnt = msg->restart_cnt; if (msg->cpus_per_node) job->cpus = msg->cpus_per_node[0]; format_core_allocs(msg->cred, conf->node_name, &job->job_alloc_cores, &job->step_alloc_cores, &job->job_mem, &job->step_mem); if (job->step_mem) jobacct_gather_set_mem_limit(job->jobid, NO_VAL, job->step_mem); else if (job->job_mem) jobacct_gather_set_mem_limit(job->jobid, NO_VAL, job->job_mem); get_cred_gres(msg->cred, conf->node_name, &job->job_gres_list, &job->step_gres_list); srun = srun_info_create(NULL, NULL, NULL); list_append(job->sruns, (void *) srun); if (msg->argc) { job->argc = msg->argc; job->argv = _array_copy(job->argc, msg->argv); } else { job->argc = 1; /* job script has not yet been written out to disk -- * argv will be filled in later by _make_batch_script() */ job->argv = (char **) xmalloc(2 * sizeof(char *)); } job->task = xmalloc(sizeof(slurmd_task_info_t *)); if (msg->std_err == NULL) msg->std_err = xstrdup(msg->std_out); if (msg->std_in == NULL) in_name = xstrdup("/dev/null"); else in_name = fname_create(job, msg->std_in, 0); job->task[0] = task_info_create(0, 0, in_name, _batchfilename(job, msg->std_out), _batchfilename(job, msg->std_err)); job->task[0]->argc = job->argc; job->task[0]->argv = job->argv; #ifdef HAVE_CRAY select_g_select_jobinfo_get(msg->select_jobinfo, SELECT_JOBDATA_RESV_ID, &job->resv_id); #endif return job; }