Exemplo n.º 1
0
extern stepd_step_rec_t *
batch_stepd_step_rec_create(batch_job_launch_msg_t *msg)
{
	stepd_step_rec_t *job;
	srun_info_t  *srun = NULL;
	char *in_name;

	xassert(msg != NULL);

	debug3("entering batch_stepd_step_rec_create");

	if (!_valid_uid_gid((uid_t)msg->uid, &(msg->gid), &(msg->user_name)))
		return NULL;

	if (acct_gather_check_acct_freq_task(msg->job_mem, msg->acctg_freq))
		return NULL;

	job = xmalloc(sizeof(stepd_step_rec_t));

	job->state   = SLURMSTEPD_STEP_STARTING;
	if (msg->cpus_per_node)
		job->cpus    = msg->cpus_per_node[0];
	job->node_tasks  = 1;
	job->ntasks  = msg->ntasks;
	job->jobid   = msg->job_id;
	job->stepid  = msg->step_id;
	job->array_job_id  = msg->array_job_id;
	job->array_task_id = msg->array_task_id;
	job->job_core_spec = msg->job_core_spec;

	job->batch   = true;
	/* This needs to happen before acct_gather_profile_startpoll
	   and only really looks at the profile in the job.
	*/
	acct_gather_profile_g_node_step_start(job);
	/* needed for the jobacct_gather plugin to start */
	acct_gather_profile_startpoll(msg->acctg_freq,
				      conf->job_acct_gather_freq);

	job->multi_prog = 0;
	job->open_mode  = msg->open_mode;
	job->overcommit = (bool) msg->overcommit;
	job->node_name  = xstrdup(conf->node_name);

	job->uid     = (uid_t) msg->uid;
	job->user_name  = xstrdup(msg->user_name);
	job->gid     = (gid_t) msg->gid;
	job->cwd     = xstrdup(msg->work_dir);

	job->ckpt_dir = xstrdup(msg->ckpt_dir);
	job->restart_dir = xstrdup(msg->restart_dir);

	job->env     = _array_copy(msg->envc, msg->environment);
	job->eio     = eio_handle_create();
	job->sruns   = list_create((ListDelF) _srun_info_destructor);
	job->envtp   = xmalloc(sizeof(env_t));
	job->envtp->jobid = -1;
	job->envtp->stepid = -1;
	job->envtp->procid = -1;
	job->envtp->localid = -1;
	job->envtp->nodeid = -1;

	job->envtp->distribution = 0;
	job->cpu_bind_type = msg->cpu_bind_type;
	job->cpu_bind = xstrdup(msg->cpu_bind);
	job->envtp->mem_bind_type = 0;
	job->envtp->mem_bind = NULL;
	job->envtp->ckpt_dir = NULL;
	job->envtp->restart_cnt = msg->restart_cnt;

	if (msg->cpus_per_node)
		job->cpus    = msg->cpus_per_node[0];

	format_core_allocs(msg->cred, conf->node_name, conf->cpus,
			   &job->job_alloc_cores, &job->step_alloc_cores,
			   &job->job_mem, &job->step_mem);
	if (job->step_mem
		&& conf->mem_limit_enforce)
		jobacct_gather_set_mem_limit(job->jobid, NO_VAL, job->step_mem);
	else if (job->job_mem
		&& conf->mem_limit_enforce)
		jobacct_gather_set_mem_limit(job->jobid, NO_VAL, job->job_mem);

	get_cred_gres(msg->cred, conf->node_name,
		      &job->job_gres_list, &job->step_gres_list);

	srun = srun_info_create(NULL, NULL, NULL);

	list_append(job->sruns, (void *) srun);

	if (msg->argc) {
		job->argc    = msg->argc;
		job->argv    = _array_copy(job->argc, msg->argv);
	} else {
		job->argc    = 1;
		/* job script has not yet been written out to disk --
		 * argv will be filled in later by _make_batch_script()
		 */
		job->argv    = (char **) xmalloc(2 * sizeof(char *));
	}

	job->task = xmalloc(sizeof(stepd_step_task_info_t *));
	if (msg->std_err == NULL)
		msg->std_err = xstrdup(msg->std_out);

	if (msg->std_in == NULL)
		in_name = xstrdup("/dev/null");
	else
		in_name = fname_create(job, msg->std_in, 0);

	job->task[0] = task_info_create(0, 0,
					in_name,
					_batchfilename(job, msg->std_out),
					_batchfilename(job, msg->std_err));
	job->task[0]->argc = job->argc;
	job->task[0]->argv = job->argv;

#ifdef HAVE_ALPS_CRAY
	select_g_select_jobinfo_get(msg->select_jobinfo, SELECT_JOBDATA_RESV_ID,
				    &job->resv_id);
#endif

	return job;
}
Exemplo n.º 2
0
slurmd_job_t *
job_batch_job_create(batch_job_launch_msg_t *msg)
{
    struct passwd *pwd;
    slurmd_job_t *job;
    srun_info_t  *srun = NULL;
    char *in_name;

    xassert(msg != NULL);

    debug3("entering batch_job_create");

    if ((pwd = _pwd_create((uid_t)msg->uid)) == NULL) {
        error("uid %ld not found on system", (long) msg->uid);
        slurm_seterrno (ESLURMD_UID_NOT_FOUND);
        return NULL;
    }
    if (!_valid_gid(pwd, &(msg->gid))) {
        slurm_seterrno (ESLURMD_GID_NOT_FOUND);
        _pwd_destroy(pwd);
        return NULL;
    }
    if(msg->job_mem && (msg->acctg_freq != (uint16_t) NO_VAL)
            && (msg->acctg_freq > conf->job_acct_gather_freq)) {
        error("Can't set frequency to %u, it is higher than %u.  "
              "We need it to be at least at this level to "
              "monitor memory usage.",
              msg->acctg_freq, conf->job_acct_gather_freq);
        slurm_seterrno (ESLURMD_INVALID_ACCT_FREQ);
        _pwd_destroy(pwd);
        return NULL;
    }

    job = xmalloc(sizeof(slurmd_job_t));

    job->state   = SLURMSTEPD_STEP_STARTING;
    job->pwd     = pwd;
    if (msg->cpus_per_node)
        job->cpus    = msg->cpus_per_node[0];
    job->node_tasks  = 1;
    job->ntasks  = msg->ntasks;
    job->jobid   = msg->job_id;
    job->stepid  = msg->step_id;

    job->batch   = true;
    if (msg->acctg_freq != (uint16_t) NO_VAL)
        jobacct_gather_change_poll(msg->acctg_freq);
    job->multi_prog = 0;
    job->open_mode  = msg->open_mode;
    job->overcommit = (bool) msg->overcommit;
    job->node_name  = xstrdup(conf->node_name);

    job->uid     = (uid_t) msg->uid;
    job->gid     = (gid_t) msg->gid;
    job->cwd     = xstrdup(msg->work_dir);

    job->ckpt_dir = xstrdup(msg->ckpt_dir);
    job->restart_dir = xstrdup(msg->restart_dir);

    job->env     = _array_copy(msg->envc, msg->environment);
    job->eio     = eio_handle_create();
    job->sruns   = list_create((ListDelF) _srun_info_destructor);
    job->envtp   = xmalloc(sizeof(env_t));
    job->envtp->jobid = -1;
    job->envtp->stepid = -1;
    job->envtp->procid = -1;
    job->envtp->localid = -1;
    job->envtp->nodeid = -1;

    job->envtp->distribution = 0;
    job->cpu_bind_type = msg->cpu_bind_type;
    job->cpu_bind = xstrdup(msg->cpu_bind);
    job->envtp->mem_bind_type = 0;
    job->envtp->mem_bind = NULL;
    job->envtp->ckpt_dir = NULL;
    job->envtp->restart_cnt = msg->restart_cnt;

    if (msg->cpus_per_node)
        job->cpus    = msg->cpus_per_node[0];
    format_core_allocs(msg->cred, conf->node_name,
                       &job->job_alloc_cores, &job->step_alloc_cores,
                       &job->job_mem, &job->step_mem);
    if (job->step_mem)
        jobacct_gather_set_mem_limit(job->jobid, NO_VAL, job->step_mem);
    else if (job->job_mem)
        jobacct_gather_set_mem_limit(job->jobid, NO_VAL, job->job_mem);

    get_cred_gres(msg->cred, conf->node_name,
                  &job->job_gres_list, &job->step_gres_list);

    srun = srun_info_create(NULL, NULL, NULL);

    list_append(job->sruns, (void *) srun);

    if (msg->argc) {
        job->argc    = msg->argc;
        job->argv    = _array_copy(job->argc, msg->argv);
    } else {
        job->argc    = 1;
        /* job script has not yet been written out to disk --
         * argv will be filled in later by _make_batch_script()
         */
        job->argv    = (char **) xmalloc(2 * sizeof(char *));
    }

    job->task = xmalloc(sizeof(slurmd_task_info_t *));
    if (msg->std_err == NULL)
        msg->std_err = xstrdup(msg->std_out);

    if (msg->std_in == NULL)
        in_name = xstrdup("/dev/null");
    else
        in_name = fname_create(job, msg->std_in, 0);

    job->task[0] = task_info_create(0, 0,
                                    in_name,
                                    _batchfilename(job, msg->std_out),
                                    _batchfilename(job, msg->std_err));
    job->task[0]->argc = job->argc;
    job->task[0]->argv = job->argv;

#ifdef HAVE_CRAY
    select_g_select_jobinfo_get(msg->select_jobinfo, SELECT_JOBDATA_RESV_ID,
                                &job->resv_id);
#endif

    return job;
}