Exemple #1
0
extern int select_p_job_suspend(struct job_record *job_ptr, bool indf_susp)
{
	if (job_ptr == NULL)
		return SLURM_SUCCESS;

	if ((!_zero_size_job(job_ptr)) &&
	    (do_basil_switch(job_ptr, 1) != SLURM_SUCCESS))
		return SLURM_ERROR;

	return other_job_suspend(job_ptr, indf_susp);
}
Exemple #2
0
extern int select_p_job_resume(struct job_record *job_ptr, bool indf_susp)
{
	if (job_ptr == NULL)
		return SLURM_SUCCESS;

	if (slurmctld_primary && !_zero_size_job(job_ptr) &&
	    (do_basil_switch(job_ptr, 0) != SLURM_SUCCESS))
		return SLURM_ERROR;

	return other_job_resume(job_ptr, indf_susp);
}
Exemple #3
0
extern int select_p_job_begin(struct job_record *job_ptr)
{
	xassert(job_ptr);

	if (slurmctld_primary && !_zero_size_job(job_ptr) &&
	    (do_basil_reserve(job_ptr) != SLURM_SUCCESS)) {
		job_ptr->state_reason = WAIT_RESOURCES;
		xfree(job_ptr->state_desc);
		return SLURM_ERROR;
	}
	return other_job_begin(job_ptr);
}
Exemple #4
0
extern int select_p_job_signal(struct job_record *job_ptr, int signal)
{
	xassert(job_ptr);
	/*
	 * Release the ALPS reservation already here for those signals that are
	 * likely to terminate the job. Otherwise there is a race condition if a
	 * script has more than one aprun line: while the apkill of the current
	 * aprun line is underway, the job script proceeds to run and executes
	 * the next following aprun line, until reaching the end of the script.
	 * This not only creates large delays, it can also mess up cleaning up
	 * after the job. Releasing the reservation will stop any new aprun
	 * lines from being executed.
	 */
	if (slurmctld_primary) {
		switch (signal) {
			case SIGCHLD:
			case SIGCONT:
			case SIGSTOP:
			case SIGTSTP:
			case SIGTTIN:
			case SIGTTOU:
			case SIGURG:
			case SIGWINCH:
				break;
		        case SIGTERM:
		        case SIGKILL:
				if (cray_conf->no_apid_signal_on_kill &&
				    job_ptr->batch_flag)
					return other_job_signal(
						job_ptr, signal);
			default:
				if (signal < SIGRTMIN)
					do_basil_release(job_ptr);
		}
	}

	if (slurmctld_primary && !_zero_size_job(job_ptr)) {
		if (signal != SIGKILL) {
			if (do_basil_signal(job_ptr, signal) != SLURM_SUCCESS)
				return SLURM_ERROR;
		} else {
			uint16_t kill_wait = slurm_get_kill_wait();
			if (do_basil_signal(job_ptr, SIGCONT) != SLURM_SUCCESS)
				return SLURM_ERROR;
			if (do_basil_signal(job_ptr, SIGTERM) != SLURM_SUCCESS)
				return SLURM_ERROR;
			queue_basil_signal(job_ptr, SIGKILL, kill_wait);
		}
	}
	return other_job_signal(job_ptr, signal);
}
Exemple #5
0
extern int select_p_job_fini(struct job_record *job_ptr)
{
	if (job_ptr == NULL)
		return SLURM_SUCCESS;
	if ((!_zero_size_job(job_ptr)) &&
	    (do_basil_release(job_ptr) != SLURM_SUCCESS))
		return SLURM_ERROR;
	/*
	 * Convention: like select_p_job_ready, may be called also from
	 *             stepdmgr, where job_state == NO_VAL is used to
	 *             distinguish the context from that of slurmctld.
	 */
	if (job_ptr->job_state == (uint16_t)NO_VAL)
		return SLURM_SUCCESS;
	return other_job_fini(job_ptr);
}
Exemple #6
0
extern int select_p_job_ready(struct job_record *job_ptr)
{
	int rc = SLURM_SUCCESS;

	xassert(job_ptr);
	/*
	 * Convention:	this function may be called also from stepdmgr, to
	 *		confirm the ALPS reservation of a batch job. In this
	 *		case, job_ptr only has minimal information and sets
	 *		job_state == NO_VAL to distinguish this call from one
	 *		done by slurmctld. It also sets batch_flag == 0, which
	 *		means that we need to confirm only if batch_flag is 0,
	 *		and execute the other_job_ready() only in slurmctld.
	 */
	if (!job_ptr->batch_flag && !_zero_size_job(job_ptr))
		rc = do_basil_confirm(job_ptr);
	if (rc != SLURM_SUCCESS || (job_ptr->job_state == (uint16_t)NO_VAL))
		return rc;
	return other_job_ready(job_ptr);
}
Exemple #7
0
extern int select_p_job_fini(struct job_record *job_ptr)
{
	if (job_ptr == NULL)
		return SLURM_SUCCESS;

	/* Don't run the release in the controller for batch jobs.  It is
	 * handled on the stepd end.
	 */
	if (((slurmctld_primary && !job_ptr->batch_flag) ||
	     (job_ptr->job_state == NO_VAL))
	    && !_zero_size_job(job_ptr) &&
	    (do_basil_release(job_ptr) != SLURM_SUCCESS))
		return SLURM_ERROR;
	/*
	 * Convention: like select_p_job_ready, may be called also from
	 *             stepdmgr, where job_state == NO_VAL is used to
	 *             distinguish the context from that of slurmctld.
	 */
	if (job_ptr->job_state == NO_VAL)
		return SLURM_SUCCESS;
	return other_job_fini(job_ptr);
}