Beispiel #1
0
extern int select_p_job_signal(struct job_record *job_ptr, int signal)
{
	xassert(job_ptr);
	/*
	 * Release the ALPS reservation already here for those signals that are
	 * likely to terminate the job. Otherwise there is a race condition if a
	 * script has more than one aprun line: while the apkill of the current
	 * aprun line is underway, the job script proceeds to run and executes
	 * the next following aprun line, until reaching the end of the script.
	 * This not only creates large delays, it can also mess up cleaning up
	 * after the job. Releasing the reservation will stop any new aprun
	 * lines from being executed.
	 */
	switch (signal) {
		case SIGCONT:
		case SIGSTOP:
		case SIGTSTP:
		case SIGTTIN:
		case SIGTTOU:
		case SIGURG:
		case SIGCHLD:
		case SIGWINCH:
			break;
		default:
			if (signal < SIGRTMIN)
				do_basil_release(job_ptr);
	}

	if (do_basil_signal(job_ptr, signal) != SLURM_SUCCESS)
		return SLURM_ERROR;
	return other_job_signal(job_ptr, signal);
}
Beispiel #2
0
extern int select_p_job_signal(struct job_record *job_ptr, int signal)
{
	xassert(job_ptr);
	/*
	 * Release the ALPS reservation already here for those signals that are
	 * likely to terminate the job. Otherwise there is a race condition if a
	 * script has more than one aprun line: while the apkill of the current
	 * aprun line is underway, the job script proceeds to run and executes
	 * the next following aprun line, until reaching the end of the script.
	 * This not only creates large delays, it can also mess up cleaning up
	 * after the job. Releasing the reservation will stop any new aprun
	 * lines from being executed.
	 */
	if (slurmctld_primary) {
		switch (signal) {
			case SIGCHLD:
			case SIGCONT:
			case SIGSTOP:
			case SIGTSTP:
			case SIGTTIN:
			case SIGTTOU:
			case SIGURG:
			case SIGWINCH:
				break;
		        case SIGTERM:
		        case SIGKILL:
				if (cray_conf->no_apid_signal_on_kill &&
				    job_ptr->batch_flag)
					return other_job_signal(
						job_ptr, signal);
			default:
				if (signal < SIGRTMIN)
					do_basil_release(job_ptr);
		}
	}

	if (slurmctld_primary && !_zero_size_job(job_ptr)) {
		if (signal != SIGKILL) {
			if (do_basil_signal(job_ptr, signal) != SLURM_SUCCESS)
				return SLURM_ERROR;
		} else {
			uint16_t kill_wait = slurm_get_kill_wait();
			if (do_basil_signal(job_ptr, SIGCONT) != SLURM_SUCCESS)
				return SLURM_ERROR;
			if (do_basil_signal(job_ptr, SIGTERM) != SLURM_SUCCESS)
				return SLURM_ERROR;
			queue_basil_signal(job_ptr, SIGKILL, kill_wait);
		}
	}
	return other_job_signal(job_ptr, signal);
}
Beispiel #3
0
extern int select_p_job_fini(struct job_record *job_ptr)
{
	if (job_ptr == NULL)
		return SLURM_SUCCESS;
	if (do_basil_release(job_ptr) != SLURM_SUCCESS)
		return SLURM_ERROR;
	/*
	 * Convention: like select_p_job_ready, may be called also from
	 *             stepdmgr, where job_state == NO_VAL is used to
	 *             distinguish the context from that of slurmctld.
	 */
	if (job_ptr->job_state == (uint16_t)NO_VAL)
		return SLURM_SUCCESS;
	return other_job_fini(job_ptr);
}
Beispiel #4
0
extern int select_p_job_fini(struct job_record *job_ptr)
{
	if (job_ptr == NULL)
		return SLURM_SUCCESS;

	/* Don't run the release in the controller for batch jobs.  It is
	 * handled on the stepd end.
	 */
	if (((slurmctld_primary && !job_ptr->batch_flag) ||
	     (job_ptr->job_state == NO_VAL))
	    && !_zero_size_job(job_ptr) &&
	    (do_basil_release(job_ptr) != SLURM_SUCCESS))
		return SLURM_ERROR;
	/*
	 * Convention: like select_p_job_ready, may be called also from
	 *             stepdmgr, where job_state == NO_VAL is used to
	 *             distinguish the context from that of slurmctld.
	 */
	if (job_ptr->job_state == NO_VAL)
		return SLURM_SUCCESS;
	return other_job_fini(job_ptr);
}