Exemplo n.º 1
0
Arquivo: gang.c Projeto: corburn/slurm
/* Notify the gang scheduler that a job has completed */
extern int gs_job_fini(struct job_record *job_ptr)
{
	struct gs_part *p_ptr;
	char *part_name;

	if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG)
		info("gang: entering gs_job_fini for job %u", job_ptr->job_id);
	if (job_ptr->part_ptr && job_ptr->part_ptr->name)
		part_name = job_ptr->part_ptr->name;
	else
		part_name = job_ptr->partition;
	pthread_mutex_lock(&data_mutex);
	p_ptr = list_find_first(gs_part_list, _find_gs_part, part_name);
	if (!p_ptr) {
		pthread_mutex_unlock(&data_mutex);
		if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG)
			info("gang: leaving gs_job_fini");
		return SLURM_SUCCESS;
	}

	/* remove job from the partition */
	_remove_job_from_part(job_ptr->job_id, p_ptr, true);
	/* this job may have preempted other jobs, so
	 * check by updating all active rows */
	_update_all_active_rows();
	pthread_mutex_unlock(&data_mutex);
	if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG)
		info("gang: leaving gs_job_fini");

	return SLURM_SUCCESS;
}
Exemplo n.º 2
0
/* Notify the gang scheduler that a job has been started */
extern int gs_job_start(struct job_record *job_ptr)
{
	struct gs_part *p_ptr;
	uint16_t job_state;

	if (gs_debug_flags & DEBUG_FLAG_GANG)
		info("gang: entering gs_job_start for job %u", job_ptr->job_id);
	/* add job to partition */
	pthread_mutex_lock(&data_mutex);
	p_ptr = list_find_first(gs_part_list, _find_gs_part,
				job_ptr->partition);
	if (p_ptr) {
		job_state = _add_job_to_part(p_ptr, job_ptr);
		/* if this job is running then check for preemption */
		if (job_state == GS_RESUME)
			_update_all_active_rows();
	}
	pthread_mutex_unlock(&data_mutex);

	if (!p_ptr) {
		/* No partition was found for this job, so let it run
		 * uninterupted (what else can we do?)
		 */
		error("gang: could not find partition %s for job %u",
		      job_ptr->partition, job_ptr->job_id);
	}

	_preempt_job_dequeue();	/* MUST BE OUTSIDE OF data_mutex lock */
	if (gs_debug_flags & DEBUG_FLAG_GANG)
		info("gang: leaving gs_job_start");

	return SLURM_SUCCESS;
}
Exemplo n.º 3
0
/* Notify the gang scheduler that a job has been resumed or started.
 * In either case, add the job to gang scheduling. */
extern void gs_job_start(struct job_record *job_ptr)
{
	struct gs_part *p_ptr;
	uint16_t job_sig_state;
	char *part_name;

	if (!(slurmctld_conf.preempt_mode & PREEMPT_MODE_GANG))
		return;

	if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG)
		info("gang: entering gs_job_start for job %u", job_ptr->job_id);
	/* add job to partition */
	if (job_ptr->part_ptr && job_ptr->part_ptr->name)
		part_name = job_ptr->part_ptr->name;
	else
		part_name = job_ptr->partition;
	slurm_mutex_lock(&data_mutex);
	p_ptr = list_find_first(gs_part_list, _find_gs_part, part_name);
	if (p_ptr) {
		job_sig_state = _add_job_to_part(p_ptr, job_ptr);
		/* if this job is running then check for preemption */
		if (job_sig_state == GS_RESUME)
			_update_all_active_rows();
	}
	slurm_mutex_unlock(&data_mutex);

	if (!p_ptr) {
		/* No partition was found for this job, so let it run
		 * uninterupted (what else can we do?)
		 */
		error("gang: could not find partition %s for job %u",
		      part_name, job_ptr->job_id);
	}

	_preempt_job_dequeue();	/* MUST BE OUTSIDE OF data_mutex lock */
	if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG)
		info("gang: leaving gs_job_start");
}
Exemplo n.º 4
0
Arquivo: gang.c Projeto: corburn/slurm
/* ensure that all jobs running in SLURM are accounted for.
 * this procedure assumes that the gs data has already been
 * locked by the caller!
 */
static void _scan_slurm_job_list(void)
{
	struct job_record *job_ptr;
	struct gs_part *p_ptr;
	int i;
	ListIterator job_iterator;
	char *part_name;

	if (!job_list) {	/* no jobs */
		if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG)
			info("gang: _scan_slurm_job_list: job_list NULL");
		return;
	}
	if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG)
		info("gang: _scan_slurm_job_list: job_list exists...");
	job_iterator = list_iterator_create(job_list);
	while ((job_ptr = (struct job_record *) list_next(job_iterator))) {
		if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) {
			info("gang: _scan_slurm_job_list: checking job %u",
			    job_ptr->job_id);
		}
		if (IS_JOB_PENDING(job_ptr))
			continue;
		if (IS_JOB_SUSPENDED(job_ptr) && (job_ptr->priority == 0))
			continue;	/* not suspended by us */

		if (job_ptr->part_ptr && job_ptr->part_ptr->name)
			part_name = job_ptr->part_ptr->name;
		else
			part_name = job_ptr->partition;

		if (IS_JOB_SUSPENDED(job_ptr) || IS_JOB_RUNNING(job_ptr)) {
			/* are we tracking this job already? */
			p_ptr = list_find_first(gs_part_list, _find_gs_part,
						part_name);
			if (!p_ptr) /* no partition */
				continue;
			i = _find_job_index(p_ptr, job_ptr->job_id);
			if (i >= 0) /* we're tracking it, so continue */
				continue;

			/* We're not tracking this job. Resume it if it's
			 * suspended, and then add it to the job list. */

			if (IS_JOB_SUSPENDED(job_ptr) && job_ptr->priority) {
				/* The likely scenario here is that the
				 * failed over, and this is a job that gang
				 * had previously suspended. It's not possible
				 * to determine the previous order of jobs
				 * without preserving gang state, which is not
				 * worth the extra infrastructure. Just resume
				 * the job and then add it to the job list.
				 */
				_resume_job(job_ptr->job_id);
			}

			_add_job_to_part(p_ptr, job_ptr);
			continue;
		}

		/* if the job is not pending, suspended, or running, then
		 * it's completing or completed. Make sure we've released
		 * this job */
		p_ptr = list_find_first(gs_part_list, _find_gs_part, part_name);
		if (!p_ptr) /* no partition */
			continue;
		_remove_job_from_part(job_ptr->job_id, p_ptr, false);
	}
	list_iterator_destroy(job_iterator);

	/* now that all of the old jobs have been flushed out,
	 * update the active row of all partitions */
	_update_all_active_rows();

	return;
}