Пример #1
0
extern int select_p_step_finish(struct step_record *step_ptr)
{
	select_jobinfo_t *jobinfo = step_ptr->select_jobinfo->data;

	if (slurmctld_conf.select_type_param & CR_NHC_STEP_NO) {
		debug3("NHC_No_Steps set not running NHC on steps.");
		other_step_finish(step_ptr);
		/* free resources on the job */
		post_job_step(step_ptr);
		return SLURM_SUCCESS;
	}
	/* The NHC needs to be ran after each step even if the job is
	   about to run the NHC for the allocation.  The NHC
	   developers feel this is needed.  If it ever changes just
	   remove the below commented code.
	*/

	/*  else if (IS_JOB_COMPLETING(step_ptr->job_ptr)) { */
	/* 	debug3("step completion %u.%u was received after job " */
	/* 	      "allocation is already completing, no extra NHC needed.", */
	/* 	      step_ptr->job_ptr->job_id, step_ptr->step_id); */
	/* 	other_step_finish(step_ptr); */
	/* 	/\* free resources on the job *\/ */
	/* 	post_job_step(step_ptr); */
	/* 	return SLURM_SUCCESS; */
	/* } */

	jobinfo->cleaning = 1;
	_spawn_cleanup_thread(step_ptr, _step_fini);

	return SLURM_SUCCESS;
}
Пример #2
0
static void *_step_fini(void *args)
{
	struct step_record *step_ptr = (struct step_record *)args;
	select_jobinfo_t *jobinfo = NULL;
	nhc_info_t nhc_info;

	/* Locks: Write job, write node */
	slurmctld_lock_t job_write_lock = {
		NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK
	};
	slurmctld_lock_t job_read_lock = {
		NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK };


	if (!step_ptr) {
		error("_step_fini: no step ptr given, "
		      "this should never happen");
		return NULL;
	}

	memset(&nhc_info, 0, sizeof(nhc_info_t));
	nhc_info.step = 1;
	lock_slurmctld(job_read_lock);
	nhc_info.jobid = step_ptr->job_ptr->job_id;
	nhc_info.apid = SLURM_ID_HASH(step_ptr->job_ptr->job_id,
				      step_ptr->step_id);
	nhc_info.exit_code = step_ptr->exit_code;
	nhc_info.user_id = step_ptr->job_ptr->user_id;

	if (!step_ptr->step_layout || !step_ptr->step_layout->node_list) {
		if (step_ptr->job_ptr)
			nhc_info.nodelist = xstrdup(step_ptr->job_ptr->nodes);
	} else
		nhc_info.nodelist = xstrdup(step_ptr->step_layout->node_list);
	unlock_slurmctld(job_read_lock);

	/* run NHC */
	_run_nhc(&nhc_info);
	/***********/

	xfree(nhc_info.nodelist);

	lock_slurmctld(job_write_lock);
	if (!step_ptr->job_ptr || !step_ptr->step_node_bitmap) {
		error("For some reason we don't have a step_node_bitmap or "
		      "a job_ptr for %"PRIu64".  This should never happen.",
		      nhc_info.apid);
	} else {
		other_step_finish(step_ptr);

		jobinfo = step_ptr->select_jobinfo->data;
		jobinfo->cleaning = 0;

		/* free resources on the job */
		post_job_step(step_ptr);
	}
	unlock_slurmctld(job_write_lock);

	return NULL;
}
Пример #3
0
static void *_step_fini(void *args)
{
	struct step_record *step_ptr = (struct step_record *)args;
	select_jobinfo_t *jobinfo = NULL;
	uint64_t apid = 0;
	char *node_list = NULL;

	/* Locks: Write job, write node */
	slurmctld_lock_t job_write_lock = {
		NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK
	};
	slurmctld_lock_t job_read_lock = {
		NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK };


	if (!step_ptr) {
		error("_step_fini: no step ptr given, "
		      "this should never happen");
		return NULL;
	}

	lock_slurmctld(job_read_lock);
	apid = SLURM_ID_HASH(step_ptr->job_ptr->job_id, step_ptr->step_id);

	if (!step_ptr->step_layout || !step_ptr->step_layout->node_list) {
		if (step_ptr->job_ptr)
			node_list = xstrdup(step_ptr->job_ptr->nodes);
	} else
		node_list = xstrdup(step_ptr->step_layout->node_list);
	unlock_slurmctld(job_read_lock);

	/* run NHC */
	_run_nhc(apid, node_list, 0);
	/***********/

	xfree(node_list);

	lock_slurmctld(job_write_lock);
	if (!step_ptr->job_ptr || !step_ptr->step_node_bitmap) {
		error("For some reason we don't have a step_node_bitmap or "
		      "a job_ptr for %"PRIu64".  This should never happen.",
		      apid);
	} else {
		other_step_finish(step_ptr);

		jobinfo = step_ptr->select_jobinfo->data;
		jobinfo->cleaning = 0;

		/* free resources on the job */
		post_job_step(step_ptr);
	}
	unlock_slurmctld(job_write_lock);

	return NULL;
}
Пример #4
0
extern int select_p_step_finish(struct step_record *step_ptr)
{
	select_jobinfo_t *jobinfo = step_ptr->select_jobinfo->data;

	if (IS_JOB_COMPLETING(step_ptr->job_ptr)) {
		debug3("step completion %u.%u was received after job "
		      "allocation is already completing, no extra NHC needed.",
		      step_ptr->job_ptr->job_id, step_ptr->step_id);
		other_step_finish(step_ptr);
		/* free resources on the job */
		post_job_step(step_ptr);
		return SLURM_SUCCESS;
	}

	jobinfo->cleaning = 1;
	_spawn_cleanup_thread(step_ptr, _step_fini);

	return SLURM_SUCCESS;
}