Exemplo n.º 1
0
static void *_step_fini(void *args)
{
	struct step_record *step_ptr = (struct step_record *)args;
	select_jobinfo_t *jobinfo = NULL;
	nhc_info_t nhc_info;

	/* Locks: Write job, write node */
	slurmctld_lock_t job_write_lock = {
		NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK
	};
	slurmctld_lock_t job_read_lock = {
		NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK };


	if (!step_ptr) {
		error("_step_fini: no step ptr given, "
		      "this should never happen");
		return NULL;
	}

	memset(&nhc_info, 0, sizeof(nhc_info_t));
	nhc_info.step = 1;
	lock_slurmctld(job_read_lock);
	nhc_info.jobid = step_ptr->job_ptr->job_id;
	nhc_info.apid = SLURM_ID_HASH(step_ptr->job_ptr->job_id,
				      step_ptr->step_id);
	nhc_info.exit_code = step_ptr->exit_code;
	nhc_info.user_id = step_ptr->job_ptr->user_id;

	if (!step_ptr->step_layout || !step_ptr->step_layout->node_list) {
		if (step_ptr->job_ptr)
			nhc_info.nodelist = xstrdup(step_ptr->job_ptr->nodes);
	} else
		nhc_info.nodelist = xstrdup(step_ptr->step_layout->node_list);
	unlock_slurmctld(job_read_lock);

	/* run NHC */
	_run_nhc(&nhc_info);
	/***********/

	xfree(nhc_info.nodelist);

	lock_slurmctld(job_write_lock);
	if (!step_ptr->job_ptr || !step_ptr->step_node_bitmap) {
		error("For some reason we don't have a step_node_bitmap or "
		      "a job_ptr for %"PRIu64".  This should never happen.",
		      nhc_info.apid);
	} else {
		other_step_finish(step_ptr);

		jobinfo = step_ptr->select_jobinfo->data;
		jobinfo->cleaning = 0;

		/* free resources on the job */
		post_job_step(step_ptr);
	}
	unlock_slurmctld(job_write_lock);

	return NULL;
}
Exemplo n.º 2
0
extern int select_p_step_finish(struct step_record *step_ptr)
{
	select_jobinfo_t *jobinfo = step_ptr->select_jobinfo->data;

	if (slurmctld_conf.select_type_param & CR_NHC_STEP_NO) {
		debug3("NHC_No_Steps set not running NHC on steps.");
		other_step_finish(step_ptr);
		/* free resources on the job */
		post_job_step(step_ptr);
		return SLURM_SUCCESS;
	}
	/* The NHC needs to be ran after each step even if the job is
	   about to run the NHC for the allocation.  The NHC
	   developers feel this is needed.  If it ever changes just
	   remove the below commented code.
	*/

	/*  else if (IS_JOB_COMPLETING(step_ptr->job_ptr)) { */
	/* 	debug3("step completion %u.%u was received after job " */
	/* 	      "allocation is already completing, no extra NHC needed.", */
	/* 	      step_ptr->job_ptr->job_id, step_ptr->step_id); */
	/* 	other_step_finish(step_ptr); */
	/* 	/\* free resources on the job *\/ */
	/* 	post_job_step(step_ptr); */
	/* 	return SLURM_SUCCESS; */
	/* } */

	jobinfo->cleaning = 1;
	_spawn_cleanup_thread(step_ptr, _step_fini);

	return SLURM_SUCCESS;
}
Exemplo n.º 3
0
extern int select_p_step_start(struct step_record *step_ptr)
{
#ifdef HAVE_NATIVE_CRAY
	if (aeld_running) {
		_update_app(step_ptr->job_ptr, step_ptr, ALPSC_EV_START);
	}
#endif

	return other_step_finish(step_ptr);
}
Exemplo n.º 4
0
static void *_step_fini(void *args)
{
	struct step_record *step_ptr = (struct step_record *)args;
	select_jobinfo_t *jobinfo = NULL;
	uint64_t apid = 0;
	char *node_list = NULL;

	/* Locks: Write job, write node */
	slurmctld_lock_t job_write_lock = {
		NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK
	};
	slurmctld_lock_t job_read_lock = {
		NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK };


	if (!step_ptr) {
		error("_step_fini: no step ptr given, "
		      "this should never happen");
		return NULL;
	}

	lock_slurmctld(job_read_lock);
	apid = SLURM_ID_HASH(step_ptr->job_ptr->job_id, step_ptr->step_id);

	if (!step_ptr->step_layout || !step_ptr->step_layout->node_list) {
		if (step_ptr->job_ptr)
			node_list = xstrdup(step_ptr->job_ptr->nodes);
	} else
		node_list = xstrdup(step_ptr->step_layout->node_list);
	unlock_slurmctld(job_read_lock);

	/* run NHC */
	_run_nhc(apid, node_list, 0);
	/***********/

	xfree(node_list);

	lock_slurmctld(job_write_lock);
	if (!step_ptr->job_ptr || !step_ptr->step_node_bitmap) {
		error("For some reason we don't have a step_node_bitmap or "
		      "a job_ptr for %"PRIu64".  This should never happen.",
		      apid);
	} else {
		other_step_finish(step_ptr);

		jobinfo = step_ptr->select_jobinfo->data;
		jobinfo->cleaning = 0;

		/* free resources on the job */
		post_job_step(step_ptr);
	}
	unlock_slurmctld(job_write_lock);

	return NULL;
}
Exemplo n.º 5
0
extern int select_p_step_finish(struct step_record *step_ptr)
{
	select_jobinfo_t *jobinfo = step_ptr->select_jobinfo->data;

	if (IS_JOB_COMPLETING(step_ptr->job_ptr)) {
		debug3("step completion %u.%u was received after job "
		      "allocation is already completing, no extra NHC needed.",
		      step_ptr->job_ptr->job_id, step_ptr->step_id);
		other_step_finish(step_ptr);
		/* free resources on the job */
		post_job_step(step_ptr);
		return SLURM_SUCCESS;
	}

	jobinfo->cleaning = 1;
	_spawn_cleanup_thread(step_ptr, _step_fini);

	return SLURM_SUCCESS;
}
Exemplo n.º 6
0
extern int select_p_step_start(struct step_record *step_ptr)
{
	return other_step_finish(step_ptr);
}
Exemplo n.º 7
0
extern int select_p_step_finish(struct step_record *step_ptr, bool killing_step)
{
	return other_step_finish(step_ptr, killing_step);
}