コード例 #1
0
ファイル: bg_job_run.c プロジェクト: HPCNow/slurm
/*
 * Perform any work required to terminate a jobs on a block.
 * bg_block_id IN - block name
 * RET - SLURM_SUCCESS or an error code
 *
 * NOTE: The job is killed before the function returns. This can take
 * many seconds. Do not call from slurmctld  or any other entity that
 * can not wait.
 */
int term_jobs_on_block(char *bg_block_id)
{
	int rc = SLURM_SUCCESS;
	bg_action_t *bg_action_ptr;

	bg_action_ptr = xmalloc(sizeof(bg_action_t));
	bg_action_ptr->op = TERM_OP;
	bg_action_ptr->bg_block_id = xstrdup(bg_block_id);
	_block_op(bg_action_ptr);

	return rc;
}
コード例 #2
0
ファイル: bg_job_run.c プロジェクト: HPCNow/slurm
/* Update block user and reboot as needed block_state_mutex needs to
 * be locked before coming in. */
static void _sync_agent(bg_action_t *bg_action_ptr, bg_record_t *bg_record)
{
	struct job_record *job_ptr = bg_action_ptr->job_ptr;

	debug3("Queue sync of job %u in BG block %s ending at %ld",
	       job_ptr->job_id, bg_action_ptr->bg_block_id,
	       job_ptr->end_time);

	last_bg_update = time(NULL);

	ba_sync_job_to_block(bg_record, job_ptr);

	set_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_BLOCK_PTR,
			   bg_record);

	num_unused_cpus -= job_ptr->total_cpus;

	if (!block_ptr_exist_in_list(bg_lists->job_running, bg_record))
		list_push(bg_lists->job_running, bg_record);

	if (!block_ptr_exist_in_list(bg_lists->booted, bg_record))
		list_push(bg_lists->booted, bg_record);

	if (bg_record->state == BG_BLOCK_INITED) {
		int sync_user_rc;
		job_ptr->job_state &= (~JOB_CONFIGURING);
		last_job_update = time(NULL);
		/* Just in case reset the boot flags */
		bg_record->boot_state = 0;
		bg_record->boot_count = 0;
		sync_user_rc = bridge_block_sync_users(bg_record);

		if (sync_user_rc == SLURM_ERROR) {
			slurm_mutex_unlock(&block_state_mutex);
			(void) slurm_fail_job(job_ptr->job_id, JOB_BOOT_FAIL);
			slurm_mutex_lock(&block_state_mutex);
		}
		_destroy_bg_action(bg_action_ptr);
	} else {
		if (bg_record->state != BG_BLOCK_BOOTING) {
			error("Block %s isn't ready and isn't "
			      "being configured! Starting job again.",
			      bg_action_ptr->bg_block_id);
		} else {
			debug("Block %s is booting, job ok",
			      bg_action_ptr->bg_block_id);
		}
		/* the function _block_op calls will destroy the
		   bg_action_ptr */
		_block_op(bg_action_ptr);
	}
}
コード例 #3
0
ファイル: bg_job_run.c プロジェクト: HPCNow/slurm
/*
 * Perform any work required to terminate a job
 * job_ptr IN - pointer to the job being terminated
 * RET - SLURM_SUCCESS or an error code
 *
 * NOTE: This happens in parallel with srun and slurmd terminating
 * the job. Insure that this function, mpirun and the epilog can
 * all deal with termination race conditions.
 */
int term_job(struct job_record *job_ptr)
{
	int rc = SLURM_SUCCESS;
	bg_action_t *bg_action_ptr = NULL;

	bg_action_ptr = xmalloc(sizeof(bg_action_t));
	bg_action_ptr->op = TERM_OP;
	bg_action_ptr->job_ptr = job_ptr;
	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_BLOCK_ID,
			   &(bg_action_ptr->bg_block_id));
	info("Queue termination of job %u in BG block %s",
	     job_ptr->job_id, bg_action_ptr->bg_block_id);
	_block_op(bg_action_ptr);

	return rc;
}
コード例 #4
0
ファイル: bg_job_run.c プロジェクト: HPCNow/slurm
/*
 * Perform any setup required to initiate a job
 * job_ptr IN - pointer to the job being initiated
 * RET - SLURM_SUCCESS or an error code
 *
 * NOTE: This happens in parallel with srun and slurmd spawning
 * the job. A prolog script is expected to defer initiation of
 * the job script until the BG block is available for use.
 */
extern int start_job(struct job_record *job_ptr)
{
	int rc = SLURM_SUCCESS;
	bg_record_t *bg_record = NULL;
	bg_action_t *bg_action_ptr = NULL;
	select_jobinfo_t *jobinfo = job_ptr->select_jobinfo->data;

	slurm_mutex_lock(&block_state_mutex);
	bg_record = jobinfo->bg_record;

	if (!bg_record || !block_ptr_exist_in_list(bg_lists->main, bg_record)) {
		slurm_mutex_unlock(&block_state_mutex);
		error("bg_record %s doesn't exist, requested for job (%d)",
		      jobinfo->bg_block_id, job_ptr->job_id);
		return SLURM_ERROR;
	}

	if ((jobinfo->conn_type[0] != SELECT_NAV)
	    && (jobinfo->conn_type[0] < SELECT_SMALL)) {
		int dim;
		for (dim=0; dim<SYSTEM_DIMENSIONS; dim++)
			jobinfo->conn_type[dim] = bg_record->conn_type[dim];
	}

	/* If it isn't 0 then it was setup previous (sub-block)
	*/
	if (jobinfo->geometry[SYSTEM_DIMENSIONS] == 0)
		memcpy(jobinfo->geometry, bg_record->geo,
		       sizeof(bg_record->geo));

	if (bg_record->job_list) {
		/* Mark the ba_mp cnodes as used now. */
		ba_mp_t *ba_mp = list_peek(bg_record->ba_mp_list);
		xassert(ba_mp);
		xassert(ba_mp->cnode_bitmap);
		bit_or(ba_mp->cnode_bitmap, jobinfo->units_avail);
		if (!find_job_in_bg_record(bg_record, job_ptr->job_id))
			list_append(bg_record->job_list, job_ptr);
	} else {
		bg_record->job_running = job_ptr->job_id;
		bg_record->job_ptr = job_ptr;
	}

	job_ptr->job_state |= JOB_CONFIGURING;

	bg_action_ptr = xmalloc(sizeof(bg_action_t));
	bg_action_ptr->op = START_OP;
	bg_action_ptr->job_ptr = job_ptr;

	/* FIXME: The below get_select_jobinfo calls could be avoided
	 * by just using the jobinfo as we do above.
	 */
	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_BLOCK_ID,
			   &(bg_action_ptr->bg_block_id));
	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_REBOOT,
			   &(bg_action_ptr->reboot));
	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_CONN_TYPE,
			   &(bg_action_ptr->conn_type));
	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_MLOADER_IMAGE,
			   &(bg_action_ptr->mloaderimage));
#ifdef HAVE_BG_L_P
# ifdef HAVE_BGL
	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_BLRTS_IMAGE,
			   &(bg_action_ptr->blrtsimage));
	if (!bg_action_ptr->blrtsimage) {
		bg_action_ptr->blrtsimage =
			xstrdup(bg_conf->default_blrtsimage);
		set_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_BLRTS_IMAGE,
				   bg_action_ptr->blrtsimage);
	}
# elif defined HAVE_BGP
	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_CONN_TYPE,
			   &(bg_action_ptr->conn_type));
# endif
	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_LINUX_IMAGE,
			   &(bg_action_ptr->linuximage));
	if (!bg_action_ptr->linuximage) {
		bg_action_ptr->linuximage =
			xstrdup(bg_conf->default_linuximage);
		set_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_LINUX_IMAGE,
				   bg_action_ptr->linuximage);
	}

	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_RAMDISK_IMAGE,
			   &(bg_action_ptr->ramdiskimage));
	if (!bg_action_ptr->ramdiskimage) {
		bg_action_ptr->ramdiskimage =
			xstrdup(bg_conf->default_ramdiskimage);
		set_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_RAMDISK_IMAGE,
				   bg_action_ptr->ramdiskimage);
	}

#endif
	if (!bg_action_ptr->mloaderimage) {
		bg_action_ptr->mloaderimage =
			xstrdup(bg_conf->default_mloaderimage);
		set_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_MLOADER_IMAGE,
				   bg_action_ptr->mloaderimage);
	}

	num_unused_cpus -= job_ptr->total_cpus;

	if (!block_ptr_exist_in_list(bg_lists->job_running, bg_record))
		list_push(bg_lists->job_running, bg_record);

	if (!block_ptr_exist_in_list(bg_lists->booted, bg_record))
		list_push(bg_lists->booted, bg_record);
	/* Just in case something happens to free this block before we
	   start the job we will make it so this job doesn't get blown
	   away.
	*/
	bg_record->modifying = 1;
	last_bg_update = time(NULL);

	slurm_mutex_unlock(&block_state_mutex);

	info("Queue start of job %u in BG block %s",
	     job_ptr->job_id,
	     bg_action_ptr->bg_block_id);
	_block_op(bg_action_ptr);
	return rc;
}
コード例 #5
0
ファイル: bg_job_run.c プロジェクト: Cray/slurm
/*
 * Perform any setup required to initiate a job
 * job_ptr IN - pointer to the job being initiated
 * RET - SLURM_SUCCESS or an error code
 *
 * NOTE: This happens in parallel with srun and slurmd spawning
 * the job. A prolog script is expected to defer initiation of
 * the job script until the BG block is available for use.
 */
extern int start_job(struct job_record *job_ptr)
{
	int rc = SLURM_SUCCESS;
	bg_record_t *bg_record = NULL;

	bg_action_t *bg_action_ptr = NULL;

	bg_action_ptr = xmalloc(sizeof(bg_action_t));
	bg_action_ptr->op = START_OP;
	bg_action_ptr->job_ptr = job_ptr;

	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_BLOCK_ID,
			   &(bg_action_ptr->bg_block_id));
	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_REBOOT,
			   &(bg_action_ptr->reboot));
	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_CONN_TYPE,
			   &(bg_action_ptr->conn_type));
	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_MLOADER_IMAGE,
			   &(bg_action_ptr->mloaderimage));
#ifdef HAVE_BG_L_P
# ifdef HAVE_BGL
	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_BLRTS_IMAGE,
			   &(bg_action_ptr->blrtsimage));
	if (!bg_action_ptr->blrtsimage) {
		bg_action_ptr->blrtsimage =
			xstrdup(bg_conf->default_blrtsimage);
		set_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_BLRTS_IMAGE,
				   bg_action_ptr->blrtsimage);
	}
# elif defined HAVE_BGP
	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_CONN_TYPE,
			   &(bg_action_ptr->conn_type));
# endif
	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_LINUX_IMAGE,
			   &(bg_action_ptr->linuximage));
	if (!bg_action_ptr->linuximage) {
		bg_action_ptr->linuximage =
			xstrdup(bg_conf->default_linuximage);
		set_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_LINUX_IMAGE,
				   bg_action_ptr->linuximage);
	}

	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_RAMDISK_IMAGE,
			   &(bg_action_ptr->ramdiskimage));
	if (!bg_action_ptr->ramdiskimage) {
		bg_action_ptr->ramdiskimage =
			xstrdup(bg_conf->default_ramdiskimage);
		set_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_RAMDISK_IMAGE,
				   bg_action_ptr->ramdiskimage);
	}

#endif
	if (!bg_action_ptr->mloaderimage) {
		bg_action_ptr->mloaderimage =
			xstrdup(bg_conf->default_mloaderimage);
		set_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_MLOADER_IMAGE,
				   bg_action_ptr->mloaderimage);
	}

	slurm_mutex_lock(&block_state_mutex);
	bg_record = find_bg_record_in_list(bg_lists->main,
					   bg_action_ptr->bg_block_id);
	if (!bg_record) {
		slurm_mutex_unlock(&block_state_mutex);
		error("bg_record %s doesn't exist, requested for job (%d)",
		      bg_action_ptr->bg_block_id, job_ptr->job_id);
		_destroy_bg_action(bg_action_ptr);
		return SLURM_ERROR;
	}

	last_bg_update = time(NULL);

	if (bg_record->job_list) {
		if (!find_job_in_bg_record(bg_record, job_ptr->job_id))
			list_append(bg_record->job_list, job_ptr);
	} else {
		bg_record->job_running = bg_action_ptr->job_ptr->job_id;
		bg_record->job_ptr = bg_action_ptr->job_ptr;
	}
	num_unused_cpus -= job_ptr->total_cpus;

	if (!block_ptr_exist_in_list(bg_lists->job_running, bg_record))
		list_push(bg_lists->job_running, bg_record);

	if (!block_ptr_exist_in_list(bg_lists->booted, bg_record))
		list_push(bg_lists->booted, bg_record);
	/* Just incase something happens to free this block before we
	   start the job we will make it so this job doesn't get blown
	   away.
	*/
	bg_record->modifying = 1;
	slurm_mutex_unlock(&block_state_mutex);

	info("Queue start of job %u in BG block %s",
	     job_ptr->job_id,
	     bg_action_ptr->bg_block_id);
	_block_op(bg_action_ptr);
	return rc;
}
コード例 #6
0
ファイル: bg_job_run.c プロジェクト: tpatki/slurm_test
/*
 * Synchronize BG block state to that of currently active jobs.
 * This can recover from slurmctld crashes when block usership
 * changes were queued
 */
extern int sync_jobs(List job_list)
{
	ListIterator job_iterator;
	struct job_record  *job_ptr = NULL;
	bg_action_t *bg_action_ptr = NULL;
	List block_list = NULL;
	static bool run_already = false;

	/* Execute only on initial startup. We don't support bgblock
	 * creation on demand today, so there is no need to re-sync data. */
	if (run_already)
		return SLURM_SUCCESS;
	run_already = true;

	if (!job_list) {
		error("sync_jobs: no job_list");
		return SLURM_ERROR;
	}
	/* Insure that all running jobs own the specified block */
	block_list = _get_all_allocated_blocks();
	job_iterator = list_iterator_create(job_list);
	while ((job_ptr = list_next(job_iterator))) {
		bool good_block = true;
		if (!IS_JOB_RUNNING(job_ptr))
			continue;

		bg_action_ptr = xmalloc(sizeof(bg_action_t));
		bg_action_ptr->op = SYNC_OP;
		bg_action_ptr->job_ptr = job_ptr;

		get_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_BLOCK_ID,
				   &(bg_action_ptr->bg_block_id));
#ifdef HAVE_BG_L_P
# ifdef HAVE_BGL
		get_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_BLRTS_IMAGE,
				   &(bg_action_ptr->blrtsimage));
# else
		get_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_CONN_TYPE,
				   &(bg_action_ptr->conn_type));
# endif
		get_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_LINUX_IMAGE,
				   &(bg_action_ptr->linuximage));
		get_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_RAMDISK_IMAGE,
				   &(bg_action_ptr->ramdiskimage));
#endif
		get_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_MLOADER_IMAGE,
				   &(bg_action_ptr->mloaderimage));

		if (bg_action_ptr->bg_block_id == NULL) {
			error("Running job %u has bgblock==NULL",
			      job_ptr->job_id);
			good_block = false;
		} else if (job_ptr->nodes == NULL) {
			error("Running job %u has nodes==NULL",
			      job_ptr->job_id);
			good_block = false;
		} else if (_excise_block(block_list,
					 bg_action_ptr->bg_block_id,
					 job_ptr->nodes)
			   != SLURM_SUCCESS) {
			error("Kill job %u belongs to defunct "
			      "bgblock %s",
			      job_ptr->job_id,
			      bg_action_ptr->bg_block_id);
			good_block = false;
		}
		if (!good_block) {
			job_ptr->job_state = JOB_FAILED
				| JOB_COMPLETING;
			job_ptr->end_time = time(NULL);
			last_job_update = time(NULL);
			_destroy_bg_action(bg_action_ptr);
			continue;
		}

		debug3("Queue sync of job %u in BG block %s "
		       "ending at %ld",
		       job_ptr->job_id,
		       bg_action_ptr->bg_block_id,
		       job_ptr->end_time);
		_block_op(bg_action_ptr);
	}
	list_iterator_destroy(job_iterator);

	/* Insure that all other blocks are free of users */
	if (block_list) {
		bridge_reset_block_list(block_list);
		list_destroy(block_list);
	} else {
		/* this should never happen,
		 * vestigial logic */
		error("sync_jobs: no block_list");
		return SLURM_ERROR;
	}
	return SLURM_SUCCESS;
}