Пример #1
0
/* Update block user and reboot as needed block_state_mutex needs to
 * be locked before coming in. */
static void _sync_agent(bg_action_t *bg_action_ptr, bg_record_t *bg_record)
{
	struct job_record *job_ptr = bg_action_ptr->job_ptr;

	debug3("Queue sync of job %u in BG block %s ending at %ld",
	       job_ptr->job_id, bg_action_ptr->bg_block_id,
	       job_ptr->end_time);

	last_bg_update = time(NULL);

	ba_sync_job_to_block(bg_record, job_ptr);

	set_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_BLOCK_PTR,
			   bg_record);

	num_unused_cpus -= job_ptr->total_cpus;

	if (!block_ptr_exist_in_list(bg_lists->job_running, bg_record))
		list_push(bg_lists->job_running, bg_record);

	if (!block_ptr_exist_in_list(bg_lists->booted, bg_record))
		list_push(bg_lists->booted, bg_record);

	if (bg_record->state == BG_BLOCK_INITED) {
		int sync_user_rc;
		job_ptr->job_state &= (~JOB_CONFIGURING);
		last_job_update = time(NULL);
		/* Just in case reset the boot flags */
		bg_record->boot_state = 0;
		bg_record->boot_count = 0;
		sync_user_rc = bridge_block_sync_users(bg_record);

		if (sync_user_rc == SLURM_ERROR) {
			slurm_mutex_unlock(&block_state_mutex);
			(void) slurm_fail_job(job_ptr->job_id, JOB_BOOT_FAIL);
			slurm_mutex_lock(&block_state_mutex);
		}
		_destroy_bg_action(bg_action_ptr);
	} else {
		if (bg_record->state != BG_BLOCK_BOOTING) {
			error("Block %s isn't ready and isn't "
			      "being configured! Starting job again.",
			      bg_action_ptr->bg_block_id);
		} else {
			debug("Block %s is booting, job ok",
			      bg_action_ptr->bg_block_id);
		}
		/* the function _block_op calls will destroy the
		   bg_action_ptr */
		_block_op(bg_action_ptr);
	}
}
Пример #2
0
static void *_block_agent(void *args)
{
	bg_action_t *bg_action_ptr = (bg_action_t *)args;

	if (bg_action_ptr->op == START_OP)
		_start_agent(bg_action_ptr);
	else if (bg_action_ptr->op == TERM_OP)
		bridge_block_post_job(bg_action_ptr->bg_block_id,
				      bg_action_ptr->job_ptr);
	_destroy_bg_action(bg_action_ptr);

	return NULL;
}
Пример #3
0
/*
 * Synchronize BG block state to that of currently active jobs.
 * This can recover from slurmctld crashes when block usership
 * changes were queued
 */
extern int sync_jobs(List job_list)
{
	ListIterator itr;
	struct job_record  *job_ptr = NULL;
	List block_list = NULL, kill_list = NULL;
	static bool run_already = false;
	bg_record_t *bg_record = NULL;

	/* Execute only on initial startup. We don't support bgblock
	 * creation on demand today, so there is no need to re-sync data. */
	if (run_already)
		return SLURM_SUCCESS;
	run_already = true;

	if (!job_list) {
		error("sync_jobs: no job_list");
		return SLURM_ERROR;
	}
	slurm_mutex_lock(&block_state_mutex);
	/* Insure that all running jobs own the specified block */
	itr = list_iterator_create(job_list);
	while ((job_ptr = list_next(itr))) {
		bg_action_t *bg_action_ptr = NULL;
		if (!IS_JOB_RUNNING(job_ptr) && !IS_JOB_COMPLETING(job_ptr))
			continue;

		bg_action_ptr = xmalloc(sizeof(bg_action_t));
		if (IS_JOB_COMPLETING(job_ptr))
			bg_action_ptr->op = TERM_OP;
		else
			bg_action_ptr->op = START_OP;
		bg_action_ptr->job_ptr = job_ptr;

		get_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_BLOCK_ID,
				   &(bg_action_ptr->bg_block_id));
#ifdef HAVE_BG_L_P
# ifdef HAVE_BGL
		get_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_BLRTS_IMAGE,
				   &(bg_action_ptr->blrtsimage));
# else
		get_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_CONN_TYPE,
				   &(bg_action_ptr->conn_type));
# endif
		get_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_LINUX_IMAGE,
				   &(bg_action_ptr->linuximage));
		get_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_RAMDISK_IMAGE,
				   &(bg_action_ptr->ramdiskimage));
#endif
		get_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_MLOADER_IMAGE,
				   &(bg_action_ptr->mloaderimage));

		if (bg_action_ptr->bg_block_id == NULL) {
			error("Running job %u has bgblock==NULL",
			      job_ptr->job_id);
		} else if (job_ptr->nodes == NULL) {
			error("Running job %u has nodes==NULL",
			      job_ptr->job_id);
		} else if (!(bg_record = find_bg_record_in_list(
				     bg_lists->main,
				     bg_action_ptr->bg_block_id))) {
			error("Kill job %u belongs to defunct "
			      "bgblock %s",
			      job_ptr->job_id,
			      bg_action_ptr->bg_block_id);
		}

		if (!bg_record) {
			/* Can't fail it just now, we have locks in
			   place. */
			bg_status_add_job_kill_list(job_ptr, &kill_list);
			_destroy_bg_action(bg_action_ptr);
			continue;
		}
		/* _sync_agent will destroy the bg_action_ptr */
		_sync_agent(bg_action_ptr, bg_record);
	}
	list_iterator_destroy(itr);

	block_list = list_create(destroy_bg_record);
	itr = list_iterator_create(bg_lists->main);
	while ((bg_record = list_next(itr))) {
		bg_record_t *rm_record;
		if (bg_record->job_ptr
		    || (bg_record->job_list
			&& list_count(bg_record->job_list)))
			continue;
		rm_record = xmalloc(sizeof(bg_record_t));
		rm_record->magic = BLOCK_MAGIC;
		rm_record->bg_block_id = xstrdup(bg_record->bg_block_id);
		rm_record->mp_str = xstrdup(bg_record->mp_str);
		list_append(block_list, rm_record);
	}
	list_iterator_destroy(itr);
	slurm_mutex_unlock(&block_state_mutex);

	if (kill_list) {
		/* slurmctld is already locked up, so handle this right after
		 * the unlock of block_state_mutex.
		 */
		bg_status_process_kill_job_list(kill_list, JOB_BOOT_FAIL, 1);
		FREE_NULL_LIST(kill_list);
	}

	/* Insure that all other blocks are free of users */
	if (block_list) {
		itr = list_iterator_create(block_list);
		while ((bg_record = list_next(itr))) {
			info("Queue clearing of users of BG block %s",
			     bg_record->bg_block_id);
			term_jobs_on_block(bg_record->bg_block_id);
		}
		list_iterator_destroy(itr);
		FREE_NULL_LIST(block_list);
	} else {
		/* this should never happen,
		 * vestigial logic */
		error("sync_jobs: no block_list");
		return SLURM_ERROR;
	}
	return SLURM_SUCCESS;
}
Пример #4
0
/*
 * Perform any setup required to initiate a job
 * job_ptr IN - pointer to the job being initiated
 * RET - SLURM_SUCCESS or an error code
 *
 * NOTE: This happens in parallel with srun and slurmd spawning
 * the job. A prolog script is expected to defer initiation of
 * the job script until the BG block is available for use.
 */
extern int start_job(struct job_record *job_ptr)
{
	int rc = SLURM_SUCCESS;
	bg_record_t *bg_record = NULL;

	bg_action_t *bg_action_ptr = NULL;

	bg_action_ptr = xmalloc(sizeof(bg_action_t));
	bg_action_ptr->op = START_OP;
	bg_action_ptr->job_ptr = job_ptr;

	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_BLOCK_ID,
			   &(bg_action_ptr->bg_block_id));
	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_REBOOT,
			   &(bg_action_ptr->reboot));
	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_CONN_TYPE,
			   &(bg_action_ptr->conn_type));
	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_MLOADER_IMAGE,
			   &(bg_action_ptr->mloaderimage));
#ifdef HAVE_BG_L_P
# ifdef HAVE_BGL
	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_BLRTS_IMAGE,
			   &(bg_action_ptr->blrtsimage));
	if (!bg_action_ptr->blrtsimage) {
		bg_action_ptr->blrtsimage =
			xstrdup(bg_conf->default_blrtsimage);
		set_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_BLRTS_IMAGE,
				   bg_action_ptr->blrtsimage);
	}
# elif defined HAVE_BGP
	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_CONN_TYPE,
			   &(bg_action_ptr->conn_type));
# endif
	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_LINUX_IMAGE,
			   &(bg_action_ptr->linuximage));
	if (!bg_action_ptr->linuximage) {
		bg_action_ptr->linuximage =
			xstrdup(bg_conf->default_linuximage);
		set_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_LINUX_IMAGE,
				   bg_action_ptr->linuximage);
	}

	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_RAMDISK_IMAGE,
			   &(bg_action_ptr->ramdiskimage));
	if (!bg_action_ptr->ramdiskimage) {
		bg_action_ptr->ramdiskimage =
			xstrdup(bg_conf->default_ramdiskimage);
		set_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_RAMDISK_IMAGE,
				   bg_action_ptr->ramdiskimage);
	}

#endif
	if (!bg_action_ptr->mloaderimage) {
		bg_action_ptr->mloaderimage =
			xstrdup(bg_conf->default_mloaderimage);
		set_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_MLOADER_IMAGE,
				   bg_action_ptr->mloaderimage);
	}

	slurm_mutex_lock(&block_state_mutex);
	bg_record = find_bg_record_in_list(bg_lists->main,
					   bg_action_ptr->bg_block_id);
	if (!bg_record) {
		slurm_mutex_unlock(&block_state_mutex);
		error("bg_record %s doesn't exist, requested for job (%d)",
		      bg_action_ptr->bg_block_id, job_ptr->job_id);
		_destroy_bg_action(bg_action_ptr);
		return SLURM_ERROR;
	}

	last_bg_update = time(NULL);

	if (bg_record->job_list) {
		if (!find_job_in_bg_record(bg_record, job_ptr->job_id))
			list_append(bg_record->job_list, job_ptr);
	} else {
		bg_record->job_running = bg_action_ptr->job_ptr->job_id;
		bg_record->job_ptr = bg_action_ptr->job_ptr;
	}
	num_unused_cpus -= job_ptr->total_cpus;

	if (!block_ptr_exist_in_list(bg_lists->job_running, bg_record))
		list_push(bg_lists->job_running, bg_record);

	if (!block_ptr_exist_in_list(bg_lists->booted, bg_record))
		list_push(bg_lists->booted, bg_record);
	/* Just incase something happens to free this block before we
	   start the job we will make it so this job doesn't get blown
	   away.
	*/
	bg_record->modifying = 1;
	slurm_mutex_unlock(&block_state_mutex);

	info("Queue start of job %u in BG block %s",
	     job_ptr->job_id,
	     bg_action_ptr->bg_block_id);
	_block_op(bg_action_ptr);
	return rc;
}
Пример #5
0
/*
 * Synchronize BG block state to that of currently active jobs.
 * This can recover from slurmctld crashes when block usership
 * changes were queued
 */
extern int sync_jobs(List job_list)
{
	ListIterator job_iterator;
	struct job_record  *job_ptr = NULL;
	bg_action_t *bg_action_ptr = NULL;
	List block_list = NULL;
	static bool run_already = false;

	/* Execute only on initial startup. We don't support bgblock
	 * creation on demand today, so there is no need to re-sync data. */
	if (run_already)
		return SLURM_SUCCESS;
	run_already = true;

	if (!job_list) {
		error("sync_jobs: no job_list");
		return SLURM_ERROR;
	}
	/* Insure that all running jobs own the specified block */
	block_list = _get_all_allocated_blocks();
	job_iterator = list_iterator_create(job_list);
	while ((job_ptr = list_next(job_iterator))) {
		bool good_block = true;
		if (!IS_JOB_RUNNING(job_ptr))
			continue;

		bg_action_ptr = xmalloc(sizeof(bg_action_t));
		bg_action_ptr->op = SYNC_OP;
		bg_action_ptr->job_ptr = job_ptr;

		get_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_BLOCK_ID,
				   &(bg_action_ptr->bg_block_id));
#ifdef HAVE_BG_L_P
# ifdef HAVE_BGL
		get_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_BLRTS_IMAGE,
				   &(bg_action_ptr->blrtsimage));
# else
		get_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_CONN_TYPE,
				   &(bg_action_ptr->conn_type));
# endif
		get_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_LINUX_IMAGE,
				   &(bg_action_ptr->linuximage));
		get_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_RAMDISK_IMAGE,
				   &(bg_action_ptr->ramdiskimage));
#endif
		get_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_MLOADER_IMAGE,
				   &(bg_action_ptr->mloaderimage));

		if (bg_action_ptr->bg_block_id == NULL) {
			error("Running job %u has bgblock==NULL",
			      job_ptr->job_id);
			good_block = false;
		} else if (job_ptr->nodes == NULL) {
			error("Running job %u has nodes==NULL",
			      job_ptr->job_id);
			good_block = false;
		} else if (_excise_block(block_list,
					 bg_action_ptr->bg_block_id,
					 job_ptr->nodes)
			   != SLURM_SUCCESS) {
			error("Kill job %u belongs to defunct "
			      "bgblock %s",
			      job_ptr->job_id,
			      bg_action_ptr->bg_block_id);
			good_block = false;
		}
		if (!good_block) {
			job_ptr->job_state = JOB_FAILED
				| JOB_COMPLETING;
			job_ptr->end_time = time(NULL);
			last_job_update = time(NULL);
			_destroy_bg_action(bg_action_ptr);
			continue;
		}

		debug3("Queue sync of job %u in BG block %s "
		       "ending at %ld",
		       job_ptr->job_id,
		       bg_action_ptr->bg_block_id,
		       job_ptr->end_time);
		_block_op(bg_action_ptr);
	}
	list_iterator_destroy(job_iterator);

	/* Insure that all other blocks are free of users */
	if (block_list) {
		bridge_reset_block_list(block_list);
		list_destroy(block_list);
	} else {
		/* this should never happen,
		 * vestigial logic */
		error("sync_jobs: no block_list");
		return SLURM_ERROR;
	}
	return SLURM_SUCCESS;
}