Пример #1
0
/* block_state_mutex must be unlocked before calling this. */
extern void bg_requeue_job(uint32_t job_id, bool wait_for_start,
			   bool slurmctld_locked, uint32_t job_state,
			   bool preempted)
{
	int rc;
	slurmctld_lock_t job_write_lock = {
		NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK };

	/* Wait for the slurmd to begin the batch script, slurm_fail_job()
	   is a no-op if issued prior to the script initiation do
	   clean up just incase the fail job isn't ran. */
	if (wait_for_start)
		sleep(2);

	if (!slurmctld_locked)
		lock_slurmctld(job_write_lock);
	rc = job_requeue(0, job_id, NULL, preempted, 0);
	if (rc == ESLURM_JOB_PENDING) {
		error("%s: Could not requeue pending job %u", __func__, job_id);
	} else if (rc != SLURM_SUCCESS) {
		error("%s: Could not requeue job %u, failing it: %s",
		      __func__, job_id, slurm_strerror(rc));
		job_fail(job_id, job_state);
	}
	if (!slurmctld_locked)
		unlock_slurmctld(job_write_lock);
}
Пример #2
0
/* RET 0 on success, -1 on failure */
extern int	job_requeue_wiki(char *cmd_ptr, int *err_code, char **err_msg)
{
	char *arg_ptr, *tmp_char;
	uint32_t jobid;
	struct job_record *job_ptr;
	static char reply_msg[128];
	int slurm_rc;
	/* Write lock on job and node info */
	slurmctld_lock_t job_write_lock = {
		NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK };

	arg_ptr = strstr(cmd_ptr, "ARG=");
	if (arg_ptr == NULL) {
		*err_code = -300;
		*err_msg = "REQUEUEJOB lacks ARG";
		error("wiki: REQUEUEJOB lacks ARG");
		return -1;
	}
	jobid = strtoul(arg_ptr+4, &tmp_char, 10);
	if ((tmp_char[0] != '\0') && (!isspace(tmp_char[0]))) {
		*err_code = -300;
		*err_msg = "Invalid ARG value";
		error("wiki: REQUEUEJOB has invalid jobid");
		return -1;
	}

	lock_slurmctld(job_write_lock);
	slurm_rc = job_requeue(0, jobid, NULL, false, 0);
	if (slurm_rc != SLURM_SUCCESS) {
		unlock_slurmctld(job_write_lock);
		*err_code = -700;
		*err_msg = slurm_strerror(slurm_rc);
		error("wiki: Failed to requeue job %u (%m)", jobid);
		return -1;
	}

	/* We need to clear the required node list here.
	 * If the job was submitted with srun and a
	 * required node list, it gets lost here. */
	job_ptr = find_job_record(jobid);
	if (job_ptr && job_ptr->details) {
		xfree(job_ptr->details->req_nodes);
		FREE_NULL_BITMAP(job_ptr->details->req_node_bitmap);
	}
	info("wiki: requeued job %u", jobid);
	unlock_slurmctld(job_write_lock);
	snprintf(reply_msg, sizeof(reply_msg),
		"job %u requeued successfully", jobid);
	*err_msg = reply_msg;
	return 0;
}
Пример #3
0
/* block_state_mutex must be unlocked before calling this. */
extern void bg_requeue_job(uint32_t job_id, bool wait_for_start)
{
	int rc;
	slurmctld_lock_t job_write_lock = {
		NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK };

	/* Wait for the slurmd to begin the batch script, slurm_fail_job()
	   is a no-op if issued prior to the script initiation do
	   clean up just incase the fail job isn't ran. */
	if (wait_for_start)
		sleep(2);

	lock_slurmctld(job_write_lock);
	if ((rc = job_requeue(0, job_id, -1, (uint16_t)NO_VAL, false))) {
		error("Couldn't requeue job %u, failing it: %s",
		      job_id, slurm_strerror(rc));
		job_fail(job_id);
	}
	unlock_slurmctld(job_write_lock);
}
Пример #4
0
static void _preempt_job_dequeue(void)
{
	struct job_record *job_ptr;
	uint32_t job_id, *tmp_id;
	uint16_t preempt_mode;

	xassert(preempt_job_list);
	while ((tmp_id = list_pop(preempt_job_list))) {
		int rc = SLURM_ERROR;
		job_id = *tmp_id;
		xfree(tmp_id);

		if ((job_ptr = find_job_record(job_id)) == NULL) {
			error("_preempt_job_dequeue could not find job %u",
			      job_id);
			continue;
		}
		preempt_mode = slurm_job_preempt_mode(job_ptr);

		if (preempt_mode == PREEMPT_MODE_SUSPEND) {
			if ((rc = _suspend_job(job_id)) == ESLURM_DISABLED)
				rc = SLURM_SUCCESS;
		} else if (preempt_mode == PREEMPT_MODE_CANCEL) {
			rc = job_signal(job_ptr->job_id, SIGKILL, 0, 0, true);
			if (rc == SLURM_SUCCESS) {
				info("preempted job %u has been killed",
				     job_ptr->job_id);
			}
		} else if (preempt_mode == PREEMPT_MODE_CHECKPOINT) {
			checkpoint_msg_t ckpt_msg;
			memset(&ckpt_msg, 0, sizeof(checkpoint_msg_t));
			ckpt_msg.op	   = CHECK_REQUEUE;
			ckpt_msg.job_id    = job_ptr->job_id;
			rc = job_checkpoint(&ckpt_msg, 0, -1,
					    (uint16_t)NO_VAL);
			if (rc == ESLURM_NOT_SUPPORTED) {
				memset(&ckpt_msg, 0, sizeof(checkpoint_msg_t));
				ckpt_msg.op	   = CHECK_VACATE;
				ckpt_msg.job_id    = job_ptr->job_id;
				rc = job_checkpoint(&ckpt_msg, 0, -1,
						    (uint16_t)NO_VAL);
			}
			if (rc == SLURM_SUCCESS) {
				info("preempted job %u has been checkpointed",
				     job_ptr->job_id);
			} else
				error("preempted job %u could not be "
				      "checkpointed: %s",
				      job_ptr->job_id, slurm_strerror(rc));
		} else if ((preempt_mode == PREEMPT_MODE_REQUEUE) &&
			   job_ptr->batch_flag && job_ptr->details &&
			   (job_ptr->details->requeue > 0)) {
			rc = job_requeue(0, job_ptr->job_id, -1,
					 (uint16_t)NO_VAL, true, 0);
			if (rc == SLURM_SUCCESS) {
				info("preempted job %u has been requeued",
				     job_ptr->job_id);
			} else
				error("preempted job %u could not be "
				      "requeued: %s",
				      job_ptr->job_id, slurm_strerror(rc));
		}

		if (rc != SLURM_SUCCESS) {
			rc = job_signal(job_ptr->job_id, SIGKILL, 0, 0, true);
			if (rc == SLURM_SUCCESS)
				info("preempted job %u had to be killed",
				     job_ptr->job_id);
			else {
				info("preempted job %u kill failure %s",
				     job_ptr->job_id, slurm_strerror(rc));
			}
		}
	}

	return;
}
Пример #5
0
static int _check_for_booted_overlapping_blocks(
	List block_list, ListIterator bg_record_itr,
	bg_record_t *bg_record, int overlap_check, List overlapped_list,
	uint16_t query_mode)
{
	bg_record_t *found_record = NULL;
	ListIterator itr = NULL;
	int rc = 0;
	int overlap = 0;
	bool is_test = SELECT_IS_TEST(query_mode);

	/* this test only is for actually picking a block not testing */
	if (is_test && bg_conf->layout_mode == LAYOUT_DYNAMIC)
		return rc;

	/* Make sure no other blocks are under this block
	   are booted and running jobs
	*/
	itr = list_iterator_create(block_list);
	while ((found_record = (bg_record_t*)list_next(itr)) != NULL) {
		if ((!found_record->bg_block_id)
		    || (bg_record == found_record)) {
			if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK)
				info("Don't need to look at myself %s %s",
				     bg_record->bg_block_id,
				     found_record->bg_block_id);
			continue;
		}

		slurm_mutex_lock(&block_state_mutex);
		overlap = blocks_overlap(bg_record, found_record);
		slurm_mutex_unlock(&block_state_mutex);

		if (overlap) {
			overlap = 0;
			/* make the available time on this block
			 * (bg_record) the max of this found_record's job
			 * or the one already set if in overlapped_block_list
			 * since we aren't setting job_running we
			 * don't have to remove them since the
			 * block_list should always be destroyed afterwards.
			 */
			if (is_test && overlapped_list
			    && found_record->job_ptr
			    && bg_record->job_running == NO_JOB_RUNNING) {
				ListIterator itr = list_iterator_create(
					overlapped_list);
				bg_record_t *tmp_rec = NULL;

				if (bg_conf->slurm_debug_flags
				    & DEBUG_FLAG_BG_PICK)
					info("found overlapping block %s "
					     "overlapped %s with job %u",
					     found_record->bg_block_id,
					     bg_record->bg_block_id,
					     found_record->job_ptr->job_id);

				while ((tmp_rec = list_next(itr))) {
					if (tmp_rec == bg_record)
						break;
				}
				list_iterator_destroy(itr);
				if (tmp_rec && tmp_rec->job_ptr->end_time
				    < found_record->job_ptr->end_time)
					tmp_rec->job_ptr =
						found_record->job_ptr;
				else if (!tmp_rec) {
					bg_record->job_ptr =
						found_record->job_ptr;
					list_append(overlapped_list,
						    bg_record);
				}
			}
			/* We already know this block doesn't work
			 * right now so we will if there is another
			 * overlapping block that ends later
			 */
			if (rc)
				continue;
			/* This test is here to check if the block we
			 * chose is not booted or if there is a block
			 * overlapping that we could avoid freeing if
			 * we choose something else
			 */
			if (bg_conf->layout_mode == LAYOUT_OVERLAP
			    && ((overlap_check == 0 && bg_record->state
				 != BG_BLOCK_INITED)
				|| (overlap_check == 1 && found_record->state
				    != BG_BLOCK_FREE))) {

				if (!is_test) {
					rc = 1;
					break;
				}
			}

			if (((bg_conf->layout_mode == LAYOUT_DYNAMIC)
			     || ((!SELECT_IS_CHECK_FULL_SET(query_mode)
				  || SELECT_IS_MODE_RUN_NOW(query_mode))
				 && (bg_conf->layout_mode != LAYOUT_DYNAMIC)))
			    && ((found_record->job_running != NO_JOB_RUNNING)
				|| (found_record->state
				    & BG_BLOCK_ERROR_FLAG))) {
				if ((found_record->job_running
				     == BLOCK_ERROR_STATE)
				    || (found_record->state
					& BG_BLOCK_ERROR_FLAG))
					error("can't use %s, "
					      "overlapping block %s "
					      "is in an error state.",
					      bg_record->bg_block_id,
					      found_record->bg_block_id);
				else if (bg_conf->slurm_debug_flags
					 & DEBUG_FLAG_BG_PICK)
					info("can't use %s, there is "
					     "a job (%d) running on "
					     "an overlapping "
					     "block %s",
					     bg_record->bg_block_id,
					     found_record->job_running,
					     found_record->bg_block_id);

				if (bg_conf->layout_mode == LAYOUT_DYNAMIC) {
					List tmp_list = list_create(NULL);
					/* this will remove and
					 * destroy the memory for
					 * bg_record
					 */
					list_remove(bg_record_itr);
					slurm_mutex_lock(&block_state_mutex);

					if (bg_record->original) {
						if (bg_conf->slurm_debug_flags
						    & DEBUG_FLAG_BG_PICK)
							info("This was a "
							     "copy %s",
							     bg_record->
							     bg_block_id);
						found_record =
							bg_record->original;
					} else {
						if (bg_conf->slurm_debug_flags
						    & DEBUG_FLAG_BG_PICK)
							info("looking for "
							     "original");
						found_record =
							find_org_in_bg_list(
								bg_lists->main,
								bg_record);
					}

					if (bg_conf->slurm_debug_flags
					    & DEBUG_FLAG_BG_PICK)
						info("Removing unusable block "
						     "%s from the system.",
						     bg_record->bg_block_id);

					if (!found_record) {
						if (bg_conf->slurm_debug_flags
						    & DEBUG_FLAG_BG_PICK)
							info("This record %s "
							     "wasn't found in "
							     "the "
							     "bg_lists->main, "
							     "no big deal, it "
							     "probably wasn't "
							     "added",
							     bg_record->
							     bg_block_id);
						found_record = bg_record;
					} else
						destroy_bg_record(bg_record);

					list_push(tmp_list, found_record);
					slurm_mutex_unlock(&block_state_mutex);

					/* We need to make sure if a
					   job is running here to not
					   call the regular method since
					   we are inside the job write
					   lock already.
					*/
					if (found_record->job_ptr
					    && !IS_JOB_FINISHED(
						    found_record->job_ptr)) {
						info("Somehow block %s "
						     "is being freed, but "
						     "appears to already have "
						     "a job %u(%u) running "
						     "on it.",
						     found_record->bg_block_id,
						     found_record->
						     job_ptr->job_id,
						     found_record->job_running);
						if (job_requeue(0,
								found_record->
								job_ptr->job_id,
								-1,
								(uint16_t)
								NO_VAL,
								false)) {
							error("Couldn't "
							      "requeue job %u, "
							      "failing it: %s",
							      found_record->
							      job_ptr->job_id,
							      slurm_strerror(
								      rc));
							job_fail(found_record->
								 job_ptr->
								 job_id);
						}
					}

					free_block_list(NO_VAL, tmp_list, 0, 0);
					list_destroy(tmp_list);
				}
				rc = 1;

				if (!is_test)
					break;
			}
		}
	}
	list_iterator_destroy(itr);

	return rc;
}