Esempio n. 1
0
int main(int argc, char * argv[])
{
	if (argc != 6) {
		printf("Usage: %s, control_addr job_id1 job_id2 sched_port is_bluegene\n",
			argv[0]);
		exit(1);
	}

	control_addr = argv[1];
	job_id1      = atoi(argv[2]);
	job_id2      = atoi(argv[3]);
	sched_port   = atoi(argv[4]);
	is_bluegene  = atoi(argv[5]);
	printf("control_addr=%s job_id=%ld,%ld sched_port=%d is_bluegene=%d\n",
		control_addr, job_id1, job_id2, sched_port, is_bluegene);

	_get_jobs();
	_get_nodes();
	_modify_job(job_id1);
	_get_jobs();
	_start_job(job_id1);
	if (!is_bluegene) {
		_suspend_job(job_id1);
		_resume_job(job_id1);
	}
	_cancel_job(job_id2);
	sleep(5);
	_get_jobs();

	printf("SUCCESS\n");
	exit(0);
}
Esempio n. 2
0
int main(int argc, char * argv[])
{
	if (argc < 6) {
		printf("Usage: %s, auth_key control_addr e_port "
			"job_id sched_port is_bluegene\n", argv[0]);
		exit(1);
	}

	auth_key     = argv[1];
	control_addr = argv[2];
	e_port       = atoi(argv[3]);
	job_id       = atoi(argv[4]);
	sched_port   = atoi(argv[5]);
	is_bluegene  = atoi(argv[6]);
	printf("auth_key=%s control_addr=%s e_port=%d job_id=%d sched_port=%d "
		"is_bluegene=%d\n",
		auth_key, control_addr, e_port, job_id, sched_port, is_bluegene);

#if _DEBUG
	_single_msg();
#else
	_initialize();
	_get_jobs();
	_get_nodes();
	_job_will_run(job_id);
	_modify_job(job_id);
	_get_jobs();
	_start_job(job_id);
	_get_jobs();
	if (!is_bluegene) {
		_suspend_job(job_id);
		_resume_job(job_id);
	}
	_notify_job(job_id);
	_signal_job(job_id);
	if (e_port)
		_event_mgr();
	else {
		printf("READY\n");
		sleep(3);
	}
	_cancel_job(job_id+1);
	_job_requeue(job_id);	/* Put job back into HELD state */
	sleep(15);
	_start_job(job_id);
	_get_jobs();
#endif
	printf("SUCCESS\n");
	exit(0);
}
Esempio n. 3
0
File: gang.c Progetto: corburn/slurm
/* Add the given job to the given partition, and if it remains running
 * then "cast it's shadow" over the active row of any partition with a
 * lower priority than the given partition. Return the sig state of the
 * job (GS_SUSPEND or GS_RESUME) */
static uint16_t _add_job_to_part(struct gs_part *p_ptr,
				 struct job_record *job_ptr)
{
	int i;
	struct gs_job *j_ptr;

	xassert(p_ptr);
	xassert(job_ptr->job_id > 0);
	xassert(job_ptr->job_resrcs);
	xassert(job_ptr->job_resrcs->node_bitmap);
	xassert(job_ptr->job_resrcs->core_bitmap);

	if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) {
		info("gang: _add_job_to_part: adding job %u to %s",
		     job_ptr->job_id, p_ptr->part_name);
	}

	/* take care of any memory needs */
	if (!p_ptr->job_list) {
		p_ptr->job_list_size = default_job_list_size;
		p_ptr->job_list = xmalloc(p_ptr->job_list_size *
					  sizeof(struct gs_job *));
		/* job_list is initialized to be NULL filled */
	}

	/* protect against duplicates */
	i = _find_job_index(p_ptr, job_ptr->job_id);
	if (i >= 0) {
		/* This job already exists, but the resource allocation
		 * may have changed. In any case, remove the existing
		 * job before adding this new one.
		 */
		if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) {
			info("gang: _add_job_to_part: duplicate job %u "
			     "detected", job_ptr->job_id);
		}
		_remove_job_from_part(job_ptr->job_id, p_ptr, false);
		_update_active_row(p_ptr, 0);
	}

	/* more memory management */
	if ((p_ptr->num_jobs + 1) == p_ptr->job_list_size) {
		p_ptr->job_list_size *= 2;
		xrealloc(p_ptr->job_list, p_ptr->job_list_size *
			 sizeof(struct gs_job *));
		/* enlarged job_list is initialized to be NULL filled */
	}
	j_ptr = xmalloc(sizeof(struct gs_job));

	/* gather job info */
	j_ptr->job_id    = job_ptr->job_id;
	j_ptr->job_ptr   = job_ptr;
	j_ptr->sig_state = GS_RESUME;  /* all jobs are running initially */
	j_ptr->row_state = GS_NO_ACTIVE; /* job is not in the active row */

	/* append this job to the job_list */
	p_ptr->job_list[p_ptr->num_jobs++] = j_ptr;

	/* determine the immediate fate of this job (run or suspend) */
	if (_job_fits_in_active_row(job_ptr, p_ptr)) {
		if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) {
			info("gang: _add_job_to_part: job %u remains running",
			     job_ptr->job_id);
		}
		_add_job_to_active(job_ptr, p_ptr);
		/* note that this job is a "filler" for this row */
		j_ptr->row_state = GS_FILLER;
		/* all jobs begin in the run state, so
		 * there's no need to signal this job */

		/* since this job is running we need to "cast it's shadow"
		 * over lower priority partitions */
		_cast_shadow(j_ptr, p_ptr->priority);

	} else {
		if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) {
			info("gang: _add_job_to_part: suspending job %u",
			     job_ptr->job_id);
		}
		if (p_ptr->num_shadows &&
		    (slurm_job_preempt_mode(job_ptr) !=
		     PREEMPT_MODE_SUSPEND)) {
			_preempt_job_queue(job_ptr->job_id);
		} else
			_suspend_job(job_ptr->job_id);
		j_ptr->sig_state = GS_SUSPEND;
	}

	_print_jobs(p_ptr);

	return j_ptr->sig_state;
}
Esempio n. 4
0
File: gang.c Progetto: corburn/slurm
/* Rebuild the active row BUT preserve the order of existing jobs.
 * This is called after one or more jobs have been removed from
 * the partition or if a higher priority "shadow" has been added
 * which could preempt running jobs.
 */
static void _update_active_row(struct gs_part *p_ptr, int add_new_jobs)
{
	int i;
	struct gs_job *j_ptr;

	if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) {
		info("gang: update_active_row: rebuilding part %s...",
		     p_ptr->part_name);
	}
	/* rebuild the active row, starting with any shadows */
	p_ptr->jobs_active = 0;
	for (i = 0; p_ptr->shadow && p_ptr->shadow[i]; i++) {
		_add_job_to_active(p_ptr->shadow[i]->job_ptr, p_ptr);
	}

	/* attempt to add the existing 'active' jobs */
	for (i = 0; i < p_ptr->num_jobs; i++) {
		j_ptr = p_ptr->job_list[i];
		if (j_ptr->row_state != GS_ACTIVE)
			continue;
		if (_job_fits_in_active_row(j_ptr->job_ptr, p_ptr)) {
			_add_job_to_active(j_ptr->job_ptr, p_ptr);
			_cast_shadow(j_ptr, p_ptr->priority);

		} else {
			/* this job has been preempted by a shadow job.
			 * suspend it and preserve it's job_list order */
			if (j_ptr->sig_state != GS_SUSPEND) {
				if (p_ptr->num_shadows &&
				    (slurm_job_preempt_mode(j_ptr->job_ptr) !=
				     PREEMPT_MODE_SUSPEND)) {
					_preempt_job_queue(j_ptr->job_id);
				} else
					_suspend_job(j_ptr->job_id);
				j_ptr->sig_state = GS_SUSPEND;
				_clear_shadow(j_ptr);
			}
			j_ptr->row_state = GS_NO_ACTIVE;
		}
	}
	/* attempt to add the existing 'filler' jobs */
	for (i = 0; i < p_ptr->num_jobs; i++) {
		j_ptr = p_ptr->job_list[i];
		if (j_ptr->row_state != GS_FILLER)
			continue;
		if (_job_fits_in_active_row(j_ptr->job_ptr, p_ptr)) {
			_add_job_to_active(j_ptr->job_ptr, p_ptr);
			_cast_shadow(j_ptr, p_ptr->priority);
		} else {
			/* this job has been preempted by a shadow job.
			 * suspend it and preserve it's job_list order */
			if (j_ptr->sig_state != GS_SUSPEND) {
				if (p_ptr->num_shadows &&
				    (slurm_job_preempt_mode(j_ptr->job_ptr) !=
				     PREEMPT_MODE_SUSPEND)) {
					_preempt_job_queue(j_ptr->job_id);
				} else
					_suspend_job(j_ptr->job_id);
				j_ptr->sig_state = GS_SUSPEND;
				_clear_shadow(j_ptr);
			}
			j_ptr->row_state = GS_NO_ACTIVE;
		}
	}

	if (!add_new_jobs)
		return;

	/* attempt to add any new jobs */
	for (i = 0; i < p_ptr->num_jobs; i++) {
		j_ptr = p_ptr->job_list[i];
		if ((j_ptr->row_state != GS_NO_ACTIVE) ||
		    (j_ptr->job_ptr->priority == 0))
			continue;
		if (_job_fits_in_active_row(j_ptr->job_ptr, p_ptr)) {
			_add_job_to_active(j_ptr->job_ptr, p_ptr);
			_cast_shadow(j_ptr, p_ptr->priority);
			/* note that this job is a "filler" for this row,
			 * blocked by a higher priority job */
			j_ptr->row_state = GS_FILLER;
			/* resume the job */
			if (j_ptr->sig_state == GS_SUSPEND) {
				_resume_job(j_ptr->job_id);
				j_ptr->sig_state = GS_RESUME;
			}
		}
	}
}
Esempio n. 5
0
File: gang.c Progetto: corburn/slurm
static void _preempt_job_dequeue(void)
{
	struct job_record *job_ptr;
	uint32_t job_id, *tmp_id;
	uint16_t preempt_mode;

	xassert(preempt_job_list);
	while ((tmp_id = list_pop(preempt_job_list))) {
		int rc = SLURM_ERROR;
		job_id = *tmp_id;
		xfree(tmp_id);

		if ((job_ptr = find_job_record(job_id)) == NULL) {
			error("_preempt_job_dequeue could not find job %u",
			      job_id);
			continue;
		}
		preempt_mode = slurm_job_preempt_mode(job_ptr);

		if (preempt_mode == PREEMPT_MODE_SUSPEND) {
			if ((rc = _suspend_job(job_id)) == ESLURM_DISABLED)
				rc = SLURM_SUCCESS;
		} else if (preempt_mode == PREEMPT_MODE_CANCEL) {
			rc = job_signal(job_ptr->job_id, SIGKILL, 0, 0, true);
			if (rc == SLURM_SUCCESS) {
				info("preempted job %u has been killed",
				     job_ptr->job_id);
			}
		} else if (preempt_mode == PREEMPT_MODE_CHECKPOINT) {
			checkpoint_msg_t ckpt_msg;
			memset(&ckpt_msg, 0, sizeof(checkpoint_msg_t));
			ckpt_msg.op	   = CHECK_REQUEUE;
			ckpt_msg.job_id    = job_ptr->job_id;
			rc = job_checkpoint(&ckpt_msg, 0, -1,
					    (uint16_t)NO_VAL);
			if (rc == ESLURM_NOT_SUPPORTED) {
				memset(&ckpt_msg, 0, sizeof(checkpoint_msg_t));
				ckpt_msg.op	   = CHECK_VACATE;
				ckpt_msg.job_id    = job_ptr->job_id;
				rc = job_checkpoint(&ckpt_msg, 0, -1,
						    (uint16_t)NO_VAL);
			}
			if (rc == SLURM_SUCCESS) {
				info("preempted job %u has been checkpointed",
				     job_ptr->job_id);
			} else
				error("preempted job %u could not be "
				      "checkpointed: %s",
				      job_ptr->job_id, slurm_strerror(rc));
		} else if ((preempt_mode == PREEMPT_MODE_REQUEUE) &&
			   job_ptr->batch_flag && job_ptr->details &&
			   (job_ptr->details->requeue > 0)) {
			rc = job_requeue(0, job_ptr->job_id, -1,
					 (uint16_t)NO_VAL, true, 0);
			if (rc == SLURM_SUCCESS) {
				info("preempted job %u has been requeued",
				     job_ptr->job_id);
			} else
				error("preempted job %u could not be "
				      "requeued: %s",
				      job_ptr->job_id, slurm_strerror(rc));
		}

		if (rc != SLURM_SUCCESS) {
			rc = job_signal(job_ptr->job_id, SIGKILL, 0, 0, true);
			if (rc == SLURM_SUCCESS)
				info("preempted job %u had to be killed",
				     job_ptr->job_id);
			else {
				info("preempted job %u kill failure %s",
				     job_ptr->job_id, slurm_strerror(rc));
			}
		}
	}

	return;
}
Esempio n. 6
0
File: gang.c Progetto: corburn/slurm
/* _cycle_job_list
 *
 * This is the heart of the timeslicer. The algorithm works as follows:
 *
 * 1. Each new job is added to the end of the job list, so the earliest job
 *    is at the front of the list.
 * 2. Any "shadow" jobs are first applied to the active_resmap. Then the
 *    active_resmap is filled out by starting with the first job in the list,
 *    and adding to it any job that doesn't conflict with the resources.
 * 3. When the timeslice has passed, all jobs that were added to the active
 *    resmap are moved to the back of the list (preserving their order among
 *    each other).
 * 4. Loop back to step 2, starting with the new "first job in the list".
 */
static void _cycle_job_list(struct gs_part *p_ptr)
{
	int i, j;
	struct gs_job *j_ptr;

	if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG)
		info("gang: entering _cycle_job_list");
	/* re-prioritize the job_list and set all row_states to GS_NO_ACTIVE */
	for (i = 0; i < p_ptr->num_jobs; i++) {
		while (p_ptr->job_list[i]->row_state == GS_ACTIVE) {
			/* move this job to the back row and "deactivate" it */
			j_ptr = p_ptr->job_list[i];
			j_ptr->row_state = GS_NO_ACTIVE;
			for (j = i; j+1 < p_ptr->num_jobs; j++) {
				p_ptr->job_list[j] = p_ptr->job_list[j+1];
			}
			p_ptr->job_list[j] = j_ptr;
		}
		if (p_ptr->job_list[i]->row_state == GS_FILLER)
			p_ptr->job_list[i]->row_state = GS_NO_ACTIVE;

	}
	if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG)
		info("gang: _cycle_job_list reordered job list:");
	/* Rebuild the active row. */
	_build_active_row(p_ptr);
	if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG)
		info("gang: _cycle_job_list new active job list:");
	_print_jobs(p_ptr);

	/* Suspend running jobs that are GS_NO_ACTIVE */
	for (i = 0; i < p_ptr->num_jobs; i++) {
		j_ptr = p_ptr->job_list[i];
		if ((j_ptr->row_state == GS_NO_ACTIVE) &&
		    (j_ptr->sig_state == GS_RESUME)) {
			if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) {
		    		info("gang: _cycle_job_list: suspending job %u",
				     j_ptr->job_id);
			}
			if (p_ptr->num_shadows &&
			    (slurm_job_preempt_mode(j_ptr->job_ptr) !=
			     PREEMPT_MODE_SUSPEND)) {
				_preempt_job_queue(j_ptr->job_id);
			} else
				_suspend_job(j_ptr->job_id);
			j_ptr->sig_state = GS_SUSPEND;
			_clear_shadow(j_ptr);
		}
	}

	/* Resume suspended jobs that are GS_ACTIVE */
	for (i = 0; i < p_ptr->num_jobs; i++) {
		j_ptr = p_ptr->job_list[i];
		if ((j_ptr->row_state == GS_ACTIVE) &&
		    (j_ptr->sig_state == GS_SUSPEND) &&
		    (j_ptr->job_ptr->priority != 0)) {	/* Redundant check */
			if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) {
		    		info("gang: _cycle_job_list: resuming job %u",
				     j_ptr->job_id);
			}
			_resume_job(j_ptr->job_id);
			j_ptr->sig_state = GS_RESUME;
			_cast_shadow(j_ptr, p_ptr->priority);
		}
	}
	if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG)
		info("gang: leaving _cycle_job_list");
}