Exemplo n.º 1
0
/* returns 1 if job and nodes are ready for job to begin, 0 otherwise */
static int _wait_nodes_ready(resource_allocation_response_msg_t *alloc)
{
	int is_ready = 0, i, rc;
	int cur_delay = 0;
	int suspend_time, resume_time, max_delay;

	suspend_time = slurm_get_suspend_timeout();
	resume_time  = slurm_get_resume_timeout();
	if ((suspend_time == 0) || (resume_time == 0))
		return 1;	/* Power save mode disabled */
	max_delay = suspend_time + resume_time;
	max_delay *= 5;		/* Allow for ResumeRate support */

	pending_job_id = alloc->job_id;

	if (opt.wait_all_nodes == (uint16_t) NO_VAL)
		opt.wait_all_nodes = DEFAULT_WAIT_ALL_NODES;

	for (i=0; (cur_delay < max_delay); i++) {
		if (i) {
			if (i == 1)
				info("Waiting for nodes to boot");
			else
				debug("still waiting");
			sleep(POLL_SLEEP);
			cur_delay += POLL_SLEEP;
		}

		if (opt.wait_all_nodes)
			rc = slurm_job_node_ready(alloc->job_id);
		else {
			is_ready = 1;
			break;
		}

		if (rc == READY_JOB_FATAL)
			break;				/* fatal error */
		if ((rc == READY_JOB_ERROR) || (rc == EAGAIN))
			continue;			/* retry */
		if ((rc & READY_JOB_STATE) == 0)	/* job killed */
			break;
		if (rc & READY_NODE_STATE) {		/* job and node ready */
			is_ready = 1;
			break;
		}
		if (allocation_interrupted)
			break;
	}
	if (is_ready) {
		if (i > 0)
     			info ("Nodes %s are ready for job", alloc->node_list);
	} else if (!allocation_interrupted)
		error("Nodes %s are still not ready", alloc->node_list);
	else	/* allocation_interrupted or slurmctld not responing */
		is_ready = 0;

	pending_job_id = 0;

	return is_ready;
}
Exemplo n.º 2
0
/* returns 1 if job and nodes are ready for job to begin, 0 otherwise */
static int _wait_bluegene_block_ready(resource_allocation_response_msg_t *alloc)
{
	int is_ready = 0, i, rc;
	char *block_id = NULL;
	double cur_delay = 0;
	double cur_sleep = 0;
	int max_delay = BG_FREE_PREVIOUS_BLOCK + BG_MIN_BLOCK_BOOT +
		(BG_INCR_BLOCK_BOOT * alloc->node_cnt);

	select_g_select_jobinfo_get(alloc->select_jobinfo,
				    SELECT_JOBDATA_BLOCK_ID,
				    &block_id);

	for (i = 0; cur_delay < max_delay; i++) {
		cur_sleep = POLL_SLEEP * i;
		if (i == 1) {
			debug("Waiting for block %s to become ready for job",
			      block_id);
		}
		if (i) {
			usleep(1000000 * cur_sleep);
			rc = _blocks_dealloc();
			if ((rc == 0) || (rc == -1))
				cur_delay += cur_sleep;
			debug2("still waiting");
		}

		rc = slurm_job_node_ready(alloc->job_id);

		if (rc == READY_JOB_FATAL)
			break;				/* fatal error */
		if ((rc == READY_JOB_ERROR) || (rc == EAGAIN))
			continue;			/* retry */
		if ((rc & READY_JOB_STATE) == 0)	/* job killed */
			break;
		if (rc & READY_NODE_STATE) {		/* job and node ready */
			is_ready = 1;
			break;
		}
		if (destroy_job)
			break;
	}
	if (is_ready)
     		debug("Block %s is ready for job", block_id);
	else if (!destroy_job)
		error("Block %s still not ready", block_id);
	else	/* destroy_job set and slurmctld not responing */
		is_ready = 0;

	xfree(block_id);

	return is_ready;
}
Exemplo n.º 3
0
static int _wait_bluegene_block_ready(resource_allocation_response_msg_t *alloc)
{
	int is_ready = SLURM_ERROR, i, rc = 0;
	char *block_id = NULL;
	int cur_delay = 0;
	int max_delay = BG_FREE_PREVIOUS_BLOCK + BG_MIN_BLOCK_BOOT +
		(BG_INCR_BLOCK_BOOT * alloc->node_cnt);

	select_g_select_jobinfo_get(alloc->select_jobinfo,
				    SELECT_JOBDATA_BLOCK_ID,
				    &block_id);

	for (i=0; (cur_delay < max_delay); i++) {
		if (i) {
			if (i == 1) {
				info("Waiting for block %s to become ready for "
				     "job", block_id);
			} else
				debug("still waiting");
			sleep(POLL_SLEEP);
			rc = _blocks_dealloc();
			if ((rc == 0) || (rc == -1))
				cur_delay += POLL_SLEEP;
		}

		rc = slurm_job_node_ready(alloc->job_id);

		if (rc == READY_JOB_FATAL)
			break;				/* fatal error */
		if ((rc == READY_JOB_ERROR) || (rc == EAGAIN))
			continue;			/* retry */
		if ((rc & READY_JOB_STATE) == 0)	/* job killed */
			break;
		if (rc & READY_NODE_STATE) {		/* job and node ready */
			is_ready = SLURM_SUCCESS;
			break;
		}
	}

	if (is_ready == SLURM_SUCCESS)
     		info("Block %s is ready for job %u", block_id, alloc->job_id);
	else if ((rc & READY_JOB_STATE) == 0)
		info("Job %u no longer running", alloc->job_id);
	else
		info("Problem running job %u", alloc->job_id);
	xfree(block_id);

	return is_ready;
}
Exemplo n.º 4
0
/* returns 1 if job and nodes are ready for job to begin, 0 otherwise */
static int _wait_part_ready(uint32_t job_id)
{
	int is_ready = 0, i, rc;

	max_delay = BG_FREE_PREVIOUS_BLOCK + BG_MIN_BLOCK_BOOT +
		(BG_INCR_BLOCK_BOOT * _get_job_size(job_id));

#if _DEBUG
	printf("Waiting for job %u to become ready.", job_id);
#endif

	for (i=0; (cur_delay < max_delay); i++) {
		if (i) {
			sleep(POLL_SLEEP);
			rc = _partitions_dealloc();
			if ((rc == 0) || (rc == -1))
				cur_delay += POLL_SLEEP;
#if _DEBUG
			printf(".");
#endif
		}

		rc = slurm_job_node_ready(job_id);
		if (rc == READY_JOB_FATAL)
			break;				/* fatal error */
		if (rc == READY_JOB_ERROR)		/* error */
			continue;			/* retry */
		if ((rc & READY_JOB_STATE) == 0) {	/* job killed */
			/* return 1 so we don't get a prolog error */
			is_ready = 1;
			break;
		}
		if (rc & READY_NODE_STATE) {		/* job and node ready */
			is_ready = 1;
			break;
		}
	}

#if _DEBUG
	if (is_ready == 0)
		printf("\n");
	else
     		printf("\nJob %u is ready.\n", job_id);
#endif
	if (is_ready == 0)
		fprintf(stderr, "Job %u is not ready.\n", job_id);
	return is_ready;
}
Exemplo n.º 5
0
static int _wait_nodes_ready(uint32_t job_id)
{
	int is_ready = SLURM_ERROR, i, rc = 0;
	int cur_delay = 0;
	int suspend_time, resume_time, max_delay;

	suspend_time = slurm_get_suspend_timeout();
	resume_time  = slurm_get_resume_timeout();
	if ((suspend_time == 0) || (resume_time == 0))
		return SLURM_SUCCESS;	/* Power save mode disabled */
	max_delay = suspend_time + resume_time;
	max_delay *= 5;		/* Allow for ResumeRate support */

	for (i=0; (cur_delay < max_delay); i++) {
		if (i) {
			if (i == 1)
				info("Waiting for nodes to boot");
			sleep(POLL_SLEEP);
			cur_delay += POLL_SLEEP;
		}

		rc = slurm_job_node_ready(job_id);

		if (rc == READY_JOB_FATAL)
			break;				/* fatal error */
		if ((rc == READY_JOB_ERROR) || (rc == EAGAIN))
			continue;			/* retry */
		if ((rc & READY_JOB_STATE) == 0)	/* job killed */
			break;
		if (rc & READY_NODE_STATE) {		/* job and node ready */
			is_ready = SLURM_SUCCESS;
			break;
		}
	}
	if (is_ready == SLURM_SUCCESS)
     		info("Nodes are ready for job %u", job_id);
	else if ((rc & READY_JOB_STATE) == 0)
		info("Job %u no longer running", job_id);
	else
		info("Problem running job %u", job_id);

	return is_ready;
}
Exemplo n.º 6
0
static void _wait_part_not_ready(uint32_t job_id)
{
	int is_ready = 1, i, rc;

	max_delay = MIN_DELAY + (INCR_DELAY * _get_job_size(job_id));

#if _DEBUG
	printf("Waiting for job %u to be not ready.", job_id);
#endif

	for (i=0; (cur_delay < max_delay); i++) {
		if (i) {
			sleep(POLL_SLEEP);
			cur_delay += POLL_SLEEP;
#if _DEBUG
			printf(".");
#endif
		}

		rc = slurm_job_node_ready(job_id);
		if (rc == READY_JOB_FATAL)
			break;				/* fatal error */
		if (rc == READY_JOB_ERROR)		/* error */
			continue;			/* retry */
		if ((rc & READY_NODE_STATE) == 0) {
			is_ready = 0;
			break;
		}
	}

#if _DEBUG
	if (is_ready == 1)
		printf("\n");
	else
     		printf("\nJob %u is not ready.\n", job_id);
#endif
	if (is_ready == 1)
		fprintf(stderr, "Job %u is still ready.\n", job_id);

}
Exemplo n.º 7
0
static void _wait_part_not_ready(uint32_t job_id)
{
	int is_ready = 1, rc;

#if _DEBUG
	printf("Waiting for job %u to be not ready.", job_id);
#endif

	/* It has been decided waiting forever is a better solution
	   than ending early and saying we are done when in reality
	   the job is still running.  So now we trust the slurmctld to
	   tell us when we are done and never end until that happens.
	*/
	while (1) {
		rc = slurm_job_node_ready(job_id);
		if (rc == READY_JOB_FATAL)
			break;				/* fatal error */
		if (rc == READY_JOB_ERROR)		/* error */
			continue;			/* retry */
		if ((rc & READY_NODE_STATE) == 0) {
			is_ready = 0;
			break;
		}
		sleep(POLL_SLEEP);
#if _DEBUG
		printf(".");
#endif
	}

#if _DEBUG
	if (is_ready == 1)
		printf("\n");
	else
     		printf("\nJob %u is not ready.\n", job_id);
#endif
	if (is_ready == 1)
		fprintf(stderr, "Job %u is still ready.\n", job_id);

}
Exemplo n.º 8
0
/* returns 1 if job and nodes are ready for job to begin, 0 otherwise */
static int _wait_nodes_ready(resource_allocation_response_msg_t *alloc)
{
	int is_ready = 0, i, rc;
	int cur_delay = 0;
	int suspend_time, resume_time, max_delay;

	suspend_time = slurm_get_suspend_timeout();
	resume_time  = slurm_get_resume_timeout();
	if ((suspend_time == 0) || (resume_time == 0))
		return 1;	/* Power save mode disabled */
	max_delay = suspend_time + resume_time;
	max_delay *= 5;		/* Allow for ResumeRate support */

	pending_job_id = alloc->job_id;

	for (i = 0; (cur_delay < max_delay); i++) {
		if (i) {
			if (i == 1)
				verbose("Waiting for nodes to boot");
			else
				debug("still waiting");
			sleep(POLL_SLEEP);
			cur_delay += POLL_SLEEP;
		}

		rc = slurm_job_node_ready(alloc->job_id);

		if (rc == READY_JOB_FATAL)
			break;				/* fatal error */
		if ((rc == READY_JOB_ERROR) || (rc == EAGAIN))
			continue;			/* retry */
		if ((rc & READY_JOB_STATE) == 0)	/* job killed */
			break;
		if (rc & READY_NODE_STATE) {		/* job and node ready */
			is_ready = 1;
			break;
		}
		if (destroy_job)
			break;
	}
	if (is_ready) {
		resource_allocation_response_msg_t *resp;
		char *tmp_str;
		if (i > 0)
     			verbose("Nodes %s are ready for job", alloc->node_list);
		if (alloc->alias_list && !strcmp(alloc->alias_list, "TBD") &&
		    (slurm_allocation_lookup_lite(pending_job_id, &resp)
		     == SLURM_SUCCESS)) {
			tmp_str = alloc->alias_list;
			alloc->alias_list = resp->alias_list;
			resp->alias_list = tmp_str;
			slurm_free_resource_allocation_response_msg(resp);
		}
	} else if (!destroy_job)
		error("Nodes %s are still not ready", alloc->node_list);
	else	/* allocation_interrupted and slurmctld not responing */
		is_ready = 0;

	pending_job_id = 0;

	return is_ready;
}
Exemplo n.º 9
0
/* returns 1 if job and nodes are ready for job to begin, 0 otherwise */
static int _wait_nodes_ready(resource_allocation_response_msg_t *alloc)
{
	int is_ready = 0, i, rc;
	double cur_delay = 0;
	double cur_sleep = 0;
	int suspend_time, resume_time, max_delay;
	bool job_killed = false;

	suspend_time = slurm_get_suspend_timeout();
	resume_time  = slurm_get_resume_timeout();
	if ((suspend_time == 0) || (resume_time == 0))
		return 1;	/* Power save mode disabled */
	max_delay = suspend_time + resume_time;
	max_delay *= 5;		/* Allow for ResumeRate support */

	pending_job_id = alloc->job_id;

	for (i = 0; cur_delay < max_delay; i++) {
		if (i) {
			cur_sleep = POLL_SLEEP * i;
			if (i == 1) {
				verbose("Waiting for nodes to boot (delay looping %d times @ %f secs x index)",
					max_delay, POLL_SLEEP);
			} else {
				debug("Waited %f sec and still waiting: next sleep for %f sec",
				      cur_delay, cur_sleep);
			}
			usleep(1000000 * cur_sleep);
			cur_delay += cur_sleep;
		}

		rc = slurm_job_node_ready(alloc->job_id);
		if (rc == READY_JOB_FATAL)
			break;				/* fatal error */
		if ((rc == READY_JOB_ERROR) || (rc == EAGAIN))
			continue;			/* retry */
		if ((rc & READY_JOB_STATE) == 0) {	/* job killed */
			job_killed = true;
			break;
		}
		if (rc & READY_NODE_STATE) {		/* job and node ready */
			is_ready = 1;
			break;
		}
		if (destroy_job)
			break;
	}
	if (is_ready) {
		resource_allocation_response_msg_t *resp;
		char *tmp_str;
		if (i > 0)
     			verbose("Nodes %s are ready for job", alloc->node_list);
		if (alloc->alias_list && !xstrcmp(alloc->alias_list, "TBD") &&
		    (slurm_allocation_lookup(pending_job_id, &resp)
		     == SLURM_SUCCESS)) {
			tmp_str = alloc->alias_list;
			alloc->alias_list = resp->alias_list;
			resp->alias_list = tmp_str;
			slurm_free_resource_allocation_response_msg(resp);
		}
	} else if (!destroy_job) {
		if (job_killed) {
			error("Job allocation %u has been revoked",
			      alloc->job_id);
			destroy_job = true;
		} else
			error("Nodes %s are still not ready", alloc->node_list);
	} else	/* allocation_interrupted and slurmctld not responing */
		is_ready = 0;

	pending_job_id = 0;

	return is_ready;
}