Пример #1
0
/* returns 1 if job and nodes are ready for job to begin, 0 otherwise */
static int _wait_bluegene_block_ready(resource_allocation_response_msg_t *alloc)
{
	int is_ready = 0, i, rc;
	char *block_id = NULL;
	double cur_delay = 0;
	double cur_sleep = 0;
	int max_delay = BG_FREE_PREVIOUS_BLOCK + BG_MIN_BLOCK_BOOT +
		(BG_INCR_BLOCK_BOOT * alloc->node_cnt);

	select_g_select_jobinfo_get(alloc->select_jobinfo,
				    SELECT_JOBDATA_BLOCK_ID,
				    &block_id);

	for (i = 0; cur_delay < max_delay; i++) {
		cur_sleep = POLL_SLEEP * i;
		if (i == 1) {
			debug("Waiting for block %s to become ready for job",
			      block_id);
		}
		if (i) {
			usleep(1000000 * cur_sleep);
			rc = _blocks_dealloc();
			if ((rc == 0) || (rc == -1))
				cur_delay += cur_sleep;
			debug2("still waiting");
		}

		rc = slurm_job_node_ready(alloc->job_id);

		if (rc == READY_JOB_FATAL)
			break;				/* fatal error */
		if ((rc == READY_JOB_ERROR) || (rc == EAGAIN))
			continue;			/* retry */
		if ((rc & READY_JOB_STATE) == 0)	/* job killed */
			break;
		if (rc & READY_NODE_STATE) {		/* job and node ready */
			is_ready = 1;
			break;
		}
		if (destroy_job)
			break;
	}
	if (is_ready)
     		debug("Block %s is ready for job", block_id);
	else if (!destroy_job)
		error("Block %s still not ready", block_id);
	else	/* destroy_job set and slurmctld not responing */
		is_ready = 0;

	xfree(block_id);

	return is_ready;
}
Пример #2
0
static int _wait_bluegene_block_ready(resource_allocation_response_msg_t *alloc)
{
	int is_ready = SLURM_ERROR, i, rc = 0;
	char *block_id = NULL;
	int cur_delay = 0;
	int max_delay = BG_FREE_PREVIOUS_BLOCK + BG_MIN_BLOCK_BOOT +
		(BG_INCR_BLOCK_BOOT * alloc->node_cnt);

	select_g_select_jobinfo_get(alloc->select_jobinfo,
				    SELECT_JOBDATA_BLOCK_ID,
				    &block_id);

	for (i=0; (cur_delay < max_delay); i++) {
		if (i) {
			if (i == 1) {
				info("Waiting for block %s to become ready for "
				     "job", block_id);
			} else
				debug("still waiting");
			sleep(POLL_SLEEP);
			rc = _blocks_dealloc();
			if ((rc == 0) || (rc == -1))
				cur_delay += POLL_SLEEP;
		}

		rc = slurm_job_node_ready(alloc->job_id);

		if (rc == READY_JOB_FATAL)
			break;				/* fatal error */
		if ((rc == READY_JOB_ERROR) || (rc == EAGAIN))
			continue;			/* retry */
		if ((rc & READY_JOB_STATE) == 0)	/* job killed */
			break;
		if (rc & READY_NODE_STATE) {		/* job and node ready */
			is_ready = SLURM_SUCCESS;
			break;
		}
	}

	if (is_ready == SLURM_SUCCESS)
     		info("Block %s is ready for job %u", block_id, alloc->job_id);
	else if ((rc & READY_JOB_STATE) == 0)
		info("Job %u no longer running", alloc->job_id);
	else
		info("Problem running job %u", alloc->job_id);
	xfree(block_id);

	return is_ready;
}