Пример #1
0
/*
 * assign_front_end - assign a front end node for starting a job
 * job_ptr IN - job to assign a front end node (tests access control lists)
 * RET pointer to the front end node to use or NULL if none found
 */
extern front_end_record_t *assign_front_end(struct job_record *job_ptr)
{
#ifdef HAVE_FRONT_END
	front_end_record_t *front_end_ptr, *best_front_end = NULL;
	uint32_t state_flags;
	int i;

	if (!job_ptr->batch_host && (job_ptr->batch_flag == 0) &&
	    (front_end_ptr = find_front_end_record(job_ptr->alloc_node))) {
		/* Use submit host for interactive job */
		if (!IS_NODE_DOWN(front_end_ptr)  &&
		    !IS_NODE_DRAIN(front_end_ptr) &&
		    !IS_NODE_NO_RESPOND(front_end_ptr) &&
		    _front_end_access(front_end_ptr, job_ptr)) {
			best_front_end = front_end_ptr;
		} else {
			info("%s: front-end node %s not available for job %u",
			     __func__, job_ptr->alloc_node, job_ptr->job_id);
			return NULL;
		}
	} else {
		for (i = 0, front_end_ptr = front_end_nodes;
		     i < front_end_node_cnt; i++, front_end_ptr++) {
			if (job_ptr->batch_host) { /* Find specific front-end */
				if (xstrcmp(job_ptr->batch_host,
					   front_end_ptr->name))
					continue;
				if (!_front_end_access(front_end_ptr, job_ptr))
					break;
			} else {	      /* Find a usable front-end node */
				if (IS_NODE_DOWN(front_end_ptr) ||
				    IS_NODE_DRAIN(front_end_ptr) ||
				    IS_NODE_NO_RESPOND(front_end_ptr))
					continue;
				if (!_front_end_access(front_end_ptr, job_ptr))
					continue;
			}
			if ((best_front_end == NULL) ||
			    (front_end_ptr->job_cnt_run <
			     best_front_end->job_cnt_run))
				best_front_end = front_end_ptr;
		}
	}

	if (best_front_end) {
		state_flags = best_front_end->node_state & NODE_STATE_FLAGS;
		best_front_end->node_state = NODE_STATE_ALLOCATED | state_flags;
		best_front_end->job_cnt_run++;
		return best_front_end;
	} else if (job_ptr->batch_host) {    /* Find specific front-end node */
		error("assign_front_end: front end node %s not found",
		      job_ptr->batch_host);
	} else {		/* Find some usable front-end node */
		error("assign_front_end: no available front end nodes found");
	}
#endif
	return NULL;
}
Пример #2
0
/*
 * sync_front_end_state - synchronize job pointers and front-end node state
 */
extern void sync_front_end_state(void)
{
#ifdef HAVE_FRONT_END
	ListIterator job_iterator;
	struct job_record *job_ptr;
	front_end_record_t *front_end_ptr;
	uint16_t state_flags;
	int i;

	for (i = 0, front_end_ptr = front_end_nodes;
	     i < front_end_node_cnt; i++, front_end_ptr++) {
		front_end_ptr->job_cnt_comp = 0;
		front_end_ptr->job_cnt_run  = 0;
	}

	job_iterator = list_iterator_create(job_list);
	while ((job_ptr = (struct job_record *) list_next(job_iterator))) {
		if (job_ptr->batch_host) {
			job_ptr->front_end_ptr =
				find_front_end_record(job_ptr->batch_host);
			if ((job_ptr->front_end_ptr == NULL) &&
			    IS_JOB_RUNNING(job_ptr)) {
				error("front end node %s has vanished, "
				      "killing job %u",
				      job_ptr->batch_host, job_ptr->job_id);
				job_ptr->job_state = JOB_NODE_FAIL |
						     JOB_COMPLETING;
			} else if (job_ptr->front_end_ptr == NULL) {
				info("front end node %s has vanished",
				     job_ptr->batch_host);
			} else if (IS_JOB_COMPLETING(job_ptr)) {
				job_ptr->front_end_ptr->job_cnt_comp++;
			} else if (IS_JOB_RUNNING(job_ptr)) {
				job_ptr->front_end_ptr->job_cnt_run++;
			}
		} else {
			job_ptr->front_end_ptr = NULL;
		}
	}
	list_iterator_destroy(job_iterator);

	for (i = 0, front_end_ptr = front_end_nodes;
	     i < front_end_node_cnt; i++, front_end_ptr++) {
		if ((IS_NODE_IDLE(front_end_ptr) ||
		     IS_NODE_UNKNOWN(front_end_ptr)) &&
		    (front_end_ptr->job_cnt_run != 0)) {
			state_flags = front_end_ptr->node_state &
				      NODE_STATE_FLAGS;
			front_end_ptr->node_state = NODE_STATE_ALLOCATED |
						    state_flags;
		}
		if (IS_NODE_ALLOCATED(front_end_ptr) &&
		    (front_end_ptr->job_cnt_run == 0)) {
			state_flags = front_end_ptr->node_state &
				      NODE_STATE_FLAGS;
			front_end_ptr->node_state = NODE_STATE_IDLE |
						    state_flags;
		}
		if (IS_NODE_COMPLETING(front_end_ptr) &&
		    (front_end_ptr->job_cnt_comp == 0)) {
			front_end_ptr->node_state &= (~NODE_STATE_COMPLETING);
		}
		if (!IS_NODE_COMPLETING(front_end_ptr) &&
		    (front_end_ptr->job_cnt_comp != 0)) {
			front_end_ptr->node_state |= NODE_STATE_COMPLETING;
		}
	}

	if (slurmctld_conf.debug_flags & DEBUG_FLAG_FRONT_END)
		log_front_end_state();
#endif
}
Пример #3
0
/*
 * load_all_front_end_state - Load the front_end node state from file, recover
 *	on slurmctld restart. Execute this after loading the configuration
 *	file data. Data goes into common storage.
 * IN state_only - if true, overwrite only front_end node state and reason
 *	Use this to overwrite the "UNKNOWN state typically used in slurm.conf
 * RET 0 or error code
 * NOTE: READ lock_slurmctld config before entry
 */
extern int load_all_front_end_state(bool state_only)
{
#ifdef HAVE_FRONT_END
	char *node_name = NULL, *reason = NULL, *data = NULL, *state_file;
	int data_allocated, data_read = 0, error_code = 0, node_cnt = 0;
	uint16_t node_state;
	uint32_t data_size = 0, name_len;
	uint32_t reason_uid = NO_VAL;
	time_t reason_time = 0;
	front_end_record_t *front_end_ptr;
	int state_fd;
	time_t time_stamp;
	Buf buffer;
	char *ver_str = NULL;
	uint16_t protocol_version = (uint16_t) NO_VAL;

	/* read the file */
	lock_state_files ();
	state_fd = _open_front_end_state_file(&state_file);
	if (state_fd < 0) {
		info ("No node state file (%s) to recover", state_file);
		error_code = ENOENT;
	} else {
		data_allocated = BUF_SIZE;
		data = xmalloc(data_allocated);
		while (1) {
			data_read = read(state_fd, &data[data_size], BUF_SIZE);
			if (data_read < 0) {
				if (errno == EINTR)
					continue;
				else {
					error ("Read error on %s: %m",
						state_file);
					break;
				}
			} else if (data_read == 0)     /* eof */
				break;
			data_size      += data_read;
			data_allocated += data_read;
			xrealloc(data, data_allocated);
		}
		close (state_fd);
	}
	xfree (state_file);
	unlock_state_files ();

	buffer = create_buf (data, data_size);

	safe_unpackstr_xmalloc( &ver_str, &name_len, buffer);
	debug3("Version string in front_end_state header is %s", ver_str);
	if (ver_str) {
		if (!strcmp(ver_str, FRONT_END_STATE_VERSION)) {
			protocol_version = SLURM_PROTOCOL_VERSION;
		}
	}

	if (protocol_version == (uint16_t) NO_VAL) {
		error("*****************************************************");
		error("Can not recover front_end state, version incompatible");
		error("*****************************************************");
		xfree(ver_str);
		free_buf(buffer);
		return EFAULT;
	}
	xfree(ver_str);

	safe_unpack_time(&time_stamp, buffer);

	while (remaining_buf (buffer) > 0) {
		uint16_t base_state;
		if (protocol_version >= SLURM_2_5_PROTOCOL_VERSION) {
			safe_unpackstr_xmalloc (&node_name, &name_len, buffer);
			safe_unpack16 (&node_state,  buffer);
			safe_unpackstr_xmalloc (&reason,    &name_len, buffer);
			safe_unpack_time (&reason_time, buffer);
			safe_unpack32 (&reason_uid,  buffer);
			base_state = node_state & NODE_STATE_BASE;
		} else
			goto unpack_error;

		/* validity test as possible */

		/* find record and perform update */
		front_end_ptr = find_front_end_record(node_name);
		if (front_end_ptr == NULL) {
			error("Front_end node %s has vanished from "
			      "configuration", node_name);
		} else if (state_only) {
			uint16_t orig_flags;
			orig_flags = front_end_ptr->node_state &
				     NODE_STATE_FLAGS;
			node_cnt++;
			if (IS_NODE_UNKNOWN(front_end_ptr)) {
				if (base_state == NODE_STATE_DOWN) {
					orig_flags &= (~NODE_STATE_COMPLETING);
					front_end_ptr->node_state =
						NODE_STATE_DOWN | orig_flags;
				}
				if (node_state & NODE_STATE_DRAIN) {
					 front_end_ptr->node_state |=
						 NODE_STATE_DRAIN;
				}
				if (node_state & NODE_STATE_FAIL) {
					front_end_ptr->node_state |=
						NODE_STATE_FAIL;
				}
			}
			if (front_end_ptr->reason == NULL) {
				front_end_ptr->reason = reason;
				reason = NULL;	/* Nothing to free */
				front_end_ptr->reason_time = reason_time;
				front_end_ptr->reason_uid = reason_uid;
			}
		} else {
			node_cnt++;
			front_end_ptr->node_state = node_state;
			xfree(front_end_ptr->reason);
			front_end_ptr->reason	= reason;
			reason			= NULL;	/* Nothing to free */
			front_end_ptr->reason_time	= reason_time;
			front_end_ptr->reason_uid	= reason_uid;
			front_end_ptr->last_response	= (time_t) 0;
		}

		xfree(node_name);
		xfree(reason);
	}

fini:	info("Recovered state of %d front_end nodes", node_cnt);
	free_buf (buffer);
	return error_code;

unpack_error:
	error("Incomplete front_end node data checkpoint file");
	error_code = EFAULT;
	xfree (node_name);
	xfree(reason);
	goto fini;
#else
	return 0;
#endif
}