示例#1
0
文件: front_end.c 项目: A1ve5/slurm
/*
 * assign_front_end - assign a front end node for starting a job
 * job_ptr IN - job to assign a front end node (tests access control lists)
 * RET pointer to the front end node to use or NULL if none found
 */
extern front_end_record_t *assign_front_end(struct job_record *job_ptr)
{
#ifdef HAVE_FRONT_END
	front_end_record_t *front_end_ptr, *best_front_end = NULL;
	uint32_t state_flags;
	int i;

	if (!job_ptr->batch_host && (job_ptr->batch_flag == 0) &&
	    (front_end_ptr = find_front_end_record(job_ptr->alloc_node))) {
		/* Use submit host for interactive job */
		if (!IS_NODE_DOWN(front_end_ptr)  &&
		    !IS_NODE_DRAIN(front_end_ptr) &&
		    !IS_NODE_NO_RESPOND(front_end_ptr) &&
		    _front_end_access(front_end_ptr, job_ptr)) {
			best_front_end = front_end_ptr;
		} else {
			info("%s: front-end node %s not available for job %u",
			     __func__, job_ptr->alloc_node, job_ptr->job_id);
			return NULL;
		}
	} else {
		for (i = 0, front_end_ptr = front_end_nodes;
		     i < front_end_node_cnt; i++, front_end_ptr++) {
			if (job_ptr->batch_host) { /* Find specific front-end */
				if (xstrcmp(job_ptr->batch_host,
					   front_end_ptr->name))
					continue;
				if (!_front_end_access(front_end_ptr, job_ptr))
					break;
			} else {	      /* Find a usable front-end node */
				if (IS_NODE_DOWN(front_end_ptr) ||
				    IS_NODE_DRAIN(front_end_ptr) ||
				    IS_NODE_NO_RESPOND(front_end_ptr))
					continue;
				if (!_front_end_access(front_end_ptr, job_ptr))
					continue;
			}
			if ((best_front_end == NULL) ||
			    (front_end_ptr->job_cnt_run <
			     best_front_end->job_cnt_run))
				best_front_end = front_end_ptr;
		}
	}

	if (best_front_end) {
		state_flags = best_front_end->node_state & NODE_STATE_FLAGS;
		best_front_end->node_state = NODE_STATE_ALLOCATED | state_flags;
		best_front_end->job_cnt_run++;
		return best_front_end;
	} else if (job_ptr->batch_host) {    /* Find specific front-end node */
		error("assign_front_end: front end node %s not found",
		      job_ptr->batch_host);
	} else {		/* Find some usable front-end node */
		error("assign_front_end: no available front end nodes found");
	}
#endif
	return NULL;
}
示例#2
0
文件: ping_nodes.c 项目: A1ve5/slurm
/* Update acct_gather data for every node that is not DOWN */
extern void update_nodes_acct_gather_data(void)
{
#ifdef HAVE_FRONT_END
	front_end_record_t *front_end_ptr;
#else
	struct node_record *node_ptr;
#endif
	int i;
	char *host_str = NULL;
	agent_arg_t *agent_args = NULL;

	agent_args = xmalloc (sizeof (agent_arg_t));
	agent_args->msg_type = REQUEST_ACCT_GATHER_UPDATE;
	agent_args->retry = 0;
	agent_args->protocol_version = SLURM_PROTOCOL_VERSION;
	agent_args->hostlist = hostlist_create(NULL);

#ifdef HAVE_FRONT_END
	for (i = 0, front_end_ptr = front_end_nodes;
	     i < front_end_node_cnt; i++, front_end_ptr++) {
		if (IS_NODE_NO_RESPOND(front_end_ptr))
			continue;
		if (agent_args->protocol_version >
		    front_end_ptr->protocol_version)
			agent_args->protocol_version =
				front_end_ptr->protocol_version;

		hostlist_push_host(agent_args->hostlist, front_end_ptr->name);
		agent_args->node_count++;
	}
#else
	for (i = 0, node_ptr = node_record_table_ptr;
	     i < node_record_count; i++, node_ptr++) {
		if (IS_NODE_NO_RESPOND(node_ptr) || IS_NODE_FUTURE(node_ptr) ||
		    IS_NODE_POWER_SAVE(node_ptr))
			continue;
		if (agent_args->protocol_version > node_ptr->protocol_version)
			agent_args->protocol_version =
				node_ptr->protocol_version;
		hostlist_push_host(agent_args->hostlist, node_ptr->name);
		agent_args->node_count++;
	}
#endif

	if (agent_args->node_count == 0) {
		hostlist_destroy(agent_args->hostlist);
		xfree (agent_args);
	} else {
		hostlist_uniq(agent_args->hostlist);
		host_str = hostlist_ranged_string_xmalloc(agent_args->hostlist);
		if (slurmctld_conf.debug_flags & DEBUG_FLAG_ENERGY)
			info("Updating acct_gather data for %s", host_str);
		xfree(host_str);
		ping_begin();
		agent_queue_request(agent_args);
	}
}
示例#3
0
/* Spawn health check function for every node that is not DOWN */
extern void run_health_check(void)
{
#ifdef HAVE_FRONT_END
	front_end_record_t *front_end_ptr;
#else
	struct node_record *node_ptr;
#endif
	int i;
	char *host_str = NULL;
	agent_arg_t *check_agent_args = NULL;

	check_agent_args = xmalloc (sizeof (agent_arg_t));
	check_agent_args->msg_type = REQUEST_HEALTH_CHECK;
	check_agent_args->retry = 0;
	check_agent_args->hostlist = hostlist_create("");
	if (check_agent_args->hostlist == NULL)
		fatal("hostlist_create: malloc failure");

#ifdef HAVE_FRONT_END
	for (i = 0, front_end_ptr = front_end_nodes;
	     i < front_end_node_cnt; i++, front_end_ptr++) {
		if (IS_NODE_NO_RESPOND(front_end_ptr))
			continue;
		hostlist_push(check_agent_args->hostlist, front_end_ptr->name);
		check_agent_args->node_count++;
	}
#else
	for (i=0, node_ptr=node_record_table_ptr;
	     i<node_record_count; i++, node_ptr++) {
		if (IS_NODE_NO_RESPOND(node_ptr) || IS_NODE_FUTURE(node_ptr) ||
		    IS_NODE_POWER_SAVE(node_ptr))
			continue;
		hostlist_push(check_agent_args->hostlist, node_ptr->name);
		check_agent_args->node_count++;
	}
#endif

	if (check_agent_args->node_count == 0) {
		hostlist_destroy(check_agent_args->hostlist);
		xfree (check_agent_args);
	} else {
		hostlist_uniq(check_agent_args->hostlist);
		host_str = hostlist_ranged_string_xmalloc(
				check_agent_args->hostlist);
		debug("Spawning health check agent for %s", host_str);
		xfree(host_str);
		ping_begin();
		agent_queue_request(check_agent_args);
	}
}
示例#4
0
文件: power_save.c 项目: edsw/slurm
/* If slurmctld crashes, the node state that it recovers could differ
 * from the actual hardware state (e.g. ResumeProgram failed to complete).
 * To address that, when a node that should be powered up for a running
 * job is not responding, they try running ResumeProgram again. */
static void _re_wake(void)
{
	struct node_record *node_ptr;
	bitstr_t *wake_node_bitmap = NULL;
	int i;

	node_ptr = node_record_table_ptr;
	for (i=0; i<node_record_count; i++, node_ptr++) {
		if (IS_NODE_ALLOCATED(node_ptr)   &&
		    IS_NODE_NO_RESPOND(node_ptr)  &&
		    !IS_NODE_POWER_SAVE(node_ptr) &&
		    (bit_test(suspend_node_bitmap, i) == 0) &&
		    (bit_test(resume_node_bitmap,  i) == 0)) {
			if (wake_node_bitmap == NULL) {
				wake_node_bitmap =
					bit_alloc(node_record_count);
			}
			bit_set(wake_node_bitmap, i);
		}
	}

	if (wake_node_bitmap) {
		char *nodes;
		nodes = bitmap2node_name(wake_node_bitmap);
		if (nodes) {
			pid_t pid = _run_prog(resume_prog, nodes, NULL);
			info("power_save: pid %d rewaking nodes %s",
			     (int) pid, nodes);
		} else
			error("power_save: bitmap2nodename");
		xfree(nodes);
		FREE_NULL_BITMAP(wake_node_bitmap);
	}
}
示例#5
0
文件: front_end.c 项目: lipari/slurm
/*
 * assign_front_end - assign a front end node for starting a job
 * RET pointer to the front end node to use or NULL if none available
 */
extern front_end_record_t *assign_front_end(void)
{
#ifdef HAVE_FRONT_END
	static int last_assigned = -1;
	front_end_record_t *front_end_ptr;
	uint16_t state_flags;
	int i;

	for (i = 0; i < front_end_node_cnt; i++) {
		last_assigned = (last_assigned + 1) % front_end_node_cnt;
		front_end_ptr = front_end_nodes + last_assigned;
		if (IS_NODE_DOWN(front_end_ptr) ||
		    IS_NODE_DRAIN(front_end_ptr) ||
		    IS_NODE_NO_RESPOND(front_end_ptr))
			continue;
		state_flags = front_end_nodes[last_assigned].node_state &
			      NODE_STATE_FLAGS;
		front_end_nodes[last_assigned].node_state =
				NODE_STATE_ALLOCATED | state_flags;
		front_end_nodes[last_assigned].job_cnt_run++;
		return front_end_ptr;
	}
	fatal("assign_front_end: no available front end nodes found");
#endif
	return NULL;
}
示例#6
0
文件: front_end.c 项目: lipari/slurm
/*
 * avail_front_end - test if any front end nodes are available for starting job
 */
extern bool avail_front_end(void)
{
#ifdef HAVE_FRONT_END
	front_end_record_t *front_end_ptr;
	int i;

	for (i = 0, front_end_ptr = front_end_nodes;
	     i < front_end_node_cnt; i++, front_end_ptr++) {
		if (IS_NODE_DOWN(front_end_ptr)  ||
		    IS_NODE_DRAIN(front_end_ptr) ||
		    IS_NODE_NO_RESPOND(front_end_ptr))
			continue;
		return true;
	}
	return false;
#else
	return true;
#endif
}
示例#7
0
/*
 * assign_front_end - assign a front end node for starting a job
 * job_ptr IN - job to assign a front end node (tests access control lists)
 * RET pointer to the front end node to use or NULL if none found
 */
extern front_end_record_t *assign_front_end(struct job_record *job_ptr)
{
#ifdef HAVE_FRONT_END
	front_end_record_t *front_end_ptr, *best_front_end = NULL;
	uint32_t state_flags;
	int i;

	for (i = 0, front_end_ptr = front_end_nodes; i < front_end_node_cnt;
	     i++, front_end_ptr++) {
		if (job_ptr->batch_host) {   /* Find specific front-end node */
			if (strcmp(job_ptr->batch_host, front_end_ptr->name))
				continue;
			if (!_front_end_access(front_end_ptr, job_ptr))
				break;
		} else {		/* Find some usable front-end node */
			if (IS_NODE_DOWN(front_end_ptr) ||
			    IS_NODE_DRAIN(front_end_ptr) ||
			    IS_NODE_NO_RESPOND(front_end_ptr))
				continue;
			if (!_front_end_access(front_end_ptr, job_ptr))
				continue;
		}
		if ((best_front_end == NULL) ||
		    (front_end_ptr->job_cnt_run < best_front_end->job_cnt_run))
			best_front_end = front_end_ptr;
	}

	if (best_front_end) {
		state_flags = best_front_end->node_state & NODE_STATE_FLAGS;
		best_front_end->node_state = NODE_STATE_ALLOCATED | state_flags;
		best_front_end->job_cnt_run++;
		return best_front_end;
	} else if (job_ptr->batch_host) {    /* Find specific front-end node */
		error("assign_front_end: front end node %s not found",
		      job_ptr->batch_host);
	} else {		/* Find some usable front-end node */
		error("assign_front_end: no available front end nodes found");
	}
#endif
	return NULL;
}
示例#8
0
文件: front_end.c 项目: Cray/slurm
/*
 * assign_front_end - assign a front end node for starting a job
 * job_ptr IN - job to assign a front end node (tests access control lists)
 * RET pointer to the front end node to use or NULL if none found
 */
extern front_end_record_t *assign_front_end(struct job_record *job_ptr)
{
#ifdef HAVE_FRONT_END
	static int last_assigned = -1;
	front_end_record_t *front_end_ptr;
	uint16_t state_flags;
	int i;

	for (i = 0; i < front_end_node_cnt; i++) {
		last_assigned = (last_assigned + 1) % front_end_node_cnt;
		front_end_ptr = front_end_nodes + last_assigned;
		if (job_ptr->batch_host) {   /* Find specific front-end node */
			if (strcmp(job_ptr->batch_host, front_end_ptr->name))
				continue;
			if (!_front_end_access(front_end_ptr, job_ptr))
				break;
		} else {		/* Find some usable front-end node */
			if (IS_NODE_DOWN(front_end_ptr) ||
			    IS_NODE_DRAIN(front_end_ptr) ||
			    IS_NODE_NO_RESPOND(front_end_ptr))
				continue;
			if (!_front_end_access(front_end_ptr, job_ptr))
				continue;
		}
		state_flags = front_end_nodes[last_assigned].node_state &
			      NODE_STATE_FLAGS;
		front_end_nodes[last_assigned].node_state =
				NODE_STATE_ALLOCATED | state_flags;
		front_end_nodes[last_assigned].job_cnt_run++;
		return front_end_ptr;
	}
	if (job_ptr->batch_host) {	/* Find specific front-end node */
		error("assign_front_end: front end node %s not found",
		      job_ptr->batch_host);
	} else {		/* Find some usable front-end node */
		error("assign_front_end: no available front end nodes found");
	}
#endif
	return NULL;
}
示例#9
0
/**
 * basil_inventory - Periodic node-state query via ALPS XML-RPC.
 * This should be run immediately before each scheduling cycle.
 * Returns non-SLURM_SUCCESS if
 * - INVENTORY method failed (error)
 * - no nodes are available (no point in scheduling)
 * - orphaned ALPS reservation exists (wait until ALPS resynchronizes)
 */
extern int basil_inventory(void)
{
	enum basil_version version = get_basil_version();
	struct basil_inventory *inv;
	struct basil_node *node;
	struct basil_rsvn *rsvn;
	int slurm_alps_mismatch = 0;
	int rc = SLURM_SUCCESS;
	int rel_rc;
	time_t now = time(NULL);
	static time_t slurm_alps_mismatch_time = (time_t) 0;
	static bool logged_sync_timeout = false;
	static time_t last_inv_run = 0;

	if ((now - last_inv_run) < inv_interval)
		return SLURM_SUCCESS;

	last_inv_run = now;

	inv = get_full_inventory(version);
	if (inv == NULL) {
		error("BASIL %s INVENTORY failed", bv_names_long[version]);
		return SLURM_ERROR;
	}

	debug("BASIL %s INVENTORY: %d/%d batch nodes available",
	      bv_names_long[version], inv->batch_avail, inv->batch_total);

	/* Avoid checking for inv->batch_avail here since if we are
	   gang scheduling returning an error for a full system is
	   probably the wrong thing to do. (the schedule() function
	   in the slurmctld will never run ;)).
	*/
	if (!inv->f->node_head || !inv->batch_total)
		rc = ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE;

	for (node = inv->f->node_head; node; node = node->next) {
		int node_inx;
		struct node_record *node_ptr;
		char *reason = NULL;

		/* This will ignore interactive nodes when iterating through
		 * the apbasil inventory.  If we don't do this, SLURM is
		 * unable to resolve the ID to a nidXXX name since it's not in
		 * the slurm.conf file.  (Chris North)
		 */
		if (node->role == BNR_INTER)
			continue;

		node_ptr = _find_node_by_basil_id(node->node_id);
		if (node_ptr == NULL) {
			error("nid%05u (%s node in state %s) not in slurm.conf",
			      node->node_id, nam_noderole[node->role],
			      nam_nodestate[node->state]);
			continue;
		}
		node_inx = node_ptr - node_record_table_ptr;

		if (node_is_allocated(node) && !IS_NODE_ALLOCATED(node_ptr)) {
			/*
			 * ALPS still hangs on to the node while SLURM considers
			 * it already unallocated. Possible causes are partition
			 * cleanup taking too long (can be 10sec ... minutes),
			 * and orphaned ALPS reservations (caught below).
			 *
			 * The converse case (SLURM hanging on to the node while
			 * ALPS has already freed it) happens frequently during
			 * job completion: select_g_job_fini() is called before
			 * make_node_comp(). Rely on SLURM logic for this case.
			 */
			slurm_alps_mismatch++;
		}

		if (node->state == BNS_DOWN) {
			reason = "ALPS marked it DOWN";
		} else if (node->state == BNS_UNAVAIL) {
			reason = "node is UNAVAILABLE";
		} else if (node->state == BNS_ROUTE) {
			reason = "node does ROUTING";
		} else if (node->state == BNS_SUSPECT) {
			reason = "entered SUSPECT mode";
		} else if (node->state == BNS_ADMINDOWN) {
			reason = "node is ADMINDOWN";
		} else if (node->state != BNS_UP) {
			reason = "state not UP";
		} else if (node->role != BNR_BATCH) {
			reason = "mode not BATCH";
		} else if (node->arch != BNA_XT) {
			reason = "arch not XT/XE";
		}

		/* Base state entirely derives from ALPS */
		if (reason) {
			if (node_ptr->down_time == 0)
				node_ptr->down_time = now;
			if (IS_NODE_DOWN(node_ptr)) {
				/* node still down */
			} else if ((slurmctld_conf.slurmd_timeout == 0) ||
				   ((now - node_ptr->down_time) <
				    slurmctld_conf.slurmd_timeout)) {
				node_ptr->node_state |= NODE_STATE_NO_RESPOND;
				bit_clear(avail_node_bitmap, node_inx);
			} else {
				xfree(node_ptr->reason);
				info("MARKING %s DOWN (%s)",
				     node_ptr->name, reason);
				/* set_node_down also kills any running jobs */
				set_node_down_ptr(node_ptr, reason);
			}
		} else if (IS_NODE_DOWN(node_ptr)) {
			xfree(node_ptr->reason);
			node_ptr->down_time = 0;
			info("MARKING %s UP", node_ptr->name);

			/* Reset state, make_node_idle figures out the rest */
			node_ptr->node_state &= NODE_STATE_FLAGS;
			node_ptr->node_state &= (~NODE_STATE_NO_RESPOND);
			node_ptr->node_state |= NODE_STATE_UNKNOWN;

			make_node_idle(node_ptr, NULL);
			if (!IS_NODE_DRAIN(node_ptr) &&
			    !IS_NODE_FAIL(node_ptr)) {
				xfree(node_ptr->reason);
				node_ptr->reason_time = 0;
				node_ptr->reason_uid = NO_VAL;
				clusteracct_storage_g_node_up(
					acct_db_conn, node_ptr, now);
			}
		} else if (IS_NODE_NO_RESPOND(node_ptr)) {
			node_ptr->node_state &= (~NODE_STATE_NO_RESPOND);
			if (!IS_NODE_DRAIN(node_ptr) &&
			    !IS_NODE_FAIL(node_ptr)) {
				bit_set(avail_node_bitmap, node_inx);
			}
		}
	}

	if (slurm_alps_mismatch)
		debug("ALPS: %d node(s) still held", slurm_alps_mismatch);

	/*
	 * Check that each ALPS reservation corresponds to a SLURM job.
	 * Purge orphaned reservations, which may result from stale or
	 * messed up system state, or are indicative of ALPS problems
	 * (stuck in pending cancel calls).
	 */
	for (rsvn = inv->f->rsvn_head; rsvn; rsvn = rsvn->next) {
		ListIterator job_iter = list_iterator_create(job_list);
		struct job_record *job_ptr;
		uint32_t resv_id;

		while ((job_ptr = (struct job_record *)list_next(job_iter))) {
			if (_get_select_jobinfo(job_ptr->select_jobinfo->data,
						SELECT_JOBDATA_RESV_ID,
						&resv_id) == SLURM_SUCCESS
			    && resv_id == rsvn->rsvn_id)
				break;
		}
		list_iterator_destroy(job_iter);

		/*
		 * Changed to ignore reservations for "UNKNOWN" batch
		 * ids (e.g. the interactive region) (Chris North)
		 */

		if ((job_ptr == NULL) && (xstrcmp(rsvn->batch_id, "UNKNOWN"))) {
			error("orphaned ALPS reservation %u, trying to remove",
			      rsvn->rsvn_id);
			rel_rc = basil_safe_release(rsvn->rsvn_id, inv);
			if (rel_rc) {
				error("ALPS reservation %u removal FAILED: %s",
				      rsvn->rsvn_id, basil_strerror(rel_rc));
			} else {
				debug("ALPS reservation %u removed",
				      rsvn->rsvn_id);
			}
			slurm_alps_mismatch = true;
		}
	}
	free_inv(inv);

	if (slurm_alps_mismatch) {
		/* If SLURM and ALPS state are not in synchronization,
		 * do not schedule any more jobs until waiting at least
		 * SyncTimeout seconds. */
		if (slurm_alps_mismatch_time == 0) {
			slurm_alps_mismatch_time = now;
		} else if (cray_conf->sync_timeout == 0) {
			/* Wait indefinitely */
		} else if (difftime(now, slurm_alps_mismatch_time) <
			   cray_conf->sync_timeout) {
			return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE;
		} else if (!logged_sync_timeout) {
			error("Could not synchronize SLURM with ALPS for %u "
			      "seconds, proceeding with job scheduling",
			      cray_conf->sync_timeout);
			logged_sync_timeout = true;
		}
	} else {
		slurm_alps_mismatch_time = 0;
		logged_sync_timeout = false;
	}
	return rc;
}
示例#10
0
/*
 * ping_nodes - check that all nodes and daemons are alive,
 *	get nodes in UNKNOWN state to register
 */
void ping_nodes (void)
{
	static bool restart_flag = true;	/* system just restarted */
	static int offset = 0;	/* mutex via node table write lock on entry */
	static int max_reg_threads = 0;	/* max node registration threads
					 * this can include DOWN nodes, so
					 * limit the number to avoid huge
					 * communication delays */
	int i;
	time_t now, still_live_time, node_dead_time;
	static time_t last_ping_time = (time_t) 0;
	hostlist_t down_hostlist = NULL;
	char *host_str = NULL;
	agent_arg_t *ping_agent_args = NULL;
	agent_arg_t *reg_agent_args = NULL;
#ifdef HAVE_FRONT_END
	front_end_record_t *front_end_ptr = NULL;
#else
	struct node_record *node_ptr = NULL;
#endif

	now = time (NULL);

	ping_agent_args = xmalloc (sizeof (agent_arg_t));
	ping_agent_args->msg_type = REQUEST_PING;
	ping_agent_args->retry = 0;
	ping_agent_args->hostlist = hostlist_create("");

	reg_agent_args = xmalloc (sizeof (agent_arg_t));
	reg_agent_args->msg_type = REQUEST_NODE_REGISTRATION_STATUS;
	reg_agent_args->retry = 0;
	reg_agent_args->hostlist = hostlist_create("");

	/*
	 * If there are a large number of down nodes, the node ping
	 * can take a long time to complete:
	 *  ping_time = down_nodes * agent_timeout / agent_parallelism
	 *  ping_time = down_nodes * 10_seconds / 10
	 *  ping_time = down_nodes (seconds)
	 * Because of this, we extend the SlurmdTimeout by the
	 * time needed to complete a ping of all nodes.
	 */
	if ((slurmctld_conf.slurmd_timeout == 0) ||
	    (last_ping_time == (time_t) 0)) {
		node_dead_time = (time_t) 0;
	} else {
		node_dead_time = last_ping_time -
				 slurmctld_conf.slurmd_timeout;
	}
	still_live_time = now - (slurmctld_conf.slurmd_timeout / 3);
	last_ping_time  = now;

	if (max_reg_threads == 0) {
		max_reg_threads = MAX(slurm_get_tree_width(), 1);
	}
	offset += max_reg_threads;
	if ((offset > node_record_count) &&
	    (offset >= (max_reg_threads * MAX_REG_FREQUENCY)))
		offset = 0;

#ifdef HAVE_FRONT_END
	for (i = 0, front_end_ptr = front_end_nodes;
	     i < front_end_node_cnt; i++, front_end_ptr++) {
		if ((slurmctld_conf.slurmd_timeout == 0)	&&
		    (!restart_flag)				&&
		    (!IS_NODE_UNKNOWN(front_end_ptr))		&&
		    (!IS_NODE_NO_RESPOND(front_end_ptr)))
			continue;

		if ((front_end_ptr->last_response != (time_t) 0)     &&
		    (front_end_ptr->last_response <= node_dead_time) &&
		    (!IS_NODE_DOWN(front_end_ptr))) {
			if (down_hostlist)
				(void) hostlist_push_host(down_hostlist,
					front_end_ptr->name);
			else {
				down_hostlist =
					hostlist_create(front_end_ptr->name);
				if (down_hostlist == NULL)
					fatal("hostlist_create: malloc error");
			}
			set_front_end_down(front_end_ptr, "Not responding");
			front_end_ptr->not_responding = false;
			continue;
		}

		if (restart_flag) {
			front_end_ptr->last_response =
				slurmctld_conf.last_update;
		}

		/* Request a node registration if its state is UNKNOWN or
		 * on a periodic basis (about every MAX_REG_FREQUENCY ping,
		 * this mechanism avoids an additional (per node) timer or
		 * counter and gets updated configuration information
		 * once in a while). We limit these requests since they
		 * can generate a flood of incoming RPCs. */
		if (IS_NODE_UNKNOWN(front_end_ptr) || restart_flag ||
		    ((i >= offset) && (i < (offset + max_reg_threads)))) {
			hostlist_push(reg_agent_args->hostlist,
				      front_end_ptr->name);
			reg_agent_args->node_count++;
			continue;
		}

		if ((!IS_NODE_NO_RESPOND(front_end_ptr)) &&
		    (front_end_ptr->last_response >= still_live_time))
			continue;

		/* Do not keep pinging down nodes since this can induce
		 * huge delays in hierarchical communication fail-over */
		if (IS_NODE_NO_RESPOND(front_end_ptr) &&
		    IS_NODE_DOWN(front_end_ptr))
			continue;

		hostlist_push(ping_agent_args->hostlist, front_end_ptr->name);
		ping_agent_args->node_count++;
	}
#else
	for (i=0, node_ptr=node_record_table_ptr;
	     i<node_record_count; i++, node_ptr++) {
		if (IS_NODE_FUTURE(node_ptr) || IS_NODE_POWER_SAVE(node_ptr))
			continue;
		if ((slurmctld_conf.slurmd_timeout == 0) &&
		    (!restart_flag)			 &&
		    (!IS_NODE_UNKNOWN(node_ptr))         &&
		    (!IS_NODE_NO_RESPOND(node_ptr)))
			continue;

		if ((node_ptr->last_response != (time_t) 0)     &&
		    (node_ptr->last_response <= node_dead_time) &&
		    (!IS_NODE_DOWN(node_ptr))) {
			if (down_hostlist)
				(void) hostlist_push_host(down_hostlist,
					node_ptr->name);
			else {
				down_hostlist =
					hostlist_create(node_ptr->name);
				if (down_hostlist == NULL)
					fatal("hostlist_create: malloc error");
			}
			set_node_down_ptr(node_ptr, "Not responding");
			node_ptr->not_responding = false;  /* logged below */
			continue;
		}

		if (restart_flag)
			node_ptr->last_response = slurmctld_conf.last_update;

		/* Request a node registration if its state is UNKNOWN or
		 * on a periodic basis (about every MAX_REG_FREQUENCY ping,
		 * this mechanism avoids an additional (per node) timer or
		 * counter and gets updated configuration information
		 * once in a while). We limit these requests since they
		 * can generate a flood of incoming RPCs. */
		if (IS_NODE_UNKNOWN(node_ptr) || restart_flag ||
		    ((i >= offset) && (i < (offset + max_reg_threads)))) {
			hostlist_push(reg_agent_args->hostlist,
				      node_ptr->name);
			reg_agent_args->node_count++;
			continue;
		}

		if ((!IS_NODE_NO_RESPOND(node_ptr)) &&
		    (node_ptr->last_response >= still_live_time))
			continue;

		/* Do not keep pinging down nodes since this can induce
		 * huge delays in hierarchical communication fail-over */
		if (IS_NODE_NO_RESPOND(node_ptr) && IS_NODE_DOWN(node_ptr))
			continue;

		hostlist_push(ping_agent_args->hostlist, node_ptr->name);
		ping_agent_args->node_count++;
	}
#endif

	restart_flag = false;
	if (ping_agent_args->node_count == 0) {
		hostlist_destroy(ping_agent_args->hostlist);
		xfree (ping_agent_args);
	} else {
		hostlist_uniq(ping_agent_args->hostlist);
		host_str = hostlist_ranged_string_xmalloc(
				ping_agent_args->hostlist);
		debug("Spawning ping agent for %s", host_str);
		xfree(host_str);
		ping_begin();
		agent_queue_request(ping_agent_args);
	}

	if (reg_agent_args->node_count == 0) {
		hostlist_destroy(reg_agent_args->hostlist);
		xfree (reg_agent_args);
	} else {
		hostlist_uniq(reg_agent_args->hostlist);
		host_str = hostlist_ranged_string_xmalloc(
				reg_agent_args->hostlist);
		debug("Spawning registration agent for %s %d hosts",
		      host_str, reg_agent_args->node_count);
		xfree(host_str);
		ping_begin();
		agent_queue_request(reg_agent_args);
	}

	if (down_hostlist) {
		hostlist_uniq(down_hostlist);
		host_str = hostlist_ranged_string_xmalloc(down_hostlist);
		error("Nodes %s not responding, setting DOWN", host_str);
		xfree(host_str);
		hostlist_destroy(down_hostlist);
	}
}
示例#11
0
/* Spawn health check function for every node that is not DOWN */
extern void run_health_check(void)
{
#ifdef HAVE_FRONT_END
	front_end_record_t *front_end_ptr;
#else
	struct node_record *node_ptr;
	int node_test_cnt = 0, node_limit, node_states, run_cyclic;
	static int base_node_loc = -1;
	static time_t cycle_start_time = (time_t) 0;
#endif
	int i;
	char *host_str = NULL;
	agent_arg_t *check_agent_args = NULL;

	/* Sync plugin internal data with
	 * node select_nodeinfo. This is important
	 * after reconfig otherwise select_nodeinfo
	 * will not return the correct number of
	 * allocated cpus.
	 */
	select_g_select_nodeinfo_set_all();

	check_agent_args = xmalloc (sizeof (agent_arg_t));
	check_agent_args->msg_type = REQUEST_HEALTH_CHECK;
	check_agent_args->retry = 0;
	check_agent_args->hostlist = hostlist_create(NULL);
#ifdef HAVE_FRONT_END
	for (i = 0, front_end_ptr = front_end_nodes;
	     i < front_end_node_cnt; i++, front_end_ptr++) {
		if (IS_NODE_NO_RESPOND(front_end_ptr))
			continue;
		hostlist_push_host(check_agent_args->hostlist,
				   front_end_ptr->name);
		check_agent_args->node_count++;
	}
#else
	run_cyclic = slurmctld_conf.health_check_node_state &
		     HEALTH_CHECK_CYCLE;
	node_states = slurmctld_conf.health_check_node_state &
		      (~HEALTH_CHECK_CYCLE);
	if (run_cyclic) {
		time_t now = time(NULL);
		if (cycle_start_time == (time_t) 0)
			cycle_start_time = now;
		else if (base_node_loc >= 0)
			;	/* mid-cycle */
		else if (difftime(now, cycle_start_time) <
			 slurmctld_conf.health_check_interval) {
			return;	/* Wait to start next cycle */
		}
		cycle_start_time = now;
		/* Determine how many nodes we want to test on each call of
		 * run_health_check() to spread out the work. */
		node_limit = (node_record_count * 2) /
			     slurmctld_conf.health_check_interval;
		node_limit = MAX(node_limit, 10);
	}
	if ((node_states != HEALTH_CHECK_NODE_ANY) &&
	    (node_states != HEALTH_CHECK_NODE_IDLE)) {
		/* Update each node's alloc_cpus count */
		select_g_select_nodeinfo_set_all();
	}

	for (i = 0; i < node_record_count; i++) {
		if (run_cyclic) {
			if (node_test_cnt++ >= node_limit)
				break;
			base_node_loc++;
			if (base_node_loc >= node_record_count) {
				base_node_loc = -1;
				break;
			}
			node_ptr = node_record_table_ptr + base_node_loc;
		} else {
			node_ptr = node_record_table_ptr + i;
		}
		if (IS_NODE_NO_RESPOND(node_ptr) || IS_NODE_FUTURE(node_ptr) ||
		    IS_NODE_POWER_SAVE(node_ptr))
			continue;
		if (node_states != HEALTH_CHECK_NODE_ANY) {
			uint16_t cpus_total, cpus_used = 0;
			if (slurmctld_conf.fast_schedule) {
				cpus_total = node_ptr->config_ptr->cpus;
			} else {
				cpus_total = node_ptr->cpus;
			}
			if (!IS_NODE_IDLE(node_ptr)) {
				select_g_select_nodeinfo_get(
						node_ptr->select_nodeinfo,
						SELECT_NODEDATA_SUBCNT,
						NODE_STATE_ALLOCATED,
						&cpus_used);
			}
			/* Here the node state is inferred from
			 * the cpus allocated on it.
			 * - cpus_used == 0
			 *       means node is idle
			 * - cpus_used < cpus_total
			 *       means the node is in mixed state
			 * else cpus_used == cpus_total
			 *       means the node is allocated
			 */
			if (cpus_used == 0) {
				if (!(node_states & HEALTH_CHECK_NODE_IDLE))
					continue;
				if (!IS_NODE_IDLE(node_ptr))
					continue;
			} else if (cpus_used < cpus_total) {
				if (!(node_states & HEALTH_CHECK_NODE_MIXED))
					continue;
			} else {
				if (!(node_states & HEALTH_CHECK_NODE_ALLOC))
					continue;
			}
		}
		hostlist_push_host(check_agent_args->hostlist, node_ptr->name);
		check_agent_args->node_count++;
	}
	if (run_cyclic && (i >= node_record_count))
		base_node_loc = -1;
#endif

	if (check_agent_args->node_count == 0) {
		hostlist_destroy(check_agent_args->hostlist);
		xfree (check_agent_args);
	} else {
		hostlist_uniq(check_agent_args->hostlist);
		host_str = hostlist_ranged_string_xmalloc(
				check_agent_args->hostlist);
		debug("Spawning health check agent for %s", host_str);
		xfree(host_str);
		ping_begin();
		agent_queue_request(check_agent_args);
	}
}
示例#12
0
/*
 * _filter_out - Determine if the specified node should be filtered out or
 *	reported.
 * node_ptr IN - node to consider filtering out
 * RET - true if node should not be reported, false otherwise
 */
static bool _filter_out(node_info_t *node_ptr)
{
	static hostlist_t host_list = NULL;

	if (params.nodes) {
		if (host_list == NULL)
			host_list = hostlist_create(params.nodes);
		if (hostlist_find (host_list, node_ptr->name) == -1)
			return true;
	}

	if (params.dead_nodes && !IS_NODE_NO_RESPOND(node_ptr))
		return true;

	if (params.responding_nodes && IS_NODE_NO_RESPOND(node_ptr))
		return true;

	if (params.state_list) {
		int *node_state;
		bool match = false;
		uint16_t base_state;
		ListIterator iterator;
		uint16_t cpus = 0;
		node_info_t tmp_node, *tmp_node_ptr = &tmp_node;

		iterator = list_iterator_create(params.state_list);
		while ((node_state = list_next(iterator))) {
			tmp_node_ptr->node_state = *node_state;
			if (*node_state == NODE_STATE_DRAIN) {
				/* We search for anything that has the
				 * drain flag set */
				if (IS_NODE_DRAIN(node_ptr)) {
					match = true;
					break;
				}
			} else if (IS_NODE_DRAINING(tmp_node_ptr)) {
				/* We search for anything that gets mapped to
				 * DRAINING in node_state_string */
				if (IS_NODE_DRAINING(node_ptr)) {
					match = true;
					break;
				}
			} else if (IS_NODE_DRAINED(tmp_node_ptr)) {
				/* We search for anything that gets mapped to
				 * DRAINED in node_state_string */
				if (IS_NODE_DRAINED(node_ptr)) {
					match = true;
					break;
				}
			} else if (*node_state & NODE_STATE_FLAGS) {
				if (*node_state & node_ptr->node_state) {
					match = true;
					break;
				}
			} else if (*node_state == NODE_STATE_ERROR) {
				slurm_get_select_nodeinfo(
					node_ptr->select_nodeinfo,
					SELECT_NODEDATA_SUBCNT,
					NODE_STATE_ERROR,
					&cpus);
				if (cpus) {
					match = true;
					break;
				}
			} else if (*node_state == NODE_STATE_ALLOCATED) {
				slurm_get_select_nodeinfo(
					node_ptr->select_nodeinfo,
					SELECT_NODEDATA_SUBCNT,
					NODE_STATE_ALLOCATED,
					&cpus);
				if (params.cluster_flags & CLUSTER_FLAG_BG
				    && !cpus &&
				    (IS_NODE_ALLOCATED(node_ptr) ||
				     IS_NODE_COMPLETING(node_ptr)))
					cpus = node_ptr->cpus;
				if (cpus) {
					match = true;
					break;
				}
			} else if (*node_state == NODE_STATE_IDLE) {
				base_state = node_ptr->node_state &
					(~NODE_STATE_NO_RESPOND);
				if (base_state == NODE_STATE_IDLE) {
					match = true;
					break;
				}
			} else {
				base_state =
					node_ptr->node_state & NODE_STATE_BASE;
				if (base_state == *node_state) {
					match = true;
					break;
				}
			}
		}
		list_iterator_destroy(iterator);
		if (!match)
			return true;
	}

	return false;
}
示例#13
0
/* Test if a batch launch request should be defered
 * RET -1: abort the request, pending job cancelled
 *      0: execute the request now
 *      1: defer the request
 */
static int _batch_launch_defer(queued_request_t *queued_req_ptr)
{
	agent_arg_t *agent_arg_ptr;
	batch_job_launch_msg_t *launch_msg_ptr;
	time_t now = time(NULL);
	struct job_record  *job_ptr;
	int delay_time, nodes_ready = 0;

	agent_arg_ptr = queued_req_ptr->agent_arg_ptr;
	if (agent_arg_ptr->msg_type != REQUEST_BATCH_JOB_LAUNCH)
		return 0;

	if (difftime(now, queued_req_ptr->last_attempt) < 10) {
		/* Reduce overhead by only testing once every 10 secs */
		return 1;
	}

	launch_msg_ptr = (batch_job_launch_msg_t *)agent_arg_ptr->msg_args;
	job_ptr = find_job_record(launch_msg_ptr->job_id);
	if ((job_ptr == NULL) ||
	    (!IS_JOB_RUNNING(job_ptr) && !IS_JOB_SUSPENDED(job_ptr))) {
		info("agent(batch_launch): removed pending request for "
		     "cancelled job %u",
		     launch_msg_ptr->job_id);
		return -1;	/* job cancelled while waiting */
	}

	if (job_ptr->wait_all_nodes) {
		(void) job_node_ready(launch_msg_ptr->job_id, &nodes_ready);
	} else {
#ifdef HAVE_FRONT_END
		nodes_ready = 1;
#else
		struct node_record *node_ptr;
		char *hostname;

		hostname = hostlist_deranged_string_xmalloc(
					agent_arg_ptr->hostlist);
		node_ptr = find_node_record(hostname);
		if (node_ptr == NULL) {
			error("agent(batch_launch) removed pending request for "
			      "job %u, missing node %s",
			      launch_msg_ptr->job_id, hostname);
			xfree(hostname);
			return -1;	/* invalid request?? */
		}
		xfree(hostname);
		if (!IS_NODE_POWER_SAVE(node_ptr) &&
		    !IS_NODE_NO_RESPOND(node_ptr)) {
			nodes_ready = 1;
		}
#endif
	}

	delay_time = difftime(now, job_ptr->start_time);
	if (nodes_ready) {
		/* ready to launch, adjust time limit for boot time */
		if (delay_time && (job_ptr->time_limit != INFINITE) &&
		    (!wiki2_sched)) {
			info("Job %u launch delayed by %d secs, "
			     "updating end_time",
			     launch_msg_ptr->job_id, delay_time);
			job_ptr->end_time += delay_time;
		}
		queued_req_ptr->last_attempt = (time_t) 0;
		return 0;
	}

	if (queued_req_ptr->last_attempt == 0) {
		queued_req_ptr->first_attempt = now;
		queued_req_ptr->last_attempt  = now;
	} else if (difftime(now, queued_req_ptr->first_attempt) >=
				 slurm_get_resume_timeout()) {
		error("agent waited too long for nodes to respond, "
		      "sending batch request anyway...");
		if (delay_time && (job_ptr->time_limit != INFINITE) &&
		    (!wiki2_sched)) {
			info("Job %u launch delayed by %d secs, "
			     "updating end_time",
			     launch_msg_ptr->job_id, delay_time);
			job_ptr->end_time += delay_time;
		}
		queued_req_ptr->last_attempt = (time_t) 0;
		return 0;
	}

	queued_req_ptr->last_attempt  = now;
	return 1;
}