Esempio n. 1
0
/* If slurmctld crashes, the node state that it recovers could differ
 * from the actual hardware state (e.g. ResumeProgram failed to complete).
 * To address that, when a node that should be powered up for a running
 * job is not responding, they try running ResumeProgram again. */
static void _re_wake(void)
{
	struct node_record *node_ptr;
	bitstr_t *wake_node_bitmap = NULL;
	int i;

	node_ptr = node_record_table_ptr;
	for (i=0; i<node_record_count; i++, node_ptr++) {
		if (IS_NODE_ALLOCATED(node_ptr)   &&
		    IS_NODE_NO_RESPOND(node_ptr)  &&
		    !IS_NODE_POWER_SAVE(node_ptr) &&
		    (bit_test(suspend_node_bitmap, i) == 0) &&
		    (bit_test(resume_node_bitmap,  i) == 0)) {
			if (wake_node_bitmap == NULL) {
				wake_node_bitmap =
					bit_alloc(node_record_count);
			}
			bit_set(wake_node_bitmap, i);
		}
	}

	if (wake_node_bitmap) {
		char *nodes;
		nodes = bitmap2node_name(wake_node_bitmap);
		if (nodes) {
			pid_t pid = _run_prog(resume_prog, nodes, NULL);
			info("power_save: pid %d rewaking nodes %s",
			     (int) pid, nodes);
		} else
			error("power_save: bitmap2nodename");
		xfree(nodes);
		FREE_NULL_BITMAP(wake_node_bitmap);
	}
}
Esempio n. 2
0
static void _do_suspend(char *host)
{
	pid_t pid = _run_prog(suspend_prog, host, NULL);
#if _DEBUG
	info("power_save: pid %d suspending nodes %s", (int) pid, host);
#else
	verbose("power_save: pid %d suspending nodes %s", (int) pid, host);
#endif
}
Esempio n. 3
0
static void _do_resume(char *host)
{
	pid_t pid = _run_prog(resume_prog, host, NULL);
#if _DEBUG
	info("power_save: pid %d waking nodes %s", (int) pid, host);
#else
	verbose("power_save: pid %d waking nodes %s", (int) pid, host);
#endif
}
Esempio n. 4
0
static void _do_suspend(char *host)
{
#if _DEBUG
	info("power_save: suspending nodes %s", host);
#else
	verbose("power_save: suspending nodes %s", host);
#endif
	_run_prog(suspend_prog, host, NULL);
}
Esempio n. 5
0
static void _do_resume(char *host)
{
#if _DEBUG
	info("power_save: waking nodes %s", host);
#else
	verbose("power_save: waking nodes %s", host);
#endif
	_run_prog(resume_prog, host, NULL);
}
Esempio n. 6
0
/* power_job_reboot - Reboot compute nodes for a job from the head node */
extern int power_job_reboot(struct job_record *job_ptr)
{
	int rc = SLURM_SUCCESS;
	int i, i_first, i_last;
	struct node_record *node_ptr;
	bitstr_t *wake_node_bitmap = NULL;
	time_t now = time(NULL);
	char *nodes, *features = NULL;

	wake_node_bitmap = bit_alloc(node_record_count);
	i_first = bit_ffs(job_ptr->node_bitmap);
	i_last = bit_fls(job_ptr->node_bitmap);
	for (i = i_first; i <= i_last; i++) {
		if (!bit_test(job_ptr->node_bitmap, i))
			continue;
		node_ptr = node_record_table_ptr + i;
		resume_cnt++;
		resume_cnt_f++;
		node_ptr->node_state &= (~NODE_STATE_POWER_SAVE);
		node_ptr->node_state |=   NODE_STATE_POWER_UP;
		node_ptr->node_state |=   NODE_STATE_NO_RESPOND;
		bit_clear(power_node_bitmap, i);
		bit_clear(avail_node_bitmap, i);
		node_ptr->last_response = now + resume_timeout;
		bit_set(wake_node_bitmap,    i);
		bit_set(resume_node_bitmap,  i);
	}

	nodes = bitmap2node_name(wake_node_bitmap);
	if (nodes) {
#if _DEBUG
		info("power_save: reboot nodes %s", nodes);
#else
		verbose("power_save: reboot nodes %s", nodes);
#endif
		if (job_ptr->details && job_ptr->details->features)
			features = xlate_features(job_ptr->details->features);
		_run_prog(resume_prog, nodes, features);
		xfree(features);
	} else {
		error("power_save: bitmap2nodename");
		rc = SLURM_ERROR;
	}
	xfree(nodes);
	FREE_NULL_BITMAP(wake_node_bitmap);
	last_node_update = now;

	return rc;
}
Esempio n. 7
0
/* power_job_reboot - Reboot compute nodes for a job from the head node */
extern int power_job_reboot(struct job_record *job_ptr)
{
	int rc = SLURM_SUCCESS;
	int i, i_first, i_last;
	struct node_record *node_ptr;
	bitstr_t *boot_node_bitmap = NULL;
	time_t now = time(NULL);
	char *nodes, *features = NULL;
	pid_t pid;

	boot_node_bitmap = node_features_reboot(job_ptr);
	if (boot_node_bitmap == NULL)
		return SLURM_SUCCESS;

	i_first = bit_ffs(boot_node_bitmap);
	if (i_first >= 0)
		i_last = bit_fls(boot_node_bitmap);
	else
		i_last = i_first - 1;
	for (i = i_first; i <= i_last; i++) {
		if (!bit_test(boot_node_bitmap, i))
			continue;
		node_ptr = node_record_table_ptr + i;
		resume_cnt++;
		resume_cnt_f++;
		node_ptr->node_state &= (~NODE_STATE_POWER_SAVE);
		node_ptr->node_state |=   NODE_STATE_POWER_UP;
		node_ptr->node_state |=   NODE_STATE_NO_RESPOND;
		bit_clear(power_node_bitmap, i);
		bit_clear(avail_node_bitmap, i);
		node_ptr->last_response = now + resume_timeout;
		bit_set(resume_node_bitmap,  i);
	}

	nodes = bitmap2node_name(boot_node_bitmap);
	if (nodes) {
		job_ptr->job_state |= JOB_CONFIGURING;
		job_ptr->wait_all_nodes = 1;
		if (job_ptr->details && job_ptr->details->features &&
		    node_features_g_user_update(job_ptr->user_id)) {
			features = node_features_g_job_xlate(
					job_ptr->details->features);
		}
		pid = _run_prog(resume_prog, nodes, features);
#if _DEBUG
		info("power_save: pid %d reboot nodes %s features %s",
		     (int) pid, nodes, features);
#else
		verbose("power_save: pid %d reboot nodes %s features %s",
			(int) pid, nodes, features);
#endif
		xfree(features);
	} else {
		error("power_save: bitmap2nodename");
		rc = SLURM_ERROR;
	}
	xfree(nodes);
	FREE_NULL_BITMAP(boot_node_bitmap);
	last_node_update = now;

	return rc;
}