コード例 #1
0
ファイル: capmc_suspend.c プロジェクト: A1ve5/slurm
/* Attempt to shutdown all nodes in a single capmc call.
 * RET 0 on success, -1 on failure */
static int _update_all_nodes(char *node_names)
{
	char *argv[10], *nid_list, *resp_msg;
	int rc = -1, status = 0;

	nid_list = _node_names_2_nid_list(node_names);
	if (nid_list == NULL)
		return -1;

	/* Request node power down.
	 * Example: "capmc node_off –n 43" */
	argv[0] = "capmc";
	argv[1] = "node_off";
	argv[2] = "-n";
	argv[3] = nid_list;
	argv[4] = NULL;
	resp_msg = _run_script(argv, &status);
	if ((status == 0) ||
	    (resp_msg && strcasestr(resp_msg, "Success"))) {
		debug("%s: node_off sent to %s", prog_name, argv[3]);
		rc = 0;
	} else {
		error("%s: capmc(%s,%s,%s): %d %s", prog_name,
		      argv[1], argv[2], argv[3], status, resp_msg);
	}
	xfree(resp_msg);
	xfree(nid_list);
	return rc;
}
コード例 #2
0
ファイル: capmc_suspend.c プロジェクト: elodina/slurm
static void *_node_update(void *args)
{
	char *node_name = (char *) args;
	char *argv[10], nid_str[32], *resp_msg;
	int i, nid = -1, status = 0;
	bool node_state_ok, node_off_sent = false;
	time_t poll_start;

	for (i = 0; node_name[i]; i++) {
		if ((node_name[i] >= '0') && (node_name[i] <= '9')) {
			nid = strtol(node_name + i, NULL, 10);
			break;
		}
	}
	if (nid < 0) {
		error("%s: No valid NID: %s", prog_name, node_name);
		return NULL;
	}
	snprintf(nid_str, sizeof(nid_str), "%d", nid);

	/* Request node power down.
	 * Example: "capmc node_off –n 43" */
	argv[0] = "capmc";
	argv[1] = "node_off";
	argv[2] = "-n";
	argv[3] = nid_str;
	argv[4] = NULL;
	for (i = 0; ((i < NODE_OFF_RETRIES) && !node_off_sent); i++) {
		resp_msg = _run_script(argv, &status);
		if ((status != 0) ||
		    (resp_msg && (strcasestr(resp_msg, "Success") == NULL))) {
			error("%s: capmc(%s,%s,%s): %d %s", prog_name,
			      argv[1], argv[2], argv[3], status, resp_msg);
			sleep(1);
		} else {
			debug("%s: node_off sent to %s", prog_name, nid_str);
			node_off_sent = true;
		}
		xfree(resp_msg);
	}

	/* Wait for node in "off" state */
	poll_start = time(NULL);
	while (!node_state_ok &&
	      (difftime(time(NULL), poll_start) < NODE_OFF_STATE_WAIT)) {
		sleep(capmc_poll_freq);
		node_state_ok = _check_node_state(nid, nid_str, "off");
	}

	slurm_mutex_lock(&thread_cnt_mutex);
	thread_cnt--;
	pthread_cond_signal(&thread_cnt_cond);
	slurm_mutex_unlock(&thread_cnt_mutex);
	return NULL;
}
コード例 #3
0
ファイル: burst_buffer_generic.c プロジェクト: natefoo/slurm
/*
 * Trigger a job's burst buffer stage-out to begin
 *
 * Returns a SLURM errno.
 */
extern int bb_p_job_start_stage_out(struct job_record *job_ptr)
{
	bb_alloc_t *bb_ptr;
	char **script_argv, *resp;
	int i;

	if (debug_flag) {
		info("%s: %s",  __func__, plugin_type);
		info("%s: job_id:%u", __func__, job_ptr->job_id);
	}
	if ((job_ptr->burst_buffer == NULL) ||
	    (job_ptr->burst_buffer[0] == '\0') ||
	    (_get_bb_size(job_ptr) == 0))
		return SLURM_SUCCESS;

	pthread_mutex_lock(&bb_mutex);
	bb_ptr = _find_bb_job_rec(job_ptr);
	if (!bb_ptr) {
		/* No job buffers. Assuming use of persistent buffers only */
		debug("%s: job_id:%u bb_rec not found",
		      __func__, job_ptr->job_id);
	} else {
		script_argv = _build_stage_args(start_stage_out, "stage_out",
						job_ptr);
		if (script_argv) {
			bb_ptr->state = BB_STATE_STAGING_OUT;
			resp = _run_script("StartStageOut", start_stage_out,
					   script_argv, -1);
			if (resp) {
				error("%s: StartStageOut: %s", __func__, resp);
				xfree(resp);
			}
			for (i = 0; script_argv[i]; i++)
				xfree(script_argv[i]);
			xfree(script_argv);
		} else {
			bb_ptr->state = BB_STATE_STAGED_OUT;
		}
	}
	pthread_mutex_unlock(&bb_mutex);

	return SLURM_SUCCESS;
}
コード例 #4
0
ファイル: capmc_suspend.c プロジェクト: elodina/slurm
static bool _check_node_state(int nid, char *nid_str, char *state)
{
	bool node_state_ok = false;
	char *argv[10], *resp_msg;
	int i, nid_cnt, status = 0;
	uint32_t *nid_array;
	json_object *j;

	argv[0] = "capmc";
	argv[1] = "node_status";
	argv[2] = "-n";
	argv[3] = nid_str;
	argv[4] = NULL;
	resp_msg = _run_script(argv, &status);
	if (status != 0) {
		error("%s: capmc(%s,%s,%s): %d %s", prog_name,
			argv[1], argv[2], argv[3], status, resp_msg);
		xfree(resp_msg);
		return node_state_ok;
	}
	j = json_tokener_parse(resp_msg);
	if (j == NULL) {
		error("%s: json parser failed on %s", prog_name, resp_msg);
		xfree(resp_msg);
		return node_state_ok;
	}
	xfree(resp_msg);

	nid_cnt = 0;
	nid_array = _json_parse_nids(j, "off", &nid_cnt);
	json_object_put(j);	/* Frees json memory */
	for (i = 0; i < nid_cnt; i++) {
		if (nid_array[i] == nid) {
			node_state_ok = true;
			break;
		}
	}
	xfree(nid_array);

	return node_state_ok;
}
コード例 #5
0
ファイル: burst_buffer_generic.c プロジェクト: natefoo/slurm
/* Determine the current actual burst buffer state.
 * Run the program "get_sys_state" and parse stdout for details. */
static void _load_state(void)
{
	static uint32_t last_total_space = 0;
	char *save_ptr = NULL, *tok, *leftover = NULL, *resp, *tmp = NULL;
	char *script_args[3] = { NULL, "get_sys", NULL };
	s_p_hashtbl_t *state_hashtbl = NULL;
	static s_p_options_t state_options[] = {
		{"ENOENT", S_P_STRING},
		{"UserID", S_P_ARRAY, _parse_job_info, _destroy_job_info},
		{"TotalSize", S_P_STRING},
		{NULL}
	};

	tok = strrchr(get_sys_state, '/');
	if (tok)
		script_args[0] = tok + 1;
	else
		script_args[0] = get_sys_state;
	resp = _run_script("GetSysState", get_sys_state, script_args, 10);
	if (resp == NULL)
		return;
	state_hashtbl = s_p_hashtbl_create(state_options);
	tok = strtok_r(resp, "\n", &save_ptr);
	while (tok) {
		s_p_parse_line(state_hashtbl, tok, &leftover);
		tok = strtok_r(NULL, "\n", &save_ptr);
	}
	if (s_p_get_string(&tmp, "TotalSize", state_hashtbl)) {
		total_space = _get_size_num(tmp);
		xfree(tmp);
	} else {
		error("%s: GetSysState failed to respond with TotalSize",
		      plugin_type);
	}
	s_p_hashtbl_destroy(state_hashtbl);

	if (debug_flag && (total_space != last_total_space))
		info("%s: total_space:%u",  __func__, total_space);
	last_total_space = total_space;
}
コード例 #6
0
ファイル: capmc_resume.c プロジェクト: jwhite530/slurm
/* Wait for all identified computed nodes to enter "on" state */
static void _wait_all_nodes_on(void)
{
	char *argv[10], *resp_msg;
	int i, nid_cnt = 0, status = 0;
	json_object *j;
	uint32_t *nid_array;
	time_t start_time = time(NULL);

	while ((difftime(time(NULL), start_time) < (30 * 60)) &&
	       (bit_set_count(node_bitmap) > 0)) {
		sleep(20);
		argv[0] = "capmc";
		argv[1] = "node_status";
		argv[2] = NULL;
		resp_msg = _run_script(argv, &status);
		if (status != 0) {
			error("%s: capmc(%s,%s,%s): %d %s", log_file,
				argv[1], argv[2], argv[3], status, resp_msg);
			break;
		}
		j = json_tokener_parse(resp_msg);
		if (j == NULL) {
			error("%s: json parser failed on %s",
			      log_file, resp_msg);
			xfree(resp_msg);
			break;
		}
		xfree(resp_msg);
		nid_cnt = 0;
		nid_array = _json_parse_nids(j, "on", &nid_cnt);
		json_object_put(j);	/* Frees json memory */
		for (i = 0; i < nid_cnt; i++) {
			bit_clear(node_bitmap, nid_array[i]);
		}
		xfree(nid_array);
	}
}
コード例 #7
0
ファイル: capmc_resume.c プロジェクト: jwhite530/slurm
static void *_node_update(void *args)
{
	char *node_name = (char *) args;
	char *argv[10], nid_str[32], *resp_msg;
	int i, nid = -1, status = 0;
	bool node_state_ok;

	for (i = 0; node_name[i]; i++) {
		if ((node_name[i] >= '0') && (node_name[i] <= '9')) {
			nid = strtol(node_name + i, NULL, 10);
			break;
		}
	}
	if (nid < 0) {
		error("%s: No valid NID: %s", log_file, node_name);
		goto fini;
	}
	snprintf(nid_str, sizeof(nid_str), "%d", nid);

	if (mcdram_mode) {
		/* Update MCDRAM mode.
		* Example: "capmc set_mcdram_cfg –n 43 –m cache" */
		argv[0] = "capmc";
		argv[1] = "set_mcdram_cfg";
		argv[2] = "-n";
		argv[3] = nid_str;
		argv[4] = "-m";
		argv[5] = mcdram_mode;
		argv[6] = NULL;
		resp_msg = _run_script(argv, &status);
		if (status != 0) {
			error("%s: capmc(%s,%s,%s,%s,%s): %d %s", log_file,
			      argv[1], argv[2], argv[3], argv[4], argv[5],
			      status, resp_msg);
		}
		xfree(resp_msg);
	}

	if (numa_mode) {
		/* Update NUMA mode.
		 * Example: "capmc set_numa_cfg –n 43 –m a2a" */
		argv[0] = "capmc";
		argv[1] = "set_numa_cfg";
		argv[2] = "-n";
		argv[3] = nid_str;
		argv[4] = "-m";
		argv[5] = numa_mode;
		argv[6] = NULL;
		resp_msg = _run_script(argv, &status);
		if (status != 0) {
			error("%s: capmc(%s,%s,%s,%s,%s): %d %s", log_file,
			      argv[1], argv[2], argv[3], argv[4], argv[5],
			      status, resp_msg);
		}
		xfree(resp_msg);
	}

	/* Test if already in "off" state */
	node_state_ok = _check_node_state(nid, nid_str, "off");

	/* Request node power down.
	 * Example: "capmc node_off –n 43" */
	if (!node_state_ok) {
		argv[0] = "capmc";
		argv[1] = "node_off";
		argv[2] = "-n";
		argv[3] = nid_str;
		argv[4] = NULL;
		resp_msg = _run_script(argv, &status);
		if (status != 0) {
			error("%s: capmc(%s,%s,%s): %d %s", log_file,
			      argv[1], argv[2], argv[3], status, resp_msg);
		}
		xfree(resp_msg);
	}

	/* Wait for node in "off" state */
	while (!node_state_ok) {
		sleep(2);
		node_state_ok = _check_node_state(nid, nid_str, "off");
	}

	/* Request node power up.
	 * Example: "capmc node_up –n 43" */
	argv[0] = "capmc";
	argv[1] = "node_up";
	argv[2] = "-n";
	argv[3] = nid_str;
	argv[4] = NULL;
	resp_msg = _run_script(argv, &status);
	if (status != 0) {
		error("%s: capmc(%s,%s,%s): %d %s", log_file,
			argv[1], argv[2], argv[3], status, resp_msg);
	}
	xfree(resp_msg);

fini:	slurm_mutex_lock(&thread_cnt_mutex);
	thread_cnt--;
	pthread_cond_signal(&thread_cnt_cond);
	slurm_mutex_unlock(&thread_cnt_mutex);
	return NULL;
}