Esempio n. 1
0
/*
 * slurm_get_node_energy_n - issue RPC to get the energy data of all
 * configured sensors on the target machine
 * IN  host  - name of node to query, NULL if localhost
 * IN  delta - Use cache if data is newer than this in seconds
 * OUT sensors_cnt - number of sensors
 * OUT energy - array of acct_gather_energy_t structures on success or
 *                NULL other wise
 * RET 0 on success or a slurm error code
 * NOTE: free the response using xfree
 */
extern int slurm_get_node_energy(char *host, uint16_t delta,
				 uint16_t *sensor_cnt,
				 acct_gather_energy_t **energy)
{
	int rc;
	slurm_msg_t req_msg;
	slurm_msg_t resp_msg;
	acct_gather_energy_req_msg_t req;
	uint32_t cluster_flags = slurmdb_setup_cluster_flags();
	char *this_addr;

	xassert(sensor_cnt);
	xassert(energy);

	*sensor_cnt = 0;
	*energy = NULL;

	slurm_msg_t_init(&req_msg);
	slurm_msg_t_init(&resp_msg);

	if (host)
		slurm_conf_get_addr(host, &req_msg.address);
	else if (cluster_flags & CLUSTER_FLAG_MULTSD) {
		if ((this_addr = getenv("SLURMD_NODENAME"))) {
			slurm_conf_get_addr(this_addr, &req_msg.address);
		} else {
			this_addr = "localhost";
			slurm_set_addr(&req_msg.address,
				       (uint16_t)slurm_get_slurmd_port(),
				       this_addr);
		}
	} else {
		char this_host[256];
		/*
		 *  Set request message address to slurmd on localhost
		 */
		gethostname_short(this_host, sizeof(this_host));
		this_addr = slurm_conf_get_nodeaddr(this_host);
		if (this_addr == NULL)
			this_addr = xstrdup("localhost");
		slurm_set_addr(&req_msg.address,
			       (uint16_t)slurm_get_slurmd_port(),
			       this_addr);
		xfree(this_addr);
	}

	req.delta        = delta;
	req_msg.msg_type = REQUEST_ACCT_GATHER_ENERGY;
	req_msg.data     = &req;

	rc = slurm_send_recv_node_msg(&req_msg, &resp_msg, 0);

	if (rc != 0 || !resp_msg.auth_cred) {
		error("slurm_get_node_energy: %m");
		if (resp_msg.auth_cred)
			g_slurm_auth_destroy(resp_msg.auth_cred);
		return SLURM_ERROR;
	}
	if (resp_msg.auth_cred)
		g_slurm_auth_destroy(resp_msg.auth_cred);
	switch (resp_msg.msg_type) {
	case RESPONSE_ACCT_GATHER_ENERGY:
		*sensor_cnt = ((acct_gather_node_resp_msg_t *)
			       resp_msg.data)->sensor_cnt;
		*energy = ((acct_gather_node_resp_msg_t *)
			   resp_msg.data)->energy;
		((acct_gather_node_resp_msg_t *) resp_msg.data)->energy = NULL;
		slurm_free_acct_gather_node_resp_msg(resp_msg.data);
		break;
	case RESPONSE_SLURM_RC:
	        rc = ((return_code_msg_t *) resp_msg.data)->return_code;
		slurm_free_return_code_msg(resp_msg.data);
		if (rc)
			slurm_seterrno_ret(rc);
		break;
	default:
		slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR);
		break;
	}

	return SLURM_PROTOCOL_SUCCESS;
}
Esempio n. 2
0
/*
 * slurm_load_slurmd_status - issue RPC to get the status of slurmd
 *	daemon on this machine
 * IN slurmd_info_ptr - place to store slurmd status information
 * RET 0 or -1 on error
 * NOTE: free the response using slurm_free_slurmd_status()
 */
extern int
slurm_load_slurmd_status(slurmd_status_t **slurmd_status_ptr)
{
	int rc;
	slurm_msg_t req_msg;
	slurm_msg_t resp_msg;
	uint32_t cluster_flags = slurmdb_setup_cluster_flags();
	char *this_addr;

	slurm_msg_t_init(&req_msg);
	slurm_msg_t_init(&resp_msg);

	if (cluster_flags & CLUSTER_FLAG_MULTSD) {
		if ((this_addr = getenv("SLURMD_NODENAME"))) {
			slurm_conf_get_addr(this_addr, &req_msg.address);
		} else {
			this_addr = "localhost";
			slurm_set_addr(&req_msg.address,
				       (uint16_t)slurm_get_slurmd_port(),
				       this_addr);
		}
	} else {
		char this_host[256];

		/*
		 *  Set request message address to slurmd on localhost
		 */
		gethostname_short(this_host, sizeof(this_host));
		this_addr = slurm_conf_get_nodeaddr(this_host);
		if (this_addr == NULL)
			this_addr = xstrdup("localhost");
		slurm_set_addr(&req_msg.address,
			       (uint16_t)slurm_get_slurmd_port(),
			       this_addr);
		xfree(this_addr);
	}
	req_msg.msg_type = REQUEST_DAEMON_STATUS;
	req_msg.data     = NULL;

	rc = slurm_send_recv_node_msg(&req_msg, &resp_msg, 0);

	if ((rc != 0) || !resp_msg.auth_cred) {
		error("slurm_slurmd_info: %m");
		if (resp_msg.auth_cred)
			g_slurm_auth_destroy(resp_msg.auth_cred);
		return SLURM_ERROR;
	}
	if (resp_msg.auth_cred)
		g_slurm_auth_destroy(resp_msg.auth_cred);

	switch (resp_msg.msg_type) {
	case RESPONSE_SLURMD_STATUS:
		*slurmd_status_ptr = (slurmd_status_t *) resp_msg.data;
		break;
	case RESPONSE_SLURM_RC:
		rc = ((return_code_msg_t *) resp_msg.data)->return_code;
		slurm_free_return_code_msg(resp_msg.data);
		if (rc)
			slurm_seterrno_ret(rc);
		break;
	default:
		slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR);
		break;
	}

	return SLURM_PROTOCOL_SUCCESS;
}
Esempio n. 3
0
/*
 * Read the slurm configuration file (slurm.conf) and substitute some
 * values into the slurmd configuration in preference of the defaults.
 */
static void
_read_config(void)
{
	char *path_pubkey = NULL;
	slurm_ctl_conf_t *cf = NULL;
	uint16_t tmp16 = 0;

#ifndef HAVE_FRONT_END
	bool cr_flag = false, gang_flag = false;
#endif

	cf = slurm_conf_lock();

	slurm_mutex_lock(&conf->config_mutex);

	if (conf->conffile == NULL)
		conf->conffile = xstrdup(cf->slurm_conf);

	conf->slurm_user_id =  cf->slurm_user_id;

	conf->cr_type = cf->select_type_param;

	path_pubkey = xstrdup(cf->job_credential_public_certificate);

	if (!conf->logfile)
		conf->logfile = xstrdup(cf->slurmd_logfile);

#ifndef HAVE_FRONT_END
	if (!strcmp(cf->select_type, "select/cons_res"))
		cr_flag = true;
	if (cf->preempt_mode & PREEMPT_MODE_GANG)
		gang_flag = true;
#endif

	slurm_conf_unlock();
	/* node_name may already be set from a command line parameter */
	if (conf->node_name == NULL)
		conf->node_name = slurm_conf_get_nodename(conf->hostname);
	/* if we didn't match the form of the hostname already
	 * stored in conf->hostname, check to see if we match any
	 * valid aliases */
	if (conf->node_name == NULL)
		conf->node_name = slurm_conf_get_aliased_nodename();

	if (conf->node_name == NULL)
		conf->node_name = slurm_conf_get_nodename("localhost");

	if (conf->node_name == NULL)
		fatal("Unable to determine this slurmd's NodeName");

	_massage_pathname(&conf->logfile);

	/* set node_addr if relevant */
	if ((conf->node_addr == NULL) &&
	    (conf->node_addr = slurm_conf_get_nodeaddr(conf->hostname)) &&
	    (strcmp(conf->node_addr, conf->hostname) == 0)) {
		xfree(conf->node_addr);	/* Sets to NULL */
	}

	conf->port = slurm_conf_get_port(conf->node_name);
	slurm_conf_get_cpus_bsct(conf->node_name,
				 &conf->conf_cpus, &conf->conf_boards,
				 &conf->conf_sockets, &conf->conf_cores,
				 &conf->conf_threads);

	/* store hardware properties in slurmd_config */
	xfree(conf->block_map);
	xfree(conf->block_map_inv);

	_update_logging();
	_update_nice();

	get_cpuinfo(&conf->actual_cpus,
		    &conf->actual_boards,
	            &conf->actual_sockets,
	            &conf->actual_cores,
	            &conf->actual_threads,
	            &conf->block_map_size,
	            &conf->block_map, &conf->block_map_inv);
#ifdef HAVE_FRONT_END
	/*
	 * When running with multiple frontends, the slurmd S:C:T values are not
	 * relevant, hence ignored by both _register_front_ends (sets all to 1)
	 * and validate_nodes_via_front_end (uses slurm.conf values).
	 * Report actual hardware configuration, irrespective of FastSchedule.
	 */
	conf->cpus    = conf->actual_cpus;
	conf->boards  = conf->actual_boards;
	conf->sockets = conf->actual_sockets;
	conf->cores   = conf->actual_cores;
	conf->threads = conf->actual_threads;
#else
	/* If the actual resources on a node differ than what is in
	 * the configuration file and we are using
	 * cons_res or gang scheduling we have to use what is in the
	 * configuration file because the slurmctld creates bitmaps
	 * for scheduling before these nodes check in.
	 */
	if (((cf->fast_schedule == 0) && !cr_flag && !gang_flag) ||
	    ((cf->fast_schedule == 1) &&
	     (conf->actual_cpus < conf->conf_cpus))) {
		conf->cpus    = conf->actual_cpus;
		conf->boards  = conf->actual_boards;
		conf->sockets = conf->actual_sockets;
		conf->cores   = conf->actual_cores;
		conf->threads = conf->actual_threads;
	} else {
		conf->cpus    = conf->conf_cpus;
		conf->boards  = conf->conf_boards;
		conf->sockets = conf->conf_sockets;
		conf->cores   = conf->conf_cores;
		conf->threads = conf->conf_threads;
	}

	if ((conf->cpus    != conf->actual_cpus)    ||
	    (conf->sockets != conf->actual_sockets) ||
	    (conf->cores   != conf->actual_cores)   ||
	    (conf->threads != conf->actual_threads)) {
		if (cf->fast_schedule) {
			info("Node configuration differs from hardware: "
			     "CPUs=%u:%u(hw) Boards=%u:%u(hw) "
			     "SocketsPerBoard=%u:%u(hw) CoresPerSocket=%u:%u(hw) "
			     "ThreadsPerCore=%u:%u(hw)",
			     conf->cpus,    conf->actual_cpus,
			     conf->boards,  conf->actual_boards,
			     conf->sockets, conf->actual_sockets,
			     conf->cores,   conf->actual_cores,
			     conf->threads, conf->actual_threads);
		} else if ((cf->fast_schedule == 0) && (cr_flag || gang_flag)) {
			error("You are using cons_res or gang scheduling with "
			      "Fastschedule=0 and node configuration differs "
			      "from hardware.  The node configuration used "
			      "will be what is in the slurm.conf because of "
			      "the bitmaps the slurmctld must create before "
			      "the slurmd registers.\n"
			      "   CPUs=%u:%u(hw) Boards=%u:%u(hw) "
			      "SocketsPerBoard=%u:%u(hw) CoresPerSocket=%u:%u(hw) "
			      "ThreadsPerCore=%u:%u(hw)",
			      conf->cpus,    conf->actual_cpus,
			      conf->boards,  conf->actual_boards,
			      conf->sockets, conf->actual_sockets,
			      conf->cores,   conf->actual_cores,
			      conf->threads, conf->actual_threads);
		}
	}
#endif

	get_memory(&conf->real_memory_size);
	get_up_time(&conf->up_time);

	cf = slurm_conf_lock();
	get_tmp_disk(&conf->tmp_disk_space, cf->tmp_fs);
	_free_and_set(&conf->epilog,   xstrdup(cf->epilog));
	_free_and_set(&conf->prolog,   xstrdup(cf->prolog));
	_free_and_set(&conf->tmpfs,    xstrdup(cf->tmp_fs));
	_free_and_set(&conf->health_check_program,
		      xstrdup(cf->health_check_program));
	_free_and_set(&conf->spooldir, xstrdup(cf->slurmd_spooldir));
	_massage_pathname(&conf->spooldir);
	_free_and_set(&conf->pidfile,  xstrdup(cf->slurmd_pidfile));
	_massage_pathname(&conf->pidfile);
	_free_and_set(&conf->select_type, xstrdup(cf->select_type));
	_free_and_set(&conf->task_prolog, xstrdup(cf->task_prolog));
	_free_and_set(&conf->task_epilog, xstrdup(cf->task_epilog));
	_free_and_set(&conf->pubkey,   path_pubkey);

	conf->debug_flags = cf->debug_flags;
	conf->propagate_prio = cf->propagate_prio_process;

	_free_and_set(&conf->job_acct_gather_freq,
		      xstrdup(cf->job_acct_gather_freq));

	conf->acct_freq_task = (uint16_t)NO_VAL;
	tmp16 = acct_gather_parse_freq(PROFILE_TASK,
				       conf->job_acct_gather_freq);
	if (tmp16 != -1)
		conf->acct_freq_task = tmp16;

	_free_and_set(&conf->acct_gather_energy_type,
		      xstrdup(cf->acct_gather_energy_type));
	_free_and_set(&conf->acct_gather_filesystem_type,
		      xstrdup(cf->acct_gather_filesystem_type));
	_free_and_set(&conf->acct_gather_infiniband_type,
		      xstrdup(cf->acct_gather_infiniband_type));
	_free_and_set(&conf->acct_gather_profile_type,
		      xstrdup(cf->acct_gather_profile_type));
	_free_and_set(&conf->job_acct_gather_type,
		      xstrdup(cf->job_acct_gather_type));

	if ( (conf->node_name == NULL) ||
	     (conf->node_name[0] == '\0') )
		fatal("Node name lookup failure");

	if (cf->control_addr == NULL)
		fatal("Unable to establish controller machine");
	if (cf->slurmctld_port == 0)
		fatal("Unable to establish controller port");
	conf->slurmd_timeout = cf->slurmd_timeout;
	conf->use_pam = cf->use_pam;
	conf->task_plugin_param = cf->task_plugin_param;

	slurm_mutex_unlock(&conf->config_mutex);
	slurm_conf_unlock();
}