/* * slurm_get_node_energy_n - issue RPC to get the energy data of all * configured sensors on the target machine * IN host - name of node to query, NULL if localhost * IN delta - Use cache if data is newer than this in seconds * OUT sensors_cnt - number of sensors * OUT energy - array of acct_gather_energy_t structures on success or * NULL other wise * RET 0 on success or a slurm error code * NOTE: free the response using xfree */ extern int slurm_get_node_energy(char *host, uint16_t delta, uint16_t *sensor_cnt, acct_gather_energy_t **energy) { int rc; slurm_msg_t req_msg; slurm_msg_t resp_msg; acct_gather_energy_req_msg_t req; uint32_t cluster_flags = slurmdb_setup_cluster_flags(); char *this_addr; xassert(sensor_cnt); xassert(energy); *sensor_cnt = 0; *energy = NULL; slurm_msg_t_init(&req_msg); slurm_msg_t_init(&resp_msg); if (host) slurm_conf_get_addr(host, &req_msg.address); else if (cluster_flags & CLUSTER_FLAG_MULTSD) { if ((this_addr = getenv("SLURMD_NODENAME"))) { slurm_conf_get_addr(this_addr, &req_msg.address); } else { this_addr = "localhost"; slurm_set_addr(&req_msg.address, (uint16_t)slurm_get_slurmd_port(), this_addr); } } else { char this_host[256]; /* * Set request message address to slurmd on localhost */ gethostname_short(this_host, sizeof(this_host)); this_addr = slurm_conf_get_nodeaddr(this_host); if (this_addr == NULL) this_addr = xstrdup("localhost"); slurm_set_addr(&req_msg.address, (uint16_t)slurm_get_slurmd_port(), this_addr); xfree(this_addr); } req.delta = delta; req_msg.msg_type = REQUEST_ACCT_GATHER_ENERGY; req_msg.data = &req; rc = slurm_send_recv_node_msg(&req_msg, &resp_msg, 0); if (rc != 0 || !resp_msg.auth_cred) { error("slurm_get_node_energy: %m"); if (resp_msg.auth_cred) g_slurm_auth_destroy(resp_msg.auth_cred); return SLURM_ERROR; } if (resp_msg.auth_cred) g_slurm_auth_destroy(resp_msg.auth_cred); switch (resp_msg.msg_type) { case RESPONSE_ACCT_GATHER_ENERGY: *sensor_cnt = ((acct_gather_node_resp_msg_t *) resp_msg.data)->sensor_cnt; *energy = ((acct_gather_node_resp_msg_t *) resp_msg.data)->energy; ((acct_gather_node_resp_msg_t *) resp_msg.data)->energy = NULL; slurm_free_acct_gather_node_resp_msg(resp_msg.data); break; case RESPONSE_SLURM_RC: rc = ((return_code_msg_t *) resp_msg.data)->return_code; slurm_free_return_code_msg(resp_msg.data); if (rc) slurm_seterrno_ret(rc); break; default: slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR); break; } return SLURM_PROTOCOL_SUCCESS; }
/* * slurm_load_slurmd_status - issue RPC to get the status of slurmd * daemon on this machine * IN slurmd_info_ptr - place to store slurmd status information * RET 0 or -1 on error * NOTE: free the response using slurm_free_slurmd_status() */ extern int slurm_load_slurmd_status(slurmd_status_t **slurmd_status_ptr) { int rc; slurm_msg_t req_msg; slurm_msg_t resp_msg; uint32_t cluster_flags = slurmdb_setup_cluster_flags(); char *this_addr; slurm_msg_t_init(&req_msg); slurm_msg_t_init(&resp_msg); if (cluster_flags & CLUSTER_FLAG_MULTSD) { if ((this_addr = getenv("SLURMD_NODENAME"))) { slurm_conf_get_addr(this_addr, &req_msg.address); } else { this_addr = "localhost"; slurm_set_addr(&req_msg.address, (uint16_t)slurm_get_slurmd_port(), this_addr); } } else { char this_host[256]; /* * Set request message address to slurmd on localhost */ gethostname_short(this_host, sizeof(this_host)); this_addr = slurm_conf_get_nodeaddr(this_host); if (this_addr == NULL) this_addr = xstrdup("localhost"); slurm_set_addr(&req_msg.address, (uint16_t)slurm_get_slurmd_port(), this_addr); xfree(this_addr); } req_msg.msg_type = REQUEST_DAEMON_STATUS; req_msg.data = NULL; rc = slurm_send_recv_node_msg(&req_msg, &resp_msg, 0); if ((rc != 0) || !resp_msg.auth_cred) { error("slurm_slurmd_info: %m"); if (resp_msg.auth_cred) g_slurm_auth_destroy(resp_msg.auth_cred); return SLURM_ERROR; } if (resp_msg.auth_cred) g_slurm_auth_destroy(resp_msg.auth_cred); switch (resp_msg.msg_type) { case RESPONSE_SLURMD_STATUS: *slurmd_status_ptr = (slurmd_status_t *) resp_msg.data; break; case RESPONSE_SLURM_RC: rc = ((return_code_msg_t *) resp_msg.data)->return_code; slurm_free_return_code_msg(resp_msg.data); if (rc) slurm_seterrno_ret(rc); break; default: slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR); break; } return SLURM_PROTOCOL_SUCCESS; }
/* * Read the slurm configuration file (slurm.conf) and substitute some * values into the slurmd configuration in preference of the defaults. */ static void _read_config(void) { char *path_pubkey = NULL; slurm_ctl_conf_t *cf = NULL; uint16_t tmp16 = 0; #ifndef HAVE_FRONT_END bool cr_flag = false, gang_flag = false; #endif cf = slurm_conf_lock(); slurm_mutex_lock(&conf->config_mutex); if (conf->conffile == NULL) conf->conffile = xstrdup(cf->slurm_conf); conf->slurm_user_id = cf->slurm_user_id; conf->cr_type = cf->select_type_param; path_pubkey = xstrdup(cf->job_credential_public_certificate); if (!conf->logfile) conf->logfile = xstrdup(cf->slurmd_logfile); #ifndef HAVE_FRONT_END if (!strcmp(cf->select_type, "select/cons_res")) cr_flag = true; if (cf->preempt_mode & PREEMPT_MODE_GANG) gang_flag = true; #endif slurm_conf_unlock(); /* node_name may already be set from a command line parameter */ if (conf->node_name == NULL) conf->node_name = slurm_conf_get_nodename(conf->hostname); /* if we didn't match the form of the hostname already * stored in conf->hostname, check to see if we match any * valid aliases */ if (conf->node_name == NULL) conf->node_name = slurm_conf_get_aliased_nodename(); if (conf->node_name == NULL) conf->node_name = slurm_conf_get_nodename("localhost"); if (conf->node_name == NULL) fatal("Unable to determine this slurmd's NodeName"); _massage_pathname(&conf->logfile); /* set node_addr if relevant */ if ((conf->node_addr == NULL) && (conf->node_addr = slurm_conf_get_nodeaddr(conf->hostname)) && (strcmp(conf->node_addr, conf->hostname) == 0)) { xfree(conf->node_addr); /* Sets to NULL */ } conf->port = slurm_conf_get_port(conf->node_name); slurm_conf_get_cpus_bsct(conf->node_name, &conf->conf_cpus, &conf->conf_boards, &conf->conf_sockets, &conf->conf_cores, &conf->conf_threads); /* store hardware properties in slurmd_config */ xfree(conf->block_map); xfree(conf->block_map_inv); _update_logging(); _update_nice(); get_cpuinfo(&conf->actual_cpus, &conf->actual_boards, &conf->actual_sockets, &conf->actual_cores, &conf->actual_threads, &conf->block_map_size, &conf->block_map, &conf->block_map_inv); #ifdef HAVE_FRONT_END /* * When running with multiple frontends, the slurmd S:C:T values are not * relevant, hence ignored by both _register_front_ends (sets all to 1) * and validate_nodes_via_front_end (uses slurm.conf values). * Report actual hardware configuration, irrespective of FastSchedule. */ conf->cpus = conf->actual_cpus; conf->boards = conf->actual_boards; conf->sockets = conf->actual_sockets; conf->cores = conf->actual_cores; conf->threads = conf->actual_threads; #else /* If the actual resources on a node differ than what is in * the configuration file and we are using * cons_res or gang scheduling we have to use what is in the * configuration file because the slurmctld creates bitmaps * for scheduling before these nodes check in. */ if (((cf->fast_schedule == 0) && !cr_flag && !gang_flag) || ((cf->fast_schedule == 1) && (conf->actual_cpus < conf->conf_cpus))) { conf->cpus = conf->actual_cpus; conf->boards = conf->actual_boards; conf->sockets = conf->actual_sockets; conf->cores = conf->actual_cores; conf->threads = conf->actual_threads; } else { conf->cpus = conf->conf_cpus; conf->boards = conf->conf_boards; conf->sockets = conf->conf_sockets; conf->cores = conf->conf_cores; conf->threads = conf->conf_threads; } if ((conf->cpus != conf->actual_cpus) || (conf->sockets != conf->actual_sockets) || (conf->cores != conf->actual_cores) || (conf->threads != conf->actual_threads)) { if (cf->fast_schedule) { info("Node configuration differs from hardware: " "CPUs=%u:%u(hw) Boards=%u:%u(hw) " "SocketsPerBoard=%u:%u(hw) CoresPerSocket=%u:%u(hw) " "ThreadsPerCore=%u:%u(hw)", conf->cpus, conf->actual_cpus, conf->boards, conf->actual_boards, conf->sockets, conf->actual_sockets, conf->cores, conf->actual_cores, conf->threads, conf->actual_threads); } else if ((cf->fast_schedule == 0) && (cr_flag || gang_flag)) { error("You are using cons_res or gang scheduling with " "Fastschedule=0 and node configuration differs " "from hardware. The node configuration used " "will be what is in the slurm.conf because of " "the bitmaps the slurmctld must create before " "the slurmd registers.\n" " CPUs=%u:%u(hw) Boards=%u:%u(hw) " "SocketsPerBoard=%u:%u(hw) CoresPerSocket=%u:%u(hw) " "ThreadsPerCore=%u:%u(hw)", conf->cpus, conf->actual_cpus, conf->boards, conf->actual_boards, conf->sockets, conf->actual_sockets, conf->cores, conf->actual_cores, conf->threads, conf->actual_threads); } } #endif get_memory(&conf->real_memory_size); get_up_time(&conf->up_time); cf = slurm_conf_lock(); get_tmp_disk(&conf->tmp_disk_space, cf->tmp_fs); _free_and_set(&conf->epilog, xstrdup(cf->epilog)); _free_and_set(&conf->prolog, xstrdup(cf->prolog)); _free_and_set(&conf->tmpfs, xstrdup(cf->tmp_fs)); _free_and_set(&conf->health_check_program, xstrdup(cf->health_check_program)); _free_and_set(&conf->spooldir, xstrdup(cf->slurmd_spooldir)); _massage_pathname(&conf->spooldir); _free_and_set(&conf->pidfile, xstrdup(cf->slurmd_pidfile)); _massage_pathname(&conf->pidfile); _free_and_set(&conf->select_type, xstrdup(cf->select_type)); _free_and_set(&conf->task_prolog, xstrdup(cf->task_prolog)); _free_and_set(&conf->task_epilog, xstrdup(cf->task_epilog)); _free_and_set(&conf->pubkey, path_pubkey); conf->debug_flags = cf->debug_flags; conf->propagate_prio = cf->propagate_prio_process; _free_and_set(&conf->job_acct_gather_freq, xstrdup(cf->job_acct_gather_freq)); conf->acct_freq_task = (uint16_t)NO_VAL; tmp16 = acct_gather_parse_freq(PROFILE_TASK, conf->job_acct_gather_freq); if (tmp16 != -1) conf->acct_freq_task = tmp16; _free_and_set(&conf->acct_gather_energy_type, xstrdup(cf->acct_gather_energy_type)); _free_and_set(&conf->acct_gather_filesystem_type, xstrdup(cf->acct_gather_filesystem_type)); _free_and_set(&conf->acct_gather_infiniband_type, xstrdup(cf->acct_gather_infiniband_type)); _free_and_set(&conf->acct_gather_profile_type, xstrdup(cf->acct_gather_profile_type)); _free_and_set(&conf->job_acct_gather_type, xstrdup(cf->job_acct_gather_type)); if ( (conf->node_name == NULL) || (conf->node_name[0] == '\0') ) fatal("Node name lookup failure"); if (cf->control_addr == NULL) fatal("Unable to establish controller machine"); if (cf->slurmctld_port == 0) fatal("Unable to establish controller port"); conf->slurmd_timeout = cf->slurmd_timeout; conf->use_pam = cf->use_pam; conf->task_plugin_param = cf->task_plugin_param; slurm_mutex_unlock(&conf->config_mutex); slurm_conf_unlock(); }